Commit d8bf4ca9ca9576548628344c9725edd3786e90b1

Authored by Michal Hocko
Committed by Jiri Kosina
1 parent eb032b9837

rcu: treewide: Do not use rcu_read_lock_held when calling rcu_dereference_check

Since ca5ecddf (rcu: define __rcu address space modifier for sparse)
rcu_dereference_check use rcu_read_lock_held as a part of condition
automatically so callers do not have to do that as well.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>

Showing 14 changed files with 6 additions and 27 deletions Inline Diff

include/linux/cgroup.h
1 #ifndef _LINUX_CGROUP_H 1 #ifndef _LINUX_CGROUP_H
2 #define _LINUX_CGROUP_H 2 #define _LINUX_CGROUP_H
3 /* 3 /*
4 * cgroup interface 4 * cgroup interface
5 * 5 *
6 * Copyright (C) 2003 BULL SA 6 * Copyright (C) 2003 BULL SA
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * 8 *
9 */ 9 */
10 10
11 #include <linux/sched.h> 11 #include <linux/sched.h>
12 #include <linux/cpumask.h> 12 #include <linux/cpumask.h>
13 #include <linux/nodemask.h> 13 #include <linux/nodemask.h>
14 #include <linux/rcupdate.h> 14 #include <linux/rcupdate.h>
15 #include <linux/cgroupstats.h> 15 #include <linux/cgroupstats.h>
16 #include <linux/prio_heap.h> 16 #include <linux/prio_heap.h>
17 #include <linux/rwsem.h> 17 #include <linux/rwsem.h>
18 #include <linux/idr.h> 18 #include <linux/idr.h>
19 19
20 #ifdef CONFIG_CGROUPS 20 #ifdef CONFIG_CGROUPS
21 21
22 struct cgroupfs_root; 22 struct cgroupfs_root;
23 struct cgroup_subsys; 23 struct cgroup_subsys;
24 struct inode; 24 struct inode;
25 struct cgroup; 25 struct cgroup;
26 struct css_id; 26 struct css_id;
27 27
28 extern int cgroup_init_early(void); 28 extern int cgroup_init_early(void);
29 extern int cgroup_init(void); 29 extern int cgroup_init(void);
30 extern void cgroup_lock(void); 30 extern void cgroup_lock(void);
31 extern int cgroup_lock_is_held(void); 31 extern int cgroup_lock_is_held(void);
32 extern bool cgroup_lock_live_group(struct cgroup *cgrp); 32 extern bool cgroup_lock_live_group(struct cgroup *cgrp);
33 extern void cgroup_unlock(void); 33 extern void cgroup_unlock(void);
34 extern void cgroup_fork(struct task_struct *p); 34 extern void cgroup_fork(struct task_struct *p);
35 extern void cgroup_fork_callbacks(struct task_struct *p); 35 extern void cgroup_fork_callbacks(struct task_struct *p);
36 extern void cgroup_post_fork(struct task_struct *p); 36 extern void cgroup_post_fork(struct task_struct *p);
37 extern void cgroup_exit(struct task_struct *p, int run_callbacks); 37 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
38 extern int cgroupstats_build(struct cgroupstats *stats, 38 extern int cgroupstats_build(struct cgroupstats *stats,
39 struct dentry *dentry); 39 struct dentry *dentry);
40 extern int cgroup_load_subsys(struct cgroup_subsys *ss); 40 extern int cgroup_load_subsys(struct cgroup_subsys *ss);
41 extern void cgroup_unload_subsys(struct cgroup_subsys *ss); 41 extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
42 42
43 extern const struct file_operations proc_cgroup_operations; 43 extern const struct file_operations proc_cgroup_operations;
44 44
45 /* Define the enumeration of all builtin cgroup subsystems */ 45 /* Define the enumeration of all builtin cgroup subsystems */
46 #define SUBSYS(_x) _x ## _subsys_id, 46 #define SUBSYS(_x) _x ## _subsys_id,
47 enum cgroup_subsys_id { 47 enum cgroup_subsys_id {
48 #include <linux/cgroup_subsys.h> 48 #include <linux/cgroup_subsys.h>
49 CGROUP_BUILTIN_SUBSYS_COUNT 49 CGROUP_BUILTIN_SUBSYS_COUNT
50 }; 50 };
51 #undef SUBSYS 51 #undef SUBSYS
52 /* 52 /*
53 * This define indicates the maximum number of subsystems that can be loaded 53 * This define indicates the maximum number of subsystems that can be loaded
54 * at once. We limit to this many since cgroupfs_root has subsys_bits to keep 54 * at once. We limit to this many since cgroupfs_root has subsys_bits to keep
55 * track of all of them. 55 * track of all of them.
56 */ 56 */
57 #define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long)) 57 #define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long))
58 58
59 /* Per-subsystem/per-cgroup state maintained by the system. */ 59 /* Per-subsystem/per-cgroup state maintained by the system. */
60 struct cgroup_subsys_state { 60 struct cgroup_subsys_state {
61 /* 61 /*
62 * The cgroup that this subsystem is attached to. Useful 62 * The cgroup that this subsystem is attached to. Useful
63 * for subsystems that want to know about the cgroup 63 * for subsystems that want to know about the cgroup
64 * hierarchy structure 64 * hierarchy structure
65 */ 65 */
66 struct cgroup *cgroup; 66 struct cgroup *cgroup;
67 67
68 /* 68 /*
69 * State maintained by the cgroup system to allow subsystems 69 * State maintained by the cgroup system to allow subsystems
70 * to be "busy". Should be accessed via css_get(), 70 * to be "busy". Should be accessed via css_get(),
71 * css_tryget() and and css_put(). 71 * css_tryget() and and css_put().
72 */ 72 */
73 73
74 atomic_t refcnt; 74 atomic_t refcnt;
75 75
76 unsigned long flags; 76 unsigned long flags;
77 /* ID for this css, if possible */ 77 /* ID for this css, if possible */
78 struct css_id __rcu *id; 78 struct css_id __rcu *id;
79 }; 79 };
80 80
81 /* bits in struct cgroup_subsys_state flags field */ 81 /* bits in struct cgroup_subsys_state flags field */
82 enum { 82 enum {
83 CSS_ROOT, /* This CSS is the root of the subsystem */ 83 CSS_ROOT, /* This CSS is the root of the subsystem */
84 CSS_REMOVED, /* This CSS is dead */ 84 CSS_REMOVED, /* This CSS is dead */
85 }; 85 };
86 86
87 /* Caller must verify that the css is not for root cgroup */ 87 /* Caller must verify that the css is not for root cgroup */
88 static inline void __css_get(struct cgroup_subsys_state *css, int count) 88 static inline void __css_get(struct cgroup_subsys_state *css, int count)
89 { 89 {
90 atomic_add(count, &css->refcnt); 90 atomic_add(count, &css->refcnt);
91 } 91 }
92 92
93 /* 93 /*
94 * Call css_get() to hold a reference on the css; it can be used 94 * Call css_get() to hold a reference on the css; it can be used
95 * for a reference obtained via: 95 * for a reference obtained via:
96 * - an existing ref-counted reference to the css 96 * - an existing ref-counted reference to the css
97 * - task->cgroups for a locked task 97 * - task->cgroups for a locked task
98 */ 98 */
99 99
100 static inline void css_get(struct cgroup_subsys_state *css) 100 static inline void css_get(struct cgroup_subsys_state *css)
101 { 101 {
102 /* We don't need to reference count the root state */ 102 /* We don't need to reference count the root state */
103 if (!test_bit(CSS_ROOT, &css->flags)) 103 if (!test_bit(CSS_ROOT, &css->flags))
104 __css_get(css, 1); 104 __css_get(css, 1);
105 } 105 }
106 106
107 static inline bool css_is_removed(struct cgroup_subsys_state *css) 107 static inline bool css_is_removed(struct cgroup_subsys_state *css)
108 { 108 {
109 return test_bit(CSS_REMOVED, &css->flags); 109 return test_bit(CSS_REMOVED, &css->flags);
110 } 110 }
111 111
112 /* 112 /*
113 * Call css_tryget() to take a reference on a css if your existing 113 * Call css_tryget() to take a reference on a css if your existing
114 * (known-valid) reference isn't already ref-counted. Returns false if 114 * (known-valid) reference isn't already ref-counted. Returns false if
115 * the css has been destroyed. 115 * the css has been destroyed.
116 */ 116 */
117 117
118 static inline bool css_tryget(struct cgroup_subsys_state *css) 118 static inline bool css_tryget(struct cgroup_subsys_state *css)
119 { 119 {
120 if (test_bit(CSS_ROOT, &css->flags)) 120 if (test_bit(CSS_ROOT, &css->flags))
121 return true; 121 return true;
122 while (!atomic_inc_not_zero(&css->refcnt)) { 122 while (!atomic_inc_not_zero(&css->refcnt)) {
123 if (test_bit(CSS_REMOVED, &css->flags)) 123 if (test_bit(CSS_REMOVED, &css->flags))
124 return false; 124 return false;
125 cpu_relax(); 125 cpu_relax();
126 } 126 }
127 return true; 127 return true;
128 } 128 }
129 129
130 /* 130 /*
131 * css_put() should be called to release a reference taken by 131 * css_put() should be called to release a reference taken by
132 * css_get() or css_tryget() 132 * css_get() or css_tryget()
133 */ 133 */
134 134
135 extern void __css_put(struct cgroup_subsys_state *css, int count); 135 extern void __css_put(struct cgroup_subsys_state *css, int count);
136 static inline void css_put(struct cgroup_subsys_state *css) 136 static inline void css_put(struct cgroup_subsys_state *css)
137 { 137 {
138 if (!test_bit(CSS_ROOT, &css->flags)) 138 if (!test_bit(CSS_ROOT, &css->flags))
139 __css_put(css, 1); 139 __css_put(css, 1);
140 } 140 }
141 141
142 /* bits in struct cgroup flags field */ 142 /* bits in struct cgroup flags field */
143 enum { 143 enum {
144 /* Control Group is dead */ 144 /* Control Group is dead */
145 CGRP_REMOVED, 145 CGRP_REMOVED,
146 /* 146 /*
147 * Control Group has previously had a child cgroup or a task, 147 * Control Group has previously had a child cgroup or a task,
148 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) 148 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
149 */ 149 */
150 CGRP_RELEASABLE, 150 CGRP_RELEASABLE,
151 /* Control Group requires release notifications to userspace */ 151 /* Control Group requires release notifications to userspace */
152 CGRP_NOTIFY_ON_RELEASE, 152 CGRP_NOTIFY_ON_RELEASE,
153 /* 153 /*
154 * A thread in rmdir() is wating for this cgroup. 154 * A thread in rmdir() is wating for this cgroup.
155 */ 155 */
156 CGRP_WAIT_ON_RMDIR, 156 CGRP_WAIT_ON_RMDIR,
157 /* 157 /*
158 * Clone cgroup values when creating a new child cgroup 158 * Clone cgroup values when creating a new child cgroup
159 */ 159 */
160 CGRP_CLONE_CHILDREN, 160 CGRP_CLONE_CHILDREN,
161 }; 161 };
162 162
163 /* which pidlist file are we talking about? */ 163 /* which pidlist file are we talking about? */
164 enum cgroup_filetype { 164 enum cgroup_filetype {
165 CGROUP_FILE_PROCS, 165 CGROUP_FILE_PROCS,
166 CGROUP_FILE_TASKS, 166 CGROUP_FILE_TASKS,
167 }; 167 };
168 168
169 /* 169 /*
170 * A pidlist is a list of pids that virtually represents the contents of one 170 * A pidlist is a list of pids that virtually represents the contents of one
171 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, 171 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
172 * a pair (one each for procs, tasks) for each pid namespace that's relevant 172 * a pair (one each for procs, tasks) for each pid namespace that's relevant
173 * to the cgroup. 173 * to the cgroup.
174 */ 174 */
175 struct cgroup_pidlist { 175 struct cgroup_pidlist {
176 /* 176 /*
177 * used to find which pidlist is wanted. doesn't change as long as 177 * used to find which pidlist is wanted. doesn't change as long as
178 * this particular list stays in the list. 178 * this particular list stays in the list.
179 */ 179 */
180 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; 180 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
181 /* array of xids */ 181 /* array of xids */
182 pid_t *list; 182 pid_t *list;
183 /* how many elements the above list has */ 183 /* how many elements the above list has */
184 int length; 184 int length;
185 /* how many files are using the current array */ 185 /* how many files are using the current array */
186 int use_count; 186 int use_count;
187 /* each of these stored in a list by its cgroup */ 187 /* each of these stored in a list by its cgroup */
188 struct list_head links; 188 struct list_head links;
189 /* pointer to the cgroup we belong to, for list removal purposes */ 189 /* pointer to the cgroup we belong to, for list removal purposes */
190 struct cgroup *owner; 190 struct cgroup *owner;
191 /* protects the other fields */ 191 /* protects the other fields */
192 struct rw_semaphore mutex; 192 struct rw_semaphore mutex;
193 }; 193 };
194 194
195 struct cgroup { 195 struct cgroup {
196 unsigned long flags; /* "unsigned long" so bitops work */ 196 unsigned long flags; /* "unsigned long" so bitops work */
197 197
198 /* 198 /*
199 * count users of this cgroup. >0 means busy, but doesn't 199 * count users of this cgroup. >0 means busy, but doesn't
200 * necessarily indicate the number of tasks in the cgroup 200 * necessarily indicate the number of tasks in the cgroup
201 */ 201 */
202 atomic_t count; 202 atomic_t count;
203 203
204 /* 204 /*
205 * We link our 'sibling' struct into our parent's 'children'. 205 * We link our 'sibling' struct into our parent's 'children'.
206 * Our children link their 'sibling' into our 'children'. 206 * Our children link their 'sibling' into our 'children'.
207 */ 207 */
208 struct list_head sibling; /* my parent's children */ 208 struct list_head sibling; /* my parent's children */
209 struct list_head children; /* my children */ 209 struct list_head children; /* my children */
210 210
211 struct cgroup *parent; /* my parent */ 211 struct cgroup *parent; /* my parent */
212 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ 212 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */
213 213
214 /* Private pointers for each registered subsystem */ 214 /* Private pointers for each registered subsystem */
215 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 215 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
216 216
217 struct cgroupfs_root *root; 217 struct cgroupfs_root *root;
218 struct cgroup *top_cgroup; 218 struct cgroup *top_cgroup;
219 219
220 /* 220 /*
221 * List of cg_cgroup_links pointing at css_sets with 221 * List of cg_cgroup_links pointing at css_sets with
222 * tasks in this cgroup. Protected by css_set_lock 222 * tasks in this cgroup. Protected by css_set_lock
223 */ 223 */
224 struct list_head css_sets; 224 struct list_head css_sets;
225 225
226 /* 226 /*
227 * Linked list running through all cgroups that can 227 * Linked list running through all cgroups that can
228 * potentially be reaped by the release agent. Protected by 228 * potentially be reaped by the release agent. Protected by
229 * release_list_lock 229 * release_list_lock
230 */ 230 */
231 struct list_head release_list; 231 struct list_head release_list;
232 232
233 /* 233 /*
234 * list of pidlists, up to two for each namespace (one for procs, one 234 * list of pidlists, up to two for each namespace (one for procs, one
235 * for tasks); created on demand. 235 * for tasks); created on demand.
236 */ 236 */
237 struct list_head pidlists; 237 struct list_head pidlists;
238 struct mutex pidlist_mutex; 238 struct mutex pidlist_mutex;
239 239
240 /* For RCU-protected deletion */ 240 /* For RCU-protected deletion */
241 struct rcu_head rcu_head; 241 struct rcu_head rcu_head;
242 242
243 /* List of events which userspace want to receive */ 243 /* List of events which userspace want to receive */
244 struct list_head event_list; 244 struct list_head event_list;
245 spinlock_t event_list_lock; 245 spinlock_t event_list_lock;
246 }; 246 };
247 247
248 /* 248 /*
249 * A css_set is a structure holding pointers to a set of 249 * A css_set is a structure holding pointers to a set of
250 * cgroup_subsys_state objects. This saves space in the task struct 250 * cgroup_subsys_state objects. This saves space in the task struct
251 * object and speeds up fork()/exit(), since a single inc/dec and a 251 * object and speeds up fork()/exit(), since a single inc/dec and a
252 * list_add()/del() can bump the reference count on the entire cgroup 252 * list_add()/del() can bump the reference count on the entire cgroup
253 * set for a task. 253 * set for a task.
254 */ 254 */
255 255
256 struct css_set { 256 struct css_set {
257 257
258 /* Reference count */ 258 /* Reference count */
259 atomic_t refcount; 259 atomic_t refcount;
260 260
261 /* 261 /*
262 * List running through all cgroup groups in the same hash 262 * List running through all cgroup groups in the same hash
263 * slot. Protected by css_set_lock 263 * slot. Protected by css_set_lock
264 */ 264 */
265 struct hlist_node hlist; 265 struct hlist_node hlist;
266 266
267 /* 267 /*
268 * List running through all tasks using this cgroup 268 * List running through all tasks using this cgroup
269 * group. Protected by css_set_lock 269 * group. Protected by css_set_lock
270 */ 270 */
271 struct list_head tasks; 271 struct list_head tasks;
272 272
273 /* 273 /*
274 * List of cg_cgroup_link objects on link chains from 274 * List of cg_cgroup_link objects on link chains from
275 * cgroups referenced from this css_set. Protected by 275 * cgroups referenced from this css_set. Protected by
276 * css_set_lock 276 * css_set_lock
277 */ 277 */
278 struct list_head cg_links; 278 struct list_head cg_links;
279 279
280 /* 280 /*
281 * Set of subsystem states, one for each subsystem. This array 281 * Set of subsystem states, one for each subsystem. This array
282 * is immutable after creation apart from the init_css_set 282 * is immutable after creation apart from the init_css_set
283 * during subsystem registration (at boot time) and modular subsystem 283 * during subsystem registration (at boot time) and modular subsystem
284 * loading/unloading. 284 * loading/unloading.
285 */ 285 */
286 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 286 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
287 287
288 /* For RCU-protected deletion */ 288 /* For RCU-protected deletion */
289 struct rcu_head rcu_head; 289 struct rcu_head rcu_head;
290 }; 290 };
291 291
292 /* 292 /*
293 * cgroup_map_cb is an abstract callback API for reporting map-valued 293 * cgroup_map_cb is an abstract callback API for reporting map-valued
294 * control files 294 * control files
295 */ 295 */
296 296
297 struct cgroup_map_cb { 297 struct cgroup_map_cb {
298 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); 298 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
299 void *state; 299 void *state;
300 }; 300 };
301 301
302 /* 302 /*
303 * struct cftype: handler definitions for cgroup control files 303 * struct cftype: handler definitions for cgroup control files
304 * 304 *
305 * When reading/writing to a file: 305 * When reading/writing to a file:
306 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata 306 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata
307 * - the 'cftype' of the file is file->f_dentry->d_fsdata 307 * - the 'cftype' of the file is file->f_dentry->d_fsdata
308 */ 308 */
309 309
310 #define MAX_CFTYPE_NAME 64 310 #define MAX_CFTYPE_NAME 64
311 struct cftype { 311 struct cftype {
312 /* 312 /*
313 * By convention, the name should begin with the name of the 313 * By convention, the name should begin with the name of the
314 * subsystem, followed by a period 314 * subsystem, followed by a period
315 */ 315 */
316 char name[MAX_CFTYPE_NAME]; 316 char name[MAX_CFTYPE_NAME];
317 int private; 317 int private;
318 /* 318 /*
319 * If not 0, file mode is set to this value, otherwise it will 319 * If not 0, file mode is set to this value, otherwise it will
320 * be figured out automatically 320 * be figured out automatically
321 */ 321 */
322 mode_t mode; 322 mode_t mode;
323 323
324 /* 324 /*
325 * If non-zero, defines the maximum length of string that can 325 * If non-zero, defines the maximum length of string that can
326 * be passed to write_string; defaults to 64 326 * be passed to write_string; defaults to 64
327 */ 327 */
328 size_t max_write_len; 328 size_t max_write_len;
329 329
330 int (*open)(struct inode *inode, struct file *file); 330 int (*open)(struct inode *inode, struct file *file);
331 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, 331 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
332 struct file *file, 332 struct file *file,
333 char __user *buf, size_t nbytes, loff_t *ppos); 333 char __user *buf, size_t nbytes, loff_t *ppos);
334 /* 334 /*
335 * read_u64() is a shortcut for the common case of returning a 335 * read_u64() is a shortcut for the common case of returning a
336 * single integer. Use it in place of read() 336 * single integer. Use it in place of read()
337 */ 337 */
338 u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); 338 u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft);
339 /* 339 /*
340 * read_s64() is a signed version of read_u64() 340 * read_s64() is a signed version of read_u64()
341 */ 341 */
342 s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); 342 s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft);
343 /* 343 /*
344 * read_map() is used for defining a map of key/value 344 * read_map() is used for defining a map of key/value
345 * pairs. It should call cb->fill(cb, key, value) for each 345 * pairs. It should call cb->fill(cb, key, value) for each
346 * entry. The key/value pairs (and their ordering) should not 346 * entry. The key/value pairs (and their ordering) should not
347 * change between reboots. 347 * change between reboots.
348 */ 348 */
349 int (*read_map)(struct cgroup *cont, struct cftype *cft, 349 int (*read_map)(struct cgroup *cont, struct cftype *cft,
350 struct cgroup_map_cb *cb); 350 struct cgroup_map_cb *cb);
351 /* 351 /*
352 * read_seq_string() is used for outputting a simple sequence 352 * read_seq_string() is used for outputting a simple sequence
353 * using seqfile. 353 * using seqfile.
354 */ 354 */
355 int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, 355 int (*read_seq_string)(struct cgroup *cont, struct cftype *cft,
356 struct seq_file *m); 356 struct seq_file *m);
357 357
358 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, 358 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft,
359 struct file *file, 359 struct file *file,
360 const char __user *buf, size_t nbytes, loff_t *ppos); 360 const char __user *buf, size_t nbytes, loff_t *ppos);
361 361
362 /* 362 /*
363 * write_u64() is a shortcut for the common case of accepting 363 * write_u64() is a shortcut for the common case of accepting
364 * a single integer (as parsed by simple_strtoull) from 364 * a single integer (as parsed by simple_strtoull) from
365 * userspace. Use in place of write(); return 0 or error. 365 * userspace. Use in place of write(); return 0 or error.
366 */ 366 */
367 int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); 367 int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val);
368 /* 368 /*
369 * write_s64() is a signed version of write_u64() 369 * write_s64() is a signed version of write_u64()
370 */ 370 */
371 int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); 371 int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val);
372 372
373 /* 373 /*
374 * write_string() is passed a nul-terminated kernelspace 374 * write_string() is passed a nul-terminated kernelspace
375 * buffer of maximum length determined by max_write_len. 375 * buffer of maximum length determined by max_write_len.
376 * Returns 0 or -ve error code. 376 * Returns 0 or -ve error code.
377 */ 377 */
378 int (*write_string)(struct cgroup *cgrp, struct cftype *cft, 378 int (*write_string)(struct cgroup *cgrp, struct cftype *cft,
379 const char *buffer); 379 const char *buffer);
380 /* 380 /*
381 * trigger() callback can be used to get some kick from the 381 * trigger() callback can be used to get some kick from the
382 * userspace, when the actual string written is not important 382 * userspace, when the actual string written is not important
383 * at all. The private field can be used to determine the 383 * at all. The private field can be used to determine the
384 * kick type for multiplexing. 384 * kick type for multiplexing.
385 */ 385 */
386 int (*trigger)(struct cgroup *cgrp, unsigned int event); 386 int (*trigger)(struct cgroup *cgrp, unsigned int event);
387 387
388 int (*release)(struct inode *inode, struct file *file); 388 int (*release)(struct inode *inode, struct file *file);
389 389
390 /* 390 /*
391 * register_event() callback will be used to add new userspace 391 * register_event() callback will be used to add new userspace
392 * waiter for changes related to the cftype. Implement it if 392 * waiter for changes related to the cftype. Implement it if
393 * you want to provide this functionality. Use eventfd_signal() 393 * you want to provide this functionality. Use eventfd_signal()
394 * on eventfd to send notification to userspace. 394 * on eventfd to send notification to userspace.
395 */ 395 */
396 int (*register_event)(struct cgroup *cgrp, struct cftype *cft, 396 int (*register_event)(struct cgroup *cgrp, struct cftype *cft,
397 struct eventfd_ctx *eventfd, const char *args); 397 struct eventfd_ctx *eventfd, const char *args);
398 /* 398 /*
399 * unregister_event() callback will be called when userspace 399 * unregister_event() callback will be called when userspace
400 * closes the eventfd or on cgroup removing. 400 * closes the eventfd or on cgroup removing.
401 * This callback must be implemented, if you want provide 401 * This callback must be implemented, if you want provide
402 * notification functionality. 402 * notification functionality.
403 */ 403 */
404 void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, 404 void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft,
405 struct eventfd_ctx *eventfd); 405 struct eventfd_ctx *eventfd);
406 }; 406 };
407 407
408 struct cgroup_scanner { 408 struct cgroup_scanner {
409 struct cgroup *cg; 409 struct cgroup *cg;
410 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); 410 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
411 void (*process_task)(struct task_struct *p, 411 void (*process_task)(struct task_struct *p,
412 struct cgroup_scanner *scan); 412 struct cgroup_scanner *scan);
413 struct ptr_heap *heap; 413 struct ptr_heap *heap;
414 void *data; 414 void *data;
415 }; 415 };
416 416
417 /* 417 /*
418 * Add a new file to the given cgroup directory. Should only be 418 * Add a new file to the given cgroup directory. Should only be
419 * called by subsystems from within a populate() method 419 * called by subsystems from within a populate() method
420 */ 420 */
421 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 421 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
422 const struct cftype *cft); 422 const struct cftype *cft);
423 423
424 /* 424 /*
425 * Add a set of new files to the given cgroup directory. Should 425 * Add a set of new files to the given cgroup directory. Should
426 * only be called by subsystems from within a populate() method 426 * only be called by subsystems from within a populate() method
427 */ 427 */
428 int cgroup_add_files(struct cgroup *cgrp, 428 int cgroup_add_files(struct cgroup *cgrp,
429 struct cgroup_subsys *subsys, 429 struct cgroup_subsys *subsys,
430 const struct cftype cft[], 430 const struct cftype cft[],
431 int count); 431 int count);
432 432
433 int cgroup_is_removed(const struct cgroup *cgrp); 433 int cgroup_is_removed(const struct cgroup *cgrp);
434 434
435 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); 435 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
436 436
437 int cgroup_task_count(const struct cgroup *cgrp); 437 int cgroup_task_count(const struct cgroup *cgrp);
438 438
439 /* Return true if cgrp is a descendant of the task's cgroup */ 439 /* Return true if cgrp is a descendant of the task's cgroup */
440 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); 440 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
441 441
442 /* 442 /*
443 * When the subsys has to access css and may add permanent refcnt to css, 443 * When the subsys has to access css and may add permanent refcnt to css,
444 * it should take care of racy conditions with rmdir(). Following set of 444 * it should take care of racy conditions with rmdir(). Following set of
445 * functions, is for stop/restart rmdir if necessary. 445 * functions, is for stop/restart rmdir if necessary.
446 * Because these will call css_get/put, "css" should be alive css. 446 * Because these will call css_get/put, "css" should be alive css.
447 * 447 *
448 * cgroup_exclude_rmdir(); 448 * cgroup_exclude_rmdir();
449 * ...do some jobs which may access arbitrary empty cgroup 449 * ...do some jobs which may access arbitrary empty cgroup
450 * cgroup_release_and_wakeup_rmdir(); 450 * cgroup_release_and_wakeup_rmdir();
451 * 451 *
452 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, 452 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
453 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. 453 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
454 */ 454 */
455 455
456 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); 456 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
457 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); 457 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
458 458
459 /* 459 /*
460 * Control Group subsystem type. 460 * Control Group subsystem type.
461 * See Documentation/cgroups/cgroups.txt for details 461 * See Documentation/cgroups/cgroups.txt for details
462 */ 462 */
463 463
464 struct cgroup_subsys { 464 struct cgroup_subsys {
465 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, 465 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
466 struct cgroup *cgrp); 466 struct cgroup *cgrp);
467 int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 467 int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
468 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 468 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
469 int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 469 int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
470 struct task_struct *tsk); 470 struct task_struct *tsk);
471 int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk); 471 int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
472 void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 472 void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
473 struct task_struct *tsk); 473 struct task_struct *tsk);
474 void (*pre_attach)(struct cgroup *cgrp); 474 void (*pre_attach)(struct cgroup *cgrp);
475 void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk); 475 void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
476 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 476 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
477 struct cgroup *old_cgrp, struct task_struct *tsk); 477 struct cgroup *old_cgrp, struct task_struct *tsk);
478 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); 478 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
479 void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, 479 void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
480 struct cgroup *old_cgrp, struct task_struct *task); 480 struct cgroup *old_cgrp, struct task_struct *task);
481 int (*populate)(struct cgroup_subsys *ss, 481 int (*populate)(struct cgroup_subsys *ss,
482 struct cgroup *cgrp); 482 struct cgroup *cgrp);
483 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); 483 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
484 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); 484 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
485 485
486 int subsys_id; 486 int subsys_id;
487 int active; 487 int active;
488 int disabled; 488 int disabled;
489 int early_init; 489 int early_init;
490 /* 490 /*
491 * True if this subsys uses ID. ID is not available before cgroup_init() 491 * True if this subsys uses ID. ID is not available before cgroup_init()
492 * (not available in early_init time.) 492 * (not available in early_init time.)
493 */ 493 */
494 bool use_id; 494 bool use_id;
495 #define MAX_CGROUP_TYPE_NAMELEN 32 495 #define MAX_CGROUP_TYPE_NAMELEN 32
496 const char *name; 496 const char *name;
497 497
498 /* 498 /*
499 * Protects sibling/children links of cgroups in this 499 * Protects sibling/children links of cgroups in this
500 * hierarchy, plus protects which hierarchy (or none) the 500 * hierarchy, plus protects which hierarchy (or none) the
501 * subsystem is a part of (i.e. root/sibling). To avoid 501 * subsystem is a part of (i.e. root/sibling). To avoid
502 * potential deadlocks, the following operations should not be 502 * potential deadlocks, the following operations should not be
503 * undertaken while holding any hierarchy_mutex: 503 * undertaken while holding any hierarchy_mutex:
504 * 504 *
505 * - allocating memory 505 * - allocating memory
506 * - initiating hotplug events 506 * - initiating hotplug events
507 */ 507 */
508 struct mutex hierarchy_mutex; 508 struct mutex hierarchy_mutex;
509 struct lock_class_key subsys_key; 509 struct lock_class_key subsys_key;
510 510
511 /* 511 /*
512 * Link to parent, and list entry in parent's children. 512 * Link to parent, and list entry in parent's children.
513 * Protected by this->hierarchy_mutex and cgroup_lock() 513 * Protected by this->hierarchy_mutex and cgroup_lock()
514 */ 514 */
515 struct cgroupfs_root *root; 515 struct cgroupfs_root *root;
516 struct list_head sibling; 516 struct list_head sibling;
517 /* used when use_id == true */ 517 /* used when use_id == true */
518 struct idr idr; 518 struct idr idr;
519 spinlock_t id_lock; 519 spinlock_t id_lock;
520 520
521 /* should be defined only by modular subsystems */ 521 /* should be defined only by modular subsystems */
522 struct module *module; 522 struct module *module;
523 }; 523 };
524 524
525 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; 525 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
526 #include <linux/cgroup_subsys.h> 526 #include <linux/cgroup_subsys.h>
527 #undef SUBSYS 527 #undef SUBSYS
528 528
529 static inline struct cgroup_subsys_state *cgroup_subsys_state( 529 static inline struct cgroup_subsys_state *cgroup_subsys_state(
530 struct cgroup *cgrp, int subsys_id) 530 struct cgroup *cgrp, int subsys_id)
531 { 531 {
532 return cgrp->subsys[subsys_id]; 532 return cgrp->subsys[subsys_id];
533 } 533 }
534 534
535 /* 535 /*
536 * function to get the cgroup_subsys_state which allows for extra 536 * function to get the cgroup_subsys_state which allows for extra
537 * rcu_dereference_check() conditions, such as locks used during the 537 * rcu_dereference_check() conditions, such as locks used during the
538 * cgroup_subsys::attach() methods. 538 * cgroup_subsys::attach() methods.
539 */ 539 */
540 #define task_subsys_state_check(task, subsys_id, __c) \ 540 #define task_subsys_state_check(task, subsys_id, __c) \
541 rcu_dereference_check(task->cgroups->subsys[subsys_id], \ 541 rcu_dereference_check(task->cgroups->subsys[subsys_id], \
542 rcu_read_lock_held() || \
543 lockdep_is_held(&task->alloc_lock) || \ 542 lockdep_is_held(&task->alloc_lock) || \
544 cgroup_lock_is_held() || (__c)) 543 cgroup_lock_is_held() || (__c))
545 544
546 static inline struct cgroup_subsys_state * 545 static inline struct cgroup_subsys_state *
547 task_subsys_state(struct task_struct *task, int subsys_id) 546 task_subsys_state(struct task_struct *task, int subsys_id)
548 { 547 {
549 return task_subsys_state_check(task, subsys_id, false); 548 return task_subsys_state_check(task, subsys_id, false);
550 } 549 }
551 550
552 static inline struct cgroup* task_cgroup(struct task_struct *task, 551 static inline struct cgroup* task_cgroup(struct task_struct *task,
553 int subsys_id) 552 int subsys_id)
554 { 553 {
555 return task_subsys_state(task, subsys_id)->cgroup; 554 return task_subsys_state(task, subsys_id)->cgroup;
556 } 555 }
557 556
558 /* A cgroup_iter should be treated as an opaque object */ 557 /* A cgroup_iter should be treated as an opaque object */
559 struct cgroup_iter { 558 struct cgroup_iter {
560 struct list_head *cg_link; 559 struct list_head *cg_link;
561 struct list_head *task; 560 struct list_head *task;
562 }; 561 };
563 562
564 /* 563 /*
565 * To iterate across the tasks in a cgroup: 564 * To iterate across the tasks in a cgroup:
566 * 565 *
567 * 1) call cgroup_iter_start to initialize an iterator 566 * 1) call cgroup_iter_start to initialize an iterator
568 * 567 *
569 * 2) call cgroup_iter_next() to retrieve member tasks until it 568 * 2) call cgroup_iter_next() to retrieve member tasks until it
570 * returns NULL or until you want to end the iteration 569 * returns NULL or until you want to end the iteration
571 * 570 *
572 * 3) call cgroup_iter_end() to destroy the iterator. 571 * 3) call cgroup_iter_end() to destroy the iterator.
573 * 572 *
574 * Or, call cgroup_scan_tasks() to iterate through every task in a 573 * Or, call cgroup_scan_tasks() to iterate through every task in a
575 * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling 574 * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling
576 * the test_task() callback, but not while calling the process_task() 575 * the test_task() callback, but not while calling the process_task()
577 * callback. 576 * callback.
578 */ 577 */
579 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); 578 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
580 struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 579 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
581 struct cgroup_iter *it); 580 struct cgroup_iter *it);
582 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); 581 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
583 int cgroup_scan_tasks(struct cgroup_scanner *scan); 582 int cgroup_scan_tasks(struct cgroup_scanner *scan);
584 int cgroup_attach_task(struct cgroup *, struct task_struct *); 583 int cgroup_attach_task(struct cgroup *, struct task_struct *);
585 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 584 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
586 585
587 static inline int cgroup_attach_task_current_cg(struct task_struct *tsk) 586 static inline int cgroup_attach_task_current_cg(struct task_struct *tsk)
588 { 587 {
589 return cgroup_attach_task_all(current, tsk); 588 return cgroup_attach_task_all(current, tsk);
590 } 589 }
591 590
592 /* 591 /*
593 * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works 592 * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
594 * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. 593 * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
595 * CSS ID is assigned at cgroup allocation (create) automatically 594 * CSS ID is assigned at cgroup allocation (create) automatically
596 * and removed when subsys calls free_css_id() function. This is because 595 * and removed when subsys calls free_css_id() function. This is because
597 * the lifetime of cgroup_subsys_state is subsys's matter. 596 * the lifetime of cgroup_subsys_state is subsys's matter.
598 * 597 *
599 * Looking up and scanning function should be called under rcu_read_lock(). 598 * Looking up and scanning function should be called under rcu_read_lock().
600 * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls. 599 * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
601 * But the css returned by this routine can be "not populated yet" or "being 600 * But the css returned by this routine can be "not populated yet" or "being
602 * destroyed". The caller should check css and cgroup's status. 601 * destroyed". The caller should check css and cgroup's status.
603 */ 602 */
604 603
605 /* 604 /*
606 * Typically Called at ->destroy(), or somewhere the subsys frees 605 * Typically Called at ->destroy(), or somewhere the subsys frees
607 * cgroup_subsys_state. 606 * cgroup_subsys_state.
608 */ 607 */
609 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); 608 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
610 609
611 /* Find a cgroup_subsys_state which has given ID */ 610 /* Find a cgroup_subsys_state which has given ID */
612 611
613 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); 612 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
614 613
615 /* 614 /*
616 * Get a cgroup whose id is greater than or equal to id under tree of root. 615 * Get a cgroup whose id is greater than or equal to id under tree of root.
617 * Returning a cgroup_subsys_state or NULL. 616 * Returning a cgroup_subsys_state or NULL.
618 */ 617 */
619 struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, 618 struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
620 struct cgroup_subsys_state *root, int *foundid); 619 struct cgroup_subsys_state *root, int *foundid);
621 620
622 /* Returns true if root is ancestor of cg */ 621 /* Returns true if root is ancestor of cg */
623 bool css_is_ancestor(struct cgroup_subsys_state *cg, 622 bool css_is_ancestor(struct cgroup_subsys_state *cg,
624 const struct cgroup_subsys_state *root); 623 const struct cgroup_subsys_state *root);
625 624
626 /* Get id and depth of css */ 625 /* Get id and depth of css */
627 unsigned short css_id(struct cgroup_subsys_state *css); 626 unsigned short css_id(struct cgroup_subsys_state *css);
628 unsigned short css_depth(struct cgroup_subsys_state *css); 627 unsigned short css_depth(struct cgroup_subsys_state *css);
629 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); 628 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
630 629
631 #else /* !CONFIG_CGROUPS */ 630 #else /* !CONFIG_CGROUPS */
632 631
633 static inline int cgroup_init_early(void) { return 0; } 632 static inline int cgroup_init_early(void) { return 0; }
634 static inline int cgroup_init(void) { return 0; } 633 static inline int cgroup_init(void) { return 0; }
635 static inline void cgroup_fork(struct task_struct *p) {} 634 static inline void cgroup_fork(struct task_struct *p) {}
636 static inline void cgroup_fork_callbacks(struct task_struct *p) {} 635 static inline void cgroup_fork_callbacks(struct task_struct *p) {}
637 static inline void cgroup_post_fork(struct task_struct *p) {} 636 static inline void cgroup_post_fork(struct task_struct *p) {}
638 static inline void cgroup_exit(struct task_struct *p, int callbacks) {} 637 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
639 638
640 static inline void cgroup_lock(void) {} 639 static inline void cgroup_lock(void) {}
641 static inline void cgroup_unlock(void) {} 640 static inline void cgroup_unlock(void) {}
642 static inline int cgroupstats_build(struct cgroupstats *stats, 641 static inline int cgroupstats_build(struct cgroupstats *stats,
643 struct dentry *dentry) 642 struct dentry *dentry)
644 { 643 {
645 return -EINVAL; 644 return -EINVAL;
646 } 645 }
647 646
648 /* No cgroups - nothing to do */ 647 /* No cgroups - nothing to do */
649 static inline int cgroup_attach_task_all(struct task_struct *from, 648 static inline int cgroup_attach_task_all(struct task_struct *from,
650 struct task_struct *t) 649 struct task_struct *t)
651 { 650 {
652 return 0; 651 return 0;
653 } 652 }
654 static inline int cgroup_attach_task_current_cg(struct task_struct *t) 653 static inline int cgroup_attach_task_current_cg(struct task_struct *t)
655 { 654 {
656 return 0; 655 return 0;
657 } 656 }
658 657
659 #endif /* !CONFIG_CGROUPS */ 658 #endif /* !CONFIG_CGROUPS */
660 659
661 #endif /* _LINUX_CGROUP_H */ 660 #endif /* _LINUX_CGROUP_H */
662 661
include/linux/cred.h
1 /* Credentials management - see Documentation/security/credentials.txt 1 /* Credentials management - see Documentation/security/credentials.txt
2 * 2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence 7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12 #ifndef _LINUX_CRED_H 12 #ifndef _LINUX_CRED_H
13 #define _LINUX_CRED_H 13 #define _LINUX_CRED_H
14 14
15 #include <linux/capability.h> 15 #include <linux/capability.h>
16 #include <linux/init.h> 16 #include <linux/init.h>
17 #include <linux/key.h> 17 #include <linux/key.h>
18 #include <linux/selinux.h> 18 #include <linux/selinux.h>
19 #include <asm/atomic.h> 19 #include <asm/atomic.h>
20 20
21 struct user_struct; 21 struct user_struct;
22 struct cred; 22 struct cred;
23 struct inode; 23 struct inode;
24 24
25 /* 25 /*
26 * COW Supplementary groups list 26 * COW Supplementary groups list
27 */ 27 */
28 #define NGROUPS_SMALL 32 28 #define NGROUPS_SMALL 32
29 #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) 29 #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
30 30
31 struct group_info { 31 struct group_info {
32 atomic_t usage; 32 atomic_t usage;
33 int ngroups; 33 int ngroups;
34 int nblocks; 34 int nblocks;
35 gid_t small_block[NGROUPS_SMALL]; 35 gid_t small_block[NGROUPS_SMALL];
36 gid_t *blocks[0]; 36 gid_t *blocks[0];
37 }; 37 };
38 38
39 /** 39 /**
40 * get_group_info - Get a reference to a group info structure 40 * get_group_info - Get a reference to a group info structure
41 * @group_info: The group info to reference 41 * @group_info: The group info to reference
42 * 42 *
43 * This gets a reference to a set of supplementary groups. 43 * This gets a reference to a set of supplementary groups.
44 * 44 *
45 * If the caller is accessing a task's credentials, they must hold the RCU read 45 * If the caller is accessing a task's credentials, they must hold the RCU read
46 * lock when reading. 46 * lock when reading.
47 */ 47 */
48 static inline struct group_info *get_group_info(struct group_info *gi) 48 static inline struct group_info *get_group_info(struct group_info *gi)
49 { 49 {
50 atomic_inc(&gi->usage); 50 atomic_inc(&gi->usage);
51 return gi; 51 return gi;
52 } 52 }
53 53
54 /** 54 /**
55 * put_group_info - Release a reference to a group info structure 55 * put_group_info - Release a reference to a group info structure
56 * @group_info: The group info to release 56 * @group_info: The group info to release
57 */ 57 */
58 #define put_group_info(group_info) \ 58 #define put_group_info(group_info) \
59 do { \ 59 do { \
60 if (atomic_dec_and_test(&(group_info)->usage)) \ 60 if (atomic_dec_and_test(&(group_info)->usage)) \
61 groups_free(group_info); \ 61 groups_free(group_info); \
62 } while (0) 62 } while (0)
63 63
64 extern struct group_info *groups_alloc(int); 64 extern struct group_info *groups_alloc(int);
65 extern struct group_info init_groups; 65 extern struct group_info init_groups;
66 extern void groups_free(struct group_info *); 66 extern void groups_free(struct group_info *);
67 extern int set_current_groups(struct group_info *); 67 extern int set_current_groups(struct group_info *);
68 extern int set_groups(struct cred *, struct group_info *); 68 extern int set_groups(struct cred *, struct group_info *);
69 extern int groups_search(const struct group_info *, gid_t); 69 extern int groups_search(const struct group_info *, gid_t);
70 70
71 /* access the groups "array" with this macro */ 71 /* access the groups "array" with this macro */
72 #define GROUP_AT(gi, i) \ 72 #define GROUP_AT(gi, i) \
73 ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK]) 73 ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK])
74 74
75 extern int in_group_p(gid_t); 75 extern int in_group_p(gid_t);
76 extern int in_egroup_p(gid_t); 76 extern int in_egroup_p(gid_t);
77 77
78 /* 78 /*
79 * The common credentials for a thread group 79 * The common credentials for a thread group
80 * - shared by CLONE_THREAD 80 * - shared by CLONE_THREAD
81 */ 81 */
82 #ifdef CONFIG_KEYS 82 #ifdef CONFIG_KEYS
83 struct thread_group_cred { 83 struct thread_group_cred {
84 atomic_t usage; 84 atomic_t usage;
85 pid_t tgid; /* thread group process ID */ 85 pid_t tgid; /* thread group process ID */
86 spinlock_t lock; 86 spinlock_t lock;
87 struct key __rcu *session_keyring; /* keyring inherited over fork */ 87 struct key __rcu *session_keyring; /* keyring inherited over fork */
88 struct key *process_keyring; /* keyring private to this process */ 88 struct key *process_keyring; /* keyring private to this process */
89 struct rcu_head rcu; /* RCU deletion hook */ 89 struct rcu_head rcu; /* RCU deletion hook */
90 }; 90 };
91 #endif 91 #endif
92 92
93 /* 93 /*
94 * The security context of a task 94 * The security context of a task
95 * 95 *
96 * The parts of the context break down into two categories: 96 * The parts of the context break down into two categories:
97 * 97 *
98 * (1) The objective context of a task. These parts are used when some other 98 * (1) The objective context of a task. These parts are used when some other
99 * task is attempting to affect this one. 99 * task is attempting to affect this one.
100 * 100 *
101 * (2) The subjective context. These details are used when the task is acting 101 * (2) The subjective context. These details are used when the task is acting
102 * upon another object, be that a file, a task, a key or whatever. 102 * upon another object, be that a file, a task, a key or whatever.
103 * 103 *
104 * Note that some members of this structure belong to both categories - the 104 * Note that some members of this structure belong to both categories - the
105 * LSM security pointer for instance. 105 * LSM security pointer for instance.
106 * 106 *
107 * A task has two security pointers. task->real_cred points to the objective 107 * A task has two security pointers. task->real_cred points to the objective
108 * context that defines that task's actual details. The objective part of this 108 * context that defines that task's actual details. The objective part of this
109 * context is used whenever that task is acted upon. 109 * context is used whenever that task is acted upon.
110 * 110 *
111 * task->cred points to the subjective context that defines the details of how 111 * task->cred points to the subjective context that defines the details of how
112 * that task is going to act upon another object. This may be overridden 112 * that task is going to act upon another object. This may be overridden
113 * temporarily to point to another security context, but normally points to the 113 * temporarily to point to another security context, but normally points to the
114 * same context as task->real_cred. 114 * same context as task->real_cred.
115 */ 115 */
116 struct cred { 116 struct cred {
117 atomic_t usage; 117 atomic_t usage;
118 #ifdef CONFIG_DEBUG_CREDENTIALS 118 #ifdef CONFIG_DEBUG_CREDENTIALS
119 atomic_t subscribers; /* number of processes subscribed */ 119 atomic_t subscribers; /* number of processes subscribed */
120 void *put_addr; 120 void *put_addr;
121 unsigned magic; 121 unsigned magic;
122 #define CRED_MAGIC 0x43736564 122 #define CRED_MAGIC 0x43736564
123 #define CRED_MAGIC_DEAD 0x44656144 123 #define CRED_MAGIC_DEAD 0x44656144
124 #endif 124 #endif
125 uid_t uid; /* real UID of the task */ 125 uid_t uid; /* real UID of the task */
126 gid_t gid; /* real GID of the task */ 126 gid_t gid; /* real GID of the task */
127 uid_t suid; /* saved UID of the task */ 127 uid_t suid; /* saved UID of the task */
128 gid_t sgid; /* saved GID of the task */ 128 gid_t sgid; /* saved GID of the task */
129 uid_t euid; /* effective UID of the task */ 129 uid_t euid; /* effective UID of the task */
130 gid_t egid; /* effective GID of the task */ 130 gid_t egid; /* effective GID of the task */
131 uid_t fsuid; /* UID for VFS ops */ 131 uid_t fsuid; /* UID for VFS ops */
132 gid_t fsgid; /* GID for VFS ops */ 132 gid_t fsgid; /* GID for VFS ops */
133 unsigned securebits; /* SUID-less security management */ 133 unsigned securebits; /* SUID-less security management */
134 kernel_cap_t cap_inheritable; /* caps our children can inherit */ 134 kernel_cap_t cap_inheritable; /* caps our children can inherit */
135 kernel_cap_t cap_permitted; /* caps we're permitted */ 135 kernel_cap_t cap_permitted; /* caps we're permitted */
136 kernel_cap_t cap_effective; /* caps we can actually use */ 136 kernel_cap_t cap_effective; /* caps we can actually use */
137 kernel_cap_t cap_bset; /* capability bounding set */ 137 kernel_cap_t cap_bset; /* capability bounding set */
138 #ifdef CONFIG_KEYS 138 #ifdef CONFIG_KEYS
139 unsigned char jit_keyring; /* default keyring to attach requested 139 unsigned char jit_keyring; /* default keyring to attach requested
140 * keys to */ 140 * keys to */
141 struct key *thread_keyring; /* keyring private to this thread */ 141 struct key *thread_keyring; /* keyring private to this thread */
142 struct key *request_key_auth; /* assumed request_key authority */ 142 struct key *request_key_auth; /* assumed request_key authority */
143 struct thread_group_cred *tgcred; /* thread-group shared credentials */ 143 struct thread_group_cred *tgcred; /* thread-group shared credentials */
144 #endif 144 #endif
145 #ifdef CONFIG_SECURITY 145 #ifdef CONFIG_SECURITY
146 void *security; /* subjective LSM security */ 146 void *security; /* subjective LSM security */
147 #endif 147 #endif
148 struct user_struct *user; /* real user ID subscription */ 148 struct user_struct *user; /* real user ID subscription */
149 struct user_namespace *user_ns; /* cached user->user_ns */ 149 struct user_namespace *user_ns; /* cached user->user_ns */
150 struct group_info *group_info; /* supplementary groups for euid/fsgid */ 150 struct group_info *group_info; /* supplementary groups for euid/fsgid */
151 struct rcu_head rcu; /* RCU deletion hook */ 151 struct rcu_head rcu; /* RCU deletion hook */
152 }; 152 };
153 153
154 extern void __put_cred(struct cred *); 154 extern void __put_cred(struct cred *);
155 extern void exit_creds(struct task_struct *); 155 extern void exit_creds(struct task_struct *);
156 extern int copy_creds(struct task_struct *, unsigned long); 156 extern int copy_creds(struct task_struct *, unsigned long);
157 extern const struct cred *get_task_cred(struct task_struct *); 157 extern const struct cred *get_task_cred(struct task_struct *);
158 extern struct cred *cred_alloc_blank(void); 158 extern struct cred *cred_alloc_blank(void);
159 extern struct cred *prepare_creds(void); 159 extern struct cred *prepare_creds(void);
160 extern struct cred *prepare_exec_creds(void); 160 extern struct cred *prepare_exec_creds(void);
161 extern int commit_creds(struct cred *); 161 extern int commit_creds(struct cred *);
162 extern void abort_creds(struct cred *); 162 extern void abort_creds(struct cred *);
163 extern const struct cred *override_creds(const struct cred *); 163 extern const struct cred *override_creds(const struct cred *);
164 extern void revert_creds(const struct cred *); 164 extern void revert_creds(const struct cred *);
165 extern struct cred *prepare_kernel_cred(struct task_struct *); 165 extern struct cred *prepare_kernel_cred(struct task_struct *);
166 extern int change_create_files_as(struct cred *, struct inode *); 166 extern int change_create_files_as(struct cred *, struct inode *);
167 extern int set_security_override(struct cred *, u32); 167 extern int set_security_override(struct cred *, u32);
168 extern int set_security_override_from_ctx(struct cred *, const char *); 168 extern int set_security_override_from_ctx(struct cred *, const char *);
169 extern int set_create_files_as(struct cred *, struct inode *); 169 extern int set_create_files_as(struct cred *, struct inode *);
170 extern void __init cred_init(void); 170 extern void __init cred_init(void);
171 171
172 /* 172 /*
173 * check for validity of credentials 173 * check for validity of credentials
174 */ 174 */
175 #ifdef CONFIG_DEBUG_CREDENTIALS 175 #ifdef CONFIG_DEBUG_CREDENTIALS
176 extern void __invalid_creds(const struct cred *, const char *, unsigned); 176 extern void __invalid_creds(const struct cred *, const char *, unsigned);
177 extern void __validate_process_creds(struct task_struct *, 177 extern void __validate_process_creds(struct task_struct *,
178 const char *, unsigned); 178 const char *, unsigned);
179 179
180 extern bool creds_are_invalid(const struct cred *cred); 180 extern bool creds_are_invalid(const struct cred *cred);
181 181
182 static inline void __validate_creds(const struct cred *cred, 182 static inline void __validate_creds(const struct cred *cred,
183 const char *file, unsigned line) 183 const char *file, unsigned line)
184 { 184 {
185 if (unlikely(creds_are_invalid(cred))) 185 if (unlikely(creds_are_invalid(cred)))
186 __invalid_creds(cred, file, line); 186 __invalid_creds(cred, file, line);
187 } 187 }
188 188
189 #define validate_creds(cred) \ 189 #define validate_creds(cred) \
190 do { \ 190 do { \
191 __validate_creds((cred), __FILE__, __LINE__); \ 191 __validate_creds((cred), __FILE__, __LINE__); \
192 } while(0) 192 } while(0)
193 193
194 #define validate_process_creds() \ 194 #define validate_process_creds() \
195 do { \ 195 do { \
196 __validate_process_creds(current, __FILE__, __LINE__); \ 196 __validate_process_creds(current, __FILE__, __LINE__); \
197 } while(0) 197 } while(0)
198 198
199 extern void validate_creds_for_do_exit(struct task_struct *); 199 extern void validate_creds_for_do_exit(struct task_struct *);
200 #else 200 #else
201 static inline void validate_creds(const struct cred *cred) 201 static inline void validate_creds(const struct cred *cred)
202 { 202 {
203 } 203 }
204 static inline void validate_creds_for_do_exit(struct task_struct *tsk) 204 static inline void validate_creds_for_do_exit(struct task_struct *tsk)
205 { 205 {
206 } 206 }
207 static inline void validate_process_creds(void) 207 static inline void validate_process_creds(void)
208 { 208 {
209 } 209 }
210 #endif 210 #endif
211 211
212 /** 212 /**
213 * get_new_cred - Get a reference on a new set of credentials 213 * get_new_cred - Get a reference on a new set of credentials
214 * @cred: The new credentials to reference 214 * @cred: The new credentials to reference
215 * 215 *
216 * Get a reference on the specified set of new credentials. The caller must 216 * Get a reference on the specified set of new credentials. The caller must
217 * release the reference. 217 * release the reference.
218 */ 218 */
219 static inline struct cred *get_new_cred(struct cred *cred) 219 static inline struct cred *get_new_cred(struct cred *cred)
220 { 220 {
221 atomic_inc(&cred->usage); 221 atomic_inc(&cred->usage);
222 return cred; 222 return cred;
223 } 223 }
224 224
225 /** 225 /**
226 * get_cred - Get a reference on a set of credentials 226 * get_cred - Get a reference on a set of credentials
227 * @cred: The credentials to reference 227 * @cred: The credentials to reference
228 * 228 *
229 * Get a reference on the specified set of credentials. The caller must 229 * Get a reference on the specified set of credentials. The caller must
230 * release the reference. 230 * release the reference.
231 * 231 *
232 * This is used to deal with a committed set of credentials. Although the 232 * This is used to deal with a committed set of credentials. Although the
233 * pointer is const, this will temporarily discard the const and increment the 233 * pointer is const, this will temporarily discard the const and increment the
234 * usage count. The purpose of this is to attempt to catch at compile time the 234 * usage count. The purpose of this is to attempt to catch at compile time the
235 * accidental alteration of a set of credentials that should be considered 235 * accidental alteration of a set of credentials that should be considered
236 * immutable. 236 * immutable.
237 */ 237 */
238 static inline const struct cred *get_cred(const struct cred *cred) 238 static inline const struct cred *get_cred(const struct cred *cred)
239 { 239 {
240 struct cred *nonconst_cred = (struct cred *) cred; 240 struct cred *nonconst_cred = (struct cred *) cred;
241 validate_creds(cred); 241 validate_creds(cred);
242 return get_new_cred(nonconst_cred); 242 return get_new_cred(nonconst_cred);
243 } 243 }
244 244
245 /** 245 /**
246 * put_cred - Release a reference to a set of credentials 246 * put_cred - Release a reference to a set of credentials
247 * @cred: The credentials to release 247 * @cred: The credentials to release
248 * 248 *
249 * Release a reference to a set of credentials, deleting them when the last ref 249 * Release a reference to a set of credentials, deleting them when the last ref
250 * is released. 250 * is released.
251 * 251 *
252 * This takes a const pointer to a set of credentials because the credentials 252 * This takes a const pointer to a set of credentials because the credentials
253 * on task_struct are attached by const pointers to prevent accidental 253 * on task_struct are attached by const pointers to prevent accidental
254 * alteration of otherwise immutable credential sets. 254 * alteration of otherwise immutable credential sets.
255 */ 255 */
256 static inline void put_cred(const struct cred *_cred) 256 static inline void put_cred(const struct cred *_cred)
257 { 257 {
258 struct cred *cred = (struct cred *) _cred; 258 struct cred *cred = (struct cred *) _cred;
259 259
260 validate_creds(cred); 260 validate_creds(cred);
261 if (atomic_dec_and_test(&(cred)->usage)) 261 if (atomic_dec_and_test(&(cred)->usage))
262 __put_cred(cred); 262 __put_cred(cred);
263 } 263 }
264 264
265 /** 265 /**
266 * current_cred - Access the current task's subjective credentials 266 * current_cred - Access the current task's subjective credentials
267 * 267 *
268 * Access the subjective credentials of the current task. 268 * Access the subjective credentials of the current task.
269 */ 269 */
270 #define current_cred() \ 270 #define current_cred() \
271 (current->cred) 271 (current->cred)
272 272
273 /** 273 /**
274 * __task_cred - Access a task's objective credentials 274 * __task_cred - Access a task's objective credentials
275 * @task: The task to query 275 * @task: The task to query
276 * 276 *
277 * Access the objective credentials of a task. The caller must hold the RCU 277 * Access the objective credentials of a task. The caller must hold the RCU
278 * readlock or the task must be dead and unable to change its own credentials. 278 * readlock or the task must be dead and unable to change its own credentials.
279 * 279 *
280 * The result of this function should not be passed directly to get_cred(); 280 * The result of this function should not be passed directly to get_cred();
281 * rather get_task_cred() should be used instead. 281 * rather get_task_cred() should be used instead.
282 */ 282 */
283 #define __task_cred(task) \ 283 #define __task_cred(task) \
284 ({ \ 284 ({ \
285 const struct task_struct *__t = (task); \ 285 const struct task_struct *__t = (task); \
286 rcu_dereference_check(__t->real_cred, \ 286 rcu_dereference_check(__t->real_cred, \
287 rcu_read_lock_held() || \
288 task_is_dead(__t)); \ 287 task_is_dead(__t)); \
289 }) 288 })
290 289
291 /** 290 /**
292 * get_current_cred - Get the current task's subjective credentials 291 * get_current_cred - Get the current task's subjective credentials
293 * 292 *
294 * Get the subjective credentials of the current task, pinning them so that 293 * Get the subjective credentials of the current task, pinning them so that
295 * they can't go away. Accessing the current task's credentials directly is 294 * they can't go away. Accessing the current task's credentials directly is
296 * not permitted. 295 * not permitted.
297 */ 296 */
298 #define get_current_cred() \ 297 #define get_current_cred() \
299 (get_cred(current_cred())) 298 (get_cred(current_cred()))
300 299
301 /** 300 /**
302 * get_current_user - Get the current task's user_struct 301 * get_current_user - Get the current task's user_struct
303 * 302 *
304 * Get the user record of the current task, pinning it so that it can't go 303 * Get the user record of the current task, pinning it so that it can't go
305 * away. 304 * away.
306 */ 305 */
307 #define get_current_user() \ 306 #define get_current_user() \
308 ({ \ 307 ({ \
309 struct user_struct *__u; \ 308 struct user_struct *__u; \
310 struct cred *__cred; \ 309 struct cred *__cred; \
311 __cred = (struct cred *) current_cred(); \ 310 __cred = (struct cred *) current_cred(); \
312 __u = get_uid(__cred->user); \ 311 __u = get_uid(__cred->user); \
313 __u; \ 312 __u; \
314 }) 313 })
315 314
316 /** 315 /**
317 * get_current_groups - Get the current task's supplementary group list 316 * get_current_groups - Get the current task's supplementary group list
318 * 317 *
319 * Get the supplementary group list of the current task, pinning it so that it 318 * Get the supplementary group list of the current task, pinning it so that it
320 * can't go away. 319 * can't go away.
321 */ 320 */
322 #define get_current_groups() \ 321 #define get_current_groups() \
323 ({ \ 322 ({ \
324 struct group_info *__groups; \ 323 struct group_info *__groups; \
325 struct cred *__cred; \ 324 struct cred *__cred; \
326 __cred = (struct cred *) current_cred(); \ 325 __cred = (struct cred *) current_cred(); \
327 __groups = get_group_info(__cred->group_info); \ 326 __groups = get_group_info(__cred->group_info); \
328 __groups; \ 327 __groups; \
329 }) 328 })
330 329
331 #define task_cred_xxx(task, xxx) \ 330 #define task_cred_xxx(task, xxx) \
332 ({ \ 331 ({ \
333 __typeof__(((struct cred *)NULL)->xxx) ___val; \ 332 __typeof__(((struct cred *)NULL)->xxx) ___val; \
334 rcu_read_lock(); \ 333 rcu_read_lock(); \
335 ___val = __task_cred((task))->xxx; \ 334 ___val = __task_cred((task))->xxx; \
336 rcu_read_unlock(); \ 335 rcu_read_unlock(); \
337 ___val; \ 336 ___val; \
338 }) 337 })
339 338
340 #define task_uid(task) (task_cred_xxx((task), uid)) 339 #define task_uid(task) (task_cred_xxx((task), uid))
341 #define task_euid(task) (task_cred_xxx((task), euid)) 340 #define task_euid(task) (task_cred_xxx((task), euid))
342 341
343 #define current_cred_xxx(xxx) \ 342 #define current_cred_xxx(xxx) \
344 ({ \ 343 ({ \
345 current->cred->xxx; \ 344 current->cred->xxx; \
346 }) 345 })
347 346
348 #define current_uid() (current_cred_xxx(uid)) 347 #define current_uid() (current_cred_xxx(uid))
349 #define current_gid() (current_cred_xxx(gid)) 348 #define current_gid() (current_cred_xxx(gid))
350 #define current_euid() (current_cred_xxx(euid)) 349 #define current_euid() (current_cred_xxx(euid))
351 #define current_egid() (current_cred_xxx(egid)) 350 #define current_egid() (current_cred_xxx(egid))
352 #define current_suid() (current_cred_xxx(suid)) 351 #define current_suid() (current_cred_xxx(suid))
353 #define current_sgid() (current_cred_xxx(sgid)) 352 #define current_sgid() (current_cred_xxx(sgid))
354 #define current_fsuid() (current_cred_xxx(fsuid)) 353 #define current_fsuid() (current_cred_xxx(fsuid))
355 #define current_fsgid() (current_cred_xxx(fsgid)) 354 #define current_fsgid() (current_cred_xxx(fsgid))
356 #define current_cap() (current_cred_xxx(cap_effective)) 355 #define current_cap() (current_cred_xxx(cap_effective))
357 #define current_user() (current_cred_xxx(user)) 356 #define current_user() (current_cred_xxx(user))
358 #define current_security() (current_cred_xxx(security)) 357 #define current_security() (current_cred_xxx(security))
359 358
360 #ifdef CONFIG_USER_NS 359 #ifdef CONFIG_USER_NS
361 #define current_user_ns() (current_cred_xxx(user_ns)) 360 #define current_user_ns() (current_cred_xxx(user_ns))
362 #else 361 #else
363 extern struct user_namespace init_user_ns; 362 extern struct user_namespace init_user_ns;
364 #define current_user_ns() (&init_user_ns) 363 #define current_user_ns() (&init_user_ns)
365 #endif 364 #endif
366 365
367 366
368 #define current_uid_gid(_uid, _gid) \ 367 #define current_uid_gid(_uid, _gid) \
369 do { \ 368 do { \
370 const struct cred *__cred; \ 369 const struct cred *__cred; \
371 __cred = current_cred(); \ 370 __cred = current_cred(); \
372 *(_uid) = __cred->uid; \ 371 *(_uid) = __cred->uid; \
373 *(_gid) = __cred->gid; \ 372 *(_gid) = __cred->gid; \
374 } while(0) 373 } while(0)
375 374
376 #define current_euid_egid(_euid, _egid) \ 375 #define current_euid_egid(_euid, _egid) \
377 do { \ 376 do { \
378 const struct cred *__cred; \ 377 const struct cred *__cred; \
379 __cred = current_cred(); \ 378 __cred = current_cred(); \
380 *(_euid) = __cred->euid; \ 379 *(_euid) = __cred->euid; \
381 *(_egid) = __cred->egid; \ 380 *(_egid) = __cred->egid; \
382 } while(0) 381 } while(0)
383 382
384 #define current_fsuid_fsgid(_fsuid, _fsgid) \ 383 #define current_fsuid_fsgid(_fsuid, _fsgid) \
385 do { \ 384 do { \
386 const struct cred *__cred; \ 385 const struct cred *__cred; \
387 __cred = current_cred(); \ 386 __cred = current_cred(); \
388 *(_fsuid) = __cred->fsuid; \ 387 *(_fsuid) = __cred->fsuid; \
389 *(_fsgid) = __cred->fsgid; \ 388 *(_fsgid) = __cred->fsgid; \
390 } while(0) 389 } while(0)
391 390
392 #endif /* _LINUX_CRED_H */ 391 #endif /* _LINUX_CRED_H */
393 392
include/linux/fdtable.h
1 /* 1 /*
2 * descriptor table internals; you almost certainly want file.h instead. 2 * descriptor table internals; you almost certainly want file.h instead.
3 */ 3 */
4 4
5 #ifndef __LINUX_FDTABLE_H 5 #ifndef __LINUX_FDTABLE_H
6 #define __LINUX_FDTABLE_H 6 #define __LINUX_FDTABLE_H
7 7
8 #include <linux/posix_types.h> 8 #include <linux/posix_types.h>
9 #include <linux/compiler.h> 9 #include <linux/compiler.h>
10 #include <linux/spinlock.h> 10 #include <linux/spinlock.h>
11 #include <linux/rcupdate.h> 11 #include <linux/rcupdate.h>
12 #include <linux/types.h> 12 #include <linux/types.h>
13 #include <linux/init.h> 13 #include <linux/init.h>
14 #include <linux/fs.h> 14 #include <linux/fs.h>
15 15
16 #include <asm/atomic.h> 16 #include <asm/atomic.h>
17 17
18 /* 18 /*
19 * The default fd array needs to be at least BITS_PER_LONG, 19 * The default fd array needs to be at least BITS_PER_LONG,
20 * as this is the granularity returned by copy_fdset(). 20 * as this is the granularity returned by copy_fdset().
21 */ 21 */
22 #define NR_OPEN_DEFAULT BITS_PER_LONG 22 #define NR_OPEN_DEFAULT BITS_PER_LONG
23 23
24 /* 24 /*
25 * The embedded_fd_set is a small fd_set, 25 * The embedded_fd_set is a small fd_set,
26 * suitable for most tasks (which open <= BITS_PER_LONG files) 26 * suitable for most tasks (which open <= BITS_PER_LONG files)
27 */ 27 */
28 struct embedded_fd_set { 28 struct embedded_fd_set {
29 unsigned long fds_bits[1]; 29 unsigned long fds_bits[1];
30 }; 30 };
31 31
32 struct fdtable { 32 struct fdtable {
33 unsigned int max_fds; 33 unsigned int max_fds;
34 struct file __rcu **fd; /* current fd array */ 34 struct file __rcu **fd; /* current fd array */
35 fd_set *close_on_exec; 35 fd_set *close_on_exec;
36 fd_set *open_fds; 36 fd_set *open_fds;
37 struct rcu_head rcu; 37 struct rcu_head rcu;
38 struct fdtable *next; 38 struct fdtable *next;
39 }; 39 };
40 40
41 /* 41 /*
42 * Open file table structure 42 * Open file table structure
43 */ 43 */
44 struct files_struct { 44 struct files_struct {
45 /* 45 /*
46 * read mostly part 46 * read mostly part
47 */ 47 */
48 atomic_t count; 48 atomic_t count;
49 struct fdtable __rcu *fdt; 49 struct fdtable __rcu *fdt;
50 struct fdtable fdtab; 50 struct fdtable fdtab;
51 /* 51 /*
52 * written part on a separate cache line in SMP 52 * written part on a separate cache line in SMP
53 */ 53 */
54 spinlock_t file_lock ____cacheline_aligned_in_smp; 54 spinlock_t file_lock ____cacheline_aligned_in_smp;
55 int next_fd; 55 int next_fd;
56 struct embedded_fd_set close_on_exec_init; 56 struct embedded_fd_set close_on_exec_init;
57 struct embedded_fd_set open_fds_init; 57 struct embedded_fd_set open_fds_init;
58 struct file __rcu * fd_array[NR_OPEN_DEFAULT]; 58 struct file __rcu * fd_array[NR_OPEN_DEFAULT];
59 }; 59 };
60 60
61 #define rcu_dereference_check_fdtable(files, fdtfd) \ 61 #define rcu_dereference_check_fdtable(files, fdtfd) \
62 (rcu_dereference_check((fdtfd), \ 62 (rcu_dereference_check((fdtfd), \
63 rcu_read_lock_held() || \
64 lockdep_is_held(&(files)->file_lock) || \ 63 lockdep_is_held(&(files)->file_lock) || \
65 atomic_read(&(files)->count) == 1 || \ 64 atomic_read(&(files)->count) == 1 || \
66 rcu_my_thread_group_empty())) 65 rcu_my_thread_group_empty()))
67 66
68 #define files_fdtable(files) \ 67 #define files_fdtable(files) \
69 (rcu_dereference_check_fdtable((files), (files)->fdt)) 68 (rcu_dereference_check_fdtable((files), (files)->fdt))
70 69
71 struct file_operations; 70 struct file_operations;
72 struct vfsmount; 71 struct vfsmount;
73 struct dentry; 72 struct dentry;
74 73
75 extern int expand_files(struct files_struct *, int nr); 74 extern int expand_files(struct files_struct *, int nr);
76 extern void free_fdtable_rcu(struct rcu_head *rcu); 75 extern void free_fdtable_rcu(struct rcu_head *rcu);
77 extern void __init files_defer_init(void); 76 extern void __init files_defer_init(void);
78 77
79 static inline void free_fdtable(struct fdtable *fdt) 78 static inline void free_fdtable(struct fdtable *fdt)
80 { 79 {
81 call_rcu(&fdt->rcu, free_fdtable_rcu); 80 call_rcu(&fdt->rcu, free_fdtable_rcu);
82 } 81 }
83 82
84 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) 83 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
85 { 84 {
86 struct file * file = NULL; 85 struct file * file = NULL;
87 struct fdtable *fdt = files_fdtable(files); 86 struct fdtable *fdt = files_fdtable(files);
88 87
89 if (fd < fdt->max_fds) 88 if (fd < fdt->max_fds)
90 file = rcu_dereference_check_fdtable(files, fdt->fd[fd]); 89 file = rcu_dereference_check_fdtable(files, fdt->fd[fd]);
91 return file; 90 return file;
92 } 91 }
93 92
94 /* 93 /*
95 * Check whether the specified fd has an open file. 94 * Check whether the specified fd has an open file.
96 */ 95 */
97 #define fcheck(fd) fcheck_files(current->files, fd) 96 #define fcheck(fd) fcheck_files(current->files, fd)
98 97
99 struct task_struct; 98 struct task_struct;
100 99
101 struct files_struct *get_files_struct(struct task_struct *); 100 struct files_struct *get_files_struct(struct task_struct *);
102 void put_files_struct(struct files_struct *fs); 101 void put_files_struct(struct files_struct *fs);
103 void reset_files_struct(struct files_struct *); 102 void reset_files_struct(struct files_struct *);
104 int unshare_files(struct files_struct **); 103 int unshare_files(struct files_struct **);
105 struct files_struct *dup_fd(struct files_struct *, int *); 104 struct files_struct *dup_fd(struct files_struct *, int *);
106 105
107 extern struct kmem_cache *files_cachep; 106 extern struct kmem_cache *files_cachep;
108 107
109 #endif /* __LINUX_FDTABLE_H */ 108 #endif /* __LINUX_FDTABLE_H */
110 109
include/linux/rtnetlink.h
1 #ifndef __LINUX_RTNETLINK_H 1 #ifndef __LINUX_RTNETLINK_H
2 #define __LINUX_RTNETLINK_H 2 #define __LINUX_RTNETLINK_H
3 3
4 #include <linux/types.h> 4 #include <linux/types.h>
5 #include <linux/netlink.h> 5 #include <linux/netlink.h>
6 #include <linux/if_link.h> 6 #include <linux/if_link.h>
7 #include <linux/if_addr.h> 7 #include <linux/if_addr.h>
8 #include <linux/neighbour.h> 8 #include <linux/neighbour.h>
9 9
10 /* rtnetlink families. Values up to 127 are reserved for real address 10 /* rtnetlink families. Values up to 127 are reserved for real address
11 * families, values above 128 may be used arbitrarily. 11 * families, values above 128 may be used arbitrarily.
12 */ 12 */
13 #define RTNL_FAMILY_IPMR 128 13 #define RTNL_FAMILY_IPMR 128
14 #define RTNL_FAMILY_IP6MR 129 14 #define RTNL_FAMILY_IP6MR 129
15 #define RTNL_FAMILY_MAX 129 15 #define RTNL_FAMILY_MAX 129
16 16
17 /**** 17 /****
18 * Routing/neighbour discovery messages. 18 * Routing/neighbour discovery messages.
19 ****/ 19 ****/
20 20
21 /* Types of messages */ 21 /* Types of messages */
22 22
23 enum { 23 enum {
24 RTM_BASE = 16, 24 RTM_BASE = 16,
25 #define RTM_BASE RTM_BASE 25 #define RTM_BASE RTM_BASE
26 26
27 RTM_NEWLINK = 16, 27 RTM_NEWLINK = 16,
28 #define RTM_NEWLINK RTM_NEWLINK 28 #define RTM_NEWLINK RTM_NEWLINK
29 RTM_DELLINK, 29 RTM_DELLINK,
30 #define RTM_DELLINK RTM_DELLINK 30 #define RTM_DELLINK RTM_DELLINK
31 RTM_GETLINK, 31 RTM_GETLINK,
32 #define RTM_GETLINK RTM_GETLINK 32 #define RTM_GETLINK RTM_GETLINK
33 RTM_SETLINK, 33 RTM_SETLINK,
34 #define RTM_SETLINK RTM_SETLINK 34 #define RTM_SETLINK RTM_SETLINK
35 35
36 RTM_NEWADDR = 20, 36 RTM_NEWADDR = 20,
37 #define RTM_NEWADDR RTM_NEWADDR 37 #define RTM_NEWADDR RTM_NEWADDR
38 RTM_DELADDR, 38 RTM_DELADDR,
39 #define RTM_DELADDR RTM_DELADDR 39 #define RTM_DELADDR RTM_DELADDR
40 RTM_GETADDR, 40 RTM_GETADDR,
41 #define RTM_GETADDR RTM_GETADDR 41 #define RTM_GETADDR RTM_GETADDR
42 42
43 RTM_NEWROUTE = 24, 43 RTM_NEWROUTE = 24,
44 #define RTM_NEWROUTE RTM_NEWROUTE 44 #define RTM_NEWROUTE RTM_NEWROUTE
45 RTM_DELROUTE, 45 RTM_DELROUTE,
46 #define RTM_DELROUTE RTM_DELROUTE 46 #define RTM_DELROUTE RTM_DELROUTE
47 RTM_GETROUTE, 47 RTM_GETROUTE,
48 #define RTM_GETROUTE RTM_GETROUTE 48 #define RTM_GETROUTE RTM_GETROUTE
49 49
50 RTM_NEWNEIGH = 28, 50 RTM_NEWNEIGH = 28,
51 #define RTM_NEWNEIGH RTM_NEWNEIGH 51 #define RTM_NEWNEIGH RTM_NEWNEIGH
52 RTM_DELNEIGH, 52 RTM_DELNEIGH,
53 #define RTM_DELNEIGH RTM_DELNEIGH 53 #define RTM_DELNEIGH RTM_DELNEIGH
54 RTM_GETNEIGH, 54 RTM_GETNEIGH,
55 #define RTM_GETNEIGH RTM_GETNEIGH 55 #define RTM_GETNEIGH RTM_GETNEIGH
56 56
57 RTM_NEWRULE = 32, 57 RTM_NEWRULE = 32,
58 #define RTM_NEWRULE RTM_NEWRULE 58 #define RTM_NEWRULE RTM_NEWRULE
59 RTM_DELRULE, 59 RTM_DELRULE,
60 #define RTM_DELRULE RTM_DELRULE 60 #define RTM_DELRULE RTM_DELRULE
61 RTM_GETRULE, 61 RTM_GETRULE,
62 #define RTM_GETRULE RTM_GETRULE 62 #define RTM_GETRULE RTM_GETRULE
63 63
64 RTM_NEWQDISC = 36, 64 RTM_NEWQDISC = 36,
65 #define RTM_NEWQDISC RTM_NEWQDISC 65 #define RTM_NEWQDISC RTM_NEWQDISC
66 RTM_DELQDISC, 66 RTM_DELQDISC,
67 #define RTM_DELQDISC RTM_DELQDISC 67 #define RTM_DELQDISC RTM_DELQDISC
68 RTM_GETQDISC, 68 RTM_GETQDISC,
69 #define RTM_GETQDISC RTM_GETQDISC 69 #define RTM_GETQDISC RTM_GETQDISC
70 70
71 RTM_NEWTCLASS = 40, 71 RTM_NEWTCLASS = 40,
72 #define RTM_NEWTCLASS RTM_NEWTCLASS 72 #define RTM_NEWTCLASS RTM_NEWTCLASS
73 RTM_DELTCLASS, 73 RTM_DELTCLASS,
74 #define RTM_DELTCLASS RTM_DELTCLASS 74 #define RTM_DELTCLASS RTM_DELTCLASS
75 RTM_GETTCLASS, 75 RTM_GETTCLASS,
76 #define RTM_GETTCLASS RTM_GETTCLASS 76 #define RTM_GETTCLASS RTM_GETTCLASS
77 77
78 RTM_NEWTFILTER = 44, 78 RTM_NEWTFILTER = 44,
79 #define RTM_NEWTFILTER RTM_NEWTFILTER 79 #define RTM_NEWTFILTER RTM_NEWTFILTER
80 RTM_DELTFILTER, 80 RTM_DELTFILTER,
81 #define RTM_DELTFILTER RTM_DELTFILTER 81 #define RTM_DELTFILTER RTM_DELTFILTER
82 RTM_GETTFILTER, 82 RTM_GETTFILTER,
83 #define RTM_GETTFILTER RTM_GETTFILTER 83 #define RTM_GETTFILTER RTM_GETTFILTER
84 84
85 RTM_NEWACTION = 48, 85 RTM_NEWACTION = 48,
86 #define RTM_NEWACTION RTM_NEWACTION 86 #define RTM_NEWACTION RTM_NEWACTION
87 RTM_DELACTION, 87 RTM_DELACTION,
88 #define RTM_DELACTION RTM_DELACTION 88 #define RTM_DELACTION RTM_DELACTION
89 RTM_GETACTION, 89 RTM_GETACTION,
90 #define RTM_GETACTION RTM_GETACTION 90 #define RTM_GETACTION RTM_GETACTION
91 91
92 RTM_NEWPREFIX = 52, 92 RTM_NEWPREFIX = 52,
93 #define RTM_NEWPREFIX RTM_NEWPREFIX 93 #define RTM_NEWPREFIX RTM_NEWPREFIX
94 94
95 RTM_GETMULTICAST = 58, 95 RTM_GETMULTICAST = 58,
96 #define RTM_GETMULTICAST RTM_GETMULTICAST 96 #define RTM_GETMULTICAST RTM_GETMULTICAST
97 97
98 RTM_GETANYCAST = 62, 98 RTM_GETANYCAST = 62,
99 #define RTM_GETANYCAST RTM_GETANYCAST 99 #define RTM_GETANYCAST RTM_GETANYCAST
100 100
101 RTM_NEWNEIGHTBL = 64, 101 RTM_NEWNEIGHTBL = 64,
102 #define RTM_NEWNEIGHTBL RTM_NEWNEIGHTBL 102 #define RTM_NEWNEIGHTBL RTM_NEWNEIGHTBL
103 RTM_GETNEIGHTBL = 66, 103 RTM_GETNEIGHTBL = 66,
104 #define RTM_GETNEIGHTBL RTM_GETNEIGHTBL 104 #define RTM_GETNEIGHTBL RTM_GETNEIGHTBL
105 RTM_SETNEIGHTBL, 105 RTM_SETNEIGHTBL,
106 #define RTM_SETNEIGHTBL RTM_SETNEIGHTBL 106 #define RTM_SETNEIGHTBL RTM_SETNEIGHTBL
107 107
108 RTM_NEWNDUSEROPT = 68, 108 RTM_NEWNDUSEROPT = 68,
109 #define RTM_NEWNDUSEROPT RTM_NEWNDUSEROPT 109 #define RTM_NEWNDUSEROPT RTM_NEWNDUSEROPT
110 110
111 RTM_NEWADDRLABEL = 72, 111 RTM_NEWADDRLABEL = 72,
112 #define RTM_NEWADDRLABEL RTM_NEWADDRLABEL 112 #define RTM_NEWADDRLABEL RTM_NEWADDRLABEL
113 RTM_DELADDRLABEL, 113 RTM_DELADDRLABEL,
114 #define RTM_DELADDRLABEL RTM_DELADDRLABEL 114 #define RTM_DELADDRLABEL RTM_DELADDRLABEL
115 RTM_GETADDRLABEL, 115 RTM_GETADDRLABEL,
116 #define RTM_GETADDRLABEL RTM_GETADDRLABEL 116 #define RTM_GETADDRLABEL RTM_GETADDRLABEL
117 117
118 RTM_GETDCB = 78, 118 RTM_GETDCB = 78,
119 #define RTM_GETDCB RTM_GETDCB 119 #define RTM_GETDCB RTM_GETDCB
120 RTM_SETDCB, 120 RTM_SETDCB,
121 #define RTM_SETDCB RTM_SETDCB 121 #define RTM_SETDCB RTM_SETDCB
122 122
123 __RTM_MAX, 123 __RTM_MAX,
124 #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) 124 #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1)
125 }; 125 };
126 126
127 #define RTM_NR_MSGTYPES (RTM_MAX + 1 - RTM_BASE) 127 #define RTM_NR_MSGTYPES (RTM_MAX + 1 - RTM_BASE)
128 #define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2) 128 #define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2)
129 #define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2) 129 #define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2)
130 130
131 /* 131 /*
132 Generic structure for encapsulation of optional route information. 132 Generic structure for encapsulation of optional route information.
133 It is reminiscent of sockaddr, but with sa_family replaced 133 It is reminiscent of sockaddr, but with sa_family replaced
134 with attribute type. 134 with attribute type.
135 */ 135 */
136 136
137 struct rtattr { 137 struct rtattr {
138 unsigned short rta_len; 138 unsigned short rta_len;
139 unsigned short rta_type; 139 unsigned short rta_type;
140 }; 140 };
141 141
142 /* Macros to handle rtattributes */ 142 /* Macros to handle rtattributes */
143 143
144 #define RTA_ALIGNTO 4 144 #define RTA_ALIGNTO 4
145 #define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) ) 145 #define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) )
146 #define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \ 146 #define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \
147 (rta)->rta_len >= sizeof(struct rtattr) && \ 147 (rta)->rta_len >= sizeof(struct rtattr) && \
148 (rta)->rta_len <= (len)) 148 (rta)->rta_len <= (len))
149 #define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \ 149 #define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \
150 (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len))) 150 (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len)))
151 #define RTA_LENGTH(len) (RTA_ALIGN(sizeof(struct rtattr)) + (len)) 151 #define RTA_LENGTH(len) (RTA_ALIGN(sizeof(struct rtattr)) + (len))
152 #define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len)) 152 #define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len))
153 #define RTA_DATA(rta) ((void*)(((char*)(rta)) + RTA_LENGTH(0))) 153 #define RTA_DATA(rta) ((void*)(((char*)(rta)) + RTA_LENGTH(0)))
154 #define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0)) 154 #define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0))
155 155
156 156
157 157
158 158
159 /****************************************************************************** 159 /******************************************************************************
160 * Definitions used in routing table administration. 160 * Definitions used in routing table administration.
161 ****/ 161 ****/
162 162
163 struct rtmsg { 163 struct rtmsg {
164 unsigned char rtm_family; 164 unsigned char rtm_family;
165 unsigned char rtm_dst_len; 165 unsigned char rtm_dst_len;
166 unsigned char rtm_src_len; 166 unsigned char rtm_src_len;
167 unsigned char rtm_tos; 167 unsigned char rtm_tos;
168 168
169 unsigned char rtm_table; /* Routing table id */ 169 unsigned char rtm_table; /* Routing table id */
170 unsigned char rtm_protocol; /* Routing protocol; see below */ 170 unsigned char rtm_protocol; /* Routing protocol; see below */
171 unsigned char rtm_scope; /* See below */ 171 unsigned char rtm_scope; /* See below */
172 unsigned char rtm_type; /* See below */ 172 unsigned char rtm_type; /* See below */
173 173
174 unsigned rtm_flags; 174 unsigned rtm_flags;
175 }; 175 };
176 176
177 /* rtm_type */ 177 /* rtm_type */
178 178
179 enum { 179 enum {
180 RTN_UNSPEC, 180 RTN_UNSPEC,
181 RTN_UNICAST, /* Gateway or direct route */ 181 RTN_UNICAST, /* Gateway or direct route */
182 RTN_LOCAL, /* Accept locally */ 182 RTN_LOCAL, /* Accept locally */
183 RTN_BROADCAST, /* Accept locally as broadcast, 183 RTN_BROADCAST, /* Accept locally as broadcast,
184 send as broadcast */ 184 send as broadcast */
185 RTN_ANYCAST, /* Accept locally as broadcast, 185 RTN_ANYCAST, /* Accept locally as broadcast,
186 but send as unicast */ 186 but send as unicast */
187 RTN_MULTICAST, /* Multicast route */ 187 RTN_MULTICAST, /* Multicast route */
188 RTN_BLACKHOLE, /* Drop */ 188 RTN_BLACKHOLE, /* Drop */
189 RTN_UNREACHABLE, /* Destination is unreachable */ 189 RTN_UNREACHABLE, /* Destination is unreachable */
190 RTN_PROHIBIT, /* Administratively prohibited */ 190 RTN_PROHIBIT, /* Administratively prohibited */
191 RTN_THROW, /* Not in this table */ 191 RTN_THROW, /* Not in this table */
192 RTN_NAT, /* Translate this address */ 192 RTN_NAT, /* Translate this address */
193 RTN_XRESOLVE, /* Use external resolver */ 193 RTN_XRESOLVE, /* Use external resolver */
194 __RTN_MAX 194 __RTN_MAX
195 }; 195 };
196 196
197 #define RTN_MAX (__RTN_MAX - 1) 197 #define RTN_MAX (__RTN_MAX - 1)
198 198
199 199
200 /* rtm_protocol */ 200 /* rtm_protocol */
201 201
202 #define RTPROT_UNSPEC 0 202 #define RTPROT_UNSPEC 0
203 #define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; 203 #define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects;
204 not used by current IPv4 */ 204 not used by current IPv4 */
205 #define RTPROT_KERNEL 2 /* Route installed by kernel */ 205 #define RTPROT_KERNEL 2 /* Route installed by kernel */
206 #define RTPROT_BOOT 3 /* Route installed during boot */ 206 #define RTPROT_BOOT 3 /* Route installed during boot */
207 #define RTPROT_STATIC 4 /* Route installed by administrator */ 207 #define RTPROT_STATIC 4 /* Route installed by administrator */
208 208
209 /* Values of protocol >= RTPROT_STATIC are not interpreted by kernel; 209 /* Values of protocol >= RTPROT_STATIC are not interpreted by kernel;
210 they are just passed from user and back as is. 210 they are just passed from user and back as is.
211 It will be used by hypothetical multiple routing daemons. 211 It will be used by hypothetical multiple routing daemons.
212 Note that protocol values should be standardized in order to 212 Note that protocol values should be standardized in order to
213 avoid conflicts. 213 avoid conflicts.
214 */ 214 */
215 215
216 #define RTPROT_GATED 8 /* Apparently, GateD */ 216 #define RTPROT_GATED 8 /* Apparently, GateD */
217 #define RTPROT_RA 9 /* RDISC/ND router advertisements */ 217 #define RTPROT_RA 9 /* RDISC/ND router advertisements */
218 #define RTPROT_MRT 10 /* Merit MRT */ 218 #define RTPROT_MRT 10 /* Merit MRT */
219 #define RTPROT_ZEBRA 11 /* Zebra */ 219 #define RTPROT_ZEBRA 11 /* Zebra */
220 #define RTPROT_BIRD 12 /* BIRD */ 220 #define RTPROT_BIRD 12 /* BIRD */
221 #define RTPROT_DNROUTED 13 /* DECnet routing daemon */ 221 #define RTPROT_DNROUTED 13 /* DECnet routing daemon */
222 #define RTPROT_XORP 14 /* XORP */ 222 #define RTPROT_XORP 14 /* XORP */
223 #define RTPROT_NTK 15 /* Netsukuku */ 223 #define RTPROT_NTK 15 /* Netsukuku */
224 #define RTPROT_DHCP 16 /* DHCP client */ 224 #define RTPROT_DHCP 16 /* DHCP client */
225 225
226 /* rtm_scope 226 /* rtm_scope
227 227
228 Really it is not scope, but sort of distance to the destination. 228 Really it is not scope, but sort of distance to the destination.
229 NOWHERE are reserved for not existing destinations, HOST is our 229 NOWHERE are reserved for not existing destinations, HOST is our
230 local addresses, LINK are destinations, located on directly attached 230 local addresses, LINK are destinations, located on directly attached
231 link and UNIVERSE is everywhere in the Universe. 231 link and UNIVERSE is everywhere in the Universe.
232 232
233 Intermediate values are also possible f.e. interior routes 233 Intermediate values are also possible f.e. interior routes
234 could be assigned a value between UNIVERSE and LINK. 234 could be assigned a value between UNIVERSE and LINK.
235 */ 235 */
236 236
237 enum rt_scope_t { 237 enum rt_scope_t {
238 RT_SCOPE_UNIVERSE=0, 238 RT_SCOPE_UNIVERSE=0,
239 /* User defined values */ 239 /* User defined values */
240 RT_SCOPE_SITE=200, 240 RT_SCOPE_SITE=200,
241 RT_SCOPE_LINK=253, 241 RT_SCOPE_LINK=253,
242 RT_SCOPE_HOST=254, 242 RT_SCOPE_HOST=254,
243 RT_SCOPE_NOWHERE=255 243 RT_SCOPE_NOWHERE=255
244 }; 244 };
245 245
246 /* rtm_flags */ 246 /* rtm_flags */
247 247
248 #define RTM_F_NOTIFY 0x100 /* Notify user of route change */ 248 #define RTM_F_NOTIFY 0x100 /* Notify user of route change */
249 #define RTM_F_CLONED 0x200 /* This route is cloned */ 249 #define RTM_F_CLONED 0x200 /* This route is cloned */
250 #define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ 250 #define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */
251 #define RTM_F_PREFIX 0x800 /* Prefix addresses */ 251 #define RTM_F_PREFIX 0x800 /* Prefix addresses */
252 252
253 /* Reserved table identifiers */ 253 /* Reserved table identifiers */
254 254
255 enum rt_class_t { 255 enum rt_class_t {
256 RT_TABLE_UNSPEC=0, 256 RT_TABLE_UNSPEC=0,
257 /* User defined values */ 257 /* User defined values */
258 RT_TABLE_COMPAT=252, 258 RT_TABLE_COMPAT=252,
259 RT_TABLE_DEFAULT=253, 259 RT_TABLE_DEFAULT=253,
260 RT_TABLE_MAIN=254, 260 RT_TABLE_MAIN=254,
261 RT_TABLE_LOCAL=255, 261 RT_TABLE_LOCAL=255,
262 RT_TABLE_MAX=0xFFFFFFFF 262 RT_TABLE_MAX=0xFFFFFFFF
263 }; 263 };
264 264
265 265
266 /* Routing message attributes */ 266 /* Routing message attributes */
267 267
268 enum rtattr_type_t { 268 enum rtattr_type_t {
269 RTA_UNSPEC, 269 RTA_UNSPEC,
270 RTA_DST, 270 RTA_DST,
271 RTA_SRC, 271 RTA_SRC,
272 RTA_IIF, 272 RTA_IIF,
273 RTA_OIF, 273 RTA_OIF,
274 RTA_GATEWAY, 274 RTA_GATEWAY,
275 RTA_PRIORITY, 275 RTA_PRIORITY,
276 RTA_PREFSRC, 276 RTA_PREFSRC,
277 RTA_METRICS, 277 RTA_METRICS,
278 RTA_MULTIPATH, 278 RTA_MULTIPATH,
279 RTA_PROTOINFO, /* no longer used */ 279 RTA_PROTOINFO, /* no longer used */
280 RTA_FLOW, 280 RTA_FLOW,
281 RTA_CACHEINFO, 281 RTA_CACHEINFO,
282 RTA_SESSION, /* no longer used */ 282 RTA_SESSION, /* no longer used */
283 RTA_MP_ALGO, /* no longer used */ 283 RTA_MP_ALGO, /* no longer used */
284 RTA_TABLE, 284 RTA_TABLE,
285 RTA_MARK, 285 RTA_MARK,
286 __RTA_MAX 286 __RTA_MAX
287 }; 287 };
288 288
289 #define RTA_MAX (__RTA_MAX - 1) 289 #define RTA_MAX (__RTA_MAX - 1)
290 290
291 #define RTM_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg)))) 291 #define RTM_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg))))
292 #define RTM_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct rtmsg)) 292 #define RTM_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct rtmsg))
293 293
294 /* RTM_MULTIPATH --- array of struct rtnexthop. 294 /* RTM_MULTIPATH --- array of struct rtnexthop.
295 * 295 *
296 * "struct rtnexthop" describes all necessary nexthop information, 296 * "struct rtnexthop" describes all necessary nexthop information,
297 * i.e. parameters of path to a destination via this nexthop. 297 * i.e. parameters of path to a destination via this nexthop.
298 * 298 *
299 * At the moment it is impossible to set different prefsrc, mtu, window 299 * At the moment it is impossible to set different prefsrc, mtu, window
300 * and rtt for different paths from multipath. 300 * and rtt for different paths from multipath.
301 */ 301 */
302 302
303 struct rtnexthop { 303 struct rtnexthop {
304 unsigned short rtnh_len; 304 unsigned short rtnh_len;
305 unsigned char rtnh_flags; 305 unsigned char rtnh_flags;
306 unsigned char rtnh_hops; 306 unsigned char rtnh_hops;
307 int rtnh_ifindex; 307 int rtnh_ifindex;
308 }; 308 };
309 309
310 /* rtnh_flags */ 310 /* rtnh_flags */
311 311
312 #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ 312 #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */
313 #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ 313 #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */
314 #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ 314 #define RTNH_F_ONLINK 4 /* Gateway is forced on link */
315 315
316 /* Macros to handle hexthops */ 316 /* Macros to handle hexthops */
317 317
318 #define RTNH_ALIGNTO 4 318 #define RTNH_ALIGNTO 4
319 #define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) ) 319 #define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) )
320 #define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \ 320 #define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \
321 ((int)(rtnh)->rtnh_len) <= (len)) 321 ((int)(rtnh)->rtnh_len) <= (len))
322 #define RTNH_NEXT(rtnh) ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len))) 322 #define RTNH_NEXT(rtnh) ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len)))
323 #define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len)) 323 #define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len))
324 #define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len)) 324 #define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len))
325 #define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0))) 325 #define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0)))
326 326
327 /* RTM_CACHEINFO */ 327 /* RTM_CACHEINFO */
328 328
329 struct rta_cacheinfo { 329 struct rta_cacheinfo {
330 __u32 rta_clntref; 330 __u32 rta_clntref;
331 __u32 rta_lastuse; 331 __u32 rta_lastuse;
332 __s32 rta_expires; 332 __s32 rta_expires;
333 __u32 rta_error; 333 __u32 rta_error;
334 __u32 rta_used; 334 __u32 rta_used;
335 335
336 #define RTNETLINK_HAVE_PEERINFO 1 336 #define RTNETLINK_HAVE_PEERINFO 1
337 __u32 rta_id; 337 __u32 rta_id;
338 __u32 rta_ts; 338 __u32 rta_ts;
339 __u32 rta_tsage; 339 __u32 rta_tsage;
340 }; 340 };
341 341
342 /* RTM_METRICS --- array of struct rtattr with types of RTAX_* */ 342 /* RTM_METRICS --- array of struct rtattr with types of RTAX_* */
343 343
344 enum { 344 enum {
345 RTAX_UNSPEC, 345 RTAX_UNSPEC,
346 #define RTAX_UNSPEC RTAX_UNSPEC 346 #define RTAX_UNSPEC RTAX_UNSPEC
347 RTAX_LOCK, 347 RTAX_LOCK,
348 #define RTAX_LOCK RTAX_LOCK 348 #define RTAX_LOCK RTAX_LOCK
349 RTAX_MTU, 349 RTAX_MTU,
350 #define RTAX_MTU RTAX_MTU 350 #define RTAX_MTU RTAX_MTU
351 RTAX_WINDOW, 351 RTAX_WINDOW,
352 #define RTAX_WINDOW RTAX_WINDOW 352 #define RTAX_WINDOW RTAX_WINDOW
353 RTAX_RTT, 353 RTAX_RTT,
354 #define RTAX_RTT RTAX_RTT 354 #define RTAX_RTT RTAX_RTT
355 RTAX_RTTVAR, 355 RTAX_RTTVAR,
356 #define RTAX_RTTVAR RTAX_RTTVAR 356 #define RTAX_RTTVAR RTAX_RTTVAR
357 RTAX_SSTHRESH, 357 RTAX_SSTHRESH,
358 #define RTAX_SSTHRESH RTAX_SSTHRESH 358 #define RTAX_SSTHRESH RTAX_SSTHRESH
359 RTAX_CWND, 359 RTAX_CWND,
360 #define RTAX_CWND RTAX_CWND 360 #define RTAX_CWND RTAX_CWND
361 RTAX_ADVMSS, 361 RTAX_ADVMSS,
362 #define RTAX_ADVMSS RTAX_ADVMSS 362 #define RTAX_ADVMSS RTAX_ADVMSS
363 RTAX_REORDERING, 363 RTAX_REORDERING,
364 #define RTAX_REORDERING RTAX_REORDERING 364 #define RTAX_REORDERING RTAX_REORDERING
365 RTAX_HOPLIMIT, 365 RTAX_HOPLIMIT,
366 #define RTAX_HOPLIMIT RTAX_HOPLIMIT 366 #define RTAX_HOPLIMIT RTAX_HOPLIMIT
367 RTAX_INITCWND, 367 RTAX_INITCWND,
368 #define RTAX_INITCWND RTAX_INITCWND 368 #define RTAX_INITCWND RTAX_INITCWND
369 RTAX_FEATURES, 369 RTAX_FEATURES,
370 #define RTAX_FEATURES RTAX_FEATURES 370 #define RTAX_FEATURES RTAX_FEATURES
371 RTAX_RTO_MIN, 371 RTAX_RTO_MIN,
372 #define RTAX_RTO_MIN RTAX_RTO_MIN 372 #define RTAX_RTO_MIN RTAX_RTO_MIN
373 RTAX_INITRWND, 373 RTAX_INITRWND,
374 #define RTAX_INITRWND RTAX_INITRWND 374 #define RTAX_INITRWND RTAX_INITRWND
375 __RTAX_MAX 375 __RTAX_MAX
376 }; 376 };
377 377
378 #define RTAX_MAX (__RTAX_MAX - 1) 378 #define RTAX_MAX (__RTAX_MAX - 1)
379 379
380 #define RTAX_FEATURE_ECN 0x00000001 380 #define RTAX_FEATURE_ECN 0x00000001
381 #define RTAX_FEATURE_SACK 0x00000002 381 #define RTAX_FEATURE_SACK 0x00000002
382 #define RTAX_FEATURE_TIMESTAMP 0x00000004 382 #define RTAX_FEATURE_TIMESTAMP 0x00000004
383 #define RTAX_FEATURE_ALLFRAG 0x00000008 383 #define RTAX_FEATURE_ALLFRAG 0x00000008
384 384
385 struct rta_session { 385 struct rta_session {
386 __u8 proto; 386 __u8 proto;
387 __u8 pad1; 387 __u8 pad1;
388 __u16 pad2; 388 __u16 pad2;
389 389
390 union { 390 union {
391 struct { 391 struct {
392 __u16 sport; 392 __u16 sport;
393 __u16 dport; 393 __u16 dport;
394 } ports; 394 } ports;
395 395
396 struct { 396 struct {
397 __u8 type; 397 __u8 type;
398 __u8 code; 398 __u8 code;
399 __u16 ident; 399 __u16 ident;
400 } icmpt; 400 } icmpt;
401 401
402 __u32 spi; 402 __u32 spi;
403 } u; 403 } u;
404 }; 404 };
405 405
406 /**** 406 /****
407 * General form of address family dependent message. 407 * General form of address family dependent message.
408 ****/ 408 ****/
409 409
410 struct rtgenmsg { 410 struct rtgenmsg {
411 unsigned char rtgen_family; 411 unsigned char rtgen_family;
412 }; 412 };
413 413
414 /***************************************************************** 414 /*****************************************************************
415 * Link layer specific messages. 415 * Link layer specific messages.
416 ****/ 416 ****/
417 417
418 /* struct ifinfomsg 418 /* struct ifinfomsg
419 * passes link level specific information, not dependent 419 * passes link level specific information, not dependent
420 * on network protocol. 420 * on network protocol.
421 */ 421 */
422 422
423 struct ifinfomsg { 423 struct ifinfomsg {
424 unsigned char ifi_family; 424 unsigned char ifi_family;
425 unsigned char __ifi_pad; 425 unsigned char __ifi_pad;
426 unsigned short ifi_type; /* ARPHRD_* */ 426 unsigned short ifi_type; /* ARPHRD_* */
427 int ifi_index; /* Link index */ 427 int ifi_index; /* Link index */
428 unsigned ifi_flags; /* IFF_* flags */ 428 unsigned ifi_flags; /* IFF_* flags */
429 unsigned ifi_change; /* IFF_* change mask */ 429 unsigned ifi_change; /* IFF_* change mask */
430 }; 430 };
431 431
432 /******************************************************************** 432 /********************************************************************
433 * prefix information 433 * prefix information
434 ****/ 434 ****/
435 435
436 struct prefixmsg { 436 struct prefixmsg {
437 unsigned char prefix_family; 437 unsigned char prefix_family;
438 unsigned char prefix_pad1; 438 unsigned char prefix_pad1;
439 unsigned short prefix_pad2; 439 unsigned short prefix_pad2;
440 int prefix_ifindex; 440 int prefix_ifindex;
441 unsigned char prefix_type; 441 unsigned char prefix_type;
442 unsigned char prefix_len; 442 unsigned char prefix_len;
443 unsigned char prefix_flags; 443 unsigned char prefix_flags;
444 unsigned char prefix_pad3; 444 unsigned char prefix_pad3;
445 }; 445 };
446 446
447 enum 447 enum
448 { 448 {
449 PREFIX_UNSPEC, 449 PREFIX_UNSPEC,
450 PREFIX_ADDRESS, 450 PREFIX_ADDRESS,
451 PREFIX_CACHEINFO, 451 PREFIX_CACHEINFO,
452 __PREFIX_MAX 452 __PREFIX_MAX
453 }; 453 };
454 454
455 #define PREFIX_MAX (__PREFIX_MAX - 1) 455 #define PREFIX_MAX (__PREFIX_MAX - 1)
456 456
457 struct prefix_cacheinfo { 457 struct prefix_cacheinfo {
458 __u32 preferred_time; 458 __u32 preferred_time;
459 __u32 valid_time; 459 __u32 valid_time;
460 }; 460 };
461 461
462 462
463 /***************************************************************** 463 /*****************************************************************
464 * Traffic control messages. 464 * Traffic control messages.
465 ****/ 465 ****/
466 466
467 struct tcmsg { 467 struct tcmsg {
468 unsigned char tcm_family; 468 unsigned char tcm_family;
469 unsigned char tcm__pad1; 469 unsigned char tcm__pad1;
470 unsigned short tcm__pad2; 470 unsigned short tcm__pad2;
471 int tcm_ifindex; 471 int tcm_ifindex;
472 __u32 tcm_handle; 472 __u32 tcm_handle;
473 __u32 tcm_parent; 473 __u32 tcm_parent;
474 __u32 tcm_info; 474 __u32 tcm_info;
475 }; 475 };
476 476
477 enum { 477 enum {
478 TCA_UNSPEC, 478 TCA_UNSPEC,
479 TCA_KIND, 479 TCA_KIND,
480 TCA_OPTIONS, 480 TCA_OPTIONS,
481 TCA_STATS, 481 TCA_STATS,
482 TCA_XSTATS, 482 TCA_XSTATS,
483 TCA_RATE, 483 TCA_RATE,
484 TCA_FCNT, 484 TCA_FCNT,
485 TCA_STATS2, 485 TCA_STATS2,
486 TCA_STAB, 486 TCA_STAB,
487 __TCA_MAX 487 __TCA_MAX
488 }; 488 };
489 489
490 #define TCA_MAX (__TCA_MAX - 1) 490 #define TCA_MAX (__TCA_MAX - 1)
491 491
492 #define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) 492 #define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg))))
493 #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) 493 #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg))
494 494
495 /******************************************************************** 495 /********************************************************************
496 * Neighbor Discovery userland options 496 * Neighbor Discovery userland options
497 ****/ 497 ****/
498 498
499 struct nduseroptmsg { 499 struct nduseroptmsg {
500 unsigned char nduseropt_family; 500 unsigned char nduseropt_family;
501 unsigned char nduseropt_pad1; 501 unsigned char nduseropt_pad1;
502 unsigned short nduseropt_opts_len; /* Total length of options */ 502 unsigned short nduseropt_opts_len; /* Total length of options */
503 int nduseropt_ifindex; 503 int nduseropt_ifindex;
504 __u8 nduseropt_icmp_type; 504 __u8 nduseropt_icmp_type;
505 __u8 nduseropt_icmp_code; 505 __u8 nduseropt_icmp_code;
506 unsigned short nduseropt_pad2; 506 unsigned short nduseropt_pad2;
507 unsigned int nduseropt_pad3; 507 unsigned int nduseropt_pad3;
508 /* Followed by one or more ND options */ 508 /* Followed by one or more ND options */
509 }; 509 };
510 510
511 enum { 511 enum {
512 NDUSEROPT_UNSPEC, 512 NDUSEROPT_UNSPEC,
513 NDUSEROPT_SRCADDR, 513 NDUSEROPT_SRCADDR,
514 __NDUSEROPT_MAX 514 __NDUSEROPT_MAX
515 }; 515 };
516 516
517 #define NDUSEROPT_MAX (__NDUSEROPT_MAX - 1) 517 #define NDUSEROPT_MAX (__NDUSEROPT_MAX - 1)
518 518
519 #ifndef __KERNEL__ 519 #ifndef __KERNEL__
520 /* RTnetlink multicast groups - backwards compatibility for userspace */ 520 /* RTnetlink multicast groups - backwards compatibility for userspace */
521 #define RTMGRP_LINK 1 521 #define RTMGRP_LINK 1
522 #define RTMGRP_NOTIFY 2 522 #define RTMGRP_NOTIFY 2
523 #define RTMGRP_NEIGH 4 523 #define RTMGRP_NEIGH 4
524 #define RTMGRP_TC 8 524 #define RTMGRP_TC 8
525 525
526 #define RTMGRP_IPV4_IFADDR 0x10 526 #define RTMGRP_IPV4_IFADDR 0x10
527 #define RTMGRP_IPV4_MROUTE 0x20 527 #define RTMGRP_IPV4_MROUTE 0x20
528 #define RTMGRP_IPV4_ROUTE 0x40 528 #define RTMGRP_IPV4_ROUTE 0x40
529 #define RTMGRP_IPV4_RULE 0x80 529 #define RTMGRP_IPV4_RULE 0x80
530 530
531 #define RTMGRP_IPV6_IFADDR 0x100 531 #define RTMGRP_IPV6_IFADDR 0x100
532 #define RTMGRP_IPV6_MROUTE 0x200 532 #define RTMGRP_IPV6_MROUTE 0x200
533 #define RTMGRP_IPV6_ROUTE 0x400 533 #define RTMGRP_IPV6_ROUTE 0x400
534 #define RTMGRP_IPV6_IFINFO 0x800 534 #define RTMGRP_IPV6_IFINFO 0x800
535 535
536 #define RTMGRP_DECnet_IFADDR 0x1000 536 #define RTMGRP_DECnet_IFADDR 0x1000
537 #define RTMGRP_DECnet_ROUTE 0x4000 537 #define RTMGRP_DECnet_ROUTE 0x4000
538 538
539 #define RTMGRP_IPV6_PREFIX 0x20000 539 #define RTMGRP_IPV6_PREFIX 0x20000
540 #endif 540 #endif
541 541
542 /* RTnetlink multicast groups */ 542 /* RTnetlink multicast groups */
543 enum rtnetlink_groups { 543 enum rtnetlink_groups {
544 RTNLGRP_NONE, 544 RTNLGRP_NONE,
545 #define RTNLGRP_NONE RTNLGRP_NONE 545 #define RTNLGRP_NONE RTNLGRP_NONE
546 RTNLGRP_LINK, 546 RTNLGRP_LINK,
547 #define RTNLGRP_LINK RTNLGRP_LINK 547 #define RTNLGRP_LINK RTNLGRP_LINK
548 RTNLGRP_NOTIFY, 548 RTNLGRP_NOTIFY,
549 #define RTNLGRP_NOTIFY RTNLGRP_NOTIFY 549 #define RTNLGRP_NOTIFY RTNLGRP_NOTIFY
550 RTNLGRP_NEIGH, 550 RTNLGRP_NEIGH,
551 #define RTNLGRP_NEIGH RTNLGRP_NEIGH 551 #define RTNLGRP_NEIGH RTNLGRP_NEIGH
552 RTNLGRP_TC, 552 RTNLGRP_TC,
553 #define RTNLGRP_TC RTNLGRP_TC 553 #define RTNLGRP_TC RTNLGRP_TC
554 RTNLGRP_IPV4_IFADDR, 554 RTNLGRP_IPV4_IFADDR,
555 #define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR 555 #define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR
556 RTNLGRP_IPV4_MROUTE, 556 RTNLGRP_IPV4_MROUTE,
557 #define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE 557 #define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE
558 RTNLGRP_IPV4_ROUTE, 558 RTNLGRP_IPV4_ROUTE,
559 #define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE 559 #define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE
560 RTNLGRP_IPV4_RULE, 560 RTNLGRP_IPV4_RULE,
561 #define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE 561 #define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE
562 RTNLGRP_IPV6_IFADDR, 562 RTNLGRP_IPV6_IFADDR,
563 #define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR 563 #define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR
564 RTNLGRP_IPV6_MROUTE, 564 RTNLGRP_IPV6_MROUTE,
565 #define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE 565 #define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE
566 RTNLGRP_IPV6_ROUTE, 566 RTNLGRP_IPV6_ROUTE,
567 #define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE 567 #define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE
568 RTNLGRP_IPV6_IFINFO, 568 RTNLGRP_IPV6_IFINFO,
569 #define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO 569 #define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO
570 RTNLGRP_DECnet_IFADDR, 570 RTNLGRP_DECnet_IFADDR,
571 #define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR 571 #define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR
572 RTNLGRP_NOP2, 572 RTNLGRP_NOP2,
573 RTNLGRP_DECnet_ROUTE, 573 RTNLGRP_DECnet_ROUTE,
574 #define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE 574 #define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE
575 RTNLGRP_DECnet_RULE, 575 RTNLGRP_DECnet_RULE,
576 #define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE 576 #define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE
577 RTNLGRP_NOP4, 577 RTNLGRP_NOP4,
578 RTNLGRP_IPV6_PREFIX, 578 RTNLGRP_IPV6_PREFIX,
579 #define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX 579 #define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX
580 RTNLGRP_IPV6_RULE, 580 RTNLGRP_IPV6_RULE,
581 #define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE 581 #define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE
582 RTNLGRP_ND_USEROPT, 582 RTNLGRP_ND_USEROPT,
583 #define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT 583 #define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT
584 RTNLGRP_PHONET_IFADDR, 584 RTNLGRP_PHONET_IFADDR,
585 #define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR 585 #define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR
586 RTNLGRP_PHONET_ROUTE, 586 RTNLGRP_PHONET_ROUTE,
587 #define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE 587 #define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE
588 __RTNLGRP_MAX 588 __RTNLGRP_MAX
589 }; 589 };
590 #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) 590 #define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
591 591
592 /* TC action piece */ 592 /* TC action piece */
593 struct tcamsg { 593 struct tcamsg {
594 unsigned char tca_family; 594 unsigned char tca_family;
595 unsigned char tca__pad1; 595 unsigned char tca__pad1;
596 unsigned short tca__pad2; 596 unsigned short tca__pad2;
597 }; 597 };
598 #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) 598 #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
599 #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) 599 #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
600 #define TCA_ACT_TAB 1 /* attr type must be >=1 */ 600 #define TCA_ACT_TAB 1 /* attr type must be >=1 */
601 #define TCAA_MAX 1 601 #define TCAA_MAX 1
602 602
603 /* End of information exported to user level */ 603 /* End of information exported to user level */
604 604
605 #ifdef __KERNEL__ 605 #ifdef __KERNEL__
606 606
607 #include <linux/mutex.h> 607 #include <linux/mutex.h>
608 #include <linux/netdevice.h> 608 #include <linux/netdevice.h>
609 609
610 static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str) 610 static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str)
611 { 611 {
612 int len = strlen(str) + 1; 612 int len = strlen(str) + 1;
613 return len > rta->rta_len || memcmp(RTA_DATA(rta), str, len); 613 return len > rta->rta_len || memcmp(RTA_DATA(rta), str, len);
614 } 614 }
615 615
616 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); 616 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
617 extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); 617 extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
618 extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, 618 extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
619 u32 group, struct nlmsghdr *nlh, gfp_t flags); 619 u32 group, struct nlmsghdr *nlh, gfp_t flags);
620 extern void rtnl_set_sk_err(struct net *net, u32 group, int error); 620 extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
621 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); 621 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
622 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, 622 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
623 u32 id, u32 ts, u32 tsage, long expires, 623 u32 id, u32 ts, u32 tsage, long expires,
624 u32 error); 624 u32 error);
625 625
626 extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data); 626 extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
627 627
628 #define RTA_PUT(skb, attrtype, attrlen, data) \ 628 #define RTA_PUT(skb, attrtype, attrlen, data) \
629 ({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \ 629 ({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \
630 goto rtattr_failure; \ 630 goto rtattr_failure; \
631 __rta_fill(skb, attrtype, attrlen, data); }) 631 __rta_fill(skb, attrtype, attrlen, data); })
632 632
633 #define RTA_APPEND(skb, attrlen, data) \ 633 #define RTA_APPEND(skb, attrlen, data) \
634 ({ if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \ 634 ({ if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \
635 goto rtattr_failure; \ 635 goto rtattr_failure; \
636 memcpy(skb_put(skb, attrlen), data, attrlen); }) 636 memcpy(skb_put(skb, attrlen), data, attrlen); })
637 637
638 #define RTA_PUT_NOHDR(skb, attrlen, data) \ 638 #define RTA_PUT_NOHDR(skb, attrlen, data) \
639 ({ RTA_APPEND(skb, RTA_ALIGN(attrlen), data); \ 639 ({ RTA_APPEND(skb, RTA_ALIGN(attrlen), data); \
640 memset(skb_tail_pointer(skb) - (RTA_ALIGN(attrlen) - attrlen), 0, \ 640 memset(skb_tail_pointer(skb) - (RTA_ALIGN(attrlen) - attrlen), 0, \
641 RTA_ALIGN(attrlen) - attrlen); }) 641 RTA_ALIGN(attrlen) - attrlen); })
642 642
643 #define RTA_PUT_U8(skb, attrtype, value) \ 643 #define RTA_PUT_U8(skb, attrtype, value) \
644 ({ u8 _tmp = (value); \ 644 ({ u8 _tmp = (value); \
645 RTA_PUT(skb, attrtype, sizeof(u8), &_tmp); }) 645 RTA_PUT(skb, attrtype, sizeof(u8), &_tmp); })
646 646
647 #define RTA_PUT_U16(skb, attrtype, value) \ 647 #define RTA_PUT_U16(skb, attrtype, value) \
648 ({ u16 _tmp = (value); \ 648 ({ u16 _tmp = (value); \
649 RTA_PUT(skb, attrtype, sizeof(u16), &_tmp); }) 649 RTA_PUT(skb, attrtype, sizeof(u16), &_tmp); })
650 650
651 #define RTA_PUT_U32(skb, attrtype, value) \ 651 #define RTA_PUT_U32(skb, attrtype, value) \
652 ({ u32 _tmp = (value); \ 652 ({ u32 _tmp = (value); \
653 RTA_PUT(skb, attrtype, sizeof(u32), &_tmp); }) 653 RTA_PUT(skb, attrtype, sizeof(u32), &_tmp); })
654 654
655 #define RTA_PUT_U64(skb, attrtype, value) \ 655 #define RTA_PUT_U64(skb, attrtype, value) \
656 ({ u64 _tmp = (value); \ 656 ({ u64 _tmp = (value); \
657 RTA_PUT(skb, attrtype, sizeof(u64), &_tmp); }) 657 RTA_PUT(skb, attrtype, sizeof(u64), &_tmp); })
658 658
659 #define RTA_PUT_SECS(skb, attrtype, value) \ 659 #define RTA_PUT_SECS(skb, attrtype, value) \
660 RTA_PUT_U64(skb, attrtype, (value) / HZ) 660 RTA_PUT_U64(skb, attrtype, (value) / HZ)
661 661
662 #define RTA_PUT_MSECS(skb, attrtype, value) \ 662 #define RTA_PUT_MSECS(skb, attrtype, value) \
663 RTA_PUT_U64(skb, attrtype, jiffies_to_msecs(value)) 663 RTA_PUT_U64(skb, attrtype, jiffies_to_msecs(value))
664 664
665 #define RTA_PUT_STRING(skb, attrtype, value) \ 665 #define RTA_PUT_STRING(skb, attrtype, value) \
666 RTA_PUT(skb, attrtype, strlen(value) + 1, value) 666 RTA_PUT(skb, attrtype, strlen(value) + 1, value)
667 667
668 #define RTA_PUT_FLAG(skb, attrtype) \ 668 #define RTA_PUT_FLAG(skb, attrtype) \
669 RTA_PUT(skb, attrtype, 0, NULL); 669 RTA_PUT(skb, attrtype, 0, NULL);
670 670
671 #define RTA_NEST(skb, type) \ 671 #define RTA_NEST(skb, type) \
672 ({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \ 672 ({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \
673 RTA_PUT(skb, type, 0, NULL); \ 673 RTA_PUT(skb, type, 0, NULL); \
674 __start; }) 674 __start; })
675 675
676 #define RTA_NEST_END(skb, start) \ 676 #define RTA_NEST_END(skb, start) \
677 ({ (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \ 677 ({ (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \
678 (skb)->len; }) 678 (skb)->len; })
679 679
680 #define RTA_NEST_COMPAT(skb, type, attrlen, data) \ 680 #define RTA_NEST_COMPAT(skb, type, attrlen, data) \
681 ({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \ 681 ({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \
682 RTA_PUT(skb, type, attrlen, data); \ 682 RTA_PUT(skb, type, attrlen, data); \
683 RTA_NEST(skb, type); \ 683 RTA_NEST(skb, type); \
684 __start; }) 684 __start; })
685 685
686 #define RTA_NEST_COMPAT_END(skb, start) \ 686 #define RTA_NEST_COMPAT_END(skb, start) \
687 ({ struct rtattr *__nest = (void *)(start) + NLMSG_ALIGN((start)->rta_len); \ 687 ({ struct rtattr *__nest = (void *)(start) + NLMSG_ALIGN((start)->rta_len); \
688 (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \ 688 (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \
689 RTA_NEST_END(skb, __nest); \ 689 RTA_NEST_END(skb, __nest); \
690 (skb)->len; }) 690 (skb)->len; })
691 691
692 #define RTA_NEST_CANCEL(skb, start) \ 692 #define RTA_NEST_CANCEL(skb, start) \
693 ({ if (start) \ 693 ({ if (start) \
694 skb_trim(skb, (unsigned char *) (start) - (skb)->data); \ 694 skb_trim(skb, (unsigned char *) (start) - (skb)->data); \
695 -1; }) 695 -1; })
696 696
697 #define RTA_GET_U8(rta) \ 697 #define RTA_GET_U8(rta) \
698 ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u8)) \ 698 ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u8)) \
699 goto rtattr_failure; \ 699 goto rtattr_failure; \
700 *(u8 *) RTA_DATA(rta); }) 700 *(u8 *) RTA_DATA(rta); })
701 701
702 #define RTA_GET_U16(rta) \ 702 #define RTA_GET_U16(rta) \
703 ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u16)) \ 703 ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u16)) \
704 goto rtattr_failure; \ 704 goto rtattr_failure; \
705 *(u16 *) RTA_DATA(rta); }) 705 *(u16 *) RTA_DATA(rta); })
706 706
707 #define RTA_GET_U32(rta) \ 707 #define RTA_GET_U32(rta) \
708 ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u32)) \ 708 ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u32)) \
709 goto rtattr_failure; \ 709 goto rtattr_failure; \
710 *(u32 *) RTA_DATA(rta); }) 710 *(u32 *) RTA_DATA(rta); })
711 711
712 #define RTA_GET_U64(rta) \ 712 #define RTA_GET_U64(rta) \
713 ({ u64 _tmp; \ 713 ({ u64 _tmp; \
714 if (!rta || RTA_PAYLOAD(rta) < sizeof(u64)) \ 714 if (!rta || RTA_PAYLOAD(rta) < sizeof(u64)) \
715 goto rtattr_failure; \ 715 goto rtattr_failure; \
716 memcpy(&_tmp, RTA_DATA(rta), sizeof(_tmp)); \ 716 memcpy(&_tmp, RTA_DATA(rta), sizeof(_tmp)); \
717 _tmp; }) 717 _tmp; })
718 718
719 #define RTA_GET_FLAG(rta) (!!(rta)) 719 #define RTA_GET_FLAG(rta) (!!(rta))
720 720
721 #define RTA_GET_SECS(rta) ((unsigned long) RTA_GET_U64(rta) * HZ) 721 #define RTA_GET_SECS(rta) ((unsigned long) RTA_GET_U64(rta) * HZ)
722 #define RTA_GET_MSECS(rta) (msecs_to_jiffies((unsigned long) RTA_GET_U64(rta))) 722 #define RTA_GET_MSECS(rta) (msecs_to_jiffies((unsigned long) RTA_GET_U64(rta)))
723 723
724 static inline struct rtattr * 724 static inline struct rtattr *
725 __rta_reserve(struct sk_buff *skb, int attrtype, int attrlen) 725 __rta_reserve(struct sk_buff *skb, int attrtype, int attrlen)
726 { 726 {
727 struct rtattr *rta; 727 struct rtattr *rta;
728 int size = RTA_LENGTH(attrlen); 728 int size = RTA_LENGTH(attrlen);
729 729
730 rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); 730 rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size));
731 rta->rta_type = attrtype; 731 rta->rta_type = attrtype;
732 rta->rta_len = size; 732 rta->rta_len = size;
733 memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); 733 memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
734 return rta; 734 return rta;
735 } 735 }
736 736
737 #define __RTA_PUT(skb, attrtype, attrlen) \ 737 #define __RTA_PUT(skb, attrtype, attrlen) \
738 ({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \ 738 ({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \
739 goto rtattr_failure; \ 739 goto rtattr_failure; \
740 __rta_reserve(skb, attrtype, attrlen); }) 740 __rta_reserve(skb, attrtype, attrlen); })
741 741
742 extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change); 742 extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change);
743 743
744 /* RTNL is used as a global lock for all changes to network configuration */ 744 /* RTNL is used as a global lock for all changes to network configuration */
745 extern void rtnl_lock(void); 745 extern void rtnl_lock(void);
746 extern void rtnl_unlock(void); 746 extern void rtnl_unlock(void);
747 extern int rtnl_trylock(void); 747 extern int rtnl_trylock(void);
748 extern int rtnl_is_locked(void); 748 extern int rtnl_is_locked(void);
749 #ifdef CONFIG_PROVE_LOCKING 749 #ifdef CONFIG_PROVE_LOCKING
750 extern int lockdep_rtnl_is_held(void); 750 extern int lockdep_rtnl_is_held(void);
751 #endif /* #ifdef CONFIG_PROVE_LOCKING */ 751 #endif /* #ifdef CONFIG_PROVE_LOCKING */
752 752
753 /** 753 /**
754 * rcu_dereference_rtnl - rcu_dereference with debug checking 754 * rcu_dereference_rtnl - rcu_dereference with debug checking
755 * @p: The pointer to read, prior to dereferencing 755 * @p: The pointer to read, prior to dereferencing
756 * 756 *
757 * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() 757 * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
758 * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference() 758 * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
759 */ 759 */
760 #define rcu_dereference_rtnl(p) \ 760 #define rcu_dereference_rtnl(p) \
761 rcu_dereference_check(p, rcu_read_lock_held() || \ 761 rcu_dereference_check(p, lockdep_rtnl_is_held())
762 lockdep_rtnl_is_held())
763 762
764 /** 763 /**
765 * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL 764 * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
766 * @p: The pointer to read, prior to dereferencing 765 * @p: The pointer to read, prior to dereferencing
767 * 766 *
768 * Return the value of the specified RCU-protected pointer, but omit 767 * Return the value of the specified RCU-protected pointer, but omit
769 * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because 768 * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
770 * caller holds RTNL. 769 * caller holds RTNL.
771 */ 770 */
772 #define rtnl_dereference(p) \ 771 #define rtnl_dereference(p) \
773 rcu_dereference_protected(p, lockdep_rtnl_is_held()) 772 rcu_dereference_protected(p, lockdep_rtnl_is_held())
774 773
775 static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) 774 static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
776 { 775 {
777 return rtnl_dereference(dev->ingress_queue); 776 return rtnl_dereference(dev->ingress_queue);
778 } 777 }
779 778
780 extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); 779 extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
781 780
782 extern void rtnetlink_init(void); 781 extern void rtnetlink_init(void);
783 extern void __rtnl_unlock(void); 782 extern void __rtnl_unlock(void);
784 783
785 #define ASSERT_RTNL() do { \ 784 #define ASSERT_RTNL() do { \
786 if (unlikely(!rtnl_is_locked())) { \ 785 if (unlikely(!rtnl_is_locked())) { \
787 printk(KERN_ERR "RTNL: assertion failed at %s (%d)\n", \ 786 printk(KERN_ERR "RTNL: assertion failed at %s (%d)\n", \
788 __FILE__, __LINE__); \ 787 __FILE__, __LINE__); \
789 dump_stack(); \ 788 dump_stack(); \
790 } \ 789 } \
791 } while(0) 790 } while(0)
792 791
793 static inline u32 rtm_get_table(struct rtattr **rta, u8 table) 792 static inline u32 rtm_get_table(struct rtattr **rta, u8 table)
794 { 793 {
795 return RTA_GET_U32(rta[RTA_TABLE-1]); 794 return RTA_GET_U32(rta[RTA_TABLE-1]);
796 rtattr_failure: 795 rtattr_failure:
797 return table; 796 return table;
798 } 797 }
799 798
800 #endif /* __KERNEL__ */ 799 #endif /* __KERNEL__ */
801 800
802 801
803 #endif /* __LINUX_RTNETLINK_H */ 802 #endif /* __LINUX_RTNETLINK_H */
804 803
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * Definitions for the AF_INET socket handler. 6 * Definitions for the AF_INET socket handler.
7 * 7 *
8 * Version: @(#)sock.h 1.0.4 05/13/93 8 * Version: @(#)sock.h 1.0.4 05/13/93
9 * 9 *
10 * Authors: Ross Biro 10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Corey Minyard <wf-rch!minyard@relay.EU.net> 12 * Corey Minyard <wf-rch!minyard@relay.EU.net>
13 * Florian La Roche <flla@stud.uni-sb.de> 13 * Florian La Roche <flla@stud.uni-sb.de>
14 * 14 *
15 * Fixes: 15 * Fixes:
16 * Alan Cox : Volatiles in skbuff pointers. See 16 * Alan Cox : Volatiles in skbuff pointers. See
17 * skbuff comments. May be overdone, 17 * skbuff comments. May be overdone,
18 * better to prove they can be removed 18 * better to prove they can be removed
19 * than the reverse. 19 * than the reverse.
20 * Alan Cox : Added a zapped field for tcp to note 20 * Alan Cox : Added a zapped field for tcp to note
21 * a socket is reset and must stay shut up 21 * a socket is reset and must stay shut up
22 * Alan Cox : New fields for options 22 * Alan Cox : New fields for options
23 * Pauline Middelink : identd support 23 * Pauline Middelink : identd support
24 * Alan Cox : Eliminate low level recv/recvfrom 24 * Alan Cox : Eliminate low level recv/recvfrom
25 * David S. Miller : New socket lookup architecture. 25 * David S. Miller : New socket lookup architecture.
26 * Steve Whitehouse: Default routines for sock_ops 26 * Steve Whitehouse: Default routines for sock_ops
27 * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made 27 * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made
28 * protinfo be just a void pointer, as the 28 * protinfo be just a void pointer, as the
29 * protocol specific parts were moved to 29 * protocol specific parts were moved to
30 * respective headers and ipv4/v6, etc now 30 * respective headers and ipv4/v6, etc now
31 * use private slabcaches for its socks 31 * use private slabcaches for its socks
32 * Pedro Hortas : New flags field for socket options 32 * Pedro Hortas : New flags field for socket options
33 * 33 *
34 * 34 *
35 * This program is free software; you can redistribute it and/or 35 * This program is free software; you can redistribute it and/or
36 * modify it under the terms of the GNU General Public License 36 * modify it under the terms of the GNU General Public License
37 * as published by the Free Software Foundation; either version 37 * as published by the Free Software Foundation; either version
38 * 2 of the License, or (at your option) any later version. 38 * 2 of the License, or (at your option) any later version.
39 */ 39 */
40 #ifndef _SOCK_H 40 #ifndef _SOCK_H
41 #define _SOCK_H 41 #define _SOCK_H
42 42
43 #include <linux/kernel.h> 43 #include <linux/kernel.h>
44 #include <linux/list.h> 44 #include <linux/list.h>
45 #include <linux/list_nulls.h> 45 #include <linux/list_nulls.h>
46 #include <linux/timer.h> 46 #include <linux/timer.h>
47 #include <linux/cache.h> 47 #include <linux/cache.h>
48 #include <linux/module.h> 48 #include <linux/module.h>
49 #include <linux/lockdep.h> 49 #include <linux/lockdep.h>
50 #include <linux/netdevice.h> 50 #include <linux/netdevice.h>
51 #include <linux/skbuff.h> /* struct sk_buff */ 51 #include <linux/skbuff.h> /* struct sk_buff */
52 #include <linux/mm.h> 52 #include <linux/mm.h>
53 #include <linux/security.h> 53 #include <linux/security.h>
54 #include <linux/slab.h> 54 #include <linux/slab.h>
55 #include <linux/uaccess.h> 55 #include <linux/uaccess.h>
56 56
57 #include <linux/filter.h> 57 #include <linux/filter.h>
58 #include <linux/rculist_nulls.h> 58 #include <linux/rculist_nulls.h>
59 #include <linux/poll.h> 59 #include <linux/poll.h>
60 60
61 #include <linux/atomic.h> 61 #include <linux/atomic.h>
62 #include <net/dst.h> 62 #include <net/dst.h>
63 #include <net/checksum.h> 63 #include <net/checksum.h>
64 64
65 /* 65 /*
66 * This structure really needs to be cleaned up. 66 * This structure really needs to be cleaned up.
67 * Most of it is for TCP, and not used by any of 67 * Most of it is for TCP, and not used by any of
68 * the other protocols. 68 * the other protocols.
69 */ 69 */
70 70
71 /* Define this to get the SOCK_DBG debugging facility. */ 71 /* Define this to get the SOCK_DBG debugging facility. */
72 #define SOCK_DEBUGGING 72 #define SOCK_DEBUGGING
73 #ifdef SOCK_DEBUGGING 73 #ifdef SOCK_DEBUGGING
74 #define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ 74 #define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \
75 printk(KERN_DEBUG msg); } while (0) 75 printk(KERN_DEBUG msg); } while (0)
76 #else 76 #else
77 /* Validate arguments and do nothing */ 77 /* Validate arguments and do nothing */
78 static inline void __attribute__ ((format (printf, 2, 3))) 78 static inline void __attribute__ ((format (printf, 2, 3)))
79 SOCK_DEBUG(struct sock *sk, const char *msg, ...) 79 SOCK_DEBUG(struct sock *sk, const char *msg, ...)
80 { 80 {
81 } 81 }
82 #endif 82 #endif
83 83
84 /* This is the per-socket lock. The spinlock provides a synchronization 84 /* This is the per-socket lock. The spinlock provides a synchronization
85 * between user contexts and software interrupt processing, whereas the 85 * between user contexts and software interrupt processing, whereas the
86 * mini-semaphore synchronizes multiple users amongst themselves. 86 * mini-semaphore synchronizes multiple users amongst themselves.
87 */ 87 */
88 typedef struct { 88 typedef struct {
89 spinlock_t slock; 89 spinlock_t slock;
90 int owned; 90 int owned;
91 wait_queue_head_t wq; 91 wait_queue_head_t wq;
92 /* 92 /*
93 * We express the mutex-alike socket_lock semantics 93 * We express the mutex-alike socket_lock semantics
94 * to the lock validator by explicitly managing 94 * to the lock validator by explicitly managing
95 * the slock as a lock variant (in addition to 95 * the slock as a lock variant (in addition to
96 * the slock itself): 96 * the slock itself):
97 */ 97 */
98 #ifdef CONFIG_DEBUG_LOCK_ALLOC 98 #ifdef CONFIG_DEBUG_LOCK_ALLOC
99 struct lockdep_map dep_map; 99 struct lockdep_map dep_map;
100 #endif 100 #endif
101 } socket_lock_t; 101 } socket_lock_t;
102 102
103 struct sock; 103 struct sock;
104 struct proto; 104 struct proto;
105 struct net; 105 struct net;
106 106
107 /** 107 /**
108 * struct sock_common - minimal network layer representation of sockets 108 * struct sock_common - minimal network layer representation of sockets
109 * @skc_daddr: Foreign IPv4 addr 109 * @skc_daddr: Foreign IPv4 addr
110 * @skc_rcv_saddr: Bound local IPv4 addr 110 * @skc_rcv_saddr: Bound local IPv4 addr
111 * @skc_hash: hash value used with various protocol lookup tables 111 * @skc_hash: hash value used with various protocol lookup tables
112 * @skc_u16hashes: two u16 hash values used by UDP lookup tables 112 * @skc_u16hashes: two u16 hash values used by UDP lookup tables
113 * @skc_family: network address family 113 * @skc_family: network address family
114 * @skc_state: Connection state 114 * @skc_state: Connection state
115 * @skc_reuse: %SO_REUSEADDR setting 115 * @skc_reuse: %SO_REUSEADDR setting
116 * @skc_bound_dev_if: bound device index if != 0 116 * @skc_bound_dev_if: bound device index if != 0
117 * @skc_bind_node: bind hash linkage for various protocol lookup tables 117 * @skc_bind_node: bind hash linkage for various protocol lookup tables
118 * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol 118 * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
119 * @skc_prot: protocol handlers inside a network family 119 * @skc_prot: protocol handlers inside a network family
120 * @skc_net: reference to the network namespace of this socket 120 * @skc_net: reference to the network namespace of this socket
121 * @skc_node: main hash linkage for various protocol lookup tables 121 * @skc_node: main hash linkage for various protocol lookup tables
122 * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol 122 * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
123 * @skc_tx_queue_mapping: tx queue number for this connection 123 * @skc_tx_queue_mapping: tx queue number for this connection
124 * @skc_refcnt: reference count 124 * @skc_refcnt: reference count
125 * 125 *
126 * This is the minimal network layer representation of sockets, the header 126 * This is the minimal network layer representation of sockets, the header
127 * for struct sock and struct inet_timewait_sock. 127 * for struct sock and struct inet_timewait_sock.
128 */ 128 */
129 struct sock_common { 129 struct sock_common {
130 /* skc_daddr and skc_rcv_saddr must be grouped : 130 /* skc_daddr and skc_rcv_saddr must be grouped :
131 * cf INET_MATCH() and INET_TW_MATCH() 131 * cf INET_MATCH() and INET_TW_MATCH()
132 */ 132 */
133 __be32 skc_daddr; 133 __be32 skc_daddr;
134 __be32 skc_rcv_saddr; 134 __be32 skc_rcv_saddr;
135 135
136 union { 136 union {
137 unsigned int skc_hash; 137 unsigned int skc_hash;
138 __u16 skc_u16hashes[2]; 138 __u16 skc_u16hashes[2];
139 }; 139 };
140 unsigned short skc_family; 140 unsigned short skc_family;
141 volatile unsigned char skc_state; 141 volatile unsigned char skc_state;
142 unsigned char skc_reuse; 142 unsigned char skc_reuse;
143 int skc_bound_dev_if; 143 int skc_bound_dev_if;
144 union { 144 union {
145 struct hlist_node skc_bind_node; 145 struct hlist_node skc_bind_node;
146 struct hlist_nulls_node skc_portaddr_node; 146 struct hlist_nulls_node skc_portaddr_node;
147 }; 147 };
148 struct proto *skc_prot; 148 struct proto *skc_prot;
149 #ifdef CONFIG_NET_NS 149 #ifdef CONFIG_NET_NS
150 struct net *skc_net; 150 struct net *skc_net;
151 #endif 151 #endif
152 /* 152 /*
153 * fields between dontcopy_begin/dontcopy_end 153 * fields between dontcopy_begin/dontcopy_end
154 * are not copied in sock_copy() 154 * are not copied in sock_copy()
155 */ 155 */
156 /* private: */ 156 /* private: */
157 int skc_dontcopy_begin[0]; 157 int skc_dontcopy_begin[0];
158 /* public: */ 158 /* public: */
159 union { 159 union {
160 struct hlist_node skc_node; 160 struct hlist_node skc_node;
161 struct hlist_nulls_node skc_nulls_node; 161 struct hlist_nulls_node skc_nulls_node;
162 }; 162 };
163 int skc_tx_queue_mapping; 163 int skc_tx_queue_mapping;
164 atomic_t skc_refcnt; 164 atomic_t skc_refcnt;
165 /* private: */ 165 /* private: */
166 int skc_dontcopy_end[0]; 166 int skc_dontcopy_end[0];
167 /* public: */ 167 /* public: */
168 }; 168 };
169 169
170 /** 170 /**
171 * struct sock - network layer representation of sockets 171 * struct sock - network layer representation of sockets
172 * @__sk_common: shared layout with inet_timewait_sock 172 * @__sk_common: shared layout with inet_timewait_sock
173 * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN 173 * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
174 * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings 174 * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
175 * @sk_lock: synchronizer 175 * @sk_lock: synchronizer
176 * @sk_rcvbuf: size of receive buffer in bytes 176 * @sk_rcvbuf: size of receive buffer in bytes
177 * @sk_wq: sock wait queue and async head 177 * @sk_wq: sock wait queue and async head
178 * @sk_dst_cache: destination cache 178 * @sk_dst_cache: destination cache
179 * @sk_dst_lock: destination cache lock 179 * @sk_dst_lock: destination cache lock
180 * @sk_policy: flow policy 180 * @sk_policy: flow policy
181 * @sk_receive_queue: incoming packets 181 * @sk_receive_queue: incoming packets
182 * @sk_wmem_alloc: transmit queue bytes committed 182 * @sk_wmem_alloc: transmit queue bytes committed
183 * @sk_write_queue: Packet sending queue 183 * @sk_write_queue: Packet sending queue
184 * @sk_async_wait_queue: DMA copied packets 184 * @sk_async_wait_queue: DMA copied packets
185 * @sk_omem_alloc: "o" is "option" or "other" 185 * @sk_omem_alloc: "o" is "option" or "other"
186 * @sk_wmem_queued: persistent queue size 186 * @sk_wmem_queued: persistent queue size
187 * @sk_forward_alloc: space allocated forward 187 * @sk_forward_alloc: space allocated forward
188 * @sk_allocation: allocation mode 188 * @sk_allocation: allocation mode
189 * @sk_sndbuf: size of send buffer in bytes 189 * @sk_sndbuf: size of send buffer in bytes
190 * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, 190 * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
191 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings 191 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
192 * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets 192 * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
193 * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) 193 * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
194 * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) 194 * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
195 * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) 195 * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
196 * @sk_gso_max_size: Maximum GSO segment size to build 196 * @sk_gso_max_size: Maximum GSO segment size to build
197 * @sk_lingertime: %SO_LINGER l_linger setting 197 * @sk_lingertime: %SO_LINGER l_linger setting
198 * @sk_backlog: always used with the per-socket spinlock held 198 * @sk_backlog: always used with the per-socket spinlock held
199 * @sk_callback_lock: used with the callbacks in the end of this struct 199 * @sk_callback_lock: used with the callbacks in the end of this struct
200 * @sk_error_queue: rarely used 200 * @sk_error_queue: rarely used
201 * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, 201 * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
202 * IPV6_ADDRFORM for instance) 202 * IPV6_ADDRFORM for instance)
203 * @sk_err: last error 203 * @sk_err: last error
204 * @sk_err_soft: errors that don't cause failure but are the cause of a 204 * @sk_err_soft: errors that don't cause failure but are the cause of a
205 * persistent failure not just 'timed out' 205 * persistent failure not just 'timed out'
206 * @sk_drops: raw/udp drops counter 206 * @sk_drops: raw/udp drops counter
207 * @sk_ack_backlog: current listen backlog 207 * @sk_ack_backlog: current listen backlog
208 * @sk_max_ack_backlog: listen backlog set in listen() 208 * @sk_max_ack_backlog: listen backlog set in listen()
209 * @sk_priority: %SO_PRIORITY setting 209 * @sk_priority: %SO_PRIORITY setting
210 * @sk_type: socket type (%SOCK_STREAM, etc) 210 * @sk_type: socket type (%SOCK_STREAM, etc)
211 * @sk_protocol: which protocol this socket belongs in this network family 211 * @sk_protocol: which protocol this socket belongs in this network family
212 * @sk_peer_pid: &struct pid for this socket's peer 212 * @sk_peer_pid: &struct pid for this socket's peer
213 * @sk_peer_cred: %SO_PEERCRED setting 213 * @sk_peer_cred: %SO_PEERCRED setting
214 * @sk_rcvlowat: %SO_RCVLOWAT setting 214 * @sk_rcvlowat: %SO_RCVLOWAT setting
215 * @sk_rcvtimeo: %SO_RCVTIMEO setting 215 * @sk_rcvtimeo: %SO_RCVTIMEO setting
216 * @sk_sndtimeo: %SO_SNDTIMEO setting 216 * @sk_sndtimeo: %SO_SNDTIMEO setting
217 * @sk_rxhash: flow hash received from netif layer 217 * @sk_rxhash: flow hash received from netif layer
218 * @sk_filter: socket filtering instructions 218 * @sk_filter: socket filtering instructions
219 * @sk_protinfo: private area, net family specific, when not using slab 219 * @sk_protinfo: private area, net family specific, when not using slab
220 * @sk_timer: sock cleanup timer 220 * @sk_timer: sock cleanup timer
221 * @sk_stamp: time stamp of last packet received 221 * @sk_stamp: time stamp of last packet received
222 * @sk_socket: Identd and reporting IO signals 222 * @sk_socket: Identd and reporting IO signals
223 * @sk_user_data: RPC layer private data 223 * @sk_user_data: RPC layer private data
224 * @sk_sndmsg_page: cached page for sendmsg 224 * @sk_sndmsg_page: cached page for sendmsg
225 * @sk_sndmsg_off: cached offset for sendmsg 225 * @sk_sndmsg_off: cached offset for sendmsg
226 * @sk_send_head: front of stuff to transmit 226 * @sk_send_head: front of stuff to transmit
227 * @sk_security: used by security modules 227 * @sk_security: used by security modules
228 * @sk_mark: generic packet mark 228 * @sk_mark: generic packet mark
229 * @sk_classid: this socket's cgroup classid 229 * @sk_classid: this socket's cgroup classid
230 * @sk_write_pending: a write to stream socket waits to start 230 * @sk_write_pending: a write to stream socket waits to start
231 * @sk_state_change: callback to indicate change in the state of the sock 231 * @sk_state_change: callback to indicate change in the state of the sock
232 * @sk_data_ready: callback to indicate there is data to be processed 232 * @sk_data_ready: callback to indicate there is data to be processed
233 * @sk_write_space: callback to indicate there is bf sending space available 233 * @sk_write_space: callback to indicate there is bf sending space available
234 * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) 234 * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
235 * @sk_backlog_rcv: callback to process the backlog 235 * @sk_backlog_rcv: callback to process the backlog
236 * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 236 * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
237 */ 237 */
238 struct sock { 238 struct sock {
239 /* 239 /*
240 * Now struct inet_timewait_sock also uses sock_common, so please just 240 * Now struct inet_timewait_sock also uses sock_common, so please just
241 * don't add nothing before this first member (__sk_common) --acme 241 * don't add nothing before this first member (__sk_common) --acme
242 */ 242 */
243 struct sock_common __sk_common; 243 struct sock_common __sk_common;
244 #define sk_node __sk_common.skc_node 244 #define sk_node __sk_common.skc_node
245 #define sk_nulls_node __sk_common.skc_nulls_node 245 #define sk_nulls_node __sk_common.skc_nulls_node
246 #define sk_refcnt __sk_common.skc_refcnt 246 #define sk_refcnt __sk_common.skc_refcnt
247 #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping 247 #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
248 248
249 #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin 249 #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin
250 #define sk_dontcopy_end __sk_common.skc_dontcopy_end 250 #define sk_dontcopy_end __sk_common.skc_dontcopy_end
251 #define sk_hash __sk_common.skc_hash 251 #define sk_hash __sk_common.skc_hash
252 #define sk_family __sk_common.skc_family 252 #define sk_family __sk_common.skc_family
253 #define sk_state __sk_common.skc_state 253 #define sk_state __sk_common.skc_state
254 #define sk_reuse __sk_common.skc_reuse 254 #define sk_reuse __sk_common.skc_reuse
255 #define sk_bound_dev_if __sk_common.skc_bound_dev_if 255 #define sk_bound_dev_if __sk_common.skc_bound_dev_if
256 #define sk_bind_node __sk_common.skc_bind_node 256 #define sk_bind_node __sk_common.skc_bind_node
257 #define sk_prot __sk_common.skc_prot 257 #define sk_prot __sk_common.skc_prot
258 #define sk_net __sk_common.skc_net 258 #define sk_net __sk_common.skc_net
259 socket_lock_t sk_lock; 259 socket_lock_t sk_lock;
260 struct sk_buff_head sk_receive_queue; 260 struct sk_buff_head sk_receive_queue;
261 /* 261 /*
262 * The backlog queue is special, it is always used with 262 * The backlog queue is special, it is always used with
263 * the per-socket spinlock held and requires low latency 263 * the per-socket spinlock held and requires low latency
264 * access. Therefore we special case it's implementation. 264 * access. Therefore we special case it's implementation.
265 * Note : rmem_alloc is in this structure to fill a hole 265 * Note : rmem_alloc is in this structure to fill a hole
266 * on 64bit arches, not because its logically part of 266 * on 64bit arches, not because its logically part of
267 * backlog. 267 * backlog.
268 */ 268 */
269 struct { 269 struct {
270 atomic_t rmem_alloc; 270 atomic_t rmem_alloc;
271 int len; 271 int len;
272 struct sk_buff *head; 272 struct sk_buff *head;
273 struct sk_buff *tail; 273 struct sk_buff *tail;
274 } sk_backlog; 274 } sk_backlog;
275 #define sk_rmem_alloc sk_backlog.rmem_alloc 275 #define sk_rmem_alloc sk_backlog.rmem_alloc
276 int sk_forward_alloc; 276 int sk_forward_alloc;
277 #ifdef CONFIG_RPS 277 #ifdef CONFIG_RPS
278 __u32 sk_rxhash; 278 __u32 sk_rxhash;
279 #endif 279 #endif
280 atomic_t sk_drops; 280 atomic_t sk_drops;
281 int sk_rcvbuf; 281 int sk_rcvbuf;
282 282
283 struct sk_filter __rcu *sk_filter; 283 struct sk_filter __rcu *sk_filter;
284 struct socket_wq __rcu *sk_wq; 284 struct socket_wq __rcu *sk_wq;
285 285
286 #ifdef CONFIG_NET_DMA 286 #ifdef CONFIG_NET_DMA
287 struct sk_buff_head sk_async_wait_queue; 287 struct sk_buff_head sk_async_wait_queue;
288 #endif 288 #endif
289 289
290 #ifdef CONFIG_XFRM 290 #ifdef CONFIG_XFRM
291 struct xfrm_policy *sk_policy[2]; 291 struct xfrm_policy *sk_policy[2];
292 #endif 292 #endif
293 unsigned long sk_flags; 293 unsigned long sk_flags;
294 struct dst_entry *sk_dst_cache; 294 struct dst_entry *sk_dst_cache;
295 spinlock_t sk_dst_lock; 295 spinlock_t sk_dst_lock;
296 atomic_t sk_wmem_alloc; 296 atomic_t sk_wmem_alloc;
297 atomic_t sk_omem_alloc; 297 atomic_t sk_omem_alloc;
298 int sk_sndbuf; 298 int sk_sndbuf;
299 struct sk_buff_head sk_write_queue; 299 struct sk_buff_head sk_write_queue;
300 kmemcheck_bitfield_begin(flags); 300 kmemcheck_bitfield_begin(flags);
301 unsigned int sk_shutdown : 2, 301 unsigned int sk_shutdown : 2,
302 sk_no_check : 2, 302 sk_no_check : 2,
303 sk_userlocks : 4, 303 sk_userlocks : 4,
304 sk_protocol : 8, 304 sk_protocol : 8,
305 sk_type : 16; 305 sk_type : 16;
306 kmemcheck_bitfield_end(flags); 306 kmemcheck_bitfield_end(flags);
307 int sk_wmem_queued; 307 int sk_wmem_queued;
308 gfp_t sk_allocation; 308 gfp_t sk_allocation;
309 int sk_route_caps; 309 int sk_route_caps;
310 int sk_route_nocaps; 310 int sk_route_nocaps;
311 int sk_gso_type; 311 int sk_gso_type;
312 unsigned int sk_gso_max_size; 312 unsigned int sk_gso_max_size;
313 int sk_rcvlowat; 313 int sk_rcvlowat;
314 unsigned long sk_lingertime; 314 unsigned long sk_lingertime;
315 struct sk_buff_head sk_error_queue; 315 struct sk_buff_head sk_error_queue;
316 struct proto *sk_prot_creator; 316 struct proto *sk_prot_creator;
317 rwlock_t sk_callback_lock; 317 rwlock_t sk_callback_lock;
318 int sk_err, 318 int sk_err,
319 sk_err_soft; 319 sk_err_soft;
320 unsigned short sk_ack_backlog; 320 unsigned short sk_ack_backlog;
321 unsigned short sk_max_ack_backlog; 321 unsigned short sk_max_ack_backlog;
322 __u32 sk_priority; 322 __u32 sk_priority;
323 struct pid *sk_peer_pid; 323 struct pid *sk_peer_pid;
324 const struct cred *sk_peer_cred; 324 const struct cred *sk_peer_cred;
325 long sk_rcvtimeo; 325 long sk_rcvtimeo;
326 long sk_sndtimeo; 326 long sk_sndtimeo;
327 void *sk_protinfo; 327 void *sk_protinfo;
328 struct timer_list sk_timer; 328 struct timer_list sk_timer;
329 ktime_t sk_stamp; 329 ktime_t sk_stamp;
330 struct socket *sk_socket; 330 struct socket *sk_socket;
331 void *sk_user_data; 331 void *sk_user_data;
332 struct page *sk_sndmsg_page; 332 struct page *sk_sndmsg_page;
333 struct sk_buff *sk_send_head; 333 struct sk_buff *sk_send_head;
334 __u32 sk_sndmsg_off; 334 __u32 sk_sndmsg_off;
335 int sk_write_pending; 335 int sk_write_pending;
336 #ifdef CONFIG_SECURITY 336 #ifdef CONFIG_SECURITY
337 void *sk_security; 337 void *sk_security;
338 #endif 338 #endif
339 __u32 sk_mark; 339 __u32 sk_mark;
340 u32 sk_classid; 340 u32 sk_classid;
341 void (*sk_state_change)(struct sock *sk); 341 void (*sk_state_change)(struct sock *sk);
342 void (*sk_data_ready)(struct sock *sk, int bytes); 342 void (*sk_data_ready)(struct sock *sk, int bytes);
343 void (*sk_write_space)(struct sock *sk); 343 void (*sk_write_space)(struct sock *sk);
344 void (*sk_error_report)(struct sock *sk); 344 void (*sk_error_report)(struct sock *sk);
345 int (*sk_backlog_rcv)(struct sock *sk, 345 int (*sk_backlog_rcv)(struct sock *sk,
346 struct sk_buff *skb); 346 struct sk_buff *skb);
347 void (*sk_destruct)(struct sock *sk); 347 void (*sk_destruct)(struct sock *sk);
348 }; 348 };
349 349
350 /* 350 /*
351 * Hashed lists helper routines 351 * Hashed lists helper routines
352 */ 352 */
353 static inline struct sock *sk_entry(const struct hlist_node *node) 353 static inline struct sock *sk_entry(const struct hlist_node *node)
354 { 354 {
355 return hlist_entry(node, struct sock, sk_node); 355 return hlist_entry(node, struct sock, sk_node);
356 } 356 }
357 357
358 static inline struct sock *__sk_head(const struct hlist_head *head) 358 static inline struct sock *__sk_head(const struct hlist_head *head)
359 { 359 {
360 return hlist_entry(head->first, struct sock, sk_node); 360 return hlist_entry(head->first, struct sock, sk_node);
361 } 361 }
362 362
363 static inline struct sock *sk_head(const struct hlist_head *head) 363 static inline struct sock *sk_head(const struct hlist_head *head)
364 { 364 {
365 return hlist_empty(head) ? NULL : __sk_head(head); 365 return hlist_empty(head) ? NULL : __sk_head(head);
366 } 366 }
367 367
368 static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) 368 static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
369 { 369 {
370 return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); 370 return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
371 } 371 }
372 372
373 static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) 373 static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
374 { 374 {
375 return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); 375 return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
376 } 376 }
377 377
378 static inline struct sock *sk_next(const struct sock *sk) 378 static inline struct sock *sk_next(const struct sock *sk)
379 { 379 {
380 return sk->sk_node.next ? 380 return sk->sk_node.next ?
381 hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; 381 hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL;
382 } 382 }
383 383
384 static inline struct sock *sk_nulls_next(const struct sock *sk) 384 static inline struct sock *sk_nulls_next(const struct sock *sk)
385 { 385 {
386 return (!is_a_nulls(sk->sk_nulls_node.next)) ? 386 return (!is_a_nulls(sk->sk_nulls_node.next)) ?
387 hlist_nulls_entry(sk->sk_nulls_node.next, 387 hlist_nulls_entry(sk->sk_nulls_node.next,
388 struct sock, sk_nulls_node) : 388 struct sock, sk_nulls_node) :
389 NULL; 389 NULL;
390 } 390 }
391 391
392 static inline int sk_unhashed(const struct sock *sk) 392 static inline int sk_unhashed(const struct sock *sk)
393 { 393 {
394 return hlist_unhashed(&sk->sk_node); 394 return hlist_unhashed(&sk->sk_node);
395 } 395 }
396 396
397 static inline int sk_hashed(const struct sock *sk) 397 static inline int sk_hashed(const struct sock *sk)
398 { 398 {
399 return !sk_unhashed(sk); 399 return !sk_unhashed(sk);
400 } 400 }
401 401
402 static __inline__ void sk_node_init(struct hlist_node *node) 402 static __inline__ void sk_node_init(struct hlist_node *node)
403 { 403 {
404 node->pprev = NULL; 404 node->pprev = NULL;
405 } 405 }
406 406
407 static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node) 407 static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node)
408 { 408 {
409 node->pprev = NULL; 409 node->pprev = NULL;
410 } 410 }
411 411
412 static __inline__ void __sk_del_node(struct sock *sk) 412 static __inline__ void __sk_del_node(struct sock *sk)
413 { 413 {
414 __hlist_del(&sk->sk_node); 414 __hlist_del(&sk->sk_node);
415 } 415 }
416 416
417 /* NB: equivalent to hlist_del_init_rcu */ 417 /* NB: equivalent to hlist_del_init_rcu */
418 static __inline__ int __sk_del_node_init(struct sock *sk) 418 static __inline__ int __sk_del_node_init(struct sock *sk)
419 { 419 {
420 if (sk_hashed(sk)) { 420 if (sk_hashed(sk)) {
421 __sk_del_node(sk); 421 __sk_del_node(sk);
422 sk_node_init(&sk->sk_node); 422 sk_node_init(&sk->sk_node);
423 return 1; 423 return 1;
424 } 424 }
425 return 0; 425 return 0;
426 } 426 }
427 427
428 /* Grab socket reference count. This operation is valid only 428 /* Grab socket reference count. This operation is valid only
429 when sk is ALREADY grabbed f.e. it is found in hash table 429 when sk is ALREADY grabbed f.e. it is found in hash table
430 or a list and the lookup is made under lock preventing hash table 430 or a list and the lookup is made under lock preventing hash table
431 modifications. 431 modifications.
432 */ 432 */
433 433
434 static inline void sock_hold(struct sock *sk) 434 static inline void sock_hold(struct sock *sk)
435 { 435 {
436 atomic_inc(&sk->sk_refcnt); 436 atomic_inc(&sk->sk_refcnt);
437 } 437 }
438 438
439 /* Ungrab socket in the context, which assumes that socket refcnt 439 /* Ungrab socket in the context, which assumes that socket refcnt
440 cannot hit zero, f.e. it is true in context of any socketcall. 440 cannot hit zero, f.e. it is true in context of any socketcall.
441 */ 441 */
442 static inline void __sock_put(struct sock *sk) 442 static inline void __sock_put(struct sock *sk)
443 { 443 {
444 atomic_dec(&sk->sk_refcnt); 444 atomic_dec(&sk->sk_refcnt);
445 } 445 }
446 446
447 static __inline__ int sk_del_node_init(struct sock *sk) 447 static __inline__ int sk_del_node_init(struct sock *sk)
448 { 448 {
449 int rc = __sk_del_node_init(sk); 449 int rc = __sk_del_node_init(sk);
450 450
451 if (rc) { 451 if (rc) {
452 /* paranoid for a while -acme */ 452 /* paranoid for a while -acme */
453 WARN_ON(atomic_read(&sk->sk_refcnt) == 1); 453 WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
454 __sock_put(sk); 454 __sock_put(sk);
455 } 455 }
456 return rc; 456 return rc;
457 } 457 }
458 #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) 458 #define sk_del_node_init_rcu(sk) sk_del_node_init(sk)
459 459
460 static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk) 460 static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk)
461 { 461 {
462 if (sk_hashed(sk)) { 462 if (sk_hashed(sk)) {
463 hlist_nulls_del_init_rcu(&sk->sk_nulls_node); 463 hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
464 return 1; 464 return 1;
465 } 465 }
466 return 0; 466 return 0;
467 } 467 }
468 468
469 static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk) 469 static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk)
470 { 470 {
471 int rc = __sk_nulls_del_node_init_rcu(sk); 471 int rc = __sk_nulls_del_node_init_rcu(sk);
472 472
473 if (rc) { 473 if (rc) {
474 /* paranoid for a while -acme */ 474 /* paranoid for a while -acme */
475 WARN_ON(atomic_read(&sk->sk_refcnt) == 1); 475 WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
476 __sock_put(sk); 476 __sock_put(sk);
477 } 477 }
478 return rc; 478 return rc;
479 } 479 }
480 480
481 static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list) 481 static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list)
482 { 482 {
483 hlist_add_head(&sk->sk_node, list); 483 hlist_add_head(&sk->sk_node, list);
484 } 484 }
485 485
486 static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) 486 static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
487 { 487 {
488 sock_hold(sk); 488 sock_hold(sk);
489 __sk_add_node(sk, list); 489 __sk_add_node(sk, list);
490 } 490 }
491 491
492 static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) 492 static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
493 { 493 {
494 sock_hold(sk); 494 sock_hold(sk);
495 hlist_add_head_rcu(&sk->sk_node, list); 495 hlist_add_head_rcu(&sk->sk_node, list);
496 } 496 }
497 497
498 static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) 498 static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
499 { 499 {
500 hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); 500 hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
501 } 501 }
502 502
503 static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) 503 static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
504 { 504 {
505 sock_hold(sk); 505 sock_hold(sk);
506 __sk_nulls_add_node_rcu(sk, list); 506 __sk_nulls_add_node_rcu(sk, list);
507 } 507 }
508 508
509 static __inline__ void __sk_del_bind_node(struct sock *sk) 509 static __inline__ void __sk_del_bind_node(struct sock *sk)
510 { 510 {
511 __hlist_del(&sk->sk_bind_node); 511 __hlist_del(&sk->sk_bind_node);
512 } 512 }
513 513
514 static __inline__ void sk_add_bind_node(struct sock *sk, 514 static __inline__ void sk_add_bind_node(struct sock *sk,
515 struct hlist_head *list) 515 struct hlist_head *list)
516 { 516 {
517 hlist_add_head(&sk->sk_bind_node, list); 517 hlist_add_head(&sk->sk_bind_node, list);
518 } 518 }
519 519
520 #define sk_for_each(__sk, node, list) \ 520 #define sk_for_each(__sk, node, list) \
521 hlist_for_each_entry(__sk, node, list, sk_node) 521 hlist_for_each_entry(__sk, node, list, sk_node)
522 #define sk_for_each_rcu(__sk, node, list) \ 522 #define sk_for_each_rcu(__sk, node, list) \
523 hlist_for_each_entry_rcu(__sk, node, list, sk_node) 523 hlist_for_each_entry_rcu(__sk, node, list, sk_node)
524 #define sk_nulls_for_each(__sk, node, list) \ 524 #define sk_nulls_for_each(__sk, node, list) \
525 hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) 525 hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
526 #define sk_nulls_for_each_rcu(__sk, node, list) \ 526 #define sk_nulls_for_each_rcu(__sk, node, list) \
527 hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) 527 hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
528 #define sk_for_each_from(__sk, node) \ 528 #define sk_for_each_from(__sk, node) \
529 if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ 529 if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
530 hlist_for_each_entry_from(__sk, node, sk_node) 530 hlist_for_each_entry_from(__sk, node, sk_node)
531 #define sk_nulls_for_each_from(__sk, node) \ 531 #define sk_nulls_for_each_from(__sk, node) \
532 if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ 532 if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
533 hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) 533 hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
534 #define sk_for_each_safe(__sk, node, tmp, list) \ 534 #define sk_for_each_safe(__sk, node, tmp, list) \
535 hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node) 535 hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node)
536 #define sk_for_each_bound(__sk, node, list) \ 536 #define sk_for_each_bound(__sk, node, list) \
537 hlist_for_each_entry(__sk, node, list, sk_bind_node) 537 hlist_for_each_entry(__sk, node, list, sk_bind_node)
538 538
539 /* Sock flags */ 539 /* Sock flags */
540 enum sock_flags { 540 enum sock_flags {
541 SOCK_DEAD, 541 SOCK_DEAD,
542 SOCK_DONE, 542 SOCK_DONE,
543 SOCK_URGINLINE, 543 SOCK_URGINLINE,
544 SOCK_KEEPOPEN, 544 SOCK_KEEPOPEN,
545 SOCK_LINGER, 545 SOCK_LINGER,
546 SOCK_DESTROY, 546 SOCK_DESTROY,
547 SOCK_BROADCAST, 547 SOCK_BROADCAST,
548 SOCK_TIMESTAMP, 548 SOCK_TIMESTAMP,
549 SOCK_ZAPPED, 549 SOCK_ZAPPED,
550 SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ 550 SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
551 SOCK_DBG, /* %SO_DEBUG setting */ 551 SOCK_DBG, /* %SO_DEBUG setting */
552 SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ 552 SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
553 SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ 553 SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
554 SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ 554 SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
555 SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ 555 SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
556 SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */ 556 SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */
557 SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */ 557 SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */
558 SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */ 558 SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */
559 SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ 559 SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */
560 SOCK_TIMESTAMPING_SOFTWARE, /* %SOF_TIMESTAMPING_SOFTWARE */ 560 SOCK_TIMESTAMPING_SOFTWARE, /* %SOF_TIMESTAMPING_SOFTWARE */
561 SOCK_TIMESTAMPING_RAW_HARDWARE, /* %SOF_TIMESTAMPING_RAW_HARDWARE */ 561 SOCK_TIMESTAMPING_RAW_HARDWARE, /* %SOF_TIMESTAMPING_RAW_HARDWARE */
562 SOCK_TIMESTAMPING_SYS_HARDWARE, /* %SOF_TIMESTAMPING_SYS_HARDWARE */ 562 SOCK_TIMESTAMPING_SYS_HARDWARE, /* %SOF_TIMESTAMPING_SYS_HARDWARE */
563 SOCK_FASYNC, /* fasync() active */ 563 SOCK_FASYNC, /* fasync() active */
564 SOCK_RXQ_OVFL, 564 SOCK_RXQ_OVFL,
565 }; 565 };
566 566
567 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) 567 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
568 { 568 {
569 nsk->sk_flags = osk->sk_flags; 569 nsk->sk_flags = osk->sk_flags;
570 } 570 }
571 571
572 static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) 572 static inline void sock_set_flag(struct sock *sk, enum sock_flags flag)
573 { 573 {
574 __set_bit(flag, &sk->sk_flags); 574 __set_bit(flag, &sk->sk_flags);
575 } 575 }
576 576
577 static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) 577 static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
578 { 578 {
579 __clear_bit(flag, &sk->sk_flags); 579 __clear_bit(flag, &sk->sk_flags);
580 } 580 }
581 581
582 static inline int sock_flag(struct sock *sk, enum sock_flags flag) 582 static inline int sock_flag(struct sock *sk, enum sock_flags flag)
583 { 583 {
584 return test_bit(flag, &sk->sk_flags); 584 return test_bit(flag, &sk->sk_flags);
585 } 585 }
586 586
587 static inline void sk_acceptq_removed(struct sock *sk) 587 static inline void sk_acceptq_removed(struct sock *sk)
588 { 588 {
589 sk->sk_ack_backlog--; 589 sk->sk_ack_backlog--;
590 } 590 }
591 591
592 static inline void sk_acceptq_added(struct sock *sk) 592 static inline void sk_acceptq_added(struct sock *sk)
593 { 593 {
594 sk->sk_ack_backlog++; 594 sk->sk_ack_backlog++;
595 } 595 }
596 596
597 static inline int sk_acceptq_is_full(struct sock *sk) 597 static inline int sk_acceptq_is_full(struct sock *sk)
598 { 598 {
599 return sk->sk_ack_backlog > sk->sk_max_ack_backlog; 599 return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
600 } 600 }
601 601
602 /* 602 /*
603 * Compute minimal free write space needed to queue new packets. 603 * Compute minimal free write space needed to queue new packets.
604 */ 604 */
605 static inline int sk_stream_min_wspace(struct sock *sk) 605 static inline int sk_stream_min_wspace(struct sock *sk)
606 { 606 {
607 return sk->sk_wmem_queued >> 1; 607 return sk->sk_wmem_queued >> 1;
608 } 608 }
609 609
610 static inline int sk_stream_wspace(struct sock *sk) 610 static inline int sk_stream_wspace(struct sock *sk)
611 { 611 {
612 return sk->sk_sndbuf - sk->sk_wmem_queued; 612 return sk->sk_sndbuf - sk->sk_wmem_queued;
613 } 613 }
614 614
615 extern void sk_stream_write_space(struct sock *sk); 615 extern void sk_stream_write_space(struct sock *sk);
616 616
617 static inline int sk_stream_memory_free(struct sock *sk) 617 static inline int sk_stream_memory_free(struct sock *sk)
618 { 618 {
619 return sk->sk_wmem_queued < sk->sk_sndbuf; 619 return sk->sk_wmem_queued < sk->sk_sndbuf;
620 } 620 }
621 621
622 /* OOB backlog add */ 622 /* OOB backlog add */
623 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) 623 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
624 { 624 {
625 /* dont let skb dst not refcounted, we are going to leave rcu lock */ 625 /* dont let skb dst not refcounted, we are going to leave rcu lock */
626 skb_dst_force(skb); 626 skb_dst_force(skb);
627 627
628 if (!sk->sk_backlog.tail) 628 if (!sk->sk_backlog.tail)
629 sk->sk_backlog.head = skb; 629 sk->sk_backlog.head = skb;
630 else 630 else
631 sk->sk_backlog.tail->next = skb; 631 sk->sk_backlog.tail->next = skb;
632 632
633 sk->sk_backlog.tail = skb; 633 sk->sk_backlog.tail = skb;
634 skb->next = NULL; 634 skb->next = NULL;
635 } 635 }
636 636
637 /* 637 /*
638 * Take into account size of receive queue and backlog queue 638 * Take into account size of receive queue and backlog queue
639 */ 639 */
640 static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) 640 static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb)
641 { 641 {
642 unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); 642 unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
643 643
644 return qsize + skb->truesize > sk->sk_rcvbuf; 644 return qsize + skb->truesize > sk->sk_rcvbuf;
645 } 645 }
646 646
647 /* The per-socket spinlock must be held here. */ 647 /* The per-socket spinlock must be held here. */
648 static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb) 648 static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb)
649 { 649 {
650 if (sk_rcvqueues_full(sk, skb)) 650 if (sk_rcvqueues_full(sk, skb))
651 return -ENOBUFS; 651 return -ENOBUFS;
652 652
653 __sk_add_backlog(sk, skb); 653 __sk_add_backlog(sk, skb);
654 sk->sk_backlog.len += skb->truesize; 654 sk->sk_backlog.len += skb->truesize;
655 return 0; 655 return 0;
656 } 656 }
657 657
658 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 658 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
659 { 659 {
660 return sk->sk_backlog_rcv(sk, skb); 660 return sk->sk_backlog_rcv(sk, skb);
661 } 661 }
662 662
663 static inline void sock_rps_record_flow(const struct sock *sk) 663 static inline void sock_rps_record_flow(const struct sock *sk)
664 { 664 {
665 #ifdef CONFIG_RPS 665 #ifdef CONFIG_RPS
666 struct rps_sock_flow_table *sock_flow_table; 666 struct rps_sock_flow_table *sock_flow_table;
667 667
668 rcu_read_lock(); 668 rcu_read_lock();
669 sock_flow_table = rcu_dereference(rps_sock_flow_table); 669 sock_flow_table = rcu_dereference(rps_sock_flow_table);
670 rps_record_sock_flow(sock_flow_table, sk->sk_rxhash); 670 rps_record_sock_flow(sock_flow_table, sk->sk_rxhash);
671 rcu_read_unlock(); 671 rcu_read_unlock();
672 #endif 672 #endif
673 } 673 }
674 674
675 static inline void sock_rps_reset_flow(const struct sock *sk) 675 static inline void sock_rps_reset_flow(const struct sock *sk)
676 { 676 {
677 #ifdef CONFIG_RPS 677 #ifdef CONFIG_RPS
678 struct rps_sock_flow_table *sock_flow_table; 678 struct rps_sock_flow_table *sock_flow_table;
679 679
680 rcu_read_lock(); 680 rcu_read_lock();
681 sock_flow_table = rcu_dereference(rps_sock_flow_table); 681 sock_flow_table = rcu_dereference(rps_sock_flow_table);
682 rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash); 682 rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash);
683 rcu_read_unlock(); 683 rcu_read_unlock();
684 #endif 684 #endif
685 } 685 }
686 686
687 static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash) 687 static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash)
688 { 688 {
689 #ifdef CONFIG_RPS 689 #ifdef CONFIG_RPS
690 if (unlikely(sk->sk_rxhash != rxhash)) { 690 if (unlikely(sk->sk_rxhash != rxhash)) {
691 sock_rps_reset_flow(sk); 691 sock_rps_reset_flow(sk);
692 sk->sk_rxhash = rxhash; 692 sk->sk_rxhash = rxhash;
693 } 693 }
694 #endif 694 #endif
695 } 695 }
696 696
697 #define sk_wait_event(__sk, __timeo, __condition) \ 697 #define sk_wait_event(__sk, __timeo, __condition) \
698 ({ int __rc; \ 698 ({ int __rc; \
699 release_sock(__sk); \ 699 release_sock(__sk); \
700 __rc = __condition; \ 700 __rc = __condition; \
701 if (!__rc) { \ 701 if (!__rc) { \
702 *(__timeo) = schedule_timeout(*(__timeo)); \ 702 *(__timeo) = schedule_timeout(*(__timeo)); \
703 } \ 703 } \
704 lock_sock(__sk); \ 704 lock_sock(__sk); \
705 __rc = __condition; \ 705 __rc = __condition; \
706 __rc; \ 706 __rc; \
707 }) 707 })
708 708
709 extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); 709 extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
710 extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); 710 extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
711 extern void sk_stream_wait_close(struct sock *sk, long timeo_p); 711 extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
712 extern int sk_stream_error(struct sock *sk, int flags, int err); 712 extern int sk_stream_error(struct sock *sk, int flags, int err);
713 extern void sk_stream_kill_queues(struct sock *sk); 713 extern void sk_stream_kill_queues(struct sock *sk);
714 714
715 extern int sk_wait_data(struct sock *sk, long *timeo); 715 extern int sk_wait_data(struct sock *sk, long *timeo);
716 716
717 struct request_sock_ops; 717 struct request_sock_ops;
718 struct timewait_sock_ops; 718 struct timewait_sock_ops;
719 struct inet_hashinfo; 719 struct inet_hashinfo;
720 struct raw_hashinfo; 720 struct raw_hashinfo;
721 721
722 /* Networking protocol blocks we attach to sockets. 722 /* Networking protocol blocks we attach to sockets.
723 * socket layer -> transport layer interface 723 * socket layer -> transport layer interface
724 * transport -> network interface is defined by struct inet_proto 724 * transport -> network interface is defined by struct inet_proto
725 */ 725 */
726 struct proto { 726 struct proto {
727 void (*close)(struct sock *sk, 727 void (*close)(struct sock *sk,
728 long timeout); 728 long timeout);
729 int (*connect)(struct sock *sk, 729 int (*connect)(struct sock *sk,
730 struct sockaddr *uaddr, 730 struct sockaddr *uaddr,
731 int addr_len); 731 int addr_len);
732 int (*disconnect)(struct sock *sk, int flags); 732 int (*disconnect)(struct sock *sk, int flags);
733 733
734 struct sock * (*accept) (struct sock *sk, int flags, int *err); 734 struct sock * (*accept) (struct sock *sk, int flags, int *err);
735 735
736 int (*ioctl)(struct sock *sk, int cmd, 736 int (*ioctl)(struct sock *sk, int cmd,
737 unsigned long arg); 737 unsigned long arg);
738 int (*init)(struct sock *sk); 738 int (*init)(struct sock *sk);
739 void (*destroy)(struct sock *sk); 739 void (*destroy)(struct sock *sk);
740 void (*shutdown)(struct sock *sk, int how); 740 void (*shutdown)(struct sock *sk, int how);
741 int (*setsockopt)(struct sock *sk, int level, 741 int (*setsockopt)(struct sock *sk, int level,
742 int optname, char __user *optval, 742 int optname, char __user *optval,
743 unsigned int optlen); 743 unsigned int optlen);
744 int (*getsockopt)(struct sock *sk, int level, 744 int (*getsockopt)(struct sock *sk, int level,
745 int optname, char __user *optval, 745 int optname, char __user *optval,
746 int __user *option); 746 int __user *option);
747 #ifdef CONFIG_COMPAT 747 #ifdef CONFIG_COMPAT
748 int (*compat_setsockopt)(struct sock *sk, 748 int (*compat_setsockopt)(struct sock *sk,
749 int level, 749 int level,
750 int optname, char __user *optval, 750 int optname, char __user *optval,
751 unsigned int optlen); 751 unsigned int optlen);
752 int (*compat_getsockopt)(struct sock *sk, 752 int (*compat_getsockopt)(struct sock *sk,
753 int level, 753 int level,
754 int optname, char __user *optval, 754 int optname, char __user *optval,
755 int __user *option); 755 int __user *option);
756 int (*compat_ioctl)(struct sock *sk, 756 int (*compat_ioctl)(struct sock *sk,
757 unsigned int cmd, unsigned long arg); 757 unsigned int cmd, unsigned long arg);
758 #endif 758 #endif
759 int (*sendmsg)(struct kiocb *iocb, struct sock *sk, 759 int (*sendmsg)(struct kiocb *iocb, struct sock *sk,
760 struct msghdr *msg, size_t len); 760 struct msghdr *msg, size_t len);
761 int (*recvmsg)(struct kiocb *iocb, struct sock *sk, 761 int (*recvmsg)(struct kiocb *iocb, struct sock *sk,
762 struct msghdr *msg, 762 struct msghdr *msg,
763 size_t len, int noblock, int flags, 763 size_t len, int noblock, int flags,
764 int *addr_len); 764 int *addr_len);
765 int (*sendpage)(struct sock *sk, struct page *page, 765 int (*sendpage)(struct sock *sk, struct page *page,
766 int offset, size_t size, int flags); 766 int offset, size_t size, int flags);
767 int (*bind)(struct sock *sk, 767 int (*bind)(struct sock *sk,
768 struct sockaddr *uaddr, int addr_len); 768 struct sockaddr *uaddr, int addr_len);
769 769
770 int (*backlog_rcv) (struct sock *sk, 770 int (*backlog_rcv) (struct sock *sk,
771 struct sk_buff *skb); 771 struct sk_buff *skb);
772 772
773 /* Keeping track of sk's, looking them up, and port selection methods. */ 773 /* Keeping track of sk's, looking them up, and port selection methods. */
774 void (*hash)(struct sock *sk); 774 void (*hash)(struct sock *sk);
775 void (*unhash)(struct sock *sk); 775 void (*unhash)(struct sock *sk);
776 void (*rehash)(struct sock *sk); 776 void (*rehash)(struct sock *sk);
777 int (*get_port)(struct sock *sk, unsigned short snum); 777 int (*get_port)(struct sock *sk, unsigned short snum);
778 void (*clear_sk)(struct sock *sk, int size); 778 void (*clear_sk)(struct sock *sk, int size);
779 779
780 /* Keeping track of sockets in use */ 780 /* Keeping track of sockets in use */
781 #ifdef CONFIG_PROC_FS 781 #ifdef CONFIG_PROC_FS
782 unsigned int inuse_idx; 782 unsigned int inuse_idx;
783 #endif 783 #endif
784 784
785 /* Memory pressure */ 785 /* Memory pressure */
786 void (*enter_memory_pressure)(struct sock *sk); 786 void (*enter_memory_pressure)(struct sock *sk);
787 atomic_long_t *memory_allocated; /* Current allocated memory. */ 787 atomic_long_t *memory_allocated; /* Current allocated memory. */
788 struct percpu_counter *sockets_allocated; /* Current number of sockets. */ 788 struct percpu_counter *sockets_allocated; /* Current number of sockets. */
789 /* 789 /*
790 * Pressure flag: try to collapse. 790 * Pressure flag: try to collapse.
791 * Technical note: it is used by multiple contexts non atomically. 791 * Technical note: it is used by multiple contexts non atomically.
792 * All the __sk_mem_schedule() is of this nature: accounting 792 * All the __sk_mem_schedule() is of this nature: accounting
793 * is strict, actions are advisory and have some latency. 793 * is strict, actions are advisory and have some latency.
794 */ 794 */
795 int *memory_pressure; 795 int *memory_pressure;
796 long *sysctl_mem; 796 long *sysctl_mem;
797 int *sysctl_wmem; 797 int *sysctl_wmem;
798 int *sysctl_rmem; 798 int *sysctl_rmem;
799 int max_header; 799 int max_header;
800 bool no_autobind; 800 bool no_autobind;
801 801
802 struct kmem_cache *slab; 802 struct kmem_cache *slab;
803 unsigned int obj_size; 803 unsigned int obj_size;
804 int slab_flags; 804 int slab_flags;
805 805
806 struct percpu_counter *orphan_count; 806 struct percpu_counter *orphan_count;
807 807
808 struct request_sock_ops *rsk_prot; 808 struct request_sock_ops *rsk_prot;
809 struct timewait_sock_ops *twsk_prot; 809 struct timewait_sock_ops *twsk_prot;
810 810
811 union { 811 union {
812 struct inet_hashinfo *hashinfo; 812 struct inet_hashinfo *hashinfo;
813 struct udp_table *udp_table; 813 struct udp_table *udp_table;
814 struct raw_hashinfo *raw_hash; 814 struct raw_hashinfo *raw_hash;
815 } h; 815 } h;
816 816
817 struct module *owner; 817 struct module *owner;
818 818
819 char name[32]; 819 char name[32];
820 820
821 struct list_head node; 821 struct list_head node;
822 #ifdef SOCK_REFCNT_DEBUG 822 #ifdef SOCK_REFCNT_DEBUG
823 atomic_t socks; 823 atomic_t socks;
824 #endif 824 #endif
825 }; 825 };
826 826
827 extern int proto_register(struct proto *prot, int alloc_slab); 827 extern int proto_register(struct proto *prot, int alloc_slab);
828 extern void proto_unregister(struct proto *prot); 828 extern void proto_unregister(struct proto *prot);
829 829
830 #ifdef SOCK_REFCNT_DEBUG 830 #ifdef SOCK_REFCNT_DEBUG
831 static inline void sk_refcnt_debug_inc(struct sock *sk) 831 static inline void sk_refcnt_debug_inc(struct sock *sk)
832 { 832 {
833 atomic_inc(&sk->sk_prot->socks); 833 atomic_inc(&sk->sk_prot->socks);
834 } 834 }
835 835
836 static inline void sk_refcnt_debug_dec(struct sock *sk) 836 static inline void sk_refcnt_debug_dec(struct sock *sk)
837 { 837 {
838 atomic_dec(&sk->sk_prot->socks); 838 atomic_dec(&sk->sk_prot->socks);
839 printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", 839 printk(KERN_DEBUG "%s socket %p released, %d are still alive\n",
840 sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); 840 sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks));
841 } 841 }
842 842
843 static inline void sk_refcnt_debug_release(const struct sock *sk) 843 static inline void sk_refcnt_debug_release(const struct sock *sk)
844 { 844 {
845 if (atomic_read(&sk->sk_refcnt) != 1) 845 if (atomic_read(&sk->sk_refcnt) != 1)
846 printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", 846 printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n",
847 sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt)); 847 sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt));
848 } 848 }
849 #else /* SOCK_REFCNT_DEBUG */ 849 #else /* SOCK_REFCNT_DEBUG */
850 #define sk_refcnt_debug_inc(sk) do { } while (0) 850 #define sk_refcnt_debug_inc(sk) do { } while (0)
851 #define sk_refcnt_debug_dec(sk) do { } while (0) 851 #define sk_refcnt_debug_dec(sk) do { } while (0)
852 #define sk_refcnt_debug_release(sk) do { } while (0) 852 #define sk_refcnt_debug_release(sk) do { } while (0)
853 #endif /* SOCK_REFCNT_DEBUG */ 853 #endif /* SOCK_REFCNT_DEBUG */
854 854
855 855
856 #ifdef CONFIG_PROC_FS 856 #ifdef CONFIG_PROC_FS
857 /* Called with local bh disabled */ 857 /* Called with local bh disabled */
858 extern void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc); 858 extern void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
859 extern int sock_prot_inuse_get(struct net *net, struct proto *proto); 859 extern int sock_prot_inuse_get(struct net *net, struct proto *proto);
860 #else 860 #else
861 static void inline sock_prot_inuse_add(struct net *net, struct proto *prot, 861 static void inline sock_prot_inuse_add(struct net *net, struct proto *prot,
862 int inc) 862 int inc)
863 { 863 {
864 } 864 }
865 #endif 865 #endif
866 866
867 867
868 /* With per-bucket locks this operation is not-atomic, so that 868 /* With per-bucket locks this operation is not-atomic, so that
869 * this version is not worse. 869 * this version is not worse.
870 */ 870 */
871 static inline void __sk_prot_rehash(struct sock *sk) 871 static inline void __sk_prot_rehash(struct sock *sk)
872 { 872 {
873 sk->sk_prot->unhash(sk); 873 sk->sk_prot->unhash(sk);
874 sk->sk_prot->hash(sk); 874 sk->sk_prot->hash(sk);
875 } 875 }
876 876
877 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size); 877 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size);
878 878
879 /* About 10 seconds */ 879 /* About 10 seconds */
880 #define SOCK_DESTROY_TIME (10*HZ) 880 #define SOCK_DESTROY_TIME (10*HZ)
881 881
882 /* Sockets 0-1023 can't be bound to unless you are superuser */ 882 /* Sockets 0-1023 can't be bound to unless you are superuser */
883 #define PROT_SOCK 1024 883 #define PROT_SOCK 1024
884 884
885 #define SHUTDOWN_MASK 3 885 #define SHUTDOWN_MASK 3
886 #define RCV_SHUTDOWN 1 886 #define RCV_SHUTDOWN 1
887 #define SEND_SHUTDOWN 2 887 #define SEND_SHUTDOWN 2
888 888
889 #define SOCK_SNDBUF_LOCK 1 889 #define SOCK_SNDBUF_LOCK 1
890 #define SOCK_RCVBUF_LOCK 2 890 #define SOCK_RCVBUF_LOCK 2
891 #define SOCK_BINDADDR_LOCK 4 891 #define SOCK_BINDADDR_LOCK 4
892 #define SOCK_BINDPORT_LOCK 8 892 #define SOCK_BINDPORT_LOCK 8
893 893
894 /* sock_iocb: used to kick off async processing of socket ios */ 894 /* sock_iocb: used to kick off async processing of socket ios */
895 struct sock_iocb { 895 struct sock_iocb {
896 struct list_head list; 896 struct list_head list;
897 897
898 int flags; 898 int flags;
899 int size; 899 int size;
900 struct socket *sock; 900 struct socket *sock;
901 struct sock *sk; 901 struct sock *sk;
902 struct scm_cookie *scm; 902 struct scm_cookie *scm;
903 struct msghdr *msg, async_msg; 903 struct msghdr *msg, async_msg;
904 struct kiocb *kiocb; 904 struct kiocb *kiocb;
905 }; 905 };
906 906
907 static inline struct sock_iocb *kiocb_to_siocb(struct kiocb *iocb) 907 static inline struct sock_iocb *kiocb_to_siocb(struct kiocb *iocb)
908 { 908 {
909 return (struct sock_iocb *)iocb->private; 909 return (struct sock_iocb *)iocb->private;
910 } 910 }
911 911
912 static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si) 912 static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si)
913 { 913 {
914 return si->kiocb; 914 return si->kiocb;
915 } 915 }
916 916
917 struct socket_alloc { 917 struct socket_alloc {
918 struct socket socket; 918 struct socket socket;
919 struct inode vfs_inode; 919 struct inode vfs_inode;
920 }; 920 };
921 921
922 static inline struct socket *SOCKET_I(struct inode *inode) 922 static inline struct socket *SOCKET_I(struct inode *inode)
923 { 923 {
924 return &container_of(inode, struct socket_alloc, vfs_inode)->socket; 924 return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
925 } 925 }
926 926
927 static inline struct inode *SOCK_INODE(struct socket *socket) 927 static inline struct inode *SOCK_INODE(struct socket *socket)
928 { 928 {
929 return &container_of(socket, struct socket_alloc, socket)->vfs_inode; 929 return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
930 } 930 }
931 931
932 /* 932 /*
933 * Functions for memory accounting 933 * Functions for memory accounting
934 */ 934 */
935 extern int __sk_mem_schedule(struct sock *sk, int size, int kind); 935 extern int __sk_mem_schedule(struct sock *sk, int size, int kind);
936 extern void __sk_mem_reclaim(struct sock *sk); 936 extern void __sk_mem_reclaim(struct sock *sk);
937 937
938 #define SK_MEM_QUANTUM ((int)PAGE_SIZE) 938 #define SK_MEM_QUANTUM ((int)PAGE_SIZE)
939 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) 939 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
940 #define SK_MEM_SEND 0 940 #define SK_MEM_SEND 0
941 #define SK_MEM_RECV 1 941 #define SK_MEM_RECV 1
942 942
943 static inline int sk_mem_pages(int amt) 943 static inline int sk_mem_pages(int amt)
944 { 944 {
945 return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT; 945 return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
946 } 946 }
947 947
948 static inline int sk_has_account(struct sock *sk) 948 static inline int sk_has_account(struct sock *sk)
949 { 949 {
950 /* return true if protocol supports memory accounting */ 950 /* return true if protocol supports memory accounting */
951 return !!sk->sk_prot->memory_allocated; 951 return !!sk->sk_prot->memory_allocated;
952 } 952 }
953 953
954 static inline int sk_wmem_schedule(struct sock *sk, int size) 954 static inline int sk_wmem_schedule(struct sock *sk, int size)
955 { 955 {
956 if (!sk_has_account(sk)) 956 if (!sk_has_account(sk))
957 return 1; 957 return 1;
958 return size <= sk->sk_forward_alloc || 958 return size <= sk->sk_forward_alloc ||
959 __sk_mem_schedule(sk, size, SK_MEM_SEND); 959 __sk_mem_schedule(sk, size, SK_MEM_SEND);
960 } 960 }
961 961
962 static inline int sk_rmem_schedule(struct sock *sk, int size) 962 static inline int sk_rmem_schedule(struct sock *sk, int size)
963 { 963 {
964 if (!sk_has_account(sk)) 964 if (!sk_has_account(sk))
965 return 1; 965 return 1;
966 return size <= sk->sk_forward_alloc || 966 return size <= sk->sk_forward_alloc ||
967 __sk_mem_schedule(sk, size, SK_MEM_RECV); 967 __sk_mem_schedule(sk, size, SK_MEM_RECV);
968 } 968 }
969 969
970 static inline void sk_mem_reclaim(struct sock *sk) 970 static inline void sk_mem_reclaim(struct sock *sk)
971 { 971 {
972 if (!sk_has_account(sk)) 972 if (!sk_has_account(sk))
973 return; 973 return;
974 if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) 974 if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
975 __sk_mem_reclaim(sk); 975 __sk_mem_reclaim(sk);
976 } 976 }
977 977
978 static inline void sk_mem_reclaim_partial(struct sock *sk) 978 static inline void sk_mem_reclaim_partial(struct sock *sk)
979 { 979 {
980 if (!sk_has_account(sk)) 980 if (!sk_has_account(sk))
981 return; 981 return;
982 if (sk->sk_forward_alloc > SK_MEM_QUANTUM) 982 if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
983 __sk_mem_reclaim(sk); 983 __sk_mem_reclaim(sk);
984 } 984 }
985 985
986 static inline void sk_mem_charge(struct sock *sk, int size) 986 static inline void sk_mem_charge(struct sock *sk, int size)
987 { 987 {
988 if (!sk_has_account(sk)) 988 if (!sk_has_account(sk))
989 return; 989 return;
990 sk->sk_forward_alloc -= size; 990 sk->sk_forward_alloc -= size;
991 } 991 }
992 992
993 static inline void sk_mem_uncharge(struct sock *sk, int size) 993 static inline void sk_mem_uncharge(struct sock *sk, int size)
994 { 994 {
995 if (!sk_has_account(sk)) 995 if (!sk_has_account(sk))
996 return; 996 return;
997 sk->sk_forward_alloc += size; 997 sk->sk_forward_alloc += size;
998 } 998 }
999 999
1000 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) 1000 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
1001 { 1001 {
1002 sock_set_flag(sk, SOCK_QUEUE_SHRUNK); 1002 sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1003 sk->sk_wmem_queued -= skb->truesize; 1003 sk->sk_wmem_queued -= skb->truesize;
1004 sk_mem_uncharge(sk, skb->truesize); 1004 sk_mem_uncharge(sk, skb->truesize);
1005 __kfree_skb(skb); 1005 __kfree_skb(skb);
1006 } 1006 }
1007 1007
1008 /* Used by processes to "lock" a socket state, so that 1008 /* Used by processes to "lock" a socket state, so that
1009 * interrupts and bottom half handlers won't change it 1009 * interrupts and bottom half handlers won't change it
1010 * from under us. It essentially blocks any incoming 1010 * from under us. It essentially blocks any incoming
1011 * packets, so that we won't get any new data or any 1011 * packets, so that we won't get any new data or any
1012 * packets that change the state of the socket. 1012 * packets that change the state of the socket.
1013 * 1013 *
1014 * While locked, BH processing will add new packets to 1014 * While locked, BH processing will add new packets to
1015 * the backlog queue. This queue is processed by the 1015 * the backlog queue. This queue is processed by the
1016 * owner of the socket lock right before it is released. 1016 * owner of the socket lock right before it is released.
1017 * 1017 *
1018 * Since ~2.3.5 it is also exclusive sleep lock serializing 1018 * Since ~2.3.5 it is also exclusive sleep lock serializing
1019 * accesses from user process context. 1019 * accesses from user process context.
1020 */ 1020 */
1021 #define sock_owned_by_user(sk) ((sk)->sk_lock.owned) 1021 #define sock_owned_by_user(sk) ((sk)->sk_lock.owned)
1022 1022
1023 /* 1023 /*
1024 * Macro so as to not evaluate some arguments when 1024 * Macro so as to not evaluate some arguments when
1025 * lockdep is not enabled. 1025 * lockdep is not enabled.
1026 * 1026 *
1027 * Mark both the sk_lock and the sk_lock.slock as a 1027 * Mark both the sk_lock and the sk_lock.slock as a
1028 * per-address-family lock class. 1028 * per-address-family lock class.
1029 */ 1029 */
1030 #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ 1030 #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \
1031 do { \ 1031 do { \
1032 sk->sk_lock.owned = 0; \ 1032 sk->sk_lock.owned = 0; \
1033 init_waitqueue_head(&sk->sk_lock.wq); \ 1033 init_waitqueue_head(&sk->sk_lock.wq); \
1034 spin_lock_init(&(sk)->sk_lock.slock); \ 1034 spin_lock_init(&(sk)->sk_lock.slock); \
1035 debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ 1035 debug_check_no_locks_freed((void *)&(sk)->sk_lock, \
1036 sizeof((sk)->sk_lock)); \ 1036 sizeof((sk)->sk_lock)); \
1037 lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ 1037 lockdep_set_class_and_name(&(sk)->sk_lock.slock, \
1038 (skey), (sname)); \ 1038 (skey), (sname)); \
1039 lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ 1039 lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
1040 } while (0) 1040 } while (0)
1041 1041
1042 extern void lock_sock_nested(struct sock *sk, int subclass); 1042 extern void lock_sock_nested(struct sock *sk, int subclass);
1043 1043
1044 static inline void lock_sock(struct sock *sk) 1044 static inline void lock_sock(struct sock *sk)
1045 { 1045 {
1046 lock_sock_nested(sk, 0); 1046 lock_sock_nested(sk, 0);
1047 } 1047 }
1048 1048
1049 extern void release_sock(struct sock *sk); 1049 extern void release_sock(struct sock *sk);
1050 1050
1051 /* BH context may only use the following locking interface. */ 1051 /* BH context may only use the following locking interface. */
1052 #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) 1052 #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock))
1053 #define bh_lock_sock_nested(__sk) \ 1053 #define bh_lock_sock_nested(__sk) \
1054 spin_lock_nested(&((__sk)->sk_lock.slock), \ 1054 spin_lock_nested(&((__sk)->sk_lock.slock), \
1055 SINGLE_DEPTH_NESTING) 1055 SINGLE_DEPTH_NESTING)
1056 #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) 1056 #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock))
1057 1057
1058 extern bool lock_sock_fast(struct sock *sk); 1058 extern bool lock_sock_fast(struct sock *sk);
1059 /** 1059 /**
1060 * unlock_sock_fast - complement of lock_sock_fast 1060 * unlock_sock_fast - complement of lock_sock_fast
1061 * @sk: socket 1061 * @sk: socket
1062 * @slow: slow mode 1062 * @slow: slow mode
1063 * 1063 *
1064 * fast unlock socket for user context. 1064 * fast unlock socket for user context.
1065 * If slow mode is on, we call regular release_sock() 1065 * If slow mode is on, we call regular release_sock()
1066 */ 1066 */
1067 static inline void unlock_sock_fast(struct sock *sk, bool slow) 1067 static inline void unlock_sock_fast(struct sock *sk, bool slow)
1068 { 1068 {
1069 if (slow) 1069 if (slow)
1070 release_sock(sk); 1070 release_sock(sk);
1071 else 1071 else
1072 spin_unlock_bh(&sk->sk_lock.slock); 1072 spin_unlock_bh(&sk->sk_lock.slock);
1073 } 1073 }
1074 1074
1075 1075
1076 extern struct sock *sk_alloc(struct net *net, int family, 1076 extern struct sock *sk_alloc(struct net *net, int family,
1077 gfp_t priority, 1077 gfp_t priority,
1078 struct proto *prot); 1078 struct proto *prot);
1079 extern void sk_free(struct sock *sk); 1079 extern void sk_free(struct sock *sk);
1080 extern void sk_release_kernel(struct sock *sk); 1080 extern void sk_release_kernel(struct sock *sk);
1081 extern struct sock *sk_clone(const struct sock *sk, 1081 extern struct sock *sk_clone(const struct sock *sk,
1082 const gfp_t priority); 1082 const gfp_t priority);
1083 1083
1084 extern struct sk_buff *sock_wmalloc(struct sock *sk, 1084 extern struct sk_buff *sock_wmalloc(struct sock *sk,
1085 unsigned long size, int force, 1085 unsigned long size, int force,
1086 gfp_t priority); 1086 gfp_t priority);
1087 extern struct sk_buff *sock_rmalloc(struct sock *sk, 1087 extern struct sk_buff *sock_rmalloc(struct sock *sk,
1088 unsigned long size, int force, 1088 unsigned long size, int force,
1089 gfp_t priority); 1089 gfp_t priority);
1090 extern void sock_wfree(struct sk_buff *skb); 1090 extern void sock_wfree(struct sk_buff *skb);
1091 extern void sock_rfree(struct sk_buff *skb); 1091 extern void sock_rfree(struct sk_buff *skb);
1092 1092
1093 extern int sock_setsockopt(struct socket *sock, int level, 1093 extern int sock_setsockopt(struct socket *sock, int level,
1094 int op, char __user *optval, 1094 int op, char __user *optval,
1095 unsigned int optlen); 1095 unsigned int optlen);
1096 1096
1097 extern int sock_getsockopt(struct socket *sock, int level, 1097 extern int sock_getsockopt(struct socket *sock, int level,
1098 int op, char __user *optval, 1098 int op, char __user *optval,
1099 int __user *optlen); 1099 int __user *optlen);
1100 extern struct sk_buff *sock_alloc_send_skb(struct sock *sk, 1100 extern struct sk_buff *sock_alloc_send_skb(struct sock *sk,
1101 unsigned long size, 1101 unsigned long size,
1102 int noblock, 1102 int noblock,
1103 int *errcode); 1103 int *errcode);
1104 extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk, 1104 extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1105 unsigned long header_len, 1105 unsigned long header_len,
1106 unsigned long data_len, 1106 unsigned long data_len,
1107 int noblock, 1107 int noblock,
1108 int *errcode); 1108 int *errcode);
1109 extern void *sock_kmalloc(struct sock *sk, int size, 1109 extern void *sock_kmalloc(struct sock *sk, int size,
1110 gfp_t priority); 1110 gfp_t priority);
1111 extern void sock_kfree_s(struct sock *sk, void *mem, int size); 1111 extern void sock_kfree_s(struct sock *sk, void *mem, int size);
1112 extern void sk_send_sigurg(struct sock *sk); 1112 extern void sk_send_sigurg(struct sock *sk);
1113 1113
1114 #ifdef CONFIG_CGROUPS 1114 #ifdef CONFIG_CGROUPS
1115 extern void sock_update_classid(struct sock *sk); 1115 extern void sock_update_classid(struct sock *sk);
1116 #else 1116 #else
1117 static inline void sock_update_classid(struct sock *sk) 1117 static inline void sock_update_classid(struct sock *sk)
1118 { 1118 {
1119 } 1119 }
1120 #endif 1120 #endif
1121 1121
1122 /* 1122 /*
1123 * Functions to fill in entries in struct proto_ops when a protocol 1123 * Functions to fill in entries in struct proto_ops when a protocol
1124 * does not implement a particular function. 1124 * does not implement a particular function.
1125 */ 1125 */
1126 extern int sock_no_bind(struct socket *, 1126 extern int sock_no_bind(struct socket *,
1127 struct sockaddr *, int); 1127 struct sockaddr *, int);
1128 extern int sock_no_connect(struct socket *, 1128 extern int sock_no_connect(struct socket *,
1129 struct sockaddr *, int, int); 1129 struct sockaddr *, int, int);
1130 extern int sock_no_socketpair(struct socket *, 1130 extern int sock_no_socketpair(struct socket *,
1131 struct socket *); 1131 struct socket *);
1132 extern int sock_no_accept(struct socket *, 1132 extern int sock_no_accept(struct socket *,
1133 struct socket *, int); 1133 struct socket *, int);
1134 extern int sock_no_getname(struct socket *, 1134 extern int sock_no_getname(struct socket *,
1135 struct sockaddr *, int *, int); 1135 struct sockaddr *, int *, int);
1136 extern unsigned int sock_no_poll(struct file *, struct socket *, 1136 extern unsigned int sock_no_poll(struct file *, struct socket *,
1137 struct poll_table_struct *); 1137 struct poll_table_struct *);
1138 extern int sock_no_ioctl(struct socket *, unsigned int, 1138 extern int sock_no_ioctl(struct socket *, unsigned int,
1139 unsigned long); 1139 unsigned long);
1140 extern int sock_no_listen(struct socket *, int); 1140 extern int sock_no_listen(struct socket *, int);
1141 extern int sock_no_shutdown(struct socket *, int); 1141 extern int sock_no_shutdown(struct socket *, int);
1142 extern int sock_no_getsockopt(struct socket *, int , int, 1142 extern int sock_no_getsockopt(struct socket *, int , int,
1143 char __user *, int __user *); 1143 char __user *, int __user *);
1144 extern int sock_no_setsockopt(struct socket *, int, int, 1144 extern int sock_no_setsockopt(struct socket *, int, int,
1145 char __user *, unsigned int); 1145 char __user *, unsigned int);
1146 extern int sock_no_sendmsg(struct kiocb *, struct socket *, 1146 extern int sock_no_sendmsg(struct kiocb *, struct socket *,
1147 struct msghdr *, size_t); 1147 struct msghdr *, size_t);
1148 extern int sock_no_recvmsg(struct kiocb *, struct socket *, 1148 extern int sock_no_recvmsg(struct kiocb *, struct socket *,
1149 struct msghdr *, size_t, int); 1149 struct msghdr *, size_t, int);
1150 extern int sock_no_mmap(struct file *file, 1150 extern int sock_no_mmap(struct file *file,
1151 struct socket *sock, 1151 struct socket *sock,
1152 struct vm_area_struct *vma); 1152 struct vm_area_struct *vma);
1153 extern ssize_t sock_no_sendpage(struct socket *sock, 1153 extern ssize_t sock_no_sendpage(struct socket *sock,
1154 struct page *page, 1154 struct page *page,
1155 int offset, size_t size, 1155 int offset, size_t size,
1156 int flags); 1156 int flags);
1157 1157
1158 /* 1158 /*
1159 * Functions to fill in entries in struct proto_ops when a protocol 1159 * Functions to fill in entries in struct proto_ops when a protocol
1160 * uses the inet style. 1160 * uses the inet style.
1161 */ 1161 */
1162 extern int sock_common_getsockopt(struct socket *sock, int level, int optname, 1162 extern int sock_common_getsockopt(struct socket *sock, int level, int optname,
1163 char __user *optval, int __user *optlen); 1163 char __user *optval, int __user *optlen);
1164 extern int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, 1164 extern int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1165 struct msghdr *msg, size_t size, int flags); 1165 struct msghdr *msg, size_t size, int flags);
1166 extern int sock_common_setsockopt(struct socket *sock, int level, int optname, 1166 extern int sock_common_setsockopt(struct socket *sock, int level, int optname,
1167 char __user *optval, unsigned int optlen); 1167 char __user *optval, unsigned int optlen);
1168 extern int compat_sock_common_getsockopt(struct socket *sock, int level, 1168 extern int compat_sock_common_getsockopt(struct socket *sock, int level,
1169 int optname, char __user *optval, int __user *optlen); 1169 int optname, char __user *optval, int __user *optlen);
1170 extern int compat_sock_common_setsockopt(struct socket *sock, int level, 1170 extern int compat_sock_common_setsockopt(struct socket *sock, int level,
1171 int optname, char __user *optval, unsigned int optlen); 1171 int optname, char __user *optval, unsigned int optlen);
1172 1172
1173 extern void sk_common_release(struct sock *sk); 1173 extern void sk_common_release(struct sock *sk);
1174 1174
1175 /* 1175 /*
1176 * Default socket callbacks and setup code 1176 * Default socket callbacks and setup code
1177 */ 1177 */
1178 1178
1179 /* Initialise core socket variables */ 1179 /* Initialise core socket variables */
1180 extern void sock_init_data(struct socket *sock, struct sock *sk); 1180 extern void sock_init_data(struct socket *sock, struct sock *sk);
1181 1181
1182 extern void sk_filter_release_rcu(struct rcu_head *rcu); 1182 extern void sk_filter_release_rcu(struct rcu_head *rcu);
1183 1183
1184 /** 1184 /**
1185 * sk_filter_release - release a socket filter 1185 * sk_filter_release - release a socket filter
1186 * @fp: filter to remove 1186 * @fp: filter to remove
1187 * 1187 *
1188 * Remove a filter from a socket and release its resources. 1188 * Remove a filter from a socket and release its resources.
1189 */ 1189 */
1190 1190
1191 static inline void sk_filter_release(struct sk_filter *fp) 1191 static inline void sk_filter_release(struct sk_filter *fp)
1192 { 1192 {
1193 if (atomic_dec_and_test(&fp->refcnt)) 1193 if (atomic_dec_and_test(&fp->refcnt))
1194 call_rcu(&fp->rcu, sk_filter_release_rcu); 1194 call_rcu(&fp->rcu, sk_filter_release_rcu);
1195 } 1195 }
1196 1196
1197 static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 1197 static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
1198 { 1198 {
1199 unsigned int size = sk_filter_len(fp); 1199 unsigned int size = sk_filter_len(fp);
1200 1200
1201 atomic_sub(size, &sk->sk_omem_alloc); 1201 atomic_sub(size, &sk->sk_omem_alloc);
1202 sk_filter_release(fp); 1202 sk_filter_release(fp);
1203 } 1203 }
1204 1204
1205 static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp) 1205 static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1206 { 1206 {
1207 atomic_inc(&fp->refcnt); 1207 atomic_inc(&fp->refcnt);
1208 atomic_add(sk_filter_len(fp), &sk->sk_omem_alloc); 1208 atomic_add(sk_filter_len(fp), &sk->sk_omem_alloc);
1209 } 1209 }
1210 1210
1211 /* 1211 /*
1212 * Socket reference counting postulates. 1212 * Socket reference counting postulates.
1213 * 1213 *
1214 * * Each user of socket SHOULD hold a reference count. 1214 * * Each user of socket SHOULD hold a reference count.
1215 * * Each access point to socket (an hash table bucket, reference from a list, 1215 * * Each access point to socket (an hash table bucket, reference from a list,
1216 * running timer, skb in flight MUST hold a reference count. 1216 * running timer, skb in flight MUST hold a reference count.
1217 * * When reference count hits 0, it means it will never increase back. 1217 * * When reference count hits 0, it means it will never increase back.
1218 * * When reference count hits 0, it means that no references from 1218 * * When reference count hits 0, it means that no references from
1219 * outside exist to this socket and current process on current CPU 1219 * outside exist to this socket and current process on current CPU
1220 * is last user and may/should destroy this socket. 1220 * is last user and may/should destroy this socket.
1221 * * sk_free is called from any context: process, BH, IRQ. When 1221 * * sk_free is called from any context: process, BH, IRQ. When
1222 * it is called, socket has no references from outside -> sk_free 1222 * it is called, socket has no references from outside -> sk_free
1223 * may release descendant resources allocated by the socket, but 1223 * may release descendant resources allocated by the socket, but
1224 * to the time when it is called, socket is NOT referenced by any 1224 * to the time when it is called, socket is NOT referenced by any
1225 * hash tables, lists etc. 1225 * hash tables, lists etc.
1226 * * Packets, delivered from outside (from network or from another process) 1226 * * Packets, delivered from outside (from network or from another process)
1227 * and enqueued on receive/error queues SHOULD NOT grab reference count, 1227 * and enqueued on receive/error queues SHOULD NOT grab reference count,
1228 * when they sit in queue. Otherwise, packets will leak to hole, when 1228 * when they sit in queue. Otherwise, packets will leak to hole, when
1229 * socket is looked up by one cpu and unhasing is made by another CPU. 1229 * socket is looked up by one cpu and unhasing is made by another CPU.
1230 * It is true for udp/raw, netlink (leak to receive and error queues), tcp 1230 * It is true for udp/raw, netlink (leak to receive and error queues), tcp
1231 * (leak to backlog). Packet socket does all the processing inside 1231 * (leak to backlog). Packet socket does all the processing inside
1232 * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets 1232 * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets
1233 * use separate SMP lock, so that they are prone too. 1233 * use separate SMP lock, so that they are prone too.
1234 */ 1234 */
1235 1235
1236 /* Ungrab socket and destroy it, if it was the last reference. */ 1236 /* Ungrab socket and destroy it, if it was the last reference. */
1237 static inline void sock_put(struct sock *sk) 1237 static inline void sock_put(struct sock *sk)
1238 { 1238 {
1239 if (atomic_dec_and_test(&sk->sk_refcnt)) 1239 if (atomic_dec_and_test(&sk->sk_refcnt))
1240 sk_free(sk); 1240 sk_free(sk);
1241 } 1241 }
1242 1242
1243 extern int sk_receive_skb(struct sock *sk, struct sk_buff *skb, 1243 extern int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
1244 const int nested); 1244 const int nested);
1245 1245
1246 static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) 1246 static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
1247 { 1247 {
1248 sk->sk_tx_queue_mapping = tx_queue; 1248 sk->sk_tx_queue_mapping = tx_queue;
1249 } 1249 }
1250 1250
1251 static inline void sk_tx_queue_clear(struct sock *sk) 1251 static inline void sk_tx_queue_clear(struct sock *sk)
1252 { 1252 {
1253 sk->sk_tx_queue_mapping = -1; 1253 sk->sk_tx_queue_mapping = -1;
1254 } 1254 }
1255 1255
1256 static inline int sk_tx_queue_get(const struct sock *sk) 1256 static inline int sk_tx_queue_get(const struct sock *sk)
1257 { 1257 {
1258 return sk ? sk->sk_tx_queue_mapping : -1; 1258 return sk ? sk->sk_tx_queue_mapping : -1;
1259 } 1259 }
1260 1260
1261 static inline void sk_set_socket(struct sock *sk, struct socket *sock) 1261 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
1262 { 1262 {
1263 sk_tx_queue_clear(sk); 1263 sk_tx_queue_clear(sk);
1264 sk->sk_socket = sock; 1264 sk->sk_socket = sock;
1265 } 1265 }
1266 1266
1267 static inline wait_queue_head_t *sk_sleep(struct sock *sk) 1267 static inline wait_queue_head_t *sk_sleep(struct sock *sk)
1268 { 1268 {
1269 BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); 1269 BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
1270 return &rcu_dereference_raw(sk->sk_wq)->wait; 1270 return &rcu_dereference_raw(sk->sk_wq)->wait;
1271 } 1271 }
1272 /* Detach socket from process context. 1272 /* Detach socket from process context.
1273 * Announce socket dead, detach it from wait queue and inode. 1273 * Announce socket dead, detach it from wait queue and inode.
1274 * Note that parent inode held reference count on this struct sock, 1274 * Note that parent inode held reference count on this struct sock,
1275 * we do not release it in this function, because protocol 1275 * we do not release it in this function, because protocol
1276 * probably wants some additional cleanups or even continuing 1276 * probably wants some additional cleanups or even continuing
1277 * to work with this socket (TCP). 1277 * to work with this socket (TCP).
1278 */ 1278 */
1279 static inline void sock_orphan(struct sock *sk) 1279 static inline void sock_orphan(struct sock *sk)
1280 { 1280 {
1281 write_lock_bh(&sk->sk_callback_lock); 1281 write_lock_bh(&sk->sk_callback_lock);
1282 sock_set_flag(sk, SOCK_DEAD); 1282 sock_set_flag(sk, SOCK_DEAD);
1283 sk_set_socket(sk, NULL); 1283 sk_set_socket(sk, NULL);
1284 sk->sk_wq = NULL; 1284 sk->sk_wq = NULL;
1285 write_unlock_bh(&sk->sk_callback_lock); 1285 write_unlock_bh(&sk->sk_callback_lock);
1286 } 1286 }
1287 1287
1288 static inline void sock_graft(struct sock *sk, struct socket *parent) 1288 static inline void sock_graft(struct sock *sk, struct socket *parent)
1289 { 1289 {
1290 write_lock_bh(&sk->sk_callback_lock); 1290 write_lock_bh(&sk->sk_callback_lock);
1291 sk->sk_wq = parent->wq; 1291 sk->sk_wq = parent->wq;
1292 parent->sk = sk; 1292 parent->sk = sk;
1293 sk_set_socket(sk, parent); 1293 sk_set_socket(sk, parent);
1294 security_sock_graft(sk, parent); 1294 security_sock_graft(sk, parent);
1295 write_unlock_bh(&sk->sk_callback_lock); 1295 write_unlock_bh(&sk->sk_callback_lock);
1296 } 1296 }
1297 1297
1298 extern int sock_i_uid(struct sock *sk); 1298 extern int sock_i_uid(struct sock *sk);
1299 extern unsigned long sock_i_ino(struct sock *sk); 1299 extern unsigned long sock_i_ino(struct sock *sk);
1300 1300
1301 static inline struct dst_entry * 1301 static inline struct dst_entry *
1302 __sk_dst_get(struct sock *sk) 1302 __sk_dst_get(struct sock *sk)
1303 { 1303 {
1304 return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() || 1304 return rcu_dereference_check(sk->sk_dst_cache, sock_owned_by_user(sk) ||
1305 sock_owned_by_user(sk) ||
1306 lockdep_is_held(&sk->sk_lock.slock)); 1305 lockdep_is_held(&sk->sk_lock.slock));
1307 } 1306 }
1308 1307
1309 static inline struct dst_entry * 1308 static inline struct dst_entry *
1310 sk_dst_get(struct sock *sk) 1309 sk_dst_get(struct sock *sk)
1311 { 1310 {
1312 struct dst_entry *dst; 1311 struct dst_entry *dst;
1313 1312
1314 rcu_read_lock(); 1313 rcu_read_lock();
1315 dst = rcu_dereference(sk->sk_dst_cache); 1314 dst = rcu_dereference(sk->sk_dst_cache);
1316 if (dst) 1315 if (dst)
1317 dst_hold(dst); 1316 dst_hold(dst);
1318 rcu_read_unlock(); 1317 rcu_read_unlock();
1319 return dst; 1318 return dst;
1320 } 1319 }
1321 1320
1322 extern void sk_reset_txq(struct sock *sk); 1321 extern void sk_reset_txq(struct sock *sk);
1323 1322
1324 static inline void dst_negative_advice(struct sock *sk) 1323 static inline void dst_negative_advice(struct sock *sk)
1325 { 1324 {
1326 struct dst_entry *ndst, *dst = __sk_dst_get(sk); 1325 struct dst_entry *ndst, *dst = __sk_dst_get(sk);
1327 1326
1328 if (dst && dst->ops->negative_advice) { 1327 if (dst && dst->ops->negative_advice) {
1329 ndst = dst->ops->negative_advice(dst); 1328 ndst = dst->ops->negative_advice(dst);
1330 1329
1331 if (ndst != dst) { 1330 if (ndst != dst) {
1332 rcu_assign_pointer(sk->sk_dst_cache, ndst); 1331 rcu_assign_pointer(sk->sk_dst_cache, ndst);
1333 sk_reset_txq(sk); 1332 sk_reset_txq(sk);
1334 } 1333 }
1335 } 1334 }
1336 } 1335 }
1337 1336
1338 static inline void 1337 static inline void
1339 __sk_dst_set(struct sock *sk, struct dst_entry *dst) 1338 __sk_dst_set(struct sock *sk, struct dst_entry *dst)
1340 { 1339 {
1341 struct dst_entry *old_dst; 1340 struct dst_entry *old_dst;
1342 1341
1343 sk_tx_queue_clear(sk); 1342 sk_tx_queue_clear(sk);
1344 /* 1343 /*
1345 * This can be called while sk is owned by the caller only, 1344 * This can be called while sk is owned by the caller only,
1346 * with no state that can be checked in a rcu_dereference_check() cond 1345 * with no state that can be checked in a rcu_dereference_check() cond
1347 */ 1346 */
1348 old_dst = rcu_dereference_raw(sk->sk_dst_cache); 1347 old_dst = rcu_dereference_raw(sk->sk_dst_cache);
1349 rcu_assign_pointer(sk->sk_dst_cache, dst); 1348 rcu_assign_pointer(sk->sk_dst_cache, dst);
1350 dst_release(old_dst); 1349 dst_release(old_dst);
1351 } 1350 }
1352 1351
1353 static inline void 1352 static inline void
1354 sk_dst_set(struct sock *sk, struct dst_entry *dst) 1353 sk_dst_set(struct sock *sk, struct dst_entry *dst)
1355 { 1354 {
1356 spin_lock(&sk->sk_dst_lock); 1355 spin_lock(&sk->sk_dst_lock);
1357 __sk_dst_set(sk, dst); 1356 __sk_dst_set(sk, dst);
1358 spin_unlock(&sk->sk_dst_lock); 1357 spin_unlock(&sk->sk_dst_lock);
1359 } 1358 }
1360 1359
1361 static inline void 1360 static inline void
1362 __sk_dst_reset(struct sock *sk) 1361 __sk_dst_reset(struct sock *sk)
1363 { 1362 {
1364 __sk_dst_set(sk, NULL); 1363 __sk_dst_set(sk, NULL);
1365 } 1364 }
1366 1365
1367 static inline void 1366 static inline void
1368 sk_dst_reset(struct sock *sk) 1367 sk_dst_reset(struct sock *sk)
1369 { 1368 {
1370 spin_lock(&sk->sk_dst_lock); 1369 spin_lock(&sk->sk_dst_lock);
1371 __sk_dst_reset(sk); 1370 __sk_dst_reset(sk);
1372 spin_unlock(&sk->sk_dst_lock); 1371 spin_unlock(&sk->sk_dst_lock);
1373 } 1372 }
1374 1373
1375 extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); 1374 extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
1376 1375
1377 extern struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); 1376 extern struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
1378 1377
1379 static inline int sk_can_gso(const struct sock *sk) 1378 static inline int sk_can_gso(const struct sock *sk)
1380 { 1379 {
1381 return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); 1380 return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
1382 } 1381 }
1383 1382
1384 extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst); 1383 extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
1385 1384
1386 static inline void sk_nocaps_add(struct sock *sk, int flags) 1385 static inline void sk_nocaps_add(struct sock *sk, int flags)
1387 { 1386 {
1388 sk->sk_route_nocaps |= flags; 1387 sk->sk_route_nocaps |= flags;
1389 sk->sk_route_caps &= ~flags; 1388 sk->sk_route_caps &= ~flags;
1390 } 1389 }
1391 1390
1392 static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, 1391 static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
1393 char __user *from, char *to, 1392 char __user *from, char *to,
1394 int copy, int offset) 1393 int copy, int offset)
1395 { 1394 {
1396 if (skb->ip_summed == CHECKSUM_NONE) { 1395 if (skb->ip_summed == CHECKSUM_NONE) {
1397 int err = 0; 1396 int err = 0;
1398 __wsum csum = csum_and_copy_from_user(from, to, copy, 0, &err); 1397 __wsum csum = csum_and_copy_from_user(from, to, copy, 0, &err);
1399 if (err) 1398 if (err)
1400 return err; 1399 return err;
1401 skb->csum = csum_block_add(skb->csum, csum, offset); 1400 skb->csum = csum_block_add(skb->csum, csum, offset);
1402 } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { 1401 } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
1403 if (!access_ok(VERIFY_READ, from, copy) || 1402 if (!access_ok(VERIFY_READ, from, copy) ||
1404 __copy_from_user_nocache(to, from, copy)) 1403 __copy_from_user_nocache(to, from, copy))
1405 return -EFAULT; 1404 return -EFAULT;
1406 } else if (copy_from_user(to, from, copy)) 1405 } else if (copy_from_user(to, from, copy))
1407 return -EFAULT; 1406 return -EFAULT;
1408 1407
1409 return 0; 1408 return 0;
1410 } 1409 }
1411 1410
1412 static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, 1411 static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
1413 char __user *from, int copy) 1412 char __user *from, int copy)
1414 { 1413 {
1415 int err, offset = skb->len; 1414 int err, offset = skb->len;
1416 1415
1417 err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), 1416 err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy),
1418 copy, offset); 1417 copy, offset);
1419 if (err) 1418 if (err)
1420 __skb_trim(skb, offset); 1419 __skb_trim(skb, offset);
1421 1420
1422 return err; 1421 return err;
1423 } 1422 }
1424 1423
1425 static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from, 1424 static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from,
1426 struct sk_buff *skb, 1425 struct sk_buff *skb,
1427 struct page *page, 1426 struct page *page,
1428 int off, int copy) 1427 int off, int copy)
1429 { 1428 {
1430 int err; 1429 int err;
1431 1430
1432 err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, 1431 err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
1433 copy, skb->len); 1432 copy, skb->len);
1434 if (err) 1433 if (err)
1435 return err; 1434 return err;
1436 1435
1437 skb->len += copy; 1436 skb->len += copy;
1438 skb->data_len += copy; 1437 skb->data_len += copy;
1439 skb->truesize += copy; 1438 skb->truesize += copy;
1440 sk->sk_wmem_queued += copy; 1439 sk->sk_wmem_queued += copy;
1441 sk_mem_charge(sk, copy); 1440 sk_mem_charge(sk, copy);
1442 return 0; 1441 return 0;
1443 } 1442 }
1444 1443
1445 static inline int skb_copy_to_page(struct sock *sk, char __user *from, 1444 static inline int skb_copy_to_page(struct sock *sk, char __user *from,
1446 struct sk_buff *skb, struct page *page, 1445 struct sk_buff *skb, struct page *page,
1447 int off, int copy) 1446 int off, int copy)
1448 { 1447 {
1449 if (skb->ip_summed == CHECKSUM_NONE) { 1448 if (skb->ip_summed == CHECKSUM_NONE) {
1450 int err = 0; 1449 int err = 0;
1451 __wsum csum = csum_and_copy_from_user(from, 1450 __wsum csum = csum_and_copy_from_user(from,
1452 page_address(page) + off, 1451 page_address(page) + off,
1453 copy, 0, &err); 1452 copy, 0, &err);
1454 if (err) 1453 if (err)
1455 return err; 1454 return err;
1456 skb->csum = csum_block_add(skb->csum, csum, skb->len); 1455 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1457 } else if (copy_from_user(page_address(page) + off, from, copy)) 1456 } else if (copy_from_user(page_address(page) + off, from, copy))
1458 return -EFAULT; 1457 return -EFAULT;
1459 1458
1460 skb->len += copy; 1459 skb->len += copy;
1461 skb->data_len += copy; 1460 skb->data_len += copy;
1462 skb->truesize += copy; 1461 skb->truesize += copy;
1463 sk->sk_wmem_queued += copy; 1462 sk->sk_wmem_queued += copy;
1464 sk_mem_charge(sk, copy); 1463 sk_mem_charge(sk, copy);
1465 return 0; 1464 return 0;
1466 } 1465 }
1467 1466
1468 /** 1467 /**
1469 * sk_wmem_alloc_get - returns write allocations 1468 * sk_wmem_alloc_get - returns write allocations
1470 * @sk: socket 1469 * @sk: socket
1471 * 1470 *
1472 * Returns sk_wmem_alloc minus initial offset of one 1471 * Returns sk_wmem_alloc minus initial offset of one
1473 */ 1472 */
1474 static inline int sk_wmem_alloc_get(const struct sock *sk) 1473 static inline int sk_wmem_alloc_get(const struct sock *sk)
1475 { 1474 {
1476 return atomic_read(&sk->sk_wmem_alloc) - 1; 1475 return atomic_read(&sk->sk_wmem_alloc) - 1;
1477 } 1476 }
1478 1477
1479 /** 1478 /**
1480 * sk_rmem_alloc_get - returns read allocations 1479 * sk_rmem_alloc_get - returns read allocations
1481 * @sk: socket 1480 * @sk: socket
1482 * 1481 *
1483 * Returns sk_rmem_alloc 1482 * Returns sk_rmem_alloc
1484 */ 1483 */
1485 static inline int sk_rmem_alloc_get(const struct sock *sk) 1484 static inline int sk_rmem_alloc_get(const struct sock *sk)
1486 { 1485 {
1487 return atomic_read(&sk->sk_rmem_alloc); 1486 return atomic_read(&sk->sk_rmem_alloc);
1488 } 1487 }
1489 1488
1490 /** 1489 /**
1491 * sk_has_allocations - check if allocations are outstanding 1490 * sk_has_allocations - check if allocations are outstanding
1492 * @sk: socket 1491 * @sk: socket
1493 * 1492 *
1494 * Returns true if socket has write or read allocations 1493 * Returns true if socket has write or read allocations
1495 */ 1494 */
1496 static inline int sk_has_allocations(const struct sock *sk) 1495 static inline int sk_has_allocations(const struct sock *sk)
1497 { 1496 {
1498 return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); 1497 return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
1499 } 1498 }
1500 1499
1501 /** 1500 /**
1502 * wq_has_sleeper - check if there are any waiting processes 1501 * wq_has_sleeper - check if there are any waiting processes
1503 * @wq: struct socket_wq 1502 * @wq: struct socket_wq
1504 * 1503 *
1505 * Returns true if socket_wq has waiting processes 1504 * Returns true if socket_wq has waiting processes
1506 * 1505 *
1507 * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory 1506 * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
1508 * barrier call. They were added due to the race found within the tcp code. 1507 * barrier call. They were added due to the race found within the tcp code.
1509 * 1508 *
1510 * Consider following tcp code paths: 1509 * Consider following tcp code paths:
1511 * 1510 *
1512 * CPU1 CPU2 1511 * CPU1 CPU2
1513 * 1512 *
1514 * sys_select receive packet 1513 * sys_select receive packet
1515 * ... ... 1514 * ... ...
1516 * __add_wait_queue update tp->rcv_nxt 1515 * __add_wait_queue update tp->rcv_nxt
1517 * ... ... 1516 * ... ...
1518 * tp->rcv_nxt check sock_def_readable 1517 * tp->rcv_nxt check sock_def_readable
1519 * ... { 1518 * ... {
1520 * schedule rcu_read_lock(); 1519 * schedule rcu_read_lock();
1521 * wq = rcu_dereference(sk->sk_wq); 1520 * wq = rcu_dereference(sk->sk_wq);
1522 * if (wq && waitqueue_active(&wq->wait)) 1521 * if (wq && waitqueue_active(&wq->wait))
1523 * wake_up_interruptible(&wq->wait) 1522 * wake_up_interruptible(&wq->wait)
1524 * ... 1523 * ...
1525 * } 1524 * }
1526 * 1525 *
1527 * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay 1526 * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay
1528 * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 1527 * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1
1529 * could then endup calling schedule and sleep forever if there are no more 1528 * could then endup calling schedule and sleep forever if there are no more
1530 * data on the socket. 1529 * data on the socket.
1531 * 1530 *
1532 */ 1531 */
1533 static inline bool wq_has_sleeper(struct socket_wq *wq) 1532 static inline bool wq_has_sleeper(struct socket_wq *wq)
1534 { 1533 {
1535 1534
1536 /* 1535 /*
1537 * We need to be sure we are in sync with the 1536 * We need to be sure we are in sync with the
1538 * add_wait_queue modifications to the wait queue. 1537 * add_wait_queue modifications to the wait queue.
1539 * 1538 *
1540 * This memory barrier is paired in the sock_poll_wait. 1539 * This memory barrier is paired in the sock_poll_wait.
1541 */ 1540 */
1542 smp_mb(); 1541 smp_mb();
1543 return wq && waitqueue_active(&wq->wait); 1542 return wq && waitqueue_active(&wq->wait);
1544 } 1543 }
1545 1544
1546 /** 1545 /**
1547 * sock_poll_wait - place memory barrier behind the poll_wait call. 1546 * sock_poll_wait - place memory barrier behind the poll_wait call.
1548 * @filp: file 1547 * @filp: file
1549 * @wait_address: socket wait queue 1548 * @wait_address: socket wait queue
1550 * @p: poll_table 1549 * @p: poll_table
1551 * 1550 *
1552 * See the comments in the wq_has_sleeper function. 1551 * See the comments in the wq_has_sleeper function.
1553 */ 1552 */
1554 static inline void sock_poll_wait(struct file *filp, 1553 static inline void sock_poll_wait(struct file *filp,
1555 wait_queue_head_t *wait_address, poll_table *p) 1554 wait_queue_head_t *wait_address, poll_table *p)
1556 { 1555 {
1557 if (p && wait_address) { 1556 if (p && wait_address) {
1558 poll_wait(filp, wait_address, p); 1557 poll_wait(filp, wait_address, p);
1559 /* 1558 /*
1560 * We need to be sure we are in sync with the 1559 * We need to be sure we are in sync with the
1561 * socket flags modification. 1560 * socket flags modification.
1562 * 1561 *
1563 * This memory barrier is paired in the wq_has_sleeper. 1562 * This memory barrier is paired in the wq_has_sleeper.
1564 */ 1563 */
1565 smp_mb(); 1564 smp_mb();
1566 } 1565 }
1567 } 1566 }
1568 1567
1569 /* 1568 /*
1570 * Queue a received datagram if it will fit. Stream and sequenced 1569 * Queue a received datagram if it will fit. Stream and sequenced
1571 * protocols can't normally use this as they need to fit buffers in 1570 * protocols can't normally use this as they need to fit buffers in
1572 * and play with them. 1571 * and play with them.
1573 * 1572 *
1574 * Inlined as it's very short and called for pretty much every 1573 * Inlined as it's very short and called for pretty much every
1575 * packet ever received. 1574 * packet ever received.
1576 */ 1575 */
1577 1576
1578 static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 1577 static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1579 { 1578 {
1580 skb_orphan(skb); 1579 skb_orphan(skb);
1581 skb->sk = sk; 1580 skb->sk = sk;
1582 skb->destructor = sock_wfree; 1581 skb->destructor = sock_wfree;
1583 /* 1582 /*
1584 * We used to take a refcount on sk, but following operation 1583 * We used to take a refcount on sk, but following operation
1585 * is enough to guarantee sk_free() wont free this sock until 1584 * is enough to guarantee sk_free() wont free this sock until
1586 * all in-flight packets are completed 1585 * all in-flight packets are completed
1587 */ 1586 */
1588 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 1587 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1589 } 1588 }
1590 1589
1591 static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) 1590 static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
1592 { 1591 {
1593 skb_orphan(skb); 1592 skb_orphan(skb);
1594 skb->sk = sk; 1593 skb->sk = sk;
1595 skb->destructor = sock_rfree; 1594 skb->destructor = sock_rfree;
1596 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 1595 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
1597 sk_mem_charge(sk, skb->truesize); 1596 sk_mem_charge(sk, skb->truesize);
1598 } 1597 }
1599 1598
1600 extern void sk_reset_timer(struct sock *sk, struct timer_list* timer, 1599 extern void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1601 unsigned long expires); 1600 unsigned long expires);
1602 1601
1603 extern void sk_stop_timer(struct sock *sk, struct timer_list* timer); 1602 extern void sk_stop_timer(struct sock *sk, struct timer_list* timer);
1604 1603
1605 extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 1604 extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
1606 1605
1607 extern int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); 1606 extern int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
1608 1607
1609 /* 1608 /*
1610 * Recover an error report and clear atomically 1609 * Recover an error report and clear atomically
1611 */ 1610 */
1612 1611
1613 static inline int sock_error(struct sock *sk) 1612 static inline int sock_error(struct sock *sk)
1614 { 1613 {
1615 int err; 1614 int err;
1616 if (likely(!sk->sk_err)) 1615 if (likely(!sk->sk_err))
1617 return 0; 1616 return 0;
1618 err = xchg(&sk->sk_err, 0); 1617 err = xchg(&sk->sk_err, 0);
1619 return -err; 1618 return -err;
1620 } 1619 }
1621 1620
1622 static inline unsigned long sock_wspace(struct sock *sk) 1621 static inline unsigned long sock_wspace(struct sock *sk)
1623 { 1622 {
1624 int amt = 0; 1623 int amt = 0;
1625 1624
1626 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 1625 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
1627 amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); 1626 amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
1628 if (amt < 0) 1627 if (amt < 0)
1629 amt = 0; 1628 amt = 0;
1630 } 1629 }
1631 return amt; 1630 return amt;
1632 } 1631 }
1633 1632
1634 static inline void sk_wake_async(struct sock *sk, int how, int band) 1633 static inline void sk_wake_async(struct sock *sk, int how, int band)
1635 { 1634 {
1636 if (sock_flag(sk, SOCK_FASYNC)) 1635 if (sock_flag(sk, SOCK_FASYNC))
1637 sock_wake_async(sk->sk_socket, how, band); 1636 sock_wake_async(sk->sk_socket, how, band);
1638 } 1637 }
1639 1638
1640 #define SOCK_MIN_SNDBUF 2048 1639 #define SOCK_MIN_SNDBUF 2048
1641 /* 1640 /*
1642 * Since sk_rmem_alloc sums skb->truesize, even a small frame might need 1641 * Since sk_rmem_alloc sums skb->truesize, even a small frame might need
1643 * sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak 1642 * sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak
1644 */ 1643 */
1645 #define SOCK_MIN_RCVBUF (2048 + sizeof(struct sk_buff)) 1644 #define SOCK_MIN_RCVBUF (2048 + sizeof(struct sk_buff))
1646 1645
1647 static inline void sk_stream_moderate_sndbuf(struct sock *sk) 1646 static inline void sk_stream_moderate_sndbuf(struct sock *sk)
1648 { 1647 {
1649 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) { 1648 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) {
1650 sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); 1649 sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
1651 sk->sk_sndbuf = max(sk->sk_sndbuf, SOCK_MIN_SNDBUF); 1650 sk->sk_sndbuf = max(sk->sk_sndbuf, SOCK_MIN_SNDBUF);
1652 } 1651 }
1653 } 1652 }
1654 1653
1655 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp); 1654 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp);
1656 1655
1657 static inline struct page *sk_stream_alloc_page(struct sock *sk) 1656 static inline struct page *sk_stream_alloc_page(struct sock *sk)
1658 { 1657 {
1659 struct page *page = NULL; 1658 struct page *page = NULL;
1660 1659
1661 page = alloc_pages(sk->sk_allocation, 0); 1660 page = alloc_pages(sk->sk_allocation, 0);
1662 if (!page) { 1661 if (!page) {
1663 sk->sk_prot->enter_memory_pressure(sk); 1662 sk->sk_prot->enter_memory_pressure(sk);
1664 sk_stream_moderate_sndbuf(sk); 1663 sk_stream_moderate_sndbuf(sk);
1665 } 1664 }
1666 return page; 1665 return page;
1667 } 1666 }
1668 1667
1669 /* 1668 /*
1670 * Default write policy as shown to user space via poll/select/SIGIO 1669 * Default write policy as shown to user space via poll/select/SIGIO
1671 */ 1670 */
1672 static inline int sock_writeable(const struct sock *sk) 1671 static inline int sock_writeable(const struct sock *sk)
1673 { 1672 {
1674 return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1); 1673 return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
1675 } 1674 }
1676 1675
1677 static inline gfp_t gfp_any(void) 1676 static inline gfp_t gfp_any(void)
1678 { 1677 {
1679 return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; 1678 return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
1680 } 1679 }
1681 1680
1682 static inline long sock_rcvtimeo(const struct sock *sk, int noblock) 1681 static inline long sock_rcvtimeo(const struct sock *sk, int noblock)
1683 { 1682 {
1684 return noblock ? 0 : sk->sk_rcvtimeo; 1683 return noblock ? 0 : sk->sk_rcvtimeo;
1685 } 1684 }
1686 1685
1687 static inline long sock_sndtimeo(const struct sock *sk, int noblock) 1686 static inline long sock_sndtimeo(const struct sock *sk, int noblock)
1688 { 1687 {
1689 return noblock ? 0 : sk->sk_sndtimeo; 1688 return noblock ? 0 : sk->sk_sndtimeo;
1690 } 1689 }
1691 1690
1692 static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) 1691 static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
1693 { 1692 {
1694 return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1; 1693 return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1;
1695 } 1694 }
1696 1695
1697 /* Alas, with timeout socket operations are not restartable. 1696 /* Alas, with timeout socket operations are not restartable.
1698 * Compare this to poll(). 1697 * Compare this to poll().
1699 */ 1698 */
1700 static inline int sock_intr_errno(long timeo) 1699 static inline int sock_intr_errno(long timeo)
1701 { 1700 {
1702 return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; 1701 return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
1703 } 1702 }
1704 1703
1705 extern void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, 1704 extern void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
1706 struct sk_buff *skb); 1705 struct sk_buff *skb);
1707 1706
1708 static __inline__ void 1707 static __inline__ void
1709 sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) 1708 sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
1710 { 1709 {
1711 ktime_t kt = skb->tstamp; 1710 ktime_t kt = skb->tstamp;
1712 struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); 1711 struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
1713 1712
1714 /* 1713 /*
1715 * generate control messages if 1714 * generate control messages if
1716 * - receive time stamping in software requested (SOCK_RCVTSTAMP 1715 * - receive time stamping in software requested (SOCK_RCVTSTAMP
1717 * or SOCK_TIMESTAMPING_RX_SOFTWARE) 1716 * or SOCK_TIMESTAMPING_RX_SOFTWARE)
1718 * - software time stamp available and wanted 1717 * - software time stamp available and wanted
1719 * (SOCK_TIMESTAMPING_SOFTWARE) 1718 * (SOCK_TIMESTAMPING_SOFTWARE)
1720 * - hardware time stamps available and wanted 1719 * - hardware time stamps available and wanted
1721 * (SOCK_TIMESTAMPING_SYS_HARDWARE or 1720 * (SOCK_TIMESTAMPING_SYS_HARDWARE or
1722 * SOCK_TIMESTAMPING_RAW_HARDWARE) 1721 * SOCK_TIMESTAMPING_RAW_HARDWARE)
1723 */ 1722 */
1724 if (sock_flag(sk, SOCK_RCVTSTAMP) || 1723 if (sock_flag(sk, SOCK_RCVTSTAMP) ||
1725 sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE) || 1724 sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE) ||
1726 (kt.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) || 1725 (kt.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) ||
1727 (hwtstamps->hwtstamp.tv64 && 1726 (hwtstamps->hwtstamp.tv64 &&
1728 sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) || 1727 sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) ||
1729 (hwtstamps->syststamp.tv64 && 1728 (hwtstamps->syststamp.tv64 &&
1730 sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))) 1729 sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)))
1731 __sock_recv_timestamp(msg, sk, skb); 1730 __sock_recv_timestamp(msg, sk, skb);
1732 else 1731 else
1733 sk->sk_stamp = kt; 1732 sk->sk_stamp = kt;
1734 } 1733 }
1735 1734
1736 extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, 1735 extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
1737 struct sk_buff *skb); 1736 struct sk_buff *skb);
1738 1737
1739 static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, 1738 static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
1740 struct sk_buff *skb) 1739 struct sk_buff *skb)
1741 { 1740 {
1742 #define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ 1741 #define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \
1743 (1UL << SOCK_RCVTSTAMP) | \ 1742 (1UL << SOCK_RCVTSTAMP) | \
1744 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 1743 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
1745 (1UL << SOCK_TIMESTAMPING_SOFTWARE) | \ 1744 (1UL << SOCK_TIMESTAMPING_SOFTWARE) | \
1746 (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE) | \ 1745 (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE) | \
1747 (1UL << SOCK_TIMESTAMPING_SYS_HARDWARE)) 1746 (1UL << SOCK_TIMESTAMPING_SYS_HARDWARE))
1748 1747
1749 if (sk->sk_flags & FLAGS_TS_OR_DROPS) 1748 if (sk->sk_flags & FLAGS_TS_OR_DROPS)
1750 __sock_recv_ts_and_drops(msg, sk, skb); 1749 __sock_recv_ts_and_drops(msg, sk, skb);
1751 else 1750 else
1752 sk->sk_stamp = skb->tstamp; 1751 sk->sk_stamp = skb->tstamp;
1753 } 1752 }
1754 1753
1755 /** 1754 /**
1756 * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped 1755 * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
1757 * @sk: socket sending this packet 1756 * @sk: socket sending this packet
1758 * @tx_flags: filled with instructions for time stamping 1757 * @tx_flags: filled with instructions for time stamping
1759 * 1758 *
1760 * Currently only depends on SOCK_TIMESTAMPING* flags. Returns error code if 1759 * Currently only depends on SOCK_TIMESTAMPING* flags. Returns error code if
1761 * parameters are invalid. 1760 * parameters are invalid.
1762 */ 1761 */
1763 extern int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags); 1762 extern int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags);
1764 1763
1765 /** 1764 /**
1766 * sk_eat_skb - Release a skb if it is no longer needed 1765 * sk_eat_skb - Release a skb if it is no longer needed
1767 * @sk: socket to eat this skb from 1766 * @sk: socket to eat this skb from
1768 * @skb: socket buffer to eat 1767 * @skb: socket buffer to eat
1769 * @copied_early: flag indicating whether DMA operations copied this data early 1768 * @copied_early: flag indicating whether DMA operations copied this data early
1770 * 1769 *
1771 * This routine must be called with interrupts disabled or with the socket 1770 * This routine must be called with interrupts disabled or with the socket
1772 * locked so that the sk_buff queue operation is ok. 1771 * locked so that the sk_buff queue operation is ok.
1773 */ 1772 */
1774 #ifdef CONFIG_NET_DMA 1773 #ifdef CONFIG_NET_DMA
1775 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) 1774 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
1776 { 1775 {
1777 __skb_unlink(skb, &sk->sk_receive_queue); 1776 __skb_unlink(skb, &sk->sk_receive_queue);
1778 if (!copied_early) 1777 if (!copied_early)
1779 __kfree_skb(skb); 1778 __kfree_skb(skb);
1780 else 1779 else
1781 __skb_queue_tail(&sk->sk_async_wait_queue, skb); 1780 __skb_queue_tail(&sk->sk_async_wait_queue, skb);
1782 } 1781 }
1783 #else 1782 #else
1784 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) 1783 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
1785 { 1784 {
1786 __skb_unlink(skb, &sk->sk_receive_queue); 1785 __skb_unlink(skb, &sk->sk_receive_queue);
1787 __kfree_skb(skb); 1786 __kfree_skb(skb);
1788 } 1787 }
1789 #endif 1788 #endif
1790 1789
1791 static inline 1790 static inline
1792 struct net *sock_net(const struct sock *sk) 1791 struct net *sock_net(const struct sock *sk)
1793 { 1792 {
1794 return read_pnet(&sk->sk_net); 1793 return read_pnet(&sk->sk_net);
1795 } 1794 }
1796 1795
1797 static inline 1796 static inline
1798 void sock_net_set(struct sock *sk, struct net *net) 1797 void sock_net_set(struct sock *sk, struct net *net)
1799 { 1798 {
1800 write_pnet(&sk->sk_net, net); 1799 write_pnet(&sk->sk_net, net);
1801 } 1800 }
1802 1801
1803 /* 1802 /*
1804 * Kernel sockets, f.e. rtnl or icmp_socket, are a part of a namespace. 1803 * Kernel sockets, f.e. rtnl or icmp_socket, are a part of a namespace.
1805 * They should not hold a reference to a namespace in order to allow 1804 * They should not hold a reference to a namespace in order to allow
1806 * to stop it. 1805 * to stop it.
1807 * Sockets after sk_change_net should be released using sk_release_kernel 1806 * Sockets after sk_change_net should be released using sk_release_kernel
1808 */ 1807 */
1809 static inline void sk_change_net(struct sock *sk, struct net *net) 1808 static inline void sk_change_net(struct sock *sk, struct net *net)
1810 { 1809 {
1811 put_net(sock_net(sk)); 1810 put_net(sock_net(sk));
1812 sock_net_set(sk, hold_net(net)); 1811 sock_net_set(sk, hold_net(net));
1813 } 1812 }
1814 1813
1815 static inline struct sock *skb_steal_sock(struct sk_buff *skb) 1814 static inline struct sock *skb_steal_sock(struct sk_buff *skb)
1816 { 1815 {
1817 if (unlikely(skb->sk)) { 1816 if (unlikely(skb->sk)) {
1818 struct sock *sk = skb->sk; 1817 struct sock *sk = skb->sk;
1819 1818
1820 skb->destructor = NULL; 1819 skb->destructor = NULL;
1821 skb->sk = NULL; 1820 skb->sk = NULL;
1822 return sk; 1821 return sk;
1823 } 1822 }
1824 return NULL; 1823 return NULL;
1825 } 1824 }
1826 1825
1827 extern void sock_enable_timestamp(struct sock *sk, int flag); 1826 extern void sock_enable_timestamp(struct sock *sk, int flag);
1828 extern int sock_get_timestamp(struct sock *, struct timeval __user *); 1827 extern int sock_get_timestamp(struct sock *, struct timeval __user *);
1829 extern int sock_get_timestampns(struct sock *, struct timespec __user *); 1828 extern int sock_get_timestampns(struct sock *, struct timespec __user *);
1830 1829
1831 /* 1830 /*
1832 * Enable debug/info messages 1831 * Enable debug/info messages
1833 */ 1832 */
1834 extern int net_msg_warn; 1833 extern int net_msg_warn;
1835 #define NETDEBUG(fmt, args...) \ 1834 #define NETDEBUG(fmt, args...) \
1836 do { if (net_msg_warn) printk(fmt,##args); } while (0) 1835 do { if (net_msg_warn) printk(fmt,##args); } while (0)
1837 1836
1838 #define LIMIT_NETDEBUG(fmt, args...) \ 1837 #define LIMIT_NETDEBUG(fmt, args...) \
1839 do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0) 1838 do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0)
1840 1839
1841 extern __u32 sysctl_wmem_max; 1840 extern __u32 sysctl_wmem_max;
1842 extern __u32 sysctl_rmem_max; 1841 extern __u32 sysctl_rmem_max;
1843 1842
1844 extern void sk_init(void); 1843 extern void sk_init(void);
1845 1844
1846 extern int sysctl_optmem_max; 1845 extern int sysctl_optmem_max;
1847 1846
1848 extern __u32 sysctl_wmem_default; 1847 extern __u32 sysctl_wmem_default;
1849 extern __u32 sysctl_rmem_default; 1848 extern __u32 sysctl_rmem_default;
1850 1849
1851 #endif /* _SOCK_H */ 1850 #endif /* _SOCK_H */
1852 1851
1 /* 1 /*
2 * Generic process-grouping system. 2 * Generic process-grouping system.
3 * 3 *
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support 7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation 8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov 9 * Author: Kirill A. Shutemov
10 * 10 *
11 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
12 * -------------------------------------------------- 12 * --------------------------------------------------
13 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
14 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 14 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
15 * 15 *
16 * Portions derived from Patrick Mochel's sysfs code. 16 * Portions derived from Patrick Mochel's sysfs code.
17 * sysfs is Copyright (c) 2001-3 Patrick Mochel 17 * sysfs is Copyright (c) 2001-3 Patrick Mochel
18 * 18 *
19 * 2003-10-10 Written by Simon Derr. 19 * 2003-10-10 Written by Simon Derr.
20 * 2003-10-22 Updates by Stephen Hemminger. 20 * 2003-10-22 Updates by Stephen Hemminger.
21 * 2004 May-July Rework by Paul Jackson. 21 * 2004 May-July Rework by Paul Jackson.
22 * --------------------------------------------------- 22 * ---------------------------------------------------
23 * 23 *
24 * This file is subject to the terms and conditions of the GNU General Public 24 * This file is subject to the terms and conditions of the GNU General Public
25 * License. See the file COPYING in the main directory of the Linux 25 * License. See the file COPYING in the main directory of the Linux
26 * distribution for more details. 26 * distribution for more details.
27 */ 27 */
28 28
29 #include <linux/cgroup.h> 29 #include <linux/cgroup.h>
30 #include <linux/ctype.h> 30 #include <linux/ctype.h>
31 #include <linux/errno.h> 31 #include <linux/errno.h>
32 #include <linux/fs.h> 32 #include <linux/fs.h>
33 #include <linux/kernel.h> 33 #include <linux/kernel.h>
34 #include <linux/list.h> 34 #include <linux/list.h>
35 #include <linux/mm.h> 35 #include <linux/mm.h>
36 #include <linux/mutex.h> 36 #include <linux/mutex.h>
37 #include <linux/mount.h> 37 #include <linux/mount.h>
38 #include <linux/pagemap.h> 38 #include <linux/pagemap.h>
39 #include <linux/proc_fs.h> 39 #include <linux/proc_fs.h>
40 #include <linux/rcupdate.h> 40 #include <linux/rcupdate.h>
41 #include <linux/sched.h> 41 #include <linux/sched.h>
42 #include <linux/backing-dev.h> 42 #include <linux/backing-dev.h>
43 #include <linux/seq_file.h> 43 #include <linux/seq_file.h>
44 #include <linux/slab.h> 44 #include <linux/slab.h>
45 #include <linux/magic.h> 45 #include <linux/magic.h>
46 #include <linux/spinlock.h> 46 #include <linux/spinlock.h>
47 #include <linux/string.h> 47 #include <linux/string.h>
48 #include <linux/sort.h> 48 #include <linux/sort.h>
49 #include <linux/kmod.h> 49 #include <linux/kmod.h>
50 #include <linux/module.h> 50 #include <linux/module.h>
51 #include <linux/delayacct.h> 51 #include <linux/delayacct.h>
52 #include <linux/cgroupstats.h> 52 #include <linux/cgroupstats.h>
53 #include <linux/hash.h> 53 #include <linux/hash.h>
54 #include <linux/namei.h> 54 #include <linux/namei.h>
55 #include <linux/pid_namespace.h> 55 #include <linux/pid_namespace.h>
56 #include <linux/idr.h> 56 #include <linux/idr.h>
57 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58 #include <linux/eventfd.h> 58 #include <linux/eventfd.h>
59 #include <linux/poll.h> 59 #include <linux/poll.h>
60 #include <linux/flex_array.h> /* used in cgroup_attach_proc */ 60 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
61 61
62 #include <asm/atomic.h> 62 #include <asm/atomic.h>
63 63
64 static DEFINE_MUTEX(cgroup_mutex); 64 static DEFINE_MUTEX(cgroup_mutex);
65 65
66 /* 66 /*
67 * Generate an array of cgroup subsystem pointers. At boot time, this is 67 * Generate an array of cgroup subsystem pointers. At boot time, this is
68 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are 68 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69 * registered after that. The mutable section of this array is protected by 69 * registered after that. The mutable section of this array is protected by
70 * cgroup_mutex. 70 * cgroup_mutex.
71 */ 71 */
72 #define SUBSYS(_x) &_x ## _subsys, 72 #define SUBSYS(_x) &_x ## _subsys,
73 static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 73 static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
74 #include <linux/cgroup_subsys.h> 74 #include <linux/cgroup_subsys.h>
75 }; 75 };
76 76
77 #define MAX_CGROUP_ROOT_NAMELEN 64 77 #define MAX_CGROUP_ROOT_NAMELEN 64
78 78
79 /* 79 /*
80 * A cgroupfs_root represents the root of a cgroup hierarchy, 80 * A cgroupfs_root represents the root of a cgroup hierarchy,
81 * and may be associated with a superblock to form an active 81 * and may be associated with a superblock to form an active
82 * hierarchy 82 * hierarchy
83 */ 83 */
84 struct cgroupfs_root { 84 struct cgroupfs_root {
85 struct super_block *sb; 85 struct super_block *sb;
86 86
87 /* 87 /*
88 * The bitmask of subsystems intended to be attached to this 88 * The bitmask of subsystems intended to be attached to this
89 * hierarchy 89 * hierarchy
90 */ 90 */
91 unsigned long subsys_bits; 91 unsigned long subsys_bits;
92 92
93 /* Unique id for this hierarchy. */ 93 /* Unique id for this hierarchy. */
94 int hierarchy_id; 94 int hierarchy_id;
95 95
96 /* The bitmask of subsystems currently attached to this hierarchy */ 96 /* The bitmask of subsystems currently attached to this hierarchy */
97 unsigned long actual_subsys_bits; 97 unsigned long actual_subsys_bits;
98 98
99 /* A list running through the attached subsystems */ 99 /* A list running through the attached subsystems */
100 struct list_head subsys_list; 100 struct list_head subsys_list;
101 101
102 /* The root cgroup for this hierarchy */ 102 /* The root cgroup for this hierarchy */
103 struct cgroup top_cgroup; 103 struct cgroup top_cgroup;
104 104
105 /* Tracks how many cgroups are currently defined in hierarchy.*/ 105 /* Tracks how many cgroups are currently defined in hierarchy.*/
106 int number_of_cgroups; 106 int number_of_cgroups;
107 107
108 /* A list running through the active hierarchies */ 108 /* A list running through the active hierarchies */
109 struct list_head root_list; 109 struct list_head root_list;
110 110
111 /* Hierarchy-specific flags */ 111 /* Hierarchy-specific flags */
112 unsigned long flags; 112 unsigned long flags;
113 113
114 /* The path to use for release notifications. */ 114 /* The path to use for release notifications. */
115 char release_agent_path[PATH_MAX]; 115 char release_agent_path[PATH_MAX];
116 116
117 /* The name for this hierarchy - may be empty */ 117 /* The name for this hierarchy - may be empty */
118 char name[MAX_CGROUP_ROOT_NAMELEN]; 118 char name[MAX_CGROUP_ROOT_NAMELEN];
119 }; 119 };
120 120
121 /* 121 /*
122 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 122 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
123 * subsystems that are otherwise unattached - it never has more than a 123 * subsystems that are otherwise unattached - it never has more than a
124 * single cgroup, and all tasks are part of that cgroup. 124 * single cgroup, and all tasks are part of that cgroup.
125 */ 125 */
126 static struct cgroupfs_root rootnode; 126 static struct cgroupfs_root rootnode;
127 127
128 /* 128 /*
129 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 129 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
130 * cgroup_subsys->use_id != 0. 130 * cgroup_subsys->use_id != 0.
131 */ 131 */
132 #define CSS_ID_MAX (65535) 132 #define CSS_ID_MAX (65535)
133 struct css_id { 133 struct css_id {
134 /* 134 /*
135 * The css to which this ID points. This pointer is set to valid value 135 * The css to which this ID points. This pointer is set to valid value
136 * after cgroup is populated. If cgroup is removed, this will be NULL. 136 * after cgroup is populated. If cgroup is removed, this will be NULL.
137 * This pointer is expected to be RCU-safe because destroy() 137 * This pointer is expected to be RCU-safe because destroy()
138 * is called after synchronize_rcu(). But for safe use, css_is_removed() 138 * is called after synchronize_rcu(). But for safe use, css_is_removed()
139 * css_tryget() should be used for avoiding race. 139 * css_tryget() should be used for avoiding race.
140 */ 140 */
141 struct cgroup_subsys_state __rcu *css; 141 struct cgroup_subsys_state __rcu *css;
142 /* 142 /*
143 * ID of this css. 143 * ID of this css.
144 */ 144 */
145 unsigned short id; 145 unsigned short id;
146 /* 146 /*
147 * Depth in hierarchy which this ID belongs to. 147 * Depth in hierarchy which this ID belongs to.
148 */ 148 */
149 unsigned short depth; 149 unsigned short depth;
150 /* 150 /*
151 * ID is freed by RCU. (and lookup routine is RCU safe.) 151 * ID is freed by RCU. (and lookup routine is RCU safe.)
152 */ 152 */
153 struct rcu_head rcu_head; 153 struct rcu_head rcu_head;
154 /* 154 /*
155 * Hierarchy of CSS ID belongs to. 155 * Hierarchy of CSS ID belongs to.
156 */ 156 */
157 unsigned short stack[0]; /* Array of Length (depth+1) */ 157 unsigned short stack[0]; /* Array of Length (depth+1) */
158 }; 158 };
159 159
160 /* 160 /*
161 * cgroup_event represents events which userspace want to receive. 161 * cgroup_event represents events which userspace want to receive.
162 */ 162 */
163 struct cgroup_event { 163 struct cgroup_event {
164 /* 164 /*
165 * Cgroup which the event belongs to. 165 * Cgroup which the event belongs to.
166 */ 166 */
167 struct cgroup *cgrp; 167 struct cgroup *cgrp;
168 /* 168 /*
169 * Control file which the event associated. 169 * Control file which the event associated.
170 */ 170 */
171 struct cftype *cft; 171 struct cftype *cft;
172 /* 172 /*
173 * eventfd to signal userspace about the event. 173 * eventfd to signal userspace about the event.
174 */ 174 */
175 struct eventfd_ctx *eventfd; 175 struct eventfd_ctx *eventfd;
176 /* 176 /*
177 * Each of these stored in a list by the cgroup. 177 * Each of these stored in a list by the cgroup.
178 */ 178 */
179 struct list_head list; 179 struct list_head list;
180 /* 180 /*
181 * All fields below needed to unregister event when 181 * All fields below needed to unregister event when
182 * userspace closes eventfd. 182 * userspace closes eventfd.
183 */ 183 */
184 poll_table pt; 184 poll_table pt;
185 wait_queue_head_t *wqh; 185 wait_queue_head_t *wqh;
186 wait_queue_t wait; 186 wait_queue_t wait;
187 struct work_struct remove; 187 struct work_struct remove;
188 }; 188 };
189 189
190 /* The list of hierarchy roots */ 190 /* The list of hierarchy roots */
191 191
192 static LIST_HEAD(roots); 192 static LIST_HEAD(roots);
193 static int root_count; 193 static int root_count;
194 194
195 static DEFINE_IDA(hierarchy_ida); 195 static DEFINE_IDA(hierarchy_ida);
196 static int next_hierarchy_id; 196 static int next_hierarchy_id;
197 static DEFINE_SPINLOCK(hierarchy_id_lock); 197 static DEFINE_SPINLOCK(hierarchy_id_lock);
198 198
199 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 199 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
200 #define dummytop (&rootnode.top_cgroup) 200 #define dummytop (&rootnode.top_cgroup)
201 201
202 /* This flag indicates whether tasks in the fork and exit paths should 202 /* This flag indicates whether tasks in the fork and exit paths should
203 * check for fork/exit handlers to call. This avoids us having to do 203 * check for fork/exit handlers to call. This avoids us having to do
204 * extra work in the fork/exit path if none of the subsystems need to 204 * extra work in the fork/exit path if none of the subsystems need to
205 * be called. 205 * be called.
206 */ 206 */
207 static int need_forkexit_callback __read_mostly; 207 static int need_forkexit_callback __read_mostly;
208 208
209 #ifdef CONFIG_PROVE_LOCKING 209 #ifdef CONFIG_PROVE_LOCKING
210 int cgroup_lock_is_held(void) 210 int cgroup_lock_is_held(void)
211 { 211 {
212 return lockdep_is_held(&cgroup_mutex); 212 return lockdep_is_held(&cgroup_mutex);
213 } 213 }
214 #else /* #ifdef CONFIG_PROVE_LOCKING */ 214 #else /* #ifdef CONFIG_PROVE_LOCKING */
215 int cgroup_lock_is_held(void) 215 int cgroup_lock_is_held(void)
216 { 216 {
217 return mutex_is_locked(&cgroup_mutex); 217 return mutex_is_locked(&cgroup_mutex);
218 } 218 }
219 #endif /* #else #ifdef CONFIG_PROVE_LOCKING */ 219 #endif /* #else #ifdef CONFIG_PROVE_LOCKING */
220 220
221 EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 221 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
222 222
223 /* convenient tests for these bits */ 223 /* convenient tests for these bits */
224 inline int cgroup_is_removed(const struct cgroup *cgrp) 224 inline int cgroup_is_removed(const struct cgroup *cgrp)
225 { 225 {
226 return test_bit(CGRP_REMOVED, &cgrp->flags); 226 return test_bit(CGRP_REMOVED, &cgrp->flags);
227 } 227 }
228 228
229 /* bits in struct cgroupfs_root flags field */ 229 /* bits in struct cgroupfs_root flags field */
230 enum { 230 enum {
231 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 231 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
232 }; 232 };
233 233
234 static int cgroup_is_releasable(const struct cgroup *cgrp) 234 static int cgroup_is_releasable(const struct cgroup *cgrp)
235 { 235 {
236 const int bits = 236 const int bits =
237 (1 << CGRP_RELEASABLE) | 237 (1 << CGRP_RELEASABLE) |
238 (1 << CGRP_NOTIFY_ON_RELEASE); 238 (1 << CGRP_NOTIFY_ON_RELEASE);
239 return (cgrp->flags & bits) == bits; 239 return (cgrp->flags & bits) == bits;
240 } 240 }
241 241
242 static int notify_on_release(const struct cgroup *cgrp) 242 static int notify_on_release(const struct cgroup *cgrp)
243 { 243 {
244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 244 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
245 } 245 }
246 246
247 static int clone_children(const struct cgroup *cgrp) 247 static int clone_children(const struct cgroup *cgrp)
248 { 248 {
249 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 249 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
250 } 250 }
251 251
252 /* 252 /*
253 * for_each_subsys() allows you to iterate on each subsystem attached to 253 * for_each_subsys() allows you to iterate on each subsystem attached to
254 * an active hierarchy 254 * an active hierarchy
255 */ 255 */
256 #define for_each_subsys(_root, _ss) \ 256 #define for_each_subsys(_root, _ss) \
257 list_for_each_entry(_ss, &_root->subsys_list, sibling) 257 list_for_each_entry(_ss, &_root->subsys_list, sibling)
258 258
259 /* for_each_active_root() allows you to iterate across the active hierarchies */ 259 /* for_each_active_root() allows you to iterate across the active hierarchies */
260 #define for_each_active_root(_root) \ 260 #define for_each_active_root(_root) \
261 list_for_each_entry(_root, &roots, root_list) 261 list_for_each_entry(_root, &roots, root_list)
262 262
263 /* the list of cgroups eligible for automatic release. Protected by 263 /* the list of cgroups eligible for automatic release. Protected by
264 * release_list_lock */ 264 * release_list_lock */
265 static LIST_HEAD(release_list); 265 static LIST_HEAD(release_list);
266 static DEFINE_SPINLOCK(release_list_lock); 266 static DEFINE_SPINLOCK(release_list_lock);
267 static void cgroup_release_agent(struct work_struct *work); 267 static void cgroup_release_agent(struct work_struct *work);
268 static DECLARE_WORK(release_agent_work, cgroup_release_agent); 268 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
269 static void check_for_release(struct cgroup *cgrp); 269 static void check_for_release(struct cgroup *cgrp);
270 270
271 /* Link structure for associating css_set objects with cgroups */ 271 /* Link structure for associating css_set objects with cgroups */
272 struct cg_cgroup_link { 272 struct cg_cgroup_link {
273 /* 273 /*
274 * List running through cg_cgroup_links associated with a 274 * List running through cg_cgroup_links associated with a
275 * cgroup, anchored on cgroup->css_sets 275 * cgroup, anchored on cgroup->css_sets
276 */ 276 */
277 struct list_head cgrp_link_list; 277 struct list_head cgrp_link_list;
278 struct cgroup *cgrp; 278 struct cgroup *cgrp;
279 /* 279 /*
280 * List running through cg_cgroup_links pointing at a 280 * List running through cg_cgroup_links pointing at a
281 * single css_set object, anchored on css_set->cg_links 281 * single css_set object, anchored on css_set->cg_links
282 */ 282 */
283 struct list_head cg_link_list; 283 struct list_head cg_link_list;
284 struct css_set *cg; 284 struct css_set *cg;
285 }; 285 };
286 286
287 /* The default css_set - used by init and its children prior to any 287 /* The default css_set - used by init and its children prior to any
288 * hierarchies being mounted. It contains a pointer to the root state 288 * hierarchies being mounted. It contains a pointer to the root state
289 * for each subsystem. Also used to anchor the list of css_sets. Not 289 * for each subsystem. Also used to anchor the list of css_sets. Not
290 * reference-counted, to improve performance when child cgroups 290 * reference-counted, to improve performance when child cgroups
291 * haven't been created. 291 * haven't been created.
292 */ 292 */
293 293
294 static struct css_set init_css_set; 294 static struct css_set init_css_set;
295 static struct cg_cgroup_link init_css_set_link; 295 static struct cg_cgroup_link init_css_set_link;
296 296
297 static int cgroup_init_idr(struct cgroup_subsys *ss, 297 static int cgroup_init_idr(struct cgroup_subsys *ss,
298 struct cgroup_subsys_state *css); 298 struct cgroup_subsys_state *css);
299 299
300 /* css_set_lock protects the list of css_set objects, and the 300 /* css_set_lock protects the list of css_set objects, and the
301 * chain of tasks off each css_set. Nests outside task->alloc_lock 301 * chain of tasks off each css_set. Nests outside task->alloc_lock
302 * due to cgroup_iter_start() */ 302 * due to cgroup_iter_start() */
303 static DEFINE_RWLOCK(css_set_lock); 303 static DEFINE_RWLOCK(css_set_lock);
304 static int css_set_count; 304 static int css_set_count;
305 305
306 /* 306 /*
307 * hash table for cgroup groups. This improves the performance to find 307 * hash table for cgroup groups. This improves the performance to find
308 * an existing css_set. This hash doesn't (currently) take into 308 * an existing css_set. This hash doesn't (currently) take into
309 * account cgroups in empty hierarchies. 309 * account cgroups in empty hierarchies.
310 */ 310 */
311 #define CSS_SET_HASH_BITS 7 311 #define CSS_SET_HASH_BITS 7
312 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 312 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
313 static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; 313 static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
314 314
315 static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) 315 static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
316 { 316 {
317 int i; 317 int i;
318 int index; 318 int index;
319 unsigned long tmp = 0UL; 319 unsigned long tmp = 0UL;
320 320
321 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 321 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
322 tmp += (unsigned long)css[i]; 322 tmp += (unsigned long)css[i];
323 tmp = (tmp >> 16) ^ tmp; 323 tmp = (tmp >> 16) ^ tmp;
324 324
325 index = hash_long(tmp, CSS_SET_HASH_BITS); 325 index = hash_long(tmp, CSS_SET_HASH_BITS);
326 326
327 return &css_set_table[index]; 327 return &css_set_table[index];
328 } 328 }
329 329
330 /* We don't maintain the lists running through each css_set to its 330 /* We don't maintain the lists running through each css_set to its
331 * task until after the first call to cgroup_iter_start(). This 331 * task until after the first call to cgroup_iter_start(). This
332 * reduces the fork()/exit() overhead for people who have cgroups 332 * reduces the fork()/exit() overhead for people who have cgroups
333 * compiled into their kernel but not actually in use */ 333 * compiled into their kernel but not actually in use */
334 static int use_task_css_set_links __read_mostly; 334 static int use_task_css_set_links __read_mostly;
335 335
336 static void __put_css_set(struct css_set *cg, int taskexit) 336 static void __put_css_set(struct css_set *cg, int taskexit)
337 { 337 {
338 struct cg_cgroup_link *link; 338 struct cg_cgroup_link *link;
339 struct cg_cgroup_link *saved_link; 339 struct cg_cgroup_link *saved_link;
340 /* 340 /*
341 * Ensure that the refcount doesn't hit zero while any readers 341 * Ensure that the refcount doesn't hit zero while any readers
342 * can see it. Similar to atomic_dec_and_lock(), but for an 342 * can see it. Similar to atomic_dec_and_lock(), but for an
343 * rwlock 343 * rwlock
344 */ 344 */
345 if (atomic_add_unless(&cg->refcount, -1, 1)) 345 if (atomic_add_unless(&cg->refcount, -1, 1))
346 return; 346 return;
347 write_lock(&css_set_lock); 347 write_lock(&css_set_lock);
348 if (!atomic_dec_and_test(&cg->refcount)) { 348 if (!atomic_dec_and_test(&cg->refcount)) {
349 write_unlock(&css_set_lock); 349 write_unlock(&css_set_lock);
350 return; 350 return;
351 } 351 }
352 352
353 /* This css_set is dead. unlink it and release cgroup refcounts */ 353 /* This css_set is dead. unlink it and release cgroup refcounts */
354 hlist_del(&cg->hlist); 354 hlist_del(&cg->hlist);
355 css_set_count--; 355 css_set_count--;
356 356
357 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 357 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
358 cg_link_list) { 358 cg_link_list) {
359 struct cgroup *cgrp = link->cgrp; 359 struct cgroup *cgrp = link->cgrp;
360 list_del(&link->cg_link_list); 360 list_del(&link->cg_link_list);
361 list_del(&link->cgrp_link_list); 361 list_del(&link->cgrp_link_list);
362 if (atomic_dec_and_test(&cgrp->count) && 362 if (atomic_dec_and_test(&cgrp->count) &&
363 notify_on_release(cgrp)) { 363 notify_on_release(cgrp)) {
364 if (taskexit) 364 if (taskexit)
365 set_bit(CGRP_RELEASABLE, &cgrp->flags); 365 set_bit(CGRP_RELEASABLE, &cgrp->flags);
366 check_for_release(cgrp); 366 check_for_release(cgrp);
367 } 367 }
368 368
369 kfree(link); 369 kfree(link);
370 } 370 }
371 371
372 write_unlock(&css_set_lock); 372 write_unlock(&css_set_lock);
373 kfree_rcu(cg, rcu_head); 373 kfree_rcu(cg, rcu_head);
374 } 374 }
375 375
376 /* 376 /*
377 * refcounted get/put for css_set objects 377 * refcounted get/put for css_set objects
378 */ 378 */
379 static inline void get_css_set(struct css_set *cg) 379 static inline void get_css_set(struct css_set *cg)
380 { 380 {
381 atomic_inc(&cg->refcount); 381 atomic_inc(&cg->refcount);
382 } 382 }
383 383
384 static inline void put_css_set(struct css_set *cg) 384 static inline void put_css_set(struct css_set *cg)
385 { 385 {
386 __put_css_set(cg, 0); 386 __put_css_set(cg, 0);
387 } 387 }
388 388
389 static inline void put_css_set_taskexit(struct css_set *cg) 389 static inline void put_css_set_taskexit(struct css_set *cg)
390 { 390 {
391 __put_css_set(cg, 1); 391 __put_css_set(cg, 1);
392 } 392 }
393 393
394 /* 394 /*
395 * compare_css_sets - helper function for find_existing_css_set(). 395 * compare_css_sets - helper function for find_existing_css_set().
396 * @cg: candidate css_set being tested 396 * @cg: candidate css_set being tested
397 * @old_cg: existing css_set for a task 397 * @old_cg: existing css_set for a task
398 * @new_cgrp: cgroup that's being entered by the task 398 * @new_cgrp: cgroup that's being entered by the task
399 * @template: desired set of css pointers in css_set (pre-calculated) 399 * @template: desired set of css pointers in css_set (pre-calculated)
400 * 400 *
401 * Returns true if "cg" matches "old_cg" except for the hierarchy 401 * Returns true if "cg" matches "old_cg" except for the hierarchy
402 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 402 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
403 */ 403 */
404 static bool compare_css_sets(struct css_set *cg, 404 static bool compare_css_sets(struct css_set *cg,
405 struct css_set *old_cg, 405 struct css_set *old_cg,
406 struct cgroup *new_cgrp, 406 struct cgroup *new_cgrp,
407 struct cgroup_subsys_state *template[]) 407 struct cgroup_subsys_state *template[])
408 { 408 {
409 struct list_head *l1, *l2; 409 struct list_head *l1, *l2;
410 410
411 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { 411 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
412 /* Not all subsystems matched */ 412 /* Not all subsystems matched */
413 return false; 413 return false;
414 } 414 }
415 415
416 /* 416 /*
417 * Compare cgroup pointers in order to distinguish between 417 * Compare cgroup pointers in order to distinguish between
418 * different cgroups in heirarchies with no subsystems. We 418 * different cgroups in heirarchies with no subsystems. We
419 * could get by with just this check alone (and skip the 419 * could get by with just this check alone (and skip the
420 * memcmp above) but on most setups the memcmp check will 420 * memcmp above) but on most setups the memcmp check will
421 * avoid the need for this more expensive check on almost all 421 * avoid the need for this more expensive check on almost all
422 * candidates. 422 * candidates.
423 */ 423 */
424 424
425 l1 = &cg->cg_links; 425 l1 = &cg->cg_links;
426 l2 = &old_cg->cg_links; 426 l2 = &old_cg->cg_links;
427 while (1) { 427 while (1) {
428 struct cg_cgroup_link *cgl1, *cgl2; 428 struct cg_cgroup_link *cgl1, *cgl2;
429 struct cgroup *cg1, *cg2; 429 struct cgroup *cg1, *cg2;
430 430
431 l1 = l1->next; 431 l1 = l1->next;
432 l2 = l2->next; 432 l2 = l2->next;
433 /* See if we reached the end - both lists are equal length. */ 433 /* See if we reached the end - both lists are equal length. */
434 if (l1 == &cg->cg_links) { 434 if (l1 == &cg->cg_links) {
435 BUG_ON(l2 != &old_cg->cg_links); 435 BUG_ON(l2 != &old_cg->cg_links);
436 break; 436 break;
437 } else { 437 } else {
438 BUG_ON(l2 == &old_cg->cg_links); 438 BUG_ON(l2 == &old_cg->cg_links);
439 } 439 }
440 /* Locate the cgroups associated with these links. */ 440 /* Locate the cgroups associated with these links. */
441 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); 441 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
442 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); 442 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
443 cg1 = cgl1->cgrp; 443 cg1 = cgl1->cgrp;
444 cg2 = cgl2->cgrp; 444 cg2 = cgl2->cgrp;
445 /* Hierarchies should be linked in the same order. */ 445 /* Hierarchies should be linked in the same order. */
446 BUG_ON(cg1->root != cg2->root); 446 BUG_ON(cg1->root != cg2->root);
447 447
448 /* 448 /*
449 * If this hierarchy is the hierarchy of the cgroup 449 * If this hierarchy is the hierarchy of the cgroup
450 * that's changing, then we need to check that this 450 * that's changing, then we need to check that this
451 * css_set points to the new cgroup; if it's any other 451 * css_set points to the new cgroup; if it's any other
452 * hierarchy, then this css_set should point to the 452 * hierarchy, then this css_set should point to the
453 * same cgroup as the old css_set. 453 * same cgroup as the old css_set.
454 */ 454 */
455 if (cg1->root == new_cgrp->root) { 455 if (cg1->root == new_cgrp->root) {
456 if (cg1 != new_cgrp) 456 if (cg1 != new_cgrp)
457 return false; 457 return false;
458 } else { 458 } else {
459 if (cg1 != cg2) 459 if (cg1 != cg2)
460 return false; 460 return false;
461 } 461 }
462 } 462 }
463 return true; 463 return true;
464 } 464 }
465 465
466 /* 466 /*
467 * find_existing_css_set() is a helper for 467 * find_existing_css_set() is a helper for
468 * find_css_set(), and checks to see whether an existing 468 * find_css_set(), and checks to see whether an existing
469 * css_set is suitable. 469 * css_set is suitable.
470 * 470 *
471 * oldcg: the cgroup group that we're using before the cgroup 471 * oldcg: the cgroup group that we're using before the cgroup
472 * transition 472 * transition
473 * 473 *
474 * cgrp: the cgroup that we're moving into 474 * cgrp: the cgroup that we're moving into
475 * 475 *
476 * template: location in which to build the desired set of subsystem 476 * template: location in which to build the desired set of subsystem
477 * state objects for the new cgroup group 477 * state objects for the new cgroup group
478 */ 478 */
479 static struct css_set *find_existing_css_set( 479 static struct css_set *find_existing_css_set(
480 struct css_set *oldcg, 480 struct css_set *oldcg,
481 struct cgroup *cgrp, 481 struct cgroup *cgrp,
482 struct cgroup_subsys_state *template[]) 482 struct cgroup_subsys_state *template[])
483 { 483 {
484 int i; 484 int i;
485 struct cgroupfs_root *root = cgrp->root; 485 struct cgroupfs_root *root = cgrp->root;
486 struct hlist_head *hhead; 486 struct hlist_head *hhead;
487 struct hlist_node *node; 487 struct hlist_node *node;
488 struct css_set *cg; 488 struct css_set *cg;
489 489
490 /* 490 /*
491 * Build the set of subsystem state objects that we want to see in the 491 * Build the set of subsystem state objects that we want to see in the
492 * new css_set. while subsystems can change globally, the entries here 492 * new css_set. while subsystems can change globally, the entries here
493 * won't change, so no need for locking. 493 * won't change, so no need for locking.
494 */ 494 */
495 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 495 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
496 if (root->subsys_bits & (1UL << i)) { 496 if (root->subsys_bits & (1UL << i)) {
497 /* Subsystem is in this hierarchy. So we want 497 /* Subsystem is in this hierarchy. So we want
498 * the subsystem state from the new 498 * the subsystem state from the new
499 * cgroup */ 499 * cgroup */
500 template[i] = cgrp->subsys[i]; 500 template[i] = cgrp->subsys[i];
501 } else { 501 } else {
502 /* Subsystem is not in this hierarchy, so we 502 /* Subsystem is not in this hierarchy, so we
503 * don't want to change the subsystem state */ 503 * don't want to change the subsystem state */
504 template[i] = oldcg->subsys[i]; 504 template[i] = oldcg->subsys[i];
505 } 505 }
506 } 506 }
507 507
508 hhead = css_set_hash(template); 508 hhead = css_set_hash(template);
509 hlist_for_each_entry(cg, node, hhead, hlist) { 509 hlist_for_each_entry(cg, node, hhead, hlist) {
510 if (!compare_css_sets(cg, oldcg, cgrp, template)) 510 if (!compare_css_sets(cg, oldcg, cgrp, template))
511 continue; 511 continue;
512 512
513 /* This css_set matches what we need */ 513 /* This css_set matches what we need */
514 return cg; 514 return cg;
515 } 515 }
516 516
517 /* No existing cgroup group matched */ 517 /* No existing cgroup group matched */
518 return NULL; 518 return NULL;
519 } 519 }
520 520
521 static void free_cg_links(struct list_head *tmp) 521 static void free_cg_links(struct list_head *tmp)
522 { 522 {
523 struct cg_cgroup_link *link; 523 struct cg_cgroup_link *link;
524 struct cg_cgroup_link *saved_link; 524 struct cg_cgroup_link *saved_link;
525 525
526 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { 526 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
527 list_del(&link->cgrp_link_list); 527 list_del(&link->cgrp_link_list);
528 kfree(link); 528 kfree(link);
529 } 529 }
530 } 530 }
531 531
532 /* 532 /*
533 * allocate_cg_links() allocates "count" cg_cgroup_link structures 533 * allocate_cg_links() allocates "count" cg_cgroup_link structures
534 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 534 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
535 * success or a negative error 535 * success or a negative error
536 */ 536 */
537 static int allocate_cg_links(int count, struct list_head *tmp) 537 static int allocate_cg_links(int count, struct list_head *tmp)
538 { 538 {
539 struct cg_cgroup_link *link; 539 struct cg_cgroup_link *link;
540 int i; 540 int i;
541 INIT_LIST_HEAD(tmp); 541 INIT_LIST_HEAD(tmp);
542 for (i = 0; i < count; i++) { 542 for (i = 0; i < count; i++) {
543 link = kmalloc(sizeof(*link), GFP_KERNEL); 543 link = kmalloc(sizeof(*link), GFP_KERNEL);
544 if (!link) { 544 if (!link) {
545 free_cg_links(tmp); 545 free_cg_links(tmp);
546 return -ENOMEM; 546 return -ENOMEM;
547 } 547 }
548 list_add(&link->cgrp_link_list, tmp); 548 list_add(&link->cgrp_link_list, tmp);
549 } 549 }
550 return 0; 550 return 0;
551 } 551 }
552 552
553 /** 553 /**
554 * link_css_set - a helper function to link a css_set to a cgroup 554 * link_css_set - a helper function to link a css_set to a cgroup
555 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() 555 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
556 * @cg: the css_set to be linked 556 * @cg: the css_set to be linked
557 * @cgrp: the destination cgroup 557 * @cgrp: the destination cgroup
558 */ 558 */
559 static void link_css_set(struct list_head *tmp_cg_links, 559 static void link_css_set(struct list_head *tmp_cg_links,
560 struct css_set *cg, struct cgroup *cgrp) 560 struct css_set *cg, struct cgroup *cgrp)
561 { 561 {
562 struct cg_cgroup_link *link; 562 struct cg_cgroup_link *link;
563 563
564 BUG_ON(list_empty(tmp_cg_links)); 564 BUG_ON(list_empty(tmp_cg_links));
565 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 565 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
566 cgrp_link_list); 566 cgrp_link_list);
567 link->cg = cg; 567 link->cg = cg;
568 link->cgrp = cgrp; 568 link->cgrp = cgrp;
569 atomic_inc(&cgrp->count); 569 atomic_inc(&cgrp->count);
570 list_move(&link->cgrp_link_list, &cgrp->css_sets); 570 list_move(&link->cgrp_link_list, &cgrp->css_sets);
571 /* 571 /*
572 * Always add links to the tail of the list so that the list 572 * Always add links to the tail of the list so that the list
573 * is sorted by order of hierarchy creation 573 * is sorted by order of hierarchy creation
574 */ 574 */
575 list_add_tail(&link->cg_link_list, &cg->cg_links); 575 list_add_tail(&link->cg_link_list, &cg->cg_links);
576 } 576 }
577 577
578 /* 578 /*
579 * find_css_set() takes an existing cgroup group and a 579 * find_css_set() takes an existing cgroup group and a
580 * cgroup object, and returns a css_set object that's 580 * cgroup object, and returns a css_set object that's
581 * equivalent to the old group, but with the given cgroup 581 * equivalent to the old group, but with the given cgroup
582 * substituted into the appropriate hierarchy. Must be called with 582 * substituted into the appropriate hierarchy. Must be called with
583 * cgroup_mutex held 583 * cgroup_mutex held
584 */ 584 */
585 static struct css_set *find_css_set( 585 static struct css_set *find_css_set(
586 struct css_set *oldcg, struct cgroup *cgrp) 586 struct css_set *oldcg, struct cgroup *cgrp)
587 { 587 {
588 struct css_set *res; 588 struct css_set *res;
589 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 589 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
590 590
591 struct list_head tmp_cg_links; 591 struct list_head tmp_cg_links;
592 592
593 struct hlist_head *hhead; 593 struct hlist_head *hhead;
594 struct cg_cgroup_link *link; 594 struct cg_cgroup_link *link;
595 595
596 /* First see if we already have a cgroup group that matches 596 /* First see if we already have a cgroup group that matches
597 * the desired set */ 597 * the desired set */
598 read_lock(&css_set_lock); 598 read_lock(&css_set_lock);
599 res = find_existing_css_set(oldcg, cgrp, template); 599 res = find_existing_css_set(oldcg, cgrp, template);
600 if (res) 600 if (res)
601 get_css_set(res); 601 get_css_set(res);
602 read_unlock(&css_set_lock); 602 read_unlock(&css_set_lock);
603 603
604 if (res) 604 if (res)
605 return res; 605 return res;
606 606
607 res = kmalloc(sizeof(*res), GFP_KERNEL); 607 res = kmalloc(sizeof(*res), GFP_KERNEL);
608 if (!res) 608 if (!res)
609 return NULL; 609 return NULL;
610 610
611 /* Allocate all the cg_cgroup_link objects that we'll need */ 611 /* Allocate all the cg_cgroup_link objects that we'll need */
612 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { 612 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
613 kfree(res); 613 kfree(res);
614 return NULL; 614 return NULL;
615 } 615 }
616 616
617 atomic_set(&res->refcount, 1); 617 atomic_set(&res->refcount, 1);
618 INIT_LIST_HEAD(&res->cg_links); 618 INIT_LIST_HEAD(&res->cg_links);
619 INIT_LIST_HEAD(&res->tasks); 619 INIT_LIST_HEAD(&res->tasks);
620 INIT_HLIST_NODE(&res->hlist); 620 INIT_HLIST_NODE(&res->hlist);
621 621
622 /* Copy the set of subsystem state objects generated in 622 /* Copy the set of subsystem state objects generated in
623 * find_existing_css_set() */ 623 * find_existing_css_set() */
624 memcpy(res->subsys, template, sizeof(res->subsys)); 624 memcpy(res->subsys, template, sizeof(res->subsys));
625 625
626 write_lock(&css_set_lock); 626 write_lock(&css_set_lock);
627 /* Add reference counts and links from the new css_set. */ 627 /* Add reference counts and links from the new css_set. */
628 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { 628 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
629 struct cgroup *c = link->cgrp; 629 struct cgroup *c = link->cgrp;
630 if (c->root == cgrp->root) 630 if (c->root == cgrp->root)
631 c = cgrp; 631 c = cgrp;
632 link_css_set(&tmp_cg_links, res, c); 632 link_css_set(&tmp_cg_links, res, c);
633 } 633 }
634 634
635 BUG_ON(!list_empty(&tmp_cg_links)); 635 BUG_ON(!list_empty(&tmp_cg_links));
636 636
637 css_set_count++; 637 css_set_count++;
638 638
639 /* Add this cgroup group to the hash table */ 639 /* Add this cgroup group to the hash table */
640 hhead = css_set_hash(res->subsys); 640 hhead = css_set_hash(res->subsys);
641 hlist_add_head(&res->hlist, hhead); 641 hlist_add_head(&res->hlist, hhead);
642 642
643 write_unlock(&css_set_lock); 643 write_unlock(&css_set_lock);
644 644
645 return res; 645 return res;
646 } 646 }
647 647
648 /* 648 /*
649 * Return the cgroup for "task" from the given hierarchy. Must be 649 * Return the cgroup for "task" from the given hierarchy. Must be
650 * called with cgroup_mutex held. 650 * called with cgroup_mutex held.
651 */ 651 */
652 static struct cgroup *task_cgroup_from_root(struct task_struct *task, 652 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
653 struct cgroupfs_root *root) 653 struct cgroupfs_root *root)
654 { 654 {
655 struct css_set *css; 655 struct css_set *css;
656 struct cgroup *res = NULL; 656 struct cgroup *res = NULL;
657 657
658 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 658 BUG_ON(!mutex_is_locked(&cgroup_mutex));
659 read_lock(&css_set_lock); 659 read_lock(&css_set_lock);
660 /* 660 /*
661 * No need to lock the task - since we hold cgroup_mutex the 661 * No need to lock the task - since we hold cgroup_mutex the
662 * task can't change groups, so the only thing that can happen 662 * task can't change groups, so the only thing that can happen
663 * is that it exits and its css is set back to init_css_set. 663 * is that it exits and its css is set back to init_css_set.
664 */ 664 */
665 css = task->cgroups; 665 css = task->cgroups;
666 if (css == &init_css_set) { 666 if (css == &init_css_set) {
667 res = &root->top_cgroup; 667 res = &root->top_cgroup;
668 } else { 668 } else {
669 struct cg_cgroup_link *link; 669 struct cg_cgroup_link *link;
670 list_for_each_entry(link, &css->cg_links, cg_link_list) { 670 list_for_each_entry(link, &css->cg_links, cg_link_list) {
671 struct cgroup *c = link->cgrp; 671 struct cgroup *c = link->cgrp;
672 if (c->root == root) { 672 if (c->root == root) {
673 res = c; 673 res = c;
674 break; 674 break;
675 } 675 }
676 } 676 }
677 } 677 }
678 read_unlock(&css_set_lock); 678 read_unlock(&css_set_lock);
679 BUG_ON(!res); 679 BUG_ON(!res);
680 return res; 680 return res;
681 } 681 }
682 682
683 /* 683 /*
684 * There is one global cgroup mutex. We also require taking 684 * There is one global cgroup mutex. We also require taking
685 * task_lock() when dereferencing a task's cgroup subsys pointers. 685 * task_lock() when dereferencing a task's cgroup subsys pointers.
686 * See "The task_lock() exception", at the end of this comment. 686 * See "The task_lock() exception", at the end of this comment.
687 * 687 *
688 * A task must hold cgroup_mutex to modify cgroups. 688 * A task must hold cgroup_mutex to modify cgroups.
689 * 689 *
690 * Any task can increment and decrement the count field without lock. 690 * Any task can increment and decrement the count field without lock.
691 * So in general, code holding cgroup_mutex can't rely on the count 691 * So in general, code holding cgroup_mutex can't rely on the count
692 * field not changing. However, if the count goes to zero, then only 692 * field not changing. However, if the count goes to zero, then only
693 * cgroup_attach_task() can increment it again. Because a count of zero 693 * cgroup_attach_task() can increment it again. Because a count of zero
694 * means that no tasks are currently attached, therefore there is no 694 * means that no tasks are currently attached, therefore there is no
695 * way a task attached to that cgroup can fork (the other way to 695 * way a task attached to that cgroup can fork (the other way to
696 * increment the count). So code holding cgroup_mutex can safely 696 * increment the count). So code holding cgroup_mutex can safely
697 * assume that if the count is zero, it will stay zero. Similarly, if 697 * assume that if the count is zero, it will stay zero. Similarly, if
698 * a task holds cgroup_mutex on a cgroup with zero count, it 698 * a task holds cgroup_mutex on a cgroup with zero count, it
699 * knows that the cgroup won't be removed, as cgroup_rmdir() 699 * knows that the cgroup won't be removed, as cgroup_rmdir()
700 * needs that mutex. 700 * needs that mutex.
701 * 701 *
702 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't 702 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
703 * (usually) take cgroup_mutex. These are the two most performance 703 * (usually) take cgroup_mutex. These are the two most performance
704 * critical pieces of code here. The exception occurs on cgroup_exit(), 704 * critical pieces of code here. The exception occurs on cgroup_exit(),
705 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex 705 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
706 * is taken, and if the cgroup count is zero, a usermode call made 706 * is taken, and if the cgroup count is zero, a usermode call made
707 * to the release agent with the name of the cgroup (path relative to 707 * to the release agent with the name of the cgroup (path relative to
708 * the root of cgroup file system) as the argument. 708 * the root of cgroup file system) as the argument.
709 * 709 *
710 * A cgroup can only be deleted if both its 'count' of using tasks 710 * A cgroup can only be deleted if both its 'count' of using tasks
711 * is zero, and its list of 'children' cgroups is empty. Since all 711 * is zero, and its list of 'children' cgroups is empty. Since all
712 * tasks in the system use _some_ cgroup, and since there is always at 712 * tasks in the system use _some_ cgroup, and since there is always at
713 * least one task in the system (init, pid == 1), therefore, top_cgroup 713 * least one task in the system (init, pid == 1), therefore, top_cgroup
714 * always has either children cgroups and/or using tasks. So we don't 714 * always has either children cgroups and/or using tasks. So we don't
715 * need a special hack to ensure that top_cgroup cannot be deleted. 715 * need a special hack to ensure that top_cgroup cannot be deleted.
716 * 716 *
717 * The task_lock() exception 717 * The task_lock() exception
718 * 718 *
719 * The need for this exception arises from the action of 719 * The need for this exception arises from the action of
720 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 720 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
721 * another. It does so using cgroup_mutex, however there are 721 * another. It does so using cgroup_mutex, however there are
722 * several performance critical places that need to reference 722 * several performance critical places that need to reference
723 * task->cgroup without the expense of grabbing a system global 723 * task->cgroup without the expense of grabbing a system global
724 * mutex. Therefore except as noted below, when dereferencing or, as 724 * mutex. Therefore except as noted below, when dereferencing or, as
725 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 725 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
726 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 726 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
727 * the task_struct routinely used for such matters. 727 * the task_struct routinely used for such matters.
728 * 728 *
729 * P.S. One more locking exception. RCU is used to guard the 729 * P.S. One more locking exception. RCU is used to guard the
730 * update of a tasks cgroup pointer by cgroup_attach_task() 730 * update of a tasks cgroup pointer by cgroup_attach_task()
731 */ 731 */
732 732
733 /** 733 /**
734 * cgroup_lock - lock out any changes to cgroup structures 734 * cgroup_lock - lock out any changes to cgroup structures
735 * 735 *
736 */ 736 */
737 void cgroup_lock(void) 737 void cgroup_lock(void)
738 { 738 {
739 mutex_lock(&cgroup_mutex); 739 mutex_lock(&cgroup_mutex);
740 } 740 }
741 EXPORT_SYMBOL_GPL(cgroup_lock); 741 EXPORT_SYMBOL_GPL(cgroup_lock);
742 742
743 /** 743 /**
744 * cgroup_unlock - release lock on cgroup changes 744 * cgroup_unlock - release lock on cgroup changes
745 * 745 *
746 * Undo the lock taken in a previous cgroup_lock() call. 746 * Undo the lock taken in a previous cgroup_lock() call.
747 */ 747 */
748 void cgroup_unlock(void) 748 void cgroup_unlock(void)
749 { 749 {
750 mutex_unlock(&cgroup_mutex); 750 mutex_unlock(&cgroup_mutex);
751 } 751 }
752 EXPORT_SYMBOL_GPL(cgroup_unlock); 752 EXPORT_SYMBOL_GPL(cgroup_unlock);
753 753
754 /* 754 /*
755 * A couple of forward declarations required, due to cyclic reference loop: 755 * A couple of forward declarations required, due to cyclic reference loop:
756 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> 756 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
757 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations 757 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
758 * -> cgroup_mkdir. 758 * -> cgroup_mkdir.
759 */ 759 */
760 760
761 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 761 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
762 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); 762 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
763 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 763 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
764 static int cgroup_populate_dir(struct cgroup *cgrp); 764 static int cgroup_populate_dir(struct cgroup *cgrp);
765 static const struct inode_operations cgroup_dir_inode_operations; 765 static const struct inode_operations cgroup_dir_inode_operations;
766 static const struct file_operations proc_cgroupstats_operations; 766 static const struct file_operations proc_cgroupstats_operations;
767 767
768 static struct backing_dev_info cgroup_backing_dev_info = { 768 static struct backing_dev_info cgroup_backing_dev_info = {
769 .name = "cgroup", 769 .name = "cgroup",
770 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 770 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
771 }; 771 };
772 772
773 static int alloc_css_id(struct cgroup_subsys *ss, 773 static int alloc_css_id(struct cgroup_subsys *ss,
774 struct cgroup *parent, struct cgroup *child); 774 struct cgroup *parent, struct cgroup *child);
775 775
776 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 776 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
777 { 777 {
778 struct inode *inode = new_inode(sb); 778 struct inode *inode = new_inode(sb);
779 779
780 if (inode) { 780 if (inode) {
781 inode->i_ino = get_next_ino(); 781 inode->i_ino = get_next_ino();
782 inode->i_mode = mode; 782 inode->i_mode = mode;
783 inode->i_uid = current_fsuid(); 783 inode->i_uid = current_fsuid();
784 inode->i_gid = current_fsgid(); 784 inode->i_gid = current_fsgid();
785 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 785 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
786 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 786 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
787 } 787 }
788 return inode; 788 return inode;
789 } 789 }
790 790
791 /* 791 /*
792 * Call subsys's pre_destroy handler. 792 * Call subsys's pre_destroy handler.
793 * This is called before css refcnt check. 793 * This is called before css refcnt check.
794 */ 794 */
795 static int cgroup_call_pre_destroy(struct cgroup *cgrp) 795 static int cgroup_call_pre_destroy(struct cgroup *cgrp)
796 { 796 {
797 struct cgroup_subsys *ss; 797 struct cgroup_subsys *ss;
798 int ret = 0; 798 int ret = 0;
799 799
800 for_each_subsys(cgrp->root, ss) 800 for_each_subsys(cgrp->root, ss)
801 if (ss->pre_destroy) { 801 if (ss->pre_destroy) {
802 ret = ss->pre_destroy(ss, cgrp); 802 ret = ss->pre_destroy(ss, cgrp);
803 if (ret) 803 if (ret)
804 break; 804 break;
805 } 805 }
806 806
807 return ret; 807 return ret;
808 } 808 }
809 809
810 static void cgroup_diput(struct dentry *dentry, struct inode *inode) 810 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
811 { 811 {
812 /* is dentry a directory ? if so, kfree() associated cgroup */ 812 /* is dentry a directory ? if so, kfree() associated cgroup */
813 if (S_ISDIR(inode->i_mode)) { 813 if (S_ISDIR(inode->i_mode)) {
814 struct cgroup *cgrp = dentry->d_fsdata; 814 struct cgroup *cgrp = dentry->d_fsdata;
815 struct cgroup_subsys *ss; 815 struct cgroup_subsys *ss;
816 BUG_ON(!(cgroup_is_removed(cgrp))); 816 BUG_ON(!(cgroup_is_removed(cgrp)));
817 /* It's possible for external users to be holding css 817 /* It's possible for external users to be holding css
818 * reference counts on a cgroup; css_put() needs to 818 * reference counts on a cgroup; css_put() needs to
819 * be able to access the cgroup after decrementing 819 * be able to access the cgroup after decrementing
820 * the reference count in order to know if it needs to 820 * the reference count in order to know if it needs to
821 * queue the cgroup to be handled by the release 821 * queue the cgroup to be handled by the release
822 * agent */ 822 * agent */
823 synchronize_rcu(); 823 synchronize_rcu();
824 824
825 mutex_lock(&cgroup_mutex); 825 mutex_lock(&cgroup_mutex);
826 /* 826 /*
827 * Release the subsystem state objects. 827 * Release the subsystem state objects.
828 */ 828 */
829 for_each_subsys(cgrp->root, ss) 829 for_each_subsys(cgrp->root, ss)
830 ss->destroy(ss, cgrp); 830 ss->destroy(ss, cgrp);
831 831
832 cgrp->root->number_of_cgroups--; 832 cgrp->root->number_of_cgroups--;
833 mutex_unlock(&cgroup_mutex); 833 mutex_unlock(&cgroup_mutex);
834 834
835 /* 835 /*
836 * Drop the active superblock reference that we took when we 836 * Drop the active superblock reference that we took when we
837 * created the cgroup 837 * created the cgroup
838 */ 838 */
839 deactivate_super(cgrp->root->sb); 839 deactivate_super(cgrp->root->sb);
840 840
841 /* 841 /*
842 * if we're getting rid of the cgroup, refcount should ensure 842 * if we're getting rid of the cgroup, refcount should ensure
843 * that there are no pidlists left. 843 * that there are no pidlists left.
844 */ 844 */
845 BUG_ON(!list_empty(&cgrp->pidlists)); 845 BUG_ON(!list_empty(&cgrp->pidlists));
846 846
847 kfree_rcu(cgrp, rcu_head); 847 kfree_rcu(cgrp, rcu_head);
848 } 848 }
849 iput(inode); 849 iput(inode);
850 } 850 }
851 851
852 static int cgroup_delete(const struct dentry *d) 852 static int cgroup_delete(const struct dentry *d)
853 { 853 {
854 return 1; 854 return 1;
855 } 855 }
856 856
857 static void remove_dir(struct dentry *d) 857 static void remove_dir(struct dentry *d)
858 { 858 {
859 struct dentry *parent = dget(d->d_parent); 859 struct dentry *parent = dget(d->d_parent);
860 860
861 d_delete(d); 861 d_delete(d);
862 simple_rmdir(parent->d_inode, d); 862 simple_rmdir(parent->d_inode, d);
863 dput(parent); 863 dput(parent);
864 } 864 }
865 865
866 static void cgroup_clear_directory(struct dentry *dentry) 866 static void cgroup_clear_directory(struct dentry *dentry)
867 { 867 {
868 struct list_head *node; 868 struct list_head *node;
869 869
870 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 870 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
871 spin_lock(&dentry->d_lock); 871 spin_lock(&dentry->d_lock);
872 node = dentry->d_subdirs.next; 872 node = dentry->d_subdirs.next;
873 while (node != &dentry->d_subdirs) { 873 while (node != &dentry->d_subdirs) {
874 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 874 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
875 875
876 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 876 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
877 list_del_init(node); 877 list_del_init(node);
878 if (d->d_inode) { 878 if (d->d_inode) {
879 /* This should never be called on a cgroup 879 /* This should never be called on a cgroup
880 * directory with child cgroups */ 880 * directory with child cgroups */
881 BUG_ON(d->d_inode->i_mode & S_IFDIR); 881 BUG_ON(d->d_inode->i_mode & S_IFDIR);
882 dget_dlock(d); 882 dget_dlock(d);
883 spin_unlock(&d->d_lock); 883 spin_unlock(&d->d_lock);
884 spin_unlock(&dentry->d_lock); 884 spin_unlock(&dentry->d_lock);
885 d_delete(d); 885 d_delete(d);
886 simple_unlink(dentry->d_inode, d); 886 simple_unlink(dentry->d_inode, d);
887 dput(d); 887 dput(d);
888 spin_lock(&dentry->d_lock); 888 spin_lock(&dentry->d_lock);
889 } else 889 } else
890 spin_unlock(&d->d_lock); 890 spin_unlock(&d->d_lock);
891 node = dentry->d_subdirs.next; 891 node = dentry->d_subdirs.next;
892 } 892 }
893 spin_unlock(&dentry->d_lock); 893 spin_unlock(&dentry->d_lock);
894 } 894 }
895 895
896 /* 896 /*
897 * NOTE : the dentry must have been dget()'ed 897 * NOTE : the dentry must have been dget()'ed
898 */ 898 */
899 static void cgroup_d_remove_dir(struct dentry *dentry) 899 static void cgroup_d_remove_dir(struct dentry *dentry)
900 { 900 {
901 struct dentry *parent; 901 struct dentry *parent;
902 902
903 cgroup_clear_directory(dentry); 903 cgroup_clear_directory(dentry);
904 904
905 parent = dentry->d_parent; 905 parent = dentry->d_parent;
906 spin_lock(&parent->d_lock); 906 spin_lock(&parent->d_lock);
907 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 907 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
908 list_del_init(&dentry->d_u.d_child); 908 list_del_init(&dentry->d_u.d_child);
909 spin_unlock(&dentry->d_lock); 909 spin_unlock(&dentry->d_lock);
910 spin_unlock(&parent->d_lock); 910 spin_unlock(&parent->d_lock);
911 remove_dir(dentry); 911 remove_dir(dentry);
912 } 912 }
913 913
914 /* 914 /*
915 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when 915 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
916 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some 916 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
917 * reference to css->refcnt. In general, this refcnt is expected to goes down 917 * reference to css->refcnt. In general, this refcnt is expected to goes down
918 * to zero, soon. 918 * to zero, soon.
919 * 919 *
920 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; 920 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
921 */ 921 */
922 DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 922 DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
923 923
924 static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) 924 static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
925 { 925 {
926 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 926 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
927 wake_up_all(&cgroup_rmdir_waitq); 927 wake_up_all(&cgroup_rmdir_waitq);
928 } 928 }
929 929
930 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) 930 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
931 { 931 {
932 css_get(css); 932 css_get(css);
933 } 933 }
934 934
935 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) 935 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
936 { 936 {
937 cgroup_wakeup_rmdir_waiter(css->cgroup); 937 cgroup_wakeup_rmdir_waiter(css->cgroup);
938 css_put(css); 938 css_put(css);
939 } 939 }
940 940
941 /* 941 /*
942 * Call with cgroup_mutex held. Drops reference counts on modules, including 942 * Call with cgroup_mutex held. Drops reference counts on modules, including
943 * any duplicate ones that parse_cgroupfs_options took. If this function 943 * any duplicate ones that parse_cgroupfs_options took. If this function
944 * returns an error, no reference counts are touched. 944 * returns an error, no reference counts are touched.
945 */ 945 */
946 static int rebind_subsystems(struct cgroupfs_root *root, 946 static int rebind_subsystems(struct cgroupfs_root *root,
947 unsigned long final_bits) 947 unsigned long final_bits)
948 { 948 {
949 unsigned long added_bits, removed_bits; 949 unsigned long added_bits, removed_bits;
950 struct cgroup *cgrp = &root->top_cgroup; 950 struct cgroup *cgrp = &root->top_cgroup;
951 int i; 951 int i;
952 952
953 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 953 BUG_ON(!mutex_is_locked(&cgroup_mutex));
954 954
955 removed_bits = root->actual_subsys_bits & ~final_bits; 955 removed_bits = root->actual_subsys_bits & ~final_bits;
956 added_bits = final_bits & ~root->actual_subsys_bits; 956 added_bits = final_bits & ~root->actual_subsys_bits;
957 /* Check that any added subsystems are currently free */ 957 /* Check that any added subsystems are currently free */
958 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 958 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
959 unsigned long bit = 1UL << i; 959 unsigned long bit = 1UL << i;
960 struct cgroup_subsys *ss = subsys[i]; 960 struct cgroup_subsys *ss = subsys[i];
961 if (!(bit & added_bits)) 961 if (!(bit & added_bits))
962 continue; 962 continue;
963 /* 963 /*
964 * Nobody should tell us to do a subsys that doesn't exist: 964 * Nobody should tell us to do a subsys that doesn't exist:
965 * parse_cgroupfs_options should catch that case and refcounts 965 * parse_cgroupfs_options should catch that case and refcounts
966 * ensure that subsystems won't disappear once selected. 966 * ensure that subsystems won't disappear once selected.
967 */ 967 */
968 BUG_ON(ss == NULL); 968 BUG_ON(ss == NULL);
969 if (ss->root != &rootnode) { 969 if (ss->root != &rootnode) {
970 /* Subsystem isn't free */ 970 /* Subsystem isn't free */
971 return -EBUSY; 971 return -EBUSY;
972 } 972 }
973 } 973 }
974 974
975 /* Currently we don't handle adding/removing subsystems when 975 /* Currently we don't handle adding/removing subsystems when
976 * any child cgroups exist. This is theoretically supportable 976 * any child cgroups exist. This is theoretically supportable
977 * but involves complex error handling, so it's being left until 977 * but involves complex error handling, so it's being left until
978 * later */ 978 * later */
979 if (root->number_of_cgroups > 1) 979 if (root->number_of_cgroups > 1)
980 return -EBUSY; 980 return -EBUSY;
981 981
982 /* Process each subsystem */ 982 /* Process each subsystem */
983 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 983 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
984 struct cgroup_subsys *ss = subsys[i]; 984 struct cgroup_subsys *ss = subsys[i];
985 unsigned long bit = 1UL << i; 985 unsigned long bit = 1UL << i;
986 if (bit & added_bits) { 986 if (bit & added_bits) {
987 /* We're binding this subsystem to this hierarchy */ 987 /* We're binding this subsystem to this hierarchy */
988 BUG_ON(ss == NULL); 988 BUG_ON(ss == NULL);
989 BUG_ON(cgrp->subsys[i]); 989 BUG_ON(cgrp->subsys[i]);
990 BUG_ON(!dummytop->subsys[i]); 990 BUG_ON(!dummytop->subsys[i]);
991 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 991 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
992 mutex_lock(&ss->hierarchy_mutex); 992 mutex_lock(&ss->hierarchy_mutex);
993 cgrp->subsys[i] = dummytop->subsys[i]; 993 cgrp->subsys[i] = dummytop->subsys[i];
994 cgrp->subsys[i]->cgroup = cgrp; 994 cgrp->subsys[i]->cgroup = cgrp;
995 list_move(&ss->sibling, &root->subsys_list); 995 list_move(&ss->sibling, &root->subsys_list);
996 ss->root = root; 996 ss->root = root;
997 if (ss->bind) 997 if (ss->bind)
998 ss->bind(ss, cgrp); 998 ss->bind(ss, cgrp);
999 mutex_unlock(&ss->hierarchy_mutex); 999 mutex_unlock(&ss->hierarchy_mutex);
1000 /* refcount was already taken, and we're keeping it */ 1000 /* refcount was already taken, and we're keeping it */
1001 } else if (bit & removed_bits) { 1001 } else if (bit & removed_bits) {
1002 /* We're removing this subsystem */ 1002 /* We're removing this subsystem */
1003 BUG_ON(ss == NULL); 1003 BUG_ON(ss == NULL);
1004 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 1004 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1005 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1005 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1006 mutex_lock(&ss->hierarchy_mutex); 1006 mutex_lock(&ss->hierarchy_mutex);
1007 if (ss->bind) 1007 if (ss->bind)
1008 ss->bind(ss, dummytop); 1008 ss->bind(ss, dummytop);
1009 dummytop->subsys[i]->cgroup = dummytop; 1009 dummytop->subsys[i]->cgroup = dummytop;
1010 cgrp->subsys[i] = NULL; 1010 cgrp->subsys[i] = NULL;
1011 subsys[i]->root = &rootnode; 1011 subsys[i]->root = &rootnode;
1012 list_move(&ss->sibling, &rootnode.subsys_list); 1012 list_move(&ss->sibling, &rootnode.subsys_list);
1013 mutex_unlock(&ss->hierarchy_mutex); 1013 mutex_unlock(&ss->hierarchy_mutex);
1014 /* subsystem is now free - drop reference on module */ 1014 /* subsystem is now free - drop reference on module */
1015 module_put(ss->module); 1015 module_put(ss->module);
1016 } else if (bit & final_bits) { 1016 } else if (bit & final_bits) {
1017 /* Subsystem state should already exist */ 1017 /* Subsystem state should already exist */
1018 BUG_ON(ss == NULL); 1018 BUG_ON(ss == NULL);
1019 BUG_ON(!cgrp->subsys[i]); 1019 BUG_ON(!cgrp->subsys[i]);
1020 /* 1020 /*
1021 * a refcount was taken, but we already had one, so 1021 * a refcount was taken, but we already had one, so
1022 * drop the extra reference. 1022 * drop the extra reference.
1023 */ 1023 */
1024 module_put(ss->module); 1024 module_put(ss->module);
1025 #ifdef CONFIG_MODULE_UNLOAD 1025 #ifdef CONFIG_MODULE_UNLOAD
1026 BUG_ON(ss->module && !module_refcount(ss->module)); 1026 BUG_ON(ss->module && !module_refcount(ss->module));
1027 #endif 1027 #endif
1028 } else { 1028 } else {
1029 /* Subsystem state shouldn't exist */ 1029 /* Subsystem state shouldn't exist */
1030 BUG_ON(cgrp->subsys[i]); 1030 BUG_ON(cgrp->subsys[i]);
1031 } 1031 }
1032 } 1032 }
1033 root->subsys_bits = root->actual_subsys_bits = final_bits; 1033 root->subsys_bits = root->actual_subsys_bits = final_bits;
1034 synchronize_rcu(); 1034 synchronize_rcu();
1035 1035
1036 return 0; 1036 return 0;
1037 } 1037 }
1038 1038
1039 static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) 1039 static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1040 { 1040 {
1041 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; 1041 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
1042 struct cgroup_subsys *ss; 1042 struct cgroup_subsys *ss;
1043 1043
1044 mutex_lock(&cgroup_mutex); 1044 mutex_lock(&cgroup_mutex);
1045 for_each_subsys(root, ss) 1045 for_each_subsys(root, ss)
1046 seq_printf(seq, ",%s", ss->name); 1046 seq_printf(seq, ",%s", ss->name);
1047 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1047 if (test_bit(ROOT_NOPREFIX, &root->flags))
1048 seq_puts(seq, ",noprefix"); 1048 seq_puts(seq, ",noprefix");
1049 if (strlen(root->release_agent_path)) 1049 if (strlen(root->release_agent_path))
1050 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1050 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1051 if (clone_children(&root->top_cgroup)) 1051 if (clone_children(&root->top_cgroup))
1052 seq_puts(seq, ",clone_children"); 1052 seq_puts(seq, ",clone_children");
1053 if (strlen(root->name)) 1053 if (strlen(root->name))
1054 seq_printf(seq, ",name=%s", root->name); 1054 seq_printf(seq, ",name=%s", root->name);
1055 mutex_unlock(&cgroup_mutex); 1055 mutex_unlock(&cgroup_mutex);
1056 return 0; 1056 return 0;
1057 } 1057 }
1058 1058
1059 struct cgroup_sb_opts { 1059 struct cgroup_sb_opts {
1060 unsigned long subsys_bits; 1060 unsigned long subsys_bits;
1061 unsigned long flags; 1061 unsigned long flags;
1062 char *release_agent; 1062 char *release_agent;
1063 bool clone_children; 1063 bool clone_children;
1064 char *name; 1064 char *name;
1065 /* User explicitly requested empty subsystem */ 1065 /* User explicitly requested empty subsystem */
1066 bool none; 1066 bool none;
1067 1067
1068 struct cgroupfs_root *new_root; 1068 struct cgroupfs_root *new_root;
1069 1069
1070 }; 1070 };
1071 1071
1072 /* 1072 /*
1073 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call 1073 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
1074 * with cgroup_mutex held to protect the subsys[] array. This function takes 1074 * with cgroup_mutex held to protect the subsys[] array. This function takes
1075 * refcounts on subsystems to be used, unless it returns error, in which case 1075 * refcounts on subsystems to be used, unless it returns error, in which case
1076 * no refcounts are taken. 1076 * no refcounts are taken.
1077 */ 1077 */
1078 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1078 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1079 { 1079 {
1080 char *token, *o = data; 1080 char *token, *o = data;
1081 bool all_ss = false, one_ss = false; 1081 bool all_ss = false, one_ss = false;
1082 unsigned long mask = (unsigned long)-1; 1082 unsigned long mask = (unsigned long)-1;
1083 int i; 1083 int i;
1084 bool module_pin_failed = false; 1084 bool module_pin_failed = false;
1085 1085
1086 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1086 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1087 1087
1088 #ifdef CONFIG_CPUSETS 1088 #ifdef CONFIG_CPUSETS
1089 mask = ~(1UL << cpuset_subsys_id); 1089 mask = ~(1UL << cpuset_subsys_id);
1090 #endif 1090 #endif
1091 1091
1092 memset(opts, 0, sizeof(*opts)); 1092 memset(opts, 0, sizeof(*opts));
1093 1093
1094 while ((token = strsep(&o, ",")) != NULL) { 1094 while ((token = strsep(&o, ",")) != NULL) {
1095 if (!*token) 1095 if (!*token)
1096 return -EINVAL; 1096 return -EINVAL;
1097 if (!strcmp(token, "none")) { 1097 if (!strcmp(token, "none")) {
1098 /* Explicitly have no subsystems */ 1098 /* Explicitly have no subsystems */
1099 opts->none = true; 1099 opts->none = true;
1100 continue; 1100 continue;
1101 } 1101 }
1102 if (!strcmp(token, "all")) { 1102 if (!strcmp(token, "all")) {
1103 /* Mutually exclusive option 'all' + subsystem name */ 1103 /* Mutually exclusive option 'all' + subsystem name */
1104 if (one_ss) 1104 if (one_ss)
1105 return -EINVAL; 1105 return -EINVAL;
1106 all_ss = true; 1106 all_ss = true;
1107 continue; 1107 continue;
1108 } 1108 }
1109 if (!strcmp(token, "noprefix")) { 1109 if (!strcmp(token, "noprefix")) {
1110 set_bit(ROOT_NOPREFIX, &opts->flags); 1110 set_bit(ROOT_NOPREFIX, &opts->flags);
1111 continue; 1111 continue;
1112 } 1112 }
1113 if (!strcmp(token, "clone_children")) { 1113 if (!strcmp(token, "clone_children")) {
1114 opts->clone_children = true; 1114 opts->clone_children = true;
1115 continue; 1115 continue;
1116 } 1116 }
1117 if (!strncmp(token, "release_agent=", 14)) { 1117 if (!strncmp(token, "release_agent=", 14)) {
1118 /* Specifying two release agents is forbidden */ 1118 /* Specifying two release agents is forbidden */
1119 if (opts->release_agent) 1119 if (opts->release_agent)
1120 return -EINVAL; 1120 return -EINVAL;
1121 opts->release_agent = 1121 opts->release_agent =
1122 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); 1122 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1123 if (!opts->release_agent) 1123 if (!opts->release_agent)
1124 return -ENOMEM; 1124 return -ENOMEM;
1125 continue; 1125 continue;
1126 } 1126 }
1127 if (!strncmp(token, "name=", 5)) { 1127 if (!strncmp(token, "name=", 5)) {
1128 const char *name = token + 5; 1128 const char *name = token + 5;
1129 /* Can't specify an empty name */ 1129 /* Can't specify an empty name */
1130 if (!strlen(name)) 1130 if (!strlen(name))
1131 return -EINVAL; 1131 return -EINVAL;
1132 /* Must match [\w.-]+ */ 1132 /* Must match [\w.-]+ */
1133 for (i = 0; i < strlen(name); i++) { 1133 for (i = 0; i < strlen(name); i++) {
1134 char c = name[i]; 1134 char c = name[i];
1135 if (isalnum(c)) 1135 if (isalnum(c))
1136 continue; 1136 continue;
1137 if ((c == '.') || (c == '-') || (c == '_')) 1137 if ((c == '.') || (c == '-') || (c == '_'))
1138 continue; 1138 continue;
1139 return -EINVAL; 1139 return -EINVAL;
1140 } 1140 }
1141 /* Specifying two names is forbidden */ 1141 /* Specifying two names is forbidden */
1142 if (opts->name) 1142 if (opts->name)
1143 return -EINVAL; 1143 return -EINVAL;
1144 opts->name = kstrndup(name, 1144 opts->name = kstrndup(name,
1145 MAX_CGROUP_ROOT_NAMELEN - 1, 1145 MAX_CGROUP_ROOT_NAMELEN - 1,
1146 GFP_KERNEL); 1146 GFP_KERNEL);
1147 if (!opts->name) 1147 if (!opts->name)
1148 return -ENOMEM; 1148 return -ENOMEM;
1149 1149
1150 continue; 1150 continue;
1151 } 1151 }
1152 1152
1153 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1153 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1154 struct cgroup_subsys *ss = subsys[i]; 1154 struct cgroup_subsys *ss = subsys[i];
1155 if (ss == NULL) 1155 if (ss == NULL)
1156 continue; 1156 continue;
1157 if (strcmp(token, ss->name)) 1157 if (strcmp(token, ss->name))
1158 continue; 1158 continue;
1159 if (ss->disabled) 1159 if (ss->disabled)
1160 continue; 1160 continue;
1161 1161
1162 /* Mutually exclusive option 'all' + subsystem name */ 1162 /* Mutually exclusive option 'all' + subsystem name */
1163 if (all_ss) 1163 if (all_ss)
1164 return -EINVAL; 1164 return -EINVAL;
1165 set_bit(i, &opts->subsys_bits); 1165 set_bit(i, &opts->subsys_bits);
1166 one_ss = true; 1166 one_ss = true;
1167 1167
1168 break; 1168 break;
1169 } 1169 }
1170 if (i == CGROUP_SUBSYS_COUNT) 1170 if (i == CGROUP_SUBSYS_COUNT)
1171 return -ENOENT; 1171 return -ENOENT;
1172 } 1172 }
1173 1173
1174 /* 1174 /*
1175 * If the 'all' option was specified select all the subsystems, 1175 * If the 'all' option was specified select all the subsystems,
1176 * otherwise 'all, 'none' and a subsystem name options were not 1176 * otherwise 'all, 'none' and a subsystem name options were not
1177 * specified, let's default to 'all' 1177 * specified, let's default to 'all'
1178 */ 1178 */
1179 if (all_ss || (!all_ss && !one_ss && !opts->none)) { 1179 if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1180 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1180 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1181 struct cgroup_subsys *ss = subsys[i]; 1181 struct cgroup_subsys *ss = subsys[i];
1182 if (ss == NULL) 1182 if (ss == NULL)
1183 continue; 1183 continue;
1184 if (ss->disabled) 1184 if (ss->disabled)
1185 continue; 1185 continue;
1186 set_bit(i, &opts->subsys_bits); 1186 set_bit(i, &opts->subsys_bits);
1187 } 1187 }
1188 } 1188 }
1189 1189
1190 /* Consistency checks */ 1190 /* Consistency checks */
1191 1191
1192 /* 1192 /*
1193 * Option noprefix was introduced just for backward compatibility 1193 * Option noprefix was introduced just for backward compatibility
1194 * with the old cpuset, so we allow noprefix only if mounting just 1194 * with the old cpuset, so we allow noprefix only if mounting just
1195 * the cpuset subsystem. 1195 * the cpuset subsystem.
1196 */ 1196 */
1197 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1197 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
1198 (opts->subsys_bits & mask)) 1198 (opts->subsys_bits & mask))
1199 return -EINVAL; 1199 return -EINVAL;
1200 1200
1201 1201
1202 /* Can't specify "none" and some subsystems */ 1202 /* Can't specify "none" and some subsystems */
1203 if (opts->subsys_bits && opts->none) 1203 if (opts->subsys_bits && opts->none)
1204 return -EINVAL; 1204 return -EINVAL;
1205 1205
1206 /* 1206 /*
1207 * We either have to specify by name or by subsystems. (So all 1207 * We either have to specify by name or by subsystems. (So all
1208 * empty hierarchies must have a name). 1208 * empty hierarchies must have a name).
1209 */ 1209 */
1210 if (!opts->subsys_bits && !opts->name) 1210 if (!opts->subsys_bits && !opts->name)
1211 return -EINVAL; 1211 return -EINVAL;
1212 1212
1213 /* 1213 /*
1214 * Grab references on all the modules we'll need, so the subsystems 1214 * Grab references on all the modules we'll need, so the subsystems
1215 * don't dance around before rebind_subsystems attaches them. This may 1215 * don't dance around before rebind_subsystems attaches them. This may
1216 * take duplicate reference counts on a subsystem that's already used, 1216 * take duplicate reference counts on a subsystem that's already used,
1217 * but rebind_subsystems handles this case. 1217 * but rebind_subsystems handles this case.
1218 */ 1218 */
1219 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { 1219 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1220 unsigned long bit = 1UL << i; 1220 unsigned long bit = 1UL << i;
1221 1221
1222 if (!(bit & opts->subsys_bits)) 1222 if (!(bit & opts->subsys_bits))
1223 continue; 1223 continue;
1224 if (!try_module_get(subsys[i]->module)) { 1224 if (!try_module_get(subsys[i]->module)) {
1225 module_pin_failed = true; 1225 module_pin_failed = true;
1226 break; 1226 break;
1227 } 1227 }
1228 } 1228 }
1229 if (module_pin_failed) { 1229 if (module_pin_failed) {
1230 /* 1230 /*
1231 * oops, one of the modules was going away. this means that we 1231 * oops, one of the modules was going away. this means that we
1232 * raced with a module_delete call, and to the user this is 1232 * raced with a module_delete call, and to the user this is
1233 * essentially a "subsystem doesn't exist" case. 1233 * essentially a "subsystem doesn't exist" case.
1234 */ 1234 */
1235 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { 1235 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1236 /* drop refcounts only on the ones we took */ 1236 /* drop refcounts only on the ones we took */
1237 unsigned long bit = 1UL << i; 1237 unsigned long bit = 1UL << i;
1238 1238
1239 if (!(bit & opts->subsys_bits)) 1239 if (!(bit & opts->subsys_bits))
1240 continue; 1240 continue;
1241 module_put(subsys[i]->module); 1241 module_put(subsys[i]->module);
1242 } 1242 }
1243 return -ENOENT; 1243 return -ENOENT;
1244 } 1244 }
1245 1245
1246 return 0; 1246 return 0;
1247 } 1247 }
1248 1248
1249 static void drop_parsed_module_refcounts(unsigned long subsys_bits) 1249 static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1250 { 1250 {
1251 int i; 1251 int i;
1252 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { 1252 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1253 unsigned long bit = 1UL << i; 1253 unsigned long bit = 1UL << i;
1254 1254
1255 if (!(bit & subsys_bits)) 1255 if (!(bit & subsys_bits))
1256 continue; 1256 continue;
1257 module_put(subsys[i]->module); 1257 module_put(subsys[i]->module);
1258 } 1258 }
1259 } 1259 }
1260 1260
1261 static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1261 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1262 { 1262 {
1263 int ret = 0; 1263 int ret = 0;
1264 struct cgroupfs_root *root = sb->s_fs_info; 1264 struct cgroupfs_root *root = sb->s_fs_info;
1265 struct cgroup *cgrp = &root->top_cgroup; 1265 struct cgroup *cgrp = &root->top_cgroup;
1266 struct cgroup_sb_opts opts; 1266 struct cgroup_sb_opts opts;
1267 1267
1268 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1268 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1269 mutex_lock(&cgroup_mutex); 1269 mutex_lock(&cgroup_mutex);
1270 1270
1271 /* See what subsystems are wanted */ 1271 /* See what subsystems are wanted */
1272 ret = parse_cgroupfs_options(data, &opts); 1272 ret = parse_cgroupfs_options(data, &opts);
1273 if (ret) 1273 if (ret)
1274 goto out_unlock; 1274 goto out_unlock;
1275 1275
1276 /* Don't allow flags or name to change at remount */ 1276 /* Don't allow flags or name to change at remount */
1277 if (opts.flags != root->flags || 1277 if (opts.flags != root->flags ||
1278 (opts.name && strcmp(opts.name, root->name))) { 1278 (opts.name && strcmp(opts.name, root->name))) {
1279 ret = -EINVAL; 1279 ret = -EINVAL;
1280 drop_parsed_module_refcounts(opts.subsys_bits); 1280 drop_parsed_module_refcounts(opts.subsys_bits);
1281 goto out_unlock; 1281 goto out_unlock;
1282 } 1282 }
1283 1283
1284 ret = rebind_subsystems(root, opts.subsys_bits); 1284 ret = rebind_subsystems(root, opts.subsys_bits);
1285 if (ret) { 1285 if (ret) {
1286 drop_parsed_module_refcounts(opts.subsys_bits); 1286 drop_parsed_module_refcounts(opts.subsys_bits);
1287 goto out_unlock; 1287 goto out_unlock;
1288 } 1288 }
1289 1289
1290 /* (re)populate subsystem files */ 1290 /* (re)populate subsystem files */
1291 cgroup_populate_dir(cgrp); 1291 cgroup_populate_dir(cgrp);
1292 1292
1293 if (opts.release_agent) 1293 if (opts.release_agent)
1294 strcpy(root->release_agent_path, opts.release_agent); 1294 strcpy(root->release_agent_path, opts.release_agent);
1295 out_unlock: 1295 out_unlock:
1296 kfree(opts.release_agent); 1296 kfree(opts.release_agent);
1297 kfree(opts.name); 1297 kfree(opts.name);
1298 mutex_unlock(&cgroup_mutex); 1298 mutex_unlock(&cgroup_mutex);
1299 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1299 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1300 return ret; 1300 return ret;
1301 } 1301 }
1302 1302
1303 static const struct super_operations cgroup_ops = { 1303 static const struct super_operations cgroup_ops = {
1304 .statfs = simple_statfs, 1304 .statfs = simple_statfs,
1305 .drop_inode = generic_delete_inode, 1305 .drop_inode = generic_delete_inode,
1306 .show_options = cgroup_show_options, 1306 .show_options = cgroup_show_options,
1307 .remount_fs = cgroup_remount, 1307 .remount_fs = cgroup_remount,
1308 }; 1308 };
1309 1309
1310 static void init_cgroup_housekeeping(struct cgroup *cgrp) 1310 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1311 { 1311 {
1312 INIT_LIST_HEAD(&cgrp->sibling); 1312 INIT_LIST_HEAD(&cgrp->sibling);
1313 INIT_LIST_HEAD(&cgrp->children); 1313 INIT_LIST_HEAD(&cgrp->children);
1314 INIT_LIST_HEAD(&cgrp->css_sets); 1314 INIT_LIST_HEAD(&cgrp->css_sets);
1315 INIT_LIST_HEAD(&cgrp->release_list); 1315 INIT_LIST_HEAD(&cgrp->release_list);
1316 INIT_LIST_HEAD(&cgrp->pidlists); 1316 INIT_LIST_HEAD(&cgrp->pidlists);
1317 mutex_init(&cgrp->pidlist_mutex); 1317 mutex_init(&cgrp->pidlist_mutex);
1318 INIT_LIST_HEAD(&cgrp->event_list); 1318 INIT_LIST_HEAD(&cgrp->event_list);
1319 spin_lock_init(&cgrp->event_list_lock); 1319 spin_lock_init(&cgrp->event_list_lock);
1320 } 1320 }
1321 1321
1322 static void init_cgroup_root(struct cgroupfs_root *root) 1322 static void init_cgroup_root(struct cgroupfs_root *root)
1323 { 1323 {
1324 struct cgroup *cgrp = &root->top_cgroup; 1324 struct cgroup *cgrp = &root->top_cgroup;
1325 INIT_LIST_HEAD(&root->subsys_list); 1325 INIT_LIST_HEAD(&root->subsys_list);
1326 INIT_LIST_HEAD(&root->root_list); 1326 INIT_LIST_HEAD(&root->root_list);
1327 root->number_of_cgroups = 1; 1327 root->number_of_cgroups = 1;
1328 cgrp->root = root; 1328 cgrp->root = root;
1329 cgrp->top_cgroup = cgrp; 1329 cgrp->top_cgroup = cgrp;
1330 init_cgroup_housekeeping(cgrp); 1330 init_cgroup_housekeeping(cgrp);
1331 } 1331 }
1332 1332
1333 static bool init_root_id(struct cgroupfs_root *root) 1333 static bool init_root_id(struct cgroupfs_root *root)
1334 { 1334 {
1335 int ret = 0; 1335 int ret = 0;
1336 1336
1337 do { 1337 do {
1338 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) 1338 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1339 return false; 1339 return false;
1340 spin_lock(&hierarchy_id_lock); 1340 spin_lock(&hierarchy_id_lock);
1341 /* Try to allocate the next unused ID */ 1341 /* Try to allocate the next unused ID */
1342 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, 1342 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1343 &root->hierarchy_id); 1343 &root->hierarchy_id);
1344 if (ret == -ENOSPC) 1344 if (ret == -ENOSPC)
1345 /* Try again starting from 0 */ 1345 /* Try again starting from 0 */
1346 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); 1346 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1347 if (!ret) { 1347 if (!ret) {
1348 next_hierarchy_id = root->hierarchy_id + 1; 1348 next_hierarchy_id = root->hierarchy_id + 1;
1349 } else if (ret != -EAGAIN) { 1349 } else if (ret != -EAGAIN) {
1350 /* Can only get here if the 31-bit IDR is full ... */ 1350 /* Can only get here if the 31-bit IDR is full ... */
1351 BUG_ON(ret); 1351 BUG_ON(ret);
1352 } 1352 }
1353 spin_unlock(&hierarchy_id_lock); 1353 spin_unlock(&hierarchy_id_lock);
1354 } while (ret); 1354 } while (ret);
1355 return true; 1355 return true;
1356 } 1356 }
1357 1357
1358 static int cgroup_test_super(struct super_block *sb, void *data) 1358 static int cgroup_test_super(struct super_block *sb, void *data)
1359 { 1359 {
1360 struct cgroup_sb_opts *opts = data; 1360 struct cgroup_sb_opts *opts = data;
1361 struct cgroupfs_root *root = sb->s_fs_info; 1361 struct cgroupfs_root *root = sb->s_fs_info;
1362 1362
1363 /* If we asked for a name then it must match */ 1363 /* If we asked for a name then it must match */
1364 if (opts->name && strcmp(opts->name, root->name)) 1364 if (opts->name && strcmp(opts->name, root->name))
1365 return 0; 1365 return 0;
1366 1366
1367 /* 1367 /*
1368 * If we asked for subsystems (or explicitly for no 1368 * If we asked for subsystems (or explicitly for no
1369 * subsystems) then they must match 1369 * subsystems) then they must match
1370 */ 1370 */
1371 if ((opts->subsys_bits || opts->none) 1371 if ((opts->subsys_bits || opts->none)
1372 && (opts->subsys_bits != root->subsys_bits)) 1372 && (opts->subsys_bits != root->subsys_bits))
1373 return 0; 1373 return 0;
1374 1374
1375 return 1; 1375 return 1;
1376 } 1376 }
1377 1377
1378 static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) 1378 static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1379 { 1379 {
1380 struct cgroupfs_root *root; 1380 struct cgroupfs_root *root;
1381 1381
1382 if (!opts->subsys_bits && !opts->none) 1382 if (!opts->subsys_bits && !opts->none)
1383 return NULL; 1383 return NULL;
1384 1384
1385 root = kzalloc(sizeof(*root), GFP_KERNEL); 1385 root = kzalloc(sizeof(*root), GFP_KERNEL);
1386 if (!root) 1386 if (!root)
1387 return ERR_PTR(-ENOMEM); 1387 return ERR_PTR(-ENOMEM);
1388 1388
1389 if (!init_root_id(root)) { 1389 if (!init_root_id(root)) {
1390 kfree(root); 1390 kfree(root);
1391 return ERR_PTR(-ENOMEM); 1391 return ERR_PTR(-ENOMEM);
1392 } 1392 }
1393 init_cgroup_root(root); 1393 init_cgroup_root(root);
1394 1394
1395 root->subsys_bits = opts->subsys_bits; 1395 root->subsys_bits = opts->subsys_bits;
1396 root->flags = opts->flags; 1396 root->flags = opts->flags;
1397 if (opts->release_agent) 1397 if (opts->release_agent)
1398 strcpy(root->release_agent_path, opts->release_agent); 1398 strcpy(root->release_agent_path, opts->release_agent);
1399 if (opts->name) 1399 if (opts->name)
1400 strcpy(root->name, opts->name); 1400 strcpy(root->name, opts->name);
1401 if (opts->clone_children) 1401 if (opts->clone_children)
1402 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); 1402 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1403 return root; 1403 return root;
1404 } 1404 }
1405 1405
1406 static void cgroup_drop_root(struct cgroupfs_root *root) 1406 static void cgroup_drop_root(struct cgroupfs_root *root)
1407 { 1407 {
1408 if (!root) 1408 if (!root)
1409 return; 1409 return;
1410 1410
1411 BUG_ON(!root->hierarchy_id); 1411 BUG_ON(!root->hierarchy_id);
1412 spin_lock(&hierarchy_id_lock); 1412 spin_lock(&hierarchy_id_lock);
1413 ida_remove(&hierarchy_ida, root->hierarchy_id); 1413 ida_remove(&hierarchy_ida, root->hierarchy_id);
1414 spin_unlock(&hierarchy_id_lock); 1414 spin_unlock(&hierarchy_id_lock);
1415 kfree(root); 1415 kfree(root);
1416 } 1416 }
1417 1417
1418 static int cgroup_set_super(struct super_block *sb, void *data) 1418 static int cgroup_set_super(struct super_block *sb, void *data)
1419 { 1419 {
1420 int ret; 1420 int ret;
1421 struct cgroup_sb_opts *opts = data; 1421 struct cgroup_sb_opts *opts = data;
1422 1422
1423 /* If we don't have a new root, we can't set up a new sb */ 1423 /* If we don't have a new root, we can't set up a new sb */
1424 if (!opts->new_root) 1424 if (!opts->new_root)
1425 return -EINVAL; 1425 return -EINVAL;
1426 1426
1427 BUG_ON(!opts->subsys_bits && !opts->none); 1427 BUG_ON(!opts->subsys_bits && !opts->none);
1428 1428
1429 ret = set_anon_super(sb, NULL); 1429 ret = set_anon_super(sb, NULL);
1430 if (ret) 1430 if (ret)
1431 return ret; 1431 return ret;
1432 1432
1433 sb->s_fs_info = opts->new_root; 1433 sb->s_fs_info = opts->new_root;
1434 opts->new_root->sb = sb; 1434 opts->new_root->sb = sb;
1435 1435
1436 sb->s_blocksize = PAGE_CACHE_SIZE; 1436 sb->s_blocksize = PAGE_CACHE_SIZE;
1437 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1437 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1438 sb->s_magic = CGROUP_SUPER_MAGIC; 1438 sb->s_magic = CGROUP_SUPER_MAGIC;
1439 sb->s_op = &cgroup_ops; 1439 sb->s_op = &cgroup_ops;
1440 1440
1441 return 0; 1441 return 0;
1442 } 1442 }
1443 1443
1444 static int cgroup_get_rootdir(struct super_block *sb) 1444 static int cgroup_get_rootdir(struct super_block *sb)
1445 { 1445 {
1446 static const struct dentry_operations cgroup_dops = { 1446 static const struct dentry_operations cgroup_dops = {
1447 .d_iput = cgroup_diput, 1447 .d_iput = cgroup_diput,
1448 .d_delete = cgroup_delete, 1448 .d_delete = cgroup_delete,
1449 }; 1449 };
1450 1450
1451 struct inode *inode = 1451 struct inode *inode =
1452 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1452 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1453 struct dentry *dentry; 1453 struct dentry *dentry;
1454 1454
1455 if (!inode) 1455 if (!inode)
1456 return -ENOMEM; 1456 return -ENOMEM;
1457 1457
1458 inode->i_fop = &simple_dir_operations; 1458 inode->i_fop = &simple_dir_operations;
1459 inode->i_op = &cgroup_dir_inode_operations; 1459 inode->i_op = &cgroup_dir_inode_operations;
1460 /* directories start off with i_nlink == 2 (for "." entry) */ 1460 /* directories start off with i_nlink == 2 (for "." entry) */
1461 inc_nlink(inode); 1461 inc_nlink(inode);
1462 dentry = d_alloc_root(inode); 1462 dentry = d_alloc_root(inode);
1463 if (!dentry) { 1463 if (!dentry) {
1464 iput(inode); 1464 iput(inode);
1465 return -ENOMEM; 1465 return -ENOMEM;
1466 } 1466 }
1467 sb->s_root = dentry; 1467 sb->s_root = dentry;
1468 /* for everything else we want ->d_op set */ 1468 /* for everything else we want ->d_op set */
1469 sb->s_d_op = &cgroup_dops; 1469 sb->s_d_op = &cgroup_dops;
1470 return 0; 1470 return 0;
1471 } 1471 }
1472 1472
1473 static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1473 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1474 int flags, const char *unused_dev_name, 1474 int flags, const char *unused_dev_name,
1475 void *data) 1475 void *data)
1476 { 1476 {
1477 struct cgroup_sb_opts opts; 1477 struct cgroup_sb_opts opts;
1478 struct cgroupfs_root *root; 1478 struct cgroupfs_root *root;
1479 int ret = 0; 1479 int ret = 0;
1480 struct super_block *sb; 1480 struct super_block *sb;
1481 struct cgroupfs_root *new_root; 1481 struct cgroupfs_root *new_root;
1482 1482
1483 /* First find the desired set of subsystems */ 1483 /* First find the desired set of subsystems */
1484 mutex_lock(&cgroup_mutex); 1484 mutex_lock(&cgroup_mutex);
1485 ret = parse_cgroupfs_options(data, &opts); 1485 ret = parse_cgroupfs_options(data, &opts);
1486 mutex_unlock(&cgroup_mutex); 1486 mutex_unlock(&cgroup_mutex);
1487 if (ret) 1487 if (ret)
1488 goto out_err; 1488 goto out_err;
1489 1489
1490 /* 1490 /*
1491 * Allocate a new cgroup root. We may not need it if we're 1491 * Allocate a new cgroup root. We may not need it if we're
1492 * reusing an existing hierarchy. 1492 * reusing an existing hierarchy.
1493 */ 1493 */
1494 new_root = cgroup_root_from_opts(&opts); 1494 new_root = cgroup_root_from_opts(&opts);
1495 if (IS_ERR(new_root)) { 1495 if (IS_ERR(new_root)) {
1496 ret = PTR_ERR(new_root); 1496 ret = PTR_ERR(new_root);
1497 goto drop_modules; 1497 goto drop_modules;
1498 } 1498 }
1499 opts.new_root = new_root; 1499 opts.new_root = new_root;
1500 1500
1501 /* Locate an existing or new sb for this hierarchy */ 1501 /* Locate an existing or new sb for this hierarchy */
1502 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); 1502 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1503 if (IS_ERR(sb)) { 1503 if (IS_ERR(sb)) {
1504 ret = PTR_ERR(sb); 1504 ret = PTR_ERR(sb);
1505 cgroup_drop_root(opts.new_root); 1505 cgroup_drop_root(opts.new_root);
1506 goto drop_modules; 1506 goto drop_modules;
1507 } 1507 }
1508 1508
1509 root = sb->s_fs_info; 1509 root = sb->s_fs_info;
1510 BUG_ON(!root); 1510 BUG_ON(!root);
1511 if (root == opts.new_root) { 1511 if (root == opts.new_root) {
1512 /* We used the new root structure, so this is a new hierarchy */ 1512 /* We used the new root structure, so this is a new hierarchy */
1513 struct list_head tmp_cg_links; 1513 struct list_head tmp_cg_links;
1514 struct cgroup *root_cgrp = &root->top_cgroup; 1514 struct cgroup *root_cgrp = &root->top_cgroup;
1515 struct inode *inode; 1515 struct inode *inode;
1516 struct cgroupfs_root *existing_root; 1516 struct cgroupfs_root *existing_root;
1517 int i; 1517 int i;
1518 1518
1519 BUG_ON(sb->s_root != NULL); 1519 BUG_ON(sb->s_root != NULL);
1520 1520
1521 ret = cgroup_get_rootdir(sb); 1521 ret = cgroup_get_rootdir(sb);
1522 if (ret) 1522 if (ret)
1523 goto drop_new_super; 1523 goto drop_new_super;
1524 inode = sb->s_root->d_inode; 1524 inode = sb->s_root->d_inode;
1525 1525
1526 mutex_lock(&inode->i_mutex); 1526 mutex_lock(&inode->i_mutex);
1527 mutex_lock(&cgroup_mutex); 1527 mutex_lock(&cgroup_mutex);
1528 1528
1529 if (strlen(root->name)) { 1529 if (strlen(root->name)) {
1530 /* Check for name clashes with existing mounts */ 1530 /* Check for name clashes with existing mounts */
1531 for_each_active_root(existing_root) { 1531 for_each_active_root(existing_root) {
1532 if (!strcmp(existing_root->name, root->name)) { 1532 if (!strcmp(existing_root->name, root->name)) {
1533 ret = -EBUSY; 1533 ret = -EBUSY;
1534 mutex_unlock(&cgroup_mutex); 1534 mutex_unlock(&cgroup_mutex);
1535 mutex_unlock(&inode->i_mutex); 1535 mutex_unlock(&inode->i_mutex);
1536 goto drop_new_super; 1536 goto drop_new_super;
1537 } 1537 }
1538 } 1538 }
1539 } 1539 }
1540 1540
1541 /* 1541 /*
1542 * We're accessing css_set_count without locking 1542 * We're accessing css_set_count without locking
1543 * css_set_lock here, but that's OK - it can only be 1543 * css_set_lock here, but that's OK - it can only be
1544 * increased by someone holding cgroup_lock, and 1544 * increased by someone holding cgroup_lock, and
1545 * that's us. The worst that can happen is that we 1545 * that's us. The worst that can happen is that we
1546 * have some link structures left over 1546 * have some link structures left over
1547 */ 1547 */
1548 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1548 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1549 if (ret) { 1549 if (ret) {
1550 mutex_unlock(&cgroup_mutex); 1550 mutex_unlock(&cgroup_mutex);
1551 mutex_unlock(&inode->i_mutex); 1551 mutex_unlock(&inode->i_mutex);
1552 goto drop_new_super; 1552 goto drop_new_super;
1553 } 1553 }
1554 1554
1555 ret = rebind_subsystems(root, root->subsys_bits); 1555 ret = rebind_subsystems(root, root->subsys_bits);
1556 if (ret == -EBUSY) { 1556 if (ret == -EBUSY) {
1557 mutex_unlock(&cgroup_mutex); 1557 mutex_unlock(&cgroup_mutex);
1558 mutex_unlock(&inode->i_mutex); 1558 mutex_unlock(&inode->i_mutex);
1559 free_cg_links(&tmp_cg_links); 1559 free_cg_links(&tmp_cg_links);
1560 goto drop_new_super; 1560 goto drop_new_super;
1561 } 1561 }
1562 /* 1562 /*
1563 * There must be no failure case after here, since rebinding 1563 * There must be no failure case after here, since rebinding
1564 * takes care of subsystems' refcounts, which are explicitly 1564 * takes care of subsystems' refcounts, which are explicitly
1565 * dropped in the failure exit path. 1565 * dropped in the failure exit path.
1566 */ 1566 */
1567 1567
1568 /* EBUSY should be the only error here */ 1568 /* EBUSY should be the only error here */
1569 BUG_ON(ret); 1569 BUG_ON(ret);
1570 1570
1571 list_add(&root->root_list, &roots); 1571 list_add(&root->root_list, &roots);
1572 root_count++; 1572 root_count++;
1573 1573
1574 sb->s_root->d_fsdata = root_cgrp; 1574 sb->s_root->d_fsdata = root_cgrp;
1575 root->top_cgroup.dentry = sb->s_root; 1575 root->top_cgroup.dentry = sb->s_root;
1576 1576
1577 /* Link the top cgroup in this hierarchy into all 1577 /* Link the top cgroup in this hierarchy into all
1578 * the css_set objects */ 1578 * the css_set objects */
1579 write_lock(&css_set_lock); 1579 write_lock(&css_set_lock);
1580 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 1580 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
1581 struct hlist_head *hhead = &css_set_table[i]; 1581 struct hlist_head *hhead = &css_set_table[i];
1582 struct hlist_node *node; 1582 struct hlist_node *node;
1583 struct css_set *cg; 1583 struct css_set *cg;
1584 1584
1585 hlist_for_each_entry(cg, node, hhead, hlist) 1585 hlist_for_each_entry(cg, node, hhead, hlist)
1586 link_css_set(&tmp_cg_links, cg, root_cgrp); 1586 link_css_set(&tmp_cg_links, cg, root_cgrp);
1587 } 1587 }
1588 write_unlock(&css_set_lock); 1588 write_unlock(&css_set_lock);
1589 1589
1590 free_cg_links(&tmp_cg_links); 1590 free_cg_links(&tmp_cg_links);
1591 1591
1592 BUG_ON(!list_empty(&root_cgrp->sibling)); 1592 BUG_ON(!list_empty(&root_cgrp->sibling));
1593 BUG_ON(!list_empty(&root_cgrp->children)); 1593 BUG_ON(!list_empty(&root_cgrp->children));
1594 BUG_ON(root->number_of_cgroups != 1); 1594 BUG_ON(root->number_of_cgroups != 1);
1595 1595
1596 cgroup_populate_dir(root_cgrp); 1596 cgroup_populate_dir(root_cgrp);
1597 mutex_unlock(&cgroup_mutex); 1597 mutex_unlock(&cgroup_mutex);
1598 mutex_unlock(&inode->i_mutex); 1598 mutex_unlock(&inode->i_mutex);
1599 } else { 1599 } else {
1600 /* 1600 /*
1601 * We re-used an existing hierarchy - the new root (if 1601 * We re-used an existing hierarchy - the new root (if
1602 * any) is not needed 1602 * any) is not needed
1603 */ 1603 */
1604 cgroup_drop_root(opts.new_root); 1604 cgroup_drop_root(opts.new_root);
1605 /* no subsys rebinding, so refcounts don't change */ 1605 /* no subsys rebinding, so refcounts don't change */
1606 drop_parsed_module_refcounts(opts.subsys_bits); 1606 drop_parsed_module_refcounts(opts.subsys_bits);
1607 } 1607 }
1608 1608
1609 kfree(opts.release_agent); 1609 kfree(opts.release_agent);
1610 kfree(opts.name); 1610 kfree(opts.name);
1611 return dget(sb->s_root); 1611 return dget(sb->s_root);
1612 1612
1613 drop_new_super: 1613 drop_new_super:
1614 deactivate_locked_super(sb); 1614 deactivate_locked_super(sb);
1615 drop_modules: 1615 drop_modules:
1616 drop_parsed_module_refcounts(opts.subsys_bits); 1616 drop_parsed_module_refcounts(opts.subsys_bits);
1617 out_err: 1617 out_err:
1618 kfree(opts.release_agent); 1618 kfree(opts.release_agent);
1619 kfree(opts.name); 1619 kfree(opts.name);
1620 return ERR_PTR(ret); 1620 return ERR_PTR(ret);
1621 } 1621 }
1622 1622
1623 static void cgroup_kill_sb(struct super_block *sb) { 1623 static void cgroup_kill_sb(struct super_block *sb) {
1624 struct cgroupfs_root *root = sb->s_fs_info; 1624 struct cgroupfs_root *root = sb->s_fs_info;
1625 struct cgroup *cgrp = &root->top_cgroup; 1625 struct cgroup *cgrp = &root->top_cgroup;
1626 int ret; 1626 int ret;
1627 struct cg_cgroup_link *link; 1627 struct cg_cgroup_link *link;
1628 struct cg_cgroup_link *saved_link; 1628 struct cg_cgroup_link *saved_link;
1629 1629
1630 BUG_ON(!root); 1630 BUG_ON(!root);
1631 1631
1632 BUG_ON(root->number_of_cgroups != 1); 1632 BUG_ON(root->number_of_cgroups != 1);
1633 BUG_ON(!list_empty(&cgrp->children)); 1633 BUG_ON(!list_empty(&cgrp->children));
1634 BUG_ON(!list_empty(&cgrp->sibling)); 1634 BUG_ON(!list_empty(&cgrp->sibling));
1635 1635
1636 mutex_lock(&cgroup_mutex); 1636 mutex_lock(&cgroup_mutex);
1637 1637
1638 /* Rebind all subsystems back to the default hierarchy */ 1638 /* Rebind all subsystems back to the default hierarchy */
1639 ret = rebind_subsystems(root, 0); 1639 ret = rebind_subsystems(root, 0);
1640 /* Shouldn't be able to fail ... */ 1640 /* Shouldn't be able to fail ... */
1641 BUG_ON(ret); 1641 BUG_ON(ret);
1642 1642
1643 /* 1643 /*
1644 * Release all the links from css_sets to this hierarchy's 1644 * Release all the links from css_sets to this hierarchy's
1645 * root cgroup 1645 * root cgroup
1646 */ 1646 */
1647 write_lock(&css_set_lock); 1647 write_lock(&css_set_lock);
1648 1648
1649 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, 1649 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
1650 cgrp_link_list) { 1650 cgrp_link_list) {
1651 list_del(&link->cg_link_list); 1651 list_del(&link->cg_link_list);
1652 list_del(&link->cgrp_link_list); 1652 list_del(&link->cgrp_link_list);
1653 kfree(link); 1653 kfree(link);
1654 } 1654 }
1655 write_unlock(&css_set_lock); 1655 write_unlock(&css_set_lock);
1656 1656
1657 if (!list_empty(&root->root_list)) { 1657 if (!list_empty(&root->root_list)) {
1658 list_del(&root->root_list); 1658 list_del(&root->root_list);
1659 root_count--; 1659 root_count--;
1660 } 1660 }
1661 1661
1662 mutex_unlock(&cgroup_mutex); 1662 mutex_unlock(&cgroup_mutex);
1663 1663
1664 kill_litter_super(sb); 1664 kill_litter_super(sb);
1665 cgroup_drop_root(root); 1665 cgroup_drop_root(root);
1666 } 1666 }
1667 1667
1668 static struct file_system_type cgroup_fs_type = { 1668 static struct file_system_type cgroup_fs_type = {
1669 .name = "cgroup", 1669 .name = "cgroup",
1670 .mount = cgroup_mount, 1670 .mount = cgroup_mount,
1671 .kill_sb = cgroup_kill_sb, 1671 .kill_sb = cgroup_kill_sb,
1672 }; 1672 };
1673 1673
1674 static struct kobject *cgroup_kobj; 1674 static struct kobject *cgroup_kobj;
1675 1675
1676 static inline struct cgroup *__d_cgrp(struct dentry *dentry) 1676 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1677 { 1677 {
1678 return dentry->d_fsdata; 1678 return dentry->d_fsdata;
1679 } 1679 }
1680 1680
1681 static inline struct cftype *__d_cft(struct dentry *dentry) 1681 static inline struct cftype *__d_cft(struct dentry *dentry)
1682 { 1682 {
1683 return dentry->d_fsdata; 1683 return dentry->d_fsdata;
1684 } 1684 }
1685 1685
1686 /** 1686 /**
1687 * cgroup_path - generate the path of a cgroup 1687 * cgroup_path - generate the path of a cgroup
1688 * @cgrp: the cgroup in question 1688 * @cgrp: the cgroup in question
1689 * @buf: the buffer to write the path into 1689 * @buf: the buffer to write the path into
1690 * @buflen: the length of the buffer 1690 * @buflen: the length of the buffer
1691 * 1691 *
1692 * Called with cgroup_mutex held or else with an RCU-protected cgroup 1692 * Called with cgroup_mutex held or else with an RCU-protected cgroup
1693 * reference. Writes path of cgroup into buf. Returns 0 on success, 1693 * reference. Writes path of cgroup into buf. Returns 0 on success,
1694 * -errno on error. 1694 * -errno on error.
1695 */ 1695 */
1696 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1696 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1697 { 1697 {
1698 char *start; 1698 char *start;
1699 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1699 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1700 rcu_read_lock_held() ||
1701 cgroup_lock_is_held()); 1700 cgroup_lock_is_held());
1702 1701
1703 if (!dentry || cgrp == dummytop) { 1702 if (!dentry || cgrp == dummytop) {
1704 /* 1703 /*
1705 * Inactive subsystems have no dentry for their root 1704 * Inactive subsystems have no dentry for their root
1706 * cgroup 1705 * cgroup
1707 */ 1706 */
1708 strcpy(buf, "/"); 1707 strcpy(buf, "/");
1709 return 0; 1708 return 0;
1710 } 1709 }
1711 1710
1712 start = buf + buflen; 1711 start = buf + buflen;
1713 1712
1714 *--start = '\0'; 1713 *--start = '\0';
1715 for (;;) { 1714 for (;;) {
1716 int len = dentry->d_name.len; 1715 int len = dentry->d_name.len;
1717 1716
1718 if ((start -= len) < buf) 1717 if ((start -= len) < buf)
1719 return -ENAMETOOLONG; 1718 return -ENAMETOOLONG;
1720 memcpy(start, dentry->d_name.name, len); 1719 memcpy(start, dentry->d_name.name, len);
1721 cgrp = cgrp->parent; 1720 cgrp = cgrp->parent;
1722 if (!cgrp) 1721 if (!cgrp)
1723 break; 1722 break;
1724 1723
1725 dentry = rcu_dereference_check(cgrp->dentry, 1724 dentry = rcu_dereference_check(cgrp->dentry,
1726 rcu_read_lock_held() ||
1727 cgroup_lock_is_held()); 1725 cgroup_lock_is_held());
1728 if (!cgrp->parent) 1726 if (!cgrp->parent)
1729 continue; 1727 continue;
1730 if (--start < buf) 1728 if (--start < buf)
1731 return -ENAMETOOLONG; 1729 return -ENAMETOOLONG;
1732 *start = '/'; 1730 *start = '/';
1733 } 1731 }
1734 memmove(buf, start, buf + buflen - start); 1732 memmove(buf, start, buf + buflen - start);
1735 return 0; 1733 return 0;
1736 } 1734 }
1737 EXPORT_SYMBOL_GPL(cgroup_path); 1735 EXPORT_SYMBOL_GPL(cgroup_path);
1738 1736
1739 /* 1737 /*
1740 * cgroup_task_migrate - move a task from one cgroup to another. 1738 * cgroup_task_migrate - move a task from one cgroup to another.
1741 * 1739 *
1742 * 'guarantee' is set if the caller promises that a new css_set for the task 1740 * 'guarantee' is set if the caller promises that a new css_set for the task
1743 * will already exist. If not set, this function might sleep, and can fail with 1741 * will already exist. If not set, this function might sleep, and can fail with
1744 * -ENOMEM. Otherwise, it can only fail with -ESRCH. 1742 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1745 */ 1743 */
1746 static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1744 static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1747 struct task_struct *tsk, bool guarantee) 1745 struct task_struct *tsk, bool guarantee)
1748 { 1746 {
1749 struct css_set *oldcg; 1747 struct css_set *oldcg;
1750 struct css_set *newcg; 1748 struct css_set *newcg;
1751 1749
1752 /* 1750 /*
1753 * get old css_set. we need to take task_lock and refcount it, because 1751 * get old css_set. we need to take task_lock and refcount it, because
1754 * an exiting task can change its css_set to init_css_set and drop its 1752 * an exiting task can change its css_set to init_css_set and drop its
1755 * old one without taking cgroup_mutex. 1753 * old one without taking cgroup_mutex.
1756 */ 1754 */
1757 task_lock(tsk); 1755 task_lock(tsk);
1758 oldcg = tsk->cgroups; 1756 oldcg = tsk->cgroups;
1759 get_css_set(oldcg); 1757 get_css_set(oldcg);
1760 task_unlock(tsk); 1758 task_unlock(tsk);
1761 1759
1762 /* locate or allocate a new css_set for this task. */ 1760 /* locate or allocate a new css_set for this task. */
1763 if (guarantee) { 1761 if (guarantee) {
1764 /* we know the css_set we want already exists. */ 1762 /* we know the css_set we want already exists. */
1765 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 1763 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1766 read_lock(&css_set_lock); 1764 read_lock(&css_set_lock);
1767 newcg = find_existing_css_set(oldcg, cgrp, template); 1765 newcg = find_existing_css_set(oldcg, cgrp, template);
1768 BUG_ON(!newcg); 1766 BUG_ON(!newcg);
1769 get_css_set(newcg); 1767 get_css_set(newcg);
1770 read_unlock(&css_set_lock); 1768 read_unlock(&css_set_lock);
1771 } else { 1769 } else {
1772 might_sleep(); 1770 might_sleep();
1773 /* find_css_set will give us newcg already referenced. */ 1771 /* find_css_set will give us newcg already referenced. */
1774 newcg = find_css_set(oldcg, cgrp); 1772 newcg = find_css_set(oldcg, cgrp);
1775 if (!newcg) { 1773 if (!newcg) {
1776 put_css_set(oldcg); 1774 put_css_set(oldcg);
1777 return -ENOMEM; 1775 return -ENOMEM;
1778 } 1776 }
1779 } 1777 }
1780 put_css_set(oldcg); 1778 put_css_set(oldcg);
1781 1779
1782 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ 1780 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1783 task_lock(tsk); 1781 task_lock(tsk);
1784 if (tsk->flags & PF_EXITING) { 1782 if (tsk->flags & PF_EXITING) {
1785 task_unlock(tsk); 1783 task_unlock(tsk);
1786 put_css_set(newcg); 1784 put_css_set(newcg);
1787 return -ESRCH; 1785 return -ESRCH;
1788 } 1786 }
1789 rcu_assign_pointer(tsk->cgroups, newcg); 1787 rcu_assign_pointer(tsk->cgroups, newcg);
1790 task_unlock(tsk); 1788 task_unlock(tsk);
1791 1789
1792 /* Update the css_set linked lists if we're using them */ 1790 /* Update the css_set linked lists if we're using them */
1793 write_lock(&css_set_lock); 1791 write_lock(&css_set_lock);
1794 if (!list_empty(&tsk->cg_list)) 1792 if (!list_empty(&tsk->cg_list))
1795 list_move(&tsk->cg_list, &newcg->tasks); 1793 list_move(&tsk->cg_list, &newcg->tasks);
1796 write_unlock(&css_set_lock); 1794 write_unlock(&css_set_lock);
1797 1795
1798 /* 1796 /*
1799 * We just gained a reference on oldcg by taking it from the task. As 1797 * We just gained a reference on oldcg by taking it from the task. As
1800 * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1798 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1801 * it here; it will be freed under RCU. 1799 * it here; it will be freed under RCU.
1802 */ 1800 */
1803 put_css_set(oldcg); 1801 put_css_set(oldcg);
1804 1802
1805 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1803 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1806 return 0; 1804 return 0;
1807 } 1805 }
1808 1806
1809 /** 1807 /**
1810 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1808 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1811 * @cgrp: the cgroup the task is attaching to 1809 * @cgrp: the cgroup the task is attaching to
1812 * @tsk: the task to be attached 1810 * @tsk: the task to be attached
1813 * 1811 *
1814 * Call holding cgroup_mutex. May take task_lock of 1812 * Call holding cgroup_mutex. May take task_lock of
1815 * the task 'tsk' during call. 1813 * the task 'tsk' during call.
1816 */ 1814 */
1817 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1815 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1818 { 1816 {
1819 int retval; 1817 int retval;
1820 struct cgroup_subsys *ss, *failed_ss = NULL; 1818 struct cgroup_subsys *ss, *failed_ss = NULL;
1821 struct cgroup *oldcgrp; 1819 struct cgroup *oldcgrp;
1822 struct cgroupfs_root *root = cgrp->root; 1820 struct cgroupfs_root *root = cgrp->root;
1823 1821
1824 /* Nothing to do if the task is already in that cgroup */ 1822 /* Nothing to do if the task is already in that cgroup */
1825 oldcgrp = task_cgroup_from_root(tsk, root); 1823 oldcgrp = task_cgroup_from_root(tsk, root);
1826 if (cgrp == oldcgrp) 1824 if (cgrp == oldcgrp)
1827 return 0; 1825 return 0;
1828 1826
1829 for_each_subsys(root, ss) { 1827 for_each_subsys(root, ss) {
1830 if (ss->can_attach) { 1828 if (ss->can_attach) {
1831 retval = ss->can_attach(ss, cgrp, tsk); 1829 retval = ss->can_attach(ss, cgrp, tsk);
1832 if (retval) { 1830 if (retval) {
1833 /* 1831 /*
1834 * Remember on which subsystem the can_attach() 1832 * Remember on which subsystem the can_attach()
1835 * failed, so that we only call cancel_attach() 1833 * failed, so that we only call cancel_attach()
1836 * against the subsystems whose can_attach() 1834 * against the subsystems whose can_attach()
1837 * succeeded. (See below) 1835 * succeeded. (See below)
1838 */ 1836 */
1839 failed_ss = ss; 1837 failed_ss = ss;
1840 goto out; 1838 goto out;
1841 } 1839 }
1842 } 1840 }
1843 if (ss->can_attach_task) { 1841 if (ss->can_attach_task) {
1844 retval = ss->can_attach_task(cgrp, tsk); 1842 retval = ss->can_attach_task(cgrp, tsk);
1845 if (retval) { 1843 if (retval) {
1846 failed_ss = ss; 1844 failed_ss = ss;
1847 goto out; 1845 goto out;
1848 } 1846 }
1849 } 1847 }
1850 } 1848 }
1851 1849
1852 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1850 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1853 if (retval) 1851 if (retval)
1854 goto out; 1852 goto out;
1855 1853
1856 for_each_subsys(root, ss) { 1854 for_each_subsys(root, ss) {
1857 if (ss->pre_attach) 1855 if (ss->pre_attach)
1858 ss->pre_attach(cgrp); 1856 ss->pre_attach(cgrp);
1859 if (ss->attach_task) 1857 if (ss->attach_task)
1860 ss->attach_task(cgrp, tsk); 1858 ss->attach_task(cgrp, tsk);
1861 if (ss->attach) 1859 if (ss->attach)
1862 ss->attach(ss, cgrp, oldcgrp, tsk); 1860 ss->attach(ss, cgrp, oldcgrp, tsk);
1863 } 1861 }
1864 1862
1865 synchronize_rcu(); 1863 synchronize_rcu();
1866 1864
1867 /* 1865 /*
1868 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1866 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1869 * is no longer empty. 1867 * is no longer empty.
1870 */ 1868 */
1871 cgroup_wakeup_rmdir_waiter(cgrp); 1869 cgroup_wakeup_rmdir_waiter(cgrp);
1872 out: 1870 out:
1873 if (retval) { 1871 if (retval) {
1874 for_each_subsys(root, ss) { 1872 for_each_subsys(root, ss) {
1875 if (ss == failed_ss) 1873 if (ss == failed_ss)
1876 /* 1874 /*
1877 * This subsystem was the one that failed the 1875 * This subsystem was the one that failed the
1878 * can_attach() check earlier, so we don't need 1876 * can_attach() check earlier, so we don't need
1879 * to call cancel_attach() against it or any 1877 * to call cancel_attach() against it or any
1880 * remaining subsystems. 1878 * remaining subsystems.
1881 */ 1879 */
1882 break; 1880 break;
1883 if (ss->cancel_attach) 1881 if (ss->cancel_attach)
1884 ss->cancel_attach(ss, cgrp, tsk); 1882 ss->cancel_attach(ss, cgrp, tsk);
1885 } 1883 }
1886 } 1884 }
1887 return retval; 1885 return retval;
1888 } 1886 }
1889 1887
1890 /** 1888 /**
1891 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' 1889 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
1892 * @from: attach to all cgroups of a given task 1890 * @from: attach to all cgroups of a given task
1893 * @tsk: the task to be attached 1891 * @tsk: the task to be attached
1894 */ 1892 */
1895 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) 1893 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1896 { 1894 {
1897 struct cgroupfs_root *root; 1895 struct cgroupfs_root *root;
1898 int retval = 0; 1896 int retval = 0;
1899 1897
1900 cgroup_lock(); 1898 cgroup_lock();
1901 for_each_active_root(root) { 1899 for_each_active_root(root) {
1902 struct cgroup *from_cg = task_cgroup_from_root(from, root); 1900 struct cgroup *from_cg = task_cgroup_from_root(from, root);
1903 1901
1904 retval = cgroup_attach_task(from_cg, tsk); 1902 retval = cgroup_attach_task(from_cg, tsk);
1905 if (retval) 1903 if (retval)
1906 break; 1904 break;
1907 } 1905 }
1908 cgroup_unlock(); 1906 cgroup_unlock();
1909 1907
1910 return retval; 1908 return retval;
1911 } 1909 }
1912 EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1910 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1913 1911
1914 /* 1912 /*
1915 * cgroup_attach_proc works in two stages, the first of which prefetches all 1913 * cgroup_attach_proc works in two stages, the first of which prefetches all
1916 * new css_sets needed (to make sure we have enough memory before committing 1914 * new css_sets needed (to make sure we have enough memory before committing
1917 * to the move) and stores them in a list of entries of the following type. 1915 * to the move) and stores them in a list of entries of the following type.
1918 * TODO: possible optimization: use css_set->rcu_head for chaining instead 1916 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1919 */ 1917 */
1920 struct cg_list_entry { 1918 struct cg_list_entry {
1921 struct css_set *cg; 1919 struct css_set *cg;
1922 struct list_head links; 1920 struct list_head links;
1923 }; 1921 };
1924 1922
1925 static bool css_set_check_fetched(struct cgroup *cgrp, 1923 static bool css_set_check_fetched(struct cgroup *cgrp,
1926 struct task_struct *tsk, struct css_set *cg, 1924 struct task_struct *tsk, struct css_set *cg,
1927 struct list_head *newcg_list) 1925 struct list_head *newcg_list)
1928 { 1926 {
1929 struct css_set *newcg; 1927 struct css_set *newcg;
1930 struct cg_list_entry *cg_entry; 1928 struct cg_list_entry *cg_entry;
1931 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 1929 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1932 1930
1933 read_lock(&css_set_lock); 1931 read_lock(&css_set_lock);
1934 newcg = find_existing_css_set(cg, cgrp, template); 1932 newcg = find_existing_css_set(cg, cgrp, template);
1935 if (newcg) 1933 if (newcg)
1936 get_css_set(newcg); 1934 get_css_set(newcg);
1937 read_unlock(&css_set_lock); 1935 read_unlock(&css_set_lock);
1938 1936
1939 /* doesn't exist at all? */ 1937 /* doesn't exist at all? */
1940 if (!newcg) 1938 if (!newcg)
1941 return false; 1939 return false;
1942 /* see if it's already in the list */ 1940 /* see if it's already in the list */
1943 list_for_each_entry(cg_entry, newcg_list, links) { 1941 list_for_each_entry(cg_entry, newcg_list, links) {
1944 if (cg_entry->cg == newcg) { 1942 if (cg_entry->cg == newcg) {
1945 put_css_set(newcg); 1943 put_css_set(newcg);
1946 return true; 1944 return true;
1947 } 1945 }
1948 } 1946 }
1949 1947
1950 /* not found */ 1948 /* not found */
1951 put_css_set(newcg); 1949 put_css_set(newcg);
1952 return false; 1950 return false;
1953 } 1951 }
1954 1952
1955 /* 1953 /*
1956 * Find the new css_set and store it in the list in preparation for moving the 1954 * Find the new css_set and store it in the list in preparation for moving the
1957 * given task to the given cgroup. Returns 0 or -ENOMEM. 1955 * given task to the given cgroup. Returns 0 or -ENOMEM.
1958 */ 1956 */
1959 static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, 1957 static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1960 struct list_head *newcg_list) 1958 struct list_head *newcg_list)
1961 { 1959 {
1962 struct css_set *newcg; 1960 struct css_set *newcg;
1963 struct cg_list_entry *cg_entry; 1961 struct cg_list_entry *cg_entry;
1964 1962
1965 /* ensure a new css_set will exist for this thread */ 1963 /* ensure a new css_set will exist for this thread */
1966 newcg = find_css_set(cg, cgrp); 1964 newcg = find_css_set(cg, cgrp);
1967 if (!newcg) 1965 if (!newcg)
1968 return -ENOMEM; 1966 return -ENOMEM;
1969 /* add it to the list */ 1967 /* add it to the list */
1970 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); 1968 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1971 if (!cg_entry) { 1969 if (!cg_entry) {
1972 put_css_set(newcg); 1970 put_css_set(newcg);
1973 return -ENOMEM; 1971 return -ENOMEM;
1974 } 1972 }
1975 cg_entry->cg = newcg; 1973 cg_entry->cg = newcg;
1976 list_add(&cg_entry->links, newcg_list); 1974 list_add(&cg_entry->links, newcg_list);
1977 return 0; 1975 return 0;
1978 } 1976 }
1979 1977
1980 /** 1978 /**
1981 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup 1979 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1982 * @cgrp: the cgroup to attach to 1980 * @cgrp: the cgroup to attach to
1983 * @leader: the threadgroup leader task_struct of the group to be attached 1981 * @leader: the threadgroup leader task_struct of the group to be attached
1984 * 1982 *
1985 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will 1983 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1986 * take task_lock of each thread in leader's threadgroup individually in turn. 1984 * take task_lock of each thread in leader's threadgroup individually in turn.
1987 */ 1985 */
1988 int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 1986 int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1989 { 1987 {
1990 int retval, i, group_size; 1988 int retval, i, group_size;
1991 struct cgroup_subsys *ss, *failed_ss = NULL; 1989 struct cgroup_subsys *ss, *failed_ss = NULL;
1992 bool cancel_failed_ss = false; 1990 bool cancel_failed_ss = false;
1993 /* guaranteed to be initialized later, but the compiler needs this */ 1991 /* guaranteed to be initialized later, but the compiler needs this */
1994 struct cgroup *oldcgrp = NULL; 1992 struct cgroup *oldcgrp = NULL;
1995 struct css_set *oldcg; 1993 struct css_set *oldcg;
1996 struct cgroupfs_root *root = cgrp->root; 1994 struct cgroupfs_root *root = cgrp->root;
1997 /* threadgroup list cursor and array */ 1995 /* threadgroup list cursor and array */
1998 struct task_struct *tsk; 1996 struct task_struct *tsk;
1999 struct flex_array *group; 1997 struct flex_array *group;
2000 /* 1998 /*
2001 * we need to make sure we have css_sets for all the tasks we're 1999 * we need to make sure we have css_sets for all the tasks we're
2002 * going to move -before- we actually start moving them, so that in 2000 * going to move -before- we actually start moving them, so that in
2003 * case we get an ENOMEM we can bail out before making any changes. 2001 * case we get an ENOMEM we can bail out before making any changes.
2004 */ 2002 */
2005 struct list_head newcg_list; 2003 struct list_head newcg_list;
2006 struct cg_list_entry *cg_entry, *temp_nobe; 2004 struct cg_list_entry *cg_entry, *temp_nobe;
2007 2005
2008 /* 2006 /*
2009 * step 0: in order to do expensive, possibly blocking operations for 2007 * step 0: in order to do expensive, possibly blocking operations for
2010 * every thread, we cannot iterate the thread group list, since it needs 2008 * every thread, we cannot iterate the thread group list, since it needs
2011 * rcu or tasklist locked. instead, build an array of all threads in the 2009 * rcu or tasklist locked. instead, build an array of all threads in the
2012 * group - threadgroup_fork_lock prevents new threads from appearing, 2010 * group - threadgroup_fork_lock prevents new threads from appearing,
2013 * and if threads exit, this will just be an over-estimate. 2011 * and if threads exit, this will just be an over-estimate.
2014 */ 2012 */
2015 group_size = get_nr_threads(leader); 2013 group_size = get_nr_threads(leader);
2016 /* flex_array supports very large thread-groups better than kmalloc. */ 2014 /* flex_array supports very large thread-groups better than kmalloc. */
2017 group = flex_array_alloc(sizeof(struct task_struct *), group_size, 2015 group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2018 GFP_KERNEL); 2016 GFP_KERNEL);
2019 if (!group) 2017 if (!group)
2020 return -ENOMEM; 2018 return -ENOMEM;
2021 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 2019 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2022 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); 2020 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2023 if (retval) 2021 if (retval)
2024 goto out_free_group_list; 2022 goto out_free_group_list;
2025 2023
2026 /* prevent changes to the threadgroup list while we take a snapshot. */ 2024 /* prevent changes to the threadgroup list while we take a snapshot. */
2027 rcu_read_lock(); 2025 rcu_read_lock();
2028 if (!thread_group_leader(leader)) { 2026 if (!thread_group_leader(leader)) {
2029 /* 2027 /*
2030 * a race with de_thread from another thread's exec() may strip 2028 * a race with de_thread from another thread's exec() may strip
2031 * us of our leadership, making while_each_thread unsafe to use 2029 * us of our leadership, making while_each_thread unsafe to use
2032 * on this task. if this happens, there is no choice but to 2030 * on this task. if this happens, there is no choice but to
2033 * throw this task away and try again (from cgroup_procs_write); 2031 * throw this task away and try again (from cgroup_procs_write);
2034 * this is "double-double-toil-and-trouble-check locking". 2032 * this is "double-double-toil-and-trouble-check locking".
2035 */ 2033 */
2036 rcu_read_unlock(); 2034 rcu_read_unlock();
2037 retval = -EAGAIN; 2035 retval = -EAGAIN;
2038 goto out_free_group_list; 2036 goto out_free_group_list;
2039 } 2037 }
2040 /* take a reference on each task in the group to go in the array. */ 2038 /* take a reference on each task in the group to go in the array. */
2041 tsk = leader; 2039 tsk = leader;
2042 i = 0; 2040 i = 0;
2043 do { 2041 do {
2044 /* as per above, nr_threads may decrease, but not increase. */ 2042 /* as per above, nr_threads may decrease, but not increase. */
2045 BUG_ON(i >= group_size); 2043 BUG_ON(i >= group_size);
2046 get_task_struct(tsk); 2044 get_task_struct(tsk);
2047 /* 2045 /*
2048 * saying GFP_ATOMIC has no effect here because we did prealloc 2046 * saying GFP_ATOMIC has no effect here because we did prealloc
2049 * earlier, but it's good form to communicate our expectations. 2047 * earlier, but it's good form to communicate our expectations.
2050 */ 2048 */
2051 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); 2049 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2052 BUG_ON(retval != 0); 2050 BUG_ON(retval != 0);
2053 i++; 2051 i++;
2054 } while_each_thread(leader, tsk); 2052 } while_each_thread(leader, tsk);
2055 /* remember the number of threads in the array for later. */ 2053 /* remember the number of threads in the array for later. */
2056 group_size = i; 2054 group_size = i;
2057 rcu_read_unlock(); 2055 rcu_read_unlock();
2058 2056
2059 /* 2057 /*
2060 * step 1: check that we can legitimately attach to the cgroup. 2058 * step 1: check that we can legitimately attach to the cgroup.
2061 */ 2059 */
2062 for_each_subsys(root, ss) { 2060 for_each_subsys(root, ss) {
2063 if (ss->can_attach) { 2061 if (ss->can_attach) {
2064 retval = ss->can_attach(ss, cgrp, leader); 2062 retval = ss->can_attach(ss, cgrp, leader);
2065 if (retval) { 2063 if (retval) {
2066 failed_ss = ss; 2064 failed_ss = ss;
2067 goto out_cancel_attach; 2065 goto out_cancel_attach;
2068 } 2066 }
2069 } 2067 }
2070 /* a callback to be run on every thread in the threadgroup. */ 2068 /* a callback to be run on every thread in the threadgroup. */
2071 if (ss->can_attach_task) { 2069 if (ss->can_attach_task) {
2072 /* run on each task in the threadgroup. */ 2070 /* run on each task in the threadgroup. */
2073 for (i = 0; i < group_size; i++) { 2071 for (i = 0; i < group_size; i++) {
2074 tsk = flex_array_get_ptr(group, i); 2072 tsk = flex_array_get_ptr(group, i);
2075 retval = ss->can_attach_task(cgrp, tsk); 2073 retval = ss->can_attach_task(cgrp, tsk);
2076 if (retval) { 2074 if (retval) {
2077 failed_ss = ss; 2075 failed_ss = ss;
2078 cancel_failed_ss = true; 2076 cancel_failed_ss = true;
2079 goto out_cancel_attach; 2077 goto out_cancel_attach;
2080 } 2078 }
2081 } 2079 }
2082 } 2080 }
2083 } 2081 }
2084 2082
2085 /* 2083 /*
2086 * step 2: make sure css_sets exist for all threads to be migrated. 2084 * step 2: make sure css_sets exist for all threads to be migrated.
2087 * we use find_css_set, which allocates a new one if necessary. 2085 * we use find_css_set, which allocates a new one if necessary.
2088 */ 2086 */
2089 INIT_LIST_HEAD(&newcg_list); 2087 INIT_LIST_HEAD(&newcg_list);
2090 for (i = 0; i < group_size; i++) { 2088 for (i = 0; i < group_size; i++) {
2091 tsk = flex_array_get_ptr(group, i); 2089 tsk = flex_array_get_ptr(group, i);
2092 /* nothing to do if this task is already in the cgroup */ 2090 /* nothing to do if this task is already in the cgroup */
2093 oldcgrp = task_cgroup_from_root(tsk, root); 2091 oldcgrp = task_cgroup_from_root(tsk, root);
2094 if (cgrp == oldcgrp) 2092 if (cgrp == oldcgrp)
2095 continue; 2093 continue;
2096 /* get old css_set pointer */ 2094 /* get old css_set pointer */
2097 task_lock(tsk); 2095 task_lock(tsk);
2098 if (tsk->flags & PF_EXITING) { 2096 if (tsk->flags & PF_EXITING) {
2099 /* ignore this task if it's going away */ 2097 /* ignore this task if it's going away */
2100 task_unlock(tsk); 2098 task_unlock(tsk);
2101 continue; 2099 continue;
2102 } 2100 }
2103 oldcg = tsk->cgroups; 2101 oldcg = tsk->cgroups;
2104 get_css_set(oldcg); 2102 get_css_set(oldcg);
2105 task_unlock(tsk); 2103 task_unlock(tsk);
2106 /* see if the new one for us is already in the list? */ 2104 /* see if the new one for us is already in the list? */
2107 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { 2105 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2108 /* was already there, nothing to do. */ 2106 /* was already there, nothing to do. */
2109 put_css_set(oldcg); 2107 put_css_set(oldcg);
2110 } else { 2108 } else {
2111 /* we don't already have it. get new one. */ 2109 /* we don't already have it. get new one. */
2112 retval = css_set_prefetch(cgrp, oldcg, &newcg_list); 2110 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2113 put_css_set(oldcg); 2111 put_css_set(oldcg);
2114 if (retval) 2112 if (retval)
2115 goto out_list_teardown; 2113 goto out_list_teardown;
2116 } 2114 }
2117 } 2115 }
2118 2116
2119 /* 2117 /*
2120 * step 3: now that we're guaranteed success wrt the css_sets, proceed 2118 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2121 * to move all tasks to the new cgroup, calling ss->attach_task for each 2119 * to move all tasks to the new cgroup, calling ss->attach_task for each
2122 * one along the way. there are no failure cases after here, so this is 2120 * one along the way. there are no failure cases after here, so this is
2123 * the commit point. 2121 * the commit point.
2124 */ 2122 */
2125 for_each_subsys(root, ss) { 2123 for_each_subsys(root, ss) {
2126 if (ss->pre_attach) 2124 if (ss->pre_attach)
2127 ss->pre_attach(cgrp); 2125 ss->pre_attach(cgrp);
2128 } 2126 }
2129 for (i = 0; i < group_size; i++) { 2127 for (i = 0; i < group_size; i++) {
2130 tsk = flex_array_get_ptr(group, i); 2128 tsk = flex_array_get_ptr(group, i);
2131 /* leave current thread as it is if it's already there */ 2129 /* leave current thread as it is if it's already there */
2132 oldcgrp = task_cgroup_from_root(tsk, root); 2130 oldcgrp = task_cgroup_from_root(tsk, root);
2133 if (cgrp == oldcgrp) 2131 if (cgrp == oldcgrp)
2134 continue; 2132 continue;
2135 /* attach each task to each subsystem */ 2133 /* attach each task to each subsystem */
2136 for_each_subsys(root, ss) { 2134 for_each_subsys(root, ss) {
2137 if (ss->attach_task) 2135 if (ss->attach_task)
2138 ss->attach_task(cgrp, tsk); 2136 ss->attach_task(cgrp, tsk);
2139 } 2137 }
2140 /* if the thread is PF_EXITING, it can just get skipped. */ 2138 /* if the thread is PF_EXITING, it can just get skipped. */
2141 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); 2139 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2142 BUG_ON(retval != 0 && retval != -ESRCH); 2140 BUG_ON(retval != 0 && retval != -ESRCH);
2143 } 2141 }
2144 /* nothing is sensitive to fork() after this point. */ 2142 /* nothing is sensitive to fork() after this point. */
2145 2143
2146 /* 2144 /*
2147 * step 4: do expensive, non-thread-specific subsystem callbacks. 2145 * step 4: do expensive, non-thread-specific subsystem callbacks.
2148 * TODO: if ever a subsystem needs to know the oldcgrp for each task 2146 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2149 * being moved, this call will need to be reworked to communicate that. 2147 * being moved, this call will need to be reworked to communicate that.
2150 */ 2148 */
2151 for_each_subsys(root, ss) { 2149 for_each_subsys(root, ss) {
2152 if (ss->attach) 2150 if (ss->attach)
2153 ss->attach(ss, cgrp, oldcgrp, leader); 2151 ss->attach(ss, cgrp, oldcgrp, leader);
2154 } 2152 }
2155 2153
2156 /* 2154 /*
2157 * step 5: success! and cleanup 2155 * step 5: success! and cleanup
2158 */ 2156 */
2159 synchronize_rcu(); 2157 synchronize_rcu();
2160 cgroup_wakeup_rmdir_waiter(cgrp); 2158 cgroup_wakeup_rmdir_waiter(cgrp);
2161 retval = 0; 2159 retval = 0;
2162 out_list_teardown: 2160 out_list_teardown:
2163 /* clean up the list of prefetched css_sets. */ 2161 /* clean up the list of prefetched css_sets. */
2164 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { 2162 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2165 list_del(&cg_entry->links); 2163 list_del(&cg_entry->links);
2166 put_css_set(cg_entry->cg); 2164 put_css_set(cg_entry->cg);
2167 kfree(cg_entry); 2165 kfree(cg_entry);
2168 } 2166 }
2169 out_cancel_attach: 2167 out_cancel_attach:
2170 /* same deal as in cgroup_attach_task */ 2168 /* same deal as in cgroup_attach_task */
2171 if (retval) { 2169 if (retval) {
2172 for_each_subsys(root, ss) { 2170 for_each_subsys(root, ss) {
2173 if (ss == failed_ss) { 2171 if (ss == failed_ss) {
2174 if (cancel_failed_ss && ss->cancel_attach) 2172 if (cancel_failed_ss && ss->cancel_attach)
2175 ss->cancel_attach(ss, cgrp, leader); 2173 ss->cancel_attach(ss, cgrp, leader);
2176 break; 2174 break;
2177 } 2175 }
2178 if (ss->cancel_attach) 2176 if (ss->cancel_attach)
2179 ss->cancel_attach(ss, cgrp, leader); 2177 ss->cancel_attach(ss, cgrp, leader);
2180 } 2178 }
2181 } 2179 }
2182 /* clean up the array of referenced threads in the group. */ 2180 /* clean up the array of referenced threads in the group. */
2183 for (i = 0; i < group_size; i++) { 2181 for (i = 0; i < group_size; i++) {
2184 tsk = flex_array_get_ptr(group, i); 2182 tsk = flex_array_get_ptr(group, i);
2185 put_task_struct(tsk); 2183 put_task_struct(tsk);
2186 } 2184 }
2187 out_free_group_list: 2185 out_free_group_list:
2188 flex_array_free(group); 2186 flex_array_free(group);
2189 return retval; 2187 return retval;
2190 } 2188 }
2191 2189
2192 /* 2190 /*
2193 * Find the task_struct of the task to attach by vpid and pass it along to the 2191 * Find the task_struct of the task to attach by vpid and pass it along to the
2194 * function to attach either it or all tasks in its threadgroup. Will take 2192 * function to attach either it or all tasks in its threadgroup. Will take
2195 * cgroup_mutex; may take task_lock of task. 2193 * cgroup_mutex; may take task_lock of task.
2196 */ 2194 */
2197 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2195 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2198 { 2196 {
2199 struct task_struct *tsk; 2197 struct task_struct *tsk;
2200 const struct cred *cred = current_cred(), *tcred; 2198 const struct cred *cred = current_cred(), *tcred;
2201 int ret; 2199 int ret;
2202 2200
2203 if (!cgroup_lock_live_group(cgrp)) 2201 if (!cgroup_lock_live_group(cgrp))
2204 return -ENODEV; 2202 return -ENODEV;
2205 2203
2206 if (pid) { 2204 if (pid) {
2207 rcu_read_lock(); 2205 rcu_read_lock();
2208 tsk = find_task_by_vpid(pid); 2206 tsk = find_task_by_vpid(pid);
2209 if (!tsk) { 2207 if (!tsk) {
2210 rcu_read_unlock(); 2208 rcu_read_unlock();
2211 cgroup_unlock(); 2209 cgroup_unlock();
2212 return -ESRCH; 2210 return -ESRCH;
2213 } 2211 }
2214 if (threadgroup) { 2212 if (threadgroup) {
2215 /* 2213 /*
2216 * RCU protects this access, since tsk was found in the 2214 * RCU protects this access, since tsk was found in the
2217 * tid map. a race with de_thread may cause group_leader 2215 * tid map. a race with de_thread may cause group_leader
2218 * to stop being the leader, but cgroup_attach_proc will 2216 * to stop being the leader, but cgroup_attach_proc will
2219 * detect it later. 2217 * detect it later.
2220 */ 2218 */
2221 tsk = tsk->group_leader; 2219 tsk = tsk->group_leader;
2222 } else if (tsk->flags & PF_EXITING) { 2220 } else if (tsk->flags & PF_EXITING) {
2223 /* optimization for the single-task-only case */ 2221 /* optimization for the single-task-only case */
2224 rcu_read_unlock(); 2222 rcu_read_unlock();
2225 cgroup_unlock(); 2223 cgroup_unlock();
2226 return -ESRCH; 2224 return -ESRCH;
2227 } 2225 }
2228 2226
2229 /* 2227 /*
2230 * even if we're attaching all tasks in the thread group, we 2228 * even if we're attaching all tasks in the thread group, we
2231 * only need to check permissions on one of them. 2229 * only need to check permissions on one of them.
2232 */ 2230 */
2233 tcred = __task_cred(tsk); 2231 tcred = __task_cred(tsk);
2234 if (cred->euid && 2232 if (cred->euid &&
2235 cred->euid != tcred->uid && 2233 cred->euid != tcred->uid &&
2236 cred->euid != tcred->suid) { 2234 cred->euid != tcred->suid) {
2237 rcu_read_unlock(); 2235 rcu_read_unlock();
2238 cgroup_unlock(); 2236 cgroup_unlock();
2239 return -EACCES; 2237 return -EACCES;
2240 } 2238 }
2241 get_task_struct(tsk); 2239 get_task_struct(tsk);
2242 rcu_read_unlock(); 2240 rcu_read_unlock();
2243 } else { 2241 } else {
2244 if (threadgroup) 2242 if (threadgroup)
2245 tsk = current->group_leader; 2243 tsk = current->group_leader;
2246 else 2244 else
2247 tsk = current; 2245 tsk = current;
2248 get_task_struct(tsk); 2246 get_task_struct(tsk);
2249 } 2247 }
2250 2248
2251 if (threadgroup) { 2249 if (threadgroup) {
2252 threadgroup_fork_write_lock(tsk); 2250 threadgroup_fork_write_lock(tsk);
2253 ret = cgroup_attach_proc(cgrp, tsk); 2251 ret = cgroup_attach_proc(cgrp, tsk);
2254 threadgroup_fork_write_unlock(tsk); 2252 threadgroup_fork_write_unlock(tsk);
2255 } else { 2253 } else {
2256 ret = cgroup_attach_task(cgrp, tsk); 2254 ret = cgroup_attach_task(cgrp, tsk);
2257 } 2255 }
2258 put_task_struct(tsk); 2256 put_task_struct(tsk);
2259 cgroup_unlock(); 2257 cgroup_unlock();
2260 return ret; 2258 return ret;
2261 } 2259 }
2262 2260
2263 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2261 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2264 { 2262 {
2265 return attach_task_by_pid(cgrp, pid, false); 2263 return attach_task_by_pid(cgrp, pid, false);
2266 } 2264 }
2267 2265
2268 static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2266 static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2269 { 2267 {
2270 int ret; 2268 int ret;
2271 do { 2269 do {
2272 /* 2270 /*
2273 * attach_proc fails with -EAGAIN if threadgroup leadership 2271 * attach_proc fails with -EAGAIN if threadgroup leadership
2274 * changes in the middle of the operation, in which case we need 2272 * changes in the middle of the operation, in which case we need
2275 * to find the task_struct for the new leader and start over. 2273 * to find the task_struct for the new leader and start over.
2276 */ 2274 */
2277 ret = attach_task_by_pid(cgrp, tgid, true); 2275 ret = attach_task_by_pid(cgrp, tgid, true);
2278 } while (ret == -EAGAIN); 2276 } while (ret == -EAGAIN);
2279 return ret; 2277 return ret;
2280 } 2278 }
2281 2279
2282 /** 2280 /**
2283 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 2281 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
2284 * @cgrp: the cgroup to be checked for liveness 2282 * @cgrp: the cgroup to be checked for liveness
2285 * 2283 *
2286 * On success, returns true; the lock should be later released with 2284 * On success, returns true; the lock should be later released with
2287 * cgroup_unlock(). On failure returns false with no lock held. 2285 * cgroup_unlock(). On failure returns false with no lock held.
2288 */ 2286 */
2289 bool cgroup_lock_live_group(struct cgroup *cgrp) 2287 bool cgroup_lock_live_group(struct cgroup *cgrp)
2290 { 2288 {
2291 mutex_lock(&cgroup_mutex); 2289 mutex_lock(&cgroup_mutex);
2292 if (cgroup_is_removed(cgrp)) { 2290 if (cgroup_is_removed(cgrp)) {
2293 mutex_unlock(&cgroup_mutex); 2291 mutex_unlock(&cgroup_mutex);
2294 return false; 2292 return false;
2295 } 2293 }
2296 return true; 2294 return true;
2297 } 2295 }
2298 EXPORT_SYMBOL_GPL(cgroup_lock_live_group); 2296 EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
2299 2297
2300 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2298 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2301 const char *buffer) 2299 const char *buffer)
2302 { 2300 {
2303 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2301 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2304 if (strlen(buffer) >= PATH_MAX) 2302 if (strlen(buffer) >= PATH_MAX)
2305 return -EINVAL; 2303 return -EINVAL;
2306 if (!cgroup_lock_live_group(cgrp)) 2304 if (!cgroup_lock_live_group(cgrp))
2307 return -ENODEV; 2305 return -ENODEV;
2308 strcpy(cgrp->root->release_agent_path, buffer); 2306 strcpy(cgrp->root->release_agent_path, buffer);
2309 cgroup_unlock(); 2307 cgroup_unlock();
2310 return 0; 2308 return 0;
2311 } 2309 }
2312 2310
2313 static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2311 static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2314 struct seq_file *seq) 2312 struct seq_file *seq)
2315 { 2313 {
2316 if (!cgroup_lock_live_group(cgrp)) 2314 if (!cgroup_lock_live_group(cgrp))
2317 return -ENODEV; 2315 return -ENODEV;
2318 seq_puts(seq, cgrp->root->release_agent_path); 2316 seq_puts(seq, cgrp->root->release_agent_path);
2319 seq_putc(seq, '\n'); 2317 seq_putc(seq, '\n');
2320 cgroup_unlock(); 2318 cgroup_unlock();
2321 return 0; 2319 return 0;
2322 } 2320 }
2323 2321
2324 /* A buffer size big enough for numbers or short strings */ 2322 /* A buffer size big enough for numbers or short strings */
2325 #define CGROUP_LOCAL_BUFFER_SIZE 64 2323 #define CGROUP_LOCAL_BUFFER_SIZE 64
2326 2324
2327 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2325 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2328 struct file *file, 2326 struct file *file,
2329 const char __user *userbuf, 2327 const char __user *userbuf,
2330 size_t nbytes, loff_t *unused_ppos) 2328 size_t nbytes, loff_t *unused_ppos)
2331 { 2329 {
2332 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2330 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2333 int retval = 0; 2331 int retval = 0;
2334 char *end; 2332 char *end;
2335 2333
2336 if (!nbytes) 2334 if (!nbytes)
2337 return -EINVAL; 2335 return -EINVAL;
2338 if (nbytes >= sizeof(buffer)) 2336 if (nbytes >= sizeof(buffer))
2339 return -E2BIG; 2337 return -E2BIG;
2340 if (copy_from_user(buffer, userbuf, nbytes)) 2338 if (copy_from_user(buffer, userbuf, nbytes))
2341 return -EFAULT; 2339 return -EFAULT;
2342 2340
2343 buffer[nbytes] = 0; /* nul-terminate */ 2341 buffer[nbytes] = 0; /* nul-terminate */
2344 if (cft->write_u64) { 2342 if (cft->write_u64) {
2345 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2343 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2346 if (*end) 2344 if (*end)
2347 return -EINVAL; 2345 return -EINVAL;
2348 retval = cft->write_u64(cgrp, cft, val); 2346 retval = cft->write_u64(cgrp, cft, val);
2349 } else { 2347 } else {
2350 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2348 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2351 if (*end) 2349 if (*end)
2352 return -EINVAL; 2350 return -EINVAL;
2353 retval = cft->write_s64(cgrp, cft, val); 2351 retval = cft->write_s64(cgrp, cft, val);
2354 } 2352 }
2355 if (!retval) 2353 if (!retval)
2356 retval = nbytes; 2354 retval = nbytes;
2357 return retval; 2355 return retval;
2358 } 2356 }
2359 2357
2360 static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2358 static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2361 struct file *file, 2359 struct file *file,
2362 const char __user *userbuf, 2360 const char __user *userbuf,
2363 size_t nbytes, loff_t *unused_ppos) 2361 size_t nbytes, loff_t *unused_ppos)
2364 { 2362 {
2365 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2363 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2366 int retval = 0; 2364 int retval = 0;
2367 size_t max_bytes = cft->max_write_len; 2365 size_t max_bytes = cft->max_write_len;
2368 char *buffer = local_buffer; 2366 char *buffer = local_buffer;
2369 2367
2370 if (!max_bytes) 2368 if (!max_bytes)
2371 max_bytes = sizeof(local_buffer) - 1; 2369 max_bytes = sizeof(local_buffer) - 1;
2372 if (nbytes >= max_bytes) 2370 if (nbytes >= max_bytes)
2373 return -E2BIG; 2371 return -E2BIG;
2374 /* Allocate a dynamic buffer if we need one */ 2372 /* Allocate a dynamic buffer if we need one */
2375 if (nbytes >= sizeof(local_buffer)) { 2373 if (nbytes >= sizeof(local_buffer)) {
2376 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 2374 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
2377 if (buffer == NULL) 2375 if (buffer == NULL)
2378 return -ENOMEM; 2376 return -ENOMEM;
2379 } 2377 }
2380 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { 2378 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2381 retval = -EFAULT; 2379 retval = -EFAULT;
2382 goto out; 2380 goto out;
2383 } 2381 }
2384 2382
2385 buffer[nbytes] = 0; /* nul-terminate */ 2383 buffer[nbytes] = 0; /* nul-terminate */
2386 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2384 retval = cft->write_string(cgrp, cft, strstrip(buffer));
2387 if (!retval) 2385 if (!retval)
2388 retval = nbytes; 2386 retval = nbytes;
2389 out: 2387 out:
2390 if (buffer != local_buffer) 2388 if (buffer != local_buffer)
2391 kfree(buffer); 2389 kfree(buffer);
2392 return retval; 2390 return retval;
2393 } 2391 }
2394 2392
2395 static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2393 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2396 size_t nbytes, loff_t *ppos) 2394 size_t nbytes, loff_t *ppos)
2397 { 2395 {
2398 struct cftype *cft = __d_cft(file->f_dentry); 2396 struct cftype *cft = __d_cft(file->f_dentry);
2399 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2397 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2400 2398
2401 if (cgroup_is_removed(cgrp)) 2399 if (cgroup_is_removed(cgrp))
2402 return -ENODEV; 2400 return -ENODEV;
2403 if (cft->write) 2401 if (cft->write)
2404 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2402 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
2405 if (cft->write_u64 || cft->write_s64) 2403 if (cft->write_u64 || cft->write_s64)
2406 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2404 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
2407 if (cft->write_string) 2405 if (cft->write_string)
2408 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2406 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
2409 if (cft->trigger) { 2407 if (cft->trigger) {
2410 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2408 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
2411 return ret ? ret : nbytes; 2409 return ret ? ret : nbytes;
2412 } 2410 }
2413 return -EINVAL; 2411 return -EINVAL;
2414 } 2412 }
2415 2413
2416 static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2414 static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
2417 struct file *file, 2415 struct file *file,
2418 char __user *buf, size_t nbytes, 2416 char __user *buf, size_t nbytes,
2419 loff_t *ppos) 2417 loff_t *ppos)
2420 { 2418 {
2421 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2419 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2422 u64 val = cft->read_u64(cgrp, cft); 2420 u64 val = cft->read_u64(cgrp, cft);
2423 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2421 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2424 2422
2425 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2423 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2426 } 2424 }
2427 2425
2428 static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2426 static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
2429 struct file *file, 2427 struct file *file,
2430 char __user *buf, size_t nbytes, 2428 char __user *buf, size_t nbytes,
2431 loff_t *ppos) 2429 loff_t *ppos)
2432 { 2430 {
2433 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2431 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2434 s64 val = cft->read_s64(cgrp, cft); 2432 s64 val = cft->read_s64(cgrp, cft);
2435 int len = sprintf(tmp, "%lld\n", (long long) val); 2433 int len = sprintf(tmp, "%lld\n", (long long) val);
2436 2434
2437 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2435 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2438 } 2436 }
2439 2437
2440 static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2438 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2441 size_t nbytes, loff_t *ppos) 2439 size_t nbytes, loff_t *ppos)
2442 { 2440 {
2443 struct cftype *cft = __d_cft(file->f_dentry); 2441 struct cftype *cft = __d_cft(file->f_dentry);
2444 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2442 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2445 2443
2446 if (cgroup_is_removed(cgrp)) 2444 if (cgroup_is_removed(cgrp))
2447 return -ENODEV; 2445 return -ENODEV;
2448 2446
2449 if (cft->read) 2447 if (cft->read)
2450 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2448 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
2451 if (cft->read_u64) 2449 if (cft->read_u64)
2452 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2450 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
2453 if (cft->read_s64) 2451 if (cft->read_s64)
2454 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2452 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
2455 return -EINVAL; 2453 return -EINVAL;
2456 } 2454 }
2457 2455
2458 /* 2456 /*
2459 * seqfile ops/methods for returning structured data. Currently just 2457 * seqfile ops/methods for returning structured data. Currently just
2460 * supports string->u64 maps, but can be extended in future. 2458 * supports string->u64 maps, but can be extended in future.
2461 */ 2459 */
2462 2460
2463 struct cgroup_seqfile_state { 2461 struct cgroup_seqfile_state {
2464 struct cftype *cft; 2462 struct cftype *cft;
2465 struct cgroup *cgroup; 2463 struct cgroup *cgroup;
2466 }; 2464 };
2467 2465
2468 static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2466 static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2469 { 2467 {
2470 struct seq_file *sf = cb->state; 2468 struct seq_file *sf = cb->state;
2471 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); 2469 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2472 } 2470 }
2473 2471
2474 static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2472 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2475 { 2473 {
2476 struct cgroup_seqfile_state *state = m->private; 2474 struct cgroup_seqfile_state *state = m->private;
2477 struct cftype *cft = state->cft; 2475 struct cftype *cft = state->cft;
2478 if (cft->read_map) { 2476 if (cft->read_map) {
2479 struct cgroup_map_cb cb = { 2477 struct cgroup_map_cb cb = {
2480 .fill = cgroup_map_add, 2478 .fill = cgroup_map_add,
2481 .state = m, 2479 .state = m,
2482 }; 2480 };
2483 return cft->read_map(state->cgroup, cft, &cb); 2481 return cft->read_map(state->cgroup, cft, &cb);
2484 } 2482 }
2485 return cft->read_seq_string(state->cgroup, cft, m); 2483 return cft->read_seq_string(state->cgroup, cft, m);
2486 } 2484 }
2487 2485
2488 static int cgroup_seqfile_release(struct inode *inode, struct file *file) 2486 static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2489 { 2487 {
2490 struct seq_file *seq = file->private_data; 2488 struct seq_file *seq = file->private_data;
2491 kfree(seq->private); 2489 kfree(seq->private);
2492 return single_release(inode, file); 2490 return single_release(inode, file);
2493 } 2491 }
2494 2492
2495 static const struct file_operations cgroup_seqfile_operations = { 2493 static const struct file_operations cgroup_seqfile_operations = {
2496 .read = seq_read, 2494 .read = seq_read,
2497 .write = cgroup_file_write, 2495 .write = cgroup_file_write,
2498 .llseek = seq_lseek, 2496 .llseek = seq_lseek,
2499 .release = cgroup_seqfile_release, 2497 .release = cgroup_seqfile_release,
2500 }; 2498 };
2501 2499
2502 static int cgroup_file_open(struct inode *inode, struct file *file) 2500 static int cgroup_file_open(struct inode *inode, struct file *file)
2503 { 2501 {
2504 int err; 2502 int err;
2505 struct cftype *cft; 2503 struct cftype *cft;
2506 2504
2507 err = generic_file_open(inode, file); 2505 err = generic_file_open(inode, file);
2508 if (err) 2506 if (err)
2509 return err; 2507 return err;
2510 cft = __d_cft(file->f_dentry); 2508 cft = __d_cft(file->f_dentry);
2511 2509
2512 if (cft->read_map || cft->read_seq_string) { 2510 if (cft->read_map || cft->read_seq_string) {
2513 struct cgroup_seqfile_state *state = 2511 struct cgroup_seqfile_state *state =
2514 kzalloc(sizeof(*state), GFP_USER); 2512 kzalloc(sizeof(*state), GFP_USER);
2515 if (!state) 2513 if (!state)
2516 return -ENOMEM; 2514 return -ENOMEM;
2517 state->cft = cft; 2515 state->cft = cft;
2518 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2516 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2519 file->f_op = &cgroup_seqfile_operations; 2517 file->f_op = &cgroup_seqfile_operations;
2520 err = single_open(file, cgroup_seqfile_show, state); 2518 err = single_open(file, cgroup_seqfile_show, state);
2521 if (err < 0) 2519 if (err < 0)
2522 kfree(state); 2520 kfree(state);
2523 } else if (cft->open) 2521 } else if (cft->open)
2524 err = cft->open(inode, file); 2522 err = cft->open(inode, file);
2525 else 2523 else
2526 err = 0; 2524 err = 0;
2527 2525
2528 return err; 2526 return err;
2529 } 2527 }
2530 2528
2531 static int cgroup_file_release(struct inode *inode, struct file *file) 2529 static int cgroup_file_release(struct inode *inode, struct file *file)
2532 { 2530 {
2533 struct cftype *cft = __d_cft(file->f_dentry); 2531 struct cftype *cft = __d_cft(file->f_dentry);
2534 if (cft->release) 2532 if (cft->release)
2535 return cft->release(inode, file); 2533 return cft->release(inode, file);
2536 return 0; 2534 return 0;
2537 } 2535 }
2538 2536
2539 /* 2537 /*
2540 * cgroup_rename - Only allow simple rename of directories in place. 2538 * cgroup_rename - Only allow simple rename of directories in place.
2541 */ 2539 */
2542 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2540 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2543 struct inode *new_dir, struct dentry *new_dentry) 2541 struct inode *new_dir, struct dentry *new_dentry)
2544 { 2542 {
2545 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2543 if (!S_ISDIR(old_dentry->d_inode->i_mode))
2546 return -ENOTDIR; 2544 return -ENOTDIR;
2547 if (new_dentry->d_inode) 2545 if (new_dentry->d_inode)
2548 return -EEXIST; 2546 return -EEXIST;
2549 if (old_dir != new_dir) 2547 if (old_dir != new_dir)
2550 return -EIO; 2548 return -EIO;
2551 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2549 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2552 } 2550 }
2553 2551
2554 static const struct file_operations cgroup_file_operations = { 2552 static const struct file_operations cgroup_file_operations = {
2555 .read = cgroup_file_read, 2553 .read = cgroup_file_read,
2556 .write = cgroup_file_write, 2554 .write = cgroup_file_write,
2557 .llseek = generic_file_llseek, 2555 .llseek = generic_file_llseek,
2558 .open = cgroup_file_open, 2556 .open = cgroup_file_open,
2559 .release = cgroup_file_release, 2557 .release = cgroup_file_release,
2560 }; 2558 };
2561 2559
2562 static const struct inode_operations cgroup_dir_inode_operations = { 2560 static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = cgroup_lookup, 2561 .lookup = cgroup_lookup,
2564 .mkdir = cgroup_mkdir, 2562 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir, 2563 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename, 2564 .rename = cgroup_rename,
2567 }; 2565 };
2568 2566
2569 static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 2567 static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2570 { 2568 {
2571 if (dentry->d_name.len > NAME_MAX) 2569 if (dentry->d_name.len > NAME_MAX)
2572 return ERR_PTR(-ENAMETOOLONG); 2570 return ERR_PTR(-ENAMETOOLONG);
2573 d_add(dentry, NULL); 2571 d_add(dentry, NULL);
2574 return NULL; 2572 return NULL;
2575 } 2573 }
2576 2574
2577 /* 2575 /*
2578 * Check if a file is a control file 2576 * Check if a file is a control file
2579 */ 2577 */
2580 static inline struct cftype *__file_cft(struct file *file) 2578 static inline struct cftype *__file_cft(struct file *file)
2581 { 2579 {
2582 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) 2580 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2583 return ERR_PTR(-EINVAL); 2581 return ERR_PTR(-EINVAL);
2584 return __d_cft(file->f_dentry); 2582 return __d_cft(file->f_dentry);
2585 } 2583 }
2586 2584
2587 static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2585 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2588 struct super_block *sb) 2586 struct super_block *sb)
2589 { 2587 {
2590 struct inode *inode; 2588 struct inode *inode;
2591 2589
2592 if (!dentry) 2590 if (!dentry)
2593 return -ENOENT; 2591 return -ENOENT;
2594 if (dentry->d_inode) 2592 if (dentry->d_inode)
2595 return -EEXIST; 2593 return -EEXIST;
2596 2594
2597 inode = cgroup_new_inode(mode, sb); 2595 inode = cgroup_new_inode(mode, sb);
2598 if (!inode) 2596 if (!inode)
2599 return -ENOMEM; 2597 return -ENOMEM;
2600 2598
2601 if (S_ISDIR(mode)) { 2599 if (S_ISDIR(mode)) {
2602 inode->i_op = &cgroup_dir_inode_operations; 2600 inode->i_op = &cgroup_dir_inode_operations;
2603 inode->i_fop = &simple_dir_operations; 2601 inode->i_fop = &simple_dir_operations;
2604 2602
2605 /* start off with i_nlink == 2 (for "." entry) */ 2603 /* start off with i_nlink == 2 (for "." entry) */
2606 inc_nlink(inode); 2604 inc_nlink(inode);
2607 2605
2608 /* start with the directory inode held, so that we can 2606 /* start with the directory inode held, so that we can
2609 * populate it without racing with another mkdir */ 2607 * populate it without racing with another mkdir */
2610 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2608 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2611 } else if (S_ISREG(mode)) { 2609 } else if (S_ISREG(mode)) {
2612 inode->i_size = 0; 2610 inode->i_size = 0;
2613 inode->i_fop = &cgroup_file_operations; 2611 inode->i_fop = &cgroup_file_operations;
2614 } 2612 }
2615 d_instantiate(dentry, inode); 2613 d_instantiate(dentry, inode);
2616 dget(dentry); /* Extra count - pin the dentry in core */ 2614 dget(dentry); /* Extra count - pin the dentry in core */
2617 return 0; 2615 return 0;
2618 } 2616 }
2619 2617
2620 /* 2618 /*
2621 * cgroup_create_dir - create a directory for an object. 2619 * cgroup_create_dir - create a directory for an object.
2622 * @cgrp: the cgroup we create the directory for. It must have a valid 2620 * @cgrp: the cgroup we create the directory for. It must have a valid
2623 * ->parent field. And we are going to fill its ->dentry field. 2621 * ->parent field. And we are going to fill its ->dentry field.
2624 * @dentry: dentry of the new cgroup 2622 * @dentry: dentry of the new cgroup
2625 * @mode: mode to set on new directory. 2623 * @mode: mode to set on new directory.
2626 */ 2624 */
2627 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 2625 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2628 mode_t mode) 2626 mode_t mode)
2629 { 2627 {
2630 struct dentry *parent; 2628 struct dentry *parent;
2631 int error = 0; 2629 int error = 0;
2632 2630
2633 parent = cgrp->parent->dentry; 2631 parent = cgrp->parent->dentry;
2634 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); 2632 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2635 if (!error) { 2633 if (!error) {
2636 dentry->d_fsdata = cgrp; 2634 dentry->d_fsdata = cgrp;
2637 inc_nlink(parent->d_inode); 2635 inc_nlink(parent->d_inode);
2638 rcu_assign_pointer(cgrp->dentry, dentry); 2636 rcu_assign_pointer(cgrp->dentry, dentry);
2639 dget(dentry); 2637 dget(dentry);
2640 } 2638 }
2641 dput(dentry); 2639 dput(dentry);
2642 2640
2643 return error; 2641 return error;
2644 } 2642 }
2645 2643
2646 /** 2644 /**
2647 * cgroup_file_mode - deduce file mode of a control file 2645 * cgroup_file_mode - deduce file mode of a control file
2648 * @cft: the control file in question 2646 * @cft: the control file in question
2649 * 2647 *
2650 * returns cft->mode if ->mode is not 0 2648 * returns cft->mode if ->mode is not 0
2651 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler 2649 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2652 * returns S_IRUGO if it has only a read handler 2650 * returns S_IRUGO if it has only a read handler
2653 * returns S_IWUSR if it has only a write hander 2651 * returns S_IWUSR if it has only a write hander
2654 */ 2652 */
2655 static mode_t cgroup_file_mode(const struct cftype *cft) 2653 static mode_t cgroup_file_mode(const struct cftype *cft)
2656 { 2654 {
2657 mode_t mode = 0; 2655 mode_t mode = 0;
2658 2656
2659 if (cft->mode) 2657 if (cft->mode)
2660 return cft->mode; 2658 return cft->mode;
2661 2659
2662 if (cft->read || cft->read_u64 || cft->read_s64 || 2660 if (cft->read || cft->read_u64 || cft->read_s64 ||
2663 cft->read_map || cft->read_seq_string) 2661 cft->read_map || cft->read_seq_string)
2664 mode |= S_IRUGO; 2662 mode |= S_IRUGO;
2665 2663
2666 if (cft->write || cft->write_u64 || cft->write_s64 || 2664 if (cft->write || cft->write_u64 || cft->write_s64 ||
2667 cft->write_string || cft->trigger) 2665 cft->write_string || cft->trigger)
2668 mode |= S_IWUSR; 2666 mode |= S_IWUSR;
2669 2667
2670 return mode; 2668 return mode;
2671 } 2669 }
2672 2670
2673 int cgroup_add_file(struct cgroup *cgrp, 2671 int cgroup_add_file(struct cgroup *cgrp,
2674 struct cgroup_subsys *subsys, 2672 struct cgroup_subsys *subsys,
2675 const struct cftype *cft) 2673 const struct cftype *cft)
2676 { 2674 {
2677 struct dentry *dir = cgrp->dentry; 2675 struct dentry *dir = cgrp->dentry;
2678 struct dentry *dentry; 2676 struct dentry *dentry;
2679 int error; 2677 int error;
2680 mode_t mode; 2678 mode_t mode;
2681 2679
2682 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2680 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2683 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2681 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2684 strcpy(name, subsys->name); 2682 strcpy(name, subsys->name);
2685 strcat(name, "."); 2683 strcat(name, ".");
2686 } 2684 }
2687 strcat(name, cft->name); 2685 strcat(name, cft->name);
2688 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2686 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2689 dentry = lookup_one_len(name, dir, strlen(name)); 2687 dentry = lookup_one_len(name, dir, strlen(name));
2690 if (!IS_ERR(dentry)) { 2688 if (!IS_ERR(dentry)) {
2691 mode = cgroup_file_mode(cft); 2689 mode = cgroup_file_mode(cft);
2692 error = cgroup_create_file(dentry, mode | S_IFREG, 2690 error = cgroup_create_file(dentry, mode | S_IFREG,
2693 cgrp->root->sb); 2691 cgrp->root->sb);
2694 if (!error) 2692 if (!error)
2695 dentry->d_fsdata = (void *)cft; 2693 dentry->d_fsdata = (void *)cft;
2696 dput(dentry); 2694 dput(dentry);
2697 } else 2695 } else
2698 error = PTR_ERR(dentry); 2696 error = PTR_ERR(dentry);
2699 return error; 2697 return error;
2700 } 2698 }
2701 EXPORT_SYMBOL_GPL(cgroup_add_file); 2699 EXPORT_SYMBOL_GPL(cgroup_add_file);
2702 2700
2703 int cgroup_add_files(struct cgroup *cgrp, 2701 int cgroup_add_files(struct cgroup *cgrp,
2704 struct cgroup_subsys *subsys, 2702 struct cgroup_subsys *subsys,
2705 const struct cftype cft[], 2703 const struct cftype cft[],
2706 int count) 2704 int count)
2707 { 2705 {
2708 int i, err; 2706 int i, err;
2709 for (i = 0; i < count; i++) { 2707 for (i = 0; i < count; i++) {
2710 err = cgroup_add_file(cgrp, subsys, &cft[i]); 2708 err = cgroup_add_file(cgrp, subsys, &cft[i]);
2711 if (err) 2709 if (err)
2712 return err; 2710 return err;
2713 } 2711 }
2714 return 0; 2712 return 0;
2715 } 2713 }
2716 EXPORT_SYMBOL_GPL(cgroup_add_files); 2714 EXPORT_SYMBOL_GPL(cgroup_add_files);
2717 2715
2718 /** 2716 /**
2719 * cgroup_task_count - count the number of tasks in a cgroup. 2717 * cgroup_task_count - count the number of tasks in a cgroup.
2720 * @cgrp: the cgroup in question 2718 * @cgrp: the cgroup in question
2721 * 2719 *
2722 * Return the number of tasks in the cgroup. 2720 * Return the number of tasks in the cgroup.
2723 */ 2721 */
2724 int cgroup_task_count(const struct cgroup *cgrp) 2722 int cgroup_task_count(const struct cgroup *cgrp)
2725 { 2723 {
2726 int count = 0; 2724 int count = 0;
2727 struct cg_cgroup_link *link; 2725 struct cg_cgroup_link *link;
2728 2726
2729 read_lock(&css_set_lock); 2727 read_lock(&css_set_lock);
2730 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 2728 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
2731 count += atomic_read(&link->cg->refcount); 2729 count += atomic_read(&link->cg->refcount);
2732 } 2730 }
2733 read_unlock(&css_set_lock); 2731 read_unlock(&css_set_lock);
2734 return count; 2732 return count;
2735 } 2733 }
2736 2734
2737 /* 2735 /*
2738 * Advance a list_head iterator. The iterator should be positioned at 2736 * Advance a list_head iterator. The iterator should be positioned at
2739 * the start of a css_set 2737 * the start of a css_set
2740 */ 2738 */
2741 static void cgroup_advance_iter(struct cgroup *cgrp, 2739 static void cgroup_advance_iter(struct cgroup *cgrp,
2742 struct cgroup_iter *it) 2740 struct cgroup_iter *it)
2743 { 2741 {
2744 struct list_head *l = it->cg_link; 2742 struct list_head *l = it->cg_link;
2745 struct cg_cgroup_link *link; 2743 struct cg_cgroup_link *link;
2746 struct css_set *cg; 2744 struct css_set *cg;
2747 2745
2748 /* Advance to the next non-empty css_set */ 2746 /* Advance to the next non-empty css_set */
2749 do { 2747 do {
2750 l = l->next; 2748 l = l->next;
2751 if (l == &cgrp->css_sets) { 2749 if (l == &cgrp->css_sets) {
2752 it->cg_link = NULL; 2750 it->cg_link = NULL;
2753 return; 2751 return;
2754 } 2752 }
2755 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); 2753 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
2756 cg = link->cg; 2754 cg = link->cg;
2757 } while (list_empty(&cg->tasks)); 2755 } while (list_empty(&cg->tasks));
2758 it->cg_link = l; 2756 it->cg_link = l;
2759 it->task = cg->tasks.next; 2757 it->task = cg->tasks.next;
2760 } 2758 }
2761 2759
2762 /* 2760 /*
2763 * To reduce the fork() overhead for systems that are not actually 2761 * To reduce the fork() overhead for systems that are not actually
2764 * using their cgroups capability, we don't maintain the lists running 2762 * using their cgroups capability, we don't maintain the lists running
2765 * through each css_set to its tasks until we see the list actually 2763 * through each css_set to its tasks until we see the list actually
2766 * used - in other words after the first call to cgroup_iter_start(). 2764 * used - in other words after the first call to cgroup_iter_start().
2767 * 2765 *
2768 * The tasklist_lock is not held here, as do_each_thread() and 2766 * The tasklist_lock is not held here, as do_each_thread() and
2769 * while_each_thread() are protected by RCU. 2767 * while_each_thread() are protected by RCU.
2770 */ 2768 */
2771 static void cgroup_enable_task_cg_lists(void) 2769 static void cgroup_enable_task_cg_lists(void)
2772 { 2770 {
2773 struct task_struct *p, *g; 2771 struct task_struct *p, *g;
2774 write_lock(&css_set_lock); 2772 write_lock(&css_set_lock);
2775 use_task_css_set_links = 1; 2773 use_task_css_set_links = 1;
2776 do_each_thread(g, p) { 2774 do_each_thread(g, p) {
2777 task_lock(p); 2775 task_lock(p);
2778 /* 2776 /*
2779 * We should check if the process is exiting, otherwise 2777 * We should check if the process is exiting, otherwise
2780 * it will race with cgroup_exit() in that the list 2778 * it will race with cgroup_exit() in that the list
2781 * entry won't be deleted though the process has exited. 2779 * entry won't be deleted though the process has exited.
2782 */ 2780 */
2783 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 2781 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2784 list_add(&p->cg_list, &p->cgroups->tasks); 2782 list_add(&p->cg_list, &p->cgroups->tasks);
2785 task_unlock(p); 2783 task_unlock(p);
2786 } while_each_thread(g, p); 2784 } while_each_thread(g, p);
2787 write_unlock(&css_set_lock); 2785 write_unlock(&css_set_lock);
2788 } 2786 }
2789 2787
2790 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 2788 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
2791 { 2789 {
2792 /* 2790 /*
2793 * The first time anyone tries to iterate across a cgroup, 2791 * The first time anyone tries to iterate across a cgroup,
2794 * we need to enable the list linking each css_set to its 2792 * we need to enable the list linking each css_set to its
2795 * tasks, and fix up all existing tasks. 2793 * tasks, and fix up all existing tasks.
2796 */ 2794 */
2797 if (!use_task_css_set_links) 2795 if (!use_task_css_set_links)
2798 cgroup_enable_task_cg_lists(); 2796 cgroup_enable_task_cg_lists();
2799 2797
2800 read_lock(&css_set_lock); 2798 read_lock(&css_set_lock);
2801 it->cg_link = &cgrp->css_sets; 2799 it->cg_link = &cgrp->css_sets;
2802 cgroup_advance_iter(cgrp, it); 2800 cgroup_advance_iter(cgrp, it);
2803 } 2801 }
2804 2802
2805 struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 2803 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
2806 struct cgroup_iter *it) 2804 struct cgroup_iter *it)
2807 { 2805 {
2808 struct task_struct *res; 2806 struct task_struct *res;
2809 struct list_head *l = it->task; 2807 struct list_head *l = it->task;
2810 struct cg_cgroup_link *link; 2808 struct cg_cgroup_link *link;
2811 2809
2812 /* If the iterator cg is NULL, we have no tasks */ 2810 /* If the iterator cg is NULL, we have no tasks */
2813 if (!it->cg_link) 2811 if (!it->cg_link)
2814 return NULL; 2812 return NULL;
2815 res = list_entry(l, struct task_struct, cg_list); 2813 res = list_entry(l, struct task_struct, cg_list);
2816 /* Advance iterator to find next entry */ 2814 /* Advance iterator to find next entry */
2817 l = l->next; 2815 l = l->next;
2818 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); 2816 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
2819 if (l == &link->cg->tasks) { 2817 if (l == &link->cg->tasks) {
2820 /* We reached the end of this task list - move on to 2818 /* We reached the end of this task list - move on to
2821 * the next cg_cgroup_link */ 2819 * the next cg_cgroup_link */
2822 cgroup_advance_iter(cgrp, it); 2820 cgroup_advance_iter(cgrp, it);
2823 } else { 2821 } else {
2824 it->task = l; 2822 it->task = l;
2825 } 2823 }
2826 return res; 2824 return res;
2827 } 2825 }
2828 2826
2829 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 2827 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
2830 { 2828 {
2831 read_unlock(&css_set_lock); 2829 read_unlock(&css_set_lock);
2832 } 2830 }
2833 2831
2834 static inline int started_after_time(struct task_struct *t1, 2832 static inline int started_after_time(struct task_struct *t1,
2835 struct timespec *time, 2833 struct timespec *time,
2836 struct task_struct *t2) 2834 struct task_struct *t2)
2837 { 2835 {
2838 int start_diff = timespec_compare(&t1->start_time, time); 2836 int start_diff = timespec_compare(&t1->start_time, time);
2839 if (start_diff > 0) { 2837 if (start_diff > 0) {
2840 return 1; 2838 return 1;
2841 } else if (start_diff < 0) { 2839 } else if (start_diff < 0) {
2842 return 0; 2840 return 0;
2843 } else { 2841 } else {
2844 /* 2842 /*
2845 * Arbitrarily, if two processes started at the same 2843 * Arbitrarily, if two processes started at the same
2846 * time, we'll say that the lower pointer value 2844 * time, we'll say that the lower pointer value
2847 * started first. Note that t2 may have exited by now 2845 * started first. Note that t2 may have exited by now
2848 * so this may not be a valid pointer any longer, but 2846 * so this may not be a valid pointer any longer, but
2849 * that's fine - it still serves to distinguish 2847 * that's fine - it still serves to distinguish
2850 * between two tasks started (effectively) simultaneously. 2848 * between two tasks started (effectively) simultaneously.
2851 */ 2849 */
2852 return t1 > t2; 2850 return t1 > t2;
2853 } 2851 }
2854 } 2852 }
2855 2853
2856 /* 2854 /*
2857 * This function is a callback from heap_insert() and is used to order 2855 * This function is a callback from heap_insert() and is used to order
2858 * the heap. 2856 * the heap.
2859 * In this case we order the heap in descending task start time. 2857 * In this case we order the heap in descending task start time.
2860 */ 2858 */
2861 static inline int started_after(void *p1, void *p2) 2859 static inline int started_after(void *p1, void *p2)
2862 { 2860 {
2863 struct task_struct *t1 = p1; 2861 struct task_struct *t1 = p1;
2864 struct task_struct *t2 = p2; 2862 struct task_struct *t2 = p2;
2865 return started_after_time(t1, &t2->start_time, t2); 2863 return started_after_time(t1, &t2->start_time, t2);
2866 } 2864 }
2867 2865
2868 /** 2866 /**
2869 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 2867 * cgroup_scan_tasks - iterate though all the tasks in a cgroup
2870 * @scan: struct cgroup_scanner containing arguments for the scan 2868 * @scan: struct cgroup_scanner containing arguments for the scan
2871 * 2869 *
2872 * Arguments include pointers to callback functions test_task() and 2870 * Arguments include pointers to callback functions test_task() and
2873 * process_task(). 2871 * process_task().
2874 * Iterate through all the tasks in a cgroup, calling test_task() for each, 2872 * Iterate through all the tasks in a cgroup, calling test_task() for each,
2875 * and if it returns true, call process_task() for it also. 2873 * and if it returns true, call process_task() for it also.
2876 * The test_task pointer may be NULL, meaning always true (select all tasks). 2874 * The test_task pointer may be NULL, meaning always true (select all tasks).
2877 * Effectively duplicates cgroup_iter_{start,next,end}() 2875 * Effectively duplicates cgroup_iter_{start,next,end}()
2878 * but does not lock css_set_lock for the call to process_task(). 2876 * but does not lock css_set_lock for the call to process_task().
2879 * The struct cgroup_scanner may be embedded in any structure of the caller's 2877 * The struct cgroup_scanner may be embedded in any structure of the caller's
2880 * creation. 2878 * creation.
2881 * It is guaranteed that process_task() will act on every task that 2879 * It is guaranteed that process_task() will act on every task that
2882 * is a member of the cgroup for the duration of this call. This 2880 * is a member of the cgroup for the duration of this call. This
2883 * function may or may not call process_task() for tasks that exit 2881 * function may or may not call process_task() for tasks that exit
2884 * or move to a different cgroup during the call, or are forked or 2882 * or move to a different cgroup during the call, or are forked or
2885 * move into the cgroup during the call. 2883 * move into the cgroup during the call.
2886 * 2884 *
2887 * Note that test_task() may be called with locks held, and may in some 2885 * Note that test_task() may be called with locks held, and may in some
2888 * situations be called multiple times for the same task, so it should 2886 * situations be called multiple times for the same task, so it should
2889 * be cheap. 2887 * be cheap.
2890 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 2888 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
2891 * pre-allocated and will be used for heap operations (and its "gt" member will 2889 * pre-allocated and will be used for heap operations (and its "gt" member will
2892 * be overwritten), else a temporary heap will be used (allocation of which 2890 * be overwritten), else a temporary heap will be used (allocation of which
2893 * may cause this function to fail). 2891 * may cause this function to fail).
2894 */ 2892 */
2895 int cgroup_scan_tasks(struct cgroup_scanner *scan) 2893 int cgroup_scan_tasks(struct cgroup_scanner *scan)
2896 { 2894 {
2897 int retval, i; 2895 int retval, i;
2898 struct cgroup_iter it; 2896 struct cgroup_iter it;
2899 struct task_struct *p, *dropped; 2897 struct task_struct *p, *dropped;
2900 /* Never dereference latest_task, since it's not refcounted */ 2898 /* Never dereference latest_task, since it's not refcounted */
2901 struct task_struct *latest_task = NULL; 2899 struct task_struct *latest_task = NULL;
2902 struct ptr_heap tmp_heap; 2900 struct ptr_heap tmp_heap;
2903 struct ptr_heap *heap; 2901 struct ptr_heap *heap;
2904 struct timespec latest_time = { 0, 0 }; 2902 struct timespec latest_time = { 0, 0 };
2905 2903
2906 if (scan->heap) { 2904 if (scan->heap) {
2907 /* The caller supplied our heap and pre-allocated its memory */ 2905 /* The caller supplied our heap and pre-allocated its memory */
2908 heap = scan->heap; 2906 heap = scan->heap;
2909 heap->gt = &started_after; 2907 heap->gt = &started_after;
2910 } else { 2908 } else {
2911 /* We need to allocate our own heap memory */ 2909 /* We need to allocate our own heap memory */
2912 heap = &tmp_heap; 2910 heap = &tmp_heap;
2913 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); 2911 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
2914 if (retval) 2912 if (retval)
2915 /* cannot allocate the heap */ 2913 /* cannot allocate the heap */
2916 return retval; 2914 return retval;
2917 } 2915 }
2918 2916
2919 again: 2917 again:
2920 /* 2918 /*
2921 * Scan tasks in the cgroup, using the scanner's "test_task" callback 2919 * Scan tasks in the cgroup, using the scanner's "test_task" callback
2922 * to determine which are of interest, and using the scanner's 2920 * to determine which are of interest, and using the scanner's
2923 * "process_task" callback to process any of them that need an update. 2921 * "process_task" callback to process any of them that need an update.
2924 * Since we don't want to hold any locks during the task updates, 2922 * Since we don't want to hold any locks during the task updates,
2925 * gather tasks to be processed in a heap structure. 2923 * gather tasks to be processed in a heap structure.
2926 * The heap is sorted by descending task start time. 2924 * The heap is sorted by descending task start time.
2927 * If the statically-sized heap fills up, we overflow tasks that 2925 * If the statically-sized heap fills up, we overflow tasks that
2928 * started later, and in future iterations only consider tasks that 2926 * started later, and in future iterations only consider tasks that
2929 * started after the latest task in the previous pass. This 2927 * started after the latest task in the previous pass. This
2930 * guarantees forward progress and that we don't miss any tasks. 2928 * guarantees forward progress and that we don't miss any tasks.
2931 */ 2929 */
2932 heap->size = 0; 2930 heap->size = 0;
2933 cgroup_iter_start(scan->cg, &it); 2931 cgroup_iter_start(scan->cg, &it);
2934 while ((p = cgroup_iter_next(scan->cg, &it))) { 2932 while ((p = cgroup_iter_next(scan->cg, &it))) {
2935 /* 2933 /*
2936 * Only affect tasks that qualify per the caller's callback, 2934 * Only affect tasks that qualify per the caller's callback,
2937 * if he provided one 2935 * if he provided one
2938 */ 2936 */
2939 if (scan->test_task && !scan->test_task(p, scan)) 2937 if (scan->test_task && !scan->test_task(p, scan))
2940 continue; 2938 continue;
2941 /* 2939 /*
2942 * Only process tasks that started after the last task 2940 * Only process tasks that started after the last task
2943 * we processed 2941 * we processed
2944 */ 2942 */
2945 if (!started_after_time(p, &latest_time, latest_task)) 2943 if (!started_after_time(p, &latest_time, latest_task))
2946 continue; 2944 continue;
2947 dropped = heap_insert(heap, p); 2945 dropped = heap_insert(heap, p);
2948 if (dropped == NULL) { 2946 if (dropped == NULL) {
2949 /* 2947 /*
2950 * The new task was inserted; the heap wasn't 2948 * The new task was inserted; the heap wasn't
2951 * previously full 2949 * previously full
2952 */ 2950 */
2953 get_task_struct(p); 2951 get_task_struct(p);
2954 } else if (dropped != p) { 2952 } else if (dropped != p) {
2955 /* 2953 /*
2956 * The new task was inserted, and pushed out a 2954 * The new task was inserted, and pushed out a
2957 * different task 2955 * different task
2958 */ 2956 */
2959 get_task_struct(p); 2957 get_task_struct(p);
2960 put_task_struct(dropped); 2958 put_task_struct(dropped);
2961 } 2959 }
2962 /* 2960 /*
2963 * Else the new task was newer than anything already in 2961 * Else the new task was newer than anything already in
2964 * the heap and wasn't inserted 2962 * the heap and wasn't inserted
2965 */ 2963 */
2966 } 2964 }
2967 cgroup_iter_end(scan->cg, &it); 2965 cgroup_iter_end(scan->cg, &it);
2968 2966
2969 if (heap->size) { 2967 if (heap->size) {
2970 for (i = 0; i < heap->size; i++) { 2968 for (i = 0; i < heap->size; i++) {
2971 struct task_struct *q = heap->ptrs[i]; 2969 struct task_struct *q = heap->ptrs[i];
2972 if (i == 0) { 2970 if (i == 0) {
2973 latest_time = q->start_time; 2971 latest_time = q->start_time;
2974 latest_task = q; 2972 latest_task = q;
2975 } 2973 }
2976 /* Process the task per the caller's callback */ 2974 /* Process the task per the caller's callback */
2977 scan->process_task(q, scan); 2975 scan->process_task(q, scan);
2978 put_task_struct(q); 2976 put_task_struct(q);
2979 } 2977 }
2980 /* 2978 /*
2981 * If we had to process any tasks at all, scan again 2979 * If we had to process any tasks at all, scan again
2982 * in case some of them were in the middle of forking 2980 * in case some of them were in the middle of forking
2983 * children that didn't get processed. 2981 * children that didn't get processed.
2984 * Not the most efficient way to do it, but it avoids 2982 * Not the most efficient way to do it, but it avoids
2985 * having to take callback_mutex in the fork path 2983 * having to take callback_mutex in the fork path
2986 */ 2984 */
2987 goto again; 2985 goto again;
2988 } 2986 }
2989 if (heap == &tmp_heap) 2987 if (heap == &tmp_heap)
2990 heap_free(&tmp_heap); 2988 heap_free(&tmp_heap);
2991 return 0; 2989 return 0;
2992 } 2990 }
2993 2991
2994 /* 2992 /*
2995 * Stuff for reading the 'tasks'/'procs' files. 2993 * Stuff for reading the 'tasks'/'procs' files.
2996 * 2994 *
2997 * Reading this file can return large amounts of data if a cgroup has 2995 * Reading this file can return large amounts of data if a cgroup has
2998 * *lots* of attached tasks. So it may need several calls to read(), 2996 * *lots* of attached tasks. So it may need several calls to read(),
2999 * but we cannot guarantee that the information we produce is correct 2997 * but we cannot guarantee that the information we produce is correct
3000 * unless we produce it entirely atomically. 2998 * unless we produce it entirely atomically.
3001 * 2999 *
3002 */ 3000 */
3003 3001
3004 /* 3002 /*
3005 * The following two functions "fix" the issue where there are more pids 3003 * The following two functions "fix" the issue where there are more pids
3006 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. 3004 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3007 * TODO: replace with a kernel-wide solution to this problem 3005 * TODO: replace with a kernel-wide solution to this problem
3008 */ 3006 */
3009 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) 3007 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3010 static void *pidlist_allocate(int count) 3008 static void *pidlist_allocate(int count)
3011 { 3009 {
3012 if (PIDLIST_TOO_LARGE(count)) 3010 if (PIDLIST_TOO_LARGE(count))
3013 return vmalloc(count * sizeof(pid_t)); 3011 return vmalloc(count * sizeof(pid_t));
3014 else 3012 else
3015 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3013 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3016 } 3014 }
3017 static void pidlist_free(void *p) 3015 static void pidlist_free(void *p)
3018 { 3016 {
3019 if (is_vmalloc_addr(p)) 3017 if (is_vmalloc_addr(p))
3020 vfree(p); 3018 vfree(p);
3021 else 3019 else
3022 kfree(p); 3020 kfree(p);
3023 } 3021 }
3024 static void *pidlist_resize(void *p, int newcount) 3022 static void *pidlist_resize(void *p, int newcount)
3025 { 3023 {
3026 void *newlist; 3024 void *newlist;
3027 /* note: if new alloc fails, old p will still be valid either way */ 3025 /* note: if new alloc fails, old p will still be valid either way */
3028 if (is_vmalloc_addr(p)) { 3026 if (is_vmalloc_addr(p)) {
3029 newlist = vmalloc(newcount * sizeof(pid_t)); 3027 newlist = vmalloc(newcount * sizeof(pid_t));
3030 if (!newlist) 3028 if (!newlist)
3031 return NULL; 3029 return NULL;
3032 memcpy(newlist, p, newcount * sizeof(pid_t)); 3030 memcpy(newlist, p, newcount * sizeof(pid_t));
3033 vfree(p); 3031 vfree(p);
3034 } else { 3032 } else {
3035 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); 3033 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
3036 } 3034 }
3037 return newlist; 3035 return newlist;
3038 } 3036 }
3039 3037
3040 /* 3038 /*
3041 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3039 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3042 * If the new stripped list is sufficiently smaller and there's enough memory 3040 * If the new stripped list is sufficiently smaller and there's enough memory
3043 * to allocate a new buffer, will let go of the unneeded memory. Returns the 3041 * to allocate a new buffer, will let go of the unneeded memory. Returns the
3044 * number of unique elements. 3042 * number of unique elements.
3045 */ 3043 */
3046 /* is the size difference enough that we should re-allocate the array? */ 3044 /* is the size difference enough that we should re-allocate the array? */
3047 #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) 3045 #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
3048 static int pidlist_uniq(pid_t **p, int length) 3046 static int pidlist_uniq(pid_t **p, int length)
3049 { 3047 {
3050 int src, dest = 1; 3048 int src, dest = 1;
3051 pid_t *list = *p; 3049 pid_t *list = *p;
3052 pid_t *newlist; 3050 pid_t *newlist;
3053 3051
3054 /* 3052 /*
3055 * we presume the 0th element is unique, so i starts at 1. trivial 3053 * we presume the 0th element is unique, so i starts at 1. trivial
3056 * edge cases first; no work needs to be done for either 3054 * edge cases first; no work needs to be done for either
3057 */ 3055 */
3058 if (length == 0 || length == 1) 3056 if (length == 0 || length == 1)
3059 return length; 3057 return length;
3060 /* src and dest walk down the list; dest counts unique elements */ 3058 /* src and dest walk down the list; dest counts unique elements */
3061 for (src = 1; src < length; src++) { 3059 for (src = 1; src < length; src++) {
3062 /* find next unique element */ 3060 /* find next unique element */
3063 while (list[src] == list[src-1]) { 3061 while (list[src] == list[src-1]) {
3064 src++; 3062 src++;
3065 if (src == length) 3063 if (src == length)
3066 goto after; 3064 goto after;
3067 } 3065 }
3068 /* dest always points to where the next unique element goes */ 3066 /* dest always points to where the next unique element goes */
3069 list[dest] = list[src]; 3067 list[dest] = list[src];
3070 dest++; 3068 dest++;
3071 } 3069 }
3072 after: 3070 after:
3073 /* 3071 /*
3074 * if the length difference is large enough, we want to allocate a 3072 * if the length difference is large enough, we want to allocate a
3075 * smaller buffer to save memory. if this fails due to out of memory, 3073 * smaller buffer to save memory. if this fails due to out of memory,
3076 * we'll just stay with what we've got. 3074 * we'll just stay with what we've got.
3077 */ 3075 */
3078 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { 3076 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
3079 newlist = pidlist_resize(list, dest); 3077 newlist = pidlist_resize(list, dest);
3080 if (newlist) 3078 if (newlist)
3081 *p = newlist; 3079 *p = newlist;
3082 } 3080 }
3083 return dest; 3081 return dest;
3084 } 3082 }
3085 3083
3086 static int cmppid(const void *a, const void *b) 3084 static int cmppid(const void *a, const void *b)
3087 { 3085 {
3088 return *(pid_t *)a - *(pid_t *)b; 3086 return *(pid_t *)a - *(pid_t *)b;
3089 } 3087 }
3090 3088
3091 /* 3089 /*
3092 * find the appropriate pidlist for our purpose (given procs vs tasks) 3090 * find the appropriate pidlist for our purpose (given procs vs tasks)
3093 * returns with the lock on that pidlist already held, and takes care 3091 * returns with the lock on that pidlist already held, and takes care
3094 * of the use count, or returns NULL with no locks held if we're out of 3092 * of the use count, or returns NULL with no locks held if we're out of
3095 * memory. 3093 * memory.
3096 */ 3094 */
3097 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3095 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3098 enum cgroup_filetype type) 3096 enum cgroup_filetype type)
3099 { 3097 {
3100 struct cgroup_pidlist *l; 3098 struct cgroup_pidlist *l;
3101 /* don't need task_nsproxy() if we're looking at ourself */ 3099 /* don't need task_nsproxy() if we're looking at ourself */
3102 struct pid_namespace *ns = current->nsproxy->pid_ns; 3100 struct pid_namespace *ns = current->nsproxy->pid_ns;
3103 3101
3104 /* 3102 /*
3105 * We can't drop the pidlist_mutex before taking the l->mutex in case 3103 * We can't drop the pidlist_mutex before taking the l->mutex in case
3106 * the last ref-holder is trying to remove l from the list at the same 3104 * the last ref-holder is trying to remove l from the list at the same
3107 * time. Holding the pidlist_mutex precludes somebody taking whichever 3105 * time. Holding the pidlist_mutex precludes somebody taking whichever
3108 * list we find out from under us - compare release_pid_array(). 3106 * list we find out from under us - compare release_pid_array().
3109 */ 3107 */
3110 mutex_lock(&cgrp->pidlist_mutex); 3108 mutex_lock(&cgrp->pidlist_mutex);
3111 list_for_each_entry(l, &cgrp->pidlists, links) { 3109 list_for_each_entry(l, &cgrp->pidlists, links) {
3112 if (l->key.type == type && l->key.ns == ns) { 3110 if (l->key.type == type && l->key.ns == ns) {
3113 /* make sure l doesn't vanish out from under us */ 3111 /* make sure l doesn't vanish out from under us */
3114 down_write(&l->mutex); 3112 down_write(&l->mutex);
3115 mutex_unlock(&cgrp->pidlist_mutex); 3113 mutex_unlock(&cgrp->pidlist_mutex);
3116 return l; 3114 return l;
3117 } 3115 }
3118 } 3116 }
3119 /* entry not found; create a new one */ 3117 /* entry not found; create a new one */
3120 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3118 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3121 if (!l) { 3119 if (!l) {
3122 mutex_unlock(&cgrp->pidlist_mutex); 3120 mutex_unlock(&cgrp->pidlist_mutex);
3123 return l; 3121 return l;
3124 } 3122 }
3125 init_rwsem(&l->mutex); 3123 init_rwsem(&l->mutex);
3126 down_write(&l->mutex); 3124 down_write(&l->mutex);
3127 l->key.type = type; 3125 l->key.type = type;
3128 l->key.ns = get_pid_ns(ns); 3126 l->key.ns = get_pid_ns(ns);
3129 l->use_count = 0; /* don't increment here */ 3127 l->use_count = 0; /* don't increment here */
3130 l->list = NULL; 3128 l->list = NULL;
3131 l->owner = cgrp; 3129 l->owner = cgrp;
3132 list_add(&l->links, &cgrp->pidlists); 3130 list_add(&l->links, &cgrp->pidlists);
3133 mutex_unlock(&cgrp->pidlist_mutex); 3131 mutex_unlock(&cgrp->pidlist_mutex);
3134 return l; 3132 return l;
3135 } 3133 }
3136 3134
3137 /* 3135 /*
3138 * Load a cgroup's pidarray with either procs' tgids or tasks' pids 3136 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3139 */ 3137 */
3140 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, 3138 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3141 struct cgroup_pidlist **lp) 3139 struct cgroup_pidlist **lp)
3142 { 3140 {
3143 pid_t *array; 3141 pid_t *array;
3144 int length; 3142 int length;
3145 int pid, n = 0; /* used for populating the array */ 3143 int pid, n = 0; /* used for populating the array */
3146 struct cgroup_iter it; 3144 struct cgroup_iter it;
3147 struct task_struct *tsk; 3145 struct task_struct *tsk;
3148 struct cgroup_pidlist *l; 3146 struct cgroup_pidlist *l;
3149 3147
3150 /* 3148 /*
3151 * If cgroup gets more users after we read count, we won't have 3149 * If cgroup gets more users after we read count, we won't have
3152 * enough space - tough. This race is indistinguishable to the 3150 * enough space - tough. This race is indistinguishable to the
3153 * caller from the case that the additional cgroup users didn't 3151 * caller from the case that the additional cgroup users didn't
3154 * show up until sometime later on. 3152 * show up until sometime later on.
3155 */ 3153 */
3156 length = cgroup_task_count(cgrp); 3154 length = cgroup_task_count(cgrp);
3157 array = pidlist_allocate(length); 3155 array = pidlist_allocate(length);
3158 if (!array) 3156 if (!array)
3159 return -ENOMEM; 3157 return -ENOMEM;
3160 /* now, populate the array */ 3158 /* now, populate the array */
3161 cgroup_iter_start(cgrp, &it); 3159 cgroup_iter_start(cgrp, &it);
3162 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3160 while ((tsk = cgroup_iter_next(cgrp, &it))) {
3163 if (unlikely(n == length)) 3161 if (unlikely(n == length))
3164 break; 3162 break;
3165 /* get tgid or pid for procs or tasks file respectively */ 3163 /* get tgid or pid for procs or tasks file respectively */
3166 if (type == CGROUP_FILE_PROCS) 3164 if (type == CGROUP_FILE_PROCS)
3167 pid = task_tgid_vnr(tsk); 3165 pid = task_tgid_vnr(tsk);
3168 else 3166 else
3169 pid = task_pid_vnr(tsk); 3167 pid = task_pid_vnr(tsk);
3170 if (pid > 0) /* make sure to only use valid results */ 3168 if (pid > 0) /* make sure to only use valid results */
3171 array[n++] = pid; 3169 array[n++] = pid;
3172 } 3170 }
3173 cgroup_iter_end(cgrp, &it); 3171 cgroup_iter_end(cgrp, &it);
3174 length = n; 3172 length = n;
3175 /* now sort & (if procs) strip out duplicates */ 3173 /* now sort & (if procs) strip out duplicates */
3176 sort(array, length, sizeof(pid_t), cmppid, NULL); 3174 sort(array, length, sizeof(pid_t), cmppid, NULL);
3177 if (type == CGROUP_FILE_PROCS) 3175 if (type == CGROUP_FILE_PROCS)
3178 length = pidlist_uniq(&array, length); 3176 length = pidlist_uniq(&array, length);
3179 l = cgroup_pidlist_find(cgrp, type); 3177 l = cgroup_pidlist_find(cgrp, type);
3180 if (!l) { 3178 if (!l) {
3181 pidlist_free(array); 3179 pidlist_free(array);
3182 return -ENOMEM; 3180 return -ENOMEM;
3183 } 3181 }
3184 /* store array, freeing old if necessary - lock already held */ 3182 /* store array, freeing old if necessary - lock already held */
3185 pidlist_free(l->list); 3183 pidlist_free(l->list);
3186 l->list = array; 3184 l->list = array;
3187 l->length = length; 3185 l->length = length;
3188 l->use_count++; 3186 l->use_count++;
3189 up_write(&l->mutex); 3187 up_write(&l->mutex);
3190 *lp = l; 3188 *lp = l;
3191 return 0; 3189 return 0;
3192 } 3190 }
3193 3191
3194 /** 3192 /**
3195 * cgroupstats_build - build and fill cgroupstats 3193 * cgroupstats_build - build and fill cgroupstats
3196 * @stats: cgroupstats to fill information into 3194 * @stats: cgroupstats to fill information into
3197 * @dentry: A dentry entry belonging to the cgroup for which stats have 3195 * @dentry: A dentry entry belonging to the cgroup for which stats have
3198 * been requested. 3196 * been requested.
3199 * 3197 *
3200 * Build and fill cgroupstats so that taskstats can export it to user 3198 * Build and fill cgroupstats so that taskstats can export it to user
3201 * space. 3199 * space.
3202 */ 3200 */
3203 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3201 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3204 { 3202 {
3205 int ret = -EINVAL; 3203 int ret = -EINVAL;
3206 struct cgroup *cgrp; 3204 struct cgroup *cgrp;
3207 struct cgroup_iter it; 3205 struct cgroup_iter it;
3208 struct task_struct *tsk; 3206 struct task_struct *tsk;
3209 3207
3210 /* 3208 /*
3211 * Validate dentry by checking the superblock operations, 3209 * Validate dentry by checking the superblock operations,
3212 * and make sure it's a directory. 3210 * and make sure it's a directory.
3213 */ 3211 */
3214 if (dentry->d_sb->s_op != &cgroup_ops || 3212 if (dentry->d_sb->s_op != &cgroup_ops ||
3215 !S_ISDIR(dentry->d_inode->i_mode)) 3213 !S_ISDIR(dentry->d_inode->i_mode))
3216 goto err; 3214 goto err;
3217 3215
3218 ret = 0; 3216 ret = 0;
3219 cgrp = dentry->d_fsdata; 3217 cgrp = dentry->d_fsdata;
3220 3218
3221 cgroup_iter_start(cgrp, &it); 3219 cgroup_iter_start(cgrp, &it);
3222 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3220 while ((tsk = cgroup_iter_next(cgrp, &it))) {
3223 switch (tsk->state) { 3221 switch (tsk->state) {
3224 case TASK_RUNNING: 3222 case TASK_RUNNING:
3225 stats->nr_running++; 3223 stats->nr_running++;
3226 break; 3224 break;
3227 case TASK_INTERRUPTIBLE: 3225 case TASK_INTERRUPTIBLE:
3228 stats->nr_sleeping++; 3226 stats->nr_sleeping++;
3229 break; 3227 break;
3230 case TASK_UNINTERRUPTIBLE: 3228 case TASK_UNINTERRUPTIBLE:
3231 stats->nr_uninterruptible++; 3229 stats->nr_uninterruptible++;
3232 break; 3230 break;
3233 case TASK_STOPPED: 3231 case TASK_STOPPED:
3234 stats->nr_stopped++; 3232 stats->nr_stopped++;
3235 break; 3233 break;
3236 default: 3234 default:
3237 if (delayacct_is_task_waiting_on_io(tsk)) 3235 if (delayacct_is_task_waiting_on_io(tsk))
3238 stats->nr_io_wait++; 3236 stats->nr_io_wait++;
3239 break; 3237 break;
3240 } 3238 }
3241 } 3239 }
3242 cgroup_iter_end(cgrp, &it); 3240 cgroup_iter_end(cgrp, &it);
3243 3241
3244 err: 3242 err:
3245 return ret; 3243 return ret;
3246 } 3244 }
3247 3245
3248 3246
3249 /* 3247 /*
3250 * seq_file methods for the tasks/procs files. The seq_file position is the 3248 * seq_file methods for the tasks/procs files. The seq_file position is the
3251 * next pid to display; the seq_file iterator is a pointer to the pid 3249 * next pid to display; the seq_file iterator is a pointer to the pid
3252 * in the cgroup->l->list array. 3250 * in the cgroup->l->list array.
3253 */ 3251 */
3254 3252
3255 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) 3253 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3256 { 3254 {
3257 /* 3255 /*
3258 * Initially we receive a position value that corresponds to 3256 * Initially we receive a position value that corresponds to
3259 * one more than the last pid shown (or 0 on the first call or 3257 * one more than the last pid shown (or 0 on the first call or
3260 * after a seek to the start). Use a binary-search to find the 3258 * after a seek to the start). Use a binary-search to find the
3261 * next pid to display, if any 3259 * next pid to display, if any
3262 */ 3260 */
3263 struct cgroup_pidlist *l = s->private; 3261 struct cgroup_pidlist *l = s->private;
3264 int index = 0, pid = *pos; 3262 int index = 0, pid = *pos;
3265 int *iter; 3263 int *iter;
3266 3264
3267 down_read(&l->mutex); 3265 down_read(&l->mutex);
3268 if (pid) { 3266 if (pid) {
3269 int end = l->length; 3267 int end = l->length;
3270 3268
3271 while (index < end) { 3269 while (index < end) {
3272 int mid = (index + end) / 2; 3270 int mid = (index + end) / 2;
3273 if (l->list[mid] == pid) { 3271 if (l->list[mid] == pid) {
3274 index = mid; 3272 index = mid;
3275 break; 3273 break;
3276 } else if (l->list[mid] <= pid) 3274 } else if (l->list[mid] <= pid)
3277 index = mid + 1; 3275 index = mid + 1;
3278 else 3276 else
3279 end = mid; 3277 end = mid;
3280 } 3278 }
3281 } 3279 }
3282 /* If we're off the end of the array, we're done */ 3280 /* If we're off the end of the array, we're done */
3283 if (index >= l->length) 3281 if (index >= l->length)
3284 return NULL; 3282 return NULL;
3285 /* Update the abstract position to be the actual pid that we found */ 3283 /* Update the abstract position to be the actual pid that we found */
3286 iter = l->list + index; 3284 iter = l->list + index;
3287 *pos = *iter; 3285 *pos = *iter;
3288 return iter; 3286 return iter;
3289 } 3287 }
3290 3288
3291 static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3289 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3292 { 3290 {
3293 struct cgroup_pidlist *l = s->private; 3291 struct cgroup_pidlist *l = s->private;
3294 up_read(&l->mutex); 3292 up_read(&l->mutex);
3295 } 3293 }
3296 3294
3297 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3295 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3298 { 3296 {
3299 struct cgroup_pidlist *l = s->private; 3297 struct cgroup_pidlist *l = s->private;
3300 pid_t *p = v; 3298 pid_t *p = v;
3301 pid_t *end = l->list + l->length; 3299 pid_t *end = l->list + l->length;
3302 /* 3300 /*
3303 * Advance to the next pid in the array. If this goes off the 3301 * Advance to the next pid in the array. If this goes off the
3304 * end, we're done 3302 * end, we're done
3305 */ 3303 */
3306 p++; 3304 p++;
3307 if (p >= end) { 3305 if (p >= end) {
3308 return NULL; 3306 return NULL;
3309 } else { 3307 } else {
3310 *pos = *p; 3308 *pos = *p;
3311 return p; 3309 return p;
3312 } 3310 }
3313 } 3311 }
3314 3312
3315 static int cgroup_pidlist_show(struct seq_file *s, void *v) 3313 static int cgroup_pidlist_show(struct seq_file *s, void *v)
3316 { 3314 {
3317 return seq_printf(s, "%d\n", *(int *)v); 3315 return seq_printf(s, "%d\n", *(int *)v);
3318 } 3316 }
3319 3317
3320 /* 3318 /*
3321 * seq_operations functions for iterating on pidlists through seq_file - 3319 * seq_operations functions for iterating on pidlists through seq_file -
3322 * independent of whether it's tasks or procs 3320 * independent of whether it's tasks or procs
3323 */ 3321 */
3324 static const struct seq_operations cgroup_pidlist_seq_operations = { 3322 static const struct seq_operations cgroup_pidlist_seq_operations = {
3325 .start = cgroup_pidlist_start, 3323 .start = cgroup_pidlist_start,
3326 .stop = cgroup_pidlist_stop, 3324 .stop = cgroup_pidlist_stop,
3327 .next = cgroup_pidlist_next, 3325 .next = cgroup_pidlist_next,
3328 .show = cgroup_pidlist_show, 3326 .show = cgroup_pidlist_show,
3329 }; 3327 };
3330 3328
3331 static void cgroup_release_pid_array(struct cgroup_pidlist *l) 3329 static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3332 { 3330 {
3333 /* 3331 /*
3334 * the case where we're the last user of this particular pidlist will 3332 * the case where we're the last user of this particular pidlist will
3335 * have us remove it from the cgroup's list, which entails taking the 3333 * have us remove it from the cgroup's list, which entails taking the
3336 * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> 3334 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3337 * pidlist_mutex, we have to take pidlist_mutex first. 3335 * pidlist_mutex, we have to take pidlist_mutex first.
3338 */ 3336 */
3339 mutex_lock(&l->owner->pidlist_mutex); 3337 mutex_lock(&l->owner->pidlist_mutex);
3340 down_write(&l->mutex); 3338 down_write(&l->mutex);
3341 BUG_ON(!l->use_count); 3339 BUG_ON(!l->use_count);
3342 if (!--l->use_count) { 3340 if (!--l->use_count) {
3343 /* we're the last user if refcount is 0; remove and free */ 3341 /* we're the last user if refcount is 0; remove and free */
3344 list_del(&l->links); 3342 list_del(&l->links);
3345 mutex_unlock(&l->owner->pidlist_mutex); 3343 mutex_unlock(&l->owner->pidlist_mutex);
3346 pidlist_free(l->list); 3344 pidlist_free(l->list);
3347 put_pid_ns(l->key.ns); 3345 put_pid_ns(l->key.ns);
3348 up_write(&l->mutex); 3346 up_write(&l->mutex);
3349 kfree(l); 3347 kfree(l);
3350 return; 3348 return;
3351 } 3349 }
3352 mutex_unlock(&l->owner->pidlist_mutex); 3350 mutex_unlock(&l->owner->pidlist_mutex);
3353 up_write(&l->mutex); 3351 up_write(&l->mutex);
3354 } 3352 }
3355 3353
3356 static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3354 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3357 { 3355 {
3358 struct cgroup_pidlist *l; 3356 struct cgroup_pidlist *l;
3359 if (!(file->f_mode & FMODE_READ)) 3357 if (!(file->f_mode & FMODE_READ))
3360 return 0; 3358 return 0;
3361 /* 3359 /*
3362 * the seq_file will only be initialized if the file was opened for 3360 * the seq_file will only be initialized if the file was opened for
3363 * reading; hence we check if it's not null only in that case. 3361 * reading; hence we check if it's not null only in that case.
3364 */ 3362 */
3365 l = ((struct seq_file *)file->private_data)->private; 3363 l = ((struct seq_file *)file->private_data)->private;
3366 cgroup_release_pid_array(l); 3364 cgroup_release_pid_array(l);
3367 return seq_release(inode, file); 3365 return seq_release(inode, file);
3368 } 3366 }
3369 3367
3370 static const struct file_operations cgroup_pidlist_operations = { 3368 static const struct file_operations cgroup_pidlist_operations = {
3371 .read = seq_read, 3369 .read = seq_read,
3372 .llseek = seq_lseek, 3370 .llseek = seq_lseek,
3373 .write = cgroup_file_write, 3371 .write = cgroup_file_write,
3374 .release = cgroup_pidlist_release, 3372 .release = cgroup_pidlist_release,
3375 }; 3373 };
3376 3374
3377 /* 3375 /*
3378 * The following functions handle opens on a file that displays a pidlist 3376 * The following functions handle opens on a file that displays a pidlist
3379 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's 3377 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3380 * in the cgroup. 3378 * in the cgroup.
3381 */ 3379 */
3382 /* helper function for the two below it */ 3380 /* helper function for the two below it */
3383 static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) 3381 static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3384 { 3382 {
3385 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 3383 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3386 struct cgroup_pidlist *l; 3384 struct cgroup_pidlist *l;
3387 int retval; 3385 int retval;
3388 3386
3389 /* Nothing to do for write-only files */ 3387 /* Nothing to do for write-only files */
3390 if (!(file->f_mode & FMODE_READ)) 3388 if (!(file->f_mode & FMODE_READ))
3391 return 0; 3389 return 0;
3392 3390
3393 /* have the array populated */ 3391 /* have the array populated */
3394 retval = pidlist_array_load(cgrp, type, &l); 3392 retval = pidlist_array_load(cgrp, type, &l);
3395 if (retval) 3393 if (retval)
3396 return retval; 3394 return retval;
3397 /* configure file information */ 3395 /* configure file information */
3398 file->f_op = &cgroup_pidlist_operations; 3396 file->f_op = &cgroup_pidlist_operations;
3399 3397
3400 retval = seq_open(file, &cgroup_pidlist_seq_operations); 3398 retval = seq_open(file, &cgroup_pidlist_seq_operations);
3401 if (retval) { 3399 if (retval) {
3402 cgroup_release_pid_array(l); 3400 cgroup_release_pid_array(l);
3403 return retval; 3401 return retval;
3404 } 3402 }
3405 ((struct seq_file *)file->private_data)->private = l; 3403 ((struct seq_file *)file->private_data)->private = l;
3406 return 0; 3404 return 0;
3407 } 3405 }
3408 static int cgroup_tasks_open(struct inode *unused, struct file *file) 3406 static int cgroup_tasks_open(struct inode *unused, struct file *file)
3409 { 3407 {
3410 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); 3408 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3411 } 3409 }
3412 static int cgroup_procs_open(struct inode *unused, struct file *file) 3410 static int cgroup_procs_open(struct inode *unused, struct file *file)
3413 { 3411 {
3414 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3412 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3415 } 3413 }
3416 3414
3417 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3415 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
3418 struct cftype *cft) 3416 struct cftype *cft)
3419 { 3417 {
3420 return notify_on_release(cgrp); 3418 return notify_on_release(cgrp);
3421 } 3419 }
3422 3420
3423 static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3421 static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3424 struct cftype *cft, 3422 struct cftype *cft,
3425 u64 val) 3423 u64 val)
3426 { 3424 {
3427 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3425 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
3428 if (val) 3426 if (val)
3429 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3427 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3430 else 3428 else
3431 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3429 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3432 return 0; 3430 return 0;
3433 } 3431 }
3434 3432
3435 /* 3433 /*
3436 * Unregister event and free resources. 3434 * Unregister event and free resources.
3437 * 3435 *
3438 * Gets called from workqueue. 3436 * Gets called from workqueue.
3439 */ 3437 */
3440 static void cgroup_event_remove(struct work_struct *work) 3438 static void cgroup_event_remove(struct work_struct *work)
3441 { 3439 {
3442 struct cgroup_event *event = container_of(work, struct cgroup_event, 3440 struct cgroup_event *event = container_of(work, struct cgroup_event,
3443 remove); 3441 remove);
3444 struct cgroup *cgrp = event->cgrp; 3442 struct cgroup *cgrp = event->cgrp;
3445 3443
3446 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3444 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3447 3445
3448 eventfd_ctx_put(event->eventfd); 3446 eventfd_ctx_put(event->eventfd);
3449 kfree(event); 3447 kfree(event);
3450 dput(cgrp->dentry); 3448 dput(cgrp->dentry);
3451 } 3449 }
3452 3450
3453 /* 3451 /*
3454 * Gets called on POLLHUP on eventfd when user closes it. 3452 * Gets called on POLLHUP on eventfd when user closes it.
3455 * 3453 *
3456 * Called with wqh->lock held and interrupts disabled. 3454 * Called with wqh->lock held and interrupts disabled.
3457 */ 3455 */
3458 static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, 3456 static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3459 int sync, void *key) 3457 int sync, void *key)
3460 { 3458 {
3461 struct cgroup_event *event = container_of(wait, 3459 struct cgroup_event *event = container_of(wait,
3462 struct cgroup_event, wait); 3460 struct cgroup_event, wait);
3463 struct cgroup *cgrp = event->cgrp; 3461 struct cgroup *cgrp = event->cgrp;
3464 unsigned long flags = (unsigned long)key; 3462 unsigned long flags = (unsigned long)key;
3465 3463
3466 if (flags & POLLHUP) { 3464 if (flags & POLLHUP) {
3467 __remove_wait_queue(event->wqh, &event->wait); 3465 __remove_wait_queue(event->wqh, &event->wait);
3468 spin_lock(&cgrp->event_list_lock); 3466 spin_lock(&cgrp->event_list_lock);
3469 list_del(&event->list); 3467 list_del(&event->list);
3470 spin_unlock(&cgrp->event_list_lock); 3468 spin_unlock(&cgrp->event_list_lock);
3471 /* 3469 /*
3472 * We are in atomic context, but cgroup_event_remove() may 3470 * We are in atomic context, but cgroup_event_remove() may
3473 * sleep, so we have to call it in workqueue. 3471 * sleep, so we have to call it in workqueue.
3474 */ 3472 */
3475 schedule_work(&event->remove); 3473 schedule_work(&event->remove);
3476 } 3474 }
3477 3475
3478 return 0; 3476 return 0;
3479 } 3477 }
3480 3478
3481 static void cgroup_event_ptable_queue_proc(struct file *file, 3479 static void cgroup_event_ptable_queue_proc(struct file *file,
3482 wait_queue_head_t *wqh, poll_table *pt) 3480 wait_queue_head_t *wqh, poll_table *pt)
3483 { 3481 {
3484 struct cgroup_event *event = container_of(pt, 3482 struct cgroup_event *event = container_of(pt,
3485 struct cgroup_event, pt); 3483 struct cgroup_event, pt);
3486 3484
3487 event->wqh = wqh; 3485 event->wqh = wqh;
3488 add_wait_queue(wqh, &event->wait); 3486 add_wait_queue(wqh, &event->wait);
3489 } 3487 }
3490 3488
3491 /* 3489 /*
3492 * Parse input and register new cgroup event handler. 3490 * Parse input and register new cgroup event handler.
3493 * 3491 *
3494 * Input must be in format '<event_fd> <control_fd> <args>'. 3492 * Input must be in format '<event_fd> <control_fd> <args>'.
3495 * Interpretation of args is defined by control file implementation. 3493 * Interpretation of args is defined by control file implementation.
3496 */ 3494 */
3497 static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 3495 static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3498 const char *buffer) 3496 const char *buffer)
3499 { 3497 {
3500 struct cgroup_event *event = NULL; 3498 struct cgroup_event *event = NULL;
3501 unsigned int efd, cfd; 3499 unsigned int efd, cfd;
3502 struct file *efile = NULL; 3500 struct file *efile = NULL;
3503 struct file *cfile = NULL; 3501 struct file *cfile = NULL;
3504 char *endp; 3502 char *endp;
3505 int ret; 3503 int ret;
3506 3504
3507 efd = simple_strtoul(buffer, &endp, 10); 3505 efd = simple_strtoul(buffer, &endp, 10);
3508 if (*endp != ' ') 3506 if (*endp != ' ')
3509 return -EINVAL; 3507 return -EINVAL;
3510 buffer = endp + 1; 3508 buffer = endp + 1;
3511 3509
3512 cfd = simple_strtoul(buffer, &endp, 10); 3510 cfd = simple_strtoul(buffer, &endp, 10);
3513 if ((*endp != ' ') && (*endp != '\0')) 3511 if ((*endp != ' ') && (*endp != '\0'))
3514 return -EINVAL; 3512 return -EINVAL;
3515 buffer = endp + 1; 3513 buffer = endp + 1;
3516 3514
3517 event = kzalloc(sizeof(*event), GFP_KERNEL); 3515 event = kzalloc(sizeof(*event), GFP_KERNEL);
3518 if (!event) 3516 if (!event)
3519 return -ENOMEM; 3517 return -ENOMEM;
3520 event->cgrp = cgrp; 3518 event->cgrp = cgrp;
3521 INIT_LIST_HEAD(&event->list); 3519 INIT_LIST_HEAD(&event->list);
3522 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 3520 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3523 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 3521 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3524 INIT_WORK(&event->remove, cgroup_event_remove); 3522 INIT_WORK(&event->remove, cgroup_event_remove);
3525 3523
3526 efile = eventfd_fget(efd); 3524 efile = eventfd_fget(efd);
3527 if (IS_ERR(efile)) { 3525 if (IS_ERR(efile)) {
3528 ret = PTR_ERR(efile); 3526 ret = PTR_ERR(efile);
3529 goto fail; 3527 goto fail;
3530 } 3528 }
3531 3529
3532 event->eventfd = eventfd_ctx_fileget(efile); 3530 event->eventfd = eventfd_ctx_fileget(efile);
3533 if (IS_ERR(event->eventfd)) { 3531 if (IS_ERR(event->eventfd)) {
3534 ret = PTR_ERR(event->eventfd); 3532 ret = PTR_ERR(event->eventfd);
3535 goto fail; 3533 goto fail;
3536 } 3534 }
3537 3535
3538 cfile = fget(cfd); 3536 cfile = fget(cfd);
3539 if (!cfile) { 3537 if (!cfile) {
3540 ret = -EBADF; 3538 ret = -EBADF;
3541 goto fail; 3539 goto fail;
3542 } 3540 }
3543 3541
3544 /* the process need read permission on control file */ 3542 /* the process need read permission on control file */
3545 ret = file_permission(cfile, MAY_READ); 3543 ret = file_permission(cfile, MAY_READ);
3546 if (ret < 0) 3544 if (ret < 0)
3547 goto fail; 3545 goto fail;
3548 3546
3549 event->cft = __file_cft(cfile); 3547 event->cft = __file_cft(cfile);
3550 if (IS_ERR(event->cft)) { 3548 if (IS_ERR(event->cft)) {
3551 ret = PTR_ERR(event->cft); 3549 ret = PTR_ERR(event->cft);
3552 goto fail; 3550 goto fail;
3553 } 3551 }
3554 3552
3555 if (!event->cft->register_event || !event->cft->unregister_event) { 3553 if (!event->cft->register_event || !event->cft->unregister_event) {
3556 ret = -EINVAL; 3554 ret = -EINVAL;
3557 goto fail; 3555 goto fail;
3558 } 3556 }
3559 3557
3560 ret = event->cft->register_event(cgrp, event->cft, 3558 ret = event->cft->register_event(cgrp, event->cft,
3561 event->eventfd, buffer); 3559 event->eventfd, buffer);
3562 if (ret) 3560 if (ret)
3563 goto fail; 3561 goto fail;
3564 3562
3565 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { 3563 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3566 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3564 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3567 ret = 0; 3565 ret = 0;
3568 goto fail; 3566 goto fail;
3569 } 3567 }
3570 3568
3571 /* 3569 /*
3572 * Events should be removed after rmdir of cgroup directory, but before 3570 * Events should be removed after rmdir of cgroup directory, but before
3573 * destroying subsystem state objects. Let's take reference to cgroup 3571 * destroying subsystem state objects. Let's take reference to cgroup
3574 * directory dentry to do that. 3572 * directory dentry to do that.
3575 */ 3573 */
3576 dget(cgrp->dentry); 3574 dget(cgrp->dentry);
3577 3575
3578 spin_lock(&cgrp->event_list_lock); 3576 spin_lock(&cgrp->event_list_lock);
3579 list_add(&event->list, &cgrp->event_list); 3577 list_add(&event->list, &cgrp->event_list);
3580 spin_unlock(&cgrp->event_list_lock); 3578 spin_unlock(&cgrp->event_list_lock);
3581 3579
3582 fput(cfile); 3580 fput(cfile);
3583 fput(efile); 3581 fput(efile);
3584 3582
3585 return 0; 3583 return 0;
3586 3584
3587 fail: 3585 fail:
3588 if (cfile) 3586 if (cfile)
3589 fput(cfile); 3587 fput(cfile);
3590 3588
3591 if (event && event->eventfd && !IS_ERR(event->eventfd)) 3589 if (event && event->eventfd && !IS_ERR(event->eventfd))
3592 eventfd_ctx_put(event->eventfd); 3590 eventfd_ctx_put(event->eventfd);
3593 3591
3594 if (!IS_ERR_OR_NULL(efile)) 3592 if (!IS_ERR_OR_NULL(efile))
3595 fput(efile); 3593 fput(efile);
3596 3594
3597 kfree(event); 3595 kfree(event);
3598 3596
3599 return ret; 3597 return ret;
3600 } 3598 }
3601 3599
3602 static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3600 static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3603 struct cftype *cft) 3601 struct cftype *cft)
3604 { 3602 {
3605 return clone_children(cgrp); 3603 return clone_children(cgrp);
3606 } 3604 }
3607 3605
3608 static int cgroup_clone_children_write(struct cgroup *cgrp, 3606 static int cgroup_clone_children_write(struct cgroup *cgrp,
3609 struct cftype *cft, 3607 struct cftype *cft,
3610 u64 val) 3608 u64 val)
3611 { 3609 {
3612 if (val) 3610 if (val)
3613 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3611 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3614 else 3612 else
3615 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3613 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3616 return 0; 3614 return 0;
3617 } 3615 }
3618 3616
3619 /* 3617 /*
3620 * for the common functions, 'private' gives the type of file 3618 * for the common functions, 'private' gives the type of file
3621 */ 3619 */
3622 /* for hysterical raisins, we can't put this on the older files */ 3620 /* for hysterical raisins, we can't put this on the older files */
3623 #define CGROUP_FILE_GENERIC_PREFIX "cgroup." 3621 #define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3624 static struct cftype files[] = { 3622 static struct cftype files[] = {
3625 { 3623 {
3626 .name = "tasks", 3624 .name = "tasks",
3627 .open = cgroup_tasks_open, 3625 .open = cgroup_tasks_open,
3628 .write_u64 = cgroup_tasks_write, 3626 .write_u64 = cgroup_tasks_write,
3629 .release = cgroup_pidlist_release, 3627 .release = cgroup_pidlist_release,
3630 .mode = S_IRUGO | S_IWUSR, 3628 .mode = S_IRUGO | S_IWUSR,
3631 }, 3629 },
3632 { 3630 {
3633 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 3631 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3634 .open = cgroup_procs_open, 3632 .open = cgroup_procs_open,
3635 .write_u64 = cgroup_procs_write, 3633 .write_u64 = cgroup_procs_write,
3636 .release = cgroup_pidlist_release, 3634 .release = cgroup_pidlist_release,
3637 .mode = S_IRUGO | S_IWUSR, 3635 .mode = S_IRUGO | S_IWUSR,
3638 }, 3636 },
3639 { 3637 {
3640 .name = "notify_on_release", 3638 .name = "notify_on_release",
3641 .read_u64 = cgroup_read_notify_on_release, 3639 .read_u64 = cgroup_read_notify_on_release,
3642 .write_u64 = cgroup_write_notify_on_release, 3640 .write_u64 = cgroup_write_notify_on_release,
3643 }, 3641 },
3644 { 3642 {
3645 .name = CGROUP_FILE_GENERIC_PREFIX "event_control", 3643 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3646 .write_string = cgroup_write_event_control, 3644 .write_string = cgroup_write_event_control,
3647 .mode = S_IWUGO, 3645 .mode = S_IWUGO,
3648 }, 3646 },
3649 { 3647 {
3650 .name = "cgroup.clone_children", 3648 .name = "cgroup.clone_children",
3651 .read_u64 = cgroup_clone_children_read, 3649 .read_u64 = cgroup_clone_children_read,
3652 .write_u64 = cgroup_clone_children_write, 3650 .write_u64 = cgroup_clone_children_write,
3653 }, 3651 },
3654 }; 3652 };
3655 3653
3656 static struct cftype cft_release_agent = { 3654 static struct cftype cft_release_agent = {
3657 .name = "release_agent", 3655 .name = "release_agent",
3658 .read_seq_string = cgroup_release_agent_show, 3656 .read_seq_string = cgroup_release_agent_show,
3659 .write_string = cgroup_release_agent_write, 3657 .write_string = cgroup_release_agent_write,
3660 .max_write_len = PATH_MAX, 3658 .max_write_len = PATH_MAX,
3661 }; 3659 };
3662 3660
3663 static int cgroup_populate_dir(struct cgroup *cgrp) 3661 static int cgroup_populate_dir(struct cgroup *cgrp)
3664 { 3662 {
3665 int err; 3663 int err;
3666 struct cgroup_subsys *ss; 3664 struct cgroup_subsys *ss;
3667 3665
3668 /* First clear out any existing files */ 3666 /* First clear out any existing files */
3669 cgroup_clear_directory(cgrp->dentry); 3667 cgroup_clear_directory(cgrp->dentry);
3670 3668
3671 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); 3669 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3672 if (err < 0) 3670 if (err < 0)
3673 return err; 3671 return err;
3674 3672
3675 if (cgrp == cgrp->top_cgroup) { 3673 if (cgrp == cgrp->top_cgroup) {
3676 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) 3674 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3677 return err; 3675 return err;
3678 } 3676 }
3679 3677
3680 for_each_subsys(cgrp->root, ss) { 3678 for_each_subsys(cgrp->root, ss) {
3681 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 3679 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
3682 return err; 3680 return err;
3683 } 3681 }
3684 /* This cgroup is ready now */ 3682 /* This cgroup is ready now */
3685 for_each_subsys(cgrp->root, ss) { 3683 for_each_subsys(cgrp->root, ss) {
3686 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3684 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3687 /* 3685 /*
3688 * Update id->css pointer and make this css visible from 3686 * Update id->css pointer and make this css visible from
3689 * CSS ID functions. This pointer will be dereferened 3687 * CSS ID functions. This pointer will be dereferened
3690 * from RCU-read-side without locks. 3688 * from RCU-read-side without locks.
3691 */ 3689 */
3692 if (css->id) 3690 if (css->id)
3693 rcu_assign_pointer(css->id->css, css); 3691 rcu_assign_pointer(css->id->css, css);
3694 } 3692 }
3695 3693
3696 return 0; 3694 return 0;
3697 } 3695 }
3698 3696
3699 static void init_cgroup_css(struct cgroup_subsys_state *css, 3697 static void init_cgroup_css(struct cgroup_subsys_state *css,
3700 struct cgroup_subsys *ss, 3698 struct cgroup_subsys *ss,
3701 struct cgroup *cgrp) 3699 struct cgroup *cgrp)
3702 { 3700 {
3703 css->cgroup = cgrp; 3701 css->cgroup = cgrp;
3704 atomic_set(&css->refcnt, 1); 3702 atomic_set(&css->refcnt, 1);
3705 css->flags = 0; 3703 css->flags = 0;
3706 css->id = NULL; 3704 css->id = NULL;
3707 if (cgrp == dummytop) 3705 if (cgrp == dummytop)
3708 set_bit(CSS_ROOT, &css->flags); 3706 set_bit(CSS_ROOT, &css->flags);
3709 BUG_ON(cgrp->subsys[ss->subsys_id]); 3707 BUG_ON(cgrp->subsys[ss->subsys_id]);
3710 cgrp->subsys[ss->subsys_id] = css; 3708 cgrp->subsys[ss->subsys_id] = css;
3711 } 3709 }
3712 3710
3713 static void cgroup_lock_hierarchy(struct cgroupfs_root *root) 3711 static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
3714 { 3712 {
3715 /* We need to take each hierarchy_mutex in a consistent order */ 3713 /* We need to take each hierarchy_mutex in a consistent order */
3716 int i; 3714 int i;
3717 3715
3718 /* 3716 /*
3719 * No worry about a race with rebind_subsystems that might mess up the 3717 * No worry about a race with rebind_subsystems that might mess up the
3720 * locking order, since both parties are under cgroup_mutex. 3718 * locking order, since both parties are under cgroup_mutex.
3721 */ 3719 */
3722 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3720 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3723 struct cgroup_subsys *ss = subsys[i]; 3721 struct cgroup_subsys *ss = subsys[i];
3724 if (ss == NULL) 3722 if (ss == NULL)
3725 continue; 3723 continue;
3726 if (ss->root == root) 3724 if (ss->root == root)
3727 mutex_lock(&ss->hierarchy_mutex); 3725 mutex_lock(&ss->hierarchy_mutex);
3728 } 3726 }
3729 } 3727 }
3730 3728
3731 static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) 3729 static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3732 { 3730 {
3733 int i; 3731 int i;
3734 3732
3735 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3733 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3736 struct cgroup_subsys *ss = subsys[i]; 3734 struct cgroup_subsys *ss = subsys[i];
3737 if (ss == NULL) 3735 if (ss == NULL)
3738 continue; 3736 continue;
3739 if (ss->root == root) 3737 if (ss->root == root)
3740 mutex_unlock(&ss->hierarchy_mutex); 3738 mutex_unlock(&ss->hierarchy_mutex);
3741 } 3739 }
3742 } 3740 }
3743 3741
3744 /* 3742 /*
3745 * cgroup_create - create a cgroup 3743 * cgroup_create - create a cgroup
3746 * @parent: cgroup that will be parent of the new cgroup 3744 * @parent: cgroup that will be parent of the new cgroup
3747 * @dentry: dentry of the new cgroup 3745 * @dentry: dentry of the new cgroup
3748 * @mode: mode to set on new inode 3746 * @mode: mode to set on new inode
3749 * 3747 *
3750 * Must be called with the mutex on the parent inode held 3748 * Must be called with the mutex on the parent inode held
3751 */ 3749 */
3752 static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3750 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3753 mode_t mode) 3751 mode_t mode)
3754 { 3752 {
3755 struct cgroup *cgrp; 3753 struct cgroup *cgrp;
3756 struct cgroupfs_root *root = parent->root; 3754 struct cgroupfs_root *root = parent->root;
3757 int err = 0; 3755 int err = 0;
3758 struct cgroup_subsys *ss; 3756 struct cgroup_subsys *ss;
3759 struct super_block *sb = root->sb; 3757 struct super_block *sb = root->sb;
3760 3758
3761 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3759 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3762 if (!cgrp) 3760 if (!cgrp)
3763 return -ENOMEM; 3761 return -ENOMEM;
3764 3762
3765 /* Grab a reference on the superblock so the hierarchy doesn't 3763 /* Grab a reference on the superblock so the hierarchy doesn't
3766 * get deleted on unmount if there are child cgroups. This 3764 * get deleted on unmount if there are child cgroups. This
3767 * can be done outside cgroup_mutex, since the sb can't 3765 * can be done outside cgroup_mutex, since the sb can't
3768 * disappear while someone has an open control file on the 3766 * disappear while someone has an open control file on the
3769 * fs */ 3767 * fs */
3770 atomic_inc(&sb->s_active); 3768 atomic_inc(&sb->s_active);
3771 3769
3772 mutex_lock(&cgroup_mutex); 3770 mutex_lock(&cgroup_mutex);
3773 3771
3774 init_cgroup_housekeeping(cgrp); 3772 init_cgroup_housekeeping(cgrp);
3775 3773
3776 cgrp->parent = parent; 3774 cgrp->parent = parent;
3777 cgrp->root = parent->root; 3775 cgrp->root = parent->root;
3778 cgrp->top_cgroup = parent->top_cgroup; 3776 cgrp->top_cgroup = parent->top_cgroup;
3779 3777
3780 if (notify_on_release(parent)) 3778 if (notify_on_release(parent))
3781 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3779 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3782 3780
3783 if (clone_children(parent)) 3781 if (clone_children(parent))
3784 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3782 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3785 3783
3786 for_each_subsys(root, ss) { 3784 for_each_subsys(root, ss) {
3787 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3785 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3788 3786
3789 if (IS_ERR(css)) { 3787 if (IS_ERR(css)) {
3790 err = PTR_ERR(css); 3788 err = PTR_ERR(css);
3791 goto err_destroy; 3789 goto err_destroy;
3792 } 3790 }
3793 init_cgroup_css(css, ss, cgrp); 3791 init_cgroup_css(css, ss, cgrp);
3794 if (ss->use_id) { 3792 if (ss->use_id) {
3795 err = alloc_css_id(ss, parent, cgrp); 3793 err = alloc_css_id(ss, parent, cgrp);
3796 if (err) 3794 if (err)
3797 goto err_destroy; 3795 goto err_destroy;
3798 } 3796 }
3799 /* At error, ->destroy() callback has to free assigned ID. */ 3797 /* At error, ->destroy() callback has to free assigned ID. */
3800 if (clone_children(parent) && ss->post_clone) 3798 if (clone_children(parent) && ss->post_clone)
3801 ss->post_clone(ss, cgrp); 3799 ss->post_clone(ss, cgrp);
3802 } 3800 }
3803 3801
3804 cgroup_lock_hierarchy(root); 3802 cgroup_lock_hierarchy(root);
3805 list_add(&cgrp->sibling, &cgrp->parent->children); 3803 list_add(&cgrp->sibling, &cgrp->parent->children);
3806 cgroup_unlock_hierarchy(root); 3804 cgroup_unlock_hierarchy(root);
3807 root->number_of_cgroups++; 3805 root->number_of_cgroups++;
3808 3806
3809 err = cgroup_create_dir(cgrp, dentry, mode); 3807 err = cgroup_create_dir(cgrp, dentry, mode);
3810 if (err < 0) 3808 if (err < 0)
3811 goto err_remove; 3809 goto err_remove;
3812 3810
3813 /* The cgroup directory was pre-locked for us */ 3811 /* The cgroup directory was pre-locked for us */
3814 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 3812 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3815 3813
3816 err = cgroup_populate_dir(cgrp); 3814 err = cgroup_populate_dir(cgrp);
3817 /* If err < 0, we have a half-filled directory - oh well ;) */ 3815 /* If err < 0, we have a half-filled directory - oh well ;) */
3818 3816
3819 mutex_unlock(&cgroup_mutex); 3817 mutex_unlock(&cgroup_mutex);
3820 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 3818 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
3821 3819
3822 return 0; 3820 return 0;
3823 3821
3824 err_remove: 3822 err_remove:
3825 3823
3826 cgroup_lock_hierarchy(root); 3824 cgroup_lock_hierarchy(root);
3827 list_del(&cgrp->sibling); 3825 list_del(&cgrp->sibling);
3828 cgroup_unlock_hierarchy(root); 3826 cgroup_unlock_hierarchy(root);
3829 root->number_of_cgroups--; 3827 root->number_of_cgroups--;
3830 3828
3831 err_destroy: 3829 err_destroy:
3832 3830
3833 for_each_subsys(root, ss) { 3831 for_each_subsys(root, ss) {
3834 if (cgrp->subsys[ss->subsys_id]) 3832 if (cgrp->subsys[ss->subsys_id])
3835 ss->destroy(ss, cgrp); 3833 ss->destroy(ss, cgrp);
3836 } 3834 }
3837 3835
3838 mutex_unlock(&cgroup_mutex); 3836 mutex_unlock(&cgroup_mutex);
3839 3837
3840 /* Release the reference count that we took on the superblock */ 3838 /* Release the reference count that we took on the superblock */
3841 deactivate_super(sb); 3839 deactivate_super(sb);
3842 3840
3843 kfree(cgrp); 3841 kfree(cgrp);
3844 return err; 3842 return err;
3845 } 3843 }
3846 3844
3847 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) 3845 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3848 { 3846 {
3849 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3847 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
3850 3848
3851 /* the vfs holds inode->i_mutex already */ 3849 /* the vfs holds inode->i_mutex already */
3852 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 3850 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3853 } 3851 }
3854 3852
3855 static int cgroup_has_css_refs(struct cgroup *cgrp) 3853 static int cgroup_has_css_refs(struct cgroup *cgrp)
3856 { 3854 {
3857 /* Check the reference count on each subsystem. Since we 3855 /* Check the reference count on each subsystem. Since we
3858 * already established that there are no tasks in the 3856 * already established that there are no tasks in the
3859 * cgroup, if the css refcount is also 1, then there should 3857 * cgroup, if the css refcount is also 1, then there should
3860 * be no outstanding references, so the subsystem is safe to 3858 * be no outstanding references, so the subsystem is safe to
3861 * destroy. We scan across all subsystems rather than using 3859 * destroy. We scan across all subsystems rather than using
3862 * the per-hierarchy linked list of mounted subsystems since 3860 * the per-hierarchy linked list of mounted subsystems since
3863 * we can be called via check_for_release() with no 3861 * we can be called via check_for_release() with no
3864 * synchronization other than RCU, and the subsystem linked 3862 * synchronization other than RCU, and the subsystem linked
3865 * list isn't RCU-safe */ 3863 * list isn't RCU-safe */
3866 int i; 3864 int i;
3867 /* 3865 /*
3868 * We won't need to lock the subsys array, because the subsystems 3866 * We won't need to lock the subsys array, because the subsystems
3869 * we're concerned about aren't going anywhere since our cgroup root 3867 * we're concerned about aren't going anywhere since our cgroup root
3870 * has a reference on them. 3868 * has a reference on them.
3871 */ 3869 */
3872 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3870 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3873 struct cgroup_subsys *ss = subsys[i]; 3871 struct cgroup_subsys *ss = subsys[i];
3874 struct cgroup_subsys_state *css; 3872 struct cgroup_subsys_state *css;
3875 /* Skip subsystems not present or not in this hierarchy */ 3873 /* Skip subsystems not present or not in this hierarchy */
3876 if (ss == NULL || ss->root != cgrp->root) 3874 if (ss == NULL || ss->root != cgrp->root)
3877 continue; 3875 continue;
3878 css = cgrp->subsys[ss->subsys_id]; 3876 css = cgrp->subsys[ss->subsys_id];
3879 /* When called from check_for_release() it's possible 3877 /* When called from check_for_release() it's possible
3880 * that by this point the cgroup has been removed 3878 * that by this point the cgroup has been removed
3881 * and the css deleted. But a false-positive doesn't 3879 * and the css deleted. But a false-positive doesn't
3882 * matter, since it can only happen if the cgroup 3880 * matter, since it can only happen if the cgroup
3883 * has been deleted and hence no longer needs the 3881 * has been deleted and hence no longer needs the
3884 * release agent to be called anyway. */ 3882 * release agent to be called anyway. */
3885 if (css && (atomic_read(&css->refcnt) > 1)) 3883 if (css && (atomic_read(&css->refcnt) > 1))
3886 return 1; 3884 return 1;
3887 } 3885 }
3888 return 0; 3886 return 0;
3889 } 3887 }
3890 3888
3891 /* 3889 /*
3892 * Atomically mark all (or else none) of the cgroup's CSS objects as 3890 * Atomically mark all (or else none) of the cgroup's CSS objects as
3893 * CSS_REMOVED. Return true on success, or false if the cgroup has 3891 * CSS_REMOVED. Return true on success, or false if the cgroup has
3894 * busy subsystems. Call with cgroup_mutex held 3892 * busy subsystems. Call with cgroup_mutex held
3895 */ 3893 */
3896 3894
3897 static int cgroup_clear_css_refs(struct cgroup *cgrp) 3895 static int cgroup_clear_css_refs(struct cgroup *cgrp)
3898 { 3896 {
3899 struct cgroup_subsys *ss; 3897 struct cgroup_subsys *ss;
3900 unsigned long flags; 3898 unsigned long flags;
3901 bool failed = false; 3899 bool failed = false;
3902 local_irq_save(flags); 3900 local_irq_save(flags);
3903 for_each_subsys(cgrp->root, ss) { 3901 for_each_subsys(cgrp->root, ss) {
3904 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3902 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3905 int refcnt; 3903 int refcnt;
3906 while (1) { 3904 while (1) {
3907 /* We can only remove a CSS with a refcnt==1 */ 3905 /* We can only remove a CSS with a refcnt==1 */
3908 refcnt = atomic_read(&css->refcnt); 3906 refcnt = atomic_read(&css->refcnt);
3909 if (refcnt > 1) { 3907 if (refcnt > 1) {
3910 failed = true; 3908 failed = true;
3911 goto done; 3909 goto done;
3912 } 3910 }
3913 BUG_ON(!refcnt); 3911 BUG_ON(!refcnt);
3914 /* 3912 /*
3915 * Drop the refcnt to 0 while we check other 3913 * Drop the refcnt to 0 while we check other
3916 * subsystems. This will cause any racing 3914 * subsystems. This will cause any racing
3917 * css_tryget() to spin until we set the 3915 * css_tryget() to spin until we set the
3918 * CSS_REMOVED bits or abort 3916 * CSS_REMOVED bits or abort
3919 */ 3917 */
3920 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) 3918 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3921 break; 3919 break;
3922 cpu_relax(); 3920 cpu_relax();
3923 } 3921 }
3924 } 3922 }
3925 done: 3923 done:
3926 for_each_subsys(cgrp->root, ss) { 3924 for_each_subsys(cgrp->root, ss) {
3927 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3925 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3928 if (failed) { 3926 if (failed) {
3929 /* 3927 /*
3930 * Restore old refcnt if we previously managed 3928 * Restore old refcnt if we previously managed
3931 * to clear it from 1 to 0 3929 * to clear it from 1 to 0
3932 */ 3930 */
3933 if (!atomic_read(&css->refcnt)) 3931 if (!atomic_read(&css->refcnt))
3934 atomic_set(&css->refcnt, 1); 3932 atomic_set(&css->refcnt, 1);
3935 } else { 3933 } else {
3936 /* Commit the fact that the CSS is removed */ 3934 /* Commit the fact that the CSS is removed */
3937 set_bit(CSS_REMOVED, &css->flags); 3935 set_bit(CSS_REMOVED, &css->flags);
3938 } 3936 }
3939 } 3937 }
3940 local_irq_restore(flags); 3938 local_irq_restore(flags);
3941 return !failed; 3939 return !failed;
3942 } 3940 }
3943 3941
3944 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 3942 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3945 { 3943 {
3946 struct cgroup *cgrp = dentry->d_fsdata; 3944 struct cgroup *cgrp = dentry->d_fsdata;
3947 struct dentry *d; 3945 struct dentry *d;
3948 struct cgroup *parent; 3946 struct cgroup *parent;
3949 DEFINE_WAIT(wait); 3947 DEFINE_WAIT(wait);
3950 struct cgroup_event *event, *tmp; 3948 struct cgroup_event *event, *tmp;
3951 int ret; 3949 int ret;
3952 3950
3953 /* the vfs holds both inode->i_mutex already */ 3951 /* the vfs holds both inode->i_mutex already */
3954 again: 3952 again:
3955 mutex_lock(&cgroup_mutex); 3953 mutex_lock(&cgroup_mutex);
3956 if (atomic_read(&cgrp->count) != 0) { 3954 if (atomic_read(&cgrp->count) != 0) {
3957 mutex_unlock(&cgroup_mutex); 3955 mutex_unlock(&cgroup_mutex);
3958 return -EBUSY; 3956 return -EBUSY;
3959 } 3957 }
3960 if (!list_empty(&cgrp->children)) { 3958 if (!list_empty(&cgrp->children)) {
3961 mutex_unlock(&cgroup_mutex); 3959 mutex_unlock(&cgroup_mutex);
3962 return -EBUSY; 3960 return -EBUSY;
3963 } 3961 }
3964 mutex_unlock(&cgroup_mutex); 3962 mutex_unlock(&cgroup_mutex);
3965 3963
3966 /* 3964 /*
3967 * In general, subsystem has no css->refcnt after pre_destroy(). But 3965 * In general, subsystem has no css->refcnt after pre_destroy(). But
3968 * in racy cases, subsystem may have to get css->refcnt after 3966 * in racy cases, subsystem may have to get css->refcnt after
3969 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes 3967 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
3970 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue 3968 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
3971 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir 3969 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
3972 * and subsystem's reference count handling. Please see css_get/put 3970 * and subsystem's reference count handling. Please see css_get/put
3973 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. 3971 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
3974 */ 3972 */
3975 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 3973 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3976 3974
3977 /* 3975 /*
3978 * Call pre_destroy handlers of subsys. Notify subsystems 3976 * Call pre_destroy handlers of subsys. Notify subsystems
3979 * that rmdir() request comes. 3977 * that rmdir() request comes.
3980 */ 3978 */
3981 ret = cgroup_call_pre_destroy(cgrp); 3979 ret = cgroup_call_pre_destroy(cgrp);
3982 if (ret) { 3980 if (ret) {
3983 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 3981 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3984 return ret; 3982 return ret;
3985 } 3983 }
3986 3984
3987 mutex_lock(&cgroup_mutex); 3985 mutex_lock(&cgroup_mutex);
3988 parent = cgrp->parent; 3986 parent = cgrp->parent;
3989 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 3987 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
3990 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 3988 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3991 mutex_unlock(&cgroup_mutex); 3989 mutex_unlock(&cgroup_mutex);
3992 return -EBUSY; 3990 return -EBUSY;
3993 } 3991 }
3994 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 3992 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
3995 if (!cgroup_clear_css_refs(cgrp)) { 3993 if (!cgroup_clear_css_refs(cgrp)) {
3996 mutex_unlock(&cgroup_mutex); 3994 mutex_unlock(&cgroup_mutex);
3997 /* 3995 /*
3998 * Because someone may call cgroup_wakeup_rmdir_waiter() before 3996 * Because someone may call cgroup_wakeup_rmdir_waiter() before
3999 * prepare_to_wait(), we need to check this flag. 3997 * prepare_to_wait(), we need to check this flag.
4000 */ 3998 */
4001 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) 3999 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4002 schedule(); 4000 schedule();
4003 finish_wait(&cgroup_rmdir_waitq, &wait); 4001 finish_wait(&cgroup_rmdir_waitq, &wait);
4004 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4002 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4005 if (signal_pending(current)) 4003 if (signal_pending(current))
4006 return -EINTR; 4004 return -EINTR;
4007 goto again; 4005 goto again;
4008 } 4006 }
4009 /* NO css_tryget() can success after here. */ 4007 /* NO css_tryget() can success after here. */
4010 finish_wait(&cgroup_rmdir_waitq, &wait); 4008 finish_wait(&cgroup_rmdir_waitq, &wait);
4011 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4009 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4012 4010
4013 spin_lock(&release_list_lock); 4011 spin_lock(&release_list_lock);
4014 set_bit(CGRP_REMOVED, &cgrp->flags); 4012 set_bit(CGRP_REMOVED, &cgrp->flags);
4015 if (!list_empty(&cgrp->release_list)) 4013 if (!list_empty(&cgrp->release_list))
4016 list_del_init(&cgrp->release_list); 4014 list_del_init(&cgrp->release_list);
4017 spin_unlock(&release_list_lock); 4015 spin_unlock(&release_list_lock);
4018 4016
4019 cgroup_lock_hierarchy(cgrp->root); 4017 cgroup_lock_hierarchy(cgrp->root);
4020 /* delete this cgroup from parent->children */ 4018 /* delete this cgroup from parent->children */
4021 list_del_init(&cgrp->sibling); 4019 list_del_init(&cgrp->sibling);
4022 cgroup_unlock_hierarchy(cgrp->root); 4020 cgroup_unlock_hierarchy(cgrp->root);
4023 4021
4024 d = dget(cgrp->dentry); 4022 d = dget(cgrp->dentry);
4025 4023
4026 cgroup_d_remove_dir(d); 4024 cgroup_d_remove_dir(d);
4027 dput(d); 4025 dput(d);
4028 4026
4029 set_bit(CGRP_RELEASABLE, &parent->flags); 4027 set_bit(CGRP_RELEASABLE, &parent->flags);
4030 check_for_release(parent); 4028 check_for_release(parent);
4031 4029
4032 /* 4030 /*
4033 * Unregister events and notify userspace. 4031 * Unregister events and notify userspace.
4034 * Notify userspace about cgroup removing only after rmdir of cgroup 4032 * Notify userspace about cgroup removing only after rmdir of cgroup
4035 * directory to avoid race between userspace and kernelspace 4033 * directory to avoid race between userspace and kernelspace
4036 */ 4034 */
4037 spin_lock(&cgrp->event_list_lock); 4035 spin_lock(&cgrp->event_list_lock);
4038 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4036 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4039 list_del(&event->list); 4037 list_del(&event->list);
4040 remove_wait_queue(event->wqh, &event->wait); 4038 remove_wait_queue(event->wqh, &event->wait);
4041 eventfd_signal(event->eventfd, 1); 4039 eventfd_signal(event->eventfd, 1);
4042 schedule_work(&event->remove); 4040 schedule_work(&event->remove);
4043 } 4041 }
4044 spin_unlock(&cgrp->event_list_lock); 4042 spin_unlock(&cgrp->event_list_lock);
4045 4043
4046 mutex_unlock(&cgroup_mutex); 4044 mutex_unlock(&cgroup_mutex);
4047 return 0; 4045 return 0;
4048 } 4046 }
4049 4047
4050 static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4048 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4051 { 4049 {
4052 struct cgroup_subsys_state *css; 4050 struct cgroup_subsys_state *css;
4053 4051
4054 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4052 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4055 4053
4056 /* Create the top cgroup state for this subsystem */ 4054 /* Create the top cgroup state for this subsystem */
4057 list_add(&ss->sibling, &rootnode.subsys_list); 4055 list_add(&ss->sibling, &rootnode.subsys_list);
4058 ss->root = &rootnode; 4056 ss->root = &rootnode;
4059 css = ss->create(ss, dummytop); 4057 css = ss->create(ss, dummytop);
4060 /* We don't handle early failures gracefully */ 4058 /* We don't handle early failures gracefully */
4061 BUG_ON(IS_ERR(css)); 4059 BUG_ON(IS_ERR(css));
4062 init_cgroup_css(css, ss, dummytop); 4060 init_cgroup_css(css, ss, dummytop);
4063 4061
4064 /* Update the init_css_set to contain a subsys 4062 /* Update the init_css_set to contain a subsys
4065 * pointer to this state - since the subsystem is 4063 * pointer to this state - since the subsystem is
4066 * newly registered, all tasks and hence the 4064 * newly registered, all tasks and hence the
4067 * init_css_set is in the subsystem's top cgroup. */ 4065 * init_css_set is in the subsystem's top cgroup. */
4068 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 4066 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
4069 4067
4070 need_forkexit_callback |= ss->fork || ss->exit; 4068 need_forkexit_callback |= ss->fork || ss->exit;
4071 4069
4072 /* At system boot, before all subsystems have been 4070 /* At system boot, before all subsystems have been
4073 * registered, no tasks have been forked, so we don't 4071 * registered, no tasks have been forked, so we don't
4074 * need to invoke fork callbacks here. */ 4072 * need to invoke fork callbacks here. */
4075 BUG_ON(!list_empty(&init_task.tasks)); 4073 BUG_ON(!list_empty(&init_task.tasks));
4076 4074
4077 mutex_init(&ss->hierarchy_mutex); 4075 mutex_init(&ss->hierarchy_mutex);
4078 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 4076 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4079 ss->active = 1; 4077 ss->active = 1;
4080 4078
4081 /* this function shouldn't be used with modular subsystems, since they 4079 /* this function shouldn't be used with modular subsystems, since they
4082 * need to register a subsys_id, among other things */ 4080 * need to register a subsys_id, among other things */
4083 BUG_ON(ss->module); 4081 BUG_ON(ss->module);
4084 } 4082 }
4085 4083
4086 /** 4084 /**
4087 * cgroup_load_subsys: load and register a modular subsystem at runtime 4085 * cgroup_load_subsys: load and register a modular subsystem at runtime
4088 * @ss: the subsystem to load 4086 * @ss: the subsystem to load
4089 * 4087 *
4090 * This function should be called in a modular subsystem's initcall. If the 4088 * This function should be called in a modular subsystem's initcall. If the
4091 * subsystem is built as a module, it will be assigned a new subsys_id and set 4089 * subsystem is built as a module, it will be assigned a new subsys_id and set
4092 * up for use. If the subsystem is built-in anyway, work is delegated to the 4090 * up for use. If the subsystem is built-in anyway, work is delegated to the
4093 * simpler cgroup_init_subsys. 4091 * simpler cgroup_init_subsys.
4094 */ 4092 */
4095 int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4093 int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4096 { 4094 {
4097 int i; 4095 int i;
4098 struct cgroup_subsys_state *css; 4096 struct cgroup_subsys_state *css;
4099 4097
4100 /* check name and function validity */ 4098 /* check name and function validity */
4101 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4099 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4102 ss->create == NULL || ss->destroy == NULL) 4100 ss->create == NULL || ss->destroy == NULL)
4103 return -EINVAL; 4101 return -EINVAL;
4104 4102
4105 /* 4103 /*
4106 * we don't support callbacks in modular subsystems. this check is 4104 * we don't support callbacks in modular subsystems. this check is
4107 * before the ss->module check for consistency; a subsystem that could 4105 * before the ss->module check for consistency; a subsystem that could
4108 * be a module should still have no callbacks even if the user isn't 4106 * be a module should still have no callbacks even if the user isn't
4109 * compiling it as one. 4107 * compiling it as one.
4110 */ 4108 */
4111 if (ss->fork || ss->exit) 4109 if (ss->fork || ss->exit)
4112 return -EINVAL; 4110 return -EINVAL;
4113 4111
4114 /* 4112 /*
4115 * an optionally modular subsystem is built-in: we want to do nothing, 4113 * an optionally modular subsystem is built-in: we want to do nothing,
4116 * since cgroup_init_subsys will have already taken care of it. 4114 * since cgroup_init_subsys will have already taken care of it.
4117 */ 4115 */
4118 if (ss->module == NULL) { 4116 if (ss->module == NULL) {
4119 /* a few sanity checks */ 4117 /* a few sanity checks */
4120 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); 4118 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
4121 BUG_ON(subsys[ss->subsys_id] != ss); 4119 BUG_ON(subsys[ss->subsys_id] != ss);
4122 return 0; 4120 return 0;
4123 } 4121 }
4124 4122
4125 /* 4123 /*
4126 * need to register a subsys id before anything else - for example, 4124 * need to register a subsys id before anything else - for example,
4127 * init_cgroup_css needs it. 4125 * init_cgroup_css needs it.
4128 */ 4126 */
4129 mutex_lock(&cgroup_mutex); 4127 mutex_lock(&cgroup_mutex);
4130 /* find the first empty slot in the array */ 4128 /* find the first empty slot in the array */
4131 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { 4129 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
4132 if (subsys[i] == NULL) 4130 if (subsys[i] == NULL)
4133 break; 4131 break;
4134 } 4132 }
4135 if (i == CGROUP_SUBSYS_COUNT) { 4133 if (i == CGROUP_SUBSYS_COUNT) {
4136 /* maximum number of subsystems already registered! */ 4134 /* maximum number of subsystems already registered! */
4137 mutex_unlock(&cgroup_mutex); 4135 mutex_unlock(&cgroup_mutex);
4138 return -EBUSY; 4136 return -EBUSY;
4139 } 4137 }
4140 /* assign ourselves the subsys_id */ 4138 /* assign ourselves the subsys_id */
4141 ss->subsys_id = i; 4139 ss->subsys_id = i;
4142 subsys[i] = ss; 4140 subsys[i] = ss;
4143 4141
4144 /* 4142 /*
4145 * no ss->create seems to need anything important in the ss struct, so 4143 * no ss->create seems to need anything important in the ss struct, so
4146 * this can happen first (i.e. before the rootnode attachment). 4144 * this can happen first (i.e. before the rootnode attachment).
4147 */ 4145 */
4148 css = ss->create(ss, dummytop); 4146 css = ss->create(ss, dummytop);
4149 if (IS_ERR(css)) { 4147 if (IS_ERR(css)) {
4150 /* failure case - need to deassign the subsys[] slot. */ 4148 /* failure case - need to deassign the subsys[] slot. */
4151 subsys[i] = NULL; 4149 subsys[i] = NULL;
4152 mutex_unlock(&cgroup_mutex); 4150 mutex_unlock(&cgroup_mutex);
4153 return PTR_ERR(css); 4151 return PTR_ERR(css);
4154 } 4152 }
4155 4153
4156 list_add(&ss->sibling, &rootnode.subsys_list); 4154 list_add(&ss->sibling, &rootnode.subsys_list);
4157 ss->root = &rootnode; 4155 ss->root = &rootnode;
4158 4156
4159 /* our new subsystem will be attached to the dummy hierarchy. */ 4157 /* our new subsystem will be attached to the dummy hierarchy. */
4160 init_cgroup_css(css, ss, dummytop); 4158 init_cgroup_css(css, ss, dummytop);
4161 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4159 /* init_idr must be after init_cgroup_css because it sets css->id. */
4162 if (ss->use_id) { 4160 if (ss->use_id) {
4163 int ret = cgroup_init_idr(ss, css); 4161 int ret = cgroup_init_idr(ss, css);
4164 if (ret) { 4162 if (ret) {
4165 dummytop->subsys[ss->subsys_id] = NULL; 4163 dummytop->subsys[ss->subsys_id] = NULL;
4166 ss->destroy(ss, dummytop); 4164 ss->destroy(ss, dummytop);
4167 subsys[i] = NULL; 4165 subsys[i] = NULL;
4168 mutex_unlock(&cgroup_mutex); 4166 mutex_unlock(&cgroup_mutex);
4169 return ret; 4167 return ret;
4170 } 4168 }
4171 } 4169 }
4172 4170
4173 /* 4171 /*
4174 * Now we need to entangle the css into the existing css_sets. unlike 4172 * Now we need to entangle the css into the existing css_sets. unlike
4175 * in cgroup_init_subsys, there are now multiple css_sets, so each one 4173 * in cgroup_init_subsys, there are now multiple css_sets, so each one
4176 * will need a new pointer to it; done by iterating the css_set_table. 4174 * will need a new pointer to it; done by iterating the css_set_table.
4177 * furthermore, modifying the existing css_sets will corrupt the hash 4175 * furthermore, modifying the existing css_sets will corrupt the hash
4178 * table state, so each changed css_set will need its hash recomputed. 4176 * table state, so each changed css_set will need its hash recomputed.
4179 * this is all done under the css_set_lock. 4177 * this is all done under the css_set_lock.
4180 */ 4178 */
4181 write_lock(&css_set_lock); 4179 write_lock(&css_set_lock);
4182 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 4180 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
4183 struct css_set *cg; 4181 struct css_set *cg;
4184 struct hlist_node *node, *tmp; 4182 struct hlist_node *node, *tmp;
4185 struct hlist_head *bucket = &css_set_table[i], *new_bucket; 4183 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
4186 4184
4187 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { 4185 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
4188 /* skip entries that we already rehashed */ 4186 /* skip entries that we already rehashed */
4189 if (cg->subsys[ss->subsys_id]) 4187 if (cg->subsys[ss->subsys_id])
4190 continue; 4188 continue;
4191 /* remove existing entry */ 4189 /* remove existing entry */
4192 hlist_del(&cg->hlist); 4190 hlist_del(&cg->hlist);
4193 /* set new value */ 4191 /* set new value */
4194 cg->subsys[ss->subsys_id] = css; 4192 cg->subsys[ss->subsys_id] = css;
4195 /* recompute hash and restore entry */ 4193 /* recompute hash and restore entry */
4196 new_bucket = css_set_hash(cg->subsys); 4194 new_bucket = css_set_hash(cg->subsys);
4197 hlist_add_head(&cg->hlist, new_bucket); 4195 hlist_add_head(&cg->hlist, new_bucket);
4198 } 4196 }
4199 } 4197 }
4200 write_unlock(&css_set_lock); 4198 write_unlock(&css_set_lock);
4201 4199
4202 mutex_init(&ss->hierarchy_mutex); 4200 mutex_init(&ss->hierarchy_mutex);
4203 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 4201 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4204 ss->active = 1; 4202 ss->active = 1;
4205 4203
4206 /* success! */ 4204 /* success! */
4207 mutex_unlock(&cgroup_mutex); 4205 mutex_unlock(&cgroup_mutex);
4208 return 0; 4206 return 0;
4209 } 4207 }
4210 EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4208 EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4211 4209
4212 /** 4210 /**
4213 * cgroup_unload_subsys: unload a modular subsystem 4211 * cgroup_unload_subsys: unload a modular subsystem
4214 * @ss: the subsystem to unload 4212 * @ss: the subsystem to unload
4215 * 4213 *
4216 * This function should be called in a modular subsystem's exitcall. When this 4214 * This function should be called in a modular subsystem's exitcall. When this
4217 * function is invoked, the refcount on the subsystem's module will be 0, so 4215 * function is invoked, the refcount on the subsystem's module will be 0, so
4218 * the subsystem will not be attached to any hierarchy. 4216 * the subsystem will not be attached to any hierarchy.
4219 */ 4217 */
4220 void cgroup_unload_subsys(struct cgroup_subsys *ss) 4218 void cgroup_unload_subsys(struct cgroup_subsys *ss)
4221 { 4219 {
4222 struct cg_cgroup_link *link; 4220 struct cg_cgroup_link *link;
4223 struct hlist_head *hhead; 4221 struct hlist_head *hhead;
4224 4222
4225 BUG_ON(ss->module == NULL); 4223 BUG_ON(ss->module == NULL);
4226 4224
4227 /* 4225 /*
4228 * we shouldn't be called if the subsystem is in use, and the use of 4226 * we shouldn't be called if the subsystem is in use, and the use of
4229 * try_module_get in parse_cgroupfs_options should ensure that it 4227 * try_module_get in parse_cgroupfs_options should ensure that it
4230 * doesn't start being used while we're killing it off. 4228 * doesn't start being used while we're killing it off.
4231 */ 4229 */
4232 BUG_ON(ss->root != &rootnode); 4230 BUG_ON(ss->root != &rootnode);
4233 4231
4234 mutex_lock(&cgroup_mutex); 4232 mutex_lock(&cgroup_mutex);
4235 /* deassign the subsys_id */ 4233 /* deassign the subsys_id */
4236 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); 4234 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
4237 subsys[ss->subsys_id] = NULL; 4235 subsys[ss->subsys_id] = NULL;
4238 4236
4239 /* remove subsystem from rootnode's list of subsystems */ 4237 /* remove subsystem from rootnode's list of subsystems */
4240 list_del_init(&ss->sibling); 4238 list_del_init(&ss->sibling);
4241 4239
4242 /* 4240 /*
4243 * disentangle the css from all css_sets attached to the dummytop. as 4241 * disentangle the css from all css_sets attached to the dummytop. as
4244 * in loading, we need to pay our respects to the hashtable gods. 4242 * in loading, we need to pay our respects to the hashtable gods.
4245 */ 4243 */
4246 write_lock(&css_set_lock); 4244 write_lock(&css_set_lock);
4247 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4245 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4248 struct css_set *cg = link->cg; 4246 struct css_set *cg = link->cg;
4249 4247
4250 hlist_del(&cg->hlist); 4248 hlist_del(&cg->hlist);
4251 BUG_ON(!cg->subsys[ss->subsys_id]); 4249 BUG_ON(!cg->subsys[ss->subsys_id]);
4252 cg->subsys[ss->subsys_id] = NULL; 4250 cg->subsys[ss->subsys_id] = NULL;
4253 hhead = css_set_hash(cg->subsys); 4251 hhead = css_set_hash(cg->subsys);
4254 hlist_add_head(&cg->hlist, hhead); 4252 hlist_add_head(&cg->hlist, hhead);
4255 } 4253 }
4256 write_unlock(&css_set_lock); 4254 write_unlock(&css_set_lock);
4257 4255
4258 /* 4256 /*
4259 * remove subsystem's css from the dummytop and free it - need to free 4257 * remove subsystem's css from the dummytop and free it - need to free
4260 * before marking as null because ss->destroy needs the cgrp->subsys 4258 * before marking as null because ss->destroy needs the cgrp->subsys
4261 * pointer to find their state. note that this also takes care of 4259 * pointer to find their state. note that this also takes care of
4262 * freeing the css_id. 4260 * freeing the css_id.
4263 */ 4261 */
4264 ss->destroy(ss, dummytop); 4262 ss->destroy(ss, dummytop);
4265 dummytop->subsys[ss->subsys_id] = NULL; 4263 dummytop->subsys[ss->subsys_id] = NULL;
4266 4264
4267 mutex_unlock(&cgroup_mutex); 4265 mutex_unlock(&cgroup_mutex);
4268 } 4266 }
4269 EXPORT_SYMBOL_GPL(cgroup_unload_subsys); 4267 EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4270 4268
4271 /** 4269 /**
4272 * cgroup_init_early - cgroup initialization at system boot 4270 * cgroup_init_early - cgroup initialization at system boot
4273 * 4271 *
4274 * Initialize cgroups at system boot, and initialize any 4272 * Initialize cgroups at system boot, and initialize any
4275 * subsystems that request early init. 4273 * subsystems that request early init.
4276 */ 4274 */
4277 int __init cgroup_init_early(void) 4275 int __init cgroup_init_early(void)
4278 { 4276 {
4279 int i; 4277 int i;
4280 atomic_set(&init_css_set.refcount, 1); 4278 atomic_set(&init_css_set.refcount, 1);
4281 INIT_LIST_HEAD(&init_css_set.cg_links); 4279 INIT_LIST_HEAD(&init_css_set.cg_links);
4282 INIT_LIST_HEAD(&init_css_set.tasks); 4280 INIT_LIST_HEAD(&init_css_set.tasks);
4283 INIT_HLIST_NODE(&init_css_set.hlist); 4281 INIT_HLIST_NODE(&init_css_set.hlist);
4284 css_set_count = 1; 4282 css_set_count = 1;
4285 init_cgroup_root(&rootnode); 4283 init_cgroup_root(&rootnode);
4286 root_count = 1; 4284 root_count = 1;
4287 init_task.cgroups = &init_css_set; 4285 init_task.cgroups = &init_css_set;
4288 4286
4289 init_css_set_link.cg = &init_css_set; 4287 init_css_set_link.cg = &init_css_set;
4290 init_css_set_link.cgrp = dummytop; 4288 init_css_set_link.cgrp = dummytop;
4291 list_add(&init_css_set_link.cgrp_link_list, 4289 list_add(&init_css_set_link.cgrp_link_list,
4292 &rootnode.top_cgroup.css_sets); 4290 &rootnode.top_cgroup.css_sets);
4293 list_add(&init_css_set_link.cg_link_list, 4291 list_add(&init_css_set_link.cg_link_list,
4294 &init_css_set.cg_links); 4292 &init_css_set.cg_links);
4295 4293
4296 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 4294 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4297 INIT_HLIST_HEAD(&css_set_table[i]); 4295 INIT_HLIST_HEAD(&css_set_table[i]);
4298 4296
4299 /* at bootup time, we don't worry about modular subsystems */ 4297 /* at bootup time, we don't worry about modular subsystems */
4300 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4298 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4301 struct cgroup_subsys *ss = subsys[i]; 4299 struct cgroup_subsys *ss = subsys[i];
4302 4300
4303 BUG_ON(!ss->name); 4301 BUG_ON(!ss->name);
4304 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4302 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4305 BUG_ON(!ss->create); 4303 BUG_ON(!ss->create);
4306 BUG_ON(!ss->destroy); 4304 BUG_ON(!ss->destroy);
4307 if (ss->subsys_id != i) { 4305 if (ss->subsys_id != i) {
4308 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4306 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4309 ss->name, ss->subsys_id); 4307 ss->name, ss->subsys_id);
4310 BUG(); 4308 BUG();
4311 } 4309 }
4312 4310
4313 if (ss->early_init) 4311 if (ss->early_init)
4314 cgroup_init_subsys(ss); 4312 cgroup_init_subsys(ss);
4315 } 4313 }
4316 return 0; 4314 return 0;
4317 } 4315 }
4318 4316
4319 /** 4317 /**
4320 * cgroup_init - cgroup initialization 4318 * cgroup_init - cgroup initialization
4321 * 4319 *
4322 * Register cgroup filesystem and /proc file, and initialize 4320 * Register cgroup filesystem and /proc file, and initialize
4323 * any subsystems that didn't request early init. 4321 * any subsystems that didn't request early init.
4324 */ 4322 */
4325 int __init cgroup_init(void) 4323 int __init cgroup_init(void)
4326 { 4324 {
4327 int err; 4325 int err;
4328 int i; 4326 int i;
4329 struct hlist_head *hhead; 4327 struct hlist_head *hhead;
4330 4328
4331 err = bdi_init(&cgroup_backing_dev_info); 4329 err = bdi_init(&cgroup_backing_dev_info);
4332 if (err) 4330 if (err)
4333 return err; 4331 return err;
4334 4332
4335 /* at bootup time, we don't worry about modular subsystems */ 4333 /* at bootup time, we don't worry about modular subsystems */
4336 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4334 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4337 struct cgroup_subsys *ss = subsys[i]; 4335 struct cgroup_subsys *ss = subsys[i];
4338 if (!ss->early_init) 4336 if (!ss->early_init)
4339 cgroup_init_subsys(ss); 4337 cgroup_init_subsys(ss);
4340 if (ss->use_id) 4338 if (ss->use_id)
4341 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); 4339 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4342 } 4340 }
4343 4341
4344 /* Add init_css_set to the hash table */ 4342 /* Add init_css_set to the hash table */
4345 hhead = css_set_hash(init_css_set.subsys); 4343 hhead = css_set_hash(init_css_set.subsys);
4346 hlist_add_head(&init_css_set.hlist, hhead); 4344 hlist_add_head(&init_css_set.hlist, hhead);
4347 BUG_ON(!init_root_id(&rootnode)); 4345 BUG_ON(!init_root_id(&rootnode));
4348 4346
4349 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4347 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4350 if (!cgroup_kobj) { 4348 if (!cgroup_kobj) {
4351 err = -ENOMEM; 4349 err = -ENOMEM;
4352 goto out; 4350 goto out;
4353 } 4351 }
4354 4352
4355 err = register_filesystem(&cgroup_fs_type); 4353 err = register_filesystem(&cgroup_fs_type);
4356 if (err < 0) { 4354 if (err < 0) {
4357 kobject_put(cgroup_kobj); 4355 kobject_put(cgroup_kobj);
4358 goto out; 4356 goto out;
4359 } 4357 }
4360 4358
4361 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4359 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4362 4360
4363 out: 4361 out:
4364 if (err) 4362 if (err)
4365 bdi_destroy(&cgroup_backing_dev_info); 4363 bdi_destroy(&cgroup_backing_dev_info);
4366 4364
4367 return err; 4365 return err;
4368 } 4366 }
4369 4367
4370 /* 4368 /*
4371 * proc_cgroup_show() 4369 * proc_cgroup_show()
4372 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4370 * - Print task's cgroup paths into seq_file, one line for each hierarchy
4373 * - Used for /proc/<pid>/cgroup. 4371 * - Used for /proc/<pid>/cgroup.
4374 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it 4372 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4375 * doesn't really matter if tsk->cgroup changes after we read it, 4373 * doesn't really matter if tsk->cgroup changes after we read it,
4376 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it 4374 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4377 * anyway. No need to check that tsk->cgroup != NULL, thanks to 4375 * anyway. No need to check that tsk->cgroup != NULL, thanks to
4378 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks 4376 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4379 * cgroup to top_cgroup. 4377 * cgroup to top_cgroup.
4380 */ 4378 */
4381 4379
4382 /* TODO: Use a proper seq_file iterator */ 4380 /* TODO: Use a proper seq_file iterator */
4383 static int proc_cgroup_show(struct seq_file *m, void *v) 4381 static int proc_cgroup_show(struct seq_file *m, void *v)
4384 { 4382 {
4385 struct pid *pid; 4383 struct pid *pid;
4386 struct task_struct *tsk; 4384 struct task_struct *tsk;
4387 char *buf; 4385 char *buf;
4388 int retval; 4386 int retval;
4389 struct cgroupfs_root *root; 4387 struct cgroupfs_root *root;
4390 4388
4391 retval = -ENOMEM; 4389 retval = -ENOMEM;
4392 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4390 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4393 if (!buf) 4391 if (!buf)
4394 goto out; 4392 goto out;
4395 4393
4396 retval = -ESRCH; 4394 retval = -ESRCH;
4397 pid = m->private; 4395 pid = m->private;
4398 tsk = get_pid_task(pid, PIDTYPE_PID); 4396 tsk = get_pid_task(pid, PIDTYPE_PID);
4399 if (!tsk) 4397 if (!tsk)
4400 goto out_free; 4398 goto out_free;
4401 4399
4402 retval = 0; 4400 retval = 0;
4403 4401
4404 mutex_lock(&cgroup_mutex); 4402 mutex_lock(&cgroup_mutex);
4405 4403
4406 for_each_active_root(root) { 4404 for_each_active_root(root) {
4407 struct cgroup_subsys *ss; 4405 struct cgroup_subsys *ss;
4408 struct cgroup *cgrp; 4406 struct cgroup *cgrp;
4409 int count = 0; 4407 int count = 0;
4410 4408
4411 seq_printf(m, "%d:", root->hierarchy_id); 4409 seq_printf(m, "%d:", root->hierarchy_id);
4412 for_each_subsys(root, ss) 4410 for_each_subsys(root, ss)
4413 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4411 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4414 if (strlen(root->name)) 4412 if (strlen(root->name))
4415 seq_printf(m, "%sname=%s", count ? "," : "", 4413 seq_printf(m, "%sname=%s", count ? "," : "",
4416 root->name); 4414 root->name);
4417 seq_putc(m, ':'); 4415 seq_putc(m, ':');
4418 cgrp = task_cgroup_from_root(tsk, root); 4416 cgrp = task_cgroup_from_root(tsk, root);
4419 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 4417 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
4420 if (retval < 0) 4418 if (retval < 0)
4421 goto out_unlock; 4419 goto out_unlock;
4422 seq_puts(m, buf); 4420 seq_puts(m, buf);
4423 seq_putc(m, '\n'); 4421 seq_putc(m, '\n');
4424 } 4422 }
4425 4423
4426 out_unlock: 4424 out_unlock:
4427 mutex_unlock(&cgroup_mutex); 4425 mutex_unlock(&cgroup_mutex);
4428 put_task_struct(tsk); 4426 put_task_struct(tsk);
4429 out_free: 4427 out_free:
4430 kfree(buf); 4428 kfree(buf);
4431 out: 4429 out:
4432 return retval; 4430 return retval;
4433 } 4431 }
4434 4432
4435 static int cgroup_open(struct inode *inode, struct file *file) 4433 static int cgroup_open(struct inode *inode, struct file *file)
4436 { 4434 {
4437 struct pid *pid = PROC_I(inode)->pid; 4435 struct pid *pid = PROC_I(inode)->pid;
4438 return single_open(file, proc_cgroup_show, pid); 4436 return single_open(file, proc_cgroup_show, pid);
4439 } 4437 }
4440 4438
4441 const struct file_operations proc_cgroup_operations = { 4439 const struct file_operations proc_cgroup_operations = {
4442 .open = cgroup_open, 4440 .open = cgroup_open,
4443 .read = seq_read, 4441 .read = seq_read,
4444 .llseek = seq_lseek, 4442 .llseek = seq_lseek,
4445 .release = single_release, 4443 .release = single_release,
4446 }; 4444 };
4447 4445
4448 /* Display information about each subsystem and each hierarchy */ 4446 /* Display information about each subsystem and each hierarchy */
4449 static int proc_cgroupstats_show(struct seq_file *m, void *v) 4447 static int proc_cgroupstats_show(struct seq_file *m, void *v)
4450 { 4448 {
4451 int i; 4449 int i;
4452 4450
4453 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 4451 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4454 /* 4452 /*
4455 * ideally we don't want subsystems moving around while we do this. 4453 * ideally we don't want subsystems moving around while we do this.
4456 * cgroup_mutex is also necessary to guarantee an atomic snapshot of 4454 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
4457 * subsys/hierarchy state. 4455 * subsys/hierarchy state.
4458 */ 4456 */
4459 mutex_lock(&cgroup_mutex); 4457 mutex_lock(&cgroup_mutex);
4460 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4458 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4461 struct cgroup_subsys *ss = subsys[i]; 4459 struct cgroup_subsys *ss = subsys[i];
4462 if (ss == NULL) 4460 if (ss == NULL)
4463 continue; 4461 continue;
4464 seq_printf(m, "%s\t%d\t%d\t%d\n", 4462 seq_printf(m, "%s\t%d\t%d\t%d\n",
4465 ss->name, ss->root->hierarchy_id, 4463 ss->name, ss->root->hierarchy_id,
4466 ss->root->number_of_cgroups, !ss->disabled); 4464 ss->root->number_of_cgroups, !ss->disabled);
4467 } 4465 }
4468 mutex_unlock(&cgroup_mutex); 4466 mutex_unlock(&cgroup_mutex);
4469 return 0; 4467 return 0;
4470 } 4468 }
4471 4469
4472 static int cgroupstats_open(struct inode *inode, struct file *file) 4470 static int cgroupstats_open(struct inode *inode, struct file *file)
4473 { 4471 {
4474 return single_open(file, proc_cgroupstats_show, NULL); 4472 return single_open(file, proc_cgroupstats_show, NULL);
4475 } 4473 }
4476 4474
4477 static const struct file_operations proc_cgroupstats_operations = { 4475 static const struct file_operations proc_cgroupstats_operations = {
4478 .open = cgroupstats_open, 4476 .open = cgroupstats_open,
4479 .read = seq_read, 4477 .read = seq_read,
4480 .llseek = seq_lseek, 4478 .llseek = seq_lseek,
4481 .release = single_release, 4479 .release = single_release,
4482 }; 4480 };
4483 4481
4484 /** 4482 /**
4485 * cgroup_fork - attach newly forked task to its parents cgroup. 4483 * cgroup_fork - attach newly forked task to its parents cgroup.
4486 * @child: pointer to task_struct of forking parent process. 4484 * @child: pointer to task_struct of forking parent process.
4487 * 4485 *
4488 * Description: A task inherits its parent's cgroup at fork(). 4486 * Description: A task inherits its parent's cgroup at fork().
4489 * 4487 *
4490 * A pointer to the shared css_set was automatically copied in 4488 * A pointer to the shared css_set was automatically copied in
4491 * fork.c by dup_task_struct(). However, we ignore that copy, since 4489 * fork.c by dup_task_struct(). However, we ignore that copy, since
4492 * it was not made under the protection of RCU or cgroup_mutex, so 4490 * it was not made under the protection of RCU or cgroup_mutex, so
4493 * might no longer be a valid cgroup pointer. cgroup_attach_task() might 4491 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
4494 * have already changed current->cgroups, allowing the previously 4492 * have already changed current->cgroups, allowing the previously
4495 * referenced cgroup group to be removed and freed. 4493 * referenced cgroup group to be removed and freed.
4496 * 4494 *
4497 * At the point that cgroup_fork() is called, 'current' is the parent 4495 * At the point that cgroup_fork() is called, 'current' is the parent
4498 * task, and the passed argument 'child' points to the child task. 4496 * task, and the passed argument 'child' points to the child task.
4499 */ 4497 */
4500 void cgroup_fork(struct task_struct *child) 4498 void cgroup_fork(struct task_struct *child)
4501 { 4499 {
4502 task_lock(current); 4500 task_lock(current);
4503 child->cgroups = current->cgroups; 4501 child->cgroups = current->cgroups;
4504 get_css_set(child->cgroups); 4502 get_css_set(child->cgroups);
4505 task_unlock(current); 4503 task_unlock(current);
4506 INIT_LIST_HEAD(&child->cg_list); 4504 INIT_LIST_HEAD(&child->cg_list);
4507 } 4505 }
4508 4506
4509 /** 4507 /**
4510 * cgroup_fork_callbacks - run fork callbacks 4508 * cgroup_fork_callbacks - run fork callbacks
4511 * @child: the new task 4509 * @child: the new task
4512 * 4510 *
4513 * Called on a new task very soon before adding it to the 4511 * Called on a new task very soon before adding it to the
4514 * tasklist. No need to take any locks since no-one can 4512 * tasklist. No need to take any locks since no-one can
4515 * be operating on this task. 4513 * be operating on this task.
4516 */ 4514 */
4517 void cgroup_fork_callbacks(struct task_struct *child) 4515 void cgroup_fork_callbacks(struct task_struct *child)
4518 { 4516 {
4519 if (need_forkexit_callback) { 4517 if (need_forkexit_callback) {
4520 int i; 4518 int i;
4521 /* 4519 /*
4522 * forkexit callbacks are only supported for builtin 4520 * forkexit callbacks are only supported for builtin
4523 * subsystems, and the builtin section of the subsys array is 4521 * subsystems, and the builtin section of the subsys array is
4524 * immutable, so we don't need to lock the subsys array here. 4522 * immutable, so we don't need to lock the subsys array here.
4525 */ 4523 */
4526 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4524 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4527 struct cgroup_subsys *ss = subsys[i]; 4525 struct cgroup_subsys *ss = subsys[i];
4528 if (ss->fork) 4526 if (ss->fork)
4529 ss->fork(ss, child); 4527 ss->fork(ss, child);
4530 } 4528 }
4531 } 4529 }
4532 } 4530 }
4533 4531
4534 /** 4532 /**
4535 * cgroup_post_fork - called on a new task after adding it to the task list 4533 * cgroup_post_fork - called on a new task after adding it to the task list
4536 * @child: the task in question 4534 * @child: the task in question
4537 * 4535 *
4538 * Adds the task to the list running through its css_set if necessary. 4536 * Adds the task to the list running through its css_set if necessary.
4539 * Has to be after the task is visible on the task list in case we race 4537 * Has to be after the task is visible on the task list in case we race
4540 * with the first call to cgroup_iter_start() - to guarantee that the 4538 * with the first call to cgroup_iter_start() - to guarantee that the
4541 * new task ends up on its list. 4539 * new task ends up on its list.
4542 */ 4540 */
4543 void cgroup_post_fork(struct task_struct *child) 4541 void cgroup_post_fork(struct task_struct *child)
4544 { 4542 {
4545 if (use_task_css_set_links) { 4543 if (use_task_css_set_links) {
4546 write_lock(&css_set_lock); 4544 write_lock(&css_set_lock);
4547 task_lock(child); 4545 task_lock(child);
4548 if (list_empty(&child->cg_list)) 4546 if (list_empty(&child->cg_list))
4549 list_add(&child->cg_list, &child->cgroups->tasks); 4547 list_add(&child->cg_list, &child->cgroups->tasks);
4550 task_unlock(child); 4548 task_unlock(child);
4551 write_unlock(&css_set_lock); 4549 write_unlock(&css_set_lock);
4552 } 4550 }
4553 } 4551 }
4554 /** 4552 /**
4555 * cgroup_exit - detach cgroup from exiting task 4553 * cgroup_exit - detach cgroup from exiting task
4556 * @tsk: pointer to task_struct of exiting process 4554 * @tsk: pointer to task_struct of exiting process
4557 * @run_callback: run exit callbacks? 4555 * @run_callback: run exit callbacks?
4558 * 4556 *
4559 * Description: Detach cgroup from @tsk and release it. 4557 * Description: Detach cgroup from @tsk and release it.
4560 * 4558 *
4561 * Note that cgroups marked notify_on_release force every task in 4559 * Note that cgroups marked notify_on_release force every task in
4562 * them to take the global cgroup_mutex mutex when exiting. 4560 * them to take the global cgroup_mutex mutex when exiting.
4563 * This could impact scaling on very large systems. Be reluctant to 4561 * This could impact scaling on very large systems. Be reluctant to
4564 * use notify_on_release cgroups where very high task exit scaling 4562 * use notify_on_release cgroups where very high task exit scaling
4565 * is required on large systems. 4563 * is required on large systems.
4566 * 4564 *
4567 * the_top_cgroup_hack: 4565 * the_top_cgroup_hack:
4568 * 4566 *
4569 * Set the exiting tasks cgroup to the root cgroup (top_cgroup). 4567 * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
4570 * 4568 *
4571 * We call cgroup_exit() while the task is still competent to 4569 * We call cgroup_exit() while the task is still competent to
4572 * handle notify_on_release(), then leave the task attached to the 4570 * handle notify_on_release(), then leave the task attached to the
4573 * root cgroup in each hierarchy for the remainder of its exit. 4571 * root cgroup in each hierarchy for the remainder of its exit.
4574 * 4572 *
4575 * To do this properly, we would increment the reference count on 4573 * To do this properly, we would increment the reference count on
4576 * top_cgroup, and near the very end of the kernel/exit.c do_exit() 4574 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
4577 * code we would add a second cgroup function call, to drop that 4575 * code we would add a second cgroup function call, to drop that
4578 * reference. This would just create an unnecessary hot spot on 4576 * reference. This would just create an unnecessary hot spot on
4579 * the top_cgroup reference count, to no avail. 4577 * the top_cgroup reference count, to no avail.
4580 * 4578 *
4581 * Normally, holding a reference to a cgroup without bumping its 4579 * Normally, holding a reference to a cgroup without bumping its
4582 * count is unsafe. The cgroup could go away, or someone could 4580 * count is unsafe. The cgroup could go away, or someone could
4583 * attach us to a different cgroup, decrementing the count on 4581 * attach us to a different cgroup, decrementing the count on
4584 * the first cgroup that we never incremented. But in this case, 4582 * the first cgroup that we never incremented. But in this case,
4585 * top_cgroup isn't going away, and either task has PF_EXITING set, 4583 * top_cgroup isn't going away, and either task has PF_EXITING set,
4586 * which wards off any cgroup_attach_task() attempts, or task is a failed 4584 * which wards off any cgroup_attach_task() attempts, or task is a failed
4587 * fork, never visible to cgroup_attach_task. 4585 * fork, never visible to cgroup_attach_task.
4588 */ 4586 */
4589 void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4587 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4590 { 4588 {
4591 struct css_set *cg; 4589 struct css_set *cg;
4592 int i; 4590 int i;
4593 4591
4594 /* 4592 /*
4595 * Unlink from the css_set task list if necessary. 4593 * Unlink from the css_set task list if necessary.
4596 * Optimistically check cg_list before taking 4594 * Optimistically check cg_list before taking
4597 * css_set_lock 4595 * css_set_lock
4598 */ 4596 */
4599 if (!list_empty(&tsk->cg_list)) { 4597 if (!list_empty(&tsk->cg_list)) {
4600 write_lock(&css_set_lock); 4598 write_lock(&css_set_lock);
4601 if (!list_empty(&tsk->cg_list)) 4599 if (!list_empty(&tsk->cg_list))
4602 list_del_init(&tsk->cg_list); 4600 list_del_init(&tsk->cg_list);
4603 write_unlock(&css_set_lock); 4601 write_unlock(&css_set_lock);
4604 } 4602 }
4605 4603
4606 /* Reassign the task to the init_css_set. */ 4604 /* Reassign the task to the init_css_set. */
4607 task_lock(tsk); 4605 task_lock(tsk);
4608 cg = tsk->cgroups; 4606 cg = tsk->cgroups;
4609 tsk->cgroups = &init_css_set; 4607 tsk->cgroups = &init_css_set;
4610 4608
4611 if (run_callbacks && need_forkexit_callback) { 4609 if (run_callbacks && need_forkexit_callback) {
4612 /* 4610 /*
4613 * modular subsystems can't use callbacks, so no need to lock 4611 * modular subsystems can't use callbacks, so no need to lock
4614 * the subsys array 4612 * the subsys array
4615 */ 4613 */
4616 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4614 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4617 struct cgroup_subsys *ss = subsys[i]; 4615 struct cgroup_subsys *ss = subsys[i];
4618 if (ss->exit) { 4616 if (ss->exit) {
4619 struct cgroup *old_cgrp = 4617 struct cgroup *old_cgrp =
4620 rcu_dereference_raw(cg->subsys[i])->cgroup; 4618 rcu_dereference_raw(cg->subsys[i])->cgroup;
4621 struct cgroup *cgrp = task_cgroup(tsk, i); 4619 struct cgroup *cgrp = task_cgroup(tsk, i);
4622 ss->exit(ss, cgrp, old_cgrp, tsk); 4620 ss->exit(ss, cgrp, old_cgrp, tsk);
4623 } 4621 }
4624 } 4622 }
4625 } 4623 }
4626 task_unlock(tsk); 4624 task_unlock(tsk);
4627 4625
4628 if (cg) 4626 if (cg)
4629 put_css_set_taskexit(cg); 4627 put_css_set_taskexit(cg);
4630 } 4628 }
4631 4629
4632 /** 4630 /**
4633 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp 4631 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
4634 * @cgrp: the cgroup in question 4632 * @cgrp: the cgroup in question
4635 * @task: the task in question 4633 * @task: the task in question
4636 * 4634 *
4637 * See if @cgrp is a descendant of @task's cgroup in the appropriate 4635 * See if @cgrp is a descendant of @task's cgroup in the appropriate
4638 * hierarchy. 4636 * hierarchy.
4639 * 4637 *
4640 * If we are sending in dummytop, then presumably we are creating 4638 * If we are sending in dummytop, then presumably we are creating
4641 * the top cgroup in the subsystem. 4639 * the top cgroup in the subsystem.
4642 * 4640 *
4643 * Called only by the ns (nsproxy) cgroup. 4641 * Called only by the ns (nsproxy) cgroup.
4644 */ 4642 */
4645 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) 4643 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
4646 { 4644 {
4647 int ret; 4645 int ret;
4648 struct cgroup *target; 4646 struct cgroup *target;
4649 4647
4650 if (cgrp == dummytop) 4648 if (cgrp == dummytop)
4651 return 1; 4649 return 1;
4652 4650
4653 target = task_cgroup_from_root(task, cgrp->root); 4651 target = task_cgroup_from_root(task, cgrp->root);
4654 while (cgrp != target && cgrp!= cgrp->top_cgroup) 4652 while (cgrp != target && cgrp!= cgrp->top_cgroup)
4655 cgrp = cgrp->parent; 4653 cgrp = cgrp->parent;
4656 ret = (cgrp == target); 4654 ret = (cgrp == target);
4657 return ret; 4655 return ret;
4658 } 4656 }
4659 4657
4660 static void check_for_release(struct cgroup *cgrp) 4658 static void check_for_release(struct cgroup *cgrp)
4661 { 4659 {
4662 /* All of these checks rely on RCU to keep the cgroup 4660 /* All of these checks rely on RCU to keep the cgroup
4663 * structure alive */ 4661 * structure alive */
4664 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) 4662 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
4665 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { 4663 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
4666 /* Control Group is currently removeable. If it's not 4664 /* Control Group is currently removeable. If it's not
4667 * already queued for a userspace notification, queue 4665 * already queued for a userspace notification, queue
4668 * it now */ 4666 * it now */
4669 int need_schedule_work = 0; 4667 int need_schedule_work = 0;
4670 spin_lock(&release_list_lock); 4668 spin_lock(&release_list_lock);
4671 if (!cgroup_is_removed(cgrp) && 4669 if (!cgroup_is_removed(cgrp) &&
4672 list_empty(&cgrp->release_list)) { 4670 list_empty(&cgrp->release_list)) {
4673 list_add(&cgrp->release_list, &release_list); 4671 list_add(&cgrp->release_list, &release_list);
4674 need_schedule_work = 1; 4672 need_schedule_work = 1;
4675 } 4673 }
4676 spin_unlock(&release_list_lock); 4674 spin_unlock(&release_list_lock);
4677 if (need_schedule_work) 4675 if (need_schedule_work)
4678 schedule_work(&release_agent_work); 4676 schedule_work(&release_agent_work);
4679 } 4677 }
4680 } 4678 }
4681 4679
4682 /* Caller must verify that the css is not for root cgroup */ 4680 /* Caller must verify that the css is not for root cgroup */
4683 void __css_put(struct cgroup_subsys_state *css, int count) 4681 void __css_put(struct cgroup_subsys_state *css, int count)
4684 { 4682 {
4685 struct cgroup *cgrp = css->cgroup; 4683 struct cgroup *cgrp = css->cgroup;
4686 int val; 4684 int val;
4687 rcu_read_lock(); 4685 rcu_read_lock();
4688 val = atomic_sub_return(count, &css->refcnt); 4686 val = atomic_sub_return(count, &css->refcnt);
4689 if (val == 1) { 4687 if (val == 1) {
4690 if (notify_on_release(cgrp)) { 4688 if (notify_on_release(cgrp)) {
4691 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4689 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4692 check_for_release(cgrp); 4690 check_for_release(cgrp);
4693 } 4691 }
4694 cgroup_wakeup_rmdir_waiter(cgrp); 4692 cgroup_wakeup_rmdir_waiter(cgrp);
4695 } 4693 }
4696 rcu_read_unlock(); 4694 rcu_read_unlock();
4697 WARN_ON_ONCE(val < 1); 4695 WARN_ON_ONCE(val < 1);
4698 } 4696 }
4699 EXPORT_SYMBOL_GPL(__css_put); 4697 EXPORT_SYMBOL_GPL(__css_put);
4700 4698
4701 /* 4699 /*
4702 * Notify userspace when a cgroup is released, by running the 4700 * Notify userspace when a cgroup is released, by running the
4703 * configured release agent with the name of the cgroup (path 4701 * configured release agent with the name of the cgroup (path
4704 * relative to the root of cgroup file system) as the argument. 4702 * relative to the root of cgroup file system) as the argument.
4705 * 4703 *
4706 * Most likely, this user command will try to rmdir this cgroup. 4704 * Most likely, this user command will try to rmdir this cgroup.
4707 * 4705 *
4708 * This races with the possibility that some other task will be 4706 * This races with the possibility that some other task will be
4709 * attached to this cgroup before it is removed, or that some other 4707 * attached to this cgroup before it is removed, or that some other
4710 * user task will 'mkdir' a child cgroup of this cgroup. That's ok. 4708 * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
4711 * The presumed 'rmdir' will fail quietly if this cgroup is no longer 4709 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
4712 * unused, and this cgroup will be reprieved from its death sentence, 4710 * unused, and this cgroup will be reprieved from its death sentence,
4713 * to continue to serve a useful existence. Next time it's released, 4711 * to continue to serve a useful existence. Next time it's released,
4714 * we will get notified again, if it still has 'notify_on_release' set. 4712 * we will get notified again, if it still has 'notify_on_release' set.
4715 * 4713 *
4716 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which 4714 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
4717 * means only wait until the task is successfully execve()'d. The 4715 * means only wait until the task is successfully execve()'d. The
4718 * separate release agent task is forked by call_usermodehelper(), 4716 * separate release agent task is forked by call_usermodehelper(),
4719 * then control in this thread returns here, without waiting for the 4717 * then control in this thread returns here, without waiting for the
4720 * release agent task. We don't bother to wait because the caller of 4718 * release agent task. We don't bother to wait because the caller of
4721 * this routine has no use for the exit status of the release agent 4719 * this routine has no use for the exit status of the release agent
4722 * task, so no sense holding our caller up for that. 4720 * task, so no sense holding our caller up for that.
4723 */ 4721 */
4724 static void cgroup_release_agent(struct work_struct *work) 4722 static void cgroup_release_agent(struct work_struct *work)
4725 { 4723 {
4726 BUG_ON(work != &release_agent_work); 4724 BUG_ON(work != &release_agent_work);
4727 mutex_lock(&cgroup_mutex); 4725 mutex_lock(&cgroup_mutex);
4728 spin_lock(&release_list_lock); 4726 spin_lock(&release_list_lock);
4729 while (!list_empty(&release_list)) { 4727 while (!list_empty(&release_list)) {
4730 char *argv[3], *envp[3]; 4728 char *argv[3], *envp[3];
4731 int i; 4729 int i;
4732 char *pathbuf = NULL, *agentbuf = NULL; 4730 char *pathbuf = NULL, *agentbuf = NULL;
4733 struct cgroup *cgrp = list_entry(release_list.next, 4731 struct cgroup *cgrp = list_entry(release_list.next,
4734 struct cgroup, 4732 struct cgroup,
4735 release_list); 4733 release_list);
4736 list_del_init(&cgrp->release_list); 4734 list_del_init(&cgrp->release_list);
4737 spin_unlock(&release_list_lock); 4735 spin_unlock(&release_list_lock);
4738 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4736 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4739 if (!pathbuf) 4737 if (!pathbuf)
4740 goto continue_free; 4738 goto continue_free;
4741 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) 4739 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
4742 goto continue_free; 4740 goto continue_free;
4743 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 4741 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
4744 if (!agentbuf) 4742 if (!agentbuf)
4745 goto continue_free; 4743 goto continue_free;
4746 4744
4747 i = 0; 4745 i = 0;
4748 argv[i++] = agentbuf; 4746 argv[i++] = agentbuf;
4749 argv[i++] = pathbuf; 4747 argv[i++] = pathbuf;
4750 argv[i] = NULL; 4748 argv[i] = NULL;
4751 4749
4752 i = 0; 4750 i = 0;
4753 /* minimal command environment */ 4751 /* minimal command environment */
4754 envp[i++] = "HOME=/"; 4752 envp[i++] = "HOME=/";
4755 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 4753 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
4756 envp[i] = NULL; 4754 envp[i] = NULL;
4757 4755
4758 /* Drop the lock while we invoke the usermode helper, 4756 /* Drop the lock while we invoke the usermode helper,
4759 * since the exec could involve hitting disk and hence 4757 * since the exec could involve hitting disk and hence
4760 * be a slow process */ 4758 * be a slow process */
4761 mutex_unlock(&cgroup_mutex); 4759 mutex_unlock(&cgroup_mutex);
4762 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); 4760 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
4763 mutex_lock(&cgroup_mutex); 4761 mutex_lock(&cgroup_mutex);
4764 continue_free: 4762 continue_free:
4765 kfree(pathbuf); 4763 kfree(pathbuf);
4766 kfree(agentbuf); 4764 kfree(agentbuf);
4767 spin_lock(&release_list_lock); 4765 spin_lock(&release_list_lock);
4768 } 4766 }
4769 spin_unlock(&release_list_lock); 4767 spin_unlock(&release_list_lock);
4770 mutex_unlock(&cgroup_mutex); 4768 mutex_unlock(&cgroup_mutex);
4771 } 4769 }
4772 4770
4773 static int __init cgroup_disable(char *str) 4771 static int __init cgroup_disable(char *str)
4774 { 4772 {
4775 int i; 4773 int i;
4776 char *token; 4774 char *token;
4777 4775
4778 while ((token = strsep(&str, ",")) != NULL) { 4776 while ((token = strsep(&str, ",")) != NULL) {
4779 if (!*token) 4777 if (!*token)
4780 continue; 4778 continue;
4781 /* 4779 /*
4782 * cgroup_disable, being at boot time, can't know about module 4780 * cgroup_disable, being at boot time, can't know about module
4783 * subsystems, so we don't worry about them. 4781 * subsystems, so we don't worry about them.
4784 */ 4782 */
4785 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4783 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4786 struct cgroup_subsys *ss = subsys[i]; 4784 struct cgroup_subsys *ss = subsys[i];
4787 4785
4788 if (!strcmp(token, ss->name)) { 4786 if (!strcmp(token, ss->name)) {
4789 ss->disabled = 1; 4787 ss->disabled = 1;
4790 printk(KERN_INFO "Disabling %s control group" 4788 printk(KERN_INFO "Disabling %s control group"
4791 " subsystem\n", ss->name); 4789 " subsystem\n", ss->name);
4792 break; 4790 break;
4793 } 4791 }
4794 } 4792 }
4795 } 4793 }
4796 return 1; 4794 return 1;
4797 } 4795 }
4798 __setup("cgroup_disable=", cgroup_disable); 4796 __setup("cgroup_disable=", cgroup_disable);
4799 4797
4800 /* 4798 /*
4801 * Functons for CSS ID. 4799 * Functons for CSS ID.
4802 */ 4800 */
4803 4801
4804 /* 4802 /*
4805 *To get ID other than 0, this should be called when !cgroup_is_removed(). 4803 *To get ID other than 0, this should be called when !cgroup_is_removed().
4806 */ 4804 */
4807 unsigned short css_id(struct cgroup_subsys_state *css) 4805 unsigned short css_id(struct cgroup_subsys_state *css)
4808 { 4806 {
4809 struct css_id *cssid; 4807 struct css_id *cssid;
4810 4808
4811 /* 4809 /*
4812 * This css_id() can return correct value when somone has refcnt 4810 * This css_id() can return correct value when somone has refcnt
4813 * on this or this is under rcu_read_lock(). Once css->id is allocated, 4811 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4814 * it's unchanged until freed. 4812 * it's unchanged until freed.
4815 */ 4813 */
4816 cssid = rcu_dereference_check(css->id, 4814 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
4817 rcu_read_lock_held() || atomic_read(&css->refcnt));
4818 4815
4819 if (cssid) 4816 if (cssid)
4820 return cssid->id; 4817 return cssid->id;
4821 return 0; 4818 return 0;
4822 } 4819 }
4823 EXPORT_SYMBOL_GPL(css_id); 4820 EXPORT_SYMBOL_GPL(css_id);
4824 4821
4825 unsigned short css_depth(struct cgroup_subsys_state *css) 4822 unsigned short css_depth(struct cgroup_subsys_state *css)
4826 { 4823 {
4827 struct css_id *cssid; 4824 struct css_id *cssid;
4828 4825
4829 cssid = rcu_dereference_check(css->id, 4826 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
4830 rcu_read_lock_held() || atomic_read(&css->refcnt));
4831 4827
4832 if (cssid) 4828 if (cssid)
4833 return cssid->depth; 4829 return cssid->depth;
4834 return 0; 4830 return 0;
4835 } 4831 }
4836 EXPORT_SYMBOL_GPL(css_depth); 4832 EXPORT_SYMBOL_GPL(css_depth);
4837 4833
4838 /** 4834 /**
4839 * css_is_ancestor - test "root" css is an ancestor of "child" 4835 * css_is_ancestor - test "root" css is an ancestor of "child"
4840 * @child: the css to be tested. 4836 * @child: the css to be tested.
4841 * @root: the css supporsed to be an ancestor of the child. 4837 * @root: the css supporsed to be an ancestor of the child.
4842 * 4838 *
4843 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because 4839 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4844 * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). 4840 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4845 * But, considering usual usage, the csses should be valid objects after test. 4841 * But, considering usual usage, the csses should be valid objects after test.
4846 * Assuming that the caller will do some action to the child if this returns 4842 * Assuming that the caller will do some action to the child if this returns
4847 * returns true, the caller must take "child";s reference count. 4843 * returns true, the caller must take "child";s reference count.
4848 * If "child" is valid object and this returns true, "root" is valid, too. 4844 * If "child" is valid object and this returns true, "root" is valid, too.
4849 */ 4845 */
4850 4846
4851 bool css_is_ancestor(struct cgroup_subsys_state *child, 4847 bool css_is_ancestor(struct cgroup_subsys_state *child,
4852 const struct cgroup_subsys_state *root) 4848 const struct cgroup_subsys_state *root)
4853 { 4849 {
4854 struct css_id *child_id; 4850 struct css_id *child_id;
4855 struct css_id *root_id; 4851 struct css_id *root_id;
4856 bool ret = true; 4852 bool ret = true;
4857 4853
4858 rcu_read_lock(); 4854 rcu_read_lock();
4859 child_id = rcu_dereference(child->id); 4855 child_id = rcu_dereference(child->id);
4860 root_id = rcu_dereference(root->id); 4856 root_id = rcu_dereference(root->id);
4861 if (!child_id 4857 if (!child_id
4862 || !root_id 4858 || !root_id
4863 || (child_id->depth < root_id->depth) 4859 || (child_id->depth < root_id->depth)
4864 || (child_id->stack[root_id->depth] != root_id->id)) 4860 || (child_id->stack[root_id->depth] != root_id->id))
4865 ret = false; 4861 ret = false;
4866 rcu_read_unlock(); 4862 rcu_read_unlock();
4867 return ret; 4863 return ret;
4868 } 4864 }
4869 4865
4870 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 4866 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4871 { 4867 {
4872 struct css_id *id = css->id; 4868 struct css_id *id = css->id;
4873 /* When this is called before css_id initialization, id can be NULL */ 4869 /* When this is called before css_id initialization, id can be NULL */
4874 if (!id) 4870 if (!id)
4875 return; 4871 return;
4876 4872
4877 BUG_ON(!ss->use_id); 4873 BUG_ON(!ss->use_id);
4878 4874
4879 rcu_assign_pointer(id->css, NULL); 4875 rcu_assign_pointer(id->css, NULL);
4880 rcu_assign_pointer(css->id, NULL); 4876 rcu_assign_pointer(css->id, NULL);
4881 spin_lock(&ss->id_lock); 4877 spin_lock(&ss->id_lock);
4882 idr_remove(&ss->idr, id->id); 4878 idr_remove(&ss->idr, id->id);
4883 spin_unlock(&ss->id_lock); 4879 spin_unlock(&ss->id_lock);
4884 kfree_rcu(id, rcu_head); 4880 kfree_rcu(id, rcu_head);
4885 } 4881 }
4886 EXPORT_SYMBOL_GPL(free_css_id); 4882 EXPORT_SYMBOL_GPL(free_css_id);
4887 4883
4888 /* 4884 /*
4889 * This is called by init or create(). Then, calls to this function are 4885 * This is called by init or create(). Then, calls to this function are
4890 * always serialized (By cgroup_mutex() at create()). 4886 * always serialized (By cgroup_mutex() at create()).
4891 */ 4887 */
4892 4888
4893 static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) 4889 static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4894 { 4890 {
4895 struct css_id *newid; 4891 struct css_id *newid;
4896 int myid, error, size; 4892 int myid, error, size;
4897 4893
4898 BUG_ON(!ss->use_id); 4894 BUG_ON(!ss->use_id);
4899 4895
4900 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); 4896 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
4901 newid = kzalloc(size, GFP_KERNEL); 4897 newid = kzalloc(size, GFP_KERNEL);
4902 if (!newid) 4898 if (!newid)
4903 return ERR_PTR(-ENOMEM); 4899 return ERR_PTR(-ENOMEM);
4904 /* get id */ 4900 /* get id */
4905 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { 4901 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
4906 error = -ENOMEM; 4902 error = -ENOMEM;
4907 goto err_out; 4903 goto err_out;
4908 } 4904 }
4909 spin_lock(&ss->id_lock); 4905 spin_lock(&ss->id_lock);
4910 /* Don't use 0. allocates an ID of 1-65535 */ 4906 /* Don't use 0. allocates an ID of 1-65535 */
4911 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 4907 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4912 spin_unlock(&ss->id_lock); 4908 spin_unlock(&ss->id_lock);
4913 4909
4914 /* Returns error when there are no free spaces for new ID.*/ 4910 /* Returns error when there are no free spaces for new ID.*/
4915 if (error) { 4911 if (error) {
4916 error = -ENOSPC; 4912 error = -ENOSPC;
4917 goto err_out; 4913 goto err_out;
4918 } 4914 }
4919 if (myid > CSS_ID_MAX) 4915 if (myid > CSS_ID_MAX)
4920 goto remove_idr; 4916 goto remove_idr;
4921 4917
4922 newid->id = myid; 4918 newid->id = myid;
4923 newid->depth = depth; 4919 newid->depth = depth;
4924 return newid; 4920 return newid;
4925 remove_idr: 4921 remove_idr:
4926 error = -ENOSPC; 4922 error = -ENOSPC;
4927 spin_lock(&ss->id_lock); 4923 spin_lock(&ss->id_lock);
4928 idr_remove(&ss->idr, myid); 4924 idr_remove(&ss->idr, myid);
4929 spin_unlock(&ss->id_lock); 4925 spin_unlock(&ss->id_lock);
4930 err_out: 4926 err_out:
4931 kfree(newid); 4927 kfree(newid);
4932 return ERR_PTR(error); 4928 return ERR_PTR(error);
4933 4929
4934 } 4930 }
4935 4931
4936 static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, 4932 static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4937 struct cgroup_subsys_state *rootcss) 4933 struct cgroup_subsys_state *rootcss)
4938 { 4934 {
4939 struct css_id *newid; 4935 struct css_id *newid;
4940 4936
4941 spin_lock_init(&ss->id_lock); 4937 spin_lock_init(&ss->id_lock);
4942 idr_init(&ss->idr); 4938 idr_init(&ss->idr);
4943 4939
4944 newid = get_new_cssid(ss, 0); 4940 newid = get_new_cssid(ss, 0);
4945 if (IS_ERR(newid)) 4941 if (IS_ERR(newid))
4946 return PTR_ERR(newid); 4942 return PTR_ERR(newid);
4947 4943
4948 newid->stack[0] = newid->id; 4944 newid->stack[0] = newid->id;
4949 newid->css = rootcss; 4945 newid->css = rootcss;
4950 rootcss->id = newid; 4946 rootcss->id = newid;
4951 return 0; 4947 return 0;
4952 } 4948 }
4953 4949
4954 static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, 4950 static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4955 struct cgroup *child) 4951 struct cgroup *child)
4956 { 4952 {
4957 int subsys_id, i, depth = 0; 4953 int subsys_id, i, depth = 0;
4958 struct cgroup_subsys_state *parent_css, *child_css; 4954 struct cgroup_subsys_state *parent_css, *child_css;
4959 struct css_id *child_id, *parent_id; 4955 struct css_id *child_id, *parent_id;
4960 4956
4961 subsys_id = ss->subsys_id; 4957 subsys_id = ss->subsys_id;
4962 parent_css = parent->subsys[subsys_id]; 4958 parent_css = parent->subsys[subsys_id];
4963 child_css = child->subsys[subsys_id]; 4959 child_css = child->subsys[subsys_id];
4964 parent_id = parent_css->id; 4960 parent_id = parent_css->id;
4965 depth = parent_id->depth + 1; 4961 depth = parent_id->depth + 1;
4966 4962
4967 child_id = get_new_cssid(ss, depth); 4963 child_id = get_new_cssid(ss, depth);
4968 if (IS_ERR(child_id)) 4964 if (IS_ERR(child_id))
4969 return PTR_ERR(child_id); 4965 return PTR_ERR(child_id);
4970 4966
4971 for (i = 0; i < depth; i++) 4967 for (i = 0; i < depth; i++)
4972 child_id->stack[i] = parent_id->stack[i]; 4968 child_id->stack[i] = parent_id->stack[i];
4973 child_id->stack[depth] = child_id->id; 4969 child_id->stack[depth] = child_id->id;
4974 /* 4970 /*
4975 * child_id->css pointer will be set after this cgroup is available 4971 * child_id->css pointer will be set after this cgroup is available
4976 * see cgroup_populate_dir() 4972 * see cgroup_populate_dir()
4977 */ 4973 */
4978 rcu_assign_pointer(child_css->id, child_id); 4974 rcu_assign_pointer(child_css->id, child_id);
4979 4975
4980 return 0; 4976 return 0;
4981 } 4977 }
4982 4978
4983 /** 4979 /**
4984 * css_lookup - lookup css by id 4980 * css_lookup - lookup css by id
4985 * @ss: cgroup subsys to be looked into. 4981 * @ss: cgroup subsys to be looked into.
4986 * @id: the id 4982 * @id: the id
4987 * 4983 *
4988 * Returns pointer to cgroup_subsys_state if there is valid one with id. 4984 * Returns pointer to cgroup_subsys_state if there is valid one with id.
4989 * NULL if not. Should be called under rcu_read_lock() 4985 * NULL if not. Should be called under rcu_read_lock()
4990 */ 4986 */
4991 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) 4987 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
4992 { 4988 {
4993 struct css_id *cssid = NULL; 4989 struct css_id *cssid = NULL;
4994 4990
4995 BUG_ON(!ss->use_id); 4991 BUG_ON(!ss->use_id);
4996 cssid = idr_find(&ss->idr, id); 4992 cssid = idr_find(&ss->idr, id);
4997 4993
4998 if (unlikely(!cssid)) 4994 if (unlikely(!cssid))
4999 return NULL; 4995 return NULL;
5000 4996
5001 return rcu_dereference(cssid->css); 4997 return rcu_dereference(cssid->css);
5002 } 4998 }
5003 EXPORT_SYMBOL_GPL(css_lookup); 4999 EXPORT_SYMBOL_GPL(css_lookup);
5004 5000
5005 /** 5001 /**
5006 * css_get_next - lookup next cgroup under specified hierarchy. 5002 * css_get_next - lookup next cgroup under specified hierarchy.
5007 * @ss: pointer to subsystem 5003 * @ss: pointer to subsystem
5008 * @id: current position of iteration. 5004 * @id: current position of iteration.
5009 * @root: pointer to css. search tree under this. 5005 * @root: pointer to css. search tree under this.
5010 * @foundid: position of found object. 5006 * @foundid: position of found object.
5011 * 5007 *
5012 * Search next css under the specified hierarchy of rootid. Calling under 5008 * Search next css under the specified hierarchy of rootid. Calling under
5013 * rcu_read_lock() is necessary. Returns NULL if it reaches the end. 5009 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
5014 */ 5010 */
5015 struct cgroup_subsys_state * 5011 struct cgroup_subsys_state *
5016 css_get_next(struct cgroup_subsys *ss, int id, 5012 css_get_next(struct cgroup_subsys *ss, int id,
5017 struct cgroup_subsys_state *root, int *foundid) 5013 struct cgroup_subsys_state *root, int *foundid)
5018 { 5014 {
5019 struct cgroup_subsys_state *ret = NULL; 5015 struct cgroup_subsys_state *ret = NULL;
5020 struct css_id *tmp; 5016 struct css_id *tmp;
5021 int tmpid; 5017 int tmpid;
5022 int rootid = css_id(root); 5018 int rootid = css_id(root);
5023 int depth = css_depth(root); 5019 int depth = css_depth(root);
5024 5020
5025 if (!rootid) 5021 if (!rootid)
5026 return NULL; 5022 return NULL;
5027 5023
5028 BUG_ON(!ss->use_id); 5024 BUG_ON(!ss->use_id);
5029 /* fill start point for scan */ 5025 /* fill start point for scan */
5030 tmpid = id; 5026 tmpid = id;
5031 while (1) { 5027 while (1) {
5032 /* 5028 /*
5033 * scan next entry from bitmap(tree), tmpid is updated after 5029 * scan next entry from bitmap(tree), tmpid is updated after
5034 * idr_get_next(). 5030 * idr_get_next().
5035 */ 5031 */
5036 spin_lock(&ss->id_lock); 5032 spin_lock(&ss->id_lock);
5037 tmp = idr_get_next(&ss->idr, &tmpid); 5033 tmp = idr_get_next(&ss->idr, &tmpid);
5038 spin_unlock(&ss->id_lock); 5034 spin_unlock(&ss->id_lock);
5039 5035
5040 if (!tmp) 5036 if (!tmp)
5041 break; 5037 break;
5042 if (tmp->depth >= depth && tmp->stack[depth] == rootid) { 5038 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
5043 ret = rcu_dereference(tmp->css); 5039 ret = rcu_dereference(tmp->css);
5044 if (ret) { 5040 if (ret) {
5045 *foundid = tmpid; 5041 *foundid = tmpid;
5046 break; 5042 break;
5047 } 5043 }
5048 } 5044 }
5049 /* continue to scan from next id */ 5045 /* continue to scan from next id */
5050 tmpid = tmpid + 1; 5046 tmpid = tmpid + 1;
5051 } 5047 }
5052 return ret; 5048 return ret;
5053 } 5049 }
5054 5050
5055 /* 5051 /*
5056 * get corresponding css from file open on cgroupfs directory 5052 * get corresponding css from file open on cgroupfs directory
5057 */ 5053 */
5058 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) 5054 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5059 { 5055 {
5060 struct cgroup *cgrp; 5056 struct cgroup *cgrp;
5061 struct inode *inode; 5057 struct inode *inode;
5062 struct cgroup_subsys_state *css; 5058 struct cgroup_subsys_state *css;
5063 5059
5064 inode = f->f_dentry->d_inode; 5060 inode = f->f_dentry->d_inode;
5065 /* check in cgroup filesystem dir */ 5061 /* check in cgroup filesystem dir */
5066 if (inode->i_op != &cgroup_dir_inode_operations) 5062 if (inode->i_op != &cgroup_dir_inode_operations)
5067 return ERR_PTR(-EBADF); 5063 return ERR_PTR(-EBADF);
5068 5064
5069 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5065 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
5070 return ERR_PTR(-EINVAL); 5066 return ERR_PTR(-EINVAL);
5071 5067
5072 /* get cgroup */ 5068 /* get cgroup */
5073 cgrp = __d_cgrp(f->f_dentry); 5069 cgrp = __d_cgrp(f->f_dentry);
5074 css = cgrp->subsys[id]; 5070 css = cgrp->subsys[id];
5075 return css ? css : ERR_PTR(-ENOENT); 5071 return css ? css : ERR_PTR(-ENOENT);
5076 } 5072 }
5077 5073
5078 #ifdef CONFIG_CGROUP_DEBUG 5074 #ifdef CONFIG_CGROUP_DEBUG
5079 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 5075 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
5080 struct cgroup *cont) 5076 struct cgroup *cont)
5081 { 5077 {
5082 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5078 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5083 5079
5084 if (!css) 5080 if (!css)
5085 return ERR_PTR(-ENOMEM); 5081 return ERR_PTR(-ENOMEM);
5086 5082
5087 return css; 5083 return css;
5088 } 5084 }
5089 5085
5090 static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 5086 static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
5091 { 5087 {
5092 kfree(cont->subsys[debug_subsys_id]); 5088 kfree(cont->subsys[debug_subsys_id]);
5093 } 5089 }
5094 5090
5095 static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) 5091 static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5096 { 5092 {
5097 return atomic_read(&cont->count); 5093 return atomic_read(&cont->count);
5098 } 5094 }
5099 5095
5100 static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) 5096 static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
5101 { 5097 {
5102 return cgroup_task_count(cont); 5098 return cgroup_task_count(cont);
5103 } 5099 }
5104 5100
5105 static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) 5101 static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
5106 { 5102 {
5107 return (u64)(unsigned long)current->cgroups; 5103 return (u64)(unsigned long)current->cgroups;
5108 } 5104 }
5109 5105
5110 static u64 current_css_set_refcount_read(struct cgroup *cont, 5106 static u64 current_css_set_refcount_read(struct cgroup *cont,
5111 struct cftype *cft) 5107 struct cftype *cft)
5112 { 5108 {
5113 u64 count; 5109 u64 count;
5114 5110
5115 rcu_read_lock(); 5111 rcu_read_lock();
5116 count = atomic_read(&current->cgroups->refcount); 5112 count = atomic_read(&current->cgroups->refcount);
5117 rcu_read_unlock(); 5113 rcu_read_unlock();
5118 return count; 5114 return count;
5119 } 5115 }
5120 5116
5121 static int current_css_set_cg_links_read(struct cgroup *cont, 5117 static int current_css_set_cg_links_read(struct cgroup *cont,
5122 struct cftype *cft, 5118 struct cftype *cft,
5123 struct seq_file *seq) 5119 struct seq_file *seq)
5124 { 5120 {
5125 struct cg_cgroup_link *link; 5121 struct cg_cgroup_link *link;
5126 struct css_set *cg; 5122 struct css_set *cg;
5127 5123
5128 read_lock(&css_set_lock); 5124 read_lock(&css_set_lock);
5129 rcu_read_lock(); 5125 rcu_read_lock();
5130 cg = rcu_dereference(current->cgroups); 5126 cg = rcu_dereference(current->cgroups);
5131 list_for_each_entry(link, &cg->cg_links, cg_link_list) { 5127 list_for_each_entry(link, &cg->cg_links, cg_link_list) {
5132 struct cgroup *c = link->cgrp; 5128 struct cgroup *c = link->cgrp;
5133 const char *name; 5129 const char *name;
5134 5130
5135 if (c->dentry) 5131 if (c->dentry)
5136 name = c->dentry->d_name.name; 5132 name = c->dentry->d_name.name;
5137 else 5133 else
5138 name = "?"; 5134 name = "?";
5139 seq_printf(seq, "Root %d group %s\n", 5135 seq_printf(seq, "Root %d group %s\n",
5140 c->root->hierarchy_id, name); 5136 c->root->hierarchy_id, name);
5141 } 5137 }
5142 rcu_read_unlock(); 5138 rcu_read_unlock();
5143 read_unlock(&css_set_lock); 5139 read_unlock(&css_set_lock);
5144 return 0; 5140 return 0;
5145 } 5141 }
5146 5142
5147 #define MAX_TASKS_SHOWN_PER_CSS 25 5143 #define MAX_TASKS_SHOWN_PER_CSS 25
5148 static int cgroup_css_links_read(struct cgroup *cont, 5144 static int cgroup_css_links_read(struct cgroup *cont,
5149 struct cftype *cft, 5145 struct cftype *cft,
5150 struct seq_file *seq) 5146 struct seq_file *seq)
5151 { 5147 {
5152 struct cg_cgroup_link *link; 5148 struct cg_cgroup_link *link;
5153 5149
5154 read_lock(&css_set_lock); 5150 read_lock(&css_set_lock);
5155 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { 5151 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
5156 struct css_set *cg = link->cg; 5152 struct css_set *cg = link->cg;
5157 struct task_struct *task; 5153 struct task_struct *task;
5158 int count = 0; 5154 int count = 0;
5159 seq_printf(seq, "css_set %p\n", cg); 5155 seq_printf(seq, "css_set %p\n", cg);
5160 list_for_each_entry(task, &cg->tasks, cg_list) { 5156 list_for_each_entry(task, &cg->tasks, cg_list) {
5161 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 5157 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5162 seq_puts(seq, " ...\n"); 5158 seq_puts(seq, " ...\n");
5163 break; 5159 break;
5164 } else { 5160 } else {
5165 seq_printf(seq, " task %d\n", 5161 seq_printf(seq, " task %d\n",
5166 task_pid_vnr(task)); 5162 task_pid_vnr(task));
5167 } 5163 }
5168 } 5164 }
5169 } 5165 }
5170 read_unlock(&css_set_lock); 5166 read_unlock(&css_set_lock);
5171 return 0; 5167 return 0;
5172 } 5168 }
5173 5169
5174 static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5170 static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5175 { 5171 {
5176 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5172 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
5177 } 5173 }
5178 5174
5179 static struct cftype debug_files[] = { 5175 static struct cftype debug_files[] = {
5180 { 5176 {
5181 .name = "cgroup_refcount", 5177 .name = "cgroup_refcount",
5182 .read_u64 = cgroup_refcount_read, 5178 .read_u64 = cgroup_refcount_read,
5183 }, 5179 },
5184 { 5180 {
5185 .name = "taskcount", 5181 .name = "taskcount",
5186 .read_u64 = debug_taskcount_read, 5182 .read_u64 = debug_taskcount_read,
5187 }, 5183 },
5188 5184
5189 { 5185 {
5190 .name = "current_css_set", 5186 .name = "current_css_set",
5191 .read_u64 = current_css_set_read, 5187 .read_u64 = current_css_set_read,
5192 }, 5188 },
5193 5189
5194 { 5190 {
5195 .name = "current_css_set_refcount", 5191 .name = "current_css_set_refcount",
5196 .read_u64 = current_css_set_refcount_read, 5192 .read_u64 = current_css_set_refcount_read,
5197 }, 5193 },
5198 5194
5199 { 5195 {
5200 .name = "current_css_set_cg_links", 5196 .name = "current_css_set_cg_links",
5201 .read_seq_string = current_css_set_cg_links_read, 5197 .read_seq_string = current_css_set_cg_links_read,
5202 }, 5198 },
5203 5199
5204 { 5200 {
5205 .name = "cgroup_css_links", 5201 .name = "cgroup_css_links",
5206 .read_seq_string = cgroup_css_links_read, 5202 .read_seq_string = cgroup_css_links_read,
5207 }, 5203 },
5208 5204
5209 { 5205 {
5210 .name = "releasable", 5206 .name = "releasable",
5211 .read_u64 = releasable_read, 5207 .read_u64 = releasable_read,
5212 }, 5208 },
5213 }; 5209 };
5214 5210
5215 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 5211 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
5216 { 5212 {
5217 return cgroup_add_files(cont, ss, debug_files, 5213 return cgroup_add_files(cont, ss, debug_files,
5218 ARRAY_SIZE(debug_files)); 5214 ARRAY_SIZE(debug_files));
5219 } 5215 }
5220 5216
5221 struct cgroup_subsys debug_subsys = { 5217 struct cgroup_subsys debug_subsys = {
5222 .name = "debug", 5218 .name = "debug",
5223 .create = debug_create, 5219 .create = debug_create,
5224 .destroy = debug_destroy, 5220 .destroy = debug_destroy,
5225 .populate = debug_populate, 5221 .populate = debug_populate,
5226 .subsys_id = debug_subsys_id, 5222 .subsys_id = debug_subsys_id,
5227 }; 5223 };
5228 #endif /* CONFIG_CGROUP_DEBUG */ 5224 #endif /* CONFIG_CGROUP_DEBUG */
5229 5225
1 /* 1 /*
2 * linux/kernel/exit.c 2 * linux/kernel/exit.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7 #include <linux/mm.h> 7 #include <linux/mm.h>
8 #include <linux/slab.h> 8 #include <linux/slab.h>
9 #include <linux/interrupt.h> 9 #include <linux/interrupt.h>
10 #include <linux/module.h> 10 #include <linux/module.h>
11 #include <linux/capability.h> 11 #include <linux/capability.h>
12 #include <linux/completion.h> 12 #include <linux/completion.h>
13 #include <linux/personality.h> 13 #include <linux/personality.h>
14 #include <linux/tty.h> 14 #include <linux/tty.h>
15 #include <linux/iocontext.h> 15 #include <linux/iocontext.h>
16 #include <linux/key.h> 16 #include <linux/key.h>
17 #include <linux/security.h> 17 #include <linux/security.h>
18 #include <linux/cpu.h> 18 #include <linux/cpu.h>
19 #include <linux/acct.h> 19 #include <linux/acct.h>
20 #include <linux/tsacct_kern.h> 20 #include <linux/tsacct_kern.h>
21 #include <linux/file.h> 21 #include <linux/file.h>
22 #include <linux/fdtable.h> 22 #include <linux/fdtable.h>
23 #include <linux/binfmts.h> 23 #include <linux/binfmts.h>
24 #include <linux/nsproxy.h> 24 #include <linux/nsproxy.h>
25 #include <linux/pid_namespace.h> 25 #include <linux/pid_namespace.h>
26 #include <linux/ptrace.h> 26 #include <linux/ptrace.h>
27 #include <linux/profile.h> 27 #include <linux/profile.h>
28 #include <linux/mount.h> 28 #include <linux/mount.h>
29 #include <linux/proc_fs.h> 29 #include <linux/proc_fs.h>
30 #include <linux/kthread.h> 30 #include <linux/kthread.h>
31 #include <linux/mempolicy.h> 31 #include <linux/mempolicy.h>
32 #include <linux/taskstats_kern.h> 32 #include <linux/taskstats_kern.h>
33 #include <linux/delayacct.h> 33 #include <linux/delayacct.h>
34 #include <linux/freezer.h> 34 #include <linux/freezer.h>
35 #include <linux/cgroup.h> 35 #include <linux/cgroup.h>
36 #include <linux/syscalls.h> 36 #include <linux/syscalls.h>
37 #include <linux/signal.h> 37 #include <linux/signal.h>
38 #include <linux/posix-timers.h> 38 #include <linux/posix-timers.h>
39 #include <linux/cn_proc.h> 39 #include <linux/cn_proc.h>
40 #include <linux/mutex.h> 40 #include <linux/mutex.h>
41 #include <linux/futex.h> 41 #include <linux/futex.h>
42 #include <linux/pipe_fs_i.h> 42 #include <linux/pipe_fs_i.h>
43 #include <linux/audit.h> /* for audit_free() */ 43 #include <linux/audit.h> /* for audit_free() */
44 #include <linux/resource.h> 44 #include <linux/resource.h>
45 #include <linux/blkdev.h> 45 #include <linux/blkdev.h>
46 #include <linux/task_io_accounting_ops.h> 46 #include <linux/task_io_accounting_ops.h>
47 #include <linux/tracehook.h> 47 #include <linux/tracehook.h>
48 #include <linux/fs_struct.h> 48 #include <linux/fs_struct.h>
49 #include <linux/init_task.h> 49 #include <linux/init_task.h>
50 #include <linux/perf_event.h> 50 #include <linux/perf_event.h>
51 #include <trace/events/sched.h> 51 #include <trace/events/sched.h>
52 #include <linux/hw_breakpoint.h> 52 #include <linux/hw_breakpoint.h>
53 #include <linux/oom.h> 53 #include <linux/oom.h>
54 54
55 #include <asm/uaccess.h> 55 #include <asm/uaccess.h>
56 #include <asm/unistd.h> 56 #include <asm/unistd.h>
57 #include <asm/pgtable.h> 57 #include <asm/pgtable.h>
58 #include <asm/mmu_context.h> 58 #include <asm/mmu_context.h>
59 59
60 static void exit_mm(struct task_struct * tsk); 60 static void exit_mm(struct task_struct * tsk);
61 61
62 static void __unhash_process(struct task_struct *p, bool group_dead) 62 static void __unhash_process(struct task_struct *p, bool group_dead)
63 { 63 {
64 nr_threads--; 64 nr_threads--;
65 detach_pid(p, PIDTYPE_PID); 65 detach_pid(p, PIDTYPE_PID);
66 if (group_dead) { 66 if (group_dead) {
67 detach_pid(p, PIDTYPE_PGID); 67 detach_pid(p, PIDTYPE_PGID);
68 detach_pid(p, PIDTYPE_SID); 68 detach_pid(p, PIDTYPE_SID);
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling); 71 list_del_init(&p->sibling);
72 __this_cpu_dec(process_counts); 72 __this_cpu_dec(process_counts);
73 } 73 }
74 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
75 } 75 }
76 76
77 /* 77 /*
78 * This function expects the tasklist_lock write-locked. 78 * This function expects the tasklist_lock write-locked.
79 */ 79 */
80 static void __exit_signal(struct task_struct *tsk) 80 static void __exit_signal(struct task_struct *tsk)
81 { 81 {
82 struct signal_struct *sig = tsk->signal; 82 struct signal_struct *sig = tsk->signal;
83 bool group_dead = thread_group_leader(tsk); 83 bool group_dead = thread_group_leader(tsk);
84 struct sighand_struct *sighand; 84 struct sighand_struct *sighand;
85 struct tty_struct *uninitialized_var(tty); 85 struct tty_struct *uninitialized_var(tty);
86 86
87 sighand = rcu_dereference_check(tsk->sighand, 87 sighand = rcu_dereference_check(tsk->sighand,
88 rcu_read_lock_held() ||
89 lockdep_tasklist_lock_is_held()); 88 lockdep_tasklist_lock_is_held());
90 spin_lock(&sighand->siglock); 89 spin_lock(&sighand->siglock);
91 90
92 posix_cpu_timers_exit(tsk); 91 posix_cpu_timers_exit(tsk);
93 if (group_dead) { 92 if (group_dead) {
94 posix_cpu_timers_exit_group(tsk); 93 posix_cpu_timers_exit_group(tsk);
95 tty = sig->tty; 94 tty = sig->tty;
96 sig->tty = NULL; 95 sig->tty = NULL;
97 } else { 96 } else {
98 /* 97 /*
99 * This can only happen if the caller is de_thread(). 98 * This can only happen if the caller is de_thread().
100 * FIXME: this is the temporary hack, we should teach 99 * FIXME: this is the temporary hack, we should teach
101 * posix-cpu-timers to handle this case correctly. 100 * posix-cpu-timers to handle this case correctly.
102 */ 101 */
103 if (unlikely(has_group_leader_pid(tsk))) 102 if (unlikely(has_group_leader_pid(tsk)))
104 posix_cpu_timers_exit_group(tsk); 103 posix_cpu_timers_exit_group(tsk);
105 104
106 /* 105 /*
107 * If there is any task waiting for the group exit 106 * If there is any task waiting for the group exit
108 * then notify it: 107 * then notify it:
109 */ 108 */
110 if (sig->notify_count > 0 && !--sig->notify_count) 109 if (sig->notify_count > 0 && !--sig->notify_count)
111 wake_up_process(sig->group_exit_task); 110 wake_up_process(sig->group_exit_task);
112 111
113 if (tsk == sig->curr_target) 112 if (tsk == sig->curr_target)
114 sig->curr_target = next_thread(tsk); 113 sig->curr_target = next_thread(tsk);
115 /* 114 /*
116 * Accumulate here the counters for all threads but the 115 * Accumulate here the counters for all threads but the
117 * group leader as they die, so they can be added into 116 * group leader as they die, so they can be added into
118 * the process-wide totals when those are taken. 117 * the process-wide totals when those are taken.
119 * The group leader stays around as a zombie as long 118 * The group leader stays around as a zombie as long
120 * as there are other threads. When it gets reaped, 119 * as there are other threads. When it gets reaped,
121 * the exit.c code will add its counts into these totals. 120 * the exit.c code will add its counts into these totals.
122 * We won't ever get here for the group leader, since it 121 * We won't ever get here for the group leader, since it
123 * will have been the last reference on the signal_struct. 122 * will have been the last reference on the signal_struct.
124 */ 123 */
125 sig->utime = cputime_add(sig->utime, tsk->utime); 124 sig->utime = cputime_add(sig->utime, tsk->utime);
126 sig->stime = cputime_add(sig->stime, tsk->stime); 125 sig->stime = cputime_add(sig->stime, tsk->stime);
127 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 126 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
128 sig->min_flt += tsk->min_flt; 127 sig->min_flt += tsk->min_flt;
129 sig->maj_flt += tsk->maj_flt; 128 sig->maj_flt += tsk->maj_flt;
130 sig->nvcsw += tsk->nvcsw; 129 sig->nvcsw += tsk->nvcsw;
131 sig->nivcsw += tsk->nivcsw; 130 sig->nivcsw += tsk->nivcsw;
132 sig->inblock += task_io_get_inblock(tsk); 131 sig->inblock += task_io_get_inblock(tsk);
133 sig->oublock += task_io_get_oublock(tsk); 132 sig->oublock += task_io_get_oublock(tsk);
134 task_io_accounting_add(&sig->ioac, &tsk->ioac); 133 task_io_accounting_add(&sig->ioac, &tsk->ioac);
135 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 134 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
136 } 135 }
137 136
138 sig->nr_threads--; 137 sig->nr_threads--;
139 __unhash_process(tsk, group_dead); 138 __unhash_process(tsk, group_dead);
140 139
141 /* 140 /*
142 * Do this under ->siglock, we can race with another thread 141 * Do this under ->siglock, we can race with another thread
143 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 142 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
144 */ 143 */
145 flush_sigqueue(&tsk->pending); 144 flush_sigqueue(&tsk->pending);
146 tsk->sighand = NULL; 145 tsk->sighand = NULL;
147 spin_unlock(&sighand->siglock); 146 spin_unlock(&sighand->siglock);
148 147
149 __cleanup_sighand(sighand); 148 __cleanup_sighand(sighand);
150 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 149 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
151 if (group_dead) { 150 if (group_dead) {
152 flush_sigqueue(&sig->shared_pending); 151 flush_sigqueue(&sig->shared_pending);
153 tty_kref_put(tty); 152 tty_kref_put(tty);
154 } 153 }
155 } 154 }
156 155
157 static void delayed_put_task_struct(struct rcu_head *rhp) 156 static void delayed_put_task_struct(struct rcu_head *rhp)
158 { 157 {
159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 158 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
160 159
161 perf_event_delayed_put(tsk); 160 perf_event_delayed_put(tsk);
162 trace_sched_process_free(tsk); 161 trace_sched_process_free(tsk);
163 put_task_struct(tsk); 162 put_task_struct(tsk);
164 } 163 }
165 164
166 165
167 void release_task(struct task_struct * p) 166 void release_task(struct task_struct * p)
168 { 167 {
169 struct task_struct *leader; 168 struct task_struct *leader;
170 int zap_leader; 169 int zap_leader;
171 repeat: 170 repeat:
172 tracehook_prepare_release_task(p); 171 tracehook_prepare_release_task(p);
173 /* don't need to get the RCU readlock here - the process is dead and 172 /* don't need to get the RCU readlock here - the process is dead and
174 * can't be modifying its own credentials. But shut RCU-lockdep up */ 173 * can't be modifying its own credentials. But shut RCU-lockdep up */
175 rcu_read_lock(); 174 rcu_read_lock();
176 atomic_dec(&__task_cred(p)->user->processes); 175 atomic_dec(&__task_cred(p)->user->processes);
177 rcu_read_unlock(); 176 rcu_read_unlock();
178 177
179 proc_flush_task(p); 178 proc_flush_task(p);
180 179
181 write_lock_irq(&tasklist_lock); 180 write_lock_irq(&tasklist_lock);
182 tracehook_finish_release_task(p); 181 tracehook_finish_release_task(p);
183 __exit_signal(p); 182 __exit_signal(p);
184 183
185 /* 184 /*
186 * If we are the last non-leader member of the thread 185 * If we are the last non-leader member of the thread
187 * group, and the leader is zombie, then notify the 186 * group, and the leader is zombie, then notify the
188 * group leader's parent process. (if it wants notification.) 187 * group leader's parent process. (if it wants notification.)
189 */ 188 */
190 zap_leader = 0; 189 zap_leader = 0;
191 leader = p->group_leader; 190 leader = p->group_leader;
192 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 191 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
193 BUG_ON(task_detached(leader)); 192 BUG_ON(task_detached(leader));
194 do_notify_parent(leader, leader->exit_signal); 193 do_notify_parent(leader, leader->exit_signal);
195 /* 194 /*
196 * If we were the last child thread and the leader has 195 * If we were the last child thread and the leader has
197 * exited already, and the leader's parent ignores SIGCHLD, 196 * exited already, and the leader's parent ignores SIGCHLD,
198 * then we are the one who should release the leader. 197 * then we are the one who should release the leader.
199 * 198 *
200 * do_notify_parent() will have marked it self-reaping in 199 * do_notify_parent() will have marked it self-reaping in
201 * that case. 200 * that case.
202 */ 201 */
203 zap_leader = task_detached(leader); 202 zap_leader = task_detached(leader);
204 203
205 /* 204 /*
206 * This maintains the invariant that release_task() 205 * This maintains the invariant that release_task()
207 * only runs on a task in EXIT_DEAD, just for sanity. 206 * only runs on a task in EXIT_DEAD, just for sanity.
208 */ 207 */
209 if (zap_leader) 208 if (zap_leader)
210 leader->exit_state = EXIT_DEAD; 209 leader->exit_state = EXIT_DEAD;
211 } 210 }
212 211
213 write_unlock_irq(&tasklist_lock); 212 write_unlock_irq(&tasklist_lock);
214 release_thread(p); 213 release_thread(p);
215 call_rcu(&p->rcu, delayed_put_task_struct); 214 call_rcu(&p->rcu, delayed_put_task_struct);
216 215
217 p = leader; 216 p = leader;
218 if (unlikely(zap_leader)) 217 if (unlikely(zap_leader))
219 goto repeat; 218 goto repeat;
220 } 219 }
221 220
222 /* 221 /*
223 * This checks not only the pgrp, but falls back on the pid if no 222 * This checks not only the pgrp, but falls back on the pid if no
224 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly 223 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
225 * without this... 224 * without this...
226 * 225 *
227 * The caller must hold rcu lock or the tasklist lock. 226 * The caller must hold rcu lock or the tasklist lock.
228 */ 227 */
229 struct pid *session_of_pgrp(struct pid *pgrp) 228 struct pid *session_of_pgrp(struct pid *pgrp)
230 { 229 {
231 struct task_struct *p; 230 struct task_struct *p;
232 struct pid *sid = NULL; 231 struct pid *sid = NULL;
233 232
234 p = pid_task(pgrp, PIDTYPE_PGID); 233 p = pid_task(pgrp, PIDTYPE_PGID);
235 if (p == NULL) 234 if (p == NULL)
236 p = pid_task(pgrp, PIDTYPE_PID); 235 p = pid_task(pgrp, PIDTYPE_PID);
237 if (p != NULL) 236 if (p != NULL)
238 sid = task_session(p); 237 sid = task_session(p);
239 238
240 return sid; 239 return sid;
241 } 240 }
242 241
243 /* 242 /*
244 * Determine if a process group is "orphaned", according to the POSIX 243 * Determine if a process group is "orphaned", according to the POSIX
245 * definition in 2.2.2.52. Orphaned process groups are not to be affected 244 * definition in 2.2.2.52. Orphaned process groups are not to be affected
246 * by terminal-generated stop signals. Newly orphaned process groups are 245 * by terminal-generated stop signals. Newly orphaned process groups are
247 * to receive a SIGHUP and a SIGCONT. 246 * to receive a SIGHUP and a SIGCONT.
248 * 247 *
249 * "I ask you, have you ever known what it is to be an orphan?" 248 * "I ask you, have you ever known what it is to be an orphan?"
250 */ 249 */
251 static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) 250 static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
252 { 251 {
253 struct task_struct *p; 252 struct task_struct *p;
254 253
255 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 254 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
256 if ((p == ignored_task) || 255 if ((p == ignored_task) ||
257 (p->exit_state && thread_group_empty(p)) || 256 (p->exit_state && thread_group_empty(p)) ||
258 is_global_init(p->real_parent)) 257 is_global_init(p->real_parent))
259 continue; 258 continue;
260 259
261 if (task_pgrp(p->real_parent) != pgrp && 260 if (task_pgrp(p->real_parent) != pgrp &&
262 task_session(p->real_parent) == task_session(p)) 261 task_session(p->real_parent) == task_session(p))
263 return 0; 262 return 0;
264 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 263 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
265 264
266 return 1; 265 return 1;
267 } 266 }
268 267
269 int is_current_pgrp_orphaned(void) 268 int is_current_pgrp_orphaned(void)
270 { 269 {
271 int retval; 270 int retval;
272 271
273 read_lock(&tasklist_lock); 272 read_lock(&tasklist_lock);
274 retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); 273 retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
275 read_unlock(&tasklist_lock); 274 read_unlock(&tasklist_lock);
276 275
277 return retval; 276 return retval;
278 } 277 }
279 278
280 static int has_stopped_jobs(struct pid *pgrp) 279 static int has_stopped_jobs(struct pid *pgrp)
281 { 280 {
282 int retval = 0; 281 int retval = 0;
283 struct task_struct *p; 282 struct task_struct *p;
284 283
285 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 284 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
286 if (!task_is_stopped(p)) 285 if (!task_is_stopped(p))
287 continue; 286 continue;
288 retval = 1; 287 retval = 1;
289 break; 288 break;
290 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 289 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
291 return retval; 290 return retval;
292 } 291 }
293 292
294 /* 293 /*
295 * Check to see if any process groups have become orphaned as 294 * Check to see if any process groups have become orphaned as
296 * a result of our exiting, and if they have any stopped jobs, 295 * a result of our exiting, and if they have any stopped jobs,
297 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 296 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
298 */ 297 */
299 static void 298 static void
300 kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) 299 kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
301 { 300 {
302 struct pid *pgrp = task_pgrp(tsk); 301 struct pid *pgrp = task_pgrp(tsk);
303 struct task_struct *ignored_task = tsk; 302 struct task_struct *ignored_task = tsk;
304 303
305 if (!parent) 304 if (!parent)
306 /* exit: our father is in a different pgrp than 305 /* exit: our father is in a different pgrp than
307 * we are and we were the only connection outside. 306 * we are and we were the only connection outside.
308 */ 307 */
309 parent = tsk->real_parent; 308 parent = tsk->real_parent;
310 else 309 else
311 /* reparent: our child is in a different pgrp than 310 /* reparent: our child is in a different pgrp than
312 * we are, and it was the only connection outside. 311 * we are, and it was the only connection outside.
313 */ 312 */
314 ignored_task = NULL; 313 ignored_task = NULL;
315 314
316 if (task_pgrp(parent) != pgrp && 315 if (task_pgrp(parent) != pgrp &&
317 task_session(parent) == task_session(tsk) && 316 task_session(parent) == task_session(tsk) &&
318 will_become_orphaned_pgrp(pgrp, ignored_task) && 317 will_become_orphaned_pgrp(pgrp, ignored_task) &&
319 has_stopped_jobs(pgrp)) { 318 has_stopped_jobs(pgrp)) {
320 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); 319 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
321 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); 320 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
322 } 321 }
323 } 322 }
324 323
325 /** 324 /**
326 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd 325 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
327 * 326 *
328 * If a kernel thread is launched as a result of a system call, or if 327 * If a kernel thread is launched as a result of a system call, or if
329 * it ever exits, it should generally reparent itself to kthreadd so it 328 * it ever exits, it should generally reparent itself to kthreadd so it
330 * isn't in the way of other processes and is correctly cleaned up on exit. 329 * isn't in the way of other processes and is correctly cleaned up on exit.
331 * 330 *
332 * The various task state such as scheduling policy and priority may have 331 * The various task state such as scheduling policy and priority may have
333 * been inherited from a user process, so we reset them to sane values here. 332 * been inherited from a user process, so we reset them to sane values here.
334 * 333 *
335 * NOTE that reparent_to_kthreadd() gives the caller full capabilities. 334 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
336 */ 335 */
337 static void reparent_to_kthreadd(void) 336 static void reparent_to_kthreadd(void)
338 { 337 {
339 write_lock_irq(&tasklist_lock); 338 write_lock_irq(&tasklist_lock);
340 339
341 ptrace_unlink(current); 340 ptrace_unlink(current);
342 /* Reparent to init */ 341 /* Reparent to init */
343 current->real_parent = current->parent = kthreadd_task; 342 current->real_parent = current->parent = kthreadd_task;
344 list_move_tail(&current->sibling, &current->real_parent->children); 343 list_move_tail(&current->sibling, &current->real_parent->children);
345 344
346 /* Set the exit signal to SIGCHLD so we signal init on exit */ 345 /* Set the exit signal to SIGCHLD so we signal init on exit */
347 current->exit_signal = SIGCHLD; 346 current->exit_signal = SIGCHLD;
348 347
349 if (task_nice(current) < 0) 348 if (task_nice(current) < 0)
350 set_user_nice(current, 0); 349 set_user_nice(current, 0);
351 /* cpus_allowed? */ 350 /* cpus_allowed? */
352 /* rt_priority? */ 351 /* rt_priority? */
353 /* signals? */ 352 /* signals? */
354 memcpy(current->signal->rlim, init_task.signal->rlim, 353 memcpy(current->signal->rlim, init_task.signal->rlim,
355 sizeof(current->signal->rlim)); 354 sizeof(current->signal->rlim));
356 355
357 atomic_inc(&init_cred.usage); 356 atomic_inc(&init_cred.usage);
358 commit_creds(&init_cred); 357 commit_creds(&init_cred);
359 write_unlock_irq(&tasklist_lock); 358 write_unlock_irq(&tasklist_lock);
360 } 359 }
361 360
362 void __set_special_pids(struct pid *pid) 361 void __set_special_pids(struct pid *pid)
363 { 362 {
364 struct task_struct *curr = current->group_leader; 363 struct task_struct *curr = current->group_leader;
365 364
366 if (task_session(curr) != pid) 365 if (task_session(curr) != pid)
367 change_pid(curr, PIDTYPE_SID, pid); 366 change_pid(curr, PIDTYPE_SID, pid);
368 367
369 if (task_pgrp(curr) != pid) 368 if (task_pgrp(curr) != pid)
370 change_pid(curr, PIDTYPE_PGID, pid); 369 change_pid(curr, PIDTYPE_PGID, pid);
371 } 370 }
372 371
373 static void set_special_pids(struct pid *pid) 372 static void set_special_pids(struct pid *pid)
374 { 373 {
375 write_lock_irq(&tasklist_lock); 374 write_lock_irq(&tasklist_lock);
376 __set_special_pids(pid); 375 __set_special_pids(pid);
377 write_unlock_irq(&tasklist_lock); 376 write_unlock_irq(&tasklist_lock);
378 } 377 }
379 378
380 /* 379 /*
381 * Let kernel threads use this to say that they allow a certain signal. 380 * Let kernel threads use this to say that they allow a certain signal.
382 * Must not be used if kthread was cloned with CLONE_SIGHAND. 381 * Must not be used if kthread was cloned with CLONE_SIGHAND.
383 */ 382 */
384 int allow_signal(int sig) 383 int allow_signal(int sig)
385 { 384 {
386 if (!valid_signal(sig) || sig < 1) 385 if (!valid_signal(sig) || sig < 1)
387 return -EINVAL; 386 return -EINVAL;
388 387
389 spin_lock_irq(&current->sighand->siglock); 388 spin_lock_irq(&current->sighand->siglock);
390 /* This is only needed for daemonize()'ed kthreads */ 389 /* This is only needed for daemonize()'ed kthreads */
391 sigdelset(&current->blocked, sig); 390 sigdelset(&current->blocked, sig);
392 /* 391 /*
393 * Kernel threads handle their own signals. Let the signal code 392 * Kernel threads handle their own signals. Let the signal code
394 * know it'll be handled, so that they don't get converted to 393 * know it'll be handled, so that they don't get converted to
395 * SIGKILL or just silently dropped. 394 * SIGKILL or just silently dropped.
396 */ 395 */
397 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; 396 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
398 recalc_sigpending(); 397 recalc_sigpending();
399 spin_unlock_irq(&current->sighand->siglock); 398 spin_unlock_irq(&current->sighand->siglock);
400 return 0; 399 return 0;
401 } 400 }
402 401
403 EXPORT_SYMBOL(allow_signal); 402 EXPORT_SYMBOL(allow_signal);
404 403
405 int disallow_signal(int sig) 404 int disallow_signal(int sig)
406 { 405 {
407 if (!valid_signal(sig) || sig < 1) 406 if (!valid_signal(sig) || sig < 1)
408 return -EINVAL; 407 return -EINVAL;
409 408
410 spin_lock_irq(&current->sighand->siglock); 409 spin_lock_irq(&current->sighand->siglock);
411 current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; 410 current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
412 recalc_sigpending(); 411 recalc_sigpending();
413 spin_unlock_irq(&current->sighand->siglock); 412 spin_unlock_irq(&current->sighand->siglock);
414 return 0; 413 return 0;
415 } 414 }
416 415
417 EXPORT_SYMBOL(disallow_signal); 416 EXPORT_SYMBOL(disallow_signal);
418 417
419 /* 418 /*
420 * Put all the gunge required to become a kernel thread without 419 * Put all the gunge required to become a kernel thread without
421 * attached user resources in one place where it belongs. 420 * attached user resources in one place where it belongs.
422 */ 421 */
423 422
424 void daemonize(const char *name, ...) 423 void daemonize(const char *name, ...)
425 { 424 {
426 va_list args; 425 va_list args;
427 sigset_t blocked; 426 sigset_t blocked;
428 427
429 va_start(args, name); 428 va_start(args, name);
430 vsnprintf(current->comm, sizeof(current->comm), name, args); 429 vsnprintf(current->comm, sizeof(current->comm), name, args);
431 va_end(args); 430 va_end(args);
432 431
433 /* 432 /*
434 * If we were started as result of loading a module, close all of the 433 * If we were started as result of loading a module, close all of the
435 * user space pages. We don't need them, and if we didn't close them 434 * user space pages. We don't need them, and if we didn't close them
436 * they would be locked into memory. 435 * they would be locked into memory.
437 */ 436 */
438 exit_mm(current); 437 exit_mm(current);
439 /* 438 /*
440 * We don't want to have TIF_FREEZE set if the system-wide hibernation 439 * We don't want to have TIF_FREEZE set if the system-wide hibernation
441 * or suspend transition begins right now. 440 * or suspend transition begins right now.
442 */ 441 */
443 current->flags |= (PF_NOFREEZE | PF_KTHREAD); 442 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
444 443
445 if (current->nsproxy != &init_nsproxy) { 444 if (current->nsproxy != &init_nsproxy) {
446 get_nsproxy(&init_nsproxy); 445 get_nsproxy(&init_nsproxy);
447 switch_task_namespaces(current, &init_nsproxy); 446 switch_task_namespaces(current, &init_nsproxy);
448 } 447 }
449 set_special_pids(&init_struct_pid); 448 set_special_pids(&init_struct_pid);
450 proc_clear_tty(current); 449 proc_clear_tty(current);
451 450
452 /* Block and flush all signals */ 451 /* Block and flush all signals */
453 sigfillset(&blocked); 452 sigfillset(&blocked);
454 sigprocmask(SIG_BLOCK, &blocked, NULL); 453 sigprocmask(SIG_BLOCK, &blocked, NULL);
455 flush_signals(current); 454 flush_signals(current);
456 455
457 /* Become as one with the init task */ 456 /* Become as one with the init task */
458 457
459 daemonize_fs_struct(); 458 daemonize_fs_struct();
460 exit_files(current); 459 exit_files(current);
461 current->files = init_task.files; 460 current->files = init_task.files;
462 atomic_inc(&current->files->count); 461 atomic_inc(&current->files->count);
463 462
464 reparent_to_kthreadd(); 463 reparent_to_kthreadd();
465 } 464 }
466 465
467 EXPORT_SYMBOL(daemonize); 466 EXPORT_SYMBOL(daemonize);
468 467
469 static void close_files(struct files_struct * files) 468 static void close_files(struct files_struct * files)
470 { 469 {
471 int i, j; 470 int i, j;
472 struct fdtable *fdt; 471 struct fdtable *fdt;
473 472
474 j = 0; 473 j = 0;
475 474
476 /* 475 /*
477 * It is safe to dereference the fd table without RCU or 476 * It is safe to dereference the fd table without RCU or
478 * ->file_lock because this is the last reference to the 477 * ->file_lock because this is the last reference to the
479 * files structure. But use RCU to shut RCU-lockdep up. 478 * files structure. But use RCU to shut RCU-lockdep up.
480 */ 479 */
481 rcu_read_lock(); 480 rcu_read_lock();
482 fdt = files_fdtable(files); 481 fdt = files_fdtable(files);
483 rcu_read_unlock(); 482 rcu_read_unlock();
484 for (;;) { 483 for (;;) {
485 unsigned long set; 484 unsigned long set;
486 i = j * __NFDBITS; 485 i = j * __NFDBITS;
487 if (i >= fdt->max_fds) 486 if (i >= fdt->max_fds)
488 break; 487 break;
489 set = fdt->open_fds->fds_bits[j++]; 488 set = fdt->open_fds->fds_bits[j++];
490 while (set) { 489 while (set) {
491 if (set & 1) { 490 if (set & 1) {
492 struct file * file = xchg(&fdt->fd[i], NULL); 491 struct file * file = xchg(&fdt->fd[i], NULL);
493 if (file) { 492 if (file) {
494 filp_close(file, files); 493 filp_close(file, files);
495 cond_resched(); 494 cond_resched();
496 } 495 }
497 } 496 }
498 i++; 497 i++;
499 set >>= 1; 498 set >>= 1;
500 } 499 }
501 } 500 }
502 } 501 }
503 502
504 struct files_struct *get_files_struct(struct task_struct *task) 503 struct files_struct *get_files_struct(struct task_struct *task)
505 { 504 {
506 struct files_struct *files; 505 struct files_struct *files;
507 506
508 task_lock(task); 507 task_lock(task);
509 files = task->files; 508 files = task->files;
510 if (files) 509 if (files)
511 atomic_inc(&files->count); 510 atomic_inc(&files->count);
512 task_unlock(task); 511 task_unlock(task);
513 512
514 return files; 513 return files;
515 } 514 }
516 515
517 void put_files_struct(struct files_struct *files) 516 void put_files_struct(struct files_struct *files)
518 { 517 {
519 struct fdtable *fdt; 518 struct fdtable *fdt;
520 519
521 if (atomic_dec_and_test(&files->count)) { 520 if (atomic_dec_and_test(&files->count)) {
522 close_files(files); 521 close_files(files);
523 /* 522 /*
524 * Free the fd and fdset arrays if we expanded them. 523 * Free the fd and fdset arrays if we expanded them.
525 * If the fdtable was embedded, pass files for freeing 524 * If the fdtable was embedded, pass files for freeing
526 * at the end of the RCU grace period. Otherwise, 525 * at the end of the RCU grace period. Otherwise,
527 * you can free files immediately. 526 * you can free files immediately.
528 */ 527 */
529 rcu_read_lock(); 528 rcu_read_lock();
530 fdt = files_fdtable(files); 529 fdt = files_fdtable(files);
531 if (fdt != &files->fdtab) 530 if (fdt != &files->fdtab)
532 kmem_cache_free(files_cachep, files); 531 kmem_cache_free(files_cachep, files);
533 free_fdtable(fdt); 532 free_fdtable(fdt);
534 rcu_read_unlock(); 533 rcu_read_unlock();
535 } 534 }
536 } 535 }
537 536
538 void reset_files_struct(struct files_struct *files) 537 void reset_files_struct(struct files_struct *files)
539 { 538 {
540 struct task_struct *tsk = current; 539 struct task_struct *tsk = current;
541 struct files_struct *old; 540 struct files_struct *old;
542 541
543 old = tsk->files; 542 old = tsk->files;
544 task_lock(tsk); 543 task_lock(tsk);
545 tsk->files = files; 544 tsk->files = files;
546 task_unlock(tsk); 545 task_unlock(tsk);
547 put_files_struct(old); 546 put_files_struct(old);
548 } 547 }
549 548
550 void exit_files(struct task_struct *tsk) 549 void exit_files(struct task_struct *tsk)
551 { 550 {
552 struct files_struct * files = tsk->files; 551 struct files_struct * files = tsk->files;
553 552
554 if (files) { 553 if (files) {
555 task_lock(tsk); 554 task_lock(tsk);
556 tsk->files = NULL; 555 tsk->files = NULL;
557 task_unlock(tsk); 556 task_unlock(tsk);
558 put_files_struct(files); 557 put_files_struct(files);
559 } 558 }
560 } 559 }
561 560
562 #ifdef CONFIG_MM_OWNER 561 #ifdef CONFIG_MM_OWNER
563 /* 562 /*
564 * Task p is exiting and it owned mm, lets find a new owner for it 563 * Task p is exiting and it owned mm, lets find a new owner for it
565 */ 564 */
566 static inline int 565 static inline int
567 mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) 566 mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
568 { 567 {
569 /* 568 /*
570 * If there are other users of the mm and the owner (us) is exiting 569 * If there are other users of the mm and the owner (us) is exiting
571 * we need to find a new owner to take on the responsibility. 570 * we need to find a new owner to take on the responsibility.
572 */ 571 */
573 if (atomic_read(&mm->mm_users) <= 1) 572 if (atomic_read(&mm->mm_users) <= 1)
574 return 0; 573 return 0;
575 if (mm->owner != p) 574 if (mm->owner != p)
576 return 0; 575 return 0;
577 return 1; 576 return 1;
578 } 577 }
579 578
580 void mm_update_next_owner(struct mm_struct *mm) 579 void mm_update_next_owner(struct mm_struct *mm)
581 { 580 {
582 struct task_struct *c, *g, *p = current; 581 struct task_struct *c, *g, *p = current;
583 582
584 retry: 583 retry:
585 if (!mm_need_new_owner(mm, p)) 584 if (!mm_need_new_owner(mm, p))
586 return; 585 return;
587 586
588 read_lock(&tasklist_lock); 587 read_lock(&tasklist_lock);
589 /* 588 /*
590 * Search in the children 589 * Search in the children
591 */ 590 */
592 list_for_each_entry(c, &p->children, sibling) { 591 list_for_each_entry(c, &p->children, sibling) {
593 if (c->mm == mm) 592 if (c->mm == mm)
594 goto assign_new_owner; 593 goto assign_new_owner;
595 } 594 }
596 595
597 /* 596 /*
598 * Search in the siblings 597 * Search in the siblings
599 */ 598 */
600 list_for_each_entry(c, &p->real_parent->children, sibling) { 599 list_for_each_entry(c, &p->real_parent->children, sibling) {
601 if (c->mm == mm) 600 if (c->mm == mm)
602 goto assign_new_owner; 601 goto assign_new_owner;
603 } 602 }
604 603
605 /* 604 /*
606 * Search through everything else. We should not get 605 * Search through everything else. We should not get
607 * here often 606 * here often
608 */ 607 */
609 do_each_thread(g, c) { 608 do_each_thread(g, c) {
610 if (c->mm == mm) 609 if (c->mm == mm)
611 goto assign_new_owner; 610 goto assign_new_owner;
612 } while_each_thread(g, c); 611 } while_each_thread(g, c);
613 612
614 read_unlock(&tasklist_lock); 613 read_unlock(&tasklist_lock);
615 /* 614 /*
616 * We found no owner yet mm_users > 1: this implies that we are 615 * We found no owner yet mm_users > 1: this implies that we are
617 * most likely racing with swapoff (try_to_unuse()) or /proc or 616 * most likely racing with swapoff (try_to_unuse()) or /proc or
618 * ptrace or page migration (get_task_mm()). Mark owner as NULL. 617 * ptrace or page migration (get_task_mm()). Mark owner as NULL.
619 */ 618 */
620 mm->owner = NULL; 619 mm->owner = NULL;
621 return; 620 return;
622 621
623 assign_new_owner: 622 assign_new_owner:
624 BUG_ON(c == p); 623 BUG_ON(c == p);
625 get_task_struct(c); 624 get_task_struct(c);
626 /* 625 /*
627 * The task_lock protects c->mm from changing. 626 * The task_lock protects c->mm from changing.
628 * We always want mm->owner->mm == mm 627 * We always want mm->owner->mm == mm
629 */ 628 */
630 task_lock(c); 629 task_lock(c);
631 /* 630 /*
632 * Delay read_unlock() till we have the task_lock() 631 * Delay read_unlock() till we have the task_lock()
633 * to ensure that c does not slip away underneath us 632 * to ensure that c does not slip away underneath us
634 */ 633 */
635 read_unlock(&tasklist_lock); 634 read_unlock(&tasklist_lock);
636 if (c->mm != mm) { 635 if (c->mm != mm) {
637 task_unlock(c); 636 task_unlock(c);
638 put_task_struct(c); 637 put_task_struct(c);
639 goto retry; 638 goto retry;
640 } 639 }
641 mm->owner = c; 640 mm->owner = c;
642 task_unlock(c); 641 task_unlock(c);
643 put_task_struct(c); 642 put_task_struct(c);
644 } 643 }
645 #endif /* CONFIG_MM_OWNER */ 644 #endif /* CONFIG_MM_OWNER */
646 645
647 /* 646 /*
648 * Turn us into a lazy TLB process if we 647 * Turn us into a lazy TLB process if we
649 * aren't already.. 648 * aren't already..
650 */ 649 */
651 static void exit_mm(struct task_struct * tsk) 650 static void exit_mm(struct task_struct * tsk)
652 { 651 {
653 struct mm_struct *mm = tsk->mm; 652 struct mm_struct *mm = tsk->mm;
654 struct core_state *core_state; 653 struct core_state *core_state;
655 654
656 mm_release(tsk, mm); 655 mm_release(tsk, mm);
657 if (!mm) 656 if (!mm)
658 return; 657 return;
659 /* 658 /*
660 * Serialize with any possible pending coredump. 659 * Serialize with any possible pending coredump.
661 * We must hold mmap_sem around checking core_state 660 * We must hold mmap_sem around checking core_state
662 * and clearing tsk->mm. The core-inducing thread 661 * and clearing tsk->mm. The core-inducing thread
663 * will increment ->nr_threads for each thread in the 662 * will increment ->nr_threads for each thread in the
664 * group with ->mm != NULL. 663 * group with ->mm != NULL.
665 */ 664 */
666 down_read(&mm->mmap_sem); 665 down_read(&mm->mmap_sem);
667 core_state = mm->core_state; 666 core_state = mm->core_state;
668 if (core_state) { 667 if (core_state) {
669 struct core_thread self; 668 struct core_thread self;
670 up_read(&mm->mmap_sem); 669 up_read(&mm->mmap_sem);
671 670
672 self.task = tsk; 671 self.task = tsk;
673 self.next = xchg(&core_state->dumper.next, &self); 672 self.next = xchg(&core_state->dumper.next, &self);
674 /* 673 /*
675 * Implies mb(), the result of xchg() must be visible 674 * Implies mb(), the result of xchg() must be visible
676 * to core_state->dumper. 675 * to core_state->dumper.
677 */ 676 */
678 if (atomic_dec_and_test(&core_state->nr_threads)) 677 if (atomic_dec_and_test(&core_state->nr_threads))
679 complete(&core_state->startup); 678 complete(&core_state->startup);
680 679
681 for (;;) { 680 for (;;) {
682 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 681 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
683 if (!self.task) /* see coredump_finish() */ 682 if (!self.task) /* see coredump_finish() */
684 break; 683 break;
685 schedule(); 684 schedule();
686 } 685 }
687 __set_task_state(tsk, TASK_RUNNING); 686 __set_task_state(tsk, TASK_RUNNING);
688 down_read(&mm->mmap_sem); 687 down_read(&mm->mmap_sem);
689 } 688 }
690 atomic_inc(&mm->mm_count); 689 atomic_inc(&mm->mm_count);
691 BUG_ON(mm != tsk->active_mm); 690 BUG_ON(mm != tsk->active_mm);
692 /* more a memory barrier than a real lock */ 691 /* more a memory barrier than a real lock */
693 task_lock(tsk); 692 task_lock(tsk);
694 tsk->mm = NULL; 693 tsk->mm = NULL;
695 up_read(&mm->mmap_sem); 694 up_read(&mm->mmap_sem);
696 enter_lazy_tlb(mm, current); 695 enter_lazy_tlb(mm, current);
697 /* We don't want this task to be frozen prematurely */ 696 /* We don't want this task to be frozen prematurely */
698 clear_freeze_flag(tsk); 697 clear_freeze_flag(tsk);
699 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) 698 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
700 atomic_dec(&mm->oom_disable_count); 699 atomic_dec(&mm->oom_disable_count);
701 task_unlock(tsk); 700 task_unlock(tsk);
702 mm_update_next_owner(mm); 701 mm_update_next_owner(mm);
703 mmput(mm); 702 mmput(mm);
704 } 703 }
705 704
706 /* 705 /*
707 * When we die, we re-parent all our children. 706 * When we die, we re-parent all our children.
708 * Try to give them to another thread in our thread 707 * Try to give them to another thread in our thread
709 * group, and if no such member exists, give it to 708 * group, and if no such member exists, give it to
710 * the child reaper process (ie "init") in our pid 709 * the child reaper process (ie "init") in our pid
711 * space. 710 * space.
712 */ 711 */
713 static struct task_struct *find_new_reaper(struct task_struct *father) 712 static struct task_struct *find_new_reaper(struct task_struct *father)
714 __releases(&tasklist_lock) 713 __releases(&tasklist_lock)
715 __acquires(&tasklist_lock) 714 __acquires(&tasklist_lock)
716 { 715 {
717 struct pid_namespace *pid_ns = task_active_pid_ns(father); 716 struct pid_namespace *pid_ns = task_active_pid_ns(father);
718 struct task_struct *thread; 717 struct task_struct *thread;
719 718
720 thread = father; 719 thread = father;
721 while_each_thread(father, thread) { 720 while_each_thread(father, thread) {
722 if (thread->flags & PF_EXITING) 721 if (thread->flags & PF_EXITING)
723 continue; 722 continue;
724 if (unlikely(pid_ns->child_reaper == father)) 723 if (unlikely(pid_ns->child_reaper == father))
725 pid_ns->child_reaper = thread; 724 pid_ns->child_reaper = thread;
726 return thread; 725 return thread;
727 } 726 }
728 727
729 if (unlikely(pid_ns->child_reaper == father)) { 728 if (unlikely(pid_ns->child_reaper == father)) {
730 write_unlock_irq(&tasklist_lock); 729 write_unlock_irq(&tasklist_lock);
731 if (unlikely(pid_ns == &init_pid_ns)) 730 if (unlikely(pid_ns == &init_pid_ns))
732 panic("Attempted to kill init!"); 731 panic("Attempted to kill init!");
733 732
734 zap_pid_ns_processes(pid_ns); 733 zap_pid_ns_processes(pid_ns);
735 write_lock_irq(&tasklist_lock); 734 write_lock_irq(&tasklist_lock);
736 /* 735 /*
737 * We can not clear ->child_reaper or leave it alone. 736 * We can not clear ->child_reaper or leave it alone.
738 * There may by stealth EXIT_DEAD tasks on ->children, 737 * There may by stealth EXIT_DEAD tasks on ->children,
739 * forget_original_parent() must move them somewhere. 738 * forget_original_parent() must move them somewhere.
740 */ 739 */
741 pid_ns->child_reaper = init_pid_ns.child_reaper; 740 pid_ns->child_reaper = init_pid_ns.child_reaper;
742 } 741 }
743 742
744 return pid_ns->child_reaper; 743 return pid_ns->child_reaper;
745 } 744 }
746 745
747 /* 746 /*
748 * Any that need to be release_task'd are put on the @dead list. 747 * Any that need to be release_task'd are put on the @dead list.
749 */ 748 */
750 static void reparent_leader(struct task_struct *father, struct task_struct *p, 749 static void reparent_leader(struct task_struct *father, struct task_struct *p,
751 struct list_head *dead) 750 struct list_head *dead)
752 { 751 {
753 list_move_tail(&p->sibling, &p->real_parent->children); 752 list_move_tail(&p->sibling, &p->real_parent->children);
754 753
755 if (task_detached(p)) 754 if (task_detached(p))
756 return; 755 return;
757 /* 756 /*
758 * If this is a threaded reparent there is no need to 757 * If this is a threaded reparent there is no need to
759 * notify anyone anything has happened. 758 * notify anyone anything has happened.
760 */ 759 */
761 if (same_thread_group(p->real_parent, father)) 760 if (same_thread_group(p->real_parent, father))
762 return; 761 return;
763 762
764 /* We don't want people slaying init. */ 763 /* We don't want people slaying init. */
765 p->exit_signal = SIGCHLD; 764 p->exit_signal = SIGCHLD;
766 765
767 /* If it has exited notify the new parent about this child's death. */ 766 /* If it has exited notify the new parent about this child's death. */
768 if (!task_ptrace(p) && 767 if (!task_ptrace(p) &&
769 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 768 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
770 do_notify_parent(p, p->exit_signal); 769 do_notify_parent(p, p->exit_signal);
771 if (task_detached(p)) { 770 if (task_detached(p)) {
772 p->exit_state = EXIT_DEAD; 771 p->exit_state = EXIT_DEAD;
773 list_move_tail(&p->sibling, dead); 772 list_move_tail(&p->sibling, dead);
774 } 773 }
775 } 774 }
776 775
777 kill_orphaned_pgrp(p, father); 776 kill_orphaned_pgrp(p, father);
778 } 777 }
779 778
780 static void forget_original_parent(struct task_struct *father) 779 static void forget_original_parent(struct task_struct *father)
781 { 780 {
782 struct task_struct *p, *n, *reaper; 781 struct task_struct *p, *n, *reaper;
783 LIST_HEAD(dead_children); 782 LIST_HEAD(dead_children);
784 783
785 write_lock_irq(&tasklist_lock); 784 write_lock_irq(&tasklist_lock);
786 /* 785 /*
787 * Note that exit_ptrace() and find_new_reaper() might 786 * Note that exit_ptrace() and find_new_reaper() might
788 * drop tasklist_lock and reacquire it. 787 * drop tasklist_lock and reacquire it.
789 */ 788 */
790 exit_ptrace(father); 789 exit_ptrace(father);
791 reaper = find_new_reaper(father); 790 reaper = find_new_reaper(father);
792 791
793 list_for_each_entry_safe(p, n, &father->children, sibling) { 792 list_for_each_entry_safe(p, n, &father->children, sibling) {
794 struct task_struct *t = p; 793 struct task_struct *t = p;
795 do { 794 do {
796 t->real_parent = reaper; 795 t->real_parent = reaper;
797 if (t->parent == father) { 796 if (t->parent == father) {
798 BUG_ON(task_ptrace(t)); 797 BUG_ON(task_ptrace(t));
799 t->parent = t->real_parent; 798 t->parent = t->real_parent;
800 } 799 }
801 if (t->pdeath_signal) 800 if (t->pdeath_signal)
802 group_send_sig_info(t->pdeath_signal, 801 group_send_sig_info(t->pdeath_signal,
803 SEND_SIG_NOINFO, t); 802 SEND_SIG_NOINFO, t);
804 } while_each_thread(p, t); 803 } while_each_thread(p, t);
805 reparent_leader(father, p, &dead_children); 804 reparent_leader(father, p, &dead_children);
806 } 805 }
807 write_unlock_irq(&tasklist_lock); 806 write_unlock_irq(&tasklist_lock);
808 807
809 BUG_ON(!list_empty(&father->children)); 808 BUG_ON(!list_empty(&father->children));
810 809
811 list_for_each_entry_safe(p, n, &dead_children, sibling) { 810 list_for_each_entry_safe(p, n, &dead_children, sibling) {
812 list_del_init(&p->sibling); 811 list_del_init(&p->sibling);
813 release_task(p); 812 release_task(p);
814 } 813 }
815 } 814 }
816 815
817 /* 816 /*
818 * Send signals to all our closest relatives so that they know 817 * Send signals to all our closest relatives so that they know
819 * to properly mourn us.. 818 * to properly mourn us..
820 */ 819 */
821 static void exit_notify(struct task_struct *tsk, int group_dead) 820 static void exit_notify(struct task_struct *tsk, int group_dead)
822 { 821 {
823 int signal; 822 int signal;
824 void *cookie; 823 void *cookie;
825 824
826 /* 825 /*
827 * This does two things: 826 * This does two things:
828 * 827 *
829 * A. Make init inherit all the child processes 828 * A. Make init inherit all the child processes
830 * B. Check to see if any process groups have become orphaned 829 * B. Check to see if any process groups have become orphaned
831 * as a result of our exiting, and if they have any stopped 830 * as a result of our exiting, and if they have any stopped
832 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 831 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
833 */ 832 */
834 forget_original_parent(tsk); 833 forget_original_parent(tsk);
835 exit_task_namespaces(tsk); 834 exit_task_namespaces(tsk);
836 835
837 write_lock_irq(&tasklist_lock); 836 write_lock_irq(&tasklist_lock);
838 if (group_dead) 837 if (group_dead)
839 kill_orphaned_pgrp(tsk->group_leader, NULL); 838 kill_orphaned_pgrp(tsk->group_leader, NULL);
840 839
841 /* Let father know we died 840 /* Let father know we died
842 * 841 *
843 * Thread signals are configurable, but you aren't going to use 842 * Thread signals are configurable, but you aren't going to use
844 * that to send signals to arbitrary processes. 843 * that to send signals to arbitrary processes.
845 * That stops right now. 844 * That stops right now.
846 * 845 *
847 * If the parent exec id doesn't match the exec id we saved 846 * If the parent exec id doesn't match the exec id we saved
848 * when we started then we know the parent has changed security 847 * when we started then we know the parent has changed security
849 * domain. 848 * domain.
850 * 849 *
851 * If our self_exec id doesn't match our parent_exec_id then 850 * If our self_exec id doesn't match our parent_exec_id then
852 * we have changed execution domain as these two values started 851 * we have changed execution domain as these two values started
853 * the same after a fork. 852 * the same after a fork.
854 */ 853 */
855 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && 854 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
856 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 855 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
857 tsk->self_exec_id != tsk->parent_exec_id)) 856 tsk->self_exec_id != tsk->parent_exec_id))
858 tsk->exit_signal = SIGCHLD; 857 tsk->exit_signal = SIGCHLD;
859 858
860 signal = tracehook_notify_death(tsk, &cookie, group_dead); 859 signal = tracehook_notify_death(tsk, &cookie, group_dead);
861 if (signal >= 0) 860 if (signal >= 0)
862 signal = do_notify_parent(tsk, signal); 861 signal = do_notify_parent(tsk, signal);
863 862
864 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 863 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
865 864
866 /* mt-exec, de_thread() is waiting for group leader */ 865 /* mt-exec, de_thread() is waiting for group leader */
867 if (unlikely(tsk->signal->notify_count < 0)) 866 if (unlikely(tsk->signal->notify_count < 0))
868 wake_up_process(tsk->signal->group_exit_task); 867 wake_up_process(tsk->signal->group_exit_task);
869 write_unlock_irq(&tasklist_lock); 868 write_unlock_irq(&tasklist_lock);
870 869
871 tracehook_report_death(tsk, signal, cookie, group_dead); 870 tracehook_report_death(tsk, signal, cookie, group_dead);
872 871
873 /* If the process is dead, release it - nobody will wait for it */ 872 /* If the process is dead, release it - nobody will wait for it */
874 if (signal == DEATH_REAP) 873 if (signal == DEATH_REAP)
875 release_task(tsk); 874 release_task(tsk);
876 } 875 }
877 876
878 #ifdef CONFIG_DEBUG_STACK_USAGE 877 #ifdef CONFIG_DEBUG_STACK_USAGE
879 static void check_stack_usage(void) 878 static void check_stack_usage(void)
880 { 879 {
881 static DEFINE_SPINLOCK(low_water_lock); 880 static DEFINE_SPINLOCK(low_water_lock);
882 static int lowest_to_date = THREAD_SIZE; 881 static int lowest_to_date = THREAD_SIZE;
883 unsigned long free; 882 unsigned long free;
884 883
885 free = stack_not_used(current); 884 free = stack_not_used(current);
886 885
887 if (free >= lowest_to_date) 886 if (free >= lowest_to_date)
888 return; 887 return;
889 888
890 spin_lock(&low_water_lock); 889 spin_lock(&low_water_lock);
891 if (free < lowest_to_date) { 890 if (free < lowest_to_date) {
892 printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " 891 printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
893 "left\n", 892 "left\n",
894 current->comm, free); 893 current->comm, free);
895 lowest_to_date = free; 894 lowest_to_date = free;
896 } 895 }
897 spin_unlock(&low_water_lock); 896 spin_unlock(&low_water_lock);
898 } 897 }
899 #else 898 #else
900 static inline void check_stack_usage(void) {} 899 static inline void check_stack_usage(void) {}
901 #endif 900 #endif
902 901
903 NORET_TYPE void do_exit(long code) 902 NORET_TYPE void do_exit(long code)
904 { 903 {
905 struct task_struct *tsk = current; 904 struct task_struct *tsk = current;
906 int group_dead; 905 int group_dead;
907 906
908 profile_task_exit(tsk); 907 profile_task_exit(tsk);
909 908
910 WARN_ON(atomic_read(&tsk->fs_excl)); 909 WARN_ON(atomic_read(&tsk->fs_excl));
911 WARN_ON(blk_needs_flush_plug(tsk)); 910 WARN_ON(blk_needs_flush_plug(tsk));
912 911
913 if (unlikely(in_interrupt())) 912 if (unlikely(in_interrupt()))
914 panic("Aiee, killing interrupt handler!"); 913 panic("Aiee, killing interrupt handler!");
915 if (unlikely(!tsk->pid)) 914 if (unlikely(!tsk->pid))
916 panic("Attempted to kill the idle task!"); 915 panic("Attempted to kill the idle task!");
917 916
918 /* 917 /*
919 * If do_exit is called because this processes oopsed, it's possible 918 * If do_exit is called because this processes oopsed, it's possible
920 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before 919 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
921 * continuing. Amongst other possible reasons, this is to prevent 920 * continuing. Amongst other possible reasons, this is to prevent
922 * mm_release()->clear_child_tid() from writing to a user-controlled 921 * mm_release()->clear_child_tid() from writing to a user-controlled
923 * kernel address. 922 * kernel address.
924 */ 923 */
925 set_fs(USER_DS); 924 set_fs(USER_DS);
926 925
927 tracehook_report_exit(&code); 926 tracehook_report_exit(&code);
928 927
929 validate_creds_for_do_exit(tsk); 928 validate_creds_for_do_exit(tsk);
930 929
931 /* 930 /*
932 * We're taking recursive faults here in do_exit. Safest is to just 931 * We're taking recursive faults here in do_exit. Safest is to just
933 * leave this task alone and wait for reboot. 932 * leave this task alone and wait for reboot.
934 */ 933 */
935 if (unlikely(tsk->flags & PF_EXITING)) { 934 if (unlikely(tsk->flags & PF_EXITING)) {
936 printk(KERN_ALERT 935 printk(KERN_ALERT
937 "Fixing recursive fault but reboot is needed!\n"); 936 "Fixing recursive fault but reboot is needed!\n");
938 /* 937 /*
939 * We can do this unlocked here. The futex code uses 938 * We can do this unlocked here. The futex code uses
940 * this flag just to verify whether the pi state 939 * this flag just to verify whether the pi state
941 * cleanup has been done or not. In the worst case it 940 * cleanup has been done or not. In the worst case it
942 * loops once more. We pretend that the cleanup was 941 * loops once more. We pretend that the cleanup was
943 * done as there is no way to return. Either the 942 * done as there is no way to return. Either the
944 * OWNER_DIED bit is set by now or we push the blocked 943 * OWNER_DIED bit is set by now or we push the blocked
945 * task into the wait for ever nirwana as well. 944 * task into the wait for ever nirwana as well.
946 */ 945 */
947 tsk->flags |= PF_EXITPIDONE; 946 tsk->flags |= PF_EXITPIDONE;
948 set_current_state(TASK_UNINTERRUPTIBLE); 947 set_current_state(TASK_UNINTERRUPTIBLE);
949 schedule(); 948 schedule();
950 } 949 }
951 950
952 exit_irq_thread(); 951 exit_irq_thread();
953 952
954 exit_signals(tsk); /* sets PF_EXITING */ 953 exit_signals(tsk); /* sets PF_EXITING */
955 /* 954 /*
956 * tsk->flags are checked in the futex code to protect against 955 * tsk->flags are checked in the futex code to protect against
957 * an exiting task cleaning up the robust pi futexes. 956 * an exiting task cleaning up the robust pi futexes.
958 */ 957 */
959 smp_mb(); 958 smp_mb();
960 raw_spin_unlock_wait(&tsk->pi_lock); 959 raw_spin_unlock_wait(&tsk->pi_lock);
961 960
962 if (unlikely(in_atomic())) 961 if (unlikely(in_atomic()))
963 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 962 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
964 current->comm, task_pid_nr(current), 963 current->comm, task_pid_nr(current),
965 preempt_count()); 964 preempt_count());
966 965
967 acct_update_integrals(tsk); 966 acct_update_integrals(tsk);
968 /* sync mm's RSS info before statistics gathering */ 967 /* sync mm's RSS info before statistics gathering */
969 if (tsk->mm) 968 if (tsk->mm)
970 sync_mm_rss(tsk, tsk->mm); 969 sync_mm_rss(tsk, tsk->mm);
971 group_dead = atomic_dec_and_test(&tsk->signal->live); 970 group_dead = atomic_dec_and_test(&tsk->signal->live);
972 if (group_dead) { 971 if (group_dead) {
973 hrtimer_cancel(&tsk->signal->real_timer); 972 hrtimer_cancel(&tsk->signal->real_timer);
974 exit_itimers(tsk->signal); 973 exit_itimers(tsk->signal);
975 if (tsk->mm) 974 if (tsk->mm)
976 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); 975 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
977 } 976 }
978 acct_collect(code, group_dead); 977 acct_collect(code, group_dead);
979 if (group_dead) 978 if (group_dead)
980 tty_audit_exit(); 979 tty_audit_exit();
981 if (unlikely(tsk->audit_context)) 980 if (unlikely(tsk->audit_context))
982 audit_free(tsk); 981 audit_free(tsk);
983 982
984 tsk->exit_code = code; 983 tsk->exit_code = code;
985 taskstats_exit(tsk, group_dead); 984 taskstats_exit(tsk, group_dead);
986 985
987 exit_mm(tsk); 986 exit_mm(tsk);
988 987
989 if (group_dead) 988 if (group_dead)
990 acct_process(); 989 acct_process();
991 trace_sched_process_exit(tsk); 990 trace_sched_process_exit(tsk);
992 991
993 exit_sem(tsk); 992 exit_sem(tsk);
994 exit_files(tsk); 993 exit_files(tsk);
995 exit_fs(tsk); 994 exit_fs(tsk);
996 check_stack_usage(); 995 check_stack_usage();
997 exit_thread(); 996 exit_thread();
998 997
999 /* 998 /*
1000 * Flush inherited counters to the parent - before the parent 999 * Flush inherited counters to the parent - before the parent
1001 * gets woken up by child-exit notifications. 1000 * gets woken up by child-exit notifications.
1002 * 1001 *
1003 * because of cgroup mode, must be called before cgroup_exit() 1002 * because of cgroup mode, must be called before cgroup_exit()
1004 */ 1003 */
1005 perf_event_exit_task(tsk); 1004 perf_event_exit_task(tsk);
1006 1005
1007 cgroup_exit(tsk, 1); 1006 cgroup_exit(tsk, 1);
1008 1007
1009 if (group_dead) 1008 if (group_dead)
1010 disassociate_ctty(1); 1009 disassociate_ctty(1);
1011 1010
1012 module_put(task_thread_info(tsk)->exec_domain->module); 1011 module_put(task_thread_info(tsk)->exec_domain->module);
1013 1012
1014 proc_exit_connector(tsk); 1013 proc_exit_connector(tsk);
1015 1014
1016 /* 1015 /*
1017 * FIXME: do that only when needed, using sched_exit tracepoint 1016 * FIXME: do that only when needed, using sched_exit tracepoint
1018 */ 1017 */
1019 ptrace_put_breakpoints(tsk); 1018 ptrace_put_breakpoints(tsk);
1020 1019
1021 exit_notify(tsk, group_dead); 1020 exit_notify(tsk, group_dead);
1022 #ifdef CONFIG_NUMA 1021 #ifdef CONFIG_NUMA
1023 task_lock(tsk); 1022 task_lock(tsk);
1024 mpol_put(tsk->mempolicy); 1023 mpol_put(tsk->mempolicy);
1025 tsk->mempolicy = NULL; 1024 tsk->mempolicy = NULL;
1026 task_unlock(tsk); 1025 task_unlock(tsk);
1027 #endif 1026 #endif
1028 #ifdef CONFIG_FUTEX 1027 #ifdef CONFIG_FUTEX
1029 if (unlikely(current->pi_state_cache)) 1028 if (unlikely(current->pi_state_cache))
1030 kfree(current->pi_state_cache); 1029 kfree(current->pi_state_cache);
1031 #endif 1030 #endif
1032 /* 1031 /*
1033 * Make sure we are holding no locks: 1032 * Make sure we are holding no locks:
1034 */ 1033 */
1035 debug_check_no_locks_held(tsk); 1034 debug_check_no_locks_held(tsk);
1036 /* 1035 /*
1037 * We can do this unlocked here. The futex code uses this flag 1036 * We can do this unlocked here. The futex code uses this flag
1038 * just to verify whether the pi state cleanup has been done 1037 * just to verify whether the pi state cleanup has been done
1039 * or not. In the worst case it loops once more. 1038 * or not. In the worst case it loops once more.
1040 */ 1039 */
1041 tsk->flags |= PF_EXITPIDONE; 1040 tsk->flags |= PF_EXITPIDONE;
1042 1041
1043 if (tsk->io_context) 1042 if (tsk->io_context)
1044 exit_io_context(tsk); 1043 exit_io_context(tsk);
1045 1044
1046 if (tsk->splice_pipe) 1045 if (tsk->splice_pipe)
1047 __free_pipe_info(tsk->splice_pipe); 1046 __free_pipe_info(tsk->splice_pipe);
1048 1047
1049 validate_creds_for_do_exit(tsk); 1048 validate_creds_for_do_exit(tsk);
1050 1049
1051 preempt_disable(); 1050 preempt_disable();
1052 exit_rcu(); 1051 exit_rcu();
1053 /* causes final put_task_struct in finish_task_switch(). */ 1052 /* causes final put_task_struct in finish_task_switch(). */
1054 tsk->state = TASK_DEAD; 1053 tsk->state = TASK_DEAD;
1055 schedule(); 1054 schedule();
1056 BUG(); 1055 BUG();
1057 /* Avoid "noreturn function does return". */ 1056 /* Avoid "noreturn function does return". */
1058 for (;;) 1057 for (;;)
1059 cpu_relax(); /* For when BUG is null */ 1058 cpu_relax(); /* For when BUG is null */
1060 } 1059 }
1061 1060
1062 EXPORT_SYMBOL_GPL(do_exit); 1061 EXPORT_SYMBOL_GPL(do_exit);
1063 1062
1064 NORET_TYPE void complete_and_exit(struct completion *comp, long code) 1063 NORET_TYPE void complete_and_exit(struct completion *comp, long code)
1065 { 1064 {
1066 if (comp) 1065 if (comp)
1067 complete(comp); 1066 complete(comp);
1068 1067
1069 do_exit(code); 1068 do_exit(code);
1070 } 1069 }
1071 1070
1072 EXPORT_SYMBOL(complete_and_exit); 1071 EXPORT_SYMBOL(complete_and_exit);
1073 1072
1074 SYSCALL_DEFINE1(exit, int, error_code) 1073 SYSCALL_DEFINE1(exit, int, error_code)
1075 { 1074 {
1076 do_exit((error_code&0xff)<<8); 1075 do_exit((error_code&0xff)<<8);
1077 } 1076 }
1078 1077
1079 /* 1078 /*
1080 * Take down every thread in the group. This is called by fatal signals 1079 * Take down every thread in the group. This is called by fatal signals
1081 * as well as by sys_exit_group (below). 1080 * as well as by sys_exit_group (below).
1082 */ 1081 */
1083 NORET_TYPE void 1082 NORET_TYPE void
1084 do_group_exit(int exit_code) 1083 do_group_exit(int exit_code)
1085 { 1084 {
1086 struct signal_struct *sig = current->signal; 1085 struct signal_struct *sig = current->signal;
1087 1086
1088 BUG_ON(exit_code & 0x80); /* core dumps don't get here */ 1087 BUG_ON(exit_code & 0x80); /* core dumps don't get here */
1089 1088
1090 if (signal_group_exit(sig)) 1089 if (signal_group_exit(sig))
1091 exit_code = sig->group_exit_code; 1090 exit_code = sig->group_exit_code;
1092 else if (!thread_group_empty(current)) { 1091 else if (!thread_group_empty(current)) {
1093 struct sighand_struct *const sighand = current->sighand; 1092 struct sighand_struct *const sighand = current->sighand;
1094 spin_lock_irq(&sighand->siglock); 1093 spin_lock_irq(&sighand->siglock);
1095 if (signal_group_exit(sig)) 1094 if (signal_group_exit(sig))
1096 /* Another thread got here before we took the lock. */ 1095 /* Another thread got here before we took the lock. */
1097 exit_code = sig->group_exit_code; 1096 exit_code = sig->group_exit_code;
1098 else { 1097 else {
1099 sig->group_exit_code = exit_code; 1098 sig->group_exit_code = exit_code;
1100 sig->flags = SIGNAL_GROUP_EXIT; 1099 sig->flags = SIGNAL_GROUP_EXIT;
1101 zap_other_threads(current); 1100 zap_other_threads(current);
1102 } 1101 }
1103 spin_unlock_irq(&sighand->siglock); 1102 spin_unlock_irq(&sighand->siglock);
1104 } 1103 }
1105 1104
1106 do_exit(exit_code); 1105 do_exit(exit_code);
1107 /* NOTREACHED */ 1106 /* NOTREACHED */
1108 } 1107 }
1109 1108
1110 /* 1109 /*
1111 * this kills every thread in the thread group. Note that any externally 1110 * this kills every thread in the thread group. Note that any externally
1112 * wait4()-ing process will get the correct exit code - even if this 1111 * wait4()-ing process will get the correct exit code - even if this
1113 * thread is not the thread group leader. 1112 * thread is not the thread group leader.
1114 */ 1113 */
1115 SYSCALL_DEFINE1(exit_group, int, error_code) 1114 SYSCALL_DEFINE1(exit_group, int, error_code)
1116 { 1115 {
1117 do_group_exit((error_code & 0xff) << 8); 1116 do_group_exit((error_code & 0xff) << 8);
1118 /* NOTREACHED */ 1117 /* NOTREACHED */
1119 return 0; 1118 return 0;
1120 } 1119 }
1121 1120
1122 struct wait_opts { 1121 struct wait_opts {
1123 enum pid_type wo_type; 1122 enum pid_type wo_type;
1124 int wo_flags; 1123 int wo_flags;
1125 struct pid *wo_pid; 1124 struct pid *wo_pid;
1126 1125
1127 struct siginfo __user *wo_info; 1126 struct siginfo __user *wo_info;
1128 int __user *wo_stat; 1127 int __user *wo_stat;
1129 struct rusage __user *wo_rusage; 1128 struct rusage __user *wo_rusage;
1130 1129
1131 wait_queue_t child_wait; 1130 wait_queue_t child_wait;
1132 int notask_error; 1131 int notask_error;
1133 }; 1132 };
1134 1133
1135 static inline 1134 static inline
1136 struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1135 struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1137 { 1136 {
1138 if (type != PIDTYPE_PID) 1137 if (type != PIDTYPE_PID)
1139 task = task->group_leader; 1138 task = task->group_leader;
1140 return task->pids[type].pid; 1139 return task->pids[type].pid;
1141 } 1140 }
1142 1141
1143 static int eligible_pid(struct wait_opts *wo, struct task_struct *p) 1142 static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1144 { 1143 {
1145 return wo->wo_type == PIDTYPE_MAX || 1144 return wo->wo_type == PIDTYPE_MAX ||
1146 task_pid_type(p, wo->wo_type) == wo->wo_pid; 1145 task_pid_type(p, wo->wo_type) == wo->wo_pid;
1147 } 1146 }
1148 1147
1149 static int eligible_child(struct wait_opts *wo, struct task_struct *p) 1148 static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1150 { 1149 {
1151 if (!eligible_pid(wo, p)) 1150 if (!eligible_pid(wo, p))
1152 return 0; 1151 return 0;
1153 /* Wait for all children (clone and not) if __WALL is set; 1152 /* Wait for all children (clone and not) if __WALL is set;
1154 * otherwise, wait for clone children *only* if __WCLONE is 1153 * otherwise, wait for clone children *only* if __WCLONE is
1155 * set; otherwise, wait for non-clone children *only*. (Note: 1154 * set; otherwise, wait for non-clone children *only*. (Note:
1156 * A "clone" child here is one that reports to its parent 1155 * A "clone" child here is one that reports to its parent
1157 * using a signal other than SIGCHLD.) */ 1156 * using a signal other than SIGCHLD.) */
1158 if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) 1157 if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1159 && !(wo->wo_flags & __WALL)) 1158 && !(wo->wo_flags & __WALL))
1160 return 0; 1159 return 0;
1161 1160
1162 return 1; 1161 return 1;
1163 } 1162 }
1164 1163
1165 static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, 1164 static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1166 pid_t pid, uid_t uid, int why, int status) 1165 pid_t pid, uid_t uid, int why, int status)
1167 { 1166 {
1168 struct siginfo __user *infop; 1167 struct siginfo __user *infop;
1169 int retval = wo->wo_rusage 1168 int retval = wo->wo_rusage
1170 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1169 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1171 1170
1172 put_task_struct(p); 1171 put_task_struct(p);
1173 infop = wo->wo_info; 1172 infop = wo->wo_info;
1174 if (infop) { 1173 if (infop) {
1175 if (!retval) 1174 if (!retval)
1176 retval = put_user(SIGCHLD, &infop->si_signo); 1175 retval = put_user(SIGCHLD, &infop->si_signo);
1177 if (!retval) 1176 if (!retval)
1178 retval = put_user(0, &infop->si_errno); 1177 retval = put_user(0, &infop->si_errno);
1179 if (!retval) 1178 if (!retval)
1180 retval = put_user((short)why, &infop->si_code); 1179 retval = put_user((short)why, &infop->si_code);
1181 if (!retval) 1180 if (!retval)
1182 retval = put_user(pid, &infop->si_pid); 1181 retval = put_user(pid, &infop->si_pid);
1183 if (!retval) 1182 if (!retval)
1184 retval = put_user(uid, &infop->si_uid); 1183 retval = put_user(uid, &infop->si_uid);
1185 if (!retval) 1184 if (!retval)
1186 retval = put_user(status, &infop->si_status); 1185 retval = put_user(status, &infop->si_status);
1187 } 1186 }
1188 if (!retval) 1187 if (!retval)
1189 retval = pid; 1188 retval = pid;
1190 return retval; 1189 return retval;
1191 } 1190 }
1192 1191
1193 /* 1192 /*
1194 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold 1193 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1195 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1194 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1196 * the lock and this task is uninteresting. If we return nonzero, we have 1195 * the lock and this task is uninteresting. If we return nonzero, we have
1197 * released the lock and the system call should return. 1196 * released the lock and the system call should return.
1198 */ 1197 */
1199 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 1198 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1200 { 1199 {
1201 unsigned long state; 1200 unsigned long state;
1202 int retval, status, traced; 1201 int retval, status, traced;
1203 pid_t pid = task_pid_vnr(p); 1202 pid_t pid = task_pid_vnr(p);
1204 uid_t uid = __task_cred(p)->uid; 1203 uid_t uid = __task_cred(p)->uid;
1205 struct siginfo __user *infop; 1204 struct siginfo __user *infop;
1206 1205
1207 if (!likely(wo->wo_flags & WEXITED)) 1206 if (!likely(wo->wo_flags & WEXITED))
1208 return 0; 1207 return 0;
1209 1208
1210 if (unlikely(wo->wo_flags & WNOWAIT)) { 1209 if (unlikely(wo->wo_flags & WNOWAIT)) {
1211 int exit_code = p->exit_code; 1210 int exit_code = p->exit_code;
1212 int why; 1211 int why;
1213 1212
1214 get_task_struct(p); 1213 get_task_struct(p);
1215 read_unlock(&tasklist_lock); 1214 read_unlock(&tasklist_lock);
1216 if ((exit_code & 0x7f) == 0) { 1215 if ((exit_code & 0x7f) == 0) {
1217 why = CLD_EXITED; 1216 why = CLD_EXITED;
1218 status = exit_code >> 8; 1217 status = exit_code >> 8;
1219 } else { 1218 } else {
1220 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; 1219 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1221 status = exit_code & 0x7f; 1220 status = exit_code & 0x7f;
1222 } 1221 }
1223 return wait_noreap_copyout(wo, p, pid, uid, why, status); 1222 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1224 } 1223 }
1225 1224
1226 /* 1225 /*
1227 * Try to move the task's state to DEAD 1226 * Try to move the task's state to DEAD
1228 * only one thread is allowed to do this: 1227 * only one thread is allowed to do this:
1229 */ 1228 */
1230 state = xchg(&p->exit_state, EXIT_DEAD); 1229 state = xchg(&p->exit_state, EXIT_DEAD);
1231 if (state != EXIT_ZOMBIE) { 1230 if (state != EXIT_ZOMBIE) {
1232 BUG_ON(state != EXIT_DEAD); 1231 BUG_ON(state != EXIT_DEAD);
1233 return 0; 1232 return 0;
1234 } 1233 }
1235 1234
1236 traced = ptrace_reparented(p); 1235 traced = ptrace_reparented(p);
1237 /* 1236 /*
1238 * It can be ptraced but not reparented, check 1237 * It can be ptraced but not reparented, check
1239 * !task_detached() to filter out sub-threads. 1238 * !task_detached() to filter out sub-threads.
1240 */ 1239 */
1241 if (likely(!traced) && likely(!task_detached(p))) { 1240 if (likely(!traced) && likely(!task_detached(p))) {
1242 struct signal_struct *psig; 1241 struct signal_struct *psig;
1243 struct signal_struct *sig; 1242 struct signal_struct *sig;
1244 unsigned long maxrss; 1243 unsigned long maxrss;
1245 cputime_t tgutime, tgstime; 1244 cputime_t tgutime, tgstime;
1246 1245
1247 /* 1246 /*
1248 * The resource counters for the group leader are in its 1247 * The resource counters for the group leader are in its
1249 * own task_struct. Those for dead threads in the group 1248 * own task_struct. Those for dead threads in the group
1250 * are in its signal_struct, as are those for the child 1249 * are in its signal_struct, as are those for the child
1251 * processes it has previously reaped. All these 1250 * processes it has previously reaped. All these
1252 * accumulate in the parent's signal_struct c* fields. 1251 * accumulate in the parent's signal_struct c* fields.
1253 * 1252 *
1254 * We don't bother to take a lock here to protect these 1253 * We don't bother to take a lock here to protect these
1255 * p->signal fields, because they are only touched by 1254 * p->signal fields, because they are only touched by
1256 * __exit_signal, which runs with tasklist_lock 1255 * __exit_signal, which runs with tasklist_lock
1257 * write-locked anyway, and so is excluded here. We do 1256 * write-locked anyway, and so is excluded here. We do
1258 * need to protect the access to parent->signal fields, 1257 * need to protect the access to parent->signal fields,
1259 * as other threads in the parent group can be right 1258 * as other threads in the parent group can be right
1260 * here reaping other children at the same time. 1259 * here reaping other children at the same time.
1261 * 1260 *
1262 * We use thread_group_times() to get times for the thread 1261 * We use thread_group_times() to get times for the thread
1263 * group, which consolidates times for all threads in the 1262 * group, which consolidates times for all threads in the
1264 * group including the group leader. 1263 * group including the group leader.
1265 */ 1264 */
1266 thread_group_times(p, &tgutime, &tgstime); 1265 thread_group_times(p, &tgutime, &tgstime);
1267 spin_lock_irq(&p->real_parent->sighand->siglock); 1266 spin_lock_irq(&p->real_parent->sighand->siglock);
1268 psig = p->real_parent->signal; 1267 psig = p->real_parent->signal;
1269 sig = p->signal; 1268 sig = p->signal;
1270 psig->cutime = 1269 psig->cutime =
1271 cputime_add(psig->cutime, 1270 cputime_add(psig->cutime,
1272 cputime_add(tgutime, 1271 cputime_add(tgutime,
1273 sig->cutime)); 1272 sig->cutime));
1274 psig->cstime = 1273 psig->cstime =
1275 cputime_add(psig->cstime, 1274 cputime_add(psig->cstime,
1276 cputime_add(tgstime, 1275 cputime_add(tgstime,
1277 sig->cstime)); 1276 sig->cstime));
1278 psig->cgtime = 1277 psig->cgtime =
1279 cputime_add(psig->cgtime, 1278 cputime_add(psig->cgtime,
1280 cputime_add(p->gtime, 1279 cputime_add(p->gtime,
1281 cputime_add(sig->gtime, 1280 cputime_add(sig->gtime,
1282 sig->cgtime))); 1281 sig->cgtime)));
1283 psig->cmin_flt += 1282 psig->cmin_flt +=
1284 p->min_flt + sig->min_flt + sig->cmin_flt; 1283 p->min_flt + sig->min_flt + sig->cmin_flt;
1285 psig->cmaj_flt += 1284 psig->cmaj_flt +=
1286 p->maj_flt + sig->maj_flt + sig->cmaj_flt; 1285 p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1287 psig->cnvcsw += 1286 psig->cnvcsw +=
1288 p->nvcsw + sig->nvcsw + sig->cnvcsw; 1287 p->nvcsw + sig->nvcsw + sig->cnvcsw;
1289 psig->cnivcsw += 1288 psig->cnivcsw +=
1290 p->nivcsw + sig->nivcsw + sig->cnivcsw; 1289 p->nivcsw + sig->nivcsw + sig->cnivcsw;
1291 psig->cinblock += 1290 psig->cinblock +=
1292 task_io_get_inblock(p) + 1291 task_io_get_inblock(p) +
1293 sig->inblock + sig->cinblock; 1292 sig->inblock + sig->cinblock;
1294 psig->coublock += 1293 psig->coublock +=
1295 task_io_get_oublock(p) + 1294 task_io_get_oublock(p) +
1296 sig->oublock + sig->coublock; 1295 sig->oublock + sig->coublock;
1297 maxrss = max(sig->maxrss, sig->cmaxrss); 1296 maxrss = max(sig->maxrss, sig->cmaxrss);
1298 if (psig->cmaxrss < maxrss) 1297 if (psig->cmaxrss < maxrss)
1299 psig->cmaxrss = maxrss; 1298 psig->cmaxrss = maxrss;
1300 task_io_accounting_add(&psig->ioac, &p->ioac); 1299 task_io_accounting_add(&psig->ioac, &p->ioac);
1301 task_io_accounting_add(&psig->ioac, &sig->ioac); 1300 task_io_accounting_add(&psig->ioac, &sig->ioac);
1302 spin_unlock_irq(&p->real_parent->sighand->siglock); 1301 spin_unlock_irq(&p->real_parent->sighand->siglock);
1303 } 1302 }
1304 1303
1305 /* 1304 /*
1306 * Now we are sure this task is interesting, and no other 1305 * Now we are sure this task is interesting, and no other
1307 * thread can reap it because we set its state to EXIT_DEAD. 1306 * thread can reap it because we set its state to EXIT_DEAD.
1308 */ 1307 */
1309 read_unlock(&tasklist_lock); 1308 read_unlock(&tasklist_lock);
1310 1309
1311 retval = wo->wo_rusage 1310 retval = wo->wo_rusage
1312 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1311 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1313 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1312 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1314 ? p->signal->group_exit_code : p->exit_code; 1313 ? p->signal->group_exit_code : p->exit_code;
1315 if (!retval && wo->wo_stat) 1314 if (!retval && wo->wo_stat)
1316 retval = put_user(status, wo->wo_stat); 1315 retval = put_user(status, wo->wo_stat);
1317 1316
1318 infop = wo->wo_info; 1317 infop = wo->wo_info;
1319 if (!retval && infop) 1318 if (!retval && infop)
1320 retval = put_user(SIGCHLD, &infop->si_signo); 1319 retval = put_user(SIGCHLD, &infop->si_signo);
1321 if (!retval && infop) 1320 if (!retval && infop)
1322 retval = put_user(0, &infop->si_errno); 1321 retval = put_user(0, &infop->si_errno);
1323 if (!retval && infop) { 1322 if (!retval && infop) {
1324 int why; 1323 int why;
1325 1324
1326 if ((status & 0x7f) == 0) { 1325 if ((status & 0x7f) == 0) {
1327 why = CLD_EXITED; 1326 why = CLD_EXITED;
1328 status >>= 8; 1327 status >>= 8;
1329 } else { 1328 } else {
1330 why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; 1329 why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1331 status &= 0x7f; 1330 status &= 0x7f;
1332 } 1331 }
1333 retval = put_user((short)why, &infop->si_code); 1332 retval = put_user((short)why, &infop->si_code);
1334 if (!retval) 1333 if (!retval)
1335 retval = put_user(status, &infop->si_status); 1334 retval = put_user(status, &infop->si_status);
1336 } 1335 }
1337 if (!retval && infop) 1336 if (!retval && infop)
1338 retval = put_user(pid, &infop->si_pid); 1337 retval = put_user(pid, &infop->si_pid);
1339 if (!retval && infop) 1338 if (!retval && infop)
1340 retval = put_user(uid, &infop->si_uid); 1339 retval = put_user(uid, &infop->si_uid);
1341 if (!retval) 1340 if (!retval)
1342 retval = pid; 1341 retval = pid;
1343 1342
1344 if (traced) { 1343 if (traced) {
1345 write_lock_irq(&tasklist_lock); 1344 write_lock_irq(&tasklist_lock);
1346 /* We dropped tasklist, ptracer could die and untrace */ 1345 /* We dropped tasklist, ptracer could die and untrace */
1347 ptrace_unlink(p); 1346 ptrace_unlink(p);
1348 /* 1347 /*
1349 * If this is not a detached task, notify the parent. 1348 * If this is not a detached task, notify the parent.
1350 * If it's still not detached after that, don't release 1349 * If it's still not detached after that, don't release
1351 * it now. 1350 * it now.
1352 */ 1351 */
1353 if (!task_detached(p)) { 1352 if (!task_detached(p)) {
1354 do_notify_parent(p, p->exit_signal); 1353 do_notify_parent(p, p->exit_signal);
1355 if (!task_detached(p)) { 1354 if (!task_detached(p)) {
1356 p->exit_state = EXIT_ZOMBIE; 1355 p->exit_state = EXIT_ZOMBIE;
1357 p = NULL; 1356 p = NULL;
1358 } 1357 }
1359 } 1358 }
1360 write_unlock_irq(&tasklist_lock); 1359 write_unlock_irq(&tasklist_lock);
1361 } 1360 }
1362 if (p != NULL) 1361 if (p != NULL)
1363 release_task(p); 1362 release_task(p);
1364 1363
1365 return retval; 1364 return retval;
1366 } 1365 }
1367 1366
1368 static int *task_stopped_code(struct task_struct *p, bool ptrace) 1367 static int *task_stopped_code(struct task_struct *p, bool ptrace)
1369 { 1368 {
1370 if (ptrace) { 1369 if (ptrace) {
1371 if (task_is_stopped_or_traced(p)) 1370 if (task_is_stopped_or_traced(p))
1372 return &p->exit_code; 1371 return &p->exit_code;
1373 } else { 1372 } else {
1374 if (p->signal->flags & SIGNAL_STOP_STOPPED) 1373 if (p->signal->flags & SIGNAL_STOP_STOPPED)
1375 return &p->signal->group_exit_code; 1374 return &p->signal->group_exit_code;
1376 } 1375 }
1377 return NULL; 1376 return NULL;
1378 } 1377 }
1379 1378
1380 /** 1379 /**
1381 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED 1380 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1382 * @wo: wait options 1381 * @wo: wait options
1383 * @ptrace: is the wait for ptrace 1382 * @ptrace: is the wait for ptrace
1384 * @p: task to wait for 1383 * @p: task to wait for
1385 * 1384 *
1386 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. 1385 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1387 * 1386 *
1388 * CONTEXT: 1387 * CONTEXT:
1389 * read_lock(&tasklist_lock), which is released if return value is 1388 * read_lock(&tasklist_lock), which is released if return value is
1390 * non-zero. Also, grabs and releases @p->sighand->siglock. 1389 * non-zero. Also, grabs and releases @p->sighand->siglock.
1391 * 1390 *
1392 * RETURNS: 1391 * RETURNS:
1393 * 0 if wait condition didn't exist and search for other wait conditions 1392 * 0 if wait condition didn't exist and search for other wait conditions
1394 * should continue. Non-zero return, -errno on failure and @p's pid on 1393 * should continue. Non-zero return, -errno on failure and @p's pid on
1395 * success, implies that tasklist_lock is released and wait condition 1394 * success, implies that tasklist_lock is released and wait condition
1396 * search should terminate. 1395 * search should terminate.
1397 */ 1396 */
1398 static int wait_task_stopped(struct wait_opts *wo, 1397 static int wait_task_stopped(struct wait_opts *wo,
1399 int ptrace, struct task_struct *p) 1398 int ptrace, struct task_struct *p)
1400 { 1399 {
1401 struct siginfo __user *infop; 1400 struct siginfo __user *infop;
1402 int retval, exit_code, *p_code, why; 1401 int retval, exit_code, *p_code, why;
1403 uid_t uid = 0; /* unneeded, required by compiler */ 1402 uid_t uid = 0; /* unneeded, required by compiler */
1404 pid_t pid; 1403 pid_t pid;
1405 1404
1406 /* 1405 /*
1407 * Traditionally we see ptrace'd stopped tasks regardless of options. 1406 * Traditionally we see ptrace'd stopped tasks regardless of options.
1408 */ 1407 */
1409 if (!ptrace && !(wo->wo_flags & WUNTRACED)) 1408 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1410 return 0; 1409 return 0;
1411 1410
1412 if (!task_stopped_code(p, ptrace)) 1411 if (!task_stopped_code(p, ptrace))
1413 return 0; 1412 return 0;
1414 1413
1415 exit_code = 0; 1414 exit_code = 0;
1416 spin_lock_irq(&p->sighand->siglock); 1415 spin_lock_irq(&p->sighand->siglock);
1417 1416
1418 p_code = task_stopped_code(p, ptrace); 1417 p_code = task_stopped_code(p, ptrace);
1419 if (unlikely(!p_code)) 1418 if (unlikely(!p_code))
1420 goto unlock_sig; 1419 goto unlock_sig;
1421 1420
1422 exit_code = *p_code; 1421 exit_code = *p_code;
1423 if (!exit_code) 1422 if (!exit_code)
1424 goto unlock_sig; 1423 goto unlock_sig;
1425 1424
1426 if (!unlikely(wo->wo_flags & WNOWAIT)) 1425 if (!unlikely(wo->wo_flags & WNOWAIT))
1427 *p_code = 0; 1426 *p_code = 0;
1428 1427
1429 uid = task_uid(p); 1428 uid = task_uid(p);
1430 unlock_sig: 1429 unlock_sig:
1431 spin_unlock_irq(&p->sighand->siglock); 1430 spin_unlock_irq(&p->sighand->siglock);
1432 if (!exit_code) 1431 if (!exit_code)
1433 return 0; 1432 return 0;
1434 1433
1435 /* 1434 /*
1436 * Now we are pretty sure this task is interesting. 1435 * Now we are pretty sure this task is interesting.
1437 * Make sure it doesn't get reaped out from under us while we 1436 * Make sure it doesn't get reaped out from under us while we
1438 * give up the lock and then examine it below. We don't want to 1437 * give up the lock and then examine it below. We don't want to
1439 * keep holding onto the tasklist_lock while we call getrusage and 1438 * keep holding onto the tasklist_lock while we call getrusage and
1440 * possibly take page faults for user memory. 1439 * possibly take page faults for user memory.
1441 */ 1440 */
1442 get_task_struct(p); 1441 get_task_struct(p);
1443 pid = task_pid_vnr(p); 1442 pid = task_pid_vnr(p);
1444 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1443 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1445 read_unlock(&tasklist_lock); 1444 read_unlock(&tasklist_lock);
1446 1445
1447 if (unlikely(wo->wo_flags & WNOWAIT)) 1446 if (unlikely(wo->wo_flags & WNOWAIT))
1448 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); 1447 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1449 1448
1450 retval = wo->wo_rusage 1449 retval = wo->wo_rusage
1451 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1450 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1452 if (!retval && wo->wo_stat) 1451 if (!retval && wo->wo_stat)
1453 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); 1452 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1454 1453
1455 infop = wo->wo_info; 1454 infop = wo->wo_info;
1456 if (!retval && infop) 1455 if (!retval && infop)
1457 retval = put_user(SIGCHLD, &infop->si_signo); 1456 retval = put_user(SIGCHLD, &infop->si_signo);
1458 if (!retval && infop) 1457 if (!retval && infop)
1459 retval = put_user(0, &infop->si_errno); 1458 retval = put_user(0, &infop->si_errno);
1460 if (!retval && infop) 1459 if (!retval && infop)
1461 retval = put_user((short)why, &infop->si_code); 1460 retval = put_user((short)why, &infop->si_code);
1462 if (!retval && infop) 1461 if (!retval && infop)
1463 retval = put_user(exit_code, &infop->si_status); 1462 retval = put_user(exit_code, &infop->si_status);
1464 if (!retval && infop) 1463 if (!retval && infop)
1465 retval = put_user(pid, &infop->si_pid); 1464 retval = put_user(pid, &infop->si_pid);
1466 if (!retval && infop) 1465 if (!retval && infop)
1467 retval = put_user(uid, &infop->si_uid); 1466 retval = put_user(uid, &infop->si_uid);
1468 if (!retval) 1467 if (!retval)
1469 retval = pid; 1468 retval = pid;
1470 put_task_struct(p); 1469 put_task_struct(p);
1471 1470
1472 BUG_ON(!retval); 1471 BUG_ON(!retval);
1473 return retval; 1472 return retval;
1474 } 1473 }
1475 1474
1476 /* 1475 /*
1477 * Handle do_wait work for one task in a live, non-stopped state. 1476 * Handle do_wait work for one task in a live, non-stopped state.
1478 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1477 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1479 * the lock and this task is uninteresting. If we return nonzero, we have 1478 * the lock and this task is uninteresting. If we return nonzero, we have
1480 * released the lock and the system call should return. 1479 * released the lock and the system call should return.
1481 */ 1480 */
1482 static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) 1481 static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1483 { 1482 {
1484 int retval; 1483 int retval;
1485 pid_t pid; 1484 pid_t pid;
1486 uid_t uid; 1485 uid_t uid;
1487 1486
1488 if (!unlikely(wo->wo_flags & WCONTINUED)) 1487 if (!unlikely(wo->wo_flags & WCONTINUED))
1489 return 0; 1488 return 0;
1490 1489
1491 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1490 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1492 return 0; 1491 return 0;
1493 1492
1494 spin_lock_irq(&p->sighand->siglock); 1493 spin_lock_irq(&p->sighand->siglock);
1495 /* Re-check with the lock held. */ 1494 /* Re-check with the lock held. */
1496 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { 1495 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1497 spin_unlock_irq(&p->sighand->siglock); 1496 spin_unlock_irq(&p->sighand->siglock);
1498 return 0; 1497 return 0;
1499 } 1498 }
1500 if (!unlikely(wo->wo_flags & WNOWAIT)) 1499 if (!unlikely(wo->wo_flags & WNOWAIT))
1501 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1500 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1502 uid = task_uid(p); 1501 uid = task_uid(p);
1503 spin_unlock_irq(&p->sighand->siglock); 1502 spin_unlock_irq(&p->sighand->siglock);
1504 1503
1505 pid = task_pid_vnr(p); 1504 pid = task_pid_vnr(p);
1506 get_task_struct(p); 1505 get_task_struct(p);
1507 read_unlock(&tasklist_lock); 1506 read_unlock(&tasklist_lock);
1508 1507
1509 if (!wo->wo_info) { 1508 if (!wo->wo_info) {
1510 retval = wo->wo_rusage 1509 retval = wo->wo_rusage
1511 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1510 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1512 put_task_struct(p); 1511 put_task_struct(p);
1513 if (!retval && wo->wo_stat) 1512 if (!retval && wo->wo_stat)
1514 retval = put_user(0xffff, wo->wo_stat); 1513 retval = put_user(0xffff, wo->wo_stat);
1515 if (!retval) 1514 if (!retval)
1516 retval = pid; 1515 retval = pid;
1517 } else { 1516 } else {
1518 retval = wait_noreap_copyout(wo, p, pid, uid, 1517 retval = wait_noreap_copyout(wo, p, pid, uid,
1519 CLD_CONTINUED, SIGCONT); 1518 CLD_CONTINUED, SIGCONT);
1520 BUG_ON(retval == 0); 1519 BUG_ON(retval == 0);
1521 } 1520 }
1522 1521
1523 return retval; 1522 return retval;
1524 } 1523 }
1525 1524
1526 /* 1525 /*
1527 * Consider @p for a wait by @parent. 1526 * Consider @p for a wait by @parent.
1528 * 1527 *
1529 * -ECHILD should be in ->notask_error before the first call. 1528 * -ECHILD should be in ->notask_error before the first call.
1530 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1529 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1531 * Returns zero if the search for a child should continue; 1530 * Returns zero if the search for a child should continue;
1532 * then ->notask_error is 0 if @p is an eligible child, 1531 * then ->notask_error is 0 if @p is an eligible child,
1533 * or another error from security_task_wait(), or still -ECHILD. 1532 * or another error from security_task_wait(), or still -ECHILD.
1534 */ 1533 */
1535 static int wait_consider_task(struct wait_opts *wo, int ptrace, 1534 static int wait_consider_task(struct wait_opts *wo, int ptrace,
1536 struct task_struct *p) 1535 struct task_struct *p)
1537 { 1536 {
1538 int ret = eligible_child(wo, p); 1537 int ret = eligible_child(wo, p);
1539 if (!ret) 1538 if (!ret)
1540 return ret; 1539 return ret;
1541 1540
1542 ret = security_task_wait(p); 1541 ret = security_task_wait(p);
1543 if (unlikely(ret < 0)) { 1542 if (unlikely(ret < 0)) {
1544 /* 1543 /*
1545 * If we have not yet seen any eligible child, 1544 * If we have not yet seen any eligible child,
1546 * then let this error code replace -ECHILD. 1545 * then let this error code replace -ECHILD.
1547 * A permission error will give the user a clue 1546 * A permission error will give the user a clue
1548 * to look for security policy problems, rather 1547 * to look for security policy problems, rather
1549 * than for mysterious wait bugs. 1548 * than for mysterious wait bugs.
1550 */ 1549 */
1551 if (wo->notask_error) 1550 if (wo->notask_error)
1552 wo->notask_error = ret; 1551 wo->notask_error = ret;
1553 return 0; 1552 return 0;
1554 } 1553 }
1555 1554
1556 /* dead body doesn't have much to contribute */ 1555 /* dead body doesn't have much to contribute */
1557 if (p->exit_state == EXIT_DEAD) 1556 if (p->exit_state == EXIT_DEAD)
1558 return 0; 1557 return 0;
1559 1558
1560 /* slay zombie? */ 1559 /* slay zombie? */
1561 if (p->exit_state == EXIT_ZOMBIE) { 1560 if (p->exit_state == EXIT_ZOMBIE) {
1562 /* 1561 /*
1563 * A zombie ptracee is only visible to its ptracer. 1562 * A zombie ptracee is only visible to its ptracer.
1564 * Notification and reaping will be cascaded to the real 1563 * Notification and reaping will be cascaded to the real
1565 * parent when the ptracer detaches. 1564 * parent when the ptracer detaches.
1566 */ 1565 */
1567 if (likely(!ptrace) && unlikely(task_ptrace(p))) { 1566 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1568 /* it will become visible, clear notask_error */ 1567 /* it will become visible, clear notask_error */
1569 wo->notask_error = 0; 1568 wo->notask_error = 0;
1570 return 0; 1569 return 0;
1571 } 1570 }
1572 1571
1573 /* we don't reap group leaders with subthreads */ 1572 /* we don't reap group leaders with subthreads */
1574 if (!delay_group_leader(p)) 1573 if (!delay_group_leader(p))
1575 return wait_task_zombie(wo, p); 1574 return wait_task_zombie(wo, p);
1576 1575
1577 /* 1576 /*
1578 * Allow access to stopped/continued state via zombie by 1577 * Allow access to stopped/continued state via zombie by
1579 * falling through. Clearing of notask_error is complex. 1578 * falling through. Clearing of notask_error is complex.
1580 * 1579 *
1581 * When !@ptrace: 1580 * When !@ptrace:
1582 * 1581 *
1583 * If WEXITED is set, notask_error should naturally be 1582 * If WEXITED is set, notask_error should naturally be
1584 * cleared. If not, subset of WSTOPPED|WCONTINUED is set, 1583 * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
1585 * so, if there are live subthreads, there are events to 1584 * so, if there are live subthreads, there are events to
1586 * wait for. If all subthreads are dead, it's still safe 1585 * wait for. If all subthreads are dead, it's still safe
1587 * to clear - this function will be called again in finite 1586 * to clear - this function will be called again in finite
1588 * amount time once all the subthreads are released and 1587 * amount time once all the subthreads are released and
1589 * will then return without clearing. 1588 * will then return without clearing.
1590 * 1589 *
1591 * When @ptrace: 1590 * When @ptrace:
1592 * 1591 *
1593 * Stopped state is per-task and thus can't change once the 1592 * Stopped state is per-task and thus can't change once the
1594 * target task dies. Only continued and exited can happen. 1593 * target task dies. Only continued and exited can happen.
1595 * Clear notask_error if WCONTINUED | WEXITED. 1594 * Clear notask_error if WCONTINUED | WEXITED.
1596 */ 1595 */
1597 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) 1596 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1598 wo->notask_error = 0; 1597 wo->notask_error = 0;
1599 } else { 1598 } else {
1600 /* 1599 /*
1601 * If @p is ptraced by a task in its real parent's group, 1600 * If @p is ptraced by a task in its real parent's group,
1602 * hide group stop/continued state when looking at @p as 1601 * hide group stop/continued state when looking at @p as
1603 * the real parent; otherwise, a single stop can be 1602 * the real parent; otherwise, a single stop can be
1604 * reported twice as group and ptrace stops. 1603 * reported twice as group and ptrace stops.
1605 * 1604 *
1606 * If a ptracer wants to distinguish the two events for its 1605 * If a ptracer wants to distinguish the two events for its
1607 * own children, it should create a separate process which 1606 * own children, it should create a separate process which
1608 * takes the role of real parent. 1607 * takes the role of real parent.
1609 */ 1608 */
1610 if (likely(!ptrace) && task_ptrace(p) && 1609 if (likely(!ptrace) && task_ptrace(p) &&
1611 same_thread_group(p->parent, p->real_parent)) 1610 same_thread_group(p->parent, p->real_parent))
1612 return 0; 1611 return 0;
1613 1612
1614 /* 1613 /*
1615 * @p is alive and it's gonna stop, continue or exit, so 1614 * @p is alive and it's gonna stop, continue or exit, so
1616 * there always is something to wait for. 1615 * there always is something to wait for.
1617 */ 1616 */
1618 wo->notask_error = 0; 1617 wo->notask_error = 0;
1619 } 1618 }
1620 1619
1621 /* 1620 /*
1622 * Wait for stopped. Depending on @ptrace, different stopped state 1621 * Wait for stopped. Depending on @ptrace, different stopped state
1623 * is used and the two don't interact with each other. 1622 * is used and the two don't interact with each other.
1624 */ 1623 */
1625 ret = wait_task_stopped(wo, ptrace, p); 1624 ret = wait_task_stopped(wo, ptrace, p);
1626 if (ret) 1625 if (ret)
1627 return ret; 1626 return ret;
1628 1627
1629 /* 1628 /*
1630 * Wait for continued. There's only one continued state and the 1629 * Wait for continued. There's only one continued state and the
1631 * ptracer can consume it which can confuse the real parent. Don't 1630 * ptracer can consume it which can confuse the real parent. Don't
1632 * use WCONTINUED from ptracer. You don't need or want it. 1631 * use WCONTINUED from ptracer. You don't need or want it.
1633 */ 1632 */
1634 return wait_task_continued(wo, p); 1633 return wait_task_continued(wo, p);
1635 } 1634 }
1636 1635
1637 /* 1636 /*
1638 * Do the work of do_wait() for one thread in the group, @tsk. 1637 * Do the work of do_wait() for one thread in the group, @tsk.
1639 * 1638 *
1640 * -ECHILD should be in ->notask_error before the first call. 1639 * -ECHILD should be in ->notask_error before the first call.
1641 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1640 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1642 * Returns zero if the search for a child should continue; then 1641 * Returns zero if the search for a child should continue; then
1643 * ->notask_error is 0 if there were any eligible children, 1642 * ->notask_error is 0 if there were any eligible children,
1644 * or another error from security_task_wait(), or still -ECHILD. 1643 * or another error from security_task_wait(), or still -ECHILD.
1645 */ 1644 */
1646 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) 1645 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1647 { 1646 {
1648 struct task_struct *p; 1647 struct task_struct *p;
1649 1648
1650 list_for_each_entry(p, &tsk->children, sibling) { 1649 list_for_each_entry(p, &tsk->children, sibling) {
1651 int ret = wait_consider_task(wo, 0, p); 1650 int ret = wait_consider_task(wo, 0, p);
1652 if (ret) 1651 if (ret)
1653 return ret; 1652 return ret;
1654 } 1653 }
1655 1654
1656 return 0; 1655 return 0;
1657 } 1656 }
1658 1657
1659 static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) 1658 static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1660 { 1659 {
1661 struct task_struct *p; 1660 struct task_struct *p;
1662 1661
1663 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1662 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1664 int ret = wait_consider_task(wo, 1, p); 1663 int ret = wait_consider_task(wo, 1, p);
1665 if (ret) 1664 if (ret)
1666 return ret; 1665 return ret;
1667 } 1666 }
1668 1667
1669 return 0; 1668 return 0;
1670 } 1669 }
1671 1670
1672 static int child_wait_callback(wait_queue_t *wait, unsigned mode, 1671 static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1673 int sync, void *key) 1672 int sync, void *key)
1674 { 1673 {
1675 struct wait_opts *wo = container_of(wait, struct wait_opts, 1674 struct wait_opts *wo = container_of(wait, struct wait_opts,
1676 child_wait); 1675 child_wait);
1677 struct task_struct *p = key; 1676 struct task_struct *p = key;
1678 1677
1679 if (!eligible_pid(wo, p)) 1678 if (!eligible_pid(wo, p))
1680 return 0; 1679 return 0;
1681 1680
1682 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) 1681 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1683 return 0; 1682 return 0;
1684 1683
1685 return default_wake_function(wait, mode, sync, key); 1684 return default_wake_function(wait, mode, sync, key);
1686 } 1685 }
1687 1686
1688 void __wake_up_parent(struct task_struct *p, struct task_struct *parent) 1687 void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1689 { 1688 {
1690 __wake_up_sync_key(&parent->signal->wait_chldexit, 1689 __wake_up_sync_key(&parent->signal->wait_chldexit,
1691 TASK_INTERRUPTIBLE, 1, p); 1690 TASK_INTERRUPTIBLE, 1, p);
1692 } 1691 }
1693 1692
1694 static long do_wait(struct wait_opts *wo) 1693 static long do_wait(struct wait_opts *wo)
1695 { 1694 {
1696 struct task_struct *tsk; 1695 struct task_struct *tsk;
1697 int retval; 1696 int retval;
1698 1697
1699 trace_sched_process_wait(wo->wo_pid); 1698 trace_sched_process_wait(wo->wo_pid);
1700 1699
1701 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); 1700 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1702 wo->child_wait.private = current; 1701 wo->child_wait.private = current;
1703 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); 1702 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1704 repeat: 1703 repeat:
1705 /* 1704 /*
1706 * If there is nothing that can match our critiera just get out. 1705 * If there is nothing that can match our critiera just get out.
1707 * We will clear ->notask_error to zero if we see any child that 1706 * We will clear ->notask_error to zero if we see any child that
1708 * might later match our criteria, even if we are not able to reap 1707 * might later match our criteria, even if we are not able to reap
1709 * it yet. 1708 * it yet.
1710 */ 1709 */
1711 wo->notask_error = -ECHILD; 1710 wo->notask_error = -ECHILD;
1712 if ((wo->wo_type < PIDTYPE_MAX) && 1711 if ((wo->wo_type < PIDTYPE_MAX) &&
1713 (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) 1712 (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1714 goto notask; 1713 goto notask;
1715 1714
1716 set_current_state(TASK_INTERRUPTIBLE); 1715 set_current_state(TASK_INTERRUPTIBLE);
1717 read_lock(&tasklist_lock); 1716 read_lock(&tasklist_lock);
1718 tsk = current; 1717 tsk = current;
1719 do { 1718 do {
1720 retval = do_wait_thread(wo, tsk); 1719 retval = do_wait_thread(wo, tsk);
1721 if (retval) 1720 if (retval)
1722 goto end; 1721 goto end;
1723 1722
1724 retval = ptrace_do_wait(wo, tsk); 1723 retval = ptrace_do_wait(wo, tsk);
1725 if (retval) 1724 if (retval)
1726 goto end; 1725 goto end;
1727 1726
1728 if (wo->wo_flags & __WNOTHREAD) 1727 if (wo->wo_flags & __WNOTHREAD)
1729 break; 1728 break;
1730 } while_each_thread(current, tsk); 1729 } while_each_thread(current, tsk);
1731 read_unlock(&tasklist_lock); 1730 read_unlock(&tasklist_lock);
1732 1731
1733 notask: 1732 notask:
1734 retval = wo->notask_error; 1733 retval = wo->notask_error;
1735 if (!retval && !(wo->wo_flags & WNOHANG)) { 1734 if (!retval && !(wo->wo_flags & WNOHANG)) {
1736 retval = -ERESTARTSYS; 1735 retval = -ERESTARTSYS;
1737 if (!signal_pending(current)) { 1736 if (!signal_pending(current)) {
1738 schedule(); 1737 schedule();
1739 goto repeat; 1738 goto repeat;
1740 } 1739 }
1741 } 1740 }
1742 end: 1741 end:
1743 __set_current_state(TASK_RUNNING); 1742 __set_current_state(TASK_RUNNING);
1744 remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); 1743 remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1745 return retval; 1744 return retval;
1746 } 1745 }
1747 1746
1748 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1747 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1749 infop, int, options, struct rusage __user *, ru) 1748 infop, int, options, struct rusage __user *, ru)
1750 { 1749 {
1751 struct wait_opts wo; 1750 struct wait_opts wo;
1752 struct pid *pid = NULL; 1751 struct pid *pid = NULL;
1753 enum pid_type type; 1752 enum pid_type type;
1754 long ret; 1753 long ret;
1755 1754
1756 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) 1755 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
1757 return -EINVAL; 1756 return -EINVAL;
1758 if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) 1757 if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1759 return -EINVAL; 1758 return -EINVAL;
1760 1759
1761 switch (which) { 1760 switch (which) {
1762 case P_ALL: 1761 case P_ALL:
1763 type = PIDTYPE_MAX; 1762 type = PIDTYPE_MAX;
1764 break; 1763 break;
1765 case P_PID: 1764 case P_PID:
1766 type = PIDTYPE_PID; 1765 type = PIDTYPE_PID;
1767 if (upid <= 0) 1766 if (upid <= 0)
1768 return -EINVAL; 1767 return -EINVAL;
1769 break; 1768 break;
1770 case P_PGID: 1769 case P_PGID:
1771 type = PIDTYPE_PGID; 1770 type = PIDTYPE_PGID;
1772 if (upid <= 0) 1771 if (upid <= 0)
1773 return -EINVAL; 1772 return -EINVAL;
1774 break; 1773 break;
1775 default: 1774 default:
1776 return -EINVAL; 1775 return -EINVAL;
1777 } 1776 }
1778 1777
1779 if (type < PIDTYPE_MAX) 1778 if (type < PIDTYPE_MAX)
1780 pid = find_get_pid(upid); 1779 pid = find_get_pid(upid);
1781 1780
1782 wo.wo_type = type; 1781 wo.wo_type = type;
1783 wo.wo_pid = pid; 1782 wo.wo_pid = pid;
1784 wo.wo_flags = options; 1783 wo.wo_flags = options;
1785 wo.wo_info = infop; 1784 wo.wo_info = infop;
1786 wo.wo_stat = NULL; 1785 wo.wo_stat = NULL;
1787 wo.wo_rusage = ru; 1786 wo.wo_rusage = ru;
1788 ret = do_wait(&wo); 1787 ret = do_wait(&wo);
1789 1788
1790 if (ret > 0) { 1789 if (ret > 0) {
1791 ret = 0; 1790 ret = 0;
1792 } else if (infop) { 1791 } else if (infop) {
1793 /* 1792 /*
1794 * For a WNOHANG return, clear out all the fields 1793 * For a WNOHANG return, clear out all the fields
1795 * we would set so the user can easily tell the 1794 * we would set so the user can easily tell the
1796 * difference. 1795 * difference.
1797 */ 1796 */
1798 if (!ret) 1797 if (!ret)
1799 ret = put_user(0, &infop->si_signo); 1798 ret = put_user(0, &infop->si_signo);
1800 if (!ret) 1799 if (!ret)
1801 ret = put_user(0, &infop->si_errno); 1800 ret = put_user(0, &infop->si_errno);
1802 if (!ret) 1801 if (!ret)
1803 ret = put_user(0, &infop->si_code); 1802 ret = put_user(0, &infop->si_code);
1804 if (!ret) 1803 if (!ret)
1805 ret = put_user(0, &infop->si_pid); 1804 ret = put_user(0, &infop->si_pid);
1806 if (!ret) 1805 if (!ret)
1807 ret = put_user(0, &infop->si_uid); 1806 ret = put_user(0, &infop->si_uid);
1808 if (!ret) 1807 if (!ret)
1809 ret = put_user(0, &infop->si_status); 1808 ret = put_user(0, &infop->si_status);
1810 } 1809 }
1811 1810
1812 put_pid(pid); 1811 put_pid(pid);
1813 1812
1814 /* avoid REGPARM breakage on x86: */ 1813 /* avoid REGPARM breakage on x86: */
1815 asmlinkage_protect(5, ret, which, upid, infop, options, ru); 1814 asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1816 return ret; 1815 return ret;
1817 } 1816 }
1818 1817
1819 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1818 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1820 int, options, struct rusage __user *, ru) 1819 int, options, struct rusage __user *, ru)
1821 { 1820 {
1822 struct wait_opts wo; 1821 struct wait_opts wo;
1823 struct pid *pid = NULL; 1822 struct pid *pid = NULL;
1824 enum pid_type type; 1823 enum pid_type type;
1825 long ret; 1824 long ret;
1826 1825
1827 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| 1826 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1828 __WNOTHREAD|__WCLONE|__WALL)) 1827 __WNOTHREAD|__WCLONE|__WALL))
1829 return -EINVAL; 1828 return -EINVAL;
1830 1829
1831 if (upid == -1) 1830 if (upid == -1)
1832 type = PIDTYPE_MAX; 1831 type = PIDTYPE_MAX;
1833 else if (upid < 0) { 1832 else if (upid < 0) {
1834 type = PIDTYPE_PGID; 1833 type = PIDTYPE_PGID;
1835 pid = find_get_pid(-upid); 1834 pid = find_get_pid(-upid);
1836 } else if (upid == 0) { 1835 } else if (upid == 0) {
1837 type = PIDTYPE_PGID; 1836 type = PIDTYPE_PGID;
1838 pid = get_task_pid(current, PIDTYPE_PGID); 1837 pid = get_task_pid(current, PIDTYPE_PGID);
1839 } else /* upid > 0 */ { 1838 } else /* upid > 0 */ {
1840 type = PIDTYPE_PID; 1839 type = PIDTYPE_PID;
1841 pid = find_get_pid(upid); 1840 pid = find_get_pid(upid);
1842 } 1841 }
1843 1842
1844 wo.wo_type = type; 1843 wo.wo_type = type;
1845 wo.wo_pid = pid; 1844 wo.wo_pid = pid;
1846 wo.wo_flags = options | WEXITED; 1845 wo.wo_flags = options | WEXITED;
1847 wo.wo_info = NULL; 1846 wo.wo_info = NULL;
1848 wo.wo_stat = stat_addr; 1847 wo.wo_stat = stat_addr;
1849 wo.wo_rusage = ru; 1848 wo.wo_rusage = ru;
1850 ret = do_wait(&wo); 1849 ret = do_wait(&wo);
1851 put_pid(pid); 1850 put_pid(pid);
1852 1851
1853 /* avoid REGPARM breakage on x86: */ 1852 /* avoid REGPARM breakage on x86: */
1854 asmlinkage_protect(4, ret, upid, stat_addr, options, ru); 1853 asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1855 return ret; 1854 return ret;
1856 } 1855 }
1857 1856
1858 #ifdef __ARCH_WANT_SYS_WAITPID 1857 #ifdef __ARCH_WANT_SYS_WAITPID
1859 1858
1860 /* 1859 /*
1861 * sys_waitpid() remains for compatibility. waitpid() should be 1860 * sys_waitpid() remains for compatibility. waitpid() should be
1862 * implemented by calling sys_wait4() from libc.a. 1861 * implemented by calling sys_wait4() from libc.a.
1863 */ 1862 */
1864 SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) 1863 SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1865 { 1864 {
1866 return sys_wait4(pid, stat_addr, options, NULL); 1865 return sys_wait4(pid, stat_addr, options, NULL);
1867 } 1866 }
1868 1867
1869 #endif 1868 #endif
1870 1869
1 /* 1 /*
2 * Generic pidhash and scalable, time-bounded PID allocator 2 * Generic pidhash and scalable, time-bounded PID allocator
3 * 3 *
4 * (C) 2002-2003 William Irwin, IBM 4 * (C) 2002-2003 William Irwin, IBM
5 * (C) 2004 William Irwin, Oracle 5 * (C) 2004 William Irwin, Oracle
6 * (C) 2002-2004 Ingo Molnar, Red Hat 6 * (C) 2002-2004 Ingo Molnar, Red Hat
7 * 7 *
8 * pid-structures are backing objects for tasks sharing a given ID to chain 8 * pid-structures are backing objects for tasks sharing a given ID to chain
9 * against. There is very little to them aside from hashing them and 9 * against. There is very little to them aside from hashing them and
10 * parking tasks using given ID's on a list. 10 * parking tasks using given ID's on a list.
11 * 11 *
12 * The hash is always changed with the tasklist_lock write-acquired, 12 * The hash is always changed with the tasklist_lock write-acquired,
13 * and the hash is only accessed with the tasklist_lock at least 13 * and the hash is only accessed with the tasklist_lock at least
14 * read-acquired, so there's no additional SMP locking needed here. 14 * read-acquired, so there's no additional SMP locking needed here.
15 * 15 *
16 * We have a list of bitmap pages, which bitmaps represent the PID space. 16 * We have a list of bitmap pages, which bitmaps represent the PID space.
17 * Allocating and freeing PIDs is completely lockless. The worst-case 17 * Allocating and freeing PIDs is completely lockless. The worst-case
18 * allocation scenario when all but one out of 1 million PIDs possible are 18 * allocation scenario when all but one out of 1 million PIDs possible are
19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE 19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). 20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
21 * 21 *
22 * Pid namespaces: 22 * Pid namespaces:
23 * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. 23 * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
24 * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM 24 * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
25 * Many thanks to Oleg Nesterov for comments and help 25 * Many thanks to Oleg Nesterov for comments and help
26 * 26 *
27 */ 27 */
28 28
29 #include <linux/mm.h> 29 #include <linux/mm.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/slab.h> 31 #include <linux/slab.h>
32 #include <linux/init.h> 32 #include <linux/init.h>
33 #include <linux/rculist.h> 33 #include <linux/rculist.h>
34 #include <linux/bootmem.h> 34 #include <linux/bootmem.h>
35 #include <linux/hash.h> 35 #include <linux/hash.h>
36 #include <linux/pid_namespace.h> 36 #include <linux/pid_namespace.h>
37 #include <linux/init_task.h> 37 #include <linux/init_task.h>
38 #include <linux/syscalls.h> 38 #include <linux/syscalls.h>
39 39
40 #define pid_hashfn(nr, ns) \ 40 #define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
42 static struct hlist_head *pid_hash; 42 static struct hlist_head *pid_hash;
43 static unsigned int pidhash_shift = 4; 43 static unsigned int pidhash_shift = 4;
44 struct pid init_struct_pid = INIT_STRUCT_PID; 44 struct pid init_struct_pid = INIT_STRUCT_PID;
45 45
46 int pid_max = PID_MAX_DEFAULT; 46 int pid_max = PID_MAX_DEFAULT;
47 47
48 #define RESERVED_PIDS 300 48 #define RESERVED_PIDS 300
49 49
50 int pid_max_min = RESERVED_PIDS + 1; 50 int pid_max_min = RESERVED_PIDS + 1;
51 int pid_max_max = PID_MAX_LIMIT; 51 int pid_max_max = PID_MAX_LIMIT;
52 52
53 #define BITS_PER_PAGE (PAGE_SIZE*8) 53 #define BITS_PER_PAGE (PAGE_SIZE*8)
54 #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) 54 #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
55 55
56 static inline int mk_pid(struct pid_namespace *pid_ns, 56 static inline int mk_pid(struct pid_namespace *pid_ns,
57 struct pidmap *map, int off) 57 struct pidmap *map, int off)
58 { 58 {
59 return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; 59 return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
60 } 60 }
61 61
62 #define find_next_offset(map, off) \ 62 #define find_next_offset(map, off) \
63 find_next_zero_bit((map)->page, BITS_PER_PAGE, off) 63 find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
64 64
65 /* 65 /*
66 * PID-map pages start out as NULL, they get allocated upon 66 * PID-map pages start out as NULL, they get allocated upon
67 * first use and are never deallocated. This way a low pid_max 67 * first use and are never deallocated. This way a low pid_max
68 * value does not cause lots of bitmaps to be allocated, but 68 * value does not cause lots of bitmaps to be allocated, but
69 * the scheme scales to up to 4 million PIDs, runtime. 69 * the scheme scales to up to 4 million PIDs, runtime.
70 */ 70 */
71 struct pid_namespace init_pid_ns = { 71 struct pid_namespace init_pid_ns = {
72 .kref = { 72 .kref = {
73 .refcount = ATOMIC_INIT(2), 73 .refcount = ATOMIC_INIT(2),
74 }, 74 },
75 .pidmap = { 75 .pidmap = {
76 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 76 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
77 }, 77 },
78 .last_pid = 0, 78 .last_pid = 0,
79 .level = 0, 79 .level = 0,
80 .child_reaper = &init_task, 80 .child_reaper = &init_task,
81 }; 81 };
82 EXPORT_SYMBOL_GPL(init_pid_ns); 82 EXPORT_SYMBOL_GPL(init_pid_ns);
83 83
84 int is_container_init(struct task_struct *tsk) 84 int is_container_init(struct task_struct *tsk)
85 { 85 {
86 int ret = 0; 86 int ret = 0;
87 struct pid *pid; 87 struct pid *pid;
88 88
89 rcu_read_lock(); 89 rcu_read_lock();
90 pid = task_pid(tsk); 90 pid = task_pid(tsk);
91 if (pid != NULL && pid->numbers[pid->level].nr == 1) 91 if (pid != NULL && pid->numbers[pid->level].nr == 1)
92 ret = 1; 92 ret = 1;
93 rcu_read_unlock(); 93 rcu_read_unlock();
94 94
95 return ret; 95 return ret;
96 } 96 }
97 EXPORT_SYMBOL(is_container_init); 97 EXPORT_SYMBOL(is_container_init);
98 98
99 /* 99 /*
100 * Note: disable interrupts while the pidmap_lock is held as an 100 * Note: disable interrupts while the pidmap_lock is held as an
101 * interrupt might come in and do read_lock(&tasklist_lock). 101 * interrupt might come in and do read_lock(&tasklist_lock).
102 * 102 *
103 * If we don't disable interrupts there is a nasty deadlock between 103 * If we don't disable interrupts there is a nasty deadlock between
104 * detach_pid()->free_pid() and another cpu that does 104 * detach_pid()->free_pid() and another cpu that does
105 * spin_lock(&pidmap_lock) followed by an interrupt routine that does 105 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
106 * read_lock(&tasklist_lock); 106 * read_lock(&tasklist_lock);
107 * 107 *
108 * After we clean up the tasklist_lock and know there are no 108 * After we clean up the tasklist_lock and know there are no
109 * irq handlers that take it we can leave the interrupts enabled. 109 * irq handlers that take it we can leave the interrupts enabled.
110 * For now it is easier to be safe than to prove it can't happen. 110 * For now it is easier to be safe than to prove it can't happen.
111 */ 111 */
112 112
113 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 113 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
114 114
115 static void free_pidmap(struct upid *upid) 115 static void free_pidmap(struct upid *upid)
116 { 116 {
117 int nr = upid->nr; 117 int nr = upid->nr;
118 struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; 118 struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
119 int offset = nr & BITS_PER_PAGE_MASK; 119 int offset = nr & BITS_PER_PAGE_MASK;
120 120
121 clear_bit(offset, map->page); 121 clear_bit(offset, map->page);
122 atomic_inc(&map->nr_free); 122 atomic_inc(&map->nr_free);
123 } 123 }
124 124
125 /* 125 /*
126 * If we started walking pids at 'base', is 'a' seen before 'b'? 126 * If we started walking pids at 'base', is 'a' seen before 'b'?
127 */ 127 */
128 static int pid_before(int base, int a, int b) 128 static int pid_before(int base, int a, int b)
129 { 129 {
130 /* 130 /*
131 * This is the same as saying 131 * This is the same as saying
132 * 132 *
133 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT 133 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
134 * and that mapping orders 'a' and 'b' with respect to 'base'. 134 * and that mapping orders 'a' and 'b' with respect to 'base'.
135 */ 135 */
136 return (unsigned)(a - base) < (unsigned)(b - base); 136 return (unsigned)(a - base) < (unsigned)(b - base);
137 } 137 }
138 138
139 /* 139 /*
140 * We might be racing with someone else trying to set pid_ns->last_pid. 140 * We might be racing with someone else trying to set pid_ns->last_pid.
141 * We want the winner to have the "later" value, because if the 141 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately. 142 * "earlier" value prevails, then a pid may get reused immediately.
143 * 143 *
144 * Since pids rollover, it is not sufficient to just pick the bigger 144 * Since pids rollover, it is not sufficient to just pick the bigger
145 * value. We have to consider where we started counting from. 145 * value. We have to consider where we started counting from.
146 * 146 *
147 * 'base' is the value of pid_ns->last_pid that we observed when 147 * 'base' is the value of pid_ns->last_pid that we observed when
148 * we started looking for a pid. 148 * we started looking for a pid.
149 * 149 *
150 * 'pid' is the pid that we eventually found. 150 * 'pid' is the pid that we eventually found.
151 */ 151 */
152 static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) 152 static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
153 { 153 {
154 int prev; 154 int prev;
155 int last_write = base; 155 int last_write = base;
156 do { 156 do {
157 prev = last_write; 157 prev = last_write;
158 last_write = cmpxchg(&pid_ns->last_pid, prev, pid); 158 last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
159 } while ((prev != last_write) && (pid_before(base, last_write, pid))); 159 } while ((prev != last_write) && (pid_before(base, last_write, pid)));
160 } 160 }
161 161
162 static int alloc_pidmap(struct pid_namespace *pid_ns) 162 static int alloc_pidmap(struct pid_namespace *pid_ns)
163 { 163 {
164 int i, offset, max_scan, pid, last = pid_ns->last_pid; 164 int i, offset, max_scan, pid, last = pid_ns->last_pid;
165 struct pidmap *map; 165 struct pidmap *map;
166 166
167 pid = last + 1; 167 pid = last + 1;
168 if (pid >= pid_max) 168 if (pid >= pid_max)
169 pid = RESERVED_PIDS; 169 pid = RESERVED_PIDS;
170 offset = pid & BITS_PER_PAGE_MASK; 170 offset = pid & BITS_PER_PAGE_MASK;
171 map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; 171 map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
172 /* 172 /*
173 * If last_pid points into the middle of the map->page we 173 * If last_pid points into the middle of the map->page we
174 * want to scan this bitmap block twice, the second time 174 * want to scan this bitmap block twice, the second time
175 * we start with offset == 0 (or RESERVED_PIDS). 175 * we start with offset == 0 (or RESERVED_PIDS).
176 */ 176 */
177 max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; 177 max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
178 for (i = 0; i <= max_scan; ++i) { 178 for (i = 0; i <= max_scan; ++i) {
179 if (unlikely(!map->page)) { 179 if (unlikely(!map->page)) {
180 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); 180 void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
181 /* 181 /*
182 * Free the page if someone raced with us 182 * Free the page if someone raced with us
183 * installing it: 183 * installing it:
184 */ 184 */
185 spin_lock_irq(&pidmap_lock); 185 spin_lock_irq(&pidmap_lock);
186 if (!map->page) { 186 if (!map->page) {
187 map->page = page; 187 map->page = page;
188 page = NULL; 188 page = NULL;
189 } 189 }
190 spin_unlock_irq(&pidmap_lock); 190 spin_unlock_irq(&pidmap_lock);
191 kfree(page); 191 kfree(page);
192 if (unlikely(!map->page)) 192 if (unlikely(!map->page))
193 break; 193 break;
194 } 194 }
195 if (likely(atomic_read(&map->nr_free))) { 195 if (likely(atomic_read(&map->nr_free))) {
196 do { 196 do {
197 if (!test_and_set_bit(offset, map->page)) { 197 if (!test_and_set_bit(offset, map->page)) {
198 atomic_dec(&map->nr_free); 198 atomic_dec(&map->nr_free);
199 set_last_pid(pid_ns, last, pid); 199 set_last_pid(pid_ns, last, pid);
200 return pid; 200 return pid;
201 } 201 }
202 offset = find_next_offset(map, offset); 202 offset = find_next_offset(map, offset);
203 pid = mk_pid(pid_ns, map, offset); 203 pid = mk_pid(pid_ns, map, offset);
204 } while (offset < BITS_PER_PAGE && pid < pid_max); 204 } while (offset < BITS_PER_PAGE && pid < pid_max);
205 } 205 }
206 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 206 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
207 ++map; 207 ++map;
208 offset = 0; 208 offset = 0;
209 } else { 209 } else {
210 map = &pid_ns->pidmap[0]; 210 map = &pid_ns->pidmap[0];
211 offset = RESERVED_PIDS; 211 offset = RESERVED_PIDS;
212 if (unlikely(last == offset)) 212 if (unlikely(last == offset))
213 break; 213 break;
214 } 214 }
215 pid = mk_pid(pid_ns, map, offset); 215 pid = mk_pid(pid_ns, map, offset);
216 } 216 }
217 return -1; 217 return -1;
218 } 218 }
219 219
220 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) 220 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
221 { 221 {
222 int offset; 222 int offset;
223 struct pidmap *map, *end; 223 struct pidmap *map, *end;
224 224
225 if (last >= PID_MAX_LIMIT) 225 if (last >= PID_MAX_LIMIT)
226 return -1; 226 return -1;
227 227
228 offset = (last + 1) & BITS_PER_PAGE_MASK; 228 offset = (last + 1) & BITS_PER_PAGE_MASK;
229 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; 229 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
230 end = &pid_ns->pidmap[PIDMAP_ENTRIES]; 230 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
231 for (; map < end; map++, offset = 0) { 231 for (; map < end; map++, offset = 0) {
232 if (unlikely(!map->page)) 232 if (unlikely(!map->page))
233 continue; 233 continue;
234 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); 234 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
235 if (offset < BITS_PER_PAGE) 235 if (offset < BITS_PER_PAGE)
236 return mk_pid(pid_ns, map, offset); 236 return mk_pid(pid_ns, map, offset);
237 } 237 }
238 return -1; 238 return -1;
239 } 239 }
240 240
241 void put_pid(struct pid *pid) 241 void put_pid(struct pid *pid)
242 { 242 {
243 struct pid_namespace *ns; 243 struct pid_namespace *ns;
244 244
245 if (!pid) 245 if (!pid)
246 return; 246 return;
247 247
248 ns = pid->numbers[pid->level].ns; 248 ns = pid->numbers[pid->level].ns;
249 if ((atomic_read(&pid->count) == 1) || 249 if ((atomic_read(&pid->count) == 1) ||
250 atomic_dec_and_test(&pid->count)) { 250 atomic_dec_and_test(&pid->count)) {
251 kmem_cache_free(ns->pid_cachep, pid); 251 kmem_cache_free(ns->pid_cachep, pid);
252 put_pid_ns(ns); 252 put_pid_ns(ns);
253 } 253 }
254 } 254 }
255 EXPORT_SYMBOL_GPL(put_pid); 255 EXPORT_SYMBOL_GPL(put_pid);
256 256
257 static void delayed_put_pid(struct rcu_head *rhp) 257 static void delayed_put_pid(struct rcu_head *rhp)
258 { 258 {
259 struct pid *pid = container_of(rhp, struct pid, rcu); 259 struct pid *pid = container_of(rhp, struct pid, rcu);
260 put_pid(pid); 260 put_pid(pid);
261 } 261 }
262 262
263 void free_pid(struct pid *pid) 263 void free_pid(struct pid *pid)
264 { 264 {
265 /* We can be called with write_lock_irq(&tasklist_lock) held */ 265 /* We can be called with write_lock_irq(&tasklist_lock) held */
266 int i; 266 int i;
267 unsigned long flags; 267 unsigned long flags;
268 268
269 spin_lock_irqsave(&pidmap_lock, flags); 269 spin_lock_irqsave(&pidmap_lock, flags);
270 for (i = 0; i <= pid->level; i++) 270 for (i = 0; i <= pid->level; i++)
271 hlist_del_rcu(&pid->numbers[i].pid_chain); 271 hlist_del_rcu(&pid->numbers[i].pid_chain);
272 spin_unlock_irqrestore(&pidmap_lock, flags); 272 spin_unlock_irqrestore(&pidmap_lock, flags);
273 273
274 for (i = 0; i <= pid->level; i++) 274 for (i = 0; i <= pid->level; i++)
275 free_pidmap(pid->numbers + i); 275 free_pidmap(pid->numbers + i);
276 276
277 call_rcu(&pid->rcu, delayed_put_pid); 277 call_rcu(&pid->rcu, delayed_put_pid);
278 } 278 }
279 279
280 struct pid *alloc_pid(struct pid_namespace *ns) 280 struct pid *alloc_pid(struct pid_namespace *ns)
281 { 281 {
282 struct pid *pid; 282 struct pid *pid;
283 enum pid_type type; 283 enum pid_type type;
284 int i, nr; 284 int i, nr;
285 struct pid_namespace *tmp; 285 struct pid_namespace *tmp;
286 struct upid *upid; 286 struct upid *upid;
287 287
288 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); 288 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
289 if (!pid) 289 if (!pid)
290 goto out; 290 goto out;
291 291
292 tmp = ns; 292 tmp = ns;
293 for (i = ns->level; i >= 0; i--) { 293 for (i = ns->level; i >= 0; i--) {
294 nr = alloc_pidmap(tmp); 294 nr = alloc_pidmap(tmp);
295 if (nr < 0) 295 if (nr < 0)
296 goto out_free; 296 goto out_free;
297 297
298 pid->numbers[i].nr = nr; 298 pid->numbers[i].nr = nr;
299 pid->numbers[i].ns = tmp; 299 pid->numbers[i].ns = tmp;
300 tmp = tmp->parent; 300 tmp = tmp->parent;
301 } 301 }
302 302
303 get_pid_ns(ns); 303 get_pid_ns(ns);
304 pid->level = ns->level; 304 pid->level = ns->level;
305 atomic_set(&pid->count, 1); 305 atomic_set(&pid->count, 1);
306 for (type = 0; type < PIDTYPE_MAX; ++type) 306 for (type = 0; type < PIDTYPE_MAX; ++type)
307 INIT_HLIST_HEAD(&pid->tasks[type]); 307 INIT_HLIST_HEAD(&pid->tasks[type]);
308 308
309 upid = pid->numbers + ns->level; 309 upid = pid->numbers + ns->level;
310 spin_lock_irq(&pidmap_lock); 310 spin_lock_irq(&pidmap_lock);
311 for ( ; upid >= pid->numbers; --upid) 311 for ( ; upid >= pid->numbers; --upid)
312 hlist_add_head_rcu(&upid->pid_chain, 312 hlist_add_head_rcu(&upid->pid_chain,
313 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 313 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
314 spin_unlock_irq(&pidmap_lock); 314 spin_unlock_irq(&pidmap_lock);
315 315
316 out: 316 out:
317 return pid; 317 return pid;
318 318
319 out_free: 319 out_free:
320 while (++i <= ns->level) 320 while (++i <= ns->level)
321 free_pidmap(pid->numbers + i); 321 free_pidmap(pid->numbers + i);
322 322
323 kmem_cache_free(ns->pid_cachep, pid); 323 kmem_cache_free(ns->pid_cachep, pid);
324 pid = NULL; 324 pid = NULL;
325 goto out; 325 goto out;
326 } 326 }
327 327
328 struct pid *find_pid_ns(int nr, struct pid_namespace *ns) 328 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
329 { 329 {
330 struct hlist_node *elem; 330 struct hlist_node *elem;
331 struct upid *pnr; 331 struct upid *pnr;
332 332
333 hlist_for_each_entry_rcu(pnr, elem, 333 hlist_for_each_entry_rcu(pnr, elem,
334 &pid_hash[pid_hashfn(nr, ns)], pid_chain) 334 &pid_hash[pid_hashfn(nr, ns)], pid_chain)
335 if (pnr->nr == nr && pnr->ns == ns) 335 if (pnr->nr == nr && pnr->ns == ns)
336 return container_of(pnr, struct pid, 336 return container_of(pnr, struct pid,
337 numbers[ns->level]); 337 numbers[ns->level]);
338 338
339 return NULL; 339 return NULL;
340 } 340 }
341 EXPORT_SYMBOL_GPL(find_pid_ns); 341 EXPORT_SYMBOL_GPL(find_pid_ns);
342 342
343 struct pid *find_vpid(int nr) 343 struct pid *find_vpid(int nr)
344 { 344 {
345 return find_pid_ns(nr, current->nsproxy->pid_ns); 345 return find_pid_ns(nr, current->nsproxy->pid_ns);
346 } 346 }
347 EXPORT_SYMBOL_GPL(find_vpid); 347 EXPORT_SYMBOL_GPL(find_vpid);
348 348
349 /* 349 /*
350 * attach_pid() must be called with the tasklist_lock write-held. 350 * attach_pid() must be called with the tasklist_lock write-held.
351 */ 351 */
352 void attach_pid(struct task_struct *task, enum pid_type type, 352 void attach_pid(struct task_struct *task, enum pid_type type,
353 struct pid *pid) 353 struct pid *pid)
354 { 354 {
355 struct pid_link *link; 355 struct pid_link *link;
356 356
357 link = &task->pids[type]; 357 link = &task->pids[type];
358 link->pid = pid; 358 link->pid = pid;
359 hlist_add_head_rcu(&link->node, &pid->tasks[type]); 359 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
360 } 360 }
361 361
362 static void __change_pid(struct task_struct *task, enum pid_type type, 362 static void __change_pid(struct task_struct *task, enum pid_type type,
363 struct pid *new) 363 struct pid *new)
364 { 364 {
365 struct pid_link *link; 365 struct pid_link *link;
366 struct pid *pid; 366 struct pid *pid;
367 int tmp; 367 int tmp;
368 368
369 link = &task->pids[type]; 369 link = &task->pids[type];
370 pid = link->pid; 370 pid = link->pid;
371 371
372 hlist_del_rcu(&link->node); 372 hlist_del_rcu(&link->node);
373 link->pid = new; 373 link->pid = new;
374 374
375 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 375 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
376 if (!hlist_empty(&pid->tasks[tmp])) 376 if (!hlist_empty(&pid->tasks[tmp]))
377 return; 377 return;
378 378
379 free_pid(pid); 379 free_pid(pid);
380 } 380 }
381 381
382 void detach_pid(struct task_struct *task, enum pid_type type) 382 void detach_pid(struct task_struct *task, enum pid_type type)
383 { 383 {
384 __change_pid(task, type, NULL); 384 __change_pid(task, type, NULL);
385 } 385 }
386 386
387 void change_pid(struct task_struct *task, enum pid_type type, 387 void change_pid(struct task_struct *task, enum pid_type type,
388 struct pid *pid) 388 struct pid *pid)
389 { 389 {
390 __change_pid(task, type, pid); 390 __change_pid(task, type, pid);
391 attach_pid(task, type, pid); 391 attach_pid(task, type, pid);
392 } 392 }
393 393
394 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 394 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
395 void transfer_pid(struct task_struct *old, struct task_struct *new, 395 void transfer_pid(struct task_struct *old, struct task_struct *new,
396 enum pid_type type) 396 enum pid_type type)
397 { 397 {
398 new->pids[type].pid = old->pids[type].pid; 398 new->pids[type].pid = old->pids[type].pid;
399 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); 399 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
400 } 400 }
401 401
402 struct task_struct *pid_task(struct pid *pid, enum pid_type type) 402 struct task_struct *pid_task(struct pid *pid, enum pid_type type)
403 { 403 {
404 struct task_struct *result = NULL; 404 struct task_struct *result = NULL;
405 if (pid) { 405 if (pid) {
406 struct hlist_node *first; 406 struct hlist_node *first;
407 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), 407 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
408 rcu_read_lock_held() ||
409 lockdep_tasklist_lock_is_held()); 408 lockdep_tasklist_lock_is_held());
410 if (first) 409 if (first)
411 result = hlist_entry(first, struct task_struct, pids[(type)].node); 410 result = hlist_entry(first, struct task_struct, pids[(type)].node);
412 } 411 }
413 return result; 412 return result;
414 } 413 }
415 EXPORT_SYMBOL(pid_task); 414 EXPORT_SYMBOL(pid_task);
416 415
417 /* 416 /*
418 * Must be called under rcu_read_lock(). 417 * Must be called under rcu_read_lock().
419 */ 418 */
420 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 419 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
421 { 420 {
422 rcu_lockdep_assert(rcu_read_lock_held()); 421 rcu_lockdep_assert(rcu_read_lock_held());
423 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 422 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
424 } 423 }
425 424
426 struct task_struct *find_task_by_vpid(pid_t vnr) 425 struct task_struct *find_task_by_vpid(pid_t vnr)
427 { 426 {
428 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); 427 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
429 } 428 }
430 429
431 struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 430 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
432 { 431 {
433 struct pid *pid; 432 struct pid *pid;
434 rcu_read_lock(); 433 rcu_read_lock();
435 if (type != PIDTYPE_PID) 434 if (type != PIDTYPE_PID)
436 task = task->group_leader; 435 task = task->group_leader;
437 pid = get_pid(task->pids[type].pid); 436 pid = get_pid(task->pids[type].pid);
438 rcu_read_unlock(); 437 rcu_read_unlock();
439 return pid; 438 return pid;
440 } 439 }
441 EXPORT_SYMBOL_GPL(get_task_pid); 440 EXPORT_SYMBOL_GPL(get_task_pid);
442 441
443 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) 442 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
444 { 443 {
445 struct task_struct *result; 444 struct task_struct *result;
446 rcu_read_lock(); 445 rcu_read_lock();
447 result = pid_task(pid, type); 446 result = pid_task(pid, type);
448 if (result) 447 if (result)
449 get_task_struct(result); 448 get_task_struct(result);
450 rcu_read_unlock(); 449 rcu_read_unlock();
451 return result; 450 return result;
452 } 451 }
453 EXPORT_SYMBOL_GPL(get_pid_task); 452 EXPORT_SYMBOL_GPL(get_pid_task);
454 453
455 struct pid *find_get_pid(pid_t nr) 454 struct pid *find_get_pid(pid_t nr)
456 { 455 {
457 struct pid *pid; 456 struct pid *pid;
458 457
459 rcu_read_lock(); 458 rcu_read_lock();
460 pid = get_pid(find_vpid(nr)); 459 pid = get_pid(find_vpid(nr));
461 rcu_read_unlock(); 460 rcu_read_unlock();
462 461
463 return pid; 462 return pid;
464 } 463 }
465 EXPORT_SYMBOL_GPL(find_get_pid); 464 EXPORT_SYMBOL_GPL(find_get_pid);
466 465
467 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) 466 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
468 { 467 {
469 struct upid *upid; 468 struct upid *upid;
470 pid_t nr = 0; 469 pid_t nr = 0;
471 470
472 if (pid && ns->level <= pid->level) { 471 if (pid && ns->level <= pid->level) {
473 upid = &pid->numbers[ns->level]; 472 upid = &pid->numbers[ns->level];
474 if (upid->ns == ns) 473 if (upid->ns == ns)
475 nr = upid->nr; 474 nr = upid->nr;
476 } 475 }
477 return nr; 476 return nr;
478 } 477 }
479 478
480 pid_t pid_vnr(struct pid *pid) 479 pid_t pid_vnr(struct pid *pid)
481 { 480 {
482 return pid_nr_ns(pid, current->nsproxy->pid_ns); 481 return pid_nr_ns(pid, current->nsproxy->pid_ns);
483 } 482 }
484 EXPORT_SYMBOL_GPL(pid_vnr); 483 EXPORT_SYMBOL_GPL(pid_vnr);
485 484
486 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, 485 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
487 struct pid_namespace *ns) 486 struct pid_namespace *ns)
488 { 487 {
489 pid_t nr = 0; 488 pid_t nr = 0;
490 489
491 rcu_read_lock(); 490 rcu_read_lock();
492 if (!ns) 491 if (!ns)
493 ns = current->nsproxy->pid_ns; 492 ns = current->nsproxy->pid_ns;
494 if (likely(pid_alive(task))) { 493 if (likely(pid_alive(task))) {
495 if (type != PIDTYPE_PID) 494 if (type != PIDTYPE_PID)
496 task = task->group_leader; 495 task = task->group_leader;
497 nr = pid_nr_ns(task->pids[type].pid, ns); 496 nr = pid_nr_ns(task->pids[type].pid, ns);
498 } 497 }
499 rcu_read_unlock(); 498 rcu_read_unlock();
500 499
501 return nr; 500 return nr;
502 } 501 }
503 EXPORT_SYMBOL(__task_pid_nr_ns); 502 EXPORT_SYMBOL(__task_pid_nr_ns);
504 503
505 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 504 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
506 { 505 {
507 return pid_nr_ns(task_tgid(tsk), ns); 506 return pid_nr_ns(task_tgid(tsk), ns);
508 } 507 }
509 EXPORT_SYMBOL(task_tgid_nr_ns); 508 EXPORT_SYMBOL(task_tgid_nr_ns);
510 509
511 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) 510 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
512 { 511 {
513 return ns_of_pid(task_pid(tsk)); 512 return ns_of_pid(task_pid(tsk));
514 } 513 }
515 EXPORT_SYMBOL_GPL(task_active_pid_ns); 514 EXPORT_SYMBOL_GPL(task_active_pid_ns);
516 515
517 /* 516 /*
518 * Used by proc to find the first pid that is greater than or equal to nr. 517 * Used by proc to find the first pid that is greater than or equal to nr.
519 * 518 *
520 * If there is a pid at nr this function is exactly the same as find_pid_ns. 519 * If there is a pid at nr this function is exactly the same as find_pid_ns.
521 */ 520 */
522 struct pid *find_ge_pid(int nr, struct pid_namespace *ns) 521 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
523 { 522 {
524 struct pid *pid; 523 struct pid *pid;
525 524
526 do { 525 do {
527 pid = find_pid_ns(nr, ns); 526 pid = find_pid_ns(nr, ns);
528 if (pid) 527 if (pid)
529 break; 528 break;
530 nr = next_pidmap(ns, nr); 529 nr = next_pidmap(ns, nr);
531 } while (nr > 0); 530 } while (nr > 0);
532 531
533 return pid; 532 return pid;
534 } 533 }
535 534
536 /* 535 /*
537 * The pid hash table is scaled according to the amount of memory in the 536 * The pid hash table is scaled according to the amount of memory in the
538 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 537 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
539 * more. 538 * more.
540 */ 539 */
541 void __init pidhash_init(void) 540 void __init pidhash_init(void)
542 { 541 {
543 int i, pidhash_size; 542 int i, pidhash_size;
544 543
545 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 544 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
546 HASH_EARLY | HASH_SMALL, 545 HASH_EARLY | HASH_SMALL,
547 &pidhash_shift, NULL, 4096); 546 &pidhash_shift, NULL, 4096);
548 pidhash_size = 1 << pidhash_shift; 547 pidhash_size = 1 << pidhash_shift;
549 548
550 for (i = 0; i < pidhash_size; i++) 549 for (i = 0; i < pidhash_size; i++)
551 INIT_HLIST_HEAD(&pid_hash[i]); 550 INIT_HLIST_HEAD(&pid_hash[i]);
552 } 551 }
553 552
554 void __init pidmap_init(void) 553 void __init pidmap_init(void)
555 { 554 {
556 /* bump default and minimum pid_max based on number of cpus */ 555 /* bump default and minimum pid_max based on number of cpus */
557 pid_max = min(pid_max_max, max_t(int, pid_max, 556 pid_max = min(pid_max_max, max_t(int, pid_max,
558 PIDS_PER_CPU_DEFAULT * num_possible_cpus())); 557 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
559 pid_max_min = max_t(int, pid_max_min, 558 pid_max_min = max_t(int, pid_max_min,
560 PIDS_PER_CPU_MIN * num_possible_cpus()); 559 PIDS_PER_CPU_MIN * num_possible_cpus());
561 pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); 560 pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
562 561
563 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 562 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
564 /* Reserve PID 0. We never call free_pidmap(0) */ 563 /* Reserve PID 0. We never call free_pidmap(0) */
565 set_bit(0, init_pid_ns.pidmap[0].page); 564 set_bit(0, init_pid_ns.pidmap[0].page);
566 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 565 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
567 566
568 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 567 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
569 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 568 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
570 } 569 }
571 570
1 /* 1 /*
2 * Read-Copy Update module-based torture test facility 2 * Read-Copy Update module-based torture test facility
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or 6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version. 7 * (at your option) any later version.
8 * 8 *
9 * This program is distributed in the hope that it will be useful, 9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Josh Triplett <josh@freedesktop.org> 21 * Josh Triplett <josh@freedesktop.org>
22 * 22 *
23 * See also: Documentation/RCU/torture.txt 23 * See also: Documentation/RCU/torture.txt
24 */ 24 */
25 #include <linux/types.h> 25 #include <linux/types.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/init.h> 27 #include <linux/init.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/kthread.h> 29 #include <linux/kthread.h>
30 #include <linux/err.h> 30 #include <linux/err.h>
31 #include <linux/spinlock.h> 31 #include <linux/spinlock.h>
32 #include <linux/smp.h> 32 #include <linux/smp.h>
33 #include <linux/rcupdate.h> 33 #include <linux/rcupdate.h>
34 #include <linux/interrupt.h> 34 #include <linux/interrupt.h>
35 #include <linux/sched.h> 35 #include <linux/sched.h>
36 #include <asm/atomic.h> 36 #include <asm/atomic.h>
37 #include <linux/bitops.h> 37 #include <linux/bitops.h>
38 #include <linux/completion.h> 38 #include <linux/completion.h>
39 #include <linux/moduleparam.h> 39 #include <linux/moduleparam.h>
40 #include <linux/percpu.h> 40 #include <linux/percpu.h>
41 #include <linux/notifier.h> 41 #include <linux/notifier.h>
42 #include <linux/reboot.h> 42 #include <linux/reboot.h>
43 #include <linux/freezer.h> 43 #include <linux/freezer.h>
44 #include <linux/cpu.h> 44 #include <linux/cpu.h>
45 #include <linux/delay.h> 45 #include <linux/delay.h>
46 #include <linux/stat.h> 46 #include <linux/stat.h>
47 #include <linux/srcu.h> 47 #include <linux/srcu.h>
48 #include <linux/slab.h> 48 #include <linux/slab.h>
49 #include <asm/byteorder.h> 49 #include <asm/byteorder.h>
50 50
51 MODULE_LICENSE("GPL"); 51 MODULE_LICENSE("GPL");
52 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
53 "Josh Triplett <josh@freedesktop.org>"); 53 "Josh Triplett <josh@freedesktop.org>");
54 54
55 static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55 static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56 static int nfakewriters = 4; /* # fake writer threads */ 56 static int nfakewriters = 4; /* # fake writer threads */
57 static int stat_interval; /* Interval between stats, in seconds. */ 57 static int stat_interval; /* Interval between stats, in seconds. */
58 /* Defaults to "only at end of test". */ 58 /* Defaults to "only at end of test". */
59 static int verbose; /* Print more debug info. */ 59 static int verbose; /* Print more debug info. */
60 static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 60 static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62 static int stutter = 5; /* Start/stop testing interval (in sec) */ 62 static int stutter = 5; /* Start/stop testing interval (in sec) */
63 static int irqreader = 1; /* RCU readers from irq (timers). */ 63 static int irqreader = 1; /* RCU readers from irq (timers). */
64 static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 64 static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65 static int fqs_holdoff = 0; /* Hold time within burst (us). */ 65 static int fqs_holdoff = 0; /* Hold time within burst (us). */
66 static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66 static int fqs_stutter = 3; /* Wait time between bursts (s). */
67 static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 67 static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
68 static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 68 static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
69 static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 69 static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
70 static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 70 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
71 71
72 module_param(nreaders, int, 0444); 72 module_param(nreaders, int, 0444);
73 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 73 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
74 module_param(nfakewriters, int, 0444); 74 module_param(nfakewriters, int, 0444);
75 MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); 75 MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
76 module_param(stat_interval, int, 0444); 76 module_param(stat_interval, int, 0444);
77 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 77 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
78 module_param(verbose, bool, 0444); 78 module_param(verbose, bool, 0444);
79 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 79 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
80 module_param(test_no_idle_hz, bool, 0444); 80 module_param(test_no_idle_hz, bool, 0444);
81 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 81 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
82 module_param(shuffle_interval, int, 0444); 82 module_param(shuffle_interval, int, 0444);
83 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 83 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
84 module_param(stutter, int, 0444); 84 module_param(stutter, int, 0444);
85 MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 85 MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
86 module_param(irqreader, int, 0444); 86 module_param(irqreader, int, 0444);
87 MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 87 MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
88 module_param(fqs_duration, int, 0444); 88 module_param(fqs_duration, int, 0444);
89 MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); 89 MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
90 module_param(fqs_holdoff, int, 0444); 90 module_param(fqs_holdoff, int, 0444);
91 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 91 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
92 module_param(fqs_stutter, int, 0444); 92 module_param(fqs_stutter, int, 0444);
93 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 93 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
94 module_param(test_boost, int, 0444); 94 module_param(test_boost, int, 0444);
95 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 95 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
96 module_param(test_boost_interval, int, 0444); 96 module_param(test_boost_interval, int, 0444);
97 MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); 97 MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
98 module_param(test_boost_duration, int, 0444); 98 module_param(test_boost_duration, int, 0444);
99 MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); 99 MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
100 module_param(torture_type, charp, 0444); 100 module_param(torture_type, charp, 0444);
101 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 101 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
102 102
103 #define TORTURE_FLAG "-torture:" 103 #define TORTURE_FLAG "-torture:"
104 #define PRINTK_STRING(s) \ 104 #define PRINTK_STRING(s) \
105 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) 105 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
106 #define VERBOSE_PRINTK_STRING(s) \ 106 #define VERBOSE_PRINTK_STRING(s) \
107 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) 107 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
108 #define VERBOSE_PRINTK_ERRSTRING(s) \ 108 #define VERBOSE_PRINTK_ERRSTRING(s) \
109 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) 109 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
110 110
111 static char printk_buf[4096]; 111 static char printk_buf[4096];
112 112
113 static int nrealreaders; 113 static int nrealreaders;
114 static struct task_struct *writer_task; 114 static struct task_struct *writer_task;
115 static struct task_struct **fakewriter_tasks; 115 static struct task_struct **fakewriter_tasks;
116 static struct task_struct **reader_tasks; 116 static struct task_struct **reader_tasks;
117 static struct task_struct *stats_task; 117 static struct task_struct *stats_task;
118 static struct task_struct *shuffler_task; 118 static struct task_struct *shuffler_task;
119 static struct task_struct *stutter_task; 119 static struct task_struct *stutter_task;
120 static struct task_struct *fqs_task; 120 static struct task_struct *fqs_task;
121 static struct task_struct *boost_tasks[NR_CPUS]; 121 static struct task_struct *boost_tasks[NR_CPUS];
122 122
123 #define RCU_TORTURE_PIPE_LEN 10 123 #define RCU_TORTURE_PIPE_LEN 10
124 124
125 struct rcu_torture { 125 struct rcu_torture {
126 struct rcu_head rtort_rcu; 126 struct rcu_head rtort_rcu;
127 int rtort_pipe_count; 127 int rtort_pipe_count;
128 struct list_head rtort_free; 128 struct list_head rtort_free;
129 int rtort_mbtest; 129 int rtort_mbtest;
130 }; 130 };
131 131
132 static LIST_HEAD(rcu_torture_freelist); 132 static LIST_HEAD(rcu_torture_freelist);
133 static struct rcu_torture __rcu *rcu_torture_current; 133 static struct rcu_torture __rcu *rcu_torture_current;
134 static unsigned long rcu_torture_current_version; 134 static unsigned long rcu_torture_current_version;
135 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 135 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
136 static DEFINE_SPINLOCK(rcu_torture_lock); 136 static DEFINE_SPINLOCK(rcu_torture_lock);
137 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 137 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
138 { 0 }; 138 { 0 };
139 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = 139 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
140 { 0 }; 140 { 0 };
141 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; 141 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
142 static atomic_t n_rcu_torture_alloc; 142 static atomic_t n_rcu_torture_alloc;
143 static atomic_t n_rcu_torture_alloc_fail; 143 static atomic_t n_rcu_torture_alloc_fail;
144 static atomic_t n_rcu_torture_free; 144 static atomic_t n_rcu_torture_free;
145 static atomic_t n_rcu_torture_mberror; 145 static atomic_t n_rcu_torture_mberror;
146 static atomic_t n_rcu_torture_error; 146 static atomic_t n_rcu_torture_error;
147 static long n_rcu_torture_boost_ktrerror; 147 static long n_rcu_torture_boost_ktrerror;
148 static long n_rcu_torture_boost_rterror; 148 static long n_rcu_torture_boost_rterror;
149 static long n_rcu_torture_boost_failure; 149 static long n_rcu_torture_boost_failure;
150 static long n_rcu_torture_boosts; 150 static long n_rcu_torture_boosts;
151 static long n_rcu_torture_timers; 151 static long n_rcu_torture_timers;
152 static struct list_head rcu_torture_removed; 152 static struct list_head rcu_torture_removed;
153 static cpumask_var_t shuffle_tmp_mask; 153 static cpumask_var_t shuffle_tmp_mask;
154 154
155 static int stutter_pause_test; 155 static int stutter_pause_test;
156 156
157 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 157 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
158 #define RCUTORTURE_RUNNABLE_INIT 1 158 #define RCUTORTURE_RUNNABLE_INIT 1
159 #else 159 #else
160 #define RCUTORTURE_RUNNABLE_INIT 0 160 #define RCUTORTURE_RUNNABLE_INIT 0
161 #endif 161 #endif
162 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 162 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
163 163
164 #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 164 #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165 #define rcu_can_boost() 1 165 #define rcu_can_boost() 1
166 #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 166 #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
167 #define rcu_can_boost() 0 167 #define rcu_can_boost() 0
168 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 168 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169 169
170 static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170 static unsigned long boost_starttime; /* jiffies of next boost test start. */
171 DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171 DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
172 /* and boost task create/destroy. */ 172 /* and boost task create/destroy. */
173 173
174 /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 174 /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
175 175
176 #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ 176 #define FULLSTOP_DONTSTOP 0 /* Normal operation. */
177 #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ 177 #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
178 #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ 178 #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
179 static int fullstop = FULLSTOP_RMMOD; 179 static int fullstop = FULLSTOP_RMMOD;
180 /* 180 /*
181 * Protect fullstop transitions and spawning of kthreads. 181 * Protect fullstop transitions and spawning of kthreads.
182 */ 182 */
183 static DEFINE_MUTEX(fullstop_mutex); 183 static DEFINE_MUTEX(fullstop_mutex);
184 184
185 /* 185 /*
186 * Detect and respond to a system shutdown. 186 * Detect and respond to a system shutdown.
187 */ 187 */
188 static int 188 static int
189 rcutorture_shutdown_notify(struct notifier_block *unused1, 189 rcutorture_shutdown_notify(struct notifier_block *unused1,
190 unsigned long unused2, void *unused3) 190 unsigned long unused2, void *unused3)
191 { 191 {
192 mutex_lock(&fullstop_mutex); 192 mutex_lock(&fullstop_mutex);
193 if (fullstop == FULLSTOP_DONTSTOP) 193 if (fullstop == FULLSTOP_DONTSTOP)
194 fullstop = FULLSTOP_SHUTDOWN; 194 fullstop = FULLSTOP_SHUTDOWN;
195 else 195 else
196 printk(KERN_WARNING /* but going down anyway, so... */ 196 printk(KERN_WARNING /* but going down anyway, so... */
197 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 197 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
198 mutex_unlock(&fullstop_mutex); 198 mutex_unlock(&fullstop_mutex);
199 return NOTIFY_DONE; 199 return NOTIFY_DONE;
200 } 200 }
201 201
202 /* 202 /*
203 * Absorb kthreads into a kernel function that won't return, so that 203 * Absorb kthreads into a kernel function that won't return, so that
204 * they won't ever access module text or data again. 204 * they won't ever access module text or data again.
205 */ 205 */
206 static void rcutorture_shutdown_absorb(char *title) 206 static void rcutorture_shutdown_absorb(char *title)
207 { 207 {
208 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 208 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
209 printk(KERN_NOTICE 209 printk(KERN_NOTICE
210 "rcutorture thread %s parking due to system shutdown\n", 210 "rcutorture thread %s parking due to system shutdown\n",
211 title); 211 title);
212 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); 212 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
213 } 213 }
214 } 214 }
215 215
216 /* 216 /*
217 * Allocate an element from the rcu_tortures pool. 217 * Allocate an element from the rcu_tortures pool.
218 */ 218 */
219 static struct rcu_torture * 219 static struct rcu_torture *
220 rcu_torture_alloc(void) 220 rcu_torture_alloc(void)
221 { 221 {
222 struct list_head *p; 222 struct list_head *p;
223 223
224 spin_lock_bh(&rcu_torture_lock); 224 spin_lock_bh(&rcu_torture_lock);
225 if (list_empty(&rcu_torture_freelist)) { 225 if (list_empty(&rcu_torture_freelist)) {
226 atomic_inc(&n_rcu_torture_alloc_fail); 226 atomic_inc(&n_rcu_torture_alloc_fail);
227 spin_unlock_bh(&rcu_torture_lock); 227 spin_unlock_bh(&rcu_torture_lock);
228 return NULL; 228 return NULL;
229 } 229 }
230 atomic_inc(&n_rcu_torture_alloc); 230 atomic_inc(&n_rcu_torture_alloc);
231 p = rcu_torture_freelist.next; 231 p = rcu_torture_freelist.next;
232 list_del_init(p); 232 list_del_init(p);
233 spin_unlock_bh(&rcu_torture_lock); 233 spin_unlock_bh(&rcu_torture_lock);
234 return container_of(p, struct rcu_torture, rtort_free); 234 return container_of(p, struct rcu_torture, rtort_free);
235 } 235 }
236 236
237 /* 237 /*
238 * Free an element to the rcu_tortures pool. 238 * Free an element to the rcu_tortures pool.
239 */ 239 */
240 static void 240 static void
241 rcu_torture_free(struct rcu_torture *p) 241 rcu_torture_free(struct rcu_torture *p)
242 { 242 {
243 atomic_inc(&n_rcu_torture_free); 243 atomic_inc(&n_rcu_torture_free);
244 spin_lock_bh(&rcu_torture_lock); 244 spin_lock_bh(&rcu_torture_lock);
245 list_add_tail(&p->rtort_free, &rcu_torture_freelist); 245 list_add_tail(&p->rtort_free, &rcu_torture_freelist);
246 spin_unlock_bh(&rcu_torture_lock); 246 spin_unlock_bh(&rcu_torture_lock);
247 } 247 }
248 248
249 struct rcu_random_state { 249 struct rcu_random_state {
250 unsigned long rrs_state; 250 unsigned long rrs_state;
251 long rrs_count; 251 long rrs_count;
252 }; 252 };
253 253
254 #define RCU_RANDOM_MULT 39916801 /* prime */ 254 #define RCU_RANDOM_MULT 39916801 /* prime */
255 #define RCU_RANDOM_ADD 479001701 /* prime */ 255 #define RCU_RANDOM_ADD 479001701 /* prime */
256 #define RCU_RANDOM_REFRESH 10000 256 #define RCU_RANDOM_REFRESH 10000
257 257
258 #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } 258 #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
259 259
260 /* 260 /*
261 * Crude but fast random-number generator. Uses a linear congruential 261 * Crude but fast random-number generator. Uses a linear congruential
262 * generator, with occasional help from cpu_clock(). 262 * generator, with occasional help from cpu_clock().
263 */ 263 */
264 static unsigned long 264 static unsigned long
265 rcu_random(struct rcu_random_state *rrsp) 265 rcu_random(struct rcu_random_state *rrsp)
266 { 266 {
267 if (--rrsp->rrs_count < 0) { 267 if (--rrsp->rrs_count < 0) {
268 rrsp->rrs_state += (unsigned long)local_clock(); 268 rrsp->rrs_state += (unsigned long)local_clock();
269 rrsp->rrs_count = RCU_RANDOM_REFRESH; 269 rrsp->rrs_count = RCU_RANDOM_REFRESH;
270 } 270 }
271 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 271 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
272 return swahw32(rrsp->rrs_state); 272 return swahw32(rrsp->rrs_state);
273 } 273 }
274 274
275 static void 275 static void
276 rcu_stutter_wait(char *title) 276 rcu_stutter_wait(char *title)
277 { 277 {
278 while (stutter_pause_test || !rcutorture_runnable) { 278 while (stutter_pause_test || !rcutorture_runnable) {
279 if (rcutorture_runnable) 279 if (rcutorture_runnable)
280 schedule_timeout_interruptible(1); 280 schedule_timeout_interruptible(1);
281 else 281 else
282 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 282 schedule_timeout_interruptible(round_jiffies_relative(HZ));
283 rcutorture_shutdown_absorb(title); 283 rcutorture_shutdown_absorb(title);
284 } 284 }
285 } 285 }
286 286
287 /* 287 /*
288 * Operations vector for selecting different types of tests. 288 * Operations vector for selecting different types of tests.
289 */ 289 */
290 290
291 struct rcu_torture_ops { 291 struct rcu_torture_ops {
292 void (*init)(void); 292 void (*init)(void);
293 void (*cleanup)(void); 293 void (*cleanup)(void);
294 int (*readlock)(void); 294 int (*readlock)(void);
295 void (*read_delay)(struct rcu_random_state *rrsp); 295 void (*read_delay)(struct rcu_random_state *rrsp);
296 void (*readunlock)(int idx); 296 void (*readunlock)(int idx);
297 int (*completed)(void); 297 int (*completed)(void);
298 void (*deferred_free)(struct rcu_torture *p); 298 void (*deferred_free)(struct rcu_torture *p);
299 void (*sync)(void); 299 void (*sync)(void);
300 void (*cb_barrier)(void); 300 void (*cb_barrier)(void);
301 void (*fqs)(void); 301 void (*fqs)(void);
302 int (*stats)(char *page); 302 int (*stats)(char *page);
303 int irq_capable; 303 int irq_capable;
304 int can_boost; 304 int can_boost;
305 char *name; 305 char *name;
306 }; 306 };
307 307
308 static struct rcu_torture_ops *cur_ops; 308 static struct rcu_torture_ops *cur_ops;
309 309
310 /* 310 /*
311 * Definitions for rcu torture testing. 311 * Definitions for rcu torture testing.
312 */ 312 */
313 313
314 static int rcu_torture_read_lock(void) __acquires(RCU) 314 static int rcu_torture_read_lock(void) __acquires(RCU)
315 { 315 {
316 rcu_read_lock(); 316 rcu_read_lock();
317 return 0; 317 return 0;
318 } 318 }
319 319
320 static void rcu_read_delay(struct rcu_random_state *rrsp) 320 static void rcu_read_delay(struct rcu_random_state *rrsp)
321 { 321 {
322 const unsigned long shortdelay_us = 200; 322 const unsigned long shortdelay_us = 200;
323 const unsigned long longdelay_ms = 50; 323 const unsigned long longdelay_ms = 50;
324 324
325 /* We want a short delay sometimes to make a reader delay the grace 325 /* We want a short delay sometimes to make a reader delay the grace
326 * period, and we want a long delay occasionally to trigger 326 * period, and we want a long delay occasionally to trigger
327 * force_quiescent_state. */ 327 * force_quiescent_state. */
328 328
329 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) 329 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
330 mdelay(longdelay_ms); 330 mdelay(longdelay_ms);
331 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) 331 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
332 udelay(shortdelay_us); 332 udelay(shortdelay_us);
333 #ifdef CONFIG_PREEMPT 333 #ifdef CONFIG_PREEMPT
334 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) 334 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
335 preempt_schedule(); /* No QS if preempt_disable() in effect */ 335 preempt_schedule(); /* No QS if preempt_disable() in effect */
336 #endif 336 #endif
337 } 337 }
338 338
339 static void rcu_torture_read_unlock(int idx) __releases(RCU) 339 static void rcu_torture_read_unlock(int idx) __releases(RCU)
340 { 340 {
341 rcu_read_unlock(); 341 rcu_read_unlock();
342 } 342 }
343 343
344 static int rcu_torture_completed(void) 344 static int rcu_torture_completed(void)
345 { 345 {
346 return rcu_batches_completed(); 346 return rcu_batches_completed();
347 } 347 }
348 348
349 static void 349 static void
350 rcu_torture_cb(struct rcu_head *p) 350 rcu_torture_cb(struct rcu_head *p)
351 { 351 {
352 int i; 352 int i;
353 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); 353 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
354 354
355 if (fullstop != FULLSTOP_DONTSTOP) { 355 if (fullstop != FULLSTOP_DONTSTOP) {
356 /* Test is ending, just drop callbacks on the floor. */ 356 /* Test is ending, just drop callbacks on the floor. */
357 /* The next initialization will pick up the pieces. */ 357 /* The next initialization will pick up the pieces. */
358 return; 358 return;
359 } 359 }
360 i = rp->rtort_pipe_count; 360 i = rp->rtort_pipe_count;
361 if (i > RCU_TORTURE_PIPE_LEN) 361 if (i > RCU_TORTURE_PIPE_LEN)
362 i = RCU_TORTURE_PIPE_LEN; 362 i = RCU_TORTURE_PIPE_LEN;
363 atomic_inc(&rcu_torture_wcount[i]); 363 atomic_inc(&rcu_torture_wcount[i]);
364 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { 364 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
365 rp->rtort_mbtest = 0; 365 rp->rtort_mbtest = 0;
366 rcu_torture_free(rp); 366 rcu_torture_free(rp);
367 } else 367 } else
368 cur_ops->deferred_free(rp); 368 cur_ops->deferred_free(rp);
369 } 369 }
370 370
371 static int rcu_no_completed(void) 371 static int rcu_no_completed(void)
372 { 372 {
373 return 0; 373 return 0;
374 } 374 }
375 375
376 static void rcu_torture_deferred_free(struct rcu_torture *p) 376 static void rcu_torture_deferred_free(struct rcu_torture *p)
377 { 377 {
378 call_rcu(&p->rtort_rcu, rcu_torture_cb); 378 call_rcu(&p->rtort_rcu, rcu_torture_cb);
379 } 379 }
380 380
381 static struct rcu_torture_ops rcu_ops = { 381 static struct rcu_torture_ops rcu_ops = {
382 .init = NULL, 382 .init = NULL,
383 .cleanup = NULL, 383 .cleanup = NULL,
384 .readlock = rcu_torture_read_lock, 384 .readlock = rcu_torture_read_lock,
385 .read_delay = rcu_read_delay, 385 .read_delay = rcu_read_delay,
386 .readunlock = rcu_torture_read_unlock, 386 .readunlock = rcu_torture_read_unlock,
387 .completed = rcu_torture_completed, 387 .completed = rcu_torture_completed,
388 .deferred_free = rcu_torture_deferred_free, 388 .deferred_free = rcu_torture_deferred_free,
389 .sync = synchronize_rcu, 389 .sync = synchronize_rcu,
390 .cb_barrier = rcu_barrier, 390 .cb_barrier = rcu_barrier,
391 .fqs = rcu_force_quiescent_state, 391 .fqs = rcu_force_quiescent_state,
392 .stats = NULL, 392 .stats = NULL,
393 .irq_capable = 1, 393 .irq_capable = 1,
394 .can_boost = rcu_can_boost(), 394 .can_boost = rcu_can_boost(),
395 .name = "rcu" 395 .name = "rcu"
396 }; 396 };
397 397
398 static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 398 static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
399 { 399 {
400 int i; 400 int i;
401 struct rcu_torture *rp; 401 struct rcu_torture *rp;
402 struct rcu_torture *rp1; 402 struct rcu_torture *rp1;
403 403
404 cur_ops->sync(); 404 cur_ops->sync();
405 list_add(&p->rtort_free, &rcu_torture_removed); 405 list_add(&p->rtort_free, &rcu_torture_removed);
406 list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { 406 list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
407 i = rp->rtort_pipe_count; 407 i = rp->rtort_pipe_count;
408 if (i > RCU_TORTURE_PIPE_LEN) 408 if (i > RCU_TORTURE_PIPE_LEN)
409 i = RCU_TORTURE_PIPE_LEN; 409 i = RCU_TORTURE_PIPE_LEN;
410 atomic_inc(&rcu_torture_wcount[i]); 410 atomic_inc(&rcu_torture_wcount[i]);
411 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { 411 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
412 rp->rtort_mbtest = 0; 412 rp->rtort_mbtest = 0;
413 list_del(&rp->rtort_free); 413 list_del(&rp->rtort_free);
414 rcu_torture_free(rp); 414 rcu_torture_free(rp);
415 } 415 }
416 } 416 }
417 } 417 }
418 418
419 static void rcu_sync_torture_init(void) 419 static void rcu_sync_torture_init(void)
420 { 420 {
421 INIT_LIST_HEAD(&rcu_torture_removed); 421 INIT_LIST_HEAD(&rcu_torture_removed);
422 } 422 }
423 423
424 static struct rcu_torture_ops rcu_sync_ops = { 424 static struct rcu_torture_ops rcu_sync_ops = {
425 .init = rcu_sync_torture_init, 425 .init = rcu_sync_torture_init,
426 .cleanup = NULL, 426 .cleanup = NULL,
427 .readlock = rcu_torture_read_lock, 427 .readlock = rcu_torture_read_lock,
428 .read_delay = rcu_read_delay, 428 .read_delay = rcu_read_delay,
429 .readunlock = rcu_torture_read_unlock, 429 .readunlock = rcu_torture_read_unlock,
430 .completed = rcu_torture_completed, 430 .completed = rcu_torture_completed,
431 .deferred_free = rcu_sync_torture_deferred_free, 431 .deferred_free = rcu_sync_torture_deferred_free,
432 .sync = synchronize_rcu, 432 .sync = synchronize_rcu,
433 .cb_barrier = NULL, 433 .cb_barrier = NULL,
434 .fqs = rcu_force_quiescent_state, 434 .fqs = rcu_force_quiescent_state,
435 .stats = NULL, 435 .stats = NULL,
436 .irq_capable = 1, 436 .irq_capable = 1,
437 .can_boost = rcu_can_boost(), 437 .can_boost = rcu_can_boost(),
438 .name = "rcu_sync" 438 .name = "rcu_sync"
439 }; 439 };
440 440
441 static struct rcu_torture_ops rcu_expedited_ops = { 441 static struct rcu_torture_ops rcu_expedited_ops = {
442 .init = rcu_sync_torture_init, 442 .init = rcu_sync_torture_init,
443 .cleanup = NULL, 443 .cleanup = NULL,
444 .readlock = rcu_torture_read_lock, 444 .readlock = rcu_torture_read_lock,
445 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 445 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
446 .readunlock = rcu_torture_read_unlock, 446 .readunlock = rcu_torture_read_unlock,
447 .completed = rcu_no_completed, 447 .completed = rcu_no_completed,
448 .deferred_free = rcu_sync_torture_deferred_free, 448 .deferred_free = rcu_sync_torture_deferred_free,
449 .sync = synchronize_rcu_expedited, 449 .sync = synchronize_rcu_expedited,
450 .cb_barrier = NULL, 450 .cb_barrier = NULL,
451 .fqs = rcu_force_quiescent_state, 451 .fqs = rcu_force_quiescent_state,
452 .stats = NULL, 452 .stats = NULL,
453 .irq_capable = 1, 453 .irq_capable = 1,
454 .can_boost = rcu_can_boost(), 454 .can_boost = rcu_can_boost(),
455 .name = "rcu_expedited" 455 .name = "rcu_expedited"
456 }; 456 };
457 457
458 /* 458 /*
459 * Definitions for rcu_bh torture testing. 459 * Definitions for rcu_bh torture testing.
460 */ 460 */
461 461
462 static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) 462 static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
463 { 463 {
464 rcu_read_lock_bh(); 464 rcu_read_lock_bh();
465 return 0; 465 return 0;
466 } 466 }
467 467
468 static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) 468 static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
469 { 469 {
470 rcu_read_unlock_bh(); 470 rcu_read_unlock_bh();
471 } 471 }
472 472
473 static int rcu_bh_torture_completed(void) 473 static int rcu_bh_torture_completed(void)
474 { 474 {
475 return rcu_batches_completed_bh(); 475 return rcu_batches_completed_bh();
476 } 476 }
477 477
478 static void rcu_bh_torture_deferred_free(struct rcu_torture *p) 478 static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
479 { 479 {
480 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); 480 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
481 } 481 }
482 482
483 struct rcu_bh_torture_synchronize { 483 struct rcu_bh_torture_synchronize {
484 struct rcu_head head; 484 struct rcu_head head;
485 struct completion completion; 485 struct completion completion;
486 }; 486 };
487 487
488 static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) 488 static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
489 { 489 {
490 struct rcu_bh_torture_synchronize *rcu; 490 struct rcu_bh_torture_synchronize *rcu;
491 491
492 rcu = container_of(head, struct rcu_bh_torture_synchronize, head); 492 rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
493 complete(&rcu->completion); 493 complete(&rcu->completion);
494 } 494 }
495 495
496 static void rcu_bh_torture_synchronize(void) 496 static void rcu_bh_torture_synchronize(void)
497 { 497 {
498 struct rcu_bh_torture_synchronize rcu; 498 struct rcu_bh_torture_synchronize rcu;
499 499
500 init_rcu_head_on_stack(&rcu.head); 500 init_rcu_head_on_stack(&rcu.head);
501 init_completion(&rcu.completion); 501 init_completion(&rcu.completion);
502 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); 502 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
503 wait_for_completion(&rcu.completion); 503 wait_for_completion(&rcu.completion);
504 destroy_rcu_head_on_stack(&rcu.head); 504 destroy_rcu_head_on_stack(&rcu.head);
505 } 505 }
506 506
507 static struct rcu_torture_ops rcu_bh_ops = { 507 static struct rcu_torture_ops rcu_bh_ops = {
508 .init = NULL, 508 .init = NULL,
509 .cleanup = NULL, 509 .cleanup = NULL,
510 .readlock = rcu_bh_torture_read_lock, 510 .readlock = rcu_bh_torture_read_lock,
511 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 511 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
512 .readunlock = rcu_bh_torture_read_unlock, 512 .readunlock = rcu_bh_torture_read_unlock,
513 .completed = rcu_bh_torture_completed, 513 .completed = rcu_bh_torture_completed,
514 .deferred_free = rcu_bh_torture_deferred_free, 514 .deferred_free = rcu_bh_torture_deferred_free,
515 .sync = rcu_bh_torture_synchronize, 515 .sync = rcu_bh_torture_synchronize,
516 .cb_barrier = rcu_barrier_bh, 516 .cb_barrier = rcu_barrier_bh,
517 .fqs = rcu_bh_force_quiescent_state, 517 .fqs = rcu_bh_force_quiescent_state,
518 .stats = NULL, 518 .stats = NULL,
519 .irq_capable = 1, 519 .irq_capable = 1,
520 .name = "rcu_bh" 520 .name = "rcu_bh"
521 }; 521 };
522 522
523 static struct rcu_torture_ops rcu_bh_sync_ops = { 523 static struct rcu_torture_ops rcu_bh_sync_ops = {
524 .init = rcu_sync_torture_init, 524 .init = rcu_sync_torture_init,
525 .cleanup = NULL, 525 .cleanup = NULL,
526 .readlock = rcu_bh_torture_read_lock, 526 .readlock = rcu_bh_torture_read_lock,
527 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 527 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
528 .readunlock = rcu_bh_torture_read_unlock, 528 .readunlock = rcu_bh_torture_read_unlock,
529 .completed = rcu_bh_torture_completed, 529 .completed = rcu_bh_torture_completed,
530 .deferred_free = rcu_sync_torture_deferred_free, 530 .deferred_free = rcu_sync_torture_deferred_free,
531 .sync = rcu_bh_torture_synchronize, 531 .sync = rcu_bh_torture_synchronize,
532 .cb_barrier = NULL, 532 .cb_barrier = NULL,
533 .fqs = rcu_bh_force_quiescent_state, 533 .fqs = rcu_bh_force_quiescent_state,
534 .stats = NULL, 534 .stats = NULL,
535 .irq_capable = 1, 535 .irq_capable = 1,
536 .name = "rcu_bh_sync" 536 .name = "rcu_bh_sync"
537 }; 537 };
538 538
539 /* 539 /*
540 * Definitions for srcu torture testing. 540 * Definitions for srcu torture testing.
541 */ 541 */
542 542
543 static struct srcu_struct srcu_ctl; 543 static struct srcu_struct srcu_ctl;
544 544
545 static void srcu_torture_init(void) 545 static void srcu_torture_init(void)
546 { 546 {
547 init_srcu_struct(&srcu_ctl); 547 init_srcu_struct(&srcu_ctl);
548 rcu_sync_torture_init(); 548 rcu_sync_torture_init();
549 } 549 }
550 550
551 static void srcu_torture_cleanup(void) 551 static void srcu_torture_cleanup(void)
552 { 552 {
553 synchronize_srcu(&srcu_ctl); 553 synchronize_srcu(&srcu_ctl);
554 cleanup_srcu_struct(&srcu_ctl); 554 cleanup_srcu_struct(&srcu_ctl);
555 } 555 }
556 556
557 static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) 557 static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
558 { 558 {
559 return srcu_read_lock(&srcu_ctl); 559 return srcu_read_lock(&srcu_ctl);
560 } 560 }
561 561
562 static void srcu_read_delay(struct rcu_random_state *rrsp) 562 static void srcu_read_delay(struct rcu_random_state *rrsp)
563 { 563 {
564 long delay; 564 long delay;
565 const long uspertick = 1000000 / HZ; 565 const long uspertick = 1000000 / HZ;
566 const long longdelay = 10; 566 const long longdelay = 10;
567 567
568 /* We want there to be long-running readers, but not all the time. */ 568 /* We want there to be long-running readers, but not all the time. */
569 569
570 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); 570 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
571 if (!delay) 571 if (!delay)
572 schedule_timeout_interruptible(longdelay); 572 schedule_timeout_interruptible(longdelay);
573 else 573 else
574 rcu_read_delay(rrsp); 574 rcu_read_delay(rrsp);
575 } 575 }
576 576
577 static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) 577 static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
578 { 578 {
579 srcu_read_unlock(&srcu_ctl, idx); 579 srcu_read_unlock(&srcu_ctl, idx);
580 } 580 }
581 581
582 static int srcu_torture_completed(void) 582 static int srcu_torture_completed(void)
583 { 583 {
584 return srcu_batches_completed(&srcu_ctl); 584 return srcu_batches_completed(&srcu_ctl);
585 } 585 }
586 586
587 static void srcu_torture_synchronize(void) 587 static void srcu_torture_synchronize(void)
588 { 588 {
589 synchronize_srcu(&srcu_ctl); 589 synchronize_srcu(&srcu_ctl);
590 } 590 }
591 591
592 static int srcu_torture_stats(char *page) 592 static int srcu_torture_stats(char *page)
593 { 593 {
594 int cnt = 0; 594 int cnt = 0;
595 int cpu; 595 int cpu;
596 int idx = srcu_ctl.completed & 0x1; 596 int idx = srcu_ctl.completed & 0x1;
597 597
598 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 598 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
599 torture_type, TORTURE_FLAG, idx); 599 torture_type, TORTURE_FLAG, idx);
600 for_each_possible_cpu(cpu) { 600 for_each_possible_cpu(cpu) {
601 cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, 601 cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
602 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 602 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
603 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 603 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
604 } 604 }
605 cnt += sprintf(&page[cnt], "\n"); 605 cnt += sprintf(&page[cnt], "\n");
606 return cnt; 606 return cnt;
607 } 607 }
608 608
609 static struct rcu_torture_ops srcu_ops = { 609 static struct rcu_torture_ops srcu_ops = {
610 .init = srcu_torture_init, 610 .init = srcu_torture_init,
611 .cleanup = srcu_torture_cleanup, 611 .cleanup = srcu_torture_cleanup,
612 .readlock = srcu_torture_read_lock, 612 .readlock = srcu_torture_read_lock,
613 .read_delay = srcu_read_delay, 613 .read_delay = srcu_read_delay,
614 .readunlock = srcu_torture_read_unlock, 614 .readunlock = srcu_torture_read_unlock,
615 .completed = srcu_torture_completed, 615 .completed = srcu_torture_completed,
616 .deferred_free = rcu_sync_torture_deferred_free, 616 .deferred_free = rcu_sync_torture_deferred_free,
617 .sync = srcu_torture_synchronize, 617 .sync = srcu_torture_synchronize,
618 .cb_barrier = NULL, 618 .cb_barrier = NULL,
619 .stats = srcu_torture_stats, 619 .stats = srcu_torture_stats,
620 .name = "srcu" 620 .name = "srcu"
621 }; 621 };
622 622
623 static void srcu_torture_synchronize_expedited(void) 623 static void srcu_torture_synchronize_expedited(void)
624 { 624 {
625 synchronize_srcu_expedited(&srcu_ctl); 625 synchronize_srcu_expedited(&srcu_ctl);
626 } 626 }
627 627
628 static struct rcu_torture_ops srcu_expedited_ops = { 628 static struct rcu_torture_ops srcu_expedited_ops = {
629 .init = srcu_torture_init, 629 .init = srcu_torture_init,
630 .cleanup = srcu_torture_cleanup, 630 .cleanup = srcu_torture_cleanup,
631 .readlock = srcu_torture_read_lock, 631 .readlock = srcu_torture_read_lock,
632 .read_delay = srcu_read_delay, 632 .read_delay = srcu_read_delay,
633 .readunlock = srcu_torture_read_unlock, 633 .readunlock = srcu_torture_read_unlock,
634 .completed = srcu_torture_completed, 634 .completed = srcu_torture_completed,
635 .deferred_free = rcu_sync_torture_deferred_free, 635 .deferred_free = rcu_sync_torture_deferred_free,
636 .sync = srcu_torture_synchronize_expedited, 636 .sync = srcu_torture_synchronize_expedited,
637 .cb_barrier = NULL, 637 .cb_barrier = NULL,
638 .stats = srcu_torture_stats, 638 .stats = srcu_torture_stats,
639 .name = "srcu_expedited" 639 .name = "srcu_expedited"
640 }; 640 };
641 641
642 /* 642 /*
643 * Definitions for sched torture testing. 643 * Definitions for sched torture testing.
644 */ 644 */
645 645
646 static int sched_torture_read_lock(void) 646 static int sched_torture_read_lock(void)
647 { 647 {
648 preempt_disable(); 648 preempt_disable();
649 return 0; 649 return 0;
650 } 650 }
651 651
652 static void sched_torture_read_unlock(int idx) 652 static void sched_torture_read_unlock(int idx)
653 { 653 {
654 preempt_enable(); 654 preempt_enable();
655 } 655 }
656 656
657 static void rcu_sched_torture_deferred_free(struct rcu_torture *p) 657 static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
658 { 658 {
659 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); 659 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
660 } 660 }
661 661
662 static void sched_torture_synchronize(void) 662 static void sched_torture_synchronize(void)
663 { 663 {
664 synchronize_sched(); 664 synchronize_sched();
665 } 665 }
666 666
667 static struct rcu_torture_ops sched_ops = { 667 static struct rcu_torture_ops sched_ops = {
668 .init = rcu_sync_torture_init, 668 .init = rcu_sync_torture_init,
669 .cleanup = NULL, 669 .cleanup = NULL,
670 .readlock = sched_torture_read_lock, 670 .readlock = sched_torture_read_lock,
671 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 671 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
672 .readunlock = sched_torture_read_unlock, 672 .readunlock = sched_torture_read_unlock,
673 .completed = rcu_no_completed, 673 .completed = rcu_no_completed,
674 .deferred_free = rcu_sched_torture_deferred_free, 674 .deferred_free = rcu_sched_torture_deferred_free,
675 .sync = sched_torture_synchronize, 675 .sync = sched_torture_synchronize,
676 .cb_barrier = rcu_barrier_sched, 676 .cb_barrier = rcu_barrier_sched,
677 .fqs = rcu_sched_force_quiescent_state, 677 .fqs = rcu_sched_force_quiescent_state,
678 .stats = NULL, 678 .stats = NULL,
679 .irq_capable = 1, 679 .irq_capable = 1,
680 .name = "sched" 680 .name = "sched"
681 }; 681 };
682 682
683 static struct rcu_torture_ops sched_sync_ops = { 683 static struct rcu_torture_ops sched_sync_ops = {
684 .init = rcu_sync_torture_init, 684 .init = rcu_sync_torture_init,
685 .cleanup = NULL, 685 .cleanup = NULL,
686 .readlock = sched_torture_read_lock, 686 .readlock = sched_torture_read_lock,
687 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 687 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
688 .readunlock = sched_torture_read_unlock, 688 .readunlock = sched_torture_read_unlock,
689 .completed = rcu_no_completed, 689 .completed = rcu_no_completed,
690 .deferred_free = rcu_sync_torture_deferred_free, 690 .deferred_free = rcu_sync_torture_deferred_free,
691 .sync = sched_torture_synchronize, 691 .sync = sched_torture_synchronize,
692 .cb_barrier = NULL, 692 .cb_barrier = NULL,
693 .fqs = rcu_sched_force_quiescent_state, 693 .fqs = rcu_sched_force_quiescent_state,
694 .stats = NULL, 694 .stats = NULL,
695 .name = "sched_sync" 695 .name = "sched_sync"
696 }; 696 };
697 697
698 static struct rcu_torture_ops sched_expedited_ops = { 698 static struct rcu_torture_ops sched_expedited_ops = {
699 .init = rcu_sync_torture_init, 699 .init = rcu_sync_torture_init,
700 .cleanup = NULL, 700 .cleanup = NULL,
701 .readlock = sched_torture_read_lock, 701 .readlock = sched_torture_read_lock,
702 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 702 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
703 .readunlock = sched_torture_read_unlock, 703 .readunlock = sched_torture_read_unlock,
704 .completed = rcu_no_completed, 704 .completed = rcu_no_completed,
705 .deferred_free = rcu_sync_torture_deferred_free, 705 .deferred_free = rcu_sync_torture_deferred_free,
706 .sync = synchronize_sched_expedited, 706 .sync = synchronize_sched_expedited,
707 .cb_barrier = NULL, 707 .cb_barrier = NULL,
708 .fqs = rcu_sched_force_quiescent_state, 708 .fqs = rcu_sched_force_quiescent_state,
709 .stats = NULL, 709 .stats = NULL,
710 .irq_capable = 1, 710 .irq_capable = 1,
711 .name = "sched_expedited" 711 .name = "sched_expedited"
712 }; 712 };
713 713
714 /* 714 /*
715 * RCU torture priority-boost testing. Runs one real-time thread per 715 * RCU torture priority-boost testing. Runs one real-time thread per
716 * CPU for moderate bursts, repeatedly registering RCU callbacks and 716 * CPU for moderate bursts, repeatedly registering RCU callbacks and
717 * spinning waiting for them to be invoked. If a given callback takes 717 * spinning waiting for them to be invoked. If a given callback takes
718 * too long to be invoked, we assume that priority inversion has occurred. 718 * too long to be invoked, we assume that priority inversion has occurred.
719 */ 719 */
720 720
721 struct rcu_boost_inflight { 721 struct rcu_boost_inflight {
722 struct rcu_head rcu; 722 struct rcu_head rcu;
723 int inflight; 723 int inflight;
724 }; 724 };
725 725
726 static void rcu_torture_boost_cb(struct rcu_head *head) 726 static void rcu_torture_boost_cb(struct rcu_head *head)
727 { 727 {
728 struct rcu_boost_inflight *rbip = 728 struct rcu_boost_inflight *rbip =
729 container_of(head, struct rcu_boost_inflight, rcu); 729 container_of(head, struct rcu_boost_inflight, rcu);
730 730
731 smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ 731 smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
732 rbip->inflight = 0; 732 rbip->inflight = 0;
733 } 733 }
734 734
735 static int rcu_torture_boost(void *arg) 735 static int rcu_torture_boost(void *arg)
736 { 736 {
737 unsigned long call_rcu_time; 737 unsigned long call_rcu_time;
738 unsigned long endtime; 738 unsigned long endtime;
739 unsigned long oldstarttime; 739 unsigned long oldstarttime;
740 struct rcu_boost_inflight rbi = { .inflight = 0 }; 740 struct rcu_boost_inflight rbi = { .inflight = 0 };
741 struct sched_param sp; 741 struct sched_param sp;
742 742
743 VERBOSE_PRINTK_STRING("rcu_torture_boost started"); 743 VERBOSE_PRINTK_STRING("rcu_torture_boost started");
744 744
745 /* Set real-time priority. */ 745 /* Set real-time priority. */
746 sp.sched_priority = 1; 746 sp.sched_priority = 1;
747 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { 747 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
748 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); 748 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
749 n_rcu_torture_boost_rterror++; 749 n_rcu_torture_boost_rterror++;
750 } 750 }
751 751
752 init_rcu_head_on_stack(&rbi.rcu); 752 init_rcu_head_on_stack(&rbi.rcu);
753 /* Each pass through the following loop does one boost-test cycle. */ 753 /* Each pass through the following loop does one boost-test cycle. */
754 do { 754 do {
755 /* Wait for the next test interval. */ 755 /* Wait for the next test interval. */
756 oldstarttime = boost_starttime; 756 oldstarttime = boost_starttime;
757 while (jiffies - oldstarttime > ULONG_MAX / 2) { 757 while (jiffies - oldstarttime > ULONG_MAX / 2) {
758 schedule_timeout_uninterruptible(1); 758 schedule_timeout_uninterruptible(1);
759 rcu_stutter_wait("rcu_torture_boost"); 759 rcu_stutter_wait("rcu_torture_boost");
760 if (kthread_should_stop() || 760 if (kthread_should_stop() ||
761 fullstop != FULLSTOP_DONTSTOP) 761 fullstop != FULLSTOP_DONTSTOP)
762 goto checkwait; 762 goto checkwait;
763 } 763 }
764 764
765 /* Do one boost-test interval. */ 765 /* Do one boost-test interval. */
766 endtime = oldstarttime + test_boost_duration * HZ; 766 endtime = oldstarttime + test_boost_duration * HZ;
767 call_rcu_time = jiffies; 767 call_rcu_time = jiffies;
768 while (jiffies - endtime > ULONG_MAX / 2) { 768 while (jiffies - endtime > ULONG_MAX / 2) {
769 /* If we don't have a callback in flight, post one. */ 769 /* If we don't have a callback in flight, post one. */
770 if (!rbi.inflight) { 770 if (!rbi.inflight) {
771 smp_mb(); /* RCU core before ->inflight = 1. */ 771 smp_mb(); /* RCU core before ->inflight = 1. */
772 rbi.inflight = 1; 772 rbi.inflight = 1;
773 call_rcu(&rbi.rcu, rcu_torture_boost_cb); 773 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
774 if (jiffies - call_rcu_time > 774 if (jiffies - call_rcu_time >
775 test_boost_duration * HZ - HZ / 2) { 775 test_boost_duration * HZ - HZ / 2) {
776 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); 776 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
777 n_rcu_torture_boost_failure++; 777 n_rcu_torture_boost_failure++;
778 } 778 }
779 call_rcu_time = jiffies; 779 call_rcu_time = jiffies;
780 } 780 }
781 cond_resched(); 781 cond_resched();
782 rcu_stutter_wait("rcu_torture_boost"); 782 rcu_stutter_wait("rcu_torture_boost");
783 if (kthread_should_stop() || 783 if (kthread_should_stop() ||
784 fullstop != FULLSTOP_DONTSTOP) 784 fullstop != FULLSTOP_DONTSTOP)
785 goto checkwait; 785 goto checkwait;
786 } 786 }
787 787
788 /* 788 /*
789 * Set the start time of the next test interval. 789 * Set the start time of the next test interval.
790 * Yes, this is vulnerable to long delays, but such 790 * Yes, this is vulnerable to long delays, but such
791 * delays simply cause a false negative for the next 791 * delays simply cause a false negative for the next
792 * interval. Besides, we are running at RT priority, 792 * interval. Besides, we are running at RT priority,
793 * so delays should be relatively rare. 793 * so delays should be relatively rare.
794 */ 794 */
795 while (oldstarttime == boost_starttime) { 795 while (oldstarttime == boost_starttime) {
796 if (mutex_trylock(&boost_mutex)) { 796 if (mutex_trylock(&boost_mutex)) {
797 boost_starttime = jiffies + 797 boost_starttime = jiffies +
798 test_boost_interval * HZ; 798 test_boost_interval * HZ;
799 n_rcu_torture_boosts++; 799 n_rcu_torture_boosts++;
800 mutex_unlock(&boost_mutex); 800 mutex_unlock(&boost_mutex);
801 break; 801 break;
802 } 802 }
803 schedule_timeout_uninterruptible(1); 803 schedule_timeout_uninterruptible(1);
804 } 804 }
805 805
806 /* Go do the stutter. */ 806 /* Go do the stutter. */
807 checkwait: rcu_stutter_wait("rcu_torture_boost"); 807 checkwait: rcu_stutter_wait("rcu_torture_boost");
808 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 808 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
809 809
810 /* Clean up and exit. */ 810 /* Clean up and exit. */
811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu); 812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost"); 813 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight) 814 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1); 815 schedule_timeout_uninterruptible(1);
816 smp_mb(); /* order accesses to ->inflight before stack-frame death. */ 816 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
817 return 0; 817 return 0;
818 } 818 }
819 819
820 /* 820 /*
821 * RCU torture force-quiescent-state kthread. Repeatedly induces 821 * RCU torture force-quiescent-state kthread. Repeatedly induces
822 * bursts of calls to force_quiescent_state(), increasing the probability 822 * bursts of calls to force_quiescent_state(), increasing the probability
823 * of occurrence of some important types of race conditions. 823 * of occurrence of some important types of race conditions.
824 */ 824 */
825 static int 825 static int
826 rcu_torture_fqs(void *arg) 826 rcu_torture_fqs(void *arg)
827 { 827 {
828 unsigned long fqs_resume_time; 828 unsigned long fqs_resume_time;
829 int fqs_burst_remaining; 829 int fqs_burst_remaining;
830 830
831 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); 831 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
832 do { 832 do {
833 fqs_resume_time = jiffies + fqs_stutter * HZ; 833 fqs_resume_time = jiffies + fqs_stutter * HZ;
834 while (jiffies - fqs_resume_time > LONG_MAX) { 834 while (jiffies - fqs_resume_time > LONG_MAX) {
835 schedule_timeout_interruptible(1); 835 schedule_timeout_interruptible(1);
836 } 836 }
837 fqs_burst_remaining = fqs_duration; 837 fqs_burst_remaining = fqs_duration;
838 while (fqs_burst_remaining > 0) { 838 while (fqs_burst_remaining > 0) {
839 cur_ops->fqs(); 839 cur_ops->fqs();
840 udelay(fqs_holdoff); 840 udelay(fqs_holdoff);
841 fqs_burst_remaining -= fqs_holdoff; 841 fqs_burst_remaining -= fqs_holdoff;
842 } 842 }
843 rcu_stutter_wait("rcu_torture_fqs"); 843 rcu_stutter_wait("rcu_torture_fqs");
844 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 844 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
845 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); 845 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
846 rcutorture_shutdown_absorb("rcu_torture_fqs"); 846 rcutorture_shutdown_absorb("rcu_torture_fqs");
847 while (!kthread_should_stop()) 847 while (!kthread_should_stop())
848 schedule_timeout_uninterruptible(1); 848 schedule_timeout_uninterruptible(1);
849 return 0; 849 return 0;
850 } 850 }
851 851
852 /* 852 /*
853 * RCU torture writer kthread. Repeatedly substitutes a new structure 853 * RCU torture writer kthread. Repeatedly substitutes a new structure
854 * for that pointed to by rcu_torture_current, freeing the old structure 854 * for that pointed to by rcu_torture_current, freeing the old structure
855 * after a series of grace periods (the "pipeline"). 855 * after a series of grace periods (the "pipeline").
856 */ 856 */
857 static int 857 static int
858 rcu_torture_writer(void *arg) 858 rcu_torture_writer(void *arg)
859 { 859 {
860 int i; 860 int i;
861 long oldbatch = rcu_batches_completed(); 861 long oldbatch = rcu_batches_completed();
862 struct rcu_torture *rp; 862 struct rcu_torture *rp;
863 struct rcu_torture *old_rp; 863 struct rcu_torture *old_rp;
864 static DEFINE_RCU_RANDOM(rand); 864 static DEFINE_RCU_RANDOM(rand);
865 865
866 VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); 866 VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
867 set_user_nice(current, 19); 867 set_user_nice(current, 19);
868 868
869 do { 869 do {
870 schedule_timeout_uninterruptible(1); 870 schedule_timeout_uninterruptible(1);
871 rp = rcu_torture_alloc(); 871 rp = rcu_torture_alloc();
872 if (rp == NULL) 872 if (rp == NULL)
873 continue; 873 continue;
874 rp->rtort_pipe_count = 0; 874 rp->rtort_pipe_count = 0;
875 udelay(rcu_random(&rand) & 0x3ff); 875 udelay(rcu_random(&rand) & 0x3ff);
876 old_rp = rcu_dereference_check(rcu_torture_current, 876 old_rp = rcu_dereference_check(rcu_torture_current,
877 current == writer_task); 877 current == writer_task);
878 rp->rtort_mbtest = 1; 878 rp->rtort_mbtest = 1;
879 rcu_assign_pointer(rcu_torture_current, rp); 879 rcu_assign_pointer(rcu_torture_current, rp);
880 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ 880 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
881 if (old_rp) { 881 if (old_rp) {
882 i = old_rp->rtort_pipe_count; 882 i = old_rp->rtort_pipe_count;
883 if (i > RCU_TORTURE_PIPE_LEN) 883 if (i > RCU_TORTURE_PIPE_LEN)
884 i = RCU_TORTURE_PIPE_LEN; 884 i = RCU_TORTURE_PIPE_LEN;
885 atomic_inc(&rcu_torture_wcount[i]); 885 atomic_inc(&rcu_torture_wcount[i]);
886 old_rp->rtort_pipe_count++; 886 old_rp->rtort_pipe_count++;
887 cur_ops->deferred_free(old_rp); 887 cur_ops->deferred_free(old_rp);
888 } 888 }
889 rcutorture_record_progress(++rcu_torture_current_version); 889 rcutorture_record_progress(++rcu_torture_current_version);
890 oldbatch = cur_ops->completed(); 890 oldbatch = cur_ops->completed();
891 rcu_stutter_wait("rcu_torture_writer"); 891 rcu_stutter_wait("rcu_torture_writer");
892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
893 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 893 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
894 rcutorture_shutdown_absorb("rcu_torture_writer"); 894 rcutorture_shutdown_absorb("rcu_torture_writer");
895 while (!kthread_should_stop()) 895 while (!kthread_should_stop())
896 schedule_timeout_uninterruptible(1); 896 schedule_timeout_uninterruptible(1);
897 return 0; 897 return 0;
898 } 898 }
899 899
900 /* 900 /*
901 * RCU torture fake writer kthread. Repeatedly calls sync, with a random 901 * RCU torture fake writer kthread. Repeatedly calls sync, with a random
902 * delay between calls. 902 * delay between calls.
903 */ 903 */
904 static int 904 static int
905 rcu_torture_fakewriter(void *arg) 905 rcu_torture_fakewriter(void *arg)
906 { 906 {
907 DEFINE_RCU_RANDOM(rand); 907 DEFINE_RCU_RANDOM(rand);
908 908
909 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); 909 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
910 set_user_nice(current, 19); 910 set_user_nice(current, 19);
911 911
912 do { 912 do {
913 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 913 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
914 udelay(rcu_random(&rand) & 0x3ff); 914 udelay(rcu_random(&rand) & 0x3ff);
915 cur_ops->sync(); 915 cur_ops->sync();
916 rcu_stutter_wait("rcu_torture_fakewriter"); 916 rcu_stutter_wait("rcu_torture_fakewriter");
917 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 917 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
918 918
919 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 919 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
920 rcutorture_shutdown_absorb("rcu_torture_fakewriter"); 920 rcutorture_shutdown_absorb("rcu_torture_fakewriter");
921 while (!kthread_should_stop()) 921 while (!kthread_should_stop())
922 schedule_timeout_uninterruptible(1); 922 schedule_timeout_uninterruptible(1);
923 return 0; 923 return 0;
924 } 924 }
925 925
926 /* 926 /*
927 * RCU torture reader from timer handler. Dereferences rcu_torture_current, 927 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
928 * incrementing the corresponding element of the pipeline array. The 928 * incrementing the corresponding element of the pipeline array. The
929 * counter in the element should never be greater than 1, otherwise, the 929 * counter in the element should never be greater than 1, otherwise, the
930 * RCU implementation is broken. 930 * RCU implementation is broken.
931 */ 931 */
932 static void rcu_torture_timer(unsigned long unused) 932 static void rcu_torture_timer(unsigned long unused)
933 { 933 {
934 int idx; 934 int idx;
935 int completed; 935 int completed;
936 static DEFINE_RCU_RANDOM(rand); 936 static DEFINE_RCU_RANDOM(rand);
937 static DEFINE_SPINLOCK(rand_lock); 937 static DEFINE_SPINLOCK(rand_lock);
938 struct rcu_torture *p; 938 struct rcu_torture *p;
939 int pipe_count; 939 int pipe_count;
940 940
941 idx = cur_ops->readlock(); 941 idx = cur_ops->readlock();
942 completed = cur_ops->completed(); 942 completed = cur_ops->completed();
943 p = rcu_dereference_check(rcu_torture_current, 943 p = rcu_dereference_check(rcu_torture_current,
944 rcu_read_lock_held() ||
945 rcu_read_lock_bh_held() || 944 rcu_read_lock_bh_held() ||
946 rcu_read_lock_sched_held() || 945 rcu_read_lock_sched_held() ||
947 srcu_read_lock_held(&srcu_ctl)); 946 srcu_read_lock_held(&srcu_ctl));
948 if (p == NULL) { 947 if (p == NULL) {
949 /* Leave because rcu_torture_writer is not yet underway */ 948 /* Leave because rcu_torture_writer is not yet underway */
950 cur_ops->readunlock(idx); 949 cur_ops->readunlock(idx);
951 return; 950 return;
952 } 951 }
953 if (p->rtort_mbtest == 0) 952 if (p->rtort_mbtest == 0)
954 atomic_inc(&n_rcu_torture_mberror); 953 atomic_inc(&n_rcu_torture_mberror);
955 spin_lock(&rand_lock); 954 spin_lock(&rand_lock);
956 cur_ops->read_delay(&rand); 955 cur_ops->read_delay(&rand);
957 n_rcu_torture_timers++; 956 n_rcu_torture_timers++;
958 spin_unlock(&rand_lock); 957 spin_unlock(&rand_lock);
959 preempt_disable(); 958 preempt_disable();
960 pipe_count = p->rtort_pipe_count; 959 pipe_count = p->rtort_pipe_count;
961 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 960 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
962 /* Should not happen, but... */ 961 /* Should not happen, but... */
963 pipe_count = RCU_TORTURE_PIPE_LEN; 962 pipe_count = RCU_TORTURE_PIPE_LEN;
964 } 963 }
965 __this_cpu_inc(rcu_torture_count[pipe_count]); 964 __this_cpu_inc(rcu_torture_count[pipe_count]);
966 completed = cur_ops->completed() - completed; 965 completed = cur_ops->completed() - completed;
967 if (completed > RCU_TORTURE_PIPE_LEN) { 966 if (completed > RCU_TORTURE_PIPE_LEN) {
968 /* Should not happen, but... */ 967 /* Should not happen, but... */
969 completed = RCU_TORTURE_PIPE_LEN; 968 completed = RCU_TORTURE_PIPE_LEN;
970 } 969 }
971 __this_cpu_inc(rcu_torture_batch[completed]); 970 __this_cpu_inc(rcu_torture_batch[completed]);
972 preempt_enable(); 971 preempt_enable();
973 cur_ops->readunlock(idx); 972 cur_ops->readunlock(idx);
974 } 973 }
975 974
976 /* 975 /*
977 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, 976 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
978 * incrementing the corresponding element of the pipeline array. The 977 * incrementing the corresponding element of the pipeline array. The
979 * counter in the element should never be greater than 1, otherwise, the 978 * counter in the element should never be greater than 1, otherwise, the
980 * RCU implementation is broken. 979 * RCU implementation is broken.
981 */ 980 */
982 static int 981 static int
983 rcu_torture_reader(void *arg) 982 rcu_torture_reader(void *arg)
984 { 983 {
985 int completed; 984 int completed;
986 int idx; 985 int idx;
987 DEFINE_RCU_RANDOM(rand); 986 DEFINE_RCU_RANDOM(rand);
988 struct rcu_torture *p; 987 struct rcu_torture *p;
989 int pipe_count; 988 int pipe_count;
990 struct timer_list t; 989 struct timer_list t;
991 990
992 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 991 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
993 set_user_nice(current, 19); 992 set_user_nice(current, 19);
994 if (irqreader && cur_ops->irq_capable) 993 if (irqreader && cur_ops->irq_capable)
995 setup_timer_on_stack(&t, rcu_torture_timer, 0); 994 setup_timer_on_stack(&t, rcu_torture_timer, 0);
996 995
997 do { 996 do {
998 if (irqreader && cur_ops->irq_capable) { 997 if (irqreader && cur_ops->irq_capable) {
999 if (!timer_pending(&t)) 998 if (!timer_pending(&t))
1000 mod_timer(&t, jiffies + 1); 999 mod_timer(&t, jiffies + 1);
1001 } 1000 }
1002 idx = cur_ops->readlock(); 1001 idx = cur_ops->readlock();
1003 completed = cur_ops->completed(); 1002 completed = cur_ops->completed();
1004 p = rcu_dereference_check(rcu_torture_current, 1003 p = rcu_dereference_check(rcu_torture_current,
1005 rcu_read_lock_held() ||
1006 rcu_read_lock_bh_held() || 1004 rcu_read_lock_bh_held() ||
1007 rcu_read_lock_sched_held() || 1005 rcu_read_lock_sched_held() ||
1008 srcu_read_lock_held(&srcu_ctl)); 1006 srcu_read_lock_held(&srcu_ctl));
1009 if (p == NULL) { 1007 if (p == NULL) {
1010 /* Wait for rcu_torture_writer to get underway */ 1008 /* Wait for rcu_torture_writer to get underway */
1011 cur_ops->readunlock(idx); 1009 cur_ops->readunlock(idx);
1012 schedule_timeout_interruptible(HZ); 1010 schedule_timeout_interruptible(HZ);
1013 continue; 1011 continue;
1014 } 1012 }
1015 if (p->rtort_mbtest == 0) 1013 if (p->rtort_mbtest == 0)
1016 atomic_inc(&n_rcu_torture_mberror); 1014 atomic_inc(&n_rcu_torture_mberror);
1017 cur_ops->read_delay(&rand); 1015 cur_ops->read_delay(&rand);
1018 preempt_disable(); 1016 preempt_disable();
1019 pipe_count = p->rtort_pipe_count; 1017 pipe_count = p->rtort_pipe_count;
1020 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 1018 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
1021 /* Should not happen, but... */ 1019 /* Should not happen, but... */
1022 pipe_count = RCU_TORTURE_PIPE_LEN; 1020 pipe_count = RCU_TORTURE_PIPE_LEN;
1023 } 1021 }
1024 __this_cpu_inc(rcu_torture_count[pipe_count]); 1022 __this_cpu_inc(rcu_torture_count[pipe_count]);
1025 completed = cur_ops->completed() - completed; 1023 completed = cur_ops->completed() - completed;
1026 if (completed > RCU_TORTURE_PIPE_LEN) { 1024 if (completed > RCU_TORTURE_PIPE_LEN) {
1027 /* Should not happen, but... */ 1025 /* Should not happen, but... */
1028 completed = RCU_TORTURE_PIPE_LEN; 1026 completed = RCU_TORTURE_PIPE_LEN;
1029 } 1027 }
1030 __this_cpu_inc(rcu_torture_batch[completed]); 1028 __this_cpu_inc(rcu_torture_batch[completed]);
1031 preempt_enable(); 1029 preempt_enable();
1032 cur_ops->readunlock(idx); 1030 cur_ops->readunlock(idx);
1033 schedule(); 1031 schedule();
1034 rcu_stutter_wait("rcu_torture_reader"); 1032 rcu_stutter_wait("rcu_torture_reader");
1035 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1033 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1036 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 1034 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
1037 rcutorture_shutdown_absorb("rcu_torture_reader"); 1035 rcutorture_shutdown_absorb("rcu_torture_reader");
1038 if (irqreader && cur_ops->irq_capable) 1036 if (irqreader && cur_ops->irq_capable)
1039 del_timer_sync(&t); 1037 del_timer_sync(&t);
1040 while (!kthread_should_stop()) 1038 while (!kthread_should_stop())
1041 schedule_timeout_uninterruptible(1); 1039 schedule_timeout_uninterruptible(1);
1042 return 0; 1040 return 0;
1043 } 1041 }
1044 1042
1045 /* 1043 /*
1046 * Create an RCU-torture statistics message in the specified buffer. 1044 * Create an RCU-torture statistics message in the specified buffer.
1047 */ 1045 */
1048 static int 1046 static int
1049 rcu_torture_printk(char *page) 1047 rcu_torture_printk(char *page)
1050 { 1048 {
1051 int cnt = 0; 1049 int cnt = 0;
1052 int cpu; 1050 int cpu;
1053 int i; 1051 int i;
1054 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1052 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
1055 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1053 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
1056 1054
1057 for_each_possible_cpu(cpu) { 1055 for_each_possible_cpu(cpu) {
1058 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1056 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
1059 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; 1057 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
1060 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; 1058 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
1061 } 1059 }
1062 } 1060 }
1063 for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { 1061 for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
1064 if (pipesummary[i] != 0) 1062 if (pipesummary[i] != 0)
1065 break; 1063 break;
1066 } 1064 }
1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1065 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1068 cnt += sprintf(&page[cnt], 1066 cnt += sprintf(&page[cnt],
1069 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1067 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1070 "rtmbe: %d rtbke: %ld rtbre: %ld " 1068 "rtmbe: %d rtbke: %ld rtbre: %ld "
1071 "rtbf: %ld rtb: %ld nt: %ld", 1069 "rtbf: %ld rtb: %ld nt: %ld",
1072 rcu_torture_current, 1070 rcu_torture_current,
1073 rcu_torture_current_version, 1071 rcu_torture_current_version,
1074 list_empty(&rcu_torture_freelist), 1072 list_empty(&rcu_torture_freelist),
1075 atomic_read(&n_rcu_torture_alloc), 1073 atomic_read(&n_rcu_torture_alloc),
1076 atomic_read(&n_rcu_torture_alloc_fail), 1074 atomic_read(&n_rcu_torture_alloc_fail),
1077 atomic_read(&n_rcu_torture_free), 1075 atomic_read(&n_rcu_torture_free),
1078 atomic_read(&n_rcu_torture_mberror), 1076 atomic_read(&n_rcu_torture_mberror),
1079 n_rcu_torture_boost_ktrerror, 1077 n_rcu_torture_boost_ktrerror,
1080 n_rcu_torture_boost_rterror, 1078 n_rcu_torture_boost_rterror,
1081 n_rcu_torture_boost_failure, 1079 n_rcu_torture_boost_failure,
1082 n_rcu_torture_boosts, 1080 n_rcu_torture_boosts,
1083 n_rcu_torture_timers); 1081 n_rcu_torture_timers);
1084 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1082 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1085 n_rcu_torture_boost_ktrerror != 0 || 1083 n_rcu_torture_boost_ktrerror != 0 ||
1086 n_rcu_torture_boost_rterror != 0 || 1084 n_rcu_torture_boost_rterror != 0 ||
1087 n_rcu_torture_boost_failure != 0) 1085 n_rcu_torture_boost_failure != 0)
1088 cnt += sprintf(&page[cnt], " !!!"); 1086 cnt += sprintf(&page[cnt], " !!!");
1089 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1087 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1090 if (i > 1) { 1088 if (i > 1) {
1091 cnt += sprintf(&page[cnt], "!!! "); 1089 cnt += sprintf(&page[cnt], "!!! ");
1092 atomic_inc(&n_rcu_torture_error); 1090 atomic_inc(&n_rcu_torture_error);
1093 WARN_ON_ONCE(1); 1091 WARN_ON_ONCE(1);
1094 } 1092 }
1095 cnt += sprintf(&page[cnt], "Reader Pipe: "); 1093 cnt += sprintf(&page[cnt], "Reader Pipe: ");
1096 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1094 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1097 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 1095 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
1098 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1096 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1099 cnt += sprintf(&page[cnt], "Reader Batch: "); 1097 cnt += sprintf(&page[cnt], "Reader Batch: ");
1100 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1098 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1101 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 1099 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
1102 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1100 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1103 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 1101 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
1104 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1102 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
1105 cnt += sprintf(&page[cnt], " %d", 1103 cnt += sprintf(&page[cnt], " %d",
1106 atomic_read(&rcu_torture_wcount[i])); 1104 atomic_read(&rcu_torture_wcount[i]));
1107 } 1105 }
1108 cnt += sprintf(&page[cnt], "\n"); 1106 cnt += sprintf(&page[cnt], "\n");
1109 if (cur_ops->stats) 1107 if (cur_ops->stats)
1110 cnt += cur_ops->stats(&page[cnt]); 1108 cnt += cur_ops->stats(&page[cnt]);
1111 return cnt; 1109 return cnt;
1112 } 1110 }
1113 1111
1114 /* 1112 /*
1115 * Print torture statistics. Caller must ensure that there is only 1113 * Print torture statistics. Caller must ensure that there is only
1116 * one call to this function at a given time!!! This is normally 1114 * one call to this function at a given time!!! This is normally
1117 * accomplished by relying on the module system to only have one copy 1115 * accomplished by relying on the module system to only have one copy
1118 * of the module loaded, and then by giving the rcu_torture_stats 1116 * of the module loaded, and then by giving the rcu_torture_stats
1119 * kthread full control (or the init/cleanup functions when rcu_torture_stats 1117 * kthread full control (or the init/cleanup functions when rcu_torture_stats
1120 * thread is not running). 1118 * thread is not running).
1121 */ 1119 */
1122 static void 1120 static void
1123 rcu_torture_stats_print(void) 1121 rcu_torture_stats_print(void)
1124 { 1122 {
1125 int cnt; 1123 int cnt;
1126 1124
1127 cnt = rcu_torture_printk(printk_buf); 1125 cnt = rcu_torture_printk(printk_buf);
1128 printk(KERN_ALERT "%s", printk_buf); 1126 printk(KERN_ALERT "%s", printk_buf);
1129 } 1127 }
1130 1128
1131 /* 1129 /*
1132 * Periodically prints torture statistics, if periodic statistics printing 1130 * Periodically prints torture statistics, if periodic statistics printing
1133 * was specified via the stat_interval module parameter. 1131 * was specified via the stat_interval module parameter.
1134 * 1132 *
1135 * No need to worry about fullstop here, since this one doesn't reference 1133 * No need to worry about fullstop here, since this one doesn't reference
1136 * volatile state or register callbacks. 1134 * volatile state or register callbacks.
1137 */ 1135 */
1138 static int 1136 static int
1139 rcu_torture_stats(void *arg) 1137 rcu_torture_stats(void *arg)
1140 { 1138 {
1141 VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); 1139 VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
1142 do { 1140 do {
1143 schedule_timeout_interruptible(stat_interval * HZ); 1141 schedule_timeout_interruptible(stat_interval * HZ);
1144 rcu_torture_stats_print(); 1142 rcu_torture_stats_print();
1145 rcutorture_shutdown_absorb("rcu_torture_stats"); 1143 rcutorture_shutdown_absorb("rcu_torture_stats");
1146 } while (!kthread_should_stop()); 1144 } while (!kthread_should_stop());
1147 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 1145 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
1148 return 0; 1146 return 0;
1149 } 1147 }
1150 1148
1151 static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ 1149 static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
1152 1150
1153 /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case 1151 /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
1154 * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. 1152 * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
1155 */ 1153 */
1156 static void rcu_torture_shuffle_tasks(void) 1154 static void rcu_torture_shuffle_tasks(void)
1157 { 1155 {
1158 int i; 1156 int i;
1159 1157
1160 cpumask_setall(shuffle_tmp_mask); 1158 cpumask_setall(shuffle_tmp_mask);
1161 get_online_cpus(); 1159 get_online_cpus();
1162 1160
1163 /* No point in shuffling if there is only one online CPU (ex: UP) */ 1161 /* No point in shuffling if there is only one online CPU (ex: UP) */
1164 if (num_online_cpus() == 1) { 1162 if (num_online_cpus() == 1) {
1165 put_online_cpus(); 1163 put_online_cpus();
1166 return; 1164 return;
1167 } 1165 }
1168 1166
1169 if (rcu_idle_cpu != -1) 1167 if (rcu_idle_cpu != -1)
1170 cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); 1168 cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
1171 1169
1172 set_cpus_allowed_ptr(current, shuffle_tmp_mask); 1170 set_cpus_allowed_ptr(current, shuffle_tmp_mask);
1173 1171
1174 if (reader_tasks) { 1172 if (reader_tasks) {
1175 for (i = 0; i < nrealreaders; i++) 1173 for (i = 0; i < nrealreaders; i++)
1176 if (reader_tasks[i]) 1174 if (reader_tasks[i])
1177 set_cpus_allowed_ptr(reader_tasks[i], 1175 set_cpus_allowed_ptr(reader_tasks[i],
1178 shuffle_tmp_mask); 1176 shuffle_tmp_mask);
1179 } 1177 }
1180 1178
1181 if (fakewriter_tasks) { 1179 if (fakewriter_tasks) {
1182 for (i = 0; i < nfakewriters; i++) 1180 for (i = 0; i < nfakewriters; i++)
1183 if (fakewriter_tasks[i]) 1181 if (fakewriter_tasks[i])
1184 set_cpus_allowed_ptr(fakewriter_tasks[i], 1182 set_cpus_allowed_ptr(fakewriter_tasks[i],
1185 shuffle_tmp_mask); 1183 shuffle_tmp_mask);
1186 } 1184 }
1187 1185
1188 if (writer_task) 1186 if (writer_task)
1189 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); 1187 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
1190 1188
1191 if (stats_task) 1189 if (stats_task)
1192 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); 1190 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
1193 1191
1194 if (rcu_idle_cpu == -1) 1192 if (rcu_idle_cpu == -1)
1195 rcu_idle_cpu = num_online_cpus() - 1; 1193 rcu_idle_cpu = num_online_cpus() - 1;
1196 else 1194 else
1197 rcu_idle_cpu--; 1195 rcu_idle_cpu--;
1198 1196
1199 put_online_cpus(); 1197 put_online_cpus();
1200 } 1198 }
1201 1199
1202 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the 1200 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
1203 * system to become idle at a time and cut off its timer ticks. This is meant 1201 * system to become idle at a time and cut off its timer ticks. This is meant
1204 * to test the support for such tickless idle CPU in RCU. 1202 * to test the support for such tickless idle CPU in RCU.
1205 */ 1203 */
1206 static int 1204 static int
1207 rcu_torture_shuffle(void *arg) 1205 rcu_torture_shuffle(void *arg)
1208 { 1206 {
1209 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); 1207 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
1210 do { 1208 do {
1211 schedule_timeout_interruptible(shuffle_interval * HZ); 1209 schedule_timeout_interruptible(shuffle_interval * HZ);
1212 rcu_torture_shuffle_tasks(); 1210 rcu_torture_shuffle_tasks();
1213 rcutorture_shutdown_absorb("rcu_torture_shuffle"); 1211 rcutorture_shutdown_absorb("rcu_torture_shuffle");
1214 } while (!kthread_should_stop()); 1212 } while (!kthread_should_stop());
1215 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); 1213 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
1216 return 0; 1214 return 0;
1217 } 1215 }
1218 1216
1219 /* Cause the rcutorture test to "stutter", starting and stopping all 1217 /* Cause the rcutorture test to "stutter", starting and stopping all
1220 * threads periodically. 1218 * threads periodically.
1221 */ 1219 */
1222 static int 1220 static int
1223 rcu_torture_stutter(void *arg) 1221 rcu_torture_stutter(void *arg)
1224 { 1222 {
1225 VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); 1223 VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
1226 do { 1224 do {
1227 schedule_timeout_interruptible(stutter * HZ); 1225 schedule_timeout_interruptible(stutter * HZ);
1228 stutter_pause_test = 1; 1226 stutter_pause_test = 1;
1229 if (!kthread_should_stop()) 1227 if (!kthread_should_stop())
1230 schedule_timeout_interruptible(stutter * HZ); 1228 schedule_timeout_interruptible(stutter * HZ);
1231 stutter_pause_test = 0; 1229 stutter_pause_test = 0;
1232 rcutorture_shutdown_absorb("rcu_torture_stutter"); 1230 rcutorture_shutdown_absorb("rcu_torture_stutter");
1233 } while (!kthread_should_stop()); 1231 } while (!kthread_should_stop());
1234 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); 1232 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
1235 return 0; 1233 return 0;
1236 } 1234 }
1237 1235
1238 static inline void 1236 static inline void
1239 rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) 1237 rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1240 { 1238 {
1241 printk(KERN_ALERT "%s" TORTURE_FLAG 1239 printk(KERN_ALERT "%s" TORTURE_FLAG
1242 "--- %s: nreaders=%d nfakewriters=%d " 1240 "--- %s: nreaders=%d nfakewriters=%d "
1243 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1241 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1244 "shuffle_interval=%d stutter=%d irqreader=%d " 1242 "shuffle_interval=%d stutter=%d irqreader=%d "
1245 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1243 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1246 "test_boost=%d/%d test_boost_interval=%d " 1244 "test_boost=%d/%d test_boost_interval=%d "
1247 "test_boost_duration=%d\n", 1245 "test_boost_duration=%d\n",
1248 torture_type, tag, nrealreaders, nfakewriters, 1246 torture_type, tag, nrealreaders, nfakewriters,
1249 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1247 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1250 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1248 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1251 test_boost, cur_ops->can_boost, 1249 test_boost, cur_ops->can_boost,
1252 test_boost_interval, test_boost_duration); 1250 test_boost_interval, test_boost_duration);
1253 } 1251 }
1254 1252
1255 static struct notifier_block rcutorture_shutdown_nb = { 1253 static struct notifier_block rcutorture_shutdown_nb = {
1256 .notifier_call = rcutorture_shutdown_notify, 1254 .notifier_call = rcutorture_shutdown_notify,
1257 }; 1255 };
1258 1256
1259 static void rcutorture_booster_cleanup(int cpu) 1257 static void rcutorture_booster_cleanup(int cpu)
1260 { 1258 {
1261 struct task_struct *t; 1259 struct task_struct *t;
1262 1260
1263 if (boost_tasks[cpu] == NULL) 1261 if (boost_tasks[cpu] == NULL)
1264 return; 1262 return;
1265 mutex_lock(&boost_mutex); 1263 mutex_lock(&boost_mutex);
1266 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); 1264 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
1267 t = boost_tasks[cpu]; 1265 t = boost_tasks[cpu];
1268 boost_tasks[cpu] = NULL; 1266 boost_tasks[cpu] = NULL;
1269 mutex_unlock(&boost_mutex); 1267 mutex_unlock(&boost_mutex);
1270 1268
1271 /* This must be outside of the mutex, otherwise deadlock! */ 1269 /* This must be outside of the mutex, otherwise deadlock! */
1272 kthread_stop(t); 1270 kthread_stop(t);
1273 } 1271 }
1274 1272
1275 static int rcutorture_booster_init(int cpu) 1273 static int rcutorture_booster_init(int cpu)
1276 { 1274 {
1277 int retval; 1275 int retval;
1278 1276
1279 if (boost_tasks[cpu] != NULL) 1277 if (boost_tasks[cpu] != NULL)
1280 return 0; /* Already created, nothing more to do. */ 1278 return 0; /* Already created, nothing more to do. */
1281 1279
1282 /* Don't allow time recalculation while creating a new task. */ 1280 /* Don't allow time recalculation while creating a new task. */
1283 mutex_lock(&boost_mutex); 1281 mutex_lock(&boost_mutex);
1284 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); 1282 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1285 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, 1283 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
1286 "rcu_torture_boost"); 1284 "rcu_torture_boost");
1287 if (IS_ERR(boost_tasks[cpu])) { 1285 if (IS_ERR(boost_tasks[cpu])) {
1288 retval = PTR_ERR(boost_tasks[cpu]); 1286 retval = PTR_ERR(boost_tasks[cpu]);
1289 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); 1287 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
1290 n_rcu_torture_boost_ktrerror++; 1288 n_rcu_torture_boost_ktrerror++;
1291 boost_tasks[cpu] = NULL; 1289 boost_tasks[cpu] = NULL;
1292 mutex_unlock(&boost_mutex); 1290 mutex_unlock(&boost_mutex);
1293 return retval; 1291 return retval;
1294 } 1292 }
1295 kthread_bind(boost_tasks[cpu], cpu); 1293 kthread_bind(boost_tasks[cpu], cpu);
1296 wake_up_process(boost_tasks[cpu]); 1294 wake_up_process(boost_tasks[cpu]);
1297 mutex_unlock(&boost_mutex); 1295 mutex_unlock(&boost_mutex);
1298 return 0; 1296 return 0;
1299 } 1297 }
1300 1298
1301 static int rcutorture_cpu_notify(struct notifier_block *self, 1299 static int rcutorture_cpu_notify(struct notifier_block *self,
1302 unsigned long action, void *hcpu) 1300 unsigned long action, void *hcpu)
1303 { 1301 {
1304 long cpu = (long)hcpu; 1302 long cpu = (long)hcpu;
1305 1303
1306 switch (action) { 1304 switch (action) {
1307 case CPU_ONLINE: 1305 case CPU_ONLINE:
1308 case CPU_DOWN_FAILED: 1306 case CPU_DOWN_FAILED:
1309 (void)rcutorture_booster_init(cpu); 1307 (void)rcutorture_booster_init(cpu);
1310 break; 1308 break;
1311 case CPU_DOWN_PREPARE: 1309 case CPU_DOWN_PREPARE:
1312 rcutorture_booster_cleanup(cpu); 1310 rcutorture_booster_cleanup(cpu);
1313 break; 1311 break;
1314 default: 1312 default:
1315 break; 1313 break;
1316 } 1314 }
1317 return NOTIFY_OK; 1315 return NOTIFY_OK;
1318 } 1316 }
1319 1317
1320 static struct notifier_block rcutorture_cpu_nb = { 1318 static struct notifier_block rcutorture_cpu_nb = {
1321 .notifier_call = rcutorture_cpu_notify, 1319 .notifier_call = rcutorture_cpu_notify,
1322 }; 1320 };
1323 1321
1324 static void 1322 static void
1325 rcu_torture_cleanup(void) 1323 rcu_torture_cleanup(void)
1326 { 1324 {
1327 int i; 1325 int i;
1328 1326
1329 mutex_lock(&fullstop_mutex); 1327 mutex_lock(&fullstop_mutex);
1330 rcutorture_record_test_transition(); 1328 rcutorture_record_test_transition();
1331 if (fullstop == FULLSTOP_SHUTDOWN) { 1329 if (fullstop == FULLSTOP_SHUTDOWN) {
1332 printk(KERN_WARNING /* but going down anyway, so... */ 1330 printk(KERN_WARNING /* but going down anyway, so... */
1333 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1331 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
1334 mutex_unlock(&fullstop_mutex); 1332 mutex_unlock(&fullstop_mutex);
1335 schedule_timeout_uninterruptible(10); 1333 schedule_timeout_uninterruptible(10);
1336 if (cur_ops->cb_barrier != NULL) 1334 if (cur_ops->cb_barrier != NULL)
1337 cur_ops->cb_barrier(); 1335 cur_ops->cb_barrier();
1338 return; 1336 return;
1339 } 1337 }
1340 fullstop = FULLSTOP_RMMOD; 1338 fullstop = FULLSTOP_RMMOD;
1341 mutex_unlock(&fullstop_mutex); 1339 mutex_unlock(&fullstop_mutex);
1342 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1340 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1343 if (stutter_task) { 1341 if (stutter_task) {
1344 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1342 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1345 kthread_stop(stutter_task); 1343 kthread_stop(stutter_task);
1346 } 1344 }
1347 stutter_task = NULL; 1345 stutter_task = NULL;
1348 if (shuffler_task) { 1346 if (shuffler_task) {
1349 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); 1347 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
1350 kthread_stop(shuffler_task); 1348 kthread_stop(shuffler_task);
1351 free_cpumask_var(shuffle_tmp_mask); 1349 free_cpumask_var(shuffle_tmp_mask);
1352 } 1350 }
1353 shuffler_task = NULL; 1351 shuffler_task = NULL;
1354 1352
1355 if (writer_task) { 1353 if (writer_task) {
1356 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); 1354 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
1357 kthread_stop(writer_task); 1355 kthread_stop(writer_task);
1358 } 1356 }
1359 writer_task = NULL; 1357 writer_task = NULL;
1360 1358
1361 if (reader_tasks) { 1359 if (reader_tasks) {
1362 for (i = 0; i < nrealreaders; i++) { 1360 for (i = 0; i < nrealreaders; i++) {
1363 if (reader_tasks[i]) { 1361 if (reader_tasks[i]) {
1364 VERBOSE_PRINTK_STRING( 1362 VERBOSE_PRINTK_STRING(
1365 "Stopping rcu_torture_reader task"); 1363 "Stopping rcu_torture_reader task");
1366 kthread_stop(reader_tasks[i]); 1364 kthread_stop(reader_tasks[i]);
1367 } 1365 }
1368 reader_tasks[i] = NULL; 1366 reader_tasks[i] = NULL;
1369 } 1367 }
1370 kfree(reader_tasks); 1368 kfree(reader_tasks);
1371 reader_tasks = NULL; 1369 reader_tasks = NULL;
1372 } 1370 }
1373 rcu_torture_current = NULL; 1371 rcu_torture_current = NULL;
1374 1372
1375 if (fakewriter_tasks) { 1373 if (fakewriter_tasks) {
1376 for (i = 0; i < nfakewriters; i++) { 1374 for (i = 0; i < nfakewriters; i++) {
1377 if (fakewriter_tasks[i]) { 1375 if (fakewriter_tasks[i]) {
1378 VERBOSE_PRINTK_STRING( 1376 VERBOSE_PRINTK_STRING(
1379 "Stopping rcu_torture_fakewriter task"); 1377 "Stopping rcu_torture_fakewriter task");
1380 kthread_stop(fakewriter_tasks[i]); 1378 kthread_stop(fakewriter_tasks[i]);
1381 } 1379 }
1382 fakewriter_tasks[i] = NULL; 1380 fakewriter_tasks[i] = NULL;
1383 } 1381 }
1384 kfree(fakewriter_tasks); 1382 kfree(fakewriter_tasks);
1385 fakewriter_tasks = NULL; 1383 fakewriter_tasks = NULL;
1386 } 1384 }
1387 1385
1388 if (stats_task) { 1386 if (stats_task) {
1389 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); 1387 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
1390 kthread_stop(stats_task); 1388 kthread_stop(stats_task);
1391 } 1389 }
1392 stats_task = NULL; 1390 stats_task = NULL;
1393 1391
1394 if (fqs_task) { 1392 if (fqs_task) {
1395 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); 1393 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1396 kthread_stop(fqs_task); 1394 kthread_stop(fqs_task);
1397 } 1395 }
1398 fqs_task = NULL; 1396 fqs_task = NULL;
1399 if ((test_boost == 1 && cur_ops->can_boost) || 1397 if ((test_boost == 1 && cur_ops->can_boost) ||
1400 test_boost == 2) { 1398 test_boost == 2) {
1401 unregister_cpu_notifier(&rcutorture_cpu_nb); 1399 unregister_cpu_notifier(&rcutorture_cpu_nb);
1402 for_each_possible_cpu(i) 1400 for_each_possible_cpu(i)
1403 rcutorture_booster_cleanup(i); 1401 rcutorture_booster_cleanup(i);
1404 } 1402 }
1405 1403
1406 /* Wait for all RCU callbacks to fire. */ 1404 /* Wait for all RCU callbacks to fire. */
1407 1405
1408 if (cur_ops->cb_barrier != NULL) 1406 if (cur_ops->cb_barrier != NULL)
1409 cur_ops->cb_barrier(); 1407 cur_ops->cb_barrier();
1410 1408
1411 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 1409 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
1412 1410
1413 if (cur_ops->cleanup) 1411 if (cur_ops->cleanup)
1414 cur_ops->cleanup(); 1412 cur_ops->cleanup();
1415 if (atomic_read(&n_rcu_torture_error)) 1413 if (atomic_read(&n_rcu_torture_error))
1416 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1414 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1417 else 1415 else
1418 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1416 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1419 } 1417 }
1420 1418
1421 static int __init 1419 static int __init
1422 rcu_torture_init(void) 1420 rcu_torture_init(void)
1423 { 1421 {
1424 int i; 1422 int i;
1425 int cpu; 1423 int cpu;
1426 int firsterr = 0; 1424 int firsterr = 0;
1427 static struct rcu_torture_ops *torture_ops[] = 1425 static struct rcu_torture_ops *torture_ops[] =
1428 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1426 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1429 &rcu_bh_ops, &rcu_bh_sync_ops, 1427 &rcu_bh_ops, &rcu_bh_sync_ops,
1430 &srcu_ops, &srcu_expedited_ops, 1428 &srcu_ops, &srcu_expedited_ops,
1431 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1429 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1432 1430
1433 mutex_lock(&fullstop_mutex); 1431 mutex_lock(&fullstop_mutex);
1434 1432
1435 /* Process args and tell the world that the torturer is on the job. */ 1433 /* Process args and tell the world that the torturer is on the job. */
1436 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1434 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
1437 cur_ops = torture_ops[i]; 1435 cur_ops = torture_ops[i];
1438 if (strcmp(torture_type, cur_ops->name) == 0) 1436 if (strcmp(torture_type, cur_ops->name) == 0)
1439 break; 1437 break;
1440 } 1438 }
1441 if (i == ARRAY_SIZE(torture_ops)) { 1439 if (i == ARRAY_SIZE(torture_ops)) {
1442 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", 1440 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
1443 torture_type); 1441 torture_type);
1444 printk(KERN_ALERT "rcu-torture types:"); 1442 printk(KERN_ALERT "rcu-torture types:");
1445 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) 1443 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1446 printk(KERN_ALERT " %s", torture_ops[i]->name); 1444 printk(KERN_ALERT " %s", torture_ops[i]->name);
1447 printk(KERN_ALERT "\n"); 1445 printk(KERN_ALERT "\n");
1448 mutex_unlock(&fullstop_mutex); 1446 mutex_unlock(&fullstop_mutex);
1449 return -EINVAL; 1447 return -EINVAL;
1450 } 1448 }
1451 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1449 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1452 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " 1450 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1453 "fqs_duration, fqs disabled.\n"); 1451 "fqs_duration, fqs disabled.\n");
1454 fqs_duration = 0; 1452 fqs_duration = 0;
1455 } 1453 }
1456 if (cur_ops->init) 1454 if (cur_ops->init)
1457 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1455 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1458 1456
1459 if (nreaders >= 0) 1457 if (nreaders >= 0)
1460 nrealreaders = nreaders; 1458 nrealreaders = nreaders;
1461 else 1459 else
1462 nrealreaders = 2 * num_online_cpus(); 1460 nrealreaders = 2 * num_online_cpus();
1463 rcu_torture_print_module_parms(cur_ops, "Start of test"); 1461 rcu_torture_print_module_parms(cur_ops, "Start of test");
1464 fullstop = FULLSTOP_DONTSTOP; 1462 fullstop = FULLSTOP_DONTSTOP;
1465 1463
1466 /* Set up the freelist. */ 1464 /* Set up the freelist. */
1467 1465
1468 INIT_LIST_HEAD(&rcu_torture_freelist); 1466 INIT_LIST_HEAD(&rcu_torture_freelist);
1469 for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { 1467 for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
1470 rcu_tortures[i].rtort_mbtest = 0; 1468 rcu_tortures[i].rtort_mbtest = 0;
1471 list_add_tail(&rcu_tortures[i].rtort_free, 1469 list_add_tail(&rcu_tortures[i].rtort_free,
1472 &rcu_torture_freelist); 1470 &rcu_torture_freelist);
1473 } 1471 }
1474 1472
1475 /* Initialize the statistics so that each run gets its own numbers. */ 1473 /* Initialize the statistics so that each run gets its own numbers. */
1476 1474
1477 rcu_torture_current = NULL; 1475 rcu_torture_current = NULL;
1478 rcu_torture_current_version = 0; 1476 rcu_torture_current_version = 0;
1479 atomic_set(&n_rcu_torture_alloc, 0); 1477 atomic_set(&n_rcu_torture_alloc, 0);
1480 atomic_set(&n_rcu_torture_alloc_fail, 0); 1478 atomic_set(&n_rcu_torture_alloc_fail, 0);
1481 atomic_set(&n_rcu_torture_free, 0); 1479 atomic_set(&n_rcu_torture_free, 0);
1482 atomic_set(&n_rcu_torture_mberror, 0); 1480 atomic_set(&n_rcu_torture_mberror, 0);
1483 atomic_set(&n_rcu_torture_error, 0); 1481 atomic_set(&n_rcu_torture_error, 0);
1484 n_rcu_torture_boost_ktrerror = 0; 1482 n_rcu_torture_boost_ktrerror = 0;
1485 n_rcu_torture_boost_rterror = 0; 1483 n_rcu_torture_boost_rterror = 0;
1486 n_rcu_torture_boost_failure = 0; 1484 n_rcu_torture_boost_failure = 0;
1487 n_rcu_torture_boosts = 0; 1485 n_rcu_torture_boosts = 0;
1488 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1486 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1489 atomic_set(&rcu_torture_wcount[i], 0); 1487 atomic_set(&rcu_torture_wcount[i], 0);
1490 for_each_possible_cpu(cpu) { 1488 for_each_possible_cpu(cpu) {
1491 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1489 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
1492 per_cpu(rcu_torture_count, cpu)[i] = 0; 1490 per_cpu(rcu_torture_count, cpu)[i] = 0;
1493 per_cpu(rcu_torture_batch, cpu)[i] = 0; 1491 per_cpu(rcu_torture_batch, cpu)[i] = 0;
1494 } 1492 }
1495 } 1493 }
1496 1494
1497 /* Start up the kthreads. */ 1495 /* Start up the kthreads. */
1498 1496
1499 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); 1497 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
1500 writer_task = kthread_run(rcu_torture_writer, NULL, 1498 writer_task = kthread_run(rcu_torture_writer, NULL,
1501 "rcu_torture_writer"); 1499 "rcu_torture_writer");
1502 if (IS_ERR(writer_task)) { 1500 if (IS_ERR(writer_task)) {
1503 firsterr = PTR_ERR(writer_task); 1501 firsterr = PTR_ERR(writer_task);
1504 VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); 1502 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
1505 writer_task = NULL; 1503 writer_task = NULL;
1506 goto unwind; 1504 goto unwind;
1507 } 1505 }
1508 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1506 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1509 GFP_KERNEL); 1507 GFP_KERNEL);
1510 if (fakewriter_tasks == NULL) { 1508 if (fakewriter_tasks == NULL) {
1511 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1509 VERBOSE_PRINTK_ERRSTRING("out of memory");
1512 firsterr = -ENOMEM; 1510 firsterr = -ENOMEM;
1513 goto unwind; 1511 goto unwind;
1514 } 1512 }
1515 for (i = 0; i < nfakewriters; i++) { 1513 for (i = 0; i < nfakewriters; i++) {
1516 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); 1514 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
1517 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, 1515 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
1518 "rcu_torture_fakewriter"); 1516 "rcu_torture_fakewriter");
1519 if (IS_ERR(fakewriter_tasks[i])) { 1517 if (IS_ERR(fakewriter_tasks[i])) {
1520 firsterr = PTR_ERR(fakewriter_tasks[i]); 1518 firsterr = PTR_ERR(fakewriter_tasks[i]);
1521 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); 1519 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
1522 fakewriter_tasks[i] = NULL; 1520 fakewriter_tasks[i] = NULL;
1523 goto unwind; 1521 goto unwind;
1524 } 1522 }
1525 } 1523 }
1526 reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), 1524 reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
1527 GFP_KERNEL); 1525 GFP_KERNEL);
1528 if (reader_tasks == NULL) { 1526 if (reader_tasks == NULL) {
1529 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1527 VERBOSE_PRINTK_ERRSTRING("out of memory");
1530 firsterr = -ENOMEM; 1528 firsterr = -ENOMEM;
1531 goto unwind; 1529 goto unwind;
1532 } 1530 }
1533 for (i = 0; i < nrealreaders; i++) { 1531 for (i = 0; i < nrealreaders; i++) {
1534 VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); 1532 VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
1535 reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, 1533 reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
1536 "rcu_torture_reader"); 1534 "rcu_torture_reader");
1537 if (IS_ERR(reader_tasks[i])) { 1535 if (IS_ERR(reader_tasks[i])) {
1538 firsterr = PTR_ERR(reader_tasks[i]); 1536 firsterr = PTR_ERR(reader_tasks[i]);
1539 VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); 1537 VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
1540 reader_tasks[i] = NULL; 1538 reader_tasks[i] = NULL;
1541 goto unwind; 1539 goto unwind;
1542 } 1540 }
1543 } 1541 }
1544 if (stat_interval > 0) { 1542 if (stat_interval > 0) {
1545 VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); 1543 VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
1546 stats_task = kthread_run(rcu_torture_stats, NULL, 1544 stats_task = kthread_run(rcu_torture_stats, NULL,
1547 "rcu_torture_stats"); 1545 "rcu_torture_stats");
1548 if (IS_ERR(stats_task)) { 1546 if (IS_ERR(stats_task)) {
1549 firsterr = PTR_ERR(stats_task); 1547 firsterr = PTR_ERR(stats_task);
1550 VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); 1548 VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
1551 stats_task = NULL; 1549 stats_task = NULL;
1552 goto unwind; 1550 goto unwind;
1553 } 1551 }
1554 } 1552 }
1555 if (test_no_idle_hz) { 1553 if (test_no_idle_hz) {
1556 rcu_idle_cpu = num_online_cpus() - 1; 1554 rcu_idle_cpu = num_online_cpus() - 1;
1557 1555
1558 if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { 1556 if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
1559 firsterr = -ENOMEM; 1557 firsterr = -ENOMEM;
1560 VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); 1558 VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
1561 goto unwind; 1559 goto unwind;
1562 } 1560 }
1563 1561
1564 /* Create the shuffler thread */ 1562 /* Create the shuffler thread */
1565 shuffler_task = kthread_run(rcu_torture_shuffle, NULL, 1563 shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
1566 "rcu_torture_shuffle"); 1564 "rcu_torture_shuffle");
1567 if (IS_ERR(shuffler_task)) { 1565 if (IS_ERR(shuffler_task)) {
1568 free_cpumask_var(shuffle_tmp_mask); 1566 free_cpumask_var(shuffle_tmp_mask);
1569 firsterr = PTR_ERR(shuffler_task); 1567 firsterr = PTR_ERR(shuffler_task);
1570 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); 1568 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
1571 shuffler_task = NULL; 1569 shuffler_task = NULL;
1572 goto unwind; 1570 goto unwind;
1573 } 1571 }
1574 } 1572 }
1575 if (stutter < 0) 1573 if (stutter < 0)
1576 stutter = 0; 1574 stutter = 0;
1577 if (stutter) { 1575 if (stutter) {
1578 /* Create the stutter thread */ 1576 /* Create the stutter thread */
1579 stutter_task = kthread_run(rcu_torture_stutter, NULL, 1577 stutter_task = kthread_run(rcu_torture_stutter, NULL,
1580 "rcu_torture_stutter"); 1578 "rcu_torture_stutter");
1581 if (IS_ERR(stutter_task)) { 1579 if (IS_ERR(stutter_task)) {
1582 firsterr = PTR_ERR(stutter_task); 1580 firsterr = PTR_ERR(stutter_task);
1583 VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); 1581 VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
1584 stutter_task = NULL; 1582 stutter_task = NULL;
1585 goto unwind; 1583 goto unwind;
1586 } 1584 }
1587 } 1585 }
1588 if (fqs_duration < 0) 1586 if (fqs_duration < 0)
1589 fqs_duration = 0; 1587 fqs_duration = 0;
1590 if (fqs_duration) { 1588 if (fqs_duration) {
1591 /* Create the stutter thread */ 1589 /* Create the stutter thread */
1592 fqs_task = kthread_run(rcu_torture_fqs, NULL, 1590 fqs_task = kthread_run(rcu_torture_fqs, NULL,
1593 "rcu_torture_fqs"); 1591 "rcu_torture_fqs");
1594 if (IS_ERR(fqs_task)) { 1592 if (IS_ERR(fqs_task)) {
1595 firsterr = PTR_ERR(fqs_task); 1593 firsterr = PTR_ERR(fqs_task);
1596 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); 1594 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
1597 fqs_task = NULL; 1595 fqs_task = NULL;
1598 goto unwind; 1596 goto unwind;
1599 } 1597 }
1600 } 1598 }
1601 if (test_boost_interval < 1) 1599 if (test_boost_interval < 1)
1602 test_boost_interval = 1; 1600 test_boost_interval = 1;
1603 if (test_boost_duration < 2) 1601 if (test_boost_duration < 2)
1604 test_boost_duration = 2; 1602 test_boost_duration = 2;
1605 if ((test_boost == 1 && cur_ops->can_boost) || 1603 if ((test_boost == 1 && cur_ops->can_boost) ||
1606 test_boost == 2) { 1604 test_boost == 2) {
1607 int retval; 1605 int retval;
1608 1606
1609 boost_starttime = jiffies + test_boost_interval * HZ; 1607 boost_starttime = jiffies + test_boost_interval * HZ;
1610 register_cpu_notifier(&rcutorture_cpu_nb); 1608 register_cpu_notifier(&rcutorture_cpu_nb);
1611 for_each_possible_cpu(i) { 1609 for_each_possible_cpu(i) {
1612 if (cpu_is_offline(i)) 1610 if (cpu_is_offline(i))
1613 continue; /* Heuristic: CPU can go offline. */ 1611 continue; /* Heuristic: CPU can go offline. */
1614 retval = rcutorture_booster_init(i); 1612 retval = rcutorture_booster_init(i);
1615 if (retval < 0) { 1613 if (retval < 0) {
1616 firsterr = retval; 1614 firsterr = retval;
1617 goto unwind; 1615 goto unwind;
1618 } 1616 }
1619 } 1617 }
1620 } 1618 }
1621 register_reboot_notifier(&rcutorture_shutdown_nb); 1619 register_reboot_notifier(&rcutorture_shutdown_nb);
1622 rcutorture_record_test_transition(); 1620 rcutorture_record_test_transition();
1623 mutex_unlock(&fullstop_mutex); 1621 mutex_unlock(&fullstop_mutex);
1624 return 0; 1622 return 0;
1625 1623
1626 unwind: 1624 unwind:
1627 mutex_unlock(&fullstop_mutex); 1625 mutex_unlock(&fullstop_mutex);
1628 rcu_torture_cleanup(); 1626 rcu_torture_cleanup();
1629 return firsterr; 1627 return firsterr;
1630 } 1628 }
1631 1629
1632 module_init(rcu_torture_init); 1630 module_init(rcu_torture_init);
1633 module_exit(rcu_torture_cleanup); 1631 module_exit(rcu_torture_cleanup);
1634 1632
1 /* 1 /*
2 * kernel/sched.c 2 * kernel/sched.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas. 20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements 21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz 26 * Thomas Gleixner, Mike Kravetz
27 */ 27 */
28 28
29 #include <linux/mm.h> 29 #include <linux/mm.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/nmi.h> 31 #include <linux/nmi.h>
32 #include <linux/init.h> 32 #include <linux/init.h>
33 #include <linux/uaccess.h> 33 #include <linux/uaccess.h>
34 #include <linux/highmem.h> 34 #include <linux/highmem.h>
35 #include <asm/mmu_context.h> 35 #include <asm/mmu_context.h>
36 #include <linux/interrupt.h> 36 #include <linux/interrupt.h>
37 #include <linux/capability.h> 37 #include <linux/capability.h>
38 #include <linux/completion.h> 38 #include <linux/completion.h>
39 #include <linux/kernel_stat.h> 39 #include <linux/kernel_stat.h>
40 #include <linux/debug_locks.h> 40 #include <linux/debug_locks.h>
41 #include <linux/perf_event.h> 41 #include <linux/perf_event.h>
42 #include <linux/security.h> 42 #include <linux/security.h>
43 #include <linux/notifier.h> 43 #include <linux/notifier.h>
44 #include <linux/profile.h> 44 #include <linux/profile.h>
45 #include <linux/freezer.h> 45 #include <linux/freezer.h>
46 #include <linux/vmalloc.h> 46 #include <linux/vmalloc.h>
47 #include <linux/blkdev.h> 47 #include <linux/blkdev.h>
48 #include <linux/delay.h> 48 #include <linux/delay.h>
49 #include <linux/pid_namespace.h> 49 #include <linux/pid_namespace.h>
50 #include <linux/smp.h> 50 #include <linux/smp.h>
51 #include <linux/threads.h> 51 #include <linux/threads.h>
52 #include <linux/timer.h> 52 #include <linux/timer.h>
53 #include <linux/rcupdate.h> 53 #include <linux/rcupdate.h>
54 #include <linux/cpu.h> 54 #include <linux/cpu.h>
55 #include <linux/cpuset.h> 55 #include <linux/cpuset.h>
56 #include <linux/percpu.h> 56 #include <linux/percpu.h>
57 #include <linux/proc_fs.h> 57 #include <linux/proc_fs.h>
58 #include <linux/seq_file.h> 58 #include <linux/seq_file.h>
59 #include <linux/stop_machine.h> 59 #include <linux/stop_machine.h>
60 #include <linux/sysctl.h> 60 #include <linux/sysctl.h>
61 #include <linux/syscalls.h> 61 #include <linux/syscalls.h>
62 #include <linux/times.h> 62 #include <linux/times.h>
63 #include <linux/tsacct_kern.h> 63 #include <linux/tsacct_kern.h>
64 #include <linux/kprobes.h> 64 #include <linux/kprobes.h>
65 #include <linux/delayacct.h> 65 #include <linux/delayacct.h>
66 #include <linux/unistd.h> 66 #include <linux/unistd.h>
67 #include <linux/pagemap.h> 67 #include <linux/pagemap.h>
68 #include <linux/hrtimer.h> 68 #include <linux/hrtimer.h>
69 #include <linux/tick.h> 69 #include <linux/tick.h>
70 #include <linux/debugfs.h> 70 #include <linux/debugfs.h>
71 #include <linux/ctype.h> 71 #include <linux/ctype.h>
72 #include <linux/ftrace.h> 72 #include <linux/ftrace.h>
73 #include <linux/slab.h> 73 #include <linux/slab.h>
74 74
75 #include <asm/tlb.h> 75 #include <asm/tlb.h>
76 #include <asm/irq_regs.h> 76 #include <asm/irq_regs.h>
77 #include <asm/mutex.h> 77 #include <asm/mutex.h>
78 78
79 #include "sched_cpupri.h" 79 #include "sched_cpupri.h"
80 #include "workqueue_sched.h" 80 #include "workqueue_sched.h"
81 #include "sched_autogroup.h" 81 #include "sched_autogroup.h"
82 82
83 #define CREATE_TRACE_POINTS 83 #define CREATE_TRACE_POINTS
84 #include <trace/events/sched.h> 84 #include <trace/events/sched.h>
85 85
86 /* 86 /*
87 * Convert user-nice values [ -20 ... 0 ... 19 ] 87 * Convert user-nice values [ -20 ... 0 ... 19 ]
88 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 88 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
89 * and back. 89 * and back.
90 */ 90 */
91 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 91 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
92 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 92 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
93 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 93 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
94 94
95 /* 95 /*
96 * 'User priority' is the nice value converted to something we 96 * 'User priority' is the nice value converted to something we
97 * can work with better when scaling various scheduler parameters, 97 * can work with better when scaling various scheduler parameters,
98 * it's a [ 0 ... 39 ] range. 98 * it's a [ 0 ... 39 ] range.
99 */ 99 */
100 #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 100 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
101 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 101 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
102 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 102 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
103 103
104 /* 104 /*
105 * Helpers for converting nanosecond timing to jiffy resolution 105 * Helpers for converting nanosecond timing to jiffy resolution
106 */ 106 */
107 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 107 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
108 108
109 #define NICE_0_LOAD SCHED_LOAD_SCALE 109 #define NICE_0_LOAD SCHED_LOAD_SCALE
110 #define NICE_0_SHIFT SCHED_LOAD_SHIFT 110 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
111 111
112 /* 112 /*
113 * These are the 'tuning knobs' of the scheduler: 113 * These are the 'tuning knobs' of the scheduler:
114 * 114 *
115 * default timeslice is 100 msecs (used only for SCHED_RR tasks). 115 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
116 * Timeslices get refilled after they expire. 116 * Timeslices get refilled after they expire.
117 */ 117 */
118 #define DEF_TIMESLICE (100 * HZ / 1000) 118 #define DEF_TIMESLICE (100 * HZ / 1000)
119 119
120 /* 120 /*
121 * single value that denotes runtime == period, ie unlimited time. 121 * single value that denotes runtime == period, ie unlimited time.
122 */ 122 */
123 #define RUNTIME_INF ((u64)~0ULL) 123 #define RUNTIME_INF ((u64)~0ULL)
124 124
125 static inline int rt_policy(int policy) 125 static inline int rt_policy(int policy)
126 { 126 {
127 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 127 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
128 return 1; 128 return 1;
129 return 0; 129 return 0;
130 } 130 }
131 131
132 static inline int task_has_rt_policy(struct task_struct *p) 132 static inline int task_has_rt_policy(struct task_struct *p)
133 { 133 {
134 return rt_policy(p->policy); 134 return rt_policy(p->policy);
135 } 135 }
136 136
137 /* 137 /*
138 * This is the priority-queue data structure of the RT scheduling class: 138 * This is the priority-queue data structure of the RT scheduling class:
139 */ 139 */
140 struct rt_prio_array { 140 struct rt_prio_array {
141 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 141 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
142 struct list_head queue[MAX_RT_PRIO]; 142 struct list_head queue[MAX_RT_PRIO];
143 }; 143 };
144 144
145 struct rt_bandwidth { 145 struct rt_bandwidth {
146 /* nests inside the rq lock: */ 146 /* nests inside the rq lock: */
147 raw_spinlock_t rt_runtime_lock; 147 raw_spinlock_t rt_runtime_lock;
148 ktime_t rt_period; 148 ktime_t rt_period;
149 u64 rt_runtime; 149 u64 rt_runtime;
150 struct hrtimer rt_period_timer; 150 struct hrtimer rt_period_timer;
151 }; 151 };
152 152
153 static struct rt_bandwidth def_rt_bandwidth; 153 static struct rt_bandwidth def_rt_bandwidth;
154 154
155 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 155 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
156 156
157 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 157 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
158 { 158 {
159 struct rt_bandwidth *rt_b = 159 struct rt_bandwidth *rt_b =
160 container_of(timer, struct rt_bandwidth, rt_period_timer); 160 container_of(timer, struct rt_bandwidth, rt_period_timer);
161 ktime_t now; 161 ktime_t now;
162 int overrun; 162 int overrun;
163 int idle = 0; 163 int idle = 0;
164 164
165 for (;;) { 165 for (;;) {
166 now = hrtimer_cb_get_time(timer); 166 now = hrtimer_cb_get_time(timer);
167 overrun = hrtimer_forward(timer, now, rt_b->rt_period); 167 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
168 168
169 if (!overrun) 169 if (!overrun)
170 break; 170 break;
171 171
172 idle = do_sched_rt_period_timer(rt_b, overrun); 172 idle = do_sched_rt_period_timer(rt_b, overrun);
173 } 173 }
174 174
175 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 175 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
176 } 176 }
177 177
178 static 178 static
179 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 179 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
180 { 180 {
181 rt_b->rt_period = ns_to_ktime(period); 181 rt_b->rt_period = ns_to_ktime(period);
182 rt_b->rt_runtime = runtime; 182 rt_b->rt_runtime = runtime;
183 183
184 raw_spin_lock_init(&rt_b->rt_runtime_lock); 184 raw_spin_lock_init(&rt_b->rt_runtime_lock);
185 185
186 hrtimer_init(&rt_b->rt_period_timer, 186 hrtimer_init(&rt_b->rt_period_timer,
187 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 187 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
188 rt_b->rt_period_timer.function = sched_rt_period_timer; 188 rt_b->rt_period_timer.function = sched_rt_period_timer;
189 } 189 }
190 190
191 static inline int rt_bandwidth_enabled(void) 191 static inline int rt_bandwidth_enabled(void)
192 { 192 {
193 return sysctl_sched_rt_runtime >= 0; 193 return sysctl_sched_rt_runtime >= 0;
194 } 194 }
195 195
196 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 196 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
197 { 197 {
198 ktime_t now; 198 ktime_t now;
199 199
200 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 200 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
201 return; 201 return;
202 202
203 if (hrtimer_active(&rt_b->rt_period_timer)) 203 if (hrtimer_active(&rt_b->rt_period_timer))
204 return; 204 return;
205 205
206 raw_spin_lock(&rt_b->rt_runtime_lock); 206 raw_spin_lock(&rt_b->rt_runtime_lock);
207 for (;;) { 207 for (;;) {
208 unsigned long delta; 208 unsigned long delta;
209 ktime_t soft, hard; 209 ktime_t soft, hard;
210 210
211 if (hrtimer_active(&rt_b->rt_period_timer)) 211 if (hrtimer_active(&rt_b->rt_period_timer))
212 break; 212 break;
213 213
214 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 214 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
215 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 215 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
216 216
217 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); 217 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
218 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 218 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
219 delta = ktime_to_ns(ktime_sub(hard, soft)); 219 delta = ktime_to_ns(ktime_sub(hard, soft));
220 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 220 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
221 HRTIMER_MODE_ABS_PINNED, 0); 221 HRTIMER_MODE_ABS_PINNED, 0);
222 } 222 }
223 raw_spin_unlock(&rt_b->rt_runtime_lock); 223 raw_spin_unlock(&rt_b->rt_runtime_lock);
224 } 224 }
225 225
226 #ifdef CONFIG_RT_GROUP_SCHED 226 #ifdef CONFIG_RT_GROUP_SCHED
227 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 227 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
228 { 228 {
229 hrtimer_cancel(&rt_b->rt_period_timer); 229 hrtimer_cancel(&rt_b->rt_period_timer);
230 } 230 }
231 #endif 231 #endif
232 232
233 /* 233 /*
234 * sched_domains_mutex serializes calls to init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
236 */ 236 */
237 static DEFINE_MUTEX(sched_domains_mutex); 237 static DEFINE_MUTEX(sched_domains_mutex);
238 238
239 #ifdef CONFIG_CGROUP_SCHED 239 #ifdef CONFIG_CGROUP_SCHED
240 240
241 #include <linux/cgroup.h> 241 #include <linux/cgroup.h>
242 242
243 struct cfs_rq; 243 struct cfs_rq;
244 244
245 static LIST_HEAD(task_groups); 245 static LIST_HEAD(task_groups);
246 246
247 /* task group related information */ 247 /* task group related information */
248 struct task_group { 248 struct task_group {
249 struct cgroup_subsys_state css; 249 struct cgroup_subsys_state css;
250 250
251 #ifdef CONFIG_FAIR_GROUP_SCHED 251 #ifdef CONFIG_FAIR_GROUP_SCHED
252 /* schedulable entities of this group on each cpu */ 252 /* schedulable entities of this group on each cpu */
253 struct sched_entity **se; 253 struct sched_entity **se;
254 /* runqueue "owned" by this group on each cpu */ 254 /* runqueue "owned" by this group on each cpu */
255 struct cfs_rq **cfs_rq; 255 struct cfs_rq **cfs_rq;
256 unsigned long shares; 256 unsigned long shares;
257 257
258 atomic_t load_weight; 258 atomic_t load_weight;
259 #endif 259 #endif
260 260
261 #ifdef CONFIG_RT_GROUP_SCHED 261 #ifdef CONFIG_RT_GROUP_SCHED
262 struct sched_rt_entity **rt_se; 262 struct sched_rt_entity **rt_se;
263 struct rt_rq **rt_rq; 263 struct rt_rq **rt_rq;
264 264
265 struct rt_bandwidth rt_bandwidth; 265 struct rt_bandwidth rt_bandwidth;
266 #endif 266 #endif
267 267
268 struct rcu_head rcu; 268 struct rcu_head rcu;
269 struct list_head list; 269 struct list_head list;
270 270
271 struct task_group *parent; 271 struct task_group *parent;
272 struct list_head siblings; 272 struct list_head siblings;
273 struct list_head children; 273 struct list_head children;
274 274
275 #ifdef CONFIG_SCHED_AUTOGROUP 275 #ifdef CONFIG_SCHED_AUTOGROUP
276 struct autogroup *autogroup; 276 struct autogroup *autogroup;
277 #endif 277 #endif
278 }; 278 };
279 279
280 /* task_group_lock serializes the addition/removal of task groups */ 280 /* task_group_lock serializes the addition/removal of task groups */
281 static DEFINE_SPINLOCK(task_group_lock); 281 static DEFINE_SPINLOCK(task_group_lock);
282 282
283 #ifdef CONFIG_FAIR_GROUP_SCHED 283 #ifdef CONFIG_FAIR_GROUP_SCHED
284 284
285 # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD 285 # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
286 286
287 /* 287 /*
288 * A weight of 0 or 1 can cause arithmetics problems. 288 * A weight of 0 or 1 can cause arithmetics problems.
289 * A weight of a cfs_rq is the sum of weights of which entities 289 * A weight of a cfs_rq is the sum of weights of which entities
290 * are queued on this cfs_rq, so a weight of a entity should not be 290 * are queued on this cfs_rq, so a weight of a entity should not be
291 * too large, so as the shares value of a task group. 291 * too large, so as the shares value of a task group.
292 * (The default weight is 1024 - so there's no practical 292 * (The default weight is 1024 - so there's no practical
293 * limitation from this.) 293 * limitation from this.)
294 */ 294 */
295 #define MIN_SHARES 2 295 #define MIN_SHARES 2
296 #define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) 296 #define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
297 297
298 static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 298 static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
299 #endif 299 #endif
300 300
301 /* Default task group. 301 /* Default task group.
302 * Every task in system belong to this group at bootup. 302 * Every task in system belong to this group at bootup.
303 */ 303 */
304 struct task_group root_task_group; 304 struct task_group root_task_group;
305 305
306 #endif /* CONFIG_CGROUP_SCHED */ 306 #endif /* CONFIG_CGROUP_SCHED */
307 307
308 /* CFS-related fields in a runqueue */ 308 /* CFS-related fields in a runqueue */
309 struct cfs_rq { 309 struct cfs_rq {
310 struct load_weight load; 310 struct load_weight load;
311 unsigned long nr_running; 311 unsigned long nr_running;
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315 #ifndef CONFIG_64BIT 315 #ifndef CONFIG_64BIT
316 u64 min_vruntime_copy; 316 u64 min_vruntime_copy;
317 #endif 317 #endif
318 318
319 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
320 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
321 321
322 struct list_head tasks; 322 struct list_head tasks;
323 struct list_head *balance_iterator; 323 struct list_head *balance_iterator;
324 324
325 /* 325 /*
326 * 'curr' points to currently running entity on this cfs_rq. 326 * 'curr' points to currently running entity on this cfs_rq.
327 * It is set to NULL otherwise (i.e when none are currently running). 327 * It is set to NULL otherwise (i.e when none are currently running).
328 */ 328 */
329 struct sched_entity *curr, *next, *last, *skip; 329 struct sched_entity *curr, *next, *last, *skip;
330 330
331 #ifdef CONFIG_SCHED_DEBUG 331 #ifdef CONFIG_SCHED_DEBUG
332 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333 #endif 333 #endif
334 334
335 #ifdef CONFIG_FAIR_GROUP_SCHED 335 #ifdef CONFIG_FAIR_GROUP_SCHED
336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
337 337
338 /* 338 /*
339 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 339 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
340 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 340 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
341 * (like users, containers etc.) 341 * (like users, containers etc.)
342 * 342 *
343 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 343 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
344 * list is used during load balance. 344 * list is used during load balance.
345 */ 345 */
346 int on_list; 346 int on_list;
347 struct list_head leaf_cfs_rq_list; 347 struct list_head leaf_cfs_rq_list;
348 struct task_group *tg; /* group that "owns" this runqueue */ 348 struct task_group *tg; /* group that "owns" this runqueue */
349 349
350 #ifdef CONFIG_SMP 350 #ifdef CONFIG_SMP
351 /* 351 /*
352 * the part of load.weight contributed by tasks 352 * the part of load.weight contributed by tasks
353 */ 353 */
354 unsigned long task_weight; 354 unsigned long task_weight;
355 355
356 /* 356 /*
357 * h_load = weight * f(tg) 357 * h_load = weight * f(tg)
358 * 358 *
359 * Where f(tg) is the recursive weight fraction assigned to 359 * Where f(tg) is the recursive weight fraction assigned to
360 * this group. 360 * this group.
361 */ 361 */
362 unsigned long h_load; 362 unsigned long h_load;
363 363
364 /* 364 /*
365 * Maintaining per-cpu shares distribution for group scheduling 365 * Maintaining per-cpu shares distribution for group scheduling
366 * 366 *
367 * load_stamp is the last time we updated the load average 367 * load_stamp is the last time we updated the load average
368 * load_last is the last time we updated the load average and saw load 368 * load_last is the last time we updated the load average and saw load
369 * load_unacc_exec_time is currently unaccounted execution time 369 * load_unacc_exec_time is currently unaccounted execution time
370 */ 370 */
371 u64 load_avg; 371 u64 load_avg;
372 u64 load_period; 372 u64 load_period;
373 u64 load_stamp, load_last, load_unacc_exec_time; 373 u64 load_stamp, load_last, load_unacc_exec_time;
374 374
375 unsigned long load_contribution; 375 unsigned long load_contribution;
376 #endif 376 #endif
377 #endif 377 #endif
378 }; 378 };
379 379
380 /* Real-Time classes' related field in a runqueue: */ 380 /* Real-Time classes' related field in a runqueue: */
381 struct rt_rq { 381 struct rt_rq {
382 struct rt_prio_array active; 382 struct rt_prio_array active;
383 unsigned long rt_nr_running; 383 unsigned long rt_nr_running;
384 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 384 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
385 struct { 385 struct {
386 int curr; /* highest queued rt task prio */ 386 int curr; /* highest queued rt task prio */
387 #ifdef CONFIG_SMP 387 #ifdef CONFIG_SMP
388 int next; /* next highest */ 388 int next; /* next highest */
389 #endif 389 #endif
390 } highest_prio; 390 } highest_prio;
391 #endif 391 #endif
392 #ifdef CONFIG_SMP 392 #ifdef CONFIG_SMP
393 unsigned long rt_nr_migratory; 393 unsigned long rt_nr_migratory;
394 unsigned long rt_nr_total; 394 unsigned long rt_nr_total;
395 int overloaded; 395 int overloaded;
396 struct plist_head pushable_tasks; 396 struct plist_head pushable_tasks;
397 #endif 397 #endif
398 int rt_throttled; 398 int rt_throttled;
399 u64 rt_time; 399 u64 rt_time;
400 u64 rt_runtime; 400 u64 rt_runtime;
401 /* Nests inside the rq lock: */ 401 /* Nests inside the rq lock: */
402 raw_spinlock_t rt_runtime_lock; 402 raw_spinlock_t rt_runtime_lock;
403 403
404 #ifdef CONFIG_RT_GROUP_SCHED 404 #ifdef CONFIG_RT_GROUP_SCHED
405 unsigned long rt_nr_boosted; 405 unsigned long rt_nr_boosted;
406 406
407 struct rq *rq; 407 struct rq *rq;
408 struct list_head leaf_rt_rq_list; 408 struct list_head leaf_rt_rq_list;
409 struct task_group *tg; 409 struct task_group *tg;
410 #endif 410 #endif
411 }; 411 };
412 412
413 #ifdef CONFIG_SMP 413 #ifdef CONFIG_SMP
414 414
415 /* 415 /*
416 * We add the notion of a root-domain which will be used to define per-domain 416 * We add the notion of a root-domain which will be used to define per-domain
417 * variables. Each exclusive cpuset essentially defines an island domain by 417 * variables. Each exclusive cpuset essentially defines an island domain by
418 * fully partitioning the member cpus from any other cpuset. Whenever a new 418 * fully partitioning the member cpus from any other cpuset. Whenever a new
419 * exclusive cpuset is created, we also create and attach a new root-domain 419 * exclusive cpuset is created, we also create and attach a new root-domain
420 * object. 420 * object.
421 * 421 *
422 */ 422 */
423 struct root_domain { 423 struct root_domain {
424 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu; 425 struct rcu_head rcu;
426 cpumask_var_t span; 426 cpumask_var_t span;
427 cpumask_var_t online; 427 cpumask_var_t online;
428 428
429 /* 429 /*
430 * The "RT overload" flag: it gets set if a CPU has more than 430 * The "RT overload" flag: it gets set if a CPU has more than
431 * one runnable RT task. 431 * one runnable RT task.
432 */ 432 */
433 cpumask_var_t rto_mask; 433 cpumask_var_t rto_mask;
434 atomic_t rto_count; 434 atomic_t rto_count;
435 struct cpupri cpupri; 435 struct cpupri cpupri;
436 }; 436 };
437 437
438 /* 438 /*
439 * By default the system creates a single root-domain with all cpus as 439 * By default the system creates a single root-domain with all cpus as
440 * members (mimicking the global state we have today). 440 * members (mimicking the global state we have today).
441 */ 441 */
442 static struct root_domain def_root_domain; 442 static struct root_domain def_root_domain;
443 443
444 #endif /* CONFIG_SMP */ 444 #endif /* CONFIG_SMP */
445 445
446 /* 446 /*
447 * This is the main, per-CPU runqueue data structure. 447 * This is the main, per-CPU runqueue data structure.
448 * 448 *
449 * Locking rule: those places that want to lock multiple runqueues 449 * Locking rule: those places that want to lock multiple runqueues
450 * (such as the load balancing or the thread migration code), lock 450 * (such as the load balancing or the thread migration code), lock
451 * acquire operations must be ordered by ascending &runqueue. 451 * acquire operations must be ordered by ascending &runqueue.
452 */ 452 */
453 struct rq { 453 struct rq {
454 /* runqueue lock: */ 454 /* runqueue lock: */
455 raw_spinlock_t lock; 455 raw_spinlock_t lock;
456 456
457 /* 457 /*
458 * nr_running and cpu_load should be in the same cacheline because 458 * nr_running and cpu_load should be in the same cacheline because
459 * remote CPUs use both these fields when doing load calculation. 459 * remote CPUs use both these fields when doing load calculation.
460 */ 460 */
461 unsigned long nr_running; 461 unsigned long nr_running;
462 #define CPU_LOAD_IDX_MAX 5 462 #define CPU_LOAD_IDX_MAX 5
463 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 463 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
464 unsigned long last_load_update_tick; 464 unsigned long last_load_update_tick;
465 #ifdef CONFIG_NO_HZ 465 #ifdef CONFIG_NO_HZ
466 u64 nohz_stamp; 466 u64 nohz_stamp;
467 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
468 #endif 468 #endif
469 int skip_clock_update; 469 int skip_clock_update;
470 470
471 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
472 struct load_weight load; 472 struct load_weight load;
473 unsigned long nr_load_updates; 473 unsigned long nr_load_updates;
474 u64 nr_switches; 474 u64 nr_switches;
475 475
476 struct cfs_rq cfs; 476 struct cfs_rq cfs;
477 struct rt_rq rt; 477 struct rt_rq rt;
478 478
479 #ifdef CONFIG_FAIR_GROUP_SCHED 479 #ifdef CONFIG_FAIR_GROUP_SCHED
480 /* list of leaf cfs_rq on this cpu: */ 480 /* list of leaf cfs_rq on this cpu: */
481 struct list_head leaf_cfs_rq_list; 481 struct list_head leaf_cfs_rq_list;
482 #endif 482 #endif
483 #ifdef CONFIG_RT_GROUP_SCHED 483 #ifdef CONFIG_RT_GROUP_SCHED
484 struct list_head leaf_rt_rq_list; 484 struct list_head leaf_rt_rq_list;
485 #endif 485 #endif
486 486
487 /* 487 /*
488 * This is part of a global counter where only the total sum 488 * This is part of a global counter where only the total sum
489 * over all CPUs matters. A task can increase this counter on 489 * over all CPUs matters. A task can increase this counter on
490 * one CPU and if it got migrated afterwards it may decrease 490 * one CPU and if it got migrated afterwards it may decrease
491 * it on another CPU. Always updated under the runqueue lock: 491 * it on another CPU. Always updated under the runqueue lock:
492 */ 492 */
493 unsigned long nr_uninterruptible; 493 unsigned long nr_uninterruptible;
494 494
495 struct task_struct *curr, *idle, *stop; 495 struct task_struct *curr, *idle, *stop;
496 unsigned long next_balance; 496 unsigned long next_balance;
497 struct mm_struct *prev_mm; 497 struct mm_struct *prev_mm;
498 498
499 u64 clock; 499 u64 clock;
500 u64 clock_task; 500 u64 clock_task;
501 501
502 atomic_t nr_iowait; 502 atomic_t nr_iowait;
503 503
504 #ifdef CONFIG_SMP 504 #ifdef CONFIG_SMP
505 struct root_domain *rd; 505 struct root_domain *rd;
506 struct sched_domain *sd; 506 struct sched_domain *sd;
507 507
508 unsigned long cpu_power; 508 unsigned long cpu_power;
509 509
510 unsigned char idle_at_tick; 510 unsigned char idle_at_tick;
511 /* For active balancing */ 511 /* For active balancing */
512 int post_schedule; 512 int post_schedule;
513 int active_balance; 513 int active_balance;
514 int push_cpu; 514 int push_cpu;
515 struct cpu_stop_work active_balance_work; 515 struct cpu_stop_work active_balance_work;
516 /* cpu of this runqueue: */ 516 /* cpu of this runqueue: */
517 int cpu; 517 int cpu;
518 int online; 518 int online;
519 519
520 unsigned long avg_load_per_task; 520 unsigned long avg_load_per_task;
521 521
522 u64 rt_avg; 522 u64 rt_avg;
523 u64 age_stamp; 523 u64 age_stamp;
524 u64 idle_stamp; 524 u64 idle_stamp;
525 u64 avg_idle; 525 u64 avg_idle;
526 #endif 526 #endif
527 527
528 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 528 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
529 u64 prev_irq_time; 529 u64 prev_irq_time;
530 #endif 530 #endif
531 531
532 /* calc_load related fields */ 532 /* calc_load related fields */
533 unsigned long calc_load_update; 533 unsigned long calc_load_update;
534 long calc_load_active; 534 long calc_load_active;
535 535
536 #ifdef CONFIG_SCHED_HRTICK 536 #ifdef CONFIG_SCHED_HRTICK
537 #ifdef CONFIG_SMP 537 #ifdef CONFIG_SMP
538 int hrtick_csd_pending; 538 int hrtick_csd_pending;
539 struct call_single_data hrtick_csd; 539 struct call_single_data hrtick_csd;
540 #endif 540 #endif
541 struct hrtimer hrtick_timer; 541 struct hrtimer hrtick_timer;
542 #endif 542 #endif
543 543
544 #ifdef CONFIG_SCHEDSTATS 544 #ifdef CONFIG_SCHEDSTATS
545 /* latency stats */ 545 /* latency stats */
546 struct sched_info rq_sched_info; 546 struct sched_info rq_sched_info;
547 unsigned long long rq_cpu_time; 547 unsigned long long rq_cpu_time;
548 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 548 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
549 549
550 /* sys_sched_yield() stats */ 550 /* sys_sched_yield() stats */
551 unsigned int yld_count; 551 unsigned int yld_count;
552 552
553 /* schedule() stats */ 553 /* schedule() stats */
554 unsigned int sched_switch; 554 unsigned int sched_switch;
555 unsigned int sched_count; 555 unsigned int sched_count;
556 unsigned int sched_goidle; 556 unsigned int sched_goidle;
557 557
558 /* try_to_wake_up() stats */ 558 /* try_to_wake_up() stats */
559 unsigned int ttwu_count; 559 unsigned int ttwu_count;
560 unsigned int ttwu_local; 560 unsigned int ttwu_local;
561 #endif 561 #endif
562 562
563 #ifdef CONFIG_SMP 563 #ifdef CONFIG_SMP
564 struct task_struct *wake_list; 564 struct task_struct *wake_list;
565 #endif 565 #endif
566 }; 566 };
567 567
568 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
569 569
570 570
571 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); 571 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
572 572
573 static inline int cpu_of(struct rq *rq) 573 static inline int cpu_of(struct rq *rq)
574 { 574 {
575 #ifdef CONFIG_SMP 575 #ifdef CONFIG_SMP
576 return rq->cpu; 576 return rq->cpu;
577 #else 577 #else
578 return 0; 578 return 0;
579 #endif 579 #endif
580 } 580 }
581 581
582 #define rcu_dereference_check_sched_domain(p) \ 582 #define rcu_dereference_check_sched_domain(p) \
583 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
584 rcu_read_lock_held() || \
585 lockdep_is_held(&sched_domains_mutex)) 584 lockdep_is_held(&sched_domains_mutex))
586 585
587 /* 586 /*
588 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 587 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
589 * See detach_destroy_domains: synchronize_sched for details. 588 * See detach_destroy_domains: synchronize_sched for details.
590 * 589 *
591 * The domain tree of any CPU may only be accessed from within 590 * The domain tree of any CPU may only be accessed from within
592 * preempt-disabled sections. 591 * preempt-disabled sections.
593 */ 592 */
594 #define for_each_domain(cpu, __sd) \ 593 #define for_each_domain(cpu, __sd) \
595 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 594 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
596 595
597 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 596 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
598 #define this_rq() (&__get_cpu_var(runqueues)) 597 #define this_rq() (&__get_cpu_var(runqueues))
599 #define task_rq(p) cpu_rq(task_cpu(p)) 598 #define task_rq(p) cpu_rq(task_cpu(p))
600 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 599 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
601 #define raw_rq() (&__raw_get_cpu_var(runqueues)) 600 #define raw_rq() (&__raw_get_cpu_var(runqueues))
602 601
603 #ifdef CONFIG_CGROUP_SCHED 602 #ifdef CONFIG_CGROUP_SCHED
604 603
605 /* 604 /*
606 * Return the group to which this tasks belongs. 605 * Return the group to which this tasks belongs.
607 * 606 *
608 * We use task_subsys_state_check() and extend the RCU verification with 607 * We use task_subsys_state_check() and extend the RCU verification with
609 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each 608 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
610 * task it moves into the cgroup. Therefore by holding either of those locks, 609 * task it moves into the cgroup. Therefore by holding either of those locks,
611 * we pin the task to the current cgroup. 610 * we pin the task to the current cgroup.
612 */ 611 */
613 static inline struct task_group *task_group(struct task_struct *p) 612 static inline struct task_group *task_group(struct task_struct *p)
614 { 613 {
615 struct task_group *tg; 614 struct task_group *tg;
616 struct cgroup_subsys_state *css; 615 struct cgroup_subsys_state *css;
617 616
618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 617 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
619 lockdep_is_held(&p->pi_lock) || 618 lockdep_is_held(&p->pi_lock) ||
620 lockdep_is_held(&task_rq(p)->lock)); 619 lockdep_is_held(&task_rq(p)->lock));
621 tg = container_of(css, struct task_group, css); 620 tg = container_of(css, struct task_group, css);
622 621
623 return autogroup_task_group(p, tg); 622 return autogroup_task_group(p, tg);
624 } 623 }
625 624
626 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 625 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
627 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 626 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
628 { 627 {
629 #ifdef CONFIG_FAIR_GROUP_SCHED 628 #ifdef CONFIG_FAIR_GROUP_SCHED
630 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 629 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
631 p->se.parent = task_group(p)->se[cpu]; 630 p->se.parent = task_group(p)->se[cpu];
632 #endif 631 #endif
633 632
634 #ifdef CONFIG_RT_GROUP_SCHED 633 #ifdef CONFIG_RT_GROUP_SCHED
635 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 634 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
636 p->rt.parent = task_group(p)->rt_se[cpu]; 635 p->rt.parent = task_group(p)->rt_se[cpu];
637 #endif 636 #endif
638 } 637 }
639 638
640 #else /* CONFIG_CGROUP_SCHED */ 639 #else /* CONFIG_CGROUP_SCHED */
641 640
642 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 641 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
643 static inline struct task_group *task_group(struct task_struct *p) 642 static inline struct task_group *task_group(struct task_struct *p)
644 { 643 {
645 return NULL; 644 return NULL;
646 } 645 }
647 646
648 #endif /* CONFIG_CGROUP_SCHED */ 647 #endif /* CONFIG_CGROUP_SCHED */
649 648
650 static void update_rq_clock_task(struct rq *rq, s64 delta); 649 static void update_rq_clock_task(struct rq *rq, s64 delta);
651 650
652 static void update_rq_clock(struct rq *rq) 651 static void update_rq_clock(struct rq *rq)
653 { 652 {
654 s64 delta; 653 s64 delta;
655 654
656 if (rq->skip_clock_update > 0) 655 if (rq->skip_clock_update > 0)
657 return; 656 return;
658 657
659 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 658 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
660 rq->clock += delta; 659 rq->clock += delta;
661 update_rq_clock_task(rq, delta); 660 update_rq_clock_task(rq, delta);
662 } 661 }
663 662
664 /* 663 /*
665 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 664 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
666 */ 665 */
667 #ifdef CONFIG_SCHED_DEBUG 666 #ifdef CONFIG_SCHED_DEBUG
668 # define const_debug __read_mostly 667 # define const_debug __read_mostly
669 #else 668 #else
670 # define const_debug static const 669 # define const_debug static const
671 #endif 670 #endif
672 671
673 /** 672 /**
674 * runqueue_is_locked - Returns true if the current cpu runqueue is locked 673 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
675 * @cpu: the processor in question. 674 * @cpu: the processor in question.
676 * 675 *
677 * This interface allows printk to be called with the runqueue lock 676 * This interface allows printk to be called with the runqueue lock
678 * held and know whether or not it is OK to wake up the klogd. 677 * held and know whether or not it is OK to wake up the klogd.
679 */ 678 */
680 int runqueue_is_locked(int cpu) 679 int runqueue_is_locked(int cpu)
681 { 680 {
682 return raw_spin_is_locked(&cpu_rq(cpu)->lock); 681 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
683 } 682 }
684 683
685 /* 684 /*
686 * Debugging: various feature bits 685 * Debugging: various feature bits
687 */ 686 */
688 687
689 #define SCHED_FEAT(name, enabled) \ 688 #define SCHED_FEAT(name, enabled) \
690 __SCHED_FEAT_##name , 689 __SCHED_FEAT_##name ,
691 690
692 enum { 691 enum {
693 #include "sched_features.h" 692 #include "sched_features.h"
694 }; 693 };
695 694
696 #undef SCHED_FEAT 695 #undef SCHED_FEAT
697 696
698 #define SCHED_FEAT(name, enabled) \ 697 #define SCHED_FEAT(name, enabled) \
699 (1UL << __SCHED_FEAT_##name) * enabled | 698 (1UL << __SCHED_FEAT_##name) * enabled |
700 699
701 const_debug unsigned int sysctl_sched_features = 700 const_debug unsigned int sysctl_sched_features =
702 #include "sched_features.h" 701 #include "sched_features.h"
703 0; 702 0;
704 703
705 #undef SCHED_FEAT 704 #undef SCHED_FEAT
706 705
707 #ifdef CONFIG_SCHED_DEBUG 706 #ifdef CONFIG_SCHED_DEBUG
708 #define SCHED_FEAT(name, enabled) \ 707 #define SCHED_FEAT(name, enabled) \
709 #name , 708 #name ,
710 709
711 static __read_mostly char *sched_feat_names[] = { 710 static __read_mostly char *sched_feat_names[] = {
712 #include "sched_features.h" 711 #include "sched_features.h"
713 NULL 712 NULL
714 }; 713 };
715 714
716 #undef SCHED_FEAT 715 #undef SCHED_FEAT
717 716
718 static int sched_feat_show(struct seq_file *m, void *v) 717 static int sched_feat_show(struct seq_file *m, void *v)
719 { 718 {
720 int i; 719 int i;
721 720
722 for (i = 0; sched_feat_names[i]; i++) { 721 for (i = 0; sched_feat_names[i]; i++) {
723 if (!(sysctl_sched_features & (1UL << i))) 722 if (!(sysctl_sched_features & (1UL << i)))
724 seq_puts(m, "NO_"); 723 seq_puts(m, "NO_");
725 seq_printf(m, "%s ", sched_feat_names[i]); 724 seq_printf(m, "%s ", sched_feat_names[i]);
726 } 725 }
727 seq_puts(m, "\n"); 726 seq_puts(m, "\n");
728 727
729 return 0; 728 return 0;
730 } 729 }
731 730
732 static ssize_t 731 static ssize_t
733 sched_feat_write(struct file *filp, const char __user *ubuf, 732 sched_feat_write(struct file *filp, const char __user *ubuf,
734 size_t cnt, loff_t *ppos) 733 size_t cnt, loff_t *ppos)
735 { 734 {
736 char buf[64]; 735 char buf[64];
737 char *cmp; 736 char *cmp;
738 int neg = 0; 737 int neg = 0;
739 int i; 738 int i;
740 739
741 if (cnt > 63) 740 if (cnt > 63)
742 cnt = 63; 741 cnt = 63;
743 742
744 if (copy_from_user(&buf, ubuf, cnt)) 743 if (copy_from_user(&buf, ubuf, cnt))
745 return -EFAULT; 744 return -EFAULT;
746 745
747 buf[cnt] = 0; 746 buf[cnt] = 0;
748 cmp = strstrip(buf); 747 cmp = strstrip(buf);
749 748
750 if (strncmp(cmp, "NO_", 3) == 0) { 749 if (strncmp(cmp, "NO_", 3) == 0) {
751 neg = 1; 750 neg = 1;
752 cmp += 3; 751 cmp += 3;
753 } 752 }
754 753
755 for (i = 0; sched_feat_names[i]; i++) { 754 for (i = 0; sched_feat_names[i]; i++) {
756 if (strcmp(cmp, sched_feat_names[i]) == 0) { 755 if (strcmp(cmp, sched_feat_names[i]) == 0) {
757 if (neg) 756 if (neg)
758 sysctl_sched_features &= ~(1UL << i); 757 sysctl_sched_features &= ~(1UL << i);
759 else 758 else
760 sysctl_sched_features |= (1UL << i); 759 sysctl_sched_features |= (1UL << i);
761 break; 760 break;
762 } 761 }
763 } 762 }
764 763
765 if (!sched_feat_names[i]) 764 if (!sched_feat_names[i])
766 return -EINVAL; 765 return -EINVAL;
767 766
768 *ppos += cnt; 767 *ppos += cnt;
769 768
770 return cnt; 769 return cnt;
771 } 770 }
772 771
773 static int sched_feat_open(struct inode *inode, struct file *filp) 772 static int sched_feat_open(struct inode *inode, struct file *filp)
774 { 773 {
775 return single_open(filp, sched_feat_show, NULL); 774 return single_open(filp, sched_feat_show, NULL);
776 } 775 }
777 776
778 static const struct file_operations sched_feat_fops = { 777 static const struct file_operations sched_feat_fops = {
779 .open = sched_feat_open, 778 .open = sched_feat_open,
780 .write = sched_feat_write, 779 .write = sched_feat_write,
781 .read = seq_read, 780 .read = seq_read,
782 .llseek = seq_lseek, 781 .llseek = seq_lseek,
783 .release = single_release, 782 .release = single_release,
784 }; 783 };
785 784
786 static __init int sched_init_debug(void) 785 static __init int sched_init_debug(void)
787 { 786 {
788 debugfs_create_file("sched_features", 0644, NULL, NULL, 787 debugfs_create_file("sched_features", 0644, NULL, NULL,
789 &sched_feat_fops); 788 &sched_feat_fops);
790 789
791 return 0; 790 return 0;
792 } 791 }
793 late_initcall(sched_init_debug); 792 late_initcall(sched_init_debug);
794 793
795 #endif 794 #endif
796 795
797 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 796 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
798 797
799 /* 798 /*
800 * Number of tasks to iterate in a single balance run. 799 * Number of tasks to iterate in a single balance run.
801 * Limited because this is done with IRQs disabled. 800 * Limited because this is done with IRQs disabled.
802 */ 801 */
803 const_debug unsigned int sysctl_sched_nr_migrate = 32; 802 const_debug unsigned int sysctl_sched_nr_migrate = 32;
804 803
805 /* 804 /*
806 * period over which we average the RT time consumption, measured 805 * period over which we average the RT time consumption, measured
807 * in ms. 806 * in ms.
808 * 807 *
809 * default: 1s 808 * default: 1s
810 */ 809 */
811 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 810 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
812 811
813 /* 812 /*
814 * period over which we measure -rt task cpu usage in us. 813 * period over which we measure -rt task cpu usage in us.
815 * default: 1s 814 * default: 1s
816 */ 815 */
817 unsigned int sysctl_sched_rt_period = 1000000; 816 unsigned int sysctl_sched_rt_period = 1000000;
818 817
819 static __read_mostly int scheduler_running; 818 static __read_mostly int scheduler_running;
820 819
821 /* 820 /*
822 * part of the period that we allow rt tasks to run in us. 821 * part of the period that we allow rt tasks to run in us.
823 * default: 0.95s 822 * default: 0.95s
824 */ 823 */
825 int sysctl_sched_rt_runtime = 950000; 824 int sysctl_sched_rt_runtime = 950000;
826 825
827 static inline u64 global_rt_period(void) 826 static inline u64 global_rt_period(void)
828 { 827 {
829 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 828 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
830 } 829 }
831 830
832 static inline u64 global_rt_runtime(void) 831 static inline u64 global_rt_runtime(void)
833 { 832 {
834 if (sysctl_sched_rt_runtime < 0) 833 if (sysctl_sched_rt_runtime < 0)
835 return RUNTIME_INF; 834 return RUNTIME_INF;
836 835
837 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 836 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
838 } 837 }
839 838
840 #ifndef prepare_arch_switch 839 #ifndef prepare_arch_switch
841 # define prepare_arch_switch(next) do { } while (0) 840 # define prepare_arch_switch(next) do { } while (0)
842 #endif 841 #endif
843 #ifndef finish_arch_switch 842 #ifndef finish_arch_switch
844 # define finish_arch_switch(prev) do { } while (0) 843 # define finish_arch_switch(prev) do { } while (0)
845 #endif 844 #endif
846 845
847 static inline int task_current(struct rq *rq, struct task_struct *p) 846 static inline int task_current(struct rq *rq, struct task_struct *p)
848 { 847 {
849 return rq->curr == p; 848 return rq->curr == p;
850 } 849 }
851 850
852 static inline int task_running(struct rq *rq, struct task_struct *p) 851 static inline int task_running(struct rq *rq, struct task_struct *p)
853 { 852 {
854 #ifdef CONFIG_SMP 853 #ifdef CONFIG_SMP
855 return p->on_cpu; 854 return p->on_cpu;
856 #else 855 #else
857 return task_current(rq, p); 856 return task_current(rq, p);
858 #endif 857 #endif
859 } 858 }
860 859
861 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 860 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
862 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 861 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
863 { 862 {
864 #ifdef CONFIG_SMP 863 #ifdef CONFIG_SMP
865 /* 864 /*
866 * We can optimise this out completely for !SMP, because the 865 * We can optimise this out completely for !SMP, because the
867 * SMP rebalancing from interrupt is the only thing that cares 866 * SMP rebalancing from interrupt is the only thing that cares
868 * here. 867 * here.
869 */ 868 */
870 next->on_cpu = 1; 869 next->on_cpu = 1;
871 #endif 870 #endif
872 } 871 }
873 872
874 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 873 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
875 { 874 {
876 #ifdef CONFIG_SMP 875 #ifdef CONFIG_SMP
877 /* 876 /*
878 * After ->on_cpu is cleared, the task can be moved to a different CPU. 877 * After ->on_cpu is cleared, the task can be moved to a different CPU.
879 * We must ensure this doesn't happen until the switch is completely 878 * We must ensure this doesn't happen until the switch is completely
880 * finished. 879 * finished.
881 */ 880 */
882 smp_wmb(); 881 smp_wmb();
883 prev->on_cpu = 0; 882 prev->on_cpu = 0;
884 #endif 883 #endif
885 #ifdef CONFIG_DEBUG_SPINLOCK 884 #ifdef CONFIG_DEBUG_SPINLOCK
886 /* this is a valid case when another task releases the spinlock */ 885 /* this is a valid case when another task releases the spinlock */
887 rq->lock.owner = current; 886 rq->lock.owner = current;
888 #endif 887 #endif
889 /* 888 /*
890 * If we are tracking spinlock dependencies then we have to 889 * If we are tracking spinlock dependencies then we have to
891 * fix up the runqueue lock - which gets 'carried over' from 890 * fix up the runqueue lock - which gets 'carried over' from
892 * prev into current: 891 * prev into current:
893 */ 892 */
894 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 893 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
895 894
896 raw_spin_unlock_irq(&rq->lock); 895 raw_spin_unlock_irq(&rq->lock);
897 } 896 }
898 897
899 #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 898 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
900 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 899 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
901 { 900 {
902 #ifdef CONFIG_SMP 901 #ifdef CONFIG_SMP
903 /* 902 /*
904 * We can optimise this out completely for !SMP, because the 903 * We can optimise this out completely for !SMP, because the
905 * SMP rebalancing from interrupt is the only thing that cares 904 * SMP rebalancing from interrupt is the only thing that cares
906 * here. 905 * here.
907 */ 906 */
908 next->on_cpu = 1; 907 next->on_cpu = 1;
909 #endif 908 #endif
910 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 909 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
911 raw_spin_unlock_irq(&rq->lock); 910 raw_spin_unlock_irq(&rq->lock);
912 #else 911 #else
913 raw_spin_unlock(&rq->lock); 912 raw_spin_unlock(&rq->lock);
914 #endif 913 #endif
915 } 914 }
916 915
917 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 916 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
918 { 917 {
919 #ifdef CONFIG_SMP 918 #ifdef CONFIG_SMP
920 /* 919 /*
921 * After ->on_cpu is cleared, the task can be moved to a different CPU. 920 * After ->on_cpu is cleared, the task can be moved to a different CPU.
922 * We must ensure this doesn't happen until the switch is completely 921 * We must ensure this doesn't happen until the switch is completely
923 * finished. 922 * finished.
924 */ 923 */
925 smp_wmb(); 924 smp_wmb();
926 prev->on_cpu = 0; 925 prev->on_cpu = 0;
927 #endif 926 #endif
928 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 927 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
929 local_irq_enable(); 928 local_irq_enable();
930 #endif 929 #endif
931 } 930 }
932 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 931 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
933 932
934 /* 933 /*
935 * __task_rq_lock - lock the rq @p resides on. 934 * __task_rq_lock - lock the rq @p resides on.
936 */ 935 */
937 static inline struct rq *__task_rq_lock(struct task_struct *p) 936 static inline struct rq *__task_rq_lock(struct task_struct *p)
938 __acquires(rq->lock) 937 __acquires(rq->lock)
939 { 938 {
940 struct rq *rq; 939 struct rq *rq;
941 940
942 lockdep_assert_held(&p->pi_lock); 941 lockdep_assert_held(&p->pi_lock);
943 942
944 for (;;) { 943 for (;;) {
945 rq = task_rq(p); 944 rq = task_rq(p);
946 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
947 if (likely(rq == task_rq(p))) 946 if (likely(rq == task_rq(p)))
948 return rq; 947 return rq;
949 raw_spin_unlock(&rq->lock); 948 raw_spin_unlock(&rq->lock);
950 } 949 }
951 } 950 }
952 951
953 /* 952 /*
954 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 953 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
955 */ 954 */
956 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 955 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
957 __acquires(p->pi_lock) 956 __acquires(p->pi_lock)
958 __acquires(rq->lock) 957 __acquires(rq->lock)
959 { 958 {
960 struct rq *rq; 959 struct rq *rq;
961 960
962 for (;;) { 961 for (;;) {
963 raw_spin_lock_irqsave(&p->pi_lock, *flags); 962 raw_spin_lock_irqsave(&p->pi_lock, *flags);
964 rq = task_rq(p); 963 rq = task_rq(p);
965 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
966 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p)))
967 return rq; 966 return rq;
968 raw_spin_unlock(&rq->lock); 967 raw_spin_unlock(&rq->lock);
969 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 968 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
970 } 969 }
971 } 970 }
972 971
973 static void __task_rq_unlock(struct rq *rq) 972 static void __task_rq_unlock(struct rq *rq)
974 __releases(rq->lock) 973 __releases(rq->lock)
975 { 974 {
976 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
977 } 976 }
978 977
979 static inline void 978 static inline void
980 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 979 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
981 __releases(rq->lock) 980 __releases(rq->lock)
982 __releases(p->pi_lock) 981 __releases(p->pi_lock)
983 { 982 {
984 raw_spin_unlock(&rq->lock); 983 raw_spin_unlock(&rq->lock);
985 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
986 } 985 }
987 986
988 /* 987 /*
989 * this_rq_lock - lock this runqueue and disable interrupts. 988 * this_rq_lock - lock this runqueue and disable interrupts.
990 */ 989 */
991 static struct rq *this_rq_lock(void) 990 static struct rq *this_rq_lock(void)
992 __acquires(rq->lock) 991 __acquires(rq->lock)
993 { 992 {
994 struct rq *rq; 993 struct rq *rq;
995 994
996 local_irq_disable(); 995 local_irq_disable();
997 rq = this_rq(); 996 rq = this_rq();
998 raw_spin_lock(&rq->lock); 997 raw_spin_lock(&rq->lock);
999 998
1000 return rq; 999 return rq;
1001 } 1000 }
1002 1001
1003 #ifdef CONFIG_SCHED_HRTICK 1002 #ifdef CONFIG_SCHED_HRTICK
1004 /* 1003 /*
1005 * Use HR-timers to deliver accurate preemption points. 1004 * Use HR-timers to deliver accurate preemption points.
1006 * 1005 *
1007 * Its all a bit involved since we cannot program an hrt while holding the 1006 * Its all a bit involved since we cannot program an hrt while holding the
1008 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a 1007 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1009 * reschedule event. 1008 * reschedule event.
1010 * 1009 *
1011 * When we get rescheduled we reprogram the hrtick_timer outside of the 1010 * When we get rescheduled we reprogram the hrtick_timer outside of the
1012 * rq->lock. 1011 * rq->lock.
1013 */ 1012 */
1014 1013
1015 /* 1014 /*
1016 * Use hrtick when: 1015 * Use hrtick when:
1017 * - enabled by features 1016 * - enabled by features
1018 * - hrtimer is actually high res 1017 * - hrtimer is actually high res
1019 */ 1018 */
1020 static inline int hrtick_enabled(struct rq *rq) 1019 static inline int hrtick_enabled(struct rq *rq)
1021 { 1020 {
1022 if (!sched_feat(HRTICK)) 1021 if (!sched_feat(HRTICK))
1023 return 0; 1022 return 0;
1024 if (!cpu_active(cpu_of(rq))) 1023 if (!cpu_active(cpu_of(rq)))
1025 return 0; 1024 return 0;
1026 return hrtimer_is_hres_active(&rq->hrtick_timer); 1025 return hrtimer_is_hres_active(&rq->hrtick_timer);
1027 } 1026 }
1028 1027
1029 static void hrtick_clear(struct rq *rq) 1028 static void hrtick_clear(struct rq *rq)
1030 { 1029 {
1031 if (hrtimer_active(&rq->hrtick_timer)) 1030 if (hrtimer_active(&rq->hrtick_timer))
1032 hrtimer_cancel(&rq->hrtick_timer); 1031 hrtimer_cancel(&rq->hrtick_timer);
1033 } 1032 }
1034 1033
1035 /* 1034 /*
1036 * High-resolution timer tick. 1035 * High-resolution timer tick.
1037 * Runs from hardirq context with interrupts disabled. 1036 * Runs from hardirq context with interrupts disabled.
1038 */ 1037 */
1039 static enum hrtimer_restart hrtick(struct hrtimer *timer) 1038 static enum hrtimer_restart hrtick(struct hrtimer *timer)
1040 { 1039 {
1041 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 1040 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1042 1041
1043 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1042 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1044 1043
1045 raw_spin_lock(&rq->lock); 1044 raw_spin_lock(&rq->lock);
1046 update_rq_clock(rq); 1045 update_rq_clock(rq);
1047 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1046 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1048 raw_spin_unlock(&rq->lock); 1047 raw_spin_unlock(&rq->lock);
1049 1048
1050 return HRTIMER_NORESTART; 1049 return HRTIMER_NORESTART;
1051 } 1050 }
1052 1051
1053 #ifdef CONFIG_SMP 1052 #ifdef CONFIG_SMP
1054 /* 1053 /*
1055 * called from hardirq (IPI) context 1054 * called from hardirq (IPI) context
1056 */ 1055 */
1057 static void __hrtick_start(void *arg) 1056 static void __hrtick_start(void *arg)
1058 { 1057 {
1059 struct rq *rq = arg; 1058 struct rq *rq = arg;
1060 1059
1061 raw_spin_lock(&rq->lock); 1060 raw_spin_lock(&rq->lock);
1062 hrtimer_restart(&rq->hrtick_timer); 1061 hrtimer_restart(&rq->hrtick_timer);
1063 rq->hrtick_csd_pending = 0; 1062 rq->hrtick_csd_pending = 0;
1064 raw_spin_unlock(&rq->lock); 1063 raw_spin_unlock(&rq->lock);
1065 } 1064 }
1066 1065
1067 /* 1066 /*
1068 * Called to set the hrtick timer state. 1067 * Called to set the hrtick timer state.
1069 * 1068 *
1070 * called with rq->lock held and irqs disabled 1069 * called with rq->lock held and irqs disabled
1071 */ 1070 */
1072 static void hrtick_start(struct rq *rq, u64 delay) 1071 static void hrtick_start(struct rq *rq, u64 delay)
1073 { 1072 {
1074 struct hrtimer *timer = &rq->hrtick_timer; 1073 struct hrtimer *timer = &rq->hrtick_timer;
1075 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1074 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1076 1075
1077 hrtimer_set_expires(timer, time); 1076 hrtimer_set_expires(timer, time);
1078 1077
1079 if (rq == this_rq()) { 1078 if (rq == this_rq()) {
1080 hrtimer_restart(timer); 1079 hrtimer_restart(timer);
1081 } else if (!rq->hrtick_csd_pending) { 1080 } else if (!rq->hrtick_csd_pending) {
1082 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 1081 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1083 rq->hrtick_csd_pending = 1; 1082 rq->hrtick_csd_pending = 1;
1084 } 1083 }
1085 } 1084 }
1086 1085
1087 static int 1086 static int
1088 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 1087 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1089 { 1088 {
1090 int cpu = (int)(long)hcpu; 1089 int cpu = (int)(long)hcpu;
1091 1090
1092 switch (action) { 1091 switch (action) {
1093 case CPU_UP_CANCELED: 1092 case CPU_UP_CANCELED:
1094 case CPU_UP_CANCELED_FROZEN: 1093 case CPU_UP_CANCELED_FROZEN:
1095 case CPU_DOWN_PREPARE: 1094 case CPU_DOWN_PREPARE:
1096 case CPU_DOWN_PREPARE_FROZEN: 1095 case CPU_DOWN_PREPARE_FROZEN:
1097 case CPU_DEAD: 1096 case CPU_DEAD:
1098 case CPU_DEAD_FROZEN: 1097 case CPU_DEAD_FROZEN:
1099 hrtick_clear(cpu_rq(cpu)); 1098 hrtick_clear(cpu_rq(cpu));
1100 return NOTIFY_OK; 1099 return NOTIFY_OK;
1101 } 1100 }
1102 1101
1103 return NOTIFY_DONE; 1102 return NOTIFY_DONE;
1104 } 1103 }
1105 1104
1106 static __init void init_hrtick(void) 1105 static __init void init_hrtick(void)
1107 { 1106 {
1108 hotcpu_notifier(hotplug_hrtick, 0); 1107 hotcpu_notifier(hotplug_hrtick, 0);
1109 } 1108 }
1110 #else 1109 #else
1111 /* 1110 /*
1112 * Called to set the hrtick timer state. 1111 * Called to set the hrtick timer state.
1113 * 1112 *
1114 * called with rq->lock held and irqs disabled 1113 * called with rq->lock held and irqs disabled
1115 */ 1114 */
1116 static void hrtick_start(struct rq *rq, u64 delay) 1115 static void hrtick_start(struct rq *rq, u64 delay)
1117 { 1116 {
1118 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1117 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1119 HRTIMER_MODE_REL_PINNED, 0); 1118 HRTIMER_MODE_REL_PINNED, 0);
1120 } 1119 }
1121 1120
1122 static inline void init_hrtick(void) 1121 static inline void init_hrtick(void)
1123 { 1122 {
1124 } 1123 }
1125 #endif /* CONFIG_SMP */ 1124 #endif /* CONFIG_SMP */
1126 1125
1127 static void init_rq_hrtick(struct rq *rq) 1126 static void init_rq_hrtick(struct rq *rq)
1128 { 1127 {
1129 #ifdef CONFIG_SMP 1128 #ifdef CONFIG_SMP
1130 rq->hrtick_csd_pending = 0; 1129 rq->hrtick_csd_pending = 0;
1131 1130
1132 rq->hrtick_csd.flags = 0; 1131 rq->hrtick_csd.flags = 0;
1133 rq->hrtick_csd.func = __hrtick_start; 1132 rq->hrtick_csd.func = __hrtick_start;
1134 rq->hrtick_csd.info = rq; 1133 rq->hrtick_csd.info = rq;
1135 #endif 1134 #endif
1136 1135
1137 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1136 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1138 rq->hrtick_timer.function = hrtick; 1137 rq->hrtick_timer.function = hrtick;
1139 } 1138 }
1140 #else /* CONFIG_SCHED_HRTICK */ 1139 #else /* CONFIG_SCHED_HRTICK */
1141 static inline void hrtick_clear(struct rq *rq) 1140 static inline void hrtick_clear(struct rq *rq)
1142 { 1141 {
1143 } 1142 }
1144 1143
1145 static inline void init_rq_hrtick(struct rq *rq) 1144 static inline void init_rq_hrtick(struct rq *rq)
1146 { 1145 {
1147 } 1146 }
1148 1147
1149 static inline void init_hrtick(void) 1148 static inline void init_hrtick(void)
1150 { 1149 {
1151 } 1150 }
1152 #endif /* CONFIG_SCHED_HRTICK */ 1151 #endif /* CONFIG_SCHED_HRTICK */
1153 1152
1154 /* 1153 /*
1155 * resched_task - mark a task 'to be rescheduled now'. 1154 * resched_task - mark a task 'to be rescheduled now'.
1156 * 1155 *
1157 * On UP this means the setting of the need_resched flag, on SMP it 1156 * On UP this means the setting of the need_resched flag, on SMP it
1158 * might also involve a cross-CPU call to trigger the scheduler on 1157 * might also involve a cross-CPU call to trigger the scheduler on
1159 * the target CPU. 1158 * the target CPU.
1160 */ 1159 */
1161 #ifdef CONFIG_SMP 1160 #ifdef CONFIG_SMP
1162 1161
1163 #ifndef tsk_is_polling 1162 #ifndef tsk_is_polling
1164 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1163 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1165 #endif 1164 #endif
1166 1165
1167 static void resched_task(struct task_struct *p) 1166 static void resched_task(struct task_struct *p)
1168 { 1167 {
1169 int cpu; 1168 int cpu;
1170 1169
1171 assert_raw_spin_locked(&task_rq(p)->lock); 1170 assert_raw_spin_locked(&task_rq(p)->lock);
1172 1171
1173 if (test_tsk_need_resched(p)) 1172 if (test_tsk_need_resched(p))
1174 return; 1173 return;
1175 1174
1176 set_tsk_need_resched(p); 1175 set_tsk_need_resched(p);
1177 1176
1178 cpu = task_cpu(p); 1177 cpu = task_cpu(p);
1179 if (cpu == smp_processor_id()) 1178 if (cpu == smp_processor_id())
1180 return; 1179 return;
1181 1180
1182 /* NEED_RESCHED must be visible before we test polling */ 1181 /* NEED_RESCHED must be visible before we test polling */
1183 smp_mb(); 1182 smp_mb();
1184 if (!tsk_is_polling(p)) 1183 if (!tsk_is_polling(p))
1185 smp_send_reschedule(cpu); 1184 smp_send_reschedule(cpu);
1186 } 1185 }
1187 1186
1188 static void resched_cpu(int cpu) 1187 static void resched_cpu(int cpu)
1189 { 1188 {
1190 struct rq *rq = cpu_rq(cpu); 1189 struct rq *rq = cpu_rq(cpu);
1191 unsigned long flags; 1190 unsigned long flags;
1192 1191
1193 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 1192 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1194 return; 1193 return;
1195 resched_task(cpu_curr(cpu)); 1194 resched_task(cpu_curr(cpu));
1196 raw_spin_unlock_irqrestore(&rq->lock, flags); 1195 raw_spin_unlock_irqrestore(&rq->lock, flags);
1197 } 1196 }
1198 1197
1199 #ifdef CONFIG_NO_HZ 1198 #ifdef CONFIG_NO_HZ
1200 /* 1199 /*
1201 * In the semi idle case, use the nearest busy cpu for migrating timers 1200 * In the semi idle case, use the nearest busy cpu for migrating timers
1202 * from an idle cpu. This is good for power-savings. 1201 * from an idle cpu. This is good for power-savings.
1203 * 1202 *
1204 * We don't do similar optimization for completely idle system, as 1203 * We don't do similar optimization for completely idle system, as
1205 * selecting an idle cpu will add more delays to the timers than intended 1204 * selecting an idle cpu will add more delays to the timers than intended
1206 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 1205 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1207 */ 1206 */
1208 int get_nohz_timer_target(void) 1207 int get_nohz_timer_target(void)
1209 { 1208 {
1210 int cpu = smp_processor_id(); 1209 int cpu = smp_processor_id();
1211 int i; 1210 int i;
1212 struct sched_domain *sd; 1211 struct sched_domain *sd;
1213 1212
1214 rcu_read_lock(); 1213 rcu_read_lock();
1215 for_each_domain(cpu, sd) { 1214 for_each_domain(cpu, sd) {
1216 for_each_cpu(i, sched_domain_span(sd)) { 1215 for_each_cpu(i, sched_domain_span(sd)) {
1217 if (!idle_cpu(i)) { 1216 if (!idle_cpu(i)) {
1218 cpu = i; 1217 cpu = i;
1219 goto unlock; 1218 goto unlock;
1220 } 1219 }
1221 } 1220 }
1222 } 1221 }
1223 unlock: 1222 unlock:
1224 rcu_read_unlock(); 1223 rcu_read_unlock();
1225 return cpu; 1224 return cpu;
1226 } 1225 }
1227 /* 1226 /*
1228 * When add_timer_on() enqueues a timer into the timer wheel of an 1227 * When add_timer_on() enqueues a timer into the timer wheel of an
1229 * idle CPU then this timer might expire before the next timer event 1228 * idle CPU then this timer might expire before the next timer event
1230 * which is scheduled to wake up that CPU. In case of a completely 1229 * which is scheduled to wake up that CPU. In case of a completely
1231 * idle system the next event might even be infinite time into the 1230 * idle system the next event might even be infinite time into the
1232 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 1231 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1233 * leaves the inner idle loop so the newly added timer is taken into 1232 * leaves the inner idle loop so the newly added timer is taken into
1234 * account when the CPU goes back to idle and evaluates the timer 1233 * account when the CPU goes back to idle and evaluates the timer
1235 * wheel for the next timer event. 1234 * wheel for the next timer event.
1236 */ 1235 */
1237 void wake_up_idle_cpu(int cpu) 1236 void wake_up_idle_cpu(int cpu)
1238 { 1237 {
1239 struct rq *rq = cpu_rq(cpu); 1238 struct rq *rq = cpu_rq(cpu);
1240 1239
1241 if (cpu == smp_processor_id()) 1240 if (cpu == smp_processor_id())
1242 return; 1241 return;
1243 1242
1244 /* 1243 /*
1245 * This is safe, as this function is called with the timer 1244 * This is safe, as this function is called with the timer
1246 * wheel base lock of (cpu) held. When the CPU is on the way 1245 * wheel base lock of (cpu) held. When the CPU is on the way
1247 * to idle and has not yet set rq->curr to idle then it will 1246 * to idle and has not yet set rq->curr to idle then it will
1248 * be serialized on the timer wheel base lock and take the new 1247 * be serialized on the timer wheel base lock and take the new
1249 * timer into account automatically. 1248 * timer into account automatically.
1250 */ 1249 */
1251 if (rq->curr != rq->idle) 1250 if (rq->curr != rq->idle)
1252 return; 1251 return;
1253 1252
1254 /* 1253 /*
1255 * We can set TIF_RESCHED on the idle task of the other CPU 1254 * We can set TIF_RESCHED on the idle task of the other CPU
1256 * lockless. The worst case is that the other CPU runs the 1255 * lockless. The worst case is that the other CPU runs the
1257 * idle task through an additional NOOP schedule() 1256 * idle task through an additional NOOP schedule()
1258 */ 1257 */
1259 set_tsk_need_resched(rq->idle); 1258 set_tsk_need_resched(rq->idle);
1260 1259
1261 /* NEED_RESCHED must be visible before we test polling */ 1260 /* NEED_RESCHED must be visible before we test polling */
1262 smp_mb(); 1261 smp_mb();
1263 if (!tsk_is_polling(rq->idle)) 1262 if (!tsk_is_polling(rq->idle))
1264 smp_send_reschedule(cpu); 1263 smp_send_reschedule(cpu);
1265 } 1264 }
1266 1265
1267 #endif /* CONFIG_NO_HZ */ 1266 #endif /* CONFIG_NO_HZ */
1268 1267
1269 static u64 sched_avg_period(void) 1268 static u64 sched_avg_period(void)
1270 { 1269 {
1271 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 1270 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1272 } 1271 }
1273 1272
1274 static void sched_avg_update(struct rq *rq) 1273 static void sched_avg_update(struct rq *rq)
1275 { 1274 {
1276 s64 period = sched_avg_period(); 1275 s64 period = sched_avg_period();
1277 1276
1278 while ((s64)(rq->clock - rq->age_stamp) > period) { 1277 while ((s64)(rq->clock - rq->age_stamp) > period) {
1279 /* 1278 /*
1280 * Inline assembly required to prevent the compiler 1279 * Inline assembly required to prevent the compiler
1281 * optimising this loop into a divmod call. 1280 * optimising this loop into a divmod call.
1282 * See __iter_div_u64_rem() for another example of this. 1281 * See __iter_div_u64_rem() for another example of this.
1283 */ 1282 */
1284 asm("" : "+rm" (rq->age_stamp)); 1283 asm("" : "+rm" (rq->age_stamp));
1285 rq->age_stamp += period; 1284 rq->age_stamp += period;
1286 rq->rt_avg /= 2; 1285 rq->rt_avg /= 2;
1287 } 1286 }
1288 } 1287 }
1289 1288
1290 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1289 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1291 { 1290 {
1292 rq->rt_avg += rt_delta; 1291 rq->rt_avg += rt_delta;
1293 sched_avg_update(rq); 1292 sched_avg_update(rq);
1294 } 1293 }
1295 1294
1296 #else /* !CONFIG_SMP */ 1295 #else /* !CONFIG_SMP */
1297 static void resched_task(struct task_struct *p) 1296 static void resched_task(struct task_struct *p)
1298 { 1297 {
1299 assert_raw_spin_locked(&task_rq(p)->lock); 1298 assert_raw_spin_locked(&task_rq(p)->lock);
1300 set_tsk_need_resched(p); 1299 set_tsk_need_resched(p);
1301 } 1300 }
1302 1301
1303 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1302 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1304 { 1303 {
1305 } 1304 }
1306 1305
1307 static void sched_avg_update(struct rq *rq) 1306 static void sched_avg_update(struct rq *rq)
1308 { 1307 {
1309 } 1308 }
1310 #endif /* CONFIG_SMP */ 1309 #endif /* CONFIG_SMP */
1311 1310
1312 #if BITS_PER_LONG == 32 1311 #if BITS_PER_LONG == 32
1313 # define WMULT_CONST (~0UL) 1312 # define WMULT_CONST (~0UL)
1314 #else 1313 #else
1315 # define WMULT_CONST (1UL << 32) 1314 # define WMULT_CONST (1UL << 32)
1316 #endif 1315 #endif
1317 1316
1318 #define WMULT_SHIFT 32 1317 #define WMULT_SHIFT 32
1319 1318
1320 /* 1319 /*
1321 * Shift right and round: 1320 * Shift right and round:
1322 */ 1321 */
1323 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1322 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1324 1323
1325 /* 1324 /*
1326 * delta *= weight / lw 1325 * delta *= weight / lw
1327 */ 1326 */
1328 static unsigned long 1327 static unsigned long
1329 calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1328 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1330 struct load_weight *lw) 1329 struct load_weight *lw)
1331 { 1330 {
1332 u64 tmp; 1331 u64 tmp;
1333 1332
1334 /* 1333 /*
1335 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched 1334 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1336 * entities since MIN_SHARES = 2. Treat weight as 1 if less than 1335 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1337 * 2^SCHED_LOAD_RESOLUTION. 1336 * 2^SCHED_LOAD_RESOLUTION.
1338 */ 1337 */
1339 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) 1338 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1340 tmp = (u64)delta_exec * scale_load_down(weight); 1339 tmp = (u64)delta_exec * scale_load_down(weight);
1341 else 1340 else
1342 tmp = (u64)delta_exec; 1341 tmp = (u64)delta_exec;
1343 1342
1344 if (!lw->inv_weight) { 1343 if (!lw->inv_weight) {
1345 unsigned long w = scale_load_down(lw->weight); 1344 unsigned long w = scale_load_down(lw->weight);
1346 1345
1347 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 1346 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1348 lw->inv_weight = 1; 1347 lw->inv_weight = 1;
1349 else if (unlikely(!w)) 1348 else if (unlikely(!w))
1350 lw->inv_weight = WMULT_CONST; 1349 lw->inv_weight = WMULT_CONST;
1351 else 1350 else
1352 lw->inv_weight = WMULT_CONST / w; 1351 lw->inv_weight = WMULT_CONST / w;
1353 } 1352 }
1354 1353
1355 /* 1354 /*
1356 * Check whether we'd overflow the 64-bit multiplication: 1355 * Check whether we'd overflow the 64-bit multiplication:
1357 */ 1356 */
1358 if (unlikely(tmp > WMULT_CONST)) 1357 if (unlikely(tmp > WMULT_CONST))
1359 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 1358 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1360 WMULT_SHIFT/2); 1359 WMULT_SHIFT/2);
1361 else 1360 else
1362 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); 1361 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1363 1362
1364 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1363 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1365 } 1364 }
1366 1365
1367 static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1366 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1368 { 1367 {
1369 lw->weight += inc; 1368 lw->weight += inc;
1370 lw->inv_weight = 0; 1369 lw->inv_weight = 0;
1371 } 1370 }
1372 1371
1373 static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 1372 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1374 { 1373 {
1375 lw->weight -= dec; 1374 lw->weight -= dec;
1376 lw->inv_weight = 0; 1375 lw->inv_weight = 0;
1377 } 1376 }
1378 1377
1379 static inline void update_load_set(struct load_weight *lw, unsigned long w) 1378 static inline void update_load_set(struct load_weight *lw, unsigned long w)
1380 { 1379 {
1381 lw->weight = w; 1380 lw->weight = w;
1382 lw->inv_weight = 0; 1381 lw->inv_weight = 0;
1383 } 1382 }
1384 1383
1385 /* 1384 /*
1386 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1385 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1387 * of tasks with abnormal "nice" values across CPUs the contribution that 1386 * of tasks with abnormal "nice" values across CPUs the contribution that
1388 * each task makes to its run queue's load is weighted according to its 1387 * each task makes to its run queue's load is weighted according to its
1389 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 1388 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1390 * scaled version of the new time slice allocation that they receive on time 1389 * scaled version of the new time slice allocation that they receive on time
1391 * slice expiry etc. 1390 * slice expiry etc.
1392 */ 1391 */
1393 1392
1394 #define WEIGHT_IDLEPRIO 3 1393 #define WEIGHT_IDLEPRIO 3
1395 #define WMULT_IDLEPRIO 1431655765 1394 #define WMULT_IDLEPRIO 1431655765
1396 1395
1397 /* 1396 /*
1398 * Nice levels are multiplicative, with a gentle 10% change for every 1397 * Nice levels are multiplicative, with a gentle 10% change for every
1399 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 1398 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1400 * nice 1, it will get ~10% less CPU time than another CPU-bound task 1399 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1401 * that remained on nice 0. 1400 * that remained on nice 0.
1402 * 1401 *
1403 * The "10% effect" is relative and cumulative: from _any_ nice level, 1402 * The "10% effect" is relative and cumulative: from _any_ nice level,
1404 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 1403 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1405 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 1404 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1406 * If a task goes up by ~10% and another task goes down by ~10% then 1405 * If a task goes up by ~10% and another task goes down by ~10% then
1407 * the relative distance between them is ~25%.) 1406 * the relative distance between them is ~25%.)
1408 */ 1407 */
1409 static const int prio_to_weight[40] = { 1408 static const int prio_to_weight[40] = {
1410 /* -20 */ 88761, 71755, 56483, 46273, 36291, 1409 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1411 /* -15 */ 29154, 23254, 18705, 14949, 11916, 1410 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1412 /* -10 */ 9548, 7620, 6100, 4904, 3906, 1411 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1413 /* -5 */ 3121, 2501, 1991, 1586, 1277, 1412 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1414 /* 0 */ 1024, 820, 655, 526, 423, 1413 /* 0 */ 1024, 820, 655, 526, 423,
1415 /* 5 */ 335, 272, 215, 172, 137, 1414 /* 5 */ 335, 272, 215, 172, 137,
1416 /* 10 */ 110, 87, 70, 56, 45, 1415 /* 10 */ 110, 87, 70, 56, 45,
1417 /* 15 */ 36, 29, 23, 18, 15, 1416 /* 15 */ 36, 29, 23, 18, 15,
1418 }; 1417 };
1419 1418
1420 /* 1419 /*
1421 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. 1420 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1422 * 1421 *
1423 * In cases where the weight does not change often, we can use the 1422 * In cases where the weight does not change often, we can use the
1424 * precalculated inverse to speed up arithmetics by turning divisions 1423 * precalculated inverse to speed up arithmetics by turning divisions
1425 * into multiplications: 1424 * into multiplications:
1426 */ 1425 */
1427 static const u32 prio_to_wmult[40] = { 1426 static const u32 prio_to_wmult[40] = {
1428 /* -20 */ 48388, 59856, 76040, 92818, 118348, 1427 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1429 /* -15 */ 147320, 184698, 229616, 287308, 360437, 1428 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1430 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 1429 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1431 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 1430 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1432 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 1431 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1433 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 1432 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1434 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 1433 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1435 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1434 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1436 }; 1435 };
1437 1436
1438 /* Time spent by the tasks of the cpu accounting group executing in ... */ 1437 /* Time spent by the tasks of the cpu accounting group executing in ... */
1439 enum cpuacct_stat_index { 1438 enum cpuacct_stat_index {
1440 CPUACCT_STAT_USER, /* ... user mode */ 1439 CPUACCT_STAT_USER, /* ... user mode */
1441 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 1440 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1442 1441
1443 CPUACCT_STAT_NSTATS, 1442 CPUACCT_STAT_NSTATS,
1444 }; 1443 };
1445 1444
1446 #ifdef CONFIG_CGROUP_CPUACCT 1445 #ifdef CONFIG_CGROUP_CPUACCT
1447 static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1446 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1448 static void cpuacct_update_stats(struct task_struct *tsk, 1447 static void cpuacct_update_stats(struct task_struct *tsk,
1449 enum cpuacct_stat_index idx, cputime_t val); 1448 enum cpuacct_stat_index idx, cputime_t val);
1450 #else 1449 #else
1451 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1450 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1452 static inline void cpuacct_update_stats(struct task_struct *tsk, 1451 static inline void cpuacct_update_stats(struct task_struct *tsk,
1453 enum cpuacct_stat_index idx, cputime_t val) {} 1452 enum cpuacct_stat_index idx, cputime_t val) {}
1454 #endif 1453 #endif
1455 1454
1456 static inline void inc_cpu_load(struct rq *rq, unsigned long load) 1455 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1457 { 1456 {
1458 update_load_add(&rq->load, load); 1457 update_load_add(&rq->load, load);
1459 } 1458 }
1460 1459
1461 static inline void dec_cpu_load(struct rq *rq, unsigned long load) 1460 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1462 { 1461 {
1463 update_load_sub(&rq->load, load); 1462 update_load_sub(&rq->load, load);
1464 } 1463 }
1465 1464
1466 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1465 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1467 typedef int (*tg_visitor)(struct task_group *, void *); 1466 typedef int (*tg_visitor)(struct task_group *, void *);
1468 1467
1469 /* 1468 /*
1470 * Iterate the full tree, calling @down when first entering a node and @up when 1469 * Iterate the full tree, calling @down when first entering a node and @up when
1471 * leaving it for the final time. 1470 * leaving it for the final time.
1472 */ 1471 */
1473 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1472 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1474 { 1473 {
1475 struct task_group *parent, *child; 1474 struct task_group *parent, *child;
1476 int ret; 1475 int ret;
1477 1476
1478 rcu_read_lock(); 1477 rcu_read_lock();
1479 parent = &root_task_group; 1478 parent = &root_task_group;
1480 down: 1479 down:
1481 ret = (*down)(parent, data); 1480 ret = (*down)(parent, data);
1482 if (ret) 1481 if (ret)
1483 goto out_unlock; 1482 goto out_unlock;
1484 list_for_each_entry_rcu(child, &parent->children, siblings) { 1483 list_for_each_entry_rcu(child, &parent->children, siblings) {
1485 parent = child; 1484 parent = child;
1486 goto down; 1485 goto down;
1487 1486
1488 up: 1487 up:
1489 continue; 1488 continue;
1490 } 1489 }
1491 ret = (*up)(parent, data); 1490 ret = (*up)(parent, data);
1492 if (ret) 1491 if (ret)
1493 goto out_unlock; 1492 goto out_unlock;
1494 1493
1495 child = parent; 1494 child = parent;
1496 parent = parent->parent; 1495 parent = parent->parent;
1497 if (parent) 1496 if (parent)
1498 goto up; 1497 goto up;
1499 out_unlock: 1498 out_unlock:
1500 rcu_read_unlock(); 1499 rcu_read_unlock();
1501 1500
1502 return ret; 1501 return ret;
1503 } 1502 }
1504 1503
1505 static int tg_nop(struct task_group *tg, void *data) 1504 static int tg_nop(struct task_group *tg, void *data)
1506 { 1505 {
1507 return 0; 1506 return 0;
1508 } 1507 }
1509 #endif 1508 #endif
1510 1509
1511 #ifdef CONFIG_SMP 1510 #ifdef CONFIG_SMP
1512 /* Used instead of source_load when we know the type == 0 */ 1511 /* Used instead of source_load when we know the type == 0 */
1513 static unsigned long weighted_cpuload(const int cpu) 1512 static unsigned long weighted_cpuload(const int cpu)
1514 { 1513 {
1515 return cpu_rq(cpu)->load.weight; 1514 return cpu_rq(cpu)->load.weight;
1516 } 1515 }
1517 1516
1518 /* 1517 /*
1519 * Return a low guess at the load of a migration-source cpu weighted 1518 * Return a low guess at the load of a migration-source cpu weighted
1520 * according to the scheduling class and "nice" value. 1519 * according to the scheduling class and "nice" value.
1521 * 1520 *
1522 * We want to under-estimate the load of migration sources, to 1521 * We want to under-estimate the load of migration sources, to
1523 * balance conservatively. 1522 * balance conservatively.
1524 */ 1523 */
1525 static unsigned long source_load(int cpu, int type) 1524 static unsigned long source_load(int cpu, int type)
1526 { 1525 {
1527 struct rq *rq = cpu_rq(cpu); 1526 struct rq *rq = cpu_rq(cpu);
1528 unsigned long total = weighted_cpuload(cpu); 1527 unsigned long total = weighted_cpuload(cpu);
1529 1528
1530 if (type == 0 || !sched_feat(LB_BIAS)) 1529 if (type == 0 || !sched_feat(LB_BIAS))
1531 return total; 1530 return total;
1532 1531
1533 return min(rq->cpu_load[type-1], total); 1532 return min(rq->cpu_load[type-1], total);
1534 } 1533 }
1535 1534
1536 /* 1535 /*
1537 * Return a high guess at the load of a migration-target cpu weighted 1536 * Return a high guess at the load of a migration-target cpu weighted
1538 * according to the scheduling class and "nice" value. 1537 * according to the scheduling class and "nice" value.
1539 */ 1538 */
1540 static unsigned long target_load(int cpu, int type) 1539 static unsigned long target_load(int cpu, int type)
1541 { 1540 {
1542 struct rq *rq = cpu_rq(cpu); 1541 struct rq *rq = cpu_rq(cpu);
1543 unsigned long total = weighted_cpuload(cpu); 1542 unsigned long total = weighted_cpuload(cpu);
1544 1543
1545 if (type == 0 || !sched_feat(LB_BIAS)) 1544 if (type == 0 || !sched_feat(LB_BIAS))
1546 return total; 1545 return total;
1547 1546
1548 return max(rq->cpu_load[type-1], total); 1547 return max(rq->cpu_load[type-1], total);
1549 } 1548 }
1550 1549
1551 static unsigned long power_of(int cpu) 1550 static unsigned long power_of(int cpu)
1552 { 1551 {
1553 return cpu_rq(cpu)->cpu_power; 1552 return cpu_rq(cpu)->cpu_power;
1554 } 1553 }
1555 1554
1556 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1555 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1557 1556
1558 static unsigned long cpu_avg_load_per_task(int cpu) 1557 static unsigned long cpu_avg_load_per_task(int cpu)
1559 { 1558 {
1560 struct rq *rq = cpu_rq(cpu); 1559 struct rq *rq = cpu_rq(cpu);
1561 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1560 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1562 1561
1563 if (nr_running) 1562 if (nr_running)
1564 rq->avg_load_per_task = rq->load.weight / nr_running; 1563 rq->avg_load_per_task = rq->load.weight / nr_running;
1565 else 1564 else
1566 rq->avg_load_per_task = 0; 1565 rq->avg_load_per_task = 0;
1567 1566
1568 return rq->avg_load_per_task; 1567 return rq->avg_load_per_task;
1569 } 1568 }
1570 1569
1571 #ifdef CONFIG_FAIR_GROUP_SCHED 1570 #ifdef CONFIG_FAIR_GROUP_SCHED
1572 1571
1573 /* 1572 /*
1574 * Compute the cpu's hierarchical load factor for each task group. 1573 * Compute the cpu's hierarchical load factor for each task group.
1575 * This needs to be done in a top-down fashion because the load of a child 1574 * This needs to be done in a top-down fashion because the load of a child
1576 * group is a fraction of its parents load. 1575 * group is a fraction of its parents load.
1577 */ 1576 */
1578 static int tg_load_down(struct task_group *tg, void *data) 1577 static int tg_load_down(struct task_group *tg, void *data)
1579 { 1578 {
1580 unsigned long load; 1579 unsigned long load;
1581 long cpu = (long)data; 1580 long cpu = (long)data;
1582 1581
1583 if (!tg->parent) { 1582 if (!tg->parent) {
1584 load = cpu_rq(cpu)->load.weight; 1583 load = cpu_rq(cpu)->load.weight;
1585 } else { 1584 } else {
1586 load = tg->parent->cfs_rq[cpu]->h_load; 1585 load = tg->parent->cfs_rq[cpu]->h_load;
1587 load *= tg->se[cpu]->load.weight; 1586 load *= tg->se[cpu]->load.weight;
1588 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1587 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1589 } 1588 }
1590 1589
1591 tg->cfs_rq[cpu]->h_load = load; 1590 tg->cfs_rq[cpu]->h_load = load;
1592 1591
1593 return 0; 1592 return 0;
1594 } 1593 }
1595 1594
1596 static void update_h_load(long cpu) 1595 static void update_h_load(long cpu)
1597 { 1596 {
1598 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1597 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1599 } 1598 }
1600 1599
1601 #endif 1600 #endif
1602 1601
1603 #ifdef CONFIG_PREEMPT 1602 #ifdef CONFIG_PREEMPT
1604 1603
1605 static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1604 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1606 1605
1607 /* 1606 /*
1608 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1607 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1609 * way at the expense of forcing extra atomic operations in all 1608 * way at the expense of forcing extra atomic operations in all
1610 * invocations. This assures that the double_lock is acquired using the 1609 * invocations. This assures that the double_lock is acquired using the
1611 * same underlying policy as the spinlock_t on this architecture, which 1610 * same underlying policy as the spinlock_t on this architecture, which
1612 * reduces latency compared to the unfair variant below. However, it 1611 * reduces latency compared to the unfair variant below. However, it
1613 * also adds more overhead and therefore may reduce throughput. 1612 * also adds more overhead and therefore may reduce throughput.
1614 */ 1613 */
1615 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1614 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1616 __releases(this_rq->lock) 1615 __releases(this_rq->lock)
1617 __acquires(busiest->lock) 1616 __acquires(busiest->lock)
1618 __acquires(this_rq->lock) 1617 __acquires(this_rq->lock)
1619 { 1618 {
1620 raw_spin_unlock(&this_rq->lock); 1619 raw_spin_unlock(&this_rq->lock);
1621 double_rq_lock(this_rq, busiest); 1620 double_rq_lock(this_rq, busiest);
1622 1621
1623 return 1; 1622 return 1;
1624 } 1623 }
1625 1624
1626 #else 1625 #else
1627 /* 1626 /*
1628 * Unfair double_lock_balance: Optimizes throughput at the expense of 1627 * Unfair double_lock_balance: Optimizes throughput at the expense of
1629 * latency by eliminating extra atomic operations when the locks are 1628 * latency by eliminating extra atomic operations when the locks are
1630 * already in proper order on entry. This favors lower cpu-ids and will 1629 * already in proper order on entry. This favors lower cpu-ids and will
1631 * grant the double lock to lower cpus over higher ids under contention, 1630 * grant the double lock to lower cpus over higher ids under contention,
1632 * regardless of entry order into the function. 1631 * regardless of entry order into the function.
1633 */ 1632 */
1634 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1633 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1635 __releases(this_rq->lock) 1634 __releases(this_rq->lock)
1636 __acquires(busiest->lock) 1635 __acquires(busiest->lock)
1637 __acquires(this_rq->lock) 1636 __acquires(this_rq->lock)
1638 { 1637 {
1639 int ret = 0; 1638 int ret = 0;
1640 1639
1641 if (unlikely(!raw_spin_trylock(&busiest->lock))) { 1640 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1642 if (busiest < this_rq) { 1641 if (busiest < this_rq) {
1643 raw_spin_unlock(&this_rq->lock); 1642 raw_spin_unlock(&this_rq->lock);
1644 raw_spin_lock(&busiest->lock); 1643 raw_spin_lock(&busiest->lock);
1645 raw_spin_lock_nested(&this_rq->lock, 1644 raw_spin_lock_nested(&this_rq->lock,
1646 SINGLE_DEPTH_NESTING); 1645 SINGLE_DEPTH_NESTING);
1647 ret = 1; 1646 ret = 1;
1648 } else 1647 } else
1649 raw_spin_lock_nested(&busiest->lock, 1648 raw_spin_lock_nested(&busiest->lock,
1650 SINGLE_DEPTH_NESTING); 1649 SINGLE_DEPTH_NESTING);
1651 } 1650 }
1652 return ret; 1651 return ret;
1653 } 1652 }
1654 1653
1655 #endif /* CONFIG_PREEMPT */ 1654 #endif /* CONFIG_PREEMPT */
1656 1655
1657 /* 1656 /*
1658 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1657 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1659 */ 1658 */
1660 static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1659 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1661 { 1660 {
1662 if (unlikely(!irqs_disabled())) { 1661 if (unlikely(!irqs_disabled())) {
1663 /* printk() doesn't work good under rq->lock */ 1662 /* printk() doesn't work good under rq->lock */
1664 raw_spin_unlock(&this_rq->lock); 1663 raw_spin_unlock(&this_rq->lock);
1665 BUG_ON(1); 1664 BUG_ON(1);
1666 } 1665 }
1667 1666
1668 return _double_lock_balance(this_rq, busiest); 1667 return _double_lock_balance(this_rq, busiest);
1669 } 1668 }
1670 1669
1671 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1670 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1672 __releases(busiest->lock) 1671 __releases(busiest->lock)
1673 { 1672 {
1674 raw_spin_unlock(&busiest->lock); 1673 raw_spin_unlock(&busiest->lock);
1675 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1674 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1676 } 1675 }
1677 1676
1678 /* 1677 /*
1679 * double_rq_lock - safely lock two runqueues 1678 * double_rq_lock - safely lock two runqueues
1680 * 1679 *
1681 * Note this does not disable interrupts like task_rq_lock, 1680 * Note this does not disable interrupts like task_rq_lock,
1682 * you need to do so manually before calling. 1681 * you need to do so manually before calling.
1683 */ 1682 */
1684 static void double_rq_lock(struct rq *rq1, struct rq *rq2) 1683 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1685 __acquires(rq1->lock) 1684 __acquires(rq1->lock)
1686 __acquires(rq2->lock) 1685 __acquires(rq2->lock)
1687 { 1686 {
1688 BUG_ON(!irqs_disabled()); 1687 BUG_ON(!irqs_disabled());
1689 if (rq1 == rq2) { 1688 if (rq1 == rq2) {
1690 raw_spin_lock(&rq1->lock); 1689 raw_spin_lock(&rq1->lock);
1691 __acquire(rq2->lock); /* Fake it out ;) */ 1690 __acquire(rq2->lock); /* Fake it out ;) */
1692 } else { 1691 } else {
1693 if (rq1 < rq2) { 1692 if (rq1 < rq2) {
1694 raw_spin_lock(&rq1->lock); 1693 raw_spin_lock(&rq1->lock);
1695 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 1694 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1696 } else { 1695 } else {
1697 raw_spin_lock(&rq2->lock); 1696 raw_spin_lock(&rq2->lock);
1698 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1697 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1699 } 1698 }
1700 } 1699 }
1701 } 1700 }
1702 1701
1703 /* 1702 /*
1704 * double_rq_unlock - safely unlock two runqueues 1703 * double_rq_unlock - safely unlock two runqueues
1705 * 1704 *
1706 * Note this does not restore interrupts like task_rq_unlock, 1705 * Note this does not restore interrupts like task_rq_unlock,
1707 * you need to do so manually after calling. 1706 * you need to do so manually after calling.
1708 */ 1707 */
1709 static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 1708 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1710 __releases(rq1->lock) 1709 __releases(rq1->lock)
1711 __releases(rq2->lock) 1710 __releases(rq2->lock)
1712 { 1711 {
1713 raw_spin_unlock(&rq1->lock); 1712 raw_spin_unlock(&rq1->lock);
1714 if (rq1 != rq2) 1713 if (rq1 != rq2)
1715 raw_spin_unlock(&rq2->lock); 1714 raw_spin_unlock(&rq2->lock);
1716 else 1715 else
1717 __release(rq2->lock); 1716 __release(rq2->lock);
1718 } 1717 }
1719 1718
1720 #else /* CONFIG_SMP */ 1719 #else /* CONFIG_SMP */
1721 1720
1722 /* 1721 /*
1723 * double_rq_lock - safely lock two runqueues 1722 * double_rq_lock - safely lock two runqueues
1724 * 1723 *
1725 * Note this does not disable interrupts like task_rq_lock, 1724 * Note this does not disable interrupts like task_rq_lock,
1726 * you need to do so manually before calling. 1725 * you need to do so manually before calling.
1727 */ 1726 */
1728 static void double_rq_lock(struct rq *rq1, struct rq *rq2) 1727 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1729 __acquires(rq1->lock) 1728 __acquires(rq1->lock)
1730 __acquires(rq2->lock) 1729 __acquires(rq2->lock)
1731 { 1730 {
1732 BUG_ON(!irqs_disabled()); 1731 BUG_ON(!irqs_disabled());
1733 BUG_ON(rq1 != rq2); 1732 BUG_ON(rq1 != rq2);
1734 raw_spin_lock(&rq1->lock); 1733 raw_spin_lock(&rq1->lock);
1735 __acquire(rq2->lock); /* Fake it out ;) */ 1734 __acquire(rq2->lock); /* Fake it out ;) */
1736 } 1735 }
1737 1736
1738 /* 1737 /*
1739 * double_rq_unlock - safely unlock two runqueues 1738 * double_rq_unlock - safely unlock two runqueues
1740 * 1739 *
1741 * Note this does not restore interrupts like task_rq_unlock, 1740 * Note this does not restore interrupts like task_rq_unlock,
1742 * you need to do so manually after calling. 1741 * you need to do so manually after calling.
1743 */ 1742 */
1744 static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 1743 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1745 __releases(rq1->lock) 1744 __releases(rq1->lock)
1746 __releases(rq2->lock) 1745 __releases(rq2->lock)
1747 { 1746 {
1748 BUG_ON(rq1 != rq2); 1747 BUG_ON(rq1 != rq2);
1749 raw_spin_unlock(&rq1->lock); 1748 raw_spin_unlock(&rq1->lock);
1750 __release(rq2->lock); 1749 __release(rq2->lock);
1751 } 1750 }
1752 1751
1753 #endif 1752 #endif
1754 1753
1755 static void calc_load_account_idle(struct rq *this_rq); 1754 static void calc_load_account_idle(struct rq *this_rq);
1756 static void update_sysctl(void); 1755 static void update_sysctl(void);
1757 static int get_update_sysctl_factor(void); 1756 static int get_update_sysctl_factor(void);
1758 static void update_cpu_load(struct rq *this_rq); 1757 static void update_cpu_load(struct rq *this_rq);
1759 1758
1760 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1759 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1761 { 1760 {
1762 set_task_rq(p, cpu); 1761 set_task_rq(p, cpu);
1763 #ifdef CONFIG_SMP 1762 #ifdef CONFIG_SMP
1764 /* 1763 /*
1765 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1764 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1766 * successfuly executed on another CPU. We must ensure that updates of 1765 * successfuly executed on another CPU. We must ensure that updates of
1767 * per-task data have been completed by this moment. 1766 * per-task data have been completed by this moment.
1768 */ 1767 */
1769 smp_wmb(); 1768 smp_wmb();
1770 task_thread_info(p)->cpu = cpu; 1769 task_thread_info(p)->cpu = cpu;
1771 #endif 1770 #endif
1772 } 1771 }
1773 1772
1774 static const struct sched_class rt_sched_class; 1773 static const struct sched_class rt_sched_class;
1775 1774
1776 #define sched_class_highest (&stop_sched_class) 1775 #define sched_class_highest (&stop_sched_class)
1777 #define for_each_class(class) \ 1776 #define for_each_class(class) \
1778 for (class = sched_class_highest; class; class = class->next) 1777 for (class = sched_class_highest; class; class = class->next)
1779 1778
1780 #include "sched_stats.h" 1779 #include "sched_stats.h"
1781 1780
1782 static void inc_nr_running(struct rq *rq) 1781 static void inc_nr_running(struct rq *rq)
1783 { 1782 {
1784 rq->nr_running++; 1783 rq->nr_running++;
1785 } 1784 }
1786 1785
1787 static void dec_nr_running(struct rq *rq) 1786 static void dec_nr_running(struct rq *rq)
1788 { 1787 {
1789 rq->nr_running--; 1788 rq->nr_running--;
1790 } 1789 }
1791 1790
1792 static void set_load_weight(struct task_struct *p) 1791 static void set_load_weight(struct task_struct *p)
1793 { 1792 {
1794 int prio = p->static_prio - MAX_RT_PRIO; 1793 int prio = p->static_prio - MAX_RT_PRIO;
1795 struct load_weight *load = &p->se.load; 1794 struct load_weight *load = &p->se.load;
1796 1795
1797 /* 1796 /*
1798 * SCHED_IDLE tasks get minimal weight: 1797 * SCHED_IDLE tasks get minimal weight:
1799 */ 1798 */
1800 if (p->policy == SCHED_IDLE) { 1799 if (p->policy == SCHED_IDLE) {
1801 load->weight = scale_load(WEIGHT_IDLEPRIO); 1800 load->weight = scale_load(WEIGHT_IDLEPRIO);
1802 load->inv_weight = WMULT_IDLEPRIO; 1801 load->inv_weight = WMULT_IDLEPRIO;
1803 return; 1802 return;
1804 } 1803 }
1805 1804
1806 load->weight = scale_load(prio_to_weight[prio]); 1805 load->weight = scale_load(prio_to_weight[prio]);
1807 load->inv_weight = prio_to_wmult[prio]; 1806 load->inv_weight = prio_to_wmult[prio];
1808 } 1807 }
1809 1808
1810 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1809 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1811 { 1810 {
1812 update_rq_clock(rq); 1811 update_rq_clock(rq);
1813 sched_info_queued(p); 1812 sched_info_queued(p);
1814 p->sched_class->enqueue_task(rq, p, flags); 1813 p->sched_class->enqueue_task(rq, p, flags);
1815 } 1814 }
1816 1815
1817 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1816 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1818 { 1817 {
1819 update_rq_clock(rq); 1818 update_rq_clock(rq);
1820 sched_info_dequeued(p); 1819 sched_info_dequeued(p);
1821 p->sched_class->dequeue_task(rq, p, flags); 1820 p->sched_class->dequeue_task(rq, p, flags);
1822 } 1821 }
1823 1822
1824 /* 1823 /*
1825 * activate_task - move a task to the runqueue. 1824 * activate_task - move a task to the runqueue.
1826 */ 1825 */
1827 static void activate_task(struct rq *rq, struct task_struct *p, int flags) 1826 static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1828 { 1827 {
1829 if (task_contributes_to_load(p)) 1828 if (task_contributes_to_load(p))
1830 rq->nr_uninterruptible--; 1829 rq->nr_uninterruptible--;
1831 1830
1832 enqueue_task(rq, p, flags); 1831 enqueue_task(rq, p, flags);
1833 inc_nr_running(rq); 1832 inc_nr_running(rq);
1834 } 1833 }
1835 1834
1836 /* 1835 /*
1837 * deactivate_task - remove a task from the runqueue. 1836 * deactivate_task - remove a task from the runqueue.
1838 */ 1837 */
1839 static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 1838 static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1840 { 1839 {
1841 if (task_contributes_to_load(p)) 1840 if (task_contributes_to_load(p))
1842 rq->nr_uninterruptible++; 1841 rq->nr_uninterruptible++;
1843 1842
1844 dequeue_task(rq, p, flags); 1843 dequeue_task(rq, p, flags);
1845 dec_nr_running(rq); 1844 dec_nr_running(rq);
1846 } 1845 }
1847 1846
1848 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1847 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
1849 1848
1850 /* 1849 /*
1851 * There are no locks covering percpu hardirq/softirq time. 1850 * There are no locks covering percpu hardirq/softirq time.
1852 * They are only modified in account_system_vtime, on corresponding CPU 1851 * They are only modified in account_system_vtime, on corresponding CPU
1853 * with interrupts disabled. So, writes are safe. 1852 * with interrupts disabled. So, writes are safe.
1854 * They are read and saved off onto struct rq in update_rq_clock(). 1853 * They are read and saved off onto struct rq in update_rq_clock().
1855 * This may result in other CPU reading this CPU's irq time and can 1854 * This may result in other CPU reading this CPU's irq time and can
1856 * race with irq/account_system_vtime on this CPU. We would either get old 1855 * race with irq/account_system_vtime on this CPU. We would either get old
1857 * or new value with a side effect of accounting a slice of irq time to wrong 1856 * or new value with a side effect of accounting a slice of irq time to wrong
1858 * task when irq is in progress while we read rq->clock. That is a worthy 1857 * task when irq is in progress while we read rq->clock. That is a worthy
1859 * compromise in place of having locks on each irq in account_system_time. 1858 * compromise in place of having locks on each irq in account_system_time.
1860 */ 1859 */
1861 static DEFINE_PER_CPU(u64, cpu_hardirq_time); 1860 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1862 static DEFINE_PER_CPU(u64, cpu_softirq_time); 1861 static DEFINE_PER_CPU(u64, cpu_softirq_time);
1863 1862
1864 static DEFINE_PER_CPU(u64, irq_start_time); 1863 static DEFINE_PER_CPU(u64, irq_start_time);
1865 static int sched_clock_irqtime; 1864 static int sched_clock_irqtime;
1866 1865
1867 void enable_sched_clock_irqtime(void) 1866 void enable_sched_clock_irqtime(void)
1868 { 1867 {
1869 sched_clock_irqtime = 1; 1868 sched_clock_irqtime = 1;
1870 } 1869 }
1871 1870
1872 void disable_sched_clock_irqtime(void) 1871 void disable_sched_clock_irqtime(void)
1873 { 1872 {
1874 sched_clock_irqtime = 0; 1873 sched_clock_irqtime = 0;
1875 } 1874 }
1876 1875
1877 #ifndef CONFIG_64BIT 1876 #ifndef CONFIG_64BIT
1878 static DEFINE_PER_CPU(seqcount_t, irq_time_seq); 1877 static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1879 1878
1880 static inline void irq_time_write_begin(void) 1879 static inline void irq_time_write_begin(void)
1881 { 1880 {
1882 __this_cpu_inc(irq_time_seq.sequence); 1881 __this_cpu_inc(irq_time_seq.sequence);
1883 smp_wmb(); 1882 smp_wmb();
1884 } 1883 }
1885 1884
1886 static inline void irq_time_write_end(void) 1885 static inline void irq_time_write_end(void)
1887 { 1886 {
1888 smp_wmb(); 1887 smp_wmb();
1889 __this_cpu_inc(irq_time_seq.sequence); 1888 __this_cpu_inc(irq_time_seq.sequence);
1890 } 1889 }
1891 1890
1892 static inline u64 irq_time_read(int cpu) 1891 static inline u64 irq_time_read(int cpu)
1893 { 1892 {
1894 u64 irq_time; 1893 u64 irq_time;
1895 unsigned seq; 1894 unsigned seq;
1896 1895
1897 do { 1896 do {
1898 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); 1897 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1899 irq_time = per_cpu(cpu_softirq_time, cpu) + 1898 irq_time = per_cpu(cpu_softirq_time, cpu) +
1900 per_cpu(cpu_hardirq_time, cpu); 1899 per_cpu(cpu_hardirq_time, cpu);
1901 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); 1900 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1902 1901
1903 return irq_time; 1902 return irq_time;
1904 } 1903 }
1905 #else /* CONFIG_64BIT */ 1904 #else /* CONFIG_64BIT */
1906 static inline void irq_time_write_begin(void) 1905 static inline void irq_time_write_begin(void)
1907 { 1906 {
1908 } 1907 }
1909 1908
1910 static inline void irq_time_write_end(void) 1909 static inline void irq_time_write_end(void)
1911 { 1910 {
1912 } 1911 }
1913 1912
1914 static inline u64 irq_time_read(int cpu) 1913 static inline u64 irq_time_read(int cpu)
1915 { 1914 {
1916 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 1915 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1917 } 1916 }
1918 #endif /* CONFIG_64BIT */ 1917 #endif /* CONFIG_64BIT */
1919 1918
1920 /* 1919 /*
1921 * Called before incrementing preempt_count on {soft,}irq_enter 1920 * Called before incrementing preempt_count on {soft,}irq_enter
1922 * and before decrementing preempt_count on {soft,}irq_exit. 1921 * and before decrementing preempt_count on {soft,}irq_exit.
1923 */ 1922 */
1924 void account_system_vtime(struct task_struct *curr) 1923 void account_system_vtime(struct task_struct *curr)
1925 { 1924 {
1926 unsigned long flags; 1925 unsigned long flags;
1927 s64 delta; 1926 s64 delta;
1928 int cpu; 1927 int cpu;
1929 1928
1930 if (!sched_clock_irqtime) 1929 if (!sched_clock_irqtime)
1931 return; 1930 return;
1932 1931
1933 local_irq_save(flags); 1932 local_irq_save(flags);
1934 1933
1935 cpu = smp_processor_id(); 1934 cpu = smp_processor_id();
1936 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); 1935 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1937 __this_cpu_add(irq_start_time, delta); 1936 __this_cpu_add(irq_start_time, delta);
1938 1937
1939 irq_time_write_begin(); 1938 irq_time_write_begin();
1940 /* 1939 /*
1941 * We do not account for softirq time from ksoftirqd here. 1940 * We do not account for softirq time from ksoftirqd here.
1942 * We want to continue accounting softirq time to ksoftirqd thread 1941 * We want to continue accounting softirq time to ksoftirqd thread
1943 * in that case, so as not to confuse scheduler with a special task 1942 * in that case, so as not to confuse scheduler with a special task
1944 * that do not consume any time, but still wants to run. 1943 * that do not consume any time, but still wants to run.
1945 */ 1944 */
1946 if (hardirq_count()) 1945 if (hardirq_count())
1947 __this_cpu_add(cpu_hardirq_time, delta); 1946 __this_cpu_add(cpu_hardirq_time, delta);
1948 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) 1947 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1949 __this_cpu_add(cpu_softirq_time, delta); 1948 __this_cpu_add(cpu_softirq_time, delta);
1950 1949
1951 irq_time_write_end(); 1950 irq_time_write_end();
1952 local_irq_restore(flags); 1951 local_irq_restore(flags);
1953 } 1952 }
1954 EXPORT_SYMBOL_GPL(account_system_vtime); 1953 EXPORT_SYMBOL_GPL(account_system_vtime);
1955 1954
1956 static void update_rq_clock_task(struct rq *rq, s64 delta) 1955 static void update_rq_clock_task(struct rq *rq, s64 delta)
1957 { 1956 {
1958 s64 irq_delta; 1957 s64 irq_delta;
1959 1958
1960 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 1959 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1961 1960
1962 /* 1961 /*
1963 * Since irq_time is only updated on {soft,}irq_exit, we might run into 1962 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1964 * this case when a previous update_rq_clock() happened inside a 1963 * this case when a previous update_rq_clock() happened inside a
1965 * {soft,}irq region. 1964 * {soft,}irq region.
1966 * 1965 *
1967 * When this happens, we stop ->clock_task and only update the 1966 * When this happens, we stop ->clock_task and only update the
1968 * prev_irq_time stamp to account for the part that fit, so that a next 1967 * prev_irq_time stamp to account for the part that fit, so that a next
1969 * update will consume the rest. This ensures ->clock_task is 1968 * update will consume the rest. This ensures ->clock_task is
1970 * monotonic. 1969 * monotonic.
1971 * 1970 *
1972 * It does however cause some slight miss-attribution of {soft,}irq 1971 * It does however cause some slight miss-attribution of {soft,}irq
1973 * time, a more accurate solution would be to update the irq_time using 1972 * time, a more accurate solution would be to update the irq_time using
1974 * the current rq->clock timestamp, except that would require using 1973 * the current rq->clock timestamp, except that would require using
1975 * atomic ops. 1974 * atomic ops.
1976 */ 1975 */
1977 if (irq_delta > delta) 1976 if (irq_delta > delta)
1978 irq_delta = delta; 1977 irq_delta = delta;
1979 1978
1980 rq->prev_irq_time += irq_delta; 1979 rq->prev_irq_time += irq_delta;
1981 delta -= irq_delta; 1980 delta -= irq_delta;
1982 rq->clock_task += delta; 1981 rq->clock_task += delta;
1983 1982
1984 if (irq_delta && sched_feat(NONIRQ_POWER)) 1983 if (irq_delta && sched_feat(NONIRQ_POWER))
1985 sched_rt_avg_update(rq, irq_delta); 1984 sched_rt_avg_update(rq, irq_delta);
1986 } 1985 }
1987 1986
1988 static int irqtime_account_hi_update(void) 1987 static int irqtime_account_hi_update(void)
1989 { 1988 {
1990 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 1989 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1991 unsigned long flags; 1990 unsigned long flags;
1992 u64 latest_ns; 1991 u64 latest_ns;
1993 int ret = 0; 1992 int ret = 0;
1994 1993
1995 local_irq_save(flags); 1994 local_irq_save(flags);
1996 latest_ns = this_cpu_read(cpu_hardirq_time); 1995 latest_ns = this_cpu_read(cpu_hardirq_time);
1997 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) 1996 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1998 ret = 1; 1997 ret = 1;
1999 local_irq_restore(flags); 1998 local_irq_restore(flags);
2000 return ret; 1999 return ret;
2001 } 2000 }
2002 2001
2003 static int irqtime_account_si_update(void) 2002 static int irqtime_account_si_update(void)
2004 { 2003 {
2005 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2004 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2006 unsigned long flags; 2005 unsigned long flags;
2007 u64 latest_ns; 2006 u64 latest_ns;
2008 int ret = 0; 2007 int ret = 0;
2009 2008
2010 local_irq_save(flags); 2009 local_irq_save(flags);
2011 latest_ns = this_cpu_read(cpu_softirq_time); 2010 latest_ns = this_cpu_read(cpu_softirq_time);
2012 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) 2011 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
2013 ret = 1; 2012 ret = 1;
2014 local_irq_restore(flags); 2013 local_irq_restore(flags);
2015 return ret; 2014 return ret;
2016 } 2015 }
2017 2016
2018 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 2017 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
2019 2018
2020 #define sched_clock_irqtime (0) 2019 #define sched_clock_irqtime (0)
2021 2020
2022 static void update_rq_clock_task(struct rq *rq, s64 delta) 2021 static void update_rq_clock_task(struct rq *rq, s64 delta)
2023 { 2022 {
2024 rq->clock_task += delta; 2023 rq->clock_task += delta;
2025 } 2024 }
2026 2025
2027 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 2026 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2028 2027
2029 #include "sched_idletask.c" 2028 #include "sched_idletask.c"
2030 #include "sched_fair.c" 2029 #include "sched_fair.c"
2031 #include "sched_rt.c" 2030 #include "sched_rt.c"
2032 #include "sched_autogroup.c" 2031 #include "sched_autogroup.c"
2033 #include "sched_stoptask.c" 2032 #include "sched_stoptask.c"
2034 #ifdef CONFIG_SCHED_DEBUG 2033 #ifdef CONFIG_SCHED_DEBUG
2035 # include "sched_debug.c" 2034 # include "sched_debug.c"
2036 #endif 2035 #endif
2037 2036
2038 void sched_set_stop_task(int cpu, struct task_struct *stop) 2037 void sched_set_stop_task(int cpu, struct task_struct *stop)
2039 { 2038 {
2040 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 2039 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2041 struct task_struct *old_stop = cpu_rq(cpu)->stop; 2040 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2042 2041
2043 if (stop) { 2042 if (stop) {
2044 /* 2043 /*
2045 * Make it appear like a SCHED_FIFO task, its something 2044 * Make it appear like a SCHED_FIFO task, its something
2046 * userspace knows about and won't get confused about. 2045 * userspace knows about and won't get confused about.
2047 * 2046 *
2048 * Also, it will make PI more or less work without too 2047 * Also, it will make PI more or less work without too
2049 * much confusion -- but then, stop work should not 2048 * much confusion -- but then, stop work should not
2050 * rely on PI working anyway. 2049 * rely on PI working anyway.
2051 */ 2050 */
2052 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param); 2051 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2053 2052
2054 stop->sched_class = &stop_sched_class; 2053 stop->sched_class = &stop_sched_class;
2055 } 2054 }
2056 2055
2057 cpu_rq(cpu)->stop = stop; 2056 cpu_rq(cpu)->stop = stop;
2058 2057
2059 if (old_stop) { 2058 if (old_stop) {
2060 /* 2059 /*
2061 * Reset it back to a normal scheduling class so that 2060 * Reset it back to a normal scheduling class so that
2062 * it can die in pieces. 2061 * it can die in pieces.
2063 */ 2062 */
2064 old_stop->sched_class = &rt_sched_class; 2063 old_stop->sched_class = &rt_sched_class;
2065 } 2064 }
2066 } 2065 }
2067 2066
2068 /* 2067 /*
2069 * __normal_prio - return the priority that is based on the static prio 2068 * __normal_prio - return the priority that is based on the static prio
2070 */ 2069 */
2071 static inline int __normal_prio(struct task_struct *p) 2070 static inline int __normal_prio(struct task_struct *p)
2072 { 2071 {
2073 return p->static_prio; 2072 return p->static_prio;
2074 } 2073 }
2075 2074
2076 /* 2075 /*
2077 * Calculate the expected normal priority: i.e. priority 2076 * Calculate the expected normal priority: i.e. priority
2078 * without taking RT-inheritance into account. Might be 2077 * without taking RT-inheritance into account. Might be
2079 * boosted by interactivity modifiers. Changes upon fork, 2078 * boosted by interactivity modifiers. Changes upon fork,
2080 * setprio syscalls, and whenever the interactivity 2079 * setprio syscalls, and whenever the interactivity
2081 * estimator recalculates. 2080 * estimator recalculates.
2082 */ 2081 */
2083 static inline int normal_prio(struct task_struct *p) 2082 static inline int normal_prio(struct task_struct *p)
2084 { 2083 {
2085 int prio; 2084 int prio;
2086 2085
2087 if (task_has_rt_policy(p)) 2086 if (task_has_rt_policy(p))
2088 prio = MAX_RT_PRIO-1 - p->rt_priority; 2087 prio = MAX_RT_PRIO-1 - p->rt_priority;
2089 else 2088 else
2090 prio = __normal_prio(p); 2089 prio = __normal_prio(p);
2091 return prio; 2090 return prio;
2092 } 2091 }
2093 2092
2094 /* 2093 /*
2095 * Calculate the current priority, i.e. the priority 2094 * Calculate the current priority, i.e. the priority
2096 * taken into account by the scheduler. This value might 2095 * taken into account by the scheduler. This value might
2097 * be boosted by RT tasks, or might be boosted by 2096 * be boosted by RT tasks, or might be boosted by
2098 * interactivity modifiers. Will be RT if the task got 2097 * interactivity modifiers. Will be RT if the task got
2099 * RT-boosted. If not then it returns p->normal_prio. 2098 * RT-boosted. If not then it returns p->normal_prio.
2100 */ 2099 */
2101 static int effective_prio(struct task_struct *p) 2100 static int effective_prio(struct task_struct *p)
2102 { 2101 {
2103 p->normal_prio = normal_prio(p); 2102 p->normal_prio = normal_prio(p);
2104 /* 2103 /*
2105 * If we are RT tasks or we were boosted to RT priority, 2104 * If we are RT tasks or we were boosted to RT priority,
2106 * keep the priority unchanged. Otherwise, update priority 2105 * keep the priority unchanged. Otherwise, update priority
2107 * to the normal priority: 2106 * to the normal priority:
2108 */ 2107 */
2109 if (!rt_prio(p->prio)) 2108 if (!rt_prio(p->prio))
2110 return p->normal_prio; 2109 return p->normal_prio;
2111 return p->prio; 2110 return p->prio;
2112 } 2111 }
2113 2112
2114 /** 2113 /**
2115 * task_curr - is this task currently executing on a CPU? 2114 * task_curr - is this task currently executing on a CPU?
2116 * @p: the task in question. 2115 * @p: the task in question.
2117 */ 2116 */
2118 inline int task_curr(const struct task_struct *p) 2117 inline int task_curr(const struct task_struct *p)
2119 { 2118 {
2120 return cpu_curr(task_cpu(p)) == p; 2119 return cpu_curr(task_cpu(p)) == p;
2121 } 2120 }
2122 2121
2123 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2122 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2124 const struct sched_class *prev_class, 2123 const struct sched_class *prev_class,
2125 int oldprio) 2124 int oldprio)
2126 { 2125 {
2127 if (prev_class != p->sched_class) { 2126 if (prev_class != p->sched_class) {
2128 if (prev_class->switched_from) 2127 if (prev_class->switched_from)
2129 prev_class->switched_from(rq, p); 2128 prev_class->switched_from(rq, p);
2130 p->sched_class->switched_to(rq, p); 2129 p->sched_class->switched_to(rq, p);
2131 } else if (oldprio != p->prio) 2130 } else if (oldprio != p->prio)
2132 p->sched_class->prio_changed(rq, p, oldprio); 2131 p->sched_class->prio_changed(rq, p, oldprio);
2133 } 2132 }
2134 2133
2135 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2134 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2136 { 2135 {
2137 const struct sched_class *class; 2136 const struct sched_class *class;
2138 2137
2139 if (p->sched_class == rq->curr->sched_class) { 2138 if (p->sched_class == rq->curr->sched_class) {
2140 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 2139 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2141 } else { 2140 } else {
2142 for_each_class(class) { 2141 for_each_class(class) {
2143 if (class == rq->curr->sched_class) 2142 if (class == rq->curr->sched_class)
2144 break; 2143 break;
2145 if (class == p->sched_class) { 2144 if (class == p->sched_class) {
2146 resched_task(rq->curr); 2145 resched_task(rq->curr);
2147 break; 2146 break;
2148 } 2147 }
2149 } 2148 }
2150 } 2149 }
2151 2150
2152 /* 2151 /*
2153 * A queue event has occurred, and we're going to schedule. In 2152 * A queue event has occurred, and we're going to schedule. In
2154 * this case, we can save a useless back to back clock update. 2153 * this case, we can save a useless back to back clock update.
2155 */ 2154 */
2156 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 2155 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2157 rq->skip_clock_update = 1; 2156 rq->skip_clock_update = 1;
2158 } 2157 }
2159 2158
2160 #ifdef CONFIG_SMP 2159 #ifdef CONFIG_SMP
2161 /* 2160 /*
2162 * Is this task likely cache-hot: 2161 * Is this task likely cache-hot:
2163 */ 2162 */
2164 static int 2163 static int
2165 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 2164 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2166 { 2165 {
2167 s64 delta; 2166 s64 delta;
2168 2167
2169 if (p->sched_class != &fair_sched_class) 2168 if (p->sched_class != &fair_sched_class)
2170 return 0; 2169 return 0;
2171 2170
2172 if (unlikely(p->policy == SCHED_IDLE)) 2171 if (unlikely(p->policy == SCHED_IDLE))
2173 return 0; 2172 return 0;
2174 2173
2175 /* 2174 /*
2176 * Buddy candidates are cache hot: 2175 * Buddy candidates are cache hot:
2177 */ 2176 */
2178 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 2177 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2179 (&p->se == cfs_rq_of(&p->se)->next || 2178 (&p->se == cfs_rq_of(&p->se)->next ||
2180 &p->se == cfs_rq_of(&p->se)->last)) 2179 &p->se == cfs_rq_of(&p->se)->last))
2181 return 1; 2180 return 1;
2182 2181
2183 if (sysctl_sched_migration_cost == -1) 2182 if (sysctl_sched_migration_cost == -1)
2184 return 1; 2183 return 1;
2185 if (sysctl_sched_migration_cost == 0) 2184 if (sysctl_sched_migration_cost == 0)
2186 return 0; 2185 return 0;
2187 2186
2188 delta = now - p->se.exec_start; 2187 delta = now - p->se.exec_start;
2189 2188
2190 return delta < (s64)sysctl_sched_migration_cost; 2189 return delta < (s64)sysctl_sched_migration_cost;
2191 } 2190 }
2192 2191
2193 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2192 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2194 { 2193 {
2195 #ifdef CONFIG_SCHED_DEBUG 2194 #ifdef CONFIG_SCHED_DEBUG
2196 /* 2195 /*
2197 * We should never call set_task_cpu() on a blocked task, 2196 * We should never call set_task_cpu() on a blocked task,
2198 * ttwu() will sort out the placement. 2197 * ttwu() will sort out the placement.
2199 */ 2198 */
2200 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2199 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2201 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2200 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2202 2201
2203 #ifdef CONFIG_LOCKDEP 2202 #ifdef CONFIG_LOCKDEP
2204 /* 2203 /*
2205 * The caller should hold either p->pi_lock or rq->lock, when changing 2204 * The caller should hold either p->pi_lock or rq->lock, when changing
2206 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 2205 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2207 * 2206 *
2208 * sched_move_task() holds both and thus holding either pins the cgroup, 2207 * sched_move_task() holds both and thus holding either pins the cgroup,
2209 * see set_task_rq(). 2208 * see set_task_rq().
2210 * 2209 *
2211 * Furthermore, all task_rq users should acquire both locks, see 2210 * Furthermore, all task_rq users should acquire both locks, see
2212 * task_rq_lock(). 2211 * task_rq_lock().
2213 */ 2212 */
2214 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 2213 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2215 lockdep_is_held(&task_rq(p)->lock))); 2214 lockdep_is_held(&task_rq(p)->lock)));
2216 #endif 2215 #endif
2217 #endif 2216 #endif
2218 2217
2219 trace_sched_migrate_task(p, new_cpu); 2218 trace_sched_migrate_task(p, new_cpu);
2220 2219
2221 if (task_cpu(p) != new_cpu) { 2220 if (task_cpu(p) != new_cpu) {
2222 p->se.nr_migrations++; 2221 p->se.nr_migrations++;
2223 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2222 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2224 } 2223 }
2225 2224
2226 __set_task_cpu(p, new_cpu); 2225 __set_task_cpu(p, new_cpu);
2227 } 2226 }
2228 2227
2229 struct migration_arg { 2228 struct migration_arg {
2230 struct task_struct *task; 2229 struct task_struct *task;
2231 int dest_cpu; 2230 int dest_cpu;
2232 }; 2231 };
2233 2232
2234 static int migration_cpu_stop(void *data); 2233 static int migration_cpu_stop(void *data);
2235 2234
2236 /* 2235 /*
2237 * wait_task_inactive - wait for a thread to unschedule. 2236 * wait_task_inactive - wait for a thread to unschedule.
2238 * 2237 *
2239 * If @match_state is nonzero, it's the @p->state value just checked and 2238 * If @match_state is nonzero, it's the @p->state value just checked and
2240 * not expected to change. If it changes, i.e. @p might have woken up, 2239 * not expected to change. If it changes, i.e. @p might have woken up,
2241 * then return zero. When we succeed in waiting for @p to be off its CPU, 2240 * then return zero. When we succeed in waiting for @p to be off its CPU,
2242 * we return a positive number (its total switch count). If a second call 2241 * we return a positive number (its total switch count). If a second call
2243 * a short while later returns the same number, the caller can be sure that 2242 * a short while later returns the same number, the caller can be sure that
2244 * @p has remained unscheduled the whole time. 2243 * @p has remained unscheduled the whole time.
2245 * 2244 *
2246 * The caller must ensure that the task *will* unschedule sometime soon, 2245 * The caller must ensure that the task *will* unschedule sometime soon,
2247 * else this function might spin for a *long* time. This function can't 2246 * else this function might spin for a *long* time. This function can't
2248 * be called with interrupts off, or it may introduce deadlock with 2247 * be called with interrupts off, or it may introduce deadlock with
2249 * smp_call_function() if an IPI is sent by the same process we are 2248 * smp_call_function() if an IPI is sent by the same process we are
2250 * waiting to become inactive. 2249 * waiting to become inactive.
2251 */ 2250 */
2252 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 2251 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2253 { 2252 {
2254 unsigned long flags; 2253 unsigned long flags;
2255 int running, on_rq; 2254 int running, on_rq;
2256 unsigned long ncsw; 2255 unsigned long ncsw;
2257 struct rq *rq; 2256 struct rq *rq;
2258 2257
2259 for (;;) { 2258 for (;;) {
2260 /* 2259 /*
2261 * We do the initial early heuristics without holding 2260 * We do the initial early heuristics without holding
2262 * any task-queue locks at all. We'll only try to get 2261 * any task-queue locks at all. We'll only try to get
2263 * the runqueue lock when things look like they will 2262 * the runqueue lock when things look like they will
2264 * work out! 2263 * work out!
2265 */ 2264 */
2266 rq = task_rq(p); 2265 rq = task_rq(p);
2267 2266
2268 /* 2267 /*
2269 * If the task is actively running on another CPU 2268 * If the task is actively running on another CPU
2270 * still, just relax and busy-wait without holding 2269 * still, just relax and busy-wait without holding
2271 * any locks. 2270 * any locks.
2272 * 2271 *
2273 * NOTE! Since we don't hold any locks, it's not 2272 * NOTE! Since we don't hold any locks, it's not
2274 * even sure that "rq" stays as the right runqueue! 2273 * even sure that "rq" stays as the right runqueue!
2275 * But we don't care, since "task_running()" will 2274 * But we don't care, since "task_running()" will
2276 * return false if the runqueue has changed and p 2275 * return false if the runqueue has changed and p
2277 * is actually now running somewhere else! 2276 * is actually now running somewhere else!
2278 */ 2277 */
2279 while (task_running(rq, p)) { 2278 while (task_running(rq, p)) {
2280 if (match_state && unlikely(p->state != match_state)) 2279 if (match_state && unlikely(p->state != match_state))
2281 return 0; 2280 return 0;
2282 cpu_relax(); 2281 cpu_relax();
2283 } 2282 }
2284 2283
2285 /* 2284 /*
2286 * Ok, time to look more closely! We need the rq 2285 * Ok, time to look more closely! We need the rq
2287 * lock now, to be *sure*. If we're wrong, we'll 2286 * lock now, to be *sure*. If we're wrong, we'll
2288 * just go back and repeat. 2287 * just go back and repeat.
2289 */ 2288 */
2290 rq = task_rq_lock(p, &flags); 2289 rq = task_rq_lock(p, &flags);
2291 trace_sched_wait_task(p); 2290 trace_sched_wait_task(p);
2292 running = task_running(rq, p); 2291 running = task_running(rq, p);
2293 on_rq = p->on_rq; 2292 on_rq = p->on_rq;
2294 ncsw = 0; 2293 ncsw = 0;
2295 if (!match_state || p->state == match_state) 2294 if (!match_state || p->state == match_state)
2296 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2295 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2297 task_rq_unlock(rq, p, &flags); 2296 task_rq_unlock(rq, p, &flags);
2298 2297
2299 /* 2298 /*
2300 * If it changed from the expected state, bail out now. 2299 * If it changed from the expected state, bail out now.
2301 */ 2300 */
2302 if (unlikely(!ncsw)) 2301 if (unlikely(!ncsw))
2303 break; 2302 break;
2304 2303
2305 /* 2304 /*
2306 * Was it really running after all now that we 2305 * Was it really running after all now that we
2307 * checked with the proper locks actually held? 2306 * checked with the proper locks actually held?
2308 * 2307 *
2309 * Oops. Go back and try again.. 2308 * Oops. Go back and try again..
2310 */ 2309 */
2311 if (unlikely(running)) { 2310 if (unlikely(running)) {
2312 cpu_relax(); 2311 cpu_relax();
2313 continue; 2312 continue;
2314 } 2313 }
2315 2314
2316 /* 2315 /*
2317 * It's not enough that it's not actively running, 2316 * It's not enough that it's not actively running,
2318 * it must be off the runqueue _entirely_, and not 2317 * it must be off the runqueue _entirely_, and not
2319 * preempted! 2318 * preempted!
2320 * 2319 *
2321 * So if it was still runnable (but just not actively 2320 * So if it was still runnable (but just not actively
2322 * running right now), it's preempted, and we should 2321 * running right now), it's preempted, and we should
2323 * yield - it could be a while. 2322 * yield - it could be a while.
2324 */ 2323 */
2325 if (unlikely(on_rq)) { 2324 if (unlikely(on_rq)) {
2326 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 2325 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2327 2326
2328 set_current_state(TASK_UNINTERRUPTIBLE); 2327 set_current_state(TASK_UNINTERRUPTIBLE);
2329 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 2328 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2330 continue; 2329 continue;
2331 } 2330 }
2332 2331
2333 /* 2332 /*
2334 * Ahh, all good. It wasn't running, and it wasn't 2333 * Ahh, all good. It wasn't running, and it wasn't
2335 * runnable, which means that it will never become 2334 * runnable, which means that it will never become
2336 * running in the future either. We're all done! 2335 * running in the future either. We're all done!
2337 */ 2336 */
2338 break; 2337 break;
2339 } 2338 }
2340 2339
2341 return ncsw; 2340 return ncsw;
2342 } 2341 }
2343 2342
2344 /*** 2343 /***
2345 * kick_process - kick a running thread to enter/exit the kernel 2344 * kick_process - kick a running thread to enter/exit the kernel
2346 * @p: the to-be-kicked thread 2345 * @p: the to-be-kicked thread
2347 * 2346 *
2348 * Cause a process which is running on another CPU to enter 2347 * Cause a process which is running on another CPU to enter
2349 * kernel-mode, without any delay. (to get signals handled.) 2348 * kernel-mode, without any delay. (to get signals handled.)
2350 * 2349 *
2351 * NOTE: this function doesn't have to take the runqueue lock, 2350 * NOTE: this function doesn't have to take the runqueue lock,
2352 * because all it wants to ensure is that the remote task enters 2351 * because all it wants to ensure is that the remote task enters
2353 * the kernel. If the IPI races and the task has been migrated 2352 * the kernel. If the IPI races and the task has been migrated
2354 * to another CPU then no harm is done and the purpose has been 2353 * to another CPU then no harm is done and the purpose has been
2355 * achieved as well. 2354 * achieved as well.
2356 */ 2355 */
2357 void kick_process(struct task_struct *p) 2356 void kick_process(struct task_struct *p)
2358 { 2357 {
2359 int cpu; 2358 int cpu;
2360 2359
2361 preempt_disable(); 2360 preempt_disable();
2362 cpu = task_cpu(p); 2361 cpu = task_cpu(p);
2363 if ((cpu != smp_processor_id()) && task_curr(p)) 2362 if ((cpu != smp_processor_id()) && task_curr(p))
2364 smp_send_reschedule(cpu); 2363 smp_send_reschedule(cpu);
2365 preempt_enable(); 2364 preempt_enable();
2366 } 2365 }
2367 EXPORT_SYMBOL_GPL(kick_process); 2366 EXPORT_SYMBOL_GPL(kick_process);
2368 #endif /* CONFIG_SMP */ 2367 #endif /* CONFIG_SMP */
2369 2368
2370 #ifdef CONFIG_SMP 2369 #ifdef CONFIG_SMP
2371 /* 2370 /*
2372 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 2371 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2373 */ 2372 */
2374 static int select_fallback_rq(int cpu, struct task_struct *p) 2373 static int select_fallback_rq(int cpu, struct task_struct *p)
2375 { 2374 {
2376 int dest_cpu; 2375 int dest_cpu;
2377 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 2376 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2378 2377
2379 /* Look for allowed, online CPU in same node. */ 2378 /* Look for allowed, online CPU in same node. */
2380 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 2379 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2381 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 2380 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2382 return dest_cpu; 2381 return dest_cpu;
2383 2382
2384 /* Any allowed, online CPU? */ 2383 /* Any allowed, online CPU? */
2385 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 2384 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2386 if (dest_cpu < nr_cpu_ids) 2385 if (dest_cpu < nr_cpu_ids)
2387 return dest_cpu; 2386 return dest_cpu;
2388 2387
2389 /* No more Mr. Nice Guy. */ 2388 /* No more Mr. Nice Guy. */
2390 dest_cpu = cpuset_cpus_allowed_fallback(p); 2389 dest_cpu = cpuset_cpus_allowed_fallback(p);
2391 /* 2390 /*
2392 * Don't tell them about moving exiting tasks or 2391 * Don't tell them about moving exiting tasks or
2393 * kernel threads (both mm NULL), since they never 2392 * kernel threads (both mm NULL), since they never
2394 * leave kernel. 2393 * leave kernel.
2395 */ 2394 */
2396 if (p->mm && printk_ratelimit()) { 2395 if (p->mm && printk_ratelimit()) {
2397 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", 2396 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2398 task_pid_nr(p), p->comm, cpu); 2397 task_pid_nr(p), p->comm, cpu);
2399 } 2398 }
2400 2399
2401 return dest_cpu; 2400 return dest_cpu;
2402 } 2401 }
2403 2402
2404 /* 2403 /*
2405 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 2404 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2406 */ 2405 */
2407 static inline 2406 static inline
2408 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2407 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2409 { 2408 {
2410 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2409 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2411 2410
2412 /* 2411 /*
2413 * In order not to call set_task_cpu() on a blocking task we need 2412 * In order not to call set_task_cpu() on a blocking task we need
2414 * to rely on ttwu() to place the task on a valid ->cpus_allowed 2413 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2415 * cpu. 2414 * cpu.
2416 * 2415 *
2417 * Since this is common to all placement strategies, this lives here. 2416 * Since this is common to all placement strategies, this lives here.
2418 * 2417 *
2419 * [ this allows ->select_task() to simply return task_cpu(p) and 2418 * [ this allows ->select_task() to simply return task_cpu(p) and
2420 * not worry about this generic constraint ] 2419 * not worry about this generic constraint ]
2421 */ 2420 */
2422 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 2421 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2423 !cpu_online(cpu))) 2422 !cpu_online(cpu)))
2424 cpu = select_fallback_rq(task_cpu(p), p); 2423 cpu = select_fallback_rq(task_cpu(p), p);
2425 2424
2426 return cpu; 2425 return cpu;
2427 } 2426 }
2428 2427
2429 static void update_avg(u64 *avg, u64 sample) 2428 static void update_avg(u64 *avg, u64 sample)
2430 { 2429 {
2431 s64 diff = sample - *avg; 2430 s64 diff = sample - *avg;
2432 *avg += diff >> 3; 2431 *avg += diff >> 3;
2433 } 2432 }
2434 #endif 2433 #endif
2435 2434
2436 static void 2435 static void
2437 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 2436 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2438 { 2437 {
2439 #ifdef CONFIG_SCHEDSTATS 2438 #ifdef CONFIG_SCHEDSTATS
2440 struct rq *rq = this_rq(); 2439 struct rq *rq = this_rq();
2441 2440
2442 #ifdef CONFIG_SMP 2441 #ifdef CONFIG_SMP
2443 int this_cpu = smp_processor_id(); 2442 int this_cpu = smp_processor_id();
2444 2443
2445 if (cpu == this_cpu) { 2444 if (cpu == this_cpu) {
2446 schedstat_inc(rq, ttwu_local); 2445 schedstat_inc(rq, ttwu_local);
2447 schedstat_inc(p, se.statistics.nr_wakeups_local); 2446 schedstat_inc(p, se.statistics.nr_wakeups_local);
2448 } else { 2447 } else {
2449 struct sched_domain *sd; 2448 struct sched_domain *sd;
2450 2449
2451 schedstat_inc(p, se.statistics.nr_wakeups_remote); 2450 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2452 rcu_read_lock(); 2451 rcu_read_lock();
2453 for_each_domain(this_cpu, sd) { 2452 for_each_domain(this_cpu, sd) {
2454 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 2453 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2455 schedstat_inc(sd, ttwu_wake_remote); 2454 schedstat_inc(sd, ttwu_wake_remote);
2456 break; 2455 break;
2457 } 2456 }
2458 } 2457 }
2459 rcu_read_unlock(); 2458 rcu_read_unlock();
2460 } 2459 }
2461 2460
2462 if (wake_flags & WF_MIGRATED) 2461 if (wake_flags & WF_MIGRATED)
2463 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2462 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2464 2463
2465 #endif /* CONFIG_SMP */ 2464 #endif /* CONFIG_SMP */
2466 2465
2467 schedstat_inc(rq, ttwu_count); 2466 schedstat_inc(rq, ttwu_count);
2468 schedstat_inc(p, se.statistics.nr_wakeups); 2467 schedstat_inc(p, se.statistics.nr_wakeups);
2469 2468
2470 if (wake_flags & WF_SYNC) 2469 if (wake_flags & WF_SYNC)
2471 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2470 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2472 2471
2473 #endif /* CONFIG_SCHEDSTATS */ 2472 #endif /* CONFIG_SCHEDSTATS */
2474 } 2473 }
2475 2474
2476 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 2475 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2477 { 2476 {
2478 activate_task(rq, p, en_flags); 2477 activate_task(rq, p, en_flags);
2479 p->on_rq = 1; 2478 p->on_rq = 1;
2480 2479
2481 /* if a worker is waking up, notify workqueue */ 2480 /* if a worker is waking up, notify workqueue */
2482 if (p->flags & PF_WQ_WORKER) 2481 if (p->flags & PF_WQ_WORKER)
2483 wq_worker_waking_up(p, cpu_of(rq)); 2482 wq_worker_waking_up(p, cpu_of(rq));
2484 } 2483 }
2485 2484
2486 /* 2485 /*
2487 * Mark the task runnable and perform wakeup-preemption. 2486 * Mark the task runnable and perform wakeup-preemption.
2488 */ 2487 */
2489 static void 2488 static void
2490 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 2489 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2491 { 2490 {
2492 trace_sched_wakeup(p, true); 2491 trace_sched_wakeup(p, true);
2493 check_preempt_curr(rq, p, wake_flags); 2492 check_preempt_curr(rq, p, wake_flags);
2494 2493
2495 p->state = TASK_RUNNING; 2494 p->state = TASK_RUNNING;
2496 #ifdef CONFIG_SMP 2495 #ifdef CONFIG_SMP
2497 if (p->sched_class->task_woken) 2496 if (p->sched_class->task_woken)
2498 p->sched_class->task_woken(rq, p); 2497 p->sched_class->task_woken(rq, p);
2499 2498
2500 if (unlikely(rq->idle_stamp)) { 2499 if (unlikely(rq->idle_stamp)) {
2501 u64 delta = rq->clock - rq->idle_stamp; 2500 u64 delta = rq->clock - rq->idle_stamp;
2502 u64 max = 2*sysctl_sched_migration_cost; 2501 u64 max = 2*sysctl_sched_migration_cost;
2503 2502
2504 if (delta > max) 2503 if (delta > max)
2505 rq->avg_idle = max; 2504 rq->avg_idle = max;
2506 else 2505 else
2507 update_avg(&rq->avg_idle, delta); 2506 update_avg(&rq->avg_idle, delta);
2508 rq->idle_stamp = 0; 2507 rq->idle_stamp = 0;
2509 } 2508 }
2510 #endif 2509 #endif
2511 } 2510 }
2512 2511
2513 static void 2512 static void
2514 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 2513 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2515 { 2514 {
2516 #ifdef CONFIG_SMP 2515 #ifdef CONFIG_SMP
2517 if (p->sched_contributes_to_load) 2516 if (p->sched_contributes_to_load)
2518 rq->nr_uninterruptible--; 2517 rq->nr_uninterruptible--;
2519 #endif 2518 #endif
2520 2519
2521 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 2520 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2522 ttwu_do_wakeup(rq, p, wake_flags); 2521 ttwu_do_wakeup(rq, p, wake_flags);
2523 } 2522 }
2524 2523
2525 /* 2524 /*
2526 * Called in case the task @p isn't fully descheduled from its runqueue, 2525 * Called in case the task @p isn't fully descheduled from its runqueue,
2527 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 2526 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2528 * since all we need to do is flip p->state to TASK_RUNNING, since 2527 * since all we need to do is flip p->state to TASK_RUNNING, since
2529 * the task is still ->on_rq. 2528 * the task is still ->on_rq.
2530 */ 2529 */
2531 static int ttwu_remote(struct task_struct *p, int wake_flags) 2530 static int ttwu_remote(struct task_struct *p, int wake_flags)
2532 { 2531 {
2533 struct rq *rq; 2532 struct rq *rq;
2534 int ret = 0; 2533 int ret = 0;
2535 2534
2536 rq = __task_rq_lock(p); 2535 rq = __task_rq_lock(p);
2537 if (p->on_rq) { 2536 if (p->on_rq) {
2538 ttwu_do_wakeup(rq, p, wake_flags); 2537 ttwu_do_wakeup(rq, p, wake_flags);
2539 ret = 1; 2538 ret = 1;
2540 } 2539 }
2541 __task_rq_unlock(rq); 2540 __task_rq_unlock(rq);
2542 2541
2543 return ret; 2542 return ret;
2544 } 2543 }
2545 2544
2546 #ifdef CONFIG_SMP 2545 #ifdef CONFIG_SMP
2547 static void sched_ttwu_pending(void) 2546 static void sched_ttwu_pending(void)
2548 { 2547 {
2549 struct rq *rq = this_rq(); 2548 struct rq *rq = this_rq();
2550 struct task_struct *list = xchg(&rq->wake_list, NULL); 2549 struct task_struct *list = xchg(&rq->wake_list, NULL);
2551 2550
2552 if (!list) 2551 if (!list)
2553 return; 2552 return;
2554 2553
2555 raw_spin_lock(&rq->lock); 2554 raw_spin_lock(&rq->lock);
2556 2555
2557 while (list) { 2556 while (list) {
2558 struct task_struct *p = list; 2557 struct task_struct *p = list;
2559 list = list->wake_entry; 2558 list = list->wake_entry;
2560 ttwu_do_activate(rq, p, 0); 2559 ttwu_do_activate(rq, p, 0);
2561 } 2560 }
2562 2561
2563 raw_spin_unlock(&rq->lock); 2562 raw_spin_unlock(&rq->lock);
2564 } 2563 }
2565 2564
2566 void scheduler_ipi(void) 2565 void scheduler_ipi(void)
2567 { 2566 {
2568 sched_ttwu_pending(); 2567 sched_ttwu_pending();
2569 } 2568 }
2570 2569
2571 static void ttwu_queue_remote(struct task_struct *p, int cpu) 2570 static void ttwu_queue_remote(struct task_struct *p, int cpu)
2572 { 2571 {
2573 struct rq *rq = cpu_rq(cpu); 2572 struct rq *rq = cpu_rq(cpu);
2574 struct task_struct *next = rq->wake_list; 2573 struct task_struct *next = rq->wake_list;
2575 2574
2576 for (;;) { 2575 for (;;) {
2577 struct task_struct *old = next; 2576 struct task_struct *old = next;
2578 2577
2579 p->wake_entry = next; 2578 p->wake_entry = next;
2580 next = cmpxchg(&rq->wake_list, old, p); 2579 next = cmpxchg(&rq->wake_list, old, p);
2581 if (next == old) 2580 if (next == old)
2582 break; 2581 break;
2583 } 2582 }
2584 2583
2585 if (!next) 2584 if (!next)
2586 smp_send_reschedule(cpu); 2585 smp_send_reschedule(cpu);
2587 } 2586 }
2588 2587
2589 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2588 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2590 static int ttwu_activate_remote(struct task_struct *p, int wake_flags) 2589 static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2591 { 2590 {
2592 struct rq *rq; 2591 struct rq *rq;
2593 int ret = 0; 2592 int ret = 0;
2594 2593
2595 rq = __task_rq_lock(p); 2594 rq = __task_rq_lock(p);
2596 if (p->on_cpu) { 2595 if (p->on_cpu) {
2597 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2596 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2598 ttwu_do_wakeup(rq, p, wake_flags); 2597 ttwu_do_wakeup(rq, p, wake_flags);
2599 ret = 1; 2598 ret = 1;
2600 } 2599 }
2601 __task_rq_unlock(rq); 2600 __task_rq_unlock(rq);
2602 2601
2603 return ret; 2602 return ret;
2604 2603
2605 } 2604 }
2606 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 2605 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2607 #endif /* CONFIG_SMP */ 2606 #endif /* CONFIG_SMP */
2608 2607
2609 static void ttwu_queue(struct task_struct *p, int cpu) 2608 static void ttwu_queue(struct task_struct *p, int cpu)
2610 { 2609 {
2611 struct rq *rq = cpu_rq(cpu); 2610 struct rq *rq = cpu_rq(cpu);
2612 2611
2613 #if defined(CONFIG_SMP) 2612 #if defined(CONFIG_SMP)
2614 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 2613 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2615 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 2614 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2616 ttwu_queue_remote(p, cpu); 2615 ttwu_queue_remote(p, cpu);
2617 return; 2616 return;
2618 } 2617 }
2619 #endif 2618 #endif
2620 2619
2621 raw_spin_lock(&rq->lock); 2620 raw_spin_lock(&rq->lock);
2622 ttwu_do_activate(rq, p, 0); 2621 ttwu_do_activate(rq, p, 0);
2623 raw_spin_unlock(&rq->lock); 2622 raw_spin_unlock(&rq->lock);
2624 } 2623 }
2625 2624
2626 /** 2625 /**
2627 * try_to_wake_up - wake up a thread 2626 * try_to_wake_up - wake up a thread
2628 * @p: the thread to be awakened 2627 * @p: the thread to be awakened
2629 * @state: the mask of task states that can be woken 2628 * @state: the mask of task states that can be woken
2630 * @wake_flags: wake modifier flags (WF_*) 2629 * @wake_flags: wake modifier flags (WF_*)
2631 * 2630 *
2632 * Put it on the run-queue if it's not already there. The "current" 2631 * Put it on the run-queue if it's not already there. The "current"
2633 * thread is always on the run-queue (except when the actual 2632 * thread is always on the run-queue (except when the actual
2634 * re-schedule is in progress), and as such you're allowed to do 2633 * re-schedule is in progress), and as such you're allowed to do
2635 * the simpler "current->state = TASK_RUNNING" to mark yourself 2634 * the simpler "current->state = TASK_RUNNING" to mark yourself
2636 * runnable without the overhead of this. 2635 * runnable without the overhead of this.
2637 * 2636 *
2638 * Returns %true if @p was woken up, %false if it was already running 2637 * Returns %true if @p was woken up, %false if it was already running
2639 * or @state didn't match @p's state. 2638 * or @state didn't match @p's state.
2640 */ 2639 */
2641 static int 2640 static int
2642 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 2641 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2643 { 2642 {
2644 unsigned long flags; 2643 unsigned long flags;
2645 int cpu, success = 0; 2644 int cpu, success = 0;
2646 2645
2647 smp_wmb(); 2646 smp_wmb();
2648 raw_spin_lock_irqsave(&p->pi_lock, flags); 2647 raw_spin_lock_irqsave(&p->pi_lock, flags);
2649 if (!(p->state & state)) 2648 if (!(p->state & state))
2650 goto out; 2649 goto out;
2651 2650
2652 success = 1; /* we're going to change ->state */ 2651 success = 1; /* we're going to change ->state */
2653 cpu = task_cpu(p); 2652 cpu = task_cpu(p);
2654 2653
2655 if (p->on_rq && ttwu_remote(p, wake_flags)) 2654 if (p->on_rq && ttwu_remote(p, wake_flags))
2656 goto stat; 2655 goto stat;
2657 2656
2658 #ifdef CONFIG_SMP 2657 #ifdef CONFIG_SMP
2659 /* 2658 /*
2660 * If the owning (remote) cpu is still in the middle of schedule() with 2659 * If the owning (remote) cpu is still in the middle of schedule() with
2661 * this task as prev, wait until its done referencing the task. 2660 * this task as prev, wait until its done referencing the task.
2662 */ 2661 */
2663 while (p->on_cpu) { 2662 while (p->on_cpu) {
2664 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2663 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2665 /* 2664 /*
2666 * In case the architecture enables interrupts in 2665 * In case the architecture enables interrupts in
2667 * context_switch(), we cannot busy wait, since that 2666 * context_switch(), we cannot busy wait, since that
2668 * would lead to deadlocks when an interrupt hits and 2667 * would lead to deadlocks when an interrupt hits and
2669 * tries to wake up @prev. So bail and do a complete 2668 * tries to wake up @prev. So bail and do a complete
2670 * remote wakeup. 2669 * remote wakeup.
2671 */ 2670 */
2672 if (ttwu_activate_remote(p, wake_flags)) 2671 if (ttwu_activate_remote(p, wake_flags))
2673 goto stat; 2672 goto stat;
2674 #else 2673 #else
2675 cpu_relax(); 2674 cpu_relax();
2676 #endif 2675 #endif
2677 } 2676 }
2678 /* 2677 /*
2679 * Pairs with the smp_wmb() in finish_lock_switch(). 2678 * Pairs with the smp_wmb() in finish_lock_switch().
2680 */ 2679 */
2681 smp_rmb(); 2680 smp_rmb();
2682 2681
2683 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2682 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2684 p->state = TASK_WAKING; 2683 p->state = TASK_WAKING;
2685 2684
2686 if (p->sched_class->task_waking) 2685 if (p->sched_class->task_waking)
2687 p->sched_class->task_waking(p); 2686 p->sched_class->task_waking(p);
2688 2687
2689 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2688 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2690 if (task_cpu(p) != cpu) { 2689 if (task_cpu(p) != cpu) {
2691 wake_flags |= WF_MIGRATED; 2690 wake_flags |= WF_MIGRATED;
2692 set_task_cpu(p, cpu); 2691 set_task_cpu(p, cpu);
2693 } 2692 }
2694 #endif /* CONFIG_SMP */ 2693 #endif /* CONFIG_SMP */
2695 2694
2696 ttwu_queue(p, cpu); 2695 ttwu_queue(p, cpu);
2697 stat: 2696 stat:
2698 ttwu_stat(p, cpu, wake_flags); 2697 ttwu_stat(p, cpu, wake_flags);
2699 out: 2698 out:
2700 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2699 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2701 2700
2702 return success; 2701 return success;
2703 } 2702 }
2704 2703
2705 /** 2704 /**
2706 * try_to_wake_up_local - try to wake up a local task with rq lock held 2705 * try_to_wake_up_local - try to wake up a local task with rq lock held
2707 * @p: the thread to be awakened 2706 * @p: the thread to be awakened
2708 * 2707 *
2709 * Put @p on the run-queue if it's not already there. The caller must 2708 * Put @p on the run-queue if it's not already there. The caller must
2710 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2709 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2711 * the current task. 2710 * the current task.
2712 */ 2711 */
2713 static void try_to_wake_up_local(struct task_struct *p) 2712 static void try_to_wake_up_local(struct task_struct *p)
2714 { 2713 {
2715 struct rq *rq = task_rq(p); 2714 struct rq *rq = task_rq(p);
2716 2715
2717 BUG_ON(rq != this_rq()); 2716 BUG_ON(rq != this_rq());
2718 BUG_ON(p == current); 2717 BUG_ON(p == current);
2719 lockdep_assert_held(&rq->lock); 2718 lockdep_assert_held(&rq->lock);
2720 2719
2721 if (!raw_spin_trylock(&p->pi_lock)) { 2720 if (!raw_spin_trylock(&p->pi_lock)) {
2722 raw_spin_unlock(&rq->lock); 2721 raw_spin_unlock(&rq->lock);
2723 raw_spin_lock(&p->pi_lock); 2722 raw_spin_lock(&p->pi_lock);
2724 raw_spin_lock(&rq->lock); 2723 raw_spin_lock(&rq->lock);
2725 } 2724 }
2726 2725
2727 if (!(p->state & TASK_NORMAL)) 2726 if (!(p->state & TASK_NORMAL))
2728 goto out; 2727 goto out;
2729 2728
2730 if (!p->on_rq) 2729 if (!p->on_rq)
2731 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2730 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2732 2731
2733 ttwu_do_wakeup(rq, p, 0); 2732 ttwu_do_wakeup(rq, p, 0);
2734 ttwu_stat(p, smp_processor_id(), 0); 2733 ttwu_stat(p, smp_processor_id(), 0);
2735 out: 2734 out:
2736 raw_spin_unlock(&p->pi_lock); 2735 raw_spin_unlock(&p->pi_lock);
2737 } 2736 }
2738 2737
2739 /** 2738 /**
2740 * wake_up_process - Wake up a specific process 2739 * wake_up_process - Wake up a specific process
2741 * @p: The process to be woken up. 2740 * @p: The process to be woken up.
2742 * 2741 *
2743 * Attempt to wake up the nominated process and move it to the set of runnable 2742 * Attempt to wake up the nominated process and move it to the set of runnable
2744 * processes. Returns 1 if the process was woken up, 0 if it was already 2743 * processes. Returns 1 if the process was woken up, 0 if it was already
2745 * running. 2744 * running.
2746 * 2745 *
2747 * It may be assumed that this function implies a write memory barrier before 2746 * It may be assumed that this function implies a write memory barrier before
2748 * changing the task state if and only if any tasks are woken up. 2747 * changing the task state if and only if any tasks are woken up.
2749 */ 2748 */
2750 int wake_up_process(struct task_struct *p) 2749 int wake_up_process(struct task_struct *p)
2751 { 2750 {
2752 return try_to_wake_up(p, TASK_ALL, 0); 2751 return try_to_wake_up(p, TASK_ALL, 0);
2753 } 2752 }
2754 EXPORT_SYMBOL(wake_up_process); 2753 EXPORT_SYMBOL(wake_up_process);
2755 2754
2756 int wake_up_state(struct task_struct *p, unsigned int state) 2755 int wake_up_state(struct task_struct *p, unsigned int state)
2757 { 2756 {
2758 return try_to_wake_up(p, state, 0); 2757 return try_to_wake_up(p, state, 0);
2759 } 2758 }
2760 2759
2761 /* 2760 /*
2762 * Perform scheduler related setup for a newly forked process p. 2761 * Perform scheduler related setup for a newly forked process p.
2763 * p is forked by current. 2762 * p is forked by current.
2764 * 2763 *
2765 * __sched_fork() is basic setup used by init_idle() too: 2764 * __sched_fork() is basic setup used by init_idle() too:
2766 */ 2765 */
2767 static void __sched_fork(struct task_struct *p) 2766 static void __sched_fork(struct task_struct *p)
2768 { 2767 {
2769 p->on_rq = 0; 2768 p->on_rq = 0;
2770 2769
2771 p->se.on_rq = 0; 2770 p->se.on_rq = 0;
2772 p->se.exec_start = 0; 2771 p->se.exec_start = 0;
2773 p->se.sum_exec_runtime = 0; 2772 p->se.sum_exec_runtime = 0;
2774 p->se.prev_sum_exec_runtime = 0; 2773 p->se.prev_sum_exec_runtime = 0;
2775 p->se.nr_migrations = 0; 2774 p->se.nr_migrations = 0;
2776 p->se.vruntime = 0; 2775 p->se.vruntime = 0;
2777 INIT_LIST_HEAD(&p->se.group_node); 2776 INIT_LIST_HEAD(&p->se.group_node);
2778 2777
2779 #ifdef CONFIG_SCHEDSTATS 2778 #ifdef CONFIG_SCHEDSTATS
2780 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2779 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2781 #endif 2780 #endif
2782 2781
2783 INIT_LIST_HEAD(&p->rt.run_list); 2782 INIT_LIST_HEAD(&p->rt.run_list);
2784 2783
2785 #ifdef CONFIG_PREEMPT_NOTIFIERS 2784 #ifdef CONFIG_PREEMPT_NOTIFIERS
2786 INIT_HLIST_HEAD(&p->preempt_notifiers); 2785 INIT_HLIST_HEAD(&p->preempt_notifiers);
2787 #endif 2786 #endif
2788 } 2787 }
2789 2788
2790 /* 2789 /*
2791 * fork()/clone()-time setup: 2790 * fork()/clone()-time setup:
2792 */ 2791 */
2793 void sched_fork(struct task_struct *p) 2792 void sched_fork(struct task_struct *p)
2794 { 2793 {
2795 unsigned long flags; 2794 unsigned long flags;
2796 int cpu = get_cpu(); 2795 int cpu = get_cpu();
2797 2796
2798 __sched_fork(p); 2797 __sched_fork(p);
2799 /* 2798 /*
2800 * We mark the process as running here. This guarantees that 2799 * We mark the process as running here. This guarantees that
2801 * nobody will actually run it, and a signal or other external 2800 * nobody will actually run it, and a signal or other external
2802 * event cannot wake it up and insert it on the runqueue either. 2801 * event cannot wake it up and insert it on the runqueue either.
2803 */ 2802 */
2804 p->state = TASK_RUNNING; 2803 p->state = TASK_RUNNING;
2805 2804
2806 /* 2805 /*
2807 * Revert to default priority/policy on fork if requested. 2806 * Revert to default priority/policy on fork if requested.
2808 */ 2807 */
2809 if (unlikely(p->sched_reset_on_fork)) { 2808 if (unlikely(p->sched_reset_on_fork)) {
2810 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2809 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2811 p->policy = SCHED_NORMAL; 2810 p->policy = SCHED_NORMAL;
2812 p->normal_prio = p->static_prio; 2811 p->normal_prio = p->static_prio;
2813 } 2812 }
2814 2813
2815 if (PRIO_TO_NICE(p->static_prio) < 0) { 2814 if (PRIO_TO_NICE(p->static_prio) < 0) {
2816 p->static_prio = NICE_TO_PRIO(0); 2815 p->static_prio = NICE_TO_PRIO(0);
2817 p->normal_prio = p->static_prio; 2816 p->normal_prio = p->static_prio;
2818 set_load_weight(p); 2817 set_load_weight(p);
2819 } 2818 }
2820 2819
2821 /* 2820 /*
2822 * We don't need the reset flag anymore after the fork. It has 2821 * We don't need the reset flag anymore after the fork. It has
2823 * fulfilled its duty: 2822 * fulfilled its duty:
2824 */ 2823 */
2825 p->sched_reset_on_fork = 0; 2824 p->sched_reset_on_fork = 0;
2826 } 2825 }
2827 2826
2828 /* 2827 /*
2829 * Make sure we do not leak PI boosting priority to the child. 2828 * Make sure we do not leak PI boosting priority to the child.
2830 */ 2829 */
2831 p->prio = current->normal_prio; 2830 p->prio = current->normal_prio;
2832 2831
2833 if (!rt_prio(p->prio)) 2832 if (!rt_prio(p->prio))
2834 p->sched_class = &fair_sched_class; 2833 p->sched_class = &fair_sched_class;
2835 2834
2836 if (p->sched_class->task_fork) 2835 if (p->sched_class->task_fork)
2837 p->sched_class->task_fork(p); 2836 p->sched_class->task_fork(p);
2838 2837
2839 /* 2838 /*
2840 * The child is not yet in the pid-hash so no cgroup attach races, 2839 * The child is not yet in the pid-hash so no cgroup attach races,
2841 * and the cgroup is pinned to this child due to cgroup_fork() 2840 * and the cgroup is pinned to this child due to cgroup_fork()
2842 * is ran before sched_fork(). 2841 * is ran before sched_fork().
2843 * 2842 *
2844 * Silence PROVE_RCU. 2843 * Silence PROVE_RCU.
2845 */ 2844 */
2846 raw_spin_lock_irqsave(&p->pi_lock, flags); 2845 raw_spin_lock_irqsave(&p->pi_lock, flags);
2847 set_task_cpu(p, cpu); 2846 set_task_cpu(p, cpu);
2848 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2847 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2849 2848
2850 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2849 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2851 if (likely(sched_info_on())) 2850 if (likely(sched_info_on()))
2852 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2851 memset(&p->sched_info, 0, sizeof(p->sched_info));
2853 #endif 2852 #endif
2854 #if defined(CONFIG_SMP) 2853 #if defined(CONFIG_SMP)
2855 p->on_cpu = 0; 2854 p->on_cpu = 0;
2856 #endif 2855 #endif
2857 #ifdef CONFIG_PREEMPT 2856 #ifdef CONFIG_PREEMPT
2858 /* Want to start with kernel preemption disabled. */ 2857 /* Want to start with kernel preemption disabled. */
2859 task_thread_info(p)->preempt_count = 1; 2858 task_thread_info(p)->preempt_count = 1;
2860 #endif 2859 #endif
2861 #ifdef CONFIG_SMP 2860 #ifdef CONFIG_SMP
2862 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2861 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2863 #endif 2862 #endif
2864 2863
2865 put_cpu(); 2864 put_cpu();
2866 } 2865 }
2867 2866
2868 /* 2867 /*
2869 * wake_up_new_task - wake up a newly created task for the first time. 2868 * wake_up_new_task - wake up a newly created task for the first time.
2870 * 2869 *
2871 * This function will do some initial scheduler statistics housekeeping 2870 * This function will do some initial scheduler statistics housekeeping
2872 * that must be done for every newly created context, then puts the task 2871 * that must be done for every newly created context, then puts the task
2873 * on the runqueue and wakes it. 2872 * on the runqueue and wakes it.
2874 */ 2873 */
2875 void wake_up_new_task(struct task_struct *p) 2874 void wake_up_new_task(struct task_struct *p)
2876 { 2875 {
2877 unsigned long flags; 2876 unsigned long flags;
2878 struct rq *rq; 2877 struct rq *rq;
2879 2878
2880 raw_spin_lock_irqsave(&p->pi_lock, flags); 2879 raw_spin_lock_irqsave(&p->pi_lock, flags);
2881 #ifdef CONFIG_SMP 2880 #ifdef CONFIG_SMP
2882 /* 2881 /*
2883 * Fork balancing, do it here and not earlier because: 2882 * Fork balancing, do it here and not earlier because:
2884 * - cpus_allowed can change in the fork path 2883 * - cpus_allowed can change in the fork path
2885 * - any previously selected cpu might disappear through hotplug 2884 * - any previously selected cpu might disappear through hotplug
2886 */ 2885 */
2887 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 2886 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2888 #endif 2887 #endif
2889 2888
2890 rq = __task_rq_lock(p); 2889 rq = __task_rq_lock(p);
2891 activate_task(rq, p, 0); 2890 activate_task(rq, p, 0);
2892 p->on_rq = 1; 2891 p->on_rq = 1;
2893 trace_sched_wakeup_new(p, true); 2892 trace_sched_wakeup_new(p, true);
2894 check_preempt_curr(rq, p, WF_FORK); 2893 check_preempt_curr(rq, p, WF_FORK);
2895 #ifdef CONFIG_SMP 2894 #ifdef CONFIG_SMP
2896 if (p->sched_class->task_woken) 2895 if (p->sched_class->task_woken)
2897 p->sched_class->task_woken(rq, p); 2896 p->sched_class->task_woken(rq, p);
2898 #endif 2897 #endif
2899 task_rq_unlock(rq, p, &flags); 2898 task_rq_unlock(rq, p, &flags);
2900 } 2899 }
2901 2900
2902 #ifdef CONFIG_PREEMPT_NOTIFIERS 2901 #ifdef CONFIG_PREEMPT_NOTIFIERS
2903 2902
2904 /** 2903 /**
2905 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2904 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2906 * @notifier: notifier struct to register 2905 * @notifier: notifier struct to register
2907 */ 2906 */
2908 void preempt_notifier_register(struct preempt_notifier *notifier) 2907 void preempt_notifier_register(struct preempt_notifier *notifier)
2909 { 2908 {
2910 hlist_add_head(&notifier->link, &current->preempt_notifiers); 2909 hlist_add_head(&notifier->link, &current->preempt_notifiers);
2911 } 2910 }
2912 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2911 EXPORT_SYMBOL_GPL(preempt_notifier_register);
2913 2912
2914 /** 2913 /**
2915 * preempt_notifier_unregister - no longer interested in preemption notifications 2914 * preempt_notifier_unregister - no longer interested in preemption notifications
2916 * @notifier: notifier struct to unregister 2915 * @notifier: notifier struct to unregister
2917 * 2916 *
2918 * This is safe to call from within a preemption notifier. 2917 * This is safe to call from within a preemption notifier.
2919 */ 2918 */
2920 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2919 void preempt_notifier_unregister(struct preempt_notifier *notifier)
2921 { 2920 {
2922 hlist_del(&notifier->link); 2921 hlist_del(&notifier->link);
2923 } 2922 }
2924 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2923 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2925 2924
2926 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2925 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2927 { 2926 {
2928 struct preempt_notifier *notifier; 2927 struct preempt_notifier *notifier;
2929 struct hlist_node *node; 2928 struct hlist_node *node;
2930 2929
2931 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2930 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2932 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2931 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2933 } 2932 }
2934 2933
2935 static void 2934 static void
2936 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2935 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2937 struct task_struct *next) 2936 struct task_struct *next)
2938 { 2937 {
2939 struct preempt_notifier *notifier; 2938 struct preempt_notifier *notifier;
2940 struct hlist_node *node; 2939 struct hlist_node *node;
2941 2940
2942 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2941 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2943 notifier->ops->sched_out(notifier, next); 2942 notifier->ops->sched_out(notifier, next);
2944 } 2943 }
2945 2944
2946 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2945 #else /* !CONFIG_PREEMPT_NOTIFIERS */
2947 2946
2948 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2947 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2949 { 2948 {
2950 } 2949 }
2951 2950
2952 static void 2951 static void
2953 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2952 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2954 struct task_struct *next) 2953 struct task_struct *next)
2955 { 2954 {
2956 } 2955 }
2957 2956
2958 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2957 #endif /* CONFIG_PREEMPT_NOTIFIERS */
2959 2958
2960 /** 2959 /**
2961 * prepare_task_switch - prepare to switch tasks 2960 * prepare_task_switch - prepare to switch tasks
2962 * @rq: the runqueue preparing to switch 2961 * @rq: the runqueue preparing to switch
2963 * @prev: the current task that is being switched out 2962 * @prev: the current task that is being switched out
2964 * @next: the task we are going to switch to. 2963 * @next: the task we are going to switch to.
2965 * 2964 *
2966 * This is called with the rq lock held and interrupts off. It must 2965 * This is called with the rq lock held and interrupts off. It must
2967 * be paired with a subsequent finish_task_switch after the context 2966 * be paired with a subsequent finish_task_switch after the context
2968 * switch. 2967 * switch.
2969 * 2968 *
2970 * prepare_task_switch sets up locking and calls architecture specific 2969 * prepare_task_switch sets up locking and calls architecture specific
2971 * hooks. 2970 * hooks.
2972 */ 2971 */
2973 static inline void 2972 static inline void
2974 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2973 prepare_task_switch(struct rq *rq, struct task_struct *prev,
2975 struct task_struct *next) 2974 struct task_struct *next)
2976 { 2975 {
2977 sched_info_switch(prev, next); 2976 sched_info_switch(prev, next);
2978 perf_event_task_sched_out(prev, next); 2977 perf_event_task_sched_out(prev, next);
2979 fire_sched_out_preempt_notifiers(prev, next); 2978 fire_sched_out_preempt_notifiers(prev, next);
2980 prepare_lock_switch(rq, next); 2979 prepare_lock_switch(rq, next);
2981 prepare_arch_switch(next); 2980 prepare_arch_switch(next);
2982 trace_sched_switch(prev, next); 2981 trace_sched_switch(prev, next);
2983 } 2982 }
2984 2983
2985 /** 2984 /**
2986 * finish_task_switch - clean up after a task-switch 2985 * finish_task_switch - clean up after a task-switch
2987 * @rq: runqueue associated with task-switch 2986 * @rq: runqueue associated with task-switch
2988 * @prev: the thread we just switched away from. 2987 * @prev: the thread we just switched away from.
2989 * 2988 *
2990 * finish_task_switch must be called after the context switch, paired 2989 * finish_task_switch must be called after the context switch, paired
2991 * with a prepare_task_switch call before the context switch. 2990 * with a prepare_task_switch call before the context switch.
2992 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2991 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2993 * and do any other architecture-specific cleanup actions. 2992 * and do any other architecture-specific cleanup actions.
2994 * 2993 *
2995 * Note that we may have delayed dropping an mm in context_switch(). If 2994 * Note that we may have delayed dropping an mm in context_switch(). If
2996 * so, we finish that here outside of the runqueue lock. (Doing it 2995 * so, we finish that here outside of the runqueue lock. (Doing it
2997 * with the lock held can cause deadlocks; see schedule() for 2996 * with the lock held can cause deadlocks; see schedule() for
2998 * details.) 2997 * details.)
2999 */ 2998 */
3000 static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2999 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3001 __releases(rq->lock) 3000 __releases(rq->lock)
3002 { 3001 {
3003 struct mm_struct *mm = rq->prev_mm; 3002 struct mm_struct *mm = rq->prev_mm;
3004 long prev_state; 3003 long prev_state;
3005 3004
3006 rq->prev_mm = NULL; 3005 rq->prev_mm = NULL;
3007 3006
3008 /* 3007 /*
3009 * A task struct has one reference for the use as "current". 3008 * A task struct has one reference for the use as "current".
3010 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 3009 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
3011 * schedule one last time. The schedule call will never return, and 3010 * schedule one last time. The schedule call will never return, and
3012 * the scheduled task must drop that reference. 3011 * the scheduled task must drop that reference.
3013 * The test for TASK_DEAD must occur while the runqueue locks are 3012 * The test for TASK_DEAD must occur while the runqueue locks are
3014 * still held, otherwise prev could be scheduled on another cpu, die 3013 * still held, otherwise prev could be scheduled on another cpu, die
3015 * there before we look at prev->state, and then the reference would 3014 * there before we look at prev->state, and then the reference would
3016 * be dropped twice. 3015 * be dropped twice.
3017 * Manfred Spraul <manfred@colorfullife.com> 3016 * Manfred Spraul <manfred@colorfullife.com>
3018 */ 3017 */
3019 prev_state = prev->state; 3018 prev_state = prev->state;
3020 finish_arch_switch(prev); 3019 finish_arch_switch(prev);
3021 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 3020 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3022 local_irq_disable(); 3021 local_irq_disable();
3023 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 3022 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3024 perf_event_task_sched_in(current); 3023 perf_event_task_sched_in(current);
3025 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 3024 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3026 local_irq_enable(); 3025 local_irq_enable();
3027 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 3026 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3028 finish_lock_switch(rq, prev); 3027 finish_lock_switch(rq, prev);
3029 3028
3030 fire_sched_in_preempt_notifiers(current); 3029 fire_sched_in_preempt_notifiers(current);
3031 if (mm) 3030 if (mm)
3032 mmdrop(mm); 3031 mmdrop(mm);
3033 if (unlikely(prev_state == TASK_DEAD)) { 3032 if (unlikely(prev_state == TASK_DEAD)) {
3034 /* 3033 /*
3035 * Remove function-return probe instances associated with this 3034 * Remove function-return probe instances associated with this
3036 * task and put them back on the free list. 3035 * task and put them back on the free list.
3037 */ 3036 */
3038 kprobe_flush_task(prev); 3037 kprobe_flush_task(prev);
3039 put_task_struct(prev); 3038 put_task_struct(prev);
3040 } 3039 }
3041 } 3040 }
3042 3041
3043 #ifdef CONFIG_SMP 3042 #ifdef CONFIG_SMP
3044 3043
3045 /* assumes rq->lock is held */ 3044 /* assumes rq->lock is held */
3046 static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 3045 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
3047 { 3046 {
3048 if (prev->sched_class->pre_schedule) 3047 if (prev->sched_class->pre_schedule)
3049 prev->sched_class->pre_schedule(rq, prev); 3048 prev->sched_class->pre_schedule(rq, prev);
3050 } 3049 }
3051 3050
3052 /* rq->lock is NOT held, but preemption is disabled */ 3051 /* rq->lock is NOT held, but preemption is disabled */
3053 static inline void post_schedule(struct rq *rq) 3052 static inline void post_schedule(struct rq *rq)
3054 { 3053 {
3055 if (rq->post_schedule) { 3054 if (rq->post_schedule) {
3056 unsigned long flags; 3055 unsigned long flags;
3057 3056
3058 raw_spin_lock_irqsave(&rq->lock, flags); 3057 raw_spin_lock_irqsave(&rq->lock, flags);
3059 if (rq->curr->sched_class->post_schedule) 3058 if (rq->curr->sched_class->post_schedule)
3060 rq->curr->sched_class->post_schedule(rq); 3059 rq->curr->sched_class->post_schedule(rq);
3061 raw_spin_unlock_irqrestore(&rq->lock, flags); 3060 raw_spin_unlock_irqrestore(&rq->lock, flags);
3062 3061
3063 rq->post_schedule = 0; 3062 rq->post_schedule = 0;
3064 } 3063 }
3065 } 3064 }
3066 3065
3067 #else 3066 #else
3068 3067
3069 static inline void pre_schedule(struct rq *rq, struct task_struct *p) 3068 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
3070 { 3069 {
3071 } 3070 }
3072 3071
3073 static inline void post_schedule(struct rq *rq) 3072 static inline void post_schedule(struct rq *rq)
3074 { 3073 {
3075 } 3074 }
3076 3075
3077 #endif 3076 #endif
3078 3077
3079 /** 3078 /**
3080 * schedule_tail - first thing a freshly forked thread must call. 3079 * schedule_tail - first thing a freshly forked thread must call.
3081 * @prev: the thread we just switched away from. 3080 * @prev: the thread we just switched away from.
3082 */ 3081 */
3083 asmlinkage void schedule_tail(struct task_struct *prev) 3082 asmlinkage void schedule_tail(struct task_struct *prev)
3084 __releases(rq->lock) 3083 __releases(rq->lock)
3085 { 3084 {
3086 struct rq *rq = this_rq(); 3085 struct rq *rq = this_rq();
3087 3086
3088 finish_task_switch(rq, prev); 3087 finish_task_switch(rq, prev);
3089 3088
3090 /* 3089 /*
3091 * FIXME: do we need to worry about rq being invalidated by the 3090 * FIXME: do we need to worry about rq being invalidated by the
3092 * task_switch? 3091 * task_switch?
3093 */ 3092 */
3094 post_schedule(rq); 3093 post_schedule(rq);
3095 3094
3096 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 3095 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
3097 /* In this case, finish_task_switch does not reenable preemption */ 3096 /* In this case, finish_task_switch does not reenable preemption */
3098 preempt_enable(); 3097 preempt_enable();
3099 #endif 3098 #endif
3100 if (current->set_child_tid) 3099 if (current->set_child_tid)
3101 put_user(task_pid_vnr(current), current->set_child_tid); 3100 put_user(task_pid_vnr(current), current->set_child_tid);
3102 } 3101 }
3103 3102
3104 /* 3103 /*
3105 * context_switch - switch to the new MM and the new 3104 * context_switch - switch to the new MM and the new
3106 * thread's register state. 3105 * thread's register state.
3107 */ 3106 */
3108 static inline void 3107 static inline void
3109 context_switch(struct rq *rq, struct task_struct *prev, 3108 context_switch(struct rq *rq, struct task_struct *prev,
3110 struct task_struct *next) 3109 struct task_struct *next)
3111 { 3110 {
3112 struct mm_struct *mm, *oldmm; 3111 struct mm_struct *mm, *oldmm;
3113 3112
3114 prepare_task_switch(rq, prev, next); 3113 prepare_task_switch(rq, prev, next);
3115 3114
3116 mm = next->mm; 3115 mm = next->mm;
3117 oldmm = prev->active_mm; 3116 oldmm = prev->active_mm;
3118 /* 3117 /*
3119 * For paravirt, this is coupled with an exit in switch_to to 3118 * For paravirt, this is coupled with an exit in switch_to to
3120 * combine the page table reload and the switch backend into 3119 * combine the page table reload and the switch backend into
3121 * one hypercall. 3120 * one hypercall.
3122 */ 3121 */
3123 arch_start_context_switch(prev); 3122 arch_start_context_switch(prev);
3124 3123
3125 if (!mm) { 3124 if (!mm) {
3126 next->active_mm = oldmm; 3125 next->active_mm = oldmm;
3127 atomic_inc(&oldmm->mm_count); 3126 atomic_inc(&oldmm->mm_count);
3128 enter_lazy_tlb(oldmm, next); 3127 enter_lazy_tlb(oldmm, next);
3129 } else 3128 } else
3130 switch_mm(oldmm, mm, next); 3129 switch_mm(oldmm, mm, next);
3131 3130
3132 if (!prev->mm) { 3131 if (!prev->mm) {
3133 prev->active_mm = NULL; 3132 prev->active_mm = NULL;
3134 rq->prev_mm = oldmm; 3133 rq->prev_mm = oldmm;
3135 } 3134 }
3136 /* 3135 /*
3137 * Since the runqueue lock will be released by the next 3136 * Since the runqueue lock will be released by the next
3138 * task (which is an invalid locking op but in the case 3137 * task (which is an invalid locking op but in the case
3139 * of the scheduler it's an obvious special-case), so we 3138 * of the scheduler it's an obvious special-case), so we
3140 * do an early lockdep release here: 3139 * do an early lockdep release here:
3141 */ 3140 */
3142 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 3141 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
3143 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 3142 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3144 #endif 3143 #endif
3145 3144
3146 /* Here we just switch the register state and the stack. */ 3145 /* Here we just switch the register state and the stack. */
3147 switch_to(prev, next, prev); 3146 switch_to(prev, next, prev);
3148 3147
3149 barrier(); 3148 barrier();
3150 /* 3149 /*
3151 * this_rq must be evaluated again because prev may have moved 3150 * this_rq must be evaluated again because prev may have moved
3152 * CPUs since it called schedule(), thus the 'rq' on its stack 3151 * CPUs since it called schedule(), thus the 'rq' on its stack
3153 * frame will be invalid. 3152 * frame will be invalid.
3154 */ 3153 */
3155 finish_task_switch(this_rq(), prev); 3154 finish_task_switch(this_rq(), prev);
3156 } 3155 }
3157 3156
3158 /* 3157 /*
3159 * nr_running, nr_uninterruptible and nr_context_switches: 3158 * nr_running, nr_uninterruptible and nr_context_switches:
3160 * 3159 *
3161 * externally visible scheduler statistics: current number of runnable 3160 * externally visible scheduler statistics: current number of runnable
3162 * threads, current number of uninterruptible-sleeping threads, total 3161 * threads, current number of uninterruptible-sleeping threads, total
3163 * number of context switches performed since bootup. 3162 * number of context switches performed since bootup.
3164 */ 3163 */
3165 unsigned long nr_running(void) 3164 unsigned long nr_running(void)
3166 { 3165 {
3167 unsigned long i, sum = 0; 3166 unsigned long i, sum = 0;
3168 3167
3169 for_each_online_cpu(i) 3168 for_each_online_cpu(i)
3170 sum += cpu_rq(i)->nr_running; 3169 sum += cpu_rq(i)->nr_running;
3171 3170
3172 return sum; 3171 return sum;
3173 } 3172 }
3174 3173
3175 unsigned long nr_uninterruptible(void) 3174 unsigned long nr_uninterruptible(void)
3176 { 3175 {
3177 unsigned long i, sum = 0; 3176 unsigned long i, sum = 0;
3178 3177
3179 for_each_possible_cpu(i) 3178 for_each_possible_cpu(i)
3180 sum += cpu_rq(i)->nr_uninterruptible; 3179 sum += cpu_rq(i)->nr_uninterruptible;
3181 3180
3182 /* 3181 /*
3183 * Since we read the counters lockless, it might be slightly 3182 * Since we read the counters lockless, it might be slightly
3184 * inaccurate. Do not allow it to go below zero though: 3183 * inaccurate. Do not allow it to go below zero though:
3185 */ 3184 */
3186 if (unlikely((long)sum < 0)) 3185 if (unlikely((long)sum < 0))
3187 sum = 0; 3186 sum = 0;
3188 3187
3189 return sum; 3188 return sum;
3190 } 3189 }
3191 3190
3192 unsigned long long nr_context_switches(void) 3191 unsigned long long nr_context_switches(void)
3193 { 3192 {
3194 int i; 3193 int i;
3195 unsigned long long sum = 0; 3194 unsigned long long sum = 0;
3196 3195
3197 for_each_possible_cpu(i) 3196 for_each_possible_cpu(i)
3198 sum += cpu_rq(i)->nr_switches; 3197 sum += cpu_rq(i)->nr_switches;
3199 3198
3200 return sum; 3199 return sum;
3201 } 3200 }
3202 3201
3203 unsigned long nr_iowait(void) 3202 unsigned long nr_iowait(void)
3204 { 3203 {
3205 unsigned long i, sum = 0; 3204 unsigned long i, sum = 0;
3206 3205
3207 for_each_possible_cpu(i) 3206 for_each_possible_cpu(i)
3208 sum += atomic_read(&cpu_rq(i)->nr_iowait); 3207 sum += atomic_read(&cpu_rq(i)->nr_iowait);
3209 3208
3210 return sum; 3209 return sum;
3211 } 3210 }
3212 3211
3213 unsigned long nr_iowait_cpu(int cpu) 3212 unsigned long nr_iowait_cpu(int cpu)
3214 { 3213 {
3215 struct rq *this = cpu_rq(cpu); 3214 struct rq *this = cpu_rq(cpu);
3216 return atomic_read(&this->nr_iowait); 3215 return atomic_read(&this->nr_iowait);
3217 } 3216 }
3218 3217
3219 unsigned long this_cpu_load(void) 3218 unsigned long this_cpu_load(void)
3220 { 3219 {
3221 struct rq *this = this_rq(); 3220 struct rq *this = this_rq();
3222 return this->cpu_load[0]; 3221 return this->cpu_load[0];
3223 } 3222 }
3224 3223
3225 3224
3226 /* Variables and functions for calc_load */ 3225 /* Variables and functions for calc_load */
3227 static atomic_long_t calc_load_tasks; 3226 static atomic_long_t calc_load_tasks;
3228 static unsigned long calc_load_update; 3227 static unsigned long calc_load_update;
3229 unsigned long avenrun[3]; 3228 unsigned long avenrun[3];
3230 EXPORT_SYMBOL(avenrun); 3229 EXPORT_SYMBOL(avenrun);
3231 3230
3232 static long calc_load_fold_active(struct rq *this_rq) 3231 static long calc_load_fold_active(struct rq *this_rq)
3233 { 3232 {
3234 long nr_active, delta = 0; 3233 long nr_active, delta = 0;
3235 3234
3236 nr_active = this_rq->nr_running; 3235 nr_active = this_rq->nr_running;
3237 nr_active += (long) this_rq->nr_uninterruptible; 3236 nr_active += (long) this_rq->nr_uninterruptible;
3238 3237
3239 if (nr_active != this_rq->calc_load_active) { 3238 if (nr_active != this_rq->calc_load_active) {
3240 delta = nr_active - this_rq->calc_load_active; 3239 delta = nr_active - this_rq->calc_load_active;
3241 this_rq->calc_load_active = nr_active; 3240 this_rq->calc_load_active = nr_active;
3242 } 3241 }
3243 3242
3244 return delta; 3243 return delta;
3245 } 3244 }
3246 3245
3247 static unsigned long 3246 static unsigned long
3248 calc_load(unsigned long load, unsigned long exp, unsigned long active) 3247 calc_load(unsigned long load, unsigned long exp, unsigned long active)
3249 { 3248 {
3250 load *= exp; 3249 load *= exp;
3251 load += active * (FIXED_1 - exp); 3250 load += active * (FIXED_1 - exp);
3252 load += 1UL << (FSHIFT - 1); 3251 load += 1UL << (FSHIFT - 1);
3253 return load >> FSHIFT; 3252 return load >> FSHIFT;
3254 } 3253 }
3255 3254
3256 #ifdef CONFIG_NO_HZ 3255 #ifdef CONFIG_NO_HZ
3257 /* 3256 /*
3258 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3257 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
3259 * 3258 *
3260 * When making the ILB scale, we should try to pull this in as well. 3259 * When making the ILB scale, we should try to pull this in as well.
3261 */ 3260 */
3262 static atomic_long_t calc_load_tasks_idle; 3261 static atomic_long_t calc_load_tasks_idle;
3263 3262
3264 static void calc_load_account_idle(struct rq *this_rq) 3263 static void calc_load_account_idle(struct rq *this_rq)
3265 { 3264 {
3266 long delta; 3265 long delta;
3267 3266
3268 delta = calc_load_fold_active(this_rq); 3267 delta = calc_load_fold_active(this_rq);
3269 if (delta) 3268 if (delta)
3270 atomic_long_add(delta, &calc_load_tasks_idle); 3269 atomic_long_add(delta, &calc_load_tasks_idle);
3271 } 3270 }
3272 3271
3273 static long calc_load_fold_idle(void) 3272 static long calc_load_fold_idle(void)
3274 { 3273 {
3275 long delta = 0; 3274 long delta = 0;
3276 3275
3277 /* 3276 /*
3278 * Its got a race, we don't care... 3277 * Its got a race, we don't care...
3279 */ 3278 */
3280 if (atomic_long_read(&calc_load_tasks_idle)) 3279 if (atomic_long_read(&calc_load_tasks_idle))
3281 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 3280 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3282 3281
3283 return delta; 3282 return delta;
3284 } 3283 }
3285 3284
3286 /** 3285 /**
3287 * fixed_power_int - compute: x^n, in O(log n) time 3286 * fixed_power_int - compute: x^n, in O(log n) time
3288 * 3287 *
3289 * @x: base of the power 3288 * @x: base of the power
3290 * @frac_bits: fractional bits of @x 3289 * @frac_bits: fractional bits of @x
3291 * @n: power to raise @x to. 3290 * @n: power to raise @x to.
3292 * 3291 *
3293 * By exploiting the relation between the definition of the natural power 3292 * By exploiting the relation between the definition of the natural power
3294 * function: x^n := x*x*...*x (x multiplied by itself for n times), and 3293 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3295 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 3294 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3296 * (where: n_i \elem {0, 1}, the binary vector representing n), 3295 * (where: n_i \elem {0, 1}, the binary vector representing n),
3297 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 3296 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3298 * of course trivially computable in O(log_2 n), the length of our binary 3297 * of course trivially computable in O(log_2 n), the length of our binary
3299 * vector. 3298 * vector.
3300 */ 3299 */
3301 static unsigned long 3300 static unsigned long
3302 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) 3301 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3303 { 3302 {
3304 unsigned long result = 1UL << frac_bits; 3303 unsigned long result = 1UL << frac_bits;
3305 3304
3306 if (n) for (;;) { 3305 if (n) for (;;) {
3307 if (n & 1) { 3306 if (n & 1) {
3308 result *= x; 3307 result *= x;
3309 result += 1UL << (frac_bits - 1); 3308 result += 1UL << (frac_bits - 1);
3310 result >>= frac_bits; 3309 result >>= frac_bits;
3311 } 3310 }
3312 n >>= 1; 3311 n >>= 1;
3313 if (!n) 3312 if (!n)
3314 break; 3313 break;
3315 x *= x; 3314 x *= x;
3316 x += 1UL << (frac_bits - 1); 3315 x += 1UL << (frac_bits - 1);
3317 x >>= frac_bits; 3316 x >>= frac_bits;
3318 } 3317 }
3319 3318
3320 return result; 3319 return result;
3321 } 3320 }
3322 3321
3323 /* 3322 /*
3324 * a1 = a0 * e + a * (1 - e) 3323 * a1 = a0 * e + a * (1 - e)
3325 * 3324 *
3326 * a2 = a1 * e + a * (1 - e) 3325 * a2 = a1 * e + a * (1 - e)
3327 * = (a0 * e + a * (1 - e)) * e + a * (1 - e) 3326 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3328 * = a0 * e^2 + a * (1 - e) * (1 + e) 3327 * = a0 * e^2 + a * (1 - e) * (1 + e)
3329 * 3328 *
3330 * a3 = a2 * e + a * (1 - e) 3329 * a3 = a2 * e + a * (1 - e)
3331 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) 3330 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3332 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) 3331 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3333 * 3332 *
3334 * ... 3333 * ...
3335 * 3334 *
3336 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] 3335 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3337 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) 3336 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3338 * = a0 * e^n + a * (1 - e^n) 3337 * = a0 * e^n + a * (1 - e^n)
3339 * 3338 *
3340 * [1] application of the geometric series: 3339 * [1] application of the geometric series:
3341 * 3340 *
3342 * n 1 - x^(n+1) 3341 * n 1 - x^(n+1)
3343 * S_n := \Sum x^i = ------------- 3342 * S_n := \Sum x^i = -------------
3344 * i=0 1 - x 3343 * i=0 1 - x
3345 */ 3344 */
3346 static unsigned long 3345 static unsigned long
3347 calc_load_n(unsigned long load, unsigned long exp, 3346 calc_load_n(unsigned long load, unsigned long exp,
3348 unsigned long active, unsigned int n) 3347 unsigned long active, unsigned int n)
3349 { 3348 {
3350 3349
3351 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); 3350 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3352 } 3351 }
3353 3352
3354 /* 3353 /*
3355 * NO_HZ can leave us missing all per-cpu ticks calling 3354 * NO_HZ can leave us missing all per-cpu ticks calling
3356 * calc_load_account_active(), but since an idle CPU folds its delta into 3355 * calc_load_account_active(), but since an idle CPU folds its delta into
3357 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold 3356 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3358 * in the pending idle delta if our idle period crossed a load cycle boundary. 3357 * in the pending idle delta if our idle period crossed a load cycle boundary.
3359 * 3358 *
3360 * Once we've updated the global active value, we need to apply the exponential 3359 * Once we've updated the global active value, we need to apply the exponential
3361 * weights adjusted to the number of cycles missed. 3360 * weights adjusted to the number of cycles missed.
3362 */ 3361 */
3363 static void calc_global_nohz(unsigned long ticks) 3362 static void calc_global_nohz(unsigned long ticks)
3364 { 3363 {
3365 long delta, active, n; 3364 long delta, active, n;
3366 3365
3367 if (time_before(jiffies, calc_load_update)) 3366 if (time_before(jiffies, calc_load_update))
3368 return; 3367 return;
3369 3368
3370 /* 3369 /*
3371 * If we crossed a calc_load_update boundary, make sure to fold 3370 * If we crossed a calc_load_update boundary, make sure to fold
3372 * any pending idle changes, the respective CPUs might have 3371 * any pending idle changes, the respective CPUs might have
3373 * missed the tick driven calc_load_account_active() update 3372 * missed the tick driven calc_load_account_active() update
3374 * due to NO_HZ. 3373 * due to NO_HZ.
3375 */ 3374 */
3376 delta = calc_load_fold_idle(); 3375 delta = calc_load_fold_idle();
3377 if (delta) 3376 if (delta)
3378 atomic_long_add(delta, &calc_load_tasks); 3377 atomic_long_add(delta, &calc_load_tasks);
3379 3378
3380 /* 3379 /*
3381 * If we were idle for multiple load cycles, apply them. 3380 * If we were idle for multiple load cycles, apply them.
3382 */ 3381 */
3383 if (ticks >= LOAD_FREQ) { 3382 if (ticks >= LOAD_FREQ) {
3384 n = ticks / LOAD_FREQ; 3383 n = ticks / LOAD_FREQ;
3385 3384
3386 active = atomic_long_read(&calc_load_tasks); 3385 active = atomic_long_read(&calc_load_tasks);
3387 active = active > 0 ? active * FIXED_1 : 0; 3386 active = active > 0 ? active * FIXED_1 : 0;
3388 3387
3389 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 3388 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3390 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 3389 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3391 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 3390 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3392 3391
3393 calc_load_update += n * LOAD_FREQ; 3392 calc_load_update += n * LOAD_FREQ;
3394 } 3393 }
3395 3394
3396 /* 3395 /*
3397 * Its possible the remainder of the above division also crosses 3396 * Its possible the remainder of the above division also crosses
3398 * a LOAD_FREQ period, the regular check in calc_global_load() 3397 * a LOAD_FREQ period, the regular check in calc_global_load()
3399 * which comes after this will take care of that. 3398 * which comes after this will take care of that.
3400 * 3399 *
3401 * Consider us being 11 ticks before a cycle completion, and us 3400 * Consider us being 11 ticks before a cycle completion, and us
3402 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will 3401 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3403 * age us 4 cycles, and the test in calc_global_load() will 3402 * age us 4 cycles, and the test in calc_global_load() will
3404 * pick up the final one. 3403 * pick up the final one.
3405 */ 3404 */
3406 } 3405 }
3407 #else 3406 #else
3408 static void calc_load_account_idle(struct rq *this_rq) 3407 static void calc_load_account_idle(struct rq *this_rq)
3409 { 3408 {
3410 } 3409 }
3411 3410
3412 static inline long calc_load_fold_idle(void) 3411 static inline long calc_load_fold_idle(void)
3413 { 3412 {
3414 return 0; 3413 return 0;
3415 } 3414 }
3416 3415
3417 static void calc_global_nohz(unsigned long ticks) 3416 static void calc_global_nohz(unsigned long ticks)
3418 { 3417 {
3419 } 3418 }
3420 #endif 3419 #endif
3421 3420
3422 /** 3421 /**
3423 * get_avenrun - get the load average array 3422 * get_avenrun - get the load average array
3424 * @loads: pointer to dest load array 3423 * @loads: pointer to dest load array
3425 * @offset: offset to add 3424 * @offset: offset to add
3426 * @shift: shift count to shift the result left 3425 * @shift: shift count to shift the result left
3427 * 3426 *
3428 * These values are estimates at best, so no need for locking. 3427 * These values are estimates at best, so no need for locking.
3429 */ 3428 */
3430 void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 3429 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3431 { 3430 {
3432 loads[0] = (avenrun[0] + offset) << shift; 3431 loads[0] = (avenrun[0] + offset) << shift;
3433 loads[1] = (avenrun[1] + offset) << shift; 3432 loads[1] = (avenrun[1] + offset) << shift;
3434 loads[2] = (avenrun[2] + offset) << shift; 3433 loads[2] = (avenrun[2] + offset) << shift;
3435 } 3434 }
3436 3435
3437 /* 3436 /*
3438 * calc_load - update the avenrun load estimates 10 ticks after the 3437 * calc_load - update the avenrun load estimates 10 ticks after the
3439 * CPUs have updated calc_load_tasks. 3438 * CPUs have updated calc_load_tasks.
3440 */ 3439 */
3441 void calc_global_load(unsigned long ticks) 3440 void calc_global_load(unsigned long ticks)
3442 { 3441 {
3443 long active; 3442 long active;
3444 3443
3445 calc_global_nohz(ticks); 3444 calc_global_nohz(ticks);
3446 3445
3447 if (time_before(jiffies, calc_load_update + 10)) 3446 if (time_before(jiffies, calc_load_update + 10))
3448 return; 3447 return;
3449 3448
3450 active = atomic_long_read(&calc_load_tasks); 3449 active = atomic_long_read(&calc_load_tasks);
3451 active = active > 0 ? active * FIXED_1 : 0; 3450 active = active > 0 ? active * FIXED_1 : 0;
3452 3451
3453 avenrun[0] = calc_load(avenrun[0], EXP_1, active); 3452 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3454 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 3453 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3455 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 3454 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3456 3455
3457 calc_load_update += LOAD_FREQ; 3456 calc_load_update += LOAD_FREQ;
3458 } 3457 }
3459 3458
3460 /* 3459 /*
3461 * Called from update_cpu_load() to periodically update this CPU's 3460 * Called from update_cpu_load() to periodically update this CPU's
3462 * active count. 3461 * active count.
3463 */ 3462 */
3464 static void calc_load_account_active(struct rq *this_rq) 3463 static void calc_load_account_active(struct rq *this_rq)
3465 { 3464 {
3466 long delta; 3465 long delta;
3467 3466
3468 if (time_before(jiffies, this_rq->calc_load_update)) 3467 if (time_before(jiffies, this_rq->calc_load_update))
3469 return; 3468 return;
3470 3469
3471 delta = calc_load_fold_active(this_rq); 3470 delta = calc_load_fold_active(this_rq);
3472 delta += calc_load_fold_idle(); 3471 delta += calc_load_fold_idle();
3473 if (delta) 3472 if (delta)
3474 atomic_long_add(delta, &calc_load_tasks); 3473 atomic_long_add(delta, &calc_load_tasks);
3475 3474
3476 this_rq->calc_load_update += LOAD_FREQ; 3475 this_rq->calc_load_update += LOAD_FREQ;
3477 } 3476 }
3478 3477
3479 /* 3478 /*
3480 * The exact cpuload at various idx values, calculated at every tick would be 3479 * The exact cpuload at various idx values, calculated at every tick would be
3481 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 3480 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3482 * 3481 *
3483 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called 3482 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3484 * on nth tick when cpu may be busy, then we have: 3483 * on nth tick when cpu may be busy, then we have:
3485 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 3484 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3486 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load 3485 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3487 * 3486 *
3488 * decay_load_missed() below does efficient calculation of 3487 * decay_load_missed() below does efficient calculation of
3489 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 3488 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3490 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load 3489 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3491 * 3490 *
3492 * The calculation is approximated on a 128 point scale. 3491 * The calculation is approximated on a 128 point scale.
3493 * degrade_zero_ticks is the number of ticks after which load at any 3492 * degrade_zero_ticks is the number of ticks after which load at any
3494 * particular idx is approximated to be zero. 3493 * particular idx is approximated to be zero.
3495 * degrade_factor is a precomputed table, a row for each load idx. 3494 * degrade_factor is a precomputed table, a row for each load idx.
3496 * Each column corresponds to degradation factor for a power of two ticks, 3495 * Each column corresponds to degradation factor for a power of two ticks,
3497 * based on 128 point scale. 3496 * based on 128 point scale.
3498 * Example: 3497 * Example:
3499 * row 2, col 3 (=12) says that the degradation at load idx 2 after 3498 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3500 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). 3499 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3501 * 3500 *
3502 * With this power of 2 load factors, we can degrade the load n times 3501 * With this power of 2 load factors, we can degrade the load n times
3503 * by looking at 1 bits in n and doing as many mult/shift instead of 3502 * by looking at 1 bits in n and doing as many mult/shift instead of
3504 * n mult/shifts needed by the exact degradation. 3503 * n mult/shifts needed by the exact degradation.
3505 */ 3504 */
3506 #define DEGRADE_SHIFT 7 3505 #define DEGRADE_SHIFT 7
3507 static const unsigned char 3506 static const unsigned char
3508 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; 3507 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3509 static const unsigned char 3508 static const unsigned char
3510 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { 3509 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3511 {0, 0, 0, 0, 0, 0, 0, 0}, 3510 {0, 0, 0, 0, 0, 0, 0, 0},
3512 {64, 32, 8, 0, 0, 0, 0, 0}, 3511 {64, 32, 8, 0, 0, 0, 0, 0},
3513 {96, 72, 40, 12, 1, 0, 0}, 3512 {96, 72, 40, 12, 1, 0, 0},
3514 {112, 98, 75, 43, 15, 1, 0}, 3513 {112, 98, 75, 43, 15, 1, 0},
3515 {120, 112, 98, 76, 45, 16, 2} }; 3514 {120, 112, 98, 76, 45, 16, 2} };
3516 3515
3517 /* 3516 /*
3518 * Update cpu_load for any missed ticks, due to tickless idle. The backlog 3517 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3519 * would be when CPU is idle and so we just decay the old load without 3518 * would be when CPU is idle and so we just decay the old load without
3520 * adding any new load. 3519 * adding any new load.
3521 */ 3520 */
3522 static unsigned long 3521 static unsigned long
3523 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) 3522 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3524 { 3523 {
3525 int j = 0; 3524 int j = 0;
3526 3525
3527 if (!missed_updates) 3526 if (!missed_updates)
3528 return load; 3527 return load;
3529 3528
3530 if (missed_updates >= degrade_zero_ticks[idx]) 3529 if (missed_updates >= degrade_zero_ticks[idx])
3531 return 0; 3530 return 0;
3532 3531
3533 if (idx == 1) 3532 if (idx == 1)
3534 return load >> missed_updates; 3533 return load >> missed_updates;
3535 3534
3536 while (missed_updates) { 3535 while (missed_updates) {
3537 if (missed_updates % 2) 3536 if (missed_updates % 2)
3538 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; 3537 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3539 3538
3540 missed_updates >>= 1; 3539 missed_updates >>= 1;
3541 j++; 3540 j++;
3542 } 3541 }
3543 return load; 3542 return load;
3544 } 3543 }
3545 3544
3546 /* 3545 /*
3547 * Update rq->cpu_load[] statistics. This function is usually called every 3546 * Update rq->cpu_load[] statistics. This function is usually called every
3548 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 3547 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3549 * every tick. We fix it up based on jiffies. 3548 * every tick. We fix it up based on jiffies.
3550 */ 3549 */
3551 static void update_cpu_load(struct rq *this_rq) 3550 static void update_cpu_load(struct rq *this_rq)
3552 { 3551 {
3553 unsigned long this_load = this_rq->load.weight; 3552 unsigned long this_load = this_rq->load.weight;
3554 unsigned long curr_jiffies = jiffies; 3553 unsigned long curr_jiffies = jiffies;
3555 unsigned long pending_updates; 3554 unsigned long pending_updates;
3556 int i, scale; 3555 int i, scale;
3557 3556
3558 this_rq->nr_load_updates++; 3557 this_rq->nr_load_updates++;
3559 3558
3560 /* Avoid repeated calls on same jiffy, when moving in and out of idle */ 3559 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3561 if (curr_jiffies == this_rq->last_load_update_tick) 3560 if (curr_jiffies == this_rq->last_load_update_tick)
3562 return; 3561 return;
3563 3562
3564 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 3563 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3565 this_rq->last_load_update_tick = curr_jiffies; 3564 this_rq->last_load_update_tick = curr_jiffies;
3566 3565
3567 /* Update our load: */ 3566 /* Update our load: */
3568 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 3567 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3569 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3568 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3570 unsigned long old_load, new_load; 3569 unsigned long old_load, new_load;
3571 3570
3572 /* scale is effectively 1 << i now, and >> i divides by scale */ 3571 /* scale is effectively 1 << i now, and >> i divides by scale */
3573 3572
3574 old_load = this_rq->cpu_load[i]; 3573 old_load = this_rq->cpu_load[i];
3575 old_load = decay_load_missed(old_load, pending_updates - 1, i); 3574 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3576 new_load = this_load; 3575 new_load = this_load;
3577 /* 3576 /*
3578 * Round up the averaging division if load is increasing. This 3577 * Round up the averaging division if load is increasing. This
3579 * prevents us from getting stuck on 9 if the load is 10, for 3578 * prevents us from getting stuck on 9 if the load is 10, for
3580 * example. 3579 * example.
3581 */ 3580 */
3582 if (new_load > old_load) 3581 if (new_load > old_load)
3583 new_load += scale - 1; 3582 new_load += scale - 1;
3584 3583
3585 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 3584 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3586 } 3585 }
3587 3586
3588 sched_avg_update(this_rq); 3587 sched_avg_update(this_rq);
3589 } 3588 }
3590 3589
3591 static void update_cpu_load_active(struct rq *this_rq) 3590 static void update_cpu_load_active(struct rq *this_rq)
3592 { 3591 {
3593 update_cpu_load(this_rq); 3592 update_cpu_load(this_rq);
3594 3593
3595 calc_load_account_active(this_rq); 3594 calc_load_account_active(this_rq);
3596 } 3595 }
3597 3596
3598 #ifdef CONFIG_SMP 3597 #ifdef CONFIG_SMP
3599 3598
3600 /* 3599 /*
3601 * sched_exec - execve() is a valuable balancing opportunity, because at 3600 * sched_exec - execve() is a valuable balancing opportunity, because at
3602 * this point the task has the smallest effective memory and cache footprint. 3601 * this point the task has the smallest effective memory and cache footprint.
3603 */ 3602 */
3604 void sched_exec(void) 3603 void sched_exec(void)
3605 { 3604 {
3606 struct task_struct *p = current; 3605 struct task_struct *p = current;
3607 unsigned long flags; 3606 unsigned long flags;
3608 int dest_cpu; 3607 int dest_cpu;
3609 3608
3610 raw_spin_lock_irqsave(&p->pi_lock, flags); 3609 raw_spin_lock_irqsave(&p->pi_lock, flags);
3611 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 3610 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3612 if (dest_cpu == smp_processor_id()) 3611 if (dest_cpu == smp_processor_id())
3613 goto unlock; 3612 goto unlock;
3614 3613
3615 if (likely(cpu_active(dest_cpu))) { 3614 if (likely(cpu_active(dest_cpu))) {
3616 struct migration_arg arg = { p, dest_cpu }; 3615 struct migration_arg arg = { p, dest_cpu };
3617 3616
3618 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3617 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3619 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 3618 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3620 return; 3619 return;
3621 } 3620 }
3622 unlock: 3621 unlock:
3623 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3622 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3624 } 3623 }
3625 3624
3626 #endif 3625 #endif
3627 3626
3628 DEFINE_PER_CPU(struct kernel_stat, kstat); 3627 DEFINE_PER_CPU(struct kernel_stat, kstat);
3629 3628
3630 EXPORT_PER_CPU_SYMBOL(kstat); 3629 EXPORT_PER_CPU_SYMBOL(kstat);
3631 3630
3632 /* 3631 /*
3633 * Return any ns on the sched_clock that have not yet been accounted in 3632 * Return any ns on the sched_clock that have not yet been accounted in
3634 * @p in case that task is currently running. 3633 * @p in case that task is currently running.
3635 * 3634 *
3636 * Called with task_rq_lock() held on @rq. 3635 * Called with task_rq_lock() held on @rq.
3637 */ 3636 */
3638 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 3637 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3639 { 3638 {
3640 u64 ns = 0; 3639 u64 ns = 0;
3641 3640
3642 if (task_current(rq, p)) { 3641 if (task_current(rq, p)) {
3643 update_rq_clock(rq); 3642 update_rq_clock(rq);
3644 ns = rq->clock_task - p->se.exec_start; 3643 ns = rq->clock_task - p->se.exec_start;
3645 if ((s64)ns < 0) 3644 if ((s64)ns < 0)
3646 ns = 0; 3645 ns = 0;
3647 } 3646 }
3648 3647
3649 return ns; 3648 return ns;
3650 } 3649 }
3651 3650
3652 unsigned long long task_delta_exec(struct task_struct *p) 3651 unsigned long long task_delta_exec(struct task_struct *p)
3653 { 3652 {
3654 unsigned long flags; 3653 unsigned long flags;
3655 struct rq *rq; 3654 struct rq *rq;
3656 u64 ns = 0; 3655 u64 ns = 0;
3657 3656
3658 rq = task_rq_lock(p, &flags); 3657 rq = task_rq_lock(p, &flags);
3659 ns = do_task_delta_exec(p, rq); 3658 ns = do_task_delta_exec(p, rq);
3660 task_rq_unlock(rq, p, &flags); 3659 task_rq_unlock(rq, p, &flags);
3661 3660
3662 return ns; 3661 return ns;
3663 } 3662 }
3664 3663
3665 /* 3664 /*
3666 * Return accounted runtime for the task. 3665 * Return accounted runtime for the task.
3667 * In case the task is currently running, return the runtime plus current's 3666 * In case the task is currently running, return the runtime plus current's
3668 * pending runtime that have not been accounted yet. 3667 * pending runtime that have not been accounted yet.
3669 */ 3668 */
3670 unsigned long long task_sched_runtime(struct task_struct *p) 3669 unsigned long long task_sched_runtime(struct task_struct *p)
3671 { 3670 {
3672 unsigned long flags; 3671 unsigned long flags;
3673 struct rq *rq; 3672 struct rq *rq;
3674 u64 ns = 0; 3673 u64 ns = 0;
3675 3674
3676 rq = task_rq_lock(p, &flags); 3675 rq = task_rq_lock(p, &flags);
3677 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3676 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3678 task_rq_unlock(rq, p, &flags); 3677 task_rq_unlock(rq, p, &flags);
3679 3678
3680 return ns; 3679 return ns;
3681 } 3680 }
3682 3681
3683 /* 3682 /*
3684 * Return sum_exec_runtime for the thread group. 3683 * Return sum_exec_runtime for the thread group.
3685 * In case the task is currently running, return the sum plus current's 3684 * In case the task is currently running, return the sum plus current's
3686 * pending runtime that have not been accounted yet. 3685 * pending runtime that have not been accounted yet.
3687 * 3686 *
3688 * Note that the thread group might have other running tasks as well, 3687 * Note that the thread group might have other running tasks as well,
3689 * so the return value not includes other pending runtime that other 3688 * so the return value not includes other pending runtime that other
3690 * running tasks might have. 3689 * running tasks might have.
3691 */ 3690 */
3692 unsigned long long thread_group_sched_runtime(struct task_struct *p) 3691 unsigned long long thread_group_sched_runtime(struct task_struct *p)
3693 { 3692 {
3694 struct task_cputime totals; 3693 struct task_cputime totals;
3695 unsigned long flags; 3694 unsigned long flags;
3696 struct rq *rq; 3695 struct rq *rq;
3697 u64 ns; 3696 u64 ns;
3698 3697
3699 rq = task_rq_lock(p, &flags); 3698 rq = task_rq_lock(p, &flags);
3700 thread_group_cputime(p, &totals); 3699 thread_group_cputime(p, &totals);
3701 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3700 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3702 task_rq_unlock(rq, p, &flags); 3701 task_rq_unlock(rq, p, &flags);
3703 3702
3704 return ns; 3703 return ns;
3705 } 3704 }
3706 3705
3707 /* 3706 /*
3708 * Account user cpu time to a process. 3707 * Account user cpu time to a process.
3709 * @p: the process that the cpu time gets accounted to 3708 * @p: the process that the cpu time gets accounted to
3710 * @cputime: the cpu time spent in user space since the last update 3709 * @cputime: the cpu time spent in user space since the last update
3711 * @cputime_scaled: cputime scaled by cpu frequency 3710 * @cputime_scaled: cputime scaled by cpu frequency
3712 */ 3711 */
3713 void account_user_time(struct task_struct *p, cputime_t cputime, 3712 void account_user_time(struct task_struct *p, cputime_t cputime,
3714 cputime_t cputime_scaled) 3713 cputime_t cputime_scaled)
3715 { 3714 {
3716 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3715 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3717 cputime64_t tmp; 3716 cputime64_t tmp;
3718 3717
3719 /* Add user time to process. */ 3718 /* Add user time to process. */
3720 p->utime = cputime_add(p->utime, cputime); 3719 p->utime = cputime_add(p->utime, cputime);
3721 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 3720 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3722 account_group_user_time(p, cputime); 3721 account_group_user_time(p, cputime);
3723 3722
3724 /* Add user time to cpustat. */ 3723 /* Add user time to cpustat. */
3725 tmp = cputime_to_cputime64(cputime); 3724 tmp = cputime_to_cputime64(cputime);
3726 if (TASK_NICE(p) > 0) 3725 if (TASK_NICE(p) > 0)
3727 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3726 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3728 else 3727 else
3729 cpustat->user = cputime64_add(cpustat->user, tmp); 3728 cpustat->user = cputime64_add(cpustat->user, tmp);
3730 3729
3731 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); 3730 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3732 /* Account for user time used */ 3731 /* Account for user time used */
3733 acct_update_integrals(p); 3732 acct_update_integrals(p);
3734 } 3733 }
3735 3734
3736 /* 3735 /*
3737 * Account guest cpu time to a process. 3736 * Account guest cpu time to a process.
3738 * @p: the process that the cpu time gets accounted to 3737 * @p: the process that the cpu time gets accounted to
3739 * @cputime: the cpu time spent in virtual machine since the last update 3738 * @cputime: the cpu time spent in virtual machine since the last update
3740 * @cputime_scaled: cputime scaled by cpu frequency 3739 * @cputime_scaled: cputime scaled by cpu frequency
3741 */ 3740 */
3742 static void account_guest_time(struct task_struct *p, cputime_t cputime, 3741 static void account_guest_time(struct task_struct *p, cputime_t cputime,
3743 cputime_t cputime_scaled) 3742 cputime_t cputime_scaled)
3744 { 3743 {
3745 cputime64_t tmp; 3744 cputime64_t tmp;
3746 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3745 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3747 3746
3748 tmp = cputime_to_cputime64(cputime); 3747 tmp = cputime_to_cputime64(cputime);
3749 3748
3750 /* Add guest time to process. */ 3749 /* Add guest time to process. */
3751 p->utime = cputime_add(p->utime, cputime); 3750 p->utime = cputime_add(p->utime, cputime);
3752 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 3751 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3753 account_group_user_time(p, cputime); 3752 account_group_user_time(p, cputime);
3754 p->gtime = cputime_add(p->gtime, cputime); 3753 p->gtime = cputime_add(p->gtime, cputime);
3755 3754
3756 /* Add guest time to cpustat. */ 3755 /* Add guest time to cpustat. */
3757 if (TASK_NICE(p) > 0) { 3756 if (TASK_NICE(p) > 0) {
3758 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3757 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3759 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 3758 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3760 } else { 3759 } else {
3761 cpustat->user = cputime64_add(cpustat->user, tmp); 3760 cpustat->user = cputime64_add(cpustat->user, tmp);
3762 cpustat->guest = cputime64_add(cpustat->guest, tmp); 3761 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3763 } 3762 }
3764 } 3763 }
3765 3764
3766 /* 3765 /*
3767 * Account system cpu time to a process and desired cpustat field 3766 * Account system cpu time to a process and desired cpustat field
3768 * @p: the process that the cpu time gets accounted to 3767 * @p: the process that the cpu time gets accounted to
3769 * @cputime: the cpu time spent in kernel space since the last update 3768 * @cputime: the cpu time spent in kernel space since the last update
3770 * @cputime_scaled: cputime scaled by cpu frequency 3769 * @cputime_scaled: cputime scaled by cpu frequency
3771 * @target_cputime64: pointer to cpustat field that has to be updated 3770 * @target_cputime64: pointer to cpustat field that has to be updated
3772 */ 3771 */
3773 static inline 3772 static inline
3774 void __account_system_time(struct task_struct *p, cputime_t cputime, 3773 void __account_system_time(struct task_struct *p, cputime_t cputime,
3775 cputime_t cputime_scaled, cputime64_t *target_cputime64) 3774 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3776 { 3775 {
3777 cputime64_t tmp = cputime_to_cputime64(cputime); 3776 cputime64_t tmp = cputime_to_cputime64(cputime);
3778 3777
3779 /* Add system time to process. */ 3778 /* Add system time to process. */
3780 p->stime = cputime_add(p->stime, cputime); 3779 p->stime = cputime_add(p->stime, cputime);
3781 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 3780 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3782 account_group_system_time(p, cputime); 3781 account_group_system_time(p, cputime);
3783 3782
3784 /* Add system time to cpustat. */ 3783 /* Add system time to cpustat. */
3785 *target_cputime64 = cputime64_add(*target_cputime64, tmp); 3784 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3786 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3785 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3787 3786
3788 /* Account for system time used */ 3787 /* Account for system time used */
3789 acct_update_integrals(p); 3788 acct_update_integrals(p);
3790 } 3789 }
3791 3790
3792 /* 3791 /*
3793 * Account system cpu time to a process. 3792 * Account system cpu time to a process.
3794 * @p: the process that the cpu time gets accounted to 3793 * @p: the process that the cpu time gets accounted to
3795 * @hardirq_offset: the offset to subtract from hardirq_count() 3794 * @hardirq_offset: the offset to subtract from hardirq_count()
3796 * @cputime: the cpu time spent in kernel space since the last update 3795 * @cputime: the cpu time spent in kernel space since the last update
3797 * @cputime_scaled: cputime scaled by cpu frequency 3796 * @cputime_scaled: cputime scaled by cpu frequency
3798 */ 3797 */
3799 void account_system_time(struct task_struct *p, int hardirq_offset, 3798 void account_system_time(struct task_struct *p, int hardirq_offset,
3800 cputime_t cputime, cputime_t cputime_scaled) 3799 cputime_t cputime, cputime_t cputime_scaled)
3801 { 3800 {
3802 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3801 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3803 cputime64_t *target_cputime64; 3802 cputime64_t *target_cputime64;
3804 3803
3805 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3804 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3806 account_guest_time(p, cputime, cputime_scaled); 3805 account_guest_time(p, cputime, cputime_scaled);
3807 return; 3806 return;
3808 } 3807 }
3809 3808
3810 if (hardirq_count() - hardirq_offset) 3809 if (hardirq_count() - hardirq_offset)
3811 target_cputime64 = &cpustat->irq; 3810 target_cputime64 = &cpustat->irq;
3812 else if (in_serving_softirq()) 3811 else if (in_serving_softirq())
3813 target_cputime64 = &cpustat->softirq; 3812 target_cputime64 = &cpustat->softirq;
3814 else 3813 else
3815 target_cputime64 = &cpustat->system; 3814 target_cputime64 = &cpustat->system;
3816 3815
3817 __account_system_time(p, cputime, cputime_scaled, target_cputime64); 3816 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3818 } 3817 }
3819 3818
3820 /* 3819 /*
3821 * Account for involuntary wait time. 3820 * Account for involuntary wait time.
3822 * @cputime: the cpu time spent in involuntary wait 3821 * @cputime: the cpu time spent in involuntary wait
3823 */ 3822 */
3824 void account_steal_time(cputime_t cputime) 3823 void account_steal_time(cputime_t cputime)
3825 { 3824 {
3826 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3825 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3827 cputime64_t cputime64 = cputime_to_cputime64(cputime); 3826 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3828 3827
3829 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 3828 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3830 } 3829 }
3831 3830
3832 /* 3831 /*
3833 * Account for idle time. 3832 * Account for idle time.
3834 * @cputime: the cpu time spent in idle wait 3833 * @cputime: the cpu time spent in idle wait
3835 */ 3834 */
3836 void account_idle_time(cputime_t cputime) 3835 void account_idle_time(cputime_t cputime)
3837 { 3836 {
3838 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3837 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3839 cputime64_t cputime64 = cputime_to_cputime64(cputime); 3838 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3840 struct rq *rq = this_rq(); 3839 struct rq *rq = this_rq();
3841 3840
3842 if (atomic_read(&rq->nr_iowait) > 0) 3841 if (atomic_read(&rq->nr_iowait) > 0)
3843 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 3842 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3844 else 3843 else
3845 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3844 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3846 } 3845 }
3847 3846
3848 #ifndef CONFIG_VIRT_CPU_ACCOUNTING 3847 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
3849 3848
3850 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 3849 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
3851 /* 3850 /*
3852 * Account a tick to a process and cpustat 3851 * Account a tick to a process and cpustat
3853 * @p: the process that the cpu time gets accounted to 3852 * @p: the process that the cpu time gets accounted to
3854 * @user_tick: is the tick from userspace 3853 * @user_tick: is the tick from userspace
3855 * @rq: the pointer to rq 3854 * @rq: the pointer to rq
3856 * 3855 *
3857 * Tick demultiplexing follows the order 3856 * Tick demultiplexing follows the order
3858 * - pending hardirq update 3857 * - pending hardirq update
3859 * - pending softirq update 3858 * - pending softirq update
3860 * - user_time 3859 * - user_time
3861 * - idle_time 3860 * - idle_time
3862 * - system time 3861 * - system time
3863 * - check for guest_time 3862 * - check for guest_time
3864 * - else account as system_time 3863 * - else account as system_time
3865 * 3864 *
3866 * Check for hardirq is done both for system and user time as there is 3865 * Check for hardirq is done both for system and user time as there is
3867 * no timer going off while we are on hardirq and hence we may never get an 3866 * no timer going off while we are on hardirq and hence we may never get an
3868 * opportunity to update it solely in system time. 3867 * opportunity to update it solely in system time.
3869 * p->stime and friends are only updated on system time and not on irq 3868 * p->stime and friends are only updated on system time and not on irq
3870 * softirq as those do not count in task exec_runtime any more. 3869 * softirq as those do not count in task exec_runtime any more.
3871 */ 3870 */
3872 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 3871 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3873 struct rq *rq) 3872 struct rq *rq)
3874 { 3873 {
3875 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3874 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3876 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 3875 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3877 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3876 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3878 3877
3879 if (irqtime_account_hi_update()) { 3878 if (irqtime_account_hi_update()) {
3880 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3879 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3881 } else if (irqtime_account_si_update()) { 3880 } else if (irqtime_account_si_update()) {
3882 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3881 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3883 } else if (this_cpu_ksoftirqd() == p) { 3882 } else if (this_cpu_ksoftirqd() == p) {
3884 /* 3883 /*
3885 * ksoftirqd time do not get accounted in cpu_softirq_time. 3884 * ksoftirqd time do not get accounted in cpu_softirq_time.
3886 * So, we have to handle it separately here. 3885 * So, we have to handle it separately here.
3887 * Also, p->stime needs to be updated for ksoftirqd. 3886 * Also, p->stime needs to be updated for ksoftirqd.
3888 */ 3887 */
3889 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 3888 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3890 &cpustat->softirq); 3889 &cpustat->softirq);
3891 } else if (user_tick) { 3890 } else if (user_tick) {
3892 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3891 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3893 } else if (p == rq->idle) { 3892 } else if (p == rq->idle) {
3894 account_idle_time(cputime_one_jiffy); 3893 account_idle_time(cputime_one_jiffy);
3895 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 3894 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3896 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 3895 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3897 } else { 3896 } else {
3898 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 3897 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3899 &cpustat->system); 3898 &cpustat->system);
3900 } 3899 }
3901 } 3900 }
3902 3901
3903 static void irqtime_account_idle_ticks(int ticks) 3902 static void irqtime_account_idle_ticks(int ticks)
3904 { 3903 {
3905 int i; 3904 int i;
3906 struct rq *rq = this_rq(); 3905 struct rq *rq = this_rq();
3907 3906
3908 for (i = 0; i < ticks; i++) 3907 for (i = 0; i < ticks; i++)
3909 irqtime_account_process_tick(current, 0, rq); 3908 irqtime_account_process_tick(current, 0, rq);
3910 } 3909 }
3911 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 3910 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
3912 static void irqtime_account_idle_ticks(int ticks) {} 3911 static void irqtime_account_idle_ticks(int ticks) {}
3913 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 3912 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3914 struct rq *rq) {} 3913 struct rq *rq) {}
3915 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 3914 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3916 3915
3917 /* 3916 /*
3918 * Account a single tick of cpu time. 3917 * Account a single tick of cpu time.
3919 * @p: the process that the cpu time gets accounted to 3918 * @p: the process that the cpu time gets accounted to
3920 * @user_tick: indicates if the tick is a user or a system tick 3919 * @user_tick: indicates if the tick is a user or a system tick
3921 */ 3920 */
3922 void account_process_tick(struct task_struct *p, int user_tick) 3921 void account_process_tick(struct task_struct *p, int user_tick)
3923 { 3922 {
3924 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3923 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3925 struct rq *rq = this_rq(); 3924 struct rq *rq = this_rq();
3926 3925
3927 if (sched_clock_irqtime) { 3926 if (sched_clock_irqtime) {
3928 irqtime_account_process_tick(p, user_tick, rq); 3927 irqtime_account_process_tick(p, user_tick, rq);
3929 return; 3928 return;
3930 } 3929 }
3931 3930
3932 if (user_tick) 3931 if (user_tick)
3933 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3932 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3934 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3933 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3935 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, 3934 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3936 one_jiffy_scaled); 3935 one_jiffy_scaled);
3937 else 3936 else
3938 account_idle_time(cputime_one_jiffy); 3937 account_idle_time(cputime_one_jiffy);
3939 } 3938 }
3940 3939
3941 /* 3940 /*
3942 * Account multiple ticks of steal time. 3941 * Account multiple ticks of steal time.
3943 * @p: the process from which the cpu time has been stolen 3942 * @p: the process from which the cpu time has been stolen
3944 * @ticks: number of stolen ticks 3943 * @ticks: number of stolen ticks
3945 */ 3944 */
3946 void account_steal_ticks(unsigned long ticks) 3945 void account_steal_ticks(unsigned long ticks)
3947 { 3946 {
3948 account_steal_time(jiffies_to_cputime(ticks)); 3947 account_steal_time(jiffies_to_cputime(ticks));
3949 } 3948 }
3950 3949
3951 /* 3950 /*
3952 * Account multiple ticks of idle time. 3951 * Account multiple ticks of idle time.
3953 * @ticks: number of stolen ticks 3952 * @ticks: number of stolen ticks
3954 */ 3953 */
3955 void account_idle_ticks(unsigned long ticks) 3954 void account_idle_ticks(unsigned long ticks)
3956 { 3955 {
3957 3956
3958 if (sched_clock_irqtime) { 3957 if (sched_clock_irqtime) {
3959 irqtime_account_idle_ticks(ticks); 3958 irqtime_account_idle_ticks(ticks);
3960 return; 3959 return;
3961 } 3960 }
3962 3961
3963 account_idle_time(jiffies_to_cputime(ticks)); 3962 account_idle_time(jiffies_to_cputime(ticks));
3964 } 3963 }
3965 3964
3966 #endif 3965 #endif
3967 3966
3968 /* 3967 /*
3969 * Use precise platform statistics if available: 3968 * Use precise platform statistics if available:
3970 */ 3969 */
3971 #ifdef CONFIG_VIRT_CPU_ACCOUNTING 3970 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
3972 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3971 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3973 { 3972 {
3974 *ut = p->utime; 3973 *ut = p->utime;
3975 *st = p->stime; 3974 *st = p->stime;
3976 } 3975 }
3977 3976
3978 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3977 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3979 { 3978 {
3980 struct task_cputime cputime; 3979 struct task_cputime cputime;
3981 3980
3982 thread_group_cputime(p, &cputime); 3981 thread_group_cputime(p, &cputime);
3983 3982
3984 *ut = cputime.utime; 3983 *ut = cputime.utime;
3985 *st = cputime.stime; 3984 *st = cputime.stime;
3986 } 3985 }
3987 #else 3986 #else
3988 3987
3989 #ifndef nsecs_to_cputime 3988 #ifndef nsecs_to_cputime
3990 # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) 3989 # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3991 #endif 3990 #endif
3992 3991
3993 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3992 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3994 { 3993 {
3995 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 3994 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
3996 3995
3997 /* 3996 /*
3998 * Use CFS's precise accounting: 3997 * Use CFS's precise accounting:
3999 */ 3998 */
4000 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3999 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
4001 4000
4002 if (total) { 4001 if (total) {
4003 u64 temp = rtime; 4002 u64 temp = rtime;
4004 4003
4005 temp *= utime; 4004 temp *= utime;
4006 do_div(temp, total); 4005 do_div(temp, total);
4007 utime = (cputime_t)temp; 4006 utime = (cputime_t)temp;
4008 } else 4007 } else
4009 utime = rtime; 4008 utime = rtime;
4010 4009
4011 /* 4010 /*
4012 * Compare with previous values, to keep monotonicity: 4011 * Compare with previous values, to keep monotonicity:
4013 */ 4012 */
4014 p->prev_utime = max(p->prev_utime, utime); 4013 p->prev_utime = max(p->prev_utime, utime);
4015 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 4014 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
4016 4015
4017 *ut = p->prev_utime; 4016 *ut = p->prev_utime;
4018 *st = p->prev_stime; 4017 *st = p->prev_stime;
4019 } 4018 }
4020 4019
4021 /* 4020 /*
4022 * Must be called with siglock held. 4021 * Must be called with siglock held.
4023 */ 4022 */
4024 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 4023 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4025 { 4024 {
4026 struct signal_struct *sig = p->signal; 4025 struct signal_struct *sig = p->signal;
4027 struct task_cputime cputime; 4026 struct task_cputime cputime;
4028 cputime_t rtime, utime, total; 4027 cputime_t rtime, utime, total;
4029 4028
4030 thread_group_cputime(p, &cputime); 4029 thread_group_cputime(p, &cputime);
4031 4030
4032 total = cputime_add(cputime.utime, cputime.stime); 4031 total = cputime_add(cputime.utime, cputime.stime);
4033 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 4032 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4034 4033
4035 if (total) { 4034 if (total) {
4036 u64 temp = rtime; 4035 u64 temp = rtime;
4037 4036
4038 temp *= cputime.utime; 4037 temp *= cputime.utime;
4039 do_div(temp, total); 4038 do_div(temp, total);
4040 utime = (cputime_t)temp; 4039 utime = (cputime_t)temp;
4041 } else 4040 } else
4042 utime = rtime; 4041 utime = rtime;
4043 4042
4044 sig->prev_utime = max(sig->prev_utime, utime); 4043 sig->prev_utime = max(sig->prev_utime, utime);
4045 sig->prev_stime = max(sig->prev_stime, 4044 sig->prev_stime = max(sig->prev_stime,
4046 cputime_sub(rtime, sig->prev_utime)); 4045 cputime_sub(rtime, sig->prev_utime));
4047 4046
4048 *ut = sig->prev_utime; 4047 *ut = sig->prev_utime;
4049 *st = sig->prev_stime; 4048 *st = sig->prev_stime;
4050 } 4049 }
4051 #endif 4050 #endif
4052 4051
4053 /* 4052 /*
4054 * This function gets called by the timer code, with HZ frequency. 4053 * This function gets called by the timer code, with HZ frequency.
4055 * We call it with interrupts disabled. 4054 * We call it with interrupts disabled.
4056 */ 4055 */
4057 void scheduler_tick(void) 4056 void scheduler_tick(void)
4058 { 4057 {
4059 int cpu = smp_processor_id(); 4058 int cpu = smp_processor_id();
4060 struct rq *rq = cpu_rq(cpu); 4059 struct rq *rq = cpu_rq(cpu);
4061 struct task_struct *curr = rq->curr; 4060 struct task_struct *curr = rq->curr;
4062 4061
4063 sched_clock_tick(); 4062 sched_clock_tick();
4064 4063
4065 raw_spin_lock(&rq->lock); 4064 raw_spin_lock(&rq->lock);
4066 update_rq_clock(rq); 4065 update_rq_clock(rq);
4067 update_cpu_load_active(rq); 4066 update_cpu_load_active(rq);
4068 curr->sched_class->task_tick(rq, curr, 0); 4067 curr->sched_class->task_tick(rq, curr, 0);
4069 raw_spin_unlock(&rq->lock); 4068 raw_spin_unlock(&rq->lock);
4070 4069
4071 perf_event_task_tick(); 4070 perf_event_task_tick();
4072 4071
4073 #ifdef CONFIG_SMP 4072 #ifdef CONFIG_SMP
4074 rq->idle_at_tick = idle_cpu(cpu); 4073 rq->idle_at_tick = idle_cpu(cpu);
4075 trigger_load_balance(rq, cpu); 4074 trigger_load_balance(rq, cpu);
4076 #endif 4075 #endif
4077 } 4076 }
4078 4077
4079 notrace unsigned long get_parent_ip(unsigned long addr) 4078 notrace unsigned long get_parent_ip(unsigned long addr)
4080 { 4079 {
4081 if (in_lock_functions(addr)) { 4080 if (in_lock_functions(addr)) {
4082 addr = CALLER_ADDR2; 4081 addr = CALLER_ADDR2;
4083 if (in_lock_functions(addr)) 4082 if (in_lock_functions(addr))
4084 addr = CALLER_ADDR3; 4083 addr = CALLER_ADDR3;
4085 } 4084 }
4086 return addr; 4085 return addr;
4087 } 4086 }
4088 4087
4089 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 4088 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4090 defined(CONFIG_PREEMPT_TRACER)) 4089 defined(CONFIG_PREEMPT_TRACER))
4091 4090
4092 void __kprobes add_preempt_count(int val) 4091 void __kprobes add_preempt_count(int val)
4093 { 4092 {
4094 #ifdef CONFIG_DEBUG_PREEMPT 4093 #ifdef CONFIG_DEBUG_PREEMPT
4095 /* 4094 /*
4096 * Underflow? 4095 * Underflow?
4097 */ 4096 */
4098 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4097 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4099 return; 4098 return;
4100 #endif 4099 #endif
4101 preempt_count() += val; 4100 preempt_count() += val;
4102 #ifdef CONFIG_DEBUG_PREEMPT 4101 #ifdef CONFIG_DEBUG_PREEMPT
4103 /* 4102 /*
4104 * Spinlock count overflowing soon? 4103 * Spinlock count overflowing soon?
4105 */ 4104 */
4106 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4105 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4107 PREEMPT_MASK - 10); 4106 PREEMPT_MASK - 10);
4108 #endif 4107 #endif
4109 if (preempt_count() == val) 4108 if (preempt_count() == val)
4110 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 4109 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4111 } 4110 }
4112 EXPORT_SYMBOL(add_preempt_count); 4111 EXPORT_SYMBOL(add_preempt_count);
4113 4112
4114 void __kprobes sub_preempt_count(int val) 4113 void __kprobes sub_preempt_count(int val)
4115 { 4114 {
4116 #ifdef CONFIG_DEBUG_PREEMPT 4115 #ifdef CONFIG_DEBUG_PREEMPT
4117 /* 4116 /*
4118 * Underflow? 4117 * Underflow?
4119 */ 4118 */
4120 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4119 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4121 return; 4120 return;
4122 /* 4121 /*
4123 * Is the spinlock portion underflowing? 4122 * Is the spinlock portion underflowing?
4124 */ 4123 */
4125 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4124 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4126 !(preempt_count() & PREEMPT_MASK))) 4125 !(preempt_count() & PREEMPT_MASK)))
4127 return; 4126 return;
4128 #endif 4127 #endif
4129 4128
4130 if (preempt_count() == val) 4129 if (preempt_count() == val)
4131 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 4130 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4132 preempt_count() -= val; 4131 preempt_count() -= val;
4133 } 4132 }
4134 EXPORT_SYMBOL(sub_preempt_count); 4133 EXPORT_SYMBOL(sub_preempt_count);
4135 4134
4136 #endif 4135 #endif
4137 4136
4138 /* 4137 /*
4139 * Print scheduling while atomic bug: 4138 * Print scheduling while atomic bug:
4140 */ 4139 */
4141 static noinline void __schedule_bug(struct task_struct *prev) 4140 static noinline void __schedule_bug(struct task_struct *prev)
4142 { 4141 {
4143 struct pt_regs *regs = get_irq_regs(); 4142 struct pt_regs *regs = get_irq_regs();
4144 4143
4145 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 4144 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4146 prev->comm, prev->pid, preempt_count()); 4145 prev->comm, prev->pid, preempt_count());
4147 4146
4148 debug_show_held_locks(prev); 4147 debug_show_held_locks(prev);
4149 print_modules(); 4148 print_modules();
4150 if (irqs_disabled()) 4149 if (irqs_disabled())
4151 print_irqtrace_events(prev); 4150 print_irqtrace_events(prev);
4152 4151
4153 if (regs) 4152 if (regs)
4154 show_regs(regs); 4153 show_regs(regs);
4155 else 4154 else
4156 dump_stack(); 4155 dump_stack();
4157 } 4156 }
4158 4157
4159 /* 4158 /*
4160 * Various schedule()-time debugging checks and statistics: 4159 * Various schedule()-time debugging checks and statistics:
4161 */ 4160 */
4162 static inline void schedule_debug(struct task_struct *prev) 4161 static inline void schedule_debug(struct task_struct *prev)
4163 { 4162 {
4164 /* 4163 /*
4165 * Test if we are atomic. Since do_exit() needs to call into 4164 * Test if we are atomic. Since do_exit() needs to call into
4166 * schedule() atomically, we ignore that path for now. 4165 * schedule() atomically, we ignore that path for now.
4167 * Otherwise, whine if we are scheduling when we should not be. 4166 * Otherwise, whine if we are scheduling when we should not be.
4168 */ 4167 */
4169 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 4168 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4170 __schedule_bug(prev); 4169 __schedule_bug(prev);
4171 4170
4172 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4171 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4173 4172
4174 schedstat_inc(this_rq(), sched_count); 4173 schedstat_inc(this_rq(), sched_count);
4175 } 4174 }
4176 4175
4177 static void put_prev_task(struct rq *rq, struct task_struct *prev) 4176 static void put_prev_task(struct rq *rq, struct task_struct *prev)
4178 { 4177 {
4179 if (prev->on_rq || rq->skip_clock_update < 0) 4178 if (prev->on_rq || rq->skip_clock_update < 0)
4180 update_rq_clock(rq); 4179 update_rq_clock(rq);
4181 prev->sched_class->put_prev_task(rq, prev); 4180 prev->sched_class->put_prev_task(rq, prev);
4182 } 4181 }
4183 4182
4184 /* 4183 /*
4185 * Pick up the highest-prio task: 4184 * Pick up the highest-prio task:
4186 */ 4185 */
4187 static inline struct task_struct * 4186 static inline struct task_struct *
4188 pick_next_task(struct rq *rq) 4187 pick_next_task(struct rq *rq)
4189 { 4188 {
4190 const struct sched_class *class; 4189 const struct sched_class *class;
4191 struct task_struct *p; 4190 struct task_struct *p;
4192 4191
4193 /* 4192 /*
4194 * Optimization: we know that if all tasks are in 4193 * Optimization: we know that if all tasks are in
4195 * the fair class we can call that function directly: 4194 * the fair class we can call that function directly:
4196 */ 4195 */
4197 if (likely(rq->nr_running == rq->cfs.nr_running)) { 4196 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4198 p = fair_sched_class.pick_next_task(rq); 4197 p = fair_sched_class.pick_next_task(rq);
4199 if (likely(p)) 4198 if (likely(p))
4200 return p; 4199 return p;
4201 } 4200 }
4202 4201
4203 for_each_class(class) { 4202 for_each_class(class) {
4204 p = class->pick_next_task(rq); 4203 p = class->pick_next_task(rq);
4205 if (p) 4204 if (p)
4206 return p; 4205 return p;
4207 } 4206 }
4208 4207
4209 BUG(); /* the idle class will always have a runnable task */ 4208 BUG(); /* the idle class will always have a runnable task */
4210 } 4209 }
4211 4210
4212 /* 4211 /*
4213 * schedule() is the main scheduler function. 4212 * schedule() is the main scheduler function.
4214 */ 4213 */
4215 asmlinkage void __sched schedule(void) 4214 asmlinkage void __sched schedule(void)
4216 { 4215 {
4217 struct task_struct *prev, *next; 4216 struct task_struct *prev, *next;
4218 unsigned long *switch_count; 4217 unsigned long *switch_count;
4219 struct rq *rq; 4218 struct rq *rq;
4220 int cpu; 4219 int cpu;
4221 4220
4222 need_resched: 4221 need_resched:
4223 preempt_disable(); 4222 preempt_disable();
4224 cpu = smp_processor_id(); 4223 cpu = smp_processor_id();
4225 rq = cpu_rq(cpu); 4224 rq = cpu_rq(cpu);
4226 rcu_note_context_switch(cpu); 4225 rcu_note_context_switch(cpu);
4227 prev = rq->curr; 4226 prev = rq->curr;
4228 4227
4229 schedule_debug(prev); 4228 schedule_debug(prev);
4230 4229
4231 if (sched_feat(HRTICK)) 4230 if (sched_feat(HRTICK))
4232 hrtick_clear(rq); 4231 hrtick_clear(rq);
4233 4232
4234 raw_spin_lock_irq(&rq->lock); 4233 raw_spin_lock_irq(&rq->lock);
4235 4234
4236 switch_count = &prev->nivcsw; 4235 switch_count = &prev->nivcsw;
4237 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4236 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4238 if (unlikely(signal_pending_state(prev->state, prev))) { 4237 if (unlikely(signal_pending_state(prev->state, prev))) {
4239 prev->state = TASK_RUNNING; 4238 prev->state = TASK_RUNNING;
4240 } else { 4239 } else {
4241 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4240 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4242 prev->on_rq = 0; 4241 prev->on_rq = 0;
4243 4242
4244 /* 4243 /*
4245 * If a worker went to sleep, notify and ask workqueue 4244 * If a worker went to sleep, notify and ask workqueue
4246 * whether it wants to wake up a task to maintain 4245 * whether it wants to wake up a task to maintain
4247 * concurrency. 4246 * concurrency.
4248 */ 4247 */
4249 if (prev->flags & PF_WQ_WORKER) { 4248 if (prev->flags & PF_WQ_WORKER) {
4250 struct task_struct *to_wakeup; 4249 struct task_struct *to_wakeup;
4251 4250
4252 to_wakeup = wq_worker_sleeping(prev, cpu); 4251 to_wakeup = wq_worker_sleeping(prev, cpu);
4253 if (to_wakeup) 4252 if (to_wakeup)
4254 try_to_wake_up_local(to_wakeup); 4253 try_to_wake_up_local(to_wakeup);
4255 } 4254 }
4256 4255
4257 /* 4256 /*
4258 * If we are going to sleep and we have plugged IO 4257 * If we are going to sleep and we have plugged IO
4259 * queued, make sure to submit it to avoid deadlocks. 4258 * queued, make sure to submit it to avoid deadlocks.
4260 */ 4259 */
4261 if (blk_needs_flush_plug(prev)) { 4260 if (blk_needs_flush_plug(prev)) {
4262 raw_spin_unlock(&rq->lock); 4261 raw_spin_unlock(&rq->lock);
4263 blk_schedule_flush_plug(prev); 4262 blk_schedule_flush_plug(prev);
4264 raw_spin_lock(&rq->lock); 4263 raw_spin_lock(&rq->lock);
4265 } 4264 }
4266 } 4265 }
4267 switch_count = &prev->nvcsw; 4266 switch_count = &prev->nvcsw;
4268 } 4267 }
4269 4268
4270 pre_schedule(rq, prev); 4269 pre_schedule(rq, prev);
4271 4270
4272 if (unlikely(!rq->nr_running)) 4271 if (unlikely(!rq->nr_running))
4273 idle_balance(cpu, rq); 4272 idle_balance(cpu, rq);
4274 4273
4275 put_prev_task(rq, prev); 4274 put_prev_task(rq, prev);
4276 next = pick_next_task(rq); 4275 next = pick_next_task(rq);
4277 clear_tsk_need_resched(prev); 4276 clear_tsk_need_resched(prev);
4278 rq->skip_clock_update = 0; 4277 rq->skip_clock_update = 0;
4279 4278
4280 if (likely(prev != next)) { 4279 if (likely(prev != next)) {
4281 rq->nr_switches++; 4280 rq->nr_switches++;
4282 rq->curr = next; 4281 rq->curr = next;
4283 ++*switch_count; 4282 ++*switch_count;
4284 4283
4285 context_switch(rq, prev, next); /* unlocks the rq */ 4284 context_switch(rq, prev, next); /* unlocks the rq */
4286 /* 4285 /*
4287 * The context switch have flipped the stack from under us 4286 * The context switch have flipped the stack from under us
4288 * and restored the local variables which were saved when 4287 * and restored the local variables which were saved when
4289 * this task called schedule() in the past. prev == current 4288 * this task called schedule() in the past. prev == current
4290 * is still correct, but it can be moved to another cpu/rq. 4289 * is still correct, but it can be moved to another cpu/rq.
4291 */ 4290 */
4292 cpu = smp_processor_id(); 4291 cpu = smp_processor_id();
4293 rq = cpu_rq(cpu); 4292 rq = cpu_rq(cpu);
4294 } else 4293 } else
4295 raw_spin_unlock_irq(&rq->lock); 4294 raw_spin_unlock_irq(&rq->lock);
4296 4295
4297 post_schedule(rq); 4296 post_schedule(rq);
4298 4297
4299 preempt_enable_no_resched(); 4298 preempt_enable_no_resched();
4300 if (need_resched()) 4299 if (need_resched())
4301 goto need_resched; 4300 goto need_resched;
4302 } 4301 }
4303 EXPORT_SYMBOL(schedule); 4302 EXPORT_SYMBOL(schedule);
4304 4303
4305 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4304 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4306 4305
4307 static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 4306 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4308 { 4307 {
4309 bool ret = false; 4308 bool ret = false;
4310 4309
4311 rcu_read_lock(); 4310 rcu_read_lock();
4312 if (lock->owner != owner) 4311 if (lock->owner != owner)
4313 goto fail; 4312 goto fail;
4314 4313
4315 /* 4314 /*
4316 * Ensure we emit the owner->on_cpu, dereference _after_ checking 4315 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4317 * lock->owner still matches owner, if that fails, owner might 4316 * lock->owner still matches owner, if that fails, owner might
4318 * point to free()d memory, if it still matches, the rcu_read_lock() 4317 * point to free()d memory, if it still matches, the rcu_read_lock()
4319 * ensures the memory stays valid. 4318 * ensures the memory stays valid.
4320 */ 4319 */
4321 barrier(); 4320 barrier();
4322 4321
4323 ret = owner->on_cpu; 4322 ret = owner->on_cpu;
4324 fail: 4323 fail:
4325 rcu_read_unlock(); 4324 rcu_read_unlock();
4326 4325
4327 return ret; 4326 return ret;
4328 } 4327 }
4329 4328
4330 /* 4329 /*
4331 * Look out! "owner" is an entirely speculative pointer 4330 * Look out! "owner" is an entirely speculative pointer
4332 * access and not reliable. 4331 * access and not reliable.
4333 */ 4332 */
4334 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) 4333 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4335 { 4334 {
4336 if (!sched_feat(OWNER_SPIN)) 4335 if (!sched_feat(OWNER_SPIN))
4337 return 0; 4336 return 0;
4338 4337
4339 while (owner_running(lock, owner)) { 4338 while (owner_running(lock, owner)) {
4340 if (need_resched()) 4339 if (need_resched())
4341 return 0; 4340 return 0;
4342 4341
4343 arch_mutex_cpu_relax(); 4342 arch_mutex_cpu_relax();
4344 } 4343 }
4345 4344
4346 /* 4345 /*
4347 * If the owner changed to another task there is likely 4346 * If the owner changed to another task there is likely
4348 * heavy contention, stop spinning. 4347 * heavy contention, stop spinning.
4349 */ 4348 */
4350 if (lock->owner) 4349 if (lock->owner)
4351 return 0; 4350 return 0;
4352 4351
4353 return 1; 4352 return 1;
4354 } 4353 }
4355 #endif 4354 #endif
4356 4355
4357 #ifdef CONFIG_PREEMPT 4356 #ifdef CONFIG_PREEMPT
4358 /* 4357 /*
4359 * this is the entry point to schedule() from in-kernel preemption 4358 * this is the entry point to schedule() from in-kernel preemption
4360 * off of preempt_enable. Kernel preemptions off return from interrupt 4359 * off of preempt_enable. Kernel preemptions off return from interrupt
4361 * occur there and call schedule directly. 4360 * occur there and call schedule directly.
4362 */ 4361 */
4363 asmlinkage void __sched notrace preempt_schedule(void) 4362 asmlinkage void __sched notrace preempt_schedule(void)
4364 { 4363 {
4365 struct thread_info *ti = current_thread_info(); 4364 struct thread_info *ti = current_thread_info();
4366 4365
4367 /* 4366 /*
4368 * If there is a non-zero preempt_count or interrupts are disabled, 4367 * If there is a non-zero preempt_count or interrupts are disabled,
4369 * we do not want to preempt the current task. Just return.. 4368 * we do not want to preempt the current task. Just return..
4370 */ 4369 */
4371 if (likely(ti->preempt_count || irqs_disabled())) 4370 if (likely(ti->preempt_count || irqs_disabled()))
4372 return; 4371 return;
4373 4372
4374 do { 4373 do {
4375 add_preempt_count_notrace(PREEMPT_ACTIVE); 4374 add_preempt_count_notrace(PREEMPT_ACTIVE);
4376 schedule(); 4375 schedule();
4377 sub_preempt_count_notrace(PREEMPT_ACTIVE); 4376 sub_preempt_count_notrace(PREEMPT_ACTIVE);
4378 4377
4379 /* 4378 /*
4380 * Check again in case we missed a preemption opportunity 4379 * Check again in case we missed a preemption opportunity
4381 * between schedule and now. 4380 * between schedule and now.
4382 */ 4381 */
4383 barrier(); 4382 barrier();
4384 } while (need_resched()); 4383 } while (need_resched());
4385 } 4384 }
4386 EXPORT_SYMBOL(preempt_schedule); 4385 EXPORT_SYMBOL(preempt_schedule);
4387 4386
4388 /* 4387 /*
4389 * this is the entry point to schedule() from kernel preemption 4388 * this is the entry point to schedule() from kernel preemption
4390 * off of irq context. 4389 * off of irq context.
4391 * Note, that this is called and return with irqs disabled. This will 4390 * Note, that this is called and return with irqs disabled. This will
4392 * protect us against recursive calling from irq. 4391 * protect us against recursive calling from irq.
4393 */ 4392 */
4394 asmlinkage void __sched preempt_schedule_irq(void) 4393 asmlinkage void __sched preempt_schedule_irq(void)
4395 { 4394 {
4396 struct thread_info *ti = current_thread_info(); 4395 struct thread_info *ti = current_thread_info();
4397 4396
4398 /* Catch callers which need to be fixed */ 4397 /* Catch callers which need to be fixed */
4399 BUG_ON(ti->preempt_count || !irqs_disabled()); 4398 BUG_ON(ti->preempt_count || !irqs_disabled());
4400 4399
4401 do { 4400 do {
4402 add_preempt_count(PREEMPT_ACTIVE); 4401 add_preempt_count(PREEMPT_ACTIVE);
4403 local_irq_enable(); 4402 local_irq_enable();
4404 schedule(); 4403 schedule();
4405 local_irq_disable(); 4404 local_irq_disable();
4406 sub_preempt_count(PREEMPT_ACTIVE); 4405 sub_preempt_count(PREEMPT_ACTIVE);
4407 4406
4408 /* 4407 /*
4409 * Check again in case we missed a preemption opportunity 4408 * Check again in case we missed a preemption opportunity
4410 * between schedule and now. 4409 * between schedule and now.
4411 */ 4410 */
4412 barrier(); 4411 barrier();
4413 } while (need_resched()); 4412 } while (need_resched());
4414 } 4413 }
4415 4414
4416 #endif /* CONFIG_PREEMPT */ 4415 #endif /* CONFIG_PREEMPT */
4417 4416
4418 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 4417 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
4419 void *key) 4418 void *key)
4420 { 4419 {
4421 return try_to_wake_up(curr->private, mode, wake_flags); 4420 return try_to_wake_up(curr->private, mode, wake_flags);
4422 } 4421 }
4423 EXPORT_SYMBOL(default_wake_function); 4422 EXPORT_SYMBOL(default_wake_function);
4424 4423
4425 /* 4424 /*
4426 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 4425 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
4427 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 4426 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
4428 * number) then we wake all the non-exclusive tasks and one exclusive task. 4427 * number) then we wake all the non-exclusive tasks and one exclusive task.
4429 * 4428 *
4430 * There are circumstances in which we can try to wake a task which has already 4429 * There are circumstances in which we can try to wake a task which has already
4431 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 4430 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4432 * zero in this (rare) case, and we handle it by continuing to scan the queue. 4431 * zero in this (rare) case, and we handle it by continuing to scan the queue.
4433 */ 4432 */
4434 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 4433 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4435 int nr_exclusive, int wake_flags, void *key) 4434 int nr_exclusive, int wake_flags, void *key)
4436 { 4435 {
4437 wait_queue_t *curr, *next; 4436 wait_queue_t *curr, *next;
4438 4437
4439 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 4438 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4440 unsigned flags = curr->flags; 4439 unsigned flags = curr->flags;
4441 4440
4442 if (curr->func(curr, mode, wake_flags, key) && 4441 if (curr->func(curr, mode, wake_flags, key) &&
4443 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 4442 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4444 break; 4443 break;
4445 } 4444 }
4446 } 4445 }
4447 4446
4448 /** 4447 /**
4449 * __wake_up - wake up threads blocked on a waitqueue. 4448 * __wake_up - wake up threads blocked on a waitqueue.
4450 * @q: the waitqueue 4449 * @q: the waitqueue
4451 * @mode: which threads 4450 * @mode: which threads
4452 * @nr_exclusive: how many wake-one or wake-many threads to wake up 4451 * @nr_exclusive: how many wake-one or wake-many threads to wake up
4453 * @key: is directly passed to the wakeup function 4452 * @key: is directly passed to the wakeup function
4454 * 4453 *
4455 * It may be assumed that this function implies a write memory barrier before 4454 * It may be assumed that this function implies a write memory barrier before
4456 * changing the task state if and only if any tasks are woken up. 4455 * changing the task state if and only if any tasks are woken up.
4457 */ 4456 */
4458 void __wake_up(wait_queue_head_t *q, unsigned int mode, 4457 void __wake_up(wait_queue_head_t *q, unsigned int mode,
4459 int nr_exclusive, void *key) 4458 int nr_exclusive, void *key)
4460 { 4459 {
4461 unsigned long flags; 4460 unsigned long flags;
4462 4461
4463 spin_lock_irqsave(&q->lock, flags); 4462 spin_lock_irqsave(&q->lock, flags);
4464 __wake_up_common(q, mode, nr_exclusive, 0, key); 4463 __wake_up_common(q, mode, nr_exclusive, 0, key);
4465 spin_unlock_irqrestore(&q->lock, flags); 4464 spin_unlock_irqrestore(&q->lock, flags);
4466 } 4465 }
4467 EXPORT_SYMBOL(__wake_up); 4466 EXPORT_SYMBOL(__wake_up);
4468 4467
4469 /* 4468 /*
4470 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 4469 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
4471 */ 4470 */
4472 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 4471 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4473 { 4472 {
4474 __wake_up_common(q, mode, 1, 0, NULL); 4473 __wake_up_common(q, mode, 1, 0, NULL);
4475 } 4474 }
4476 EXPORT_SYMBOL_GPL(__wake_up_locked); 4475 EXPORT_SYMBOL_GPL(__wake_up_locked);
4477 4476
4478 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 4477 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4479 { 4478 {
4480 __wake_up_common(q, mode, 1, 0, key); 4479 __wake_up_common(q, mode, 1, 0, key);
4481 } 4480 }
4482 EXPORT_SYMBOL_GPL(__wake_up_locked_key); 4481 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4483 4482
4484 /** 4483 /**
4485 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4484 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
4486 * @q: the waitqueue 4485 * @q: the waitqueue
4487 * @mode: which threads 4486 * @mode: which threads
4488 * @nr_exclusive: how many wake-one or wake-many threads to wake up 4487 * @nr_exclusive: how many wake-one or wake-many threads to wake up
4489 * @key: opaque value to be passed to wakeup targets 4488 * @key: opaque value to be passed to wakeup targets
4490 * 4489 *
4491 * The sync wakeup differs that the waker knows that it will schedule 4490 * The sync wakeup differs that the waker knows that it will schedule
4492 * away soon, so while the target thread will be woken up, it will not 4491 * away soon, so while the target thread will be woken up, it will not
4493 * be migrated to another CPU - ie. the two threads are 'synchronized' 4492 * be migrated to another CPU - ie. the two threads are 'synchronized'
4494 * with each other. This can prevent needless bouncing between CPUs. 4493 * with each other. This can prevent needless bouncing between CPUs.
4495 * 4494 *
4496 * On UP it can prevent extra preemption. 4495 * On UP it can prevent extra preemption.
4497 * 4496 *
4498 * It may be assumed that this function implies a write memory barrier before 4497 * It may be assumed that this function implies a write memory barrier before
4499 * changing the task state if and only if any tasks are woken up. 4498 * changing the task state if and only if any tasks are woken up.
4500 */ 4499 */
4501 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 4500 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4502 int nr_exclusive, void *key) 4501 int nr_exclusive, void *key)
4503 { 4502 {
4504 unsigned long flags; 4503 unsigned long flags;
4505 int wake_flags = WF_SYNC; 4504 int wake_flags = WF_SYNC;
4506 4505
4507 if (unlikely(!q)) 4506 if (unlikely(!q))
4508 return; 4507 return;
4509 4508
4510 if (unlikely(!nr_exclusive)) 4509 if (unlikely(!nr_exclusive))
4511 wake_flags = 0; 4510 wake_flags = 0;
4512 4511
4513 spin_lock_irqsave(&q->lock, flags); 4512 spin_lock_irqsave(&q->lock, flags);
4514 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 4513 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
4515 spin_unlock_irqrestore(&q->lock, flags); 4514 spin_unlock_irqrestore(&q->lock, flags);
4516 } 4515 }
4517 EXPORT_SYMBOL_GPL(__wake_up_sync_key); 4516 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4518 4517
4519 /* 4518 /*
4520 * __wake_up_sync - see __wake_up_sync_key() 4519 * __wake_up_sync - see __wake_up_sync_key()
4521 */ 4520 */
4522 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 4521 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4523 { 4522 {
4524 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 4523 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4525 } 4524 }
4526 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4525 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4527 4526
4528 /** 4527 /**
4529 * complete: - signals a single thread waiting on this completion 4528 * complete: - signals a single thread waiting on this completion
4530 * @x: holds the state of this particular completion 4529 * @x: holds the state of this particular completion
4531 * 4530 *
4532 * This will wake up a single thread waiting on this completion. Threads will be 4531 * This will wake up a single thread waiting on this completion. Threads will be
4533 * awakened in the same order in which they were queued. 4532 * awakened in the same order in which they were queued.
4534 * 4533 *
4535 * See also complete_all(), wait_for_completion() and related routines. 4534 * See also complete_all(), wait_for_completion() and related routines.
4536 * 4535 *
4537 * It may be assumed that this function implies a write memory barrier before 4536 * It may be assumed that this function implies a write memory barrier before
4538 * changing the task state if and only if any tasks are woken up. 4537 * changing the task state if and only if any tasks are woken up.
4539 */ 4538 */
4540 void complete(struct completion *x) 4539 void complete(struct completion *x)
4541 { 4540 {
4542 unsigned long flags; 4541 unsigned long flags;
4543 4542
4544 spin_lock_irqsave(&x->wait.lock, flags); 4543 spin_lock_irqsave(&x->wait.lock, flags);
4545 x->done++; 4544 x->done++;
4546 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 4545 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4547 spin_unlock_irqrestore(&x->wait.lock, flags); 4546 spin_unlock_irqrestore(&x->wait.lock, flags);
4548 } 4547 }
4549 EXPORT_SYMBOL(complete); 4548 EXPORT_SYMBOL(complete);
4550 4549
4551 /** 4550 /**
4552 * complete_all: - signals all threads waiting on this completion 4551 * complete_all: - signals all threads waiting on this completion
4553 * @x: holds the state of this particular completion 4552 * @x: holds the state of this particular completion
4554 * 4553 *
4555 * This will wake up all threads waiting on this particular completion event. 4554 * This will wake up all threads waiting on this particular completion event.
4556 * 4555 *
4557 * It may be assumed that this function implies a write memory barrier before 4556 * It may be assumed that this function implies a write memory barrier before
4558 * changing the task state if and only if any tasks are woken up. 4557 * changing the task state if and only if any tasks are woken up.
4559 */ 4558 */
4560 void complete_all(struct completion *x) 4559 void complete_all(struct completion *x)
4561 { 4560 {
4562 unsigned long flags; 4561 unsigned long flags;
4563 4562
4564 spin_lock_irqsave(&x->wait.lock, flags); 4563 spin_lock_irqsave(&x->wait.lock, flags);
4565 x->done += UINT_MAX/2; 4564 x->done += UINT_MAX/2;
4566 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 4565 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4567 spin_unlock_irqrestore(&x->wait.lock, flags); 4566 spin_unlock_irqrestore(&x->wait.lock, flags);
4568 } 4567 }
4569 EXPORT_SYMBOL(complete_all); 4568 EXPORT_SYMBOL(complete_all);
4570 4569
4571 static inline long __sched 4570 static inline long __sched
4572 do_wait_for_common(struct completion *x, long timeout, int state) 4571 do_wait_for_common(struct completion *x, long timeout, int state)
4573 { 4572 {
4574 if (!x->done) { 4573 if (!x->done) {
4575 DECLARE_WAITQUEUE(wait, current); 4574 DECLARE_WAITQUEUE(wait, current);
4576 4575
4577 __add_wait_queue_tail_exclusive(&x->wait, &wait); 4576 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4578 do { 4577 do {
4579 if (signal_pending_state(state, current)) { 4578 if (signal_pending_state(state, current)) {
4580 timeout = -ERESTARTSYS; 4579 timeout = -ERESTARTSYS;
4581 break; 4580 break;
4582 } 4581 }
4583 __set_current_state(state); 4582 __set_current_state(state);
4584 spin_unlock_irq(&x->wait.lock); 4583 spin_unlock_irq(&x->wait.lock);
4585 timeout = schedule_timeout(timeout); 4584 timeout = schedule_timeout(timeout);
4586 spin_lock_irq(&x->wait.lock); 4585 spin_lock_irq(&x->wait.lock);
4587 } while (!x->done && timeout); 4586 } while (!x->done && timeout);
4588 __remove_wait_queue(&x->wait, &wait); 4587 __remove_wait_queue(&x->wait, &wait);
4589 if (!x->done) 4588 if (!x->done)
4590 return timeout; 4589 return timeout;
4591 } 4590 }
4592 x->done--; 4591 x->done--;
4593 return timeout ?: 1; 4592 return timeout ?: 1;
4594 } 4593 }
4595 4594
4596 static long __sched 4595 static long __sched
4597 wait_for_common(struct completion *x, long timeout, int state) 4596 wait_for_common(struct completion *x, long timeout, int state)
4598 { 4597 {
4599 might_sleep(); 4598 might_sleep();
4600 4599
4601 spin_lock_irq(&x->wait.lock); 4600 spin_lock_irq(&x->wait.lock);
4602 timeout = do_wait_for_common(x, timeout, state); 4601 timeout = do_wait_for_common(x, timeout, state);
4603 spin_unlock_irq(&x->wait.lock); 4602 spin_unlock_irq(&x->wait.lock);
4604 return timeout; 4603 return timeout;
4605 } 4604 }
4606 4605
4607 /** 4606 /**
4608 * wait_for_completion: - waits for completion of a task 4607 * wait_for_completion: - waits for completion of a task
4609 * @x: holds the state of this particular completion 4608 * @x: holds the state of this particular completion
4610 * 4609 *
4611 * This waits to be signaled for completion of a specific task. It is NOT 4610 * This waits to be signaled for completion of a specific task. It is NOT
4612 * interruptible and there is no timeout. 4611 * interruptible and there is no timeout.
4613 * 4612 *
4614 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 4613 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4615 * and interrupt capability. Also see complete(). 4614 * and interrupt capability. Also see complete().
4616 */ 4615 */
4617 void __sched wait_for_completion(struct completion *x) 4616 void __sched wait_for_completion(struct completion *x)
4618 { 4617 {
4619 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4618 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4620 } 4619 }
4621 EXPORT_SYMBOL(wait_for_completion); 4620 EXPORT_SYMBOL(wait_for_completion);
4622 4621
4623 /** 4622 /**
4624 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 4623 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4625 * @x: holds the state of this particular completion 4624 * @x: holds the state of this particular completion
4626 * @timeout: timeout value in jiffies 4625 * @timeout: timeout value in jiffies
4627 * 4626 *
4628 * This waits for either a completion of a specific task to be signaled or for a 4627 * This waits for either a completion of a specific task to be signaled or for a
4629 * specified timeout to expire. The timeout is in jiffies. It is not 4628 * specified timeout to expire. The timeout is in jiffies. It is not
4630 * interruptible. 4629 * interruptible.
4631 */ 4630 */
4632 unsigned long __sched 4631 unsigned long __sched
4633 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4632 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4634 { 4633 {
4635 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 4634 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4636 } 4635 }
4637 EXPORT_SYMBOL(wait_for_completion_timeout); 4636 EXPORT_SYMBOL(wait_for_completion_timeout);
4638 4637
4639 /** 4638 /**
4640 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 4639 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4641 * @x: holds the state of this particular completion 4640 * @x: holds the state of this particular completion
4642 * 4641 *
4643 * This waits for completion of a specific task to be signaled. It is 4642 * This waits for completion of a specific task to be signaled. It is
4644 * interruptible. 4643 * interruptible.
4645 */ 4644 */
4646 int __sched wait_for_completion_interruptible(struct completion *x) 4645 int __sched wait_for_completion_interruptible(struct completion *x)
4647 { 4646 {
4648 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4647 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4649 if (t == -ERESTARTSYS) 4648 if (t == -ERESTARTSYS)
4650 return t; 4649 return t;
4651 return 0; 4650 return 0;
4652 } 4651 }
4653 EXPORT_SYMBOL(wait_for_completion_interruptible); 4652 EXPORT_SYMBOL(wait_for_completion_interruptible);
4654 4653
4655 /** 4654 /**
4656 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 4655 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4657 * @x: holds the state of this particular completion 4656 * @x: holds the state of this particular completion
4658 * @timeout: timeout value in jiffies 4657 * @timeout: timeout value in jiffies
4659 * 4658 *
4660 * This waits for either a completion of a specific task to be signaled or for a 4659 * This waits for either a completion of a specific task to be signaled or for a
4661 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4660 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4662 */ 4661 */
4663 long __sched 4662 long __sched
4664 wait_for_completion_interruptible_timeout(struct completion *x, 4663 wait_for_completion_interruptible_timeout(struct completion *x,
4665 unsigned long timeout) 4664 unsigned long timeout)
4666 { 4665 {
4667 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 4666 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4668 } 4667 }
4669 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4668 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4670 4669
4671 /** 4670 /**
4672 * wait_for_completion_killable: - waits for completion of a task (killable) 4671 * wait_for_completion_killable: - waits for completion of a task (killable)
4673 * @x: holds the state of this particular completion 4672 * @x: holds the state of this particular completion
4674 * 4673 *
4675 * This waits to be signaled for completion of a specific task. It can be 4674 * This waits to be signaled for completion of a specific task. It can be
4676 * interrupted by a kill signal. 4675 * interrupted by a kill signal.
4677 */ 4676 */
4678 int __sched wait_for_completion_killable(struct completion *x) 4677 int __sched wait_for_completion_killable(struct completion *x)
4679 { 4678 {
4680 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4679 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4681 if (t == -ERESTARTSYS) 4680 if (t == -ERESTARTSYS)
4682 return t; 4681 return t;
4683 return 0; 4682 return 0;
4684 } 4683 }
4685 EXPORT_SYMBOL(wait_for_completion_killable); 4684 EXPORT_SYMBOL(wait_for_completion_killable);
4686 4685
4687 /** 4686 /**
4688 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) 4687 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4689 * @x: holds the state of this particular completion 4688 * @x: holds the state of this particular completion
4690 * @timeout: timeout value in jiffies 4689 * @timeout: timeout value in jiffies
4691 * 4690 *
4692 * This waits for either a completion of a specific task to be 4691 * This waits for either a completion of a specific task to be
4693 * signaled or for a specified timeout to expire. It can be 4692 * signaled or for a specified timeout to expire. It can be
4694 * interrupted by a kill signal. The timeout is in jiffies. 4693 * interrupted by a kill signal. The timeout is in jiffies.
4695 */ 4694 */
4696 long __sched 4695 long __sched
4697 wait_for_completion_killable_timeout(struct completion *x, 4696 wait_for_completion_killable_timeout(struct completion *x,
4698 unsigned long timeout) 4697 unsigned long timeout)
4699 { 4698 {
4700 return wait_for_common(x, timeout, TASK_KILLABLE); 4699 return wait_for_common(x, timeout, TASK_KILLABLE);
4701 } 4700 }
4702 EXPORT_SYMBOL(wait_for_completion_killable_timeout); 4701 EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4703 4702
4704 /** 4703 /**
4705 * try_wait_for_completion - try to decrement a completion without blocking 4704 * try_wait_for_completion - try to decrement a completion without blocking
4706 * @x: completion structure 4705 * @x: completion structure
4707 * 4706 *
4708 * Returns: 0 if a decrement cannot be done without blocking 4707 * Returns: 0 if a decrement cannot be done without blocking
4709 * 1 if a decrement succeeded. 4708 * 1 if a decrement succeeded.
4710 * 4709 *
4711 * If a completion is being used as a counting completion, 4710 * If a completion is being used as a counting completion,
4712 * attempt to decrement the counter without blocking. This 4711 * attempt to decrement the counter without blocking. This
4713 * enables us to avoid waiting if the resource the completion 4712 * enables us to avoid waiting if the resource the completion
4714 * is protecting is not available. 4713 * is protecting is not available.
4715 */ 4714 */
4716 bool try_wait_for_completion(struct completion *x) 4715 bool try_wait_for_completion(struct completion *x)
4717 { 4716 {
4718 unsigned long flags; 4717 unsigned long flags;
4719 int ret = 1; 4718 int ret = 1;
4720 4719
4721 spin_lock_irqsave(&x->wait.lock, flags); 4720 spin_lock_irqsave(&x->wait.lock, flags);
4722 if (!x->done) 4721 if (!x->done)
4723 ret = 0; 4722 ret = 0;
4724 else 4723 else
4725 x->done--; 4724 x->done--;
4726 spin_unlock_irqrestore(&x->wait.lock, flags); 4725 spin_unlock_irqrestore(&x->wait.lock, flags);
4727 return ret; 4726 return ret;
4728 } 4727 }
4729 EXPORT_SYMBOL(try_wait_for_completion); 4728 EXPORT_SYMBOL(try_wait_for_completion);
4730 4729
4731 /** 4730 /**
4732 * completion_done - Test to see if a completion has any waiters 4731 * completion_done - Test to see if a completion has any waiters
4733 * @x: completion structure 4732 * @x: completion structure
4734 * 4733 *
4735 * Returns: 0 if there are waiters (wait_for_completion() in progress) 4734 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4736 * 1 if there are no waiters. 4735 * 1 if there are no waiters.
4737 * 4736 *
4738 */ 4737 */
4739 bool completion_done(struct completion *x) 4738 bool completion_done(struct completion *x)
4740 { 4739 {
4741 unsigned long flags; 4740 unsigned long flags;
4742 int ret = 1; 4741 int ret = 1;
4743 4742
4744 spin_lock_irqsave(&x->wait.lock, flags); 4743 spin_lock_irqsave(&x->wait.lock, flags);
4745 if (!x->done) 4744 if (!x->done)
4746 ret = 0; 4745 ret = 0;
4747 spin_unlock_irqrestore(&x->wait.lock, flags); 4746 spin_unlock_irqrestore(&x->wait.lock, flags);
4748 return ret; 4747 return ret;
4749 } 4748 }
4750 EXPORT_SYMBOL(completion_done); 4749 EXPORT_SYMBOL(completion_done);
4751 4750
4752 static long __sched 4751 static long __sched
4753 sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4752 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4754 { 4753 {
4755 unsigned long flags; 4754 unsigned long flags;
4756 wait_queue_t wait; 4755 wait_queue_t wait;
4757 4756
4758 init_waitqueue_entry(&wait, current); 4757 init_waitqueue_entry(&wait, current);
4759 4758
4760 __set_current_state(state); 4759 __set_current_state(state);
4761 4760
4762 spin_lock_irqsave(&q->lock, flags); 4761 spin_lock_irqsave(&q->lock, flags);
4763 __add_wait_queue(q, &wait); 4762 __add_wait_queue(q, &wait);
4764 spin_unlock(&q->lock); 4763 spin_unlock(&q->lock);
4765 timeout = schedule_timeout(timeout); 4764 timeout = schedule_timeout(timeout);
4766 spin_lock_irq(&q->lock); 4765 spin_lock_irq(&q->lock);
4767 __remove_wait_queue(q, &wait); 4766 __remove_wait_queue(q, &wait);
4768 spin_unlock_irqrestore(&q->lock, flags); 4767 spin_unlock_irqrestore(&q->lock, flags);
4769 4768
4770 return timeout; 4769 return timeout;
4771 } 4770 }
4772 4771
4773 void __sched interruptible_sleep_on(wait_queue_head_t *q) 4772 void __sched interruptible_sleep_on(wait_queue_head_t *q)
4774 { 4773 {
4775 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 4774 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4776 } 4775 }
4777 EXPORT_SYMBOL(interruptible_sleep_on); 4776 EXPORT_SYMBOL(interruptible_sleep_on);
4778 4777
4779 long __sched 4778 long __sched
4780 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 4779 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4781 { 4780 {
4782 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 4781 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4783 } 4782 }
4784 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 4783 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4785 4784
4786 void __sched sleep_on(wait_queue_head_t *q) 4785 void __sched sleep_on(wait_queue_head_t *q)
4787 { 4786 {
4788 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 4787 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4789 } 4788 }
4790 EXPORT_SYMBOL(sleep_on); 4789 EXPORT_SYMBOL(sleep_on);
4791 4790
4792 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 4791 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4793 { 4792 {
4794 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 4793 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4795 } 4794 }
4796 EXPORT_SYMBOL(sleep_on_timeout); 4795 EXPORT_SYMBOL(sleep_on_timeout);
4797 4796
4798 #ifdef CONFIG_RT_MUTEXES 4797 #ifdef CONFIG_RT_MUTEXES
4799 4798
4800 /* 4799 /*
4801 * rt_mutex_setprio - set the current priority of a task 4800 * rt_mutex_setprio - set the current priority of a task
4802 * @p: task 4801 * @p: task
4803 * @prio: prio value (kernel-internal form) 4802 * @prio: prio value (kernel-internal form)
4804 * 4803 *
4805 * This function changes the 'effective' priority of a task. It does 4804 * This function changes the 'effective' priority of a task. It does
4806 * not touch ->normal_prio like __setscheduler(). 4805 * not touch ->normal_prio like __setscheduler().
4807 * 4806 *
4808 * Used by the rt_mutex code to implement priority inheritance logic. 4807 * Used by the rt_mutex code to implement priority inheritance logic.
4809 */ 4808 */
4810 void rt_mutex_setprio(struct task_struct *p, int prio) 4809 void rt_mutex_setprio(struct task_struct *p, int prio)
4811 { 4810 {
4812 int oldprio, on_rq, running; 4811 int oldprio, on_rq, running;
4813 struct rq *rq; 4812 struct rq *rq;
4814 const struct sched_class *prev_class; 4813 const struct sched_class *prev_class;
4815 4814
4816 BUG_ON(prio < 0 || prio > MAX_PRIO); 4815 BUG_ON(prio < 0 || prio > MAX_PRIO);
4817 4816
4818 rq = __task_rq_lock(p); 4817 rq = __task_rq_lock(p);
4819 4818
4820 trace_sched_pi_setprio(p, prio); 4819 trace_sched_pi_setprio(p, prio);
4821 oldprio = p->prio; 4820 oldprio = p->prio;
4822 prev_class = p->sched_class; 4821 prev_class = p->sched_class;
4823 on_rq = p->on_rq; 4822 on_rq = p->on_rq;
4824 running = task_current(rq, p); 4823 running = task_current(rq, p);
4825 if (on_rq) 4824 if (on_rq)
4826 dequeue_task(rq, p, 0); 4825 dequeue_task(rq, p, 0);
4827 if (running) 4826 if (running)
4828 p->sched_class->put_prev_task(rq, p); 4827 p->sched_class->put_prev_task(rq, p);
4829 4828
4830 if (rt_prio(prio)) 4829 if (rt_prio(prio))
4831 p->sched_class = &rt_sched_class; 4830 p->sched_class = &rt_sched_class;
4832 else 4831 else
4833 p->sched_class = &fair_sched_class; 4832 p->sched_class = &fair_sched_class;
4834 4833
4835 p->prio = prio; 4834 p->prio = prio;
4836 4835
4837 if (running) 4836 if (running)
4838 p->sched_class->set_curr_task(rq); 4837 p->sched_class->set_curr_task(rq);
4839 if (on_rq) 4838 if (on_rq)
4840 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4839 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4841 4840
4842 check_class_changed(rq, p, prev_class, oldprio); 4841 check_class_changed(rq, p, prev_class, oldprio);
4843 __task_rq_unlock(rq); 4842 __task_rq_unlock(rq);
4844 } 4843 }
4845 4844
4846 #endif 4845 #endif
4847 4846
4848 void set_user_nice(struct task_struct *p, long nice) 4847 void set_user_nice(struct task_struct *p, long nice)
4849 { 4848 {
4850 int old_prio, delta, on_rq; 4849 int old_prio, delta, on_rq;
4851 unsigned long flags; 4850 unsigned long flags;
4852 struct rq *rq; 4851 struct rq *rq;
4853 4852
4854 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 4853 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4855 return; 4854 return;
4856 /* 4855 /*
4857 * We have to be careful, if called from sys_setpriority(), 4856 * We have to be careful, if called from sys_setpriority(),
4858 * the task might be in the middle of scheduling on another CPU. 4857 * the task might be in the middle of scheduling on another CPU.
4859 */ 4858 */
4860 rq = task_rq_lock(p, &flags); 4859 rq = task_rq_lock(p, &flags);
4861 /* 4860 /*
4862 * The RT priorities are set via sched_setscheduler(), but we still 4861 * The RT priorities are set via sched_setscheduler(), but we still
4863 * allow the 'normal' nice value to be set - but as expected 4862 * allow the 'normal' nice value to be set - but as expected
4864 * it wont have any effect on scheduling until the task is 4863 * it wont have any effect on scheduling until the task is
4865 * SCHED_FIFO/SCHED_RR: 4864 * SCHED_FIFO/SCHED_RR:
4866 */ 4865 */
4867 if (task_has_rt_policy(p)) { 4866 if (task_has_rt_policy(p)) {
4868 p->static_prio = NICE_TO_PRIO(nice); 4867 p->static_prio = NICE_TO_PRIO(nice);
4869 goto out_unlock; 4868 goto out_unlock;
4870 } 4869 }
4871 on_rq = p->on_rq; 4870 on_rq = p->on_rq;
4872 if (on_rq) 4871 if (on_rq)
4873 dequeue_task(rq, p, 0); 4872 dequeue_task(rq, p, 0);
4874 4873
4875 p->static_prio = NICE_TO_PRIO(nice); 4874 p->static_prio = NICE_TO_PRIO(nice);
4876 set_load_weight(p); 4875 set_load_weight(p);
4877 old_prio = p->prio; 4876 old_prio = p->prio;
4878 p->prio = effective_prio(p); 4877 p->prio = effective_prio(p);
4879 delta = p->prio - old_prio; 4878 delta = p->prio - old_prio;
4880 4879
4881 if (on_rq) { 4880 if (on_rq) {
4882 enqueue_task(rq, p, 0); 4881 enqueue_task(rq, p, 0);
4883 /* 4882 /*
4884 * If the task increased its priority or is running and 4883 * If the task increased its priority or is running and
4885 * lowered its priority, then reschedule its CPU: 4884 * lowered its priority, then reschedule its CPU:
4886 */ 4885 */
4887 if (delta < 0 || (delta > 0 && task_running(rq, p))) 4886 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4888 resched_task(rq->curr); 4887 resched_task(rq->curr);
4889 } 4888 }
4890 out_unlock: 4889 out_unlock:
4891 task_rq_unlock(rq, p, &flags); 4890 task_rq_unlock(rq, p, &flags);
4892 } 4891 }
4893 EXPORT_SYMBOL(set_user_nice); 4892 EXPORT_SYMBOL(set_user_nice);
4894 4893
4895 /* 4894 /*
4896 * can_nice - check if a task can reduce its nice value 4895 * can_nice - check if a task can reduce its nice value
4897 * @p: task 4896 * @p: task
4898 * @nice: nice value 4897 * @nice: nice value
4899 */ 4898 */
4900 int can_nice(const struct task_struct *p, const int nice) 4899 int can_nice(const struct task_struct *p, const int nice)
4901 { 4900 {
4902 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4901 /* convert nice value [19,-20] to rlimit style value [1,40] */
4903 int nice_rlim = 20 - nice; 4902 int nice_rlim = 20 - nice;
4904 4903
4905 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 4904 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4906 capable(CAP_SYS_NICE)); 4905 capable(CAP_SYS_NICE));
4907 } 4906 }
4908 4907
4909 #ifdef __ARCH_WANT_SYS_NICE 4908 #ifdef __ARCH_WANT_SYS_NICE
4910 4909
4911 /* 4910 /*
4912 * sys_nice - change the priority of the current process. 4911 * sys_nice - change the priority of the current process.
4913 * @increment: priority increment 4912 * @increment: priority increment
4914 * 4913 *
4915 * sys_setpriority is a more generic, but much slower function that 4914 * sys_setpriority is a more generic, but much slower function that
4916 * does similar things. 4915 * does similar things.
4917 */ 4916 */
4918 SYSCALL_DEFINE1(nice, int, increment) 4917 SYSCALL_DEFINE1(nice, int, increment)
4919 { 4918 {
4920 long nice, retval; 4919 long nice, retval;
4921 4920
4922 /* 4921 /*
4923 * Setpriority might change our priority at the same moment. 4922 * Setpriority might change our priority at the same moment.
4924 * We don't have to worry. Conceptually one call occurs first 4923 * We don't have to worry. Conceptually one call occurs first
4925 * and we have a single winner. 4924 * and we have a single winner.
4926 */ 4925 */
4927 if (increment < -40) 4926 if (increment < -40)
4928 increment = -40; 4927 increment = -40;
4929 if (increment > 40) 4928 if (increment > 40)
4930 increment = 40; 4929 increment = 40;
4931 4930
4932 nice = TASK_NICE(current) + increment; 4931 nice = TASK_NICE(current) + increment;
4933 if (nice < -20) 4932 if (nice < -20)
4934 nice = -20; 4933 nice = -20;
4935 if (nice > 19) 4934 if (nice > 19)
4936 nice = 19; 4935 nice = 19;
4937 4936
4938 if (increment < 0 && !can_nice(current, nice)) 4937 if (increment < 0 && !can_nice(current, nice))
4939 return -EPERM; 4938 return -EPERM;
4940 4939
4941 retval = security_task_setnice(current, nice); 4940 retval = security_task_setnice(current, nice);
4942 if (retval) 4941 if (retval)
4943 return retval; 4942 return retval;
4944 4943
4945 set_user_nice(current, nice); 4944 set_user_nice(current, nice);
4946 return 0; 4945 return 0;
4947 } 4946 }
4948 4947
4949 #endif 4948 #endif
4950 4949
4951 /** 4950 /**
4952 * task_prio - return the priority value of a given task. 4951 * task_prio - return the priority value of a given task.
4953 * @p: the task in question. 4952 * @p: the task in question.
4954 * 4953 *
4955 * This is the priority value as seen by users in /proc. 4954 * This is the priority value as seen by users in /proc.
4956 * RT tasks are offset by -200. Normal tasks are centered 4955 * RT tasks are offset by -200. Normal tasks are centered
4957 * around 0, value goes from -16 to +15. 4956 * around 0, value goes from -16 to +15.
4958 */ 4957 */
4959 int task_prio(const struct task_struct *p) 4958 int task_prio(const struct task_struct *p)
4960 { 4959 {
4961 return p->prio - MAX_RT_PRIO; 4960 return p->prio - MAX_RT_PRIO;
4962 } 4961 }
4963 4962
4964 /** 4963 /**
4965 * task_nice - return the nice value of a given task. 4964 * task_nice - return the nice value of a given task.
4966 * @p: the task in question. 4965 * @p: the task in question.
4967 */ 4966 */
4968 int task_nice(const struct task_struct *p) 4967 int task_nice(const struct task_struct *p)
4969 { 4968 {
4970 return TASK_NICE(p); 4969 return TASK_NICE(p);
4971 } 4970 }
4972 EXPORT_SYMBOL(task_nice); 4971 EXPORT_SYMBOL(task_nice);
4973 4972
4974 /** 4973 /**
4975 * idle_cpu - is a given cpu idle currently? 4974 * idle_cpu - is a given cpu idle currently?
4976 * @cpu: the processor in question. 4975 * @cpu: the processor in question.
4977 */ 4976 */
4978 int idle_cpu(int cpu) 4977 int idle_cpu(int cpu)
4979 { 4978 {
4980 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 4979 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4981 } 4980 }
4982 4981
4983 /** 4982 /**
4984 * idle_task - return the idle task for a given cpu. 4983 * idle_task - return the idle task for a given cpu.
4985 * @cpu: the processor in question. 4984 * @cpu: the processor in question.
4986 */ 4985 */
4987 struct task_struct *idle_task(int cpu) 4986 struct task_struct *idle_task(int cpu)
4988 { 4987 {
4989 return cpu_rq(cpu)->idle; 4988 return cpu_rq(cpu)->idle;
4990 } 4989 }
4991 4990
4992 /** 4991 /**
4993 * find_process_by_pid - find a process with a matching PID value. 4992 * find_process_by_pid - find a process with a matching PID value.
4994 * @pid: the pid in question. 4993 * @pid: the pid in question.
4995 */ 4994 */
4996 static struct task_struct *find_process_by_pid(pid_t pid) 4995 static struct task_struct *find_process_by_pid(pid_t pid)
4997 { 4996 {
4998 return pid ? find_task_by_vpid(pid) : current; 4997 return pid ? find_task_by_vpid(pid) : current;
4999 } 4998 }
5000 4999
5001 /* Actually do priority change: must hold rq lock. */ 5000 /* Actually do priority change: must hold rq lock. */
5002 static void 5001 static void
5003 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 5002 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5004 { 5003 {
5005 p->policy = policy; 5004 p->policy = policy;
5006 p->rt_priority = prio; 5005 p->rt_priority = prio;
5007 p->normal_prio = normal_prio(p); 5006 p->normal_prio = normal_prio(p);
5008 /* we are holding p->pi_lock already */ 5007 /* we are holding p->pi_lock already */
5009 p->prio = rt_mutex_getprio(p); 5008 p->prio = rt_mutex_getprio(p);
5010 if (rt_prio(p->prio)) 5009 if (rt_prio(p->prio))
5011 p->sched_class = &rt_sched_class; 5010 p->sched_class = &rt_sched_class;
5012 else 5011 else
5013 p->sched_class = &fair_sched_class; 5012 p->sched_class = &fair_sched_class;
5014 set_load_weight(p); 5013 set_load_weight(p);
5015 } 5014 }
5016 5015
5017 /* 5016 /*
5018 * check the target process has a UID that matches the current process's 5017 * check the target process has a UID that matches the current process's
5019 */ 5018 */
5020 static bool check_same_owner(struct task_struct *p) 5019 static bool check_same_owner(struct task_struct *p)
5021 { 5020 {
5022 const struct cred *cred = current_cred(), *pcred; 5021 const struct cred *cred = current_cred(), *pcred;
5023 bool match; 5022 bool match;
5024 5023
5025 rcu_read_lock(); 5024 rcu_read_lock();
5026 pcred = __task_cred(p); 5025 pcred = __task_cred(p);
5027 if (cred->user->user_ns == pcred->user->user_ns) 5026 if (cred->user->user_ns == pcred->user->user_ns)
5028 match = (cred->euid == pcred->euid || 5027 match = (cred->euid == pcred->euid ||
5029 cred->euid == pcred->uid); 5028 cred->euid == pcred->uid);
5030 else 5029 else
5031 match = false; 5030 match = false;
5032 rcu_read_unlock(); 5031 rcu_read_unlock();
5033 return match; 5032 return match;
5034 } 5033 }
5035 5034
5036 static int __sched_setscheduler(struct task_struct *p, int policy, 5035 static int __sched_setscheduler(struct task_struct *p, int policy,
5037 const struct sched_param *param, bool user) 5036 const struct sched_param *param, bool user)
5038 { 5037 {
5039 int retval, oldprio, oldpolicy = -1, on_rq, running; 5038 int retval, oldprio, oldpolicy = -1, on_rq, running;
5040 unsigned long flags; 5039 unsigned long flags;
5041 const struct sched_class *prev_class; 5040 const struct sched_class *prev_class;
5042 struct rq *rq; 5041 struct rq *rq;
5043 int reset_on_fork; 5042 int reset_on_fork;
5044 5043
5045 /* may grab non-irq protected spin_locks */ 5044 /* may grab non-irq protected spin_locks */
5046 BUG_ON(in_interrupt()); 5045 BUG_ON(in_interrupt());
5047 recheck: 5046 recheck:
5048 /* double check policy once rq lock held */ 5047 /* double check policy once rq lock held */
5049 if (policy < 0) { 5048 if (policy < 0) {
5050 reset_on_fork = p->sched_reset_on_fork; 5049 reset_on_fork = p->sched_reset_on_fork;
5051 policy = oldpolicy = p->policy; 5050 policy = oldpolicy = p->policy;
5052 } else { 5051 } else {
5053 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 5052 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
5054 policy &= ~SCHED_RESET_ON_FORK; 5053 policy &= ~SCHED_RESET_ON_FORK;
5055 5054
5056 if (policy != SCHED_FIFO && policy != SCHED_RR && 5055 if (policy != SCHED_FIFO && policy != SCHED_RR &&
5057 policy != SCHED_NORMAL && policy != SCHED_BATCH && 5056 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5058 policy != SCHED_IDLE) 5057 policy != SCHED_IDLE)
5059 return -EINVAL; 5058 return -EINVAL;
5060 } 5059 }
5061 5060
5062 /* 5061 /*
5063 * Valid priorities for SCHED_FIFO and SCHED_RR are 5062 * Valid priorities for SCHED_FIFO and SCHED_RR are
5064 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 5063 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5065 * SCHED_BATCH and SCHED_IDLE is 0. 5064 * SCHED_BATCH and SCHED_IDLE is 0.
5066 */ 5065 */
5067 if (param->sched_priority < 0 || 5066 if (param->sched_priority < 0 ||
5068 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 5067 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5069 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 5068 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5070 return -EINVAL; 5069 return -EINVAL;
5071 if (rt_policy(policy) != (param->sched_priority != 0)) 5070 if (rt_policy(policy) != (param->sched_priority != 0))
5072 return -EINVAL; 5071 return -EINVAL;
5073 5072
5074 /* 5073 /*
5075 * Allow unprivileged RT tasks to decrease priority: 5074 * Allow unprivileged RT tasks to decrease priority:
5076 */ 5075 */
5077 if (user && !capable(CAP_SYS_NICE)) { 5076 if (user && !capable(CAP_SYS_NICE)) {
5078 if (rt_policy(policy)) { 5077 if (rt_policy(policy)) {
5079 unsigned long rlim_rtprio = 5078 unsigned long rlim_rtprio =
5080 task_rlimit(p, RLIMIT_RTPRIO); 5079 task_rlimit(p, RLIMIT_RTPRIO);
5081 5080
5082 /* can't set/change the rt policy */ 5081 /* can't set/change the rt policy */
5083 if (policy != p->policy && !rlim_rtprio) 5082 if (policy != p->policy && !rlim_rtprio)
5084 return -EPERM; 5083 return -EPERM;
5085 5084
5086 /* can't increase priority */ 5085 /* can't increase priority */
5087 if (param->sched_priority > p->rt_priority && 5086 if (param->sched_priority > p->rt_priority &&
5088 param->sched_priority > rlim_rtprio) 5087 param->sched_priority > rlim_rtprio)
5089 return -EPERM; 5088 return -EPERM;
5090 } 5089 }
5091 5090
5092 /* 5091 /*
5093 * Treat SCHED_IDLE as nice 20. Only allow a switch to 5092 * Treat SCHED_IDLE as nice 20. Only allow a switch to
5094 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 5093 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
5095 */ 5094 */
5096 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 5095 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
5097 if (!can_nice(p, TASK_NICE(p))) 5096 if (!can_nice(p, TASK_NICE(p)))
5098 return -EPERM; 5097 return -EPERM;
5099 } 5098 }
5100 5099
5101 /* can't change other user's priorities */ 5100 /* can't change other user's priorities */
5102 if (!check_same_owner(p)) 5101 if (!check_same_owner(p))
5103 return -EPERM; 5102 return -EPERM;
5104 5103
5105 /* Normal users shall not reset the sched_reset_on_fork flag */ 5104 /* Normal users shall not reset the sched_reset_on_fork flag */
5106 if (p->sched_reset_on_fork && !reset_on_fork) 5105 if (p->sched_reset_on_fork && !reset_on_fork)
5107 return -EPERM; 5106 return -EPERM;
5108 } 5107 }
5109 5108
5110 if (user) { 5109 if (user) {
5111 retval = security_task_setscheduler(p); 5110 retval = security_task_setscheduler(p);
5112 if (retval) 5111 if (retval)
5113 return retval; 5112 return retval;
5114 } 5113 }
5115 5114
5116 /* 5115 /*
5117 * make sure no PI-waiters arrive (or leave) while we are 5116 * make sure no PI-waiters arrive (or leave) while we are
5118 * changing the priority of the task: 5117 * changing the priority of the task:
5119 * 5118 *
5120 * To be able to change p->policy safely, the appropriate 5119 * To be able to change p->policy safely, the appropriate
5121 * runqueue lock must be held. 5120 * runqueue lock must be held.
5122 */ 5121 */
5123 rq = task_rq_lock(p, &flags); 5122 rq = task_rq_lock(p, &flags);
5124 5123
5125 /* 5124 /*
5126 * Changing the policy of the stop threads its a very bad idea 5125 * Changing the policy of the stop threads its a very bad idea
5127 */ 5126 */
5128 if (p == rq->stop) { 5127 if (p == rq->stop) {
5129 task_rq_unlock(rq, p, &flags); 5128 task_rq_unlock(rq, p, &flags);
5130 return -EINVAL; 5129 return -EINVAL;
5131 } 5130 }
5132 5131
5133 /* 5132 /*
5134 * If not changing anything there's no need to proceed further: 5133 * If not changing anything there's no need to proceed further:
5135 */ 5134 */
5136 if (unlikely(policy == p->policy && (!rt_policy(policy) || 5135 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5137 param->sched_priority == p->rt_priority))) { 5136 param->sched_priority == p->rt_priority))) {
5138 5137
5139 __task_rq_unlock(rq); 5138 __task_rq_unlock(rq);
5140 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5139 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5141 return 0; 5140 return 0;
5142 } 5141 }
5143 5142
5144 #ifdef CONFIG_RT_GROUP_SCHED 5143 #ifdef CONFIG_RT_GROUP_SCHED
5145 if (user) { 5144 if (user) {
5146 /* 5145 /*
5147 * Do not allow realtime tasks into groups that have no runtime 5146 * Do not allow realtime tasks into groups that have no runtime
5148 * assigned. 5147 * assigned.
5149 */ 5148 */
5150 if (rt_bandwidth_enabled() && rt_policy(policy) && 5149 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5151 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5150 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5152 !task_group_is_autogroup(task_group(p))) { 5151 !task_group_is_autogroup(task_group(p))) {
5153 task_rq_unlock(rq, p, &flags); 5152 task_rq_unlock(rq, p, &flags);
5154 return -EPERM; 5153 return -EPERM;
5155 } 5154 }
5156 } 5155 }
5157 #endif 5156 #endif
5158 5157
5159 /* recheck policy now with rq lock held */ 5158 /* recheck policy now with rq lock held */
5160 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5159 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5161 policy = oldpolicy = -1; 5160 policy = oldpolicy = -1;
5162 task_rq_unlock(rq, p, &flags); 5161 task_rq_unlock(rq, p, &flags);
5163 goto recheck; 5162 goto recheck;
5164 } 5163 }
5165 on_rq = p->on_rq; 5164 on_rq = p->on_rq;
5166 running = task_current(rq, p); 5165 running = task_current(rq, p);
5167 if (on_rq) 5166 if (on_rq)
5168 deactivate_task(rq, p, 0); 5167 deactivate_task(rq, p, 0);
5169 if (running) 5168 if (running)
5170 p->sched_class->put_prev_task(rq, p); 5169 p->sched_class->put_prev_task(rq, p);
5171 5170
5172 p->sched_reset_on_fork = reset_on_fork; 5171 p->sched_reset_on_fork = reset_on_fork;
5173 5172
5174 oldprio = p->prio; 5173 oldprio = p->prio;
5175 prev_class = p->sched_class; 5174 prev_class = p->sched_class;
5176 __setscheduler(rq, p, policy, param->sched_priority); 5175 __setscheduler(rq, p, policy, param->sched_priority);
5177 5176
5178 if (running) 5177 if (running)
5179 p->sched_class->set_curr_task(rq); 5178 p->sched_class->set_curr_task(rq);
5180 if (on_rq) 5179 if (on_rq)
5181 activate_task(rq, p, 0); 5180 activate_task(rq, p, 0);
5182 5181
5183 check_class_changed(rq, p, prev_class, oldprio); 5182 check_class_changed(rq, p, prev_class, oldprio);
5184 task_rq_unlock(rq, p, &flags); 5183 task_rq_unlock(rq, p, &flags);
5185 5184
5186 rt_mutex_adjust_pi(p); 5185 rt_mutex_adjust_pi(p);
5187 5186
5188 return 0; 5187 return 0;
5189 } 5188 }
5190 5189
5191 /** 5190 /**
5192 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 5191 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5193 * @p: the task in question. 5192 * @p: the task in question.
5194 * @policy: new policy. 5193 * @policy: new policy.
5195 * @param: structure containing the new RT priority. 5194 * @param: structure containing the new RT priority.
5196 * 5195 *
5197 * NOTE that the task may be already dead. 5196 * NOTE that the task may be already dead.
5198 */ 5197 */
5199 int sched_setscheduler(struct task_struct *p, int policy, 5198 int sched_setscheduler(struct task_struct *p, int policy,
5200 const struct sched_param *param) 5199 const struct sched_param *param)
5201 { 5200 {
5202 return __sched_setscheduler(p, policy, param, true); 5201 return __sched_setscheduler(p, policy, param, true);
5203 } 5202 }
5204 EXPORT_SYMBOL_GPL(sched_setscheduler); 5203 EXPORT_SYMBOL_GPL(sched_setscheduler);
5205 5204
5206 /** 5205 /**
5207 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 5206 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5208 * @p: the task in question. 5207 * @p: the task in question.
5209 * @policy: new policy. 5208 * @policy: new policy.
5210 * @param: structure containing the new RT priority. 5209 * @param: structure containing the new RT priority.
5211 * 5210 *
5212 * Just like sched_setscheduler, only don't bother checking if the 5211 * Just like sched_setscheduler, only don't bother checking if the
5213 * current context has permission. For example, this is needed in 5212 * current context has permission. For example, this is needed in
5214 * stop_machine(): we create temporary high priority worker threads, 5213 * stop_machine(): we create temporary high priority worker threads,
5215 * but our caller might not have that capability. 5214 * but our caller might not have that capability.
5216 */ 5215 */
5217 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 5216 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5218 const struct sched_param *param) 5217 const struct sched_param *param)
5219 { 5218 {
5220 return __sched_setscheduler(p, policy, param, false); 5219 return __sched_setscheduler(p, policy, param, false);
5221 } 5220 }
5222 5221
5223 static int 5222 static int
5224 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5223 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5225 { 5224 {
5226 struct sched_param lparam; 5225 struct sched_param lparam;
5227 struct task_struct *p; 5226 struct task_struct *p;
5228 int retval; 5227 int retval;
5229 5228
5230 if (!param || pid < 0) 5229 if (!param || pid < 0)
5231 return -EINVAL; 5230 return -EINVAL;
5232 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 5231 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5233 return -EFAULT; 5232 return -EFAULT;
5234 5233
5235 rcu_read_lock(); 5234 rcu_read_lock();
5236 retval = -ESRCH; 5235 retval = -ESRCH;
5237 p = find_process_by_pid(pid); 5236 p = find_process_by_pid(pid);
5238 if (p != NULL) 5237 if (p != NULL)
5239 retval = sched_setscheduler(p, policy, &lparam); 5238 retval = sched_setscheduler(p, policy, &lparam);
5240 rcu_read_unlock(); 5239 rcu_read_unlock();
5241 5240
5242 return retval; 5241 return retval;
5243 } 5242 }
5244 5243
5245 /** 5244 /**
5246 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 5245 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
5247 * @pid: the pid in question. 5246 * @pid: the pid in question.
5248 * @policy: new policy. 5247 * @policy: new policy.
5249 * @param: structure containing the new RT priority. 5248 * @param: structure containing the new RT priority.
5250 */ 5249 */
5251 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 5250 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5252 struct sched_param __user *, param) 5251 struct sched_param __user *, param)
5253 { 5252 {
5254 /* negative values for policy are not valid */ 5253 /* negative values for policy are not valid */
5255 if (policy < 0) 5254 if (policy < 0)
5256 return -EINVAL; 5255 return -EINVAL;
5257 5256
5258 return do_sched_setscheduler(pid, policy, param); 5257 return do_sched_setscheduler(pid, policy, param);
5259 } 5258 }
5260 5259
5261 /** 5260 /**
5262 * sys_sched_setparam - set/change the RT priority of a thread 5261 * sys_sched_setparam - set/change the RT priority of a thread
5263 * @pid: the pid in question. 5262 * @pid: the pid in question.
5264 * @param: structure containing the new RT priority. 5263 * @param: structure containing the new RT priority.
5265 */ 5264 */
5266 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 5265 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5267 { 5266 {
5268 return do_sched_setscheduler(pid, -1, param); 5267 return do_sched_setscheduler(pid, -1, param);
5269 } 5268 }
5270 5269
5271 /** 5270 /**
5272 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 5271 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5273 * @pid: the pid in question. 5272 * @pid: the pid in question.
5274 */ 5273 */
5275 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 5274 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5276 { 5275 {
5277 struct task_struct *p; 5276 struct task_struct *p;
5278 int retval; 5277 int retval;
5279 5278
5280 if (pid < 0) 5279 if (pid < 0)
5281 return -EINVAL; 5280 return -EINVAL;
5282 5281
5283 retval = -ESRCH; 5282 retval = -ESRCH;
5284 rcu_read_lock(); 5283 rcu_read_lock();
5285 p = find_process_by_pid(pid); 5284 p = find_process_by_pid(pid);
5286 if (p) { 5285 if (p) {
5287 retval = security_task_getscheduler(p); 5286 retval = security_task_getscheduler(p);
5288 if (!retval) 5287 if (!retval)
5289 retval = p->policy 5288 retval = p->policy
5290 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 5289 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5291 } 5290 }
5292 rcu_read_unlock(); 5291 rcu_read_unlock();
5293 return retval; 5292 return retval;
5294 } 5293 }
5295 5294
5296 /** 5295 /**
5297 * sys_sched_getparam - get the RT priority of a thread 5296 * sys_sched_getparam - get the RT priority of a thread
5298 * @pid: the pid in question. 5297 * @pid: the pid in question.
5299 * @param: structure containing the RT priority. 5298 * @param: structure containing the RT priority.
5300 */ 5299 */
5301 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 5300 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5302 { 5301 {
5303 struct sched_param lp; 5302 struct sched_param lp;
5304 struct task_struct *p; 5303 struct task_struct *p;
5305 int retval; 5304 int retval;
5306 5305
5307 if (!param || pid < 0) 5306 if (!param || pid < 0)
5308 return -EINVAL; 5307 return -EINVAL;
5309 5308
5310 rcu_read_lock(); 5309 rcu_read_lock();
5311 p = find_process_by_pid(pid); 5310 p = find_process_by_pid(pid);
5312 retval = -ESRCH; 5311 retval = -ESRCH;
5313 if (!p) 5312 if (!p)
5314 goto out_unlock; 5313 goto out_unlock;
5315 5314
5316 retval = security_task_getscheduler(p); 5315 retval = security_task_getscheduler(p);
5317 if (retval) 5316 if (retval)
5318 goto out_unlock; 5317 goto out_unlock;
5319 5318
5320 lp.sched_priority = p->rt_priority; 5319 lp.sched_priority = p->rt_priority;
5321 rcu_read_unlock(); 5320 rcu_read_unlock();
5322 5321
5323 /* 5322 /*
5324 * This one might sleep, we cannot do it with a spinlock held ... 5323 * This one might sleep, we cannot do it with a spinlock held ...
5325 */ 5324 */
5326 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 5325 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5327 5326
5328 return retval; 5327 return retval;
5329 5328
5330 out_unlock: 5329 out_unlock:
5331 rcu_read_unlock(); 5330 rcu_read_unlock();
5332 return retval; 5331 return retval;
5333 } 5332 }
5334 5333
5335 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 5334 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5336 { 5335 {
5337 cpumask_var_t cpus_allowed, new_mask; 5336 cpumask_var_t cpus_allowed, new_mask;
5338 struct task_struct *p; 5337 struct task_struct *p;
5339 int retval; 5338 int retval;
5340 5339
5341 get_online_cpus(); 5340 get_online_cpus();
5342 rcu_read_lock(); 5341 rcu_read_lock();
5343 5342
5344 p = find_process_by_pid(pid); 5343 p = find_process_by_pid(pid);
5345 if (!p) { 5344 if (!p) {
5346 rcu_read_unlock(); 5345 rcu_read_unlock();
5347 put_online_cpus(); 5346 put_online_cpus();
5348 return -ESRCH; 5347 return -ESRCH;
5349 } 5348 }
5350 5349
5351 /* Prevent p going away */ 5350 /* Prevent p going away */
5352 get_task_struct(p); 5351 get_task_struct(p);
5353 rcu_read_unlock(); 5352 rcu_read_unlock();
5354 5353
5355 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 5354 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5356 retval = -ENOMEM; 5355 retval = -ENOMEM;
5357 goto out_put_task; 5356 goto out_put_task;
5358 } 5357 }
5359 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 5358 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5360 retval = -ENOMEM; 5359 retval = -ENOMEM;
5361 goto out_free_cpus_allowed; 5360 goto out_free_cpus_allowed;
5362 } 5361 }
5363 retval = -EPERM; 5362 retval = -EPERM;
5364 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) 5363 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5365 goto out_unlock; 5364 goto out_unlock;
5366 5365
5367 retval = security_task_setscheduler(p); 5366 retval = security_task_setscheduler(p);
5368 if (retval) 5367 if (retval)
5369 goto out_unlock; 5368 goto out_unlock;
5370 5369
5371 cpuset_cpus_allowed(p, cpus_allowed); 5370 cpuset_cpus_allowed(p, cpus_allowed);
5372 cpumask_and(new_mask, in_mask, cpus_allowed); 5371 cpumask_and(new_mask, in_mask, cpus_allowed);
5373 again: 5372 again:
5374 retval = set_cpus_allowed_ptr(p, new_mask); 5373 retval = set_cpus_allowed_ptr(p, new_mask);
5375 5374
5376 if (!retval) { 5375 if (!retval) {
5377 cpuset_cpus_allowed(p, cpus_allowed); 5376 cpuset_cpus_allowed(p, cpus_allowed);
5378 if (!cpumask_subset(new_mask, cpus_allowed)) { 5377 if (!cpumask_subset(new_mask, cpus_allowed)) {
5379 /* 5378 /*
5380 * We must have raced with a concurrent cpuset 5379 * We must have raced with a concurrent cpuset
5381 * update. Just reset the cpus_allowed to the 5380 * update. Just reset the cpus_allowed to the
5382 * cpuset's cpus_allowed 5381 * cpuset's cpus_allowed
5383 */ 5382 */
5384 cpumask_copy(new_mask, cpus_allowed); 5383 cpumask_copy(new_mask, cpus_allowed);
5385 goto again; 5384 goto again;
5386 } 5385 }
5387 } 5386 }
5388 out_unlock: 5387 out_unlock:
5389 free_cpumask_var(new_mask); 5388 free_cpumask_var(new_mask);
5390 out_free_cpus_allowed: 5389 out_free_cpus_allowed:
5391 free_cpumask_var(cpus_allowed); 5390 free_cpumask_var(cpus_allowed);
5392 out_put_task: 5391 out_put_task:
5393 put_task_struct(p); 5392 put_task_struct(p);
5394 put_online_cpus(); 5393 put_online_cpus();
5395 return retval; 5394 return retval;
5396 } 5395 }
5397 5396
5398 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5397 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5399 struct cpumask *new_mask) 5398 struct cpumask *new_mask)
5400 { 5399 {
5401 if (len < cpumask_size()) 5400 if (len < cpumask_size())
5402 cpumask_clear(new_mask); 5401 cpumask_clear(new_mask);
5403 else if (len > cpumask_size()) 5402 else if (len > cpumask_size())
5404 len = cpumask_size(); 5403 len = cpumask_size();
5405 5404
5406 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5405 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5407 } 5406 }
5408 5407
5409 /** 5408 /**
5410 * sys_sched_setaffinity - set the cpu affinity of a process 5409 * sys_sched_setaffinity - set the cpu affinity of a process
5411 * @pid: pid of the process 5410 * @pid: pid of the process
5412 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5411 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5413 * @user_mask_ptr: user-space pointer to the new cpu mask 5412 * @user_mask_ptr: user-space pointer to the new cpu mask
5414 */ 5413 */
5415 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 5414 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5416 unsigned long __user *, user_mask_ptr) 5415 unsigned long __user *, user_mask_ptr)
5417 { 5416 {
5418 cpumask_var_t new_mask; 5417 cpumask_var_t new_mask;
5419 int retval; 5418 int retval;
5420 5419
5421 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 5420 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5422 return -ENOMEM; 5421 return -ENOMEM;
5423 5422
5424 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 5423 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5425 if (retval == 0) 5424 if (retval == 0)
5426 retval = sched_setaffinity(pid, new_mask); 5425 retval = sched_setaffinity(pid, new_mask);
5427 free_cpumask_var(new_mask); 5426 free_cpumask_var(new_mask);
5428 return retval; 5427 return retval;
5429 } 5428 }
5430 5429
5431 long sched_getaffinity(pid_t pid, struct cpumask *mask) 5430 long sched_getaffinity(pid_t pid, struct cpumask *mask)
5432 { 5431 {
5433 struct task_struct *p; 5432 struct task_struct *p;
5434 unsigned long flags; 5433 unsigned long flags;
5435 int retval; 5434 int retval;
5436 5435
5437 get_online_cpus(); 5436 get_online_cpus();
5438 rcu_read_lock(); 5437 rcu_read_lock();
5439 5438
5440 retval = -ESRCH; 5439 retval = -ESRCH;
5441 p = find_process_by_pid(pid); 5440 p = find_process_by_pid(pid);
5442 if (!p) 5441 if (!p)
5443 goto out_unlock; 5442 goto out_unlock;
5444 5443
5445 retval = security_task_getscheduler(p); 5444 retval = security_task_getscheduler(p);
5446 if (retval) 5445 if (retval)
5447 goto out_unlock; 5446 goto out_unlock;
5448 5447
5449 raw_spin_lock_irqsave(&p->pi_lock, flags); 5448 raw_spin_lock_irqsave(&p->pi_lock, flags);
5450 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5449 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5451 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5450 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5452 5451
5453 out_unlock: 5452 out_unlock:
5454 rcu_read_unlock(); 5453 rcu_read_unlock();
5455 put_online_cpus(); 5454 put_online_cpus();
5456 5455
5457 return retval; 5456 return retval;
5458 } 5457 }
5459 5458
5460 /** 5459 /**
5461 * sys_sched_getaffinity - get the cpu affinity of a process 5460 * sys_sched_getaffinity - get the cpu affinity of a process
5462 * @pid: pid of the process 5461 * @pid: pid of the process
5463 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5462 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5464 * @user_mask_ptr: user-space pointer to hold the current cpu mask 5463 * @user_mask_ptr: user-space pointer to hold the current cpu mask
5465 */ 5464 */
5466 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 5465 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5467 unsigned long __user *, user_mask_ptr) 5466 unsigned long __user *, user_mask_ptr)
5468 { 5467 {
5469 int ret; 5468 int ret;
5470 cpumask_var_t mask; 5469 cpumask_var_t mask;
5471 5470
5472 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 5471 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5473 return -EINVAL; 5472 return -EINVAL;
5474 if (len & (sizeof(unsigned long)-1)) 5473 if (len & (sizeof(unsigned long)-1))
5475 return -EINVAL; 5474 return -EINVAL;
5476 5475
5477 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 5476 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5478 return -ENOMEM; 5477 return -ENOMEM;
5479 5478
5480 ret = sched_getaffinity(pid, mask); 5479 ret = sched_getaffinity(pid, mask);
5481 if (ret == 0) { 5480 if (ret == 0) {
5482 size_t retlen = min_t(size_t, len, cpumask_size()); 5481 size_t retlen = min_t(size_t, len, cpumask_size());
5483 5482
5484 if (copy_to_user(user_mask_ptr, mask, retlen)) 5483 if (copy_to_user(user_mask_ptr, mask, retlen))
5485 ret = -EFAULT; 5484 ret = -EFAULT;
5486 else 5485 else
5487 ret = retlen; 5486 ret = retlen;
5488 } 5487 }
5489 free_cpumask_var(mask); 5488 free_cpumask_var(mask);
5490 5489
5491 return ret; 5490 return ret;
5492 } 5491 }
5493 5492
5494 /** 5493 /**
5495 * sys_sched_yield - yield the current processor to other threads. 5494 * sys_sched_yield - yield the current processor to other threads.
5496 * 5495 *
5497 * This function yields the current CPU to other tasks. If there are no 5496 * This function yields the current CPU to other tasks. If there are no
5498 * other threads running on this CPU then this function will return. 5497 * other threads running on this CPU then this function will return.
5499 */ 5498 */
5500 SYSCALL_DEFINE0(sched_yield) 5499 SYSCALL_DEFINE0(sched_yield)
5501 { 5500 {
5502 struct rq *rq = this_rq_lock(); 5501 struct rq *rq = this_rq_lock();
5503 5502
5504 schedstat_inc(rq, yld_count); 5503 schedstat_inc(rq, yld_count);
5505 current->sched_class->yield_task(rq); 5504 current->sched_class->yield_task(rq);
5506 5505
5507 /* 5506 /*
5508 * Since we are going to call schedule() anyway, there's 5507 * Since we are going to call schedule() anyway, there's
5509 * no need to preempt or enable interrupts: 5508 * no need to preempt or enable interrupts:
5510 */ 5509 */
5511 __release(rq->lock); 5510 __release(rq->lock);
5512 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 5511 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5513 do_raw_spin_unlock(&rq->lock); 5512 do_raw_spin_unlock(&rq->lock);
5514 preempt_enable_no_resched(); 5513 preempt_enable_no_resched();
5515 5514
5516 schedule(); 5515 schedule();
5517 5516
5518 return 0; 5517 return 0;
5519 } 5518 }
5520 5519
5521 static inline int should_resched(void) 5520 static inline int should_resched(void)
5522 { 5521 {
5523 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); 5522 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5524 } 5523 }
5525 5524
5526 static void __cond_resched(void) 5525 static void __cond_resched(void)
5527 { 5526 {
5528 add_preempt_count(PREEMPT_ACTIVE); 5527 add_preempt_count(PREEMPT_ACTIVE);
5529 schedule(); 5528 schedule();
5530 sub_preempt_count(PREEMPT_ACTIVE); 5529 sub_preempt_count(PREEMPT_ACTIVE);
5531 } 5530 }
5532 5531
5533 int __sched _cond_resched(void) 5532 int __sched _cond_resched(void)
5534 { 5533 {
5535 if (should_resched()) { 5534 if (should_resched()) {
5536 __cond_resched(); 5535 __cond_resched();
5537 return 1; 5536 return 1;
5538 } 5537 }
5539 return 0; 5538 return 0;
5540 } 5539 }
5541 EXPORT_SYMBOL(_cond_resched); 5540 EXPORT_SYMBOL(_cond_resched);
5542 5541
5543 /* 5542 /*
5544 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 5543 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
5545 * call schedule, and on return reacquire the lock. 5544 * call schedule, and on return reacquire the lock.
5546 * 5545 *
5547 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 5546 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
5548 * operations here to prevent schedule() from being called twice (once via 5547 * operations here to prevent schedule() from being called twice (once via
5549 * spin_unlock(), once by hand). 5548 * spin_unlock(), once by hand).
5550 */ 5549 */
5551 int __cond_resched_lock(spinlock_t *lock) 5550 int __cond_resched_lock(spinlock_t *lock)
5552 { 5551 {
5553 int resched = should_resched(); 5552 int resched = should_resched();
5554 int ret = 0; 5553 int ret = 0;
5555 5554
5556 lockdep_assert_held(lock); 5555 lockdep_assert_held(lock);
5557 5556
5558 if (spin_needbreak(lock) || resched) { 5557 if (spin_needbreak(lock) || resched) {
5559 spin_unlock(lock); 5558 spin_unlock(lock);
5560 if (resched) 5559 if (resched)
5561 __cond_resched(); 5560 __cond_resched();
5562 else 5561 else
5563 cpu_relax(); 5562 cpu_relax();
5564 ret = 1; 5563 ret = 1;
5565 spin_lock(lock); 5564 spin_lock(lock);
5566 } 5565 }
5567 return ret; 5566 return ret;
5568 } 5567 }
5569 EXPORT_SYMBOL(__cond_resched_lock); 5568 EXPORT_SYMBOL(__cond_resched_lock);
5570 5569
5571 int __sched __cond_resched_softirq(void) 5570 int __sched __cond_resched_softirq(void)
5572 { 5571 {
5573 BUG_ON(!in_softirq()); 5572 BUG_ON(!in_softirq());
5574 5573
5575 if (should_resched()) { 5574 if (should_resched()) {
5576 local_bh_enable(); 5575 local_bh_enable();
5577 __cond_resched(); 5576 __cond_resched();
5578 local_bh_disable(); 5577 local_bh_disable();
5579 return 1; 5578 return 1;
5580 } 5579 }
5581 return 0; 5580 return 0;
5582 } 5581 }
5583 EXPORT_SYMBOL(__cond_resched_softirq); 5582 EXPORT_SYMBOL(__cond_resched_softirq);
5584 5583
5585 /** 5584 /**
5586 * yield - yield the current processor to other threads. 5585 * yield - yield the current processor to other threads.
5587 * 5586 *
5588 * This is a shortcut for kernel-space yielding - it marks the 5587 * This is a shortcut for kernel-space yielding - it marks the
5589 * thread runnable and calls sys_sched_yield(). 5588 * thread runnable and calls sys_sched_yield().
5590 */ 5589 */
5591 void __sched yield(void) 5590 void __sched yield(void)
5592 { 5591 {
5593 set_current_state(TASK_RUNNING); 5592 set_current_state(TASK_RUNNING);
5594 sys_sched_yield(); 5593 sys_sched_yield();
5595 } 5594 }
5596 EXPORT_SYMBOL(yield); 5595 EXPORT_SYMBOL(yield);
5597 5596
5598 /** 5597 /**
5599 * yield_to - yield the current processor to another thread in 5598 * yield_to - yield the current processor to another thread in
5600 * your thread group, or accelerate that thread toward the 5599 * your thread group, or accelerate that thread toward the
5601 * processor it's on. 5600 * processor it's on.
5602 * @p: target task 5601 * @p: target task
5603 * @preempt: whether task preemption is allowed or not 5602 * @preempt: whether task preemption is allowed or not
5604 * 5603 *
5605 * It's the caller's job to ensure that the target task struct 5604 * It's the caller's job to ensure that the target task struct
5606 * can't go away on us before we can do any checks. 5605 * can't go away on us before we can do any checks.
5607 * 5606 *
5608 * Returns true if we indeed boosted the target task. 5607 * Returns true if we indeed boosted the target task.
5609 */ 5608 */
5610 bool __sched yield_to(struct task_struct *p, bool preempt) 5609 bool __sched yield_to(struct task_struct *p, bool preempt)
5611 { 5610 {
5612 struct task_struct *curr = current; 5611 struct task_struct *curr = current;
5613 struct rq *rq, *p_rq; 5612 struct rq *rq, *p_rq;
5614 unsigned long flags; 5613 unsigned long flags;
5615 bool yielded = 0; 5614 bool yielded = 0;
5616 5615
5617 local_irq_save(flags); 5616 local_irq_save(flags);
5618 rq = this_rq(); 5617 rq = this_rq();
5619 5618
5620 again: 5619 again:
5621 p_rq = task_rq(p); 5620 p_rq = task_rq(p);
5622 double_rq_lock(rq, p_rq); 5621 double_rq_lock(rq, p_rq);
5623 while (task_rq(p) != p_rq) { 5622 while (task_rq(p) != p_rq) {
5624 double_rq_unlock(rq, p_rq); 5623 double_rq_unlock(rq, p_rq);
5625 goto again; 5624 goto again;
5626 } 5625 }
5627 5626
5628 if (!curr->sched_class->yield_to_task) 5627 if (!curr->sched_class->yield_to_task)
5629 goto out; 5628 goto out;
5630 5629
5631 if (curr->sched_class != p->sched_class) 5630 if (curr->sched_class != p->sched_class)
5632 goto out; 5631 goto out;
5633 5632
5634 if (task_running(p_rq, p) || p->state) 5633 if (task_running(p_rq, p) || p->state)
5635 goto out; 5634 goto out;
5636 5635
5637 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 5636 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5638 if (yielded) { 5637 if (yielded) {
5639 schedstat_inc(rq, yld_count); 5638 schedstat_inc(rq, yld_count);
5640 /* 5639 /*
5641 * Make p's CPU reschedule; pick_next_entity takes care of 5640 * Make p's CPU reschedule; pick_next_entity takes care of
5642 * fairness. 5641 * fairness.
5643 */ 5642 */
5644 if (preempt && rq != p_rq) 5643 if (preempt && rq != p_rq)
5645 resched_task(p_rq->curr); 5644 resched_task(p_rq->curr);
5646 } 5645 }
5647 5646
5648 out: 5647 out:
5649 double_rq_unlock(rq, p_rq); 5648 double_rq_unlock(rq, p_rq);
5650 local_irq_restore(flags); 5649 local_irq_restore(flags);
5651 5650
5652 if (yielded) 5651 if (yielded)
5653 schedule(); 5652 schedule();
5654 5653
5655 return yielded; 5654 return yielded;
5656 } 5655 }
5657 EXPORT_SYMBOL_GPL(yield_to); 5656 EXPORT_SYMBOL_GPL(yield_to);
5658 5657
5659 /* 5658 /*
5660 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5659 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5661 * that process accounting knows that this is a task in IO wait state. 5660 * that process accounting knows that this is a task in IO wait state.
5662 */ 5661 */
5663 void __sched io_schedule(void) 5662 void __sched io_schedule(void)
5664 { 5663 {
5665 struct rq *rq = raw_rq(); 5664 struct rq *rq = raw_rq();
5666 5665
5667 delayacct_blkio_start(); 5666 delayacct_blkio_start();
5668 atomic_inc(&rq->nr_iowait); 5667 atomic_inc(&rq->nr_iowait);
5669 blk_flush_plug(current); 5668 blk_flush_plug(current);
5670 current->in_iowait = 1; 5669 current->in_iowait = 1;
5671 schedule(); 5670 schedule();
5672 current->in_iowait = 0; 5671 current->in_iowait = 0;
5673 atomic_dec(&rq->nr_iowait); 5672 atomic_dec(&rq->nr_iowait);
5674 delayacct_blkio_end(); 5673 delayacct_blkio_end();
5675 } 5674 }
5676 EXPORT_SYMBOL(io_schedule); 5675 EXPORT_SYMBOL(io_schedule);
5677 5676
5678 long __sched io_schedule_timeout(long timeout) 5677 long __sched io_schedule_timeout(long timeout)
5679 { 5678 {
5680 struct rq *rq = raw_rq(); 5679 struct rq *rq = raw_rq();
5681 long ret; 5680 long ret;
5682 5681
5683 delayacct_blkio_start(); 5682 delayacct_blkio_start();
5684 atomic_inc(&rq->nr_iowait); 5683 atomic_inc(&rq->nr_iowait);
5685 blk_flush_plug(current); 5684 blk_flush_plug(current);
5686 current->in_iowait = 1; 5685 current->in_iowait = 1;
5687 ret = schedule_timeout(timeout); 5686 ret = schedule_timeout(timeout);
5688 current->in_iowait = 0; 5687 current->in_iowait = 0;
5689 atomic_dec(&rq->nr_iowait); 5688 atomic_dec(&rq->nr_iowait);
5690 delayacct_blkio_end(); 5689 delayacct_blkio_end();
5691 return ret; 5690 return ret;
5692 } 5691 }
5693 5692
5694 /** 5693 /**
5695 * sys_sched_get_priority_max - return maximum RT priority. 5694 * sys_sched_get_priority_max - return maximum RT priority.
5696 * @policy: scheduling class. 5695 * @policy: scheduling class.
5697 * 5696 *
5698 * this syscall returns the maximum rt_priority that can be used 5697 * this syscall returns the maximum rt_priority that can be used
5699 * by a given scheduling class. 5698 * by a given scheduling class.
5700 */ 5699 */
5701 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 5700 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5702 { 5701 {
5703 int ret = -EINVAL; 5702 int ret = -EINVAL;
5704 5703
5705 switch (policy) { 5704 switch (policy) {
5706 case SCHED_FIFO: 5705 case SCHED_FIFO:
5707 case SCHED_RR: 5706 case SCHED_RR:
5708 ret = MAX_USER_RT_PRIO-1; 5707 ret = MAX_USER_RT_PRIO-1;
5709 break; 5708 break;
5710 case SCHED_NORMAL: 5709 case SCHED_NORMAL:
5711 case SCHED_BATCH: 5710 case SCHED_BATCH:
5712 case SCHED_IDLE: 5711 case SCHED_IDLE:
5713 ret = 0; 5712 ret = 0;
5714 break; 5713 break;
5715 } 5714 }
5716 return ret; 5715 return ret;
5717 } 5716 }
5718 5717
5719 /** 5718 /**
5720 * sys_sched_get_priority_min - return minimum RT priority. 5719 * sys_sched_get_priority_min - return minimum RT priority.
5721 * @policy: scheduling class. 5720 * @policy: scheduling class.
5722 * 5721 *
5723 * this syscall returns the minimum rt_priority that can be used 5722 * this syscall returns the minimum rt_priority that can be used
5724 * by a given scheduling class. 5723 * by a given scheduling class.
5725 */ 5724 */
5726 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 5725 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5727 { 5726 {
5728 int ret = -EINVAL; 5727 int ret = -EINVAL;
5729 5728
5730 switch (policy) { 5729 switch (policy) {
5731 case SCHED_FIFO: 5730 case SCHED_FIFO:
5732 case SCHED_RR: 5731 case SCHED_RR:
5733 ret = 1; 5732 ret = 1;
5734 break; 5733 break;
5735 case SCHED_NORMAL: 5734 case SCHED_NORMAL:
5736 case SCHED_BATCH: 5735 case SCHED_BATCH:
5737 case SCHED_IDLE: 5736 case SCHED_IDLE:
5738 ret = 0; 5737 ret = 0;
5739 } 5738 }
5740 return ret; 5739 return ret;
5741 } 5740 }
5742 5741
5743 /** 5742 /**
5744 * sys_sched_rr_get_interval - return the default timeslice of a process. 5743 * sys_sched_rr_get_interval - return the default timeslice of a process.
5745 * @pid: pid of the process. 5744 * @pid: pid of the process.
5746 * @interval: userspace pointer to the timeslice value. 5745 * @interval: userspace pointer to the timeslice value.
5747 * 5746 *
5748 * this syscall writes the default timeslice value of a given process 5747 * this syscall writes the default timeslice value of a given process
5749 * into the user-space timespec buffer. A value of '0' means infinity. 5748 * into the user-space timespec buffer. A value of '0' means infinity.
5750 */ 5749 */
5751 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 5750 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5752 struct timespec __user *, interval) 5751 struct timespec __user *, interval)
5753 { 5752 {
5754 struct task_struct *p; 5753 struct task_struct *p;
5755 unsigned int time_slice; 5754 unsigned int time_slice;
5756 unsigned long flags; 5755 unsigned long flags;
5757 struct rq *rq; 5756 struct rq *rq;
5758 int retval; 5757 int retval;
5759 struct timespec t; 5758 struct timespec t;
5760 5759
5761 if (pid < 0) 5760 if (pid < 0)
5762 return -EINVAL; 5761 return -EINVAL;
5763 5762
5764 retval = -ESRCH; 5763 retval = -ESRCH;
5765 rcu_read_lock(); 5764 rcu_read_lock();
5766 p = find_process_by_pid(pid); 5765 p = find_process_by_pid(pid);
5767 if (!p) 5766 if (!p)
5768 goto out_unlock; 5767 goto out_unlock;
5769 5768
5770 retval = security_task_getscheduler(p); 5769 retval = security_task_getscheduler(p);
5771 if (retval) 5770 if (retval)
5772 goto out_unlock; 5771 goto out_unlock;
5773 5772
5774 rq = task_rq_lock(p, &flags); 5773 rq = task_rq_lock(p, &flags);
5775 time_slice = p->sched_class->get_rr_interval(rq, p); 5774 time_slice = p->sched_class->get_rr_interval(rq, p);
5776 task_rq_unlock(rq, p, &flags); 5775 task_rq_unlock(rq, p, &flags);
5777 5776
5778 rcu_read_unlock(); 5777 rcu_read_unlock();
5779 jiffies_to_timespec(time_slice, &t); 5778 jiffies_to_timespec(time_slice, &t);
5780 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 5779 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5781 return retval; 5780 return retval;
5782 5781
5783 out_unlock: 5782 out_unlock:
5784 rcu_read_unlock(); 5783 rcu_read_unlock();
5785 return retval; 5784 return retval;
5786 } 5785 }
5787 5786
5788 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 5787 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5789 5788
5790 void sched_show_task(struct task_struct *p) 5789 void sched_show_task(struct task_struct *p)
5791 { 5790 {
5792 unsigned long free = 0; 5791 unsigned long free = 0;
5793 unsigned state; 5792 unsigned state;
5794 5793
5795 state = p->state ? __ffs(p->state) + 1 : 0; 5794 state = p->state ? __ffs(p->state) + 1 : 0;
5796 printk(KERN_INFO "%-15.15s %c", p->comm, 5795 printk(KERN_INFO "%-15.15s %c", p->comm,
5797 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5796 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5798 #if BITS_PER_LONG == 32 5797 #if BITS_PER_LONG == 32
5799 if (state == TASK_RUNNING) 5798 if (state == TASK_RUNNING)
5800 printk(KERN_CONT " running "); 5799 printk(KERN_CONT " running ");
5801 else 5800 else
5802 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 5801 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5803 #else 5802 #else
5804 if (state == TASK_RUNNING) 5803 if (state == TASK_RUNNING)
5805 printk(KERN_CONT " running task "); 5804 printk(KERN_CONT " running task ");
5806 else 5805 else
5807 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 5806 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5808 #endif 5807 #endif
5809 #ifdef CONFIG_DEBUG_STACK_USAGE 5808 #ifdef CONFIG_DEBUG_STACK_USAGE
5810 free = stack_not_used(p); 5809 free = stack_not_used(p);
5811 #endif 5810 #endif
5812 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 5811 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5813 task_pid_nr(p), task_pid_nr(p->real_parent), 5812 task_pid_nr(p), task_pid_nr(p->real_parent),
5814 (unsigned long)task_thread_info(p)->flags); 5813 (unsigned long)task_thread_info(p)->flags);
5815 5814
5816 show_stack(p, NULL); 5815 show_stack(p, NULL);
5817 } 5816 }
5818 5817
5819 void show_state_filter(unsigned long state_filter) 5818 void show_state_filter(unsigned long state_filter)
5820 { 5819 {
5821 struct task_struct *g, *p; 5820 struct task_struct *g, *p;
5822 5821
5823 #if BITS_PER_LONG == 32 5822 #if BITS_PER_LONG == 32
5824 printk(KERN_INFO 5823 printk(KERN_INFO
5825 " task PC stack pid father\n"); 5824 " task PC stack pid father\n");
5826 #else 5825 #else
5827 printk(KERN_INFO 5826 printk(KERN_INFO
5828 " task PC stack pid father\n"); 5827 " task PC stack pid father\n");
5829 #endif 5828 #endif
5830 read_lock(&tasklist_lock); 5829 read_lock(&tasklist_lock);
5831 do_each_thread(g, p) { 5830 do_each_thread(g, p) {
5832 /* 5831 /*
5833 * reset the NMI-timeout, listing all files on a slow 5832 * reset the NMI-timeout, listing all files on a slow
5834 * console might take a lot of time: 5833 * console might take a lot of time:
5835 */ 5834 */
5836 touch_nmi_watchdog(); 5835 touch_nmi_watchdog();
5837 if (!state_filter || (p->state & state_filter)) 5836 if (!state_filter || (p->state & state_filter))
5838 sched_show_task(p); 5837 sched_show_task(p);
5839 } while_each_thread(g, p); 5838 } while_each_thread(g, p);
5840 5839
5841 touch_all_softlockup_watchdogs(); 5840 touch_all_softlockup_watchdogs();
5842 5841
5843 #ifdef CONFIG_SCHED_DEBUG 5842 #ifdef CONFIG_SCHED_DEBUG
5844 sysrq_sched_debug_show(); 5843 sysrq_sched_debug_show();
5845 #endif 5844 #endif
5846 read_unlock(&tasklist_lock); 5845 read_unlock(&tasklist_lock);
5847 /* 5846 /*
5848 * Only show locks if all tasks are dumped: 5847 * Only show locks if all tasks are dumped:
5849 */ 5848 */
5850 if (!state_filter) 5849 if (!state_filter)
5851 debug_show_all_locks(); 5850 debug_show_all_locks();
5852 } 5851 }
5853 5852
5854 void __cpuinit init_idle_bootup_task(struct task_struct *idle) 5853 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5855 { 5854 {
5856 idle->sched_class = &idle_sched_class; 5855 idle->sched_class = &idle_sched_class;
5857 } 5856 }
5858 5857
5859 /** 5858 /**
5860 * init_idle - set up an idle thread for a given CPU 5859 * init_idle - set up an idle thread for a given CPU
5861 * @idle: task in question 5860 * @idle: task in question
5862 * @cpu: cpu the idle task belongs to 5861 * @cpu: cpu the idle task belongs to
5863 * 5862 *
5864 * NOTE: this function does not set the idle thread's NEED_RESCHED 5863 * NOTE: this function does not set the idle thread's NEED_RESCHED
5865 * flag, to make booting more robust. 5864 * flag, to make booting more robust.
5866 */ 5865 */
5867 void __cpuinit init_idle(struct task_struct *idle, int cpu) 5866 void __cpuinit init_idle(struct task_struct *idle, int cpu)
5868 { 5867 {
5869 struct rq *rq = cpu_rq(cpu); 5868 struct rq *rq = cpu_rq(cpu);
5870 unsigned long flags; 5869 unsigned long flags;
5871 5870
5872 raw_spin_lock_irqsave(&rq->lock, flags); 5871 raw_spin_lock_irqsave(&rq->lock, flags);
5873 5872
5874 __sched_fork(idle); 5873 __sched_fork(idle);
5875 idle->state = TASK_RUNNING; 5874 idle->state = TASK_RUNNING;
5876 idle->se.exec_start = sched_clock(); 5875 idle->se.exec_start = sched_clock();
5877 5876
5878 do_set_cpus_allowed(idle, cpumask_of(cpu)); 5877 do_set_cpus_allowed(idle, cpumask_of(cpu));
5879 /* 5878 /*
5880 * We're having a chicken and egg problem, even though we are 5879 * We're having a chicken and egg problem, even though we are
5881 * holding rq->lock, the cpu isn't yet set to this cpu so the 5880 * holding rq->lock, the cpu isn't yet set to this cpu so the
5882 * lockdep check in task_group() will fail. 5881 * lockdep check in task_group() will fail.
5883 * 5882 *
5884 * Similar case to sched_fork(). / Alternatively we could 5883 * Similar case to sched_fork(). / Alternatively we could
5885 * use task_rq_lock() here and obtain the other rq->lock. 5884 * use task_rq_lock() here and obtain the other rq->lock.
5886 * 5885 *
5887 * Silence PROVE_RCU 5886 * Silence PROVE_RCU
5888 */ 5887 */
5889 rcu_read_lock(); 5888 rcu_read_lock();
5890 __set_task_cpu(idle, cpu); 5889 __set_task_cpu(idle, cpu);
5891 rcu_read_unlock(); 5890 rcu_read_unlock();
5892 5891
5893 rq->curr = rq->idle = idle; 5892 rq->curr = rq->idle = idle;
5894 #if defined(CONFIG_SMP) 5893 #if defined(CONFIG_SMP)
5895 idle->on_cpu = 1; 5894 idle->on_cpu = 1;
5896 #endif 5895 #endif
5897 raw_spin_unlock_irqrestore(&rq->lock, flags); 5896 raw_spin_unlock_irqrestore(&rq->lock, flags);
5898 5897
5899 /* Set the preempt count _outside_ the spinlocks! */ 5898 /* Set the preempt count _outside_ the spinlocks! */
5900 task_thread_info(idle)->preempt_count = 0; 5899 task_thread_info(idle)->preempt_count = 0;
5901 5900
5902 /* 5901 /*
5903 * The idle tasks have their own, simple scheduling class: 5902 * The idle tasks have their own, simple scheduling class:
5904 */ 5903 */
5905 idle->sched_class = &idle_sched_class; 5904 idle->sched_class = &idle_sched_class;
5906 ftrace_graph_init_idle_task(idle, cpu); 5905 ftrace_graph_init_idle_task(idle, cpu);
5907 } 5906 }
5908 5907
5909 /* 5908 /*
5910 * In a system that switches off the HZ timer nohz_cpu_mask 5909 * In a system that switches off the HZ timer nohz_cpu_mask
5911 * indicates which cpus entered this state. This is used 5910 * indicates which cpus entered this state. This is used
5912 * in the rcu update to wait only for active cpus. For system 5911 * in the rcu update to wait only for active cpus. For system
5913 * which do not switch off the HZ timer nohz_cpu_mask should 5912 * which do not switch off the HZ timer nohz_cpu_mask should
5914 * always be CPU_BITS_NONE. 5913 * always be CPU_BITS_NONE.
5915 */ 5914 */
5916 cpumask_var_t nohz_cpu_mask; 5915 cpumask_var_t nohz_cpu_mask;
5917 5916
5918 /* 5917 /*
5919 * Increase the granularity value when there are more CPUs, 5918 * Increase the granularity value when there are more CPUs,
5920 * because with more CPUs the 'effective latency' as visible 5919 * because with more CPUs the 'effective latency' as visible
5921 * to users decreases. But the relationship is not linear, 5920 * to users decreases. But the relationship is not linear,
5922 * so pick a second-best guess by going with the log2 of the 5921 * so pick a second-best guess by going with the log2 of the
5923 * number of CPUs. 5922 * number of CPUs.
5924 * 5923 *
5925 * This idea comes from the SD scheduler of Con Kolivas: 5924 * This idea comes from the SD scheduler of Con Kolivas:
5926 */ 5925 */
5927 static int get_update_sysctl_factor(void) 5926 static int get_update_sysctl_factor(void)
5928 { 5927 {
5929 unsigned int cpus = min_t(int, num_online_cpus(), 8); 5928 unsigned int cpus = min_t(int, num_online_cpus(), 8);
5930 unsigned int factor; 5929 unsigned int factor;
5931 5930
5932 switch (sysctl_sched_tunable_scaling) { 5931 switch (sysctl_sched_tunable_scaling) {
5933 case SCHED_TUNABLESCALING_NONE: 5932 case SCHED_TUNABLESCALING_NONE:
5934 factor = 1; 5933 factor = 1;
5935 break; 5934 break;
5936 case SCHED_TUNABLESCALING_LINEAR: 5935 case SCHED_TUNABLESCALING_LINEAR:
5937 factor = cpus; 5936 factor = cpus;
5938 break; 5937 break;
5939 case SCHED_TUNABLESCALING_LOG: 5938 case SCHED_TUNABLESCALING_LOG:
5940 default: 5939 default:
5941 factor = 1 + ilog2(cpus); 5940 factor = 1 + ilog2(cpus);
5942 break; 5941 break;
5943 } 5942 }
5944 5943
5945 return factor; 5944 return factor;
5946 } 5945 }
5947 5946
5948 static void update_sysctl(void) 5947 static void update_sysctl(void)
5949 { 5948 {
5950 unsigned int factor = get_update_sysctl_factor(); 5949 unsigned int factor = get_update_sysctl_factor();
5951 5950
5952 #define SET_SYSCTL(name) \ 5951 #define SET_SYSCTL(name) \
5953 (sysctl_##name = (factor) * normalized_sysctl_##name) 5952 (sysctl_##name = (factor) * normalized_sysctl_##name)
5954 SET_SYSCTL(sched_min_granularity); 5953 SET_SYSCTL(sched_min_granularity);
5955 SET_SYSCTL(sched_latency); 5954 SET_SYSCTL(sched_latency);
5956 SET_SYSCTL(sched_wakeup_granularity); 5955 SET_SYSCTL(sched_wakeup_granularity);
5957 #undef SET_SYSCTL 5956 #undef SET_SYSCTL
5958 } 5957 }
5959 5958
5960 static inline void sched_init_granularity(void) 5959 static inline void sched_init_granularity(void)
5961 { 5960 {
5962 update_sysctl(); 5961 update_sysctl();
5963 } 5962 }
5964 5963
5965 #ifdef CONFIG_SMP 5964 #ifdef CONFIG_SMP
5966 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 5965 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
5967 { 5966 {
5968 if (p->sched_class && p->sched_class->set_cpus_allowed) 5967 if (p->sched_class && p->sched_class->set_cpus_allowed)
5969 p->sched_class->set_cpus_allowed(p, new_mask); 5968 p->sched_class->set_cpus_allowed(p, new_mask);
5970 else { 5969 else {
5971 cpumask_copy(&p->cpus_allowed, new_mask); 5970 cpumask_copy(&p->cpus_allowed, new_mask);
5972 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5971 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5973 } 5972 }
5974 } 5973 }
5975 5974
5976 /* 5975 /*
5977 * This is how migration works: 5976 * This is how migration works:
5978 * 5977 *
5979 * 1) we invoke migration_cpu_stop() on the target CPU using 5978 * 1) we invoke migration_cpu_stop() on the target CPU using
5980 * stop_one_cpu(). 5979 * stop_one_cpu().
5981 * 2) stopper starts to run (implicitly forcing the migrated thread 5980 * 2) stopper starts to run (implicitly forcing the migrated thread
5982 * off the CPU) 5981 * off the CPU)
5983 * 3) it checks whether the migrated task is still in the wrong runqueue. 5982 * 3) it checks whether the migrated task is still in the wrong runqueue.
5984 * 4) if it's in the wrong runqueue then the migration thread removes 5983 * 4) if it's in the wrong runqueue then the migration thread removes
5985 * it and puts it into the right queue. 5984 * it and puts it into the right queue.
5986 * 5) stopper completes and stop_one_cpu() returns and the migration 5985 * 5) stopper completes and stop_one_cpu() returns and the migration
5987 * is done. 5986 * is done.
5988 */ 5987 */
5989 5988
5990 /* 5989 /*
5991 * Change a given task's CPU affinity. Migrate the thread to a 5990 * Change a given task's CPU affinity. Migrate the thread to a
5992 * proper CPU and schedule it away if the CPU it's executing on 5991 * proper CPU and schedule it away if the CPU it's executing on
5993 * is removed from the allowed bitmask. 5992 * is removed from the allowed bitmask.
5994 * 5993 *
5995 * NOTE: the caller must have a valid reference to the task, the 5994 * NOTE: the caller must have a valid reference to the task, the
5996 * task must not exit() & deallocate itself prematurely. The 5995 * task must not exit() & deallocate itself prematurely. The
5997 * call is not atomic; no spinlocks may be held. 5996 * call is not atomic; no spinlocks may be held.
5998 */ 5997 */
5999 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5998 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6000 { 5999 {
6001 unsigned long flags; 6000 unsigned long flags;
6002 struct rq *rq; 6001 struct rq *rq;
6003 unsigned int dest_cpu; 6002 unsigned int dest_cpu;
6004 int ret = 0; 6003 int ret = 0;
6005 6004
6006 rq = task_rq_lock(p, &flags); 6005 rq = task_rq_lock(p, &flags);
6007 6006
6008 if (cpumask_equal(&p->cpus_allowed, new_mask)) 6007 if (cpumask_equal(&p->cpus_allowed, new_mask))
6009 goto out; 6008 goto out;
6010 6009
6011 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 6010 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
6012 ret = -EINVAL; 6011 ret = -EINVAL;
6013 goto out; 6012 goto out;
6014 } 6013 }
6015 6014
6016 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { 6015 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
6017 ret = -EINVAL; 6016 ret = -EINVAL;
6018 goto out; 6017 goto out;
6019 } 6018 }
6020 6019
6021 do_set_cpus_allowed(p, new_mask); 6020 do_set_cpus_allowed(p, new_mask);
6022 6021
6023 /* Can the task run on the task's current CPU? If so, we're done */ 6022 /* Can the task run on the task's current CPU? If so, we're done */
6024 if (cpumask_test_cpu(task_cpu(p), new_mask)) 6023 if (cpumask_test_cpu(task_cpu(p), new_mask))
6025 goto out; 6024 goto out;
6026 6025
6027 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 6026 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
6028 if (p->on_rq) { 6027 if (p->on_rq) {
6029 struct migration_arg arg = { p, dest_cpu }; 6028 struct migration_arg arg = { p, dest_cpu };
6030 /* Need help from migration thread: drop lock and wait. */ 6029 /* Need help from migration thread: drop lock and wait. */
6031 task_rq_unlock(rq, p, &flags); 6030 task_rq_unlock(rq, p, &flags);
6032 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 6031 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
6033 tlb_migrate_finish(p->mm); 6032 tlb_migrate_finish(p->mm);
6034 return 0; 6033 return 0;
6035 } 6034 }
6036 out: 6035 out:
6037 task_rq_unlock(rq, p, &flags); 6036 task_rq_unlock(rq, p, &flags);
6038 6037
6039 return ret; 6038 return ret;
6040 } 6039 }
6041 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 6040 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
6042 6041
6043 /* 6042 /*
6044 * Move (not current) task off this cpu, onto dest cpu. We're doing 6043 * Move (not current) task off this cpu, onto dest cpu. We're doing
6045 * this because either it can't run here any more (set_cpus_allowed() 6044 * this because either it can't run here any more (set_cpus_allowed()
6046 * away from this CPU, or CPU going down), or because we're 6045 * away from this CPU, or CPU going down), or because we're
6047 * attempting to rebalance this task on exec (sched_exec). 6046 * attempting to rebalance this task on exec (sched_exec).
6048 * 6047 *
6049 * So we race with normal scheduler movements, but that's OK, as long 6048 * So we race with normal scheduler movements, but that's OK, as long
6050 * as the task is no longer on this CPU. 6049 * as the task is no longer on this CPU.
6051 * 6050 *
6052 * Returns non-zero if task was successfully migrated. 6051 * Returns non-zero if task was successfully migrated.
6053 */ 6052 */
6054 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 6053 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6055 { 6054 {
6056 struct rq *rq_dest, *rq_src; 6055 struct rq *rq_dest, *rq_src;
6057 int ret = 0; 6056 int ret = 0;
6058 6057
6059 if (unlikely(!cpu_active(dest_cpu))) 6058 if (unlikely(!cpu_active(dest_cpu)))
6060 return ret; 6059 return ret;
6061 6060
6062 rq_src = cpu_rq(src_cpu); 6061 rq_src = cpu_rq(src_cpu);
6063 rq_dest = cpu_rq(dest_cpu); 6062 rq_dest = cpu_rq(dest_cpu);
6064 6063
6065 raw_spin_lock(&p->pi_lock); 6064 raw_spin_lock(&p->pi_lock);
6066 double_rq_lock(rq_src, rq_dest); 6065 double_rq_lock(rq_src, rq_dest);
6067 /* Already moved. */ 6066 /* Already moved. */
6068 if (task_cpu(p) != src_cpu) 6067 if (task_cpu(p) != src_cpu)
6069 goto done; 6068 goto done;
6070 /* Affinity changed (again). */ 6069 /* Affinity changed (again). */
6071 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 6070 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6072 goto fail; 6071 goto fail;
6073 6072
6074 /* 6073 /*
6075 * If we're not on a rq, the next wake-up will ensure we're 6074 * If we're not on a rq, the next wake-up will ensure we're
6076 * placed properly. 6075 * placed properly.
6077 */ 6076 */
6078 if (p->on_rq) { 6077 if (p->on_rq) {
6079 deactivate_task(rq_src, p, 0); 6078 deactivate_task(rq_src, p, 0);
6080 set_task_cpu(p, dest_cpu); 6079 set_task_cpu(p, dest_cpu);
6081 activate_task(rq_dest, p, 0); 6080 activate_task(rq_dest, p, 0);
6082 check_preempt_curr(rq_dest, p, 0); 6081 check_preempt_curr(rq_dest, p, 0);
6083 } 6082 }
6084 done: 6083 done:
6085 ret = 1; 6084 ret = 1;
6086 fail: 6085 fail:
6087 double_rq_unlock(rq_src, rq_dest); 6086 double_rq_unlock(rq_src, rq_dest);
6088 raw_spin_unlock(&p->pi_lock); 6087 raw_spin_unlock(&p->pi_lock);
6089 return ret; 6088 return ret;
6090 } 6089 }
6091 6090
6092 /* 6091 /*
6093 * migration_cpu_stop - this will be executed by a highprio stopper thread 6092 * migration_cpu_stop - this will be executed by a highprio stopper thread
6094 * and performs thread migration by bumping thread off CPU then 6093 * and performs thread migration by bumping thread off CPU then
6095 * 'pushing' onto another runqueue. 6094 * 'pushing' onto another runqueue.
6096 */ 6095 */
6097 static int migration_cpu_stop(void *data) 6096 static int migration_cpu_stop(void *data)
6098 { 6097 {
6099 struct migration_arg *arg = data; 6098 struct migration_arg *arg = data;
6100 6099
6101 /* 6100 /*
6102 * The original target cpu might have gone down and we might 6101 * The original target cpu might have gone down and we might
6103 * be on another cpu but it doesn't matter. 6102 * be on another cpu but it doesn't matter.
6104 */ 6103 */
6105 local_irq_disable(); 6104 local_irq_disable();
6106 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 6105 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
6107 local_irq_enable(); 6106 local_irq_enable();
6108 return 0; 6107 return 0;
6109 } 6108 }
6110 6109
6111 #ifdef CONFIG_HOTPLUG_CPU 6110 #ifdef CONFIG_HOTPLUG_CPU
6112 6111
6113 /* 6112 /*
6114 * Ensures that the idle task is using init_mm right before its cpu goes 6113 * Ensures that the idle task is using init_mm right before its cpu goes
6115 * offline. 6114 * offline.
6116 */ 6115 */
6117 void idle_task_exit(void) 6116 void idle_task_exit(void)
6118 { 6117 {
6119 struct mm_struct *mm = current->active_mm; 6118 struct mm_struct *mm = current->active_mm;
6120 6119
6121 BUG_ON(cpu_online(smp_processor_id())); 6120 BUG_ON(cpu_online(smp_processor_id()));
6122 6121
6123 if (mm != &init_mm) 6122 if (mm != &init_mm)
6124 switch_mm(mm, &init_mm, current); 6123 switch_mm(mm, &init_mm, current);
6125 mmdrop(mm); 6124 mmdrop(mm);
6126 } 6125 }
6127 6126
6128 /* 6127 /*
6129 * While a dead CPU has no uninterruptible tasks queued at this point, 6128 * While a dead CPU has no uninterruptible tasks queued at this point,
6130 * it might still have a nonzero ->nr_uninterruptible counter, because 6129 * it might still have a nonzero ->nr_uninterruptible counter, because
6131 * for performance reasons the counter is not stricly tracking tasks to 6130 * for performance reasons the counter is not stricly tracking tasks to
6132 * their home CPUs. So we just add the counter to another CPU's counter, 6131 * their home CPUs. So we just add the counter to another CPU's counter,
6133 * to keep the global sum constant after CPU-down: 6132 * to keep the global sum constant after CPU-down:
6134 */ 6133 */
6135 static void migrate_nr_uninterruptible(struct rq *rq_src) 6134 static void migrate_nr_uninterruptible(struct rq *rq_src)
6136 { 6135 {
6137 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 6136 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
6138 6137
6139 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 6138 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6140 rq_src->nr_uninterruptible = 0; 6139 rq_src->nr_uninterruptible = 0;
6141 } 6140 }
6142 6141
6143 /* 6142 /*
6144 * remove the tasks which were accounted by rq from calc_load_tasks. 6143 * remove the tasks which were accounted by rq from calc_load_tasks.
6145 */ 6144 */
6146 static void calc_global_load_remove(struct rq *rq) 6145 static void calc_global_load_remove(struct rq *rq)
6147 { 6146 {
6148 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 6147 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6149 rq->calc_load_active = 0; 6148 rq->calc_load_active = 0;
6150 } 6149 }
6151 6150
6152 /* 6151 /*
6153 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6152 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6154 * try_to_wake_up()->select_task_rq(). 6153 * try_to_wake_up()->select_task_rq().
6155 * 6154 *
6156 * Called with rq->lock held even though we'er in stop_machine() and 6155 * Called with rq->lock held even though we'er in stop_machine() and
6157 * there's no concurrency possible, we hold the required locks anyway 6156 * there's no concurrency possible, we hold the required locks anyway
6158 * because of lock validation efforts. 6157 * because of lock validation efforts.
6159 */ 6158 */
6160 static void migrate_tasks(unsigned int dead_cpu) 6159 static void migrate_tasks(unsigned int dead_cpu)
6161 { 6160 {
6162 struct rq *rq = cpu_rq(dead_cpu); 6161 struct rq *rq = cpu_rq(dead_cpu);
6163 struct task_struct *next, *stop = rq->stop; 6162 struct task_struct *next, *stop = rq->stop;
6164 int dest_cpu; 6163 int dest_cpu;
6165 6164
6166 /* 6165 /*
6167 * Fudge the rq selection such that the below task selection loop 6166 * Fudge the rq selection such that the below task selection loop
6168 * doesn't get stuck on the currently eligible stop task. 6167 * doesn't get stuck on the currently eligible stop task.
6169 * 6168 *
6170 * We're currently inside stop_machine() and the rq is either stuck 6169 * We're currently inside stop_machine() and the rq is either stuck
6171 * in the stop_machine_cpu_stop() loop, or we're executing this code, 6170 * in the stop_machine_cpu_stop() loop, or we're executing this code,
6172 * either way we should never end up calling schedule() until we're 6171 * either way we should never end up calling schedule() until we're
6173 * done here. 6172 * done here.
6174 */ 6173 */
6175 rq->stop = NULL; 6174 rq->stop = NULL;
6176 6175
6177 for ( ; ; ) { 6176 for ( ; ; ) {
6178 /* 6177 /*
6179 * There's this thread running, bail when that's the only 6178 * There's this thread running, bail when that's the only
6180 * remaining thread. 6179 * remaining thread.
6181 */ 6180 */
6182 if (rq->nr_running == 1) 6181 if (rq->nr_running == 1)
6183 break; 6182 break;
6184 6183
6185 next = pick_next_task(rq); 6184 next = pick_next_task(rq);
6186 BUG_ON(!next); 6185 BUG_ON(!next);
6187 next->sched_class->put_prev_task(rq, next); 6186 next->sched_class->put_prev_task(rq, next);
6188 6187
6189 /* Find suitable destination for @next, with force if needed. */ 6188 /* Find suitable destination for @next, with force if needed. */
6190 dest_cpu = select_fallback_rq(dead_cpu, next); 6189 dest_cpu = select_fallback_rq(dead_cpu, next);
6191 raw_spin_unlock(&rq->lock); 6190 raw_spin_unlock(&rq->lock);
6192 6191
6193 __migrate_task(next, dead_cpu, dest_cpu); 6192 __migrate_task(next, dead_cpu, dest_cpu);
6194 6193
6195 raw_spin_lock(&rq->lock); 6194 raw_spin_lock(&rq->lock);
6196 } 6195 }
6197 6196
6198 rq->stop = stop; 6197 rq->stop = stop;
6199 } 6198 }
6200 6199
6201 #endif /* CONFIG_HOTPLUG_CPU */ 6200 #endif /* CONFIG_HOTPLUG_CPU */
6202 6201
6203 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 6202 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6204 6203
6205 static struct ctl_table sd_ctl_dir[] = { 6204 static struct ctl_table sd_ctl_dir[] = {
6206 { 6205 {
6207 .procname = "sched_domain", 6206 .procname = "sched_domain",
6208 .mode = 0555, 6207 .mode = 0555,
6209 }, 6208 },
6210 {} 6209 {}
6211 }; 6210 };
6212 6211
6213 static struct ctl_table sd_ctl_root[] = { 6212 static struct ctl_table sd_ctl_root[] = {
6214 { 6213 {
6215 .procname = "kernel", 6214 .procname = "kernel",
6216 .mode = 0555, 6215 .mode = 0555,
6217 .child = sd_ctl_dir, 6216 .child = sd_ctl_dir,
6218 }, 6217 },
6219 {} 6218 {}
6220 }; 6219 };
6221 6220
6222 static struct ctl_table *sd_alloc_ctl_entry(int n) 6221 static struct ctl_table *sd_alloc_ctl_entry(int n)
6223 { 6222 {
6224 struct ctl_table *entry = 6223 struct ctl_table *entry =
6225 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 6224 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6226 6225
6227 return entry; 6226 return entry;
6228 } 6227 }
6229 6228
6230 static void sd_free_ctl_entry(struct ctl_table **tablep) 6229 static void sd_free_ctl_entry(struct ctl_table **tablep)
6231 { 6230 {
6232 struct ctl_table *entry; 6231 struct ctl_table *entry;
6233 6232
6234 /* 6233 /*
6235 * In the intermediate directories, both the child directory and 6234 * In the intermediate directories, both the child directory and
6236 * procname are dynamically allocated and could fail but the mode 6235 * procname are dynamically allocated and could fail but the mode
6237 * will always be set. In the lowest directory the names are 6236 * will always be set. In the lowest directory the names are
6238 * static strings and all have proc handlers. 6237 * static strings and all have proc handlers.
6239 */ 6238 */
6240 for (entry = *tablep; entry->mode; entry++) { 6239 for (entry = *tablep; entry->mode; entry++) {
6241 if (entry->child) 6240 if (entry->child)
6242 sd_free_ctl_entry(&entry->child); 6241 sd_free_ctl_entry(&entry->child);
6243 if (entry->proc_handler == NULL) 6242 if (entry->proc_handler == NULL)
6244 kfree(entry->procname); 6243 kfree(entry->procname);
6245 } 6244 }
6246 6245
6247 kfree(*tablep); 6246 kfree(*tablep);
6248 *tablep = NULL; 6247 *tablep = NULL;
6249 } 6248 }
6250 6249
6251 static void 6250 static void
6252 set_table_entry(struct ctl_table *entry, 6251 set_table_entry(struct ctl_table *entry,
6253 const char *procname, void *data, int maxlen, 6252 const char *procname, void *data, int maxlen,
6254 mode_t mode, proc_handler *proc_handler) 6253 mode_t mode, proc_handler *proc_handler)
6255 { 6254 {
6256 entry->procname = procname; 6255 entry->procname = procname;
6257 entry->data = data; 6256 entry->data = data;
6258 entry->maxlen = maxlen; 6257 entry->maxlen = maxlen;
6259 entry->mode = mode; 6258 entry->mode = mode;
6260 entry->proc_handler = proc_handler; 6259 entry->proc_handler = proc_handler;
6261 } 6260 }
6262 6261
6263 static struct ctl_table * 6262 static struct ctl_table *
6264 sd_alloc_ctl_domain_table(struct sched_domain *sd) 6263 sd_alloc_ctl_domain_table(struct sched_domain *sd)
6265 { 6264 {
6266 struct ctl_table *table = sd_alloc_ctl_entry(13); 6265 struct ctl_table *table = sd_alloc_ctl_entry(13);
6267 6266
6268 if (table == NULL) 6267 if (table == NULL)
6269 return NULL; 6268 return NULL;
6270 6269
6271 set_table_entry(&table[0], "min_interval", &sd->min_interval, 6270 set_table_entry(&table[0], "min_interval", &sd->min_interval,
6272 sizeof(long), 0644, proc_doulongvec_minmax); 6271 sizeof(long), 0644, proc_doulongvec_minmax);
6273 set_table_entry(&table[1], "max_interval", &sd->max_interval, 6272 set_table_entry(&table[1], "max_interval", &sd->max_interval,
6274 sizeof(long), 0644, proc_doulongvec_minmax); 6273 sizeof(long), 0644, proc_doulongvec_minmax);
6275 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 6274 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6276 sizeof(int), 0644, proc_dointvec_minmax); 6275 sizeof(int), 0644, proc_dointvec_minmax);
6277 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 6276 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6278 sizeof(int), 0644, proc_dointvec_minmax); 6277 sizeof(int), 0644, proc_dointvec_minmax);
6279 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 6278 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6280 sizeof(int), 0644, proc_dointvec_minmax); 6279 sizeof(int), 0644, proc_dointvec_minmax);
6281 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 6280 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6282 sizeof(int), 0644, proc_dointvec_minmax); 6281 sizeof(int), 0644, proc_dointvec_minmax);
6283 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 6282 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6284 sizeof(int), 0644, proc_dointvec_minmax); 6283 sizeof(int), 0644, proc_dointvec_minmax);
6285 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 6284 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6286 sizeof(int), 0644, proc_dointvec_minmax); 6285 sizeof(int), 0644, proc_dointvec_minmax);
6287 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 6286 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6288 sizeof(int), 0644, proc_dointvec_minmax); 6287 sizeof(int), 0644, proc_dointvec_minmax);
6289 set_table_entry(&table[9], "cache_nice_tries", 6288 set_table_entry(&table[9], "cache_nice_tries",
6290 &sd->cache_nice_tries, 6289 &sd->cache_nice_tries,
6291 sizeof(int), 0644, proc_dointvec_minmax); 6290 sizeof(int), 0644, proc_dointvec_minmax);
6292 set_table_entry(&table[10], "flags", &sd->flags, 6291 set_table_entry(&table[10], "flags", &sd->flags,
6293 sizeof(int), 0644, proc_dointvec_minmax); 6292 sizeof(int), 0644, proc_dointvec_minmax);
6294 set_table_entry(&table[11], "name", sd->name, 6293 set_table_entry(&table[11], "name", sd->name,
6295 CORENAME_MAX_SIZE, 0444, proc_dostring); 6294 CORENAME_MAX_SIZE, 0444, proc_dostring);
6296 /* &table[12] is terminator */ 6295 /* &table[12] is terminator */
6297 6296
6298 return table; 6297 return table;
6299 } 6298 }
6300 6299
6301 static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 6300 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6302 { 6301 {
6303 struct ctl_table *entry, *table; 6302 struct ctl_table *entry, *table;
6304 struct sched_domain *sd; 6303 struct sched_domain *sd;
6305 int domain_num = 0, i; 6304 int domain_num = 0, i;
6306 char buf[32]; 6305 char buf[32];
6307 6306
6308 for_each_domain(cpu, sd) 6307 for_each_domain(cpu, sd)
6309 domain_num++; 6308 domain_num++;
6310 entry = table = sd_alloc_ctl_entry(domain_num + 1); 6309 entry = table = sd_alloc_ctl_entry(domain_num + 1);
6311 if (table == NULL) 6310 if (table == NULL)
6312 return NULL; 6311 return NULL;
6313 6312
6314 i = 0; 6313 i = 0;
6315 for_each_domain(cpu, sd) { 6314 for_each_domain(cpu, sd) {
6316 snprintf(buf, 32, "domain%d", i); 6315 snprintf(buf, 32, "domain%d", i);
6317 entry->procname = kstrdup(buf, GFP_KERNEL); 6316 entry->procname = kstrdup(buf, GFP_KERNEL);
6318 entry->mode = 0555; 6317 entry->mode = 0555;
6319 entry->child = sd_alloc_ctl_domain_table(sd); 6318 entry->child = sd_alloc_ctl_domain_table(sd);
6320 entry++; 6319 entry++;
6321 i++; 6320 i++;
6322 } 6321 }
6323 return table; 6322 return table;
6324 } 6323 }
6325 6324
6326 static struct ctl_table_header *sd_sysctl_header; 6325 static struct ctl_table_header *sd_sysctl_header;
6327 static void register_sched_domain_sysctl(void) 6326 static void register_sched_domain_sysctl(void)
6328 { 6327 {
6329 int i, cpu_num = num_possible_cpus(); 6328 int i, cpu_num = num_possible_cpus();
6330 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 6329 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6331 char buf[32]; 6330 char buf[32];
6332 6331
6333 WARN_ON(sd_ctl_dir[0].child); 6332 WARN_ON(sd_ctl_dir[0].child);
6334 sd_ctl_dir[0].child = entry; 6333 sd_ctl_dir[0].child = entry;
6335 6334
6336 if (entry == NULL) 6335 if (entry == NULL)
6337 return; 6336 return;
6338 6337
6339 for_each_possible_cpu(i) { 6338 for_each_possible_cpu(i) {
6340 snprintf(buf, 32, "cpu%d", i); 6339 snprintf(buf, 32, "cpu%d", i);
6341 entry->procname = kstrdup(buf, GFP_KERNEL); 6340 entry->procname = kstrdup(buf, GFP_KERNEL);
6342 entry->mode = 0555; 6341 entry->mode = 0555;
6343 entry->child = sd_alloc_ctl_cpu_table(i); 6342 entry->child = sd_alloc_ctl_cpu_table(i);
6344 entry++; 6343 entry++;
6345 } 6344 }
6346 6345
6347 WARN_ON(sd_sysctl_header); 6346 WARN_ON(sd_sysctl_header);
6348 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 6347 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6349 } 6348 }
6350 6349
6351 /* may be called multiple times per register */ 6350 /* may be called multiple times per register */
6352 static void unregister_sched_domain_sysctl(void) 6351 static void unregister_sched_domain_sysctl(void)
6353 { 6352 {
6354 if (sd_sysctl_header) 6353 if (sd_sysctl_header)
6355 unregister_sysctl_table(sd_sysctl_header); 6354 unregister_sysctl_table(sd_sysctl_header);
6356 sd_sysctl_header = NULL; 6355 sd_sysctl_header = NULL;
6357 if (sd_ctl_dir[0].child) 6356 if (sd_ctl_dir[0].child)
6358 sd_free_ctl_entry(&sd_ctl_dir[0].child); 6357 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6359 } 6358 }
6360 #else 6359 #else
6361 static void register_sched_domain_sysctl(void) 6360 static void register_sched_domain_sysctl(void)
6362 { 6361 {
6363 } 6362 }
6364 static void unregister_sched_domain_sysctl(void) 6363 static void unregister_sched_domain_sysctl(void)
6365 { 6364 {
6366 } 6365 }
6367 #endif 6366 #endif
6368 6367
6369 static void set_rq_online(struct rq *rq) 6368 static void set_rq_online(struct rq *rq)
6370 { 6369 {
6371 if (!rq->online) { 6370 if (!rq->online) {
6372 const struct sched_class *class; 6371 const struct sched_class *class;
6373 6372
6374 cpumask_set_cpu(rq->cpu, rq->rd->online); 6373 cpumask_set_cpu(rq->cpu, rq->rd->online);
6375 rq->online = 1; 6374 rq->online = 1;
6376 6375
6377 for_each_class(class) { 6376 for_each_class(class) {
6378 if (class->rq_online) 6377 if (class->rq_online)
6379 class->rq_online(rq); 6378 class->rq_online(rq);
6380 } 6379 }
6381 } 6380 }
6382 } 6381 }
6383 6382
6384 static void set_rq_offline(struct rq *rq) 6383 static void set_rq_offline(struct rq *rq)
6385 { 6384 {
6386 if (rq->online) { 6385 if (rq->online) {
6387 const struct sched_class *class; 6386 const struct sched_class *class;
6388 6387
6389 for_each_class(class) { 6388 for_each_class(class) {
6390 if (class->rq_offline) 6389 if (class->rq_offline)
6391 class->rq_offline(rq); 6390 class->rq_offline(rq);
6392 } 6391 }
6393 6392
6394 cpumask_clear_cpu(rq->cpu, rq->rd->online); 6393 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6395 rq->online = 0; 6394 rq->online = 0;
6396 } 6395 }
6397 } 6396 }
6398 6397
6399 /* 6398 /*
6400 * migration_call - callback that gets triggered when a CPU is added. 6399 * migration_call - callback that gets triggered when a CPU is added.
6401 * Here we can start up the necessary migration thread for the new CPU. 6400 * Here we can start up the necessary migration thread for the new CPU.
6402 */ 6401 */
6403 static int __cpuinit 6402 static int __cpuinit
6404 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 6403 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6405 { 6404 {
6406 int cpu = (long)hcpu; 6405 int cpu = (long)hcpu;
6407 unsigned long flags; 6406 unsigned long flags;
6408 struct rq *rq = cpu_rq(cpu); 6407 struct rq *rq = cpu_rq(cpu);
6409 6408
6410 switch (action & ~CPU_TASKS_FROZEN) { 6409 switch (action & ~CPU_TASKS_FROZEN) {
6411 6410
6412 case CPU_UP_PREPARE: 6411 case CPU_UP_PREPARE:
6413 rq->calc_load_update = calc_load_update; 6412 rq->calc_load_update = calc_load_update;
6414 break; 6413 break;
6415 6414
6416 case CPU_ONLINE: 6415 case CPU_ONLINE:
6417 /* Update our root-domain */ 6416 /* Update our root-domain */
6418 raw_spin_lock_irqsave(&rq->lock, flags); 6417 raw_spin_lock_irqsave(&rq->lock, flags);
6419 if (rq->rd) { 6418 if (rq->rd) {
6420 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6419 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6421 6420
6422 set_rq_online(rq); 6421 set_rq_online(rq);
6423 } 6422 }
6424 raw_spin_unlock_irqrestore(&rq->lock, flags); 6423 raw_spin_unlock_irqrestore(&rq->lock, flags);
6425 break; 6424 break;
6426 6425
6427 #ifdef CONFIG_HOTPLUG_CPU 6426 #ifdef CONFIG_HOTPLUG_CPU
6428 case CPU_DYING: 6427 case CPU_DYING:
6429 sched_ttwu_pending(); 6428 sched_ttwu_pending();
6430 /* Update our root-domain */ 6429 /* Update our root-domain */
6431 raw_spin_lock_irqsave(&rq->lock, flags); 6430 raw_spin_lock_irqsave(&rq->lock, flags);
6432 if (rq->rd) { 6431 if (rq->rd) {
6433 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6432 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6434 set_rq_offline(rq); 6433 set_rq_offline(rq);
6435 } 6434 }
6436 migrate_tasks(cpu); 6435 migrate_tasks(cpu);
6437 BUG_ON(rq->nr_running != 1); /* the migration thread */ 6436 BUG_ON(rq->nr_running != 1); /* the migration thread */
6438 raw_spin_unlock_irqrestore(&rq->lock, flags); 6437 raw_spin_unlock_irqrestore(&rq->lock, flags);
6439 6438
6440 migrate_nr_uninterruptible(rq); 6439 migrate_nr_uninterruptible(rq);
6441 calc_global_load_remove(rq); 6440 calc_global_load_remove(rq);
6442 break; 6441 break;
6443 #endif 6442 #endif
6444 } 6443 }
6445 6444
6446 update_max_interval(); 6445 update_max_interval();
6447 6446
6448 return NOTIFY_OK; 6447 return NOTIFY_OK;
6449 } 6448 }
6450 6449
6451 /* 6450 /*
6452 * Register at high priority so that task migration (migrate_all_tasks) 6451 * Register at high priority so that task migration (migrate_all_tasks)
6453 * happens before everything else. This has to be lower priority than 6452 * happens before everything else. This has to be lower priority than
6454 * the notifier in the perf_event subsystem, though. 6453 * the notifier in the perf_event subsystem, though.
6455 */ 6454 */
6456 static struct notifier_block __cpuinitdata migration_notifier = { 6455 static struct notifier_block __cpuinitdata migration_notifier = {
6457 .notifier_call = migration_call, 6456 .notifier_call = migration_call,
6458 .priority = CPU_PRI_MIGRATION, 6457 .priority = CPU_PRI_MIGRATION,
6459 }; 6458 };
6460 6459
6461 static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 6460 static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6462 unsigned long action, void *hcpu) 6461 unsigned long action, void *hcpu)
6463 { 6462 {
6464 switch (action & ~CPU_TASKS_FROZEN) { 6463 switch (action & ~CPU_TASKS_FROZEN) {
6465 case CPU_ONLINE: 6464 case CPU_ONLINE:
6466 case CPU_DOWN_FAILED: 6465 case CPU_DOWN_FAILED:
6467 set_cpu_active((long)hcpu, true); 6466 set_cpu_active((long)hcpu, true);
6468 return NOTIFY_OK; 6467 return NOTIFY_OK;
6469 default: 6468 default:
6470 return NOTIFY_DONE; 6469 return NOTIFY_DONE;
6471 } 6470 }
6472 } 6471 }
6473 6472
6474 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 6473 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6475 unsigned long action, void *hcpu) 6474 unsigned long action, void *hcpu)
6476 { 6475 {
6477 switch (action & ~CPU_TASKS_FROZEN) { 6476 switch (action & ~CPU_TASKS_FROZEN) {
6478 case CPU_DOWN_PREPARE: 6477 case CPU_DOWN_PREPARE:
6479 set_cpu_active((long)hcpu, false); 6478 set_cpu_active((long)hcpu, false);
6480 return NOTIFY_OK; 6479 return NOTIFY_OK;
6481 default: 6480 default:
6482 return NOTIFY_DONE; 6481 return NOTIFY_DONE;
6483 } 6482 }
6484 } 6483 }
6485 6484
6486 static int __init migration_init(void) 6485 static int __init migration_init(void)
6487 { 6486 {
6488 void *cpu = (void *)(long)smp_processor_id(); 6487 void *cpu = (void *)(long)smp_processor_id();
6489 int err; 6488 int err;
6490 6489
6491 /* Initialize migration for the boot CPU */ 6490 /* Initialize migration for the boot CPU */
6492 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6491 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6493 BUG_ON(err == NOTIFY_BAD); 6492 BUG_ON(err == NOTIFY_BAD);
6494 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6493 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6495 register_cpu_notifier(&migration_notifier); 6494 register_cpu_notifier(&migration_notifier);
6496 6495
6497 /* Register cpu active notifiers */ 6496 /* Register cpu active notifiers */
6498 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 6497 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6499 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 6498 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6500 6499
6501 return 0; 6500 return 0;
6502 } 6501 }
6503 early_initcall(migration_init); 6502 early_initcall(migration_init);
6504 #endif 6503 #endif
6505 6504
6506 #ifdef CONFIG_SMP 6505 #ifdef CONFIG_SMP
6507 6506
6508 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 6507 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6509 6508
6510 #ifdef CONFIG_SCHED_DEBUG 6509 #ifdef CONFIG_SCHED_DEBUG
6511 6510
6512 static __read_mostly int sched_domain_debug_enabled; 6511 static __read_mostly int sched_domain_debug_enabled;
6513 6512
6514 static int __init sched_domain_debug_setup(char *str) 6513 static int __init sched_domain_debug_setup(char *str)
6515 { 6514 {
6516 sched_domain_debug_enabled = 1; 6515 sched_domain_debug_enabled = 1;
6517 6516
6518 return 0; 6517 return 0;
6519 } 6518 }
6520 early_param("sched_debug", sched_domain_debug_setup); 6519 early_param("sched_debug", sched_domain_debug_setup);
6521 6520
6522 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6521 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6523 struct cpumask *groupmask) 6522 struct cpumask *groupmask)
6524 { 6523 {
6525 struct sched_group *group = sd->groups; 6524 struct sched_group *group = sd->groups;
6526 char str[256]; 6525 char str[256];
6527 6526
6528 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 6527 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6529 cpumask_clear(groupmask); 6528 cpumask_clear(groupmask);
6530 6529
6531 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6530 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6532 6531
6533 if (!(sd->flags & SD_LOAD_BALANCE)) { 6532 if (!(sd->flags & SD_LOAD_BALANCE)) {
6534 printk("does not load-balance\n"); 6533 printk("does not load-balance\n");
6535 if (sd->parent) 6534 if (sd->parent)
6536 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 6535 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6537 " has parent"); 6536 " has parent");
6538 return -1; 6537 return -1;
6539 } 6538 }
6540 6539
6541 printk(KERN_CONT "span %s level %s\n", str, sd->name); 6540 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6542 6541
6543 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 6542 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6544 printk(KERN_ERR "ERROR: domain->span does not contain " 6543 printk(KERN_ERR "ERROR: domain->span does not contain "
6545 "CPU%d\n", cpu); 6544 "CPU%d\n", cpu);
6546 } 6545 }
6547 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 6546 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6548 printk(KERN_ERR "ERROR: domain->groups does not contain" 6547 printk(KERN_ERR "ERROR: domain->groups does not contain"
6549 " CPU%d\n", cpu); 6548 " CPU%d\n", cpu);
6550 } 6549 }
6551 6550
6552 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 6551 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6553 do { 6552 do {
6554 if (!group) { 6553 if (!group) {
6555 printk("\n"); 6554 printk("\n");
6556 printk(KERN_ERR "ERROR: group is NULL\n"); 6555 printk(KERN_ERR "ERROR: group is NULL\n");
6557 break; 6556 break;
6558 } 6557 }
6559 6558
6560 if (!group->cpu_power) { 6559 if (!group->cpu_power) {
6561 printk(KERN_CONT "\n"); 6560 printk(KERN_CONT "\n");
6562 printk(KERN_ERR "ERROR: domain->cpu_power not " 6561 printk(KERN_ERR "ERROR: domain->cpu_power not "
6563 "set\n"); 6562 "set\n");
6564 break; 6563 break;
6565 } 6564 }
6566 6565
6567 if (!cpumask_weight(sched_group_cpus(group))) { 6566 if (!cpumask_weight(sched_group_cpus(group))) {
6568 printk(KERN_CONT "\n"); 6567 printk(KERN_CONT "\n");
6569 printk(KERN_ERR "ERROR: empty group\n"); 6568 printk(KERN_ERR "ERROR: empty group\n");
6570 break; 6569 break;
6571 } 6570 }
6572 6571
6573 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 6572 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6574 printk(KERN_CONT "\n"); 6573 printk(KERN_CONT "\n");
6575 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6574 printk(KERN_ERR "ERROR: repeated CPUs\n");
6576 break; 6575 break;
6577 } 6576 }
6578 6577
6579 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 6578 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6580 6579
6581 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6580 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6582 6581
6583 printk(KERN_CONT " %s", str); 6582 printk(KERN_CONT " %s", str);
6584 if (group->cpu_power != SCHED_POWER_SCALE) { 6583 if (group->cpu_power != SCHED_POWER_SCALE) {
6585 printk(KERN_CONT " (cpu_power = %d)", 6584 printk(KERN_CONT " (cpu_power = %d)",
6586 group->cpu_power); 6585 group->cpu_power);
6587 } 6586 }
6588 6587
6589 group = group->next; 6588 group = group->next;
6590 } while (group != sd->groups); 6589 } while (group != sd->groups);
6591 printk(KERN_CONT "\n"); 6590 printk(KERN_CONT "\n");
6592 6591
6593 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 6592 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6594 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6593 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6595 6594
6596 if (sd->parent && 6595 if (sd->parent &&
6597 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 6596 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6598 printk(KERN_ERR "ERROR: parent span is not a superset " 6597 printk(KERN_ERR "ERROR: parent span is not a superset "
6599 "of domain->span\n"); 6598 "of domain->span\n");
6600 return 0; 6599 return 0;
6601 } 6600 }
6602 6601
6603 static void sched_domain_debug(struct sched_domain *sd, int cpu) 6602 static void sched_domain_debug(struct sched_domain *sd, int cpu)
6604 { 6603 {
6605 int level = 0; 6604 int level = 0;
6606 6605
6607 if (!sched_domain_debug_enabled) 6606 if (!sched_domain_debug_enabled)
6608 return; 6607 return;
6609 6608
6610 if (!sd) { 6609 if (!sd) {
6611 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 6610 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6612 return; 6611 return;
6613 } 6612 }
6614 6613
6615 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6614 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6616 6615
6617 for (;;) { 6616 for (;;) {
6618 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 6617 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6619 break; 6618 break;
6620 level++; 6619 level++;
6621 sd = sd->parent; 6620 sd = sd->parent;
6622 if (!sd) 6621 if (!sd)
6623 break; 6622 break;
6624 } 6623 }
6625 } 6624 }
6626 #else /* !CONFIG_SCHED_DEBUG */ 6625 #else /* !CONFIG_SCHED_DEBUG */
6627 # define sched_domain_debug(sd, cpu) do { } while (0) 6626 # define sched_domain_debug(sd, cpu) do { } while (0)
6628 #endif /* CONFIG_SCHED_DEBUG */ 6627 #endif /* CONFIG_SCHED_DEBUG */
6629 6628
6630 static int sd_degenerate(struct sched_domain *sd) 6629 static int sd_degenerate(struct sched_domain *sd)
6631 { 6630 {
6632 if (cpumask_weight(sched_domain_span(sd)) == 1) 6631 if (cpumask_weight(sched_domain_span(sd)) == 1)
6633 return 1; 6632 return 1;
6634 6633
6635 /* Following flags need at least 2 groups */ 6634 /* Following flags need at least 2 groups */
6636 if (sd->flags & (SD_LOAD_BALANCE | 6635 if (sd->flags & (SD_LOAD_BALANCE |
6637 SD_BALANCE_NEWIDLE | 6636 SD_BALANCE_NEWIDLE |
6638 SD_BALANCE_FORK | 6637 SD_BALANCE_FORK |
6639 SD_BALANCE_EXEC | 6638 SD_BALANCE_EXEC |
6640 SD_SHARE_CPUPOWER | 6639 SD_SHARE_CPUPOWER |
6641 SD_SHARE_PKG_RESOURCES)) { 6640 SD_SHARE_PKG_RESOURCES)) {
6642 if (sd->groups != sd->groups->next) 6641 if (sd->groups != sd->groups->next)
6643 return 0; 6642 return 0;
6644 } 6643 }
6645 6644
6646 /* Following flags don't use groups */ 6645 /* Following flags don't use groups */
6647 if (sd->flags & (SD_WAKE_AFFINE)) 6646 if (sd->flags & (SD_WAKE_AFFINE))
6648 return 0; 6647 return 0;
6649 6648
6650 return 1; 6649 return 1;
6651 } 6650 }
6652 6651
6653 static int 6652 static int
6654 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 6653 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6655 { 6654 {
6656 unsigned long cflags = sd->flags, pflags = parent->flags; 6655 unsigned long cflags = sd->flags, pflags = parent->flags;
6657 6656
6658 if (sd_degenerate(parent)) 6657 if (sd_degenerate(parent))
6659 return 1; 6658 return 1;
6660 6659
6661 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 6660 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6662 return 0; 6661 return 0;
6663 6662
6664 /* Flags needing groups don't count if only 1 group in parent */ 6663 /* Flags needing groups don't count if only 1 group in parent */
6665 if (parent->groups == parent->groups->next) { 6664 if (parent->groups == parent->groups->next) {
6666 pflags &= ~(SD_LOAD_BALANCE | 6665 pflags &= ~(SD_LOAD_BALANCE |
6667 SD_BALANCE_NEWIDLE | 6666 SD_BALANCE_NEWIDLE |
6668 SD_BALANCE_FORK | 6667 SD_BALANCE_FORK |
6669 SD_BALANCE_EXEC | 6668 SD_BALANCE_EXEC |
6670 SD_SHARE_CPUPOWER | 6669 SD_SHARE_CPUPOWER |
6671 SD_SHARE_PKG_RESOURCES); 6670 SD_SHARE_PKG_RESOURCES);
6672 if (nr_node_ids == 1) 6671 if (nr_node_ids == 1)
6673 pflags &= ~SD_SERIALIZE; 6672 pflags &= ~SD_SERIALIZE;
6674 } 6673 }
6675 if (~cflags & pflags) 6674 if (~cflags & pflags)
6676 return 0; 6675 return 0;
6677 6676
6678 return 1; 6677 return 1;
6679 } 6678 }
6680 6679
6681 static void free_rootdomain(struct rcu_head *rcu) 6680 static void free_rootdomain(struct rcu_head *rcu)
6682 { 6681 {
6683 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 6682 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6684 6683
6685 cpupri_cleanup(&rd->cpupri); 6684 cpupri_cleanup(&rd->cpupri);
6686 free_cpumask_var(rd->rto_mask); 6685 free_cpumask_var(rd->rto_mask);
6687 free_cpumask_var(rd->online); 6686 free_cpumask_var(rd->online);
6688 free_cpumask_var(rd->span); 6687 free_cpumask_var(rd->span);
6689 kfree(rd); 6688 kfree(rd);
6690 } 6689 }
6691 6690
6692 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6691 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6693 { 6692 {
6694 struct root_domain *old_rd = NULL; 6693 struct root_domain *old_rd = NULL;
6695 unsigned long flags; 6694 unsigned long flags;
6696 6695
6697 raw_spin_lock_irqsave(&rq->lock, flags); 6696 raw_spin_lock_irqsave(&rq->lock, flags);
6698 6697
6699 if (rq->rd) { 6698 if (rq->rd) {
6700 old_rd = rq->rd; 6699 old_rd = rq->rd;
6701 6700
6702 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 6701 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6703 set_rq_offline(rq); 6702 set_rq_offline(rq);
6704 6703
6705 cpumask_clear_cpu(rq->cpu, old_rd->span); 6704 cpumask_clear_cpu(rq->cpu, old_rd->span);
6706 6705
6707 /* 6706 /*
6708 * If we dont want to free the old_rt yet then 6707 * If we dont want to free the old_rt yet then
6709 * set old_rd to NULL to skip the freeing later 6708 * set old_rd to NULL to skip the freeing later
6710 * in this function: 6709 * in this function:
6711 */ 6710 */
6712 if (!atomic_dec_and_test(&old_rd->refcount)) 6711 if (!atomic_dec_and_test(&old_rd->refcount))
6713 old_rd = NULL; 6712 old_rd = NULL;
6714 } 6713 }
6715 6714
6716 atomic_inc(&rd->refcount); 6715 atomic_inc(&rd->refcount);
6717 rq->rd = rd; 6716 rq->rd = rd;
6718 6717
6719 cpumask_set_cpu(rq->cpu, rd->span); 6718 cpumask_set_cpu(rq->cpu, rd->span);
6720 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 6719 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
6721 set_rq_online(rq); 6720 set_rq_online(rq);
6722 6721
6723 raw_spin_unlock_irqrestore(&rq->lock, flags); 6722 raw_spin_unlock_irqrestore(&rq->lock, flags);
6724 6723
6725 if (old_rd) 6724 if (old_rd)
6726 call_rcu_sched(&old_rd->rcu, free_rootdomain); 6725 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6727 } 6726 }
6728 6727
6729 static int init_rootdomain(struct root_domain *rd) 6728 static int init_rootdomain(struct root_domain *rd)
6730 { 6729 {
6731 memset(rd, 0, sizeof(*rd)); 6730 memset(rd, 0, sizeof(*rd));
6732 6731
6733 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 6732 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6734 goto out; 6733 goto out;
6735 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 6734 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6736 goto free_span; 6735 goto free_span;
6737 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 6736 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6738 goto free_online; 6737 goto free_online;
6739 6738
6740 if (cpupri_init(&rd->cpupri) != 0) 6739 if (cpupri_init(&rd->cpupri) != 0)
6741 goto free_rto_mask; 6740 goto free_rto_mask;
6742 return 0; 6741 return 0;
6743 6742
6744 free_rto_mask: 6743 free_rto_mask:
6745 free_cpumask_var(rd->rto_mask); 6744 free_cpumask_var(rd->rto_mask);
6746 free_online: 6745 free_online:
6747 free_cpumask_var(rd->online); 6746 free_cpumask_var(rd->online);
6748 free_span: 6747 free_span:
6749 free_cpumask_var(rd->span); 6748 free_cpumask_var(rd->span);
6750 out: 6749 out:
6751 return -ENOMEM; 6750 return -ENOMEM;
6752 } 6751 }
6753 6752
6754 static void init_defrootdomain(void) 6753 static void init_defrootdomain(void)
6755 { 6754 {
6756 init_rootdomain(&def_root_domain); 6755 init_rootdomain(&def_root_domain);
6757 6756
6758 atomic_set(&def_root_domain.refcount, 1); 6757 atomic_set(&def_root_domain.refcount, 1);
6759 } 6758 }
6760 6759
6761 static struct root_domain *alloc_rootdomain(void) 6760 static struct root_domain *alloc_rootdomain(void)
6762 { 6761 {
6763 struct root_domain *rd; 6762 struct root_domain *rd;
6764 6763
6765 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 6764 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6766 if (!rd) 6765 if (!rd)
6767 return NULL; 6766 return NULL;
6768 6767
6769 if (init_rootdomain(rd) != 0) { 6768 if (init_rootdomain(rd) != 0) {
6770 kfree(rd); 6769 kfree(rd);
6771 return NULL; 6770 return NULL;
6772 } 6771 }
6773 6772
6774 return rd; 6773 return rd;
6775 } 6774 }
6776 6775
6777 static void free_sched_domain(struct rcu_head *rcu) 6776 static void free_sched_domain(struct rcu_head *rcu)
6778 { 6777 {
6779 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 6778 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6780 if (atomic_dec_and_test(&sd->groups->ref)) 6779 if (atomic_dec_and_test(&sd->groups->ref))
6781 kfree(sd->groups); 6780 kfree(sd->groups);
6782 kfree(sd); 6781 kfree(sd);
6783 } 6782 }
6784 6783
6785 static void destroy_sched_domain(struct sched_domain *sd, int cpu) 6784 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6786 { 6785 {
6787 call_rcu(&sd->rcu, free_sched_domain); 6786 call_rcu(&sd->rcu, free_sched_domain);
6788 } 6787 }
6789 6788
6790 static void destroy_sched_domains(struct sched_domain *sd, int cpu) 6789 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6791 { 6790 {
6792 for (; sd; sd = sd->parent) 6791 for (; sd; sd = sd->parent)
6793 destroy_sched_domain(sd, cpu); 6792 destroy_sched_domain(sd, cpu);
6794 } 6793 }
6795 6794
6796 /* 6795 /*
6797 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6796 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6798 * hold the hotplug lock. 6797 * hold the hotplug lock.
6799 */ 6798 */
6800 static void 6799 static void
6801 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 6800 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6802 { 6801 {
6803 struct rq *rq = cpu_rq(cpu); 6802 struct rq *rq = cpu_rq(cpu);
6804 struct sched_domain *tmp; 6803 struct sched_domain *tmp;
6805 6804
6806 /* Remove the sched domains which do not contribute to scheduling. */ 6805 /* Remove the sched domains which do not contribute to scheduling. */
6807 for (tmp = sd; tmp; ) { 6806 for (tmp = sd; tmp; ) {
6808 struct sched_domain *parent = tmp->parent; 6807 struct sched_domain *parent = tmp->parent;
6809 if (!parent) 6808 if (!parent)
6810 break; 6809 break;
6811 6810
6812 if (sd_parent_degenerate(tmp, parent)) { 6811 if (sd_parent_degenerate(tmp, parent)) {
6813 tmp->parent = parent->parent; 6812 tmp->parent = parent->parent;
6814 if (parent->parent) 6813 if (parent->parent)
6815 parent->parent->child = tmp; 6814 parent->parent->child = tmp;
6816 destroy_sched_domain(parent, cpu); 6815 destroy_sched_domain(parent, cpu);
6817 } else 6816 } else
6818 tmp = tmp->parent; 6817 tmp = tmp->parent;
6819 } 6818 }
6820 6819
6821 if (sd && sd_degenerate(sd)) { 6820 if (sd && sd_degenerate(sd)) {
6822 tmp = sd; 6821 tmp = sd;
6823 sd = sd->parent; 6822 sd = sd->parent;
6824 destroy_sched_domain(tmp, cpu); 6823 destroy_sched_domain(tmp, cpu);
6825 if (sd) 6824 if (sd)
6826 sd->child = NULL; 6825 sd->child = NULL;
6827 } 6826 }
6828 6827
6829 sched_domain_debug(sd, cpu); 6828 sched_domain_debug(sd, cpu);
6830 6829
6831 rq_attach_root(rq, rd); 6830 rq_attach_root(rq, rd);
6832 tmp = rq->sd; 6831 tmp = rq->sd;
6833 rcu_assign_pointer(rq->sd, sd); 6832 rcu_assign_pointer(rq->sd, sd);
6834 destroy_sched_domains(tmp, cpu); 6833 destroy_sched_domains(tmp, cpu);
6835 } 6834 }
6836 6835
6837 /* cpus with isolated domains */ 6836 /* cpus with isolated domains */
6838 static cpumask_var_t cpu_isolated_map; 6837 static cpumask_var_t cpu_isolated_map;
6839 6838
6840 /* Setup the mask of cpus configured for isolated domains */ 6839 /* Setup the mask of cpus configured for isolated domains */
6841 static int __init isolated_cpu_setup(char *str) 6840 static int __init isolated_cpu_setup(char *str)
6842 { 6841 {
6843 alloc_bootmem_cpumask_var(&cpu_isolated_map); 6842 alloc_bootmem_cpumask_var(&cpu_isolated_map);
6844 cpulist_parse(str, cpu_isolated_map); 6843 cpulist_parse(str, cpu_isolated_map);
6845 return 1; 6844 return 1;
6846 } 6845 }
6847 6846
6848 __setup("isolcpus=", isolated_cpu_setup); 6847 __setup("isolcpus=", isolated_cpu_setup);
6849 6848
6850 #define SD_NODES_PER_DOMAIN 16 6849 #define SD_NODES_PER_DOMAIN 16
6851 6850
6852 #ifdef CONFIG_NUMA 6851 #ifdef CONFIG_NUMA
6853 6852
6854 /** 6853 /**
6855 * find_next_best_node - find the next node to include in a sched_domain 6854 * find_next_best_node - find the next node to include in a sched_domain
6856 * @node: node whose sched_domain we're building 6855 * @node: node whose sched_domain we're building
6857 * @used_nodes: nodes already in the sched_domain 6856 * @used_nodes: nodes already in the sched_domain
6858 * 6857 *
6859 * Find the next node to include in a given scheduling domain. Simply 6858 * Find the next node to include in a given scheduling domain. Simply
6860 * finds the closest node not already in the @used_nodes map. 6859 * finds the closest node not already in the @used_nodes map.
6861 * 6860 *
6862 * Should use nodemask_t. 6861 * Should use nodemask_t.
6863 */ 6862 */
6864 static int find_next_best_node(int node, nodemask_t *used_nodes) 6863 static int find_next_best_node(int node, nodemask_t *used_nodes)
6865 { 6864 {
6866 int i, n, val, min_val, best_node = -1; 6865 int i, n, val, min_val, best_node = -1;
6867 6866
6868 min_val = INT_MAX; 6867 min_val = INT_MAX;
6869 6868
6870 for (i = 0; i < nr_node_ids; i++) { 6869 for (i = 0; i < nr_node_ids; i++) {
6871 /* Start at @node */ 6870 /* Start at @node */
6872 n = (node + i) % nr_node_ids; 6871 n = (node + i) % nr_node_ids;
6873 6872
6874 if (!nr_cpus_node(n)) 6873 if (!nr_cpus_node(n))
6875 continue; 6874 continue;
6876 6875
6877 /* Skip already used nodes */ 6876 /* Skip already used nodes */
6878 if (node_isset(n, *used_nodes)) 6877 if (node_isset(n, *used_nodes))
6879 continue; 6878 continue;
6880 6879
6881 /* Simple min distance search */ 6880 /* Simple min distance search */
6882 val = node_distance(node, n); 6881 val = node_distance(node, n);
6883 6882
6884 if (val < min_val) { 6883 if (val < min_val) {
6885 min_val = val; 6884 min_val = val;
6886 best_node = n; 6885 best_node = n;
6887 } 6886 }
6888 } 6887 }
6889 6888
6890 if (best_node != -1) 6889 if (best_node != -1)
6891 node_set(best_node, *used_nodes); 6890 node_set(best_node, *used_nodes);
6892 return best_node; 6891 return best_node;
6893 } 6892 }
6894 6893
6895 /** 6894 /**
6896 * sched_domain_node_span - get a cpumask for a node's sched_domain 6895 * sched_domain_node_span - get a cpumask for a node's sched_domain
6897 * @node: node whose cpumask we're constructing 6896 * @node: node whose cpumask we're constructing
6898 * @span: resulting cpumask 6897 * @span: resulting cpumask
6899 * 6898 *
6900 * Given a node, construct a good cpumask for its sched_domain to span. It 6899 * Given a node, construct a good cpumask for its sched_domain to span. It
6901 * should be one that prevents unnecessary balancing, but also spreads tasks 6900 * should be one that prevents unnecessary balancing, but also spreads tasks
6902 * out optimally. 6901 * out optimally.
6903 */ 6902 */
6904 static void sched_domain_node_span(int node, struct cpumask *span) 6903 static void sched_domain_node_span(int node, struct cpumask *span)
6905 { 6904 {
6906 nodemask_t used_nodes; 6905 nodemask_t used_nodes;
6907 int i; 6906 int i;
6908 6907
6909 cpumask_clear(span); 6908 cpumask_clear(span);
6910 nodes_clear(used_nodes); 6909 nodes_clear(used_nodes);
6911 6910
6912 cpumask_or(span, span, cpumask_of_node(node)); 6911 cpumask_or(span, span, cpumask_of_node(node));
6913 node_set(node, used_nodes); 6912 node_set(node, used_nodes);
6914 6913
6915 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6914 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6916 int next_node = find_next_best_node(node, &used_nodes); 6915 int next_node = find_next_best_node(node, &used_nodes);
6917 if (next_node < 0) 6916 if (next_node < 0)
6918 break; 6917 break;
6919 cpumask_or(span, span, cpumask_of_node(next_node)); 6918 cpumask_or(span, span, cpumask_of_node(next_node));
6920 } 6919 }
6921 } 6920 }
6922 6921
6923 static const struct cpumask *cpu_node_mask(int cpu) 6922 static const struct cpumask *cpu_node_mask(int cpu)
6924 { 6923 {
6925 lockdep_assert_held(&sched_domains_mutex); 6924 lockdep_assert_held(&sched_domains_mutex);
6926 6925
6927 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); 6926 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6928 6927
6929 return sched_domains_tmpmask; 6928 return sched_domains_tmpmask;
6930 } 6929 }
6931 6930
6932 static const struct cpumask *cpu_allnodes_mask(int cpu) 6931 static const struct cpumask *cpu_allnodes_mask(int cpu)
6933 { 6932 {
6934 return cpu_possible_mask; 6933 return cpu_possible_mask;
6935 } 6934 }
6936 #endif /* CONFIG_NUMA */ 6935 #endif /* CONFIG_NUMA */
6937 6936
6938 static const struct cpumask *cpu_cpu_mask(int cpu) 6937 static const struct cpumask *cpu_cpu_mask(int cpu)
6939 { 6938 {
6940 return cpumask_of_node(cpu_to_node(cpu)); 6939 return cpumask_of_node(cpu_to_node(cpu));
6941 } 6940 }
6942 6941
6943 int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6942 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6944 6943
6945 struct sd_data { 6944 struct sd_data {
6946 struct sched_domain **__percpu sd; 6945 struct sched_domain **__percpu sd;
6947 struct sched_group **__percpu sg; 6946 struct sched_group **__percpu sg;
6948 }; 6947 };
6949 6948
6950 struct s_data { 6949 struct s_data {
6951 struct sched_domain ** __percpu sd; 6950 struct sched_domain ** __percpu sd;
6952 struct root_domain *rd; 6951 struct root_domain *rd;
6953 }; 6952 };
6954 6953
6955 enum s_alloc { 6954 enum s_alloc {
6956 sa_rootdomain, 6955 sa_rootdomain,
6957 sa_sd, 6956 sa_sd,
6958 sa_sd_storage, 6957 sa_sd_storage,
6959 sa_none, 6958 sa_none,
6960 }; 6959 };
6961 6960
6962 struct sched_domain_topology_level; 6961 struct sched_domain_topology_level;
6963 6962
6964 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 6963 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6965 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 6964 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6966 6965
6967 struct sched_domain_topology_level { 6966 struct sched_domain_topology_level {
6968 sched_domain_init_f init; 6967 sched_domain_init_f init;
6969 sched_domain_mask_f mask; 6968 sched_domain_mask_f mask;
6970 struct sd_data data; 6969 struct sd_data data;
6971 }; 6970 };
6972 6971
6973 /* 6972 /*
6974 * Assumes the sched_domain tree is fully constructed 6973 * Assumes the sched_domain tree is fully constructed
6975 */ 6974 */
6976 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 6975 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6977 { 6976 {
6978 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 6977 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6979 struct sched_domain *child = sd->child; 6978 struct sched_domain *child = sd->child;
6980 6979
6981 if (child) 6980 if (child)
6982 cpu = cpumask_first(sched_domain_span(child)); 6981 cpu = cpumask_first(sched_domain_span(child));
6983 6982
6984 if (sg) 6983 if (sg)
6985 *sg = *per_cpu_ptr(sdd->sg, cpu); 6984 *sg = *per_cpu_ptr(sdd->sg, cpu);
6986 6985
6987 return cpu; 6986 return cpu;
6988 } 6987 }
6989 6988
6990 /* 6989 /*
6991 * build_sched_groups takes the cpumask we wish to span, and a pointer 6990 * build_sched_groups takes the cpumask we wish to span, and a pointer
6992 * to a function which identifies what group(along with sched group) a CPU 6991 * to a function which identifies what group(along with sched group) a CPU
6993 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids 6992 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6994 * (due to the fact that we keep track of groups covered with a struct cpumask). 6993 * (due to the fact that we keep track of groups covered with a struct cpumask).
6995 * 6994 *
6996 * build_sched_groups will build a circular linked list of the groups 6995 * build_sched_groups will build a circular linked list of the groups
6997 * covered by the given span, and will set each group's ->cpumask correctly, 6996 * covered by the given span, and will set each group's ->cpumask correctly,
6998 * and ->cpu_power to 0. 6997 * and ->cpu_power to 0.
6999 */ 6998 */
7000 static void 6999 static void
7001 build_sched_groups(struct sched_domain *sd) 7000 build_sched_groups(struct sched_domain *sd)
7002 { 7001 {
7003 struct sched_group *first = NULL, *last = NULL; 7002 struct sched_group *first = NULL, *last = NULL;
7004 struct sd_data *sdd = sd->private; 7003 struct sd_data *sdd = sd->private;
7005 const struct cpumask *span = sched_domain_span(sd); 7004 const struct cpumask *span = sched_domain_span(sd);
7006 struct cpumask *covered; 7005 struct cpumask *covered;
7007 int i; 7006 int i;
7008 7007
7009 lockdep_assert_held(&sched_domains_mutex); 7008 lockdep_assert_held(&sched_domains_mutex);
7010 covered = sched_domains_tmpmask; 7009 covered = sched_domains_tmpmask;
7011 7010
7012 cpumask_clear(covered); 7011 cpumask_clear(covered);
7013 7012
7014 for_each_cpu(i, span) { 7013 for_each_cpu(i, span) {
7015 struct sched_group *sg; 7014 struct sched_group *sg;
7016 int group = get_group(i, sdd, &sg); 7015 int group = get_group(i, sdd, &sg);
7017 int j; 7016 int j;
7018 7017
7019 if (cpumask_test_cpu(i, covered)) 7018 if (cpumask_test_cpu(i, covered))
7020 continue; 7019 continue;
7021 7020
7022 cpumask_clear(sched_group_cpus(sg)); 7021 cpumask_clear(sched_group_cpus(sg));
7023 sg->cpu_power = 0; 7022 sg->cpu_power = 0;
7024 7023
7025 for_each_cpu(j, span) { 7024 for_each_cpu(j, span) {
7026 if (get_group(j, sdd, NULL) != group) 7025 if (get_group(j, sdd, NULL) != group)
7027 continue; 7026 continue;
7028 7027
7029 cpumask_set_cpu(j, covered); 7028 cpumask_set_cpu(j, covered);
7030 cpumask_set_cpu(j, sched_group_cpus(sg)); 7029 cpumask_set_cpu(j, sched_group_cpus(sg));
7031 } 7030 }
7032 7031
7033 if (!first) 7032 if (!first)
7034 first = sg; 7033 first = sg;
7035 if (last) 7034 if (last)
7036 last->next = sg; 7035 last->next = sg;
7037 last = sg; 7036 last = sg;
7038 } 7037 }
7039 last->next = first; 7038 last->next = first;
7040 } 7039 }
7041 7040
7042 /* 7041 /*
7043 * Initialize sched groups cpu_power. 7042 * Initialize sched groups cpu_power.
7044 * 7043 *
7045 * cpu_power indicates the capacity of sched group, which is used while 7044 * cpu_power indicates the capacity of sched group, which is used while
7046 * distributing the load between different sched groups in a sched domain. 7045 * distributing the load between different sched groups in a sched domain.
7047 * Typically cpu_power for all the groups in a sched domain will be same unless 7046 * Typically cpu_power for all the groups in a sched domain will be same unless
7048 * there are asymmetries in the topology. If there are asymmetries, group 7047 * there are asymmetries in the topology. If there are asymmetries, group
7049 * having more cpu_power will pickup more load compared to the group having 7048 * having more cpu_power will pickup more load compared to the group having
7050 * less cpu_power. 7049 * less cpu_power.
7051 */ 7050 */
7052 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7051 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7053 { 7052 {
7054 WARN_ON(!sd || !sd->groups); 7053 WARN_ON(!sd || !sd->groups);
7055 7054
7056 if (cpu != group_first_cpu(sd->groups)) 7055 if (cpu != group_first_cpu(sd->groups))
7057 return; 7056 return;
7058 7057
7059 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7058 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7060 7059
7061 update_group_power(sd, cpu); 7060 update_group_power(sd, cpu);
7062 } 7061 }
7063 7062
7064 /* 7063 /*
7065 * Initializers for schedule domains 7064 * Initializers for schedule domains
7066 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7065 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7067 */ 7066 */
7068 7067
7069 #ifdef CONFIG_SCHED_DEBUG 7068 #ifdef CONFIG_SCHED_DEBUG
7070 # define SD_INIT_NAME(sd, type) sd->name = #type 7069 # define SD_INIT_NAME(sd, type) sd->name = #type
7071 #else 7070 #else
7072 # define SD_INIT_NAME(sd, type) do { } while (0) 7071 # define SD_INIT_NAME(sd, type) do { } while (0)
7073 #endif 7072 #endif
7074 7073
7075 #define SD_INIT_FUNC(type) \ 7074 #define SD_INIT_FUNC(type) \
7076 static noinline struct sched_domain * \ 7075 static noinline struct sched_domain * \
7077 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ 7076 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7078 { \ 7077 { \
7079 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ 7078 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7080 *sd = SD_##type##_INIT; \ 7079 *sd = SD_##type##_INIT; \
7081 SD_INIT_NAME(sd, type); \ 7080 SD_INIT_NAME(sd, type); \
7082 sd->private = &tl->data; \ 7081 sd->private = &tl->data; \
7083 return sd; \ 7082 return sd; \
7084 } 7083 }
7085 7084
7086 SD_INIT_FUNC(CPU) 7085 SD_INIT_FUNC(CPU)
7087 #ifdef CONFIG_NUMA 7086 #ifdef CONFIG_NUMA
7088 SD_INIT_FUNC(ALLNODES) 7087 SD_INIT_FUNC(ALLNODES)
7089 SD_INIT_FUNC(NODE) 7088 SD_INIT_FUNC(NODE)
7090 #endif 7089 #endif
7091 #ifdef CONFIG_SCHED_SMT 7090 #ifdef CONFIG_SCHED_SMT
7092 SD_INIT_FUNC(SIBLING) 7091 SD_INIT_FUNC(SIBLING)
7093 #endif 7092 #endif
7094 #ifdef CONFIG_SCHED_MC 7093 #ifdef CONFIG_SCHED_MC
7095 SD_INIT_FUNC(MC) 7094 SD_INIT_FUNC(MC)
7096 #endif 7095 #endif
7097 #ifdef CONFIG_SCHED_BOOK 7096 #ifdef CONFIG_SCHED_BOOK
7098 SD_INIT_FUNC(BOOK) 7097 SD_INIT_FUNC(BOOK)
7099 #endif 7098 #endif
7100 7099
7101 static int default_relax_domain_level = -1; 7100 static int default_relax_domain_level = -1;
7102 int sched_domain_level_max; 7101 int sched_domain_level_max;
7103 7102
7104 static int __init setup_relax_domain_level(char *str) 7103 static int __init setup_relax_domain_level(char *str)
7105 { 7104 {
7106 unsigned long val; 7105 unsigned long val;
7107 7106
7108 val = simple_strtoul(str, NULL, 0); 7107 val = simple_strtoul(str, NULL, 0);
7109 if (val < sched_domain_level_max) 7108 if (val < sched_domain_level_max)
7110 default_relax_domain_level = val; 7109 default_relax_domain_level = val;
7111 7110
7112 return 1; 7111 return 1;
7113 } 7112 }
7114 __setup("relax_domain_level=", setup_relax_domain_level); 7113 __setup("relax_domain_level=", setup_relax_domain_level);
7115 7114
7116 static void set_domain_attribute(struct sched_domain *sd, 7115 static void set_domain_attribute(struct sched_domain *sd,
7117 struct sched_domain_attr *attr) 7116 struct sched_domain_attr *attr)
7118 { 7117 {
7119 int request; 7118 int request;
7120 7119
7121 if (!attr || attr->relax_domain_level < 0) { 7120 if (!attr || attr->relax_domain_level < 0) {
7122 if (default_relax_domain_level < 0) 7121 if (default_relax_domain_level < 0)
7123 return; 7122 return;
7124 else 7123 else
7125 request = default_relax_domain_level; 7124 request = default_relax_domain_level;
7126 } else 7125 } else
7127 request = attr->relax_domain_level; 7126 request = attr->relax_domain_level;
7128 if (request < sd->level) { 7127 if (request < sd->level) {
7129 /* turn off idle balance on this domain */ 7128 /* turn off idle balance on this domain */
7130 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 7129 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7131 } else { 7130 } else {
7132 /* turn on idle balance on this domain */ 7131 /* turn on idle balance on this domain */
7133 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 7132 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7134 } 7133 }
7135 } 7134 }
7136 7135
7137 static void __sdt_free(const struct cpumask *cpu_map); 7136 static void __sdt_free(const struct cpumask *cpu_map);
7138 static int __sdt_alloc(const struct cpumask *cpu_map); 7137 static int __sdt_alloc(const struct cpumask *cpu_map);
7139 7138
7140 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7139 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7141 const struct cpumask *cpu_map) 7140 const struct cpumask *cpu_map)
7142 { 7141 {
7143 switch (what) { 7142 switch (what) {
7144 case sa_rootdomain: 7143 case sa_rootdomain:
7145 if (!atomic_read(&d->rd->refcount)) 7144 if (!atomic_read(&d->rd->refcount))
7146 free_rootdomain(&d->rd->rcu); /* fall through */ 7145 free_rootdomain(&d->rd->rcu); /* fall through */
7147 case sa_sd: 7146 case sa_sd:
7148 free_percpu(d->sd); /* fall through */ 7147 free_percpu(d->sd); /* fall through */
7149 case sa_sd_storage: 7148 case sa_sd_storage:
7150 __sdt_free(cpu_map); /* fall through */ 7149 __sdt_free(cpu_map); /* fall through */
7151 case sa_none: 7150 case sa_none:
7152 break; 7151 break;
7153 } 7152 }
7154 } 7153 }
7155 7154
7156 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7155 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7157 const struct cpumask *cpu_map) 7156 const struct cpumask *cpu_map)
7158 { 7157 {
7159 memset(d, 0, sizeof(*d)); 7158 memset(d, 0, sizeof(*d));
7160 7159
7161 if (__sdt_alloc(cpu_map)) 7160 if (__sdt_alloc(cpu_map))
7162 return sa_sd_storage; 7161 return sa_sd_storage;
7163 d->sd = alloc_percpu(struct sched_domain *); 7162 d->sd = alloc_percpu(struct sched_domain *);
7164 if (!d->sd) 7163 if (!d->sd)
7165 return sa_sd_storage; 7164 return sa_sd_storage;
7166 d->rd = alloc_rootdomain(); 7165 d->rd = alloc_rootdomain();
7167 if (!d->rd) 7166 if (!d->rd)
7168 return sa_sd; 7167 return sa_sd;
7169 return sa_rootdomain; 7168 return sa_rootdomain;
7170 } 7169 }
7171 7170
7172 /* 7171 /*
7173 * NULL the sd_data elements we've used to build the sched_domain and 7172 * NULL the sd_data elements we've used to build the sched_domain and
7174 * sched_group structure so that the subsequent __free_domain_allocs() 7173 * sched_group structure so that the subsequent __free_domain_allocs()
7175 * will not free the data we're using. 7174 * will not free the data we're using.
7176 */ 7175 */
7177 static void claim_allocations(int cpu, struct sched_domain *sd) 7176 static void claim_allocations(int cpu, struct sched_domain *sd)
7178 { 7177 {
7179 struct sd_data *sdd = sd->private; 7178 struct sd_data *sdd = sd->private;
7180 struct sched_group *sg = sd->groups; 7179 struct sched_group *sg = sd->groups;
7181 7180
7182 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 7181 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7183 *per_cpu_ptr(sdd->sd, cpu) = NULL; 7182 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7184 7183
7185 if (cpu == cpumask_first(sched_group_cpus(sg))) { 7184 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7186 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); 7185 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7187 *per_cpu_ptr(sdd->sg, cpu) = NULL; 7186 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7188 } 7187 }
7189 } 7188 }
7190 7189
7191 #ifdef CONFIG_SCHED_SMT 7190 #ifdef CONFIG_SCHED_SMT
7192 static const struct cpumask *cpu_smt_mask(int cpu) 7191 static const struct cpumask *cpu_smt_mask(int cpu)
7193 { 7192 {
7194 return topology_thread_cpumask(cpu); 7193 return topology_thread_cpumask(cpu);
7195 } 7194 }
7196 #endif 7195 #endif
7197 7196
7198 /* 7197 /*
7199 * Topology list, bottom-up. 7198 * Topology list, bottom-up.
7200 */ 7199 */
7201 static struct sched_domain_topology_level default_topology[] = { 7200 static struct sched_domain_topology_level default_topology[] = {
7202 #ifdef CONFIG_SCHED_SMT 7201 #ifdef CONFIG_SCHED_SMT
7203 { sd_init_SIBLING, cpu_smt_mask, }, 7202 { sd_init_SIBLING, cpu_smt_mask, },
7204 #endif 7203 #endif
7205 #ifdef CONFIG_SCHED_MC 7204 #ifdef CONFIG_SCHED_MC
7206 { sd_init_MC, cpu_coregroup_mask, }, 7205 { sd_init_MC, cpu_coregroup_mask, },
7207 #endif 7206 #endif
7208 #ifdef CONFIG_SCHED_BOOK 7207 #ifdef CONFIG_SCHED_BOOK
7209 { sd_init_BOOK, cpu_book_mask, }, 7208 { sd_init_BOOK, cpu_book_mask, },
7210 #endif 7209 #endif
7211 { sd_init_CPU, cpu_cpu_mask, }, 7210 { sd_init_CPU, cpu_cpu_mask, },
7212 #ifdef CONFIG_NUMA 7211 #ifdef CONFIG_NUMA
7213 { sd_init_NODE, cpu_node_mask, }, 7212 { sd_init_NODE, cpu_node_mask, },
7214 { sd_init_ALLNODES, cpu_allnodes_mask, }, 7213 { sd_init_ALLNODES, cpu_allnodes_mask, },
7215 #endif 7214 #endif
7216 { NULL, }, 7215 { NULL, },
7217 }; 7216 };
7218 7217
7219 static struct sched_domain_topology_level *sched_domain_topology = default_topology; 7218 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7220 7219
7221 static int __sdt_alloc(const struct cpumask *cpu_map) 7220 static int __sdt_alloc(const struct cpumask *cpu_map)
7222 { 7221 {
7223 struct sched_domain_topology_level *tl; 7222 struct sched_domain_topology_level *tl;
7224 int j; 7223 int j;
7225 7224
7226 for (tl = sched_domain_topology; tl->init; tl++) { 7225 for (tl = sched_domain_topology; tl->init; tl++) {
7227 struct sd_data *sdd = &tl->data; 7226 struct sd_data *sdd = &tl->data;
7228 7227
7229 sdd->sd = alloc_percpu(struct sched_domain *); 7228 sdd->sd = alloc_percpu(struct sched_domain *);
7230 if (!sdd->sd) 7229 if (!sdd->sd)
7231 return -ENOMEM; 7230 return -ENOMEM;
7232 7231
7233 sdd->sg = alloc_percpu(struct sched_group *); 7232 sdd->sg = alloc_percpu(struct sched_group *);
7234 if (!sdd->sg) 7233 if (!sdd->sg)
7235 return -ENOMEM; 7234 return -ENOMEM;
7236 7235
7237 for_each_cpu(j, cpu_map) { 7236 for_each_cpu(j, cpu_map) {
7238 struct sched_domain *sd; 7237 struct sched_domain *sd;
7239 struct sched_group *sg; 7238 struct sched_group *sg;
7240 7239
7241 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 7240 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7242 GFP_KERNEL, cpu_to_node(j)); 7241 GFP_KERNEL, cpu_to_node(j));
7243 if (!sd) 7242 if (!sd)
7244 return -ENOMEM; 7243 return -ENOMEM;
7245 7244
7246 *per_cpu_ptr(sdd->sd, j) = sd; 7245 *per_cpu_ptr(sdd->sd, j) = sd;
7247 7246
7248 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 7247 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7249 GFP_KERNEL, cpu_to_node(j)); 7248 GFP_KERNEL, cpu_to_node(j));
7250 if (!sg) 7249 if (!sg)
7251 return -ENOMEM; 7250 return -ENOMEM;
7252 7251
7253 *per_cpu_ptr(sdd->sg, j) = sg; 7252 *per_cpu_ptr(sdd->sg, j) = sg;
7254 } 7253 }
7255 } 7254 }
7256 7255
7257 return 0; 7256 return 0;
7258 } 7257 }
7259 7258
7260 static void __sdt_free(const struct cpumask *cpu_map) 7259 static void __sdt_free(const struct cpumask *cpu_map)
7261 { 7260 {
7262 struct sched_domain_topology_level *tl; 7261 struct sched_domain_topology_level *tl;
7263 int j; 7262 int j;
7264 7263
7265 for (tl = sched_domain_topology; tl->init; tl++) { 7264 for (tl = sched_domain_topology; tl->init; tl++) {
7266 struct sd_data *sdd = &tl->data; 7265 struct sd_data *sdd = &tl->data;
7267 7266
7268 for_each_cpu(j, cpu_map) { 7267 for_each_cpu(j, cpu_map) {
7269 kfree(*per_cpu_ptr(sdd->sd, j)); 7268 kfree(*per_cpu_ptr(sdd->sd, j));
7270 kfree(*per_cpu_ptr(sdd->sg, j)); 7269 kfree(*per_cpu_ptr(sdd->sg, j));
7271 } 7270 }
7272 free_percpu(sdd->sd); 7271 free_percpu(sdd->sd);
7273 free_percpu(sdd->sg); 7272 free_percpu(sdd->sg);
7274 } 7273 }
7275 } 7274 }
7276 7275
7277 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 7276 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7278 struct s_data *d, const struct cpumask *cpu_map, 7277 struct s_data *d, const struct cpumask *cpu_map,
7279 struct sched_domain_attr *attr, struct sched_domain *child, 7278 struct sched_domain_attr *attr, struct sched_domain *child,
7280 int cpu) 7279 int cpu)
7281 { 7280 {
7282 struct sched_domain *sd = tl->init(tl, cpu); 7281 struct sched_domain *sd = tl->init(tl, cpu);
7283 if (!sd) 7282 if (!sd)
7284 return child; 7283 return child;
7285 7284
7286 set_domain_attribute(sd, attr); 7285 set_domain_attribute(sd, attr);
7287 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 7286 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7288 if (child) { 7287 if (child) {
7289 sd->level = child->level + 1; 7288 sd->level = child->level + 1;
7290 sched_domain_level_max = max(sched_domain_level_max, sd->level); 7289 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7291 child->parent = sd; 7290 child->parent = sd;
7292 } 7291 }
7293 sd->child = child; 7292 sd->child = child;
7294 7293
7295 return sd; 7294 return sd;
7296 } 7295 }
7297 7296
7298 /* 7297 /*
7299 * Build sched domains for a given set of cpus and attach the sched domains 7298 * Build sched domains for a given set of cpus and attach the sched domains
7300 * to the individual cpus 7299 * to the individual cpus
7301 */ 7300 */
7302 static int build_sched_domains(const struct cpumask *cpu_map, 7301 static int build_sched_domains(const struct cpumask *cpu_map,
7303 struct sched_domain_attr *attr) 7302 struct sched_domain_attr *attr)
7304 { 7303 {
7305 enum s_alloc alloc_state = sa_none; 7304 enum s_alloc alloc_state = sa_none;
7306 struct sched_domain *sd; 7305 struct sched_domain *sd;
7307 struct s_data d; 7306 struct s_data d;
7308 int i, ret = -ENOMEM; 7307 int i, ret = -ENOMEM;
7309 7308
7310 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7309 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7311 if (alloc_state != sa_rootdomain) 7310 if (alloc_state != sa_rootdomain)
7312 goto error; 7311 goto error;
7313 7312
7314 /* Set up domains for cpus specified by the cpu_map. */ 7313 /* Set up domains for cpus specified by the cpu_map. */
7315 for_each_cpu(i, cpu_map) { 7314 for_each_cpu(i, cpu_map) {
7316 struct sched_domain_topology_level *tl; 7315 struct sched_domain_topology_level *tl;
7317 7316
7318 sd = NULL; 7317 sd = NULL;
7319 for (tl = sched_domain_topology; tl->init; tl++) 7318 for (tl = sched_domain_topology; tl->init; tl++)
7320 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 7319 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7321 7320
7322 while (sd->child) 7321 while (sd->child)
7323 sd = sd->child; 7322 sd = sd->child;
7324 7323
7325 *per_cpu_ptr(d.sd, i) = sd; 7324 *per_cpu_ptr(d.sd, i) = sd;
7326 } 7325 }
7327 7326
7328 /* Build the groups for the domains */ 7327 /* Build the groups for the domains */
7329 for_each_cpu(i, cpu_map) { 7328 for_each_cpu(i, cpu_map) {
7330 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 7329 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7331 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 7330 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7332 get_group(i, sd->private, &sd->groups); 7331 get_group(i, sd->private, &sd->groups);
7333 atomic_inc(&sd->groups->ref); 7332 atomic_inc(&sd->groups->ref);
7334 7333
7335 if (i != cpumask_first(sched_domain_span(sd))) 7334 if (i != cpumask_first(sched_domain_span(sd)))
7336 continue; 7335 continue;
7337 7336
7338 build_sched_groups(sd); 7337 build_sched_groups(sd);
7339 } 7338 }
7340 } 7339 }
7341 7340
7342 /* Calculate CPU power for physical packages and nodes */ 7341 /* Calculate CPU power for physical packages and nodes */
7343 for (i = nr_cpumask_bits-1; i >= 0; i--) { 7342 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7344 if (!cpumask_test_cpu(i, cpu_map)) 7343 if (!cpumask_test_cpu(i, cpu_map))
7345 continue; 7344 continue;
7346 7345
7347 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 7346 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7348 claim_allocations(i, sd); 7347 claim_allocations(i, sd);
7349 init_sched_groups_power(i, sd); 7348 init_sched_groups_power(i, sd);
7350 } 7349 }
7351 } 7350 }
7352 7351
7353 /* Attach the domains */ 7352 /* Attach the domains */
7354 rcu_read_lock(); 7353 rcu_read_lock();
7355 for_each_cpu(i, cpu_map) { 7354 for_each_cpu(i, cpu_map) {
7356 sd = *per_cpu_ptr(d.sd, i); 7355 sd = *per_cpu_ptr(d.sd, i);
7357 cpu_attach_domain(sd, d.rd, i); 7356 cpu_attach_domain(sd, d.rd, i);
7358 } 7357 }
7359 rcu_read_unlock(); 7358 rcu_read_unlock();
7360 7359
7361 ret = 0; 7360 ret = 0;
7362 error: 7361 error:
7363 __free_domain_allocs(&d, alloc_state, cpu_map); 7362 __free_domain_allocs(&d, alloc_state, cpu_map);
7364 return ret; 7363 return ret;
7365 } 7364 }
7366 7365
7367 static cpumask_var_t *doms_cur; /* current sched domains */ 7366 static cpumask_var_t *doms_cur; /* current sched domains */
7368 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7367 static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7369 static struct sched_domain_attr *dattr_cur; 7368 static struct sched_domain_attr *dattr_cur;
7370 /* attribues of custom domains in 'doms_cur' */ 7369 /* attribues of custom domains in 'doms_cur' */
7371 7370
7372 /* 7371 /*
7373 * Special case: If a kmalloc of a doms_cur partition (array of 7372 * Special case: If a kmalloc of a doms_cur partition (array of
7374 * cpumask) fails, then fallback to a single sched domain, 7373 * cpumask) fails, then fallback to a single sched domain,
7375 * as determined by the single cpumask fallback_doms. 7374 * as determined by the single cpumask fallback_doms.
7376 */ 7375 */
7377 static cpumask_var_t fallback_doms; 7376 static cpumask_var_t fallback_doms;
7378 7377
7379 /* 7378 /*
7380 * arch_update_cpu_topology lets virtualized architectures update the 7379 * arch_update_cpu_topology lets virtualized architectures update the
7381 * cpu core maps. It is supposed to return 1 if the topology changed 7380 * cpu core maps. It is supposed to return 1 if the topology changed
7382 * or 0 if it stayed the same. 7381 * or 0 if it stayed the same.
7383 */ 7382 */
7384 int __attribute__((weak)) arch_update_cpu_topology(void) 7383 int __attribute__((weak)) arch_update_cpu_topology(void)
7385 { 7384 {
7386 return 0; 7385 return 0;
7387 } 7386 }
7388 7387
7389 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 7388 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7390 { 7389 {
7391 int i; 7390 int i;
7392 cpumask_var_t *doms; 7391 cpumask_var_t *doms;
7393 7392
7394 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 7393 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7395 if (!doms) 7394 if (!doms)
7396 return NULL; 7395 return NULL;
7397 for (i = 0; i < ndoms; i++) { 7396 for (i = 0; i < ndoms; i++) {
7398 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 7397 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7399 free_sched_domains(doms, i); 7398 free_sched_domains(doms, i);
7400 return NULL; 7399 return NULL;
7401 } 7400 }
7402 } 7401 }
7403 return doms; 7402 return doms;
7404 } 7403 }
7405 7404
7406 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 7405 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7407 { 7406 {
7408 unsigned int i; 7407 unsigned int i;
7409 for (i = 0; i < ndoms; i++) 7408 for (i = 0; i < ndoms; i++)
7410 free_cpumask_var(doms[i]); 7409 free_cpumask_var(doms[i]);
7411 kfree(doms); 7410 kfree(doms);
7412 } 7411 }
7413 7412
7414 /* 7413 /*
7415 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7414 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7416 * For now this just excludes isolated cpus, but could be used to 7415 * For now this just excludes isolated cpus, but could be used to
7417 * exclude other special cases in the future. 7416 * exclude other special cases in the future.
7418 */ 7417 */
7419 static int init_sched_domains(const struct cpumask *cpu_map) 7418 static int init_sched_domains(const struct cpumask *cpu_map)
7420 { 7419 {
7421 int err; 7420 int err;
7422 7421
7423 arch_update_cpu_topology(); 7422 arch_update_cpu_topology();
7424 ndoms_cur = 1; 7423 ndoms_cur = 1;
7425 doms_cur = alloc_sched_domains(ndoms_cur); 7424 doms_cur = alloc_sched_domains(ndoms_cur);
7426 if (!doms_cur) 7425 if (!doms_cur)
7427 doms_cur = &fallback_doms; 7426 doms_cur = &fallback_doms;
7428 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7427 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7429 dattr_cur = NULL; 7428 dattr_cur = NULL;
7430 err = build_sched_domains(doms_cur[0], NULL); 7429 err = build_sched_domains(doms_cur[0], NULL);
7431 register_sched_domain_sysctl(); 7430 register_sched_domain_sysctl();
7432 7431
7433 return err; 7432 return err;
7434 } 7433 }
7435 7434
7436 /* 7435 /*
7437 * Detach sched domains from a group of cpus specified in cpu_map 7436 * Detach sched domains from a group of cpus specified in cpu_map
7438 * These cpus will now be attached to the NULL domain 7437 * These cpus will now be attached to the NULL domain
7439 */ 7438 */
7440 static void detach_destroy_domains(const struct cpumask *cpu_map) 7439 static void detach_destroy_domains(const struct cpumask *cpu_map)
7441 { 7440 {
7442 int i; 7441 int i;
7443 7442
7444 rcu_read_lock(); 7443 rcu_read_lock();
7445 for_each_cpu(i, cpu_map) 7444 for_each_cpu(i, cpu_map)
7446 cpu_attach_domain(NULL, &def_root_domain, i); 7445 cpu_attach_domain(NULL, &def_root_domain, i);
7447 rcu_read_unlock(); 7446 rcu_read_unlock();
7448 } 7447 }
7449 7448
7450 /* handle null as "default" */ 7449 /* handle null as "default" */
7451 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 7450 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7452 struct sched_domain_attr *new, int idx_new) 7451 struct sched_domain_attr *new, int idx_new)
7453 { 7452 {
7454 struct sched_domain_attr tmp; 7453 struct sched_domain_attr tmp;
7455 7454
7456 /* fast path */ 7455 /* fast path */
7457 if (!new && !cur) 7456 if (!new && !cur)
7458 return 1; 7457 return 1;
7459 7458
7460 tmp = SD_ATTR_INIT; 7459 tmp = SD_ATTR_INIT;
7461 return !memcmp(cur ? (cur + idx_cur) : &tmp, 7460 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7462 new ? (new + idx_new) : &tmp, 7461 new ? (new + idx_new) : &tmp,
7463 sizeof(struct sched_domain_attr)); 7462 sizeof(struct sched_domain_attr));
7464 } 7463 }
7465 7464
7466 /* 7465 /*
7467 * Partition sched domains as specified by the 'ndoms_new' 7466 * Partition sched domains as specified by the 'ndoms_new'
7468 * cpumasks in the array doms_new[] of cpumasks. This compares 7467 * cpumasks in the array doms_new[] of cpumasks. This compares
7469 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7468 * doms_new[] to the current sched domain partitioning, doms_cur[].
7470 * It destroys each deleted domain and builds each new domain. 7469 * It destroys each deleted domain and builds each new domain.
7471 * 7470 *
7472 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 7471 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
7473 * The masks don't intersect (don't overlap.) We should setup one 7472 * The masks don't intersect (don't overlap.) We should setup one
7474 * sched domain for each mask. CPUs not in any of the cpumasks will 7473 * sched domain for each mask. CPUs not in any of the cpumasks will
7475 * not be load balanced. If the same cpumask appears both in the 7474 * not be load balanced. If the same cpumask appears both in the
7476 * current 'doms_cur' domains and in the new 'doms_new', we can leave 7475 * current 'doms_cur' domains and in the new 'doms_new', we can leave
7477 * it as it is. 7476 * it as it is.
7478 * 7477 *
7479 * The passed in 'doms_new' should be allocated using 7478 * The passed in 'doms_new' should be allocated using
7480 * alloc_sched_domains. This routine takes ownership of it and will 7479 * alloc_sched_domains. This routine takes ownership of it and will
7481 * free_sched_domains it when done with it. If the caller failed the 7480 * free_sched_domains it when done with it. If the caller failed the
7482 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 7481 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7483 * and partition_sched_domains() will fallback to the single partition 7482 * and partition_sched_domains() will fallback to the single partition
7484 * 'fallback_doms', it also forces the domains to be rebuilt. 7483 * 'fallback_doms', it also forces the domains to be rebuilt.
7485 * 7484 *
7486 * If doms_new == NULL it will be replaced with cpu_online_mask. 7485 * If doms_new == NULL it will be replaced with cpu_online_mask.
7487 * ndoms_new == 0 is a special case for destroying existing domains, 7486 * ndoms_new == 0 is a special case for destroying existing domains,
7488 * and it will not create the default domain. 7487 * and it will not create the default domain.
7489 * 7488 *
7490 * Call with hotplug lock held 7489 * Call with hotplug lock held
7491 */ 7490 */
7492 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 7491 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7493 struct sched_domain_attr *dattr_new) 7492 struct sched_domain_attr *dattr_new)
7494 { 7493 {
7495 int i, j, n; 7494 int i, j, n;
7496 int new_topology; 7495 int new_topology;
7497 7496
7498 mutex_lock(&sched_domains_mutex); 7497 mutex_lock(&sched_domains_mutex);
7499 7498
7500 /* always unregister in case we don't destroy any domains */ 7499 /* always unregister in case we don't destroy any domains */
7501 unregister_sched_domain_sysctl(); 7500 unregister_sched_domain_sysctl();
7502 7501
7503 /* Let architecture update cpu core mappings. */ 7502 /* Let architecture update cpu core mappings. */
7504 new_topology = arch_update_cpu_topology(); 7503 new_topology = arch_update_cpu_topology();
7505 7504
7506 n = doms_new ? ndoms_new : 0; 7505 n = doms_new ? ndoms_new : 0;
7507 7506
7508 /* Destroy deleted domains */ 7507 /* Destroy deleted domains */
7509 for (i = 0; i < ndoms_cur; i++) { 7508 for (i = 0; i < ndoms_cur; i++) {
7510 for (j = 0; j < n && !new_topology; j++) { 7509 for (j = 0; j < n && !new_topology; j++) {
7511 if (cpumask_equal(doms_cur[i], doms_new[j]) 7510 if (cpumask_equal(doms_cur[i], doms_new[j])
7512 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7511 && dattrs_equal(dattr_cur, i, dattr_new, j))
7513 goto match1; 7512 goto match1;
7514 } 7513 }
7515 /* no match - a current sched domain not in new doms_new[] */ 7514 /* no match - a current sched domain not in new doms_new[] */
7516 detach_destroy_domains(doms_cur[i]); 7515 detach_destroy_domains(doms_cur[i]);
7517 match1: 7516 match1:
7518 ; 7517 ;
7519 } 7518 }
7520 7519
7521 if (doms_new == NULL) { 7520 if (doms_new == NULL) {
7522 ndoms_cur = 0; 7521 ndoms_cur = 0;
7523 doms_new = &fallback_doms; 7522 doms_new = &fallback_doms;
7524 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 7523 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7525 WARN_ON_ONCE(dattr_new); 7524 WARN_ON_ONCE(dattr_new);
7526 } 7525 }
7527 7526
7528 /* Build new domains */ 7527 /* Build new domains */
7529 for (i = 0; i < ndoms_new; i++) { 7528 for (i = 0; i < ndoms_new; i++) {
7530 for (j = 0; j < ndoms_cur && !new_topology; j++) { 7529 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7531 if (cpumask_equal(doms_new[i], doms_cur[j]) 7530 if (cpumask_equal(doms_new[i], doms_cur[j])
7532 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7531 && dattrs_equal(dattr_new, i, dattr_cur, j))
7533 goto match2; 7532 goto match2;
7534 } 7533 }
7535 /* no match - add a new doms_new */ 7534 /* no match - add a new doms_new */
7536 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 7535 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7537 match2: 7536 match2:
7538 ; 7537 ;
7539 } 7538 }
7540 7539
7541 /* Remember the new sched domains */ 7540 /* Remember the new sched domains */
7542 if (doms_cur != &fallback_doms) 7541 if (doms_cur != &fallback_doms)
7543 free_sched_domains(doms_cur, ndoms_cur); 7542 free_sched_domains(doms_cur, ndoms_cur);
7544 kfree(dattr_cur); /* kfree(NULL) is safe */ 7543 kfree(dattr_cur); /* kfree(NULL) is safe */
7545 doms_cur = doms_new; 7544 doms_cur = doms_new;
7546 dattr_cur = dattr_new; 7545 dattr_cur = dattr_new;
7547 ndoms_cur = ndoms_new; 7546 ndoms_cur = ndoms_new;
7548 7547
7549 register_sched_domain_sysctl(); 7548 register_sched_domain_sysctl();
7550 7549
7551 mutex_unlock(&sched_domains_mutex); 7550 mutex_unlock(&sched_domains_mutex);
7552 } 7551 }
7553 7552
7554 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7553 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7555 static void reinit_sched_domains(void) 7554 static void reinit_sched_domains(void)
7556 { 7555 {
7557 get_online_cpus(); 7556 get_online_cpus();
7558 7557
7559 /* Destroy domains first to force the rebuild */ 7558 /* Destroy domains first to force the rebuild */
7560 partition_sched_domains(0, NULL, NULL); 7559 partition_sched_domains(0, NULL, NULL);
7561 7560
7562 rebuild_sched_domains(); 7561 rebuild_sched_domains();
7563 put_online_cpus(); 7562 put_online_cpus();
7564 } 7563 }
7565 7564
7566 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7565 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7567 { 7566 {
7568 unsigned int level = 0; 7567 unsigned int level = 0;
7569 7568
7570 if (sscanf(buf, "%u", &level) != 1) 7569 if (sscanf(buf, "%u", &level) != 1)
7571 return -EINVAL; 7570 return -EINVAL;
7572 7571
7573 /* 7572 /*
7574 * level is always be positive so don't check for 7573 * level is always be positive so don't check for
7575 * level < POWERSAVINGS_BALANCE_NONE which is 0 7574 * level < POWERSAVINGS_BALANCE_NONE which is 0
7576 * What happens on 0 or 1 byte write, 7575 * What happens on 0 or 1 byte write,
7577 * need to check for count as well? 7576 * need to check for count as well?
7578 */ 7577 */
7579 7578
7580 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) 7579 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7581 return -EINVAL; 7580 return -EINVAL;
7582 7581
7583 if (smt) 7582 if (smt)
7584 sched_smt_power_savings = level; 7583 sched_smt_power_savings = level;
7585 else 7584 else
7586 sched_mc_power_savings = level; 7585 sched_mc_power_savings = level;
7587 7586
7588 reinit_sched_domains(); 7587 reinit_sched_domains();
7589 7588
7590 return count; 7589 return count;
7591 } 7590 }
7592 7591
7593 #ifdef CONFIG_SCHED_MC 7592 #ifdef CONFIG_SCHED_MC
7594 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7593 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7595 struct sysdev_class_attribute *attr, 7594 struct sysdev_class_attribute *attr,
7596 char *page) 7595 char *page)
7597 { 7596 {
7598 return sprintf(page, "%u\n", sched_mc_power_savings); 7597 return sprintf(page, "%u\n", sched_mc_power_savings);
7599 } 7598 }
7600 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7599 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7601 struct sysdev_class_attribute *attr, 7600 struct sysdev_class_attribute *attr,
7602 const char *buf, size_t count) 7601 const char *buf, size_t count)
7603 { 7602 {
7604 return sched_power_savings_store(buf, count, 0); 7603 return sched_power_savings_store(buf, count, 0);
7605 } 7604 }
7606 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, 7605 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7607 sched_mc_power_savings_show, 7606 sched_mc_power_savings_show,
7608 sched_mc_power_savings_store); 7607 sched_mc_power_savings_store);
7609 #endif 7608 #endif
7610 7609
7611 #ifdef CONFIG_SCHED_SMT 7610 #ifdef CONFIG_SCHED_SMT
7612 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7611 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7613 struct sysdev_class_attribute *attr, 7612 struct sysdev_class_attribute *attr,
7614 char *page) 7613 char *page)
7615 { 7614 {
7616 return sprintf(page, "%u\n", sched_smt_power_savings); 7615 return sprintf(page, "%u\n", sched_smt_power_savings);
7617 } 7616 }
7618 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7617 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7619 struct sysdev_class_attribute *attr, 7618 struct sysdev_class_attribute *attr,
7620 const char *buf, size_t count) 7619 const char *buf, size_t count)
7621 { 7620 {
7622 return sched_power_savings_store(buf, count, 1); 7621 return sched_power_savings_store(buf, count, 1);
7623 } 7622 }
7624 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, 7623 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7625 sched_smt_power_savings_show, 7624 sched_smt_power_savings_show,
7626 sched_smt_power_savings_store); 7625 sched_smt_power_savings_store);
7627 #endif 7626 #endif
7628 7627
7629 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 7628 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7630 { 7629 {
7631 int err = 0; 7630 int err = 0;
7632 7631
7633 #ifdef CONFIG_SCHED_SMT 7632 #ifdef CONFIG_SCHED_SMT
7634 if (smt_capable()) 7633 if (smt_capable())
7635 err = sysfs_create_file(&cls->kset.kobj, 7634 err = sysfs_create_file(&cls->kset.kobj,
7636 &attr_sched_smt_power_savings.attr); 7635 &attr_sched_smt_power_savings.attr);
7637 #endif 7636 #endif
7638 #ifdef CONFIG_SCHED_MC 7637 #ifdef CONFIG_SCHED_MC
7639 if (!err && mc_capable()) 7638 if (!err && mc_capable())
7640 err = sysfs_create_file(&cls->kset.kobj, 7639 err = sysfs_create_file(&cls->kset.kobj,
7641 &attr_sched_mc_power_savings.attr); 7640 &attr_sched_mc_power_savings.attr);
7642 #endif 7641 #endif
7643 return err; 7642 return err;
7644 } 7643 }
7645 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7644 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7646 7645
7647 /* 7646 /*
7648 * Update cpusets according to cpu_active mask. If cpusets are 7647 * Update cpusets according to cpu_active mask. If cpusets are
7649 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7648 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7650 * around partition_sched_domains(). 7649 * around partition_sched_domains().
7651 */ 7650 */
7652 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7651 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7653 void *hcpu) 7652 void *hcpu)
7654 { 7653 {
7655 switch (action & ~CPU_TASKS_FROZEN) { 7654 switch (action & ~CPU_TASKS_FROZEN) {
7656 case CPU_ONLINE: 7655 case CPU_ONLINE:
7657 case CPU_DOWN_FAILED: 7656 case CPU_DOWN_FAILED:
7658 cpuset_update_active_cpus(); 7657 cpuset_update_active_cpus();
7659 return NOTIFY_OK; 7658 return NOTIFY_OK;
7660 default: 7659 default:
7661 return NOTIFY_DONE; 7660 return NOTIFY_DONE;
7662 } 7661 }
7663 } 7662 }
7664 7663
7665 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7664 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7666 void *hcpu) 7665 void *hcpu)
7667 { 7666 {
7668 switch (action & ~CPU_TASKS_FROZEN) { 7667 switch (action & ~CPU_TASKS_FROZEN) {
7669 case CPU_DOWN_PREPARE: 7668 case CPU_DOWN_PREPARE:
7670 cpuset_update_active_cpus(); 7669 cpuset_update_active_cpus();
7671 return NOTIFY_OK; 7670 return NOTIFY_OK;
7672 default: 7671 default:
7673 return NOTIFY_DONE; 7672 return NOTIFY_DONE;
7674 } 7673 }
7675 } 7674 }
7676 7675
7677 static int update_runtime(struct notifier_block *nfb, 7676 static int update_runtime(struct notifier_block *nfb,
7678 unsigned long action, void *hcpu) 7677 unsigned long action, void *hcpu)
7679 { 7678 {
7680 int cpu = (int)(long)hcpu; 7679 int cpu = (int)(long)hcpu;
7681 7680
7682 switch (action) { 7681 switch (action) {
7683 case CPU_DOWN_PREPARE: 7682 case CPU_DOWN_PREPARE:
7684 case CPU_DOWN_PREPARE_FROZEN: 7683 case CPU_DOWN_PREPARE_FROZEN:
7685 disable_runtime(cpu_rq(cpu)); 7684 disable_runtime(cpu_rq(cpu));
7686 return NOTIFY_OK; 7685 return NOTIFY_OK;
7687 7686
7688 case CPU_DOWN_FAILED: 7687 case CPU_DOWN_FAILED:
7689 case CPU_DOWN_FAILED_FROZEN: 7688 case CPU_DOWN_FAILED_FROZEN:
7690 case CPU_ONLINE: 7689 case CPU_ONLINE:
7691 case CPU_ONLINE_FROZEN: 7690 case CPU_ONLINE_FROZEN:
7692 enable_runtime(cpu_rq(cpu)); 7691 enable_runtime(cpu_rq(cpu));
7693 return NOTIFY_OK; 7692 return NOTIFY_OK;
7694 7693
7695 default: 7694 default:
7696 return NOTIFY_DONE; 7695 return NOTIFY_DONE;
7697 } 7696 }
7698 } 7697 }
7699 7698
7700 void __init sched_init_smp(void) 7699 void __init sched_init_smp(void)
7701 { 7700 {
7702 cpumask_var_t non_isolated_cpus; 7701 cpumask_var_t non_isolated_cpus;
7703 7702
7704 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7703 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7705 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7704 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7706 7705
7707 get_online_cpus(); 7706 get_online_cpus();
7708 mutex_lock(&sched_domains_mutex); 7707 mutex_lock(&sched_domains_mutex);
7709 init_sched_domains(cpu_active_mask); 7708 init_sched_domains(cpu_active_mask);
7710 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7709 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7711 if (cpumask_empty(non_isolated_cpus)) 7710 if (cpumask_empty(non_isolated_cpus))
7712 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7711 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7713 mutex_unlock(&sched_domains_mutex); 7712 mutex_unlock(&sched_domains_mutex);
7714 put_online_cpus(); 7713 put_online_cpus();
7715 7714
7716 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 7715 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7717 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 7716 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7718 7717
7719 /* RT runtime code needs to handle some hotplug events */ 7718 /* RT runtime code needs to handle some hotplug events */
7720 hotcpu_notifier(update_runtime, 0); 7719 hotcpu_notifier(update_runtime, 0);
7721 7720
7722 init_hrtick(); 7721 init_hrtick();
7723 7722
7724 /* Move init over to a non-isolated CPU */ 7723 /* Move init over to a non-isolated CPU */
7725 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 7724 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
7726 BUG(); 7725 BUG();
7727 sched_init_granularity(); 7726 sched_init_granularity();
7728 free_cpumask_var(non_isolated_cpus); 7727 free_cpumask_var(non_isolated_cpus);
7729 7728
7730 init_sched_rt_class(); 7729 init_sched_rt_class();
7731 } 7730 }
7732 #else 7731 #else
7733 void __init sched_init_smp(void) 7732 void __init sched_init_smp(void)
7734 { 7733 {
7735 sched_init_granularity(); 7734 sched_init_granularity();
7736 } 7735 }
7737 #endif /* CONFIG_SMP */ 7736 #endif /* CONFIG_SMP */
7738 7737
7739 const_debug unsigned int sysctl_timer_migration = 1; 7738 const_debug unsigned int sysctl_timer_migration = 1;
7740 7739
7741 int in_sched_functions(unsigned long addr) 7740 int in_sched_functions(unsigned long addr)
7742 { 7741 {
7743 return in_lock_functions(addr) || 7742 return in_lock_functions(addr) ||
7744 (addr >= (unsigned long)__sched_text_start 7743 (addr >= (unsigned long)__sched_text_start
7745 && addr < (unsigned long)__sched_text_end); 7744 && addr < (unsigned long)__sched_text_end);
7746 } 7745 }
7747 7746
7748 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 7747 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7749 { 7748 {
7750 cfs_rq->tasks_timeline = RB_ROOT; 7749 cfs_rq->tasks_timeline = RB_ROOT;
7751 INIT_LIST_HEAD(&cfs_rq->tasks); 7750 INIT_LIST_HEAD(&cfs_rq->tasks);
7752 #ifdef CONFIG_FAIR_GROUP_SCHED 7751 #ifdef CONFIG_FAIR_GROUP_SCHED
7753 cfs_rq->rq = rq; 7752 cfs_rq->rq = rq;
7754 /* allow initial update_cfs_load() to truncate */ 7753 /* allow initial update_cfs_load() to truncate */
7755 #ifdef CONFIG_SMP 7754 #ifdef CONFIG_SMP
7756 cfs_rq->load_stamp = 1; 7755 cfs_rq->load_stamp = 1;
7757 #endif 7756 #endif
7758 #endif 7757 #endif
7759 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7758 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7760 } 7759 }
7761 7760
7762 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 7761 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7763 { 7762 {
7764 struct rt_prio_array *array; 7763 struct rt_prio_array *array;
7765 int i; 7764 int i;
7766 7765
7767 array = &rt_rq->active; 7766 array = &rt_rq->active;
7768 for (i = 0; i < MAX_RT_PRIO; i++) { 7767 for (i = 0; i < MAX_RT_PRIO; i++) {
7769 INIT_LIST_HEAD(array->queue + i); 7768 INIT_LIST_HEAD(array->queue + i);
7770 __clear_bit(i, array->bitmap); 7769 __clear_bit(i, array->bitmap);
7771 } 7770 }
7772 /* delimiter for bitsearch: */ 7771 /* delimiter for bitsearch: */
7773 __set_bit(MAX_RT_PRIO, array->bitmap); 7772 __set_bit(MAX_RT_PRIO, array->bitmap);
7774 7773
7775 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 7774 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
7776 rt_rq->highest_prio.curr = MAX_RT_PRIO; 7775 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7777 #ifdef CONFIG_SMP 7776 #ifdef CONFIG_SMP
7778 rt_rq->highest_prio.next = MAX_RT_PRIO; 7777 rt_rq->highest_prio.next = MAX_RT_PRIO;
7779 #endif 7778 #endif
7780 #endif 7779 #endif
7781 #ifdef CONFIG_SMP 7780 #ifdef CONFIG_SMP
7782 rt_rq->rt_nr_migratory = 0; 7781 rt_rq->rt_nr_migratory = 0;
7783 rt_rq->overloaded = 0; 7782 rt_rq->overloaded = 0;
7784 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); 7783 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
7785 #endif 7784 #endif
7786 7785
7787 rt_rq->rt_time = 0; 7786 rt_rq->rt_time = 0;
7788 rt_rq->rt_throttled = 0; 7787 rt_rq->rt_throttled = 0;
7789 rt_rq->rt_runtime = 0; 7788 rt_rq->rt_runtime = 0;
7790 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 7789 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
7791 7790
7792 #ifdef CONFIG_RT_GROUP_SCHED 7791 #ifdef CONFIG_RT_GROUP_SCHED
7793 rt_rq->rt_nr_boosted = 0; 7792 rt_rq->rt_nr_boosted = 0;
7794 rt_rq->rq = rq; 7793 rt_rq->rq = rq;
7795 #endif 7794 #endif
7796 } 7795 }
7797 7796
7798 #ifdef CONFIG_FAIR_GROUP_SCHED 7797 #ifdef CONFIG_FAIR_GROUP_SCHED
7799 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7798 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7800 struct sched_entity *se, int cpu, 7799 struct sched_entity *se, int cpu,
7801 struct sched_entity *parent) 7800 struct sched_entity *parent)
7802 { 7801 {
7803 struct rq *rq = cpu_rq(cpu); 7802 struct rq *rq = cpu_rq(cpu);
7804 tg->cfs_rq[cpu] = cfs_rq; 7803 tg->cfs_rq[cpu] = cfs_rq;
7805 init_cfs_rq(cfs_rq, rq); 7804 init_cfs_rq(cfs_rq, rq);
7806 cfs_rq->tg = tg; 7805 cfs_rq->tg = tg;
7807 7806
7808 tg->se[cpu] = se; 7807 tg->se[cpu] = se;
7809 /* se could be NULL for root_task_group */ 7808 /* se could be NULL for root_task_group */
7810 if (!se) 7809 if (!se)
7811 return; 7810 return;
7812 7811
7813 if (!parent) 7812 if (!parent)
7814 se->cfs_rq = &rq->cfs; 7813 se->cfs_rq = &rq->cfs;
7815 else 7814 else
7816 se->cfs_rq = parent->my_q; 7815 se->cfs_rq = parent->my_q;
7817 7816
7818 se->my_q = cfs_rq; 7817 se->my_q = cfs_rq;
7819 update_load_set(&se->load, 0); 7818 update_load_set(&se->load, 0);
7820 se->parent = parent; 7819 se->parent = parent;
7821 } 7820 }
7822 #endif 7821 #endif
7823 7822
7824 #ifdef CONFIG_RT_GROUP_SCHED 7823 #ifdef CONFIG_RT_GROUP_SCHED
7825 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7824 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7826 struct sched_rt_entity *rt_se, int cpu, 7825 struct sched_rt_entity *rt_se, int cpu,
7827 struct sched_rt_entity *parent) 7826 struct sched_rt_entity *parent)
7828 { 7827 {
7829 struct rq *rq = cpu_rq(cpu); 7828 struct rq *rq = cpu_rq(cpu);
7830 7829
7831 tg->rt_rq[cpu] = rt_rq; 7830 tg->rt_rq[cpu] = rt_rq;
7832 init_rt_rq(rt_rq, rq); 7831 init_rt_rq(rt_rq, rq);
7833 rt_rq->tg = tg; 7832 rt_rq->tg = tg;
7834 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7833 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7835 7834
7836 tg->rt_se[cpu] = rt_se; 7835 tg->rt_se[cpu] = rt_se;
7837 if (!rt_se) 7836 if (!rt_se)
7838 return; 7837 return;
7839 7838
7840 if (!parent) 7839 if (!parent)
7841 rt_se->rt_rq = &rq->rt; 7840 rt_se->rt_rq = &rq->rt;
7842 else 7841 else
7843 rt_se->rt_rq = parent->my_q; 7842 rt_se->rt_rq = parent->my_q;
7844 7843
7845 rt_se->my_q = rt_rq; 7844 rt_se->my_q = rt_rq;
7846 rt_se->parent = parent; 7845 rt_se->parent = parent;
7847 INIT_LIST_HEAD(&rt_se->run_list); 7846 INIT_LIST_HEAD(&rt_se->run_list);
7848 } 7847 }
7849 #endif 7848 #endif
7850 7849
7851 void __init sched_init(void) 7850 void __init sched_init(void)
7852 { 7851 {
7853 int i, j; 7852 int i, j;
7854 unsigned long alloc_size = 0, ptr; 7853 unsigned long alloc_size = 0, ptr;
7855 7854
7856 #ifdef CONFIG_FAIR_GROUP_SCHED 7855 #ifdef CONFIG_FAIR_GROUP_SCHED
7857 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7856 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7858 #endif 7857 #endif
7859 #ifdef CONFIG_RT_GROUP_SCHED 7858 #ifdef CONFIG_RT_GROUP_SCHED
7860 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7859 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7861 #endif 7860 #endif
7862 #ifdef CONFIG_CPUMASK_OFFSTACK 7861 #ifdef CONFIG_CPUMASK_OFFSTACK
7863 alloc_size += num_possible_cpus() * cpumask_size(); 7862 alloc_size += num_possible_cpus() * cpumask_size();
7864 #endif 7863 #endif
7865 if (alloc_size) { 7864 if (alloc_size) {
7866 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7865 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7867 7866
7868 #ifdef CONFIG_FAIR_GROUP_SCHED 7867 #ifdef CONFIG_FAIR_GROUP_SCHED
7869 root_task_group.se = (struct sched_entity **)ptr; 7868 root_task_group.se = (struct sched_entity **)ptr;
7870 ptr += nr_cpu_ids * sizeof(void **); 7869 ptr += nr_cpu_ids * sizeof(void **);
7871 7870
7872 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7871 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7873 ptr += nr_cpu_ids * sizeof(void **); 7872 ptr += nr_cpu_ids * sizeof(void **);
7874 7873
7875 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7874 #endif /* CONFIG_FAIR_GROUP_SCHED */
7876 #ifdef CONFIG_RT_GROUP_SCHED 7875 #ifdef CONFIG_RT_GROUP_SCHED
7877 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 7876 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7878 ptr += nr_cpu_ids * sizeof(void **); 7877 ptr += nr_cpu_ids * sizeof(void **);
7879 7878
7880 root_task_group.rt_rq = (struct rt_rq **)ptr; 7879 root_task_group.rt_rq = (struct rt_rq **)ptr;
7881 ptr += nr_cpu_ids * sizeof(void **); 7880 ptr += nr_cpu_ids * sizeof(void **);
7882 7881
7883 #endif /* CONFIG_RT_GROUP_SCHED */ 7882 #endif /* CONFIG_RT_GROUP_SCHED */
7884 #ifdef CONFIG_CPUMASK_OFFSTACK 7883 #ifdef CONFIG_CPUMASK_OFFSTACK
7885 for_each_possible_cpu(i) { 7884 for_each_possible_cpu(i) {
7886 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 7885 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
7887 ptr += cpumask_size(); 7886 ptr += cpumask_size();
7888 } 7887 }
7889 #endif /* CONFIG_CPUMASK_OFFSTACK */ 7888 #endif /* CONFIG_CPUMASK_OFFSTACK */
7890 } 7889 }
7891 7890
7892 #ifdef CONFIG_SMP 7891 #ifdef CONFIG_SMP
7893 init_defrootdomain(); 7892 init_defrootdomain();
7894 #endif 7893 #endif
7895 7894
7896 init_rt_bandwidth(&def_rt_bandwidth, 7895 init_rt_bandwidth(&def_rt_bandwidth,
7897 global_rt_period(), global_rt_runtime()); 7896 global_rt_period(), global_rt_runtime());
7898 7897
7899 #ifdef CONFIG_RT_GROUP_SCHED 7898 #ifdef CONFIG_RT_GROUP_SCHED
7900 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7899 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7901 global_rt_period(), global_rt_runtime()); 7900 global_rt_period(), global_rt_runtime());
7902 #endif /* CONFIG_RT_GROUP_SCHED */ 7901 #endif /* CONFIG_RT_GROUP_SCHED */
7903 7902
7904 #ifdef CONFIG_CGROUP_SCHED 7903 #ifdef CONFIG_CGROUP_SCHED
7905 list_add(&root_task_group.list, &task_groups); 7904 list_add(&root_task_group.list, &task_groups);
7906 INIT_LIST_HEAD(&root_task_group.children); 7905 INIT_LIST_HEAD(&root_task_group.children);
7907 autogroup_init(&init_task); 7906 autogroup_init(&init_task);
7908 #endif /* CONFIG_CGROUP_SCHED */ 7907 #endif /* CONFIG_CGROUP_SCHED */
7909 7908
7910 for_each_possible_cpu(i) { 7909 for_each_possible_cpu(i) {
7911 struct rq *rq; 7910 struct rq *rq;
7912 7911
7913 rq = cpu_rq(i); 7912 rq = cpu_rq(i);
7914 raw_spin_lock_init(&rq->lock); 7913 raw_spin_lock_init(&rq->lock);
7915 rq->nr_running = 0; 7914 rq->nr_running = 0;
7916 rq->calc_load_active = 0; 7915 rq->calc_load_active = 0;
7917 rq->calc_load_update = jiffies + LOAD_FREQ; 7916 rq->calc_load_update = jiffies + LOAD_FREQ;
7918 init_cfs_rq(&rq->cfs, rq); 7917 init_cfs_rq(&rq->cfs, rq);
7919 init_rt_rq(&rq->rt, rq); 7918 init_rt_rq(&rq->rt, rq);
7920 #ifdef CONFIG_FAIR_GROUP_SCHED 7919 #ifdef CONFIG_FAIR_GROUP_SCHED
7921 root_task_group.shares = root_task_group_load; 7920 root_task_group.shares = root_task_group_load;
7922 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7921 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7923 /* 7922 /*
7924 * How much cpu bandwidth does root_task_group get? 7923 * How much cpu bandwidth does root_task_group get?
7925 * 7924 *
7926 * In case of task-groups formed thr' the cgroup filesystem, it 7925 * In case of task-groups formed thr' the cgroup filesystem, it
7927 * gets 100% of the cpu resources in the system. This overall 7926 * gets 100% of the cpu resources in the system. This overall
7928 * system cpu resource is divided among the tasks of 7927 * system cpu resource is divided among the tasks of
7929 * root_task_group and its child task-groups in a fair manner, 7928 * root_task_group and its child task-groups in a fair manner,
7930 * based on each entity's (task or task-group's) weight 7929 * based on each entity's (task or task-group's) weight
7931 * (se->load.weight). 7930 * (se->load.weight).
7932 * 7931 *
7933 * In other words, if root_task_group has 10 tasks of weight 7932 * In other words, if root_task_group has 10 tasks of weight
7934 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7933 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7935 * then A0's share of the cpu resource is: 7934 * then A0's share of the cpu resource is:
7936 * 7935 *
7937 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7936 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7938 * 7937 *
7939 * We achieve this by letting root_task_group's tasks sit 7938 * We achieve this by letting root_task_group's tasks sit
7940 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 7939 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
7941 */ 7940 */
7942 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 7941 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7943 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7942 #endif /* CONFIG_FAIR_GROUP_SCHED */
7944 7943
7945 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7944 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7946 #ifdef CONFIG_RT_GROUP_SCHED 7945 #ifdef CONFIG_RT_GROUP_SCHED
7947 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7946 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7948 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 7947 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7949 #endif 7948 #endif
7950 7949
7951 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7950 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7952 rq->cpu_load[j] = 0; 7951 rq->cpu_load[j] = 0;
7953 7952
7954 rq->last_load_update_tick = jiffies; 7953 rq->last_load_update_tick = jiffies;
7955 7954
7956 #ifdef CONFIG_SMP 7955 #ifdef CONFIG_SMP
7957 rq->sd = NULL; 7956 rq->sd = NULL;
7958 rq->rd = NULL; 7957 rq->rd = NULL;
7959 rq->cpu_power = SCHED_POWER_SCALE; 7958 rq->cpu_power = SCHED_POWER_SCALE;
7960 rq->post_schedule = 0; 7959 rq->post_schedule = 0;
7961 rq->active_balance = 0; 7960 rq->active_balance = 0;
7962 rq->next_balance = jiffies; 7961 rq->next_balance = jiffies;
7963 rq->push_cpu = 0; 7962 rq->push_cpu = 0;
7964 rq->cpu = i; 7963 rq->cpu = i;
7965 rq->online = 0; 7964 rq->online = 0;
7966 rq->idle_stamp = 0; 7965 rq->idle_stamp = 0;
7967 rq->avg_idle = 2*sysctl_sched_migration_cost; 7966 rq->avg_idle = 2*sysctl_sched_migration_cost;
7968 rq_attach_root(rq, &def_root_domain); 7967 rq_attach_root(rq, &def_root_domain);
7969 #ifdef CONFIG_NO_HZ 7968 #ifdef CONFIG_NO_HZ
7970 rq->nohz_balance_kick = 0; 7969 rq->nohz_balance_kick = 0;
7971 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); 7970 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7972 #endif 7971 #endif
7973 #endif 7972 #endif
7974 init_rq_hrtick(rq); 7973 init_rq_hrtick(rq);
7975 atomic_set(&rq->nr_iowait, 0); 7974 atomic_set(&rq->nr_iowait, 0);
7976 } 7975 }
7977 7976
7978 set_load_weight(&init_task); 7977 set_load_weight(&init_task);
7979 7978
7980 #ifdef CONFIG_PREEMPT_NOTIFIERS 7979 #ifdef CONFIG_PREEMPT_NOTIFIERS
7981 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 7980 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7982 #endif 7981 #endif
7983 7982
7984 #ifdef CONFIG_SMP 7983 #ifdef CONFIG_SMP
7985 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 7984 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7986 #endif 7985 #endif
7987 7986
7988 #ifdef CONFIG_RT_MUTEXES 7987 #ifdef CONFIG_RT_MUTEXES
7989 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); 7988 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
7990 #endif 7989 #endif
7991 7990
7992 /* 7991 /*
7993 * The boot idle thread does lazy MMU switching as well: 7992 * The boot idle thread does lazy MMU switching as well:
7994 */ 7993 */
7995 atomic_inc(&init_mm.mm_count); 7994 atomic_inc(&init_mm.mm_count);
7996 enter_lazy_tlb(&init_mm, current); 7995 enter_lazy_tlb(&init_mm, current);
7997 7996
7998 /* 7997 /*
7999 * Make us the idle thread. Technically, schedule() should not be 7998 * Make us the idle thread. Technically, schedule() should not be
8000 * called from this thread, however somewhere below it might be, 7999 * called from this thread, however somewhere below it might be,
8001 * but because we are the idle thread, we just pick up running again 8000 * but because we are the idle thread, we just pick up running again
8002 * when this runqueue becomes "idle". 8001 * when this runqueue becomes "idle".
8003 */ 8002 */
8004 init_idle(current, smp_processor_id()); 8003 init_idle(current, smp_processor_id());
8005 8004
8006 calc_load_update = jiffies + LOAD_FREQ; 8005 calc_load_update = jiffies + LOAD_FREQ;
8007 8006
8008 /* 8007 /*
8009 * During early bootup we pretend to be a normal task: 8008 * During early bootup we pretend to be a normal task:
8010 */ 8009 */
8011 current->sched_class = &fair_sched_class; 8010 current->sched_class = &fair_sched_class;
8012 8011
8013 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 8012 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8014 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8013 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8015 #ifdef CONFIG_SMP 8014 #ifdef CONFIG_SMP
8016 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 8015 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8017 #ifdef CONFIG_NO_HZ 8016 #ifdef CONFIG_NO_HZ
8018 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 8017 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8019 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 8018 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8020 atomic_set(&nohz.load_balancer, nr_cpu_ids); 8019 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8021 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); 8020 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8022 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); 8021 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8023 #endif 8022 #endif
8024 /* May be allocated at isolcpus cmdline parse time */ 8023 /* May be allocated at isolcpus cmdline parse time */
8025 if (cpu_isolated_map == NULL) 8024 if (cpu_isolated_map == NULL)
8026 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8025 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8027 #endif /* SMP */ 8026 #endif /* SMP */
8028 8027
8029 scheduler_running = 1; 8028 scheduler_running = 1;
8030 } 8029 }
8031 8030
8032 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 8031 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
8033 static inline int preempt_count_equals(int preempt_offset) 8032 static inline int preempt_count_equals(int preempt_offset)
8034 { 8033 {
8035 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8034 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8036 8035
8037 return (nested == preempt_offset); 8036 return (nested == preempt_offset);
8038 } 8037 }
8039 8038
8040 void __might_sleep(const char *file, int line, int preempt_offset) 8039 void __might_sleep(const char *file, int line, int preempt_offset)
8041 { 8040 {
8042 #ifdef in_atomic 8041 #ifdef in_atomic
8043 static unsigned long prev_jiffy; /* ratelimiting */ 8042 static unsigned long prev_jiffy; /* ratelimiting */
8044 8043
8045 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8044 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8046 system_state != SYSTEM_RUNNING || oops_in_progress) 8045 system_state != SYSTEM_RUNNING || oops_in_progress)
8047 return; 8046 return;
8048 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8047 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8049 return; 8048 return;
8050 prev_jiffy = jiffies; 8049 prev_jiffy = jiffies;
8051 8050
8052 printk(KERN_ERR 8051 printk(KERN_ERR
8053 "BUG: sleeping function called from invalid context at %s:%d\n", 8052 "BUG: sleeping function called from invalid context at %s:%d\n",
8054 file, line); 8053 file, line);
8055 printk(KERN_ERR 8054 printk(KERN_ERR
8056 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 8055 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8057 in_atomic(), irqs_disabled(), 8056 in_atomic(), irqs_disabled(),
8058 current->pid, current->comm); 8057 current->pid, current->comm);
8059 8058
8060 debug_show_held_locks(current); 8059 debug_show_held_locks(current);
8061 if (irqs_disabled()) 8060 if (irqs_disabled())
8062 print_irqtrace_events(current); 8061 print_irqtrace_events(current);
8063 dump_stack(); 8062 dump_stack();
8064 #endif 8063 #endif
8065 } 8064 }
8066 EXPORT_SYMBOL(__might_sleep); 8065 EXPORT_SYMBOL(__might_sleep);
8067 #endif 8066 #endif
8068 8067
8069 #ifdef CONFIG_MAGIC_SYSRQ 8068 #ifdef CONFIG_MAGIC_SYSRQ
8070 static void normalize_task(struct rq *rq, struct task_struct *p) 8069 static void normalize_task(struct rq *rq, struct task_struct *p)
8071 { 8070 {
8072 const struct sched_class *prev_class = p->sched_class; 8071 const struct sched_class *prev_class = p->sched_class;
8073 int old_prio = p->prio; 8072 int old_prio = p->prio;
8074 int on_rq; 8073 int on_rq;
8075 8074
8076 on_rq = p->on_rq; 8075 on_rq = p->on_rq;
8077 if (on_rq) 8076 if (on_rq)
8078 deactivate_task(rq, p, 0); 8077 deactivate_task(rq, p, 0);
8079 __setscheduler(rq, p, SCHED_NORMAL, 0); 8078 __setscheduler(rq, p, SCHED_NORMAL, 0);
8080 if (on_rq) { 8079 if (on_rq) {
8081 activate_task(rq, p, 0); 8080 activate_task(rq, p, 0);
8082 resched_task(rq->curr); 8081 resched_task(rq->curr);
8083 } 8082 }
8084 8083
8085 check_class_changed(rq, p, prev_class, old_prio); 8084 check_class_changed(rq, p, prev_class, old_prio);
8086 } 8085 }
8087 8086
8088 void normalize_rt_tasks(void) 8087 void normalize_rt_tasks(void)
8089 { 8088 {
8090 struct task_struct *g, *p; 8089 struct task_struct *g, *p;
8091 unsigned long flags; 8090 unsigned long flags;
8092 struct rq *rq; 8091 struct rq *rq;
8093 8092
8094 read_lock_irqsave(&tasklist_lock, flags); 8093 read_lock_irqsave(&tasklist_lock, flags);
8095 do_each_thread(g, p) { 8094 do_each_thread(g, p) {
8096 /* 8095 /*
8097 * Only normalize user tasks: 8096 * Only normalize user tasks:
8098 */ 8097 */
8099 if (!p->mm) 8098 if (!p->mm)
8100 continue; 8099 continue;
8101 8100
8102 p->se.exec_start = 0; 8101 p->se.exec_start = 0;
8103 #ifdef CONFIG_SCHEDSTATS 8102 #ifdef CONFIG_SCHEDSTATS
8104 p->se.statistics.wait_start = 0; 8103 p->se.statistics.wait_start = 0;
8105 p->se.statistics.sleep_start = 0; 8104 p->se.statistics.sleep_start = 0;
8106 p->se.statistics.block_start = 0; 8105 p->se.statistics.block_start = 0;
8107 #endif 8106 #endif
8108 8107
8109 if (!rt_task(p)) { 8108 if (!rt_task(p)) {
8110 /* 8109 /*
8111 * Renice negative nice level userspace 8110 * Renice negative nice level userspace
8112 * tasks back to 0: 8111 * tasks back to 0:
8113 */ 8112 */
8114 if (TASK_NICE(p) < 0 && p->mm) 8113 if (TASK_NICE(p) < 0 && p->mm)
8115 set_user_nice(p, 0); 8114 set_user_nice(p, 0);
8116 continue; 8115 continue;
8117 } 8116 }
8118 8117
8119 raw_spin_lock(&p->pi_lock); 8118 raw_spin_lock(&p->pi_lock);
8120 rq = __task_rq_lock(p); 8119 rq = __task_rq_lock(p);
8121 8120
8122 normalize_task(rq, p); 8121 normalize_task(rq, p);
8123 8122
8124 __task_rq_unlock(rq); 8123 __task_rq_unlock(rq);
8125 raw_spin_unlock(&p->pi_lock); 8124 raw_spin_unlock(&p->pi_lock);
8126 } while_each_thread(g, p); 8125 } while_each_thread(g, p);
8127 8126
8128 read_unlock_irqrestore(&tasklist_lock, flags); 8127 read_unlock_irqrestore(&tasklist_lock, flags);
8129 } 8128 }
8130 8129
8131 #endif /* CONFIG_MAGIC_SYSRQ */ 8130 #endif /* CONFIG_MAGIC_SYSRQ */
8132 8131
8133 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 8132 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8134 /* 8133 /*
8135 * These functions are only useful for the IA64 MCA handling, or kdb. 8134 * These functions are only useful for the IA64 MCA handling, or kdb.
8136 * 8135 *
8137 * They can only be called when the whole system has been 8136 * They can only be called when the whole system has been
8138 * stopped - every CPU needs to be quiescent, and no scheduling 8137 * stopped - every CPU needs to be quiescent, and no scheduling
8139 * activity can take place. Using them for anything else would 8138 * activity can take place. Using them for anything else would
8140 * be a serious bug, and as a result, they aren't even visible 8139 * be a serious bug, and as a result, they aren't even visible
8141 * under any other configuration. 8140 * under any other configuration.
8142 */ 8141 */
8143 8142
8144 /** 8143 /**
8145 * curr_task - return the current task for a given cpu. 8144 * curr_task - return the current task for a given cpu.
8146 * @cpu: the processor in question. 8145 * @cpu: the processor in question.
8147 * 8146 *
8148 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 8147 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8149 */ 8148 */
8150 struct task_struct *curr_task(int cpu) 8149 struct task_struct *curr_task(int cpu)
8151 { 8150 {
8152 return cpu_curr(cpu); 8151 return cpu_curr(cpu);
8153 } 8152 }
8154 8153
8155 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 8154 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8156 8155
8157 #ifdef CONFIG_IA64 8156 #ifdef CONFIG_IA64
8158 /** 8157 /**
8159 * set_curr_task - set the current task for a given cpu. 8158 * set_curr_task - set the current task for a given cpu.
8160 * @cpu: the processor in question. 8159 * @cpu: the processor in question.
8161 * @p: the task pointer to set. 8160 * @p: the task pointer to set.
8162 * 8161 *
8163 * Description: This function must only be used when non-maskable interrupts 8162 * Description: This function must only be used when non-maskable interrupts
8164 * are serviced on a separate stack. It allows the architecture to switch the 8163 * are serviced on a separate stack. It allows the architecture to switch the
8165 * notion of the current task on a cpu in a non-blocking manner. This function 8164 * notion of the current task on a cpu in a non-blocking manner. This function
8166 * must be called with all CPU's synchronized, and interrupts disabled, the 8165 * must be called with all CPU's synchronized, and interrupts disabled, the
8167 * and caller must save the original value of the current task (see 8166 * and caller must save the original value of the current task (see
8168 * curr_task() above) and restore that value before reenabling interrupts and 8167 * curr_task() above) and restore that value before reenabling interrupts and
8169 * re-starting the system. 8168 * re-starting the system.
8170 * 8169 *
8171 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 8170 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8172 */ 8171 */
8173 void set_curr_task(int cpu, struct task_struct *p) 8172 void set_curr_task(int cpu, struct task_struct *p)
8174 { 8173 {
8175 cpu_curr(cpu) = p; 8174 cpu_curr(cpu) = p;
8176 } 8175 }
8177 8176
8178 #endif 8177 #endif
8179 8178
8180 #ifdef CONFIG_FAIR_GROUP_SCHED 8179 #ifdef CONFIG_FAIR_GROUP_SCHED
8181 static void free_fair_sched_group(struct task_group *tg) 8180 static void free_fair_sched_group(struct task_group *tg)
8182 { 8181 {
8183 int i; 8182 int i;
8184 8183
8185 for_each_possible_cpu(i) { 8184 for_each_possible_cpu(i) {
8186 if (tg->cfs_rq) 8185 if (tg->cfs_rq)
8187 kfree(tg->cfs_rq[i]); 8186 kfree(tg->cfs_rq[i]);
8188 if (tg->se) 8187 if (tg->se)
8189 kfree(tg->se[i]); 8188 kfree(tg->se[i]);
8190 } 8189 }
8191 8190
8192 kfree(tg->cfs_rq); 8191 kfree(tg->cfs_rq);
8193 kfree(tg->se); 8192 kfree(tg->se);
8194 } 8193 }
8195 8194
8196 static 8195 static
8197 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8196 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8198 { 8197 {
8199 struct cfs_rq *cfs_rq; 8198 struct cfs_rq *cfs_rq;
8200 struct sched_entity *se; 8199 struct sched_entity *se;
8201 int i; 8200 int i;
8202 8201
8203 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8202 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8204 if (!tg->cfs_rq) 8203 if (!tg->cfs_rq)
8205 goto err; 8204 goto err;
8206 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); 8205 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8207 if (!tg->se) 8206 if (!tg->se)
8208 goto err; 8207 goto err;
8209 8208
8210 tg->shares = NICE_0_LOAD; 8209 tg->shares = NICE_0_LOAD;
8211 8210
8212 for_each_possible_cpu(i) { 8211 for_each_possible_cpu(i) {
8213 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8212 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8214 GFP_KERNEL, cpu_to_node(i)); 8213 GFP_KERNEL, cpu_to_node(i));
8215 if (!cfs_rq) 8214 if (!cfs_rq)
8216 goto err; 8215 goto err;
8217 8216
8218 se = kzalloc_node(sizeof(struct sched_entity), 8217 se = kzalloc_node(sizeof(struct sched_entity),
8219 GFP_KERNEL, cpu_to_node(i)); 8218 GFP_KERNEL, cpu_to_node(i));
8220 if (!se) 8219 if (!se)
8221 goto err_free_rq; 8220 goto err_free_rq;
8222 8221
8223 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8222 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8224 } 8223 }
8225 8224
8226 return 1; 8225 return 1;
8227 8226
8228 err_free_rq: 8227 err_free_rq:
8229 kfree(cfs_rq); 8228 kfree(cfs_rq);
8230 err: 8229 err:
8231 return 0; 8230 return 0;
8232 } 8231 }
8233 8232
8234 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8233 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8235 { 8234 {
8236 struct rq *rq = cpu_rq(cpu); 8235 struct rq *rq = cpu_rq(cpu);
8237 unsigned long flags; 8236 unsigned long flags;
8238 8237
8239 /* 8238 /*
8240 * Only empty task groups can be destroyed; so we can speculatively 8239 * Only empty task groups can be destroyed; so we can speculatively
8241 * check on_list without danger of it being re-added. 8240 * check on_list without danger of it being re-added.
8242 */ 8241 */
8243 if (!tg->cfs_rq[cpu]->on_list) 8242 if (!tg->cfs_rq[cpu]->on_list)
8244 return; 8243 return;
8245 8244
8246 raw_spin_lock_irqsave(&rq->lock, flags); 8245 raw_spin_lock_irqsave(&rq->lock, flags);
8247 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8246 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8248 raw_spin_unlock_irqrestore(&rq->lock, flags); 8247 raw_spin_unlock_irqrestore(&rq->lock, flags);
8249 } 8248 }
8250 #else /* !CONFG_FAIR_GROUP_SCHED */ 8249 #else /* !CONFG_FAIR_GROUP_SCHED */
8251 static inline void free_fair_sched_group(struct task_group *tg) 8250 static inline void free_fair_sched_group(struct task_group *tg)
8252 { 8251 {
8253 } 8252 }
8254 8253
8255 static inline 8254 static inline
8256 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8255 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8257 { 8256 {
8258 return 1; 8257 return 1;
8259 } 8258 }
8260 8259
8261 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8260 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8262 { 8261 {
8263 } 8262 }
8264 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8263 #endif /* CONFIG_FAIR_GROUP_SCHED */
8265 8264
8266 #ifdef CONFIG_RT_GROUP_SCHED 8265 #ifdef CONFIG_RT_GROUP_SCHED
8267 static void free_rt_sched_group(struct task_group *tg) 8266 static void free_rt_sched_group(struct task_group *tg)
8268 { 8267 {
8269 int i; 8268 int i;
8270 8269
8271 destroy_rt_bandwidth(&tg->rt_bandwidth); 8270 destroy_rt_bandwidth(&tg->rt_bandwidth);
8272 8271
8273 for_each_possible_cpu(i) { 8272 for_each_possible_cpu(i) {
8274 if (tg->rt_rq) 8273 if (tg->rt_rq)
8275 kfree(tg->rt_rq[i]); 8274 kfree(tg->rt_rq[i]);
8276 if (tg->rt_se) 8275 if (tg->rt_se)
8277 kfree(tg->rt_se[i]); 8276 kfree(tg->rt_se[i]);
8278 } 8277 }
8279 8278
8280 kfree(tg->rt_rq); 8279 kfree(tg->rt_rq);
8281 kfree(tg->rt_se); 8280 kfree(tg->rt_se);
8282 } 8281 }
8283 8282
8284 static 8283 static
8285 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8284 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8286 { 8285 {
8287 struct rt_rq *rt_rq; 8286 struct rt_rq *rt_rq;
8288 struct sched_rt_entity *rt_se; 8287 struct sched_rt_entity *rt_se;
8289 int i; 8288 int i;
8290 8289
8291 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8290 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8292 if (!tg->rt_rq) 8291 if (!tg->rt_rq)
8293 goto err; 8292 goto err;
8294 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 8293 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8295 if (!tg->rt_se) 8294 if (!tg->rt_se)
8296 goto err; 8295 goto err;
8297 8296
8298 init_rt_bandwidth(&tg->rt_bandwidth, 8297 init_rt_bandwidth(&tg->rt_bandwidth,
8299 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8298 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8300 8299
8301 for_each_possible_cpu(i) { 8300 for_each_possible_cpu(i) {
8302 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8301 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8303 GFP_KERNEL, cpu_to_node(i)); 8302 GFP_KERNEL, cpu_to_node(i));
8304 if (!rt_rq) 8303 if (!rt_rq)
8305 goto err; 8304 goto err;
8306 8305
8307 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 8306 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8308 GFP_KERNEL, cpu_to_node(i)); 8307 GFP_KERNEL, cpu_to_node(i));
8309 if (!rt_se) 8308 if (!rt_se)
8310 goto err_free_rq; 8309 goto err_free_rq;
8311 8310
8312 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 8311 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8313 } 8312 }
8314 8313
8315 return 1; 8314 return 1;
8316 8315
8317 err_free_rq: 8316 err_free_rq:
8318 kfree(rt_rq); 8317 kfree(rt_rq);
8319 err: 8318 err:
8320 return 0; 8319 return 0;
8321 } 8320 }
8322 #else /* !CONFIG_RT_GROUP_SCHED */ 8321 #else /* !CONFIG_RT_GROUP_SCHED */
8323 static inline void free_rt_sched_group(struct task_group *tg) 8322 static inline void free_rt_sched_group(struct task_group *tg)
8324 { 8323 {
8325 } 8324 }
8326 8325
8327 static inline 8326 static inline
8328 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8327 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8329 { 8328 {
8330 return 1; 8329 return 1;
8331 } 8330 }
8332 #endif /* CONFIG_RT_GROUP_SCHED */ 8331 #endif /* CONFIG_RT_GROUP_SCHED */
8333 8332
8334 #ifdef CONFIG_CGROUP_SCHED 8333 #ifdef CONFIG_CGROUP_SCHED
8335 static void free_sched_group(struct task_group *tg) 8334 static void free_sched_group(struct task_group *tg)
8336 { 8335 {
8337 free_fair_sched_group(tg); 8336 free_fair_sched_group(tg);
8338 free_rt_sched_group(tg); 8337 free_rt_sched_group(tg);
8339 autogroup_free(tg); 8338 autogroup_free(tg);
8340 kfree(tg); 8339 kfree(tg);
8341 } 8340 }
8342 8341
8343 /* allocate runqueue etc for a new task group */ 8342 /* allocate runqueue etc for a new task group */
8344 struct task_group *sched_create_group(struct task_group *parent) 8343 struct task_group *sched_create_group(struct task_group *parent)
8345 { 8344 {
8346 struct task_group *tg; 8345 struct task_group *tg;
8347 unsigned long flags; 8346 unsigned long flags;
8348 8347
8349 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8348 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8350 if (!tg) 8349 if (!tg)
8351 return ERR_PTR(-ENOMEM); 8350 return ERR_PTR(-ENOMEM);
8352 8351
8353 if (!alloc_fair_sched_group(tg, parent)) 8352 if (!alloc_fair_sched_group(tg, parent))
8354 goto err; 8353 goto err;
8355 8354
8356 if (!alloc_rt_sched_group(tg, parent)) 8355 if (!alloc_rt_sched_group(tg, parent))
8357 goto err; 8356 goto err;
8358 8357
8359 spin_lock_irqsave(&task_group_lock, flags); 8358 spin_lock_irqsave(&task_group_lock, flags);
8360 list_add_rcu(&tg->list, &task_groups); 8359 list_add_rcu(&tg->list, &task_groups);
8361 8360
8362 WARN_ON(!parent); /* root should already exist */ 8361 WARN_ON(!parent); /* root should already exist */
8363 8362
8364 tg->parent = parent; 8363 tg->parent = parent;
8365 INIT_LIST_HEAD(&tg->children); 8364 INIT_LIST_HEAD(&tg->children);
8366 list_add_rcu(&tg->siblings, &parent->children); 8365 list_add_rcu(&tg->siblings, &parent->children);
8367 spin_unlock_irqrestore(&task_group_lock, flags); 8366 spin_unlock_irqrestore(&task_group_lock, flags);
8368 8367
8369 return tg; 8368 return tg;
8370 8369
8371 err: 8370 err:
8372 free_sched_group(tg); 8371 free_sched_group(tg);
8373 return ERR_PTR(-ENOMEM); 8372 return ERR_PTR(-ENOMEM);
8374 } 8373 }
8375 8374
8376 /* rcu callback to free various structures associated with a task group */ 8375 /* rcu callback to free various structures associated with a task group */
8377 static void free_sched_group_rcu(struct rcu_head *rhp) 8376 static void free_sched_group_rcu(struct rcu_head *rhp)
8378 { 8377 {
8379 /* now it should be safe to free those cfs_rqs */ 8378 /* now it should be safe to free those cfs_rqs */
8380 free_sched_group(container_of(rhp, struct task_group, rcu)); 8379 free_sched_group(container_of(rhp, struct task_group, rcu));
8381 } 8380 }
8382 8381
8383 /* Destroy runqueue etc associated with a task group */ 8382 /* Destroy runqueue etc associated with a task group */
8384 void sched_destroy_group(struct task_group *tg) 8383 void sched_destroy_group(struct task_group *tg)
8385 { 8384 {
8386 unsigned long flags; 8385 unsigned long flags;
8387 int i; 8386 int i;
8388 8387
8389 /* end participation in shares distribution */ 8388 /* end participation in shares distribution */
8390 for_each_possible_cpu(i) 8389 for_each_possible_cpu(i)
8391 unregister_fair_sched_group(tg, i); 8390 unregister_fair_sched_group(tg, i);
8392 8391
8393 spin_lock_irqsave(&task_group_lock, flags); 8392 spin_lock_irqsave(&task_group_lock, flags);
8394 list_del_rcu(&tg->list); 8393 list_del_rcu(&tg->list);
8395 list_del_rcu(&tg->siblings); 8394 list_del_rcu(&tg->siblings);
8396 spin_unlock_irqrestore(&task_group_lock, flags); 8395 spin_unlock_irqrestore(&task_group_lock, flags);
8397 8396
8398 /* wait for possible concurrent references to cfs_rqs complete */ 8397 /* wait for possible concurrent references to cfs_rqs complete */
8399 call_rcu(&tg->rcu, free_sched_group_rcu); 8398 call_rcu(&tg->rcu, free_sched_group_rcu);
8400 } 8399 }
8401 8400
8402 /* change task's runqueue when it moves between groups. 8401 /* change task's runqueue when it moves between groups.
8403 * The caller of this function should have put the task in its new group 8402 * The caller of this function should have put the task in its new group
8404 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 8403 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
8405 * reflect its new group. 8404 * reflect its new group.
8406 */ 8405 */
8407 void sched_move_task(struct task_struct *tsk) 8406 void sched_move_task(struct task_struct *tsk)
8408 { 8407 {
8409 int on_rq, running; 8408 int on_rq, running;
8410 unsigned long flags; 8409 unsigned long flags;
8411 struct rq *rq; 8410 struct rq *rq;
8412 8411
8413 rq = task_rq_lock(tsk, &flags); 8412 rq = task_rq_lock(tsk, &flags);
8414 8413
8415 running = task_current(rq, tsk); 8414 running = task_current(rq, tsk);
8416 on_rq = tsk->on_rq; 8415 on_rq = tsk->on_rq;
8417 8416
8418 if (on_rq) 8417 if (on_rq)
8419 dequeue_task(rq, tsk, 0); 8418 dequeue_task(rq, tsk, 0);
8420 if (unlikely(running)) 8419 if (unlikely(running))
8421 tsk->sched_class->put_prev_task(rq, tsk); 8420 tsk->sched_class->put_prev_task(rq, tsk);
8422 8421
8423 #ifdef CONFIG_FAIR_GROUP_SCHED 8422 #ifdef CONFIG_FAIR_GROUP_SCHED
8424 if (tsk->sched_class->task_move_group) 8423 if (tsk->sched_class->task_move_group)
8425 tsk->sched_class->task_move_group(tsk, on_rq); 8424 tsk->sched_class->task_move_group(tsk, on_rq);
8426 else 8425 else
8427 #endif 8426 #endif
8428 set_task_rq(tsk, task_cpu(tsk)); 8427 set_task_rq(tsk, task_cpu(tsk));
8429 8428
8430 if (unlikely(running)) 8429 if (unlikely(running))
8431 tsk->sched_class->set_curr_task(rq); 8430 tsk->sched_class->set_curr_task(rq);
8432 if (on_rq) 8431 if (on_rq)
8433 enqueue_task(rq, tsk, 0); 8432 enqueue_task(rq, tsk, 0);
8434 8433
8435 task_rq_unlock(rq, tsk, &flags); 8434 task_rq_unlock(rq, tsk, &flags);
8436 } 8435 }
8437 #endif /* CONFIG_CGROUP_SCHED */ 8436 #endif /* CONFIG_CGROUP_SCHED */
8438 8437
8439 #ifdef CONFIG_FAIR_GROUP_SCHED 8438 #ifdef CONFIG_FAIR_GROUP_SCHED
8440 static DEFINE_MUTEX(shares_mutex); 8439 static DEFINE_MUTEX(shares_mutex);
8441 8440
8442 int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8441 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8443 { 8442 {
8444 int i; 8443 int i;
8445 unsigned long flags; 8444 unsigned long flags;
8446 8445
8447 /* 8446 /*
8448 * We can't change the weight of the root cgroup. 8447 * We can't change the weight of the root cgroup.
8449 */ 8448 */
8450 if (!tg->se[0]) 8449 if (!tg->se[0])
8451 return -EINVAL; 8450 return -EINVAL;
8452 8451
8453 if (shares < MIN_SHARES) 8452 if (shares < MIN_SHARES)
8454 shares = MIN_SHARES; 8453 shares = MIN_SHARES;
8455 else if (shares > MAX_SHARES) 8454 else if (shares > MAX_SHARES)
8456 shares = MAX_SHARES; 8455 shares = MAX_SHARES;
8457 8456
8458 mutex_lock(&shares_mutex); 8457 mutex_lock(&shares_mutex);
8459 if (tg->shares == shares) 8458 if (tg->shares == shares)
8460 goto done; 8459 goto done;
8461 8460
8462 tg->shares = shares; 8461 tg->shares = shares;
8463 for_each_possible_cpu(i) { 8462 for_each_possible_cpu(i) {
8464 struct rq *rq = cpu_rq(i); 8463 struct rq *rq = cpu_rq(i);
8465 struct sched_entity *se; 8464 struct sched_entity *se;
8466 8465
8467 se = tg->se[i]; 8466 se = tg->se[i];
8468 /* Propagate contribution to hierarchy */ 8467 /* Propagate contribution to hierarchy */
8469 raw_spin_lock_irqsave(&rq->lock, flags); 8468 raw_spin_lock_irqsave(&rq->lock, flags);
8470 for_each_sched_entity(se) 8469 for_each_sched_entity(se)
8471 update_cfs_shares(group_cfs_rq(se)); 8470 update_cfs_shares(group_cfs_rq(se));
8472 raw_spin_unlock_irqrestore(&rq->lock, flags); 8471 raw_spin_unlock_irqrestore(&rq->lock, flags);
8473 } 8472 }
8474 8473
8475 done: 8474 done:
8476 mutex_unlock(&shares_mutex); 8475 mutex_unlock(&shares_mutex);
8477 return 0; 8476 return 0;
8478 } 8477 }
8479 8478
8480 unsigned long sched_group_shares(struct task_group *tg) 8479 unsigned long sched_group_shares(struct task_group *tg)
8481 { 8480 {
8482 return tg->shares; 8481 return tg->shares;
8483 } 8482 }
8484 #endif 8483 #endif
8485 8484
8486 #ifdef CONFIG_RT_GROUP_SCHED 8485 #ifdef CONFIG_RT_GROUP_SCHED
8487 /* 8486 /*
8488 * Ensure that the real time constraints are schedulable. 8487 * Ensure that the real time constraints are schedulable.
8489 */ 8488 */
8490 static DEFINE_MUTEX(rt_constraints_mutex); 8489 static DEFINE_MUTEX(rt_constraints_mutex);
8491 8490
8492 static unsigned long to_ratio(u64 period, u64 runtime) 8491 static unsigned long to_ratio(u64 period, u64 runtime)
8493 { 8492 {
8494 if (runtime == RUNTIME_INF) 8493 if (runtime == RUNTIME_INF)
8495 return 1ULL << 20; 8494 return 1ULL << 20;
8496 8495
8497 return div64_u64(runtime << 20, period); 8496 return div64_u64(runtime << 20, period);
8498 } 8497 }
8499 8498
8500 /* Must be called with tasklist_lock held */ 8499 /* Must be called with tasklist_lock held */
8501 static inline int tg_has_rt_tasks(struct task_group *tg) 8500 static inline int tg_has_rt_tasks(struct task_group *tg)
8502 { 8501 {
8503 struct task_struct *g, *p; 8502 struct task_struct *g, *p;
8504 8503
8505 do_each_thread(g, p) { 8504 do_each_thread(g, p) {
8506 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8505 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8507 return 1; 8506 return 1;
8508 } while_each_thread(g, p); 8507 } while_each_thread(g, p);
8509 8508
8510 return 0; 8509 return 0;
8511 } 8510 }
8512 8511
8513 struct rt_schedulable_data { 8512 struct rt_schedulable_data {
8514 struct task_group *tg; 8513 struct task_group *tg;
8515 u64 rt_period; 8514 u64 rt_period;
8516 u64 rt_runtime; 8515 u64 rt_runtime;
8517 }; 8516 };
8518 8517
8519 static int tg_schedulable(struct task_group *tg, void *data) 8518 static int tg_schedulable(struct task_group *tg, void *data)
8520 { 8519 {
8521 struct rt_schedulable_data *d = data; 8520 struct rt_schedulable_data *d = data;
8522 struct task_group *child; 8521 struct task_group *child;
8523 unsigned long total, sum = 0; 8522 unsigned long total, sum = 0;
8524 u64 period, runtime; 8523 u64 period, runtime;
8525 8524
8526 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8525 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8527 runtime = tg->rt_bandwidth.rt_runtime; 8526 runtime = tg->rt_bandwidth.rt_runtime;
8528 8527
8529 if (tg == d->tg) { 8528 if (tg == d->tg) {
8530 period = d->rt_period; 8529 period = d->rt_period;
8531 runtime = d->rt_runtime; 8530 runtime = d->rt_runtime;
8532 } 8531 }
8533 8532
8534 /* 8533 /*
8535 * Cannot have more runtime than the period. 8534 * Cannot have more runtime than the period.
8536 */ 8535 */
8537 if (runtime > period && runtime != RUNTIME_INF) 8536 if (runtime > period && runtime != RUNTIME_INF)
8538 return -EINVAL; 8537 return -EINVAL;
8539 8538
8540 /* 8539 /*
8541 * Ensure we don't starve existing RT tasks. 8540 * Ensure we don't starve existing RT tasks.
8542 */ 8541 */
8543 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 8542 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8544 return -EBUSY; 8543 return -EBUSY;
8545 8544
8546 total = to_ratio(period, runtime); 8545 total = to_ratio(period, runtime);
8547 8546
8548 /* 8547 /*
8549 * Nobody can have more than the global setting allows. 8548 * Nobody can have more than the global setting allows.
8550 */ 8549 */
8551 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 8550 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8552 return -EINVAL; 8551 return -EINVAL;
8553 8552
8554 /* 8553 /*
8555 * The sum of our children's runtime should not exceed our own. 8554 * The sum of our children's runtime should not exceed our own.
8556 */ 8555 */
8557 list_for_each_entry_rcu(child, &tg->children, siblings) { 8556 list_for_each_entry_rcu(child, &tg->children, siblings) {
8558 period = ktime_to_ns(child->rt_bandwidth.rt_period); 8557 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8559 runtime = child->rt_bandwidth.rt_runtime; 8558 runtime = child->rt_bandwidth.rt_runtime;
8560 8559
8561 if (child == d->tg) { 8560 if (child == d->tg) {
8562 period = d->rt_period; 8561 period = d->rt_period;
8563 runtime = d->rt_runtime; 8562 runtime = d->rt_runtime;
8564 } 8563 }
8565 8564
8566 sum += to_ratio(period, runtime); 8565 sum += to_ratio(period, runtime);
8567 } 8566 }
8568 8567
8569 if (sum > total) 8568 if (sum > total)
8570 return -EINVAL; 8569 return -EINVAL;
8571 8570
8572 return 0; 8571 return 0;
8573 } 8572 }
8574 8573
8575 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8574 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8576 { 8575 {
8577 struct rt_schedulable_data data = { 8576 struct rt_schedulable_data data = {
8578 .tg = tg, 8577 .tg = tg,
8579 .rt_period = period, 8578 .rt_period = period,
8580 .rt_runtime = runtime, 8579 .rt_runtime = runtime,
8581 }; 8580 };
8582 8581
8583 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8582 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8584 } 8583 }
8585 8584
8586 static int tg_set_bandwidth(struct task_group *tg, 8585 static int tg_set_bandwidth(struct task_group *tg,
8587 u64 rt_period, u64 rt_runtime) 8586 u64 rt_period, u64 rt_runtime)
8588 { 8587 {
8589 int i, err = 0; 8588 int i, err = 0;
8590 8589
8591 mutex_lock(&rt_constraints_mutex); 8590 mutex_lock(&rt_constraints_mutex);
8592 read_lock(&tasklist_lock); 8591 read_lock(&tasklist_lock);
8593 err = __rt_schedulable(tg, rt_period, rt_runtime); 8592 err = __rt_schedulable(tg, rt_period, rt_runtime);
8594 if (err) 8593 if (err)
8595 goto unlock; 8594 goto unlock;
8596 8595
8597 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8596 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8598 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8597 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8599 tg->rt_bandwidth.rt_runtime = rt_runtime; 8598 tg->rt_bandwidth.rt_runtime = rt_runtime;
8600 8599
8601 for_each_possible_cpu(i) { 8600 for_each_possible_cpu(i) {
8602 struct rt_rq *rt_rq = tg->rt_rq[i]; 8601 struct rt_rq *rt_rq = tg->rt_rq[i];
8603 8602
8604 raw_spin_lock(&rt_rq->rt_runtime_lock); 8603 raw_spin_lock(&rt_rq->rt_runtime_lock);
8605 rt_rq->rt_runtime = rt_runtime; 8604 rt_rq->rt_runtime = rt_runtime;
8606 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8605 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8607 } 8606 }
8608 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8607 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8609 unlock: 8608 unlock:
8610 read_unlock(&tasklist_lock); 8609 read_unlock(&tasklist_lock);
8611 mutex_unlock(&rt_constraints_mutex); 8610 mutex_unlock(&rt_constraints_mutex);
8612 8611
8613 return err; 8612 return err;
8614 } 8613 }
8615 8614
8616 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 8615 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8617 { 8616 {
8618 u64 rt_runtime, rt_period; 8617 u64 rt_runtime, rt_period;
8619 8618
8620 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8619 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8621 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 8620 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8622 if (rt_runtime_us < 0) 8621 if (rt_runtime_us < 0)
8623 rt_runtime = RUNTIME_INF; 8622 rt_runtime = RUNTIME_INF;
8624 8623
8625 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8624 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8626 } 8625 }
8627 8626
8628 long sched_group_rt_runtime(struct task_group *tg) 8627 long sched_group_rt_runtime(struct task_group *tg)
8629 { 8628 {
8630 u64 rt_runtime_us; 8629 u64 rt_runtime_us;
8631 8630
8632 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 8631 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8633 return -1; 8632 return -1;
8634 8633
8635 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 8634 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8636 do_div(rt_runtime_us, NSEC_PER_USEC); 8635 do_div(rt_runtime_us, NSEC_PER_USEC);
8637 return rt_runtime_us; 8636 return rt_runtime_us;
8638 } 8637 }
8639 8638
8640 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 8639 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8641 { 8640 {
8642 u64 rt_runtime, rt_period; 8641 u64 rt_runtime, rt_period;
8643 8642
8644 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8643 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8645 rt_runtime = tg->rt_bandwidth.rt_runtime; 8644 rt_runtime = tg->rt_bandwidth.rt_runtime;
8646 8645
8647 if (rt_period == 0) 8646 if (rt_period == 0)
8648 return -EINVAL; 8647 return -EINVAL;
8649 8648
8650 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8649 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8651 } 8650 }
8652 8651
8653 long sched_group_rt_period(struct task_group *tg) 8652 long sched_group_rt_period(struct task_group *tg)
8654 { 8653 {
8655 u64 rt_period_us; 8654 u64 rt_period_us;
8656 8655
8657 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 8656 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8658 do_div(rt_period_us, NSEC_PER_USEC); 8657 do_div(rt_period_us, NSEC_PER_USEC);
8659 return rt_period_us; 8658 return rt_period_us;
8660 } 8659 }
8661 8660
8662 static int sched_rt_global_constraints(void) 8661 static int sched_rt_global_constraints(void)
8663 { 8662 {
8664 u64 runtime, period; 8663 u64 runtime, period;
8665 int ret = 0; 8664 int ret = 0;
8666 8665
8667 if (sysctl_sched_rt_period <= 0) 8666 if (sysctl_sched_rt_period <= 0)
8668 return -EINVAL; 8667 return -EINVAL;
8669 8668
8670 runtime = global_rt_runtime(); 8669 runtime = global_rt_runtime();
8671 period = global_rt_period(); 8670 period = global_rt_period();
8672 8671
8673 /* 8672 /*
8674 * Sanity check on the sysctl variables. 8673 * Sanity check on the sysctl variables.
8675 */ 8674 */
8676 if (runtime > period && runtime != RUNTIME_INF) 8675 if (runtime > period && runtime != RUNTIME_INF)
8677 return -EINVAL; 8676 return -EINVAL;
8678 8677
8679 mutex_lock(&rt_constraints_mutex); 8678 mutex_lock(&rt_constraints_mutex);
8680 read_lock(&tasklist_lock); 8679 read_lock(&tasklist_lock);
8681 ret = __rt_schedulable(NULL, 0, 0); 8680 ret = __rt_schedulable(NULL, 0, 0);
8682 read_unlock(&tasklist_lock); 8681 read_unlock(&tasklist_lock);
8683 mutex_unlock(&rt_constraints_mutex); 8682 mutex_unlock(&rt_constraints_mutex);
8684 8683
8685 return ret; 8684 return ret;
8686 } 8685 }
8687 8686
8688 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 8687 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
8689 { 8688 {
8690 /* Don't accept realtime tasks when there is no way for them to run */ 8689 /* Don't accept realtime tasks when there is no way for them to run */
8691 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 8690 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
8692 return 0; 8691 return 0;
8693 8692
8694 return 1; 8693 return 1;
8695 } 8694 }
8696 8695
8697 #else /* !CONFIG_RT_GROUP_SCHED */ 8696 #else /* !CONFIG_RT_GROUP_SCHED */
8698 static int sched_rt_global_constraints(void) 8697 static int sched_rt_global_constraints(void)
8699 { 8698 {
8700 unsigned long flags; 8699 unsigned long flags;
8701 int i; 8700 int i;
8702 8701
8703 if (sysctl_sched_rt_period <= 0) 8702 if (sysctl_sched_rt_period <= 0)
8704 return -EINVAL; 8703 return -EINVAL;
8705 8704
8706 /* 8705 /*
8707 * There's always some RT tasks in the root group 8706 * There's always some RT tasks in the root group
8708 * -- migration, kstopmachine etc.. 8707 * -- migration, kstopmachine etc..
8709 */ 8708 */
8710 if (sysctl_sched_rt_runtime == 0) 8709 if (sysctl_sched_rt_runtime == 0)
8711 return -EBUSY; 8710 return -EBUSY;
8712 8711
8713 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 8712 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8714 for_each_possible_cpu(i) { 8713 for_each_possible_cpu(i) {
8715 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 8714 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8716 8715
8717 raw_spin_lock(&rt_rq->rt_runtime_lock); 8716 raw_spin_lock(&rt_rq->rt_runtime_lock);
8718 rt_rq->rt_runtime = global_rt_runtime(); 8717 rt_rq->rt_runtime = global_rt_runtime();
8719 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8718 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8720 } 8719 }
8721 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 8720 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8722 8721
8723 return 0; 8722 return 0;
8724 } 8723 }
8725 #endif /* CONFIG_RT_GROUP_SCHED */ 8724 #endif /* CONFIG_RT_GROUP_SCHED */
8726 8725
8727 int sched_rt_handler(struct ctl_table *table, int write, 8726 int sched_rt_handler(struct ctl_table *table, int write,
8728 void __user *buffer, size_t *lenp, 8727 void __user *buffer, size_t *lenp,
8729 loff_t *ppos) 8728 loff_t *ppos)
8730 { 8729 {
8731 int ret; 8730 int ret;
8732 int old_period, old_runtime; 8731 int old_period, old_runtime;
8733 static DEFINE_MUTEX(mutex); 8732 static DEFINE_MUTEX(mutex);
8734 8733
8735 mutex_lock(&mutex); 8734 mutex_lock(&mutex);
8736 old_period = sysctl_sched_rt_period; 8735 old_period = sysctl_sched_rt_period;
8737 old_runtime = sysctl_sched_rt_runtime; 8736 old_runtime = sysctl_sched_rt_runtime;
8738 8737
8739 ret = proc_dointvec(table, write, buffer, lenp, ppos); 8738 ret = proc_dointvec(table, write, buffer, lenp, ppos);
8740 8739
8741 if (!ret && write) { 8740 if (!ret && write) {
8742 ret = sched_rt_global_constraints(); 8741 ret = sched_rt_global_constraints();
8743 if (ret) { 8742 if (ret) {
8744 sysctl_sched_rt_period = old_period; 8743 sysctl_sched_rt_period = old_period;
8745 sysctl_sched_rt_runtime = old_runtime; 8744 sysctl_sched_rt_runtime = old_runtime;
8746 } else { 8745 } else {
8747 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 8746 def_rt_bandwidth.rt_runtime = global_rt_runtime();
8748 def_rt_bandwidth.rt_period = 8747 def_rt_bandwidth.rt_period =
8749 ns_to_ktime(global_rt_period()); 8748 ns_to_ktime(global_rt_period());
8750 } 8749 }
8751 } 8750 }
8752 mutex_unlock(&mutex); 8751 mutex_unlock(&mutex);
8753 8752
8754 return ret; 8753 return ret;
8755 } 8754 }
8756 8755
8757 #ifdef CONFIG_CGROUP_SCHED 8756 #ifdef CONFIG_CGROUP_SCHED
8758 8757
8759 /* return corresponding task_group object of a cgroup */ 8758 /* return corresponding task_group object of a cgroup */
8760 static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 8759 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
8761 { 8760 {
8762 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 8761 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
8763 struct task_group, css); 8762 struct task_group, css);
8764 } 8763 }
8765 8764
8766 static struct cgroup_subsys_state * 8765 static struct cgroup_subsys_state *
8767 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) 8766 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8768 { 8767 {
8769 struct task_group *tg, *parent; 8768 struct task_group *tg, *parent;
8770 8769
8771 if (!cgrp->parent) { 8770 if (!cgrp->parent) {
8772 /* This is early initialization for the top cgroup */ 8771 /* This is early initialization for the top cgroup */
8773 return &root_task_group.css; 8772 return &root_task_group.css;
8774 } 8773 }
8775 8774
8776 parent = cgroup_tg(cgrp->parent); 8775 parent = cgroup_tg(cgrp->parent);
8777 tg = sched_create_group(parent); 8776 tg = sched_create_group(parent);
8778 if (IS_ERR(tg)) 8777 if (IS_ERR(tg))
8779 return ERR_PTR(-ENOMEM); 8778 return ERR_PTR(-ENOMEM);
8780 8779
8781 return &tg->css; 8780 return &tg->css;
8782 } 8781 }
8783 8782
8784 static void 8783 static void
8785 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 8784 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8786 { 8785 {
8787 struct task_group *tg = cgroup_tg(cgrp); 8786 struct task_group *tg = cgroup_tg(cgrp);
8788 8787
8789 sched_destroy_group(tg); 8788 sched_destroy_group(tg);
8790 } 8789 }
8791 8790
8792 static int 8791 static int
8793 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 8792 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8794 { 8793 {
8795 #ifdef CONFIG_RT_GROUP_SCHED 8794 #ifdef CONFIG_RT_GROUP_SCHED
8796 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 8795 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
8797 return -EINVAL; 8796 return -EINVAL;
8798 #else 8797 #else
8799 /* We don't support RT-tasks being in separate groups */ 8798 /* We don't support RT-tasks being in separate groups */
8800 if (tsk->sched_class != &fair_sched_class) 8799 if (tsk->sched_class != &fair_sched_class)
8801 return -EINVAL; 8800 return -EINVAL;
8802 #endif 8801 #endif
8803 return 0; 8802 return 0;
8804 } 8803 }
8805 8804
8806 static void 8805 static void
8807 cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 8806 cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8808 { 8807 {
8809 sched_move_task(tsk); 8808 sched_move_task(tsk);
8810 } 8809 }
8811 8810
8812 static void 8811 static void
8813 cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 8812 cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
8814 struct cgroup *old_cgrp, struct task_struct *task) 8813 struct cgroup *old_cgrp, struct task_struct *task)
8815 { 8814 {
8816 /* 8815 /*
8817 * cgroup_exit() is called in the copy_process() failure path. 8816 * cgroup_exit() is called in the copy_process() failure path.
8818 * Ignore this case since the task hasn't ran yet, this avoids 8817 * Ignore this case since the task hasn't ran yet, this avoids
8819 * trying to poke a half freed task state from generic code. 8818 * trying to poke a half freed task state from generic code.
8820 */ 8819 */
8821 if (!(task->flags & PF_EXITING)) 8820 if (!(task->flags & PF_EXITING))
8822 return; 8821 return;
8823 8822
8824 sched_move_task(task); 8823 sched_move_task(task);
8825 } 8824 }
8826 8825
8827 #ifdef CONFIG_FAIR_GROUP_SCHED 8826 #ifdef CONFIG_FAIR_GROUP_SCHED
8828 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8827 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8829 u64 shareval) 8828 u64 shareval)
8830 { 8829 {
8831 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 8830 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
8832 } 8831 }
8833 8832
8834 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 8833 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8835 { 8834 {
8836 struct task_group *tg = cgroup_tg(cgrp); 8835 struct task_group *tg = cgroup_tg(cgrp);
8837 8836
8838 return (u64) scale_load_down(tg->shares); 8837 return (u64) scale_load_down(tg->shares);
8839 } 8838 }
8840 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8839 #endif /* CONFIG_FAIR_GROUP_SCHED */
8841 8840
8842 #ifdef CONFIG_RT_GROUP_SCHED 8841 #ifdef CONFIG_RT_GROUP_SCHED
8843 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8842 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8844 s64 val) 8843 s64 val)
8845 { 8844 {
8846 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 8845 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8847 } 8846 }
8848 8847
8849 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 8848 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
8850 { 8849 {
8851 return sched_group_rt_runtime(cgroup_tg(cgrp)); 8850 return sched_group_rt_runtime(cgroup_tg(cgrp));
8852 } 8851 }
8853 8852
8854 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 8853 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
8855 u64 rt_period_us) 8854 u64 rt_period_us)
8856 { 8855 {
8857 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 8856 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
8858 } 8857 }
8859 8858
8860 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 8859 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8861 { 8860 {
8862 return sched_group_rt_period(cgroup_tg(cgrp)); 8861 return sched_group_rt_period(cgroup_tg(cgrp));
8863 } 8862 }
8864 #endif /* CONFIG_RT_GROUP_SCHED */ 8863 #endif /* CONFIG_RT_GROUP_SCHED */
8865 8864
8866 static struct cftype cpu_files[] = { 8865 static struct cftype cpu_files[] = {
8867 #ifdef CONFIG_FAIR_GROUP_SCHED 8866 #ifdef CONFIG_FAIR_GROUP_SCHED
8868 { 8867 {
8869 .name = "shares", 8868 .name = "shares",
8870 .read_u64 = cpu_shares_read_u64, 8869 .read_u64 = cpu_shares_read_u64,
8871 .write_u64 = cpu_shares_write_u64, 8870 .write_u64 = cpu_shares_write_u64,
8872 }, 8871 },
8873 #endif 8872 #endif
8874 #ifdef CONFIG_RT_GROUP_SCHED 8873 #ifdef CONFIG_RT_GROUP_SCHED
8875 { 8874 {
8876 .name = "rt_runtime_us", 8875 .name = "rt_runtime_us",
8877 .read_s64 = cpu_rt_runtime_read, 8876 .read_s64 = cpu_rt_runtime_read,
8878 .write_s64 = cpu_rt_runtime_write, 8877 .write_s64 = cpu_rt_runtime_write,
8879 }, 8878 },
8880 { 8879 {
8881 .name = "rt_period_us", 8880 .name = "rt_period_us",
8882 .read_u64 = cpu_rt_period_read_uint, 8881 .read_u64 = cpu_rt_period_read_uint,
8883 .write_u64 = cpu_rt_period_write_uint, 8882 .write_u64 = cpu_rt_period_write_uint,
8884 }, 8883 },
8885 #endif 8884 #endif
8886 }; 8885 };
8887 8886
8888 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 8887 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
8889 { 8888 {
8890 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); 8889 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
8891 } 8890 }
8892 8891
8893 struct cgroup_subsys cpu_cgroup_subsys = { 8892 struct cgroup_subsys cpu_cgroup_subsys = {
8894 .name = "cpu", 8893 .name = "cpu",
8895 .create = cpu_cgroup_create, 8894 .create = cpu_cgroup_create,
8896 .destroy = cpu_cgroup_destroy, 8895 .destroy = cpu_cgroup_destroy,
8897 .can_attach_task = cpu_cgroup_can_attach_task, 8896 .can_attach_task = cpu_cgroup_can_attach_task,
8898 .attach_task = cpu_cgroup_attach_task, 8897 .attach_task = cpu_cgroup_attach_task,
8899 .exit = cpu_cgroup_exit, 8898 .exit = cpu_cgroup_exit,
8900 .populate = cpu_cgroup_populate, 8899 .populate = cpu_cgroup_populate,
8901 .subsys_id = cpu_cgroup_subsys_id, 8900 .subsys_id = cpu_cgroup_subsys_id,
8902 .early_init = 1, 8901 .early_init = 1,
8903 }; 8902 };
8904 8903
8905 #endif /* CONFIG_CGROUP_SCHED */ 8904 #endif /* CONFIG_CGROUP_SCHED */
8906 8905
8907 #ifdef CONFIG_CGROUP_CPUACCT 8906 #ifdef CONFIG_CGROUP_CPUACCT
8908 8907
8909 /* 8908 /*
8910 * CPU accounting code for task groups. 8909 * CPU accounting code for task groups.
8911 * 8910 *
8912 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 8911 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
8913 * (balbir@in.ibm.com). 8912 * (balbir@in.ibm.com).
8914 */ 8913 */
8915 8914
8916 /* track cpu usage of a group of tasks and its child groups */ 8915 /* track cpu usage of a group of tasks and its child groups */
8917 struct cpuacct { 8916 struct cpuacct {
8918 struct cgroup_subsys_state css; 8917 struct cgroup_subsys_state css;
8919 /* cpuusage holds pointer to a u64-type object on every cpu */ 8918 /* cpuusage holds pointer to a u64-type object on every cpu */
8920 u64 __percpu *cpuusage; 8919 u64 __percpu *cpuusage;
8921 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8920 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
8922 struct cpuacct *parent; 8921 struct cpuacct *parent;
8923 }; 8922 };
8924 8923
8925 struct cgroup_subsys cpuacct_subsys; 8924 struct cgroup_subsys cpuacct_subsys;
8926 8925
8927 /* return cpu accounting group corresponding to this container */ 8926 /* return cpu accounting group corresponding to this container */
8928 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 8927 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
8929 { 8928 {
8930 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 8929 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
8931 struct cpuacct, css); 8930 struct cpuacct, css);
8932 } 8931 }
8933 8932
8934 /* return cpu accounting group to which this task belongs */ 8933 /* return cpu accounting group to which this task belongs */
8935 static inline struct cpuacct *task_ca(struct task_struct *tsk) 8934 static inline struct cpuacct *task_ca(struct task_struct *tsk)
8936 { 8935 {
8937 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 8936 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
8938 struct cpuacct, css); 8937 struct cpuacct, css);
8939 } 8938 }
8940 8939
8941 /* create a new cpu accounting group */ 8940 /* create a new cpu accounting group */
8942 static struct cgroup_subsys_state *cpuacct_create( 8941 static struct cgroup_subsys_state *cpuacct_create(
8943 struct cgroup_subsys *ss, struct cgroup *cgrp) 8942 struct cgroup_subsys *ss, struct cgroup *cgrp)
8944 { 8943 {
8945 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 8944 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8946 int i; 8945 int i;
8947 8946
8948 if (!ca) 8947 if (!ca)
8949 goto out; 8948 goto out;
8950 8949
8951 ca->cpuusage = alloc_percpu(u64); 8950 ca->cpuusage = alloc_percpu(u64);
8952 if (!ca->cpuusage) 8951 if (!ca->cpuusage)
8953 goto out_free_ca; 8952 goto out_free_ca;
8954 8953
8955 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 8954 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
8956 if (percpu_counter_init(&ca->cpustat[i], 0)) 8955 if (percpu_counter_init(&ca->cpustat[i], 0))
8957 goto out_free_counters; 8956 goto out_free_counters;
8958 8957
8959 if (cgrp->parent) 8958 if (cgrp->parent)
8960 ca->parent = cgroup_ca(cgrp->parent); 8959 ca->parent = cgroup_ca(cgrp->parent);
8961 8960
8962 return &ca->css; 8961 return &ca->css;
8963 8962
8964 out_free_counters: 8963 out_free_counters:
8965 while (--i >= 0) 8964 while (--i >= 0)
8966 percpu_counter_destroy(&ca->cpustat[i]); 8965 percpu_counter_destroy(&ca->cpustat[i]);
8967 free_percpu(ca->cpuusage); 8966 free_percpu(ca->cpuusage);
8968 out_free_ca: 8967 out_free_ca:
8969 kfree(ca); 8968 kfree(ca);
8970 out: 8969 out:
8971 return ERR_PTR(-ENOMEM); 8970 return ERR_PTR(-ENOMEM);
8972 } 8971 }
8973 8972
8974 /* destroy an existing cpu accounting group */ 8973 /* destroy an existing cpu accounting group */
8975 static void 8974 static void
8976 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 8975 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8977 { 8976 {
8978 struct cpuacct *ca = cgroup_ca(cgrp); 8977 struct cpuacct *ca = cgroup_ca(cgrp);
8979 int i; 8978 int i;
8980 8979
8981 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 8980 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
8982 percpu_counter_destroy(&ca->cpustat[i]); 8981 percpu_counter_destroy(&ca->cpustat[i]);
8983 free_percpu(ca->cpuusage); 8982 free_percpu(ca->cpuusage);
8984 kfree(ca); 8983 kfree(ca);
8985 } 8984 }
8986 8985
8987 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 8986 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8988 { 8987 {
8989 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8988 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8990 u64 data; 8989 u64 data;
8991 8990
8992 #ifndef CONFIG_64BIT 8991 #ifndef CONFIG_64BIT
8993 /* 8992 /*
8994 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 8993 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
8995 */ 8994 */
8996 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 8995 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8997 data = *cpuusage; 8996 data = *cpuusage;
8998 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 8997 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8999 #else 8998 #else
9000 data = *cpuusage; 8999 data = *cpuusage;
9001 #endif 9000 #endif
9002 9001
9003 return data; 9002 return data;
9004 } 9003 }
9005 9004
9006 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 9005 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9007 { 9006 {
9008 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 9007 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9009 9008
9010 #ifndef CONFIG_64BIT 9009 #ifndef CONFIG_64BIT
9011 /* 9010 /*
9012 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 9011 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9013 */ 9012 */
9014 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 9013 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
9015 *cpuusage = val; 9014 *cpuusage = val;
9016 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 9015 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
9017 #else 9016 #else
9018 *cpuusage = val; 9017 *cpuusage = val;
9019 #endif 9018 #endif
9020 } 9019 }
9021 9020
9022 /* return total cpu usage (in nanoseconds) of a group */ 9021 /* return total cpu usage (in nanoseconds) of a group */
9023 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9022 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9024 { 9023 {
9025 struct cpuacct *ca = cgroup_ca(cgrp); 9024 struct cpuacct *ca = cgroup_ca(cgrp);
9026 u64 totalcpuusage = 0; 9025 u64 totalcpuusage = 0;
9027 int i; 9026 int i;
9028 9027
9029 for_each_present_cpu(i) 9028 for_each_present_cpu(i)
9030 totalcpuusage += cpuacct_cpuusage_read(ca, i); 9029 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9031 9030
9032 return totalcpuusage; 9031 return totalcpuusage;
9033 } 9032 }
9034 9033
9035 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 9034 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9036 u64 reset) 9035 u64 reset)
9037 { 9036 {
9038 struct cpuacct *ca = cgroup_ca(cgrp); 9037 struct cpuacct *ca = cgroup_ca(cgrp);
9039 int err = 0; 9038 int err = 0;
9040 int i; 9039 int i;
9041 9040
9042 if (reset) { 9041 if (reset) {
9043 err = -EINVAL; 9042 err = -EINVAL;
9044 goto out; 9043 goto out;
9045 } 9044 }
9046 9045
9047 for_each_present_cpu(i) 9046 for_each_present_cpu(i)
9048 cpuacct_cpuusage_write(ca, i, 0); 9047 cpuacct_cpuusage_write(ca, i, 0);
9049 9048
9050 out: 9049 out:
9051 return err; 9050 return err;
9052 } 9051 }
9053 9052
9054 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 9053 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9055 struct seq_file *m) 9054 struct seq_file *m)
9056 { 9055 {
9057 struct cpuacct *ca = cgroup_ca(cgroup); 9056 struct cpuacct *ca = cgroup_ca(cgroup);
9058 u64 percpu; 9057 u64 percpu;
9059 int i; 9058 int i;
9060 9059
9061 for_each_present_cpu(i) { 9060 for_each_present_cpu(i) {
9062 percpu = cpuacct_cpuusage_read(ca, i); 9061 percpu = cpuacct_cpuusage_read(ca, i);
9063 seq_printf(m, "%llu ", (unsigned long long) percpu); 9062 seq_printf(m, "%llu ", (unsigned long long) percpu);
9064 } 9063 }
9065 seq_printf(m, "\n"); 9064 seq_printf(m, "\n");
9066 return 0; 9065 return 0;
9067 } 9066 }
9068 9067
9069 static const char *cpuacct_stat_desc[] = { 9068 static const char *cpuacct_stat_desc[] = {
9070 [CPUACCT_STAT_USER] = "user", 9069 [CPUACCT_STAT_USER] = "user",
9071 [CPUACCT_STAT_SYSTEM] = "system", 9070 [CPUACCT_STAT_SYSTEM] = "system",
9072 }; 9071 };
9073 9072
9074 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 9073 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9075 struct cgroup_map_cb *cb) 9074 struct cgroup_map_cb *cb)
9076 { 9075 {
9077 struct cpuacct *ca = cgroup_ca(cgrp); 9076 struct cpuacct *ca = cgroup_ca(cgrp);
9078 int i; 9077 int i;
9079 9078
9080 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 9079 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
9081 s64 val = percpu_counter_read(&ca->cpustat[i]); 9080 s64 val = percpu_counter_read(&ca->cpustat[i]);
9082 val = cputime64_to_clock_t(val); 9081 val = cputime64_to_clock_t(val);
9083 cb->fill(cb, cpuacct_stat_desc[i], val); 9082 cb->fill(cb, cpuacct_stat_desc[i], val);
9084 } 9083 }
9085 return 0; 9084 return 0;
9086 } 9085 }
9087 9086
9088 static struct cftype files[] = { 9087 static struct cftype files[] = {
9089 { 9088 {
9090 .name = "usage", 9089 .name = "usage",
9091 .read_u64 = cpuusage_read, 9090 .read_u64 = cpuusage_read,
9092 .write_u64 = cpuusage_write, 9091 .write_u64 = cpuusage_write,
9093 }, 9092 },
9094 { 9093 {
9095 .name = "usage_percpu", 9094 .name = "usage_percpu",
9096 .read_seq_string = cpuacct_percpu_seq_read, 9095 .read_seq_string = cpuacct_percpu_seq_read,
9097 }, 9096 },
9098 { 9097 {
9099 .name = "stat", 9098 .name = "stat",
9100 .read_map = cpuacct_stats_show, 9099 .read_map = cpuacct_stats_show,
9101 }, 9100 },
9102 }; 9101 };
9103 9102
9104 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9103 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9105 { 9104 {
9106 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); 9105 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
9107 } 9106 }
9108 9107
9109 /* 9108 /*
9110 * charge this task's execution time to its accounting group. 9109 * charge this task's execution time to its accounting group.
9111 * 9110 *
9112 * called with rq->lock held. 9111 * called with rq->lock held.
9113 */ 9112 */
9114 static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9113 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9115 { 9114 {
9116 struct cpuacct *ca; 9115 struct cpuacct *ca;
9117 int cpu; 9116 int cpu;
9118 9117
9119 if (unlikely(!cpuacct_subsys.active)) 9118 if (unlikely(!cpuacct_subsys.active))
9120 return; 9119 return;
9121 9120
9122 cpu = task_cpu(tsk); 9121 cpu = task_cpu(tsk);
9123 9122
9124 rcu_read_lock(); 9123 rcu_read_lock();
9125 9124
9126 ca = task_ca(tsk); 9125 ca = task_ca(tsk);
9127 9126
9128 for (; ca; ca = ca->parent) { 9127 for (; ca; ca = ca->parent) {
9129 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 9128 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9130 *cpuusage += cputime; 9129 *cpuusage += cputime;
9131 } 9130 }
9132 9131
9133 rcu_read_unlock(); 9132 rcu_read_unlock();
9134 } 9133 }
9135 9134
9136 /* 9135 /*
9137 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large 9136 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9138 * in cputime_t units. As a result, cpuacct_update_stats calls 9137 * in cputime_t units. As a result, cpuacct_update_stats calls
9139 * percpu_counter_add with values large enough to always overflow the 9138 * percpu_counter_add with values large enough to always overflow the
9140 * per cpu batch limit causing bad SMP scalability. 9139 * per cpu batch limit causing bad SMP scalability.
9141 * 9140 *
9142 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we 9141 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9143 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled 9142 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9144 * and enabled. We cap it at INT_MAX which is the largest allowed batch value. 9143 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9145 */ 9144 */
9146 #ifdef CONFIG_SMP 9145 #ifdef CONFIG_SMP
9147 #define CPUACCT_BATCH \ 9146 #define CPUACCT_BATCH \
9148 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) 9147 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9149 #else 9148 #else
9150 #define CPUACCT_BATCH 0 9149 #define CPUACCT_BATCH 0
9151 #endif 9150 #endif
9152 9151
9153 /* 9152 /*
9154 * Charge the system/user time to the task's accounting group. 9153 * Charge the system/user time to the task's accounting group.
9155 */ 9154 */
9156 static void cpuacct_update_stats(struct task_struct *tsk, 9155 static void cpuacct_update_stats(struct task_struct *tsk,
9157 enum cpuacct_stat_index idx, cputime_t val) 9156 enum cpuacct_stat_index idx, cputime_t val)
9158 { 9157 {
9159 struct cpuacct *ca; 9158 struct cpuacct *ca;
9160 int batch = CPUACCT_BATCH; 9159 int batch = CPUACCT_BATCH;
9161 9160
9162 if (unlikely(!cpuacct_subsys.active)) 9161 if (unlikely(!cpuacct_subsys.active))
9163 return; 9162 return;
9164 9163
9165 rcu_read_lock(); 9164 rcu_read_lock();
9166 ca = task_ca(tsk); 9165 ca = task_ca(tsk);
9167 9166
9168 do { 9167 do {
9169 __percpu_counter_add(&ca->cpustat[idx], val, batch); 9168 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9170 ca = ca->parent; 9169 ca = ca->parent;
9171 } while (ca); 9170 } while (ca);
9172 rcu_read_unlock(); 9171 rcu_read_unlock();
9173 } 9172 }
9174 9173
9175 struct cgroup_subsys cpuacct_subsys = { 9174 struct cgroup_subsys cpuacct_subsys = {
9176 .name = "cpuacct", 9175 .name = "cpuacct",
9177 .create = cpuacct_create, 9176 .create = cpuacct_create,
9178 .destroy = cpuacct_destroy, 9177 .destroy = cpuacct_destroy,
9179 .populate = cpuacct_populate, 9178 .populate = cpuacct_populate,
9180 .subsys_id = cpuacct_subsys_id, 9179 .subsys_id = cpuacct_subsys_id,
9181 }; 9180 };
9182 #endif /* CONFIG_CGROUP_CPUACCT */ 9181 #endif /* CONFIG_CGROUP_CPUACCT */
9183 9182
9184 9183
net/mac80211/sta_info.c
1 /* 1 /*
2 * Copyright 2002-2005, Instant802 Networks, Inc. 2 * Copyright 2002-2005, Instant802 Networks, Inc.
3 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> 3 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 */ 8 */
9 9
10 #include <linux/module.h> 10 #include <linux/module.h>
11 #include <linux/init.h> 11 #include <linux/init.h>
12 #include <linux/netdevice.h> 12 #include <linux/netdevice.h>
13 #include <linux/types.h> 13 #include <linux/types.h>
14 #include <linux/slab.h> 14 #include <linux/slab.h>
15 #include <linux/skbuff.h> 15 #include <linux/skbuff.h>
16 #include <linux/if_arp.h> 16 #include <linux/if_arp.h>
17 #include <linux/timer.h> 17 #include <linux/timer.h>
18 #include <linux/rtnetlink.h> 18 #include <linux/rtnetlink.h>
19 19
20 #include <net/mac80211.h> 20 #include <net/mac80211.h>
21 #include "ieee80211_i.h" 21 #include "ieee80211_i.h"
22 #include "driver-ops.h" 22 #include "driver-ops.h"
23 #include "rate.h" 23 #include "rate.h"
24 #include "sta_info.h" 24 #include "sta_info.h"
25 #include "debugfs_sta.h" 25 #include "debugfs_sta.h"
26 #include "mesh.h" 26 #include "mesh.h"
27 27
28 /** 28 /**
29 * DOC: STA information lifetime rules 29 * DOC: STA information lifetime rules
30 * 30 *
31 * STA info structures (&struct sta_info) are managed in a hash table 31 * STA info structures (&struct sta_info) are managed in a hash table
32 * for faster lookup and a list for iteration. They are managed using 32 * for faster lookup and a list for iteration. They are managed using
33 * RCU, i.e. access to the list and hash table is protected by RCU. 33 * RCU, i.e. access to the list and hash table is protected by RCU.
34 * 34 *
35 * Upon allocating a STA info structure with sta_info_alloc(), the caller 35 * Upon allocating a STA info structure with sta_info_alloc(), the caller
36 * owns that structure. It must then insert it into the hash table using 36 * owns that structure. It must then insert it into the hash table using
37 * either sta_info_insert() or sta_info_insert_rcu(); only in the latter 37 * either sta_info_insert() or sta_info_insert_rcu(); only in the latter
38 * case (which acquires an rcu read section but must not be called from 38 * case (which acquires an rcu read section but must not be called from
39 * within one) will the pointer still be valid after the call. Note that 39 * within one) will the pointer still be valid after the call. Note that
40 * the caller may not do much with the STA info before inserting it, in 40 * the caller may not do much with the STA info before inserting it, in
41 * particular, it may not start any mesh peer link management or add 41 * particular, it may not start any mesh peer link management or add
42 * encryption keys. 42 * encryption keys.
43 * 43 *
44 * When the insertion fails (sta_info_insert()) returns non-zero), the 44 * When the insertion fails (sta_info_insert()) returns non-zero), the
45 * structure will have been freed by sta_info_insert()! 45 * structure will have been freed by sta_info_insert()!
46 * 46 *
47 * Station entries are added by mac80211 when you establish a link with a 47 * Station entries are added by mac80211 when you establish a link with a
48 * peer. This means different things for the different type of interfaces 48 * peer. This means different things for the different type of interfaces
49 * we support. For a regular station this mean we add the AP sta when we 49 * we support. For a regular station this mean we add the AP sta when we
50 * receive an association response from the AP. For IBSS this occurs when 50 * receive an association response from the AP. For IBSS this occurs when
51 * get to know about a peer on the same IBSS. For WDS we add the sta for 51 * get to know about a peer on the same IBSS. For WDS we add the sta for
52 * the peer immediately upon device open. When using AP mode we add stations 52 * the peer immediately upon device open. When using AP mode we add stations
53 * for each respective station upon request from userspace through nl80211. 53 * for each respective station upon request from userspace through nl80211.
54 * 54 *
55 * In order to remove a STA info structure, various sta_info_destroy_*() 55 * In order to remove a STA info structure, various sta_info_destroy_*()
56 * calls are available. 56 * calls are available.
57 * 57 *
58 * There is no concept of ownership on a STA entry, each structure is 58 * There is no concept of ownership on a STA entry, each structure is
59 * owned by the global hash table/list until it is removed. All users of 59 * owned by the global hash table/list until it is removed. All users of
60 * the structure need to be RCU protected so that the structure won't be 60 * the structure need to be RCU protected so that the structure won't be
61 * freed before they are done using it. 61 * freed before they are done using it.
62 */ 62 */
63 63
64 /* Caller must hold local->sta_lock */ 64 /* Caller must hold local->sta_lock */
65 static int sta_info_hash_del(struct ieee80211_local *local, 65 static int sta_info_hash_del(struct ieee80211_local *local,
66 struct sta_info *sta) 66 struct sta_info *sta)
67 { 67 {
68 struct sta_info *s; 68 struct sta_info *s;
69 69
70 s = rcu_dereference_protected(local->sta_hash[STA_HASH(sta->sta.addr)], 70 s = rcu_dereference_protected(local->sta_hash[STA_HASH(sta->sta.addr)],
71 lockdep_is_held(&local->sta_lock)); 71 lockdep_is_held(&local->sta_lock));
72 if (!s) 72 if (!s)
73 return -ENOENT; 73 return -ENOENT;
74 if (s == sta) { 74 if (s == sta) {
75 rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], 75 rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)],
76 s->hnext); 76 s->hnext);
77 return 0; 77 return 0;
78 } 78 }
79 79
80 while (rcu_access_pointer(s->hnext) && 80 while (rcu_access_pointer(s->hnext) &&
81 rcu_access_pointer(s->hnext) != sta) 81 rcu_access_pointer(s->hnext) != sta)
82 s = rcu_dereference_protected(s->hnext, 82 s = rcu_dereference_protected(s->hnext,
83 lockdep_is_held(&local->sta_lock)); 83 lockdep_is_held(&local->sta_lock));
84 if (rcu_access_pointer(s->hnext)) { 84 if (rcu_access_pointer(s->hnext)) {
85 rcu_assign_pointer(s->hnext, sta->hnext); 85 rcu_assign_pointer(s->hnext, sta->hnext);
86 return 0; 86 return 0;
87 } 87 }
88 88
89 return -ENOENT; 89 return -ENOENT;
90 } 90 }
91 91
92 /* protected by RCU */ 92 /* protected by RCU */
93 struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, 93 struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
94 const u8 *addr) 94 const u8 *addr)
95 { 95 {
96 struct ieee80211_local *local = sdata->local; 96 struct ieee80211_local *local = sdata->local;
97 struct sta_info *sta; 97 struct sta_info *sta;
98 98
99 sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], 99 sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)],
100 rcu_read_lock_held() ||
101 lockdep_is_held(&local->sta_lock) || 100 lockdep_is_held(&local->sta_lock) ||
102 lockdep_is_held(&local->sta_mtx)); 101 lockdep_is_held(&local->sta_mtx));
103 while (sta) { 102 while (sta) {
104 if (sta->sdata == sdata && 103 if (sta->sdata == sdata &&
105 memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) 104 memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
106 break; 105 break;
107 sta = rcu_dereference_check(sta->hnext, 106 sta = rcu_dereference_check(sta->hnext,
108 rcu_read_lock_held() ||
109 lockdep_is_held(&local->sta_lock) || 107 lockdep_is_held(&local->sta_lock) ||
110 lockdep_is_held(&local->sta_mtx)); 108 lockdep_is_held(&local->sta_mtx));
111 } 109 }
112 return sta; 110 return sta;
113 } 111 }
114 112
115 /* 113 /*
116 * Get sta info either from the specified interface 114 * Get sta info either from the specified interface
117 * or from one of its vlans 115 * or from one of its vlans
118 */ 116 */
119 struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, 117 struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
120 const u8 *addr) 118 const u8 *addr)
121 { 119 {
122 struct ieee80211_local *local = sdata->local; 120 struct ieee80211_local *local = sdata->local;
123 struct sta_info *sta; 121 struct sta_info *sta;
124 122
125 sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], 123 sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)],
126 rcu_read_lock_held() ||
127 lockdep_is_held(&local->sta_lock) || 124 lockdep_is_held(&local->sta_lock) ||
128 lockdep_is_held(&local->sta_mtx)); 125 lockdep_is_held(&local->sta_mtx));
129 while (sta) { 126 while (sta) {
130 if ((sta->sdata == sdata || 127 if ((sta->sdata == sdata ||
131 (sta->sdata->bss && sta->sdata->bss == sdata->bss)) && 128 (sta->sdata->bss && sta->sdata->bss == sdata->bss)) &&
132 memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) 129 memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
133 break; 130 break;
134 sta = rcu_dereference_check(sta->hnext, 131 sta = rcu_dereference_check(sta->hnext,
135 rcu_read_lock_held() ||
136 lockdep_is_held(&local->sta_lock) || 132 lockdep_is_held(&local->sta_lock) ||
137 lockdep_is_held(&local->sta_mtx)); 133 lockdep_is_held(&local->sta_mtx));
138 } 134 }
139 return sta; 135 return sta;
140 } 136 }
141 137
142 struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, 138 struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
143 int idx) 139 int idx)
144 { 140 {
145 struct ieee80211_local *local = sdata->local; 141 struct ieee80211_local *local = sdata->local;
146 struct sta_info *sta; 142 struct sta_info *sta;
147 int i = 0; 143 int i = 0;
148 144
149 list_for_each_entry_rcu(sta, &local->sta_list, list) { 145 list_for_each_entry_rcu(sta, &local->sta_list, list) {
150 if (sdata != sta->sdata) 146 if (sdata != sta->sdata)
151 continue; 147 continue;
152 if (i < idx) { 148 if (i < idx) {
153 ++i; 149 ++i;
154 continue; 150 continue;
155 } 151 }
156 return sta; 152 return sta;
157 } 153 }
158 154
159 return NULL; 155 return NULL;
160 } 156 }
161 157
162 /** 158 /**
163 * __sta_info_free - internal STA free helper 159 * __sta_info_free - internal STA free helper
164 * 160 *
165 * @local: pointer to the global information 161 * @local: pointer to the global information
166 * @sta: STA info to free 162 * @sta: STA info to free
167 * 163 *
168 * This function must undo everything done by sta_info_alloc() 164 * This function must undo everything done by sta_info_alloc()
169 * that may happen before sta_info_insert(). 165 * that may happen before sta_info_insert().
170 */ 166 */
171 static void __sta_info_free(struct ieee80211_local *local, 167 static void __sta_info_free(struct ieee80211_local *local,
172 struct sta_info *sta) 168 struct sta_info *sta)
173 { 169 {
174 if (sta->rate_ctrl) { 170 if (sta->rate_ctrl) {
175 rate_control_free_sta(sta); 171 rate_control_free_sta(sta);
176 rate_control_put(sta->rate_ctrl); 172 rate_control_put(sta->rate_ctrl);
177 } 173 }
178 174
179 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG 175 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
180 wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr); 176 wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr);
181 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 177 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
182 178
183 kfree(sta); 179 kfree(sta);
184 } 180 }
185 181
186 /* Caller must hold local->sta_lock */ 182 /* Caller must hold local->sta_lock */
187 static void sta_info_hash_add(struct ieee80211_local *local, 183 static void sta_info_hash_add(struct ieee80211_local *local,
188 struct sta_info *sta) 184 struct sta_info *sta)
189 { 185 {
190 sta->hnext = local->sta_hash[STA_HASH(sta->sta.addr)]; 186 sta->hnext = local->sta_hash[STA_HASH(sta->sta.addr)];
191 rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta); 187 rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta);
192 } 188 }
193 189
194 static void sta_unblock(struct work_struct *wk) 190 static void sta_unblock(struct work_struct *wk)
195 { 191 {
196 struct sta_info *sta; 192 struct sta_info *sta;
197 193
198 sta = container_of(wk, struct sta_info, drv_unblock_wk); 194 sta = container_of(wk, struct sta_info, drv_unblock_wk);
199 195
200 if (sta->dead) 196 if (sta->dead)
201 return; 197 return;
202 198
203 if (!test_sta_flags(sta, WLAN_STA_PS_STA)) 199 if (!test_sta_flags(sta, WLAN_STA_PS_STA))
204 ieee80211_sta_ps_deliver_wakeup(sta); 200 ieee80211_sta_ps_deliver_wakeup(sta);
205 else if (test_and_clear_sta_flags(sta, WLAN_STA_PSPOLL)) { 201 else if (test_and_clear_sta_flags(sta, WLAN_STA_PSPOLL)) {
206 clear_sta_flags(sta, WLAN_STA_PS_DRIVER); 202 clear_sta_flags(sta, WLAN_STA_PS_DRIVER);
207 ieee80211_sta_ps_deliver_poll_response(sta); 203 ieee80211_sta_ps_deliver_poll_response(sta);
208 } else 204 } else
209 clear_sta_flags(sta, WLAN_STA_PS_DRIVER); 205 clear_sta_flags(sta, WLAN_STA_PS_DRIVER);
210 } 206 }
211 207
212 static int sta_prepare_rate_control(struct ieee80211_local *local, 208 static int sta_prepare_rate_control(struct ieee80211_local *local,
213 struct sta_info *sta, gfp_t gfp) 209 struct sta_info *sta, gfp_t gfp)
214 { 210 {
215 if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) 211 if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
216 return 0; 212 return 0;
217 213
218 sta->rate_ctrl = rate_control_get(local->rate_ctrl); 214 sta->rate_ctrl = rate_control_get(local->rate_ctrl);
219 sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, 215 sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl,
220 &sta->sta, gfp); 216 &sta->sta, gfp);
221 if (!sta->rate_ctrl_priv) { 217 if (!sta->rate_ctrl_priv) {
222 rate_control_put(sta->rate_ctrl); 218 rate_control_put(sta->rate_ctrl);
223 return -ENOMEM; 219 return -ENOMEM;
224 } 220 }
225 221
226 return 0; 222 return 0;
227 } 223 }
228 224
229 struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, 225 struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
230 u8 *addr, gfp_t gfp) 226 u8 *addr, gfp_t gfp)
231 { 227 {
232 struct ieee80211_local *local = sdata->local; 228 struct ieee80211_local *local = sdata->local;
233 struct sta_info *sta; 229 struct sta_info *sta;
234 struct timespec uptime; 230 struct timespec uptime;
235 int i; 231 int i;
236 232
237 sta = kzalloc(sizeof(*sta) + local->hw.sta_data_size, gfp); 233 sta = kzalloc(sizeof(*sta) + local->hw.sta_data_size, gfp);
238 if (!sta) 234 if (!sta)
239 return NULL; 235 return NULL;
240 236
241 spin_lock_init(&sta->lock); 237 spin_lock_init(&sta->lock);
242 spin_lock_init(&sta->flaglock); 238 spin_lock_init(&sta->flaglock);
243 INIT_WORK(&sta->drv_unblock_wk, sta_unblock); 239 INIT_WORK(&sta->drv_unblock_wk, sta_unblock);
244 INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); 240 INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work);
245 mutex_init(&sta->ampdu_mlme.mtx); 241 mutex_init(&sta->ampdu_mlme.mtx);
246 242
247 memcpy(sta->sta.addr, addr, ETH_ALEN); 243 memcpy(sta->sta.addr, addr, ETH_ALEN);
248 sta->local = local; 244 sta->local = local;
249 sta->sdata = sdata; 245 sta->sdata = sdata;
250 sta->last_rx = jiffies; 246 sta->last_rx = jiffies;
251 247
252 do_posix_clock_monotonic_gettime(&uptime); 248 do_posix_clock_monotonic_gettime(&uptime);
253 sta->last_connected = uptime.tv_sec; 249 sta->last_connected = uptime.tv_sec;
254 ewma_init(&sta->avg_signal, 1024, 8); 250 ewma_init(&sta->avg_signal, 1024, 8);
255 251
256 if (sta_prepare_rate_control(local, sta, gfp)) { 252 if (sta_prepare_rate_control(local, sta, gfp)) {
257 kfree(sta); 253 kfree(sta);
258 return NULL; 254 return NULL;
259 } 255 }
260 256
261 for (i = 0; i < STA_TID_NUM; i++) { 257 for (i = 0; i < STA_TID_NUM; i++) {
262 /* 258 /*
263 * timer_to_tid must be initialized with identity mapping 259 * timer_to_tid must be initialized with identity mapping
264 * to enable session_timer's data differentiation. See 260 * to enable session_timer's data differentiation. See
265 * sta_rx_agg_session_timer_expired for usage. 261 * sta_rx_agg_session_timer_expired for usage.
266 */ 262 */
267 sta->timer_to_tid[i] = i; 263 sta->timer_to_tid[i] = i;
268 } 264 }
269 skb_queue_head_init(&sta->ps_tx_buf); 265 skb_queue_head_init(&sta->ps_tx_buf);
270 skb_queue_head_init(&sta->tx_filtered); 266 skb_queue_head_init(&sta->tx_filtered);
271 267
272 for (i = 0; i < NUM_RX_DATA_QUEUES; i++) 268 for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
273 sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); 269 sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
274 270
275 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG 271 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
276 wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr); 272 wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr);
277 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 273 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
278 274
279 #ifdef CONFIG_MAC80211_MESH 275 #ifdef CONFIG_MAC80211_MESH
280 sta->plink_state = NL80211_PLINK_LISTEN; 276 sta->plink_state = NL80211_PLINK_LISTEN;
281 init_timer(&sta->plink_timer); 277 init_timer(&sta->plink_timer);
282 #endif 278 #endif
283 279
284 return sta; 280 return sta;
285 } 281 }
286 282
287 static int sta_info_finish_insert(struct sta_info *sta, bool async) 283 static int sta_info_finish_insert(struct sta_info *sta, bool async)
288 { 284 {
289 struct ieee80211_local *local = sta->local; 285 struct ieee80211_local *local = sta->local;
290 struct ieee80211_sub_if_data *sdata = sta->sdata; 286 struct ieee80211_sub_if_data *sdata = sta->sdata;
291 struct station_info sinfo; 287 struct station_info sinfo;
292 unsigned long flags; 288 unsigned long flags;
293 int err = 0; 289 int err = 0;
294 290
295 lockdep_assert_held(&local->sta_mtx); 291 lockdep_assert_held(&local->sta_mtx);
296 292
297 /* notify driver */ 293 /* notify driver */
298 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 294 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
299 sdata = container_of(sdata->bss, 295 sdata = container_of(sdata->bss,
300 struct ieee80211_sub_if_data, 296 struct ieee80211_sub_if_data,
301 u.ap); 297 u.ap);
302 err = drv_sta_add(local, sdata, &sta->sta); 298 err = drv_sta_add(local, sdata, &sta->sta);
303 if (err) { 299 if (err) {
304 if (!async) 300 if (!async)
305 return err; 301 return err;
306 printk(KERN_DEBUG "%s: failed to add IBSS STA %pM to driver (%d)" 302 printk(KERN_DEBUG "%s: failed to add IBSS STA %pM to driver (%d)"
307 " - keeping it anyway.\n", 303 " - keeping it anyway.\n",
308 sdata->name, sta->sta.addr, err); 304 sdata->name, sta->sta.addr, err);
309 } else { 305 } else {
310 sta->uploaded = true; 306 sta->uploaded = true;
311 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG 307 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
312 if (async) 308 if (async)
313 wiphy_debug(local->hw.wiphy, 309 wiphy_debug(local->hw.wiphy,
314 "Finished adding IBSS STA %pM\n", 310 "Finished adding IBSS STA %pM\n",
315 sta->sta.addr); 311 sta->sta.addr);
316 #endif 312 #endif
317 } 313 }
318 314
319 sdata = sta->sdata; 315 sdata = sta->sdata;
320 316
321 if (!async) { 317 if (!async) {
322 local->num_sta++; 318 local->num_sta++;
323 local->sta_generation++; 319 local->sta_generation++;
324 smp_mb(); 320 smp_mb();
325 321
326 /* make the station visible */ 322 /* make the station visible */
327 spin_lock_irqsave(&local->sta_lock, flags); 323 spin_lock_irqsave(&local->sta_lock, flags);
328 sta_info_hash_add(local, sta); 324 sta_info_hash_add(local, sta);
329 spin_unlock_irqrestore(&local->sta_lock, flags); 325 spin_unlock_irqrestore(&local->sta_lock, flags);
330 } 326 }
331 327
332 list_add(&sta->list, &local->sta_list); 328 list_add(&sta->list, &local->sta_list);
333 329
334 ieee80211_sta_debugfs_add(sta); 330 ieee80211_sta_debugfs_add(sta);
335 rate_control_add_sta_debugfs(sta); 331 rate_control_add_sta_debugfs(sta);
336 332
337 sinfo.filled = 0; 333 sinfo.filled = 0;
338 sinfo.generation = local->sta_generation; 334 sinfo.generation = local->sta_generation;
339 cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL); 335 cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL);
340 336
341 337
342 return 0; 338 return 0;
343 } 339 }
344 340
345 static void sta_info_finish_pending(struct ieee80211_local *local) 341 static void sta_info_finish_pending(struct ieee80211_local *local)
346 { 342 {
347 struct sta_info *sta; 343 struct sta_info *sta;
348 unsigned long flags; 344 unsigned long flags;
349 345
350 spin_lock_irqsave(&local->sta_lock, flags); 346 spin_lock_irqsave(&local->sta_lock, flags);
351 while (!list_empty(&local->sta_pending_list)) { 347 while (!list_empty(&local->sta_pending_list)) {
352 sta = list_first_entry(&local->sta_pending_list, 348 sta = list_first_entry(&local->sta_pending_list,
353 struct sta_info, list); 349 struct sta_info, list);
354 list_del(&sta->list); 350 list_del(&sta->list);
355 spin_unlock_irqrestore(&local->sta_lock, flags); 351 spin_unlock_irqrestore(&local->sta_lock, flags);
356 352
357 sta_info_finish_insert(sta, true); 353 sta_info_finish_insert(sta, true);
358 354
359 spin_lock_irqsave(&local->sta_lock, flags); 355 spin_lock_irqsave(&local->sta_lock, flags);
360 } 356 }
361 spin_unlock_irqrestore(&local->sta_lock, flags); 357 spin_unlock_irqrestore(&local->sta_lock, flags);
362 } 358 }
363 359
364 static void sta_info_finish_work(struct work_struct *work) 360 static void sta_info_finish_work(struct work_struct *work)
365 { 361 {
366 struct ieee80211_local *local = 362 struct ieee80211_local *local =
367 container_of(work, struct ieee80211_local, sta_finish_work); 363 container_of(work, struct ieee80211_local, sta_finish_work);
368 364
369 mutex_lock(&local->sta_mtx); 365 mutex_lock(&local->sta_mtx);
370 sta_info_finish_pending(local); 366 sta_info_finish_pending(local);
371 mutex_unlock(&local->sta_mtx); 367 mutex_unlock(&local->sta_mtx);
372 } 368 }
373 369
374 int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) 370 int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
375 { 371 {
376 struct ieee80211_local *local = sta->local; 372 struct ieee80211_local *local = sta->local;
377 struct ieee80211_sub_if_data *sdata = sta->sdata; 373 struct ieee80211_sub_if_data *sdata = sta->sdata;
378 unsigned long flags; 374 unsigned long flags;
379 int err = 0; 375 int err = 0;
380 376
381 /* 377 /*
382 * Can't be a WARN_ON because it can be triggered through a race: 378 * Can't be a WARN_ON because it can be triggered through a race:
383 * something inserts a STA (on one CPU) without holding the RTNL 379 * something inserts a STA (on one CPU) without holding the RTNL
384 * and another CPU turns off the net device. 380 * and another CPU turns off the net device.
385 */ 381 */
386 if (unlikely(!ieee80211_sdata_running(sdata))) { 382 if (unlikely(!ieee80211_sdata_running(sdata))) {
387 err = -ENETDOWN; 383 err = -ENETDOWN;
388 rcu_read_lock(); 384 rcu_read_lock();
389 goto out_free; 385 goto out_free;
390 } 386 }
391 387
392 if (WARN_ON(compare_ether_addr(sta->sta.addr, sdata->vif.addr) == 0 || 388 if (WARN_ON(compare_ether_addr(sta->sta.addr, sdata->vif.addr) == 0 ||
393 is_multicast_ether_addr(sta->sta.addr))) { 389 is_multicast_ether_addr(sta->sta.addr))) {
394 err = -EINVAL; 390 err = -EINVAL;
395 rcu_read_lock(); 391 rcu_read_lock();
396 goto out_free; 392 goto out_free;
397 } 393 }
398 394
399 /* 395 /*
400 * In ad-hoc mode, we sometimes need to insert stations 396 * In ad-hoc mode, we sometimes need to insert stations
401 * from tasklet context from the RX path. To avoid races, 397 * from tasklet context from the RX path. To avoid races,
402 * always do so in that case -- see the comment below. 398 * always do so in that case -- see the comment below.
403 */ 399 */
404 if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { 400 if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
405 spin_lock_irqsave(&local->sta_lock, flags); 401 spin_lock_irqsave(&local->sta_lock, flags);
406 /* check if STA exists already */ 402 /* check if STA exists already */
407 if (sta_info_get_bss(sdata, sta->sta.addr)) { 403 if (sta_info_get_bss(sdata, sta->sta.addr)) {
408 spin_unlock_irqrestore(&local->sta_lock, flags); 404 spin_unlock_irqrestore(&local->sta_lock, flags);
409 rcu_read_lock(); 405 rcu_read_lock();
410 err = -EEXIST; 406 err = -EEXIST;
411 goto out_free; 407 goto out_free;
412 } 408 }
413 409
414 local->num_sta++; 410 local->num_sta++;
415 local->sta_generation++; 411 local->sta_generation++;
416 smp_mb(); 412 smp_mb();
417 sta_info_hash_add(local, sta); 413 sta_info_hash_add(local, sta);
418 414
419 list_add_tail(&sta->list, &local->sta_pending_list); 415 list_add_tail(&sta->list, &local->sta_pending_list);
420 416
421 rcu_read_lock(); 417 rcu_read_lock();
422 spin_unlock_irqrestore(&local->sta_lock, flags); 418 spin_unlock_irqrestore(&local->sta_lock, flags);
423 419
424 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG 420 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
425 wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n", 421 wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n",
426 sta->sta.addr); 422 sta->sta.addr);
427 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 423 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
428 424
429 ieee80211_queue_work(&local->hw, &local->sta_finish_work); 425 ieee80211_queue_work(&local->hw, &local->sta_finish_work);
430 426
431 return 0; 427 return 0;
432 } 428 }
433 429
434 /* 430 /*
435 * On first glance, this will look racy, because the code 431 * On first glance, this will look racy, because the code
436 * below this point, which inserts a station with sleeping, 432 * below this point, which inserts a station with sleeping,
437 * unlocks the sta_lock between checking existence in the 433 * unlocks the sta_lock between checking existence in the
438 * hash table and inserting into it. 434 * hash table and inserting into it.
439 * 435 *
440 * However, it is not racy against itself because it keeps 436 * However, it is not racy against itself because it keeps
441 * the mutex locked. It still seems to race against the 437 * the mutex locked. It still seems to race against the
442 * above code that atomically inserts the station... That, 438 * above code that atomically inserts the station... That,
443 * however, is not true because the above code can only 439 * however, is not true because the above code can only
444 * be invoked for IBSS interfaces, and the below code will 440 * be invoked for IBSS interfaces, and the below code will
445 * not be -- and the two do not race against each other as 441 * not be -- and the two do not race against each other as
446 * the hash table also keys off the interface. 442 * the hash table also keys off the interface.
447 */ 443 */
448 444
449 might_sleep(); 445 might_sleep();
450 446
451 mutex_lock(&local->sta_mtx); 447 mutex_lock(&local->sta_mtx);
452 448
453 spin_lock_irqsave(&local->sta_lock, flags); 449 spin_lock_irqsave(&local->sta_lock, flags);
454 /* check if STA exists already */ 450 /* check if STA exists already */
455 if (sta_info_get_bss(sdata, sta->sta.addr)) { 451 if (sta_info_get_bss(sdata, sta->sta.addr)) {
456 spin_unlock_irqrestore(&local->sta_lock, flags); 452 spin_unlock_irqrestore(&local->sta_lock, flags);
457 mutex_unlock(&local->sta_mtx); 453 mutex_unlock(&local->sta_mtx);
458 rcu_read_lock(); 454 rcu_read_lock();
459 err = -EEXIST; 455 err = -EEXIST;
460 goto out_free; 456 goto out_free;
461 } 457 }
462 458
463 spin_unlock_irqrestore(&local->sta_lock, flags); 459 spin_unlock_irqrestore(&local->sta_lock, flags);
464 460
465 err = sta_info_finish_insert(sta, false); 461 err = sta_info_finish_insert(sta, false);
466 if (err) { 462 if (err) {
467 mutex_unlock(&local->sta_mtx); 463 mutex_unlock(&local->sta_mtx);
468 rcu_read_lock(); 464 rcu_read_lock();
469 goto out_free; 465 goto out_free;
470 } 466 }
471 467
472 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG 468 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
473 wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr); 469 wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr);
474 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 470 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
475 471
476 /* move reference to rcu-protected */ 472 /* move reference to rcu-protected */
477 rcu_read_lock(); 473 rcu_read_lock();
478 mutex_unlock(&local->sta_mtx); 474 mutex_unlock(&local->sta_mtx);
479 475
480 if (ieee80211_vif_is_mesh(&sdata->vif)) 476 if (ieee80211_vif_is_mesh(&sdata->vif))
481 mesh_accept_plinks_update(sdata); 477 mesh_accept_plinks_update(sdata);
482 478
483 return 0; 479 return 0;
484 out_free: 480 out_free:
485 BUG_ON(!err); 481 BUG_ON(!err);
486 __sta_info_free(local, sta); 482 __sta_info_free(local, sta);
487 return err; 483 return err;
488 } 484 }
489 485
490 int sta_info_insert(struct sta_info *sta) 486 int sta_info_insert(struct sta_info *sta)
491 { 487 {
492 int err = sta_info_insert_rcu(sta); 488 int err = sta_info_insert_rcu(sta);
493 489
494 rcu_read_unlock(); 490 rcu_read_unlock();
495 491
496 return err; 492 return err;
497 } 493 }
498 494
499 static inline void __bss_tim_set(struct ieee80211_if_ap *bss, u16 aid) 495 static inline void __bss_tim_set(struct ieee80211_if_ap *bss, u16 aid)
500 { 496 {
501 /* 497 /*
502 * This format has been mandated by the IEEE specifications, 498 * This format has been mandated by the IEEE specifications,
503 * so this line may not be changed to use the __set_bit() format. 499 * so this line may not be changed to use the __set_bit() format.
504 */ 500 */
505 bss->tim[aid / 8] |= (1 << (aid % 8)); 501 bss->tim[aid / 8] |= (1 << (aid % 8));
506 } 502 }
507 503
508 static inline void __bss_tim_clear(struct ieee80211_if_ap *bss, u16 aid) 504 static inline void __bss_tim_clear(struct ieee80211_if_ap *bss, u16 aid)
509 { 505 {
510 /* 506 /*
511 * This format has been mandated by the IEEE specifications, 507 * This format has been mandated by the IEEE specifications,
512 * so this line may not be changed to use the __clear_bit() format. 508 * so this line may not be changed to use the __clear_bit() format.
513 */ 509 */
514 bss->tim[aid / 8] &= ~(1 << (aid % 8)); 510 bss->tim[aid / 8] &= ~(1 << (aid % 8));
515 } 511 }
516 512
517 static void __sta_info_set_tim_bit(struct ieee80211_if_ap *bss, 513 static void __sta_info_set_tim_bit(struct ieee80211_if_ap *bss,
518 struct sta_info *sta) 514 struct sta_info *sta)
519 { 515 {
520 BUG_ON(!bss); 516 BUG_ON(!bss);
521 517
522 __bss_tim_set(bss, sta->sta.aid); 518 __bss_tim_set(bss, sta->sta.aid);
523 519
524 if (sta->local->ops->set_tim) { 520 if (sta->local->ops->set_tim) {
525 sta->local->tim_in_locked_section = true; 521 sta->local->tim_in_locked_section = true;
526 drv_set_tim(sta->local, &sta->sta, true); 522 drv_set_tim(sta->local, &sta->sta, true);
527 sta->local->tim_in_locked_section = false; 523 sta->local->tim_in_locked_section = false;
528 } 524 }
529 } 525 }
530 526
531 void sta_info_set_tim_bit(struct sta_info *sta) 527 void sta_info_set_tim_bit(struct sta_info *sta)
532 { 528 {
533 unsigned long flags; 529 unsigned long flags;
534 530
535 BUG_ON(!sta->sdata->bss); 531 BUG_ON(!sta->sdata->bss);
536 532
537 spin_lock_irqsave(&sta->local->sta_lock, flags); 533 spin_lock_irqsave(&sta->local->sta_lock, flags);
538 __sta_info_set_tim_bit(sta->sdata->bss, sta); 534 __sta_info_set_tim_bit(sta->sdata->bss, sta);
539 spin_unlock_irqrestore(&sta->local->sta_lock, flags); 535 spin_unlock_irqrestore(&sta->local->sta_lock, flags);
540 } 536 }
541 537
542 static void __sta_info_clear_tim_bit(struct ieee80211_if_ap *bss, 538 static void __sta_info_clear_tim_bit(struct ieee80211_if_ap *bss,
543 struct sta_info *sta) 539 struct sta_info *sta)
544 { 540 {
545 BUG_ON(!bss); 541 BUG_ON(!bss);
546 542
547 __bss_tim_clear(bss, sta->sta.aid); 543 __bss_tim_clear(bss, sta->sta.aid);
548 544
549 if (sta->local->ops->set_tim) { 545 if (sta->local->ops->set_tim) {
550 sta->local->tim_in_locked_section = true; 546 sta->local->tim_in_locked_section = true;
551 drv_set_tim(sta->local, &sta->sta, false); 547 drv_set_tim(sta->local, &sta->sta, false);
552 sta->local->tim_in_locked_section = false; 548 sta->local->tim_in_locked_section = false;
553 } 549 }
554 } 550 }
555 551
556 void sta_info_clear_tim_bit(struct sta_info *sta) 552 void sta_info_clear_tim_bit(struct sta_info *sta)
557 { 553 {
558 unsigned long flags; 554 unsigned long flags;
559 555
560 BUG_ON(!sta->sdata->bss); 556 BUG_ON(!sta->sdata->bss);
561 557
562 spin_lock_irqsave(&sta->local->sta_lock, flags); 558 spin_lock_irqsave(&sta->local->sta_lock, flags);
563 __sta_info_clear_tim_bit(sta->sdata->bss, sta); 559 __sta_info_clear_tim_bit(sta->sdata->bss, sta);
564 spin_unlock_irqrestore(&sta->local->sta_lock, flags); 560 spin_unlock_irqrestore(&sta->local->sta_lock, flags);
565 } 561 }
566 562
567 static int sta_info_buffer_expired(struct sta_info *sta, 563 static int sta_info_buffer_expired(struct sta_info *sta,
568 struct sk_buff *skb) 564 struct sk_buff *skb)
569 { 565 {
570 struct ieee80211_tx_info *info; 566 struct ieee80211_tx_info *info;
571 int timeout; 567 int timeout;
572 568
573 if (!skb) 569 if (!skb)
574 return 0; 570 return 0;
575 571
576 info = IEEE80211_SKB_CB(skb); 572 info = IEEE80211_SKB_CB(skb);
577 573
578 /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ 574 /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */
579 timeout = (sta->listen_interval * 575 timeout = (sta->listen_interval *
580 sta->sdata->vif.bss_conf.beacon_int * 576 sta->sdata->vif.bss_conf.beacon_int *
581 32 / 15625) * HZ; 577 32 / 15625) * HZ;
582 if (timeout < STA_TX_BUFFER_EXPIRE) 578 if (timeout < STA_TX_BUFFER_EXPIRE)
583 timeout = STA_TX_BUFFER_EXPIRE; 579 timeout = STA_TX_BUFFER_EXPIRE;
584 return time_after(jiffies, info->control.jiffies + timeout); 580 return time_after(jiffies, info->control.jiffies + timeout);
585 } 581 }
586 582
587 583
588 static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, 584 static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local,
589 struct sta_info *sta) 585 struct sta_info *sta)
590 { 586 {
591 unsigned long flags; 587 unsigned long flags;
592 struct sk_buff *skb; 588 struct sk_buff *skb;
593 589
594 if (skb_queue_empty(&sta->ps_tx_buf)) 590 if (skb_queue_empty(&sta->ps_tx_buf))
595 return false; 591 return false;
596 592
597 for (;;) { 593 for (;;) {
598 spin_lock_irqsave(&sta->ps_tx_buf.lock, flags); 594 spin_lock_irqsave(&sta->ps_tx_buf.lock, flags);
599 skb = skb_peek(&sta->ps_tx_buf); 595 skb = skb_peek(&sta->ps_tx_buf);
600 if (sta_info_buffer_expired(sta, skb)) 596 if (sta_info_buffer_expired(sta, skb))
601 skb = __skb_dequeue(&sta->ps_tx_buf); 597 skb = __skb_dequeue(&sta->ps_tx_buf);
602 else 598 else
603 skb = NULL; 599 skb = NULL;
604 spin_unlock_irqrestore(&sta->ps_tx_buf.lock, flags); 600 spin_unlock_irqrestore(&sta->ps_tx_buf.lock, flags);
605 601
606 if (!skb) 602 if (!skb)
607 break; 603 break;
608 604
609 local->total_ps_buffered--; 605 local->total_ps_buffered--;
610 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG 606 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
611 printk(KERN_DEBUG "Buffered frame expired (STA %pM)\n", 607 printk(KERN_DEBUG "Buffered frame expired (STA %pM)\n",
612 sta->sta.addr); 608 sta->sta.addr);
613 #endif 609 #endif
614 dev_kfree_skb(skb); 610 dev_kfree_skb(skb);
615 611
616 if (skb_queue_empty(&sta->ps_tx_buf) && 612 if (skb_queue_empty(&sta->ps_tx_buf) &&
617 !test_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF)) 613 !test_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF))
618 sta_info_clear_tim_bit(sta); 614 sta_info_clear_tim_bit(sta);
619 } 615 }
620 616
621 return true; 617 return true;
622 } 618 }
623 619
624 static int __must_check __sta_info_destroy(struct sta_info *sta) 620 static int __must_check __sta_info_destroy(struct sta_info *sta)
625 { 621 {
626 struct ieee80211_local *local; 622 struct ieee80211_local *local;
627 struct ieee80211_sub_if_data *sdata; 623 struct ieee80211_sub_if_data *sdata;
628 struct sk_buff *skb; 624 struct sk_buff *skb;
629 unsigned long flags; 625 unsigned long flags;
630 int ret, i; 626 int ret, i;
631 627
632 might_sleep(); 628 might_sleep();
633 629
634 if (!sta) 630 if (!sta)
635 return -ENOENT; 631 return -ENOENT;
636 632
637 local = sta->local; 633 local = sta->local;
638 sdata = sta->sdata; 634 sdata = sta->sdata;
639 635
640 /* 636 /*
641 * Before removing the station from the driver and 637 * Before removing the station from the driver and
642 * rate control, it might still start new aggregation 638 * rate control, it might still start new aggregation
643 * sessions -- block that to make sure the tear-down 639 * sessions -- block that to make sure the tear-down
644 * will be sufficient. 640 * will be sufficient.
645 */ 641 */
646 set_sta_flags(sta, WLAN_STA_BLOCK_BA); 642 set_sta_flags(sta, WLAN_STA_BLOCK_BA);
647 ieee80211_sta_tear_down_BA_sessions(sta, true); 643 ieee80211_sta_tear_down_BA_sessions(sta, true);
648 644
649 spin_lock_irqsave(&local->sta_lock, flags); 645 spin_lock_irqsave(&local->sta_lock, flags);
650 ret = sta_info_hash_del(local, sta); 646 ret = sta_info_hash_del(local, sta);
651 /* this might still be the pending list ... which is fine */ 647 /* this might still be the pending list ... which is fine */
652 if (!ret) 648 if (!ret)
653 list_del(&sta->list); 649 list_del(&sta->list);
654 spin_unlock_irqrestore(&local->sta_lock, flags); 650 spin_unlock_irqrestore(&local->sta_lock, flags);
655 if (ret) 651 if (ret)
656 return ret; 652 return ret;
657 653
658 mutex_lock(&local->key_mtx); 654 mutex_lock(&local->key_mtx);
659 for (i = 0; i < NUM_DEFAULT_KEYS; i++) 655 for (i = 0; i < NUM_DEFAULT_KEYS; i++)
660 __ieee80211_key_free(key_mtx_dereference(local, sta->gtk[i])); 656 __ieee80211_key_free(key_mtx_dereference(local, sta->gtk[i]));
661 if (sta->ptk) 657 if (sta->ptk)
662 __ieee80211_key_free(key_mtx_dereference(local, sta->ptk)); 658 __ieee80211_key_free(key_mtx_dereference(local, sta->ptk));
663 mutex_unlock(&local->key_mtx); 659 mutex_unlock(&local->key_mtx);
664 660
665 sta->dead = true; 661 sta->dead = true;
666 662
667 if (test_and_clear_sta_flags(sta, 663 if (test_and_clear_sta_flags(sta,
668 WLAN_STA_PS_STA | WLAN_STA_PS_DRIVER)) { 664 WLAN_STA_PS_STA | WLAN_STA_PS_DRIVER)) {
669 BUG_ON(!sdata->bss); 665 BUG_ON(!sdata->bss);
670 666
671 atomic_dec(&sdata->bss->num_sta_ps); 667 atomic_dec(&sdata->bss->num_sta_ps);
672 __sta_info_clear_tim_bit(sdata->bss, sta); 668 __sta_info_clear_tim_bit(sdata->bss, sta);
673 } 669 }
674 670
675 local->num_sta--; 671 local->num_sta--;
676 local->sta_generation++; 672 local->sta_generation++;
677 673
678 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 674 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
679 rcu_assign_pointer(sdata->u.vlan.sta, NULL); 675 rcu_assign_pointer(sdata->u.vlan.sta, NULL);
680 676
681 if (sta->uploaded) { 677 if (sta->uploaded) {
682 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 678 if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
683 sdata = container_of(sdata->bss, 679 sdata = container_of(sdata->bss,
684 struct ieee80211_sub_if_data, 680 struct ieee80211_sub_if_data,
685 u.ap); 681 u.ap);
686 drv_sta_remove(local, sdata, &sta->sta); 682 drv_sta_remove(local, sdata, &sta->sta);
687 sdata = sta->sdata; 683 sdata = sta->sdata;
688 } 684 }
689 685
690 /* 686 /*
691 * At this point, after we wait for an RCU grace period, 687 * At this point, after we wait for an RCU grace period,
692 * neither mac80211 nor the driver can reference this 688 * neither mac80211 nor the driver can reference this
693 * sta struct any more except by still existing timers 689 * sta struct any more except by still existing timers
694 * associated with this station that we clean up below. 690 * associated with this station that we clean up below.
695 */ 691 */
696 synchronize_rcu(); 692 synchronize_rcu();
697 693
698 #ifdef CONFIG_MAC80211_MESH 694 #ifdef CONFIG_MAC80211_MESH
699 if (ieee80211_vif_is_mesh(&sdata->vif)) 695 if (ieee80211_vif_is_mesh(&sdata->vif))
700 mesh_accept_plinks_update(sdata); 696 mesh_accept_plinks_update(sdata);
701 #endif 697 #endif
702 698
703 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG 699 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
704 wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr); 700 wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr);
705 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ 701 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
706 cancel_work_sync(&sta->drv_unblock_wk); 702 cancel_work_sync(&sta->drv_unblock_wk);
707 703
708 cfg80211_del_sta(sdata->dev, sta->sta.addr, GFP_KERNEL); 704 cfg80211_del_sta(sdata->dev, sta->sta.addr, GFP_KERNEL);
709 705
710 rate_control_remove_sta_debugfs(sta); 706 rate_control_remove_sta_debugfs(sta);
711 ieee80211_sta_debugfs_remove(sta); 707 ieee80211_sta_debugfs_remove(sta);
712 708
713 #ifdef CONFIG_MAC80211_MESH 709 #ifdef CONFIG_MAC80211_MESH
714 if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { 710 if (ieee80211_vif_is_mesh(&sta->sdata->vif)) {
715 mesh_plink_deactivate(sta); 711 mesh_plink_deactivate(sta);
716 del_timer_sync(&sta->plink_timer); 712 del_timer_sync(&sta->plink_timer);
717 } 713 }
718 #endif 714 #endif
719 715
720 while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) { 716 while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) {
721 local->total_ps_buffered--; 717 local->total_ps_buffered--;
722 dev_kfree_skb_any(skb); 718 dev_kfree_skb_any(skb);
723 } 719 }
724 720
725 while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL) 721 while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL)
726 dev_kfree_skb_any(skb); 722 dev_kfree_skb_any(skb);
727 723
728 __sta_info_free(local, sta); 724 __sta_info_free(local, sta);
729 725
730 return 0; 726 return 0;
731 } 727 }
732 728
733 int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) 729 int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr)
734 { 730 {
735 struct sta_info *sta; 731 struct sta_info *sta;
736 int ret; 732 int ret;
737 733
738 mutex_lock(&sdata->local->sta_mtx); 734 mutex_lock(&sdata->local->sta_mtx);
739 sta = sta_info_get(sdata, addr); 735 sta = sta_info_get(sdata, addr);
740 ret = __sta_info_destroy(sta); 736 ret = __sta_info_destroy(sta);
741 mutex_unlock(&sdata->local->sta_mtx); 737 mutex_unlock(&sdata->local->sta_mtx);
742 738
743 return ret; 739 return ret;
744 } 740 }
745 741
746 int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, 742 int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata,
747 const u8 *addr) 743 const u8 *addr)
748 { 744 {
749 struct sta_info *sta; 745 struct sta_info *sta;
750 int ret; 746 int ret;
751 747
752 mutex_lock(&sdata->local->sta_mtx); 748 mutex_lock(&sdata->local->sta_mtx);
753 sta = sta_info_get_bss(sdata, addr); 749 sta = sta_info_get_bss(sdata, addr);
754 ret = __sta_info_destroy(sta); 750 ret = __sta_info_destroy(sta);
755 mutex_unlock(&sdata->local->sta_mtx); 751 mutex_unlock(&sdata->local->sta_mtx);
756 752
757 return ret; 753 return ret;
758 } 754 }
759 755
760 static void sta_info_cleanup(unsigned long data) 756 static void sta_info_cleanup(unsigned long data)
761 { 757 {
762 struct ieee80211_local *local = (struct ieee80211_local *) data; 758 struct ieee80211_local *local = (struct ieee80211_local *) data;
763 struct sta_info *sta; 759 struct sta_info *sta;
764 bool timer_needed = false; 760 bool timer_needed = false;
765 761
766 rcu_read_lock(); 762 rcu_read_lock();
767 list_for_each_entry_rcu(sta, &local->sta_list, list) 763 list_for_each_entry_rcu(sta, &local->sta_list, list)
768 if (sta_info_cleanup_expire_buffered(local, sta)) 764 if (sta_info_cleanup_expire_buffered(local, sta))
769 timer_needed = true; 765 timer_needed = true;
770 rcu_read_unlock(); 766 rcu_read_unlock();
771 767
772 if (local->quiescing) 768 if (local->quiescing)
773 return; 769 return;
774 770
775 if (!timer_needed) 771 if (!timer_needed)
776 return; 772 return;
777 773
778 mod_timer(&local->sta_cleanup, 774 mod_timer(&local->sta_cleanup,
779 round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); 775 round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
780 } 776 }
781 777
782 void sta_info_init(struct ieee80211_local *local) 778 void sta_info_init(struct ieee80211_local *local)
783 { 779 {
784 spin_lock_init(&local->sta_lock); 780 spin_lock_init(&local->sta_lock);
785 mutex_init(&local->sta_mtx); 781 mutex_init(&local->sta_mtx);
786 INIT_LIST_HEAD(&local->sta_list); 782 INIT_LIST_HEAD(&local->sta_list);
787 INIT_LIST_HEAD(&local->sta_pending_list); 783 INIT_LIST_HEAD(&local->sta_pending_list);
788 INIT_WORK(&local->sta_finish_work, sta_info_finish_work); 784 INIT_WORK(&local->sta_finish_work, sta_info_finish_work);
789 785
790 setup_timer(&local->sta_cleanup, sta_info_cleanup, 786 setup_timer(&local->sta_cleanup, sta_info_cleanup,
791 (unsigned long)local); 787 (unsigned long)local);
792 } 788 }
793 789
794 void sta_info_stop(struct ieee80211_local *local) 790 void sta_info_stop(struct ieee80211_local *local)
795 { 791 {
796 del_timer(&local->sta_cleanup); 792 del_timer(&local->sta_cleanup);
797 sta_info_flush(local, NULL); 793 sta_info_flush(local, NULL);
798 } 794 }
799 795
800 /** 796 /**
801 * sta_info_flush - flush matching STA entries from the STA table 797 * sta_info_flush - flush matching STA entries from the STA table
802 * 798 *
803 * Returns the number of removed STA entries. 799 * Returns the number of removed STA entries.
804 * 800 *
805 * @local: local interface data 801 * @local: local interface data
806 * @sdata: matching rule for the net device (sta->dev) or %NULL to match all STAs 802 * @sdata: matching rule for the net device (sta->dev) or %NULL to match all STAs
807 */ 803 */
808 int sta_info_flush(struct ieee80211_local *local, 804 int sta_info_flush(struct ieee80211_local *local,
809 struct ieee80211_sub_if_data *sdata) 805 struct ieee80211_sub_if_data *sdata)
810 { 806 {
811 struct sta_info *sta, *tmp; 807 struct sta_info *sta, *tmp;
812 int ret = 0; 808 int ret = 0;
813 809
814 might_sleep(); 810 might_sleep();
815 811
816 mutex_lock(&local->sta_mtx); 812 mutex_lock(&local->sta_mtx);
817 813
818 sta_info_finish_pending(local); 814 sta_info_finish_pending(local);
819 815
820 list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { 816 list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
821 if (!sdata || sdata == sta->sdata) 817 if (!sdata || sdata == sta->sdata)
822 WARN_ON(__sta_info_destroy(sta)); 818 WARN_ON(__sta_info_destroy(sta));
823 } 819 }
824 mutex_unlock(&local->sta_mtx); 820 mutex_unlock(&local->sta_mtx);
825 821
826 return ret; 822 return ret;
827 } 823 }
828 824
829 void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, 825 void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
830 unsigned long exp_time) 826 unsigned long exp_time)
831 { 827 {
832 struct ieee80211_local *local = sdata->local; 828 struct ieee80211_local *local = sdata->local;
833 struct sta_info *sta, *tmp; 829 struct sta_info *sta, *tmp;
834 830
835 mutex_lock(&local->sta_mtx); 831 mutex_lock(&local->sta_mtx);
836 list_for_each_entry_safe(sta, tmp, &local->sta_list, list) 832 list_for_each_entry_safe(sta, tmp, &local->sta_list, list)
837 if (time_after(jiffies, sta->last_rx + exp_time)) { 833 if (time_after(jiffies, sta->last_rx + exp_time)) {
838 #ifdef CONFIG_MAC80211_IBSS_DEBUG 834 #ifdef CONFIG_MAC80211_IBSS_DEBUG
839 printk(KERN_DEBUG "%s: expiring inactive STA %pM\n", 835 printk(KERN_DEBUG "%s: expiring inactive STA %pM\n",
840 sdata->name, sta->sta.addr); 836 sdata->name, sta->sta.addr);
841 #endif 837 #endif
842 WARN_ON(__sta_info_destroy(sta)); 838 WARN_ON(__sta_info_destroy(sta));
843 } 839 }
844 mutex_unlock(&local->sta_mtx); 840 mutex_unlock(&local->sta_mtx);
845 } 841 }
846 842
847 struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, 843 struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
848 const u8 *addr, 844 const u8 *addr,
849 const u8 *localaddr) 845 const u8 *localaddr)
850 { 846 {
851 struct sta_info *sta, *nxt; 847 struct sta_info *sta, *nxt;
852 848
853 /* 849 /*
854 * Just return a random station if localaddr is NULL 850 * Just return a random station if localaddr is NULL
855 * ... first in list. 851 * ... first in list.
856 */ 852 */
857 for_each_sta_info(hw_to_local(hw), addr, sta, nxt) { 853 for_each_sta_info(hw_to_local(hw), addr, sta, nxt) {
858 if (localaddr && 854 if (localaddr &&
859 compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0) 855 compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0)
860 continue; 856 continue;
861 if (!sta->uploaded) 857 if (!sta->uploaded)
862 return NULL; 858 return NULL;
863 return &sta->sta; 859 return &sta->sta;
864 } 860 }
865 861
866 return NULL; 862 return NULL;
867 } 863 }
868 EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); 864 EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr);
869 865
870 struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, 866 struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif,
871 const u8 *addr) 867 const u8 *addr)
872 { 868 {
873 struct sta_info *sta; 869 struct sta_info *sta;
874 870
875 if (!vif) 871 if (!vif)
876 return NULL; 872 return NULL;
877 873
878 sta = sta_info_get_bss(vif_to_sdata(vif), addr); 874 sta = sta_info_get_bss(vif_to_sdata(vif), addr);
879 if (!sta) 875 if (!sta)
880 return NULL; 876 return NULL;
881 877
882 if (!sta->uploaded) 878 if (!sta->uploaded)
883 return NULL; 879 return NULL;
884 880
885 return &sta->sta; 881 return &sta->sta;
886 } 882 }
887 EXPORT_SYMBOL(ieee80211_find_sta); 883 EXPORT_SYMBOL(ieee80211_find_sta);
888 884
889 static void clear_sta_ps_flags(void *_sta) 885 static void clear_sta_ps_flags(void *_sta)
890 { 886 {
891 struct sta_info *sta = _sta; 887 struct sta_info *sta = _sta;
892 888
893 clear_sta_flags(sta, WLAN_STA_PS_DRIVER | WLAN_STA_PS_STA); 889 clear_sta_flags(sta, WLAN_STA_PS_DRIVER | WLAN_STA_PS_STA);
894 } 890 }
895 891
896 /* powersave support code */ 892 /* powersave support code */
897 void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) 893 void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
898 { 894 {
899 struct ieee80211_sub_if_data *sdata = sta->sdata; 895 struct ieee80211_sub_if_data *sdata = sta->sdata;
900 struct ieee80211_local *local = sdata->local; 896 struct ieee80211_local *local = sdata->local;
901 int sent, buffered; 897 int sent, buffered;
902 898
903 clear_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF); 899 clear_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF);
904 if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) 900 if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS))
905 drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); 901 drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
906 902
907 if (!skb_queue_empty(&sta->ps_tx_buf)) 903 if (!skb_queue_empty(&sta->ps_tx_buf))
908 sta_info_clear_tim_bit(sta); 904 sta_info_clear_tim_bit(sta);
909 905
910 /* Send all buffered frames to the station */ 906 /* Send all buffered frames to the station */
911 sent = ieee80211_add_pending_skbs(local, &sta->tx_filtered); 907 sent = ieee80211_add_pending_skbs(local, &sta->tx_filtered);
912 buffered = ieee80211_add_pending_skbs_fn(local, &sta->ps_tx_buf, 908 buffered = ieee80211_add_pending_skbs_fn(local, &sta->ps_tx_buf,
913 clear_sta_ps_flags, sta); 909 clear_sta_ps_flags, sta);
914 sent += buffered; 910 sent += buffered;
915 local->total_ps_buffered -= buffered; 911 local->total_ps_buffered -= buffered;
916 912
917 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG 913 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
918 printk(KERN_DEBUG "%s: STA %pM aid %d sending %d filtered/%d PS frames " 914 printk(KERN_DEBUG "%s: STA %pM aid %d sending %d filtered/%d PS frames "
919 "since STA not sleeping anymore\n", sdata->name, 915 "since STA not sleeping anymore\n", sdata->name,
920 sta->sta.addr, sta->sta.aid, sent - buffered, buffered); 916 sta->sta.addr, sta->sta.aid, sent - buffered, buffered);
921 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ 917 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
922 } 918 }
923 919
924 void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) 920 void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta)
925 { 921 {
926 struct ieee80211_sub_if_data *sdata = sta->sdata; 922 struct ieee80211_sub_if_data *sdata = sta->sdata;
927 struct ieee80211_local *local = sdata->local; 923 struct ieee80211_local *local = sdata->local;
928 struct sk_buff *skb; 924 struct sk_buff *skb;
929 int no_pending_pkts; 925 int no_pending_pkts;
930 926
931 skb = skb_dequeue(&sta->tx_filtered); 927 skb = skb_dequeue(&sta->tx_filtered);
932 if (!skb) { 928 if (!skb) {
933 skb = skb_dequeue(&sta->ps_tx_buf); 929 skb = skb_dequeue(&sta->ps_tx_buf);
934 if (skb) 930 if (skb)
935 local->total_ps_buffered--; 931 local->total_ps_buffered--;
936 } 932 }
937 no_pending_pkts = skb_queue_empty(&sta->tx_filtered) && 933 no_pending_pkts = skb_queue_empty(&sta->tx_filtered) &&
938 skb_queue_empty(&sta->ps_tx_buf); 934 skb_queue_empty(&sta->ps_tx_buf);
939 935
940 if (skb) { 936 if (skb) {
941 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); 937 struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
942 struct ieee80211_hdr *hdr = 938 struct ieee80211_hdr *hdr =
943 (struct ieee80211_hdr *) skb->data; 939 (struct ieee80211_hdr *) skb->data;
944 940
945 /* 941 /*
946 * Tell TX path to send this frame even though the STA may 942 * Tell TX path to send this frame even though the STA may
947 * still remain is PS mode after this frame exchange. 943 * still remain is PS mode after this frame exchange.
948 */ 944 */
949 info->flags |= IEEE80211_TX_CTL_PSPOLL_RESPONSE; 945 info->flags |= IEEE80211_TX_CTL_PSPOLL_RESPONSE;
950 946
951 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG 947 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
952 printk(KERN_DEBUG "STA %pM aid %d: PS Poll (entries after %d)\n", 948 printk(KERN_DEBUG "STA %pM aid %d: PS Poll (entries after %d)\n",
953 sta->sta.addr, sta->sta.aid, 949 sta->sta.addr, sta->sta.aid,
954 skb_queue_len(&sta->ps_tx_buf)); 950 skb_queue_len(&sta->ps_tx_buf));
955 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ 951 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
956 952
957 /* Use MoreData flag to indicate whether there are more 953 /* Use MoreData flag to indicate whether there are more
958 * buffered frames for this STA */ 954 * buffered frames for this STA */
959 if (no_pending_pkts) 955 if (no_pending_pkts)
960 hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); 956 hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA);
961 else 957 else
962 hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); 958 hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA);
963 959
964 ieee80211_add_pending_skb(local, skb); 960 ieee80211_add_pending_skb(local, skb);
965 961
966 if (no_pending_pkts) 962 if (no_pending_pkts)
967 sta_info_clear_tim_bit(sta); 963 sta_info_clear_tim_bit(sta);
968 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG 964 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
969 } else { 965 } else {
970 /* 966 /*
971 * FIXME: This can be the result of a race condition between 967 * FIXME: This can be the result of a race condition between
972 * us expiring a frame and the station polling for it. 968 * us expiring a frame and the station polling for it.
973 * Should we send it a null-func frame indicating we 969 * Should we send it a null-func frame indicating we
974 * have nothing buffered for it? 970 * have nothing buffered for it?
975 */ 971 */
976 printk(KERN_DEBUG "%s: STA %pM sent PS Poll even " 972 printk(KERN_DEBUG "%s: STA %pM sent PS Poll even "
977 "though there are no buffered frames for it\n", 973 "though there are no buffered frames for it\n",
978 sdata->name, sta->sta.addr); 974 sdata->name, sta->sta.addr);
979 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ 975 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
980 } 976 }
981 } 977 }
982 978
983 void ieee80211_sta_block_awake(struct ieee80211_hw *hw, 979 void ieee80211_sta_block_awake(struct ieee80211_hw *hw,
984 struct ieee80211_sta *pubsta, bool block) 980 struct ieee80211_sta *pubsta, bool block)
985 { 981 {
986 struct sta_info *sta = container_of(pubsta, struct sta_info, sta); 982 struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
987 983
988 trace_api_sta_block_awake(sta->local, pubsta, block); 984 trace_api_sta_block_awake(sta->local, pubsta, block);
989 985
990 if (block) 986 if (block)
991 set_sta_flags(sta, WLAN_STA_PS_DRIVER); 987 set_sta_flags(sta, WLAN_STA_PS_DRIVER);
992 else if (test_sta_flags(sta, WLAN_STA_PS_DRIVER)) 988 else if (test_sta_flags(sta, WLAN_STA_PS_DRIVER))
993 ieee80211_queue_work(hw, &sta->drv_unblock_wk); 989 ieee80211_queue_work(hw, &sta->drv_unblock_wk);
994 } 990 }
995 EXPORT_SYMBOL(ieee80211_sta_block_awake); 991 EXPORT_SYMBOL(ieee80211_sta_block_awake);
996 992
997 void ieee80211_sta_set_tim(struct ieee80211_sta *pubsta) 993 void ieee80211_sta_set_tim(struct ieee80211_sta *pubsta)
998 { 994 {
999 struct sta_info *sta = container_of(pubsta, struct sta_info, sta); 995 struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
1000 996
1001 set_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF); 997 set_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF);
1002 sta_info_set_tim_bit(sta); 998 sta_info_set_tim_bit(sta);
1003 } 999 }
1004 EXPORT_SYMBOL(ieee80211_sta_set_tim); 1000 EXPORT_SYMBOL(ieee80211_sta_set_tim);
1005 1001
net/netlabel/netlabel_domainhash.c
1 /* 1 /*
2 * NetLabel Domain Hash Table 2 * NetLabel Domain Hash Table
3 * 3 *
4 * This file manages the domain hash table that NetLabel uses to determine 4 * This file manages the domain hash table that NetLabel uses to determine
5 * which network labeling protocol to use for a given domain. The NetLabel 5 * which network labeling protocol to use for a given domain. The NetLabel
6 * system manages static and dynamic label mappings for network protocols such 6 * system manages static and dynamic label mappings for network protocols such
7 * as CIPSO and RIPSO. 7 * as CIPSO and RIPSO.
8 * 8 *
9 * Author: Paul Moore <paul.moore@hp.com> 9 * Author: Paul Moore <paul.moore@hp.com>
10 * 10 *
11 */ 11 */
12 12
13 /* 13 /*
14 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 14 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
15 * 15 *
16 * This program is free software; you can redistribute it and/or modify 16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License as published by 17 * it under the terms of the GNU General Public License as published by
18 * the Free Software Foundation; either version 2 of the License, or 18 * the Free Software Foundation; either version 2 of the License, or
19 * (at your option) any later version. 19 * (at your option) any later version.
20 * 20 *
21 * This program is distributed in the hope that it will be useful, 21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of 22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
24 * the GNU General Public License for more details. 24 * the GNU General Public License for more details.
25 * 25 *
26 * You should have received a copy of the GNU General Public License 26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software 27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 * 29 *
30 */ 30 */
31 31
32 #include <linux/types.h> 32 #include <linux/types.h>
33 #include <linux/rculist.h> 33 #include <linux/rculist.h>
34 #include <linux/skbuff.h> 34 #include <linux/skbuff.h>
35 #include <linux/spinlock.h> 35 #include <linux/spinlock.h>
36 #include <linux/string.h> 36 #include <linux/string.h>
37 #include <linux/audit.h> 37 #include <linux/audit.h>
38 #include <linux/slab.h> 38 #include <linux/slab.h>
39 #include <net/netlabel.h> 39 #include <net/netlabel.h>
40 #include <net/cipso_ipv4.h> 40 #include <net/cipso_ipv4.h>
41 #include <asm/bug.h> 41 #include <asm/bug.h>
42 42
43 #include "netlabel_mgmt.h" 43 #include "netlabel_mgmt.h"
44 #include "netlabel_addrlist.h" 44 #include "netlabel_addrlist.h"
45 #include "netlabel_domainhash.h" 45 #include "netlabel_domainhash.h"
46 #include "netlabel_user.h" 46 #include "netlabel_user.h"
47 47
48 struct netlbl_domhsh_tbl { 48 struct netlbl_domhsh_tbl {
49 struct list_head *tbl; 49 struct list_head *tbl;
50 u32 size; 50 u32 size;
51 }; 51 };
52 52
53 /* Domain hash table */ 53 /* Domain hash table */
54 /* updates should be so rare that having one spinlock for the entire hash table 54 /* updates should be so rare that having one spinlock for the entire hash table
55 * should be okay */ 55 * should be okay */
56 static DEFINE_SPINLOCK(netlbl_domhsh_lock); 56 static DEFINE_SPINLOCK(netlbl_domhsh_lock);
57 #define netlbl_domhsh_rcu_deref(p) \ 57 #define netlbl_domhsh_rcu_deref(p) \
58 rcu_dereference_check(p, rcu_read_lock_held() || \ 58 rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
59 lockdep_is_held(&netlbl_domhsh_lock))
60 static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; 59 static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
61 static struct netlbl_dom_map *netlbl_domhsh_def = NULL; 60 static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
62 61
63 /* 62 /*
64 * Domain Hash Table Helper Functions 63 * Domain Hash Table Helper Functions
65 */ 64 */
66 65
67 /** 66 /**
68 * netlbl_domhsh_free_entry - Frees a domain hash table entry 67 * netlbl_domhsh_free_entry - Frees a domain hash table entry
69 * @entry: the entry's RCU field 68 * @entry: the entry's RCU field
70 * 69 *
71 * Description: 70 * Description:
72 * This function is designed to be used as a callback to the call_rcu() 71 * This function is designed to be used as a callback to the call_rcu()
73 * function so that the memory allocated to a hash table entry can be released 72 * function so that the memory allocated to a hash table entry can be released
74 * safely. 73 * safely.
75 * 74 *
76 */ 75 */
77 static void netlbl_domhsh_free_entry(struct rcu_head *entry) 76 static void netlbl_domhsh_free_entry(struct rcu_head *entry)
78 { 77 {
79 struct netlbl_dom_map *ptr; 78 struct netlbl_dom_map *ptr;
80 struct netlbl_af4list *iter4; 79 struct netlbl_af4list *iter4;
81 struct netlbl_af4list *tmp4; 80 struct netlbl_af4list *tmp4;
82 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 81 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
83 struct netlbl_af6list *iter6; 82 struct netlbl_af6list *iter6;
84 struct netlbl_af6list *tmp6; 83 struct netlbl_af6list *tmp6;
85 #endif /* IPv6 */ 84 #endif /* IPv6 */
86 85
87 ptr = container_of(entry, struct netlbl_dom_map, rcu); 86 ptr = container_of(entry, struct netlbl_dom_map, rcu);
88 if (ptr->type == NETLBL_NLTYPE_ADDRSELECT) { 87 if (ptr->type == NETLBL_NLTYPE_ADDRSELECT) {
89 netlbl_af4list_foreach_safe(iter4, tmp4, 88 netlbl_af4list_foreach_safe(iter4, tmp4,
90 &ptr->type_def.addrsel->list4) { 89 &ptr->type_def.addrsel->list4) {
91 netlbl_af4list_remove_entry(iter4); 90 netlbl_af4list_remove_entry(iter4);
92 kfree(netlbl_domhsh_addr4_entry(iter4)); 91 kfree(netlbl_domhsh_addr4_entry(iter4));
93 } 92 }
94 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 93 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
95 netlbl_af6list_foreach_safe(iter6, tmp6, 94 netlbl_af6list_foreach_safe(iter6, tmp6,
96 &ptr->type_def.addrsel->list6) { 95 &ptr->type_def.addrsel->list6) {
97 netlbl_af6list_remove_entry(iter6); 96 netlbl_af6list_remove_entry(iter6);
98 kfree(netlbl_domhsh_addr6_entry(iter6)); 97 kfree(netlbl_domhsh_addr6_entry(iter6));
99 } 98 }
100 #endif /* IPv6 */ 99 #endif /* IPv6 */
101 } 100 }
102 kfree(ptr->domain); 101 kfree(ptr->domain);
103 kfree(ptr); 102 kfree(ptr);
104 } 103 }
105 104
106 /** 105 /**
107 * netlbl_domhsh_hash - Hashing function for the domain hash table 106 * netlbl_domhsh_hash - Hashing function for the domain hash table
108 * @domain: the domain name to hash 107 * @domain: the domain name to hash
109 * 108 *
110 * Description: 109 * Description:
111 * This is the hashing function for the domain hash table, it returns the 110 * This is the hashing function for the domain hash table, it returns the
112 * correct bucket number for the domain. The caller is responsible for 111 * correct bucket number for the domain. The caller is responsible for
113 * ensuring that the hash table is protected with either a RCU read lock or the 112 * ensuring that the hash table is protected with either a RCU read lock or the
114 * hash table lock. 113 * hash table lock.
115 * 114 *
116 */ 115 */
117 static u32 netlbl_domhsh_hash(const char *key) 116 static u32 netlbl_domhsh_hash(const char *key)
118 { 117 {
119 u32 iter; 118 u32 iter;
120 u32 val; 119 u32 val;
121 u32 len; 120 u32 len;
122 121
123 /* This is taken (with slight modification) from 122 /* This is taken (with slight modification) from
124 * security/selinux/ss/symtab.c:symhash() */ 123 * security/selinux/ss/symtab.c:symhash() */
125 124
126 for (iter = 0, val = 0, len = strlen(key); iter < len; iter++) 125 for (iter = 0, val = 0, len = strlen(key); iter < len; iter++)
127 val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter]; 126 val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter];
128 return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1); 127 return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1);
129 } 128 }
130 129
131 /** 130 /**
132 * netlbl_domhsh_search - Search for a domain entry 131 * netlbl_domhsh_search - Search for a domain entry
133 * @domain: the domain 132 * @domain: the domain
134 * 133 *
135 * Description: 134 * Description:
136 * Searches the domain hash table and returns a pointer to the hash table 135 * Searches the domain hash table and returns a pointer to the hash table
137 * entry if found, otherwise NULL is returned. The caller is responsible for 136 * entry if found, otherwise NULL is returned. The caller is responsible for
138 * ensuring that the hash table is protected with either a RCU read lock or the 137 * ensuring that the hash table is protected with either a RCU read lock or the
139 * hash table lock. 138 * hash table lock.
140 * 139 *
141 */ 140 */
142 static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) 141 static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
143 { 142 {
144 u32 bkt; 143 u32 bkt;
145 struct list_head *bkt_list; 144 struct list_head *bkt_list;
146 struct netlbl_dom_map *iter; 145 struct netlbl_dom_map *iter;
147 146
148 if (domain != NULL) { 147 if (domain != NULL) {
149 bkt = netlbl_domhsh_hash(domain); 148 bkt = netlbl_domhsh_hash(domain);
150 bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; 149 bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
151 list_for_each_entry_rcu(iter, bkt_list, list) 150 list_for_each_entry_rcu(iter, bkt_list, list)
152 if (iter->valid && strcmp(iter->domain, domain) == 0) 151 if (iter->valid && strcmp(iter->domain, domain) == 0)
153 return iter; 152 return iter;
154 } 153 }
155 154
156 return NULL; 155 return NULL;
157 } 156 }
158 157
159 /** 158 /**
160 * netlbl_domhsh_search_def - Search for a domain entry 159 * netlbl_domhsh_search_def - Search for a domain entry
161 * @domain: the domain 160 * @domain: the domain
162 * @def: return default if no match is found 161 * @def: return default if no match is found
163 * 162 *
164 * Description: 163 * Description:
165 * Searches the domain hash table and returns a pointer to the hash table 164 * Searches the domain hash table and returns a pointer to the hash table
166 * entry if an exact match is found, if an exact match is not present in the 165 * entry if an exact match is found, if an exact match is not present in the
167 * hash table then the default entry is returned if valid otherwise NULL is 166 * hash table then the default entry is returned if valid otherwise NULL is
168 * returned. The caller is responsible ensuring that the hash table is 167 * returned. The caller is responsible ensuring that the hash table is
169 * protected with either a RCU read lock or the hash table lock. 168 * protected with either a RCU read lock or the hash table lock.
170 * 169 *
171 */ 170 */
172 static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) 171 static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
173 { 172 {
174 struct netlbl_dom_map *entry; 173 struct netlbl_dom_map *entry;
175 174
176 entry = netlbl_domhsh_search(domain); 175 entry = netlbl_domhsh_search(domain);
177 if (entry == NULL) { 176 if (entry == NULL) {
178 entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def); 177 entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def);
179 if (entry != NULL && !entry->valid) 178 if (entry != NULL && !entry->valid)
180 entry = NULL; 179 entry = NULL;
181 } 180 }
182 181
183 return entry; 182 return entry;
184 } 183 }
185 184
186 /** 185 /**
187 * netlbl_domhsh_audit_add - Generate an audit entry for an add event 186 * netlbl_domhsh_audit_add - Generate an audit entry for an add event
188 * @entry: the entry being added 187 * @entry: the entry being added
189 * @addr4: the IPv4 address information 188 * @addr4: the IPv4 address information
190 * @addr6: the IPv6 address information 189 * @addr6: the IPv6 address information
191 * @result: the result code 190 * @result: the result code
192 * @audit_info: NetLabel audit information 191 * @audit_info: NetLabel audit information
193 * 192 *
194 * Description: 193 * Description:
195 * Generate an audit record for adding a new NetLabel/LSM mapping entry with 194 * Generate an audit record for adding a new NetLabel/LSM mapping entry with
196 * the given information. Caller is responsible for holding the necessary 195 * the given information. Caller is responsible for holding the necessary
197 * locks. 196 * locks.
198 * 197 *
199 */ 198 */
200 static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, 199 static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
201 struct netlbl_af4list *addr4, 200 struct netlbl_af4list *addr4,
202 struct netlbl_af6list *addr6, 201 struct netlbl_af6list *addr6,
203 int result, 202 int result,
204 struct netlbl_audit *audit_info) 203 struct netlbl_audit *audit_info)
205 { 204 {
206 struct audit_buffer *audit_buf; 205 struct audit_buffer *audit_buf;
207 struct cipso_v4_doi *cipsov4 = NULL; 206 struct cipso_v4_doi *cipsov4 = NULL;
208 u32 type; 207 u32 type;
209 208
210 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); 209 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
211 if (audit_buf != NULL) { 210 if (audit_buf != NULL) {
212 audit_log_format(audit_buf, " nlbl_domain=%s", 211 audit_log_format(audit_buf, " nlbl_domain=%s",
213 entry->domain ? entry->domain : "(default)"); 212 entry->domain ? entry->domain : "(default)");
214 if (addr4 != NULL) { 213 if (addr4 != NULL) {
215 struct netlbl_domaddr4_map *map4; 214 struct netlbl_domaddr4_map *map4;
216 map4 = netlbl_domhsh_addr4_entry(addr4); 215 map4 = netlbl_domhsh_addr4_entry(addr4);
217 type = map4->type; 216 type = map4->type;
218 cipsov4 = map4->type_def.cipsov4; 217 cipsov4 = map4->type_def.cipsov4;
219 netlbl_af4list_audit_addr(audit_buf, 0, NULL, 218 netlbl_af4list_audit_addr(audit_buf, 0, NULL,
220 addr4->addr, addr4->mask); 219 addr4->addr, addr4->mask);
221 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 220 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
222 } else if (addr6 != NULL) { 221 } else if (addr6 != NULL) {
223 struct netlbl_domaddr6_map *map6; 222 struct netlbl_domaddr6_map *map6;
224 map6 = netlbl_domhsh_addr6_entry(addr6); 223 map6 = netlbl_domhsh_addr6_entry(addr6);
225 type = map6->type; 224 type = map6->type;
226 netlbl_af6list_audit_addr(audit_buf, 0, NULL, 225 netlbl_af6list_audit_addr(audit_buf, 0, NULL,
227 &addr6->addr, &addr6->mask); 226 &addr6->addr, &addr6->mask);
228 #endif /* IPv6 */ 227 #endif /* IPv6 */
229 } else { 228 } else {
230 type = entry->type; 229 type = entry->type;
231 cipsov4 = entry->type_def.cipsov4; 230 cipsov4 = entry->type_def.cipsov4;
232 } 231 }
233 switch (type) { 232 switch (type) {
234 case NETLBL_NLTYPE_UNLABELED: 233 case NETLBL_NLTYPE_UNLABELED:
235 audit_log_format(audit_buf, " nlbl_protocol=unlbl"); 234 audit_log_format(audit_buf, " nlbl_protocol=unlbl");
236 break; 235 break;
237 case NETLBL_NLTYPE_CIPSOV4: 236 case NETLBL_NLTYPE_CIPSOV4:
238 BUG_ON(cipsov4 == NULL); 237 BUG_ON(cipsov4 == NULL);
239 audit_log_format(audit_buf, 238 audit_log_format(audit_buf,
240 " nlbl_protocol=cipsov4 cipso_doi=%u", 239 " nlbl_protocol=cipsov4 cipso_doi=%u",
241 cipsov4->doi); 240 cipsov4->doi);
242 break; 241 break;
243 } 242 }
244 audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0); 243 audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0);
245 audit_log_end(audit_buf); 244 audit_log_end(audit_buf);
246 } 245 }
247 } 246 }
248 247
249 /* 248 /*
250 * Domain Hash Table Functions 249 * Domain Hash Table Functions
251 */ 250 */
252 251
253 /** 252 /**
254 * netlbl_domhsh_init - Init for the domain hash 253 * netlbl_domhsh_init - Init for the domain hash
255 * @size: the number of bits to use for the hash buckets 254 * @size: the number of bits to use for the hash buckets
256 * 255 *
257 * Description: 256 * Description:
258 * Initializes the domain hash table, should be called only by 257 * Initializes the domain hash table, should be called only by
259 * netlbl_user_init() during initialization. Returns zero on success, non-zero 258 * netlbl_user_init() during initialization. Returns zero on success, non-zero
260 * values on error. 259 * values on error.
261 * 260 *
262 */ 261 */
263 int __init netlbl_domhsh_init(u32 size) 262 int __init netlbl_domhsh_init(u32 size)
264 { 263 {
265 u32 iter; 264 u32 iter;
266 struct netlbl_domhsh_tbl *hsh_tbl; 265 struct netlbl_domhsh_tbl *hsh_tbl;
267 266
268 if (size == 0) 267 if (size == 0)
269 return -EINVAL; 268 return -EINVAL;
270 269
271 hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); 270 hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
272 if (hsh_tbl == NULL) 271 if (hsh_tbl == NULL)
273 return -ENOMEM; 272 return -ENOMEM;
274 hsh_tbl->size = 1 << size; 273 hsh_tbl->size = 1 << size;
275 hsh_tbl->tbl = kcalloc(hsh_tbl->size, 274 hsh_tbl->tbl = kcalloc(hsh_tbl->size,
276 sizeof(struct list_head), 275 sizeof(struct list_head),
277 GFP_KERNEL); 276 GFP_KERNEL);
278 if (hsh_tbl->tbl == NULL) { 277 if (hsh_tbl->tbl == NULL) {
279 kfree(hsh_tbl); 278 kfree(hsh_tbl);
280 return -ENOMEM; 279 return -ENOMEM;
281 } 280 }
282 for (iter = 0; iter < hsh_tbl->size; iter++) 281 for (iter = 0; iter < hsh_tbl->size; iter++)
283 INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); 282 INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
284 283
285 spin_lock(&netlbl_domhsh_lock); 284 spin_lock(&netlbl_domhsh_lock);
286 rcu_assign_pointer(netlbl_domhsh, hsh_tbl); 285 rcu_assign_pointer(netlbl_domhsh, hsh_tbl);
287 spin_unlock(&netlbl_domhsh_lock); 286 spin_unlock(&netlbl_domhsh_lock);
288 287
289 return 0; 288 return 0;
290 } 289 }
291 290
292 /** 291 /**
293 * netlbl_domhsh_add - Adds a entry to the domain hash table 292 * netlbl_domhsh_add - Adds a entry to the domain hash table
294 * @entry: the entry to add 293 * @entry: the entry to add
295 * @audit_info: NetLabel audit information 294 * @audit_info: NetLabel audit information
296 * 295 *
297 * Description: 296 * Description:
298 * Adds a new entry to the domain hash table and handles any updates to the 297 * Adds a new entry to the domain hash table and handles any updates to the
299 * lower level protocol handler (i.e. CIPSO). Returns zero on success, 298 * lower level protocol handler (i.e. CIPSO). Returns zero on success,
300 * negative on failure. 299 * negative on failure.
301 * 300 *
302 */ 301 */
303 int netlbl_domhsh_add(struct netlbl_dom_map *entry, 302 int netlbl_domhsh_add(struct netlbl_dom_map *entry,
304 struct netlbl_audit *audit_info) 303 struct netlbl_audit *audit_info)
305 { 304 {
306 int ret_val = 0; 305 int ret_val = 0;
307 struct netlbl_dom_map *entry_old; 306 struct netlbl_dom_map *entry_old;
308 struct netlbl_af4list *iter4; 307 struct netlbl_af4list *iter4;
309 struct netlbl_af4list *tmp4; 308 struct netlbl_af4list *tmp4;
310 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 309 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
311 struct netlbl_af6list *iter6; 310 struct netlbl_af6list *iter6;
312 struct netlbl_af6list *tmp6; 311 struct netlbl_af6list *tmp6;
313 #endif /* IPv6 */ 312 #endif /* IPv6 */
314 313
315 /* XXX - we can remove this RCU read lock as the spinlock protects the 314 /* XXX - we can remove this RCU read lock as the spinlock protects the
316 * entire function, but before we do we need to fixup the 315 * entire function, but before we do we need to fixup the
317 * netlbl_af[4,6]list RCU functions to do "the right thing" with 316 * netlbl_af[4,6]list RCU functions to do "the right thing" with
318 * respect to rcu_dereference() when only a spinlock is held. */ 317 * respect to rcu_dereference() when only a spinlock is held. */
319 rcu_read_lock(); 318 rcu_read_lock();
320 spin_lock(&netlbl_domhsh_lock); 319 spin_lock(&netlbl_domhsh_lock);
321 if (entry->domain != NULL) 320 if (entry->domain != NULL)
322 entry_old = netlbl_domhsh_search(entry->domain); 321 entry_old = netlbl_domhsh_search(entry->domain);
323 else 322 else
324 entry_old = netlbl_domhsh_search_def(entry->domain); 323 entry_old = netlbl_domhsh_search_def(entry->domain);
325 if (entry_old == NULL) { 324 if (entry_old == NULL) {
326 entry->valid = 1; 325 entry->valid = 1;
327 326
328 if (entry->domain != NULL) { 327 if (entry->domain != NULL) {
329 u32 bkt = netlbl_domhsh_hash(entry->domain); 328 u32 bkt = netlbl_domhsh_hash(entry->domain);
330 list_add_tail_rcu(&entry->list, 329 list_add_tail_rcu(&entry->list,
331 &rcu_dereference(netlbl_domhsh)->tbl[bkt]); 330 &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
332 } else { 331 } else {
333 INIT_LIST_HEAD(&entry->list); 332 INIT_LIST_HEAD(&entry->list);
334 rcu_assign_pointer(netlbl_domhsh_def, entry); 333 rcu_assign_pointer(netlbl_domhsh_def, entry);
335 } 334 }
336 335
337 if (entry->type == NETLBL_NLTYPE_ADDRSELECT) { 336 if (entry->type == NETLBL_NLTYPE_ADDRSELECT) {
338 netlbl_af4list_foreach_rcu(iter4, 337 netlbl_af4list_foreach_rcu(iter4,
339 &entry->type_def.addrsel->list4) 338 &entry->type_def.addrsel->list4)
340 netlbl_domhsh_audit_add(entry, iter4, NULL, 339 netlbl_domhsh_audit_add(entry, iter4, NULL,
341 ret_val, audit_info); 340 ret_val, audit_info);
342 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 341 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
343 netlbl_af6list_foreach_rcu(iter6, 342 netlbl_af6list_foreach_rcu(iter6,
344 &entry->type_def.addrsel->list6) 343 &entry->type_def.addrsel->list6)
345 netlbl_domhsh_audit_add(entry, NULL, iter6, 344 netlbl_domhsh_audit_add(entry, NULL, iter6,
346 ret_val, audit_info); 345 ret_val, audit_info);
347 #endif /* IPv6 */ 346 #endif /* IPv6 */
348 } else 347 } else
349 netlbl_domhsh_audit_add(entry, NULL, NULL, 348 netlbl_domhsh_audit_add(entry, NULL, NULL,
350 ret_val, audit_info); 349 ret_val, audit_info);
351 } else if (entry_old->type == NETLBL_NLTYPE_ADDRSELECT && 350 } else if (entry_old->type == NETLBL_NLTYPE_ADDRSELECT &&
352 entry->type == NETLBL_NLTYPE_ADDRSELECT) { 351 entry->type == NETLBL_NLTYPE_ADDRSELECT) {
353 struct list_head *old_list4; 352 struct list_head *old_list4;
354 struct list_head *old_list6; 353 struct list_head *old_list6;
355 354
356 old_list4 = &entry_old->type_def.addrsel->list4; 355 old_list4 = &entry_old->type_def.addrsel->list4;
357 old_list6 = &entry_old->type_def.addrsel->list6; 356 old_list6 = &entry_old->type_def.addrsel->list6;
358 357
359 /* we only allow the addition of address selectors if all of 358 /* we only allow the addition of address selectors if all of
360 * the selectors do not exist in the existing domain map */ 359 * the selectors do not exist in the existing domain map */
361 netlbl_af4list_foreach_rcu(iter4, 360 netlbl_af4list_foreach_rcu(iter4,
362 &entry->type_def.addrsel->list4) 361 &entry->type_def.addrsel->list4)
363 if (netlbl_af4list_search_exact(iter4->addr, 362 if (netlbl_af4list_search_exact(iter4->addr,
364 iter4->mask, 363 iter4->mask,
365 old_list4)) { 364 old_list4)) {
366 ret_val = -EEXIST; 365 ret_val = -EEXIST;
367 goto add_return; 366 goto add_return;
368 } 367 }
369 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 368 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
370 netlbl_af6list_foreach_rcu(iter6, 369 netlbl_af6list_foreach_rcu(iter6,
371 &entry->type_def.addrsel->list6) 370 &entry->type_def.addrsel->list6)
372 if (netlbl_af6list_search_exact(&iter6->addr, 371 if (netlbl_af6list_search_exact(&iter6->addr,
373 &iter6->mask, 372 &iter6->mask,
374 old_list6)) { 373 old_list6)) {
375 ret_val = -EEXIST; 374 ret_val = -EEXIST;
376 goto add_return; 375 goto add_return;
377 } 376 }
378 #endif /* IPv6 */ 377 #endif /* IPv6 */
379 378
380 netlbl_af4list_foreach_safe(iter4, tmp4, 379 netlbl_af4list_foreach_safe(iter4, tmp4,
381 &entry->type_def.addrsel->list4) { 380 &entry->type_def.addrsel->list4) {
382 netlbl_af4list_remove_entry(iter4); 381 netlbl_af4list_remove_entry(iter4);
383 iter4->valid = 1; 382 iter4->valid = 1;
384 ret_val = netlbl_af4list_add(iter4, old_list4); 383 ret_val = netlbl_af4list_add(iter4, old_list4);
385 netlbl_domhsh_audit_add(entry_old, iter4, NULL, 384 netlbl_domhsh_audit_add(entry_old, iter4, NULL,
386 ret_val, audit_info); 385 ret_val, audit_info);
387 if (ret_val != 0) 386 if (ret_val != 0)
388 goto add_return; 387 goto add_return;
389 } 388 }
390 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 389 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
391 netlbl_af6list_foreach_safe(iter6, tmp6, 390 netlbl_af6list_foreach_safe(iter6, tmp6,
392 &entry->type_def.addrsel->list6) { 391 &entry->type_def.addrsel->list6) {
393 netlbl_af6list_remove_entry(iter6); 392 netlbl_af6list_remove_entry(iter6);
394 iter6->valid = 1; 393 iter6->valid = 1;
395 ret_val = netlbl_af6list_add(iter6, old_list6); 394 ret_val = netlbl_af6list_add(iter6, old_list6);
396 netlbl_domhsh_audit_add(entry_old, NULL, iter6, 395 netlbl_domhsh_audit_add(entry_old, NULL, iter6,
397 ret_val, audit_info); 396 ret_val, audit_info);
398 if (ret_val != 0) 397 if (ret_val != 0)
399 goto add_return; 398 goto add_return;
400 } 399 }
401 #endif /* IPv6 */ 400 #endif /* IPv6 */
402 } else 401 } else
403 ret_val = -EINVAL; 402 ret_val = -EINVAL;
404 403
405 add_return: 404 add_return:
406 spin_unlock(&netlbl_domhsh_lock); 405 spin_unlock(&netlbl_domhsh_lock);
407 rcu_read_unlock(); 406 rcu_read_unlock();
408 return ret_val; 407 return ret_val;
409 } 408 }
410 409
411 /** 410 /**
412 * netlbl_domhsh_add_default - Adds the default entry to the domain hash table 411 * netlbl_domhsh_add_default - Adds the default entry to the domain hash table
413 * @entry: the entry to add 412 * @entry: the entry to add
414 * @audit_info: NetLabel audit information 413 * @audit_info: NetLabel audit information
415 * 414 *
416 * Description: 415 * Description:
417 * Adds a new default entry to the domain hash table and handles any updates 416 * Adds a new default entry to the domain hash table and handles any updates
418 * to the lower level protocol handler (i.e. CIPSO). Returns zero on success, 417 * to the lower level protocol handler (i.e. CIPSO). Returns zero on success,
419 * negative on failure. 418 * negative on failure.
420 * 419 *
421 */ 420 */
422 int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, 421 int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
423 struct netlbl_audit *audit_info) 422 struct netlbl_audit *audit_info)
424 { 423 {
425 return netlbl_domhsh_add(entry, audit_info); 424 return netlbl_domhsh_add(entry, audit_info);
426 } 425 }
427 426
428 /** 427 /**
429 * netlbl_domhsh_remove_entry - Removes a given entry from the domain table 428 * netlbl_domhsh_remove_entry - Removes a given entry from the domain table
430 * @entry: the entry to remove 429 * @entry: the entry to remove
431 * @audit_info: NetLabel audit information 430 * @audit_info: NetLabel audit information
432 * 431 *
433 * Description: 432 * Description:
434 * Removes an entry from the domain hash table and handles any updates to the 433 * Removes an entry from the domain hash table and handles any updates to the
435 * lower level protocol handler (i.e. CIPSO). Caller is responsible for 434 * lower level protocol handler (i.e. CIPSO). Caller is responsible for
436 * ensuring that the RCU read lock is held. Returns zero on success, negative 435 * ensuring that the RCU read lock is held. Returns zero on success, negative
437 * on failure. 436 * on failure.
438 * 437 *
439 */ 438 */
440 int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, 439 int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
441 struct netlbl_audit *audit_info) 440 struct netlbl_audit *audit_info)
442 { 441 {
443 int ret_val = 0; 442 int ret_val = 0;
444 struct audit_buffer *audit_buf; 443 struct audit_buffer *audit_buf;
445 444
446 if (entry == NULL) 445 if (entry == NULL)
447 return -ENOENT; 446 return -ENOENT;
448 447
449 spin_lock(&netlbl_domhsh_lock); 448 spin_lock(&netlbl_domhsh_lock);
450 if (entry->valid) { 449 if (entry->valid) {
451 entry->valid = 0; 450 entry->valid = 0;
452 if (entry != rcu_dereference(netlbl_domhsh_def)) 451 if (entry != rcu_dereference(netlbl_domhsh_def))
453 list_del_rcu(&entry->list); 452 list_del_rcu(&entry->list);
454 else 453 else
455 rcu_assign_pointer(netlbl_domhsh_def, NULL); 454 rcu_assign_pointer(netlbl_domhsh_def, NULL);
456 } else 455 } else
457 ret_val = -ENOENT; 456 ret_val = -ENOENT;
458 spin_unlock(&netlbl_domhsh_lock); 457 spin_unlock(&netlbl_domhsh_lock);
459 458
460 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); 459 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
461 if (audit_buf != NULL) { 460 if (audit_buf != NULL) {
462 audit_log_format(audit_buf, 461 audit_log_format(audit_buf,
463 " nlbl_domain=%s res=%u", 462 " nlbl_domain=%s res=%u",
464 entry->domain ? entry->domain : "(default)", 463 entry->domain ? entry->domain : "(default)",
465 ret_val == 0 ? 1 : 0); 464 ret_val == 0 ? 1 : 0);
466 audit_log_end(audit_buf); 465 audit_log_end(audit_buf);
467 } 466 }
468 467
469 if (ret_val == 0) { 468 if (ret_val == 0) {
470 struct netlbl_af4list *iter4; 469 struct netlbl_af4list *iter4;
471 struct netlbl_domaddr4_map *map4; 470 struct netlbl_domaddr4_map *map4;
472 471
473 switch (entry->type) { 472 switch (entry->type) {
474 case NETLBL_NLTYPE_ADDRSELECT: 473 case NETLBL_NLTYPE_ADDRSELECT:
475 netlbl_af4list_foreach_rcu(iter4, 474 netlbl_af4list_foreach_rcu(iter4,
476 &entry->type_def.addrsel->list4) { 475 &entry->type_def.addrsel->list4) {
477 map4 = netlbl_domhsh_addr4_entry(iter4); 476 map4 = netlbl_domhsh_addr4_entry(iter4);
478 cipso_v4_doi_putdef(map4->type_def.cipsov4); 477 cipso_v4_doi_putdef(map4->type_def.cipsov4);
479 } 478 }
480 /* no need to check the IPv6 list since we currently 479 /* no need to check the IPv6 list since we currently
481 * support only unlabeled protocols for IPv6 */ 480 * support only unlabeled protocols for IPv6 */
482 break; 481 break;
483 case NETLBL_NLTYPE_CIPSOV4: 482 case NETLBL_NLTYPE_CIPSOV4:
484 cipso_v4_doi_putdef(entry->type_def.cipsov4); 483 cipso_v4_doi_putdef(entry->type_def.cipsov4);
485 break; 484 break;
486 } 485 }
487 call_rcu(&entry->rcu, netlbl_domhsh_free_entry); 486 call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
488 } 487 }
489 488
490 return ret_val; 489 return ret_val;
491 } 490 }
492 491
493 /** 492 /**
494 * netlbl_domhsh_remove_af4 - Removes an address selector entry 493 * netlbl_domhsh_remove_af4 - Removes an address selector entry
495 * @domain: the domain 494 * @domain: the domain
496 * @addr: IPv4 address 495 * @addr: IPv4 address
497 * @mask: IPv4 address mask 496 * @mask: IPv4 address mask
498 * @audit_info: NetLabel audit information 497 * @audit_info: NetLabel audit information
499 * 498 *
500 * Description: 499 * Description:
501 * Removes an individual address selector from a domain mapping and potentially 500 * Removes an individual address selector from a domain mapping and potentially
502 * the entire mapping if it is empty. Returns zero on success, negative values 501 * the entire mapping if it is empty. Returns zero on success, negative values
503 * on failure. 502 * on failure.
504 * 503 *
505 */ 504 */
506 int netlbl_domhsh_remove_af4(const char *domain, 505 int netlbl_domhsh_remove_af4(const char *domain,
507 const struct in_addr *addr, 506 const struct in_addr *addr,
508 const struct in_addr *mask, 507 const struct in_addr *mask,
509 struct netlbl_audit *audit_info) 508 struct netlbl_audit *audit_info)
510 { 509 {
511 struct netlbl_dom_map *entry_map; 510 struct netlbl_dom_map *entry_map;
512 struct netlbl_af4list *entry_addr; 511 struct netlbl_af4list *entry_addr;
513 struct netlbl_af4list *iter4; 512 struct netlbl_af4list *iter4;
514 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 513 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
515 struct netlbl_af6list *iter6; 514 struct netlbl_af6list *iter6;
516 #endif /* IPv6 */ 515 #endif /* IPv6 */
517 struct netlbl_domaddr4_map *entry; 516 struct netlbl_domaddr4_map *entry;
518 517
519 rcu_read_lock(); 518 rcu_read_lock();
520 519
521 if (domain) 520 if (domain)
522 entry_map = netlbl_domhsh_search(domain); 521 entry_map = netlbl_domhsh_search(domain);
523 else 522 else
524 entry_map = netlbl_domhsh_search_def(domain); 523 entry_map = netlbl_domhsh_search_def(domain);
525 if (entry_map == NULL || entry_map->type != NETLBL_NLTYPE_ADDRSELECT) 524 if (entry_map == NULL || entry_map->type != NETLBL_NLTYPE_ADDRSELECT)
526 goto remove_af4_failure; 525 goto remove_af4_failure;
527 526
528 spin_lock(&netlbl_domhsh_lock); 527 spin_lock(&netlbl_domhsh_lock);
529 entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr, 528 entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
530 &entry_map->type_def.addrsel->list4); 529 &entry_map->type_def.addrsel->list4);
531 spin_unlock(&netlbl_domhsh_lock); 530 spin_unlock(&netlbl_domhsh_lock);
532 531
533 if (entry_addr == NULL) 532 if (entry_addr == NULL)
534 goto remove_af4_failure; 533 goto remove_af4_failure;
535 netlbl_af4list_foreach_rcu(iter4, &entry_map->type_def.addrsel->list4) 534 netlbl_af4list_foreach_rcu(iter4, &entry_map->type_def.addrsel->list4)
536 goto remove_af4_single_addr; 535 goto remove_af4_single_addr;
537 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 536 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
538 netlbl_af6list_foreach_rcu(iter6, &entry_map->type_def.addrsel->list6) 537 netlbl_af6list_foreach_rcu(iter6, &entry_map->type_def.addrsel->list6)
539 goto remove_af4_single_addr; 538 goto remove_af4_single_addr;
540 #endif /* IPv6 */ 539 #endif /* IPv6 */
541 /* the domain mapping is empty so remove it from the mapping table */ 540 /* the domain mapping is empty so remove it from the mapping table */
542 netlbl_domhsh_remove_entry(entry_map, audit_info); 541 netlbl_domhsh_remove_entry(entry_map, audit_info);
543 542
544 remove_af4_single_addr: 543 remove_af4_single_addr:
545 rcu_read_unlock(); 544 rcu_read_unlock();
546 /* yick, we can't use call_rcu here because we don't have a rcu head 545 /* yick, we can't use call_rcu here because we don't have a rcu head
547 * pointer but hopefully this should be a rare case so the pause 546 * pointer but hopefully this should be a rare case so the pause
548 * shouldn't be a problem */ 547 * shouldn't be a problem */
549 synchronize_rcu(); 548 synchronize_rcu();
550 entry = netlbl_domhsh_addr4_entry(entry_addr); 549 entry = netlbl_domhsh_addr4_entry(entry_addr);
551 cipso_v4_doi_putdef(entry->type_def.cipsov4); 550 cipso_v4_doi_putdef(entry->type_def.cipsov4);
552 kfree(entry); 551 kfree(entry);
553 return 0; 552 return 0;
554 553
555 remove_af4_failure: 554 remove_af4_failure:
556 rcu_read_unlock(); 555 rcu_read_unlock();
557 return -ENOENT; 556 return -ENOENT;
558 } 557 }
559 558
560 /** 559 /**
561 * netlbl_domhsh_remove - Removes an entry from the domain hash table 560 * netlbl_domhsh_remove - Removes an entry from the domain hash table
562 * @domain: the domain to remove 561 * @domain: the domain to remove
563 * @audit_info: NetLabel audit information 562 * @audit_info: NetLabel audit information
564 * 563 *
565 * Description: 564 * Description:
566 * Removes an entry from the domain hash table and handles any updates to the 565 * Removes an entry from the domain hash table and handles any updates to the
567 * lower level protocol handler (i.e. CIPSO). Returns zero on success, 566 * lower level protocol handler (i.e. CIPSO). Returns zero on success,
568 * negative on failure. 567 * negative on failure.
569 * 568 *
570 */ 569 */
571 int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) 570 int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
572 { 571 {
573 int ret_val; 572 int ret_val;
574 struct netlbl_dom_map *entry; 573 struct netlbl_dom_map *entry;
575 574
576 rcu_read_lock(); 575 rcu_read_lock();
577 if (domain) 576 if (domain)
578 entry = netlbl_domhsh_search(domain); 577 entry = netlbl_domhsh_search(domain);
579 else 578 else
580 entry = netlbl_domhsh_search_def(domain); 579 entry = netlbl_domhsh_search_def(domain);
581 ret_val = netlbl_domhsh_remove_entry(entry, audit_info); 580 ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
582 rcu_read_unlock(); 581 rcu_read_unlock();
583 582
584 return ret_val; 583 return ret_val;
585 } 584 }
586 585
587 /** 586 /**
588 * netlbl_domhsh_remove_default - Removes the default entry from the table 587 * netlbl_domhsh_remove_default - Removes the default entry from the table
589 * @audit_info: NetLabel audit information 588 * @audit_info: NetLabel audit information
590 * 589 *
591 * Description: 590 * Description:
592 * Removes/resets the default entry for the domain hash table and handles any 591 * Removes/resets the default entry for the domain hash table and handles any
593 * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on 592 * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on
594 * success, non-zero on failure. 593 * success, non-zero on failure.
595 * 594 *
596 */ 595 */
597 int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info) 596 int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
598 { 597 {
599 return netlbl_domhsh_remove(NULL, audit_info); 598 return netlbl_domhsh_remove(NULL, audit_info);
600 } 599 }
601 600
602 /** 601 /**
603 * netlbl_domhsh_getentry - Get an entry from the domain hash table 602 * netlbl_domhsh_getentry - Get an entry from the domain hash table
604 * @domain: the domain name to search for 603 * @domain: the domain name to search for
605 * 604 *
606 * Description: 605 * Description:
607 * Look through the domain hash table searching for an entry to match @domain, 606 * Look through the domain hash table searching for an entry to match @domain,
608 * return a pointer to a copy of the entry or NULL. The caller is responsible 607 * return a pointer to a copy of the entry or NULL. The caller is responsible
609 * for ensuring that rcu_read_[un]lock() is called. 608 * for ensuring that rcu_read_[un]lock() is called.
610 * 609 *
611 */ 610 */
612 struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) 611 struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
613 { 612 {
614 return netlbl_domhsh_search_def(domain); 613 return netlbl_domhsh_search_def(domain);
615 } 614 }
616 615
617 /** 616 /**
618 * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table 617 * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table
619 * @domain: the domain name to search for 618 * @domain: the domain name to search for
620 * @addr: the IP address to search for 619 * @addr: the IP address to search for
621 * 620 *
622 * Description: 621 * Description:
623 * Look through the domain hash table searching for an entry to match @domain 622 * Look through the domain hash table searching for an entry to match @domain
624 * and @addr, return a pointer to a copy of the entry or NULL. The caller is 623 * and @addr, return a pointer to a copy of the entry or NULL. The caller is
625 * responsible for ensuring that rcu_read_[un]lock() is called. 624 * responsible for ensuring that rcu_read_[un]lock() is called.
626 * 625 *
627 */ 626 */
628 struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain, 627 struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain,
629 __be32 addr) 628 __be32 addr)
630 { 629 {
631 struct netlbl_dom_map *dom_iter; 630 struct netlbl_dom_map *dom_iter;
632 struct netlbl_af4list *addr_iter; 631 struct netlbl_af4list *addr_iter;
633 632
634 dom_iter = netlbl_domhsh_search_def(domain); 633 dom_iter = netlbl_domhsh_search_def(domain);
635 if (dom_iter == NULL) 634 if (dom_iter == NULL)
636 return NULL; 635 return NULL;
637 if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) 636 if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT)
638 return NULL; 637 return NULL;
639 638
640 addr_iter = netlbl_af4list_search(addr, 639 addr_iter = netlbl_af4list_search(addr,
641 &dom_iter->type_def.addrsel->list4); 640 &dom_iter->type_def.addrsel->list4);
642 if (addr_iter == NULL) 641 if (addr_iter == NULL)
643 return NULL; 642 return NULL;
644 643
645 return netlbl_domhsh_addr4_entry(addr_iter); 644 return netlbl_domhsh_addr4_entry(addr_iter);
646 } 645 }
647 646
648 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 647 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
649 /** 648 /**
650 * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table 649 * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table
651 * @domain: the domain name to search for 650 * @domain: the domain name to search for
652 * @addr: the IP address to search for 651 * @addr: the IP address to search for
653 * 652 *
654 * Description: 653 * Description:
655 * Look through the domain hash table searching for an entry to match @domain 654 * Look through the domain hash table searching for an entry to match @domain
656 * and @addr, return a pointer to a copy of the entry or NULL. The caller is 655 * and @addr, return a pointer to a copy of the entry or NULL. The caller is
657 * responsible for ensuring that rcu_read_[un]lock() is called. 656 * responsible for ensuring that rcu_read_[un]lock() is called.
658 * 657 *
659 */ 658 */
660 struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain, 659 struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain,
661 const struct in6_addr *addr) 660 const struct in6_addr *addr)
662 { 661 {
663 struct netlbl_dom_map *dom_iter; 662 struct netlbl_dom_map *dom_iter;
664 struct netlbl_af6list *addr_iter; 663 struct netlbl_af6list *addr_iter;
665 664
666 dom_iter = netlbl_domhsh_search_def(domain); 665 dom_iter = netlbl_domhsh_search_def(domain);
667 if (dom_iter == NULL) 666 if (dom_iter == NULL)
668 return NULL; 667 return NULL;
669 if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) 668 if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT)
670 return NULL; 669 return NULL;
671 670
672 addr_iter = netlbl_af6list_search(addr, 671 addr_iter = netlbl_af6list_search(addr,
673 &dom_iter->type_def.addrsel->list6); 672 &dom_iter->type_def.addrsel->list6);
674 if (addr_iter == NULL) 673 if (addr_iter == NULL)
675 return NULL; 674 return NULL;
676 675
677 return netlbl_domhsh_addr6_entry(addr_iter); 676 return netlbl_domhsh_addr6_entry(addr_iter);
678 } 677 }
679 #endif /* IPv6 */ 678 #endif /* IPv6 */
680 679
681 /** 680 /**
682 * netlbl_domhsh_walk - Iterate through the domain mapping hash table 681 * netlbl_domhsh_walk - Iterate through the domain mapping hash table
683 * @skip_bkt: the number of buckets to skip at the start 682 * @skip_bkt: the number of buckets to skip at the start
684 * @skip_chain: the number of entries to skip in the first iterated bucket 683 * @skip_chain: the number of entries to skip in the first iterated bucket
685 * @callback: callback for each entry 684 * @callback: callback for each entry
686 * @cb_arg: argument for the callback function 685 * @cb_arg: argument for the callback function
687 * 686 *
688 * Description: 687 * Description:
689 * Interate over the domain mapping hash table, skipping the first @skip_bkt 688 * Interate over the domain mapping hash table, skipping the first @skip_bkt
690 * buckets and @skip_chain entries. For each entry in the table call 689 * buckets and @skip_chain entries. For each entry in the table call
691 * @callback, if @callback returns a negative value stop 'walking' through the 690 * @callback, if @callback returns a negative value stop 'walking' through the
692 * table and return. Updates the values in @skip_bkt and @skip_chain on 691 * table and return. Updates the values in @skip_bkt and @skip_chain on
693 * return. Returns zero on success, negative values on failure. 692 * return. Returns zero on success, negative values on failure.
694 * 693 *
695 */ 694 */
696 int netlbl_domhsh_walk(u32 *skip_bkt, 695 int netlbl_domhsh_walk(u32 *skip_bkt,
697 u32 *skip_chain, 696 u32 *skip_chain,
698 int (*callback) (struct netlbl_dom_map *entry, void *arg), 697 int (*callback) (struct netlbl_dom_map *entry, void *arg),
699 void *cb_arg) 698 void *cb_arg)
700 { 699 {
701 int ret_val = -ENOENT; 700 int ret_val = -ENOENT;
702 u32 iter_bkt; 701 u32 iter_bkt;
703 struct list_head *iter_list; 702 struct list_head *iter_list;
704 struct netlbl_dom_map *iter_entry; 703 struct netlbl_dom_map *iter_entry;
705 u32 chain_cnt = 0; 704 u32 chain_cnt = 0;
706 705
707 rcu_read_lock(); 706 rcu_read_lock();
708 for (iter_bkt = *skip_bkt; 707 for (iter_bkt = *skip_bkt;
709 iter_bkt < rcu_dereference(netlbl_domhsh)->size; 708 iter_bkt < rcu_dereference(netlbl_domhsh)->size;
710 iter_bkt++, chain_cnt = 0) { 709 iter_bkt++, chain_cnt = 0) {
711 iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt]; 710 iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt];
712 list_for_each_entry_rcu(iter_entry, iter_list, list) 711 list_for_each_entry_rcu(iter_entry, iter_list, list)
713 if (iter_entry->valid) { 712 if (iter_entry->valid) {
714 if (chain_cnt++ < *skip_chain) 713 if (chain_cnt++ < *skip_chain)
715 continue; 714 continue;
716 ret_val = callback(iter_entry, cb_arg); 715 ret_val = callback(iter_entry, cb_arg);
717 if (ret_val < 0) { 716 if (ret_val < 0) {
718 chain_cnt--; 717 chain_cnt--;
719 goto walk_return; 718 goto walk_return;
720 } 719 }
721 } 720 }
722 } 721 }
723 722
724 walk_return: 723 walk_return:
725 rcu_read_unlock(); 724 rcu_read_unlock();
726 *skip_bkt = iter_bkt; 725 *skip_bkt = iter_bkt;
727 *skip_chain = chain_cnt; 726 *skip_chain = chain_cnt;
728 return ret_val; 727 return ret_val;
729 } 728 }
730 729
net/netlabel/netlabel_unlabeled.c
1 /* 1 /*
2 * NetLabel Unlabeled Support 2 * NetLabel Unlabeled Support
3 * 3 *
4 * This file defines functions for dealing with unlabeled packets for the 4 * This file defines functions for dealing with unlabeled packets for the
5 * NetLabel system. The NetLabel system manages static and dynamic label 5 * NetLabel system. The NetLabel system manages static and dynamic label
6 * mappings for network protocols such as CIPSO and RIPSO. 6 * mappings for network protocols such as CIPSO and RIPSO.
7 * 7 *
8 * Author: Paul Moore <paul.moore@hp.com> 8 * Author: Paul Moore <paul.moore@hp.com>
9 * 9 *
10 */ 10 */
11 11
12 /* 12 /*
13 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008 13 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008
14 * 14 *
15 * This program is free software; you can redistribute it and/or modify 15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by 16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or 17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version. 18 * (at your option) any later version.
19 * 19 *
20 * This program is distributed in the hope that it will be useful, 20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
23 * the GNU General Public License for more details. 23 * the GNU General Public License for more details.
24 * 24 *
25 * You should have received a copy of the GNU General Public License 25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software 26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 * 28 *
29 */ 29 */
30 30
31 #include <linux/types.h> 31 #include <linux/types.h>
32 #include <linux/rcupdate.h> 32 #include <linux/rcupdate.h>
33 #include <linux/list.h> 33 #include <linux/list.h>
34 #include <linux/spinlock.h> 34 #include <linux/spinlock.h>
35 #include <linux/socket.h> 35 #include <linux/socket.h>
36 #include <linux/string.h> 36 #include <linux/string.h>
37 #include <linux/skbuff.h> 37 #include <linux/skbuff.h>
38 #include <linux/audit.h> 38 #include <linux/audit.h>
39 #include <linux/in.h> 39 #include <linux/in.h>
40 #include <linux/in6.h> 40 #include <linux/in6.h>
41 #include <linux/ip.h> 41 #include <linux/ip.h>
42 #include <linux/ipv6.h> 42 #include <linux/ipv6.h>
43 #include <linux/notifier.h> 43 #include <linux/notifier.h>
44 #include <linux/netdevice.h> 44 #include <linux/netdevice.h>
45 #include <linux/security.h> 45 #include <linux/security.h>
46 #include <linux/slab.h> 46 #include <linux/slab.h>
47 #include <net/sock.h> 47 #include <net/sock.h>
48 #include <net/netlink.h> 48 #include <net/netlink.h>
49 #include <net/genetlink.h> 49 #include <net/genetlink.h>
50 #include <net/ip.h> 50 #include <net/ip.h>
51 #include <net/ipv6.h> 51 #include <net/ipv6.h>
52 #include <net/net_namespace.h> 52 #include <net/net_namespace.h>
53 #include <net/netlabel.h> 53 #include <net/netlabel.h>
54 #include <asm/bug.h> 54 #include <asm/bug.h>
55 #include <asm/atomic.h> 55 #include <asm/atomic.h>
56 56
57 #include "netlabel_user.h" 57 #include "netlabel_user.h"
58 #include "netlabel_addrlist.h" 58 #include "netlabel_addrlist.h"
59 #include "netlabel_domainhash.h" 59 #include "netlabel_domainhash.h"
60 #include "netlabel_unlabeled.h" 60 #include "netlabel_unlabeled.h"
61 #include "netlabel_mgmt.h" 61 #include "netlabel_mgmt.h"
62 62
63 /* NOTE: at present we always use init's network namespace since we don't 63 /* NOTE: at present we always use init's network namespace since we don't
64 * presently support different namespaces even though the majority of 64 * presently support different namespaces even though the majority of
65 * the functions in this file are "namespace safe" */ 65 * the functions in this file are "namespace safe" */
66 66
67 /* The unlabeled connection hash table which we use to map network interfaces 67 /* The unlabeled connection hash table which we use to map network interfaces
68 * and addresses of unlabeled packets to a user specified secid value for the 68 * and addresses of unlabeled packets to a user specified secid value for the
69 * LSM. The hash table is used to lookup the network interface entry 69 * LSM. The hash table is used to lookup the network interface entry
70 * (struct netlbl_unlhsh_iface) and then the interface entry is used to 70 * (struct netlbl_unlhsh_iface) and then the interface entry is used to
71 * lookup an IP address match from an ordered list. If a network interface 71 * lookup an IP address match from an ordered list. If a network interface
72 * match can not be found in the hash table then the default entry 72 * match can not be found in the hash table then the default entry
73 * (netlbl_unlhsh_def) is used. The IP address entry list 73 * (netlbl_unlhsh_def) is used. The IP address entry list
74 * (struct netlbl_unlhsh_addr) is ordered such that the entries with a 74 * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
75 * larger netmask come first. 75 * larger netmask come first.
76 */ 76 */
77 struct netlbl_unlhsh_tbl { 77 struct netlbl_unlhsh_tbl {
78 struct list_head *tbl; 78 struct list_head *tbl;
79 u32 size; 79 u32 size;
80 }; 80 };
81 #define netlbl_unlhsh_addr4_entry(iter) \ 81 #define netlbl_unlhsh_addr4_entry(iter) \
82 container_of(iter, struct netlbl_unlhsh_addr4, list) 82 container_of(iter, struct netlbl_unlhsh_addr4, list)
83 struct netlbl_unlhsh_addr4 { 83 struct netlbl_unlhsh_addr4 {
84 u32 secid; 84 u32 secid;
85 85
86 struct netlbl_af4list list; 86 struct netlbl_af4list list;
87 struct rcu_head rcu; 87 struct rcu_head rcu;
88 }; 88 };
89 #define netlbl_unlhsh_addr6_entry(iter) \ 89 #define netlbl_unlhsh_addr6_entry(iter) \
90 container_of(iter, struct netlbl_unlhsh_addr6, list) 90 container_of(iter, struct netlbl_unlhsh_addr6, list)
91 struct netlbl_unlhsh_addr6 { 91 struct netlbl_unlhsh_addr6 {
92 u32 secid; 92 u32 secid;
93 93
94 struct netlbl_af6list list; 94 struct netlbl_af6list list;
95 struct rcu_head rcu; 95 struct rcu_head rcu;
96 }; 96 };
97 struct netlbl_unlhsh_iface { 97 struct netlbl_unlhsh_iface {
98 int ifindex; 98 int ifindex;
99 struct list_head addr4_list; 99 struct list_head addr4_list;
100 struct list_head addr6_list; 100 struct list_head addr6_list;
101 101
102 u32 valid; 102 u32 valid;
103 struct list_head list; 103 struct list_head list;
104 struct rcu_head rcu; 104 struct rcu_head rcu;
105 }; 105 };
106 106
107 /* Argument struct for netlbl_unlhsh_walk() */ 107 /* Argument struct for netlbl_unlhsh_walk() */
108 struct netlbl_unlhsh_walk_arg { 108 struct netlbl_unlhsh_walk_arg {
109 struct netlink_callback *nl_cb; 109 struct netlink_callback *nl_cb;
110 struct sk_buff *skb; 110 struct sk_buff *skb;
111 u32 seq; 111 u32 seq;
112 }; 112 };
113 113
114 /* Unlabeled connection hash table */ 114 /* Unlabeled connection hash table */
115 /* updates should be so rare that having one spinlock for the entire 115 /* updates should be so rare that having one spinlock for the entire
116 * hash table should be okay */ 116 * hash table should be okay */
117 static DEFINE_SPINLOCK(netlbl_unlhsh_lock); 117 static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
118 #define netlbl_unlhsh_rcu_deref(p) \ 118 #define netlbl_unlhsh_rcu_deref(p) \
119 rcu_dereference_check(p, rcu_read_lock_held() || \ 119 rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock))
120 lockdep_is_held(&netlbl_unlhsh_lock))
121 static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; 120 static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
122 static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; 121 static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
123 122
124 /* Accept unlabeled packets flag */ 123 /* Accept unlabeled packets flag */
125 static u8 netlabel_unlabel_acceptflg = 0; 124 static u8 netlabel_unlabel_acceptflg = 0;
126 125
127 /* NetLabel Generic NETLINK unlabeled family */ 126 /* NetLabel Generic NETLINK unlabeled family */
128 static struct genl_family netlbl_unlabel_gnl_family = { 127 static struct genl_family netlbl_unlabel_gnl_family = {
129 .id = GENL_ID_GENERATE, 128 .id = GENL_ID_GENERATE,
130 .hdrsize = 0, 129 .hdrsize = 0,
131 .name = NETLBL_NLTYPE_UNLABELED_NAME, 130 .name = NETLBL_NLTYPE_UNLABELED_NAME,
132 .version = NETLBL_PROTO_VERSION, 131 .version = NETLBL_PROTO_VERSION,
133 .maxattr = NLBL_UNLABEL_A_MAX, 132 .maxattr = NLBL_UNLABEL_A_MAX,
134 }; 133 };
135 134
136 /* NetLabel Netlink attribute policy */ 135 /* NetLabel Netlink attribute policy */
137 static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { 136 static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
138 [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, 137 [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
139 [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, 138 [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
140 .len = sizeof(struct in6_addr) }, 139 .len = sizeof(struct in6_addr) },
141 [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, 140 [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
142 .len = sizeof(struct in6_addr) }, 141 .len = sizeof(struct in6_addr) },
143 [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, 142 [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
144 .len = sizeof(struct in_addr) }, 143 .len = sizeof(struct in_addr) },
145 [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, 144 [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
146 .len = sizeof(struct in_addr) }, 145 .len = sizeof(struct in_addr) },
147 [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, 146 [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
148 .len = IFNAMSIZ - 1 }, 147 .len = IFNAMSIZ - 1 },
149 [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } 148 [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
150 }; 149 };
151 150
152 /* 151 /*
153 * Unlabeled Connection Hash Table Functions 152 * Unlabeled Connection Hash Table Functions
154 */ 153 */
155 154
156 /** 155 /**
157 * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table 156 * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
158 * @entry: the entry's RCU field 157 * @entry: the entry's RCU field
159 * 158 *
160 * Description: 159 * Description:
161 * This function is designed to be used as a callback to the call_rcu() 160 * This function is designed to be used as a callback to the call_rcu()
162 * function so that memory allocated to a hash table interface entry can be 161 * function so that memory allocated to a hash table interface entry can be
163 * released safely. It is important to note that this function does not free 162 * released safely. It is important to note that this function does not free
164 * the IPv4 and IPv6 address lists contained as part of an interface entry. It 163 * the IPv4 and IPv6 address lists contained as part of an interface entry. It
165 * is up to the rest of the code to make sure an interface entry is only freed 164 * is up to the rest of the code to make sure an interface entry is only freed
166 * once it's address lists are empty. 165 * once it's address lists are empty.
167 * 166 *
168 */ 167 */
169 static void netlbl_unlhsh_free_iface(struct rcu_head *entry) 168 static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
170 { 169 {
171 struct netlbl_unlhsh_iface *iface; 170 struct netlbl_unlhsh_iface *iface;
172 struct netlbl_af4list *iter4; 171 struct netlbl_af4list *iter4;
173 struct netlbl_af4list *tmp4; 172 struct netlbl_af4list *tmp4;
174 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 173 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
175 struct netlbl_af6list *iter6; 174 struct netlbl_af6list *iter6;
176 struct netlbl_af6list *tmp6; 175 struct netlbl_af6list *tmp6;
177 #endif /* IPv6 */ 176 #endif /* IPv6 */
178 177
179 iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); 178 iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
180 179
181 /* no need for locks here since we are the only one with access to this 180 /* no need for locks here since we are the only one with access to this
182 * structure */ 181 * structure */
183 182
184 netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) { 183 netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) {
185 netlbl_af4list_remove_entry(iter4); 184 netlbl_af4list_remove_entry(iter4);
186 kfree(netlbl_unlhsh_addr4_entry(iter4)); 185 kfree(netlbl_unlhsh_addr4_entry(iter4));
187 } 186 }
188 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 187 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
189 netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) { 188 netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) {
190 netlbl_af6list_remove_entry(iter6); 189 netlbl_af6list_remove_entry(iter6);
191 kfree(netlbl_unlhsh_addr6_entry(iter6)); 190 kfree(netlbl_unlhsh_addr6_entry(iter6));
192 } 191 }
193 #endif /* IPv6 */ 192 #endif /* IPv6 */
194 kfree(iface); 193 kfree(iface);
195 } 194 }
196 195
197 /** 196 /**
198 * netlbl_unlhsh_hash - Hashing function for the hash table 197 * netlbl_unlhsh_hash - Hashing function for the hash table
199 * @ifindex: the network interface/device to hash 198 * @ifindex: the network interface/device to hash
200 * 199 *
201 * Description: 200 * Description:
202 * This is the hashing function for the unlabeled hash table, it returns the 201 * This is the hashing function for the unlabeled hash table, it returns the
203 * bucket number for the given device/interface. The caller is responsible for 202 * bucket number for the given device/interface. The caller is responsible for
204 * ensuring that the hash table is protected with either a RCU read lock or 203 * ensuring that the hash table is protected with either a RCU read lock or
205 * the hash table lock. 204 * the hash table lock.
206 * 205 *
207 */ 206 */
208 static u32 netlbl_unlhsh_hash(int ifindex) 207 static u32 netlbl_unlhsh_hash(int ifindex)
209 { 208 {
210 return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1); 209 return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1);
211 } 210 }
212 211
213 /** 212 /**
214 * netlbl_unlhsh_search_iface - Search for a matching interface entry 213 * netlbl_unlhsh_search_iface - Search for a matching interface entry
215 * @ifindex: the network interface 214 * @ifindex: the network interface
216 * 215 *
217 * Description: 216 * Description:
218 * Searches the unlabeled connection hash table and returns a pointer to the 217 * Searches the unlabeled connection hash table and returns a pointer to the
219 * interface entry which matches @ifindex, otherwise NULL is returned. The 218 * interface entry which matches @ifindex, otherwise NULL is returned. The
220 * caller is responsible for ensuring that the hash table is protected with 219 * caller is responsible for ensuring that the hash table is protected with
221 * either a RCU read lock or the hash table lock. 220 * either a RCU read lock or the hash table lock.
222 * 221 *
223 */ 222 */
224 static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) 223 static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
225 { 224 {
226 u32 bkt; 225 u32 bkt;
227 struct list_head *bkt_list; 226 struct list_head *bkt_list;
228 struct netlbl_unlhsh_iface *iter; 227 struct netlbl_unlhsh_iface *iter;
229 228
230 bkt = netlbl_unlhsh_hash(ifindex); 229 bkt = netlbl_unlhsh_hash(ifindex);
231 bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; 230 bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt];
232 list_for_each_entry_rcu(iter, bkt_list, list) 231 list_for_each_entry_rcu(iter, bkt_list, list)
233 if (iter->valid && iter->ifindex == ifindex) 232 if (iter->valid && iter->ifindex == ifindex)
234 return iter; 233 return iter;
235 234
236 return NULL; 235 return NULL;
237 } 236 }
238 237
239 /** 238 /**
240 * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table 239 * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
241 * @iface: the associated interface entry 240 * @iface: the associated interface entry
242 * @addr: IPv4 address in network byte order 241 * @addr: IPv4 address in network byte order
243 * @mask: IPv4 address mask in network byte order 242 * @mask: IPv4 address mask in network byte order
244 * @secid: LSM secid value for entry 243 * @secid: LSM secid value for entry
245 * 244 *
246 * Description: 245 * Description:
247 * Add a new address entry into the unlabeled connection hash table using the 246 * Add a new address entry into the unlabeled connection hash table using the
248 * interface entry specified by @iface. On success zero is returned, otherwise 247 * interface entry specified by @iface. On success zero is returned, otherwise
249 * a negative value is returned. 248 * a negative value is returned.
250 * 249 *
251 */ 250 */
252 static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, 251 static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
253 const struct in_addr *addr, 252 const struct in_addr *addr,
254 const struct in_addr *mask, 253 const struct in_addr *mask,
255 u32 secid) 254 u32 secid)
256 { 255 {
257 int ret_val; 256 int ret_val;
258 struct netlbl_unlhsh_addr4 *entry; 257 struct netlbl_unlhsh_addr4 *entry;
259 258
260 entry = kzalloc(sizeof(*entry), GFP_ATOMIC); 259 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
261 if (entry == NULL) 260 if (entry == NULL)
262 return -ENOMEM; 261 return -ENOMEM;
263 262
264 entry->list.addr = addr->s_addr & mask->s_addr; 263 entry->list.addr = addr->s_addr & mask->s_addr;
265 entry->list.mask = mask->s_addr; 264 entry->list.mask = mask->s_addr;
266 entry->list.valid = 1; 265 entry->list.valid = 1;
267 entry->secid = secid; 266 entry->secid = secid;
268 267
269 spin_lock(&netlbl_unlhsh_lock); 268 spin_lock(&netlbl_unlhsh_lock);
270 ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list); 269 ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list);
271 spin_unlock(&netlbl_unlhsh_lock); 270 spin_unlock(&netlbl_unlhsh_lock);
272 271
273 if (ret_val != 0) 272 if (ret_val != 0)
274 kfree(entry); 273 kfree(entry);
275 return ret_val; 274 return ret_val;
276 } 275 }
277 276
278 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 277 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
279 /** 278 /**
280 * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table 279 * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
281 * @iface: the associated interface entry 280 * @iface: the associated interface entry
282 * @addr: IPv6 address in network byte order 281 * @addr: IPv6 address in network byte order
283 * @mask: IPv6 address mask in network byte order 282 * @mask: IPv6 address mask in network byte order
284 * @secid: LSM secid value for entry 283 * @secid: LSM secid value for entry
285 * 284 *
286 * Description: 285 * Description:
287 * Add a new address entry into the unlabeled connection hash table using the 286 * Add a new address entry into the unlabeled connection hash table using the
288 * interface entry specified by @iface. On success zero is returned, otherwise 287 * interface entry specified by @iface. On success zero is returned, otherwise
289 * a negative value is returned. 288 * a negative value is returned.
290 * 289 *
291 */ 290 */
292 static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, 291 static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
293 const struct in6_addr *addr, 292 const struct in6_addr *addr,
294 const struct in6_addr *mask, 293 const struct in6_addr *mask,
295 u32 secid) 294 u32 secid)
296 { 295 {
297 int ret_val; 296 int ret_val;
298 struct netlbl_unlhsh_addr6 *entry; 297 struct netlbl_unlhsh_addr6 *entry;
299 298
300 entry = kzalloc(sizeof(*entry), GFP_ATOMIC); 299 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
301 if (entry == NULL) 300 if (entry == NULL)
302 return -ENOMEM; 301 return -ENOMEM;
303 302
304 ipv6_addr_copy(&entry->list.addr, addr); 303 ipv6_addr_copy(&entry->list.addr, addr);
305 entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; 304 entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
306 entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; 305 entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
307 entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; 306 entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
308 entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; 307 entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
309 ipv6_addr_copy(&entry->list.mask, mask); 308 ipv6_addr_copy(&entry->list.mask, mask);
310 entry->list.valid = 1; 309 entry->list.valid = 1;
311 entry->secid = secid; 310 entry->secid = secid;
312 311
313 spin_lock(&netlbl_unlhsh_lock); 312 spin_lock(&netlbl_unlhsh_lock);
314 ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list); 313 ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list);
315 spin_unlock(&netlbl_unlhsh_lock); 314 spin_unlock(&netlbl_unlhsh_lock);
316 315
317 if (ret_val != 0) 316 if (ret_val != 0)
318 kfree(entry); 317 kfree(entry);
319 return 0; 318 return 0;
320 } 319 }
321 #endif /* IPv6 */ 320 #endif /* IPv6 */
322 321
323 /** 322 /**
324 * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table 323 * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
325 * @ifindex: network interface 324 * @ifindex: network interface
326 * 325 *
327 * Description: 326 * Description:
328 * Add a new, empty, interface entry into the unlabeled connection hash table. 327 * Add a new, empty, interface entry into the unlabeled connection hash table.
329 * On success a pointer to the new interface entry is returned, on failure NULL 328 * On success a pointer to the new interface entry is returned, on failure NULL
330 * is returned. 329 * is returned.
331 * 330 *
332 */ 331 */
333 static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) 332 static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
334 { 333 {
335 u32 bkt; 334 u32 bkt;
336 struct netlbl_unlhsh_iface *iface; 335 struct netlbl_unlhsh_iface *iface;
337 336
338 iface = kzalloc(sizeof(*iface), GFP_ATOMIC); 337 iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
339 if (iface == NULL) 338 if (iface == NULL)
340 return NULL; 339 return NULL;
341 340
342 iface->ifindex = ifindex; 341 iface->ifindex = ifindex;
343 INIT_LIST_HEAD(&iface->addr4_list); 342 INIT_LIST_HEAD(&iface->addr4_list);
344 INIT_LIST_HEAD(&iface->addr6_list); 343 INIT_LIST_HEAD(&iface->addr6_list);
345 iface->valid = 1; 344 iface->valid = 1;
346 345
347 spin_lock(&netlbl_unlhsh_lock); 346 spin_lock(&netlbl_unlhsh_lock);
348 if (ifindex > 0) { 347 if (ifindex > 0) {
349 bkt = netlbl_unlhsh_hash(ifindex); 348 bkt = netlbl_unlhsh_hash(ifindex);
350 if (netlbl_unlhsh_search_iface(ifindex) != NULL) 349 if (netlbl_unlhsh_search_iface(ifindex) != NULL)
351 goto add_iface_failure; 350 goto add_iface_failure;
352 list_add_tail_rcu(&iface->list, 351 list_add_tail_rcu(&iface->list,
353 &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]); 352 &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]);
354 } else { 353 } else {
355 INIT_LIST_HEAD(&iface->list); 354 INIT_LIST_HEAD(&iface->list);
356 if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL) 355 if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL)
357 goto add_iface_failure; 356 goto add_iface_failure;
358 rcu_assign_pointer(netlbl_unlhsh_def, iface); 357 rcu_assign_pointer(netlbl_unlhsh_def, iface);
359 } 358 }
360 spin_unlock(&netlbl_unlhsh_lock); 359 spin_unlock(&netlbl_unlhsh_lock);
361 360
362 return iface; 361 return iface;
363 362
364 add_iface_failure: 363 add_iface_failure:
365 spin_unlock(&netlbl_unlhsh_lock); 364 spin_unlock(&netlbl_unlhsh_lock);
366 kfree(iface); 365 kfree(iface);
367 return NULL; 366 return NULL;
368 } 367 }
369 368
370 /** 369 /**
371 * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table 370 * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
372 * @net: network namespace 371 * @net: network namespace
373 * @dev_name: interface name 372 * @dev_name: interface name
374 * @addr: IP address in network byte order 373 * @addr: IP address in network byte order
375 * @mask: address mask in network byte order 374 * @mask: address mask in network byte order
376 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) 375 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
377 * @secid: LSM secid value for the entry 376 * @secid: LSM secid value for the entry
378 * @audit_info: NetLabel audit information 377 * @audit_info: NetLabel audit information
379 * 378 *
380 * Description: 379 * Description:
381 * Adds a new entry to the unlabeled connection hash table. Returns zero on 380 * Adds a new entry to the unlabeled connection hash table. Returns zero on
382 * success, negative values on failure. 381 * success, negative values on failure.
383 * 382 *
384 */ 383 */
385 int netlbl_unlhsh_add(struct net *net, 384 int netlbl_unlhsh_add(struct net *net,
386 const char *dev_name, 385 const char *dev_name,
387 const void *addr, 386 const void *addr,
388 const void *mask, 387 const void *mask,
389 u32 addr_len, 388 u32 addr_len,
390 u32 secid, 389 u32 secid,
391 struct netlbl_audit *audit_info) 390 struct netlbl_audit *audit_info)
392 { 391 {
393 int ret_val; 392 int ret_val;
394 int ifindex; 393 int ifindex;
395 struct net_device *dev; 394 struct net_device *dev;
396 struct netlbl_unlhsh_iface *iface; 395 struct netlbl_unlhsh_iface *iface;
397 struct audit_buffer *audit_buf = NULL; 396 struct audit_buffer *audit_buf = NULL;
398 char *secctx = NULL; 397 char *secctx = NULL;
399 u32 secctx_len; 398 u32 secctx_len;
400 399
401 if (addr_len != sizeof(struct in_addr) && 400 if (addr_len != sizeof(struct in_addr) &&
402 addr_len != sizeof(struct in6_addr)) 401 addr_len != sizeof(struct in6_addr))
403 return -EINVAL; 402 return -EINVAL;
404 403
405 rcu_read_lock(); 404 rcu_read_lock();
406 if (dev_name != NULL) { 405 if (dev_name != NULL) {
407 dev = dev_get_by_name_rcu(net, dev_name); 406 dev = dev_get_by_name_rcu(net, dev_name);
408 if (dev == NULL) { 407 if (dev == NULL) {
409 ret_val = -ENODEV; 408 ret_val = -ENODEV;
410 goto unlhsh_add_return; 409 goto unlhsh_add_return;
411 } 410 }
412 ifindex = dev->ifindex; 411 ifindex = dev->ifindex;
413 iface = netlbl_unlhsh_search_iface(ifindex); 412 iface = netlbl_unlhsh_search_iface(ifindex);
414 } else { 413 } else {
415 ifindex = 0; 414 ifindex = 0;
416 iface = rcu_dereference(netlbl_unlhsh_def); 415 iface = rcu_dereference(netlbl_unlhsh_def);
417 } 416 }
418 if (iface == NULL) { 417 if (iface == NULL) {
419 iface = netlbl_unlhsh_add_iface(ifindex); 418 iface = netlbl_unlhsh_add_iface(ifindex);
420 if (iface == NULL) { 419 if (iface == NULL) {
421 ret_val = -ENOMEM; 420 ret_val = -ENOMEM;
422 goto unlhsh_add_return; 421 goto unlhsh_add_return;
423 } 422 }
424 } 423 }
425 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, 424 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
426 audit_info); 425 audit_info);
427 switch (addr_len) { 426 switch (addr_len) {
428 case sizeof(struct in_addr): { 427 case sizeof(struct in_addr): {
429 struct in_addr *addr4, *mask4; 428 struct in_addr *addr4, *mask4;
430 429
431 addr4 = (struct in_addr *)addr; 430 addr4 = (struct in_addr *)addr;
432 mask4 = (struct in_addr *)mask; 431 mask4 = (struct in_addr *)mask;
433 ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); 432 ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
434 if (audit_buf != NULL) 433 if (audit_buf != NULL)
435 netlbl_af4list_audit_addr(audit_buf, 1, 434 netlbl_af4list_audit_addr(audit_buf, 1,
436 dev_name, 435 dev_name,
437 addr4->s_addr, 436 addr4->s_addr,
438 mask4->s_addr); 437 mask4->s_addr);
439 break; 438 break;
440 } 439 }
441 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 440 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
442 case sizeof(struct in6_addr): { 441 case sizeof(struct in6_addr): {
443 struct in6_addr *addr6, *mask6; 442 struct in6_addr *addr6, *mask6;
444 443
445 addr6 = (struct in6_addr *)addr; 444 addr6 = (struct in6_addr *)addr;
446 mask6 = (struct in6_addr *)mask; 445 mask6 = (struct in6_addr *)mask;
447 ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); 446 ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
448 if (audit_buf != NULL) 447 if (audit_buf != NULL)
449 netlbl_af6list_audit_addr(audit_buf, 1, 448 netlbl_af6list_audit_addr(audit_buf, 1,
450 dev_name, 449 dev_name,
451 addr6, mask6); 450 addr6, mask6);
452 break; 451 break;
453 } 452 }
454 #endif /* IPv6 */ 453 #endif /* IPv6 */
455 default: 454 default:
456 ret_val = -EINVAL; 455 ret_val = -EINVAL;
457 } 456 }
458 if (ret_val == 0) 457 if (ret_val == 0)
459 atomic_inc(&netlabel_mgmt_protocount); 458 atomic_inc(&netlabel_mgmt_protocount);
460 459
461 unlhsh_add_return: 460 unlhsh_add_return:
462 rcu_read_unlock(); 461 rcu_read_unlock();
463 if (audit_buf != NULL) { 462 if (audit_buf != NULL) {
464 if (security_secid_to_secctx(secid, 463 if (security_secid_to_secctx(secid,
465 &secctx, 464 &secctx,
466 &secctx_len) == 0) { 465 &secctx_len) == 0) {
467 audit_log_format(audit_buf, " sec_obj=%s", secctx); 466 audit_log_format(audit_buf, " sec_obj=%s", secctx);
468 security_release_secctx(secctx, secctx_len); 467 security_release_secctx(secctx, secctx_len);
469 } 468 }
470 audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); 469 audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
471 audit_log_end(audit_buf); 470 audit_log_end(audit_buf);
472 } 471 }
473 return ret_val; 472 return ret_val;
474 } 473 }
475 474
476 /** 475 /**
477 * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry 476 * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
478 * @net: network namespace 477 * @net: network namespace
479 * @iface: interface entry 478 * @iface: interface entry
480 * @addr: IP address 479 * @addr: IP address
481 * @mask: IP address mask 480 * @mask: IP address mask
482 * @audit_info: NetLabel audit information 481 * @audit_info: NetLabel audit information
483 * 482 *
484 * Description: 483 * Description:
485 * Remove an IP address entry from the unlabeled connection hash table. 484 * Remove an IP address entry from the unlabeled connection hash table.
486 * Returns zero on success, negative values on failure. 485 * Returns zero on success, negative values on failure.
487 * 486 *
488 */ 487 */
489 static int netlbl_unlhsh_remove_addr4(struct net *net, 488 static int netlbl_unlhsh_remove_addr4(struct net *net,
490 struct netlbl_unlhsh_iface *iface, 489 struct netlbl_unlhsh_iface *iface,
491 const struct in_addr *addr, 490 const struct in_addr *addr,
492 const struct in_addr *mask, 491 const struct in_addr *mask,
493 struct netlbl_audit *audit_info) 492 struct netlbl_audit *audit_info)
494 { 493 {
495 struct netlbl_af4list *list_entry; 494 struct netlbl_af4list *list_entry;
496 struct netlbl_unlhsh_addr4 *entry; 495 struct netlbl_unlhsh_addr4 *entry;
497 struct audit_buffer *audit_buf; 496 struct audit_buffer *audit_buf;
498 struct net_device *dev; 497 struct net_device *dev;
499 char *secctx; 498 char *secctx;
500 u32 secctx_len; 499 u32 secctx_len;
501 500
502 spin_lock(&netlbl_unlhsh_lock); 501 spin_lock(&netlbl_unlhsh_lock);
503 list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, 502 list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
504 &iface->addr4_list); 503 &iface->addr4_list);
505 spin_unlock(&netlbl_unlhsh_lock); 504 spin_unlock(&netlbl_unlhsh_lock);
506 if (list_entry != NULL) 505 if (list_entry != NULL)
507 entry = netlbl_unlhsh_addr4_entry(list_entry); 506 entry = netlbl_unlhsh_addr4_entry(list_entry);
508 else 507 else
509 entry = NULL; 508 entry = NULL;
510 509
511 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, 510 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
512 audit_info); 511 audit_info);
513 if (audit_buf != NULL) { 512 if (audit_buf != NULL) {
514 dev = dev_get_by_index(net, iface->ifindex); 513 dev = dev_get_by_index(net, iface->ifindex);
515 netlbl_af4list_audit_addr(audit_buf, 1, 514 netlbl_af4list_audit_addr(audit_buf, 1,
516 (dev != NULL ? dev->name : NULL), 515 (dev != NULL ? dev->name : NULL),
517 addr->s_addr, mask->s_addr); 516 addr->s_addr, mask->s_addr);
518 if (dev != NULL) 517 if (dev != NULL)
519 dev_put(dev); 518 dev_put(dev);
520 if (entry != NULL && 519 if (entry != NULL &&
521 security_secid_to_secctx(entry->secid, 520 security_secid_to_secctx(entry->secid,
522 &secctx, &secctx_len) == 0) { 521 &secctx, &secctx_len) == 0) {
523 audit_log_format(audit_buf, " sec_obj=%s", secctx); 522 audit_log_format(audit_buf, " sec_obj=%s", secctx);
524 security_release_secctx(secctx, secctx_len); 523 security_release_secctx(secctx, secctx_len);
525 } 524 }
526 audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); 525 audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
527 audit_log_end(audit_buf); 526 audit_log_end(audit_buf);
528 } 527 }
529 528
530 if (entry == NULL) 529 if (entry == NULL)
531 return -ENOENT; 530 return -ENOENT;
532 531
533 kfree_rcu(entry, rcu); 532 kfree_rcu(entry, rcu);
534 return 0; 533 return 0;
535 } 534 }
536 535
537 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 536 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
538 /** 537 /**
539 * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry 538 * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
540 * @net: network namespace 539 * @net: network namespace
541 * @iface: interface entry 540 * @iface: interface entry
542 * @addr: IP address 541 * @addr: IP address
543 * @mask: IP address mask 542 * @mask: IP address mask
544 * @audit_info: NetLabel audit information 543 * @audit_info: NetLabel audit information
545 * 544 *
546 * Description: 545 * Description:
547 * Remove an IP address entry from the unlabeled connection hash table. 546 * Remove an IP address entry from the unlabeled connection hash table.
548 * Returns zero on success, negative values on failure. 547 * Returns zero on success, negative values on failure.
549 * 548 *
550 */ 549 */
551 static int netlbl_unlhsh_remove_addr6(struct net *net, 550 static int netlbl_unlhsh_remove_addr6(struct net *net,
552 struct netlbl_unlhsh_iface *iface, 551 struct netlbl_unlhsh_iface *iface,
553 const struct in6_addr *addr, 552 const struct in6_addr *addr,
554 const struct in6_addr *mask, 553 const struct in6_addr *mask,
555 struct netlbl_audit *audit_info) 554 struct netlbl_audit *audit_info)
556 { 555 {
557 struct netlbl_af6list *list_entry; 556 struct netlbl_af6list *list_entry;
558 struct netlbl_unlhsh_addr6 *entry; 557 struct netlbl_unlhsh_addr6 *entry;
559 struct audit_buffer *audit_buf; 558 struct audit_buffer *audit_buf;
560 struct net_device *dev; 559 struct net_device *dev;
561 char *secctx; 560 char *secctx;
562 u32 secctx_len; 561 u32 secctx_len;
563 562
564 spin_lock(&netlbl_unlhsh_lock); 563 spin_lock(&netlbl_unlhsh_lock);
565 list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); 564 list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list);
566 spin_unlock(&netlbl_unlhsh_lock); 565 spin_unlock(&netlbl_unlhsh_lock);
567 if (list_entry != NULL) 566 if (list_entry != NULL)
568 entry = netlbl_unlhsh_addr6_entry(list_entry); 567 entry = netlbl_unlhsh_addr6_entry(list_entry);
569 else 568 else
570 entry = NULL; 569 entry = NULL;
571 570
572 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, 571 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
573 audit_info); 572 audit_info);
574 if (audit_buf != NULL) { 573 if (audit_buf != NULL) {
575 dev = dev_get_by_index(net, iface->ifindex); 574 dev = dev_get_by_index(net, iface->ifindex);
576 netlbl_af6list_audit_addr(audit_buf, 1, 575 netlbl_af6list_audit_addr(audit_buf, 1,
577 (dev != NULL ? dev->name : NULL), 576 (dev != NULL ? dev->name : NULL),
578 addr, mask); 577 addr, mask);
579 if (dev != NULL) 578 if (dev != NULL)
580 dev_put(dev); 579 dev_put(dev);
581 if (entry != NULL && 580 if (entry != NULL &&
582 security_secid_to_secctx(entry->secid, 581 security_secid_to_secctx(entry->secid,
583 &secctx, &secctx_len) == 0) { 582 &secctx, &secctx_len) == 0) {
584 audit_log_format(audit_buf, " sec_obj=%s", secctx); 583 audit_log_format(audit_buf, " sec_obj=%s", secctx);
585 security_release_secctx(secctx, secctx_len); 584 security_release_secctx(secctx, secctx_len);
586 } 585 }
587 audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); 586 audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
588 audit_log_end(audit_buf); 587 audit_log_end(audit_buf);
589 } 588 }
590 589
591 if (entry == NULL) 590 if (entry == NULL)
592 return -ENOENT; 591 return -ENOENT;
593 592
594 kfree_rcu(entry, rcu); 593 kfree_rcu(entry, rcu);
595 return 0; 594 return 0;
596 } 595 }
597 #endif /* IPv6 */ 596 #endif /* IPv6 */
598 597
599 /** 598 /**
600 * netlbl_unlhsh_condremove_iface - Remove an interface entry 599 * netlbl_unlhsh_condremove_iface - Remove an interface entry
601 * @iface: the interface entry 600 * @iface: the interface entry
602 * 601 *
603 * Description: 602 * Description:
604 * Remove an interface entry from the unlabeled connection hash table if it is 603 * Remove an interface entry from the unlabeled connection hash table if it is
605 * empty. An interface entry is considered to be empty if there are no 604 * empty. An interface entry is considered to be empty if there are no
606 * address entries assigned to it. 605 * address entries assigned to it.
607 * 606 *
608 */ 607 */
609 static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) 608 static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
610 { 609 {
611 struct netlbl_af4list *iter4; 610 struct netlbl_af4list *iter4;
612 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 611 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
613 struct netlbl_af6list *iter6; 612 struct netlbl_af6list *iter6;
614 #endif /* IPv6 */ 613 #endif /* IPv6 */
615 614
616 spin_lock(&netlbl_unlhsh_lock); 615 spin_lock(&netlbl_unlhsh_lock);
617 netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list) 616 netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list)
618 goto unlhsh_condremove_failure; 617 goto unlhsh_condremove_failure;
619 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 618 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
620 netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list) 619 netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list)
621 goto unlhsh_condremove_failure; 620 goto unlhsh_condremove_failure;
622 #endif /* IPv6 */ 621 #endif /* IPv6 */
623 iface->valid = 0; 622 iface->valid = 0;
624 if (iface->ifindex > 0) 623 if (iface->ifindex > 0)
625 list_del_rcu(&iface->list); 624 list_del_rcu(&iface->list);
626 else 625 else
627 rcu_assign_pointer(netlbl_unlhsh_def, NULL); 626 rcu_assign_pointer(netlbl_unlhsh_def, NULL);
628 spin_unlock(&netlbl_unlhsh_lock); 627 spin_unlock(&netlbl_unlhsh_lock);
629 628
630 call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); 629 call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
631 return; 630 return;
632 631
633 unlhsh_condremove_failure: 632 unlhsh_condremove_failure:
634 spin_unlock(&netlbl_unlhsh_lock); 633 spin_unlock(&netlbl_unlhsh_lock);
635 } 634 }
636 635
637 /** 636 /**
638 * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table 637 * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
639 * @net: network namespace 638 * @net: network namespace
640 * @dev_name: interface name 639 * @dev_name: interface name
641 * @addr: IP address in network byte order 640 * @addr: IP address in network byte order
642 * @mask: address mask in network byte order 641 * @mask: address mask in network byte order
643 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) 642 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
644 * @audit_info: NetLabel audit information 643 * @audit_info: NetLabel audit information
645 * 644 *
646 * Description: 645 * Description:
647 * Removes and existing entry from the unlabeled connection hash table. 646 * Removes and existing entry from the unlabeled connection hash table.
648 * Returns zero on success, negative values on failure. 647 * Returns zero on success, negative values on failure.
649 * 648 *
650 */ 649 */
651 int netlbl_unlhsh_remove(struct net *net, 650 int netlbl_unlhsh_remove(struct net *net,
652 const char *dev_name, 651 const char *dev_name,
653 const void *addr, 652 const void *addr,
654 const void *mask, 653 const void *mask,
655 u32 addr_len, 654 u32 addr_len,
656 struct netlbl_audit *audit_info) 655 struct netlbl_audit *audit_info)
657 { 656 {
658 int ret_val; 657 int ret_val;
659 struct net_device *dev; 658 struct net_device *dev;
660 struct netlbl_unlhsh_iface *iface; 659 struct netlbl_unlhsh_iface *iface;
661 660
662 if (addr_len != sizeof(struct in_addr) && 661 if (addr_len != sizeof(struct in_addr) &&
663 addr_len != sizeof(struct in6_addr)) 662 addr_len != sizeof(struct in6_addr))
664 return -EINVAL; 663 return -EINVAL;
665 664
666 rcu_read_lock(); 665 rcu_read_lock();
667 if (dev_name != NULL) { 666 if (dev_name != NULL) {
668 dev = dev_get_by_name_rcu(net, dev_name); 667 dev = dev_get_by_name_rcu(net, dev_name);
669 if (dev == NULL) { 668 if (dev == NULL) {
670 ret_val = -ENODEV; 669 ret_val = -ENODEV;
671 goto unlhsh_remove_return; 670 goto unlhsh_remove_return;
672 } 671 }
673 iface = netlbl_unlhsh_search_iface(dev->ifindex); 672 iface = netlbl_unlhsh_search_iface(dev->ifindex);
674 } else 673 } else
675 iface = rcu_dereference(netlbl_unlhsh_def); 674 iface = rcu_dereference(netlbl_unlhsh_def);
676 if (iface == NULL) { 675 if (iface == NULL) {
677 ret_val = -ENOENT; 676 ret_val = -ENOENT;
678 goto unlhsh_remove_return; 677 goto unlhsh_remove_return;
679 } 678 }
680 switch (addr_len) { 679 switch (addr_len) {
681 case sizeof(struct in_addr): 680 case sizeof(struct in_addr):
682 ret_val = netlbl_unlhsh_remove_addr4(net, 681 ret_val = netlbl_unlhsh_remove_addr4(net,
683 iface, addr, mask, 682 iface, addr, mask,
684 audit_info); 683 audit_info);
685 break; 684 break;
686 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 685 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
687 case sizeof(struct in6_addr): 686 case sizeof(struct in6_addr):
688 ret_val = netlbl_unlhsh_remove_addr6(net, 687 ret_val = netlbl_unlhsh_remove_addr6(net,
689 iface, addr, mask, 688 iface, addr, mask,
690 audit_info); 689 audit_info);
691 break; 690 break;
692 #endif /* IPv6 */ 691 #endif /* IPv6 */
693 default: 692 default:
694 ret_val = -EINVAL; 693 ret_val = -EINVAL;
695 } 694 }
696 if (ret_val == 0) { 695 if (ret_val == 0) {
697 netlbl_unlhsh_condremove_iface(iface); 696 netlbl_unlhsh_condremove_iface(iface);
698 atomic_dec(&netlabel_mgmt_protocount); 697 atomic_dec(&netlabel_mgmt_protocount);
699 } 698 }
700 699
701 unlhsh_remove_return: 700 unlhsh_remove_return:
702 rcu_read_unlock(); 701 rcu_read_unlock();
703 return ret_val; 702 return ret_val;
704 } 703 }
705 704
706 /* 705 /*
707 * General Helper Functions 706 * General Helper Functions
708 */ 707 */
709 708
710 /** 709 /**
711 * netlbl_unlhsh_netdev_handler - Network device notification handler 710 * netlbl_unlhsh_netdev_handler - Network device notification handler
712 * @this: notifier block 711 * @this: notifier block
713 * @event: the event 712 * @event: the event
714 * @ptr: the network device (cast to void) 713 * @ptr: the network device (cast to void)
715 * 714 *
716 * Description: 715 * Description:
717 * Handle network device events, although at present all we care about is a 716 * Handle network device events, although at present all we care about is a
718 * network device going away. In the case of a device going away we clear any 717 * network device going away. In the case of a device going away we clear any
719 * related entries from the unlabeled connection hash table. 718 * related entries from the unlabeled connection hash table.
720 * 719 *
721 */ 720 */
722 static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, 721 static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
723 unsigned long event, 722 unsigned long event,
724 void *ptr) 723 void *ptr)
725 { 724 {
726 struct net_device *dev = ptr; 725 struct net_device *dev = ptr;
727 struct netlbl_unlhsh_iface *iface = NULL; 726 struct netlbl_unlhsh_iface *iface = NULL;
728 727
729 if (!net_eq(dev_net(dev), &init_net)) 728 if (!net_eq(dev_net(dev), &init_net))
730 return NOTIFY_DONE; 729 return NOTIFY_DONE;
731 730
732 /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ 731 /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
733 if (event == NETDEV_DOWN) { 732 if (event == NETDEV_DOWN) {
734 spin_lock(&netlbl_unlhsh_lock); 733 spin_lock(&netlbl_unlhsh_lock);
735 iface = netlbl_unlhsh_search_iface(dev->ifindex); 734 iface = netlbl_unlhsh_search_iface(dev->ifindex);
736 if (iface != NULL && iface->valid) { 735 if (iface != NULL && iface->valid) {
737 iface->valid = 0; 736 iface->valid = 0;
738 list_del_rcu(&iface->list); 737 list_del_rcu(&iface->list);
739 } else 738 } else
740 iface = NULL; 739 iface = NULL;
741 spin_unlock(&netlbl_unlhsh_lock); 740 spin_unlock(&netlbl_unlhsh_lock);
742 } 741 }
743 742
744 if (iface != NULL) 743 if (iface != NULL)
745 call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); 744 call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
746 745
747 return NOTIFY_DONE; 746 return NOTIFY_DONE;
748 } 747 }
749 748
750 /** 749 /**
751 * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag 750 * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
752 * @value: desired value 751 * @value: desired value
753 * @audit_info: NetLabel audit information 752 * @audit_info: NetLabel audit information
754 * 753 *
755 * Description: 754 * Description:
756 * Set the value of the unlabeled accept flag to @value. 755 * Set the value of the unlabeled accept flag to @value.
757 * 756 *
758 */ 757 */
759 static void netlbl_unlabel_acceptflg_set(u8 value, 758 static void netlbl_unlabel_acceptflg_set(u8 value,
760 struct netlbl_audit *audit_info) 759 struct netlbl_audit *audit_info)
761 { 760 {
762 struct audit_buffer *audit_buf; 761 struct audit_buffer *audit_buf;
763 u8 old_val; 762 u8 old_val;
764 763
765 old_val = netlabel_unlabel_acceptflg; 764 old_val = netlabel_unlabel_acceptflg;
766 netlabel_unlabel_acceptflg = value; 765 netlabel_unlabel_acceptflg = value;
767 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, 766 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
768 audit_info); 767 audit_info);
769 if (audit_buf != NULL) { 768 if (audit_buf != NULL) {
770 audit_log_format(audit_buf, 769 audit_log_format(audit_buf,
771 " unlbl_accept=%u old=%u", value, old_val); 770 " unlbl_accept=%u old=%u", value, old_val);
772 audit_log_end(audit_buf); 771 audit_log_end(audit_buf);
773 } 772 }
774 } 773 }
775 774
776 /** 775 /**
777 * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information 776 * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
778 * @info: the Generic NETLINK info block 777 * @info: the Generic NETLINK info block
779 * @addr: the IP address 778 * @addr: the IP address
780 * @mask: the IP address mask 779 * @mask: the IP address mask
781 * @len: the address length 780 * @len: the address length
782 * 781 *
783 * Description: 782 * Description:
784 * Examine the Generic NETLINK message and extract the IP address information. 783 * Examine the Generic NETLINK message and extract the IP address information.
785 * Returns zero on success, negative values on failure. 784 * Returns zero on success, negative values on failure.
786 * 785 *
787 */ 786 */
788 static int netlbl_unlabel_addrinfo_get(struct genl_info *info, 787 static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
789 void **addr, 788 void **addr,
790 void **mask, 789 void **mask,
791 u32 *len) 790 u32 *len)
792 { 791 {
793 u32 addr_len; 792 u32 addr_len;
794 793
795 if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { 794 if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
796 addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); 795 addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
797 if (addr_len != sizeof(struct in_addr) && 796 if (addr_len != sizeof(struct in_addr) &&
798 addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) 797 addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
799 return -EINVAL; 798 return -EINVAL;
800 *len = addr_len; 799 *len = addr_len;
801 *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); 800 *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
802 *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); 801 *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
803 return 0; 802 return 0;
804 } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { 803 } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
805 addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); 804 addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
806 if (addr_len != sizeof(struct in6_addr) && 805 if (addr_len != sizeof(struct in6_addr) &&
807 addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) 806 addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
808 return -EINVAL; 807 return -EINVAL;
809 *len = addr_len; 808 *len = addr_len;
810 *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); 809 *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
811 *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); 810 *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
812 return 0; 811 return 0;
813 } 812 }
814 813
815 return -EINVAL; 814 return -EINVAL;
816 } 815 }
817 816
818 /* 817 /*
819 * NetLabel Command Handlers 818 * NetLabel Command Handlers
820 */ 819 */
821 820
822 /** 821 /**
823 * netlbl_unlabel_accept - Handle an ACCEPT message 822 * netlbl_unlabel_accept - Handle an ACCEPT message
824 * @skb: the NETLINK buffer 823 * @skb: the NETLINK buffer
825 * @info: the Generic NETLINK info block 824 * @info: the Generic NETLINK info block
826 * 825 *
827 * Description: 826 * Description:
828 * Process a user generated ACCEPT message and set the accept flag accordingly. 827 * Process a user generated ACCEPT message and set the accept flag accordingly.
829 * Returns zero on success, negative values on failure. 828 * Returns zero on success, negative values on failure.
830 * 829 *
831 */ 830 */
832 static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) 831 static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info)
833 { 832 {
834 u8 value; 833 u8 value;
835 struct netlbl_audit audit_info; 834 struct netlbl_audit audit_info;
836 835
837 if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { 836 if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) {
838 value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); 837 value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]);
839 if (value == 1 || value == 0) { 838 if (value == 1 || value == 0) {
840 netlbl_netlink_auditinfo(skb, &audit_info); 839 netlbl_netlink_auditinfo(skb, &audit_info);
841 netlbl_unlabel_acceptflg_set(value, &audit_info); 840 netlbl_unlabel_acceptflg_set(value, &audit_info);
842 return 0; 841 return 0;
843 } 842 }
844 } 843 }
845 844
846 return -EINVAL; 845 return -EINVAL;
847 } 846 }
848 847
849 /** 848 /**
850 * netlbl_unlabel_list - Handle a LIST message 849 * netlbl_unlabel_list - Handle a LIST message
851 * @skb: the NETLINK buffer 850 * @skb: the NETLINK buffer
852 * @info: the Generic NETLINK info block 851 * @info: the Generic NETLINK info block
853 * 852 *
854 * Description: 853 * Description:
855 * Process a user generated LIST message and respond with the current status. 854 * Process a user generated LIST message and respond with the current status.
856 * Returns zero on success, negative values on failure. 855 * Returns zero on success, negative values on failure.
857 * 856 *
858 */ 857 */
859 static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) 858 static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
860 { 859 {
861 int ret_val = -EINVAL; 860 int ret_val = -EINVAL;
862 struct sk_buff *ans_skb; 861 struct sk_buff *ans_skb;
863 void *data; 862 void *data;
864 863
865 ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 864 ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
866 if (ans_skb == NULL) 865 if (ans_skb == NULL)
867 goto list_failure; 866 goto list_failure;
868 data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family, 867 data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family,
869 0, NLBL_UNLABEL_C_LIST); 868 0, NLBL_UNLABEL_C_LIST);
870 if (data == NULL) { 869 if (data == NULL) {
871 ret_val = -ENOMEM; 870 ret_val = -ENOMEM;
872 goto list_failure; 871 goto list_failure;
873 } 872 }
874 873
875 ret_val = nla_put_u8(ans_skb, 874 ret_val = nla_put_u8(ans_skb,
876 NLBL_UNLABEL_A_ACPTFLG, 875 NLBL_UNLABEL_A_ACPTFLG,
877 netlabel_unlabel_acceptflg); 876 netlabel_unlabel_acceptflg);
878 if (ret_val != 0) 877 if (ret_val != 0)
879 goto list_failure; 878 goto list_failure;
880 879
881 genlmsg_end(ans_skb, data); 880 genlmsg_end(ans_skb, data);
882 return genlmsg_reply(ans_skb, info); 881 return genlmsg_reply(ans_skb, info);
883 882
884 list_failure: 883 list_failure:
885 kfree_skb(ans_skb); 884 kfree_skb(ans_skb);
886 return ret_val; 885 return ret_val;
887 } 886 }
888 887
889 /** 888 /**
890 * netlbl_unlabel_staticadd - Handle a STATICADD message 889 * netlbl_unlabel_staticadd - Handle a STATICADD message
891 * @skb: the NETLINK buffer 890 * @skb: the NETLINK buffer
892 * @info: the Generic NETLINK info block 891 * @info: the Generic NETLINK info block
893 * 892 *
894 * Description: 893 * Description:
895 * Process a user generated STATICADD message and add a new unlabeled 894 * Process a user generated STATICADD message and add a new unlabeled
896 * connection entry to the hash table. Returns zero on success, negative 895 * connection entry to the hash table. Returns zero on success, negative
897 * values on failure. 896 * values on failure.
898 * 897 *
899 */ 898 */
900 static int netlbl_unlabel_staticadd(struct sk_buff *skb, 899 static int netlbl_unlabel_staticadd(struct sk_buff *skb,
901 struct genl_info *info) 900 struct genl_info *info)
902 { 901 {
903 int ret_val; 902 int ret_val;
904 char *dev_name; 903 char *dev_name;
905 void *addr; 904 void *addr;
906 void *mask; 905 void *mask;
907 u32 addr_len; 906 u32 addr_len;
908 u32 secid; 907 u32 secid;
909 struct netlbl_audit audit_info; 908 struct netlbl_audit audit_info;
910 909
911 /* Don't allow users to add both IPv4 and IPv6 addresses for a 910 /* Don't allow users to add both IPv4 and IPv6 addresses for a
912 * single entry. However, allow users to create two entries, one each 911 * single entry. However, allow users to create two entries, one each
913 * for IPv4 and IPv4, with the same LSM security context which should 912 * for IPv4 and IPv4, with the same LSM security context which should
914 * achieve the same result. */ 913 * achieve the same result. */
915 if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || 914 if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
916 !info->attrs[NLBL_UNLABEL_A_IFACE] || 915 !info->attrs[NLBL_UNLABEL_A_IFACE] ||
917 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || 916 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
918 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ 917 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
919 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || 918 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
920 !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) 919 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
921 return -EINVAL; 920 return -EINVAL;
922 921
923 netlbl_netlink_auditinfo(skb, &audit_info); 922 netlbl_netlink_auditinfo(skb, &audit_info);
924 923
925 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); 924 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
926 if (ret_val != 0) 925 if (ret_val != 0)
927 return ret_val; 926 return ret_val;
928 dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); 927 dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
929 ret_val = security_secctx_to_secid( 928 ret_val = security_secctx_to_secid(
930 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), 929 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
931 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), 930 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
932 &secid); 931 &secid);
933 if (ret_val != 0) 932 if (ret_val != 0)
934 return ret_val; 933 return ret_val;
935 934
936 return netlbl_unlhsh_add(&init_net, 935 return netlbl_unlhsh_add(&init_net,
937 dev_name, addr, mask, addr_len, secid, 936 dev_name, addr, mask, addr_len, secid,
938 &audit_info); 937 &audit_info);
939 } 938 }
940 939
941 /** 940 /**
942 * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message 941 * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
943 * @skb: the NETLINK buffer 942 * @skb: the NETLINK buffer
944 * @info: the Generic NETLINK info block 943 * @info: the Generic NETLINK info block
945 * 944 *
946 * Description: 945 * Description:
947 * Process a user generated STATICADDDEF message and add a new default 946 * Process a user generated STATICADDDEF message and add a new default
948 * unlabeled connection entry. Returns zero on success, negative values on 947 * unlabeled connection entry. Returns zero on success, negative values on
949 * failure. 948 * failure.
950 * 949 *
951 */ 950 */
952 static int netlbl_unlabel_staticadddef(struct sk_buff *skb, 951 static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
953 struct genl_info *info) 952 struct genl_info *info)
954 { 953 {
955 int ret_val; 954 int ret_val;
956 void *addr; 955 void *addr;
957 void *mask; 956 void *mask;
958 u32 addr_len; 957 u32 addr_len;
959 u32 secid; 958 u32 secid;
960 struct netlbl_audit audit_info; 959 struct netlbl_audit audit_info;
961 960
962 /* Don't allow users to add both IPv4 and IPv6 addresses for a 961 /* Don't allow users to add both IPv4 and IPv6 addresses for a
963 * single entry. However, allow users to create two entries, one each 962 * single entry. However, allow users to create two entries, one each
964 * for IPv4 and IPv6, with the same LSM security context which should 963 * for IPv4 and IPv6, with the same LSM security context which should
965 * achieve the same result. */ 964 * achieve the same result. */
966 if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || 965 if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
967 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || 966 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
968 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ 967 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
969 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || 968 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
970 !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) 969 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
971 return -EINVAL; 970 return -EINVAL;
972 971
973 netlbl_netlink_auditinfo(skb, &audit_info); 972 netlbl_netlink_auditinfo(skb, &audit_info);
974 973
975 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); 974 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
976 if (ret_val != 0) 975 if (ret_val != 0)
977 return ret_val; 976 return ret_val;
978 ret_val = security_secctx_to_secid( 977 ret_val = security_secctx_to_secid(
979 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), 978 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
980 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), 979 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
981 &secid); 980 &secid);
982 if (ret_val != 0) 981 if (ret_val != 0)
983 return ret_val; 982 return ret_val;
984 983
985 return netlbl_unlhsh_add(&init_net, 984 return netlbl_unlhsh_add(&init_net,
986 NULL, addr, mask, addr_len, secid, 985 NULL, addr, mask, addr_len, secid,
987 &audit_info); 986 &audit_info);
988 } 987 }
989 988
990 /** 989 /**
991 * netlbl_unlabel_staticremove - Handle a STATICREMOVE message 990 * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
992 * @skb: the NETLINK buffer 991 * @skb: the NETLINK buffer
993 * @info: the Generic NETLINK info block 992 * @info: the Generic NETLINK info block
994 * 993 *
995 * Description: 994 * Description:
996 * Process a user generated STATICREMOVE message and remove the specified 995 * Process a user generated STATICREMOVE message and remove the specified
997 * unlabeled connection entry. Returns zero on success, negative values on 996 * unlabeled connection entry. Returns zero on success, negative values on
998 * failure. 997 * failure.
999 * 998 *
1000 */ 999 */
1001 static int netlbl_unlabel_staticremove(struct sk_buff *skb, 1000 static int netlbl_unlabel_staticremove(struct sk_buff *skb,
1002 struct genl_info *info) 1001 struct genl_info *info)
1003 { 1002 {
1004 int ret_val; 1003 int ret_val;
1005 char *dev_name; 1004 char *dev_name;
1006 void *addr; 1005 void *addr;
1007 void *mask; 1006 void *mask;
1008 u32 addr_len; 1007 u32 addr_len;
1009 struct netlbl_audit audit_info; 1008 struct netlbl_audit audit_info;
1010 1009
1011 /* See the note in netlbl_unlabel_staticadd() about not allowing both 1010 /* See the note in netlbl_unlabel_staticadd() about not allowing both
1012 * IPv4 and IPv6 in the same entry. */ 1011 * IPv4 and IPv6 in the same entry. */
1013 if (!info->attrs[NLBL_UNLABEL_A_IFACE] || 1012 if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
1014 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || 1013 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1015 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ 1014 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1016 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || 1015 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1017 !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) 1016 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1018 return -EINVAL; 1017 return -EINVAL;
1019 1018
1020 netlbl_netlink_auditinfo(skb, &audit_info); 1019 netlbl_netlink_auditinfo(skb, &audit_info);
1021 1020
1022 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); 1021 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1023 if (ret_val != 0) 1022 if (ret_val != 0)
1024 return ret_val; 1023 return ret_val;
1025 dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); 1024 dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
1026 1025
1027 return netlbl_unlhsh_remove(&init_net, 1026 return netlbl_unlhsh_remove(&init_net,
1028 dev_name, addr, mask, addr_len, 1027 dev_name, addr, mask, addr_len,
1029 &audit_info); 1028 &audit_info);
1030 } 1029 }
1031 1030
1032 /** 1031 /**
1033 * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message 1032 * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
1034 * @skb: the NETLINK buffer 1033 * @skb: the NETLINK buffer
1035 * @info: the Generic NETLINK info block 1034 * @info: the Generic NETLINK info block
1036 * 1035 *
1037 * Description: 1036 * Description:
1038 * Process a user generated STATICREMOVEDEF message and remove the default 1037 * Process a user generated STATICREMOVEDEF message and remove the default
1039 * unlabeled connection entry. Returns zero on success, negative values on 1038 * unlabeled connection entry. Returns zero on success, negative values on
1040 * failure. 1039 * failure.
1041 * 1040 *
1042 */ 1041 */
1043 static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, 1042 static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
1044 struct genl_info *info) 1043 struct genl_info *info)
1045 { 1044 {
1046 int ret_val; 1045 int ret_val;
1047 void *addr; 1046 void *addr;
1048 void *mask; 1047 void *mask;
1049 u32 addr_len; 1048 u32 addr_len;
1050 struct netlbl_audit audit_info; 1049 struct netlbl_audit audit_info;
1051 1050
1052 /* See the note in netlbl_unlabel_staticadd() about not allowing both 1051 /* See the note in netlbl_unlabel_staticadd() about not allowing both
1053 * IPv4 and IPv6 in the same entry. */ 1052 * IPv4 and IPv6 in the same entry. */
1054 if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || 1053 if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1055 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ 1054 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1056 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || 1055 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1057 !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) 1056 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1058 return -EINVAL; 1057 return -EINVAL;
1059 1058
1060 netlbl_netlink_auditinfo(skb, &audit_info); 1059 netlbl_netlink_auditinfo(skb, &audit_info);
1061 1060
1062 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); 1061 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1063 if (ret_val != 0) 1062 if (ret_val != 0)
1064 return ret_val; 1063 return ret_val;
1065 1064
1066 return netlbl_unlhsh_remove(&init_net, 1065 return netlbl_unlhsh_remove(&init_net,
1067 NULL, addr, mask, addr_len, 1066 NULL, addr, mask, addr_len,
1068 &audit_info); 1067 &audit_info);
1069 } 1068 }
1070 1069
1071 1070
1072 /** 1071 /**
1073 * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] 1072 * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
1074 * @cmd: command/message 1073 * @cmd: command/message
1075 * @iface: the interface entry 1074 * @iface: the interface entry
1076 * @addr4: the IPv4 address entry 1075 * @addr4: the IPv4 address entry
1077 * @addr6: the IPv6 address entry 1076 * @addr6: the IPv6 address entry
1078 * @arg: the netlbl_unlhsh_walk_arg structure 1077 * @arg: the netlbl_unlhsh_walk_arg structure
1079 * 1078 *
1080 * Description: 1079 * Description:
1081 * This function is designed to be used to generate a response for a 1080 * This function is designed to be used to generate a response for a
1082 * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 1081 * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6
1083 * can be specified, not both, the other unspecified entry should be set to 1082 * can be specified, not both, the other unspecified entry should be set to
1084 * NULL by the caller. Returns the size of the message on success, negative 1083 * NULL by the caller. Returns the size of the message on success, negative
1085 * values on failure. 1084 * values on failure.
1086 * 1085 *
1087 */ 1086 */
1088 static int netlbl_unlabel_staticlist_gen(u32 cmd, 1087 static int netlbl_unlabel_staticlist_gen(u32 cmd,
1089 const struct netlbl_unlhsh_iface *iface, 1088 const struct netlbl_unlhsh_iface *iface,
1090 const struct netlbl_unlhsh_addr4 *addr4, 1089 const struct netlbl_unlhsh_addr4 *addr4,
1091 const struct netlbl_unlhsh_addr6 *addr6, 1090 const struct netlbl_unlhsh_addr6 *addr6,
1092 void *arg) 1091 void *arg)
1093 { 1092 {
1094 int ret_val = -ENOMEM; 1093 int ret_val = -ENOMEM;
1095 struct netlbl_unlhsh_walk_arg *cb_arg = arg; 1094 struct netlbl_unlhsh_walk_arg *cb_arg = arg;
1096 struct net_device *dev; 1095 struct net_device *dev;
1097 void *data; 1096 void *data;
1098 u32 secid; 1097 u32 secid;
1099 char *secctx; 1098 char *secctx;
1100 u32 secctx_len; 1099 u32 secctx_len;
1101 1100
1102 data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid, 1101 data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
1103 cb_arg->seq, &netlbl_unlabel_gnl_family, 1102 cb_arg->seq, &netlbl_unlabel_gnl_family,
1104 NLM_F_MULTI, cmd); 1103 NLM_F_MULTI, cmd);
1105 if (data == NULL) 1104 if (data == NULL)
1106 goto list_cb_failure; 1105 goto list_cb_failure;
1107 1106
1108 if (iface->ifindex > 0) { 1107 if (iface->ifindex > 0) {
1109 dev = dev_get_by_index(&init_net, iface->ifindex); 1108 dev = dev_get_by_index(&init_net, iface->ifindex);
1110 if (!dev) { 1109 if (!dev) {
1111 ret_val = -ENODEV; 1110 ret_val = -ENODEV;
1112 goto list_cb_failure; 1111 goto list_cb_failure;
1113 } 1112 }
1114 ret_val = nla_put_string(cb_arg->skb, 1113 ret_val = nla_put_string(cb_arg->skb,
1115 NLBL_UNLABEL_A_IFACE, dev->name); 1114 NLBL_UNLABEL_A_IFACE, dev->name);
1116 dev_put(dev); 1115 dev_put(dev);
1117 if (ret_val != 0) 1116 if (ret_val != 0)
1118 goto list_cb_failure; 1117 goto list_cb_failure;
1119 } 1118 }
1120 1119
1121 if (addr4) { 1120 if (addr4) {
1122 struct in_addr addr_struct; 1121 struct in_addr addr_struct;
1123 1122
1124 addr_struct.s_addr = addr4->list.addr; 1123 addr_struct.s_addr = addr4->list.addr;
1125 ret_val = nla_put(cb_arg->skb, 1124 ret_val = nla_put(cb_arg->skb,
1126 NLBL_UNLABEL_A_IPV4ADDR, 1125 NLBL_UNLABEL_A_IPV4ADDR,
1127 sizeof(struct in_addr), 1126 sizeof(struct in_addr),
1128 &addr_struct); 1127 &addr_struct);
1129 if (ret_val != 0) 1128 if (ret_val != 0)
1130 goto list_cb_failure; 1129 goto list_cb_failure;
1131 1130
1132 addr_struct.s_addr = addr4->list.mask; 1131 addr_struct.s_addr = addr4->list.mask;
1133 ret_val = nla_put(cb_arg->skb, 1132 ret_val = nla_put(cb_arg->skb,
1134 NLBL_UNLABEL_A_IPV4MASK, 1133 NLBL_UNLABEL_A_IPV4MASK,
1135 sizeof(struct in_addr), 1134 sizeof(struct in_addr),
1136 &addr_struct); 1135 &addr_struct);
1137 if (ret_val != 0) 1136 if (ret_val != 0)
1138 goto list_cb_failure; 1137 goto list_cb_failure;
1139 1138
1140 secid = addr4->secid; 1139 secid = addr4->secid;
1141 } else { 1140 } else {
1142 ret_val = nla_put(cb_arg->skb, 1141 ret_val = nla_put(cb_arg->skb,
1143 NLBL_UNLABEL_A_IPV6ADDR, 1142 NLBL_UNLABEL_A_IPV6ADDR,
1144 sizeof(struct in6_addr), 1143 sizeof(struct in6_addr),
1145 &addr6->list.addr); 1144 &addr6->list.addr);
1146 if (ret_val != 0) 1145 if (ret_val != 0)
1147 goto list_cb_failure; 1146 goto list_cb_failure;
1148 1147
1149 ret_val = nla_put(cb_arg->skb, 1148 ret_val = nla_put(cb_arg->skb,
1150 NLBL_UNLABEL_A_IPV6MASK, 1149 NLBL_UNLABEL_A_IPV6MASK,
1151 sizeof(struct in6_addr), 1150 sizeof(struct in6_addr),
1152 &addr6->list.mask); 1151 &addr6->list.mask);
1153 if (ret_val != 0) 1152 if (ret_val != 0)
1154 goto list_cb_failure; 1153 goto list_cb_failure;
1155 1154
1156 secid = addr6->secid; 1155 secid = addr6->secid;
1157 } 1156 }
1158 1157
1159 ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); 1158 ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
1160 if (ret_val != 0) 1159 if (ret_val != 0)
1161 goto list_cb_failure; 1160 goto list_cb_failure;
1162 ret_val = nla_put(cb_arg->skb, 1161 ret_val = nla_put(cb_arg->skb,
1163 NLBL_UNLABEL_A_SECCTX, 1162 NLBL_UNLABEL_A_SECCTX,
1164 secctx_len, 1163 secctx_len,
1165 secctx); 1164 secctx);
1166 security_release_secctx(secctx, secctx_len); 1165 security_release_secctx(secctx, secctx_len);
1167 if (ret_val != 0) 1166 if (ret_val != 0)
1168 goto list_cb_failure; 1167 goto list_cb_failure;
1169 1168
1170 cb_arg->seq++; 1169 cb_arg->seq++;
1171 return genlmsg_end(cb_arg->skb, data); 1170 return genlmsg_end(cb_arg->skb, data);
1172 1171
1173 list_cb_failure: 1172 list_cb_failure:
1174 genlmsg_cancel(cb_arg->skb, data); 1173 genlmsg_cancel(cb_arg->skb, data);
1175 return ret_val; 1174 return ret_val;
1176 } 1175 }
1177 1176
1178 /** 1177 /**
1179 * netlbl_unlabel_staticlist - Handle a STATICLIST message 1178 * netlbl_unlabel_staticlist - Handle a STATICLIST message
1180 * @skb: the NETLINK buffer 1179 * @skb: the NETLINK buffer
1181 * @cb: the NETLINK callback 1180 * @cb: the NETLINK callback
1182 * 1181 *
1183 * Description: 1182 * Description:
1184 * Process a user generated STATICLIST message and dump the unlabeled 1183 * Process a user generated STATICLIST message and dump the unlabeled
1185 * connection hash table in a form suitable for use in a kernel generated 1184 * connection hash table in a form suitable for use in a kernel generated
1186 * STATICLIST message. Returns the length of @skb. 1185 * STATICLIST message. Returns the length of @skb.
1187 * 1186 *
1188 */ 1187 */
1189 static int netlbl_unlabel_staticlist(struct sk_buff *skb, 1188 static int netlbl_unlabel_staticlist(struct sk_buff *skb,
1190 struct netlink_callback *cb) 1189 struct netlink_callback *cb)
1191 { 1190 {
1192 struct netlbl_unlhsh_walk_arg cb_arg; 1191 struct netlbl_unlhsh_walk_arg cb_arg;
1193 u32 skip_bkt = cb->args[0]; 1192 u32 skip_bkt = cb->args[0];
1194 u32 skip_chain = cb->args[1]; 1193 u32 skip_chain = cb->args[1];
1195 u32 skip_addr4 = cb->args[2]; 1194 u32 skip_addr4 = cb->args[2];
1196 u32 skip_addr6 = cb->args[3]; 1195 u32 skip_addr6 = cb->args[3];
1197 u32 iter_bkt; 1196 u32 iter_bkt;
1198 u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; 1197 u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
1199 struct netlbl_unlhsh_iface *iface; 1198 struct netlbl_unlhsh_iface *iface;
1200 struct list_head *iter_list; 1199 struct list_head *iter_list;
1201 struct netlbl_af4list *addr4; 1200 struct netlbl_af4list *addr4;
1202 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 1201 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1203 struct netlbl_af6list *addr6; 1202 struct netlbl_af6list *addr6;
1204 #endif 1203 #endif
1205 1204
1206 cb_arg.nl_cb = cb; 1205 cb_arg.nl_cb = cb;
1207 cb_arg.skb = skb; 1206 cb_arg.skb = skb;
1208 cb_arg.seq = cb->nlh->nlmsg_seq; 1207 cb_arg.seq = cb->nlh->nlmsg_seq;
1209 1208
1210 rcu_read_lock(); 1209 rcu_read_lock();
1211 for (iter_bkt = skip_bkt; 1210 for (iter_bkt = skip_bkt;
1212 iter_bkt < rcu_dereference(netlbl_unlhsh)->size; 1211 iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
1213 iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { 1212 iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
1214 iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; 1213 iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt];
1215 list_for_each_entry_rcu(iface, iter_list, list) { 1214 list_for_each_entry_rcu(iface, iter_list, list) {
1216 if (!iface->valid || 1215 if (!iface->valid ||
1217 iter_chain++ < skip_chain) 1216 iter_chain++ < skip_chain)
1218 continue; 1217 continue;
1219 netlbl_af4list_foreach_rcu(addr4, 1218 netlbl_af4list_foreach_rcu(addr4,
1220 &iface->addr4_list) { 1219 &iface->addr4_list) {
1221 if (iter_addr4++ < skip_addr4) 1220 if (iter_addr4++ < skip_addr4)
1222 continue; 1221 continue;
1223 if (netlbl_unlabel_staticlist_gen( 1222 if (netlbl_unlabel_staticlist_gen(
1224 NLBL_UNLABEL_C_STATICLIST, 1223 NLBL_UNLABEL_C_STATICLIST,
1225 iface, 1224 iface,
1226 netlbl_unlhsh_addr4_entry(addr4), 1225 netlbl_unlhsh_addr4_entry(addr4),
1227 NULL, 1226 NULL,
1228 &cb_arg) < 0) { 1227 &cb_arg) < 0) {
1229 iter_addr4--; 1228 iter_addr4--;
1230 iter_chain--; 1229 iter_chain--;
1231 goto unlabel_staticlist_return; 1230 goto unlabel_staticlist_return;
1232 } 1231 }
1233 } 1232 }
1234 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 1233 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1235 netlbl_af6list_foreach_rcu(addr6, 1234 netlbl_af6list_foreach_rcu(addr6,
1236 &iface->addr6_list) { 1235 &iface->addr6_list) {
1237 if (iter_addr6++ < skip_addr6) 1236 if (iter_addr6++ < skip_addr6)
1238 continue; 1237 continue;
1239 if (netlbl_unlabel_staticlist_gen( 1238 if (netlbl_unlabel_staticlist_gen(
1240 NLBL_UNLABEL_C_STATICLIST, 1239 NLBL_UNLABEL_C_STATICLIST,
1241 iface, 1240 iface,
1242 NULL, 1241 NULL,
1243 netlbl_unlhsh_addr6_entry(addr6), 1242 netlbl_unlhsh_addr6_entry(addr6),
1244 &cb_arg) < 0) { 1243 &cb_arg) < 0) {
1245 iter_addr6--; 1244 iter_addr6--;
1246 iter_chain--; 1245 iter_chain--;
1247 goto unlabel_staticlist_return; 1246 goto unlabel_staticlist_return;
1248 } 1247 }
1249 } 1248 }
1250 #endif /* IPv6 */ 1249 #endif /* IPv6 */
1251 } 1250 }
1252 } 1251 }
1253 1252
1254 unlabel_staticlist_return: 1253 unlabel_staticlist_return:
1255 rcu_read_unlock(); 1254 rcu_read_unlock();
1256 cb->args[0] = skip_bkt; 1255 cb->args[0] = skip_bkt;
1257 cb->args[1] = skip_chain; 1256 cb->args[1] = skip_chain;
1258 cb->args[2] = skip_addr4; 1257 cb->args[2] = skip_addr4;
1259 cb->args[3] = skip_addr6; 1258 cb->args[3] = skip_addr6;
1260 return skb->len; 1259 return skb->len;
1261 } 1260 }
1262 1261
1263 /** 1262 /**
1264 * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message 1263 * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
1265 * @skb: the NETLINK buffer 1264 * @skb: the NETLINK buffer
1266 * @cb: the NETLINK callback 1265 * @cb: the NETLINK callback
1267 * 1266 *
1268 * Description: 1267 * Description:
1269 * Process a user generated STATICLISTDEF message and dump the default 1268 * Process a user generated STATICLISTDEF message and dump the default
1270 * unlabeled connection entry in a form suitable for use in a kernel generated 1269 * unlabeled connection entry in a form suitable for use in a kernel generated
1271 * STATICLISTDEF message. Returns the length of @skb. 1270 * STATICLISTDEF message. Returns the length of @skb.
1272 * 1271 *
1273 */ 1272 */
1274 static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, 1273 static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
1275 struct netlink_callback *cb) 1274 struct netlink_callback *cb)
1276 { 1275 {
1277 struct netlbl_unlhsh_walk_arg cb_arg; 1276 struct netlbl_unlhsh_walk_arg cb_arg;
1278 struct netlbl_unlhsh_iface *iface; 1277 struct netlbl_unlhsh_iface *iface;
1279 u32 skip_addr4 = cb->args[0]; 1278 u32 skip_addr4 = cb->args[0];
1280 u32 skip_addr6 = cb->args[1]; 1279 u32 skip_addr6 = cb->args[1];
1281 u32 iter_addr4 = 0; 1280 u32 iter_addr4 = 0;
1282 struct netlbl_af4list *addr4; 1281 struct netlbl_af4list *addr4;
1283 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 1282 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1284 u32 iter_addr6 = 0; 1283 u32 iter_addr6 = 0;
1285 struct netlbl_af6list *addr6; 1284 struct netlbl_af6list *addr6;
1286 #endif 1285 #endif
1287 1286
1288 cb_arg.nl_cb = cb; 1287 cb_arg.nl_cb = cb;
1289 cb_arg.skb = skb; 1288 cb_arg.skb = skb;
1290 cb_arg.seq = cb->nlh->nlmsg_seq; 1289 cb_arg.seq = cb->nlh->nlmsg_seq;
1291 1290
1292 rcu_read_lock(); 1291 rcu_read_lock();
1293 iface = rcu_dereference(netlbl_unlhsh_def); 1292 iface = rcu_dereference(netlbl_unlhsh_def);
1294 if (iface == NULL || !iface->valid) 1293 if (iface == NULL || !iface->valid)
1295 goto unlabel_staticlistdef_return; 1294 goto unlabel_staticlistdef_return;
1296 1295
1297 netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { 1296 netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) {
1298 if (iter_addr4++ < skip_addr4) 1297 if (iter_addr4++ < skip_addr4)
1299 continue; 1298 continue;
1300 if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, 1299 if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
1301 iface, 1300 iface,
1302 netlbl_unlhsh_addr4_entry(addr4), 1301 netlbl_unlhsh_addr4_entry(addr4),
1303 NULL, 1302 NULL,
1304 &cb_arg) < 0) { 1303 &cb_arg) < 0) {
1305 iter_addr4--; 1304 iter_addr4--;
1306 goto unlabel_staticlistdef_return; 1305 goto unlabel_staticlistdef_return;
1307 } 1306 }
1308 } 1307 }
1309 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 1308 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1310 netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { 1309 netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) {
1311 if (iter_addr6++ < skip_addr6) 1310 if (iter_addr6++ < skip_addr6)
1312 continue; 1311 continue;
1313 if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, 1312 if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
1314 iface, 1313 iface,
1315 NULL, 1314 NULL,
1316 netlbl_unlhsh_addr6_entry(addr6), 1315 netlbl_unlhsh_addr6_entry(addr6),
1317 &cb_arg) < 0) { 1316 &cb_arg) < 0) {
1318 iter_addr6--; 1317 iter_addr6--;
1319 goto unlabel_staticlistdef_return; 1318 goto unlabel_staticlistdef_return;
1320 } 1319 }
1321 } 1320 }
1322 #endif /* IPv6 */ 1321 #endif /* IPv6 */
1323 1322
1324 unlabel_staticlistdef_return: 1323 unlabel_staticlistdef_return:
1325 rcu_read_unlock(); 1324 rcu_read_unlock();
1326 cb->args[0] = skip_addr4; 1325 cb->args[0] = skip_addr4;
1327 cb->args[1] = skip_addr6; 1326 cb->args[1] = skip_addr6;
1328 return skb->len; 1327 return skb->len;
1329 } 1328 }
1330 1329
1331 /* 1330 /*
1332 * NetLabel Generic NETLINK Command Definitions 1331 * NetLabel Generic NETLINK Command Definitions
1333 */ 1332 */
1334 1333
1335 static struct genl_ops netlbl_unlabel_genl_ops[] = { 1334 static struct genl_ops netlbl_unlabel_genl_ops[] = {
1336 { 1335 {
1337 .cmd = NLBL_UNLABEL_C_STATICADD, 1336 .cmd = NLBL_UNLABEL_C_STATICADD,
1338 .flags = GENL_ADMIN_PERM, 1337 .flags = GENL_ADMIN_PERM,
1339 .policy = netlbl_unlabel_genl_policy, 1338 .policy = netlbl_unlabel_genl_policy,
1340 .doit = netlbl_unlabel_staticadd, 1339 .doit = netlbl_unlabel_staticadd,
1341 .dumpit = NULL, 1340 .dumpit = NULL,
1342 }, 1341 },
1343 { 1342 {
1344 .cmd = NLBL_UNLABEL_C_STATICREMOVE, 1343 .cmd = NLBL_UNLABEL_C_STATICREMOVE,
1345 .flags = GENL_ADMIN_PERM, 1344 .flags = GENL_ADMIN_PERM,
1346 .policy = netlbl_unlabel_genl_policy, 1345 .policy = netlbl_unlabel_genl_policy,
1347 .doit = netlbl_unlabel_staticremove, 1346 .doit = netlbl_unlabel_staticremove,
1348 .dumpit = NULL, 1347 .dumpit = NULL,
1349 }, 1348 },
1350 { 1349 {
1351 .cmd = NLBL_UNLABEL_C_STATICLIST, 1350 .cmd = NLBL_UNLABEL_C_STATICLIST,
1352 .flags = 0, 1351 .flags = 0,
1353 .policy = netlbl_unlabel_genl_policy, 1352 .policy = netlbl_unlabel_genl_policy,
1354 .doit = NULL, 1353 .doit = NULL,
1355 .dumpit = netlbl_unlabel_staticlist, 1354 .dumpit = netlbl_unlabel_staticlist,
1356 }, 1355 },
1357 { 1356 {
1358 .cmd = NLBL_UNLABEL_C_STATICADDDEF, 1357 .cmd = NLBL_UNLABEL_C_STATICADDDEF,
1359 .flags = GENL_ADMIN_PERM, 1358 .flags = GENL_ADMIN_PERM,
1360 .policy = netlbl_unlabel_genl_policy, 1359 .policy = netlbl_unlabel_genl_policy,
1361 .doit = netlbl_unlabel_staticadddef, 1360 .doit = netlbl_unlabel_staticadddef,
1362 .dumpit = NULL, 1361 .dumpit = NULL,
1363 }, 1362 },
1364 { 1363 {
1365 .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, 1364 .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
1366 .flags = GENL_ADMIN_PERM, 1365 .flags = GENL_ADMIN_PERM,
1367 .policy = netlbl_unlabel_genl_policy, 1366 .policy = netlbl_unlabel_genl_policy,
1368 .doit = netlbl_unlabel_staticremovedef, 1367 .doit = netlbl_unlabel_staticremovedef,
1369 .dumpit = NULL, 1368 .dumpit = NULL,
1370 }, 1369 },
1371 { 1370 {
1372 .cmd = NLBL_UNLABEL_C_STATICLISTDEF, 1371 .cmd = NLBL_UNLABEL_C_STATICLISTDEF,
1373 .flags = 0, 1372 .flags = 0,
1374 .policy = netlbl_unlabel_genl_policy, 1373 .policy = netlbl_unlabel_genl_policy,
1375 .doit = NULL, 1374 .doit = NULL,
1376 .dumpit = netlbl_unlabel_staticlistdef, 1375 .dumpit = netlbl_unlabel_staticlistdef,
1377 }, 1376 },
1378 { 1377 {
1379 .cmd = NLBL_UNLABEL_C_ACCEPT, 1378 .cmd = NLBL_UNLABEL_C_ACCEPT,
1380 .flags = GENL_ADMIN_PERM, 1379 .flags = GENL_ADMIN_PERM,
1381 .policy = netlbl_unlabel_genl_policy, 1380 .policy = netlbl_unlabel_genl_policy,
1382 .doit = netlbl_unlabel_accept, 1381 .doit = netlbl_unlabel_accept,
1383 .dumpit = NULL, 1382 .dumpit = NULL,
1384 }, 1383 },
1385 { 1384 {
1386 .cmd = NLBL_UNLABEL_C_LIST, 1385 .cmd = NLBL_UNLABEL_C_LIST,
1387 .flags = 0, 1386 .flags = 0,
1388 .policy = netlbl_unlabel_genl_policy, 1387 .policy = netlbl_unlabel_genl_policy,
1389 .doit = netlbl_unlabel_list, 1388 .doit = netlbl_unlabel_list,
1390 .dumpit = NULL, 1389 .dumpit = NULL,
1391 }, 1390 },
1392 }; 1391 };
1393 1392
1394 /* 1393 /*
1395 * NetLabel Generic NETLINK Protocol Functions 1394 * NetLabel Generic NETLINK Protocol Functions
1396 */ 1395 */
1397 1396
1398 /** 1397 /**
1399 * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component 1398 * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component
1400 * 1399 *
1401 * Description: 1400 * Description:
1402 * Register the unlabeled packet NetLabel component with the Generic NETLINK 1401 * Register the unlabeled packet NetLabel component with the Generic NETLINK
1403 * mechanism. Returns zero on success, negative values on failure. 1402 * mechanism. Returns zero on success, negative values on failure.
1404 * 1403 *
1405 */ 1404 */
1406 int __init netlbl_unlabel_genl_init(void) 1405 int __init netlbl_unlabel_genl_init(void)
1407 { 1406 {
1408 return genl_register_family_with_ops(&netlbl_unlabel_gnl_family, 1407 return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
1409 netlbl_unlabel_genl_ops, ARRAY_SIZE(netlbl_unlabel_genl_ops)); 1408 netlbl_unlabel_genl_ops, ARRAY_SIZE(netlbl_unlabel_genl_ops));
1410 } 1409 }
1411 1410
1412 /* 1411 /*
1413 * NetLabel KAPI Hooks 1412 * NetLabel KAPI Hooks
1414 */ 1413 */
1415 1414
1416 static struct notifier_block netlbl_unlhsh_netdev_notifier = { 1415 static struct notifier_block netlbl_unlhsh_netdev_notifier = {
1417 .notifier_call = netlbl_unlhsh_netdev_handler, 1416 .notifier_call = netlbl_unlhsh_netdev_handler,
1418 }; 1417 };
1419 1418
1420 /** 1419 /**
1421 * netlbl_unlabel_init - Initialize the unlabeled connection hash table 1420 * netlbl_unlabel_init - Initialize the unlabeled connection hash table
1422 * @size: the number of bits to use for the hash buckets 1421 * @size: the number of bits to use for the hash buckets
1423 * 1422 *
1424 * Description: 1423 * Description:
1425 * Initializes the unlabeled connection hash table and registers a network 1424 * Initializes the unlabeled connection hash table and registers a network
1426 * device notification handler. This function should only be called by the 1425 * device notification handler. This function should only be called by the
1427 * NetLabel subsystem itself during initialization. Returns zero on success, 1426 * NetLabel subsystem itself during initialization. Returns zero on success,
1428 * non-zero values on error. 1427 * non-zero values on error.
1429 * 1428 *
1430 */ 1429 */
1431 int __init netlbl_unlabel_init(u32 size) 1430 int __init netlbl_unlabel_init(u32 size)
1432 { 1431 {
1433 u32 iter; 1432 u32 iter;
1434 struct netlbl_unlhsh_tbl *hsh_tbl; 1433 struct netlbl_unlhsh_tbl *hsh_tbl;
1435 1434
1436 if (size == 0) 1435 if (size == 0)
1437 return -EINVAL; 1436 return -EINVAL;
1438 1437
1439 hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); 1438 hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
1440 if (hsh_tbl == NULL) 1439 if (hsh_tbl == NULL)
1441 return -ENOMEM; 1440 return -ENOMEM;
1442 hsh_tbl->size = 1 << size; 1441 hsh_tbl->size = 1 << size;
1443 hsh_tbl->tbl = kcalloc(hsh_tbl->size, 1442 hsh_tbl->tbl = kcalloc(hsh_tbl->size,
1444 sizeof(struct list_head), 1443 sizeof(struct list_head),
1445 GFP_KERNEL); 1444 GFP_KERNEL);
1446 if (hsh_tbl->tbl == NULL) { 1445 if (hsh_tbl->tbl == NULL) {
1447 kfree(hsh_tbl); 1446 kfree(hsh_tbl);
1448 return -ENOMEM; 1447 return -ENOMEM;
1449 } 1448 }
1450 for (iter = 0; iter < hsh_tbl->size; iter++) 1449 for (iter = 0; iter < hsh_tbl->size; iter++)
1451 INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); 1450 INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
1452 1451
1453 rcu_read_lock(); 1452 rcu_read_lock();
1454 spin_lock(&netlbl_unlhsh_lock); 1453 spin_lock(&netlbl_unlhsh_lock);
1455 rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); 1454 rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
1456 spin_unlock(&netlbl_unlhsh_lock); 1455 spin_unlock(&netlbl_unlhsh_lock);
1457 rcu_read_unlock(); 1456 rcu_read_unlock();
1458 1457
1459 register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); 1458 register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);
1460 1459
1461 return 0; 1460 return 0;
1462 } 1461 }
1463 1462
1464 /** 1463 /**
1465 * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet 1464 * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
1466 * @skb: the packet 1465 * @skb: the packet
1467 * @family: protocol family 1466 * @family: protocol family
1468 * @secattr: the security attributes 1467 * @secattr: the security attributes
1469 * 1468 *
1470 * Description: 1469 * Description:
1471 * Determine the security attributes, if any, for an unlabled packet and return 1470 * Determine the security attributes, if any, for an unlabled packet and return
1472 * them in @secattr. Returns zero on success and negative values on failure. 1471 * them in @secattr. Returns zero on success and negative values on failure.
1473 * 1472 *
1474 */ 1473 */
1475 int netlbl_unlabel_getattr(const struct sk_buff *skb, 1474 int netlbl_unlabel_getattr(const struct sk_buff *skb,
1476 u16 family, 1475 u16 family,
1477 struct netlbl_lsm_secattr *secattr) 1476 struct netlbl_lsm_secattr *secattr)
1478 { 1477 {
1479 struct netlbl_unlhsh_iface *iface; 1478 struct netlbl_unlhsh_iface *iface;
1480 1479
1481 rcu_read_lock(); 1480 rcu_read_lock();
1482 iface = netlbl_unlhsh_search_iface(skb->skb_iif); 1481 iface = netlbl_unlhsh_search_iface(skb->skb_iif);
1483 if (iface == NULL) 1482 if (iface == NULL)
1484 iface = rcu_dereference(netlbl_unlhsh_def); 1483 iface = rcu_dereference(netlbl_unlhsh_def);
1485 if (iface == NULL || !iface->valid) 1484 if (iface == NULL || !iface->valid)
1486 goto unlabel_getattr_nolabel; 1485 goto unlabel_getattr_nolabel;
1487 switch (family) { 1486 switch (family) {
1488 case PF_INET: { 1487 case PF_INET: {
1489 struct iphdr *hdr4; 1488 struct iphdr *hdr4;
1490 struct netlbl_af4list *addr4; 1489 struct netlbl_af4list *addr4;
1491 1490
1492 hdr4 = ip_hdr(skb); 1491 hdr4 = ip_hdr(skb);
1493 addr4 = netlbl_af4list_search(hdr4->saddr, 1492 addr4 = netlbl_af4list_search(hdr4->saddr,
1494 &iface->addr4_list); 1493 &iface->addr4_list);
1495 if (addr4 == NULL) 1494 if (addr4 == NULL)
1496 goto unlabel_getattr_nolabel; 1495 goto unlabel_getattr_nolabel;
1497 secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid; 1496 secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid;
1498 break; 1497 break;
1499 } 1498 }
1500 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 1499 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1501 case PF_INET6: { 1500 case PF_INET6: {
1502 struct ipv6hdr *hdr6; 1501 struct ipv6hdr *hdr6;
1503 struct netlbl_af6list *addr6; 1502 struct netlbl_af6list *addr6;
1504 1503
1505 hdr6 = ipv6_hdr(skb); 1504 hdr6 = ipv6_hdr(skb);
1506 addr6 = netlbl_af6list_search(&hdr6->saddr, 1505 addr6 = netlbl_af6list_search(&hdr6->saddr,
1507 &iface->addr6_list); 1506 &iface->addr6_list);
1508 if (addr6 == NULL) 1507 if (addr6 == NULL)
1509 goto unlabel_getattr_nolabel; 1508 goto unlabel_getattr_nolabel;
1510 secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid; 1509 secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid;
1511 break; 1510 break;
1512 } 1511 }
1513 #endif /* IPv6 */ 1512 #endif /* IPv6 */
1514 default: 1513 default:
1515 goto unlabel_getattr_nolabel; 1514 goto unlabel_getattr_nolabel;
1516 } 1515 }
1517 rcu_read_unlock(); 1516 rcu_read_unlock();
1518 1517
1519 secattr->flags |= NETLBL_SECATTR_SECID; 1518 secattr->flags |= NETLBL_SECATTR_SECID;
1520 secattr->type = NETLBL_NLTYPE_UNLABELED; 1519 secattr->type = NETLBL_NLTYPE_UNLABELED;
1521 return 0; 1520 return 0;
1522 1521
1523 unlabel_getattr_nolabel: 1522 unlabel_getattr_nolabel:
1524 rcu_read_unlock(); 1523 rcu_read_unlock();
1525 if (netlabel_unlabel_acceptflg == 0) 1524 if (netlabel_unlabel_acceptflg == 0)
1526 return -ENOMSG; 1525 return -ENOMSG;
1527 secattr->type = NETLBL_NLTYPE_UNLABELED; 1526 secattr->type = NETLBL_NLTYPE_UNLABELED;
1528 return 0; 1527 return 0;
1529 } 1528 }
1530 1529
1531 /** 1530 /**
1532 * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets 1531 * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets
1533 * 1532 *
1534 * Description: 1533 * Description:
1535 * Set the default NetLabel configuration to allow incoming unlabeled packets 1534 * Set the default NetLabel configuration to allow incoming unlabeled packets
1536 * and to send unlabeled network traffic by default. 1535 * and to send unlabeled network traffic by default.
1537 * 1536 *
1538 */ 1537 */
1539 int __init netlbl_unlabel_defconf(void) 1538 int __init netlbl_unlabel_defconf(void)
1540 { 1539 {
1541 int ret_val; 1540 int ret_val;
1542 struct netlbl_dom_map *entry; 1541 struct netlbl_dom_map *entry;
1543 struct netlbl_audit audit_info; 1542 struct netlbl_audit audit_info;
1544 1543
1545 /* Only the kernel is allowed to call this function and the only time 1544 /* Only the kernel is allowed to call this function and the only time
1546 * it is called is at bootup before the audit subsystem is reporting 1545 * it is called is at bootup before the audit subsystem is reporting
1547 * messages so don't worry to much about these values. */ 1546 * messages so don't worry to much about these values. */
1548 security_task_getsecid(current, &audit_info.secid); 1547 security_task_getsecid(current, &audit_info.secid);
1549 audit_info.loginuid = 0; 1548 audit_info.loginuid = 0;
1550 audit_info.sessionid = 0; 1549 audit_info.sessionid = 0;
1551 1550
1552 entry = kzalloc(sizeof(*entry), GFP_KERNEL); 1551 entry = kzalloc(sizeof(*entry), GFP_KERNEL);
1553 if (entry == NULL) 1552 if (entry == NULL)
1554 return -ENOMEM; 1553 return -ENOMEM;
1555 entry->type = NETLBL_NLTYPE_UNLABELED; 1554 entry->type = NETLBL_NLTYPE_UNLABELED;
1556 ret_val = netlbl_domhsh_add_default(entry, &audit_info); 1555 ret_val = netlbl_domhsh_add_default(entry, &audit_info);
1557 if (ret_val != 0) 1556 if (ret_val != 0)
1558 return ret_val; 1557 return ret_val;
1559 1558
1560 netlbl_unlabel_acceptflg_set(1, &audit_info); 1559 netlbl_unlabel_acceptflg_set(1, &audit_info);
1561 1560
1562 return 0; 1561 return 0;
1563 } 1562 }
1564 1563
security/keys/keyring.c
1 /* Keyring handling 1 /* Keyring handling
2 * 2 *
3 * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/init.h> 13 #include <linux/init.h>
14 #include <linux/sched.h> 14 #include <linux/sched.h>
15 #include <linux/slab.h> 15 #include <linux/slab.h>
16 #include <linux/security.h> 16 #include <linux/security.h>
17 #include <linux/seq_file.h> 17 #include <linux/seq_file.h>
18 #include <linux/err.h> 18 #include <linux/err.h>
19 #include <keys/keyring-type.h> 19 #include <keys/keyring-type.h>
20 #include <linux/uaccess.h> 20 #include <linux/uaccess.h>
21 #include "internal.h" 21 #include "internal.h"
22 22
23 #define rcu_dereference_locked_keyring(keyring) \ 23 #define rcu_dereference_locked_keyring(keyring) \
24 (rcu_dereference_protected( \ 24 (rcu_dereference_protected( \
25 (keyring)->payload.subscriptions, \ 25 (keyring)->payload.subscriptions, \
26 rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem))) 26 rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem)))
27 27
28 #define KEY_LINK_FIXQUOTA 1UL 28 #define KEY_LINK_FIXQUOTA 1UL
29 29
30 /* 30 /*
31 * When plumbing the depths of the key tree, this sets a hard limit 31 * When plumbing the depths of the key tree, this sets a hard limit
32 * set on how deep we're willing to go. 32 * set on how deep we're willing to go.
33 */ 33 */
34 #define KEYRING_SEARCH_MAX_DEPTH 6 34 #define KEYRING_SEARCH_MAX_DEPTH 6
35 35
36 /* 36 /*
37 * We keep all named keyrings in a hash to speed looking them up. 37 * We keep all named keyrings in a hash to speed looking them up.
38 */ 38 */
39 #define KEYRING_NAME_HASH_SIZE (1 << 5) 39 #define KEYRING_NAME_HASH_SIZE (1 << 5)
40 40
41 static struct list_head keyring_name_hash[KEYRING_NAME_HASH_SIZE]; 41 static struct list_head keyring_name_hash[KEYRING_NAME_HASH_SIZE];
42 static DEFINE_RWLOCK(keyring_name_lock); 42 static DEFINE_RWLOCK(keyring_name_lock);
43 43
44 static inline unsigned keyring_hash(const char *desc) 44 static inline unsigned keyring_hash(const char *desc)
45 { 45 {
46 unsigned bucket = 0; 46 unsigned bucket = 0;
47 47
48 for (; *desc; desc++) 48 for (; *desc; desc++)
49 bucket += (unsigned char)*desc; 49 bucket += (unsigned char)*desc;
50 50
51 return bucket & (KEYRING_NAME_HASH_SIZE - 1); 51 return bucket & (KEYRING_NAME_HASH_SIZE - 1);
52 } 52 }
53 53
54 /* 54 /*
55 * The keyring key type definition. Keyrings are simply keys of this type and 55 * The keyring key type definition. Keyrings are simply keys of this type and
56 * can be treated as ordinary keys in addition to having their own special 56 * can be treated as ordinary keys in addition to having their own special
57 * operations. 57 * operations.
58 */ 58 */
59 static int keyring_instantiate(struct key *keyring, 59 static int keyring_instantiate(struct key *keyring,
60 const void *data, size_t datalen); 60 const void *data, size_t datalen);
61 static int keyring_match(const struct key *keyring, const void *criterion); 61 static int keyring_match(const struct key *keyring, const void *criterion);
62 static void keyring_revoke(struct key *keyring); 62 static void keyring_revoke(struct key *keyring);
63 static void keyring_destroy(struct key *keyring); 63 static void keyring_destroy(struct key *keyring);
64 static void keyring_describe(const struct key *keyring, struct seq_file *m); 64 static void keyring_describe(const struct key *keyring, struct seq_file *m);
65 static long keyring_read(const struct key *keyring, 65 static long keyring_read(const struct key *keyring,
66 char __user *buffer, size_t buflen); 66 char __user *buffer, size_t buflen);
67 67
68 struct key_type key_type_keyring = { 68 struct key_type key_type_keyring = {
69 .name = "keyring", 69 .name = "keyring",
70 .def_datalen = sizeof(struct keyring_list), 70 .def_datalen = sizeof(struct keyring_list),
71 .instantiate = keyring_instantiate, 71 .instantiate = keyring_instantiate,
72 .match = keyring_match, 72 .match = keyring_match,
73 .revoke = keyring_revoke, 73 .revoke = keyring_revoke,
74 .destroy = keyring_destroy, 74 .destroy = keyring_destroy,
75 .describe = keyring_describe, 75 .describe = keyring_describe,
76 .read = keyring_read, 76 .read = keyring_read,
77 }; 77 };
78 EXPORT_SYMBOL(key_type_keyring); 78 EXPORT_SYMBOL(key_type_keyring);
79 79
80 /* 80 /*
81 * Semaphore to serialise link/link calls to prevent two link calls in parallel 81 * Semaphore to serialise link/link calls to prevent two link calls in parallel
82 * introducing a cycle. 82 * introducing a cycle.
83 */ 83 */
84 static DECLARE_RWSEM(keyring_serialise_link_sem); 84 static DECLARE_RWSEM(keyring_serialise_link_sem);
85 85
86 /* 86 /*
87 * Publish the name of a keyring so that it can be found by name (if it has 87 * Publish the name of a keyring so that it can be found by name (if it has
88 * one). 88 * one).
89 */ 89 */
90 static void keyring_publish_name(struct key *keyring) 90 static void keyring_publish_name(struct key *keyring)
91 { 91 {
92 int bucket; 92 int bucket;
93 93
94 if (keyring->description) { 94 if (keyring->description) {
95 bucket = keyring_hash(keyring->description); 95 bucket = keyring_hash(keyring->description);
96 96
97 write_lock(&keyring_name_lock); 97 write_lock(&keyring_name_lock);
98 98
99 if (!keyring_name_hash[bucket].next) 99 if (!keyring_name_hash[bucket].next)
100 INIT_LIST_HEAD(&keyring_name_hash[bucket]); 100 INIT_LIST_HEAD(&keyring_name_hash[bucket]);
101 101
102 list_add_tail(&keyring->type_data.link, 102 list_add_tail(&keyring->type_data.link,
103 &keyring_name_hash[bucket]); 103 &keyring_name_hash[bucket]);
104 104
105 write_unlock(&keyring_name_lock); 105 write_unlock(&keyring_name_lock);
106 } 106 }
107 } 107 }
108 108
109 /* 109 /*
110 * Initialise a keyring. 110 * Initialise a keyring.
111 * 111 *
112 * Returns 0 on success, -EINVAL if given any data. 112 * Returns 0 on success, -EINVAL if given any data.
113 */ 113 */
114 static int keyring_instantiate(struct key *keyring, 114 static int keyring_instantiate(struct key *keyring,
115 const void *data, size_t datalen) 115 const void *data, size_t datalen)
116 { 116 {
117 int ret; 117 int ret;
118 118
119 ret = -EINVAL; 119 ret = -EINVAL;
120 if (datalen == 0) { 120 if (datalen == 0) {
121 /* make the keyring available by name if it has one */ 121 /* make the keyring available by name if it has one */
122 keyring_publish_name(keyring); 122 keyring_publish_name(keyring);
123 ret = 0; 123 ret = 0;
124 } 124 }
125 125
126 return ret; 126 return ret;
127 } 127 }
128 128
129 /* 129 /*
130 * Match keyrings on their name 130 * Match keyrings on their name
131 */ 131 */
132 static int keyring_match(const struct key *keyring, const void *description) 132 static int keyring_match(const struct key *keyring, const void *description)
133 { 133 {
134 return keyring->description && 134 return keyring->description &&
135 strcmp(keyring->description, description) == 0; 135 strcmp(keyring->description, description) == 0;
136 } 136 }
137 137
138 /* 138 /*
139 * Clean up a keyring when it is destroyed. Unpublish its name if it had one 139 * Clean up a keyring when it is destroyed. Unpublish its name if it had one
140 * and dispose of its data. 140 * and dispose of its data.
141 */ 141 */
142 static void keyring_destroy(struct key *keyring) 142 static void keyring_destroy(struct key *keyring)
143 { 143 {
144 struct keyring_list *klist; 144 struct keyring_list *klist;
145 int loop; 145 int loop;
146 146
147 if (keyring->description) { 147 if (keyring->description) {
148 write_lock(&keyring_name_lock); 148 write_lock(&keyring_name_lock);
149 149
150 if (keyring->type_data.link.next != NULL && 150 if (keyring->type_data.link.next != NULL &&
151 !list_empty(&keyring->type_data.link)) 151 !list_empty(&keyring->type_data.link))
152 list_del(&keyring->type_data.link); 152 list_del(&keyring->type_data.link);
153 153
154 write_unlock(&keyring_name_lock); 154 write_unlock(&keyring_name_lock);
155 } 155 }
156 156
157 klist = rcu_dereference_check(keyring->payload.subscriptions, 157 klist = rcu_dereference_check(keyring->payload.subscriptions,
158 rcu_read_lock_held() ||
159 atomic_read(&keyring->usage) == 0); 158 atomic_read(&keyring->usage) == 0);
160 if (klist) { 159 if (klist) {
161 for (loop = klist->nkeys - 1; loop >= 0; loop--) 160 for (loop = klist->nkeys - 1; loop >= 0; loop--)
162 key_put(klist->keys[loop]); 161 key_put(klist->keys[loop]);
163 kfree(klist); 162 kfree(klist);
164 } 163 }
165 } 164 }
166 165
167 /* 166 /*
168 * Describe a keyring for /proc. 167 * Describe a keyring for /proc.
169 */ 168 */
170 static void keyring_describe(const struct key *keyring, struct seq_file *m) 169 static void keyring_describe(const struct key *keyring, struct seq_file *m)
171 { 170 {
172 struct keyring_list *klist; 171 struct keyring_list *klist;
173 172
174 if (keyring->description) 173 if (keyring->description)
175 seq_puts(m, keyring->description); 174 seq_puts(m, keyring->description);
176 else 175 else
177 seq_puts(m, "[anon]"); 176 seq_puts(m, "[anon]");
178 177
179 if (key_is_instantiated(keyring)) { 178 if (key_is_instantiated(keyring)) {
180 rcu_read_lock(); 179 rcu_read_lock();
181 klist = rcu_dereference(keyring->payload.subscriptions); 180 klist = rcu_dereference(keyring->payload.subscriptions);
182 if (klist) 181 if (klist)
183 seq_printf(m, ": %u/%u", klist->nkeys, klist->maxkeys); 182 seq_printf(m, ": %u/%u", klist->nkeys, klist->maxkeys);
184 else 183 else
185 seq_puts(m, ": empty"); 184 seq_puts(m, ": empty");
186 rcu_read_unlock(); 185 rcu_read_unlock();
187 } 186 }
188 } 187 }
189 188
190 /* 189 /*
191 * Read a list of key IDs from the keyring's contents in binary form 190 * Read a list of key IDs from the keyring's contents in binary form
192 * 191 *
193 * The keyring's semaphore is read-locked by the caller. 192 * The keyring's semaphore is read-locked by the caller.
194 */ 193 */
195 static long keyring_read(const struct key *keyring, 194 static long keyring_read(const struct key *keyring,
196 char __user *buffer, size_t buflen) 195 char __user *buffer, size_t buflen)
197 { 196 {
198 struct keyring_list *klist; 197 struct keyring_list *klist;
199 struct key *key; 198 struct key *key;
200 size_t qty, tmp; 199 size_t qty, tmp;
201 int loop, ret; 200 int loop, ret;
202 201
203 ret = 0; 202 ret = 0;
204 klist = rcu_dereference_locked_keyring(keyring); 203 klist = rcu_dereference_locked_keyring(keyring);
205 if (klist) { 204 if (klist) {
206 /* calculate how much data we could return */ 205 /* calculate how much data we could return */
207 qty = klist->nkeys * sizeof(key_serial_t); 206 qty = klist->nkeys * sizeof(key_serial_t);
208 207
209 if (buffer && buflen > 0) { 208 if (buffer && buflen > 0) {
210 if (buflen > qty) 209 if (buflen > qty)
211 buflen = qty; 210 buflen = qty;
212 211
213 /* copy the IDs of the subscribed keys into the 212 /* copy the IDs of the subscribed keys into the
214 * buffer */ 213 * buffer */
215 ret = -EFAULT; 214 ret = -EFAULT;
216 215
217 for (loop = 0; loop < klist->nkeys; loop++) { 216 for (loop = 0; loop < klist->nkeys; loop++) {
218 key = klist->keys[loop]; 217 key = klist->keys[loop];
219 218
220 tmp = sizeof(key_serial_t); 219 tmp = sizeof(key_serial_t);
221 if (tmp > buflen) 220 if (tmp > buflen)
222 tmp = buflen; 221 tmp = buflen;
223 222
224 if (copy_to_user(buffer, 223 if (copy_to_user(buffer,
225 &key->serial, 224 &key->serial,
226 tmp) != 0) 225 tmp) != 0)
227 goto error; 226 goto error;
228 227
229 buflen -= tmp; 228 buflen -= tmp;
230 if (buflen == 0) 229 if (buflen == 0)
231 break; 230 break;
232 buffer += tmp; 231 buffer += tmp;
233 } 232 }
234 } 233 }
235 234
236 ret = qty; 235 ret = qty;
237 } 236 }
238 237
239 error: 238 error:
240 return ret; 239 return ret;
241 } 240 }
242 241
243 /* 242 /*
244 * Allocate a keyring and link into the destination keyring. 243 * Allocate a keyring and link into the destination keyring.
245 */ 244 */
246 struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, 245 struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
247 const struct cred *cred, unsigned long flags, 246 const struct cred *cred, unsigned long flags,
248 struct key *dest) 247 struct key *dest)
249 { 248 {
250 struct key *keyring; 249 struct key *keyring;
251 int ret; 250 int ret;
252 251
253 keyring = key_alloc(&key_type_keyring, description, 252 keyring = key_alloc(&key_type_keyring, description,
254 uid, gid, cred, 253 uid, gid, cred,
255 (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL, 254 (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
256 flags); 255 flags);
257 256
258 if (!IS_ERR(keyring)) { 257 if (!IS_ERR(keyring)) {
259 ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); 258 ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
260 if (ret < 0) { 259 if (ret < 0) {
261 key_put(keyring); 260 key_put(keyring);
262 keyring = ERR_PTR(ret); 261 keyring = ERR_PTR(ret);
263 } 262 }
264 } 263 }
265 264
266 return keyring; 265 return keyring;
267 } 266 }
268 267
269 /** 268 /**
270 * keyring_search_aux - Search a keyring tree for a key matching some criteria 269 * keyring_search_aux - Search a keyring tree for a key matching some criteria
271 * @keyring_ref: A pointer to the keyring with possession indicator. 270 * @keyring_ref: A pointer to the keyring with possession indicator.
272 * @cred: The credentials to use for permissions checks. 271 * @cred: The credentials to use for permissions checks.
273 * @type: The type of key to search for. 272 * @type: The type of key to search for.
274 * @description: Parameter for @match. 273 * @description: Parameter for @match.
275 * @match: Function to rule on whether or not a key is the one required. 274 * @match: Function to rule on whether or not a key is the one required.
276 * @no_state_check: Don't check if a matching key is bad 275 * @no_state_check: Don't check if a matching key is bad
277 * 276 *
278 * Search the supplied keyring tree for a key that matches the criteria given. 277 * Search the supplied keyring tree for a key that matches the criteria given.
279 * The root keyring and any linked keyrings must grant Search permission to the 278 * The root keyring and any linked keyrings must grant Search permission to the
280 * caller to be searchable and keys can only be found if they too grant Search 279 * caller to be searchable and keys can only be found if they too grant Search
281 * to the caller. The possession flag on the root keyring pointer controls use 280 * to the caller. The possession flag on the root keyring pointer controls use
282 * of the possessor bits in permissions checking of the entire tree. In 281 * of the possessor bits in permissions checking of the entire tree. In
283 * addition, the LSM gets to forbid keyring searches and key matches. 282 * addition, the LSM gets to forbid keyring searches and key matches.
284 * 283 *
285 * The search is performed as a breadth-then-depth search up to the prescribed 284 * The search is performed as a breadth-then-depth search up to the prescribed
286 * limit (KEYRING_SEARCH_MAX_DEPTH). 285 * limit (KEYRING_SEARCH_MAX_DEPTH).
287 * 286 *
288 * Keys are matched to the type provided and are then filtered by the match 287 * Keys are matched to the type provided and are then filtered by the match
289 * function, which is given the description to use in any way it sees fit. The 288 * function, which is given the description to use in any way it sees fit. The
290 * match function may use any attributes of a key that it wishes to to 289 * match function may use any attributes of a key that it wishes to to
291 * determine the match. Normally the match function from the key type would be 290 * determine the match. Normally the match function from the key type would be
292 * used. 291 * used.
293 * 292 *
294 * RCU is used to prevent the keyring key lists from disappearing without the 293 * RCU is used to prevent the keyring key lists from disappearing without the
295 * need to take lots of locks. 294 * need to take lots of locks.
296 * 295 *
297 * Returns a pointer to the found key and increments the key usage count if 296 * Returns a pointer to the found key and increments the key usage count if
298 * successful; -EAGAIN if no matching keys were found, or if expired or revoked 297 * successful; -EAGAIN if no matching keys were found, or if expired or revoked
299 * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the 298 * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the
300 * specified keyring wasn't a keyring. 299 * specified keyring wasn't a keyring.
301 * 300 *
302 * In the case of a successful return, the possession attribute from 301 * In the case of a successful return, the possession attribute from
303 * @keyring_ref is propagated to the returned key reference. 302 * @keyring_ref is propagated to the returned key reference.
304 */ 303 */
305 key_ref_t keyring_search_aux(key_ref_t keyring_ref, 304 key_ref_t keyring_search_aux(key_ref_t keyring_ref,
306 const struct cred *cred, 305 const struct cred *cred,
307 struct key_type *type, 306 struct key_type *type,
308 const void *description, 307 const void *description,
309 key_match_func_t match, 308 key_match_func_t match,
310 bool no_state_check) 309 bool no_state_check)
311 { 310 {
312 struct { 311 struct {
313 struct keyring_list *keylist; 312 struct keyring_list *keylist;
314 int kix; 313 int kix;
315 } stack[KEYRING_SEARCH_MAX_DEPTH]; 314 } stack[KEYRING_SEARCH_MAX_DEPTH];
316 315
317 struct keyring_list *keylist; 316 struct keyring_list *keylist;
318 struct timespec now; 317 struct timespec now;
319 unsigned long possessed, kflags; 318 unsigned long possessed, kflags;
320 struct key *keyring, *key; 319 struct key *keyring, *key;
321 key_ref_t key_ref; 320 key_ref_t key_ref;
322 long err; 321 long err;
323 int sp, kix; 322 int sp, kix;
324 323
325 keyring = key_ref_to_ptr(keyring_ref); 324 keyring = key_ref_to_ptr(keyring_ref);
326 possessed = is_key_possessed(keyring_ref); 325 possessed = is_key_possessed(keyring_ref);
327 key_check(keyring); 326 key_check(keyring);
328 327
329 /* top keyring must have search permission to begin the search */ 328 /* top keyring must have search permission to begin the search */
330 err = key_task_permission(keyring_ref, cred, KEY_SEARCH); 329 err = key_task_permission(keyring_ref, cred, KEY_SEARCH);
331 if (err < 0) { 330 if (err < 0) {
332 key_ref = ERR_PTR(err); 331 key_ref = ERR_PTR(err);
333 goto error; 332 goto error;
334 } 333 }
335 334
336 key_ref = ERR_PTR(-ENOTDIR); 335 key_ref = ERR_PTR(-ENOTDIR);
337 if (keyring->type != &key_type_keyring) 336 if (keyring->type != &key_type_keyring)
338 goto error; 337 goto error;
339 338
340 rcu_read_lock(); 339 rcu_read_lock();
341 340
342 now = current_kernel_time(); 341 now = current_kernel_time();
343 err = -EAGAIN; 342 err = -EAGAIN;
344 sp = 0; 343 sp = 0;
345 344
346 /* firstly we should check to see if this top-level keyring is what we 345 /* firstly we should check to see if this top-level keyring is what we
347 * are looking for */ 346 * are looking for */
348 key_ref = ERR_PTR(-EAGAIN); 347 key_ref = ERR_PTR(-EAGAIN);
349 kflags = keyring->flags; 348 kflags = keyring->flags;
350 if (keyring->type == type && match(keyring, description)) { 349 if (keyring->type == type && match(keyring, description)) {
351 key = keyring; 350 key = keyring;
352 if (no_state_check) 351 if (no_state_check)
353 goto found; 352 goto found;
354 353
355 /* check it isn't negative and hasn't expired or been 354 /* check it isn't negative and hasn't expired or been
356 * revoked */ 355 * revoked */
357 if (kflags & (1 << KEY_FLAG_REVOKED)) 356 if (kflags & (1 << KEY_FLAG_REVOKED))
358 goto error_2; 357 goto error_2;
359 if (key->expiry && now.tv_sec >= key->expiry) 358 if (key->expiry && now.tv_sec >= key->expiry)
360 goto error_2; 359 goto error_2;
361 key_ref = ERR_PTR(key->type_data.reject_error); 360 key_ref = ERR_PTR(key->type_data.reject_error);
362 if (kflags & (1 << KEY_FLAG_NEGATIVE)) 361 if (kflags & (1 << KEY_FLAG_NEGATIVE))
363 goto error_2; 362 goto error_2;
364 goto found; 363 goto found;
365 } 364 }
366 365
367 /* otherwise, the top keyring must not be revoked, expired, or 366 /* otherwise, the top keyring must not be revoked, expired, or
368 * negatively instantiated if we are to search it */ 367 * negatively instantiated if we are to search it */
369 key_ref = ERR_PTR(-EAGAIN); 368 key_ref = ERR_PTR(-EAGAIN);
370 if (kflags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_NEGATIVE)) || 369 if (kflags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_NEGATIVE)) ||
371 (keyring->expiry && now.tv_sec >= keyring->expiry)) 370 (keyring->expiry && now.tv_sec >= keyring->expiry))
372 goto error_2; 371 goto error_2;
373 372
374 /* start processing a new keyring */ 373 /* start processing a new keyring */
375 descend: 374 descend:
376 if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) 375 if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
377 goto not_this_keyring; 376 goto not_this_keyring;
378 377
379 keylist = rcu_dereference(keyring->payload.subscriptions); 378 keylist = rcu_dereference(keyring->payload.subscriptions);
380 if (!keylist) 379 if (!keylist)
381 goto not_this_keyring; 380 goto not_this_keyring;
382 381
383 /* iterate through the keys in this keyring first */ 382 /* iterate through the keys in this keyring first */
384 for (kix = 0; kix < keylist->nkeys; kix++) { 383 for (kix = 0; kix < keylist->nkeys; kix++) {
385 key = keylist->keys[kix]; 384 key = keylist->keys[kix];
386 kflags = key->flags; 385 kflags = key->flags;
387 386
388 /* ignore keys not of this type */ 387 /* ignore keys not of this type */
389 if (key->type != type) 388 if (key->type != type)
390 continue; 389 continue;
391 390
392 /* skip revoked keys and expired keys */ 391 /* skip revoked keys and expired keys */
393 if (!no_state_check) { 392 if (!no_state_check) {
394 if (kflags & (1 << KEY_FLAG_REVOKED)) 393 if (kflags & (1 << KEY_FLAG_REVOKED))
395 continue; 394 continue;
396 395
397 if (key->expiry && now.tv_sec >= key->expiry) 396 if (key->expiry && now.tv_sec >= key->expiry)
398 continue; 397 continue;
399 } 398 }
400 399
401 /* keys that don't match */ 400 /* keys that don't match */
402 if (!match(key, description)) 401 if (!match(key, description))
403 continue; 402 continue;
404 403
405 /* key must have search permissions */ 404 /* key must have search permissions */
406 if (key_task_permission(make_key_ref(key, possessed), 405 if (key_task_permission(make_key_ref(key, possessed),
407 cred, KEY_SEARCH) < 0) 406 cred, KEY_SEARCH) < 0)
408 continue; 407 continue;
409 408
410 if (no_state_check) 409 if (no_state_check)
411 goto found; 410 goto found;
412 411
413 /* we set a different error code if we pass a negative key */ 412 /* we set a different error code if we pass a negative key */
414 if (kflags & (1 << KEY_FLAG_NEGATIVE)) { 413 if (kflags & (1 << KEY_FLAG_NEGATIVE)) {
415 err = key->type_data.reject_error; 414 err = key->type_data.reject_error;
416 continue; 415 continue;
417 } 416 }
418 417
419 goto found; 418 goto found;
420 } 419 }
421 420
422 /* search through the keyrings nested in this one */ 421 /* search through the keyrings nested in this one */
423 kix = 0; 422 kix = 0;
424 ascend: 423 ascend:
425 for (; kix < keylist->nkeys; kix++) { 424 for (; kix < keylist->nkeys; kix++) {
426 key = keylist->keys[kix]; 425 key = keylist->keys[kix];
427 if (key->type != &key_type_keyring) 426 if (key->type != &key_type_keyring)
428 continue; 427 continue;
429 428
430 /* recursively search nested keyrings 429 /* recursively search nested keyrings
431 * - only search keyrings for which we have search permission 430 * - only search keyrings for which we have search permission
432 */ 431 */
433 if (sp >= KEYRING_SEARCH_MAX_DEPTH) 432 if (sp >= KEYRING_SEARCH_MAX_DEPTH)
434 continue; 433 continue;
435 434
436 if (key_task_permission(make_key_ref(key, possessed), 435 if (key_task_permission(make_key_ref(key, possessed),
437 cred, KEY_SEARCH) < 0) 436 cred, KEY_SEARCH) < 0)
438 continue; 437 continue;
439 438
440 /* stack the current position */ 439 /* stack the current position */
441 stack[sp].keylist = keylist; 440 stack[sp].keylist = keylist;
442 stack[sp].kix = kix; 441 stack[sp].kix = kix;
443 sp++; 442 sp++;
444 443
445 /* begin again with the new keyring */ 444 /* begin again with the new keyring */
446 keyring = key; 445 keyring = key;
447 goto descend; 446 goto descend;
448 } 447 }
449 448
450 /* the keyring we're looking at was disqualified or didn't contain a 449 /* the keyring we're looking at was disqualified or didn't contain a
451 * matching key */ 450 * matching key */
452 not_this_keyring: 451 not_this_keyring:
453 if (sp > 0) { 452 if (sp > 0) {
454 /* resume the processing of a keyring higher up in the tree */ 453 /* resume the processing of a keyring higher up in the tree */
455 sp--; 454 sp--;
456 keylist = stack[sp].keylist; 455 keylist = stack[sp].keylist;
457 kix = stack[sp].kix + 1; 456 kix = stack[sp].kix + 1;
458 goto ascend; 457 goto ascend;
459 } 458 }
460 459
461 key_ref = ERR_PTR(err); 460 key_ref = ERR_PTR(err);
462 goto error_2; 461 goto error_2;
463 462
464 /* we found a viable match */ 463 /* we found a viable match */
465 found: 464 found:
466 atomic_inc(&key->usage); 465 atomic_inc(&key->usage);
467 key_check(key); 466 key_check(key);
468 key_ref = make_key_ref(key, possessed); 467 key_ref = make_key_ref(key, possessed);
469 error_2: 468 error_2:
470 rcu_read_unlock(); 469 rcu_read_unlock();
471 error: 470 error:
472 return key_ref; 471 return key_ref;
473 } 472 }
474 473
475 /** 474 /**
476 * keyring_search - Search the supplied keyring tree for a matching key 475 * keyring_search - Search the supplied keyring tree for a matching key
477 * @keyring: The root of the keyring tree to be searched. 476 * @keyring: The root of the keyring tree to be searched.
478 * @type: The type of keyring we want to find. 477 * @type: The type of keyring we want to find.
479 * @description: The name of the keyring we want to find. 478 * @description: The name of the keyring we want to find.
480 * 479 *
481 * As keyring_search_aux() above, but using the current task's credentials and 480 * As keyring_search_aux() above, but using the current task's credentials and
482 * type's default matching function. 481 * type's default matching function.
483 */ 482 */
484 key_ref_t keyring_search(key_ref_t keyring, 483 key_ref_t keyring_search(key_ref_t keyring,
485 struct key_type *type, 484 struct key_type *type,
486 const char *description) 485 const char *description)
487 { 486 {
488 if (!type->match) 487 if (!type->match)
489 return ERR_PTR(-ENOKEY); 488 return ERR_PTR(-ENOKEY);
490 489
491 return keyring_search_aux(keyring, current->cred, 490 return keyring_search_aux(keyring, current->cred,
492 type, description, type->match, false); 491 type, description, type->match, false);
493 } 492 }
494 EXPORT_SYMBOL(keyring_search); 493 EXPORT_SYMBOL(keyring_search);
495 494
496 /* 495 /*
497 * Search the given keyring only (no recursion). 496 * Search the given keyring only (no recursion).
498 * 497 *
499 * The caller must guarantee that the keyring is a keyring and that the 498 * The caller must guarantee that the keyring is a keyring and that the
500 * permission is granted to search the keyring as no check is made here. 499 * permission is granted to search the keyring as no check is made here.
501 * 500 *
502 * RCU is used to make it unnecessary to lock the keyring key list here. 501 * RCU is used to make it unnecessary to lock the keyring key list here.
503 * 502 *
504 * Returns a pointer to the found key with usage count incremented if 503 * Returns a pointer to the found key with usage count incremented if
505 * successful and returns -ENOKEY if not found. Revoked keys and keys not 504 * successful and returns -ENOKEY if not found. Revoked keys and keys not
506 * providing the requested permission are skipped over. 505 * providing the requested permission are skipped over.
507 * 506 *
508 * If successful, the possession indicator is propagated from the keyring ref 507 * If successful, the possession indicator is propagated from the keyring ref
509 * to the returned key reference. 508 * to the returned key reference.
510 */ 509 */
511 key_ref_t __keyring_search_one(key_ref_t keyring_ref, 510 key_ref_t __keyring_search_one(key_ref_t keyring_ref,
512 const struct key_type *ktype, 511 const struct key_type *ktype,
513 const char *description, 512 const char *description,
514 key_perm_t perm) 513 key_perm_t perm)
515 { 514 {
516 struct keyring_list *klist; 515 struct keyring_list *klist;
517 unsigned long possessed; 516 unsigned long possessed;
518 struct key *keyring, *key; 517 struct key *keyring, *key;
519 int loop; 518 int loop;
520 519
521 keyring = key_ref_to_ptr(keyring_ref); 520 keyring = key_ref_to_ptr(keyring_ref);
522 possessed = is_key_possessed(keyring_ref); 521 possessed = is_key_possessed(keyring_ref);
523 522
524 rcu_read_lock(); 523 rcu_read_lock();
525 524
526 klist = rcu_dereference(keyring->payload.subscriptions); 525 klist = rcu_dereference(keyring->payload.subscriptions);
527 if (klist) { 526 if (klist) {
528 for (loop = 0; loop < klist->nkeys; loop++) { 527 for (loop = 0; loop < klist->nkeys; loop++) {
529 key = klist->keys[loop]; 528 key = klist->keys[loop];
530 529
531 if (key->type == ktype && 530 if (key->type == ktype &&
532 (!key->type->match || 531 (!key->type->match ||
533 key->type->match(key, description)) && 532 key->type->match(key, description)) &&
534 key_permission(make_key_ref(key, possessed), 533 key_permission(make_key_ref(key, possessed),
535 perm) == 0 && 534 perm) == 0 &&
536 !test_bit(KEY_FLAG_REVOKED, &key->flags) 535 !test_bit(KEY_FLAG_REVOKED, &key->flags)
537 ) 536 )
538 goto found; 537 goto found;
539 } 538 }
540 } 539 }
541 540
542 rcu_read_unlock(); 541 rcu_read_unlock();
543 return ERR_PTR(-ENOKEY); 542 return ERR_PTR(-ENOKEY);
544 543
545 found: 544 found:
546 atomic_inc(&key->usage); 545 atomic_inc(&key->usage);
547 rcu_read_unlock(); 546 rcu_read_unlock();
548 return make_key_ref(key, possessed); 547 return make_key_ref(key, possessed);
549 } 548 }
550 549
551 /* 550 /*
552 * Find a keyring with the specified name. 551 * Find a keyring with the specified name.
553 * 552 *
554 * All named keyrings in the current user namespace are searched, provided they 553 * All named keyrings in the current user namespace are searched, provided they
555 * grant Search permission directly to the caller (unless this check is 554 * grant Search permission directly to the caller (unless this check is
556 * skipped). Keyrings whose usage points have reached zero or who have been 555 * skipped). Keyrings whose usage points have reached zero or who have been
557 * revoked are skipped. 556 * revoked are skipped.
558 * 557 *
559 * Returns a pointer to the keyring with the keyring's refcount having being 558 * Returns a pointer to the keyring with the keyring's refcount having being
560 * incremented on success. -ENOKEY is returned if a key could not be found. 559 * incremented on success. -ENOKEY is returned if a key could not be found.
561 */ 560 */
562 struct key *find_keyring_by_name(const char *name, bool skip_perm_check) 561 struct key *find_keyring_by_name(const char *name, bool skip_perm_check)
563 { 562 {
564 struct key *keyring; 563 struct key *keyring;
565 int bucket; 564 int bucket;
566 565
567 if (!name) 566 if (!name)
568 return ERR_PTR(-EINVAL); 567 return ERR_PTR(-EINVAL);
569 568
570 bucket = keyring_hash(name); 569 bucket = keyring_hash(name);
571 570
572 read_lock(&keyring_name_lock); 571 read_lock(&keyring_name_lock);
573 572
574 if (keyring_name_hash[bucket].next) { 573 if (keyring_name_hash[bucket].next) {
575 /* search this hash bucket for a keyring with a matching name 574 /* search this hash bucket for a keyring with a matching name
576 * that's readable and that hasn't been revoked */ 575 * that's readable and that hasn't been revoked */
577 list_for_each_entry(keyring, 576 list_for_each_entry(keyring,
578 &keyring_name_hash[bucket], 577 &keyring_name_hash[bucket],
579 type_data.link 578 type_data.link
580 ) { 579 ) {
581 if (keyring->user->user_ns != current_user_ns()) 580 if (keyring->user->user_ns != current_user_ns())
582 continue; 581 continue;
583 582
584 if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) 583 if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
585 continue; 584 continue;
586 585
587 if (strcmp(keyring->description, name) != 0) 586 if (strcmp(keyring->description, name) != 0)
588 continue; 587 continue;
589 588
590 if (!skip_perm_check && 589 if (!skip_perm_check &&
591 key_permission(make_key_ref(keyring, 0), 590 key_permission(make_key_ref(keyring, 0),
592 KEY_SEARCH) < 0) 591 KEY_SEARCH) < 0)
593 continue; 592 continue;
594 593
595 /* we've got a match but we might end up racing with 594 /* we've got a match but we might end up racing with
596 * key_cleanup() if the keyring is currently 'dead' 595 * key_cleanup() if the keyring is currently 'dead'
597 * (ie. it has a zero usage count) */ 596 * (ie. it has a zero usage count) */
598 if (!atomic_inc_not_zero(&keyring->usage)) 597 if (!atomic_inc_not_zero(&keyring->usage))
599 continue; 598 continue;
600 goto out; 599 goto out;
601 } 600 }
602 } 601 }
603 602
604 keyring = ERR_PTR(-ENOKEY); 603 keyring = ERR_PTR(-ENOKEY);
605 out: 604 out:
606 read_unlock(&keyring_name_lock); 605 read_unlock(&keyring_name_lock);
607 return keyring; 606 return keyring;
608 } 607 }
609 608
610 /* 609 /*
611 * See if a cycle will will be created by inserting acyclic tree B in acyclic 610 * See if a cycle will will be created by inserting acyclic tree B in acyclic
612 * tree A at the topmost level (ie: as a direct child of A). 611 * tree A at the topmost level (ie: as a direct child of A).
613 * 612 *
614 * Since we are adding B to A at the top level, checking for cycles should just 613 * Since we are adding B to A at the top level, checking for cycles should just
615 * be a matter of seeing if node A is somewhere in tree B. 614 * be a matter of seeing if node A is somewhere in tree B.
616 */ 615 */
617 static int keyring_detect_cycle(struct key *A, struct key *B) 616 static int keyring_detect_cycle(struct key *A, struct key *B)
618 { 617 {
619 struct { 618 struct {
620 struct keyring_list *keylist; 619 struct keyring_list *keylist;
621 int kix; 620 int kix;
622 } stack[KEYRING_SEARCH_MAX_DEPTH]; 621 } stack[KEYRING_SEARCH_MAX_DEPTH];
623 622
624 struct keyring_list *keylist; 623 struct keyring_list *keylist;
625 struct key *subtree, *key; 624 struct key *subtree, *key;
626 int sp, kix, ret; 625 int sp, kix, ret;
627 626
628 rcu_read_lock(); 627 rcu_read_lock();
629 628
630 ret = -EDEADLK; 629 ret = -EDEADLK;
631 if (A == B) 630 if (A == B)
632 goto cycle_detected; 631 goto cycle_detected;
633 632
634 subtree = B; 633 subtree = B;
635 sp = 0; 634 sp = 0;
636 635
637 /* start processing a new keyring */ 636 /* start processing a new keyring */
638 descend: 637 descend:
639 if (test_bit(KEY_FLAG_REVOKED, &subtree->flags)) 638 if (test_bit(KEY_FLAG_REVOKED, &subtree->flags))
640 goto not_this_keyring; 639 goto not_this_keyring;
641 640
642 keylist = rcu_dereference(subtree->payload.subscriptions); 641 keylist = rcu_dereference(subtree->payload.subscriptions);
643 if (!keylist) 642 if (!keylist)
644 goto not_this_keyring; 643 goto not_this_keyring;
645 kix = 0; 644 kix = 0;
646 645
647 ascend: 646 ascend:
648 /* iterate through the remaining keys in this keyring */ 647 /* iterate through the remaining keys in this keyring */
649 for (; kix < keylist->nkeys; kix++) { 648 for (; kix < keylist->nkeys; kix++) {
650 key = keylist->keys[kix]; 649 key = keylist->keys[kix];
651 650
652 if (key == A) 651 if (key == A)
653 goto cycle_detected; 652 goto cycle_detected;
654 653
655 /* recursively check nested keyrings */ 654 /* recursively check nested keyrings */
656 if (key->type == &key_type_keyring) { 655 if (key->type == &key_type_keyring) {
657 if (sp >= KEYRING_SEARCH_MAX_DEPTH) 656 if (sp >= KEYRING_SEARCH_MAX_DEPTH)
658 goto too_deep; 657 goto too_deep;
659 658
660 /* stack the current position */ 659 /* stack the current position */
661 stack[sp].keylist = keylist; 660 stack[sp].keylist = keylist;
662 stack[sp].kix = kix; 661 stack[sp].kix = kix;
663 sp++; 662 sp++;
664 663
665 /* begin again with the new keyring */ 664 /* begin again with the new keyring */
666 subtree = key; 665 subtree = key;
667 goto descend; 666 goto descend;
668 } 667 }
669 } 668 }
670 669
671 /* the keyring we're looking at was disqualified or didn't contain a 670 /* the keyring we're looking at was disqualified or didn't contain a
672 * matching key */ 671 * matching key */
673 not_this_keyring: 672 not_this_keyring:
674 if (sp > 0) { 673 if (sp > 0) {
675 /* resume the checking of a keyring higher up in the tree */ 674 /* resume the checking of a keyring higher up in the tree */
676 sp--; 675 sp--;
677 keylist = stack[sp].keylist; 676 keylist = stack[sp].keylist;
678 kix = stack[sp].kix + 1; 677 kix = stack[sp].kix + 1;
679 goto ascend; 678 goto ascend;
680 } 679 }
681 680
682 ret = 0; /* no cycles detected */ 681 ret = 0; /* no cycles detected */
683 682
684 error: 683 error:
685 rcu_read_unlock(); 684 rcu_read_unlock();
686 return ret; 685 return ret;
687 686
688 too_deep: 687 too_deep:
689 ret = -ELOOP; 688 ret = -ELOOP;
690 goto error; 689 goto error;
691 690
692 cycle_detected: 691 cycle_detected:
693 ret = -EDEADLK; 692 ret = -EDEADLK;
694 goto error; 693 goto error;
695 } 694 }
696 695
697 /* 696 /*
698 * Dispose of a keyring list after the RCU grace period, freeing the unlinked 697 * Dispose of a keyring list after the RCU grace period, freeing the unlinked
699 * key 698 * key
700 */ 699 */
701 static void keyring_unlink_rcu_disposal(struct rcu_head *rcu) 700 static void keyring_unlink_rcu_disposal(struct rcu_head *rcu)
702 { 701 {
703 struct keyring_list *klist = 702 struct keyring_list *klist =
704 container_of(rcu, struct keyring_list, rcu); 703 container_of(rcu, struct keyring_list, rcu);
705 704
706 if (klist->delkey != USHRT_MAX) 705 if (klist->delkey != USHRT_MAX)
707 key_put(klist->keys[klist->delkey]); 706 key_put(klist->keys[klist->delkey]);
708 kfree(klist); 707 kfree(klist);
709 } 708 }
710 709
711 /* 710 /*
712 * Preallocate memory so that a key can be linked into to a keyring. 711 * Preallocate memory so that a key can be linked into to a keyring.
713 */ 712 */
714 int __key_link_begin(struct key *keyring, const struct key_type *type, 713 int __key_link_begin(struct key *keyring, const struct key_type *type,
715 const char *description, unsigned long *_prealloc) 714 const char *description, unsigned long *_prealloc)
716 __acquires(&keyring->sem) 715 __acquires(&keyring->sem)
717 { 716 {
718 struct keyring_list *klist, *nklist; 717 struct keyring_list *klist, *nklist;
719 unsigned long prealloc; 718 unsigned long prealloc;
720 unsigned max; 719 unsigned max;
721 size_t size; 720 size_t size;
722 int loop, ret; 721 int loop, ret;
723 722
724 kenter("%d,%s,%s,", key_serial(keyring), type->name, description); 723 kenter("%d,%s,%s,", key_serial(keyring), type->name, description);
725 724
726 if (keyring->type != &key_type_keyring) 725 if (keyring->type != &key_type_keyring)
727 return -ENOTDIR; 726 return -ENOTDIR;
728 727
729 down_write(&keyring->sem); 728 down_write(&keyring->sem);
730 729
731 ret = -EKEYREVOKED; 730 ret = -EKEYREVOKED;
732 if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) 731 if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
733 goto error_krsem; 732 goto error_krsem;
734 733
735 /* serialise link/link calls to prevent parallel calls causing a cycle 734 /* serialise link/link calls to prevent parallel calls causing a cycle
736 * when linking two keyring in opposite orders */ 735 * when linking two keyring in opposite orders */
737 if (type == &key_type_keyring) 736 if (type == &key_type_keyring)
738 down_write(&keyring_serialise_link_sem); 737 down_write(&keyring_serialise_link_sem);
739 738
740 klist = rcu_dereference_locked_keyring(keyring); 739 klist = rcu_dereference_locked_keyring(keyring);
741 740
742 /* see if there's a matching key we can displace */ 741 /* see if there's a matching key we can displace */
743 if (klist && klist->nkeys > 0) { 742 if (klist && klist->nkeys > 0) {
744 for (loop = klist->nkeys - 1; loop >= 0; loop--) { 743 for (loop = klist->nkeys - 1; loop >= 0; loop--) {
745 if (klist->keys[loop]->type == type && 744 if (klist->keys[loop]->type == type &&
746 strcmp(klist->keys[loop]->description, 745 strcmp(klist->keys[loop]->description,
747 description) == 0 746 description) == 0
748 ) { 747 ) {
749 /* found a match - we'll replace this one with 748 /* found a match - we'll replace this one with
750 * the new key */ 749 * the new key */
751 size = sizeof(struct key *) * klist->maxkeys; 750 size = sizeof(struct key *) * klist->maxkeys;
752 size += sizeof(*klist); 751 size += sizeof(*klist);
753 BUG_ON(size > PAGE_SIZE); 752 BUG_ON(size > PAGE_SIZE);
754 753
755 ret = -ENOMEM; 754 ret = -ENOMEM;
756 nklist = kmemdup(klist, size, GFP_KERNEL); 755 nklist = kmemdup(klist, size, GFP_KERNEL);
757 if (!nklist) 756 if (!nklist)
758 goto error_sem; 757 goto error_sem;
759 758
760 /* note replacement slot */ 759 /* note replacement slot */
761 klist->delkey = nklist->delkey = loop; 760 klist->delkey = nklist->delkey = loop;
762 prealloc = (unsigned long)nklist; 761 prealloc = (unsigned long)nklist;
763 goto done; 762 goto done;
764 } 763 }
765 } 764 }
766 } 765 }
767 766
768 /* check that we aren't going to overrun the user's quota */ 767 /* check that we aren't going to overrun the user's quota */
769 ret = key_payload_reserve(keyring, 768 ret = key_payload_reserve(keyring,
770 keyring->datalen + KEYQUOTA_LINK_BYTES); 769 keyring->datalen + KEYQUOTA_LINK_BYTES);
771 if (ret < 0) 770 if (ret < 0)
772 goto error_sem; 771 goto error_sem;
773 772
774 if (klist && klist->nkeys < klist->maxkeys) { 773 if (klist && klist->nkeys < klist->maxkeys) {
775 /* there's sufficient slack space to append directly */ 774 /* there's sufficient slack space to append directly */
776 nklist = NULL; 775 nklist = NULL;
777 prealloc = KEY_LINK_FIXQUOTA; 776 prealloc = KEY_LINK_FIXQUOTA;
778 } else { 777 } else {
779 /* grow the key list */ 778 /* grow the key list */
780 max = 4; 779 max = 4;
781 if (klist) 780 if (klist)
782 max += klist->maxkeys; 781 max += klist->maxkeys;
783 782
784 ret = -ENFILE; 783 ret = -ENFILE;
785 if (max > USHRT_MAX - 1) 784 if (max > USHRT_MAX - 1)
786 goto error_quota; 785 goto error_quota;
787 size = sizeof(*klist) + sizeof(struct key *) * max; 786 size = sizeof(*klist) + sizeof(struct key *) * max;
788 if (size > PAGE_SIZE) 787 if (size > PAGE_SIZE)
789 goto error_quota; 788 goto error_quota;
790 789
791 ret = -ENOMEM; 790 ret = -ENOMEM;
792 nklist = kmalloc(size, GFP_KERNEL); 791 nklist = kmalloc(size, GFP_KERNEL);
793 if (!nklist) 792 if (!nklist)
794 goto error_quota; 793 goto error_quota;
795 794
796 nklist->maxkeys = max; 795 nklist->maxkeys = max;
797 if (klist) { 796 if (klist) {
798 memcpy(nklist->keys, klist->keys, 797 memcpy(nklist->keys, klist->keys,
799 sizeof(struct key *) * klist->nkeys); 798 sizeof(struct key *) * klist->nkeys);
800 nklist->delkey = klist->nkeys; 799 nklist->delkey = klist->nkeys;
801 nklist->nkeys = klist->nkeys + 1; 800 nklist->nkeys = klist->nkeys + 1;
802 klist->delkey = USHRT_MAX; 801 klist->delkey = USHRT_MAX;
803 } else { 802 } else {
804 nklist->nkeys = 1; 803 nklist->nkeys = 1;
805 nklist->delkey = 0; 804 nklist->delkey = 0;
806 } 805 }
807 806
808 /* add the key into the new space */ 807 /* add the key into the new space */
809 nklist->keys[nklist->delkey] = NULL; 808 nklist->keys[nklist->delkey] = NULL;
810 } 809 }
811 810
812 prealloc = (unsigned long)nklist | KEY_LINK_FIXQUOTA; 811 prealloc = (unsigned long)nklist | KEY_LINK_FIXQUOTA;
813 done: 812 done:
814 *_prealloc = prealloc; 813 *_prealloc = prealloc;
815 kleave(" = 0"); 814 kleave(" = 0");
816 return 0; 815 return 0;
817 816
818 error_quota: 817 error_quota:
819 /* undo the quota changes */ 818 /* undo the quota changes */
820 key_payload_reserve(keyring, 819 key_payload_reserve(keyring,
821 keyring->datalen - KEYQUOTA_LINK_BYTES); 820 keyring->datalen - KEYQUOTA_LINK_BYTES);
822 error_sem: 821 error_sem:
823 if (type == &key_type_keyring) 822 if (type == &key_type_keyring)
824 up_write(&keyring_serialise_link_sem); 823 up_write(&keyring_serialise_link_sem);
825 error_krsem: 824 error_krsem:
826 up_write(&keyring->sem); 825 up_write(&keyring->sem);
827 kleave(" = %d", ret); 826 kleave(" = %d", ret);
828 return ret; 827 return ret;
829 } 828 }
830 829
831 /* 830 /*
832 * Check already instantiated keys aren't going to be a problem. 831 * Check already instantiated keys aren't going to be a problem.
833 * 832 *
834 * The caller must have called __key_link_begin(). Don't need to call this for 833 * The caller must have called __key_link_begin(). Don't need to call this for
835 * keys that were created since __key_link_begin() was called. 834 * keys that were created since __key_link_begin() was called.
836 */ 835 */
837 int __key_link_check_live_key(struct key *keyring, struct key *key) 836 int __key_link_check_live_key(struct key *keyring, struct key *key)
838 { 837 {
839 if (key->type == &key_type_keyring) 838 if (key->type == &key_type_keyring)
840 /* check that we aren't going to create a cycle by linking one 839 /* check that we aren't going to create a cycle by linking one
841 * keyring to another */ 840 * keyring to another */
842 return keyring_detect_cycle(keyring, key); 841 return keyring_detect_cycle(keyring, key);
843 return 0; 842 return 0;
844 } 843 }
845 844
846 /* 845 /*
847 * Link a key into to a keyring. 846 * Link a key into to a keyring.
848 * 847 *
849 * Must be called with __key_link_begin() having being called. Discards any 848 * Must be called with __key_link_begin() having being called. Discards any
850 * already extant link to matching key if there is one, so that each keyring 849 * already extant link to matching key if there is one, so that each keyring
851 * holds at most one link to any given key of a particular type+description 850 * holds at most one link to any given key of a particular type+description
852 * combination. 851 * combination.
853 */ 852 */
854 void __key_link(struct key *keyring, struct key *key, 853 void __key_link(struct key *keyring, struct key *key,
855 unsigned long *_prealloc) 854 unsigned long *_prealloc)
856 { 855 {
857 struct keyring_list *klist, *nklist; 856 struct keyring_list *klist, *nklist;
858 857
859 nklist = (struct keyring_list *)(*_prealloc & ~KEY_LINK_FIXQUOTA); 858 nklist = (struct keyring_list *)(*_prealloc & ~KEY_LINK_FIXQUOTA);
860 *_prealloc = 0; 859 *_prealloc = 0;
861 860
862 kenter("%d,%d,%p", keyring->serial, key->serial, nklist); 861 kenter("%d,%d,%p", keyring->serial, key->serial, nklist);
863 862
864 klist = rcu_dereference_protected(keyring->payload.subscriptions, 863 klist = rcu_dereference_protected(keyring->payload.subscriptions,
865 rwsem_is_locked(&keyring->sem)); 864 rwsem_is_locked(&keyring->sem));
866 865
867 atomic_inc(&key->usage); 866 atomic_inc(&key->usage);
868 867
869 /* there's a matching key we can displace or an empty slot in a newly 868 /* there's a matching key we can displace or an empty slot in a newly
870 * allocated list we can fill */ 869 * allocated list we can fill */
871 if (nklist) { 870 if (nklist) {
872 kdebug("replace %hu/%hu/%hu", 871 kdebug("replace %hu/%hu/%hu",
873 nklist->delkey, nklist->nkeys, nklist->maxkeys); 872 nklist->delkey, nklist->nkeys, nklist->maxkeys);
874 873
875 nklist->keys[nklist->delkey] = key; 874 nklist->keys[nklist->delkey] = key;
876 875
877 rcu_assign_pointer(keyring->payload.subscriptions, nklist); 876 rcu_assign_pointer(keyring->payload.subscriptions, nklist);
878 877
879 /* dispose of the old keyring list and, if there was one, the 878 /* dispose of the old keyring list and, if there was one, the
880 * displaced key */ 879 * displaced key */
881 if (klist) { 880 if (klist) {
882 kdebug("dispose %hu/%hu/%hu", 881 kdebug("dispose %hu/%hu/%hu",
883 klist->delkey, klist->nkeys, klist->maxkeys); 882 klist->delkey, klist->nkeys, klist->maxkeys);
884 call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); 883 call_rcu(&klist->rcu, keyring_unlink_rcu_disposal);
885 } 884 }
886 } else { 885 } else {
887 /* there's sufficient slack space to append directly */ 886 /* there's sufficient slack space to append directly */
888 klist->keys[klist->nkeys] = key; 887 klist->keys[klist->nkeys] = key;
889 smp_wmb(); 888 smp_wmb();
890 klist->nkeys++; 889 klist->nkeys++;
891 } 890 }
892 } 891 }
893 892
894 /* 893 /*
895 * Finish linking a key into to a keyring. 894 * Finish linking a key into to a keyring.
896 * 895 *
897 * Must be called with __key_link_begin() having being called. 896 * Must be called with __key_link_begin() having being called.
898 */ 897 */
899 void __key_link_end(struct key *keyring, struct key_type *type, 898 void __key_link_end(struct key *keyring, struct key_type *type,
900 unsigned long prealloc) 899 unsigned long prealloc)
901 __releases(&keyring->sem) 900 __releases(&keyring->sem)
902 { 901 {
903 BUG_ON(type == NULL); 902 BUG_ON(type == NULL);
904 BUG_ON(type->name == NULL); 903 BUG_ON(type->name == NULL);
905 kenter("%d,%s,%lx", keyring->serial, type->name, prealloc); 904 kenter("%d,%s,%lx", keyring->serial, type->name, prealloc);
906 905
907 if (type == &key_type_keyring) 906 if (type == &key_type_keyring)
908 up_write(&keyring_serialise_link_sem); 907 up_write(&keyring_serialise_link_sem);
909 908
910 if (prealloc) { 909 if (prealloc) {
911 if (prealloc & KEY_LINK_FIXQUOTA) 910 if (prealloc & KEY_LINK_FIXQUOTA)
912 key_payload_reserve(keyring, 911 key_payload_reserve(keyring,
913 keyring->datalen - 912 keyring->datalen -
914 KEYQUOTA_LINK_BYTES); 913 KEYQUOTA_LINK_BYTES);
915 kfree((struct keyring_list *)(prealloc & ~KEY_LINK_FIXQUOTA)); 914 kfree((struct keyring_list *)(prealloc & ~KEY_LINK_FIXQUOTA));
916 } 915 }
917 up_write(&keyring->sem); 916 up_write(&keyring->sem);
918 } 917 }
919 918
920 /** 919 /**
921 * key_link - Link a key to a keyring 920 * key_link - Link a key to a keyring
922 * @keyring: The keyring to make the link in. 921 * @keyring: The keyring to make the link in.
923 * @key: The key to link to. 922 * @key: The key to link to.
924 * 923 *
925 * Make a link in a keyring to a key, such that the keyring holds a reference 924 * Make a link in a keyring to a key, such that the keyring holds a reference
926 * on that key and the key can potentially be found by searching that keyring. 925 * on that key and the key can potentially be found by searching that keyring.
927 * 926 *
928 * This function will write-lock the keyring's semaphore and will consume some 927 * This function will write-lock the keyring's semaphore and will consume some
929 * of the user's key data quota to hold the link. 928 * of the user's key data quota to hold the link.
930 * 929 *
931 * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, 930 * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring,
932 * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is 931 * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is
933 * full, -EDQUOT if there is insufficient key data quota remaining to add 932 * full, -EDQUOT if there is insufficient key data quota remaining to add
934 * another link or -ENOMEM if there's insufficient memory. 933 * another link or -ENOMEM if there's insufficient memory.
935 * 934 *
936 * It is assumed that the caller has checked that it is permitted for a link to 935 * It is assumed that the caller has checked that it is permitted for a link to
937 * be made (the keyring should have Write permission and the key Link 936 * be made (the keyring should have Write permission and the key Link
938 * permission). 937 * permission).
939 */ 938 */
940 int key_link(struct key *keyring, struct key *key) 939 int key_link(struct key *keyring, struct key *key)
941 { 940 {
942 unsigned long prealloc; 941 unsigned long prealloc;
943 int ret; 942 int ret;
944 943
945 key_check(keyring); 944 key_check(keyring);
946 key_check(key); 945 key_check(key);
947 946
948 ret = __key_link_begin(keyring, key->type, key->description, &prealloc); 947 ret = __key_link_begin(keyring, key->type, key->description, &prealloc);
949 if (ret == 0) { 948 if (ret == 0) {
950 ret = __key_link_check_live_key(keyring, key); 949 ret = __key_link_check_live_key(keyring, key);
951 if (ret == 0) 950 if (ret == 0)
952 __key_link(keyring, key, &prealloc); 951 __key_link(keyring, key, &prealloc);
953 __key_link_end(keyring, key->type, prealloc); 952 __key_link_end(keyring, key->type, prealloc);
954 } 953 }
955 954
956 return ret; 955 return ret;
957 } 956 }
958 EXPORT_SYMBOL(key_link); 957 EXPORT_SYMBOL(key_link);
959 958
960 /** 959 /**
961 * key_unlink - Unlink the first link to a key from a keyring. 960 * key_unlink - Unlink the first link to a key from a keyring.
962 * @keyring: The keyring to remove the link from. 961 * @keyring: The keyring to remove the link from.
963 * @key: The key the link is to. 962 * @key: The key the link is to.
964 * 963 *
965 * Remove a link from a keyring to a key. 964 * Remove a link from a keyring to a key.
966 * 965 *
967 * This function will write-lock the keyring's semaphore. 966 * This function will write-lock the keyring's semaphore.
968 * 967 *
969 * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if 968 * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if
970 * the key isn't linked to by the keyring or -ENOMEM if there's insufficient 969 * the key isn't linked to by the keyring or -ENOMEM if there's insufficient
971 * memory. 970 * memory.
972 * 971 *
973 * It is assumed that the caller has checked that it is permitted for a link to 972 * It is assumed that the caller has checked that it is permitted for a link to
974 * be removed (the keyring should have Write permission; no permissions are 973 * be removed (the keyring should have Write permission; no permissions are
975 * required on the key). 974 * required on the key).
976 */ 975 */
977 int key_unlink(struct key *keyring, struct key *key) 976 int key_unlink(struct key *keyring, struct key *key)
978 { 977 {
979 struct keyring_list *klist, *nklist; 978 struct keyring_list *klist, *nklist;
980 int loop, ret; 979 int loop, ret;
981 980
982 key_check(keyring); 981 key_check(keyring);
983 key_check(key); 982 key_check(key);
984 983
985 ret = -ENOTDIR; 984 ret = -ENOTDIR;
986 if (keyring->type != &key_type_keyring) 985 if (keyring->type != &key_type_keyring)
987 goto error; 986 goto error;
988 987
989 down_write(&keyring->sem); 988 down_write(&keyring->sem);
990 989
991 klist = rcu_dereference_locked_keyring(keyring); 990 klist = rcu_dereference_locked_keyring(keyring);
992 if (klist) { 991 if (klist) {
993 /* search the keyring for the key */ 992 /* search the keyring for the key */
994 for (loop = 0; loop < klist->nkeys; loop++) 993 for (loop = 0; loop < klist->nkeys; loop++)
995 if (klist->keys[loop] == key) 994 if (klist->keys[loop] == key)
996 goto key_is_present; 995 goto key_is_present;
997 } 996 }
998 997
999 up_write(&keyring->sem); 998 up_write(&keyring->sem);
1000 ret = -ENOENT; 999 ret = -ENOENT;
1001 goto error; 1000 goto error;
1002 1001
1003 key_is_present: 1002 key_is_present:
1004 /* we need to copy the key list for RCU purposes */ 1003 /* we need to copy the key list for RCU purposes */
1005 nklist = kmalloc(sizeof(*klist) + 1004 nklist = kmalloc(sizeof(*klist) +
1006 sizeof(struct key *) * klist->maxkeys, 1005 sizeof(struct key *) * klist->maxkeys,
1007 GFP_KERNEL); 1006 GFP_KERNEL);
1008 if (!nklist) 1007 if (!nklist)
1009 goto nomem; 1008 goto nomem;
1010 nklist->maxkeys = klist->maxkeys; 1009 nklist->maxkeys = klist->maxkeys;
1011 nklist->nkeys = klist->nkeys - 1; 1010 nklist->nkeys = klist->nkeys - 1;
1012 1011
1013 if (loop > 0) 1012 if (loop > 0)
1014 memcpy(&nklist->keys[0], 1013 memcpy(&nklist->keys[0],
1015 &klist->keys[0], 1014 &klist->keys[0],
1016 loop * sizeof(struct key *)); 1015 loop * sizeof(struct key *));
1017 1016
1018 if (loop < nklist->nkeys) 1017 if (loop < nklist->nkeys)
1019 memcpy(&nklist->keys[loop], 1018 memcpy(&nklist->keys[loop],
1020 &klist->keys[loop + 1], 1019 &klist->keys[loop + 1],
1021 (nklist->nkeys - loop) * sizeof(struct key *)); 1020 (nklist->nkeys - loop) * sizeof(struct key *));
1022 1021
1023 /* adjust the user's quota */ 1022 /* adjust the user's quota */
1024 key_payload_reserve(keyring, 1023 key_payload_reserve(keyring,
1025 keyring->datalen - KEYQUOTA_LINK_BYTES); 1024 keyring->datalen - KEYQUOTA_LINK_BYTES);
1026 1025
1027 rcu_assign_pointer(keyring->payload.subscriptions, nklist); 1026 rcu_assign_pointer(keyring->payload.subscriptions, nklist);
1028 1027
1029 up_write(&keyring->sem); 1028 up_write(&keyring->sem);
1030 1029
1031 /* schedule for later cleanup */ 1030 /* schedule for later cleanup */
1032 klist->delkey = loop; 1031 klist->delkey = loop;
1033 call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); 1032 call_rcu(&klist->rcu, keyring_unlink_rcu_disposal);
1034 1033
1035 ret = 0; 1034 ret = 0;
1036 1035
1037 error: 1036 error:
1038 return ret; 1037 return ret;
1039 nomem: 1038 nomem:
1040 ret = -ENOMEM; 1039 ret = -ENOMEM;
1041 up_write(&keyring->sem); 1040 up_write(&keyring->sem);
1042 goto error; 1041 goto error;
1043 } 1042 }
1044 EXPORT_SYMBOL(key_unlink); 1043 EXPORT_SYMBOL(key_unlink);
1045 1044
1046 /* 1045 /*
1047 * Dispose of a keyring list after the RCU grace period, releasing the keys it 1046 * Dispose of a keyring list after the RCU grace period, releasing the keys it
1048 * links to. 1047 * links to.
1049 */ 1048 */
1050 static void keyring_clear_rcu_disposal(struct rcu_head *rcu) 1049 static void keyring_clear_rcu_disposal(struct rcu_head *rcu)
1051 { 1050 {
1052 struct keyring_list *klist; 1051 struct keyring_list *klist;
1053 int loop; 1052 int loop;
1054 1053
1055 klist = container_of(rcu, struct keyring_list, rcu); 1054 klist = container_of(rcu, struct keyring_list, rcu);
1056 1055
1057 for (loop = klist->nkeys - 1; loop >= 0; loop--) 1056 for (loop = klist->nkeys - 1; loop >= 0; loop--)
1058 key_put(klist->keys[loop]); 1057 key_put(klist->keys[loop]);
1059 1058
1060 kfree(klist); 1059 kfree(klist);
1061 } 1060 }
1062 1061
1063 /** 1062 /**
1064 * keyring_clear - Clear a keyring 1063 * keyring_clear - Clear a keyring
1065 * @keyring: The keyring to clear. 1064 * @keyring: The keyring to clear.
1066 * 1065 *
1067 * Clear the contents of the specified keyring. 1066 * Clear the contents of the specified keyring.
1068 * 1067 *
1069 * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring. 1068 * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring.
1070 */ 1069 */
1071 int keyring_clear(struct key *keyring) 1070 int keyring_clear(struct key *keyring)
1072 { 1071 {
1073 struct keyring_list *klist; 1072 struct keyring_list *klist;
1074 int ret; 1073 int ret;
1075 1074
1076 ret = -ENOTDIR; 1075 ret = -ENOTDIR;
1077 if (keyring->type == &key_type_keyring) { 1076 if (keyring->type == &key_type_keyring) {
1078 /* detach the pointer block with the locks held */ 1077 /* detach the pointer block with the locks held */
1079 down_write(&keyring->sem); 1078 down_write(&keyring->sem);
1080 1079
1081 klist = rcu_dereference_locked_keyring(keyring); 1080 klist = rcu_dereference_locked_keyring(keyring);
1082 if (klist) { 1081 if (klist) {
1083 /* adjust the quota */ 1082 /* adjust the quota */
1084 key_payload_reserve(keyring, 1083 key_payload_reserve(keyring,
1085 sizeof(struct keyring_list)); 1084 sizeof(struct keyring_list));
1086 1085
1087 rcu_assign_pointer(keyring->payload.subscriptions, 1086 rcu_assign_pointer(keyring->payload.subscriptions,
1088 NULL); 1087 NULL);
1089 } 1088 }
1090 1089
1091 up_write(&keyring->sem); 1090 up_write(&keyring->sem);
1092 1091
1093 /* free the keys after the locks have been dropped */ 1092 /* free the keys after the locks have been dropped */
1094 if (klist) 1093 if (klist)
1095 call_rcu(&klist->rcu, keyring_clear_rcu_disposal); 1094 call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
1096 1095
1097 ret = 0; 1096 ret = 0;
1098 } 1097 }
1099 1098
1100 return ret; 1099 return ret;
1101 } 1100 }
1102 EXPORT_SYMBOL(keyring_clear); 1101 EXPORT_SYMBOL(keyring_clear);
1103 1102
1104 /* 1103 /*
1105 * Dispose of the links from a revoked keyring. 1104 * Dispose of the links from a revoked keyring.
1106 * 1105 *
1107 * This is called with the key sem write-locked. 1106 * This is called with the key sem write-locked.
1108 */ 1107 */
1109 static void keyring_revoke(struct key *keyring) 1108 static void keyring_revoke(struct key *keyring)
1110 { 1109 {
1111 struct keyring_list *klist; 1110 struct keyring_list *klist;
1112 1111
1113 klist = rcu_dereference_locked_keyring(keyring); 1112 klist = rcu_dereference_locked_keyring(keyring);
1114 1113
1115 /* adjust the quota */ 1114 /* adjust the quota */
1116 key_payload_reserve(keyring, 0); 1115 key_payload_reserve(keyring, 0);
1117 1116
1118 if (klist) { 1117 if (klist) {
1119 rcu_assign_pointer(keyring->payload.subscriptions, NULL); 1118 rcu_assign_pointer(keyring->payload.subscriptions, NULL);
1120 call_rcu(&klist->rcu, keyring_clear_rcu_disposal); 1119 call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
1121 } 1120 }
1122 } 1121 }
1123 1122
1124 /* 1123 /*
1125 * Determine whether a key is dead. 1124 * Determine whether a key is dead.
1126 */ 1125 */
1127 static bool key_is_dead(struct key *key, time_t limit) 1126 static bool key_is_dead(struct key *key, time_t limit)
1128 { 1127 {
1129 return test_bit(KEY_FLAG_DEAD, &key->flags) || 1128 return test_bit(KEY_FLAG_DEAD, &key->flags) ||
1130 (key->expiry > 0 && key->expiry <= limit); 1129 (key->expiry > 0 && key->expiry <= limit);
1131 } 1130 }
1132 1131
1133 /* 1132 /*
1134 * Collect garbage from the contents of a keyring, replacing the old list with 1133 * Collect garbage from the contents of a keyring, replacing the old list with
1135 * a new one with the pointers all shuffled down. 1134 * a new one with the pointers all shuffled down.
1136 * 1135 *
1137 * Dead keys are classed as oned that are flagged as being dead or are revoked, 1136 * Dead keys are classed as oned that are flagged as being dead or are revoked,
1138 * expired or negative keys that were revoked or expired before the specified 1137 * expired or negative keys that were revoked or expired before the specified
1139 * limit. 1138 * limit.
1140 */ 1139 */
1141 void keyring_gc(struct key *keyring, time_t limit) 1140 void keyring_gc(struct key *keyring, time_t limit)
1142 { 1141 {
1143 struct keyring_list *klist, *new; 1142 struct keyring_list *klist, *new;
1144 struct key *key; 1143 struct key *key;
1145 int loop, keep, max; 1144 int loop, keep, max;
1146 1145
1147 kenter("{%x,%s}", key_serial(keyring), keyring->description); 1146 kenter("{%x,%s}", key_serial(keyring), keyring->description);
1148 1147
1149 down_write(&keyring->sem); 1148 down_write(&keyring->sem);
1150 1149
1151 klist = rcu_dereference_locked_keyring(keyring); 1150 klist = rcu_dereference_locked_keyring(keyring);
1152 if (!klist) 1151 if (!klist)
1153 goto no_klist; 1152 goto no_klist;
1154 1153
1155 /* work out how many subscriptions we're keeping */ 1154 /* work out how many subscriptions we're keeping */
1156 keep = 0; 1155 keep = 0;
1157 for (loop = klist->nkeys - 1; loop >= 0; loop--) 1156 for (loop = klist->nkeys - 1; loop >= 0; loop--)
1158 if (!key_is_dead(klist->keys[loop], limit)) 1157 if (!key_is_dead(klist->keys[loop], limit))
1159 keep++; 1158 keep++;
1160 1159
1161 if (keep == klist->nkeys) 1160 if (keep == klist->nkeys)
1162 goto just_return; 1161 goto just_return;
1163 1162
1164 /* allocate a new keyring payload */ 1163 /* allocate a new keyring payload */
1165 max = roundup(keep, 4); 1164 max = roundup(keep, 4);
1166 new = kmalloc(sizeof(struct keyring_list) + max * sizeof(struct key *), 1165 new = kmalloc(sizeof(struct keyring_list) + max * sizeof(struct key *),
1167 GFP_KERNEL); 1166 GFP_KERNEL);
1168 if (!new) 1167 if (!new)
1169 goto nomem; 1168 goto nomem;
1170 new->maxkeys = max; 1169 new->maxkeys = max;
1171 new->nkeys = 0; 1170 new->nkeys = 0;
1172 new->delkey = 0; 1171 new->delkey = 0;
1173 1172
1174 /* install the live keys 1173 /* install the live keys
1175 * - must take care as expired keys may be updated back to life 1174 * - must take care as expired keys may be updated back to life
1176 */ 1175 */
1177 keep = 0; 1176 keep = 0;
1178 for (loop = klist->nkeys - 1; loop >= 0; loop--) { 1177 for (loop = klist->nkeys - 1; loop >= 0; loop--) {
1179 key = klist->keys[loop]; 1178 key = klist->keys[loop];
1180 if (!key_is_dead(key, limit)) { 1179 if (!key_is_dead(key, limit)) {
1181 if (keep >= max) 1180 if (keep >= max)
1182 goto discard_new; 1181 goto discard_new;
1183 new->keys[keep++] = key_get(key); 1182 new->keys[keep++] = key_get(key);
1184 } 1183 }
1185 } 1184 }
1186 new->nkeys = keep; 1185 new->nkeys = keep;
1187 1186
1188 /* adjust the quota */ 1187 /* adjust the quota */
1189 key_payload_reserve(keyring, 1188 key_payload_reserve(keyring,
1190 sizeof(struct keyring_list) + 1189 sizeof(struct keyring_list) +
1191 KEYQUOTA_LINK_BYTES * keep); 1190 KEYQUOTA_LINK_BYTES * keep);
1192 1191
1193 if (keep == 0) { 1192 if (keep == 0) {
1194 rcu_assign_pointer(keyring->payload.subscriptions, NULL); 1193 rcu_assign_pointer(keyring->payload.subscriptions, NULL);
1195 kfree(new); 1194 kfree(new);
1196 } else { 1195 } else {
1197 rcu_assign_pointer(keyring->payload.subscriptions, new); 1196 rcu_assign_pointer(keyring->payload.subscriptions, new);
1198 } 1197 }
1199 1198
1200 up_write(&keyring->sem); 1199 up_write(&keyring->sem);
1201 1200
1202 call_rcu(&klist->rcu, keyring_clear_rcu_disposal); 1201 call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
1203 kleave(" [yes]"); 1202 kleave(" [yes]");
1204 return; 1203 return;
1205 1204
1206 discard_new: 1205 discard_new:
1207 new->nkeys = keep; 1206 new->nkeys = keep;
1208 keyring_clear_rcu_disposal(&new->rcu); 1207 keyring_clear_rcu_disposal(&new->rcu);
1209 up_write(&keyring->sem); 1208 up_write(&keyring->sem);
1210 kleave(" [discard]"); 1209 kleave(" [discard]");
1211 return; 1210 return;
1212 1211
1213 just_return: 1212 just_return:
1214 up_write(&keyring->sem); 1213 up_write(&keyring->sem);
1215 kleave(" [no dead]"); 1214 kleave(" [no dead]");
1216 return; 1215 return;
1217 1216
1218 no_klist: 1217 no_klist:
1219 up_write(&keyring->sem); 1218 up_write(&keyring->sem);
1220 kleave(" [no_klist]"); 1219 kleave(" [no_klist]");
1221 return; 1220 return;
1222 1221
1223 nomem: 1222 nomem:
1224 up_write(&keyring->sem); 1223 up_write(&keyring->sem);
1225 kleave(" [oom]"); 1224 kleave(" [oom]");
1226 } 1225 }
1227 1226