Commit d8bf4ca9ca9576548628344c9725edd3786e90b1
Committed by
Jiri Kosina
1 parent
eb032b9837
Exists in
master
and in
6 other branches
rcu: treewide: Do not use rcu_read_lock_held when calling rcu_dereference_check
Since ca5ecddf (rcu: define __rcu address space modifier for sparse) rcu_dereference_check use rcu_read_lock_held as a part of condition automatically so callers do not have to do that as well. Signed-off-by: Michal Hocko <mhocko@suse.cz> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Showing 14 changed files with 6 additions and 27 deletions Inline Diff
- include/linux/cgroup.h
- include/linux/cred.h
- include/linux/fdtable.h
- include/linux/rtnetlink.h
- include/net/sock.h
- kernel/cgroup.c
- kernel/exit.c
- kernel/pid.c
- kernel/rcutorture.c
- kernel/sched.c
- net/mac80211/sta_info.c
- net/netlabel/netlabel_domainhash.c
- net/netlabel/netlabel_unlabeled.c
- security/keys/keyring.c
include/linux/cgroup.h
1 | #ifndef _LINUX_CGROUP_H | 1 | #ifndef _LINUX_CGROUP_H |
2 | #define _LINUX_CGROUP_H | 2 | #define _LINUX_CGROUP_H |
3 | /* | 3 | /* |
4 | * cgroup interface | 4 | * cgroup interface |
5 | * | 5 | * |
6 | * Copyright (C) 2003 BULL SA | 6 | * Copyright (C) 2003 BULL SA |
7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. | 7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. |
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
12 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/rcupdate.h> | 14 | #include <linux/rcupdate.h> |
15 | #include <linux/cgroupstats.h> | 15 | #include <linux/cgroupstats.h> |
16 | #include <linux/prio_heap.h> | 16 | #include <linux/prio_heap.h> |
17 | #include <linux/rwsem.h> | 17 | #include <linux/rwsem.h> |
18 | #include <linux/idr.h> | 18 | #include <linux/idr.h> |
19 | 19 | ||
20 | #ifdef CONFIG_CGROUPS | 20 | #ifdef CONFIG_CGROUPS |
21 | 21 | ||
22 | struct cgroupfs_root; | 22 | struct cgroupfs_root; |
23 | struct cgroup_subsys; | 23 | struct cgroup_subsys; |
24 | struct inode; | 24 | struct inode; |
25 | struct cgroup; | 25 | struct cgroup; |
26 | struct css_id; | 26 | struct css_id; |
27 | 27 | ||
28 | extern int cgroup_init_early(void); | 28 | extern int cgroup_init_early(void); |
29 | extern int cgroup_init(void); | 29 | extern int cgroup_init(void); |
30 | extern void cgroup_lock(void); | 30 | extern void cgroup_lock(void); |
31 | extern int cgroup_lock_is_held(void); | 31 | extern int cgroup_lock_is_held(void); |
32 | extern bool cgroup_lock_live_group(struct cgroup *cgrp); | 32 | extern bool cgroup_lock_live_group(struct cgroup *cgrp); |
33 | extern void cgroup_unlock(void); | 33 | extern void cgroup_unlock(void); |
34 | extern void cgroup_fork(struct task_struct *p); | 34 | extern void cgroup_fork(struct task_struct *p); |
35 | extern void cgroup_fork_callbacks(struct task_struct *p); | 35 | extern void cgroup_fork_callbacks(struct task_struct *p); |
36 | extern void cgroup_post_fork(struct task_struct *p); | 36 | extern void cgroup_post_fork(struct task_struct *p); |
37 | extern void cgroup_exit(struct task_struct *p, int run_callbacks); | 37 | extern void cgroup_exit(struct task_struct *p, int run_callbacks); |
38 | extern int cgroupstats_build(struct cgroupstats *stats, | 38 | extern int cgroupstats_build(struct cgroupstats *stats, |
39 | struct dentry *dentry); | 39 | struct dentry *dentry); |
40 | extern int cgroup_load_subsys(struct cgroup_subsys *ss); | 40 | extern int cgroup_load_subsys(struct cgroup_subsys *ss); |
41 | extern void cgroup_unload_subsys(struct cgroup_subsys *ss); | 41 | extern void cgroup_unload_subsys(struct cgroup_subsys *ss); |
42 | 42 | ||
43 | extern const struct file_operations proc_cgroup_operations; | 43 | extern const struct file_operations proc_cgroup_operations; |
44 | 44 | ||
45 | /* Define the enumeration of all builtin cgroup subsystems */ | 45 | /* Define the enumeration of all builtin cgroup subsystems */ |
46 | #define SUBSYS(_x) _x ## _subsys_id, | 46 | #define SUBSYS(_x) _x ## _subsys_id, |
47 | enum cgroup_subsys_id { | 47 | enum cgroup_subsys_id { |
48 | #include <linux/cgroup_subsys.h> | 48 | #include <linux/cgroup_subsys.h> |
49 | CGROUP_BUILTIN_SUBSYS_COUNT | 49 | CGROUP_BUILTIN_SUBSYS_COUNT |
50 | }; | 50 | }; |
51 | #undef SUBSYS | 51 | #undef SUBSYS |
52 | /* | 52 | /* |
53 | * This define indicates the maximum number of subsystems that can be loaded | 53 | * This define indicates the maximum number of subsystems that can be loaded |
54 | * at once. We limit to this many since cgroupfs_root has subsys_bits to keep | 54 | * at once. We limit to this many since cgroupfs_root has subsys_bits to keep |
55 | * track of all of them. | 55 | * track of all of them. |
56 | */ | 56 | */ |
57 | #define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long)) | 57 | #define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long)) |
58 | 58 | ||
59 | /* Per-subsystem/per-cgroup state maintained by the system. */ | 59 | /* Per-subsystem/per-cgroup state maintained by the system. */ |
60 | struct cgroup_subsys_state { | 60 | struct cgroup_subsys_state { |
61 | /* | 61 | /* |
62 | * The cgroup that this subsystem is attached to. Useful | 62 | * The cgroup that this subsystem is attached to. Useful |
63 | * for subsystems that want to know about the cgroup | 63 | * for subsystems that want to know about the cgroup |
64 | * hierarchy structure | 64 | * hierarchy structure |
65 | */ | 65 | */ |
66 | struct cgroup *cgroup; | 66 | struct cgroup *cgroup; |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * State maintained by the cgroup system to allow subsystems | 69 | * State maintained by the cgroup system to allow subsystems |
70 | * to be "busy". Should be accessed via css_get(), | 70 | * to be "busy". Should be accessed via css_get(), |
71 | * css_tryget() and and css_put(). | 71 | * css_tryget() and and css_put(). |
72 | */ | 72 | */ |
73 | 73 | ||
74 | atomic_t refcnt; | 74 | atomic_t refcnt; |
75 | 75 | ||
76 | unsigned long flags; | 76 | unsigned long flags; |
77 | /* ID for this css, if possible */ | 77 | /* ID for this css, if possible */ |
78 | struct css_id __rcu *id; | 78 | struct css_id __rcu *id; |
79 | }; | 79 | }; |
80 | 80 | ||
81 | /* bits in struct cgroup_subsys_state flags field */ | 81 | /* bits in struct cgroup_subsys_state flags field */ |
82 | enum { | 82 | enum { |
83 | CSS_ROOT, /* This CSS is the root of the subsystem */ | 83 | CSS_ROOT, /* This CSS is the root of the subsystem */ |
84 | CSS_REMOVED, /* This CSS is dead */ | 84 | CSS_REMOVED, /* This CSS is dead */ |
85 | }; | 85 | }; |
86 | 86 | ||
87 | /* Caller must verify that the css is not for root cgroup */ | 87 | /* Caller must verify that the css is not for root cgroup */ |
88 | static inline void __css_get(struct cgroup_subsys_state *css, int count) | 88 | static inline void __css_get(struct cgroup_subsys_state *css, int count) |
89 | { | 89 | { |
90 | atomic_add(count, &css->refcnt); | 90 | atomic_add(count, &css->refcnt); |
91 | } | 91 | } |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * Call css_get() to hold a reference on the css; it can be used | 94 | * Call css_get() to hold a reference on the css; it can be used |
95 | * for a reference obtained via: | 95 | * for a reference obtained via: |
96 | * - an existing ref-counted reference to the css | 96 | * - an existing ref-counted reference to the css |
97 | * - task->cgroups for a locked task | 97 | * - task->cgroups for a locked task |
98 | */ | 98 | */ |
99 | 99 | ||
100 | static inline void css_get(struct cgroup_subsys_state *css) | 100 | static inline void css_get(struct cgroup_subsys_state *css) |
101 | { | 101 | { |
102 | /* We don't need to reference count the root state */ | 102 | /* We don't need to reference count the root state */ |
103 | if (!test_bit(CSS_ROOT, &css->flags)) | 103 | if (!test_bit(CSS_ROOT, &css->flags)) |
104 | __css_get(css, 1); | 104 | __css_get(css, 1); |
105 | } | 105 | } |
106 | 106 | ||
107 | static inline bool css_is_removed(struct cgroup_subsys_state *css) | 107 | static inline bool css_is_removed(struct cgroup_subsys_state *css) |
108 | { | 108 | { |
109 | return test_bit(CSS_REMOVED, &css->flags); | 109 | return test_bit(CSS_REMOVED, &css->flags); |
110 | } | 110 | } |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * Call css_tryget() to take a reference on a css if your existing | 113 | * Call css_tryget() to take a reference on a css if your existing |
114 | * (known-valid) reference isn't already ref-counted. Returns false if | 114 | * (known-valid) reference isn't already ref-counted. Returns false if |
115 | * the css has been destroyed. | 115 | * the css has been destroyed. |
116 | */ | 116 | */ |
117 | 117 | ||
118 | static inline bool css_tryget(struct cgroup_subsys_state *css) | 118 | static inline bool css_tryget(struct cgroup_subsys_state *css) |
119 | { | 119 | { |
120 | if (test_bit(CSS_ROOT, &css->flags)) | 120 | if (test_bit(CSS_ROOT, &css->flags)) |
121 | return true; | 121 | return true; |
122 | while (!atomic_inc_not_zero(&css->refcnt)) { | 122 | while (!atomic_inc_not_zero(&css->refcnt)) { |
123 | if (test_bit(CSS_REMOVED, &css->flags)) | 123 | if (test_bit(CSS_REMOVED, &css->flags)) |
124 | return false; | 124 | return false; |
125 | cpu_relax(); | 125 | cpu_relax(); |
126 | } | 126 | } |
127 | return true; | 127 | return true; |
128 | } | 128 | } |
129 | 129 | ||
130 | /* | 130 | /* |
131 | * css_put() should be called to release a reference taken by | 131 | * css_put() should be called to release a reference taken by |
132 | * css_get() or css_tryget() | 132 | * css_get() or css_tryget() |
133 | */ | 133 | */ |
134 | 134 | ||
135 | extern void __css_put(struct cgroup_subsys_state *css, int count); | 135 | extern void __css_put(struct cgroup_subsys_state *css, int count); |
136 | static inline void css_put(struct cgroup_subsys_state *css) | 136 | static inline void css_put(struct cgroup_subsys_state *css) |
137 | { | 137 | { |
138 | if (!test_bit(CSS_ROOT, &css->flags)) | 138 | if (!test_bit(CSS_ROOT, &css->flags)) |
139 | __css_put(css, 1); | 139 | __css_put(css, 1); |
140 | } | 140 | } |
141 | 141 | ||
142 | /* bits in struct cgroup flags field */ | 142 | /* bits in struct cgroup flags field */ |
143 | enum { | 143 | enum { |
144 | /* Control Group is dead */ | 144 | /* Control Group is dead */ |
145 | CGRP_REMOVED, | 145 | CGRP_REMOVED, |
146 | /* | 146 | /* |
147 | * Control Group has previously had a child cgroup or a task, | 147 | * Control Group has previously had a child cgroup or a task, |
148 | * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) | 148 | * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) |
149 | */ | 149 | */ |
150 | CGRP_RELEASABLE, | 150 | CGRP_RELEASABLE, |
151 | /* Control Group requires release notifications to userspace */ | 151 | /* Control Group requires release notifications to userspace */ |
152 | CGRP_NOTIFY_ON_RELEASE, | 152 | CGRP_NOTIFY_ON_RELEASE, |
153 | /* | 153 | /* |
154 | * A thread in rmdir() is wating for this cgroup. | 154 | * A thread in rmdir() is wating for this cgroup. |
155 | */ | 155 | */ |
156 | CGRP_WAIT_ON_RMDIR, | 156 | CGRP_WAIT_ON_RMDIR, |
157 | /* | 157 | /* |
158 | * Clone cgroup values when creating a new child cgroup | 158 | * Clone cgroup values when creating a new child cgroup |
159 | */ | 159 | */ |
160 | CGRP_CLONE_CHILDREN, | 160 | CGRP_CLONE_CHILDREN, |
161 | }; | 161 | }; |
162 | 162 | ||
163 | /* which pidlist file are we talking about? */ | 163 | /* which pidlist file are we talking about? */ |
164 | enum cgroup_filetype { | 164 | enum cgroup_filetype { |
165 | CGROUP_FILE_PROCS, | 165 | CGROUP_FILE_PROCS, |
166 | CGROUP_FILE_TASKS, | 166 | CGROUP_FILE_TASKS, |
167 | }; | 167 | }; |
168 | 168 | ||
169 | /* | 169 | /* |
170 | * A pidlist is a list of pids that virtually represents the contents of one | 170 | * A pidlist is a list of pids that virtually represents the contents of one |
171 | * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, | 171 | * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, |
172 | * a pair (one each for procs, tasks) for each pid namespace that's relevant | 172 | * a pair (one each for procs, tasks) for each pid namespace that's relevant |
173 | * to the cgroup. | 173 | * to the cgroup. |
174 | */ | 174 | */ |
175 | struct cgroup_pidlist { | 175 | struct cgroup_pidlist { |
176 | /* | 176 | /* |
177 | * used to find which pidlist is wanted. doesn't change as long as | 177 | * used to find which pidlist is wanted. doesn't change as long as |
178 | * this particular list stays in the list. | 178 | * this particular list stays in the list. |
179 | */ | 179 | */ |
180 | struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; | 180 | struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; |
181 | /* array of xids */ | 181 | /* array of xids */ |
182 | pid_t *list; | 182 | pid_t *list; |
183 | /* how many elements the above list has */ | 183 | /* how many elements the above list has */ |
184 | int length; | 184 | int length; |
185 | /* how many files are using the current array */ | 185 | /* how many files are using the current array */ |
186 | int use_count; | 186 | int use_count; |
187 | /* each of these stored in a list by its cgroup */ | 187 | /* each of these stored in a list by its cgroup */ |
188 | struct list_head links; | 188 | struct list_head links; |
189 | /* pointer to the cgroup we belong to, for list removal purposes */ | 189 | /* pointer to the cgroup we belong to, for list removal purposes */ |
190 | struct cgroup *owner; | 190 | struct cgroup *owner; |
191 | /* protects the other fields */ | 191 | /* protects the other fields */ |
192 | struct rw_semaphore mutex; | 192 | struct rw_semaphore mutex; |
193 | }; | 193 | }; |
194 | 194 | ||
195 | struct cgroup { | 195 | struct cgroup { |
196 | unsigned long flags; /* "unsigned long" so bitops work */ | 196 | unsigned long flags; /* "unsigned long" so bitops work */ |
197 | 197 | ||
198 | /* | 198 | /* |
199 | * count users of this cgroup. >0 means busy, but doesn't | 199 | * count users of this cgroup. >0 means busy, but doesn't |
200 | * necessarily indicate the number of tasks in the cgroup | 200 | * necessarily indicate the number of tasks in the cgroup |
201 | */ | 201 | */ |
202 | atomic_t count; | 202 | atomic_t count; |
203 | 203 | ||
204 | /* | 204 | /* |
205 | * We link our 'sibling' struct into our parent's 'children'. | 205 | * We link our 'sibling' struct into our parent's 'children'. |
206 | * Our children link their 'sibling' into our 'children'. | 206 | * Our children link their 'sibling' into our 'children'. |
207 | */ | 207 | */ |
208 | struct list_head sibling; /* my parent's children */ | 208 | struct list_head sibling; /* my parent's children */ |
209 | struct list_head children; /* my children */ | 209 | struct list_head children; /* my children */ |
210 | 210 | ||
211 | struct cgroup *parent; /* my parent */ | 211 | struct cgroup *parent; /* my parent */ |
212 | struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ | 212 | struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ |
213 | 213 | ||
214 | /* Private pointers for each registered subsystem */ | 214 | /* Private pointers for each registered subsystem */ |
215 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | 215 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; |
216 | 216 | ||
217 | struct cgroupfs_root *root; | 217 | struct cgroupfs_root *root; |
218 | struct cgroup *top_cgroup; | 218 | struct cgroup *top_cgroup; |
219 | 219 | ||
220 | /* | 220 | /* |
221 | * List of cg_cgroup_links pointing at css_sets with | 221 | * List of cg_cgroup_links pointing at css_sets with |
222 | * tasks in this cgroup. Protected by css_set_lock | 222 | * tasks in this cgroup. Protected by css_set_lock |
223 | */ | 223 | */ |
224 | struct list_head css_sets; | 224 | struct list_head css_sets; |
225 | 225 | ||
226 | /* | 226 | /* |
227 | * Linked list running through all cgroups that can | 227 | * Linked list running through all cgroups that can |
228 | * potentially be reaped by the release agent. Protected by | 228 | * potentially be reaped by the release agent. Protected by |
229 | * release_list_lock | 229 | * release_list_lock |
230 | */ | 230 | */ |
231 | struct list_head release_list; | 231 | struct list_head release_list; |
232 | 232 | ||
233 | /* | 233 | /* |
234 | * list of pidlists, up to two for each namespace (one for procs, one | 234 | * list of pidlists, up to two for each namespace (one for procs, one |
235 | * for tasks); created on demand. | 235 | * for tasks); created on demand. |
236 | */ | 236 | */ |
237 | struct list_head pidlists; | 237 | struct list_head pidlists; |
238 | struct mutex pidlist_mutex; | 238 | struct mutex pidlist_mutex; |
239 | 239 | ||
240 | /* For RCU-protected deletion */ | 240 | /* For RCU-protected deletion */ |
241 | struct rcu_head rcu_head; | 241 | struct rcu_head rcu_head; |
242 | 242 | ||
243 | /* List of events which userspace want to receive */ | 243 | /* List of events which userspace want to receive */ |
244 | struct list_head event_list; | 244 | struct list_head event_list; |
245 | spinlock_t event_list_lock; | 245 | spinlock_t event_list_lock; |
246 | }; | 246 | }; |
247 | 247 | ||
248 | /* | 248 | /* |
249 | * A css_set is a structure holding pointers to a set of | 249 | * A css_set is a structure holding pointers to a set of |
250 | * cgroup_subsys_state objects. This saves space in the task struct | 250 | * cgroup_subsys_state objects. This saves space in the task struct |
251 | * object and speeds up fork()/exit(), since a single inc/dec and a | 251 | * object and speeds up fork()/exit(), since a single inc/dec and a |
252 | * list_add()/del() can bump the reference count on the entire cgroup | 252 | * list_add()/del() can bump the reference count on the entire cgroup |
253 | * set for a task. | 253 | * set for a task. |
254 | */ | 254 | */ |
255 | 255 | ||
256 | struct css_set { | 256 | struct css_set { |
257 | 257 | ||
258 | /* Reference count */ | 258 | /* Reference count */ |
259 | atomic_t refcount; | 259 | atomic_t refcount; |
260 | 260 | ||
261 | /* | 261 | /* |
262 | * List running through all cgroup groups in the same hash | 262 | * List running through all cgroup groups in the same hash |
263 | * slot. Protected by css_set_lock | 263 | * slot. Protected by css_set_lock |
264 | */ | 264 | */ |
265 | struct hlist_node hlist; | 265 | struct hlist_node hlist; |
266 | 266 | ||
267 | /* | 267 | /* |
268 | * List running through all tasks using this cgroup | 268 | * List running through all tasks using this cgroup |
269 | * group. Protected by css_set_lock | 269 | * group. Protected by css_set_lock |
270 | */ | 270 | */ |
271 | struct list_head tasks; | 271 | struct list_head tasks; |
272 | 272 | ||
273 | /* | 273 | /* |
274 | * List of cg_cgroup_link objects on link chains from | 274 | * List of cg_cgroup_link objects on link chains from |
275 | * cgroups referenced from this css_set. Protected by | 275 | * cgroups referenced from this css_set. Protected by |
276 | * css_set_lock | 276 | * css_set_lock |
277 | */ | 277 | */ |
278 | struct list_head cg_links; | 278 | struct list_head cg_links; |
279 | 279 | ||
280 | /* | 280 | /* |
281 | * Set of subsystem states, one for each subsystem. This array | 281 | * Set of subsystem states, one for each subsystem. This array |
282 | * is immutable after creation apart from the init_css_set | 282 | * is immutable after creation apart from the init_css_set |
283 | * during subsystem registration (at boot time) and modular subsystem | 283 | * during subsystem registration (at boot time) and modular subsystem |
284 | * loading/unloading. | 284 | * loading/unloading. |
285 | */ | 285 | */ |
286 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | 286 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; |
287 | 287 | ||
288 | /* For RCU-protected deletion */ | 288 | /* For RCU-protected deletion */ |
289 | struct rcu_head rcu_head; | 289 | struct rcu_head rcu_head; |
290 | }; | 290 | }; |
291 | 291 | ||
292 | /* | 292 | /* |
293 | * cgroup_map_cb is an abstract callback API for reporting map-valued | 293 | * cgroup_map_cb is an abstract callback API for reporting map-valued |
294 | * control files | 294 | * control files |
295 | */ | 295 | */ |
296 | 296 | ||
297 | struct cgroup_map_cb { | 297 | struct cgroup_map_cb { |
298 | int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); | 298 | int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); |
299 | void *state; | 299 | void *state; |
300 | }; | 300 | }; |
301 | 301 | ||
302 | /* | 302 | /* |
303 | * struct cftype: handler definitions for cgroup control files | 303 | * struct cftype: handler definitions for cgroup control files |
304 | * | 304 | * |
305 | * When reading/writing to a file: | 305 | * When reading/writing to a file: |
306 | * - the cgroup to use is file->f_dentry->d_parent->d_fsdata | 306 | * - the cgroup to use is file->f_dentry->d_parent->d_fsdata |
307 | * - the 'cftype' of the file is file->f_dentry->d_fsdata | 307 | * - the 'cftype' of the file is file->f_dentry->d_fsdata |
308 | */ | 308 | */ |
309 | 309 | ||
310 | #define MAX_CFTYPE_NAME 64 | 310 | #define MAX_CFTYPE_NAME 64 |
311 | struct cftype { | 311 | struct cftype { |
312 | /* | 312 | /* |
313 | * By convention, the name should begin with the name of the | 313 | * By convention, the name should begin with the name of the |
314 | * subsystem, followed by a period | 314 | * subsystem, followed by a period |
315 | */ | 315 | */ |
316 | char name[MAX_CFTYPE_NAME]; | 316 | char name[MAX_CFTYPE_NAME]; |
317 | int private; | 317 | int private; |
318 | /* | 318 | /* |
319 | * If not 0, file mode is set to this value, otherwise it will | 319 | * If not 0, file mode is set to this value, otherwise it will |
320 | * be figured out automatically | 320 | * be figured out automatically |
321 | */ | 321 | */ |
322 | mode_t mode; | 322 | mode_t mode; |
323 | 323 | ||
324 | /* | 324 | /* |
325 | * If non-zero, defines the maximum length of string that can | 325 | * If non-zero, defines the maximum length of string that can |
326 | * be passed to write_string; defaults to 64 | 326 | * be passed to write_string; defaults to 64 |
327 | */ | 327 | */ |
328 | size_t max_write_len; | 328 | size_t max_write_len; |
329 | 329 | ||
330 | int (*open)(struct inode *inode, struct file *file); | 330 | int (*open)(struct inode *inode, struct file *file); |
331 | ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, | 331 | ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, |
332 | struct file *file, | 332 | struct file *file, |
333 | char __user *buf, size_t nbytes, loff_t *ppos); | 333 | char __user *buf, size_t nbytes, loff_t *ppos); |
334 | /* | 334 | /* |
335 | * read_u64() is a shortcut for the common case of returning a | 335 | * read_u64() is a shortcut for the common case of returning a |
336 | * single integer. Use it in place of read() | 336 | * single integer. Use it in place of read() |
337 | */ | 337 | */ |
338 | u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); | 338 | u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); |
339 | /* | 339 | /* |
340 | * read_s64() is a signed version of read_u64() | 340 | * read_s64() is a signed version of read_u64() |
341 | */ | 341 | */ |
342 | s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); | 342 | s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); |
343 | /* | 343 | /* |
344 | * read_map() is used for defining a map of key/value | 344 | * read_map() is used for defining a map of key/value |
345 | * pairs. It should call cb->fill(cb, key, value) for each | 345 | * pairs. It should call cb->fill(cb, key, value) for each |
346 | * entry. The key/value pairs (and their ordering) should not | 346 | * entry. The key/value pairs (and their ordering) should not |
347 | * change between reboots. | 347 | * change between reboots. |
348 | */ | 348 | */ |
349 | int (*read_map)(struct cgroup *cont, struct cftype *cft, | 349 | int (*read_map)(struct cgroup *cont, struct cftype *cft, |
350 | struct cgroup_map_cb *cb); | 350 | struct cgroup_map_cb *cb); |
351 | /* | 351 | /* |
352 | * read_seq_string() is used for outputting a simple sequence | 352 | * read_seq_string() is used for outputting a simple sequence |
353 | * using seqfile. | 353 | * using seqfile. |
354 | */ | 354 | */ |
355 | int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, | 355 | int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, |
356 | struct seq_file *m); | 356 | struct seq_file *m); |
357 | 357 | ||
358 | ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, | 358 | ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, |
359 | struct file *file, | 359 | struct file *file, |
360 | const char __user *buf, size_t nbytes, loff_t *ppos); | 360 | const char __user *buf, size_t nbytes, loff_t *ppos); |
361 | 361 | ||
362 | /* | 362 | /* |
363 | * write_u64() is a shortcut for the common case of accepting | 363 | * write_u64() is a shortcut for the common case of accepting |
364 | * a single integer (as parsed by simple_strtoull) from | 364 | * a single integer (as parsed by simple_strtoull) from |
365 | * userspace. Use in place of write(); return 0 or error. | 365 | * userspace. Use in place of write(); return 0 or error. |
366 | */ | 366 | */ |
367 | int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); | 367 | int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); |
368 | /* | 368 | /* |
369 | * write_s64() is a signed version of write_u64() | 369 | * write_s64() is a signed version of write_u64() |
370 | */ | 370 | */ |
371 | int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); | 371 | int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); |
372 | 372 | ||
373 | /* | 373 | /* |
374 | * write_string() is passed a nul-terminated kernelspace | 374 | * write_string() is passed a nul-terminated kernelspace |
375 | * buffer of maximum length determined by max_write_len. | 375 | * buffer of maximum length determined by max_write_len. |
376 | * Returns 0 or -ve error code. | 376 | * Returns 0 or -ve error code. |
377 | */ | 377 | */ |
378 | int (*write_string)(struct cgroup *cgrp, struct cftype *cft, | 378 | int (*write_string)(struct cgroup *cgrp, struct cftype *cft, |
379 | const char *buffer); | 379 | const char *buffer); |
380 | /* | 380 | /* |
381 | * trigger() callback can be used to get some kick from the | 381 | * trigger() callback can be used to get some kick from the |
382 | * userspace, when the actual string written is not important | 382 | * userspace, when the actual string written is not important |
383 | * at all. The private field can be used to determine the | 383 | * at all. The private field can be used to determine the |
384 | * kick type for multiplexing. | 384 | * kick type for multiplexing. |
385 | */ | 385 | */ |
386 | int (*trigger)(struct cgroup *cgrp, unsigned int event); | 386 | int (*trigger)(struct cgroup *cgrp, unsigned int event); |
387 | 387 | ||
388 | int (*release)(struct inode *inode, struct file *file); | 388 | int (*release)(struct inode *inode, struct file *file); |
389 | 389 | ||
390 | /* | 390 | /* |
391 | * register_event() callback will be used to add new userspace | 391 | * register_event() callback will be used to add new userspace |
392 | * waiter for changes related to the cftype. Implement it if | 392 | * waiter for changes related to the cftype. Implement it if |
393 | * you want to provide this functionality. Use eventfd_signal() | 393 | * you want to provide this functionality. Use eventfd_signal() |
394 | * on eventfd to send notification to userspace. | 394 | * on eventfd to send notification to userspace. |
395 | */ | 395 | */ |
396 | int (*register_event)(struct cgroup *cgrp, struct cftype *cft, | 396 | int (*register_event)(struct cgroup *cgrp, struct cftype *cft, |
397 | struct eventfd_ctx *eventfd, const char *args); | 397 | struct eventfd_ctx *eventfd, const char *args); |
398 | /* | 398 | /* |
399 | * unregister_event() callback will be called when userspace | 399 | * unregister_event() callback will be called when userspace |
400 | * closes the eventfd or on cgroup removing. | 400 | * closes the eventfd or on cgroup removing. |
401 | * This callback must be implemented, if you want provide | 401 | * This callback must be implemented, if you want provide |
402 | * notification functionality. | 402 | * notification functionality. |
403 | */ | 403 | */ |
404 | void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, | 404 | void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, |
405 | struct eventfd_ctx *eventfd); | 405 | struct eventfd_ctx *eventfd); |
406 | }; | 406 | }; |
407 | 407 | ||
408 | struct cgroup_scanner { | 408 | struct cgroup_scanner { |
409 | struct cgroup *cg; | 409 | struct cgroup *cg; |
410 | int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); | 410 | int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); |
411 | void (*process_task)(struct task_struct *p, | 411 | void (*process_task)(struct task_struct *p, |
412 | struct cgroup_scanner *scan); | 412 | struct cgroup_scanner *scan); |
413 | struct ptr_heap *heap; | 413 | struct ptr_heap *heap; |
414 | void *data; | 414 | void *data; |
415 | }; | 415 | }; |
416 | 416 | ||
417 | /* | 417 | /* |
418 | * Add a new file to the given cgroup directory. Should only be | 418 | * Add a new file to the given cgroup directory. Should only be |
419 | * called by subsystems from within a populate() method | 419 | * called by subsystems from within a populate() method |
420 | */ | 420 | */ |
421 | int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 421 | int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
422 | const struct cftype *cft); | 422 | const struct cftype *cft); |
423 | 423 | ||
424 | /* | 424 | /* |
425 | * Add a set of new files to the given cgroup directory. Should | 425 | * Add a set of new files to the given cgroup directory. Should |
426 | * only be called by subsystems from within a populate() method | 426 | * only be called by subsystems from within a populate() method |
427 | */ | 427 | */ |
428 | int cgroup_add_files(struct cgroup *cgrp, | 428 | int cgroup_add_files(struct cgroup *cgrp, |
429 | struct cgroup_subsys *subsys, | 429 | struct cgroup_subsys *subsys, |
430 | const struct cftype cft[], | 430 | const struct cftype cft[], |
431 | int count); | 431 | int count); |
432 | 432 | ||
433 | int cgroup_is_removed(const struct cgroup *cgrp); | 433 | int cgroup_is_removed(const struct cgroup *cgrp); |
434 | 434 | ||
435 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); | 435 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); |
436 | 436 | ||
437 | int cgroup_task_count(const struct cgroup *cgrp); | 437 | int cgroup_task_count(const struct cgroup *cgrp); |
438 | 438 | ||
439 | /* Return true if cgrp is a descendant of the task's cgroup */ | 439 | /* Return true if cgrp is a descendant of the task's cgroup */ |
440 | int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); | 440 | int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); |
441 | 441 | ||
442 | /* | 442 | /* |
443 | * When the subsys has to access css and may add permanent refcnt to css, | 443 | * When the subsys has to access css and may add permanent refcnt to css, |
444 | * it should take care of racy conditions with rmdir(). Following set of | 444 | * it should take care of racy conditions with rmdir(). Following set of |
445 | * functions, is for stop/restart rmdir if necessary. | 445 | * functions, is for stop/restart rmdir if necessary. |
446 | * Because these will call css_get/put, "css" should be alive css. | 446 | * Because these will call css_get/put, "css" should be alive css. |
447 | * | 447 | * |
448 | * cgroup_exclude_rmdir(); | 448 | * cgroup_exclude_rmdir(); |
449 | * ...do some jobs which may access arbitrary empty cgroup | 449 | * ...do some jobs which may access arbitrary empty cgroup |
450 | * cgroup_release_and_wakeup_rmdir(); | 450 | * cgroup_release_and_wakeup_rmdir(); |
451 | * | 451 | * |
452 | * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, | 452 | * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, |
453 | * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. | 453 | * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. |
454 | */ | 454 | */ |
455 | 455 | ||
456 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); | 456 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); |
457 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); | 457 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); |
458 | 458 | ||
459 | /* | 459 | /* |
460 | * Control Group subsystem type. | 460 | * Control Group subsystem type. |
461 | * See Documentation/cgroups/cgroups.txt for details | 461 | * See Documentation/cgroups/cgroups.txt for details |
462 | */ | 462 | */ |
463 | 463 | ||
464 | struct cgroup_subsys { | 464 | struct cgroup_subsys { |
465 | struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, | 465 | struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, |
466 | struct cgroup *cgrp); | 466 | struct cgroup *cgrp); |
467 | int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); | 467 | int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); |
468 | void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); | 468 | void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); |
469 | int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, | 469 | int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, |
470 | struct task_struct *tsk); | 470 | struct task_struct *tsk); |
471 | int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk); | 471 | int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk); |
472 | void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, | 472 | void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, |
473 | struct task_struct *tsk); | 473 | struct task_struct *tsk); |
474 | void (*pre_attach)(struct cgroup *cgrp); | 474 | void (*pre_attach)(struct cgroup *cgrp); |
475 | void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk); | 475 | void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk); |
476 | void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, | 476 | void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, |
477 | struct cgroup *old_cgrp, struct task_struct *tsk); | 477 | struct cgroup *old_cgrp, struct task_struct *tsk); |
478 | void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); | 478 | void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); |
479 | void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, | 479 | void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, |
480 | struct cgroup *old_cgrp, struct task_struct *task); | 480 | struct cgroup *old_cgrp, struct task_struct *task); |
481 | int (*populate)(struct cgroup_subsys *ss, | 481 | int (*populate)(struct cgroup_subsys *ss, |
482 | struct cgroup *cgrp); | 482 | struct cgroup *cgrp); |
483 | void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); | 483 | void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); |
484 | void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); | 484 | void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); |
485 | 485 | ||
486 | int subsys_id; | 486 | int subsys_id; |
487 | int active; | 487 | int active; |
488 | int disabled; | 488 | int disabled; |
489 | int early_init; | 489 | int early_init; |
490 | /* | 490 | /* |
491 | * True if this subsys uses ID. ID is not available before cgroup_init() | 491 | * True if this subsys uses ID. ID is not available before cgroup_init() |
492 | * (not available in early_init time.) | 492 | * (not available in early_init time.) |
493 | */ | 493 | */ |
494 | bool use_id; | 494 | bool use_id; |
495 | #define MAX_CGROUP_TYPE_NAMELEN 32 | 495 | #define MAX_CGROUP_TYPE_NAMELEN 32 |
496 | const char *name; | 496 | const char *name; |
497 | 497 | ||
498 | /* | 498 | /* |
499 | * Protects sibling/children links of cgroups in this | 499 | * Protects sibling/children links of cgroups in this |
500 | * hierarchy, plus protects which hierarchy (or none) the | 500 | * hierarchy, plus protects which hierarchy (or none) the |
501 | * subsystem is a part of (i.e. root/sibling). To avoid | 501 | * subsystem is a part of (i.e. root/sibling). To avoid |
502 | * potential deadlocks, the following operations should not be | 502 | * potential deadlocks, the following operations should not be |
503 | * undertaken while holding any hierarchy_mutex: | 503 | * undertaken while holding any hierarchy_mutex: |
504 | * | 504 | * |
505 | * - allocating memory | 505 | * - allocating memory |
506 | * - initiating hotplug events | 506 | * - initiating hotplug events |
507 | */ | 507 | */ |
508 | struct mutex hierarchy_mutex; | 508 | struct mutex hierarchy_mutex; |
509 | struct lock_class_key subsys_key; | 509 | struct lock_class_key subsys_key; |
510 | 510 | ||
511 | /* | 511 | /* |
512 | * Link to parent, and list entry in parent's children. | 512 | * Link to parent, and list entry in parent's children. |
513 | * Protected by this->hierarchy_mutex and cgroup_lock() | 513 | * Protected by this->hierarchy_mutex and cgroup_lock() |
514 | */ | 514 | */ |
515 | struct cgroupfs_root *root; | 515 | struct cgroupfs_root *root; |
516 | struct list_head sibling; | 516 | struct list_head sibling; |
517 | /* used when use_id == true */ | 517 | /* used when use_id == true */ |
518 | struct idr idr; | 518 | struct idr idr; |
519 | spinlock_t id_lock; | 519 | spinlock_t id_lock; |
520 | 520 | ||
521 | /* should be defined only by modular subsystems */ | 521 | /* should be defined only by modular subsystems */ |
522 | struct module *module; | 522 | struct module *module; |
523 | }; | 523 | }; |
524 | 524 | ||
525 | #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; | 525 | #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; |
526 | #include <linux/cgroup_subsys.h> | 526 | #include <linux/cgroup_subsys.h> |
527 | #undef SUBSYS | 527 | #undef SUBSYS |
528 | 528 | ||
529 | static inline struct cgroup_subsys_state *cgroup_subsys_state( | 529 | static inline struct cgroup_subsys_state *cgroup_subsys_state( |
530 | struct cgroup *cgrp, int subsys_id) | 530 | struct cgroup *cgrp, int subsys_id) |
531 | { | 531 | { |
532 | return cgrp->subsys[subsys_id]; | 532 | return cgrp->subsys[subsys_id]; |
533 | } | 533 | } |
534 | 534 | ||
535 | /* | 535 | /* |
536 | * function to get the cgroup_subsys_state which allows for extra | 536 | * function to get the cgroup_subsys_state which allows for extra |
537 | * rcu_dereference_check() conditions, such as locks used during the | 537 | * rcu_dereference_check() conditions, such as locks used during the |
538 | * cgroup_subsys::attach() methods. | 538 | * cgroup_subsys::attach() methods. |
539 | */ | 539 | */ |
540 | #define task_subsys_state_check(task, subsys_id, __c) \ | 540 | #define task_subsys_state_check(task, subsys_id, __c) \ |
541 | rcu_dereference_check(task->cgroups->subsys[subsys_id], \ | 541 | rcu_dereference_check(task->cgroups->subsys[subsys_id], \ |
542 | rcu_read_lock_held() || \ | ||
543 | lockdep_is_held(&task->alloc_lock) || \ | 542 | lockdep_is_held(&task->alloc_lock) || \ |
544 | cgroup_lock_is_held() || (__c)) | 543 | cgroup_lock_is_held() || (__c)) |
545 | 544 | ||
546 | static inline struct cgroup_subsys_state * | 545 | static inline struct cgroup_subsys_state * |
547 | task_subsys_state(struct task_struct *task, int subsys_id) | 546 | task_subsys_state(struct task_struct *task, int subsys_id) |
548 | { | 547 | { |
549 | return task_subsys_state_check(task, subsys_id, false); | 548 | return task_subsys_state_check(task, subsys_id, false); |
550 | } | 549 | } |
551 | 550 | ||
552 | static inline struct cgroup* task_cgroup(struct task_struct *task, | 551 | static inline struct cgroup* task_cgroup(struct task_struct *task, |
553 | int subsys_id) | 552 | int subsys_id) |
554 | { | 553 | { |
555 | return task_subsys_state(task, subsys_id)->cgroup; | 554 | return task_subsys_state(task, subsys_id)->cgroup; |
556 | } | 555 | } |
557 | 556 | ||
558 | /* A cgroup_iter should be treated as an opaque object */ | 557 | /* A cgroup_iter should be treated as an opaque object */ |
559 | struct cgroup_iter { | 558 | struct cgroup_iter { |
560 | struct list_head *cg_link; | 559 | struct list_head *cg_link; |
561 | struct list_head *task; | 560 | struct list_head *task; |
562 | }; | 561 | }; |
563 | 562 | ||
564 | /* | 563 | /* |
565 | * To iterate across the tasks in a cgroup: | 564 | * To iterate across the tasks in a cgroup: |
566 | * | 565 | * |
567 | * 1) call cgroup_iter_start to initialize an iterator | 566 | * 1) call cgroup_iter_start to initialize an iterator |
568 | * | 567 | * |
569 | * 2) call cgroup_iter_next() to retrieve member tasks until it | 568 | * 2) call cgroup_iter_next() to retrieve member tasks until it |
570 | * returns NULL or until you want to end the iteration | 569 | * returns NULL or until you want to end the iteration |
571 | * | 570 | * |
572 | * 3) call cgroup_iter_end() to destroy the iterator. | 571 | * 3) call cgroup_iter_end() to destroy the iterator. |
573 | * | 572 | * |
574 | * Or, call cgroup_scan_tasks() to iterate through every task in a | 573 | * Or, call cgroup_scan_tasks() to iterate through every task in a |
575 | * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling | 574 | * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling |
576 | * the test_task() callback, but not while calling the process_task() | 575 | * the test_task() callback, but not while calling the process_task() |
577 | * callback. | 576 | * callback. |
578 | */ | 577 | */ |
579 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); | 578 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); |
580 | struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | 579 | struct task_struct *cgroup_iter_next(struct cgroup *cgrp, |
581 | struct cgroup_iter *it); | 580 | struct cgroup_iter *it); |
582 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); | 581 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); |
583 | int cgroup_scan_tasks(struct cgroup_scanner *scan); | 582 | int cgroup_scan_tasks(struct cgroup_scanner *scan); |
584 | int cgroup_attach_task(struct cgroup *, struct task_struct *); | 583 | int cgroup_attach_task(struct cgroup *, struct task_struct *); |
585 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); | 584 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); |
586 | 585 | ||
587 | static inline int cgroup_attach_task_current_cg(struct task_struct *tsk) | 586 | static inline int cgroup_attach_task_current_cg(struct task_struct *tsk) |
588 | { | 587 | { |
589 | return cgroup_attach_task_all(current, tsk); | 588 | return cgroup_attach_task_all(current, tsk); |
590 | } | 589 | } |
591 | 590 | ||
592 | /* | 591 | /* |
593 | * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works | 592 | * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works |
594 | * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. | 593 | * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. |
595 | * CSS ID is assigned at cgroup allocation (create) automatically | 594 | * CSS ID is assigned at cgroup allocation (create) automatically |
596 | * and removed when subsys calls free_css_id() function. This is because | 595 | * and removed when subsys calls free_css_id() function. This is because |
597 | * the lifetime of cgroup_subsys_state is subsys's matter. | 596 | * the lifetime of cgroup_subsys_state is subsys's matter. |
598 | * | 597 | * |
599 | * Looking up and scanning function should be called under rcu_read_lock(). | 598 | * Looking up and scanning function should be called under rcu_read_lock(). |
600 | * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls. | 599 | * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls. |
601 | * But the css returned by this routine can be "not populated yet" or "being | 600 | * But the css returned by this routine can be "not populated yet" or "being |
602 | * destroyed". The caller should check css and cgroup's status. | 601 | * destroyed". The caller should check css and cgroup's status. |
603 | */ | 602 | */ |
604 | 603 | ||
605 | /* | 604 | /* |
606 | * Typically Called at ->destroy(), or somewhere the subsys frees | 605 | * Typically Called at ->destroy(), or somewhere the subsys frees |
607 | * cgroup_subsys_state. | 606 | * cgroup_subsys_state. |
608 | */ | 607 | */ |
609 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); | 608 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); |
610 | 609 | ||
611 | /* Find a cgroup_subsys_state which has given ID */ | 610 | /* Find a cgroup_subsys_state which has given ID */ |
612 | 611 | ||
613 | struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); | 612 | struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); |
614 | 613 | ||
615 | /* | 614 | /* |
616 | * Get a cgroup whose id is greater than or equal to id under tree of root. | 615 | * Get a cgroup whose id is greater than or equal to id under tree of root. |
617 | * Returning a cgroup_subsys_state or NULL. | 616 | * Returning a cgroup_subsys_state or NULL. |
618 | */ | 617 | */ |
619 | struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, | 618 | struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, |
620 | struct cgroup_subsys_state *root, int *foundid); | 619 | struct cgroup_subsys_state *root, int *foundid); |
621 | 620 | ||
622 | /* Returns true if root is ancestor of cg */ | 621 | /* Returns true if root is ancestor of cg */ |
623 | bool css_is_ancestor(struct cgroup_subsys_state *cg, | 622 | bool css_is_ancestor(struct cgroup_subsys_state *cg, |
624 | const struct cgroup_subsys_state *root); | 623 | const struct cgroup_subsys_state *root); |
625 | 624 | ||
626 | /* Get id and depth of css */ | 625 | /* Get id and depth of css */ |
627 | unsigned short css_id(struct cgroup_subsys_state *css); | 626 | unsigned short css_id(struct cgroup_subsys_state *css); |
628 | unsigned short css_depth(struct cgroup_subsys_state *css); | 627 | unsigned short css_depth(struct cgroup_subsys_state *css); |
629 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); | 628 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); |
630 | 629 | ||
631 | #else /* !CONFIG_CGROUPS */ | 630 | #else /* !CONFIG_CGROUPS */ |
632 | 631 | ||
633 | static inline int cgroup_init_early(void) { return 0; } | 632 | static inline int cgroup_init_early(void) { return 0; } |
634 | static inline int cgroup_init(void) { return 0; } | 633 | static inline int cgroup_init(void) { return 0; } |
635 | static inline void cgroup_fork(struct task_struct *p) {} | 634 | static inline void cgroup_fork(struct task_struct *p) {} |
636 | static inline void cgroup_fork_callbacks(struct task_struct *p) {} | 635 | static inline void cgroup_fork_callbacks(struct task_struct *p) {} |
637 | static inline void cgroup_post_fork(struct task_struct *p) {} | 636 | static inline void cgroup_post_fork(struct task_struct *p) {} |
638 | static inline void cgroup_exit(struct task_struct *p, int callbacks) {} | 637 | static inline void cgroup_exit(struct task_struct *p, int callbacks) {} |
639 | 638 | ||
640 | static inline void cgroup_lock(void) {} | 639 | static inline void cgroup_lock(void) {} |
641 | static inline void cgroup_unlock(void) {} | 640 | static inline void cgroup_unlock(void) {} |
642 | static inline int cgroupstats_build(struct cgroupstats *stats, | 641 | static inline int cgroupstats_build(struct cgroupstats *stats, |
643 | struct dentry *dentry) | 642 | struct dentry *dentry) |
644 | { | 643 | { |
645 | return -EINVAL; | 644 | return -EINVAL; |
646 | } | 645 | } |
647 | 646 | ||
648 | /* No cgroups - nothing to do */ | 647 | /* No cgroups - nothing to do */ |
649 | static inline int cgroup_attach_task_all(struct task_struct *from, | 648 | static inline int cgroup_attach_task_all(struct task_struct *from, |
650 | struct task_struct *t) | 649 | struct task_struct *t) |
651 | { | 650 | { |
652 | return 0; | 651 | return 0; |
653 | } | 652 | } |
654 | static inline int cgroup_attach_task_current_cg(struct task_struct *t) | 653 | static inline int cgroup_attach_task_current_cg(struct task_struct *t) |
655 | { | 654 | { |
656 | return 0; | 655 | return 0; |
657 | } | 656 | } |
658 | 657 | ||
659 | #endif /* !CONFIG_CGROUPS */ | 658 | #endif /* !CONFIG_CGROUPS */ |
660 | 659 | ||
661 | #endif /* _LINUX_CGROUP_H */ | 660 | #endif /* _LINUX_CGROUP_H */ |
662 | 661 |
include/linux/cred.h
1 | /* Credentials management - see Documentation/security/credentials.txt | 1 | /* Credentials management - see Documentation/security/credentials.txt |
2 | * | 2 | * |
3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public Licence | 7 | * modify it under the terms of the GNU General Public Licence |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the Licence, or (at your option) any later version. | 9 | * 2 of the Licence, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #ifndef _LINUX_CRED_H | 12 | #ifndef _LINUX_CRED_H |
13 | #define _LINUX_CRED_H | 13 | #define _LINUX_CRED_H |
14 | 14 | ||
15 | #include <linux/capability.h> | 15 | #include <linux/capability.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | #include <linux/key.h> | 17 | #include <linux/key.h> |
18 | #include <linux/selinux.h> | 18 | #include <linux/selinux.h> |
19 | #include <asm/atomic.h> | 19 | #include <asm/atomic.h> |
20 | 20 | ||
21 | struct user_struct; | 21 | struct user_struct; |
22 | struct cred; | 22 | struct cred; |
23 | struct inode; | 23 | struct inode; |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * COW Supplementary groups list | 26 | * COW Supplementary groups list |
27 | */ | 27 | */ |
28 | #define NGROUPS_SMALL 32 | 28 | #define NGROUPS_SMALL 32 |
29 | #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) | 29 | #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) |
30 | 30 | ||
31 | struct group_info { | 31 | struct group_info { |
32 | atomic_t usage; | 32 | atomic_t usage; |
33 | int ngroups; | 33 | int ngroups; |
34 | int nblocks; | 34 | int nblocks; |
35 | gid_t small_block[NGROUPS_SMALL]; | 35 | gid_t small_block[NGROUPS_SMALL]; |
36 | gid_t *blocks[0]; | 36 | gid_t *blocks[0]; |
37 | }; | 37 | }; |
38 | 38 | ||
39 | /** | 39 | /** |
40 | * get_group_info - Get a reference to a group info structure | 40 | * get_group_info - Get a reference to a group info structure |
41 | * @group_info: The group info to reference | 41 | * @group_info: The group info to reference |
42 | * | 42 | * |
43 | * This gets a reference to a set of supplementary groups. | 43 | * This gets a reference to a set of supplementary groups. |
44 | * | 44 | * |
45 | * If the caller is accessing a task's credentials, they must hold the RCU read | 45 | * If the caller is accessing a task's credentials, they must hold the RCU read |
46 | * lock when reading. | 46 | * lock when reading. |
47 | */ | 47 | */ |
48 | static inline struct group_info *get_group_info(struct group_info *gi) | 48 | static inline struct group_info *get_group_info(struct group_info *gi) |
49 | { | 49 | { |
50 | atomic_inc(&gi->usage); | 50 | atomic_inc(&gi->usage); |
51 | return gi; | 51 | return gi; |
52 | } | 52 | } |
53 | 53 | ||
54 | /** | 54 | /** |
55 | * put_group_info - Release a reference to a group info structure | 55 | * put_group_info - Release a reference to a group info structure |
56 | * @group_info: The group info to release | 56 | * @group_info: The group info to release |
57 | */ | 57 | */ |
58 | #define put_group_info(group_info) \ | 58 | #define put_group_info(group_info) \ |
59 | do { \ | 59 | do { \ |
60 | if (atomic_dec_and_test(&(group_info)->usage)) \ | 60 | if (atomic_dec_and_test(&(group_info)->usage)) \ |
61 | groups_free(group_info); \ | 61 | groups_free(group_info); \ |
62 | } while (0) | 62 | } while (0) |
63 | 63 | ||
64 | extern struct group_info *groups_alloc(int); | 64 | extern struct group_info *groups_alloc(int); |
65 | extern struct group_info init_groups; | 65 | extern struct group_info init_groups; |
66 | extern void groups_free(struct group_info *); | 66 | extern void groups_free(struct group_info *); |
67 | extern int set_current_groups(struct group_info *); | 67 | extern int set_current_groups(struct group_info *); |
68 | extern int set_groups(struct cred *, struct group_info *); | 68 | extern int set_groups(struct cred *, struct group_info *); |
69 | extern int groups_search(const struct group_info *, gid_t); | 69 | extern int groups_search(const struct group_info *, gid_t); |
70 | 70 | ||
71 | /* access the groups "array" with this macro */ | 71 | /* access the groups "array" with this macro */ |
72 | #define GROUP_AT(gi, i) \ | 72 | #define GROUP_AT(gi, i) \ |
73 | ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK]) | 73 | ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK]) |
74 | 74 | ||
75 | extern int in_group_p(gid_t); | 75 | extern int in_group_p(gid_t); |
76 | extern int in_egroup_p(gid_t); | 76 | extern int in_egroup_p(gid_t); |
77 | 77 | ||
78 | /* | 78 | /* |
79 | * The common credentials for a thread group | 79 | * The common credentials for a thread group |
80 | * - shared by CLONE_THREAD | 80 | * - shared by CLONE_THREAD |
81 | */ | 81 | */ |
82 | #ifdef CONFIG_KEYS | 82 | #ifdef CONFIG_KEYS |
83 | struct thread_group_cred { | 83 | struct thread_group_cred { |
84 | atomic_t usage; | 84 | atomic_t usage; |
85 | pid_t tgid; /* thread group process ID */ | 85 | pid_t tgid; /* thread group process ID */ |
86 | spinlock_t lock; | 86 | spinlock_t lock; |
87 | struct key __rcu *session_keyring; /* keyring inherited over fork */ | 87 | struct key __rcu *session_keyring; /* keyring inherited over fork */ |
88 | struct key *process_keyring; /* keyring private to this process */ | 88 | struct key *process_keyring; /* keyring private to this process */ |
89 | struct rcu_head rcu; /* RCU deletion hook */ | 89 | struct rcu_head rcu; /* RCU deletion hook */ |
90 | }; | 90 | }; |
91 | #endif | 91 | #endif |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * The security context of a task | 94 | * The security context of a task |
95 | * | 95 | * |
96 | * The parts of the context break down into two categories: | 96 | * The parts of the context break down into two categories: |
97 | * | 97 | * |
98 | * (1) The objective context of a task. These parts are used when some other | 98 | * (1) The objective context of a task. These parts are used when some other |
99 | * task is attempting to affect this one. | 99 | * task is attempting to affect this one. |
100 | * | 100 | * |
101 | * (2) The subjective context. These details are used when the task is acting | 101 | * (2) The subjective context. These details are used when the task is acting |
102 | * upon another object, be that a file, a task, a key or whatever. | 102 | * upon another object, be that a file, a task, a key or whatever. |
103 | * | 103 | * |
104 | * Note that some members of this structure belong to both categories - the | 104 | * Note that some members of this structure belong to both categories - the |
105 | * LSM security pointer for instance. | 105 | * LSM security pointer for instance. |
106 | * | 106 | * |
107 | * A task has two security pointers. task->real_cred points to the objective | 107 | * A task has two security pointers. task->real_cred points to the objective |
108 | * context that defines that task's actual details. The objective part of this | 108 | * context that defines that task's actual details. The objective part of this |
109 | * context is used whenever that task is acted upon. | 109 | * context is used whenever that task is acted upon. |
110 | * | 110 | * |
111 | * task->cred points to the subjective context that defines the details of how | 111 | * task->cred points to the subjective context that defines the details of how |
112 | * that task is going to act upon another object. This may be overridden | 112 | * that task is going to act upon another object. This may be overridden |
113 | * temporarily to point to another security context, but normally points to the | 113 | * temporarily to point to another security context, but normally points to the |
114 | * same context as task->real_cred. | 114 | * same context as task->real_cred. |
115 | */ | 115 | */ |
116 | struct cred { | 116 | struct cred { |
117 | atomic_t usage; | 117 | atomic_t usage; |
118 | #ifdef CONFIG_DEBUG_CREDENTIALS | 118 | #ifdef CONFIG_DEBUG_CREDENTIALS |
119 | atomic_t subscribers; /* number of processes subscribed */ | 119 | atomic_t subscribers; /* number of processes subscribed */ |
120 | void *put_addr; | 120 | void *put_addr; |
121 | unsigned magic; | 121 | unsigned magic; |
122 | #define CRED_MAGIC 0x43736564 | 122 | #define CRED_MAGIC 0x43736564 |
123 | #define CRED_MAGIC_DEAD 0x44656144 | 123 | #define CRED_MAGIC_DEAD 0x44656144 |
124 | #endif | 124 | #endif |
125 | uid_t uid; /* real UID of the task */ | 125 | uid_t uid; /* real UID of the task */ |
126 | gid_t gid; /* real GID of the task */ | 126 | gid_t gid; /* real GID of the task */ |
127 | uid_t suid; /* saved UID of the task */ | 127 | uid_t suid; /* saved UID of the task */ |
128 | gid_t sgid; /* saved GID of the task */ | 128 | gid_t sgid; /* saved GID of the task */ |
129 | uid_t euid; /* effective UID of the task */ | 129 | uid_t euid; /* effective UID of the task */ |
130 | gid_t egid; /* effective GID of the task */ | 130 | gid_t egid; /* effective GID of the task */ |
131 | uid_t fsuid; /* UID for VFS ops */ | 131 | uid_t fsuid; /* UID for VFS ops */ |
132 | gid_t fsgid; /* GID for VFS ops */ | 132 | gid_t fsgid; /* GID for VFS ops */ |
133 | unsigned securebits; /* SUID-less security management */ | 133 | unsigned securebits; /* SUID-less security management */ |
134 | kernel_cap_t cap_inheritable; /* caps our children can inherit */ | 134 | kernel_cap_t cap_inheritable; /* caps our children can inherit */ |
135 | kernel_cap_t cap_permitted; /* caps we're permitted */ | 135 | kernel_cap_t cap_permitted; /* caps we're permitted */ |
136 | kernel_cap_t cap_effective; /* caps we can actually use */ | 136 | kernel_cap_t cap_effective; /* caps we can actually use */ |
137 | kernel_cap_t cap_bset; /* capability bounding set */ | 137 | kernel_cap_t cap_bset; /* capability bounding set */ |
138 | #ifdef CONFIG_KEYS | 138 | #ifdef CONFIG_KEYS |
139 | unsigned char jit_keyring; /* default keyring to attach requested | 139 | unsigned char jit_keyring; /* default keyring to attach requested |
140 | * keys to */ | 140 | * keys to */ |
141 | struct key *thread_keyring; /* keyring private to this thread */ | 141 | struct key *thread_keyring; /* keyring private to this thread */ |
142 | struct key *request_key_auth; /* assumed request_key authority */ | 142 | struct key *request_key_auth; /* assumed request_key authority */ |
143 | struct thread_group_cred *tgcred; /* thread-group shared credentials */ | 143 | struct thread_group_cred *tgcred; /* thread-group shared credentials */ |
144 | #endif | 144 | #endif |
145 | #ifdef CONFIG_SECURITY | 145 | #ifdef CONFIG_SECURITY |
146 | void *security; /* subjective LSM security */ | 146 | void *security; /* subjective LSM security */ |
147 | #endif | 147 | #endif |
148 | struct user_struct *user; /* real user ID subscription */ | 148 | struct user_struct *user; /* real user ID subscription */ |
149 | struct user_namespace *user_ns; /* cached user->user_ns */ | 149 | struct user_namespace *user_ns; /* cached user->user_ns */ |
150 | struct group_info *group_info; /* supplementary groups for euid/fsgid */ | 150 | struct group_info *group_info; /* supplementary groups for euid/fsgid */ |
151 | struct rcu_head rcu; /* RCU deletion hook */ | 151 | struct rcu_head rcu; /* RCU deletion hook */ |
152 | }; | 152 | }; |
153 | 153 | ||
154 | extern void __put_cred(struct cred *); | 154 | extern void __put_cred(struct cred *); |
155 | extern void exit_creds(struct task_struct *); | 155 | extern void exit_creds(struct task_struct *); |
156 | extern int copy_creds(struct task_struct *, unsigned long); | 156 | extern int copy_creds(struct task_struct *, unsigned long); |
157 | extern const struct cred *get_task_cred(struct task_struct *); | 157 | extern const struct cred *get_task_cred(struct task_struct *); |
158 | extern struct cred *cred_alloc_blank(void); | 158 | extern struct cred *cred_alloc_blank(void); |
159 | extern struct cred *prepare_creds(void); | 159 | extern struct cred *prepare_creds(void); |
160 | extern struct cred *prepare_exec_creds(void); | 160 | extern struct cred *prepare_exec_creds(void); |
161 | extern int commit_creds(struct cred *); | 161 | extern int commit_creds(struct cred *); |
162 | extern void abort_creds(struct cred *); | 162 | extern void abort_creds(struct cred *); |
163 | extern const struct cred *override_creds(const struct cred *); | 163 | extern const struct cred *override_creds(const struct cred *); |
164 | extern void revert_creds(const struct cred *); | 164 | extern void revert_creds(const struct cred *); |
165 | extern struct cred *prepare_kernel_cred(struct task_struct *); | 165 | extern struct cred *prepare_kernel_cred(struct task_struct *); |
166 | extern int change_create_files_as(struct cred *, struct inode *); | 166 | extern int change_create_files_as(struct cred *, struct inode *); |
167 | extern int set_security_override(struct cred *, u32); | 167 | extern int set_security_override(struct cred *, u32); |
168 | extern int set_security_override_from_ctx(struct cred *, const char *); | 168 | extern int set_security_override_from_ctx(struct cred *, const char *); |
169 | extern int set_create_files_as(struct cred *, struct inode *); | 169 | extern int set_create_files_as(struct cred *, struct inode *); |
170 | extern void __init cred_init(void); | 170 | extern void __init cred_init(void); |
171 | 171 | ||
172 | /* | 172 | /* |
173 | * check for validity of credentials | 173 | * check for validity of credentials |
174 | */ | 174 | */ |
175 | #ifdef CONFIG_DEBUG_CREDENTIALS | 175 | #ifdef CONFIG_DEBUG_CREDENTIALS |
176 | extern void __invalid_creds(const struct cred *, const char *, unsigned); | 176 | extern void __invalid_creds(const struct cred *, const char *, unsigned); |
177 | extern void __validate_process_creds(struct task_struct *, | 177 | extern void __validate_process_creds(struct task_struct *, |
178 | const char *, unsigned); | 178 | const char *, unsigned); |
179 | 179 | ||
180 | extern bool creds_are_invalid(const struct cred *cred); | 180 | extern bool creds_are_invalid(const struct cred *cred); |
181 | 181 | ||
182 | static inline void __validate_creds(const struct cred *cred, | 182 | static inline void __validate_creds(const struct cred *cred, |
183 | const char *file, unsigned line) | 183 | const char *file, unsigned line) |
184 | { | 184 | { |
185 | if (unlikely(creds_are_invalid(cred))) | 185 | if (unlikely(creds_are_invalid(cred))) |
186 | __invalid_creds(cred, file, line); | 186 | __invalid_creds(cred, file, line); |
187 | } | 187 | } |
188 | 188 | ||
189 | #define validate_creds(cred) \ | 189 | #define validate_creds(cred) \ |
190 | do { \ | 190 | do { \ |
191 | __validate_creds((cred), __FILE__, __LINE__); \ | 191 | __validate_creds((cred), __FILE__, __LINE__); \ |
192 | } while(0) | 192 | } while(0) |
193 | 193 | ||
194 | #define validate_process_creds() \ | 194 | #define validate_process_creds() \ |
195 | do { \ | 195 | do { \ |
196 | __validate_process_creds(current, __FILE__, __LINE__); \ | 196 | __validate_process_creds(current, __FILE__, __LINE__); \ |
197 | } while(0) | 197 | } while(0) |
198 | 198 | ||
199 | extern void validate_creds_for_do_exit(struct task_struct *); | 199 | extern void validate_creds_for_do_exit(struct task_struct *); |
200 | #else | 200 | #else |
201 | static inline void validate_creds(const struct cred *cred) | 201 | static inline void validate_creds(const struct cred *cred) |
202 | { | 202 | { |
203 | } | 203 | } |
204 | static inline void validate_creds_for_do_exit(struct task_struct *tsk) | 204 | static inline void validate_creds_for_do_exit(struct task_struct *tsk) |
205 | { | 205 | { |
206 | } | 206 | } |
207 | static inline void validate_process_creds(void) | 207 | static inline void validate_process_creds(void) |
208 | { | 208 | { |
209 | } | 209 | } |
210 | #endif | 210 | #endif |
211 | 211 | ||
212 | /** | 212 | /** |
213 | * get_new_cred - Get a reference on a new set of credentials | 213 | * get_new_cred - Get a reference on a new set of credentials |
214 | * @cred: The new credentials to reference | 214 | * @cred: The new credentials to reference |
215 | * | 215 | * |
216 | * Get a reference on the specified set of new credentials. The caller must | 216 | * Get a reference on the specified set of new credentials. The caller must |
217 | * release the reference. | 217 | * release the reference. |
218 | */ | 218 | */ |
219 | static inline struct cred *get_new_cred(struct cred *cred) | 219 | static inline struct cred *get_new_cred(struct cred *cred) |
220 | { | 220 | { |
221 | atomic_inc(&cred->usage); | 221 | atomic_inc(&cred->usage); |
222 | return cred; | 222 | return cred; |
223 | } | 223 | } |
224 | 224 | ||
225 | /** | 225 | /** |
226 | * get_cred - Get a reference on a set of credentials | 226 | * get_cred - Get a reference on a set of credentials |
227 | * @cred: The credentials to reference | 227 | * @cred: The credentials to reference |
228 | * | 228 | * |
229 | * Get a reference on the specified set of credentials. The caller must | 229 | * Get a reference on the specified set of credentials. The caller must |
230 | * release the reference. | 230 | * release the reference. |
231 | * | 231 | * |
232 | * This is used to deal with a committed set of credentials. Although the | 232 | * This is used to deal with a committed set of credentials. Although the |
233 | * pointer is const, this will temporarily discard the const and increment the | 233 | * pointer is const, this will temporarily discard the const and increment the |
234 | * usage count. The purpose of this is to attempt to catch at compile time the | 234 | * usage count. The purpose of this is to attempt to catch at compile time the |
235 | * accidental alteration of a set of credentials that should be considered | 235 | * accidental alteration of a set of credentials that should be considered |
236 | * immutable. | 236 | * immutable. |
237 | */ | 237 | */ |
238 | static inline const struct cred *get_cred(const struct cred *cred) | 238 | static inline const struct cred *get_cred(const struct cred *cred) |
239 | { | 239 | { |
240 | struct cred *nonconst_cred = (struct cred *) cred; | 240 | struct cred *nonconst_cred = (struct cred *) cred; |
241 | validate_creds(cred); | 241 | validate_creds(cred); |
242 | return get_new_cred(nonconst_cred); | 242 | return get_new_cred(nonconst_cred); |
243 | } | 243 | } |
244 | 244 | ||
245 | /** | 245 | /** |
246 | * put_cred - Release a reference to a set of credentials | 246 | * put_cred - Release a reference to a set of credentials |
247 | * @cred: The credentials to release | 247 | * @cred: The credentials to release |
248 | * | 248 | * |
249 | * Release a reference to a set of credentials, deleting them when the last ref | 249 | * Release a reference to a set of credentials, deleting them when the last ref |
250 | * is released. | 250 | * is released. |
251 | * | 251 | * |
252 | * This takes a const pointer to a set of credentials because the credentials | 252 | * This takes a const pointer to a set of credentials because the credentials |
253 | * on task_struct are attached by const pointers to prevent accidental | 253 | * on task_struct are attached by const pointers to prevent accidental |
254 | * alteration of otherwise immutable credential sets. | 254 | * alteration of otherwise immutable credential sets. |
255 | */ | 255 | */ |
256 | static inline void put_cred(const struct cred *_cred) | 256 | static inline void put_cred(const struct cred *_cred) |
257 | { | 257 | { |
258 | struct cred *cred = (struct cred *) _cred; | 258 | struct cred *cred = (struct cred *) _cred; |
259 | 259 | ||
260 | validate_creds(cred); | 260 | validate_creds(cred); |
261 | if (atomic_dec_and_test(&(cred)->usage)) | 261 | if (atomic_dec_and_test(&(cred)->usage)) |
262 | __put_cred(cred); | 262 | __put_cred(cred); |
263 | } | 263 | } |
264 | 264 | ||
265 | /** | 265 | /** |
266 | * current_cred - Access the current task's subjective credentials | 266 | * current_cred - Access the current task's subjective credentials |
267 | * | 267 | * |
268 | * Access the subjective credentials of the current task. | 268 | * Access the subjective credentials of the current task. |
269 | */ | 269 | */ |
270 | #define current_cred() \ | 270 | #define current_cred() \ |
271 | (current->cred) | 271 | (current->cred) |
272 | 272 | ||
273 | /** | 273 | /** |
274 | * __task_cred - Access a task's objective credentials | 274 | * __task_cred - Access a task's objective credentials |
275 | * @task: The task to query | 275 | * @task: The task to query |
276 | * | 276 | * |
277 | * Access the objective credentials of a task. The caller must hold the RCU | 277 | * Access the objective credentials of a task. The caller must hold the RCU |
278 | * readlock or the task must be dead and unable to change its own credentials. | 278 | * readlock or the task must be dead and unable to change its own credentials. |
279 | * | 279 | * |
280 | * The result of this function should not be passed directly to get_cred(); | 280 | * The result of this function should not be passed directly to get_cred(); |
281 | * rather get_task_cred() should be used instead. | 281 | * rather get_task_cred() should be used instead. |
282 | */ | 282 | */ |
283 | #define __task_cred(task) \ | 283 | #define __task_cred(task) \ |
284 | ({ \ | 284 | ({ \ |
285 | const struct task_struct *__t = (task); \ | 285 | const struct task_struct *__t = (task); \ |
286 | rcu_dereference_check(__t->real_cred, \ | 286 | rcu_dereference_check(__t->real_cred, \ |
287 | rcu_read_lock_held() || \ | ||
288 | task_is_dead(__t)); \ | 287 | task_is_dead(__t)); \ |
289 | }) | 288 | }) |
290 | 289 | ||
291 | /** | 290 | /** |
292 | * get_current_cred - Get the current task's subjective credentials | 291 | * get_current_cred - Get the current task's subjective credentials |
293 | * | 292 | * |
294 | * Get the subjective credentials of the current task, pinning them so that | 293 | * Get the subjective credentials of the current task, pinning them so that |
295 | * they can't go away. Accessing the current task's credentials directly is | 294 | * they can't go away. Accessing the current task's credentials directly is |
296 | * not permitted. | 295 | * not permitted. |
297 | */ | 296 | */ |
298 | #define get_current_cred() \ | 297 | #define get_current_cred() \ |
299 | (get_cred(current_cred())) | 298 | (get_cred(current_cred())) |
300 | 299 | ||
301 | /** | 300 | /** |
302 | * get_current_user - Get the current task's user_struct | 301 | * get_current_user - Get the current task's user_struct |
303 | * | 302 | * |
304 | * Get the user record of the current task, pinning it so that it can't go | 303 | * Get the user record of the current task, pinning it so that it can't go |
305 | * away. | 304 | * away. |
306 | */ | 305 | */ |
307 | #define get_current_user() \ | 306 | #define get_current_user() \ |
308 | ({ \ | 307 | ({ \ |
309 | struct user_struct *__u; \ | 308 | struct user_struct *__u; \ |
310 | struct cred *__cred; \ | 309 | struct cred *__cred; \ |
311 | __cred = (struct cred *) current_cred(); \ | 310 | __cred = (struct cred *) current_cred(); \ |
312 | __u = get_uid(__cred->user); \ | 311 | __u = get_uid(__cred->user); \ |
313 | __u; \ | 312 | __u; \ |
314 | }) | 313 | }) |
315 | 314 | ||
316 | /** | 315 | /** |
317 | * get_current_groups - Get the current task's supplementary group list | 316 | * get_current_groups - Get the current task's supplementary group list |
318 | * | 317 | * |
319 | * Get the supplementary group list of the current task, pinning it so that it | 318 | * Get the supplementary group list of the current task, pinning it so that it |
320 | * can't go away. | 319 | * can't go away. |
321 | */ | 320 | */ |
322 | #define get_current_groups() \ | 321 | #define get_current_groups() \ |
323 | ({ \ | 322 | ({ \ |
324 | struct group_info *__groups; \ | 323 | struct group_info *__groups; \ |
325 | struct cred *__cred; \ | 324 | struct cred *__cred; \ |
326 | __cred = (struct cred *) current_cred(); \ | 325 | __cred = (struct cred *) current_cred(); \ |
327 | __groups = get_group_info(__cred->group_info); \ | 326 | __groups = get_group_info(__cred->group_info); \ |
328 | __groups; \ | 327 | __groups; \ |
329 | }) | 328 | }) |
330 | 329 | ||
331 | #define task_cred_xxx(task, xxx) \ | 330 | #define task_cred_xxx(task, xxx) \ |
332 | ({ \ | 331 | ({ \ |
333 | __typeof__(((struct cred *)NULL)->xxx) ___val; \ | 332 | __typeof__(((struct cred *)NULL)->xxx) ___val; \ |
334 | rcu_read_lock(); \ | 333 | rcu_read_lock(); \ |
335 | ___val = __task_cred((task))->xxx; \ | 334 | ___val = __task_cred((task))->xxx; \ |
336 | rcu_read_unlock(); \ | 335 | rcu_read_unlock(); \ |
337 | ___val; \ | 336 | ___val; \ |
338 | }) | 337 | }) |
339 | 338 | ||
340 | #define task_uid(task) (task_cred_xxx((task), uid)) | 339 | #define task_uid(task) (task_cred_xxx((task), uid)) |
341 | #define task_euid(task) (task_cred_xxx((task), euid)) | 340 | #define task_euid(task) (task_cred_xxx((task), euid)) |
342 | 341 | ||
343 | #define current_cred_xxx(xxx) \ | 342 | #define current_cred_xxx(xxx) \ |
344 | ({ \ | 343 | ({ \ |
345 | current->cred->xxx; \ | 344 | current->cred->xxx; \ |
346 | }) | 345 | }) |
347 | 346 | ||
348 | #define current_uid() (current_cred_xxx(uid)) | 347 | #define current_uid() (current_cred_xxx(uid)) |
349 | #define current_gid() (current_cred_xxx(gid)) | 348 | #define current_gid() (current_cred_xxx(gid)) |
350 | #define current_euid() (current_cred_xxx(euid)) | 349 | #define current_euid() (current_cred_xxx(euid)) |
351 | #define current_egid() (current_cred_xxx(egid)) | 350 | #define current_egid() (current_cred_xxx(egid)) |
352 | #define current_suid() (current_cred_xxx(suid)) | 351 | #define current_suid() (current_cred_xxx(suid)) |
353 | #define current_sgid() (current_cred_xxx(sgid)) | 352 | #define current_sgid() (current_cred_xxx(sgid)) |
354 | #define current_fsuid() (current_cred_xxx(fsuid)) | 353 | #define current_fsuid() (current_cred_xxx(fsuid)) |
355 | #define current_fsgid() (current_cred_xxx(fsgid)) | 354 | #define current_fsgid() (current_cred_xxx(fsgid)) |
356 | #define current_cap() (current_cred_xxx(cap_effective)) | 355 | #define current_cap() (current_cred_xxx(cap_effective)) |
357 | #define current_user() (current_cred_xxx(user)) | 356 | #define current_user() (current_cred_xxx(user)) |
358 | #define current_security() (current_cred_xxx(security)) | 357 | #define current_security() (current_cred_xxx(security)) |
359 | 358 | ||
360 | #ifdef CONFIG_USER_NS | 359 | #ifdef CONFIG_USER_NS |
361 | #define current_user_ns() (current_cred_xxx(user_ns)) | 360 | #define current_user_ns() (current_cred_xxx(user_ns)) |
362 | #else | 361 | #else |
363 | extern struct user_namespace init_user_ns; | 362 | extern struct user_namespace init_user_ns; |
364 | #define current_user_ns() (&init_user_ns) | 363 | #define current_user_ns() (&init_user_ns) |
365 | #endif | 364 | #endif |
366 | 365 | ||
367 | 366 | ||
368 | #define current_uid_gid(_uid, _gid) \ | 367 | #define current_uid_gid(_uid, _gid) \ |
369 | do { \ | 368 | do { \ |
370 | const struct cred *__cred; \ | 369 | const struct cred *__cred; \ |
371 | __cred = current_cred(); \ | 370 | __cred = current_cred(); \ |
372 | *(_uid) = __cred->uid; \ | 371 | *(_uid) = __cred->uid; \ |
373 | *(_gid) = __cred->gid; \ | 372 | *(_gid) = __cred->gid; \ |
374 | } while(0) | 373 | } while(0) |
375 | 374 | ||
376 | #define current_euid_egid(_euid, _egid) \ | 375 | #define current_euid_egid(_euid, _egid) \ |
377 | do { \ | 376 | do { \ |
378 | const struct cred *__cred; \ | 377 | const struct cred *__cred; \ |
379 | __cred = current_cred(); \ | 378 | __cred = current_cred(); \ |
380 | *(_euid) = __cred->euid; \ | 379 | *(_euid) = __cred->euid; \ |
381 | *(_egid) = __cred->egid; \ | 380 | *(_egid) = __cred->egid; \ |
382 | } while(0) | 381 | } while(0) |
383 | 382 | ||
384 | #define current_fsuid_fsgid(_fsuid, _fsgid) \ | 383 | #define current_fsuid_fsgid(_fsuid, _fsgid) \ |
385 | do { \ | 384 | do { \ |
386 | const struct cred *__cred; \ | 385 | const struct cred *__cred; \ |
387 | __cred = current_cred(); \ | 386 | __cred = current_cred(); \ |
388 | *(_fsuid) = __cred->fsuid; \ | 387 | *(_fsuid) = __cred->fsuid; \ |
389 | *(_fsgid) = __cred->fsgid; \ | 388 | *(_fsgid) = __cred->fsgid; \ |
390 | } while(0) | 389 | } while(0) |
391 | 390 | ||
392 | #endif /* _LINUX_CRED_H */ | 391 | #endif /* _LINUX_CRED_H */ |
393 | 392 |
include/linux/fdtable.h
1 | /* | 1 | /* |
2 | * descriptor table internals; you almost certainly want file.h instead. | 2 | * descriptor table internals; you almost certainly want file.h instead. |
3 | */ | 3 | */ |
4 | 4 | ||
5 | #ifndef __LINUX_FDTABLE_H | 5 | #ifndef __LINUX_FDTABLE_H |
6 | #define __LINUX_FDTABLE_H | 6 | #define __LINUX_FDTABLE_H |
7 | 7 | ||
8 | #include <linux/posix_types.h> | 8 | #include <linux/posix_types.h> |
9 | #include <linux/compiler.h> | 9 | #include <linux/compiler.h> |
10 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
11 | #include <linux/rcupdate.h> | 11 | #include <linux/rcupdate.h> |
12 | #include <linux/types.h> | 12 | #include <linux/types.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
15 | 15 | ||
16 | #include <asm/atomic.h> | 16 | #include <asm/atomic.h> |
17 | 17 | ||
18 | /* | 18 | /* |
19 | * The default fd array needs to be at least BITS_PER_LONG, | 19 | * The default fd array needs to be at least BITS_PER_LONG, |
20 | * as this is the granularity returned by copy_fdset(). | 20 | * as this is the granularity returned by copy_fdset(). |
21 | */ | 21 | */ |
22 | #define NR_OPEN_DEFAULT BITS_PER_LONG | 22 | #define NR_OPEN_DEFAULT BITS_PER_LONG |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * The embedded_fd_set is a small fd_set, | 25 | * The embedded_fd_set is a small fd_set, |
26 | * suitable for most tasks (which open <= BITS_PER_LONG files) | 26 | * suitable for most tasks (which open <= BITS_PER_LONG files) |
27 | */ | 27 | */ |
28 | struct embedded_fd_set { | 28 | struct embedded_fd_set { |
29 | unsigned long fds_bits[1]; | 29 | unsigned long fds_bits[1]; |
30 | }; | 30 | }; |
31 | 31 | ||
32 | struct fdtable { | 32 | struct fdtable { |
33 | unsigned int max_fds; | 33 | unsigned int max_fds; |
34 | struct file __rcu **fd; /* current fd array */ | 34 | struct file __rcu **fd; /* current fd array */ |
35 | fd_set *close_on_exec; | 35 | fd_set *close_on_exec; |
36 | fd_set *open_fds; | 36 | fd_set *open_fds; |
37 | struct rcu_head rcu; | 37 | struct rcu_head rcu; |
38 | struct fdtable *next; | 38 | struct fdtable *next; |
39 | }; | 39 | }; |
40 | 40 | ||
41 | /* | 41 | /* |
42 | * Open file table structure | 42 | * Open file table structure |
43 | */ | 43 | */ |
44 | struct files_struct { | 44 | struct files_struct { |
45 | /* | 45 | /* |
46 | * read mostly part | 46 | * read mostly part |
47 | */ | 47 | */ |
48 | atomic_t count; | 48 | atomic_t count; |
49 | struct fdtable __rcu *fdt; | 49 | struct fdtable __rcu *fdt; |
50 | struct fdtable fdtab; | 50 | struct fdtable fdtab; |
51 | /* | 51 | /* |
52 | * written part on a separate cache line in SMP | 52 | * written part on a separate cache line in SMP |
53 | */ | 53 | */ |
54 | spinlock_t file_lock ____cacheline_aligned_in_smp; | 54 | spinlock_t file_lock ____cacheline_aligned_in_smp; |
55 | int next_fd; | 55 | int next_fd; |
56 | struct embedded_fd_set close_on_exec_init; | 56 | struct embedded_fd_set close_on_exec_init; |
57 | struct embedded_fd_set open_fds_init; | 57 | struct embedded_fd_set open_fds_init; |
58 | struct file __rcu * fd_array[NR_OPEN_DEFAULT]; | 58 | struct file __rcu * fd_array[NR_OPEN_DEFAULT]; |
59 | }; | 59 | }; |
60 | 60 | ||
61 | #define rcu_dereference_check_fdtable(files, fdtfd) \ | 61 | #define rcu_dereference_check_fdtable(files, fdtfd) \ |
62 | (rcu_dereference_check((fdtfd), \ | 62 | (rcu_dereference_check((fdtfd), \ |
63 | rcu_read_lock_held() || \ | ||
64 | lockdep_is_held(&(files)->file_lock) || \ | 63 | lockdep_is_held(&(files)->file_lock) || \ |
65 | atomic_read(&(files)->count) == 1 || \ | 64 | atomic_read(&(files)->count) == 1 || \ |
66 | rcu_my_thread_group_empty())) | 65 | rcu_my_thread_group_empty())) |
67 | 66 | ||
68 | #define files_fdtable(files) \ | 67 | #define files_fdtable(files) \ |
69 | (rcu_dereference_check_fdtable((files), (files)->fdt)) | 68 | (rcu_dereference_check_fdtable((files), (files)->fdt)) |
70 | 69 | ||
71 | struct file_operations; | 70 | struct file_operations; |
72 | struct vfsmount; | 71 | struct vfsmount; |
73 | struct dentry; | 72 | struct dentry; |
74 | 73 | ||
75 | extern int expand_files(struct files_struct *, int nr); | 74 | extern int expand_files(struct files_struct *, int nr); |
76 | extern void free_fdtable_rcu(struct rcu_head *rcu); | 75 | extern void free_fdtable_rcu(struct rcu_head *rcu); |
77 | extern void __init files_defer_init(void); | 76 | extern void __init files_defer_init(void); |
78 | 77 | ||
79 | static inline void free_fdtable(struct fdtable *fdt) | 78 | static inline void free_fdtable(struct fdtable *fdt) |
80 | { | 79 | { |
81 | call_rcu(&fdt->rcu, free_fdtable_rcu); | 80 | call_rcu(&fdt->rcu, free_fdtable_rcu); |
82 | } | 81 | } |
83 | 82 | ||
84 | static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) | 83 | static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) |
85 | { | 84 | { |
86 | struct file * file = NULL; | 85 | struct file * file = NULL; |
87 | struct fdtable *fdt = files_fdtable(files); | 86 | struct fdtable *fdt = files_fdtable(files); |
88 | 87 | ||
89 | if (fd < fdt->max_fds) | 88 | if (fd < fdt->max_fds) |
90 | file = rcu_dereference_check_fdtable(files, fdt->fd[fd]); | 89 | file = rcu_dereference_check_fdtable(files, fdt->fd[fd]); |
91 | return file; | 90 | return file; |
92 | } | 91 | } |
93 | 92 | ||
94 | /* | 93 | /* |
95 | * Check whether the specified fd has an open file. | 94 | * Check whether the specified fd has an open file. |
96 | */ | 95 | */ |
97 | #define fcheck(fd) fcheck_files(current->files, fd) | 96 | #define fcheck(fd) fcheck_files(current->files, fd) |
98 | 97 | ||
99 | struct task_struct; | 98 | struct task_struct; |
100 | 99 | ||
101 | struct files_struct *get_files_struct(struct task_struct *); | 100 | struct files_struct *get_files_struct(struct task_struct *); |
102 | void put_files_struct(struct files_struct *fs); | 101 | void put_files_struct(struct files_struct *fs); |
103 | void reset_files_struct(struct files_struct *); | 102 | void reset_files_struct(struct files_struct *); |
104 | int unshare_files(struct files_struct **); | 103 | int unshare_files(struct files_struct **); |
105 | struct files_struct *dup_fd(struct files_struct *, int *); | 104 | struct files_struct *dup_fd(struct files_struct *, int *); |
106 | 105 | ||
107 | extern struct kmem_cache *files_cachep; | 106 | extern struct kmem_cache *files_cachep; |
108 | 107 | ||
109 | #endif /* __LINUX_FDTABLE_H */ | 108 | #endif /* __LINUX_FDTABLE_H */ |
110 | 109 |
include/linux/rtnetlink.h
1 | #ifndef __LINUX_RTNETLINK_H | 1 | #ifndef __LINUX_RTNETLINK_H |
2 | #define __LINUX_RTNETLINK_H | 2 | #define __LINUX_RTNETLINK_H |
3 | 3 | ||
4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
5 | #include <linux/netlink.h> | 5 | #include <linux/netlink.h> |
6 | #include <linux/if_link.h> | 6 | #include <linux/if_link.h> |
7 | #include <linux/if_addr.h> | 7 | #include <linux/if_addr.h> |
8 | #include <linux/neighbour.h> | 8 | #include <linux/neighbour.h> |
9 | 9 | ||
10 | /* rtnetlink families. Values up to 127 are reserved for real address | 10 | /* rtnetlink families. Values up to 127 are reserved for real address |
11 | * families, values above 128 may be used arbitrarily. | 11 | * families, values above 128 may be used arbitrarily. |
12 | */ | 12 | */ |
13 | #define RTNL_FAMILY_IPMR 128 | 13 | #define RTNL_FAMILY_IPMR 128 |
14 | #define RTNL_FAMILY_IP6MR 129 | 14 | #define RTNL_FAMILY_IP6MR 129 |
15 | #define RTNL_FAMILY_MAX 129 | 15 | #define RTNL_FAMILY_MAX 129 |
16 | 16 | ||
17 | /**** | 17 | /**** |
18 | * Routing/neighbour discovery messages. | 18 | * Routing/neighbour discovery messages. |
19 | ****/ | 19 | ****/ |
20 | 20 | ||
21 | /* Types of messages */ | 21 | /* Types of messages */ |
22 | 22 | ||
23 | enum { | 23 | enum { |
24 | RTM_BASE = 16, | 24 | RTM_BASE = 16, |
25 | #define RTM_BASE RTM_BASE | 25 | #define RTM_BASE RTM_BASE |
26 | 26 | ||
27 | RTM_NEWLINK = 16, | 27 | RTM_NEWLINK = 16, |
28 | #define RTM_NEWLINK RTM_NEWLINK | 28 | #define RTM_NEWLINK RTM_NEWLINK |
29 | RTM_DELLINK, | 29 | RTM_DELLINK, |
30 | #define RTM_DELLINK RTM_DELLINK | 30 | #define RTM_DELLINK RTM_DELLINK |
31 | RTM_GETLINK, | 31 | RTM_GETLINK, |
32 | #define RTM_GETLINK RTM_GETLINK | 32 | #define RTM_GETLINK RTM_GETLINK |
33 | RTM_SETLINK, | 33 | RTM_SETLINK, |
34 | #define RTM_SETLINK RTM_SETLINK | 34 | #define RTM_SETLINK RTM_SETLINK |
35 | 35 | ||
36 | RTM_NEWADDR = 20, | 36 | RTM_NEWADDR = 20, |
37 | #define RTM_NEWADDR RTM_NEWADDR | 37 | #define RTM_NEWADDR RTM_NEWADDR |
38 | RTM_DELADDR, | 38 | RTM_DELADDR, |
39 | #define RTM_DELADDR RTM_DELADDR | 39 | #define RTM_DELADDR RTM_DELADDR |
40 | RTM_GETADDR, | 40 | RTM_GETADDR, |
41 | #define RTM_GETADDR RTM_GETADDR | 41 | #define RTM_GETADDR RTM_GETADDR |
42 | 42 | ||
43 | RTM_NEWROUTE = 24, | 43 | RTM_NEWROUTE = 24, |
44 | #define RTM_NEWROUTE RTM_NEWROUTE | 44 | #define RTM_NEWROUTE RTM_NEWROUTE |
45 | RTM_DELROUTE, | 45 | RTM_DELROUTE, |
46 | #define RTM_DELROUTE RTM_DELROUTE | 46 | #define RTM_DELROUTE RTM_DELROUTE |
47 | RTM_GETROUTE, | 47 | RTM_GETROUTE, |
48 | #define RTM_GETROUTE RTM_GETROUTE | 48 | #define RTM_GETROUTE RTM_GETROUTE |
49 | 49 | ||
50 | RTM_NEWNEIGH = 28, | 50 | RTM_NEWNEIGH = 28, |
51 | #define RTM_NEWNEIGH RTM_NEWNEIGH | 51 | #define RTM_NEWNEIGH RTM_NEWNEIGH |
52 | RTM_DELNEIGH, | 52 | RTM_DELNEIGH, |
53 | #define RTM_DELNEIGH RTM_DELNEIGH | 53 | #define RTM_DELNEIGH RTM_DELNEIGH |
54 | RTM_GETNEIGH, | 54 | RTM_GETNEIGH, |
55 | #define RTM_GETNEIGH RTM_GETNEIGH | 55 | #define RTM_GETNEIGH RTM_GETNEIGH |
56 | 56 | ||
57 | RTM_NEWRULE = 32, | 57 | RTM_NEWRULE = 32, |
58 | #define RTM_NEWRULE RTM_NEWRULE | 58 | #define RTM_NEWRULE RTM_NEWRULE |
59 | RTM_DELRULE, | 59 | RTM_DELRULE, |
60 | #define RTM_DELRULE RTM_DELRULE | 60 | #define RTM_DELRULE RTM_DELRULE |
61 | RTM_GETRULE, | 61 | RTM_GETRULE, |
62 | #define RTM_GETRULE RTM_GETRULE | 62 | #define RTM_GETRULE RTM_GETRULE |
63 | 63 | ||
64 | RTM_NEWQDISC = 36, | 64 | RTM_NEWQDISC = 36, |
65 | #define RTM_NEWQDISC RTM_NEWQDISC | 65 | #define RTM_NEWQDISC RTM_NEWQDISC |
66 | RTM_DELQDISC, | 66 | RTM_DELQDISC, |
67 | #define RTM_DELQDISC RTM_DELQDISC | 67 | #define RTM_DELQDISC RTM_DELQDISC |
68 | RTM_GETQDISC, | 68 | RTM_GETQDISC, |
69 | #define RTM_GETQDISC RTM_GETQDISC | 69 | #define RTM_GETQDISC RTM_GETQDISC |
70 | 70 | ||
71 | RTM_NEWTCLASS = 40, | 71 | RTM_NEWTCLASS = 40, |
72 | #define RTM_NEWTCLASS RTM_NEWTCLASS | 72 | #define RTM_NEWTCLASS RTM_NEWTCLASS |
73 | RTM_DELTCLASS, | 73 | RTM_DELTCLASS, |
74 | #define RTM_DELTCLASS RTM_DELTCLASS | 74 | #define RTM_DELTCLASS RTM_DELTCLASS |
75 | RTM_GETTCLASS, | 75 | RTM_GETTCLASS, |
76 | #define RTM_GETTCLASS RTM_GETTCLASS | 76 | #define RTM_GETTCLASS RTM_GETTCLASS |
77 | 77 | ||
78 | RTM_NEWTFILTER = 44, | 78 | RTM_NEWTFILTER = 44, |
79 | #define RTM_NEWTFILTER RTM_NEWTFILTER | 79 | #define RTM_NEWTFILTER RTM_NEWTFILTER |
80 | RTM_DELTFILTER, | 80 | RTM_DELTFILTER, |
81 | #define RTM_DELTFILTER RTM_DELTFILTER | 81 | #define RTM_DELTFILTER RTM_DELTFILTER |
82 | RTM_GETTFILTER, | 82 | RTM_GETTFILTER, |
83 | #define RTM_GETTFILTER RTM_GETTFILTER | 83 | #define RTM_GETTFILTER RTM_GETTFILTER |
84 | 84 | ||
85 | RTM_NEWACTION = 48, | 85 | RTM_NEWACTION = 48, |
86 | #define RTM_NEWACTION RTM_NEWACTION | 86 | #define RTM_NEWACTION RTM_NEWACTION |
87 | RTM_DELACTION, | 87 | RTM_DELACTION, |
88 | #define RTM_DELACTION RTM_DELACTION | 88 | #define RTM_DELACTION RTM_DELACTION |
89 | RTM_GETACTION, | 89 | RTM_GETACTION, |
90 | #define RTM_GETACTION RTM_GETACTION | 90 | #define RTM_GETACTION RTM_GETACTION |
91 | 91 | ||
92 | RTM_NEWPREFIX = 52, | 92 | RTM_NEWPREFIX = 52, |
93 | #define RTM_NEWPREFIX RTM_NEWPREFIX | 93 | #define RTM_NEWPREFIX RTM_NEWPREFIX |
94 | 94 | ||
95 | RTM_GETMULTICAST = 58, | 95 | RTM_GETMULTICAST = 58, |
96 | #define RTM_GETMULTICAST RTM_GETMULTICAST | 96 | #define RTM_GETMULTICAST RTM_GETMULTICAST |
97 | 97 | ||
98 | RTM_GETANYCAST = 62, | 98 | RTM_GETANYCAST = 62, |
99 | #define RTM_GETANYCAST RTM_GETANYCAST | 99 | #define RTM_GETANYCAST RTM_GETANYCAST |
100 | 100 | ||
101 | RTM_NEWNEIGHTBL = 64, | 101 | RTM_NEWNEIGHTBL = 64, |
102 | #define RTM_NEWNEIGHTBL RTM_NEWNEIGHTBL | 102 | #define RTM_NEWNEIGHTBL RTM_NEWNEIGHTBL |
103 | RTM_GETNEIGHTBL = 66, | 103 | RTM_GETNEIGHTBL = 66, |
104 | #define RTM_GETNEIGHTBL RTM_GETNEIGHTBL | 104 | #define RTM_GETNEIGHTBL RTM_GETNEIGHTBL |
105 | RTM_SETNEIGHTBL, | 105 | RTM_SETNEIGHTBL, |
106 | #define RTM_SETNEIGHTBL RTM_SETNEIGHTBL | 106 | #define RTM_SETNEIGHTBL RTM_SETNEIGHTBL |
107 | 107 | ||
108 | RTM_NEWNDUSEROPT = 68, | 108 | RTM_NEWNDUSEROPT = 68, |
109 | #define RTM_NEWNDUSEROPT RTM_NEWNDUSEROPT | 109 | #define RTM_NEWNDUSEROPT RTM_NEWNDUSEROPT |
110 | 110 | ||
111 | RTM_NEWADDRLABEL = 72, | 111 | RTM_NEWADDRLABEL = 72, |
112 | #define RTM_NEWADDRLABEL RTM_NEWADDRLABEL | 112 | #define RTM_NEWADDRLABEL RTM_NEWADDRLABEL |
113 | RTM_DELADDRLABEL, | 113 | RTM_DELADDRLABEL, |
114 | #define RTM_DELADDRLABEL RTM_DELADDRLABEL | 114 | #define RTM_DELADDRLABEL RTM_DELADDRLABEL |
115 | RTM_GETADDRLABEL, | 115 | RTM_GETADDRLABEL, |
116 | #define RTM_GETADDRLABEL RTM_GETADDRLABEL | 116 | #define RTM_GETADDRLABEL RTM_GETADDRLABEL |
117 | 117 | ||
118 | RTM_GETDCB = 78, | 118 | RTM_GETDCB = 78, |
119 | #define RTM_GETDCB RTM_GETDCB | 119 | #define RTM_GETDCB RTM_GETDCB |
120 | RTM_SETDCB, | 120 | RTM_SETDCB, |
121 | #define RTM_SETDCB RTM_SETDCB | 121 | #define RTM_SETDCB RTM_SETDCB |
122 | 122 | ||
123 | __RTM_MAX, | 123 | __RTM_MAX, |
124 | #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) | 124 | #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) |
125 | }; | 125 | }; |
126 | 126 | ||
127 | #define RTM_NR_MSGTYPES (RTM_MAX + 1 - RTM_BASE) | 127 | #define RTM_NR_MSGTYPES (RTM_MAX + 1 - RTM_BASE) |
128 | #define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2) | 128 | #define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2) |
129 | #define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2) | 129 | #define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2) |
130 | 130 | ||
131 | /* | 131 | /* |
132 | Generic structure for encapsulation of optional route information. | 132 | Generic structure for encapsulation of optional route information. |
133 | It is reminiscent of sockaddr, but with sa_family replaced | 133 | It is reminiscent of sockaddr, but with sa_family replaced |
134 | with attribute type. | 134 | with attribute type. |
135 | */ | 135 | */ |
136 | 136 | ||
137 | struct rtattr { | 137 | struct rtattr { |
138 | unsigned short rta_len; | 138 | unsigned short rta_len; |
139 | unsigned short rta_type; | 139 | unsigned short rta_type; |
140 | }; | 140 | }; |
141 | 141 | ||
142 | /* Macros to handle rtattributes */ | 142 | /* Macros to handle rtattributes */ |
143 | 143 | ||
144 | #define RTA_ALIGNTO 4 | 144 | #define RTA_ALIGNTO 4 |
145 | #define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) ) | 145 | #define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) ) |
146 | #define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \ | 146 | #define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \ |
147 | (rta)->rta_len >= sizeof(struct rtattr) && \ | 147 | (rta)->rta_len >= sizeof(struct rtattr) && \ |
148 | (rta)->rta_len <= (len)) | 148 | (rta)->rta_len <= (len)) |
149 | #define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \ | 149 | #define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \ |
150 | (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len))) | 150 | (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len))) |
151 | #define RTA_LENGTH(len) (RTA_ALIGN(sizeof(struct rtattr)) + (len)) | 151 | #define RTA_LENGTH(len) (RTA_ALIGN(sizeof(struct rtattr)) + (len)) |
152 | #define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len)) | 152 | #define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len)) |
153 | #define RTA_DATA(rta) ((void*)(((char*)(rta)) + RTA_LENGTH(0))) | 153 | #define RTA_DATA(rta) ((void*)(((char*)(rta)) + RTA_LENGTH(0))) |
154 | #define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0)) | 154 | #define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0)) |
155 | 155 | ||
156 | 156 | ||
157 | 157 | ||
158 | 158 | ||
159 | /****************************************************************************** | 159 | /****************************************************************************** |
160 | * Definitions used in routing table administration. | 160 | * Definitions used in routing table administration. |
161 | ****/ | 161 | ****/ |
162 | 162 | ||
163 | struct rtmsg { | 163 | struct rtmsg { |
164 | unsigned char rtm_family; | 164 | unsigned char rtm_family; |
165 | unsigned char rtm_dst_len; | 165 | unsigned char rtm_dst_len; |
166 | unsigned char rtm_src_len; | 166 | unsigned char rtm_src_len; |
167 | unsigned char rtm_tos; | 167 | unsigned char rtm_tos; |
168 | 168 | ||
169 | unsigned char rtm_table; /* Routing table id */ | 169 | unsigned char rtm_table; /* Routing table id */ |
170 | unsigned char rtm_protocol; /* Routing protocol; see below */ | 170 | unsigned char rtm_protocol; /* Routing protocol; see below */ |
171 | unsigned char rtm_scope; /* See below */ | 171 | unsigned char rtm_scope; /* See below */ |
172 | unsigned char rtm_type; /* See below */ | 172 | unsigned char rtm_type; /* See below */ |
173 | 173 | ||
174 | unsigned rtm_flags; | 174 | unsigned rtm_flags; |
175 | }; | 175 | }; |
176 | 176 | ||
177 | /* rtm_type */ | 177 | /* rtm_type */ |
178 | 178 | ||
179 | enum { | 179 | enum { |
180 | RTN_UNSPEC, | 180 | RTN_UNSPEC, |
181 | RTN_UNICAST, /* Gateway or direct route */ | 181 | RTN_UNICAST, /* Gateway or direct route */ |
182 | RTN_LOCAL, /* Accept locally */ | 182 | RTN_LOCAL, /* Accept locally */ |
183 | RTN_BROADCAST, /* Accept locally as broadcast, | 183 | RTN_BROADCAST, /* Accept locally as broadcast, |
184 | send as broadcast */ | 184 | send as broadcast */ |
185 | RTN_ANYCAST, /* Accept locally as broadcast, | 185 | RTN_ANYCAST, /* Accept locally as broadcast, |
186 | but send as unicast */ | 186 | but send as unicast */ |
187 | RTN_MULTICAST, /* Multicast route */ | 187 | RTN_MULTICAST, /* Multicast route */ |
188 | RTN_BLACKHOLE, /* Drop */ | 188 | RTN_BLACKHOLE, /* Drop */ |
189 | RTN_UNREACHABLE, /* Destination is unreachable */ | 189 | RTN_UNREACHABLE, /* Destination is unreachable */ |
190 | RTN_PROHIBIT, /* Administratively prohibited */ | 190 | RTN_PROHIBIT, /* Administratively prohibited */ |
191 | RTN_THROW, /* Not in this table */ | 191 | RTN_THROW, /* Not in this table */ |
192 | RTN_NAT, /* Translate this address */ | 192 | RTN_NAT, /* Translate this address */ |
193 | RTN_XRESOLVE, /* Use external resolver */ | 193 | RTN_XRESOLVE, /* Use external resolver */ |
194 | __RTN_MAX | 194 | __RTN_MAX |
195 | }; | 195 | }; |
196 | 196 | ||
197 | #define RTN_MAX (__RTN_MAX - 1) | 197 | #define RTN_MAX (__RTN_MAX - 1) |
198 | 198 | ||
199 | 199 | ||
200 | /* rtm_protocol */ | 200 | /* rtm_protocol */ |
201 | 201 | ||
202 | #define RTPROT_UNSPEC 0 | 202 | #define RTPROT_UNSPEC 0 |
203 | #define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; | 203 | #define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; |
204 | not used by current IPv4 */ | 204 | not used by current IPv4 */ |
205 | #define RTPROT_KERNEL 2 /* Route installed by kernel */ | 205 | #define RTPROT_KERNEL 2 /* Route installed by kernel */ |
206 | #define RTPROT_BOOT 3 /* Route installed during boot */ | 206 | #define RTPROT_BOOT 3 /* Route installed during boot */ |
207 | #define RTPROT_STATIC 4 /* Route installed by administrator */ | 207 | #define RTPROT_STATIC 4 /* Route installed by administrator */ |
208 | 208 | ||
209 | /* Values of protocol >= RTPROT_STATIC are not interpreted by kernel; | 209 | /* Values of protocol >= RTPROT_STATIC are not interpreted by kernel; |
210 | they are just passed from user and back as is. | 210 | they are just passed from user and back as is. |
211 | It will be used by hypothetical multiple routing daemons. | 211 | It will be used by hypothetical multiple routing daemons. |
212 | Note that protocol values should be standardized in order to | 212 | Note that protocol values should be standardized in order to |
213 | avoid conflicts. | 213 | avoid conflicts. |
214 | */ | 214 | */ |
215 | 215 | ||
216 | #define RTPROT_GATED 8 /* Apparently, GateD */ | 216 | #define RTPROT_GATED 8 /* Apparently, GateD */ |
217 | #define RTPROT_RA 9 /* RDISC/ND router advertisements */ | 217 | #define RTPROT_RA 9 /* RDISC/ND router advertisements */ |
218 | #define RTPROT_MRT 10 /* Merit MRT */ | 218 | #define RTPROT_MRT 10 /* Merit MRT */ |
219 | #define RTPROT_ZEBRA 11 /* Zebra */ | 219 | #define RTPROT_ZEBRA 11 /* Zebra */ |
220 | #define RTPROT_BIRD 12 /* BIRD */ | 220 | #define RTPROT_BIRD 12 /* BIRD */ |
221 | #define RTPROT_DNROUTED 13 /* DECnet routing daemon */ | 221 | #define RTPROT_DNROUTED 13 /* DECnet routing daemon */ |
222 | #define RTPROT_XORP 14 /* XORP */ | 222 | #define RTPROT_XORP 14 /* XORP */ |
223 | #define RTPROT_NTK 15 /* Netsukuku */ | 223 | #define RTPROT_NTK 15 /* Netsukuku */ |
224 | #define RTPROT_DHCP 16 /* DHCP client */ | 224 | #define RTPROT_DHCP 16 /* DHCP client */ |
225 | 225 | ||
226 | /* rtm_scope | 226 | /* rtm_scope |
227 | 227 | ||
228 | Really it is not scope, but sort of distance to the destination. | 228 | Really it is not scope, but sort of distance to the destination. |
229 | NOWHERE are reserved for not existing destinations, HOST is our | 229 | NOWHERE are reserved for not existing destinations, HOST is our |
230 | local addresses, LINK are destinations, located on directly attached | 230 | local addresses, LINK are destinations, located on directly attached |
231 | link and UNIVERSE is everywhere in the Universe. | 231 | link and UNIVERSE is everywhere in the Universe. |
232 | 232 | ||
233 | Intermediate values are also possible f.e. interior routes | 233 | Intermediate values are also possible f.e. interior routes |
234 | could be assigned a value between UNIVERSE and LINK. | 234 | could be assigned a value between UNIVERSE and LINK. |
235 | */ | 235 | */ |
236 | 236 | ||
237 | enum rt_scope_t { | 237 | enum rt_scope_t { |
238 | RT_SCOPE_UNIVERSE=0, | 238 | RT_SCOPE_UNIVERSE=0, |
239 | /* User defined values */ | 239 | /* User defined values */ |
240 | RT_SCOPE_SITE=200, | 240 | RT_SCOPE_SITE=200, |
241 | RT_SCOPE_LINK=253, | 241 | RT_SCOPE_LINK=253, |
242 | RT_SCOPE_HOST=254, | 242 | RT_SCOPE_HOST=254, |
243 | RT_SCOPE_NOWHERE=255 | 243 | RT_SCOPE_NOWHERE=255 |
244 | }; | 244 | }; |
245 | 245 | ||
246 | /* rtm_flags */ | 246 | /* rtm_flags */ |
247 | 247 | ||
248 | #define RTM_F_NOTIFY 0x100 /* Notify user of route change */ | 248 | #define RTM_F_NOTIFY 0x100 /* Notify user of route change */ |
249 | #define RTM_F_CLONED 0x200 /* This route is cloned */ | 249 | #define RTM_F_CLONED 0x200 /* This route is cloned */ |
250 | #define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ | 250 | #define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ |
251 | #define RTM_F_PREFIX 0x800 /* Prefix addresses */ | 251 | #define RTM_F_PREFIX 0x800 /* Prefix addresses */ |
252 | 252 | ||
253 | /* Reserved table identifiers */ | 253 | /* Reserved table identifiers */ |
254 | 254 | ||
255 | enum rt_class_t { | 255 | enum rt_class_t { |
256 | RT_TABLE_UNSPEC=0, | 256 | RT_TABLE_UNSPEC=0, |
257 | /* User defined values */ | 257 | /* User defined values */ |
258 | RT_TABLE_COMPAT=252, | 258 | RT_TABLE_COMPAT=252, |
259 | RT_TABLE_DEFAULT=253, | 259 | RT_TABLE_DEFAULT=253, |
260 | RT_TABLE_MAIN=254, | 260 | RT_TABLE_MAIN=254, |
261 | RT_TABLE_LOCAL=255, | 261 | RT_TABLE_LOCAL=255, |
262 | RT_TABLE_MAX=0xFFFFFFFF | 262 | RT_TABLE_MAX=0xFFFFFFFF |
263 | }; | 263 | }; |
264 | 264 | ||
265 | 265 | ||
266 | /* Routing message attributes */ | 266 | /* Routing message attributes */ |
267 | 267 | ||
268 | enum rtattr_type_t { | 268 | enum rtattr_type_t { |
269 | RTA_UNSPEC, | 269 | RTA_UNSPEC, |
270 | RTA_DST, | 270 | RTA_DST, |
271 | RTA_SRC, | 271 | RTA_SRC, |
272 | RTA_IIF, | 272 | RTA_IIF, |
273 | RTA_OIF, | 273 | RTA_OIF, |
274 | RTA_GATEWAY, | 274 | RTA_GATEWAY, |
275 | RTA_PRIORITY, | 275 | RTA_PRIORITY, |
276 | RTA_PREFSRC, | 276 | RTA_PREFSRC, |
277 | RTA_METRICS, | 277 | RTA_METRICS, |
278 | RTA_MULTIPATH, | 278 | RTA_MULTIPATH, |
279 | RTA_PROTOINFO, /* no longer used */ | 279 | RTA_PROTOINFO, /* no longer used */ |
280 | RTA_FLOW, | 280 | RTA_FLOW, |
281 | RTA_CACHEINFO, | 281 | RTA_CACHEINFO, |
282 | RTA_SESSION, /* no longer used */ | 282 | RTA_SESSION, /* no longer used */ |
283 | RTA_MP_ALGO, /* no longer used */ | 283 | RTA_MP_ALGO, /* no longer used */ |
284 | RTA_TABLE, | 284 | RTA_TABLE, |
285 | RTA_MARK, | 285 | RTA_MARK, |
286 | __RTA_MAX | 286 | __RTA_MAX |
287 | }; | 287 | }; |
288 | 288 | ||
289 | #define RTA_MAX (__RTA_MAX - 1) | 289 | #define RTA_MAX (__RTA_MAX - 1) |
290 | 290 | ||
291 | #define RTM_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg)))) | 291 | #define RTM_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg)))) |
292 | #define RTM_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct rtmsg)) | 292 | #define RTM_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct rtmsg)) |
293 | 293 | ||
294 | /* RTM_MULTIPATH --- array of struct rtnexthop. | 294 | /* RTM_MULTIPATH --- array of struct rtnexthop. |
295 | * | 295 | * |
296 | * "struct rtnexthop" describes all necessary nexthop information, | 296 | * "struct rtnexthop" describes all necessary nexthop information, |
297 | * i.e. parameters of path to a destination via this nexthop. | 297 | * i.e. parameters of path to a destination via this nexthop. |
298 | * | 298 | * |
299 | * At the moment it is impossible to set different prefsrc, mtu, window | 299 | * At the moment it is impossible to set different prefsrc, mtu, window |
300 | * and rtt for different paths from multipath. | 300 | * and rtt for different paths from multipath. |
301 | */ | 301 | */ |
302 | 302 | ||
303 | struct rtnexthop { | 303 | struct rtnexthop { |
304 | unsigned short rtnh_len; | 304 | unsigned short rtnh_len; |
305 | unsigned char rtnh_flags; | 305 | unsigned char rtnh_flags; |
306 | unsigned char rtnh_hops; | 306 | unsigned char rtnh_hops; |
307 | int rtnh_ifindex; | 307 | int rtnh_ifindex; |
308 | }; | 308 | }; |
309 | 309 | ||
310 | /* rtnh_flags */ | 310 | /* rtnh_flags */ |
311 | 311 | ||
312 | #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ | 312 | #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ |
313 | #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ | 313 | #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ |
314 | #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ | 314 | #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ |
315 | 315 | ||
316 | /* Macros to handle hexthops */ | 316 | /* Macros to handle hexthops */ |
317 | 317 | ||
318 | #define RTNH_ALIGNTO 4 | 318 | #define RTNH_ALIGNTO 4 |
319 | #define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) ) | 319 | #define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) ) |
320 | #define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \ | 320 | #define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \ |
321 | ((int)(rtnh)->rtnh_len) <= (len)) | 321 | ((int)(rtnh)->rtnh_len) <= (len)) |
322 | #define RTNH_NEXT(rtnh) ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len))) | 322 | #define RTNH_NEXT(rtnh) ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len))) |
323 | #define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len)) | 323 | #define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len)) |
324 | #define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len)) | 324 | #define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len)) |
325 | #define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0))) | 325 | #define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0))) |
326 | 326 | ||
327 | /* RTM_CACHEINFO */ | 327 | /* RTM_CACHEINFO */ |
328 | 328 | ||
329 | struct rta_cacheinfo { | 329 | struct rta_cacheinfo { |
330 | __u32 rta_clntref; | 330 | __u32 rta_clntref; |
331 | __u32 rta_lastuse; | 331 | __u32 rta_lastuse; |
332 | __s32 rta_expires; | 332 | __s32 rta_expires; |
333 | __u32 rta_error; | 333 | __u32 rta_error; |
334 | __u32 rta_used; | 334 | __u32 rta_used; |
335 | 335 | ||
336 | #define RTNETLINK_HAVE_PEERINFO 1 | 336 | #define RTNETLINK_HAVE_PEERINFO 1 |
337 | __u32 rta_id; | 337 | __u32 rta_id; |
338 | __u32 rta_ts; | 338 | __u32 rta_ts; |
339 | __u32 rta_tsage; | 339 | __u32 rta_tsage; |
340 | }; | 340 | }; |
341 | 341 | ||
342 | /* RTM_METRICS --- array of struct rtattr with types of RTAX_* */ | 342 | /* RTM_METRICS --- array of struct rtattr with types of RTAX_* */ |
343 | 343 | ||
344 | enum { | 344 | enum { |
345 | RTAX_UNSPEC, | 345 | RTAX_UNSPEC, |
346 | #define RTAX_UNSPEC RTAX_UNSPEC | 346 | #define RTAX_UNSPEC RTAX_UNSPEC |
347 | RTAX_LOCK, | 347 | RTAX_LOCK, |
348 | #define RTAX_LOCK RTAX_LOCK | 348 | #define RTAX_LOCK RTAX_LOCK |
349 | RTAX_MTU, | 349 | RTAX_MTU, |
350 | #define RTAX_MTU RTAX_MTU | 350 | #define RTAX_MTU RTAX_MTU |
351 | RTAX_WINDOW, | 351 | RTAX_WINDOW, |
352 | #define RTAX_WINDOW RTAX_WINDOW | 352 | #define RTAX_WINDOW RTAX_WINDOW |
353 | RTAX_RTT, | 353 | RTAX_RTT, |
354 | #define RTAX_RTT RTAX_RTT | 354 | #define RTAX_RTT RTAX_RTT |
355 | RTAX_RTTVAR, | 355 | RTAX_RTTVAR, |
356 | #define RTAX_RTTVAR RTAX_RTTVAR | 356 | #define RTAX_RTTVAR RTAX_RTTVAR |
357 | RTAX_SSTHRESH, | 357 | RTAX_SSTHRESH, |
358 | #define RTAX_SSTHRESH RTAX_SSTHRESH | 358 | #define RTAX_SSTHRESH RTAX_SSTHRESH |
359 | RTAX_CWND, | 359 | RTAX_CWND, |
360 | #define RTAX_CWND RTAX_CWND | 360 | #define RTAX_CWND RTAX_CWND |
361 | RTAX_ADVMSS, | 361 | RTAX_ADVMSS, |
362 | #define RTAX_ADVMSS RTAX_ADVMSS | 362 | #define RTAX_ADVMSS RTAX_ADVMSS |
363 | RTAX_REORDERING, | 363 | RTAX_REORDERING, |
364 | #define RTAX_REORDERING RTAX_REORDERING | 364 | #define RTAX_REORDERING RTAX_REORDERING |
365 | RTAX_HOPLIMIT, | 365 | RTAX_HOPLIMIT, |
366 | #define RTAX_HOPLIMIT RTAX_HOPLIMIT | 366 | #define RTAX_HOPLIMIT RTAX_HOPLIMIT |
367 | RTAX_INITCWND, | 367 | RTAX_INITCWND, |
368 | #define RTAX_INITCWND RTAX_INITCWND | 368 | #define RTAX_INITCWND RTAX_INITCWND |
369 | RTAX_FEATURES, | 369 | RTAX_FEATURES, |
370 | #define RTAX_FEATURES RTAX_FEATURES | 370 | #define RTAX_FEATURES RTAX_FEATURES |
371 | RTAX_RTO_MIN, | 371 | RTAX_RTO_MIN, |
372 | #define RTAX_RTO_MIN RTAX_RTO_MIN | 372 | #define RTAX_RTO_MIN RTAX_RTO_MIN |
373 | RTAX_INITRWND, | 373 | RTAX_INITRWND, |
374 | #define RTAX_INITRWND RTAX_INITRWND | 374 | #define RTAX_INITRWND RTAX_INITRWND |
375 | __RTAX_MAX | 375 | __RTAX_MAX |
376 | }; | 376 | }; |
377 | 377 | ||
378 | #define RTAX_MAX (__RTAX_MAX - 1) | 378 | #define RTAX_MAX (__RTAX_MAX - 1) |
379 | 379 | ||
380 | #define RTAX_FEATURE_ECN 0x00000001 | 380 | #define RTAX_FEATURE_ECN 0x00000001 |
381 | #define RTAX_FEATURE_SACK 0x00000002 | 381 | #define RTAX_FEATURE_SACK 0x00000002 |
382 | #define RTAX_FEATURE_TIMESTAMP 0x00000004 | 382 | #define RTAX_FEATURE_TIMESTAMP 0x00000004 |
383 | #define RTAX_FEATURE_ALLFRAG 0x00000008 | 383 | #define RTAX_FEATURE_ALLFRAG 0x00000008 |
384 | 384 | ||
385 | struct rta_session { | 385 | struct rta_session { |
386 | __u8 proto; | 386 | __u8 proto; |
387 | __u8 pad1; | 387 | __u8 pad1; |
388 | __u16 pad2; | 388 | __u16 pad2; |
389 | 389 | ||
390 | union { | 390 | union { |
391 | struct { | 391 | struct { |
392 | __u16 sport; | 392 | __u16 sport; |
393 | __u16 dport; | 393 | __u16 dport; |
394 | } ports; | 394 | } ports; |
395 | 395 | ||
396 | struct { | 396 | struct { |
397 | __u8 type; | 397 | __u8 type; |
398 | __u8 code; | 398 | __u8 code; |
399 | __u16 ident; | 399 | __u16 ident; |
400 | } icmpt; | 400 | } icmpt; |
401 | 401 | ||
402 | __u32 spi; | 402 | __u32 spi; |
403 | } u; | 403 | } u; |
404 | }; | 404 | }; |
405 | 405 | ||
406 | /**** | 406 | /**** |
407 | * General form of address family dependent message. | 407 | * General form of address family dependent message. |
408 | ****/ | 408 | ****/ |
409 | 409 | ||
410 | struct rtgenmsg { | 410 | struct rtgenmsg { |
411 | unsigned char rtgen_family; | 411 | unsigned char rtgen_family; |
412 | }; | 412 | }; |
413 | 413 | ||
414 | /***************************************************************** | 414 | /***************************************************************** |
415 | * Link layer specific messages. | 415 | * Link layer specific messages. |
416 | ****/ | 416 | ****/ |
417 | 417 | ||
418 | /* struct ifinfomsg | 418 | /* struct ifinfomsg |
419 | * passes link level specific information, not dependent | 419 | * passes link level specific information, not dependent |
420 | * on network protocol. | 420 | * on network protocol. |
421 | */ | 421 | */ |
422 | 422 | ||
423 | struct ifinfomsg { | 423 | struct ifinfomsg { |
424 | unsigned char ifi_family; | 424 | unsigned char ifi_family; |
425 | unsigned char __ifi_pad; | 425 | unsigned char __ifi_pad; |
426 | unsigned short ifi_type; /* ARPHRD_* */ | 426 | unsigned short ifi_type; /* ARPHRD_* */ |
427 | int ifi_index; /* Link index */ | 427 | int ifi_index; /* Link index */ |
428 | unsigned ifi_flags; /* IFF_* flags */ | 428 | unsigned ifi_flags; /* IFF_* flags */ |
429 | unsigned ifi_change; /* IFF_* change mask */ | 429 | unsigned ifi_change; /* IFF_* change mask */ |
430 | }; | 430 | }; |
431 | 431 | ||
432 | /******************************************************************** | 432 | /******************************************************************** |
433 | * prefix information | 433 | * prefix information |
434 | ****/ | 434 | ****/ |
435 | 435 | ||
436 | struct prefixmsg { | 436 | struct prefixmsg { |
437 | unsigned char prefix_family; | 437 | unsigned char prefix_family; |
438 | unsigned char prefix_pad1; | 438 | unsigned char prefix_pad1; |
439 | unsigned short prefix_pad2; | 439 | unsigned short prefix_pad2; |
440 | int prefix_ifindex; | 440 | int prefix_ifindex; |
441 | unsigned char prefix_type; | 441 | unsigned char prefix_type; |
442 | unsigned char prefix_len; | 442 | unsigned char prefix_len; |
443 | unsigned char prefix_flags; | 443 | unsigned char prefix_flags; |
444 | unsigned char prefix_pad3; | 444 | unsigned char prefix_pad3; |
445 | }; | 445 | }; |
446 | 446 | ||
447 | enum | 447 | enum |
448 | { | 448 | { |
449 | PREFIX_UNSPEC, | 449 | PREFIX_UNSPEC, |
450 | PREFIX_ADDRESS, | 450 | PREFIX_ADDRESS, |
451 | PREFIX_CACHEINFO, | 451 | PREFIX_CACHEINFO, |
452 | __PREFIX_MAX | 452 | __PREFIX_MAX |
453 | }; | 453 | }; |
454 | 454 | ||
455 | #define PREFIX_MAX (__PREFIX_MAX - 1) | 455 | #define PREFIX_MAX (__PREFIX_MAX - 1) |
456 | 456 | ||
457 | struct prefix_cacheinfo { | 457 | struct prefix_cacheinfo { |
458 | __u32 preferred_time; | 458 | __u32 preferred_time; |
459 | __u32 valid_time; | 459 | __u32 valid_time; |
460 | }; | 460 | }; |
461 | 461 | ||
462 | 462 | ||
463 | /***************************************************************** | 463 | /***************************************************************** |
464 | * Traffic control messages. | 464 | * Traffic control messages. |
465 | ****/ | 465 | ****/ |
466 | 466 | ||
467 | struct tcmsg { | 467 | struct tcmsg { |
468 | unsigned char tcm_family; | 468 | unsigned char tcm_family; |
469 | unsigned char tcm__pad1; | 469 | unsigned char tcm__pad1; |
470 | unsigned short tcm__pad2; | 470 | unsigned short tcm__pad2; |
471 | int tcm_ifindex; | 471 | int tcm_ifindex; |
472 | __u32 tcm_handle; | 472 | __u32 tcm_handle; |
473 | __u32 tcm_parent; | 473 | __u32 tcm_parent; |
474 | __u32 tcm_info; | 474 | __u32 tcm_info; |
475 | }; | 475 | }; |
476 | 476 | ||
477 | enum { | 477 | enum { |
478 | TCA_UNSPEC, | 478 | TCA_UNSPEC, |
479 | TCA_KIND, | 479 | TCA_KIND, |
480 | TCA_OPTIONS, | 480 | TCA_OPTIONS, |
481 | TCA_STATS, | 481 | TCA_STATS, |
482 | TCA_XSTATS, | 482 | TCA_XSTATS, |
483 | TCA_RATE, | 483 | TCA_RATE, |
484 | TCA_FCNT, | 484 | TCA_FCNT, |
485 | TCA_STATS2, | 485 | TCA_STATS2, |
486 | TCA_STAB, | 486 | TCA_STAB, |
487 | __TCA_MAX | 487 | __TCA_MAX |
488 | }; | 488 | }; |
489 | 489 | ||
490 | #define TCA_MAX (__TCA_MAX - 1) | 490 | #define TCA_MAX (__TCA_MAX - 1) |
491 | 491 | ||
492 | #define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) | 492 | #define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) |
493 | #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) | 493 | #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) |
494 | 494 | ||
495 | /******************************************************************** | 495 | /******************************************************************** |
496 | * Neighbor Discovery userland options | 496 | * Neighbor Discovery userland options |
497 | ****/ | 497 | ****/ |
498 | 498 | ||
499 | struct nduseroptmsg { | 499 | struct nduseroptmsg { |
500 | unsigned char nduseropt_family; | 500 | unsigned char nduseropt_family; |
501 | unsigned char nduseropt_pad1; | 501 | unsigned char nduseropt_pad1; |
502 | unsigned short nduseropt_opts_len; /* Total length of options */ | 502 | unsigned short nduseropt_opts_len; /* Total length of options */ |
503 | int nduseropt_ifindex; | 503 | int nduseropt_ifindex; |
504 | __u8 nduseropt_icmp_type; | 504 | __u8 nduseropt_icmp_type; |
505 | __u8 nduseropt_icmp_code; | 505 | __u8 nduseropt_icmp_code; |
506 | unsigned short nduseropt_pad2; | 506 | unsigned short nduseropt_pad2; |
507 | unsigned int nduseropt_pad3; | 507 | unsigned int nduseropt_pad3; |
508 | /* Followed by one or more ND options */ | 508 | /* Followed by one or more ND options */ |
509 | }; | 509 | }; |
510 | 510 | ||
511 | enum { | 511 | enum { |
512 | NDUSEROPT_UNSPEC, | 512 | NDUSEROPT_UNSPEC, |
513 | NDUSEROPT_SRCADDR, | 513 | NDUSEROPT_SRCADDR, |
514 | __NDUSEROPT_MAX | 514 | __NDUSEROPT_MAX |
515 | }; | 515 | }; |
516 | 516 | ||
517 | #define NDUSEROPT_MAX (__NDUSEROPT_MAX - 1) | 517 | #define NDUSEROPT_MAX (__NDUSEROPT_MAX - 1) |
518 | 518 | ||
519 | #ifndef __KERNEL__ | 519 | #ifndef __KERNEL__ |
520 | /* RTnetlink multicast groups - backwards compatibility for userspace */ | 520 | /* RTnetlink multicast groups - backwards compatibility for userspace */ |
521 | #define RTMGRP_LINK 1 | 521 | #define RTMGRP_LINK 1 |
522 | #define RTMGRP_NOTIFY 2 | 522 | #define RTMGRP_NOTIFY 2 |
523 | #define RTMGRP_NEIGH 4 | 523 | #define RTMGRP_NEIGH 4 |
524 | #define RTMGRP_TC 8 | 524 | #define RTMGRP_TC 8 |
525 | 525 | ||
526 | #define RTMGRP_IPV4_IFADDR 0x10 | 526 | #define RTMGRP_IPV4_IFADDR 0x10 |
527 | #define RTMGRP_IPV4_MROUTE 0x20 | 527 | #define RTMGRP_IPV4_MROUTE 0x20 |
528 | #define RTMGRP_IPV4_ROUTE 0x40 | 528 | #define RTMGRP_IPV4_ROUTE 0x40 |
529 | #define RTMGRP_IPV4_RULE 0x80 | 529 | #define RTMGRP_IPV4_RULE 0x80 |
530 | 530 | ||
531 | #define RTMGRP_IPV6_IFADDR 0x100 | 531 | #define RTMGRP_IPV6_IFADDR 0x100 |
532 | #define RTMGRP_IPV6_MROUTE 0x200 | 532 | #define RTMGRP_IPV6_MROUTE 0x200 |
533 | #define RTMGRP_IPV6_ROUTE 0x400 | 533 | #define RTMGRP_IPV6_ROUTE 0x400 |
534 | #define RTMGRP_IPV6_IFINFO 0x800 | 534 | #define RTMGRP_IPV6_IFINFO 0x800 |
535 | 535 | ||
536 | #define RTMGRP_DECnet_IFADDR 0x1000 | 536 | #define RTMGRP_DECnet_IFADDR 0x1000 |
537 | #define RTMGRP_DECnet_ROUTE 0x4000 | 537 | #define RTMGRP_DECnet_ROUTE 0x4000 |
538 | 538 | ||
539 | #define RTMGRP_IPV6_PREFIX 0x20000 | 539 | #define RTMGRP_IPV6_PREFIX 0x20000 |
540 | #endif | 540 | #endif |
541 | 541 | ||
542 | /* RTnetlink multicast groups */ | 542 | /* RTnetlink multicast groups */ |
543 | enum rtnetlink_groups { | 543 | enum rtnetlink_groups { |
544 | RTNLGRP_NONE, | 544 | RTNLGRP_NONE, |
545 | #define RTNLGRP_NONE RTNLGRP_NONE | 545 | #define RTNLGRP_NONE RTNLGRP_NONE |
546 | RTNLGRP_LINK, | 546 | RTNLGRP_LINK, |
547 | #define RTNLGRP_LINK RTNLGRP_LINK | 547 | #define RTNLGRP_LINK RTNLGRP_LINK |
548 | RTNLGRP_NOTIFY, | 548 | RTNLGRP_NOTIFY, |
549 | #define RTNLGRP_NOTIFY RTNLGRP_NOTIFY | 549 | #define RTNLGRP_NOTIFY RTNLGRP_NOTIFY |
550 | RTNLGRP_NEIGH, | 550 | RTNLGRP_NEIGH, |
551 | #define RTNLGRP_NEIGH RTNLGRP_NEIGH | 551 | #define RTNLGRP_NEIGH RTNLGRP_NEIGH |
552 | RTNLGRP_TC, | 552 | RTNLGRP_TC, |
553 | #define RTNLGRP_TC RTNLGRP_TC | 553 | #define RTNLGRP_TC RTNLGRP_TC |
554 | RTNLGRP_IPV4_IFADDR, | 554 | RTNLGRP_IPV4_IFADDR, |
555 | #define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR | 555 | #define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR |
556 | RTNLGRP_IPV4_MROUTE, | 556 | RTNLGRP_IPV4_MROUTE, |
557 | #define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE | 557 | #define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE |
558 | RTNLGRP_IPV4_ROUTE, | 558 | RTNLGRP_IPV4_ROUTE, |
559 | #define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE | 559 | #define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE |
560 | RTNLGRP_IPV4_RULE, | 560 | RTNLGRP_IPV4_RULE, |
561 | #define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE | 561 | #define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE |
562 | RTNLGRP_IPV6_IFADDR, | 562 | RTNLGRP_IPV6_IFADDR, |
563 | #define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR | 563 | #define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR |
564 | RTNLGRP_IPV6_MROUTE, | 564 | RTNLGRP_IPV6_MROUTE, |
565 | #define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE | 565 | #define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE |
566 | RTNLGRP_IPV6_ROUTE, | 566 | RTNLGRP_IPV6_ROUTE, |
567 | #define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE | 567 | #define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE |
568 | RTNLGRP_IPV6_IFINFO, | 568 | RTNLGRP_IPV6_IFINFO, |
569 | #define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO | 569 | #define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO |
570 | RTNLGRP_DECnet_IFADDR, | 570 | RTNLGRP_DECnet_IFADDR, |
571 | #define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR | 571 | #define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR |
572 | RTNLGRP_NOP2, | 572 | RTNLGRP_NOP2, |
573 | RTNLGRP_DECnet_ROUTE, | 573 | RTNLGRP_DECnet_ROUTE, |
574 | #define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE | 574 | #define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE |
575 | RTNLGRP_DECnet_RULE, | 575 | RTNLGRP_DECnet_RULE, |
576 | #define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE | 576 | #define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE |
577 | RTNLGRP_NOP4, | 577 | RTNLGRP_NOP4, |
578 | RTNLGRP_IPV6_PREFIX, | 578 | RTNLGRP_IPV6_PREFIX, |
579 | #define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX | 579 | #define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX |
580 | RTNLGRP_IPV6_RULE, | 580 | RTNLGRP_IPV6_RULE, |
581 | #define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE | 581 | #define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE |
582 | RTNLGRP_ND_USEROPT, | 582 | RTNLGRP_ND_USEROPT, |
583 | #define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT | 583 | #define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT |
584 | RTNLGRP_PHONET_IFADDR, | 584 | RTNLGRP_PHONET_IFADDR, |
585 | #define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR | 585 | #define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR |
586 | RTNLGRP_PHONET_ROUTE, | 586 | RTNLGRP_PHONET_ROUTE, |
587 | #define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE | 587 | #define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE |
588 | __RTNLGRP_MAX | 588 | __RTNLGRP_MAX |
589 | }; | 589 | }; |
590 | #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) | 590 | #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) |
591 | 591 | ||
592 | /* TC action piece */ | 592 | /* TC action piece */ |
593 | struct tcamsg { | 593 | struct tcamsg { |
594 | unsigned char tca_family; | 594 | unsigned char tca_family; |
595 | unsigned char tca__pad1; | 595 | unsigned char tca__pad1; |
596 | unsigned short tca__pad2; | 596 | unsigned short tca__pad2; |
597 | }; | 597 | }; |
598 | #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) | 598 | #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) |
599 | #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) | 599 | #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) |
600 | #define TCA_ACT_TAB 1 /* attr type must be >=1 */ | 600 | #define TCA_ACT_TAB 1 /* attr type must be >=1 */ |
601 | #define TCAA_MAX 1 | 601 | #define TCAA_MAX 1 |
602 | 602 | ||
603 | /* End of information exported to user level */ | 603 | /* End of information exported to user level */ |
604 | 604 | ||
605 | #ifdef __KERNEL__ | 605 | #ifdef __KERNEL__ |
606 | 606 | ||
607 | #include <linux/mutex.h> | 607 | #include <linux/mutex.h> |
608 | #include <linux/netdevice.h> | 608 | #include <linux/netdevice.h> |
609 | 609 | ||
610 | static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str) | 610 | static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str) |
611 | { | 611 | { |
612 | int len = strlen(str) + 1; | 612 | int len = strlen(str) + 1; |
613 | return len > rta->rta_len || memcmp(RTA_DATA(rta), str, len); | 613 | return len > rta->rta_len || memcmp(RTA_DATA(rta), str, len); |
614 | } | 614 | } |
615 | 615 | ||
616 | extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); | 616 | extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); |
617 | extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); | 617 | extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); |
618 | extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, | 618 | extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, |
619 | u32 group, struct nlmsghdr *nlh, gfp_t flags); | 619 | u32 group, struct nlmsghdr *nlh, gfp_t flags); |
620 | extern void rtnl_set_sk_err(struct net *net, u32 group, int error); | 620 | extern void rtnl_set_sk_err(struct net *net, u32 group, int error); |
621 | extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); | 621 | extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); |
622 | extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, | 622 | extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, |
623 | u32 id, u32 ts, u32 tsage, long expires, | 623 | u32 id, u32 ts, u32 tsage, long expires, |
624 | u32 error); | 624 | u32 error); |
625 | 625 | ||
626 | extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data); | 626 | extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data); |
627 | 627 | ||
628 | #define RTA_PUT(skb, attrtype, attrlen, data) \ | 628 | #define RTA_PUT(skb, attrtype, attrlen, data) \ |
629 | ({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \ | 629 | ({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \ |
630 | goto rtattr_failure; \ | 630 | goto rtattr_failure; \ |
631 | __rta_fill(skb, attrtype, attrlen, data); }) | 631 | __rta_fill(skb, attrtype, attrlen, data); }) |
632 | 632 | ||
633 | #define RTA_APPEND(skb, attrlen, data) \ | 633 | #define RTA_APPEND(skb, attrlen, data) \ |
634 | ({ if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \ | 634 | ({ if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \ |
635 | goto rtattr_failure; \ | 635 | goto rtattr_failure; \ |
636 | memcpy(skb_put(skb, attrlen), data, attrlen); }) | 636 | memcpy(skb_put(skb, attrlen), data, attrlen); }) |
637 | 637 | ||
638 | #define RTA_PUT_NOHDR(skb, attrlen, data) \ | 638 | #define RTA_PUT_NOHDR(skb, attrlen, data) \ |
639 | ({ RTA_APPEND(skb, RTA_ALIGN(attrlen), data); \ | 639 | ({ RTA_APPEND(skb, RTA_ALIGN(attrlen), data); \ |
640 | memset(skb_tail_pointer(skb) - (RTA_ALIGN(attrlen) - attrlen), 0, \ | 640 | memset(skb_tail_pointer(skb) - (RTA_ALIGN(attrlen) - attrlen), 0, \ |
641 | RTA_ALIGN(attrlen) - attrlen); }) | 641 | RTA_ALIGN(attrlen) - attrlen); }) |
642 | 642 | ||
643 | #define RTA_PUT_U8(skb, attrtype, value) \ | 643 | #define RTA_PUT_U8(skb, attrtype, value) \ |
644 | ({ u8 _tmp = (value); \ | 644 | ({ u8 _tmp = (value); \ |
645 | RTA_PUT(skb, attrtype, sizeof(u8), &_tmp); }) | 645 | RTA_PUT(skb, attrtype, sizeof(u8), &_tmp); }) |
646 | 646 | ||
647 | #define RTA_PUT_U16(skb, attrtype, value) \ | 647 | #define RTA_PUT_U16(skb, attrtype, value) \ |
648 | ({ u16 _tmp = (value); \ | 648 | ({ u16 _tmp = (value); \ |
649 | RTA_PUT(skb, attrtype, sizeof(u16), &_tmp); }) | 649 | RTA_PUT(skb, attrtype, sizeof(u16), &_tmp); }) |
650 | 650 | ||
651 | #define RTA_PUT_U32(skb, attrtype, value) \ | 651 | #define RTA_PUT_U32(skb, attrtype, value) \ |
652 | ({ u32 _tmp = (value); \ | 652 | ({ u32 _tmp = (value); \ |
653 | RTA_PUT(skb, attrtype, sizeof(u32), &_tmp); }) | 653 | RTA_PUT(skb, attrtype, sizeof(u32), &_tmp); }) |
654 | 654 | ||
655 | #define RTA_PUT_U64(skb, attrtype, value) \ | 655 | #define RTA_PUT_U64(skb, attrtype, value) \ |
656 | ({ u64 _tmp = (value); \ | 656 | ({ u64 _tmp = (value); \ |
657 | RTA_PUT(skb, attrtype, sizeof(u64), &_tmp); }) | 657 | RTA_PUT(skb, attrtype, sizeof(u64), &_tmp); }) |
658 | 658 | ||
659 | #define RTA_PUT_SECS(skb, attrtype, value) \ | 659 | #define RTA_PUT_SECS(skb, attrtype, value) \ |
660 | RTA_PUT_U64(skb, attrtype, (value) / HZ) | 660 | RTA_PUT_U64(skb, attrtype, (value) / HZ) |
661 | 661 | ||
662 | #define RTA_PUT_MSECS(skb, attrtype, value) \ | 662 | #define RTA_PUT_MSECS(skb, attrtype, value) \ |
663 | RTA_PUT_U64(skb, attrtype, jiffies_to_msecs(value)) | 663 | RTA_PUT_U64(skb, attrtype, jiffies_to_msecs(value)) |
664 | 664 | ||
665 | #define RTA_PUT_STRING(skb, attrtype, value) \ | 665 | #define RTA_PUT_STRING(skb, attrtype, value) \ |
666 | RTA_PUT(skb, attrtype, strlen(value) + 1, value) | 666 | RTA_PUT(skb, attrtype, strlen(value) + 1, value) |
667 | 667 | ||
668 | #define RTA_PUT_FLAG(skb, attrtype) \ | 668 | #define RTA_PUT_FLAG(skb, attrtype) \ |
669 | RTA_PUT(skb, attrtype, 0, NULL); | 669 | RTA_PUT(skb, attrtype, 0, NULL); |
670 | 670 | ||
671 | #define RTA_NEST(skb, type) \ | 671 | #define RTA_NEST(skb, type) \ |
672 | ({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \ | 672 | ({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \ |
673 | RTA_PUT(skb, type, 0, NULL); \ | 673 | RTA_PUT(skb, type, 0, NULL); \ |
674 | __start; }) | 674 | __start; }) |
675 | 675 | ||
676 | #define RTA_NEST_END(skb, start) \ | 676 | #define RTA_NEST_END(skb, start) \ |
677 | ({ (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \ | 677 | ({ (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \ |
678 | (skb)->len; }) | 678 | (skb)->len; }) |
679 | 679 | ||
680 | #define RTA_NEST_COMPAT(skb, type, attrlen, data) \ | 680 | #define RTA_NEST_COMPAT(skb, type, attrlen, data) \ |
681 | ({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \ | 681 | ({ struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \ |
682 | RTA_PUT(skb, type, attrlen, data); \ | 682 | RTA_PUT(skb, type, attrlen, data); \ |
683 | RTA_NEST(skb, type); \ | 683 | RTA_NEST(skb, type); \ |
684 | __start; }) | 684 | __start; }) |
685 | 685 | ||
686 | #define RTA_NEST_COMPAT_END(skb, start) \ | 686 | #define RTA_NEST_COMPAT_END(skb, start) \ |
687 | ({ struct rtattr *__nest = (void *)(start) + NLMSG_ALIGN((start)->rta_len); \ | 687 | ({ struct rtattr *__nest = (void *)(start) + NLMSG_ALIGN((start)->rta_len); \ |
688 | (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \ | 688 | (start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \ |
689 | RTA_NEST_END(skb, __nest); \ | 689 | RTA_NEST_END(skb, __nest); \ |
690 | (skb)->len; }) | 690 | (skb)->len; }) |
691 | 691 | ||
692 | #define RTA_NEST_CANCEL(skb, start) \ | 692 | #define RTA_NEST_CANCEL(skb, start) \ |
693 | ({ if (start) \ | 693 | ({ if (start) \ |
694 | skb_trim(skb, (unsigned char *) (start) - (skb)->data); \ | 694 | skb_trim(skb, (unsigned char *) (start) - (skb)->data); \ |
695 | -1; }) | 695 | -1; }) |
696 | 696 | ||
697 | #define RTA_GET_U8(rta) \ | 697 | #define RTA_GET_U8(rta) \ |
698 | ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u8)) \ | 698 | ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u8)) \ |
699 | goto rtattr_failure; \ | 699 | goto rtattr_failure; \ |
700 | *(u8 *) RTA_DATA(rta); }) | 700 | *(u8 *) RTA_DATA(rta); }) |
701 | 701 | ||
702 | #define RTA_GET_U16(rta) \ | 702 | #define RTA_GET_U16(rta) \ |
703 | ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u16)) \ | 703 | ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u16)) \ |
704 | goto rtattr_failure; \ | 704 | goto rtattr_failure; \ |
705 | *(u16 *) RTA_DATA(rta); }) | 705 | *(u16 *) RTA_DATA(rta); }) |
706 | 706 | ||
707 | #define RTA_GET_U32(rta) \ | 707 | #define RTA_GET_U32(rta) \ |
708 | ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u32)) \ | 708 | ({ if (!rta || RTA_PAYLOAD(rta) < sizeof(u32)) \ |
709 | goto rtattr_failure; \ | 709 | goto rtattr_failure; \ |
710 | *(u32 *) RTA_DATA(rta); }) | 710 | *(u32 *) RTA_DATA(rta); }) |
711 | 711 | ||
712 | #define RTA_GET_U64(rta) \ | 712 | #define RTA_GET_U64(rta) \ |
713 | ({ u64 _tmp; \ | 713 | ({ u64 _tmp; \ |
714 | if (!rta || RTA_PAYLOAD(rta) < sizeof(u64)) \ | 714 | if (!rta || RTA_PAYLOAD(rta) < sizeof(u64)) \ |
715 | goto rtattr_failure; \ | 715 | goto rtattr_failure; \ |
716 | memcpy(&_tmp, RTA_DATA(rta), sizeof(_tmp)); \ | 716 | memcpy(&_tmp, RTA_DATA(rta), sizeof(_tmp)); \ |
717 | _tmp; }) | 717 | _tmp; }) |
718 | 718 | ||
719 | #define RTA_GET_FLAG(rta) (!!(rta)) | 719 | #define RTA_GET_FLAG(rta) (!!(rta)) |
720 | 720 | ||
721 | #define RTA_GET_SECS(rta) ((unsigned long) RTA_GET_U64(rta) * HZ) | 721 | #define RTA_GET_SECS(rta) ((unsigned long) RTA_GET_U64(rta) * HZ) |
722 | #define RTA_GET_MSECS(rta) (msecs_to_jiffies((unsigned long) RTA_GET_U64(rta))) | 722 | #define RTA_GET_MSECS(rta) (msecs_to_jiffies((unsigned long) RTA_GET_U64(rta))) |
723 | 723 | ||
724 | static inline struct rtattr * | 724 | static inline struct rtattr * |
725 | __rta_reserve(struct sk_buff *skb, int attrtype, int attrlen) | 725 | __rta_reserve(struct sk_buff *skb, int attrtype, int attrlen) |
726 | { | 726 | { |
727 | struct rtattr *rta; | 727 | struct rtattr *rta; |
728 | int size = RTA_LENGTH(attrlen); | 728 | int size = RTA_LENGTH(attrlen); |
729 | 729 | ||
730 | rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); | 730 | rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); |
731 | rta->rta_type = attrtype; | 731 | rta->rta_type = attrtype; |
732 | rta->rta_len = size; | 732 | rta->rta_len = size; |
733 | memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); | 733 | memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); |
734 | return rta; | 734 | return rta; |
735 | } | 735 | } |
736 | 736 | ||
737 | #define __RTA_PUT(skb, attrtype, attrlen) \ | 737 | #define __RTA_PUT(skb, attrtype, attrlen) \ |
738 | ({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \ | 738 | ({ if (unlikely(skb_tailroom(skb) < (int)RTA_SPACE(attrlen))) \ |
739 | goto rtattr_failure; \ | 739 | goto rtattr_failure; \ |
740 | __rta_reserve(skb, attrtype, attrlen); }) | 740 | __rta_reserve(skb, attrtype, attrlen); }) |
741 | 741 | ||
742 | extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change); | 742 | extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change); |
743 | 743 | ||
744 | /* RTNL is used as a global lock for all changes to network configuration */ | 744 | /* RTNL is used as a global lock for all changes to network configuration */ |
745 | extern void rtnl_lock(void); | 745 | extern void rtnl_lock(void); |
746 | extern void rtnl_unlock(void); | 746 | extern void rtnl_unlock(void); |
747 | extern int rtnl_trylock(void); | 747 | extern int rtnl_trylock(void); |
748 | extern int rtnl_is_locked(void); | 748 | extern int rtnl_is_locked(void); |
749 | #ifdef CONFIG_PROVE_LOCKING | 749 | #ifdef CONFIG_PROVE_LOCKING |
750 | extern int lockdep_rtnl_is_held(void); | 750 | extern int lockdep_rtnl_is_held(void); |
751 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | 751 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ |
752 | 752 | ||
753 | /** | 753 | /** |
754 | * rcu_dereference_rtnl - rcu_dereference with debug checking | 754 | * rcu_dereference_rtnl - rcu_dereference with debug checking |
755 | * @p: The pointer to read, prior to dereferencing | 755 | * @p: The pointer to read, prior to dereferencing |
756 | * | 756 | * |
757 | * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() | 757 | * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() |
758 | * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference() | 758 | * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference() |
759 | */ | 759 | */ |
760 | #define rcu_dereference_rtnl(p) \ | 760 | #define rcu_dereference_rtnl(p) \ |
761 | rcu_dereference_check(p, rcu_read_lock_held() || \ | 761 | rcu_dereference_check(p, lockdep_rtnl_is_held()) |
762 | lockdep_rtnl_is_held()) | ||
763 | 762 | ||
764 | /** | 763 | /** |
765 | * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL | 764 | * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL |
766 | * @p: The pointer to read, prior to dereferencing | 765 | * @p: The pointer to read, prior to dereferencing |
767 | * | 766 | * |
768 | * Return the value of the specified RCU-protected pointer, but omit | 767 | * Return the value of the specified RCU-protected pointer, but omit |
769 | * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because | 768 | * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because |
770 | * caller holds RTNL. | 769 | * caller holds RTNL. |
771 | */ | 770 | */ |
772 | #define rtnl_dereference(p) \ | 771 | #define rtnl_dereference(p) \ |
773 | rcu_dereference_protected(p, lockdep_rtnl_is_held()) | 772 | rcu_dereference_protected(p, lockdep_rtnl_is_held()) |
774 | 773 | ||
775 | static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) | 774 | static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) |
776 | { | 775 | { |
777 | return rtnl_dereference(dev->ingress_queue); | 776 | return rtnl_dereference(dev->ingress_queue); |
778 | } | 777 | } |
779 | 778 | ||
780 | extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); | 779 | extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); |
781 | 780 | ||
782 | extern void rtnetlink_init(void); | 781 | extern void rtnetlink_init(void); |
783 | extern void __rtnl_unlock(void); | 782 | extern void __rtnl_unlock(void); |
784 | 783 | ||
785 | #define ASSERT_RTNL() do { \ | 784 | #define ASSERT_RTNL() do { \ |
786 | if (unlikely(!rtnl_is_locked())) { \ | 785 | if (unlikely(!rtnl_is_locked())) { \ |
787 | printk(KERN_ERR "RTNL: assertion failed at %s (%d)\n", \ | 786 | printk(KERN_ERR "RTNL: assertion failed at %s (%d)\n", \ |
788 | __FILE__, __LINE__); \ | 787 | __FILE__, __LINE__); \ |
789 | dump_stack(); \ | 788 | dump_stack(); \ |
790 | } \ | 789 | } \ |
791 | } while(0) | 790 | } while(0) |
792 | 791 | ||
793 | static inline u32 rtm_get_table(struct rtattr **rta, u8 table) | 792 | static inline u32 rtm_get_table(struct rtattr **rta, u8 table) |
794 | { | 793 | { |
795 | return RTA_GET_U32(rta[RTA_TABLE-1]); | 794 | return RTA_GET_U32(rta[RTA_TABLE-1]); |
796 | rtattr_failure: | 795 | rtattr_failure: |
797 | return table; | 796 | return table; |
798 | } | 797 | } |
799 | 798 | ||
800 | #endif /* __KERNEL__ */ | 799 | #endif /* __KERNEL__ */ |
801 | 800 | ||
802 | 801 | ||
803 | #endif /* __LINUX_RTNETLINK_H */ | 802 | #endif /* __LINUX_RTNETLINK_H */ |
804 | 803 |
include/net/sock.h
1 | /* | 1 | /* |
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | 2 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
3 | * operating system. INET is implemented using the BSD Socket | 3 | * operating system. INET is implemented using the BSD Socket |
4 | * interface as the means of communication with the user level. | 4 | * interface as the means of communication with the user level. |
5 | * | 5 | * |
6 | * Definitions for the AF_INET socket handler. | 6 | * Definitions for the AF_INET socket handler. |
7 | * | 7 | * |
8 | * Version: @(#)sock.h 1.0.4 05/13/93 | 8 | * Version: @(#)sock.h 1.0.4 05/13/93 |
9 | * | 9 | * |
10 | * Authors: Ross Biro | 10 | * Authors: Ross Biro |
11 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> | 11 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> |
12 | * Corey Minyard <wf-rch!minyard@relay.EU.net> | 12 | * Corey Minyard <wf-rch!minyard@relay.EU.net> |
13 | * Florian La Roche <flla@stud.uni-sb.de> | 13 | * Florian La Roche <flla@stud.uni-sb.de> |
14 | * | 14 | * |
15 | * Fixes: | 15 | * Fixes: |
16 | * Alan Cox : Volatiles in skbuff pointers. See | 16 | * Alan Cox : Volatiles in skbuff pointers. See |
17 | * skbuff comments. May be overdone, | 17 | * skbuff comments. May be overdone, |
18 | * better to prove they can be removed | 18 | * better to prove they can be removed |
19 | * than the reverse. | 19 | * than the reverse. |
20 | * Alan Cox : Added a zapped field for tcp to note | 20 | * Alan Cox : Added a zapped field for tcp to note |
21 | * a socket is reset and must stay shut up | 21 | * a socket is reset and must stay shut up |
22 | * Alan Cox : New fields for options | 22 | * Alan Cox : New fields for options |
23 | * Pauline Middelink : identd support | 23 | * Pauline Middelink : identd support |
24 | * Alan Cox : Eliminate low level recv/recvfrom | 24 | * Alan Cox : Eliminate low level recv/recvfrom |
25 | * David S. Miller : New socket lookup architecture. | 25 | * David S. Miller : New socket lookup architecture. |
26 | * Steve Whitehouse: Default routines for sock_ops | 26 | * Steve Whitehouse: Default routines for sock_ops |
27 | * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made | 27 | * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made |
28 | * protinfo be just a void pointer, as the | 28 | * protinfo be just a void pointer, as the |
29 | * protocol specific parts were moved to | 29 | * protocol specific parts were moved to |
30 | * respective headers and ipv4/v6, etc now | 30 | * respective headers and ipv4/v6, etc now |
31 | * use private slabcaches for its socks | 31 | * use private slabcaches for its socks |
32 | * Pedro Hortas : New flags field for socket options | 32 | * Pedro Hortas : New flags field for socket options |
33 | * | 33 | * |
34 | * | 34 | * |
35 | * This program is free software; you can redistribute it and/or | 35 | * This program is free software; you can redistribute it and/or |
36 | * modify it under the terms of the GNU General Public License | 36 | * modify it under the terms of the GNU General Public License |
37 | * as published by the Free Software Foundation; either version | 37 | * as published by the Free Software Foundation; either version |
38 | * 2 of the License, or (at your option) any later version. | 38 | * 2 of the License, or (at your option) any later version. |
39 | */ | 39 | */ |
40 | #ifndef _SOCK_H | 40 | #ifndef _SOCK_H |
41 | #define _SOCK_H | 41 | #define _SOCK_H |
42 | 42 | ||
43 | #include <linux/kernel.h> | 43 | #include <linux/kernel.h> |
44 | #include <linux/list.h> | 44 | #include <linux/list.h> |
45 | #include <linux/list_nulls.h> | 45 | #include <linux/list_nulls.h> |
46 | #include <linux/timer.h> | 46 | #include <linux/timer.h> |
47 | #include <linux/cache.h> | 47 | #include <linux/cache.h> |
48 | #include <linux/module.h> | 48 | #include <linux/module.h> |
49 | #include <linux/lockdep.h> | 49 | #include <linux/lockdep.h> |
50 | #include <linux/netdevice.h> | 50 | #include <linux/netdevice.h> |
51 | #include <linux/skbuff.h> /* struct sk_buff */ | 51 | #include <linux/skbuff.h> /* struct sk_buff */ |
52 | #include <linux/mm.h> | 52 | #include <linux/mm.h> |
53 | #include <linux/security.h> | 53 | #include <linux/security.h> |
54 | #include <linux/slab.h> | 54 | #include <linux/slab.h> |
55 | #include <linux/uaccess.h> | 55 | #include <linux/uaccess.h> |
56 | 56 | ||
57 | #include <linux/filter.h> | 57 | #include <linux/filter.h> |
58 | #include <linux/rculist_nulls.h> | 58 | #include <linux/rculist_nulls.h> |
59 | #include <linux/poll.h> | 59 | #include <linux/poll.h> |
60 | 60 | ||
61 | #include <linux/atomic.h> | 61 | #include <linux/atomic.h> |
62 | #include <net/dst.h> | 62 | #include <net/dst.h> |
63 | #include <net/checksum.h> | 63 | #include <net/checksum.h> |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * This structure really needs to be cleaned up. | 66 | * This structure really needs to be cleaned up. |
67 | * Most of it is for TCP, and not used by any of | 67 | * Most of it is for TCP, and not used by any of |
68 | * the other protocols. | 68 | * the other protocols. |
69 | */ | 69 | */ |
70 | 70 | ||
71 | /* Define this to get the SOCK_DBG debugging facility. */ | 71 | /* Define this to get the SOCK_DBG debugging facility. */ |
72 | #define SOCK_DEBUGGING | 72 | #define SOCK_DEBUGGING |
73 | #ifdef SOCK_DEBUGGING | 73 | #ifdef SOCK_DEBUGGING |
74 | #define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ | 74 | #define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ |
75 | printk(KERN_DEBUG msg); } while (0) | 75 | printk(KERN_DEBUG msg); } while (0) |
76 | #else | 76 | #else |
77 | /* Validate arguments and do nothing */ | 77 | /* Validate arguments and do nothing */ |
78 | static inline void __attribute__ ((format (printf, 2, 3))) | 78 | static inline void __attribute__ ((format (printf, 2, 3))) |
79 | SOCK_DEBUG(struct sock *sk, const char *msg, ...) | 79 | SOCK_DEBUG(struct sock *sk, const char *msg, ...) |
80 | { | 80 | { |
81 | } | 81 | } |
82 | #endif | 82 | #endif |
83 | 83 | ||
84 | /* This is the per-socket lock. The spinlock provides a synchronization | 84 | /* This is the per-socket lock. The spinlock provides a synchronization |
85 | * between user contexts and software interrupt processing, whereas the | 85 | * between user contexts and software interrupt processing, whereas the |
86 | * mini-semaphore synchronizes multiple users amongst themselves. | 86 | * mini-semaphore synchronizes multiple users amongst themselves. |
87 | */ | 87 | */ |
88 | typedef struct { | 88 | typedef struct { |
89 | spinlock_t slock; | 89 | spinlock_t slock; |
90 | int owned; | 90 | int owned; |
91 | wait_queue_head_t wq; | 91 | wait_queue_head_t wq; |
92 | /* | 92 | /* |
93 | * We express the mutex-alike socket_lock semantics | 93 | * We express the mutex-alike socket_lock semantics |
94 | * to the lock validator by explicitly managing | 94 | * to the lock validator by explicitly managing |
95 | * the slock as a lock variant (in addition to | 95 | * the slock as a lock variant (in addition to |
96 | * the slock itself): | 96 | * the slock itself): |
97 | */ | 97 | */ |
98 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 98 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
99 | struct lockdep_map dep_map; | 99 | struct lockdep_map dep_map; |
100 | #endif | 100 | #endif |
101 | } socket_lock_t; | 101 | } socket_lock_t; |
102 | 102 | ||
103 | struct sock; | 103 | struct sock; |
104 | struct proto; | 104 | struct proto; |
105 | struct net; | 105 | struct net; |
106 | 106 | ||
107 | /** | 107 | /** |
108 | * struct sock_common - minimal network layer representation of sockets | 108 | * struct sock_common - minimal network layer representation of sockets |
109 | * @skc_daddr: Foreign IPv4 addr | 109 | * @skc_daddr: Foreign IPv4 addr |
110 | * @skc_rcv_saddr: Bound local IPv4 addr | 110 | * @skc_rcv_saddr: Bound local IPv4 addr |
111 | * @skc_hash: hash value used with various protocol lookup tables | 111 | * @skc_hash: hash value used with various protocol lookup tables |
112 | * @skc_u16hashes: two u16 hash values used by UDP lookup tables | 112 | * @skc_u16hashes: two u16 hash values used by UDP lookup tables |
113 | * @skc_family: network address family | 113 | * @skc_family: network address family |
114 | * @skc_state: Connection state | 114 | * @skc_state: Connection state |
115 | * @skc_reuse: %SO_REUSEADDR setting | 115 | * @skc_reuse: %SO_REUSEADDR setting |
116 | * @skc_bound_dev_if: bound device index if != 0 | 116 | * @skc_bound_dev_if: bound device index if != 0 |
117 | * @skc_bind_node: bind hash linkage for various protocol lookup tables | 117 | * @skc_bind_node: bind hash linkage for various protocol lookup tables |
118 | * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol | 118 | * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol |
119 | * @skc_prot: protocol handlers inside a network family | 119 | * @skc_prot: protocol handlers inside a network family |
120 | * @skc_net: reference to the network namespace of this socket | 120 | * @skc_net: reference to the network namespace of this socket |
121 | * @skc_node: main hash linkage for various protocol lookup tables | 121 | * @skc_node: main hash linkage for various protocol lookup tables |
122 | * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol | 122 | * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol |
123 | * @skc_tx_queue_mapping: tx queue number for this connection | 123 | * @skc_tx_queue_mapping: tx queue number for this connection |
124 | * @skc_refcnt: reference count | 124 | * @skc_refcnt: reference count |
125 | * | 125 | * |
126 | * This is the minimal network layer representation of sockets, the header | 126 | * This is the minimal network layer representation of sockets, the header |
127 | * for struct sock and struct inet_timewait_sock. | 127 | * for struct sock and struct inet_timewait_sock. |
128 | */ | 128 | */ |
129 | struct sock_common { | 129 | struct sock_common { |
130 | /* skc_daddr and skc_rcv_saddr must be grouped : | 130 | /* skc_daddr and skc_rcv_saddr must be grouped : |
131 | * cf INET_MATCH() and INET_TW_MATCH() | 131 | * cf INET_MATCH() and INET_TW_MATCH() |
132 | */ | 132 | */ |
133 | __be32 skc_daddr; | 133 | __be32 skc_daddr; |
134 | __be32 skc_rcv_saddr; | 134 | __be32 skc_rcv_saddr; |
135 | 135 | ||
136 | union { | 136 | union { |
137 | unsigned int skc_hash; | 137 | unsigned int skc_hash; |
138 | __u16 skc_u16hashes[2]; | 138 | __u16 skc_u16hashes[2]; |
139 | }; | 139 | }; |
140 | unsigned short skc_family; | 140 | unsigned short skc_family; |
141 | volatile unsigned char skc_state; | 141 | volatile unsigned char skc_state; |
142 | unsigned char skc_reuse; | 142 | unsigned char skc_reuse; |
143 | int skc_bound_dev_if; | 143 | int skc_bound_dev_if; |
144 | union { | 144 | union { |
145 | struct hlist_node skc_bind_node; | 145 | struct hlist_node skc_bind_node; |
146 | struct hlist_nulls_node skc_portaddr_node; | 146 | struct hlist_nulls_node skc_portaddr_node; |
147 | }; | 147 | }; |
148 | struct proto *skc_prot; | 148 | struct proto *skc_prot; |
149 | #ifdef CONFIG_NET_NS | 149 | #ifdef CONFIG_NET_NS |
150 | struct net *skc_net; | 150 | struct net *skc_net; |
151 | #endif | 151 | #endif |
152 | /* | 152 | /* |
153 | * fields between dontcopy_begin/dontcopy_end | 153 | * fields between dontcopy_begin/dontcopy_end |
154 | * are not copied in sock_copy() | 154 | * are not copied in sock_copy() |
155 | */ | 155 | */ |
156 | /* private: */ | 156 | /* private: */ |
157 | int skc_dontcopy_begin[0]; | 157 | int skc_dontcopy_begin[0]; |
158 | /* public: */ | 158 | /* public: */ |
159 | union { | 159 | union { |
160 | struct hlist_node skc_node; | 160 | struct hlist_node skc_node; |
161 | struct hlist_nulls_node skc_nulls_node; | 161 | struct hlist_nulls_node skc_nulls_node; |
162 | }; | 162 | }; |
163 | int skc_tx_queue_mapping; | 163 | int skc_tx_queue_mapping; |
164 | atomic_t skc_refcnt; | 164 | atomic_t skc_refcnt; |
165 | /* private: */ | 165 | /* private: */ |
166 | int skc_dontcopy_end[0]; | 166 | int skc_dontcopy_end[0]; |
167 | /* public: */ | 167 | /* public: */ |
168 | }; | 168 | }; |
169 | 169 | ||
170 | /** | 170 | /** |
171 | * struct sock - network layer representation of sockets | 171 | * struct sock - network layer representation of sockets |
172 | * @__sk_common: shared layout with inet_timewait_sock | 172 | * @__sk_common: shared layout with inet_timewait_sock |
173 | * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN | 173 | * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN |
174 | * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings | 174 | * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings |
175 | * @sk_lock: synchronizer | 175 | * @sk_lock: synchronizer |
176 | * @sk_rcvbuf: size of receive buffer in bytes | 176 | * @sk_rcvbuf: size of receive buffer in bytes |
177 | * @sk_wq: sock wait queue and async head | 177 | * @sk_wq: sock wait queue and async head |
178 | * @sk_dst_cache: destination cache | 178 | * @sk_dst_cache: destination cache |
179 | * @sk_dst_lock: destination cache lock | 179 | * @sk_dst_lock: destination cache lock |
180 | * @sk_policy: flow policy | 180 | * @sk_policy: flow policy |
181 | * @sk_receive_queue: incoming packets | 181 | * @sk_receive_queue: incoming packets |
182 | * @sk_wmem_alloc: transmit queue bytes committed | 182 | * @sk_wmem_alloc: transmit queue bytes committed |
183 | * @sk_write_queue: Packet sending queue | 183 | * @sk_write_queue: Packet sending queue |
184 | * @sk_async_wait_queue: DMA copied packets | 184 | * @sk_async_wait_queue: DMA copied packets |
185 | * @sk_omem_alloc: "o" is "option" or "other" | 185 | * @sk_omem_alloc: "o" is "option" or "other" |
186 | * @sk_wmem_queued: persistent queue size | 186 | * @sk_wmem_queued: persistent queue size |
187 | * @sk_forward_alloc: space allocated forward | 187 | * @sk_forward_alloc: space allocated forward |
188 | * @sk_allocation: allocation mode | 188 | * @sk_allocation: allocation mode |
189 | * @sk_sndbuf: size of send buffer in bytes | 189 | * @sk_sndbuf: size of send buffer in bytes |
190 | * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, | 190 | * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, |
191 | * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings | 191 | * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings |
192 | * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets | 192 | * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets |
193 | * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) | 193 | * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) |
194 | * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) | 194 | * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) |
195 | * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) | 195 | * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) |
196 | * @sk_gso_max_size: Maximum GSO segment size to build | 196 | * @sk_gso_max_size: Maximum GSO segment size to build |
197 | * @sk_lingertime: %SO_LINGER l_linger setting | 197 | * @sk_lingertime: %SO_LINGER l_linger setting |
198 | * @sk_backlog: always used with the per-socket spinlock held | 198 | * @sk_backlog: always used with the per-socket spinlock held |
199 | * @sk_callback_lock: used with the callbacks in the end of this struct | 199 | * @sk_callback_lock: used with the callbacks in the end of this struct |
200 | * @sk_error_queue: rarely used | 200 | * @sk_error_queue: rarely used |
201 | * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, | 201 | * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, |
202 | * IPV6_ADDRFORM for instance) | 202 | * IPV6_ADDRFORM for instance) |
203 | * @sk_err: last error | 203 | * @sk_err: last error |
204 | * @sk_err_soft: errors that don't cause failure but are the cause of a | 204 | * @sk_err_soft: errors that don't cause failure but are the cause of a |
205 | * persistent failure not just 'timed out' | 205 | * persistent failure not just 'timed out' |
206 | * @sk_drops: raw/udp drops counter | 206 | * @sk_drops: raw/udp drops counter |
207 | * @sk_ack_backlog: current listen backlog | 207 | * @sk_ack_backlog: current listen backlog |
208 | * @sk_max_ack_backlog: listen backlog set in listen() | 208 | * @sk_max_ack_backlog: listen backlog set in listen() |
209 | * @sk_priority: %SO_PRIORITY setting | 209 | * @sk_priority: %SO_PRIORITY setting |
210 | * @sk_type: socket type (%SOCK_STREAM, etc) | 210 | * @sk_type: socket type (%SOCK_STREAM, etc) |
211 | * @sk_protocol: which protocol this socket belongs in this network family | 211 | * @sk_protocol: which protocol this socket belongs in this network family |
212 | * @sk_peer_pid: &struct pid for this socket's peer | 212 | * @sk_peer_pid: &struct pid for this socket's peer |
213 | * @sk_peer_cred: %SO_PEERCRED setting | 213 | * @sk_peer_cred: %SO_PEERCRED setting |
214 | * @sk_rcvlowat: %SO_RCVLOWAT setting | 214 | * @sk_rcvlowat: %SO_RCVLOWAT setting |
215 | * @sk_rcvtimeo: %SO_RCVTIMEO setting | 215 | * @sk_rcvtimeo: %SO_RCVTIMEO setting |
216 | * @sk_sndtimeo: %SO_SNDTIMEO setting | 216 | * @sk_sndtimeo: %SO_SNDTIMEO setting |
217 | * @sk_rxhash: flow hash received from netif layer | 217 | * @sk_rxhash: flow hash received from netif layer |
218 | * @sk_filter: socket filtering instructions | 218 | * @sk_filter: socket filtering instructions |
219 | * @sk_protinfo: private area, net family specific, when not using slab | 219 | * @sk_protinfo: private area, net family specific, when not using slab |
220 | * @sk_timer: sock cleanup timer | 220 | * @sk_timer: sock cleanup timer |
221 | * @sk_stamp: time stamp of last packet received | 221 | * @sk_stamp: time stamp of last packet received |
222 | * @sk_socket: Identd and reporting IO signals | 222 | * @sk_socket: Identd and reporting IO signals |
223 | * @sk_user_data: RPC layer private data | 223 | * @sk_user_data: RPC layer private data |
224 | * @sk_sndmsg_page: cached page for sendmsg | 224 | * @sk_sndmsg_page: cached page for sendmsg |
225 | * @sk_sndmsg_off: cached offset for sendmsg | 225 | * @sk_sndmsg_off: cached offset for sendmsg |
226 | * @sk_send_head: front of stuff to transmit | 226 | * @sk_send_head: front of stuff to transmit |
227 | * @sk_security: used by security modules | 227 | * @sk_security: used by security modules |
228 | * @sk_mark: generic packet mark | 228 | * @sk_mark: generic packet mark |
229 | * @sk_classid: this socket's cgroup classid | 229 | * @sk_classid: this socket's cgroup classid |
230 | * @sk_write_pending: a write to stream socket waits to start | 230 | * @sk_write_pending: a write to stream socket waits to start |
231 | * @sk_state_change: callback to indicate change in the state of the sock | 231 | * @sk_state_change: callback to indicate change in the state of the sock |
232 | * @sk_data_ready: callback to indicate there is data to be processed | 232 | * @sk_data_ready: callback to indicate there is data to be processed |
233 | * @sk_write_space: callback to indicate there is bf sending space available | 233 | * @sk_write_space: callback to indicate there is bf sending space available |
234 | * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) | 234 | * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) |
235 | * @sk_backlog_rcv: callback to process the backlog | 235 | * @sk_backlog_rcv: callback to process the backlog |
236 | * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 | 236 | * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 |
237 | */ | 237 | */ |
238 | struct sock { | 238 | struct sock { |
239 | /* | 239 | /* |
240 | * Now struct inet_timewait_sock also uses sock_common, so please just | 240 | * Now struct inet_timewait_sock also uses sock_common, so please just |
241 | * don't add nothing before this first member (__sk_common) --acme | 241 | * don't add nothing before this first member (__sk_common) --acme |
242 | */ | 242 | */ |
243 | struct sock_common __sk_common; | 243 | struct sock_common __sk_common; |
244 | #define sk_node __sk_common.skc_node | 244 | #define sk_node __sk_common.skc_node |
245 | #define sk_nulls_node __sk_common.skc_nulls_node | 245 | #define sk_nulls_node __sk_common.skc_nulls_node |
246 | #define sk_refcnt __sk_common.skc_refcnt | 246 | #define sk_refcnt __sk_common.skc_refcnt |
247 | #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping | 247 | #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping |
248 | 248 | ||
249 | #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin | 249 | #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin |
250 | #define sk_dontcopy_end __sk_common.skc_dontcopy_end | 250 | #define sk_dontcopy_end __sk_common.skc_dontcopy_end |
251 | #define sk_hash __sk_common.skc_hash | 251 | #define sk_hash __sk_common.skc_hash |
252 | #define sk_family __sk_common.skc_family | 252 | #define sk_family __sk_common.skc_family |
253 | #define sk_state __sk_common.skc_state | 253 | #define sk_state __sk_common.skc_state |
254 | #define sk_reuse __sk_common.skc_reuse | 254 | #define sk_reuse __sk_common.skc_reuse |
255 | #define sk_bound_dev_if __sk_common.skc_bound_dev_if | 255 | #define sk_bound_dev_if __sk_common.skc_bound_dev_if |
256 | #define sk_bind_node __sk_common.skc_bind_node | 256 | #define sk_bind_node __sk_common.skc_bind_node |
257 | #define sk_prot __sk_common.skc_prot | 257 | #define sk_prot __sk_common.skc_prot |
258 | #define sk_net __sk_common.skc_net | 258 | #define sk_net __sk_common.skc_net |
259 | socket_lock_t sk_lock; | 259 | socket_lock_t sk_lock; |
260 | struct sk_buff_head sk_receive_queue; | 260 | struct sk_buff_head sk_receive_queue; |
261 | /* | 261 | /* |
262 | * The backlog queue is special, it is always used with | 262 | * The backlog queue is special, it is always used with |
263 | * the per-socket spinlock held and requires low latency | 263 | * the per-socket spinlock held and requires low latency |
264 | * access. Therefore we special case it's implementation. | 264 | * access. Therefore we special case it's implementation. |
265 | * Note : rmem_alloc is in this structure to fill a hole | 265 | * Note : rmem_alloc is in this structure to fill a hole |
266 | * on 64bit arches, not because its logically part of | 266 | * on 64bit arches, not because its logically part of |
267 | * backlog. | 267 | * backlog. |
268 | */ | 268 | */ |
269 | struct { | 269 | struct { |
270 | atomic_t rmem_alloc; | 270 | atomic_t rmem_alloc; |
271 | int len; | 271 | int len; |
272 | struct sk_buff *head; | 272 | struct sk_buff *head; |
273 | struct sk_buff *tail; | 273 | struct sk_buff *tail; |
274 | } sk_backlog; | 274 | } sk_backlog; |
275 | #define sk_rmem_alloc sk_backlog.rmem_alloc | 275 | #define sk_rmem_alloc sk_backlog.rmem_alloc |
276 | int sk_forward_alloc; | 276 | int sk_forward_alloc; |
277 | #ifdef CONFIG_RPS | 277 | #ifdef CONFIG_RPS |
278 | __u32 sk_rxhash; | 278 | __u32 sk_rxhash; |
279 | #endif | 279 | #endif |
280 | atomic_t sk_drops; | 280 | atomic_t sk_drops; |
281 | int sk_rcvbuf; | 281 | int sk_rcvbuf; |
282 | 282 | ||
283 | struct sk_filter __rcu *sk_filter; | 283 | struct sk_filter __rcu *sk_filter; |
284 | struct socket_wq __rcu *sk_wq; | 284 | struct socket_wq __rcu *sk_wq; |
285 | 285 | ||
286 | #ifdef CONFIG_NET_DMA | 286 | #ifdef CONFIG_NET_DMA |
287 | struct sk_buff_head sk_async_wait_queue; | 287 | struct sk_buff_head sk_async_wait_queue; |
288 | #endif | 288 | #endif |
289 | 289 | ||
290 | #ifdef CONFIG_XFRM | 290 | #ifdef CONFIG_XFRM |
291 | struct xfrm_policy *sk_policy[2]; | 291 | struct xfrm_policy *sk_policy[2]; |
292 | #endif | 292 | #endif |
293 | unsigned long sk_flags; | 293 | unsigned long sk_flags; |
294 | struct dst_entry *sk_dst_cache; | 294 | struct dst_entry *sk_dst_cache; |
295 | spinlock_t sk_dst_lock; | 295 | spinlock_t sk_dst_lock; |
296 | atomic_t sk_wmem_alloc; | 296 | atomic_t sk_wmem_alloc; |
297 | atomic_t sk_omem_alloc; | 297 | atomic_t sk_omem_alloc; |
298 | int sk_sndbuf; | 298 | int sk_sndbuf; |
299 | struct sk_buff_head sk_write_queue; | 299 | struct sk_buff_head sk_write_queue; |
300 | kmemcheck_bitfield_begin(flags); | 300 | kmemcheck_bitfield_begin(flags); |
301 | unsigned int sk_shutdown : 2, | 301 | unsigned int sk_shutdown : 2, |
302 | sk_no_check : 2, | 302 | sk_no_check : 2, |
303 | sk_userlocks : 4, | 303 | sk_userlocks : 4, |
304 | sk_protocol : 8, | 304 | sk_protocol : 8, |
305 | sk_type : 16; | 305 | sk_type : 16; |
306 | kmemcheck_bitfield_end(flags); | 306 | kmemcheck_bitfield_end(flags); |
307 | int sk_wmem_queued; | 307 | int sk_wmem_queued; |
308 | gfp_t sk_allocation; | 308 | gfp_t sk_allocation; |
309 | int sk_route_caps; | 309 | int sk_route_caps; |
310 | int sk_route_nocaps; | 310 | int sk_route_nocaps; |
311 | int sk_gso_type; | 311 | int sk_gso_type; |
312 | unsigned int sk_gso_max_size; | 312 | unsigned int sk_gso_max_size; |
313 | int sk_rcvlowat; | 313 | int sk_rcvlowat; |
314 | unsigned long sk_lingertime; | 314 | unsigned long sk_lingertime; |
315 | struct sk_buff_head sk_error_queue; | 315 | struct sk_buff_head sk_error_queue; |
316 | struct proto *sk_prot_creator; | 316 | struct proto *sk_prot_creator; |
317 | rwlock_t sk_callback_lock; | 317 | rwlock_t sk_callback_lock; |
318 | int sk_err, | 318 | int sk_err, |
319 | sk_err_soft; | 319 | sk_err_soft; |
320 | unsigned short sk_ack_backlog; | 320 | unsigned short sk_ack_backlog; |
321 | unsigned short sk_max_ack_backlog; | 321 | unsigned short sk_max_ack_backlog; |
322 | __u32 sk_priority; | 322 | __u32 sk_priority; |
323 | struct pid *sk_peer_pid; | 323 | struct pid *sk_peer_pid; |
324 | const struct cred *sk_peer_cred; | 324 | const struct cred *sk_peer_cred; |
325 | long sk_rcvtimeo; | 325 | long sk_rcvtimeo; |
326 | long sk_sndtimeo; | 326 | long sk_sndtimeo; |
327 | void *sk_protinfo; | 327 | void *sk_protinfo; |
328 | struct timer_list sk_timer; | 328 | struct timer_list sk_timer; |
329 | ktime_t sk_stamp; | 329 | ktime_t sk_stamp; |
330 | struct socket *sk_socket; | 330 | struct socket *sk_socket; |
331 | void *sk_user_data; | 331 | void *sk_user_data; |
332 | struct page *sk_sndmsg_page; | 332 | struct page *sk_sndmsg_page; |
333 | struct sk_buff *sk_send_head; | 333 | struct sk_buff *sk_send_head; |
334 | __u32 sk_sndmsg_off; | 334 | __u32 sk_sndmsg_off; |
335 | int sk_write_pending; | 335 | int sk_write_pending; |
336 | #ifdef CONFIG_SECURITY | 336 | #ifdef CONFIG_SECURITY |
337 | void *sk_security; | 337 | void *sk_security; |
338 | #endif | 338 | #endif |
339 | __u32 sk_mark; | 339 | __u32 sk_mark; |
340 | u32 sk_classid; | 340 | u32 sk_classid; |
341 | void (*sk_state_change)(struct sock *sk); | 341 | void (*sk_state_change)(struct sock *sk); |
342 | void (*sk_data_ready)(struct sock *sk, int bytes); | 342 | void (*sk_data_ready)(struct sock *sk, int bytes); |
343 | void (*sk_write_space)(struct sock *sk); | 343 | void (*sk_write_space)(struct sock *sk); |
344 | void (*sk_error_report)(struct sock *sk); | 344 | void (*sk_error_report)(struct sock *sk); |
345 | int (*sk_backlog_rcv)(struct sock *sk, | 345 | int (*sk_backlog_rcv)(struct sock *sk, |
346 | struct sk_buff *skb); | 346 | struct sk_buff *skb); |
347 | void (*sk_destruct)(struct sock *sk); | 347 | void (*sk_destruct)(struct sock *sk); |
348 | }; | 348 | }; |
349 | 349 | ||
350 | /* | 350 | /* |
351 | * Hashed lists helper routines | 351 | * Hashed lists helper routines |
352 | */ | 352 | */ |
353 | static inline struct sock *sk_entry(const struct hlist_node *node) | 353 | static inline struct sock *sk_entry(const struct hlist_node *node) |
354 | { | 354 | { |
355 | return hlist_entry(node, struct sock, sk_node); | 355 | return hlist_entry(node, struct sock, sk_node); |
356 | } | 356 | } |
357 | 357 | ||
358 | static inline struct sock *__sk_head(const struct hlist_head *head) | 358 | static inline struct sock *__sk_head(const struct hlist_head *head) |
359 | { | 359 | { |
360 | return hlist_entry(head->first, struct sock, sk_node); | 360 | return hlist_entry(head->first, struct sock, sk_node); |
361 | } | 361 | } |
362 | 362 | ||
363 | static inline struct sock *sk_head(const struct hlist_head *head) | 363 | static inline struct sock *sk_head(const struct hlist_head *head) |
364 | { | 364 | { |
365 | return hlist_empty(head) ? NULL : __sk_head(head); | 365 | return hlist_empty(head) ? NULL : __sk_head(head); |
366 | } | 366 | } |
367 | 367 | ||
368 | static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) | 368 | static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) |
369 | { | 369 | { |
370 | return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); | 370 | return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); |
371 | } | 371 | } |
372 | 372 | ||
373 | static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) | 373 | static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) |
374 | { | 374 | { |
375 | return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); | 375 | return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); |
376 | } | 376 | } |
377 | 377 | ||
378 | static inline struct sock *sk_next(const struct sock *sk) | 378 | static inline struct sock *sk_next(const struct sock *sk) |
379 | { | 379 | { |
380 | return sk->sk_node.next ? | 380 | return sk->sk_node.next ? |
381 | hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; | 381 | hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; |
382 | } | 382 | } |
383 | 383 | ||
384 | static inline struct sock *sk_nulls_next(const struct sock *sk) | 384 | static inline struct sock *sk_nulls_next(const struct sock *sk) |
385 | { | 385 | { |
386 | return (!is_a_nulls(sk->sk_nulls_node.next)) ? | 386 | return (!is_a_nulls(sk->sk_nulls_node.next)) ? |
387 | hlist_nulls_entry(sk->sk_nulls_node.next, | 387 | hlist_nulls_entry(sk->sk_nulls_node.next, |
388 | struct sock, sk_nulls_node) : | 388 | struct sock, sk_nulls_node) : |
389 | NULL; | 389 | NULL; |
390 | } | 390 | } |
391 | 391 | ||
392 | static inline int sk_unhashed(const struct sock *sk) | 392 | static inline int sk_unhashed(const struct sock *sk) |
393 | { | 393 | { |
394 | return hlist_unhashed(&sk->sk_node); | 394 | return hlist_unhashed(&sk->sk_node); |
395 | } | 395 | } |
396 | 396 | ||
397 | static inline int sk_hashed(const struct sock *sk) | 397 | static inline int sk_hashed(const struct sock *sk) |
398 | { | 398 | { |
399 | return !sk_unhashed(sk); | 399 | return !sk_unhashed(sk); |
400 | } | 400 | } |
401 | 401 | ||
402 | static __inline__ void sk_node_init(struct hlist_node *node) | 402 | static __inline__ void sk_node_init(struct hlist_node *node) |
403 | { | 403 | { |
404 | node->pprev = NULL; | 404 | node->pprev = NULL; |
405 | } | 405 | } |
406 | 406 | ||
407 | static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node) | 407 | static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node) |
408 | { | 408 | { |
409 | node->pprev = NULL; | 409 | node->pprev = NULL; |
410 | } | 410 | } |
411 | 411 | ||
412 | static __inline__ void __sk_del_node(struct sock *sk) | 412 | static __inline__ void __sk_del_node(struct sock *sk) |
413 | { | 413 | { |
414 | __hlist_del(&sk->sk_node); | 414 | __hlist_del(&sk->sk_node); |
415 | } | 415 | } |
416 | 416 | ||
417 | /* NB: equivalent to hlist_del_init_rcu */ | 417 | /* NB: equivalent to hlist_del_init_rcu */ |
418 | static __inline__ int __sk_del_node_init(struct sock *sk) | 418 | static __inline__ int __sk_del_node_init(struct sock *sk) |
419 | { | 419 | { |
420 | if (sk_hashed(sk)) { | 420 | if (sk_hashed(sk)) { |
421 | __sk_del_node(sk); | 421 | __sk_del_node(sk); |
422 | sk_node_init(&sk->sk_node); | 422 | sk_node_init(&sk->sk_node); |
423 | return 1; | 423 | return 1; |
424 | } | 424 | } |
425 | return 0; | 425 | return 0; |
426 | } | 426 | } |
427 | 427 | ||
428 | /* Grab socket reference count. This operation is valid only | 428 | /* Grab socket reference count. This operation is valid only |
429 | when sk is ALREADY grabbed f.e. it is found in hash table | 429 | when sk is ALREADY grabbed f.e. it is found in hash table |
430 | or a list and the lookup is made under lock preventing hash table | 430 | or a list and the lookup is made under lock preventing hash table |
431 | modifications. | 431 | modifications. |
432 | */ | 432 | */ |
433 | 433 | ||
434 | static inline void sock_hold(struct sock *sk) | 434 | static inline void sock_hold(struct sock *sk) |
435 | { | 435 | { |
436 | atomic_inc(&sk->sk_refcnt); | 436 | atomic_inc(&sk->sk_refcnt); |
437 | } | 437 | } |
438 | 438 | ||
439 | /* Ungrab socket in the context, which assumes that socket refcnt | 439 | /* Ungrab socket in the context, which assumes that socket refcnt |
440 | cannot hit zero, f.e. it is true in context of any socketcall. | 440 | cannot hit zero, f.e. it is true in context of any socketcall. |
441 | */ | 441 | */ |
442 | static inline void __sock_put(struct sock *sk) | 442 | static inline void __sock_put(struct sock *sk) |
443 | { | 443 | { |
444 | atomic_dec(&sk->sk_refcnt); | 444 | atomic_dec(&sk->sk_refcnt); |
445 | } | 445 | } |
446 | 446 | ||
447 | static __inline__ int sk_del_node_init(struct sock *sk) | 447 | static __inline__ int sk_del_node_init(struct sock *sk) |
448 | { | 448 | { |
449 | int rc = __sk_del_node_init(sk); | 449 | int rc = __sk_del_node_init(sk); |
450 | 450 | ||
451 | if (rc) { | 451 | if (rc) { |
452 | /* paranoid for a while -acme */ | 452 | /* paranoid for a while -acme */ |
453 | WARN_ON(atomic_read(&sk->sk_refcnt) == 1); | 453 | WARN_ON(atomic_read(&sk->sk_refcnt) == 1); |
454 | __sock_put(sk); | 454 | __sock_put(sk); |
455 | } | 455 | } |
456 | return rc; | 456 | return rc; |
457 | } | 457 | } |
458 | #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) | 458 | #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) |
459 | 459 | ||
460 | static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk) | 460 | static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk) |
461 | { | 461 | { |
462 | if (sk_hashed(sk)) { | 462 | if (sk_hashed(sk)) { |
463 | hlist_nulls_del_init_rcu(&sk->sk_nulls_node); | 463 | hlist_nulls_del_init_rcu(&sk->sk_nulls_node); |
464 | return 1; | 464 | return 1; |
465 | } | 465 | } |
466 | return 0; | 466 | return 0; |
467 | } | 467 | } |
468 | 468 | ||
469 | static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk) | 469 | static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk) |
470 | { | 470 | { |
471 | int rc = __sk_nulls_del_node_init_rcu(sk); | 471 | int rc = __sk_nulls_del_node_init_rcu(sk); |
472 | 472 | ||
473 | if (rc) { | 473 | if (rc) { |
474 | /* paranoid for a while -acme */ | 474 | /* paranoid for a while -acme */ |
475 | WARN_ON(atomic_read(&sk->sk_refcnt) == 1); | 475 | WARN_ON(atomic_read(&sk->sk_refcnt) == 1); |
476 | __sock_put(sk); | 476 | __sock_put(sk); |
477 | } | 477 | } |
478 | return rc; | 478 | return rc; |
479 | } | 479 | } |
480 | 480 | ||
481 | static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list) | 481 | static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list) |
482 | { | 482 | { |
483 | hlist_add_head(&sk->sk_node, list); | 483 | hlist_add_head(&sk->sk_node, list); |
484 | } | 484 | } |
485 | 485 | ||
486 | static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) | 486 | static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) |
487 | { | 487 | { |
488 | sock_hold(sk); | 488 | sock_hold(sk); |
489 | __sk_add_node(sk, list); | 489 | __sk_add_node(sk, list); |
490 | } | 490 | } |
491 | 491 | ||
492 | static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) | 492 | static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) |
493 | { | 493 | { |
494 | sock_hold(sk); | 494 | sock_hold(sk); |
495 | hlist_add_head_rcu(&sk->sk_node, list); | 495 | hlist_add_head_rcu(&sk->sk_node, list); |
496 | } | 496 | } |
497 | 497 | ||
498 | static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) | 498 | static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) |
499 | { | 499 | { |
500 | hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); | 500 | hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); |
501 | } | 501 | } |
502 | 502 | ||
503 | static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) | 503 | static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) |
504 | { | 504 | { |
505 | sock_hold(sk); | 505 | sock_hold(sk); |
506 | __sk_nulls_add_node_rcu(sk, list); | 506 | __sk_nulls_add_node_rcu(sk, list); |
507 | } | 507 | } |
508 | 508 | ||
509 | static __inline__ void __sk_del_bind_node(struct sock *sk) | 509 | static __inline__ void __sk_del_bind_node(struct sock *sk) |
510 | { | 510 | { |
511 | __hlist_del(&sk->sk_bind_node); | 511 | __hlist_del(&sk->sk_bind_node); |
512 | } | 512 | } |
513 | 513 | ||
514 | static __inline__ void sk_add_bind_node(struct sock *sk, | 514 | static __inline__ void sk_add_bind_node(struct sock *sk, |
515 | struct hlist_head *list) | 515 | struct hlist_head *list) |
516 | { | 516 | { |
517 | hlist_add_head(&sk->sk_bind_node, list); | 517 | hlist_add_head(&sk->sk_bind_node, list); |
518 | } | 518 | } |
519 | 519 | ||
520 | #define sk_for_each(__sk, node, list) \ | 520 | #define sk_for_each(__sk, node, list) \ |
521 | hlist_for_each_entry(__sk, node, list, sk_node) | 521 | hlist_for_each_entry(__sk, node, list, sk_node) |
522 | #define sk_for_each_rcu(__sk, node, list) \ | 522 | #define sk_for_each_rcu(__sk, node, list) \ |
523 | hlist_for_each_entry_rcu(__sk, node, list, sk_node) | 523 | hlist_for_each_entry_rcu(__sk, node, list, sk_node) |
524 | #define sk_nulls_for_each(__sk, node, list) \ | 524 | #define sk_nulls_for_each(__sk, node, list) \ |
525 | hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) | 525 | hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) |
526 | #define sk_nulls_for_each_rcu(__sk, node, list) \ | 526 | #define sk_nulls_for_each_rcu(__sk, node, list) \ |
527 | hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) | 527 | hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) |
528 | #define sk_for_each_from(__sk, node) \ | 528 | #define sk_for_each_from(__sk, node) \ |
529 | if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ | 529 | if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ |
530 | hlist_for_each_entry_from(__sk, node, sk_node) | 530 | hlist_for_each_entry_from(__sk, node, sk_node) |
531 | #define sk_nulls_for_each_from(__sk, node) \ | 531 | #define sk_nulls_for_each_from(__sk, node) \ |
532 | if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ | 532 | if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ |
533 | hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) | 533 | hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) |
534 | #define sk_for_each_safe(__sk, node, tmp, list) \ | 534 | #define sk_for_each_safe(__sk, node, tmp, list) \ |
535 | hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node) | 535 | hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node) |
536 | #define sk_for_each_bound(__sk, node, list) \ | 536 | #define sk_for_each_bound(__sk, node, list) \ |
537 | hlist_for_each_entry(__sk, node, list, sk_bind_node) | 537 | hlist_for_each_entry(__sk, node, list, sk_bind_node) |
538 | 538 | ||
539 | /* Sock flags */ | 539 | /* Sock flags */ |
540 | enum sock_flags { | 540 | enum sock_flags { |
541 | SOCK_DEAD, | 541 | SOCK_DEAD, |
542 | SOCK_DONE, | 542 | SOCK_DONE, |
543 | SOCK_URGINLINE, | 543 | SOCK_URGINLINE, |
544 | SOCK_KEEPOPEN, | 544 | SOCK_KEEPOPEN, |
545 | SOCK_LINGER, | 545 | SOCK_LINGER, |
546 | SOCK_DESTROY, | 546 | SOCK_DESTROY, |
547 | SOCK_BROADCAST, | 547 | SOCK_BROADCAST, |
548 | SOCK_TIMESTAMP, | 548 | SOCK_TIMESTAMP, |
549 | SOCK_ZAPPED, | 549 | SOCK_ZAPPED, |
550 | SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ | 550 | SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ |
551 | SOCK_DBG, /* %SO_DEBUG setting */ | 551 | SOCK_DBG, /* %SO_DEBUG setting */ |
552 | SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ | 552 | SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ |
553 | SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ | 553 | SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ |
554 | SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ | 554 | SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ |
555 | SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ | 555 | SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ |
556 | SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */ | 556 | SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */ |
557 | SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */ | 557 | SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */ |
558 | SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */ | 558 | SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */ |
559 | SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ | 559 | SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ |
560 | SOCK_TIMESTAMPING_SOFTWARE, /* %SOF_TIMESTAMPING_SOFTWARE */ | 560 | SOCK_TIMESTAMPING_SOFTWARE, /* %SOF_TIMESTAMPING_SOFTWARE */ |
561 | SOCK_TIMESTAMPING_RAW_HARDWARE, /* %SOF_TIMESTAMPING_RAW_HARDWARE */ | 561 | SOCK_TIMESTAMPING_RAW_HARDWARE, /* %SOF_TIMESTAMPING_RAW_HARDWARE */ |
562 | SOCK_TIMESTAMPING_SYS_HARDWARE, /* %SOF_TIMESTAMPING_SYS_HARDWARE */ | 562 | SOCK_TIMESTAMPING_SYS_HARDWARE, /* %SOF_TIMESTAMPING_SYS_HARDWARE */ |
563 | SOCK_FASYNC, /* fasync() active */ | 563 | SOCK_FASYNC, /* fasync() active */ |
564 | SOCK_RXQ_OVFL, | 564 | SOCK_RXQ_OVFL, |
565 | }; | 565 | }; |
566 | 566 | ||
567 | static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) | 567 | static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) |
568 | { | 568 | { |
569 | nsk->sk_flags = osk->sk_flags; | 569 | nsk->sk_flags = osk->sk_flags; |
570 | } | 570 | } |
571 | 571 | ||
572 | static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) | 572 | static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) |
573 | { | 573 | { |
574 | __set_bit(flag, &sk->sk_flags); | 574 | __set_bit(flag, &sk->sk_flags); |
575 | } | 575 | } |
576 | 576 | ||
577 | static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) | 577 | static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) |
578 | { | 578 | { |
579 | __clear_bit(flag, &sk->sk_flags); | 579 | __clear_bit(flag, &sk->sk_flags); |
580 | } | 580 | } |
581 | 581 | ||
582 | static inline int sock_flag(struct sock *sk, enum sock_flags flag) | 582 | static inline int sock_flag(struct sock *sk, enum sock_flags flag) |
583 | { | 583 | { |
584 | return test_bit(flag, &sk->sk_flags); | 584 | return test_bit(flag, &sk->sk_flags); |
585 | } | 585 | } |
586 | 586 | ||
587 | static inline void sk_acceptq_removed(struct sock *sk) | 587 | static inline void sk_acceptq_removed(struct sock *sk) |
588 | { | 588 | { |
589 | sk->sk_ack_backlog--; | 589 | sk->sk_ack_backlog--; |
590 | } | 590 | } |
591 | 591 | ||
592 | static inline void sk_acceptq_added(struct sock *sk) | 592 | static inline void sk_acceptq_added(struct sock *sk) |
593 | { | 593 | { |
594 | sk->sk_ack_backlog++; | 594 | sk->sk_ack_backlog++; |
595 | } | 595 | } |
596 | 596 | ||
597 | static inline int sk_acceptq_is_full(struct sock *sk) | 597 | static inline int sk_acceptq_is_full(struct sock *sk) |
598 | { | 598 | { |
599 | return sk->sk_ack_backlog > sk->sk_max_ack_backlog; | 599 | return sk->sk_ack_backlog > sk->sk_max_ack_backlog; |
600 | } | 600 | } |
601 | 601 | ||
602 | /* | 602 | /* |
603 | * Compute minimal free write space needed to queue new packets. | 603 | * Compute minimal free write space needed to queue new packets. |
604 | */ | 604 | */ |
605 | static inline int sk_stream_min_wspace(struct sock *sk) | 605 | static inline int sk_stream_min_wspace(struct sock *sk) |
606 | { | 606 | { |
607 | return sk->sk_wmem_queued >> 1; | 607 | return sk->sk_wmem_queued >> 1; |
608 | } | 608 | } |
609 | 609 | ||
610 | static inline int sk_stream_wspace(struct sock *sk) | 610 | static inline int sk_stream_wspace(struct sock *sk) |
611 | { | 611 | { |
612 | return sk->sk_sndbuf - sk->sk_wmem_queued; | 612 | return sk->sk_sndbuf - sk->sk_wmem_queued; |
613 | } | 613 | } |
614 | 614 | ||
615 | extern void sk_stream_write_space(struct sock *sk); | 615 | extern void sk_stream_write_space(struct sock *sk); |
616 | 616 | ||
617 | static inline int sk_stream_memory_free(struct sock *sk) | 617 | static inline int sk_stream_memory_free(struct sock *sk) |
618 | { | 618 | { |
619 | return sk->sk_wmem_queued < sk->sk_sndbuf; | 619 | return sk->sk_wmem_queued < sk->sk_sndbuf; |
620 | } | 620 | } |
621 | 621 | ||
622 | /* OOB backlog add */ | 622 | /* OOB backlog add */ |
623 | static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) | 623 | static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) |
624 | { | 624 | { |
625 | /* dont let skb dst not refcounted, we are going to leave rcu lock */ | 625 | /* dont let skb dst not refcounted, we are going to leave rcu lock */ |
626 | skb_dst_force(skb); | 626 | skb_dst_force(skb); |
627 | 627 | ||
628 | if (!sk->sk_backlog.tail) | 628 | if (!sk->sk_backlog.tail) |
629 | sk->sk_backlog.head = skb; | 629 | sk->sk_backlog.head = skb; |
630 | else | 630 | else |
631 | sk->sk_backlog.tail->next = skb; | 631 | sk->sk_backlog.tail->next = skb; |
632 | 632 | ||
633 | sk->sk_backlog.tail = skb; | 633 | sk->sk_backlog.tail = skb; |
634 | skb->next = NULL; | 634 | skb->next = NULL; |
635 | } | 635 | } |
636 | 636 | ||
637 | /* | 637 | /* |
638 | * Take into account size of receive queue and backlog queue | 638 | * Take into account size of receive queue and backlog queue |
639 | */ | 639 | */ |
640 | static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) | 640 | static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) |
641 | { | 641 | { |
642 | unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); | 642 | unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); |
643 | 643 | ||
644 | return qsize + skb->truesize > sk->sk_rcvbuf; | 644 | return qsize + skb->truesize > sk->sk_rcvbuf; |
645 | } | 645 | } |
646 | 646 | ||
647 | /* The per-socket spinlock must be held here. */ | 647 | /* The per-socket spinlock must be held here. */ |
648 | static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb) | 648 | static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb) |
649 | { | 649 | { |
650 | if (sk_rcvqueues_full(sk, skb)) | 650 | if (sk_rcvqueues_full(sk, skb)) |
651 | return -ENOBUFS; | 651 | return -ENOBUFS; |
652 | 652 | ||
653 | __sk_add_backlog(sk, skb); | 653 | __sk_add_backlog(sk, skb); |
654 | sk->sk_backlog.len += skb->truesize; | 654 | sk->sk_backlog.len += skb->truesize; |
655 | return 0; | 655 | return 0; |
656 | } | 656 | } |
657 | 657 | ||
658 | static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) | 658 | static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) |
659 | { | 659 | { |
660 | return sk->sk_backlog_rcv(sk, skb); | 660 | return sk->sk_backlog_rcv(sk, skb); |
661 | } | 661 | } |
662 | 662 | ||
663 | static inline void sock_rps_record_flow(const struct sock *sk) | 663 | static inline void sock_rps_record_flow(const struct sock *sk) |
664 | { | 664 | { |
665 | #ifdef CONFIG_RPS | 665 | #ifdef CONFIG_RPS |
666 | struct rps_sock_flow_table *sock_flow_table; | 666 | struct rps_sock_flow_table *sock_flow_table; |
667 | 667 | ||
668 | rcu_read_lock(); | 668 | rcu_read_lock(); |
669 | sock_flow_table = rcu_dereference(rps_sock_flow_table); | 669 | sock_flow_table = rcu_dereference(rps_sock_flow_table); |
670 | rps_record_sock_flow(sock_flow_table, sk->sk_rxhash); | 670 | rps_record_sock_flow(sock_flow_table, sk->sk_rxhash); |
671 | rcu_read_unlock(); | 671 | rcu_read_unlock(); |
672 | #endif | 672 | #endif |
673 | } | 673 | } |
674 | 674 | ||
675 | static inline void sock_rps_reset_flow(const struct sock *sk) | 675 | static inline void sock_rps_reset_flow(const struct sock *sk) |
676 | { | 676 | { |
677 | #ifdef CONFIG_RPS | 677 | #ifdef CONFIG_RPS |
678 | struct rps_sock_flow_table *sock_flow_table; | 678 | struct rps_sock_flow_table *sock_flow_table; |
679 | 679 | ||
680 | rcu_read_lock(); | 680 | rcu_read_lock(); |
681 | sock_flow_table = rcu_dereference(rps_sock_flow_table); | 681 | sock_flow_table = rcu_dereference(rps_sock_flow_table); |
682 | rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash); | 682 | rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash); |
683 | rcu_read_unlock(); | 683 | rcu_read_unlock(); |
684 | #endif | 684 | #endif |
685 | } | 685 | } |
686 | 686 | ||
687 | static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash) | 687 | static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash) |
688 | { | 688 | { |
689 | #ifdef CONFIG_RPS | 689 | #ifdef CONFIG_RPS |
690 | if (unlikely(sk->sk_rxhash != rxhash)) { | 690 | if (unlikely(sk->sk_rxhash != rxhash)) { |
691 | sock_rps_reset_flow(sk); | 691 | sock_rps_reset_flow(sk); |
692 | sk->sk_rxhash = rxhash; | 692 | sk->sk_rxhash = rxhash; |
693 | } | 693 | } |
694 | #endif | 694 | #endif |
695 | } | 695 | } |
696 | 696 | ||
697 | #define sk_wait_event(__sk, __timeo, __condition) \ | 697 | #define sk_wait_event(__sk, __timeo, __condition) \ |
698 | ({ int __rc; \ | 698 | ({ int __rc; \ |
699 | release_sock(__sk); \ | 699 | release_sock(__sk); \ |
700 | __rc = __condition; \ | 700 | __rc = __condition; \ |
701 | if (!__rc) { \ | 701 | if (!__rc) { \ |
702 | *(__timeo) = schedule_timeout(*(__timeo)); \ | 702 | *(__timeo) = schedule_timeout(*(__timeo)); \ |
703 | } \ | 703 | } \ |
704 | lock_sock(__sk); \ | 704 | lock_sock(__sk); \ |
705 | __rc = __condition; \ | 705 | __rc = __condition; \ |
706 | __rc; \ | 706 | __rc; \ |
707 | }) | 707 | }) |
708 | 708 | ||
709 | extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); | 709 | extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); |
710 | extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); | 710 | extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); |
711 | extern void sk_stream_wait_close(struct sock *sk, long timeo_p); | 711 | extern void sk_stream_wait_close(struct sock *sk, long timeo_p); |
712 | extern int sk_stream_error(struct sock *sk, int flags, int err); | 712 | extern int sk_stream_error(struct sock *sk, int flags, int err); |
713 | extern void sk_stream_kill_queues(struct sock *sk); | 713 | extern void sk_stream_kill_queues(struct sock *sk); |
714 | 714 | ||
715 | extern int sk_wait_data(struct sock *sk, long *timeo); | 715 | extern int sk_wait_data(struct sock *sk, long *timeo); |
716 | 716 | ||
717 | struct request_sock_ops; | 717 | struct request_sock_ops; |
718 | struct timewait_sock_ops; | 718 | struct timewait_sock_ops; |
719 | struct inet_hashinfo; | 719 | struct inet_hashinfo; |
720 | struct raw_hashinfo; | 720 | struct raw_hashinfo; |
721 | 721 | ||
722 | /* Networking protocol blocks we attach to sockets. | 722 | /* Networking protocol blocks we attach to sockets. |
723 | * socket layer -> transport layer interface | 723 | * socket layer -> transport layer interface |
724 | * transport -> network interface is defined by struct inet_proto | 724 | * transport -> network interface is defined by struct inet_proto |
725 | */ | 725 | */ |
726 | struct proto { | 726 | struct proto { |
727 | void (*close)(struct sock *sk, | 727 | void (*close)(struct sock *sk, |
728 | long timeout); | 728 | long timeout); |
729 | int (*connect)(struct sock *sk, | 729 | int (*connect)(struct sock *sk, |
730 | struct sockaddr *uaddr, | 730 | struct sockaddr *uaddr, |
731 | int addr_len); | 731 | int addr_len); |
732 | int (*disconnect)(struct sock *sk, int flags); | 732 | int (*disconnect)(struct sock *sk, int flags); |
733 | 733 | ||
734 | struct sock * (*accept) (struct sock *sk, int flags, int *err); | 734 | struct sock * (*accept) (struct sock *sk, int flags, int *err); |
735 | 735 | ||
736 | int (*ioctl)(struct sock *sk, int cmd, | 736 | int (*ioctl)(struct sock *sk, int cmd, |
737 | unsigned long arg); | 737 | unsigned long arg); |
738 | int (*init)(struct sock *sk); | 738 | int (*init)(struct sock *sk); |
739 | void (*destroy)(struct sock *sk); | 739 | void (*destroy)(struct sock *sk); |
740 | void (*shutdown)(struct sock *sk, int how); | 740 | void (*shutdown)(struct sock *sk, int how); |
741 | int (*setsockopt)(struct sock *sk, int level, | 741 | int (*setsockopt)(struct sock *sk, int level, |
742 | int optname, char __user *optval, | 742 | int optname, char __user *optval, |
743 | unsigned int optlen); | 743 | unsigned int optlen); |
744 | int (*getsockopt)(struct sock *sk, int level, | 744 | int (*getsockopt)(struct sock *sk, int level, |
745 | int optname, char __user *optval, | 745 | int optname, char __user *optval, |
746 | int __user *option); | 746 | int __user *option); |
747 | #ifdef CONFIG_COMPAT | 747 | #ifdef CONFIG_COMPAT |
748 | int (*compat_setsockopt)(struct sock *sk, | 748 | int (*compat_setsockopt)(struct sock *sk, |
749 | int level, | 749 | int level, |
750 | int optname, char __user *optval, | 750 | int optname, char __user *optval, |
751 | unsigned int optlen); | 751 | unsigned int optlen); |
752 | int (*compat_getsockopt)(struct sock *sk, | 752 | int (*compat_getsockopt)(struct sock *sk, |
753 | int level, | 753 | int level, |
754 | int optname, char __user *optval, | 754 | int optname, char __user *optval, |
755 | int __user *option); | 755 | int __user *option); |
756 | int (*compat_ioctl)(struct sock *sk, | 756 | int (*compat_ioctl)(struct sock *sk, |
757 | unsigned int cmd, unsigned long arg); | 757 | unsigned int cmd, unsigned long arg); |
758 | #endif | 758 | #endif |
759 | int (*sendmsg)(struct kiocb *iocb, struct sock *sk, | 759 | int (*sendmsg)(struct kiocb *iocb, struct sock *sk, |
760 | struct msghdr *msg, size_t len); | 760 | struct msghdr *msg, size_t len); |
761 | int (*recvmsg)(struct kiocb *iocb, struct sock *sk, | 761 | int (*recvmsg)(struct kiocb *iocb, struct sock *sk, |
762 | struct msghdr *msg, | 762 | struct msghdr *msg, |
763 | size_t len, int noblock, int flags, | 763 | size_t len, int noblock, int flags, |
764 | int *addr_len); | 764 | int *addr_len); |
765 | int (*sendpage)(struct sock *sk, struct page *page, | 765 | int (*sendpage)(struct sock *sk, struct page *page, |
766 | int offset, size_t size, int flags); | 766 | int offset, size_t size, int flags); |
767 | int (*bind)(struct sock *sk, | 767 | int (*bind)(struct sock *sk, |
768 | struct sockaddr *uaddr, int addr_len); | 768 | struct sockaddr *uaddr, int addr_len); |
769 | 769 | ||
770 | int (*backlog_rcv) (struct sock *sk, | 770 | int (*backlog_rcv) (struct sock *sk, |
771 | struct sk_buff *skb); | 771 | struct sk_buff *skb); |
772 | 772 | ||
773 | /* Keeping track of sk's, looking them up, and port selection methods. */ | 773 | /* Keeping track of sk's, looking them up, and port selection methods. */ |
774 | void (*hash)(struct sock *sk); | 774 | void (*hash)(struct sock *sk); |
775 | void (*unhash)(struct sock *sk); | 775 | void (*unhash)(struct sock *sk); |
776 | void (*rehash)(struct sock *sk); | 776 | void (*rehash)(struct sock *sk); |
777 | int (*get_port)(struct sock *sk, unsigned short snum); | 777 | int (*get_port)(struct sock *sk, unsigned short snum); |
778 | void (*clear_sk)(struct sock *sk, int size); | 778 | void (*clear_sk)(struct sock *sk, int size); |
779 | 779 | ||
780 | /* Keeping track of sockets in use */ | 780 | /* Keeping track of sockets in use */ |
781 | #ifdef CONFIG_PROC_FS | 781 | #ifdef CONFIG_PROC_FS |
782 | unsigned int inuse_idx; | 782 | unsigned int inuse_idx; |
783 | #endif | 783 | #endif |
784 | 784 | ||
785 | /* Memory pressure */ | 785 | /* Memory pressure */ |
786 | void (*enter_memory_pressure)(struct sock *sk); | 786 | void (*enter_memory_pressure)(struct sock *sk); |
787 | atomic_long_t *memory_allocated; /* Current allocated memory. */ | 787 | atomic_long_t *memory_allocated; /* Current allocated memory. */ |
788 | struct percpu_counter *sockets_allocated; /* Current number of sockets. */ | 788 | struct percpu_counter *sockets_allocated; /* Current number of sockets. */ |
789 | /* | 789 | /* |
790 | * Pressure flag: try to collapse. | 790 | * Pressure flag: try to collapse. |
791 | * Technical note: it is used by multiple contexts non atomically. | 791 | * Technical note: it is used by multiple contexts non atomically. |
792 | * All the __sk_mem_schedule() is of this nature: accounting | 792 | * All the __sk_mem_schedule() is of this nature: accounting |
793 | * is strict, actions are advisory and have some latency. | 793 | * is strict, actions are advisory and have some latency. |
794 | */ | 794 | */ |
795 | int *memory_pressure; | 795 | int *memory_pressure; |
796 | long *sysctl_mem; | 796 | long *sysctl_mem; |
797 | int *sysctl_wmem; | 797 | int *sysctl_wmem; |
798 | int *sysctl_rmem; | 798 | int *sysctl_rmem; |
799 | int max_header; | 799 | int max_header; |
800 | bool no_autobind; | 800 | bool no_autobind; |
801 | 801 | ||
802 | struct kmem_cache *slab; | 802 | struct kmem_cache *slab; |
803 | unsigned int obj_size; | 803 | unsigned int obj_size; |
804 | int slab_flags; | 804 | int slab_flags; |
805 | 805 | ||
806 | struct percpu_counter *orphan_count; | 806 | struct percpu_counter *orphan_count; |
807 | 807 | ||
808 | struct request_sock_ops *rsk_prot; | 808 | struct request_sock_ops *rsk_prot; |
809 | struct timewait_sock_ops *twsk_prot; | 809 | struct timewait_sock_ops *twsk_prot; |
810 | 810 | ||
811 | union { | 811 | union { |
812 | struct inet_hashinfo *hashinfo; | 812 | struct inet_hashinfo *hashinfo; |
813 | struct udp_table *udp_table; | 813 | struct udp_table *udp_table; |
814 | struct raw_hashinfo *raw_hash; | 814 | struct raw_hashinfo *raw_hash; |
815 | } h; | 815 | } h; |
816 | 816 | ||
817 | struct module *owner; | 817 | struct module *owner; |
818 | 818 | ||
819 | char name[32]; | 819 | char name[32]; |
820 | 820 | ||
821 | struct list_head node; | 821 | struct list_head node; |
822 | #ifdef SOCK_REFCNT_DEBUG | 822 | #ifdef SOCK_REFCNT_DEBUG |
823 | atomic_t socks; | 823 | atomic_t socks; |
824 | #endif | 824 | #endif |
825 | }; | 825 | }; |
826 | 826 | ||
827 | extern int proto_register(struct proto *prot, int alloc_slab); | 827 | extern int proto_register(struct proto *prot, int alloc_slab); |
828 | extern void proto_unregister(struct proto *prot); | 828 | extern void proto_unregister(struct proto *prot); |
829 | 829 | ||
830 | #ifdef SOCK_REFCNT_DEBUG | 830 | #ifdef SOCK_REFCNT_DEBUG |
831 | static inline void sk_refcnt_debug_inc(struct sock *sk) | 831 | static inline void sk_refcnt_debug_inc(struct sock *sk) |
832 | { | 832 | { |
833 | atomic_inc(&sk->sk_prot->socks); | 833 | atomic_inc(&sk->sk_prot->socks); |
834 | } | 834 | } |
835 | 835 | ||
836 | static inline void sk_refcnt_debug_dec(struct sock *sk) | 836 | static inline void sk_refcnt_debug_dec(struct sock *sk) |
837 | { | 837 | { |
838 | atomic_dec(&sk->sk_prot->socks); | 838 | atomic_dec(&sk->sk_prot->socks); |
839 | printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", | 839 | printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", |
840 | sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); | 840 | sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); |
841 | } | 841 | } |
842 | 842 | ||
843 | static inline void sk_refcnt_debug_release(const struct sock *sk) | 843 | static inline void sk_refcnt_debug_release(const struct sock *sk) |
844 | { | 844 | { |
845 | if (atomic_read(&sk->sk_refcnt) != 1) | 845 | if (atomic_read(&sk->sk_refcnt) != 1) |
846 | printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", | 846 | printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", |
847 | sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt)); | 847 | sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt)); |
848 | } | 848 | } |
849 | #else /* SOCK_REFCNT_DEBUG */ | 849 | #else /* SOCK_REFCNT_DEBUG */ |
850 | #define sk_refcnt_debug_inc(sk) do { } while (0) | 850 | #define sk_refcnt_debug_inc(sk) do { } while (0) |
851 | #define sk_refcnt_debug_dec(sk) do { } while (0) | 851 | #define sk_refcnt_debug_dec(sk) do { } while (0) |
852 | #define sk_refcnt_debug_release(sk) do { } while (0) | 852 | #define sk_refcnt_debug_release(sk) do { } while (0) |
853 | #endif /* SOCK_REFCNT_DEBUG */ | 853 | #endif /* SOCK_REFCNT_DEBUG */ |
854 | 854 | ||
855 | 855 | ||
856 | #ifdef CONFIG_PROC_FS | 856 | #ifdef CONFIG_PROC_FS |
857 | /* Called with local bh disabled */ | 857 | /* Called with local bh disabled */ |
858 | extern void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc); | 858 | extern void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc); |
859 | extern int sock_prot_inuse_get(struct net *net, struct proto *proto); | 859 | extern int sock_prot_inuse_get(struct net *net, struct proto *proto); |
860 | #else | 860 | #else |
861 | static void inline sock_prot_inuse_add(struct net *net, struct proto *prot, | 861 | static void inline sock_prot_inuse_add(struct net *net, struct proto *prot, |
862 | int inc) | 862 | int inc) |
863 | { | 863 | { |
864 | } | 864 | } |
865 | #endif | 865 | #endif |
866 | 866 | ||
867 | 867 | ||
868 | /* With per-bucket locks this operation is not-atomic, so that | 868 | /* With per-bucket locks this operation is not-atomic, so that |
869 | * this version is not worse. | 869 | * this version is not worse. |
870 | */ | 870 | */ |
871 | static inline void __sk_prot_rehash(struct sock *sk) | 871 | static inline void __sk_prot_rehash(struct sock *sk) |
872 | { | 872 | { |
873 | sk->sk_prot->unhash(sk); | 873 | sk->sk_prot->unhash(sk); |
874 | sk->sk_prot->hash(sk); | 874 | sk->sk_prot->hash(sk); |
875 | } | 875 | } |
876 | 876 | ||
877 | void sk_prot_clear_portaddr_nulls(struct sock *sk, int size); | 877 | void sk_prot_clear_portaddr_nulls(struct sock *sk, int size); |
878 | 878 | ||
879 | /* About 10 seconds */ | 879 | /* About 10 seconds */ |
880 | #define SOCK_DESTROY_TIME (10*HZ) | 880 | #define SOCK_DESTROY_TIME (10*HZ) |
881 | 881 | ||
882 | /* Sockets 0-1023 can't be bound to unless you are superuser */ | 882 | /* Sockets 0-1023 can't be bound to unless you are superuser */ |
883 | #define PROT_SOCK 1024 | 883 | #define PROT_SOCK 1024 |
884 | 884 | ||
885 | #define SHUTDOWN_MASK 3 | 885 | #define SHUTDOWN_MASK 3 |
886 | #define RCV_SHUTDOWN 1 | 886 | #define RCV_SHUTDOWN 1 |
887 | #define SEND_SHUTDOWN 2 | 887 | #define SEND_SHUTDOWN 2 |
888 | 888 | ||
889 | #define SOCK_SNDBUF_LOCK 1 | 889 | #define SOCK_SNDBUF_LOCK 1 |
890 | #define SOCK_RCVBUF_LOCK 2 | 890 | #define SOCK_RCVBUF_LOCK 2 |
891 | #define SOCK_BINDADDR_LOCK 4 | 891 | #define SOCK_BINDADDR_LOCK 4 |
892 | #define SOCK_BINDPORT_LOCK 8 | 892 | #define SOCK_BINDPORT_LOCK 8 |
893 | 893 | ||
894 | /* sock_iocb: used to kick off async processing of socket ios */ | 894 | /* sock_iocb: used to kick off async processing of socket ios */ |
895 | struct sock_iocb { | 895 | struct sock_iocb { |
896 | struct list_head list; | 896 | struct list_head list; |
897 | 897 | ||
898 | int flags; | 898 | int flags; |
899 | int size; | 899 | int size; |
900 | struct socket *sock; | 900 | struct socket *sock; |
901 | struct sock *sk; | 901 | struct sock *sk; |
902 | struct scm_cookie *scm; | 902 | struct scm_cookie *scm; |
903 | struct msghdr *msg, async_msg; | 903 | struct msghdr *msg, async_msg; |
904 | struct kiocb *kiocb; | 904 | struct kiocb *kiocb; |
905 | }; | 905 | }; |
906 | 906 | ||
907 | static inline struct sock_iocb *kiocb_to_siocb(struct kiocb *iocb) | 907 | static inline struct sock_iocb *kiocb_to_siocb(struct kiocb *iocb) |
908 | { | 908 | { |
909 | return (struct sock_iocb *)iocb->private; | 909 | return (struct sock_iocb *)iocb->private; |
910 | } | 910 | } |
911 | 911 | ||
912 | static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si) | 912 | static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si) |
913 | { | 913 | { |
914 | return si->kiocb; | 914 | return si->kiocb; |
915 | } | 915 | } |
916 | 916 | ||
917 | struct socket_alloc { | 917 | struct socket_alloc { |
918 | struct socket socket; | 918 | struct socket socket; |
919 | struct inode vfs_inode; | 919 | struct inode vfs_inode; |
920 | }; | 920 | }; |
921 | 921 | ||
922 | static inline struct socket *SOCKET_I(struct inode *inode) | 922 | static inline struct socket *SOCKET_I(struct inode *inode) |
923 | { | 923 | { |
924 | return &container_of(inode, struct socket_alloc, vfs_inode)->socket; | 924 | return &container_of(inode, struct socket_alloc, vfs_inode)->socket; |
925 | } | 925 | } |
926 | 926 | ||
927 | static inline struct inode *SOCK_INODE(struct socket *socket) | 927 | static inline struct inode *SOCK_INODE(struct socket *socket) |
928 | { | 928 | { |
929 | return &container_of(socket, struct socket_alloc, socket)->vfs_inode; | 929 | return &container_of(socket, struct socket_alloc, socket)->vfs_inode; |
930 | } | 930 | } |
931 | 931 | ||
932 | /* | 932 | /* |
933 | * Functions for memory accounting | 933 | * Functions for memory accounting |
934 | */ | 934 | */ |
935 | extern int __sk_mem_schedule(struct sock *sk, int size, int kind); | 935 | extern int __sk_mem_schedule(struct sock *sk, int size, int kind); |
936 | extern void __sk_mem_reclaim(struct sock *sk); | 936 | extern void __sk_mem_reclaim(struct sock *sk); |
937 | 937 | ||
938 | #define SK_MEM_QUANTUM ((int)PAGE_SIZE) | 938 | #define SK_MEM_QUANTUM ((int)PAGE_SIZE) |
939 | #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) | 939 | #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) |
940 | #define SK_MEM_SEND 0 | 940 | #define SK_MEM_SEND 0 |
941 | #define SK_MEM_RECV 1 | 941 | #define SK_MEM_RECV 1 |
942 | 942 | ||
943 | static inline int sk_mem_pages(int amt) | 943 | static inline int sk_mem_pages(int amt) |
944 | { | 944 | { |
945 | return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT; | 945 | return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT; |
946 | } | 946 | } |
947 | 947 | ||
948 | static inline int sk_has_account(struct sock *sk) | 948 | static inline int sk_has_account(struct sock *sk) |
949 | { | 949 | { |
950 | /* return true if protocol supports memory accounting */ | 950 | /* return true if protocol supports memory accounting */ |
951 | return !!sk->sk_prot->memory_allocated; | 951 | return !!sk->sk_prot->memory_allocated; |
952 | } | 952 | } |
953 | 953 | ||
954 | static inline int sk_wmem_schedule(struct sock *sk, int size) | 954 | static inline int sk_wmem_schedule(struct sock *sk, int size) |
955 | { | 955 | { |
956 | if (!sk_has_account(sk)) | 956 | if (!sk_has_account(sk)) |
957 | return 1; | 957 | return 1; |
958 | return size <= sk->sk_forward_alloc || | 958 | return size <= sk->sk_forward_alloc || |
959 | __sk_mem_schedule(sk, size, SK_MEM_SEND); | 959 | __sk_mem_schedule(sk, size, SK_MEM_SEND); |
960 | } | 960 | } |
961 | 961 | ||
962 | static inline int sk_rmem_schedule(struct sock *sk, int size) | 962 | static inline int sk_rmem_schedule(struct sock *sk, int size) |
963 | { | 963 | { |
964 | if (!sk_has_account(sk)) | 964 | if (!sk_has_account(sk)) |
965 | return 1; | 965 | return 1; |
966 | return size <= sk->sk_forward_alloc || | 966 | return size <= sk->sk_forward_alloc || |
967 | __sk_mem_schedule(sk, size, SK_MEM_RECV); | 967 | __sk_mem_schedule(sk, size, SK_MEM_RECV); |
968 | } | 968 | } |
969 | 969 | ||
970 | static inline void sk_mem_reclaim(struct sock *sk) | 970 | static inline void sk_mem_reclaim(struct sock *sk) |
971 | { | 971 | { |
972 | if (!sk_has_account(sk)) | 972 | if (!sk_has_account(sk)) |
973 | return; | 973 | return; |
974 | if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) | 974 | if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) |
975 | __sk_mem_reclaim(sk); | 975 | __sk_mem_reclaim(sk); |
976 | } | 976 | } |
977 | 977 | ||
978 | static inline void sk_mem_reclaim_partial(struct sock *sk) | 978 | static inline void sk_mem_reclaim_partial(struct sock *sk) |
979 | { | 979 | { |
980 | if (!sk_has_account(sk)) | 980 | if (!sk_has_account(sk)) |
981 | return; | 981 | return; |
982 | if (sk->sk_forward_alloc > SK_MEM_QUANTUM) | 982 | if (sk->sk_forward_alloc > SK_MEM_QUANTUM) |
983 | __sk_mem_reclaim(sk); | 983 | __sk_mem_reclaim(sk); |
984 | } | 984 | } |
985 | 985 | ||
986 | static inline void sk_mem_charge(struct sock *sk, int size) | 986 | static inline void sk_mem_charge(struct sock *sk, int size) |
987 | { | 987 | { |
988 | if (!sk_has_account(sk)) | 988 | if (!sk_has_account(sk)) |
989 | return; | 989 | return; |
990 | sk->sk_forward_alloc -= size; | 990 | sk->sk_forward_alloc -= size; |
991 | } | 991 | } |
992 | 992 | ||
993 | static inline void sk_mem_uncharge(struct sock *sk, int size) | 993 | static inline void sk_mem_uncharge(struct sock *sk, int size) |
994 | { | 994 | { |
995 | if (!sk_has_account(sk)) | 995 | if (!sk_has_account(sk)) |
996 | return; | 996 | return; |
997 | sk->sk_forward_alloc += size; | 997 | sk->sk_forward_alloc += size; |
998 | } | 998 | } |
999 | 999 | ||
1000 | static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) | 1000 | static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) |
1001 | { | 1001 | { |
1002 | sock_set_flag(sk, SOCK_QUEUE_SHRUNK); | 1002 | sock_set_flag(sk, SOCK_QUEUE_SHRUNK); |
1003 | sk->sk_wmem_queued -= skb->truesize; | 1003 | sk->sk_wmem_queued -= skb->truesize; |
1004 | sk_mem_uncharge(sk, skb->truesize); | 1004 | sk_mem_uncharge(sk, skb->truesize); |
1005 | __kfree_skb(skb); | 1005 | __kfree_skb(skb); |
1006 | } | 1006 | } |
1007 | 1007 | ||
1008 | /* Used by processes to "lock" a socket state, so that | 1008 | /* Used by processes to "lock" a socket state, so that |
1009 | * interrupts and bottom half handlers won't change it | 1009 | * interrupts and bottom half handlers won't change it |
1010 | * from under us. It essentially blocks any incoming | 1010 | * from under us. It essentially blocks any incoming |
1011 | * packets, so that we won't get any new data or any | 1011 | * packets, so that we won't get any new data or any |
1012 | * packets that change the state of the socket. | 1012 | * packets that change the state of the socket. |
1013 | * | 1013 | * |
1014 | * While locked, BH processing will add new packets to | 1014 | * While locked, BH processing will add new packets to |
1015 | * the backlog queue. This queue is processed by the | 1015 | * the backlog queue. This queue is processed by the |
1016 | * owner of the socket lock right before it is released. | 1016 | * owner of the socket lock right before it is released. |
1017 | * | 1017 | * |
1018 | * Since ~2.3.5 it is also exclusive sleep lock serializing | 1018 | * Since ~2.3.5 it is also exclusive sleep lock serializing |
1019 | * accesses from user process context. | 1019 | * accesses from user process context. |
1020 | */ | 1020 | */ |
1021 | #define sock_owned_by_user(sk) ((sk)->sk_lock.owned) | 1021 | #define sock_owned_by_user(sk) ((sk)->sk_lock.owned) |
1022 | 1022 | ||
1023 | /* | 1023 | /* |
1024 | * Macro so as to not evaluate some arguments when | 1024 | * Macro so as to not evaluate some arguments when |
1025 | * lockdep is not enabled. | 1025 | * lockdep is not enabled. |
1026 | * | 1026 | * |
1027 | * Mark both the sk_lock and the sk_lock.slock as a | 1027 | * Mark both the sk_lock and the sk_lock.slock as a |
1028 | * per-address-family lock class. | 1028 | * per-address-family lock class. |
1029 | */ | 1029 | */ |
1030 | #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ | 1030 | #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ |
1031 | do { \ | 1031 | do { \ |
1032 | sk->sk_lock.owned = 0; \ | 1032 | sk->sk_lock.owned = 0; \ |
1033 | init_waitqueue_head(&sk->sk_lock.wq); \ | 1033 | init_waitqueue_head(&sk->sk_lock.wq); \ |
1034 | spin_lock_init(&(sk)->sk_lock.slock); \ | 1034 | spin_lock_init(&(sk)->sk_lock.slock); \ |
1035 | debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ | 1035 | debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ |
1036 | sizeof((sk)->sk_lock)); \ | 1036 | sizeof((sk)->sk_lock)); \ |
1037 | lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ | 1037 | lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ |
1038 | (skey), (sname)); \ | 1038 | (skey), (sname)); \ |
1039 | lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ | 1039 | lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ |
1040 | } while (0) | 1040 | } while (0) |
1041 | 1041 | ||
1042 | extern void lock_sock_nested(struct sock *sk, int subclass); | 1042 | extern void lock_sock_nested(struct sock *sk, int subclass); |
1043 | 1043 | ||
1044 | static inline void lock_sock(struct sock *sk) | 1044 | static inline void lock_sock(struct sock *sk) |
1045 | { | 1045 | { |
1046 | lock_sock_nested(sk, 0); | 1046 | lock_sock_nested(sk, 0); |
1047 | } | 1047 | } |
1048 | 1048 | ||
1049 | extern void release_sock(struct sock *sk); | 1049 | extern void release_sock(struct sock *sk); |
1050 | 1050 | ||
1051 | /* BH context may only use the following locking interface. */ | 1051 | /* BH context may only use the following locking interface. */ |
1052 | #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) | 1052 | #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) |
1053 | #define bh_lock_sock_nested(__sk) \ | 1053 | #define bh_lock_sock_nested(__sk) \ |
1054 | spin_lock_nested(&((__sk)->sk_lock.slock), \ | 1054 | spin_lock_nested(&((__sk)->sk_lock.slock), \ |
1055 | SINGLE_DEPTH_NESTING) | 1055 | SINGLE_DEPTH_NESTING) |
1056 | #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) | 1056 | #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) |
1057 | 1057 | ||
1058 | extern bool lock_sock_fast(struct sock *sk); | 1058 | extern bool lock_sock_fast(struct sock *sk); |
1059 | /** | 1059 | /** |
1060 | * unlock_sock_fast - complement of lock_sock_fast | 1060 | * unlock_sock_fast - complement of lock_sock_fast |
1061 | * @sk: socket | 1061 | * @sk: socket |
1062 | * @slow: slow mode | 1062 | * @slow: slow mode |
1063 | * | 1063 | * |
1064 | * fast unlock socket for user context. | 1064 | * fast unlock socket for user context. |
1065 | * If slow mode is on, we call regular release_sock() | 1065 | * If slow mode is on, we call regular release_sock() |
1066 | */ | 1066 | */ |
1067 | static inline void unlock_sock_fast(struct sock *sk, bool slow) | 1067 | static inline void unlock_sock_fast(struct sock *sk, bool slow) |
1068 | { | 1068 | { |
1069 | if (slow) | 1069 | if (slow) |
1070 | release_sock(sk); | 1070 | release_sock(sk); |
1071 | else | 1071 | else |
1072 | spin_unlock_bh(&sk->sk_lock.slock); | 1072 | spin_unlock_bh(&sk->sk_lock.slock); |
1073 | } | 1073 | } |
1074 | 1074 | ||
1075 | 1075 | ||
1076 | extern struct sock *sk_alloc(struct net *net, int family, | 1076 | extern struct sock *sk_alloc(struct net *net, int family, |
1077 | gfp_t priority, | 1077 | gfp_t priority, |
1078 | struct proto *prot); | 1078 | struct proto *prot); |
1079 | extern void sk_free(struct sock *sk); | 1079 | extern void sk_free(struct sock *sk); |
1080 | extern void sk_release_kernel(struct sock *sk); | 1080 | extern void sk_release_kernel(struct sock *sk); |
1081 | extern struct sock *sk_clone(const struct sock *sk, | 1081 | extern struct sock *sk_clone(const struct sock *sk, |
1082 | const gfp_t priority); | 1082 | const gfp_t priority); |
1083 | 1083 | ||
1084 | extern struct sk_buff *sock_wmalloc(struct sock *sk, | 1084 | extern struct sk_buff *sock_wmalloc(struct sock *sk, |
1085 | unsigned long size, int force, | 1085 | unsigned long size, int force, |
1086 | gfp_t priority); | 1086 | gfp_t priority); |
1087 | extern struct sk_buff *sock_rmalloc(struct sock *sk, | 1087 | extern struct sk_buff *sock_rmalloc(struct sock *sk, |
1088 | unsigned long size, int force, | 1088 | unsigned long size, int force, |
1089 | gfp_t priority); | 1089 | gfp_t priority); |
1090 | extern void sock_wfree(struct sk_buff *skb); | 1090 | extern void sock_wfree(struct sk_buff *skb); |
1091 | extern void sock_rfree(struct sk_buff *skb); | 1091 | extern void sock_rfree(struct sk_buff *skb); |
1092 | 1092 | ||
1093 | extern int sock_setsockopt(struct socket *sock, int level, | 1093 | extern int sock_setsockopt(struct socket *sock, int level, |
1094 | int op, char __user *optval, | 1094 | int op, char __user *optval, |
1095 | unsigned int optlen); | 1095 | unsigned int optlen); |
1096 | 1096 | ||
1097 | extern int sock_getsockopt(struct socket *sock, int level, | 1097 | extern int sock_getsockopt(struct socket *sock, int level, |
1098 | int op, char __user *optval, | 1098 | int op, char __user *optval, |
1099 | int __user *optlen); | 1099 | int __user *optlen); |
1100 | extern struct sk_buff *sock_alloc_send_skb(struct sock *sk, | 1100 | extern struct sk_buff *sock_alloc_send_skb(struct sock *sk, |
1101 | unsigned long size, | 1101 | unsigned long size, |
1102 | int noblock, | 1102 | int noblock, |
1103 | int *errcode); | 1103 | int *errcode); |
1104 | extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk, | 1104 | extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk, |
1105 | unsigned long header_len, | 1105 | unsigned long header_len, |
1106 | unsigned long data_len, | 1106 | unsigned long data_len, |
1107 | int noblock, | 1107 | int noblock, |
1108 | int *errcode); | 1108 | int *errcode); |
1109 | extern void *sock_kmalloc(struct sock *sk, int size, | 1109 | extern void *sock_kmalloc(struct sock *sk, int size, |
1110 | gfp_t priority); | 1110 | gfp_t priority); |
1111 | extern void sock_kfree_s(struct sock *sk, void *mem, int size); | 1111 | extern void sock_kfree_s(struct sock *sk, void *mem, int size); |
1112 | extern void sk_send_sigurg(struct sock *sk); | 1112 | extern void sk_send_sigurg(struct sock *sk); |
1113 | 1113 | ||
1114 | #ifdef CONFIG_CGROUPS | 1114 | #ifdef CONFIG_CGROUPS |
1115 | extern void sock_update_classid(struct sock *sk); | 1115 | extern void sock_update_classid(struct sock *sk); |
1116 | #else | 1116 | #else |
1117 | static inline void sock_update_classid(struct sock *sk) | 1117 | static inline void sock_update_classid(struct sock *sk) |
1118 | { | 1118 | { |
1119 | } | 1119 | } |
1120 | #endif | 1120 | #endif |
1121 | 1121 | ||
1122 | /* | 1122 | /* |
1123 | * Functions to fill in entries in struct proto_ops when a protocol | 1123 | * Functions to fill in entries in struct proto_ops when a protocol |
1124 | * does not implement a particular function. | 1124 | * does not implement a particular function. |
1125 | */ | 1125 | */ |
1126 | extern int sock_no_bind(struct socket *, | 1126 | extern int sock_no_bind(struct socket *, |
1127 | struct sockaddr *, int); | 1127 | struct sockaddr *, int); |
1128 | extern int sock_no_connect(struct socket *, | 1128 | extern int sock_no_connect(struct socket *, |
1129 | struct sockaddr *, int, int); | 1129 | struct sockaddr *, int, int); |
1130 | extern int sock_no_socketpair(struct socket *, | 1130 | extern int sock_no_socketpair(struct socket *, |
1131 | struct socket *); | 1131 | struct socket *); |
1132 | extern int sock_no_accept(struct socket *, | 1132 | extern int sock_no_accept(struct socket *, |
1133 | struct socket *, int); | 1133 | struct socket *, int); |
1134 | extern int sock_no_getname(struct socket *, | 1134 | extern int sock_no_getname(struct socket *, |
1135 | struct sockaddr *, int *, int); | 1135 | struct sockaddr *, int *, int); |
1136 | extern unsigned int sock_no_poll(struct file *, struct socket *, | 1136 | extern unsigned int sock_no_poll(struct file *, struct socket *, |
1137 | struct poll_table_struct *); | 1137 | struct poll_table_struct *); |
1138 | extern int sock_no_ioctl(struct socket *, unsigned int, | 1138 | extern int sock_no_ioctl(struct socket *, unsigned int, |
1139 | unsigned long); | 1139 | unsigned long); |
1140 | extern int sock_no_listen(struct socket *, int); | 1140 | extern int sock_no_listen(struct socket *, int); |
1141 | extern int sock_no_shutdown(struct socket *, int); | 1141 | extern int sock_no_shutdown(struct socket *, int); |
1142 | extern int sock_no_getsockopt(struct socket *, int , int, | 1142 | extern int sock_no_getsockopt(struct socket *, int , int, |
1143 | char __user *, int __user *); | 1143 | char __user *, int __user *); |
1144 | extern int sock_no_setsockopt(struct socket *, int, int, | 1144 | extern int sock_no_setsockopt(struct socket *, int, int, |
1145 | char __user *, unsigned int); | 1145 | char __user *, unsigned int); |
1146 | extern int sock_no_sendmsg(struct kiocb *, struct socket *, | 1146 | extern int sock_no_sendmsg(struct kiocb *, struct socket *, |
1147 | struct msghdr *, size_t); | 1147 | struct msghdr *, size_t); |
1148 | extern int sock_no_recvmsg(struct kiocb *, struct socket *, | 1148 | extern int sock_no_recvmsg(struct kiocb *, struct socket *, |
1149 | struct msghdr *, size_t, int); | 1149 | struct msghdr *, size_t, int); |
1150 | extern int sock_no_mmap(struct file *file, | 1150 | extern int sock_no_mmap(struct file *file, |
1151 | struct socket *sock, | 1151 | struct socket *sock, |
1152 | struct vm_area_struct *vma); | 1152 | struct vm_area_struct *vma); |
1153 | extern ssize_t sock_no_sendpage(struct socket *sock, | 1153 | extern ssize_t sock_no_sendpage(struct socket *sock, |
1154 | struct page *page, | 1154 | struct page *page, |
1155 | int offset, size_t size, | 1155 | int offset, size_t size, |
1156 | int flags); | 1156 | int flags); |
1157 | 1157 | ||
1158 | /* | 1158 | /* |
1159 | * Functions to fill in entries in struct proto_ops when a protocol | 1159 | * Functions to fill in entries in struct proto_ops when a protocol |
1160 | * uses the inet style. | 1160 | * uses the inet style. |
1161 | */ | 1161 | */ |
1162 | extern int sock_common_getsockopt(struct socket *sock, int level, int optname, | 1162 | extern int sock_common_getsockopt(struct socket *sock, int level, int optname, |
1163 | char __user *optval, int __user *optlen); | 1163 | char __user *optval, int __user *optlen); |
1164 | extern int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, | 1164 | extern int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, |
1165 | struct msghdr *msg, size_t size, int flags); | 1165 | struct msghdr *msg, size_t size, int flags); |
1166 | extern int sock_common_setsockopt(struct socket *sock, int level, int optname, | 1166 | extern int sock_common_setsockopt(struct socket *sock, int level, int optname, |
1167 | char __user *optval, unsigned int optlen); | 1167 | char __user *optval, unsigned int optlen); |
1168 | extern int compat_sock_common_getsockopt(struct socket *sock, int level, | 1168 | extern int compat_sock_common_getsockopt(struct socket *sock, int level, |
1169 | int optname, char __user *optval, int __user *optlen); | 1169 | int optname, char __user *optval, int __user *optlen); |
1170 | extern int compat_sock_common_setsockopt(struct socket *sock, int level, | 1170 | extern int compat_sock_common_setsockopt(struct socket *sock, int level, |
1171 | int optname, char __user *optval, unsigned int optlen); | 1171 | int optname, char __user *optval, unsigned int optlen); |
1172 | 1172 | ||
1173 | extern void sk_common_release(struct sock *sk); | 1173 | extern void sk_common_release(struct sock *sk); |
1174 | 1174 | ||
1175 | /* | 1175 | /* |
1176 | * Default socket callbacks and setup code | 1176 | * Default socket callbacks and setup code |
1177 | */ | 1177 | */ |
1178 | 1178 | ||
1179 | /* Initialise core socket variables */ | 1179 | /* Initialise core socket variables */ |
1180 | extern void sock_init_data(struct socket *sock, struct sock *sk); | 1180 | extern void sock_init_data(struct socket *sock, struct sock *sk); |
1181 | 1181 | ||
1182 | extern void sk_filter_release_rcu(struct rcu_head *rcu); | 1182 | extern void sk_filter_release_rcu(struct rcu_head *rcu); |
1183 | 1183 | ||
1184 | /** | 1184 | /** |
1185 | * sk_filter_release - release a socket filter | 1185 | * sk_filter_release - release a socket filter |
1186 | * @fp: filter to remove | 1186 | * @fp: filter to remove |
1187 | * | 1187 | * |
1188 | * Remove a filter from a socket and release its resources. | 1188 | * Remove a filter from a socket and release its resources. |
1189 | */ | 1189 | */ |
1190 | 1190 | ||
1191 | static inline void sk_filter_release(struct sk_filter *fp) | 1191 | static inline void sk_filter_release(struct sk_filter *fp) |
1192 | { | 1192 | { |
1193 | if (atomic_dec_and_test(&fp->refcnt)) | 1193 | if (atomic_dec_and_test(&fp->refcnt)) |
1194 | call_rcu(&fp->rcu, sk_filter_release_rcu); | 1194 | call_rcu(&fp->rcu, sk_filter_release_rcu); |
1195 | } | 1195 | } |
1196 | 1196 | ||
1197 | static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) | 1197 | static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) |
1198 | { | 1198 | { |
1199 | unsigned int size = sk_filter_len(fp); | 1199 | unsigned int size = sk_filter_len(fp); |
1200 | 1200 | ||
1201 | atomic_sub(size, &sk->sk_omem_alloc); | 1201 | atomic_sub(size, &sk->sk_omem_alloc); |
1202 | sk_filter_release(fp); | 1202 | sk_filter_release(fp); |
1203 | } | 1203 | } |
1204 | 1204 | ||
1205 | static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp) | 1205 | static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp) |
1206 | { | 1206 | { |
1207 | atomic_inc(&fp->refcnt); | 1207 | atomic_inc(&fp->refcnt); |
1208 | atomic_add(sk_filter_len(fp), &sk->sk_omem_alloc); | 1208 | atomic_add(sk_filter_len(fp), &sk->sk_omem_alloc); |
1209 | } | 1209 | } |
1210 | 1210 | ||
1211 | /* | 1211 | /* |
1212 | * Socket reference counting postulates. | 1212 | * Socket reference counting postulates. |
1213 | * | 1213 | * |
1214 | * * Each user of socket SHOULD hold a reference count. | 1214 | * * Each user of socket SHOULD hold a reference count. |
1215 | * * Each access point to socket (an hash table bucket, reference from a list, | 1215 | * * Each access point to socket (an hash table bucket, reference from a list, |
1216 | * running timer, skb in flight MUST hold a reference count. | 1216 | * running timer, skb in flight MUST hold a reference count. |
1217 | * * When reference count hits 0, it means it will never increase back. | 1217 | * * When reference count hits 0, it means it will never increase back. |
1218 | * * When reference count hits 0, it means that no references from | 1218 | * * When reference count hits 0, it means that no references from |
1219 | * outside exist to this socket and current process on current CPU | 1219 | * outside exist to this socket and current process on current CPU |
1220 | * is last user and may/should destroy this socket. | 1220 | * is last user and may/should destroy this socket. |
1221 | * * sk_free is called from any context: process, BH, IRQ. When | 1221 | * * sk_free is called from any context: process, BH, IRQ. When |
1222 | * it is called, socket has no references from outside -> sk_free | 1222 | * it is called, socket has no references from outside -> sk_free |
1223 | * may release descendant resources allocated by the socket, but | 1223 | * may release descendant resources allocated by the socket, but |
1224 | * to the time when it is called, socket is NOT referenced by any | 1224 | * to the time when it is called, socket is NOT referenced by any |
1225 | * hash tables, lists etc. | 1225 | * hash tables, lists etc. |
1226 | * * Packets, delivered from outside (from network or from another process) | 1226 | * * Packets, delivered from outside (from network or from another process) |
1227 | * and enqueued on receive/error queues SHOULD NOT grab reference count, | 1227 | * and enqueued on receive/error queues SHOULD NOT grab reference count, |
1228 | * when they sit in queue. Otherwise, packets will leak to hole, when | 1228 | * when they sit in queue. Otherwise, packets will leak to hole, when |
1229 | * socket is looked up by one cpu and unhasing is made by another CPU. | 1229 | * socket is looked up by one cpu and unhasing is made by another CPU. |
1230 | * It is true for udp/raw, netlink (leak to receive and error queues), tcp | 1230 | * It is true for udp/raw, netlink (leak to receive and error queues), tcp |
1231 | * (leak to backlog). Packet socket does all the processing inside | 1231 | * (leak to backlog). Packet socket does all the processing inside |
1232 | * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets | 1232 | * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets |
1233 | * use separate SMP lock, so that they are prone too. | 1233 | * use separate SMP lock, so that they are prone too. |
1234 | */ | 1234 | */ |
1235 | 1235 | ||
1236 | /* Ungrab socket and destroy it, if it was the last reference. */ | 1236 | /* Ungrab socket and destroy it, if it was the last reference. */ |
1237 | static inline void sock_put(struct sock *sk) | 1237 | static inline void sock_put(struct sock *sk) |
1238 | { | 1238 | { |
1239 | if (atomic_dec_and_test(&sk->sk_refcnt)) | 1239 | if (atomic_dec_and_test(&sk->sk_refcnt)) |
1240 | sk_free(sk); | 1240 | sk_free(sk); |
1241 | } | 1241 | } |
1242 | 1242 | ||
1243 | extern int sk_receive_skb(struct sock *sk, struct sk_buff *skb, | 1243 | extern int sk_receive_skb(struct sock *sk, struct sk_buff *skb, |
1244 | const int nested); | 1244 | const int nested); |
1245 | 1245 | ||
1246 | static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) | 1246 | static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) |
1247 | { | 1247 | { |
1248 | sk->sk_tx_queue_mapping = tx_queue; | 1248 | sk->sk_tx_queue_mapping = tx_queue; |
1249 | } | 1249 | } |
1250 | 1250 | ||
1251 | static inline void sk_tx_queue_clear(struct sock *sk) | 1251 | static inline void sk_tx_queue_clear(struct sock *sk) |
1252 | { | 1252 | { |
1253 | sk->sk_tx_queue_mapping = -1; | 1253 | sk->sk_tx_queue_mapping = -1; |
1254 | } | 1254 | } |
1255 | 1255 | ||
1256 | static inline int sk_tx_queue_get(const struct sock *sk) | 1256 | static inline int sk_tx_queue_get(const struct sock *sk) |
1257 | { | 1257 | { |
1258 | return sk ? sk->sk_tx_queue_mapping : -1; | 1258 | return sk ? sk->sk_tx_queue_mapping : -1; |
1259 | } | 1259 | } |
1260 | 1260 | ||
1261 | static inline void sk_set_socket(struct sock *sk, struct socket *sock) | 1261 | static inline void sk_set_socket(struct sock *sk, struct socket *sock) |
1262 | { | 1262 | { |
1263 | sk_tx_queue_clear(sk); | 1263 | sk_tx_queue_clear(sk); |
1264 | sk->sk_socket = sock; | 1264 | sk->sk_socket = sock; |
1265 | } | 1265 | } |
1266 | 1266 | ||
1267 | static inline wait_queue_head_t *sk_sleep(struct sock *sk) | 1267 | static inline wait_queue_head_t *sk_sleep(struct sock *sk) |
1268 | { | 1268 | { |
1269 | BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); | 1269 | BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); |
1270 | return &rcu_dereference_raw(sk->sk_wq)->wait; | 1270 | return &rcu_dereference_raw(sk->sk_wq)->wait; |
1271 | } | 1271 | } |
1272 | /* Detach socket from process context. | 1272 | /* Detach socket from process context. |
1273 | * Announce socket dead, detach it from wait queue and inode. | 1273 | * Announce socket dead, detach it from wait queue and inode. |
1274 | * Note that parent inode held reference count on this struct sock, | 1274 | * Note that parent inode held reference count on this struct sock, |
1275 | * we do not release it in this function, because protocol | 1275 | * we do not release it in this function, because protocol |
1276 | * probably wants some additional cleanups or even continuing | 1276 | * probably wants some additional cleanups or even continuing |
1277 | * to work with this socket (TCP). | 1277 | * to work with this socket (TCP). |
1278 | */ | 1278 | */ |
1279 | static inline void sock_orphan(struct sock *sk) | 1279 | static inline void sock_orphan(struct sock *sk) |
1280 | { | 1280 | { |
1281 | write_lock_bh(&sk->sk_callback_lock); | 1281 | write_lock_bh(&sk->sk_callback_lock); |
1282 | sock_set_flag(sk, SOCK_DEAD); | 1282 | sock_set_flag(sk, SOCK_DEAD); |
1283 | sk_set_socket(sk, NULL); | 1283 | sk_set_socket(sk, NULL); |
1284 | sk->sk_wq = NULL; | 1284 | sk->sk_wq = NULL; |
1285 | write_unlock_bh(&sk->sk_callback_lock); | 1285 | write_unlock_bh(&sk->sk_callback_lock); |
1286 | } | 1286 | } |
1287 | 1287 | ||
1288 | static inline void sock_graft(struct sock *sk, struct socket *parent) | 1288 | static inline void sock_graft(struct sock *sk, struct socket *parent) |
1289 | { | 1289 | { |
1290 | write_lock_bh(&sk->sk_callback_lock); | 1290 | write_lock_bh(&sk->sk_callback_lock); |
1291 | sk->sk_wq = parent->wq; | 1291 | sk->sk_wq = parent->wq; |
1292 | parent->sk = sk; | 1292 | parent->sk = sk; |
1293 | sk_set_socket(sk, parent); | 1293 | sk_set_socket(sk, parent); |
1294 | security_sock_graft(sk, parent); | 1294 | security_sock_graft(sk, parent); |
1295 | write_unlock_bh(&sk->sk_callback_lock); | 1295 | write_unlock_bh(&sk->sk_callback_lock); |
1296 | } | 1296 | } |
1297 | 1297 | ||
1298 | extern int sock_i_uid(struct sock *sk); | 1298 | extern int sock_i_uid(struct sock *sk); |
1299 | extern unsigned long sock_i_ino(struct sock *sk); | 1299 | extern unsigned long sock_i_ino(struct sock *sk); |
1300 | 1300 | ||
1301 | static inline struct dst_entry * | 1301 | static inline struct dst_entry * |
1302 | __sk_dst_get(struct sock *sk) | 1302 | __sk_dst_get(struct sock *sk) |
1303 | { | 1303 | { |
1304 | return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() || | 1304 | return rcu_dereference_check(sk->sk_dst_cache, sock_owned_by_user(sk) || |
1305 | sock_owned_by_user(sk) || | ||
1306 | lockdep_is_held(&sk->sk_lock.slock)); | 1305 | lockdep_is_held(&sk->sk_lock.slock)); |
1307 | } | 1306 | } |
1308 | 1307 | ||
1309 | static inline struct dst_entry * | 1308 | static inline struct dst_entry * |
1310 | sk_dst_get(struct sock *sk) | 1309 | sk_dst_get(struct sock *sk) |
1311 | { | 1310 | { |
1312 | struct dst_entry *dst; | 1311 | struct dst_entry *dst; |
1313 | 1312 | ||
1314 | rcu_read_lock(); | 1313 | rcu_read_lock(); |
1315 | dst = rcu_dereference(sk->sk_dst_cache); | 1314 | dst = rcu_dereference(sk->sk_dst_cache); |
1316 | if (dst) | 1315 | if (dst) |
1317 | dst_hold(dst); | 1316 | dst_hold(dst); |
1318 | rcu_read_unlock(); | 1317 | rcu_read_unlock(); |
1319 | return dst; | 1318 | return dst; |
1320 | } | 1319 | } |
1321 | 1320 | ||
1322 | extern void sk_reset_txq(struct sock *sk); | 1321 | extern void sk_reset_txq(struct sock *sk); |
1323 | 1322 | ||
1324 | static inline void dst_negative_advice(struct sock *sk) | 1323 | static inline void dst_negative_advice(struct sock *sk) |
1325 | { | 1324 | { |
1326 | struct dst_entry *ndst, *dst = __sk_dst_get(sk); | 1325 | struct dst_entry *ndst, *dst = __sk_dst_get(sk); |
1327 | 1326 | ||
1328 | if (dst && dst->ops->negative_advice) { | 1327 | if (dst && dst->ops->negative_advice) { |
1329 | ndst = dst->ops->negative_advice(dst); | 1328 | ndst = dst->ops->negative_advice(dst); |
1330 | 1329 | ||
1331 | if (ndst != dst) { | 1330 | if (ndst != dst) { |
1332 | rcu_assign_pointer(sk->sk_dst_cache, ndst); | 1331 | rcu_assign_pointer(sk->sk_dst_cache, ndst); |
1333 | sk_reset_txq(sk); | 1332 | sk_reset_txq(sk); |
1334 | } | 1333 | } |
1335 | } | 1334 | } |
1336 | } | 1335 | } |
1337 | 1336 | ||
1338 | static inline void | 1337 | static inline void |
1339 | __sk_dst_set(struct sock *sk, struct dst_entry *dst) | 1338 | __sk_dst_set(struct sock *sk, struct dst_entry *dst) |
1340 | { | 1339 | { |
1341 | struct dst_entry *old_dst; | 1340 | struct dst_entry *old_dst; |
1342 | 1341 | ||
1343 | sk_tx_queue_clear(sk); | 1342 | sk_tx_queue_clear(sk); |
1344 | /* | 1343 | /* |
1345 | * This can be called while sk is owned by the caller only, | 1344 | * This can be called while sk is owned by the caller only, |
1346 | * with no state that can be checked in a rcu_dereference_check() cond | 1345 | * with no state that can be checked in a rcu_dereference_check() cond |
1347 | */ | 1346 | */ |
1348 | old_dst = rcu_dereference_raw(sk->sk_dst_cache); | 1347 | old_dst = rcu_dereference_raw(sk->sk_dst_cache); |
1349 | rcu_assign_pointer(sk->sk_dst_cache, dst); | 1348 | rcu_assign_pointer(sk->sk_dst_cache, dst); |
1350 | dst_release(old_dst); | 1349 | dst_release(old_dst); |
1351 | } | 1350 | } |
1352 | 1351 | ||
1353 | static inline void | 1352 | static inline void |
1354 | sk_dst_set(struct sock *sk, struct dst_entry *dst) | 1353 | sk_dst_set(struct sock *sk, struct dst_entry *dst) |
1355 | { | 1354 | { |
1356 | spin_lock(&sk->sk_dst_lock); | 1355 | spin_lock(&sk->sk_dst_lock); |
1357 | __sk_dst_set(sk, dst); | 1356 | __sk_dst_set(sk, dst); |
1358 | spin_unlock(&sk->sk_dst_lock); | 1357 | spin_unlock(&sk->sk_dst_lock); |
1359 | } | 1358 | } |
1360 | 1359 | ||
1361 | static inline void | 1360 | static inline void |
1362 | __sk_dst_reset(struct sock *sk) | 1361 | __sk_dst_reset(struct sock *sk) |
1363 | { | 1362 | { |
1364 | __sk_dst_set(sk, NULL); | 1363 | __sk_dst_set(sk, NULL); |
1365 | } | 1364 | } |
1366 | 1365 | ||
1367 | static inline void | 1366 | static inline void |
1368 | sk_dst_reset(struct sock *sk) | 1367 | sk_dst_reset(struct sock *sk) |
1369 | { | 1368 | { |
1370 | spin_lock(&sk->sk_dst_lock); | 1369 | spin_lock(&sk->sk_dst_lock); |
1371 | __sk_dst_reset(sk); | 1370 | __sk_dst_reset(sk); |
1372 | spin_unlock(&sk->sk_dst_lock); | 1371 | spin_unlock(&sk->sk_dst_lock); |
1373 | } | 1372 | } |
1374 | 1373 | ||
1375 | extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); | 1374 | extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); |
1376 | 1375 | ||
1377 | extern struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); | 1376 | extern struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); |
1378 | 1377 | ||
1379 | static inline int sk_can_gso(const struct sock *sk) | 1378 | static inline int sk_can_gso(const struct sock *sk) |
1380 | { | 1379 | { |
1381 | return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); | 1380 | return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); |
1382 | } | 1381 | } |
1383 | 1382 | ||
1384 | extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst); | 1383 | extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst); |
1385 | 1384 | ||
1386 | static inline void sk_nocaps_add(struct sock *sk, int flags) | 1385 | static inline void sk_nocaps_add(struct sock *sk, int flags) |
1387 | { | 1386 | { |
1388 | sk->sk_route_nocaps |= flags; | 1387 | sk->sk_route_nocaps |= flags; |
1389 | sk->sk_route_caps &= ~flags; | 1388 | sk->sk_route_caps &= ~flags; |
1390 | } | 1389 | } |
1391 | 1390 | ||
1392 | static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, | 1391 | static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, |
1393 | char __user *from, char *to, | 1392 | char __user *from, char *to, |
1394 | int copy, int offset) | 1393 | int copy, int offset) |
1395 | { | 1394 | { |
1396 | if (skb->ip_summed == CHECKSUM_NONE) { | 1395 | if (skb->ip_summed == CHECKSUM_NONE) { |
1397 | int err = 0; | 1396 | int err = 0; |
1398 | __wsum csum = csum_and_copy_from_user(from, to, copy, 0, &err); | 1397 | __wsum csum = csum_and_copy_from_user(from, to, copy, 0, &err); |
1399 | if (err) | 1398 | if (err) |
1400 | return err; | 1399 | return err; |
1401 | skb->csum = csum_block_add(skb->csum, csum, offset); | 1400 | skb->csum = csum_block_add(skb->csum, csum, offset); |
1402 | } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { | 1401 | } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { |
1403 | if (!access_ok(VERIFY_READ, from, copy) || | 1402 | if (!access_ok(VERIFY_READ, from, copy) || |
1404 | __copy_from_user_nocache(to, from, copy)) | 1403 | __copy_from_user_nocache(to, from, copy)) |
1405 | return -EFAULT; | 1404 | return -EFAULT; |
1406 | } else if (copy_from_user(to, from, copy)) | 1405 | } else if (copy_from_user(to, from, copy)) |
1407 | return -EFAULT; | 1406 | return -EFAULT; |
1408 | 1407 | ||
1409 | return 0; | 1408 | return 0; |
1410 | } | 1409 | } |
1411 | 1410 | ||
1412 | static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, | 1411 | static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, |
1413 | char __user *from, int copy) | 1412 | char __user *from, int copy) |
1414 | { | 1413 | { |
1415 | int err, offset = skb->len; | 1414 | int err, offset = skb->len; |
1416 | 1415 | ||
1417 | err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), | 1416 | err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), |
1418 | copy, offset); | 1417 | copy, offset); |
1419 | if (err) | 1418 | if (err) |
1420 | __skb_trim(skb, offset); | 1419 | __skb_trim(skb, offset); |
1421 | 1420 | ||
1422 | return err; | 1421 | return err; |
1423 | } | 1422 | } |
1424 | 1423 | ||
1425 | static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from, | 1424 | static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from, |
1426 | struct sk_buff *skb, | 1425 | struct sk_buff *skb, |
1427 | struct page *page, | 1426 | struct page *page, |
1428 | int off, int copy) | 1427 | int off, int copy) |
1429 | { | 1428 | { |
1430 | int err; | 1429 | int err; |
1431 | 1430 | ||
1432 | err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, | 1431 | err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, |
1433 | copy, skb->len); | 1432 | copy, skb->len); |
1434 | if (err) | 1433 | if (err) |
1435 | return err; | 1434 | return err; |
1436 | 1435 | ||
1437 | skb->len += copy; | 1436 | skb->len += copy; |
1438 | skb->data_len += copy; | 1437 | skb->data_len += copy; |
1439 | skb->truesize += copy; | 1438 | skb->truesize += copy; |
1440 | sk->sk_wmem_queued += copy; | 1439 | sk->sk_wmem_queued += copy; |
1441 | sk_mem_charge(sk, copy); | 1440 | sk_mem_charge(sk, copy); |
1442 | return 0; | 1441 | return 0; |
1443 | } | 1442 | } |
1444 | 1443 | ||
1445 | static inline int skb_copy_to_page(struct sock *sk, char __user *from, | 1444 | static inline int skb_copy_to_page(struct sock *sk, char __user *from, |
1446 | struct sk_buff *skb, struct page *page, | 1445 | struct sk_buff *skb, struct page *page, |
1447 | int off, int copy) | 1446 | int off, int copy) |
1448 | { | 1447 | { |
1449 | if (skb->ip_summed == CHECKSUM_NONE) { | 1448 | if (skb->ip_summed == CHECKSUM_NONE) { |
1450 | int err = 0; | 1449 | int err = 0; |
1451 | __wsum csum = csum_and_copy_from_user(from, | 1450 | __wsum csum = csum_and_copy_from_user(from, |
1452 | page_address(page) + off, | 1451 | page_address(page) + off, |
1453 | copy, 0, &err); | 1452 | copy, 0, &err); |
1454 | if (err) | 1453 | if (err) |
1455 | return err; | 1454 | return err; |
1456 | skb->csum = csum_block_add(skb->csum, csum, skb->len); | 1455 | skb->csum = csum_block_add(skb->csum, csum, skb->len); |
1457 | } else if (copy_from_user(page_address(page) + off, from, copy)) | 1456 | } else if (copy_from_user(page_address(page) + off, from, copy)) |
1458 | return -EFAULT; | 1457 | return -EFAULT; |
1459 | 1458 | ||
1460 | skb->len += copy; | 1459 | skb->len += copy; |
1461 | skb->data_len += copy; | 1460 | skb->data_len += copy; |
1462 | skb->truesize += copy; | 1461 | skb->truesize += copy; |
1463 | sk->sk_wmem_queued += copy; | 1462 | sk->sk_wmem_queued += copy; |
1464 | sk_mem_charge(sk, copy); | 1463 | sk_mem_charge(sk, copy); |
1465 | return 0; | 1464 | return 0; |
1466 | } | 1465 | } |
1467 | 1466 | ||
1468 | /** | 1467 | /** |
1469 | * sk_wmem_alloc_get - returns write allocations | 1468 | * sk_wmem_alloc_get - returns write allocations |
1470 | * @sk: socket | 1469 | * @sk: socket |
1471 | * | 1470 | * |
1472 | * Returns sk_wmem_alloc minus initial offset of one | 1471 | * Returns sk_wmem_alloc minus initial offset of one |
1473 | */ | 1472 | */ |
1474 | static inline int sk_wmem_alloc_get(const struct sock *sk) | 1473 | static inline int sk_wmem_alloc_get(const struct sock *sk) |
1475 | { | 1474 | { |
1476 | return atomic_read(&sk->sk_wmem_alloc) - 1; | 1475 | return atomic_read(&sk->sk_wmem_alloc) - 1; |
1477 | } | 1476 | } |
1478 | 1477 | ||
1479 | /** | 1478 | /** |
1480 | * sk_rmem_alloc_get - returns read allocations | 1479 | * sk_rmem_alloc_get - returns read allocations |
1481 | * @sk: socket | 1480 | * @sk: socket |
1482 | * | 1481 | * |
1483 | * Returns sk_rmem_alloc | 1482 | * Returns sk_rmem_alloc |
1484 | */ | 1483 | */ |
1485 | static inline int sk_rmem_alloc_get(const struct sock *sk) | 1484 | static inline int sk_rmem_alloc_get(const struct sock *sk) |
1486 | { | 1485 | { |
1487 | return atomic_read(&sk->sk_rmem_alloc); | 1486 | return atomic_read(&sk->sk_rmem_alloc); |
1488 | } | 1487 | } |
1489 | 1488 | ||
1490 | /** | 1489 | /** |
1491 | * sk_has_allocations - check if allocations are outstanding | 1490 | * sk_has_allocations - check if allocations are outstanding |
1492 | * @sk: socket | 1491 | * @sk: socket |
1493 | * | 1492 | * |
1494 | * Returns true if socket has write or read allocations | 1493 | * Returns true if socket has write or read allocations |
1495 | */ | 1494 | */ |
1496 | static inline int sk_has_allocations(const struct sock *sk) | 1495 | static inline int sk_has_allocations(const struct sock *sk) |
1497 | { | 1496 | { |
1498 | return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); | 1497 | return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); |
1499 | } | 1498 | } |
1500 | 1499 | ||
1501 | /** | 1500 | /** |
1502 | * wq_has_sleeper - check if there are any waiting processes | 1501 | * wq_has_sleeper - check if there are any waiting processes |
1503 | * @wq: struct socket_wq | 1502 | * @wq: struct socket_wq |
1504 | * | 1503 | * |
1505 | * Returns true if socket_wq has waiting processes | 1504 | * Returns true if socket_wq has waiting processes |
1506 | * | 1505 | * |
1507 | * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory | 1506 | * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory |
1508 | * barrier call. They were added due to the race found within the tcp code. | 1507 | * barrier call. They were added due to the race found within the tcp code. |
1509 | * | 1508 | * |
1510 | * Consider following tcp code paths: | 1509 | * Consider following tcp code paths: |
1511 | * | 1510 | * |
1512 | * CPU1 CPU2 | 1511 | * CPU1 CPU2 |
1513 | * | 1512 | * |
1514 | * sys_select receive packet | 1513 | * sys_select receive packet |
1515 | * ... ... | 1514 | * ... ... |
1516 | * __add_wait_queue update tp->rcv_nxt | 1515 | * __add_wait_queue update tp->rcv_nxt |
1517 | * ... ... | 1516 | * ... ... |
1518 | * tp->rcv_nxt check sock_def_readable | 1517 | * tp->rcv_nxt check sock_def_readable |
1519 | * ... { | 1518 | * ... { |
1520 | * schedule rcu_read_lock(); | 1519 | * schedule rcu_read_lock(); |
1521 | * wq = rcu_dereference(sk->sk_wq); | 1520 | * wq = rcu_dereference(sk->sk_wq); |
1522 | * if (wq && waitqueue_active(&wq->wait)) | 1521 | * if (wq && waitqueue_active(&wq->wait)) |
1523 | * wake_up_interruptible(&wq->wait) | 1522 | * wake_up_interruptible(&wq->wait) |
1524 | * ... | 1523 | * ... |
1525 | * } | 1524 | * } |
1526 | * | 1525 | * |
1527 | * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay | 1526 | * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay |
1528 | * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 | 1527 | * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 |
1529 | * could then endup calling schedule and sleep forever if there are no more | 1528 | * could then endup calling schedule and sleep forever if there are no more |
1530 | * data on the socket. | 1529 | * data on the socket. |
1531 | * | 1530 | * |
1532 | */ | 1531 | */ |
1533 | static inline bool wq_has_sleeper(struct socket_wq *wq) | 1532 | static inline bool wq_has_sleeper(struct socket_wq *wq) |
1534 | { | 1533 | { |
1535 | 1534 | ||
1536 | /* | 1535 | /* |
1537 | * We need to be sure we are in sync with the | 1536 | * We need to be sure we are in sync with the |
1538 | * add_wait_queue modifications to the wait queue. | 1537 | * add_wait_queue modifications to the wait queue. |
1539 | * | 1538 | * |
1540 | * This memory barrier is paired in the sock_poll_wait. | 1539 | * This memory barrier is paired in the sock_poll_wait. |
1541 | */ | 1540 | */ |
1542 | smp_mb(); | 1541 | smp_mb(); |
1543 | return wq && waitqueue_active(&wq->wait); | 1542 | return wq && waitqueue_active(&wq->wait); |
1544 | } | 1543 | } |
1545 | 1544 | ||
1546 | /** | 1545 | /** |
1547 | * sock_poll_wait - place memory barrier behind the poll_wait call. | 1546 | * sock_poll_wait - place memory barrier behind the poll_wait call. |
1548 | * @filp: file | 1547 | * @filp: file |
1549 | * @wait_address: socket wait queue | 1548 | * @wait_address: socket wait queue |
1550 | * @p: poll_table | 1549 | * @p: poll_table |
1551 | * | 1550 | * |
1552 | * See the comments in the wq_has_sleeper function. | 1551 | * See the comments in the wq_has_sleeper function. |
1553 | */ | 1552 | */ |
1554 | static inline void sock_poll_wait(struct file *filp, | 1553 | static inline void sock_poll_wait(struct file *filp, |
1555 | wait_queue_head_t *wait_address, poll_table *p) | 1554 | wait_queue_head_t *wait_address, poll_table *p) |
1556 | { | 1555 | { |
1557 | if (p && wait_address) { | 1556 | if (p && wait_address) { |
1558 | poll_wait(filp, wait_address, p); | 1557 | poll_wait(filp, wait_address, p); |
1559 | /* | 1558 | /* |
1560 | * We need to be sure we are in sync with the | 1559 | * We need to be sure we are in sync with the |
1561 | * socket flags modification. | 1560 | * socket flags modification. |
1562 | * | 1561 | * |
1563 | * This memory barrier is paired in the wq_has_sleeper. | 1562 | * This memory barrier is paired in the wq_has_sleeper. |
1564 | */ | 1563 | */ |
1565 | smp_mb(); | 1564 | smp_mb(); |
1566 | } | 1565 | } |
1567 | } | 1566 | } |
1568 | 1567 | ||
1569 | /* | 1568 | /* |
1570 | * Queue a received datagram if it will fit. Stream and sequenced | 1569 | * Queue a received datagram if it will fit. Stream and sequenced |
1571 | * protocols can't normally use this as they need to fit buffers in | 1570 | * protocols can't normally use this as they need to fit buffers in |
1572 | * and play with them. | 1571 | * and play with them. |
1573 | * | 1572 | * |
1574 | * Inlined as it's very short and called for pretty much every | 1573 | * Inlined as it's very short and called for pretty much every |
1575 | * packet ever received. | 1574 | * packet ever received. |
1576 | */ | 1575 | */ |
1577 | 1576 | ||
1578 | static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) | 1577 | static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) |
1579 | { | 1578 | { |
1580 | skb_orphan(skb); | 1579 | skb_orphan(skb); |
1581 | skb->sk = sk; | 1580 | skb->sk = sk; |
1582 | skb->destructor = sock_wfree; | 1581 | skb->destructor = sock_wfree; |
1583 | /* | 1582 | /* |
1584 | * We used to take a refcount on sk, but following operation | 1583 | * We used to take a refcount on sk, but following operation |
1585 | * is enough to guarantee sk_free() wont free this sock until | 1584 | * is enough to guarantee sk_free() wont free this sock until |
1586 | * all in-flight packets are completed | 1585 | * all in-flight packets are completed |
1587 | */ | 1586 | */ |
1588 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | 1587 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); |
1589 | } | 1588 | } |
1590 | 1589 | ||
1591 | static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) | 1590 | static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) |
1592 | { | 1591 | { |
1593 | skb_orphan(skb); | 1592 | skb_orphan(skb); |
1594 | skb->sk = sk; | 1593 | skb->sk = sk; |
1595 | skb->destructor = sock_rfree; | 1594 | skb->destructor = sock_rfree; |
1596 | atomic_add(skb->truesize, &sk->sk_rmem_alloc); | 1595 | atomic_add(skb->truesize, &sk->sk_rmem_alloc); |
1597 | sk_mem_charge(sk, skb->truesize); | 1596 | sk_mem_charge(sk, skb->truesize); |
1598 | } | 1597 | } |
1599 | 1598 | ||
1600 | extern void sk_reset_timer(struct sock *sk, struct timer_list* timer, | 1599 | extern void sk_reset_timer(struct sock *sk, struct timer_list* timer, |
1601 | unsigned long expires); | 1600 | unsigned long expires); |
1602 | 1601 | ||
1603 | extern void sk_stop_timer(struct sock *sk, struct timer_list* timer); | 1602 | extern void sk_stop_timer(struct sock *sk, struct timer_list* timer); |
1604 | 1603 | ||
1605 | extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); | 1604 | extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); |
1606 | 1605 | ||
1607 | extern int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); | 1606 | extern int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); |
1608 | 1607 | ||
1609 | /* | 1608 | /* |
1610 | * Recover an error report and clear atomically | 1609 | * Recover an error report and clear atomically |
1611 | */ | 1610 | */ |
1612 | 1611 | ||
1613 | static inline int sock_error(struct sock *sk) | 1612 | static inline int sock_error(struct sock *sk) |
1614 | { | 1613 | { |
1615 | int err; | 1614 | int err; |
1616 | if (likely(!sk->sk_err)) | 1615 | if (likely(!sk->sk_err)) |
1617 | return 0; | 1616 | return 0; |
1618 | err = xchg(&sk->sk_err, 0); | 1617 | err = xchg(&sk->sk_err, 0); |
1619 | return -err; | 1618 | return -err; |
1620 | } | 1619 | } |
1621 | 1620 | ||
1622 | static inline unsigned long sock_wspace(struct sock *sk) | 1621 | static inline unsigned long sock_wspace(struct sock *sk) |
1623 | { | 1622 | { |
1624 | int amt = 0; | 1623 | int amt = 0; |
1625 | 1624 | ||
1626 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { | 1625 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { |
1627 | amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); | 1626 | amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); |
1628 | if (amt < 0) | 1627 | if (amt < 0) |
1629 | amt = 0; | 1628 | amt = 0; |
1630 | } | 1629 | } |
1631 | return amt; | 1630 | return amt; |
1632 | } | 1631 | } |
1633 | 1632 | ||
1634 | static inline void sk_wake_async(struct sock *sk, int how, int band) | 1633 | static inline void sk_wake_async(struct sock *sk, int how, int band) |
1635 | { | 1634 | { |
1636 | if (sock_flag(sk, SOCK_FASYNC)) | 1635 | if (sock_flag(sk, SOCK_FASYNC)) |
1637 | sock_wake_async(sk->sk_socket, how, band); | 1636 | sock_wake_async(sk->sk_socket, how, band); |
1638 | } | 1637 | } |
1639 | 1638 | ||
1640 | #define SOCK_MIN_SNDBUF 2048 | 1639 | #define SOCK_MIN_SNDBUF 2048 |
1641 | /* | 1640 | /* |
1642 | * Since sk_rmem_alloc sums skb->truesize, even a small frame might need | 1641 | * Since sk_rmem_alloc sums skb->truesize, even a small frame might need |
1643 | * sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak | 1642 | * sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak |
1644 | */ | 1643 | */ |
1645 | #define SOCK_MIN_RCVBUF (2048 + sizeof(struct sk_buff)) | 1644 | #define SOCK_MIN_RCVBUF (2048 + sizeof(struct sk_buff)) |
1646 | 1645 | ||
1647 | static inline void sk_stream_moderate_sndbuf(struct sock *sk) | 1646 | static inline void sk_stream_moderate_sndbuf(struct sock *sk) |
1648 | { | 1647 | { |
1649 | if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) { | 1648 | if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) { |
1650 | sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); | 1649 | sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); |
1651 | sk->sk_sndbuf = max(sk->sk_sndbuf, SOCK_MIN_SNDBUF); | 1650 | sk->sk_sndbuf = max(sk->sk_sndbuf, SOCK_MIN_SNDBUF); |
1652 | } | 1651 | } |
1653 | } | 1652 | } |
1654 | 1653 | ||
1655 | struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp); | 1654 | struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp); |
1656 | 1655 | ||
1657 | static inline struct page *sk_stream_alloc_page(struct sock *sk) | 1656 | static inline struct page *sk_stream_alloc_page(struct sock *sk) |
1658 | { | 1657 | { |
1659 | struct page *page = NULL; | 1658 | struct page *page = NULL; |
1660 | 1659 | ||
1661 | page = alloc_pages(sk->sk_allocation, 0); | 1660 | page = alloc_pages(sk->sk_allocation, 0); |
1662 | if (!page) { | 1661 | if (!page) { |
1663 | sk->sk_prot->enter_memory_pressure(sk); | 1662 | sk->sk_prot->enter_memory_pressure(sk); |
1664 | sk_stream_moderate_sndbuf(sk); | 1663 | sk_stream_moderate_sndbuf(sk); |
1665 | } | 1664 | } |
1666 | return page; | 1665 | return page; |
1667 | } | 1666 | } |
1668 | 1667 | ||
1669 | /* | 1668 | /* |
1670 | * Default write policy as shown to user space via poll/select/SIGIO | 1669 | * Default write policy as shown to user space via poll/select/SIGIO |
1671 | */ | 1670 | */ |
1672 | static inline int sock_writeable(const struct sock *sk) | 1671 | static inline int sock_writeable(const struct sock *sk) |
1673 | { | 1672 | { |
1674 | return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1); | 1673 | return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1); |
1675 | } | 1674 | } |
1676 | 1675 | ||
1677 | static inline gfp_t gfp_any(void) | 1676 | static inline gfp_t gfp_any(void) |
1678 | { | 1677 | { |
1679 | return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; | 1678 | return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; |
1680 | } | 1679 | } |
1681 | 1680 | ||
1682 | static inline long sock_rcvtimeo(const struct sock *sk, int noblock) | 1681 | static inline long sock_rcvtimeo(const struct sock *sk, int noblock) |
1683 | { | 1682 | { |
1684 | return noblock ? 0 : sk->sk_rcvtimeo; | 1683 | return noblock ? 0 : sk->sk_rcvtimeo; |
1685 | } | 1684 | } |
1686 | 1685 | ||
1687 | static inline long sock_sndtimeo(const struct sock *sk, int noblock) | 1686 | static inline long sock_sndtimeo(const struct sock *sk, int noblock) |
1688 | { | 1687 | { |
1689 | return noblock ? 0 : sk->sk_sndtimeo; | 1688 | return noblock ? 0 : sk->sk_sndtimeo; |
1690 | } | 1689 | } |
1691 | 1690 | ||
1692 | static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) | 1691 | static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) |
1693 | { | 1692 | { |
1694 | return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1; | 1693 | return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1; |
1695 | } | 1694 | } |
1696 | 1695 | ||
1697 | /* Alas, with timeout socket operations are not restartable. | 1696 | /* Alas, with timeout socket operations are not restartable. |
1698 | * Compare this to poll(). | 1697 | * Compare this to poll(). |
1699 | */ | 1698 | */ |
1700 | static inline int sock_intr_errno(long timeo) | 1699 | static inline int sock_intr_errno(long timeo) |
1701 | { | 1700 | { |
1702 | return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; | 1701 | return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; |
1703 | } | 1702 | } |
1704 | 1703 | ||
1705 | extern void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, | 1704 | extern void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, |
1706 | struct sk_buff *skb); | 1705 | struct sk_buff *skb); |
1707 | 1706 | ||
1708 | static __inline__ void | 1707 | static __inline__ void |
1709 | sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) | 1708 | sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) |
1710 | { | 1709 | { |
1711 | ktime_t kt = skb->tstamp; | 1710 | ktime_t kt = skb->tstamp; |
1712 | struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); | 1711 | struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); |
1713 | 1712 | ||
1714 | /* | 1713 | /* |
1715 | * generate control messages if | 1714 | * generate control messages if |
1716 | * - receive time stamping in software requested (SOCK_RCVTSTAMP | 1715 | * - receive time stamping in software requested (SOCK_RCVTSTAMP |
1717 | * or SOCK_TIMESTAMPING_RX_SOFTWARE) | 1716 | * or SOCK_TIMESTAMPING_RX_SOFTWARE) |
1718 | * - software time stamp available and wanted | 1717 | * - software time stamp available and wanted |
1719 | * (SOCK_TIMESTAMPING_SOFTWARE) | 1718 | * (SOCK_TIMESTAMPING_SOFTWARE) |
1720 | * - hardware time stamps available and wanted | 1719 | * - hardware time stamps available and wanted |
1721 | * (SOCK_TIMESTAMPING_SYS_HARDWARE or | 1720 | * (SOCK_TIMESTAMPING_SYS_HARDWARE or |
1722 | * SOCK_TIMESTAMPING_RAW_HARDWARE) | 1721 | * SOCK_TIMESTAMPING_RAW_HARDWARE) |
1723 | */ | 1722 | */ |
1724 | if (sock_flag(sk, SOCK_RCVTSTAMP) || | 1723 | if (sock_flag(sk, SOCK_RCVTSTAMP) || |
1725 | sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE) || | 1724 | sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE) || |
1726 | (kt.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) || | 1725 | (kt.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) || |
1727 | (hwtstamps->hwtstamp.tv64 && | 1726 | (hwtstamps->hwtstamp.tv64 && |
1728 | sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) || | 1727 | sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) || |
1729 | (hwtstamps->syststamp.tv64 && | 1728 | (hwtstamps->syststamp.tv64 && |
1730 | sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))) | 1729 | sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))) |
1731 | __sock_recv_timestamp(msg, sk, skb); | 1730 | __sock_recv_timestamp(msg, sk, skb); |
1732 | else | 1731 | else |
1733 | sk->sk_stamp = kt; | 1732 | sk->sk_stamp = kt; |
1734 | } | 1733 | } |
1735 | 1734 | ||
1736 | extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, | 1735 | extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, |
1737 | struct sk_buff *skb); | 1736 | struct sk_buff *skb); |
1738 | 1737 | ||
1739 | static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, | 1738 | static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, |
1740 | struct sk_buff *skb) | 1739 | struct sk_buff *skb) |
1741 | { | 1740 | { |
1742 | #define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ | 1741 | #define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ |
1743 | (1UL << SOCK_RCVTSTAMP) | \ | 1742 | (1UL << SOCK_RCVTSTAMP) | \ |
1744 | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ | 1743 | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ |
1745 | (1UL << SOCK_TIMESTAMPING_SOFTWARE) | \ | 1744 | (1UL << SOCK_TIMESTAMPING_SOFTWARE) | \ |
1746 | (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE) | \ | 1745 | (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE) | \ |
1747 | (1UL << SOCK_TIMESTAMPING_SYS_HARDWARE)) | 1746 | (1UL << SOCK_TIMESTAMPING_SYS_HARDWARE)) |
1748 | 1747 | ||
1749 | if (sk->sk_flags & FLAGS_TS_OR_DROPS) | 1748 | if (sk->sk_flags & FLAGS_TS_OR_DROPS) |
1750 | __sock_recv_ts_and_drops(msg, sk, skb); | 1749 | __sock_recv_ts_and_drops(msg, sk, skb); |
1751 | else | 1750 | else |
1752 | sk->sk_stamp = skb->tstamp; | 1751 | sk->sk_stamp = skb->tstamp; |
1753 | } | 1752 | } |
1754 | 1753 | ||
1755 | /** | 1754 | /** |
1756 | * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped | 1755 | * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped |
1757 | * @sk: socket sending this packet | 1756 | * @sk: socket sending this packet |
1758 | * @tx_flags: filled with instructions for time stamping | 1757 | * @tx_flags: filled with instructions for time stamping |
1759 | * | 1758 | * |
1760 | * Currently only depends on SOCK_TIMESTAMPING* flags. Returns error code if | 1759 | * Currently only depends on SOCK_TIMESTAMPING* flags. Returns error code if |
1761 | * parameters are invalid. | 1760 | * parameters are invalid. |
1762 | */ | 1761 | */ |
1763 | extern int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags); | 1762 | extern int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags); |
1764 | 1763 | ||
1765 | /** | 1764 | /** |
1766 | * sk_eat_skb - Release a skb if it is no longer needed | 1765 | * sk_eat_skb - Release a skb if it is no longer needed |
1767 | * @sk: socket to eat this skb from | 1766 | * @sk: socket to eat this skb from |
1768 | * @skb: socket buffer to eat | 1767 | * @skb: socket buffer to eat |
1769 | * @copied_early: flag indicating whether DMA operations copied this data early | 1768 | * @copied_early: flag indicating whether DMA operations copied this data early |
1770 | * | 1769 | * |
1771 | * This routine must be called with interrupts disabled or with the socket | 1770 | * This routine must be called with interrupts disabled or with the socket |
1772 | * locked so that the sk_buff queue operation is ok. | 1771 | * locked so that the sk_buff queue operation is ok. |
1773 | */ | 1772 | */ |
1774 | #ifdef CONFIG_NET_DMA | 1773 | #ifdef CONFIG_NET_DMA |
1775 | static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) | 1774 | static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) |
1776 | { | 1775 | { |
1777 | __skb_unlink(skb, &sk->sk_receive_queue); | 1776 | __skb_unlink(skb, &sk->sk_receive_queue); |
1778 | if (!copied_early) | 1777 | if (!copied_early) |
1779 | __kfree_skb(skb); | 1778 | __kfree_skb(skb); |
1780 | else | 1779 | else |
1781 | __skb_queue_tail(&sk->sk_async_wait_queue, skb); | 1780 | __skb_queue_tail(&sk->sk_async_wait_queue, skb); |
1782 | } | 1781 | } |
1783 | #else | 1782 | #else |
1784 | static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) | 1783 | static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) |
1785 | { | 1784 | { |
1786 | __skb_unlink(skb, &sk->sk_receive_queue); | 1785 | __skb_unlink(skb, &sk->sk_receive_queue); |
1787 | __kfree_skb(skb); | 1786 | __kfree_skb(skb); |
1788 | } | 1787 | } |
1789 | #endif | 1788 | #endif |
1790 | 1789 | ||
1791 | static inline | 1790 | static inline |
1792 | struct net *sock_net(const struct sock *sk) | 1791 | struct net *sock_net(const struct sock *sk) |
1793 | { | 1792 | { |
1794 | return read_pnet(&sk->sk_net); | 1793 | return read_pnet(&sk->sk_net); |
1795 | } | 1794 | } |
1796 | 1795 | ||
1797 | static inline | 1796 | static inline |
1798 | void sock_net_set(struct sock *sk, struct net *net) | 1797 | void sock_net_set(struct sock *sk, struct net *net) |
1799 | { | 1798 | { |
1800 | write_pnet(&sk->sk_net, net); | 1799 | write_pnet(&sk->sk_net, net); |
1801 | } | 1800 | } |
1802 | 1801 | ||
1803 | /* | 1802 | /* |
1804 | * Kernel sockets, f.e. rtnl or icmp_socket, are a part of a namespace. | 1803 | * Kernel sockets, f.e. rtnl or icmp_socket, are a part of a namespace. |
1805 | * They should not hold a reference to a namespace in order to allow | 1804 | * They should not hold a reference to a namespace in order to allow |
1806 | * to stop it. | 1805 | * to stop it. |
1807 | * Sockets after sk_change_net should be released using sk_release_kernel | 1806 | * Sockets after sk_change_net should be released using sk_release_kernel |
1808 | */ | 1807 | */ |
1809 | static inline void sk_change_net(struct sock *sk, struct net *net) | 1808 | static inline void sk_change_net(struct sock *sk, struct net *net) |
1810 | { | 1809 | { |
1811 | put_net(sock_net(sk)); | 1810 | put_net(sock_net(sk)); |
1812 | sock_net_set(sk, hold_net(net)); | 1811 | sock_net_set(sk, hold_net(net)); |
1813 | } | 1812 | } |
1814 | 1813 | ||
1815 | static inline struct sock *skb_steal_sock(struct sk_buff *skb) | 1814 | static inline struct sock *skb_steal_sock(struct sk_buff *skb) |
1816 | { | 1815 | { |
1817 | if (unlikely(skb->sk)) { | 1816 | if (unlikely(skb->sk)) { |
1818 | struct sock *sk = skb->sk; | 1817 | struct sock *sk = skb->sk; |
1819 | 1818 | ||
1820 | skb->destructor = NULL; | 1819 | skb->destructor = NULL; |
1821 | skb->sk = NULL; | 1820 | skb->sk = NULL; |
1822 | return sk; | 1821 | return sk; |
1823 | } | 1822 | } |
1824 | return NULL; | 1823 | return NULL; |
1825 | } | 1824 | } |
1826 | 1825 | ||
1827 | extern void sock_enable_timestamp(struct sock *sk, int flag); | 1826 | extern void sock_enable_timestamp(struct sock *sk, int flag); |
1828 | extern int sock_get_timestamp(struct sock *, struct timeval __user *); | 1827 | extern int sock_get_timestamp(struct sock *, struct timeval __user *); |
1829 | extern int sock_get_timestampns(struct sock *, struct timespec __user *); | 1828 | extern int sock_get_timestampns(struct sock *, struct timespec __user *); |
1830 | 1829 | ||
1831 | /* | 1830 | /* |
1832 | * Enable debug/info messages | 1831 | * Enable debug/info messages |
1833 | */ | 1832 | */ |
1834 | extern int net_msg_warn; | 1833 | extern int net_msg_warn; |
1835 | #define NETDEBUG(fmt, args...) \ | 1834 | #define NETDEBUG(fmt, args...) \ |
1836 | do { if (net_msg_warn) printk(fmt,##args); } while (0) | 1835 | do { if (net_msg_warn) printk(fmt,##args); } while (0) |
1837 | 1836 | ||
1838 | #define LIMIT_NETDEBUG(fmt, args...) \ | 1837 | #define LIMIT_NETDEBUG(fmt, args...) \ |
1839 | do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0) | 1838 | do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0) |
1840 | 1839 | ||
1841 | extern __u32 sysctl_wmem_max; | 1840 | extern __u32 sysctl_wmem_max; |
1842 | extern __u32 sysctl_rmem_max; | 1841 | extern __u32 sysctl_rmem_max; |
1843 | 1842 | ||
1844 | extern void sk_init(void); | 1843 | extern void sk_init(void); |
1845 | 1844 | ||
1846 | extern int sysctl_optmem_max; | 1845 | extern int sysctl_optmem_max; |
1847 | 1846 | ||
1848 | extern __u32 sysctl_wmem_default; | 1847 | extern __u32 sysctl_wmem_default; |
1849 | extern __u32 sysctl_rmem_default; | 1848 | extern __u32 sysctl_rmem_default; |
1850 | 1849 | ||
1851 | #endif /* _SOCK_H */ | 1850 | #endif /* _SOCK_H */ |
1852 | 1851 |
kernel/cgroup.c
1 | /* | 1 | /* |
2 | * Generic process-grouping system. | 2 | * Generic process-grouping system. |
3 | * | 3 | * |
4 | * Based originally on the cpuset system, extracted by Paul Menage | 4 | * Based originally on the cpuset system, extracted by Paul Menage |
5 | * Copyright (C) 2006 Google, Inc | 5 | * Copyright (C) 2006 Google, Inc |
6 | * | 6 | * |
7 | * Notifications support | 7 | * Notifications support |
8 | * Copyright (C) 2009 Nokia Corporation | 8 | * Copyright (C) 2009 Nokia Corporation |
9 | * Author: Kirill A. Shutemov | 9 | * Author: Kirill A. Shutemov |
10 | * | 10 | * |
11 | * Copyright notices from the original cpuset code: | 11 | * Copyright notices from the original cpuset code: |
12 | * -------------------------------------------------- | 12 | * -------------------------------------------------- |
13 | * Copyright (C) 2003 BULL SA. | 13 | * Copyright (C) 2003 BULL SA. |
14 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. | 14 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. |
15 | * | 15 | * |
16 | * Portions derived from Patrick Mochel's sysfs code. | 16 | * Portions derived from Patrick Mochel's sysfs code. |
17 | * sysfs is Copyright (c) 2001-3 Patrick Mochel | 17 | * sysfs is Copyright (c) 2001-3 Patrick Mochel |
18 | * | 18 | * |
19 | * 2003-10-10 Written by Simon Derr. | 19 | * 2003-10-10 Written by Simon Derr. |
20 | * 2003-10-22 Updates by Stephen Hemminger. | 20 | * 2003-10-22 Updates by Stephen Hemminger. |
21 | * 2004 May-July Rework by Paul Jackson. | 21 | * 2004 May-July Rework by Paul Jackson. |
22 | * --------------------------------------------------- | 22 | * --------------------------------------------------- |
23 | * | 23 | * |
24 | * This file is subject to the terms and conditions of the GNU General Public | 24 | * This file is subject to the terms and conditions of the GNU General Public |
25 | * License. See the file COPYING in the main directory of the Linux | 25 | * License. See the file COPYING in the main directory of the Linux |
26 | * distribution for more details. | 26 | * distribution for more details. |
27 | */ | 27 | */ |
28 | 28 | ||
29 | #include <linux/cgroup.h> | 29 | #include <linux/cgroup.h> |
30 | #include <linux/ctype.h> | 30 | #include <linux/ctype.h> |
31 | #include <linux/errno.h> | 31 | #include <linux/errno.h> |
32 | #include <linux/fs.h> | 32 | #include <linux/fs.h> |
33 | #include <linux/kernel.h> | 33 | #include <linux/kernel.h> |
34 | #include <linux/list.h> | 34 | #include <linux/list.h> |
35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
36 | #include <linux/mutex.h> | 36 | #include <linux/mutex.h> |
37 | #include <linux/mount.h> | 37 | #include <linux/mount.h> |
38 | #include <linux/pagemap.h> | 38 | #include <linux/pagemap.h> |
39 | #include <linux/proc_fs.h> | 39 | #include <linux/proc_fs.h> |
40 | #include <linux/rcupdate.h> | 40 | #include <linux/rcupdate.h> |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/backing-dev.h> | 42 | #include <linux/backing-dev.h> |
43 | #include <linux/seq_file.h> | 43 | #include <linux/seq_file.h> |
44 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
45 | #include <linux/magic.h> | 45 | #include <linux/magic.h> |
46 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
47 | #include <linux/string.h> | 47 | #include <linux/string.h> |
48 | #include <linux/sort.h> | 48 | #include <linux/sort.h> |
49 | #include <linux/kmod.h> | 49 | #include <linux/kmod.h> |
50 | #include <linux/module.h> | 50 | #include <linux/module.h> |
51 | #include <linux/delayacct.h> | 51 | #include <linux/delayacct.h> |
52 | #include <linux/cgroupstats.h> | 52 | #include <linux/cgroupstats.h> |
53 | #include <linux/hash.h> | 53 | #include <linux/hash.h> |
54 | #include <linux/namei.h> | 54 | #include <linux/namei.h> |
55 | #include <linux/pid_namespace.h> | 55 | #include <linux/pid_namespace.h> |
56 | #include <linux/idr.h> | 56 | #include <linux/idr.h> |
57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
58 | #include <linux/eventfd.h> | 58 | #include <linux/eventfd.h> |
59 | #include <linux/poll.h> | 59 | #include <linux/poll.h> |
60 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | 60 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ |
61 | 61 | ||
62 | #include <asm/atomic.h> | 62 | #include <asm/atomic.h> |
63 | 63 | ||
64 | static DEFINE_MUTEX(cgroup_mutex); | 64 | static DEFINE_MUTEX(cgroup_mutex); |
65 | 65 | ||
66 | /* | 66 | /* |
67 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 67 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
68 | * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are | 68 | * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are |
69 | * registered after that. The mutable section of this array is protected by | 69 | * registered after that. The mutable section of this array is protected by |
70 | * cgroup_mutex. | 70 | * cgroup_mutex. |
71 | */ | 71 | */ |
72 | #define SUBSYS(_x) &_x ## _subsys, | 72 | #define SUBSYS(_x) &_x ## _subsys, |
73 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { | 73 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { |
74 | #include <linux/cgroup_subsys.h> | 74 | #include <linux/cgroup_subsys.h> |
75 | }; | 75 | }; |
76 | 76 | ||
77 | #define MAX_CGROUP_ROOT_NAMELEN 64 | 77 | #define MAX_CGROUP_ROOT_NAMELEN 64 |
78 | 78 | ||
79 | /* | 79 | /* |
80 | * A cgroupfs_root represents the root of a cgroup hierarchy, | 80 | * A cgroupfs_root represents the root of a cgroup hierarchy, |
81 | * and may be associated with a superblock to form an active | 81 | * and may be associated with a superblock to form an active |
82 | * hierarchy | 82 | * hierarchy |
83 | */ | 83 | */ |
84 | struct cgroupfs_root { | 84 | struct cgroupfs_root { |
85 | struct super_block *sb; | 85 | struct super_block *sb; |
86 | 86 | ||
87 | /* | 87 | /* |
88 | * The bitmask of subsystems intended to be attached to this | 88 | * The bitmask of subsystems intended to be attached to this |
89 | * hierarchy | 89 | * hierarchy |
90 | */ | 90 | */ |
91 | unsigned long subsys_bits; | 91 | unsigned long subsys_bits; |
92 | 92 | ||
93 | /* Unique id for this hierarchy. */ | 93 | /* Unique id for this hierarchy. */ |
94 | int hierarchy_id; | 94 | int hierarchy_id; |
95 | 95 | ||
96 | /* The bitmask of subsystems currently attached to this hierarchy */ | 96 | /* The bitmask of subsystems currently attached to this hierarchy */ |
97 | unsigned long actual_subsys_bits; | 97 | unsigned long actual_subsys_bits; |
98 | 98 | ||
99 | /* A list running through the attached subsystems */ | 99 | /* A list running through the attached subsystems */ |
100 | struct list_head subsys_list; | 100 | struct list_head subsys_list; |
101 | 101 | ||
102 | /* The root cgroup for this hierarchy */ | 102 | /* The root cgroup for this hierarchy */ |
103 | struct cgroup top_cgroup; | 103 | struct cgroup top_cgroup; |
104 | 104 | ||
105 | /* Tracks how many cgroups are currently defined in hierarchy.*/ | 105 | /* Tracks how many cgroups are currently defined in hierarchy.*/ |
106 | int number_of_cgroups; | 106 | int number_of_cgroups; |
107 | 107 | ||
108 | /* A list running through the active hierarchies */ | 108 | /* A list running through the active hierarchies */ |
109 | struct list_head root_list; | 109 | struct list_head root_list; |
110 | 110 | ||
111 | /* Hierarchy-specific flags */ | 111 | /* Hierarchy-specific flags */ |
112 | unsigned long flags; | 112 | unsigned long flags; |
113 | 113 | ||
114 | /* The path to use for release notifications. */ | 114 | /* The path to use for release notifications. */ |
115 | char release_agent_path[PATH_MAX]; | 115 | char release_agent_path[PATH_MAX]; |
116 | 116 | ||
117 | /* The name for this hierarchy - may be empty */ | 117 | /* The name for this hierarchy - may be empty */ |
118 | char name[MAX_CGROUP_ROOT_NAMELEN]; | 118 | char name[MAX_CGROUP_ROOT_NAMELEN]; |
119 | }; | 119 | }; |
120 | 120 | ||
121 | /* | 121 | /* |
122 | * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the | 122 | * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the |
123 | * subsystems that are otherwise unattached - it never has more than a | 123 | * subsystems that are otherwise unattached - it never has more than a |
124 | * single cgroup, and all tasks are part of that cgroup. | 124 | * single cgroup, and all tasks are part of that cgroup. |
125 | */ | 125 | */ |
126 | static struct cgroupfs_root rootnode; | 126 | static struct cgroupfs_root rootnode; |
127 | 127 | ||
128 | /* | 128 | /* |
129 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | 129 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when |
130 | * cgroup_subsys->use_id != 0. | 130 | * cgroup_subsys->use_id != 0. |
131 | */ | 131 | */ |
132 | #define CSS_ID_MAX (65535) | 132 | #define CSS_ID_MAX (65535) |
133 | struct css_id { | 133 | struct css_id { |
134 | /* | 134 | /* |
135 | * The css to which this ID points. This pointer is set to valid value | 135 | * The css to which this ID points. This pointer is set to valid value |
136 | * after cgroup is populated. If cgroup is removed, this will be NULL. | 136 | * after cgroup is populated. If cgroup is removed, this will be NULL. |
137 | * This pointer is expected to be RCU-safe because destroy() | 137 | * This pointer is expected to be RCU-safe because destroy() |
138 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 138 | * is called after synchronize_rcu(). But for safe use, css_is_removed() |
139 | * css_tryget() should be used for avoiding race. | 139 | * css_tryget() should be used for avoiding race. |
140 | */ | 140 | */ |
141 | struct cgroup_subsys_state __rcu *css; | 141 | struct cgroup_subsys_state __rcu *css; |
142 | /* | 142 | /* |
143 | * ID of this css. | 143 | * ID of this css. |
144 | */ | 144 | */ |
145 | unsigned short id; | 145 | unsigned short id; |
146 | /* | 146 | /* |
147 | * Depth in hierarchy which this ID belongs to. | 147 | * Depth in hierarchy which this ID belongs to. |
148 | */ | 148 | */ |
149 | unsigned short depth; | 149 | unsigned short depth; |
150 | /* | 150 | /* |
151 | * ID is freed by RCU. (and lookup routine is RCU safe.) | 151 | * ID is freed by RCU. (and lookup routine is RCU safe.) |
152 | */ | 152 | */ |
153 | struct rcu_head rcu_head; | 153 | struct rcu_head rcu_head; |
154 | /* | 154 | /* |
155 | * Hierarchy of CSS ID belongs to. | 155 | * Hierarchy of CSS ID belongs to. |
156 | */ | 156 | */ |
157 | unsigned short stack[0]; /* Array of Length (depth+1) */ | 157 | unsigned short stack[0]; /* Array of Length (depth+1) */ |
158 | }; | 158 | }; |
159 | 159 | ||
160 | /* | 160 | /* |
161 | * cgroup_event represents events which userspace want to receive. | 161 | * cgroup_event represents events which userspace want to receive. |
162 | */ | 162 | */ |
163 | struct cgroup_event { | 163 | struct cgroup_event { |
164 | /* | 164 | /* |
165 | * Cgroup which the event belongs to. | 165 | * Cgroup which the event belongs to. |
166 | */ | 166 | */ |
167 | struct cgroup *cgrp; | 167 | struct cgroup *cgrp; |
168 | /* | 168 | /* |
169 | * Control file which the event associated. | 169 | * Control file which the event associated. |
170 | */ | 170 | */ |
171 | struct cftype *cft; | 171 | struct cftype *cft; |
172 | /* | 172 | /* |
173 | * eventfd to signal userspace about the event. | 173 | * eventfd to signal userspace about the event. |
174 | */ | 174 | */ |
175 | struct eventfd_ctx *eventfd; | 175 | struct eventfd_ctx *eventfd; |
176 | /* | 176 | /* |
177 | * Each of these stored in a list by the cgroup. | 177 | * Each of these stored in a list by the cgroup. |
178 | */ | 178 | */ |
179 | struct list_head list; | 179 | struct list_head list; |
180 | /* | 180 | /* |
181 | * All fields below needed to unregister event when | 181 | * All fields below needed to unregister event when |
182 | * userspace closes eventfd. | 182 | * userspace closes eventfd. |
183 | */ | 183 | */ |
184 | poll_table pt; | 184 | poll_table pt; |
185 | wait_queue_head_t *wqh; | 185 | wait_queue_head_t *wqh; |
186 | wait_queue_t wait; | 186 | wait_queue_t wait; |
187 | struct work_struct remove; | 187 | struct work_struct remove; |
188 | }; | 188 | }; |
189 | 189 | ||
190 | /* The list of hierarchy roots */ | 190 | /* The list of hierarchy roots */ |
191 | 191 | ||
192 | static LIST_HEAD(roots); | 192 | static LIST_HEAD(roots); |
193 | static int root_count; | 193 | static int root_count; |
194 | 194 | ||
195 | static DEFINE_IDA(hierarchy_ida); | 195 | static DEFINE_IDA(hierarchy_ida); |
196 | static int next_hierarchy_id; | 196 | static int next_hierarchy_id; |
197 | static DEFINE_SPINLOCK(hierarchy_id_lock); | 197 | static DEFINE_SPINLOCK(hierarchy_id_lock); |
198 | 198 | ||
199 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ | 199 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ |
200 | #define dummytop (&rootnode.top_cgroup) | 200 | #define dummytop (&rootnode.top_cgroup) |
201 | 201 | ||
202 | /* This flag indicates whether tasks in the fork and exit paths should | 202 | /* This flag indicates whether tasks in the fork and exit paths should |
203 | * check for fork/exit handlers to call. This avoids us having to do | 203 | * check for fork/exit handlers to call. This avoids us having to do |
204 | * extra work in the fork/exit path if none of the subsystems need to | 204 | * extra work in the fork/exit path if none of the subsystems need to |
205 | * be called. | 205 | * be called. |
206 | */ | 206 | */ |
207 | static int need_forkexit_callback __read_mostly; | 207 | static int need_forkexit_callback __read_mostly; |
208 | 208 | ||
209 | #ifdef CONFIG_PROVE_LOCKING | 209 | #ifdef CONFIG_PROVE_LOCKING |
210 | int cgroup_lock_is_held(void) | 210 | int cgroup_lock_is_held(void) |
211 | { | 211 | { |
212 | return lockdep_is_held(&cgroup_mutex); | 212 | return lockdep_is_held(&cgroup_mutex); |
213 | } | 213 | } |
214 | #else /* #ifdef CONFIG_PROVE_LOCKING */ | 214 | #else /* #ifdef CONFIG_PROVE_LOCKING */ |
215 | int cgroup_lock_is_held(void) | 215 | int cgroup_lock_is_held(void) |
216 | { | 216 | { |
217 | return mutex_is_locked(&cgroup_mutex); | 217 | return mutex_is_locked(&cgroup_mutex); |
218 | } | 218 | } |
219 | #endif /* #else #ifdef CONFIG_PROVE_LOCKING */ | 219 | #endif /* #else #ifdef CONFIG_PROVE_LOCKING */ |
220 | 220 | ||
221 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 221 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
222 | 222 | ||
223 | /* convenient tests for these bits */ | 223 | /* convenient tests for these bits */ |
224 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 224 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
225 | { | 225 | { |
226 | return test_bit(CGRP_REMOVED, &cgrp->flags); | 226 | return test_bit(CGRP_REMOVED, &cgrp->flags); |
227 | } | 227 | } |
228 | 228 | ||
229 | /* bits in struct cgroupfs_root flags field */ | 229 | /* bits in struct cgroupfs_root flags field */ |
230 | enum { | 230 | enum { |
231 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ | 231 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ |
232 | }; | 232 | }; |
233 | 233 | ||
234 | static int cgroup_is_releasable(const struct cgroup *cgrp) | 234 | static int cgroup_is_releasable(const struct cgroup *cgrp) |
235 | { | 235 | { |
236 | const int bits = | 236 | const int bits = |
237 | (1 << CGRP_RELEASABLE) | | 237 | (1 << CGRP_RELEASABLE) | |
238 | (1 << CGRP_NOTIFY_ON_RELEASE); | 238 | (1 << CGRP_NOTIFY_ON_RELEASE); |
239 | return (cgrp->flags & bits) == bits; | 239 | return (cgrp->flags & bits) == bits; |
240 | } | 240 | } |
241 | 241 | ||
242 | static int notify_on_release(const struct cgroup *cgrp) | 242 | static int notify_on_release(const struct cgroup *cgrp) |
243 | { | 243 | { |
244 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 244 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
245 | } | 245 | } |
246 | 246 | ||
247 | static int clone_children(const struct cgroup *cgrp) | 247 | static int clone_children(const struct cgroup *cgrp) |
248 | { | 248 | { |
249 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 249 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); |
250 | } | 250 | } |
251 | 251 | ||
252 | /* | 252 | /* |
253 | * for_each_subsys() allows you to iterate on each subsystem attached to | 253 | * for_each_subsys() allows you to iterate on each subsystem attached to |
254 | * an active hierarchy | 254 | * an active hierarchy |
255 | */ | 255 | */ |
256 | #define for_each_subsys(_root, _ss) \ | 256 | #define for_each_subsys(_root, _ss) \ |
257 | list_for_each_entry(_ss, &_root->subsys_list, sibling) | 257 | list_for_each_entry(_ss, &_root->subsys_list, sibling) |
258 | 258 | ||
259 | /* for_each_active_root() allows you to iterate across the active hierarchies */ | 259 | /* for_each_active_root() allows you to iterate across the active hierarchies */ |
260 | #define for_each_active_root(_root) \ | 260 | #define for_each_active_root(_root) \ |
261 | list_for_each_entry(_root, &roots, root_list) | 261 | list_for_each_entry(_root, &roots, root_list) |
262 | 262 | ||
263 | /* the list of cgroups eligible for automatic release. Protected by | 263 | /* the list of cgroups eligible for automatic release. Protected by |
264 | * release_list_lock */ | 264 | * release_list_lock */ |
265 | static LIST_HEAD(release_list); | 265 | static LIST_HEAD(release_list); |
266 | static DEFINE_SPINLOCK(release_list_lock); | 266 | static DEFINE_SPINLOCK(release_list_lock); |
267 | static void cgroup_release_agent(struct work_struct *work); | 267 | static void cgroup_release_agent(struct work_struct *work); |
268 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); | 268 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); |
269 | static void check_for_release(struct cgroup *cgrp); | 269 | static void check_for_release(struct cgroup *cgrp); |
270 | 270 | ||
271 | /* Link structure for associating css_set objects with cgroups */ | 271 | /* Link structure for associating css_set objects with cgroups */ |
272 | struct cg_cgroup_link { | 272 | struct cg_cgroup_link { |
273 | /* | 273 | /* |
274 | * List running through cg_cgroup_links associated with a | 274 | * List running through cg_cgroup_links associated with a |
275 | * cgroup, anchored on cgroup->css_sets | 275 | * cgroup, anchored on cgroup->css_sets |
276 | */ | 276 | */ |
277 | struct list_head cgrp_link_list; | 277 | struct list_head cgrp_link_list; |
278 | struct cgroup *cgrp; | 278 | struct cgroup *cgrp; |
279 | /* | 279 | /* |
280 | * List running through cg_cgroup_links pointing at a | 280 | * List running through cg_cgroup_links pointing at a |
281 | * single css_set object, anchored on css_set->cg_links | 281 | * single css_set object, anchored on css_set->cg_links |
282 | */ | 282 | */ |
283 | struct list_head cg_link_list; | 283 | struct list_head cg_link_list; |
284 | struct css_set *cg; | 284 | struct css_set *cg; |
285 | }; | 285 | }; |
286 | 286 | ||
287 | /* The default css_set - used by init and its children prior to any | 287 | /* The default css_set - used by init and its children prior to any |
288 | * hierarchies being mounted. It contains a pointer to the root state | 288 | * hierarchies being mounted. It contains a pointer to the root state |
289 | * for each subsystem. Also used to anchor the list of css_sets. Not | 289 | * for each subsystem. Also used to anchor the list of css_sets. Not |
290 | * reference-counted, to improve performance when child cgroups | 290 | * reference-counted, to improve performance when child cgroups |
291 | * haven't been created. | 291 | * haven't been created. |
292 | */ | 292 | */ |
293 | 293 | ||
294 | static struct css_set init_css_set; | 294 | static struct css_set init_css_set; |
295 | static struct cg_cgroup_link init_css_set_link; | 295 | static struct cg_cgroup_link init_css_set_link; |
296 | 296 | ||
297 | static int cgroup_init_idr(struct cgroup_subsys *ss, | 297 | static int cgroup_init_idr(struct cgroup_subsys *ss, |
298 | struct cgroup_subsys_state *css); | 298 | struct cgroup_subsys_state *css); |
299 | 299 | ||
300 | /* css_set_lock protects the list of css_set objects, and the | 300 | /* css_set_lock protects the list of css_set objects, and the |
301 | * chain of tasks off each css_set. Nests outside task->alloc_lock | 301 | * chain of tasks off each css_set. Nests outside task->alloc_lock |
302 | * due to cgroup_iter_start() */ | 302 | * due to cgroup_iter_start() */ |
303 | static DEFINE_RWLOCK(css_set_lock); | 303 | static DEFINE_RWLOCK(css_set_lock); |
304 | static int css_set_count; | 304 | static int css_set_count; |
305 | 305 | ||
306 | /* | 306 | /* |
307 | * hash table for cgroup groups. This improves the performance to find | 307 | * hash table for cgroup groups. This improves the performance to find |
308 | * an existing css_set. This hash doesn't (currently) take into | 308 | * an existing css_set. This hash doesn't (currently) take into |
309 | * account cgroups in empty hierarchies. | 309 | * account cgroups in empty hierarchies. |
310 | */ | 310 | */ |
311 | #define CSS_SET_HASH_BITS 7 | 311 | #define CSS_SET_HASH_BITS 7 |
312 | #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) | 312 | #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) |
313 | static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; | 313 | static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; |
314 | 314 | ||
315 | static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | 315 | static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) |
316 | { | 316 | { |
317 | int i; | 317 | int i; |
318 | int index; | 318 | int index; |
319 | unsigned long tmp = 0UL; | 319 | unsigned long tmp = 0UL; |
320 | 320 | ||
321 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) | 321 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) |
322 | tmp += (unsigned long)css[i]; | 322 | tmp += (unsigned long)css[i]; |
323 | tmp = (tmp >> 16) ^ tmp; | 323 | tmp = (tmp >> 16) ^ tmp; |
324 | 324 | ||
325 | index = hash_long(tmp, CSS_SET_HASH_BITS); | 325 | index = hash_long(tmp, CSS_SET_HASH_BITS); |
326 | 326 | ||
327 | return &css_set_table[index]; | 327 | return &css_set_table[index]; |
328 | } | 328 | } |
329 | 329 | ||
330 | /* We don't maintain the lists running through each css_set to its | 330 | /* We don't maintain the lists running through each css_set to its |
331 | * task until after the first call to cgroup_iter_start(). This | 331 | * task until after the first call to cgroup_iter_start(). This |
332 | * reduces the fork()/exit() overhead for people who have cgroups | 332 | * reduces the fork()/exit() overhead for people who have cgroups |
333 | * compiled into their kernel but not actually in use */ | 333 | * compiled into their kernel but not actually in use */ |
334 | static int use_task_css_set_links __read_mostly; | 334 | static int use_task_css_set_links __read_mostly; |
335 | 335 | ||
336 | static void __put_css_set(struct css_set *cg, int taskexit) | 336 | static void __put_css_set(struct css_set *cg, int taskexit) |
337 | { | 337 | { |
338 | struct cg_cgroup_link *link; | 338 | struct cg_cgroup_link *link; |
339 | struct cg_cgroup_link *saved_link; | 339 | struct cg_cgroup_link *saved_link; |
340 | /* | 340 | /* |
341 | * Ensure that the refcount doesn't hit zero while any readers | 341 | * Ensure that the refcount doesn't hit zero while any readers |
342 | * can see it. Similar to atomic_dec_and_lock(), but for an | 342 | * can see it. Similar to atomic_dec_and_lock(), but for an |
343 | * rwlock | 343 | * rwlock |
344 | */ | 344 | */ |
345 | if (atomic_add_unless(&cg->refcount, -1, 1)) | 345 | if (atomic_add_unless(&cg->refcount, -1, 1)) |
346 | return; | 346 | return; |
347 | write_lock(&css_set_lock); | 347 | write_lock(&css_set_lock); |
348 | if (!atomic_dec_and_test(&cg->refcount)) { | 348 | if (!atomic_dec_and_test(&cg->refcount)) { |
349 | write_unlock(&css_set_lock); | 349 | write_unlock(&css_set_lock); |
350 | return; | 350 | return; |
351 | } | 351 | } |
352 | 352 | ||
353 | /* This css_set is dead. unlink it and release cgroup refcounts */ | 353 | /* This css_set is dead. unlink it and release cgroup refcounts */ |
354 | hlist_del(&cg->hlist); | 354 | hlist_del(&cg->hlist); |
355 | css_set_count--; | 355 | css_set_count--; |
356 | 356 | ||
357 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, | 357 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, |
358 | cg_link_list) { | 358 | cg_link_list) { |
359 | struct cgroup *cgrp = link->cgrp; | 359 | struct cgroup *cgrp = link->cgrp; |
360 | list_del(&link->cg_link_list); | 360 | list_del(&link->cg_link_list); |
361 | list_del(&link->cgrp_link_list); | 361 | list_del(&link->cgrp_link_list); |
362 | if (atomic_dec_and_test(&cgrp->count) && | 362 | if (atomic_dec_and_test(&cgrp->count) && |
363 | notify_on_release(cgrp)) { | 363 | notify_on_release(cgrp)) { |
364 | if (taskexit) | 364 | if (taskexit) |
365 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 365 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
366 | check_for_release(cgrp); | 366 | check_for_release(cgrp); |
367 | } | 367 | } |
368 | 368 | ||
369 | kfree(link); | 369 | kfree(link); |
370 | } | 370 | } |
371 | 371 | ||
372 | write_unlock(&css_set_lock); | 372 | write_unlock(&css_set_lock); |
373 | kfree_rcu(cg, rcu_head); | 373 | kfree_rcu(cg, rcu_head); |
374 | } | 374 | } |
375 | 375 | ||
376 | /* | 376 | /* |
377 | * refcounted get/put for css_set objects | 377 | * refcounted get/put for css_set objects |
378 | */ | 378 | */ |
379 | static inline void get_css_set(struct css_set *cg) | 379 | static inline void get_css_set(struct css_set *cg) |
380 | { | 380 | { |
381 | atomic_inc(&cg->refcount); | 381 | atomic_inc(&cg->refcount); |
382 | } | 382 | } |
383 | 383 | ||
384 | static inline void put_css_set(struct css_set *cg) | 384 | static inline void put_css_set(struct css_set *cg) |
385 | { | 385 | { |
386 | __put_css_set(cg, 0); | 386 | __put_css_set(cg, 0); |
387 | } | 387 | } |
388 | 388 | ||
389 | static inline void put_css_set_taskexit(struct css_set *cg) | 389 | static inline void put_css_set_taskexit(struct css_set *cg) |
390 | { | 390 | { |
391 | __put_css_set(cg, 1); | 391 | __put_css_set(cg, 1); |
392 | } | 392 | } |
393 | 393 | ||
394 | /* | 394 | /* |
395 | * compare_css_sets - helper function for find_existing_css_set(). | 395 | * compare_css_sets - helper function for find_existing_css_set(). |
396 | * @cg: candidate css_set being tested | 396 | * @cg: candidate css_set being tested |
397 | * @old_cg: existing css_set for a task | 397 | * @old_cg: existing css_set for a task |
398 | * @new_cgrp: cgroup that's being entered by the task | 398 | * @new_cgrp: cgroup that's being entered by the task |
399 | * @template: desired set of css pointers in css_set (pre-calculated) | 399 | * @template: desired set of css pointers in css_set (pre-calculated) |
400 | * | 400 | * |
401 | * Returns true if "cg" matches "old_cg" except for the hierarchy | 401 | * Returns true if "cg" matches "old_cg" except for the hierarchy |
402 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". | 402 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". |
403 | */ | 403 | */ |
404 | static bool compare_css_sets(struct css_set *cg, | 404 | static bool compare_css_sets(struct css_set *cg, |
405 | struct css_set *old_cg, | 405 | struct css_set *old_cg, |
406 | struct cgroup *new_cgrp, | 406 | struct cgroup *new_cgrp, |
407 | struct cgroup_subsys_state *template[]) | 407 | struct cgroup_subsys_state *template[]) |
408 | { | 408 | { |
409 | struct list_head *l1, *l2; | 409 | struct list_head *l1, *l2; |
410 | 410 | ||
411 | if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { | 411 | if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { |
412 | /* Not all subsystems matched */ | 412 | /* Not all subsystems matched */ |
413 | return false; | 413 | return false; |
414 | } | 414 | } |
415 | 415 | ||
416 | /* | 416 | /* |
417 | * Compare cgroup pointers in order to distinguish between | 417 | * Compare cgroup pointers in order to distinguish between |
418 | * different cgroups in heirarchies with no subsystems. We | 418 | * different cgroups in heirarchies with no subsystems. We |
419 | * could get by with just this check alone (and skip the | 419 | * could get by with just this check alone (and skip the |
420 | * memcmp above) but on most setups the memcmp check will | 420 | * memcmp above) but on most setups the memcmp check will |
421 | * avoid the need for this more expensive check on almost all | 421 | * avoid the need for this more expensive check on almost all |
422 | * candidates. | 422 | * candidates. |
423 | */ | 423 | */ |
424 | 424 | ||
425 | l1 = &cg->cg_links; | 425 | l1 = &cg->cg_links; |
426 | l2 = &old_cg->cg_links; | 426 | l2 = &old_cg->cg_links; |
427 | while (1) { | 427 | while (1) { |
428 | struct cg_cgroup_link *cgl1, *cgl2; | 428 | struct cg_cgroup_link *cgl1, *cgl2; |
429 | struct cgroup *cg1, *cg2; | 429 | struct cgroup *cg1, *cg2; |
430 | 430 | ||
431 | l1 = l1->next; | 431 | l1 = l1->next; |
432 | l2 = l2->next; | 432 | l2 = l2->next; |
433 | /* See if we reached the end - both lists are equal length. */ | 433 | /* See if we reached the end - both lists are equal length. */ |
434 | if (l1 == &cg->cg_links) { | 434 | if (l1 == &cg->cg_links) { |
435 | BUG_ON(l2 != &old_cg->cg_links); | 435 | BUG_ON(l2 != &old_cg->cg_links); |
436 | break; | 436 | break; |
437 | } else { | 437 | } else { |
438 | BUG_ON(l2 == &old_cg->cg_links); | 438 | BUG_ON(l2 == &old_cg->cg_links); |
439 | } | 439 | } |
440 | /* Locate the cgroups associated with these links. */ | 440 | /* Locate the cgroups associated with these links. */ |
441 | cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); | 441 | cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); |
442 | cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); | 442 | cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); |
443 | cg1 = cgl1->cgrp; | 443 | cg1 = cgl1->cgrp; |
444 | cg2 = cgl2->cgrp; | 444 | cg2 = cgl2->cgrp; |
445 | /* Hierarchies should be linked in the same order. */ | 445 | /* Hierarchies should be linked in the same order. */ |
446 | BUG_ON(cg1->root != cg2->root); | 446 | BUG_ON(cg1->root != cg2->root); |
447 | 447 | ||
448 | /* | 448 | /* |
449 | * If this hierarchy is the hierarchy of the cgroup | 449 | * If this hierarchy is the hierarchy of the cgroup |
450 | * that's changing, then we need to check that this | 450 | * that's changing, then we need to check that this |
451 | * css_set points to the new cgroup; if it's any other | 451 | * css_set points to the new cgroup; if it's any other |
452 | * hierarchy, then this css_set should point to the | 452 | * hierarchy, then this css_set should point to the |
453 | * same cgroup as the old css_set. | 453 | * same cgroup as the old css_set. |
454 | */ | 454 | */ |
455 | if (cg1->root == new_cgrp->root) { | 455 | if (cg1->root == new_cgrp->root) { |
456 | if (cg1 != new_cgrp) | 456 | if (cg1 != new_cgrp) |
457 | return false; | 457 | return false; |
458 | } else { | 458 | } else { |
459 | if (cg1 != cg2) | 459 | if (cg1 != cg2) |
460 | return false; | 460 | return false; |
461 | } | 461 | } |
462 | } | 462 | } |
463 | return true; | 463 | return true; |
464 | } | 464 | } |
465 | 465 | ||
466 | /* | 466 | /* |
467 | * find_existing_css_set() is a helper for | 467 | * find_existing_css_set() is a helper for |
468 | * find_css_set(), and checks to see whether an existing | 468 | * find_css_set(), and checks to see whether an existing |
469 | * css_set is suitable. | 469 | * css_set is suitable. |
470 | * | 470 | * |
471 | * oldcg: the cgroup group that we're using before the cgroup | 471 | * oldcg: the cgroup group that we're using before the cgroup |
472 | * transition | 472 | * transition |
473 | * | 473 | * |
474 | * cgrp: the cgroup that we're moving into | 474 | * cgrp: the cgroup that we're moving into |
475 | * | 475 | * |
476 | * template: location in which to build the desired set of subsystem | 476 | * template: location in which to build the desired set of subsystem |
477 | * state objects for the new cgroup group | 477 | * state objects for the new cgroup group |
478 | */ | 478 | */ |
479 | static struct css_set *find_existing_css_set( | 479 | static struct css_set *find_existing_css_set( |
480 | struct css_set *oldcg, | 480 | struct css_set *oldcg, |
481 | struct cgroup *cgrp, | 481 | struct cgroup *cgrp, |
482 | struct cgroup_subsys_state *template[]) | 482 | struct cgroup_subsys_state *template[]) |
483 | { | 483 | { |
484 | int i; | 484 | int i; |
485 | struct cgroupfs_root *root = cgrp->root; | 485 | struct cgroupfs_root *root = cgrp->root; |
486 | struct hlist_head *hhead; | 486 | struct hlist_head *hhead; |
487 | struct hlist_node *node; | 487 | struct hlist_node *node; |
488 | struct css_set *cg; | 488 | struct css_set *cg; |
489 | 489 | ||
490 | /* | 490 | /* |
491 | * Build the set of subsystem state objects that we want to see in the | 491 | * Build the set of subsystem state objects that we want to see in the |
492 | * new css_set. while subsystems can change globally, the entries here | 492 | * new css_set. while subsystems can change globally, the entries here |
493 | * won't change, so no need for locking. | 493 | * won't change, so no need for locking. |
494 | */ | 494 | */ |
495 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 495 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
496 | if (root->subsys_bits & (1UL << i)) { | 496 | if (root->subsys_bits & (1UL << i)) { |
497 | /* Subsystem is in this hierarchy. So we want | 497 | /* Subsystem is in this hierarchy. So we want |
498 | * the subsystem state from the new | 498 | * the subsystem state from the new |
499 | * cgroup */ | 499 | * cgroup */ |
500 | template[i] = cgrp->subsys[i]; | 500 | template[i] = cgrp->subsys[i]; |
501 | } else { | 501 | } else { |
502 | /* Subsystem is not in this hierarchy, so we | 502 | /* Subsystem is not in this hierarchy, so we |
503 | * don't want to change the subsystem state */ | 503 | * don't want to change the subsystem state */ |
504 | template[i] = oldcg->subsys[i]; | 504 | template[i] = oldcg->subsys[i]; |
505 | } | 505 | } |
506 | } | 506 | } |
507 | 507 | ||
508 | hhead = css_set_hash(template); | 508 | hhead = css_set_hash(template); |
509 | hlist_for_each_entry(cg, node, hhead, hlist) { | 509 | hlist_for_each_entry(cg, node, hhead, hlist) { |
510 | if (!compare_css_sets(cg, oldcg, cgrp, template)) | 510 | if (!compare_css_sets(cg, oldcg, cgrp, template)) |
511 | continue; | 511 | continue; |
512 | 512 | ||
513 | /* This css_set matches what we need */ | 513 | /* This css_set matches what we need */ |
514 | return cg; | 514 | return cg; |
515 | } | 515 | } |
516 | 516 | ||
517 | /* No existing cgroup group matched */ | 517 | /* No existing cgroup group matched */ |
518 | return NULL; | 518 | return NULL; |
519 | } | 519 | } |
520 | 520 | ||
521 | static void free_cg_links(struct list_head *tmp) | 521 | static void free_cg_links(struct list_head *tmp) |
522 | { | 522 | { |
523 | struct cg_cgroup_link *link; | 523 | struct cg_cgroup_link *link; |
524 | struct cg_cgroup_link *saved_link; | 524 | struct cg_cgroup_link *saved_link; |
525 | 525 | ||
526 | list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { | 526 | list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { |
527 | list_del(&link->cgrp_link_list); | 527 | list_del(&link->cgrp_link_list); |
528 | kfree(link); | 528 | kfree(link); |
529 | } | 529 | } |
530 | } | 530 | } |
531 | 531 | ||
532 | /* | 532 | /* |
533 | * allocate_cg_links() allocates "count" cg_cgroup_link structures | 533 | * allocate_cg_links() allocates "count" cg_cgroup_link structures |
534 | * and chains them on tmp through their cgrp_link_list fields. Returns 0 on | 534 | * and chains them on tmp through their cgrp_link_list fields. Returns 0 on |
535 | * success or a negative error | 535 | * success or a negative error |
536 | */ | 536 | */ |
537 | static int allocate_cg_links(int count, struct list_head *tmp) | 537 | static int allocate_cg_links(int count, struct list_head *tmp) |
538 | { | 538 | { |
539 | struct cg_cgroup_link *link; | 539 | struct cg_cgroup_link *link; |
540 | int i; | 540 | int i; |
541 | INIT_LIST_HEAD(tmp); | 541 | INIT_LIST_HEAD(tmp); |
542 | for (i = 0; i < count; i++) { | 542 | for (i = 0; i < count; i++) { |
543 | link = kmalloc(sizeof(*link), GFP_KERNEL); | 543 | link = kmalloc(sizeof(*link), GFP_KERNEL); |
544 | if (!link) { | 544 | if (!link) { |
545 | free_cg_links(tmp); | 545 | free_cg_links(tmp); |
546 | return -ENOMEM; | 546 | return -ENOMEM; |
547 | } | 547 | } |
548 | list_add(&link->cgrp_link_list, tmp); | 548 | list_add(&link->cgrp_link_list, tmp); |
549 | } | 549 | } |
550 | return 0; | 550 | return 0; |
551 | } | 551 | } |
552 | 552 | ||
553 | /** | 553 | /** |
554 | * link_css_set - a helper function to link a css_set to a cgroup | 554 | * link_css_set - a helper function to link a css_set to a cgroup |
555 | * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() | 555 | * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() |
556 | * @cg: the css_set to be linked | 556 | * @cg: the css_set to be linked |
557 | * @cgrp: the destination cgroup | 557 | * @cgrp: the destination cgroup |
558 | */ | 558 | */ |
559 | static void link_css_set(struct list_head *tmp_cg_links, | 559 | static void link_css_set(struct list_head *tmp_cg_links, |
560 | struct css_set *cg, struct cgroup *cgrp) | 560 | struct css_set *cg, struct cgroup *cgrp) |
561 | { | 561 | { |
562 | struct cg_cgroup_link *link; | 562 | struct cg_cgroup_link *link; |
563 | 563 | ||
564 | BUG_ON(list_empty(tmp_cg_links)); | 564 | BUG_ON(list_empty(tmp_cg_links)); |
565 | link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, | 565 | link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, |
566 | cgrp_link_list); | 566 | cgrp_link_list); |
567 | link->cg = cg; | 567 | link->cg = cg; |
568 | link->cgrp = cgrp; | 568 | link->cgrp = cgrp; |
569 | atomic_inc(&cgrp->count); | 569 | atomic_inc(&cgrp->count); |
570 | list_move(&link->cgrp_link_list, &cgrp->css_sets); | 570 | list_move(&link->cgrp_link_list, &cgrp->css_sets); |
571 | /* | 571 | /* |
572 | * Always add links to the tail of the list so that the list | 572 | * Always add links to the tail of the list so that the list |
573 | * is sorted by order of hierarchy creation | 573 | * is sorted by order of hierarchy creation |
574 | */ | 574 | */ |
575 | list_add_tail(&link->cg_link_list, &cg->cg_links); | 575 | list_add_tail(&link->cg_link_list, &cg->cg_links); |
576 | } | 576 | } |
577 | 577 | ||
578 | /* | 578 | /* |
579 | * find_css_set() takes an existing cgroup group and a | 579 | * find_css_set() takes an existing cgroup group and a |
580 | * cgroup object, and returns a css_set object that's | 580 | * cgroup object, and returns a css_set object that's |
581 | * equivalent to the old group, but with the given cgroup | 581 | * equivalent to the old group, but with the given cgroup |
582 | * substituted into the appropriate hierarchy. Must be called with | 582 | * substituted into the appropriate hierarchy. Must be called with |
583 | * cgroup_mutex held | 583 | * cgroup_mutex held |
584 | */ | 584 | */ |
585 | static struct css_set *find_css_set( | 585 | static struct css_set *find_css_set( |
586 | struct css_set *oldcg, struct cgroup *cgrp) | 586 | struct css_set *oldcg, struct cgroup *cgrp) |
587 | { | 587 | { |
588 | struct css_set *res; | 588 | struct css_set *res; |
589 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | 589 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; |
590 | 590 | ||
591 | struct list_head tmp_cg_links; | 591 | struct list_head tmp_cg_links; |
592 | 592 | ||
593 | struct hlist_head *hhead; | 593 | struct hlist_head *hhead; |
594 | struct cg_cgroup_link *link; | 594 | struct cg_cgroup_link *link; |
595 | 595 | ||
596 | /* First see if we already have a cgroup group that matches | 596 | /* First see if we already have a cgroup group that matches |
597 | * the desired set */ | 597 | * the desired set */ |
598 | read_lock(&css_set_lock); | 598 | read_lock(&css_set_lock); |
599 | res = find_existing_css_set(oldcg, cgrp, template); | 599 | res = find_existing_css_set(oldcg, cgrp, template); |
600 | if (res) | 600 | if (res) |
601 | get_css_set(res); | 601 | get_css_set(res); |
602 | read_unlock(&css_set_lock); | 602 | read_unlock(&css_set_lock); |
603 | 603 | ||
604 | if (res) | 604 | if (res) |
605 | return res; | 605 | return res; |
606 | 606 | ||
607 | res = kmalloc(sizeof(*res), GFP_KERNEL); | 607 | res = kmalloc(sizeof(*res), GFP_KERNEL); |
608 | if (!res) | 608 | if (!res) |
609 | return NULL; | 609 | return NULL; |
610 | 610 | ||
611 | /* Allocate all the cg_cgroup_link objects that we'll need */ | 611 | /* Allocate all the cg_cgroup_link objects that we'll need */ |
612 | if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { | 612 | if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { |
613 | kfree(res); | 613 | kfree(res); |
614 | return NULL; | 614 | return NULL; |
615 | } | 615 | } |
616 | 616 | ||
617 | atomic_set(&res->refcount, 1); | 617 | atomic_set(&res->refcount, 1); |
618 | INIT_LIST_HEAD(&res->cg_links); | 618 | INIT_LIST_HEAD(&res->cg_links); |
619 | INIT_LIST_HEAD(&res->tasks); | 619 | INIT_LIST_HEAD(&res->tasks); |
620 | INIT_HLIST_NODE(&res->hlist); | 620 | INIT_HLIST_NODE(&res->hlist); |
621 | 621 | ||
622 | /* Copy the set of subsystem state objects generated in | 622 | /* Copy the set of subsystem state objects generated in |
623 | * find_existing_css_set() */ | 623 | * find_existing_css_set() */ |
624 | memcpy(res->subsys, template, sizeof(res->subsys)); | 624 | memcpy(res->subsys, template, sizeof(res->subsys)); |
625 | 625 | ||
626 | write_lock(&css_set_lock); | 626 | write_lock(&css_set_lock); |
627 | /* Add reference counts and links from the new css_set. */ | 627 | /* Add reference counts and links from the new css_set. */ |
628 | list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { | 628 | list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { |
629 | struct cgroup *c = link->cgrp; | 629 | struct cgroup *c = link->cgrp; |
630 | if (c->root == cgrp->root) | 630 | if (c->root == cgrp->root) |
631 | c = cgrp; | 631 | c = cgrp; |
632 | link_css_set(&tmp_cg_links, res, c); | 632 | link_css_set(&tmp_cg_links, res, c); |
633 | } | 633 | } |
634 | 634 | ||
635 | BUG_ON(!list_empty(&tmp_cg_links)); | 635 | BUG_ON(!list_empty(&tmp_cg_links)); |
636 | 636 | ||
637 | css_set_count++; | 637 | css_set_count++; |
638 | 638 | ||
639 | /* Add this cgroup group to the hash table */ | 639 | /* Add this cgroup group to the hash table */ |
640 | hhead = css_set_hash(res->subsys); | 640 | hhead = css_set_hash(res->subsys); |
641 | hlist_add_head(&res->hlist, hhead); | 641 | hlist_add_head(&res->hlist, hhead); |
642 | 642 | ||
643 | write_unlock(&css_set_lock); | 643 | write_unlock(&css_set_lock); |
644 | 644 | ||
645 | return res; | 645 | return res; |
646 | } | 646 | } |
647 | 647 | ||
648 | /* | 648 | /* |
649 | * Return the cgroup for "task" from the given hierarchy. Must be | 649 | * Return the cgroup for "task" from the given hierarchy. Must be |
650 | * called with cgroup_mutex held. | 650 | * called with cgroup_mutex held. |
651 | */ | 651 | */ |
652 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, | 652 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, |
653 | struct cgroupfs_root *root) | 653 | struct cgroupfs_root *root) |
654 | { | 654 | { |
655 | struct css_set *css; | 655 | struct css_set *css; |
656 | struct cgroup *res = NULL; | 656 | struct cgroup *res = NULL; |
657 | 657 | ||
658 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 658 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
659 | read_lock(&css_set_lock); | 659 | read_lock(&css_set_lock); |
660 | /* | 660 | /* |
661 | * No need to lock the task - since we hold cgroup_mutex the | 661 | * No need to lock the task - since we hold cgroup_mutex the |
662 | * task can't change groups, so the only thing that can happen | 662 | * task can't change groups, so the only thing that can happen |
663 | * is that it exits and its css is set back to init_css_set. | 663 | * is that it exits and its css is set back to init_css_set. |
664 | */ | 664 | */ |
665 | css = task->cgroups; | 665 | css = task->cgroups; |
666 | if (css == &init_css_set) { | 666 | if (css == &init_css_set) { |
667 | res = &root->top_cgroup; | 667 | res = &root->top_cgroup; |
668 | } else { | 668 | } else { |
669 | struct cg_cgroup_link *link; | 669 | struct cg_cgroup_link *link; |
670 | list_for_each_entry(link, &css->cg_links, cg_link_list) { | 670 | list_for_each_entry(link, &css->cg_links, cg_link_list) { |
671 | struct cgroup *c = link->cgrp; | 671 | struct cgroup *c = link->cgrp; |
672 | if (c->root == root) { | 672 | if (c->root == root) { |
673 | res = c; | 673 | res = c; |
674 | break; | 674 | break; |
675 | } | 675 | } |
676 | } | 676 | } |
677 | } | 677 | } |
678 | read_unlock(&css_set_lock); | 678 | read_unlock(&css_set_lock); |
679 | BUG_ON(!res); | 679 | BUG_ON(!res); |
680 | return res; | 680 | return res; |
681 | } | 681 | } |
682 | 682 | ||
683 | /* | 683 | /* |
684 | * There is one global cgroup mutex. We also require taking | 684 | * There is one global cgroup mutex. We also require taking |
685 | * task_lock() when dereferencing a task's cgroup subsys pointers. | 685 | * task_lock() when dereferencing a task's cgroup subsys pointers. |
686 | * See "The task_lock() exception", at the end of this comment. | 686 | * See "The task_lock() exception", at the end of this comment. |
687 | * | 687 | * |
688 | * A task must hold cgroup_mutex to modify cgroups. | 688 | * A task must hold cgroup_mutex to modify cgroups. |
689 | * | 689 | * |
690 | * Any task can increment and decrement the count field without lock. | 690 | * Any task can increment and decrement the count field without lock. |
691 | * So in general, code holding cgroup_mutex can't rely on the count | 691 | * So in general, code holding cgroup_mutex can't rely on the count |
692 | * field not changing. However, if the count goes to zero, then only | 692 | * field not changing. However, if the count goes to zero, then only |
693 | * cgroup_attach_task() can increment it again. Because a count of zero | 693 | * cgroup_attach_task() can increment it again. Because a count of zero |
694 | * means that no tasks are currently attached, therefore there is no | 694 | * means that no tasks are currently attached, therefore there is no |
695 | * way a task attached to that cgroup can fork (the other way to | 695 | * way a task attached to that cgroup can fork (the other way to |
696 | * increment the count). So code holding cgroup_mutex can safely | 696 | * increment the count). So code holding cgroup_mutex can safely |
697 | * assume that if the count is zero, it will stay zero. Similarly, if | 697 | * assume that if the count is zero, it will stay zero. Similarly, if |
698 | * a task holds cgroup_mutex on a cgroup with zero count, it | 698 | * a task holds cgroup_mutex on a cgroup with zero count, it |
699 | * knows that the cgroup won't be removed, as cgroup_rmdir() | 699 | * knows that the cgroup won't be removed, as cgroup_rmdir() |
700 | * needs that mutex. | 700 | * needs that mutex. |
701 | * | 701 | * |
702 | * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't | 702 | * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't |
703 | * (usually) take cgroup_mutex. These are the two most performance | 703 | * (usually) take cgroup_mutex. These are the two most performance |
704 | * critical pieces of code here. The exception occurs on cgroup_exit(), | 704 | * critical pieces of code here. The exception occurs on cgroup_exit(), |
705 | * when a task in a notify_on_release cgroup exits. Then cgroup_mutex | 705 | * when a task in a notify_on_release cgroup exits. Then cgroup_mutex |
706 | * is taken, and if the cgroup count is zero, a usermode call made | 706 | * is taken, and if the cgroup count is zero, a usermode call made |
707 | * to the release agent with the name of the cgroup (path relative to | 707 | * to the release agent with the name of the cgroup (path relative to |
708 | * the root of cgroup file system) as the argument. | 708 | * the root of cgroup file system) as the argument. |
709 | * | 709 | * |
710 | * A cgroup can only be deleted if both its 'count' of using tasks | 710 | * A cgroup can only be deleted if both its 'count' of using tasks |
711 | * is zero, and its list of 'children' cgroups is empty. Since all | 711 | * is zero, and its list of 'children' cgroups is empty. Since all |
712 | * tasks in the system use _some_ cgroup, and since there is always at | 712 | * tasks in the system use _some_ cgroup, and since there is always at |
713 | * least one task in the system (init, pid == 1), therefore, top_cgroup | 713 | * least one task in the system (init, pid == 1), therefore, top_cgroup |
714 | * always has either children cgroups and/or using tasks. So we don't | 714 | * always has either children cgroups and/or using tasks. So we don't |
715 | * need a special hack to ensure that top_cgroup cannot be deleted. | 715 | * need a special hack to ensure that top_cgroup cannot be deleted. |
716 | * | 716 | * |
717 | * The task_lock() exception | 717 | * The task_lock() exception |
718 | * | 718 | * |
719 | * The need for this exception arises from the action of | 719 | * The need for this exception arises from the action of |
720 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with | 720 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with |
721 | * another. It does so using cgroup_mutex, however there are | 721 | * another. It does so using cgroup_mutex, however there are |
722 | * several performance critical places that need to reference | 722 | * several performance critical places that need to reference |
723 | * task->cgroup without the expense of grabbing a system global | 723 | * task->cgroup without the expense of grabbing a system global |
724 | * mutex. Therefore except as noted below, when dereferencing or, as | 724 | * mutex. Therefore except as noted below, when dereferencing or, as |
725 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use | 725 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use |
726 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 726 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
727 | * the task_struct routinely used for such matters. | 727 | * the task_struct routinely used for such matters. |
728 | * | 728 | * |
729 | * P.S. One more locking exception. RCU is used to guard the | 729 | * P.S. One more locking exception. RCU is used to guard the |
730 | * update of a tasks cgroup pointer by cgroup_attach_task() | 730 | * update of a tasks cgroup pointer by cgroup_attach_task() |
731 | */ | 731 | */ |
732 | 732 | ||
733 | /** | 733 | /** |
734 | * cgroup_lock - lock out any changes to cgroup structures | 734 | * cgroup_lock - lock out any changes to cgroup structures |
735 | * | 735 | * |
736 | */ | 736 | */ |
737 | void cgroup_lock(void) | 737 | void cgroup_lock(void) |
738 | { | 738 | { |
739 | mutex_lock(&cgroup_mutex); | 739 | mutex_lock(&cgroup_mutex); |
740 | } | 740 | } |
741 | EXPORT_SYMBOL_GPL(cgroup_lock); | 741 | EXPORT_SYMBOL_GPL(cgroup_lock); |
742 | 742 | ||
743 | /** | 743 | /** |
744 | * cgroup_unlock - release lock on cgroup changes | 744 | * cgroup_unlock - release lock on cgroup changes |
745 | * | 745 | * |
746 | * Undo the lock taken in a previous cgroup_lock() call. | 746 | * Undo the lock taken in a previous cgroup_lock() call. |
747 | */ | 747 | */ |
748 | void cgroup_unlock(void) | 748 | void cgroup_unlock(void) |
749 | { | 749 | { |
750 | mutex_unlock(&cgroup_mutex); | 750 | mutex_unlock(&cgroup_mutex); |
751 | } | 751 | } |
752 | EXPORT_SYMBOL_GPL(cgroup_unlock); | 752 | EXPORT_SYMBOL_GPL(cgroup_unlock); |
753 | 753 | ||
754 | /* | 754 | /* |
755 | * A couple of forward declarations required, due to cyclic reference loop: | 755 | * A couple of forward declarations required, due to cyclic reference loop: |
756 | * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> | 756 | * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> |
757 | * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations | 757 | * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations |
758 | * -> cgroup_mkdir. | 758 | * -> cgroup_mkdir. |
759 | */ | 759 | */ |
760 | 760 | ||
761 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); | 761 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); |
762 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); | 762 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); |
763 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 763 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
764 | static int cgroup_populate_dir(struct cgroup *cgrp); | 764 | static int cgroup_populate_dir(struct cgroup *cgrp); |
765 | static const struct inode_operations cgroup_dir_inode_operations; | 765 | static const struct inode_operations cgroup_dir_inode_operations; |
766 | static const struct file_operations proc_cgroupstats_operations; | 766 | static const struct file_operations proc_cgroupstats_operations; |
767 | 767 | ||
768 | static struct backing_dev_info cgroup_backing_dev_info = { | 768 | static struct backing_dev_info cgroup_backing_dev_info = { |
769 | .name = "cgroup", | 769 | .name = "cgroup", |
770 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 770 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
771 | }; | 771 | }; |
772 | 772 | ||
773 | static int alloc_css_id(struct cgroup_subsys *ss, | 773 | static int alloc_css_id(struct cgroup_subsys *ss, |
774 | struct cgroup *parent, struct cgroup *child); | 774 | struct cgroup *parent, struct cgroup *child); |
775 | 775 | ||
776 | static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | 776 | static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) |
777 | { | 777 | { |
778 | struct inode *inode = new_inode(sb); | 778 | struct inode *inode = new_inode(sb); |
779 | 779 | ||
780 | if (inode) { | 780 | if (inode) { |
781 | inode->i_ino = get_next_ino(); | 781 | inode->i_ino = get_next_ino(); |
782 | inode->i_mode = mode; | 782 | inode->i_mode = mode; |
783 | inode->i_uid = current_fsuid(); | 783 | inode->i_uid = current_fsuid(); |
784 | inode->i_gid = current_fsgid(); | 784 | inode->i_gid = current_fsgid(); |
785 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 785 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
786 | inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; | 786 | inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; |
787 | } | 787 | } |
788 | return inode; | 788 | return inode; |
789 | } | 789 | } |
790 | 790 | ||
791 | /* | 791 | /* |
792 | * Call subsys's pre_destroy handler. | 792 | * Call subsys's pre_destroy handler. |
793 | * This is called before css refcnt check. | 793 | * This is called before css refcnt check. |
794 | */ | 794 | */ |
795 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) | 795 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) |
796 | { | 796 | { |
797 | struct cgroup_subsys *ss; | 797 | struct cgroup_subsys *ss; |
798 | int ret = 0; | 798 | int ret = 0; |
799 | 799 | ||
800 | for_each_subsys(cgrp->root, ss) | 800 | for_each_subsys(cgrp->root, ss) |
801 | if (ss->pre_destroy) { | 801 | if (ss->pre_destroy) { |
802 | ret = ss->pre_destroy(ss, cgrp); | 802 | ret = ss->pre_destroy(ss, cgrp); |
803 | if (ret) | 803 | if (ret) |
804 | break; | 804 | break; |
805 | } | 805 | } |
806 | 806 | ||
807 | return ret; | 807 | return ret; |
808 | } | 808 | } |
809 | 809 | ||
810 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 810 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
811 | { | 811 | { |
812 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 812 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
813 | if (S_ISDIR(inode->i_mode)) { | 813 | if (S_ISDIR(inode->i_mode)) { |
814 | struct cgroup *cgrp = dentry->d_fsdata; | 814 | struct cgroup *cgrp = dentry->d_fsdata; |
815 | struct cgroup_subsys *ss; | 815 | struct cgroup_subsys *ss; |
816 | BUG_ON(!(cgroup_is_removed(cgrp))); | 816 | BUG_ON(!(cgroup_is_removed(cgrp))); |
817 | /* It's possible for external users to be holding css | 817 | /* It's possible for external users to be holding css |
818 | * reference counts on a cgroup; css_put() needs to | 818 | * reference counts on a cgroup; css_put() needs to |
819 | * be able to access the cgroup after decrementing | 819 | * be able to access the cgroup after decrementing |
820 | * the reference count in order to know if it needs to | 820 | * the reference count in order to know if it needs to |
821 | * queue the cgroup to be handled by the release | 821 | * queue the cgroup to be handled by the release |
822 | * agent */ | 822 | * agent */ |
823 | synchronize_rcu(); | 823 | synchronize_rcu(); |
824 | 824 | ||
825 | mutex_lock(&cgroup_mutex); | 825 | mutex_lock(&cgroup_mutex); |
826 | /* | 826 | /* |
827 | * Release the subsystem state objects. | 827 | * Release the subsystem state objects. |
828 | */ | 828 | */ |
829 | for_each_subsys(cgrp->root, ss) | 829 | for_each_subsys(cgrp->root, ss) |
830 | ss->destroy(ss, cgrp); | 830 | ss->destroy(ss, cgrp); |
831 | 831 | ||
832 | cgrp->root->number_of_cgroups--; | 832 | cgrp->root->number_of_cgroups--; |
833 | mutex_unlock(&cgroup_mutex); | 833 | mutex_unlock(&cgroup_mutex); |
834 | 834 | ||
835 | /* | 835 | /* |
836 | * Drop the active superblock reference that we took when we | 836 | * Drop the active superblock reference that we took when we |
837 | * created the cgroup | 837 | * created the cgroup |
838 | */ | 838 | */ |
839 | deactivate_super(cgrp->root->sb); | 839 | deactivate_super(cgrp->root->sb); |
840 | 840 | ||
841 | /* | 841 | /* |
842 | * if we're getting rid of the cgroup, refcount should ensure | 842 | * if we're getting rid of the cgroup, refcount should ensure |
843 | * that there are no pidlists left. | 843 | * that there are no pidlists left. |
844 | */ | 844 | */ |
845 | BUG_ON(!list_empty(&cgrp->pidlists)); | 845 | BUG_ON(!list_empty(&cgrp->pidlists)); |
846 | 846 | ||
847 | kfree_rcu(cgrp, rcu_head); | 847 | kfree_rcu(cgrp, rcu_head); |
848 | } | 848 | } |
849 | iput(inode); | 849 | iput(inode); |
850 | } | 850 | } |
851 | 851 | ||
852 | static int cgroup_delete(const struct dentry *d) | 852 | static int cgroup_delete(const struct dentry *d) |
853 | { | 853 | { |
854 | return 1; | 854 | return 1; |
855 | } | 855 | } |
856 | 856 | ||
857 | static void remove_dir(struct dentry *d) | 857 | static void remove_dir(struct dentry *d) |
858 | { | 858 | { |
859 | struct dentry *parent = dget(d->d_parent); | 859 | struct dentry *parent = dget(d->d_parent); |
860 | 860 | ||
861 | d_delete(d); | 861 | d_delete(d); |
862 | simple_rmdir(parent->d_inode, d); | 862 | simple_rmdir(parent->d_inode, d); |
863 | dput(parent); | 863 | dput(parent); |
864 | } | 864 | } |
865 | 865 | ||
866 | static void cgroup_clear_directory(struct dentry *dentry) | 866 | static void cgroup_clear_directory(struct dentry *dentry) |
867 | { | 867 | { |
868 | struct list_head *node; | 868 | struct list_head *node; |
869 | 869 | ||
870 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 870 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); |
871 | spin_lock(&dentry->d_lock); | 871 | spin_lock(&dentry->d_lock); |
872 | node = dentry->d_subdirs.next; | 872 | node = dentry->d_subdirs.next; |
873 | while (node != &dentry->d_subdirs) { | 873 | while (node != &dentry->d_subdirs) { |
874 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 874 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
875 | 875 | ||
876 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | 876 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); |
877 | list_del_init(node); | 877 | list_del_init(node); |
878 | if (d->d_inode) { | 878 | if (d->d_inode) { |
879 | /* This should never be called on a cgroup | 879 | /* This should never be called on a cgroup |
880 | * directory with child cgroups */ | 880 | * directory with child cgroups */ |
881 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 881 | BUG_ON(d->d_inode->i_mode & S_IFDIR); |
882 | dget_dlock(d); | 882 | dget_dlock(d); |
883 | spin_unlock(&d->d_lock); | 883 | spin_unlock(&d->d_lock); |
884 | spin_unlock(&dentry->d_lock); | 884 | spin_unlock(&dentry->d_lock); |
885 | d_delete(d); | 885 | d_delete(d); |
886 | simple_unlink(dentry->d_inode, d); | 886 | simple_unlink(dentry->d_inode, d); |
887 | dput(d); | 887 | dput(d); |
888 | spin_lock(&dentry->d_lock); | 888 | spin_lock(&dentry->d_lock); |
889 | } else | 889 | } else |
890 | spin_unlock(&d->d_lock); | 890 | spin_unlock(&d->d_lock); |
891 | node = dentry->d_subdirs.next; | 891 | node = dentry->d_subdirs.next; |
892 | } | 892 | } |
893 | spin_unlock(&dentry->d_lock); | 893 | spin_unlock(&dentry->d_lock); |
894 | } | 894 | } |
895 | 895 | ||
896 | /* | 896 | /* |
897 | * NOTE : the dentry must have been dget()'ed | 897 | * NOTE : the dentry must have been dget()'ed |
898 | */ | 898 | */ |
899 | static void cgroup_d_remove_dir(struct dentry *dentry) | 899 | static void cgroup_d_remove_dir(struct dentry *dentry) |
900 | { | 900 | { |
901 | struct dentry *parent; | 901 | struct dentry *parent; |
902 | 902 | ||
903 | cgroup_clear_directory(dentry); | 903 | cgroup_clear_directory(dentry); |
904 | 904 | ||
905 | parent = dentry->d_parent; | 905 | parent = dentry->d_parent; |
906 | spin_lock(&parent->d_lock); | 906 | spin_lock(&parent->d_lock); |
907 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); | 907 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); |
908 | list_del_init(&dentry->d_u.d_child); | 908 | list_del_init(&dentry->d_u.d_child); |
909 | spin_unlock(&dentry->d_lock); | 909 | spin_unlock(&dentry->d_lock); |
910 | spin_unlock(&parent->d_lock); | 910 | spin_unlock(&parent->d_lock); |
911 | remove_dir(dentry); | 911 | remove_dir(dentry); |
912 | } | 912 | } |
913 | 913 | ||
914 | /* | 914 | /* |
915 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | 915 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when |
916 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | 916 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some |
917 | * reference to css->refcnt. In general, this refcnt is expected to goes down | 917 | * reference to css->refcnt. In general, this refcnt is expected to goes down |
918 | * to zero, soon. | 918 | * to zero, soon. |
919 | * | 919 | * |
920 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | 920 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; |
921 | */ | 921 | */ |
922 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | 922 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); |
923 | 923 | ||
924 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | 924 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) |
925 | { | 925 | { |
926 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | 926 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) |
927 | wake_up_all(&cgroup_rmdir_waitq); | 927 | wake_up_all(&cgroup_rmdir_waitq); |
928 | } | 928 | } |
929 | 929 | ||
930 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | 930 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) |
931 | { | 931 | { |
932 | css_get(css); | 932 | css_get(css); |
933 | } | 933 | } |
934 | 934 | ||
935 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | 935 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) |
936 | { | 936 | { |
937 | cgroup_wakeup_rmdir_waiter(css->cgroup); | 937 | cgroup_wakeup_rmdir_waiter(css->cgroup); |
938 | css_put(css); | 938 | css_put(css); |
939 | } | 939 | } |
940 | 940 | ||
941 | /* | 941 | /* |
942 | * Call with cgroup_mutex held. Drops reference counts on modules, including | 942 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
943 | * any duplicate ones that parse_cgroupfs_options took. If this function | 943 | * any duplicate ones that parse_cgroupfs_options took. If this function |
944 | * returns an error, no reference counts are touched. | 944 | * returns an error, no reference counts are touched. |
945 | */ | 945 | */ |
946 | static int rebind_subsystems(struct cgroupfs_root *root, | 946 | static int rebind_subsystems(struct cgroupfs_root *root, |
947 | unsigned long final_bits) | 947 | unsigned long final_bits) |
948 | { | 948 | { |
949 | unsigned long added_bits, removed_bits; | 949 | unsigned long added_bits, removed_bits; |
950 | struct cgroup *cgrp = &root->top_cgroup; | 950 | struct cgroup *cgrp = &root->top_cgroup; |
951 | int i; | 951 | int i; |
952 | 952 | ||
953 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 953 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
954 | 954 | ||
955 | removed_bits = root->actual_subsys_bits & ~final_bits; | 955 | removed_bits = root->actual_subsys_bits & ~final_bits; |
956 | added_bits = final_bits & ~root->actual_subsys_bits; | 956 | added_bits = final_bits & ~root->actual_subsys_bits; |
957 | /* Check that any added subsystems are currently free */ | 957 | /* Check that any added subsystems are currently free */ |
958 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 958 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
959 | unsigned long bit = 1UL << i; | 959 | unsigned long bit = 1UL << i; |
960 | struct cgroup_subsys *ss = subsys[i]; | 960 | struct cgroup_subsys *ss = subsys[i]; |
961 | if (!(bit & added_bits)) | 961 | if (!(bit & added_bits)) |
962 | continue; | 962 | continue; |
963 | /* | 963 | /* |
964 | * Nobody should tell us to do a subsys that doesn't exist: | 964 | * Nobody should tell us to do a subsys that doesn't exist: |
965 | * parse_cgroupfs_options should catch that case and refcounts | 965 | * parse_cgroupfs_options should catch that case and refcounts |
966 | * ensure that subsystems won't disappear once selected. | 966 | * ensure that subsystems won't disappear once selected. |
967 | */ | 967 | */ |
968 | BUG_ON(ss == NULL); | 968 | BUG_ON(ss == NULL); |
969 | if (ss->root != &rootnode) { | 969 | if (ss->root != &rootnode) { |
970 | /* Subsystem isn't free */ | 970 | /* Subsystem isn't free */ |
971 | return -EBUSY; | 971 | return -EBUSY; |
972 | } | 972 | } |
973 | } | 973 | } |
974 | 974 | ||
975 | /* Currently we don't handle adding/removing subsystems when | 975 | /* Currently we don't handle adding/removing subsystems when |
976 | * any child cgroups exist. This is theoretically supportable | 976 | * any child cgroups exist. This is theoretically supportable |
977 | * but involves complex error handling, so it's being left until | 977 | * but involves complex error handling, so it's being left until |
978 | * later */ | 978 | * later */ |
979 | if (root->number_of_cgroups > 1) | 979 | if (root->number_of_cgroups > 1) |
980 | return -EBUSY; | 980 | return -EBUSY; |
981 | 981 | ||
982 | /* Process each subsystem */ | 982 | /* Process each subsystem */ |
983 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 983 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
984 | struct cgroup_subsys *ss = subsys[i]; | 984 | struct cgroup_subsys *ss = subsys[i]; |
985 | unsigned long bit = 1UL << i; | 985 | unsigned long bit = 1UL << i; |
986 | if (bit & added_bits) { | 986 | if (bit & added_bits) { |
987 | /* We're binding this subsystem to this hierarchy */ | 987 | /* We're binding this subsystem to this hierarchy */ |
988 | BUG_ON(ss == NULL); | 988 | BUG_ON(ss == NULL); |
989 | BUG_ON(cgrp->subsys[i]); | 989 | BUG_ON(cgrp->subsys[i]); |
990 | BUG_ON(!dummytop->subsys[i]); | 990 | BUG_ON(!dummytop->subsys[i]); |
991 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); | 991 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); |
992 | mutex_lock(&ss->hierarchy_mutex); | 992 | mutex_lock(&ss->hierarchy_mutex); |
993 | cgrp->subsys[i] = dummytop->subsys[i]; | 993 | cgrp->subsys[i] = dummytop->subsys[i]; |
994 | cgrp->subsys[i]->cgroup = cgrp; | 994 | cgrp->subsys[i]->cgroup = cgrp; |
995 | list_move(&ss->sibling, &root->subsys_list); | 995 | list_move(&ss->sibling, &root->subsys_list); |
996 | ss->root = root; | 996 | ss->root = root; |
997 | if (ss->bind) | 997 | if (ss->bind) |
998 | ss->bind(ss, cgrp); | 998 | ss->bind(ss, cgrp); |
999 | mutex_unlock(&ss->hierarchy_mutex); | 999 | mutex_unlock(&ss->hierarchy_mutex); |
1000 | /* refcount was already taken, and we're keeping it */ | 1000 | /* refcount was already taken, and we're keeping it */ |
1001 | } else if (bit & removed_bits) { | 1001 | } else if (bit & removed_bits) { |
1002 | /* We're removing this subsystem */ | 1002 | /* We're removing this subsystem */ |
1003 | BUG_ON(ss == NULL); | 1003 | BUG_ON(ss == NULL); |
1004 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); | 1004 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); |
1005 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 1005 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); |
1006 | mutex_lock(&ss->hierarchy_mutex); | 1006 | mutex_lock(&ss->hierarchy_mutex); |
1007 | if (ss->bind) | 1007 | if (ss->bind) |
1008 | ss->bind(ss, dummytop); | 1008 | ss->bind(ss, dummytop); |
1009 | dummytop->subsys[i]->cgroup = dummytop; | 1009 | dummytop->subsys[i]->cgroup = dummytop; |
1010 | cgrp->subsys[i] = NULL; | 1010 | cgrp->subsys[i] = NULL; |
1011 | subsys[i]->root = &rootnode; | 1011 | subsys[i]->root = &rootnode; |
1012 | list_move(&ss->sibling, &rootnode.subsys_list); | 1012 | list_move(&ss->sibling, &rootnode.subsys_list); |
1013 | mutex_unlock(&ss->hierarchy_mutex); | 1013 | mutex_unlock(&ss->hierarchy_mutex); |
1014 | /* subsystem is now free - drop reference on module */ | 1014 | /* subsystem is now free - drop reference on module */ |
1015 | module_put(ss->module); | 1015 | module_put(ss->module); |
1016 | } else if (bit & final_bits) { | 1016 | } else if (bit & final_bits) { |
1017 | /* Subsystem state should already exist */ | 1017 | /* Subsystem state should already exist */ |
1018 | BUG_ON(ss == NULL); | 1018 | BUG_ON(ss == NULL); |
1019 | BUG_ON(!cgrp->subsys[i]); | 1019 | BUG_ON(!cgrp->subsys[i]); |
1020 | /* | 1020 | /* |
1021 | * a refcount was taken, but we already had one, so | 1021 | * a refcount was taken, but we already had one, so |
1022 | * drop the extra reference. | 1022 | * drop the extra reference. |
1023 | */ | 1023 | */ |
1024 | module_put(ss->module); | 1024 | module_put(ss->module); |
1025 | #ifdef CONFIG_MODULE_UNLOAD | 1025 | #ifdef CONFIG_MODULE_UNLOAD |
1026 | BUG_ON(ss->module && !module_refcount(ss->module)); | 1026 | BUG_ON(ss->module && !module_refcount(ss->module)); |
1027 | #endif | 1027 | #endif |
1028 | } else { | 1028 | } else { |
1029 | /* Subsystem state shouldn't exist */ | 1029 | /* Subsystem state shouldn't exist */ |
1030 | BUG_ON(cgrp->subsys[i]); | 1030 | BUG_ON(cgrp->subsys[i]); |
1031 | } | 1031 | } |
1032 | } | 1032 | } |
1033 | root->subsys_bits = root->actual_subsys_bits = final_bits; | 1033 | root->subsys_bits = root->actual_subsys_bits = final_bits; |
1034 | synchronize_rcu(); | 1034 | synchronize_rcu(); |
1035 | 1035 | ||
1036 | return 0; | 1036 | return 0; |
1037 | } | 1037 | } |
1038 | 1038 | ||
1039 | static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | 1039 | static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) |
1040 | { | 1040 | { |
1041 | struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; | 1041 | struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; |
1042 | struct cgroup_subsys *ss; | 1042 | struct cgroup_subsys *ss; |
1043 | 1043 | ||
1044 | mutex_lock(&cgroup_mutex); | 1044 | mutex_lock(&cgroup_mutex); |
1045 | for_each_subsys(root, ss) | 1045 | for_each_subsys(root, ss) |
1046 | seq_printf(seq, ",%s", ss->name); | 1046 | seq_printf(seq, ",%s", ss->name); |
1047 | if (test_bit(ROOT_NOPREFIX, &root->flags)) | 1047 | if (test_bit(ROOT_NOPREFIX, &root->flags)) |
1048 | seq_puts(seq, ",noprefix"); | 1048 | seq_puts(seq, ",noprefix"); |
1049 | if (strlen(root->release_agent_path)) | 1049 | if (strlen(root->release_agent_path)) |
1050 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1050 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
1051 | if (clone_children(&root->top_cgroup)) | 1051 | if (clone_children(&root->top_cgroup)) |
1052 | seq_puts(seq, ",clone_children"); | 1052 | seq_puts(seq, ",clone_children"); |
1053 | if (strlen(root->name)) | 1053 | if (strlen(root->name)) |
1054 | seq_printf(seq, ",name=%s", root->name); | 1054 | seq_printf(seq, ",name=%s", root->name); |
1055 | mutex_unlock(&cgroup_mutex); | 1055 | mutex_unlock(&cgroup_mutex); |
1056 | return 0; | 1056 | return 0; |
1057 | } | 1057 | } |
1058 | 1058 | ||
1059 | struct cgroup_sb_opts { | 1059 | struct cgroup_sb_opts { |
1060 | unsigned long subsys_bits; | 1060 | unsigned long subsys_bits; |
1061 | unsigned long flags; | 1061 | unsigned long flags; |
1062 | char *release_agent; | 1062 | char *release_agent; |
1063 | bool clone_children; | 1063 | bool clone_children; |
1064 | char *name; | 1064 | char *name; |
1065 | /* User explicitly requested empty subsystem */ | 1065 | /* User explicitly requested empty subsystem */ |
1066 | bool none; | 1066 | bool none; |
1067 | 1067 | ||
1068 | struct cgroupfs_root *new_root; | 1068 | struct cgroupfs_root *new_root; |
1069 | 1069 | ||
1070 | }; | 1070 | }; |
1071 | 1071 | ||
1072 | /* | 1072 | /* |
1073 | * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call | 1073 | * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call |
1074 | * with cgroup_mutex held to protect the subsys[] array. This function takes | 1074 | * with cgroup_mutex held to protect the subsys[] array. This function takes |
1075 | * refcounts on subsystems to be used, unless it returns error, in which case | 1075 | * refcounts on subsystems to be used, unless it returns error, in which case |
1076 | * no refcounts are taken. | 1076 | * no refcounts are taken. |
1077 | */ | 1077 | */ |
1078 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | 1078 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
1079 | { | 1079 | { |
1080 | char *token, *o = data; | 1080 | char *token, *o = data; |
1081 | bool all_ss = false, one_ss = false; | 1081 | bool all_ss = false, one_ss = false; |
1082 | unsigned long mask = (unsigned long)-1; | 1082 | unsigned long mask = (unsigned long)-1; |
1083 | int i; | 1083 | int i; |
1084 | bool module_pin_failed = false; | 1084 | bool module_pin_failed = false; |
1085 | 1085 | ||
1086 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1086 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
1087 | 1087 | ||
1088 | #ifdef CONFIG_CPUSETS | 1088 | #ifdef CONFIG_CPUSETS |
1089 | mask = ~(1UL << cpuset_subsys_id); | 1089 | mask = ~(1UL << cpuset_subsys_id); |
1090 | #endif | 1090 | #endif |
1091 | 1091 | ||
1092 | memset(opts, 0, sizeof(*opts)); | 1092 | memset(opts, 0, sizeof(*opts)); |
1093 | 1093 | ||
1094 | while ((token = strsep(&o, ",")) != NULL) { | 1094 | while ((token = strsep(&o, ",")) != NULL) { |
1095 | if (!*token) | 1095 | if (!*token) |
1096 | return -EINVAL; | 1096 | return -EINVAL; |
1097 | if (!strcmp(token, "none")) { | 1097 | if (!strcmp(token, "none")) { |
1098 | /* Explicitly have no subsystems */ | 1098 | /* Explicitly have no subsystems */ |
1099 | opts->none = true; | 1099 | opts->none = true; |
1100 | continue; | 1100 | continue; |
1101 | } | 1101 | } |
1102 | if (!strcmp(token, "all")) { | 1102 | if (!strcmp(token, "all")) { |
1103 | /* Mutually exclusive option 'all' + subsystem name */ | 1103 | /* Mutually exclusive option 'all' + subsystem name */ |
1104 | if (one_ss) | 1104 | if (one_ss) |
1105 | return -EINVAL; | 1105 | return -EINVAL; |
1106 | all_ss = true; | 1106 | all_ss = true; |
1107 | continue; | 1107 | continue; |
1108 | } | 1108 | } |
1109 | if (!strcmp(token, "noprefix")) { | 1109 | if (!strcmp(token, "noprefix")) { |
1110 | set_bit(ROOT_NOPREFIX, &opts->flags); | 1110 | set_bit(ROOT_NOPREFIX, &opts->flags); |
1111 | continue; | 1111 | continue; |
1112 | } | 1112 | } |
1113 | if (!strcmp(token, "clone_children")) { | 1113 | if (!strcmp(token, "clone_children")) { |
1114 | opts->clone_children = true; | 1114 | opts->clone_children = true; |
1115 | continue; | 1115 | continue; |
1116 | } | 1116 | } |
1117 | if (!strncmp(token, "release_agent=", 14)) { | 1117 | if (!strncmp(token, "release_agent=", 14)) { |
1118 | /* Specifying two release agents is forbidden */ | 1118 | /* Specifying two release agents is forbidden */ |
1119 | if (opts->release_agent) | 1119 | if (opts->release_agent) |
1120 | return -EINVAL; | 1120 | return -EINVAL; |
1121 | opts->release_agent = | 1121 | opts->release_agent = |
1122 | kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); | 1122 | kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); |
1123 | if (!opts->release_agent) | 1123 | if (!opts->release_agent) |
1124 | return -ENOMEM; | 1124 | return -ENOMEM; |
1125 | continue; | 1125 | continue; |
1126 | } | 1126 | } |
1127 | if (!strncmp(token, "name=", 5)) { | 1127 | if (!strncmp(token, "name=", 5)) { |
1128 | const char *name = token + 5; | 1128 | const char *name = token + 5; |
1129 | /* Can't specify an empty name */ | 1129 | /* Can't specify an empty name */ |
1130 | if (!strlen(name)) | 1130 | if (!strlen(name)) |
1131 | return -EINVAL; | 1131 | return -EINVAL; |
1132 | /* Must match [\w.-]+ */ | 1132 | /* Must match [\w.-]+ */ |
1133 | for (i = 0; i < strlen(name); i++) { | 1133 | for (i = 0; i < strlen(name); i++) { |
1134 | char c = name[i]; | 1134 | char c = name[i]; |
1135 | if (isalnum(c)) | 1135 | if (isalnum(c)) |
1136 | continue; | 1136 | continue; |
1137 | if ((c == '.') || (c == '-') || (c == '_')) | 1137 | if ((c == '.') || (c == '-') || (c == '_')) |
1138 | continue; | 1138 | continue; |
1139 | return -EINVAL; | 1139 | return -EINVAL; |
1140 | } | 1140 | } |
1141 | /* Specifying two names is forbidden */ | 1141 | /* Specifying two names is forbidden */ |
1142 | if (opts->name) | 1142 | if (opts->name) |
1143 | return -EINVAL; | 1143 | return -EINVAL; |
1144 | opts->name = kstrndup(name, | 1144 | opts->name = kstrndup(name, |
1145 | MAX_CGROUP_ROOT_NAMELEN - 1, | 1145 | MAX_CGROUP_ROOT_NAMELEN - 1, |
1146 | GFP_KERNEL); | 1146 | GFP_KERNEL); |
1147 | if (!opts->name) | 1147 | if (!opts->name) |
1148 | return -ENOMEM; | 1148 | return -ENOMEM; |
1149 | 1149 | ||
1150 | continue; | 1150 | continue; |
1151 | } | 1151 | } |
1152 | 1152 | ||
1153 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1153 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1154 | struct cgroup_subsys *ss = subsys[i]; | 1154 | struct cgroup_subsys *ss = subsys[i]; |
1155 | if (ss == NULL) | 1155 | if (ss == NULL) |
1156 | continue; | 1156 | continue; |
1157 | if (strcmp(token, ss->name)) | 1157 | if (strcmp(token, ss->name)) |
1158 | continue; | 1158 | continue; |
1159 | if (ss->disabled) | 1159 | if (ss->disabled) |
1160 | continue; | 1160 | continue; |
1161 | 1161 | ||
1162 | /* Mutually exclusive option 'all' + subsystem name */ | 1162 | /* Mutually exclusive option 'all' + subsystem name */ |
1163 | if (all_ss) | 1163 | if (all_ss) |
1164 | return -EINVAL; | 1164 | return -EINVAL; |
1165 | set_bit(i, &opts->subsys_bits); | 1165 | set_bit(i, &opts->subsys_bits); |
1166 | one_ss = true; | 1166 | one_ss = true; |
1167 | 1167 | ||
1168 | break; | 1168 | break; |
1169 | } | 1169 | } |
1170 | if (i == CGROUP_SUBSYS_COUNT) | 1170 | if (i == CGROUP_SUBSYS_COUNT) |
1171 | return -ENOENT; | 1171 | return -ENOENT; |
1172 | } | 1172 | } |
1173 | 1173 | ||
1174 | /* | 1174 | /* |
1175 | * If the 'all' option was specified select all the subsystems, | 1175 | * If the 'all' option was specified select all the subsystems, |
1176 | * otherwise 'all, 'none' and a subsystem name options were not | 1176 | * otherwise 'all, 'none' and a subsystem name options were not |
1177 | * specified, let's default to 'all' | 1177 | * specified, let's default to 'all' |
1178 | */ | 1178 | */ |
1179 | if (all_ss || (!all_ss && !one_ss && !opts->none)) { | 1179 | if (all_ss || (!all_ss && !one_ss && !opts->none)) { |
1180 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1180 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1181 | struct cgroup_subsys *ss = subsys[i]; | 1181 | struct cgroup_subsys *ss = subsys[i]; |
1182 | if (ss == NULL) | 1182 | if (ss == NULL) |
1183 | continue; | 1183 | continue; |
1184 | if (ss->disabled) | 1184 | if (ss->disabled) |
1185 | continue; | 1185 | continue; |
1186 | set_bit(i, &opts->subsys_bits); | 1186 | set_bit(i, &opts->subsys_bits); |
1187 | } | 1187 | } |
1188 | } | 1188 | } |
1189 | 1189 | ||
1190 | /* Consistency checks */ | 1190 | /* Consistency checks */ |
1191 | 1191 | ||
1192 | /* | 1192 | /* |
1193 | * Option noprefix was introduced just for backward compatibility | 1193 | * Option noprefix was introduced just for backward compatibility |
1194 | * with the old cpuset, so we allow noprefix only if mounting just | 1194 | * with the old cpuset, so we allow noprefix only if mounting just |
1195 | * the cpuset subsystem. | 1195 | * the cpuset subsystem. |
1196 | */ | 1196 | */ |
1197 | if (test_bit(ROOT_NOPREFIX, &opts->flags) && | 1197 | if (test_bit(ROOT_NOPREFIX, &opts->flags) && |
1198 | (opts->subsys_bits & mask)) | 1198 | (opts->subsys_bits & mask)) |
1199 | return -EINVAL; | 1199 | return -EINVAL; |
1200 | 1200 | ||
1201 | 1201 | ||
1202 | /* Can't specify "none" and some subsystems */ | 1202 | /* Can't specify "none" and some subsystems */ |
1203 | if (opts->subsys_bits && opts->none) | 1203 | if (opts->subsys_bits && opts->none) |
1204 | return -EINVAL; | 1204 | return -EINVAL; |
1205 | 1205 | ||
1206 | /* | 1206 | /* |
1207 | * We either have to specify by name or by subsystems. (So all | 1207 | * We either have to specify by name or by subsystems. (So all |
1208 | * empty hierarchies must have a name). | 1208 | * empty hierarchies must have a name). |
1209 | */ | 1209 | */ |
1210 | if (!opts->subsys_bits && !opts->name) | 1210 | if (!opts->subsys_bits && !opts->name) |
1211 | return -EINVAL; | 1211 | return -EINVAL; |
1212 | 1212 | ||
1213 | /* | 1213 | /* |
1214 | * Grab references on all the modules we'll need, so the subsystems | 1214 | * Grab references on all the modules we'll need, so the subsystems |
1215 | * don't dance around before rebind_subsystems attaches them. This may | 1215 | * don't dance around before rebind_subsystems attaches them. This may |
1216 | * take duplicate reference counts on a subsystem that's already used, | 1216 | * take duplicate reference counts on a subsystem that's already used, |
1217 | * but rebind_subsystems handles this case. | 1217 | * but rebind_subsystems handles this case. |
1218 | */ | 1218 | */ |
1219 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | 1219 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { |
1220 | unsigned long bit = 1UL << i; | 1220 | unsigned long bit = 1UL << i; |
1221 | 1221 | ||
1222 | if (!(bit & opts->subsys_bits)) | 1222 | if (!(bit & opts->subsys_bits)) |
1223 | continue; | 1223 | continue; |
1224 | if (!try_module_get(subsys[i]->module)) { | 1224 | if (!try_module_get(subsys[i]->module)) { |
1225 | module_pin_failed = true; | 1225 | module_pin_failed = true; |
1226 | break; | 1226 | break; |
1227 | } | 1227 | } |
1228 | } | 1228 | } |
1229 | if (module_pin_failed) { | 1229 | if (module_pin_failed) { |
1230 | /* | 1230 | /* |
1231 | * oops, one of the modules was going away. this means that we | 1231 | * oops, one of the modules was going away. this means that we |
1232 | * raced with a module_delete call, and to the user this is | 1232 | * raced with a module_delete call, and to the user this is |
1233 | * essentially a "subsystem doesn't exist" case. | 1233 | * essentially a "subsystem doesn't exist" case. |
1234 | */ | 1234 | */ |
1235 | for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { | 1235 | for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { |
1236 | /* drop refcounts only on the ones we took */ | 1236 | /* drop refcounts only on the ones we took */ |
1237 | unsigned long bit = 1UL << i; | 1237 | unsigned long bit = 1UL << i; |
1238 | 1238 | ||
1239 | if (!(bit & opts->subsys_bits)) | 1239 | if (!(bit & opts->subsys_bits)) |
1240 | continue; | 1240 | continue; |
1241 | module_put(subsys[i]->module); | 1241 | module_put(subsys[i]->module); |
1242 | } | 1242 | } |
1243 | return -ENOENT; | 1243 | return -ENOENT; |
1244 | } | 1244 | } |
1245 | 1245 | ||
1246 | return 0; | 1246 | return 0; |
1247 | } | 1247 | } |
1248 | 1248 | ||
1249 | static void drop_parsed_module_refcounts(unsigned long subsys_bits) | 1249 | static void drop_parsed_module_refcounts(unsigned long subsys_bits) |
1250 | { | 1250 | { |
1251 | int i; | 1251 | int i; |
1252 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | 1252 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { |
1253 | unsigned long bit = 1UL << i; | 1253 | unsigned long bit = 1UL << i; |
1254 | 1254 | ||
1255 | if (!(bit & subsys_bits)) | 1255 | if (!(bit & subsys_bits)) |
1256 | continue; | 1256 | continue; |
1257 | module_put(subsys[i]->module); | 1257 | module_put(subsys[i]->module); |
1258 | } | 1258 | } |
1259 | } | 1259 | } |
1260 | 1260 | ||
1261 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | 1261 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) |
1262 | { | 1262 | { |
1263 | int ret = 0; | 1263 | int ret = 0; |
1264 | struct cgroupfs_root *root = sb->s_fs_info; | 1264 | struct cgroupfs_root *root = sb->s_fs_info; |
1265 | struct cgroup *cgrp = &root->top_cgroup; | 1265 | struct cgroup *cgrp = &root->top_cgroup; |
1266 | struct cgroup_sb_opts opts; | 1266 | struct cgroup_sb_opts opts; |
1267 | 1267 | ||
1268 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | 1268 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); |
1269 | mutex_lock(&cgroup_mutex); | 1269 | mutex_lock(&cgroup_mutex); |
1270 | 1270 | ||
1271 | /* See what subsystems are wanted */ | 1271 | /* See what subsystems are wanted */ |
1272 | ret = parse_cgroupfs_options(data, &opts); | 1272 | ret = parse_cgroupfs_options(data, &opts); |
1273 | if (ret) | 1273 | if (ret) |
1274 | goto out_unlock; | 1274 | goto out_unlock; |
1275 | 1275 | ||
1276 | /* Don't allow flags or name to change at remount */ | 1276 | /* Don't allow flags or name to change at remount */ |
1277 | if (opts.flags != root->flags || | 1277 | if (opts.flags != root->flags || |
1278 | (opts.name && strcmp(opts.name, root->name))) { | 1278 | (opts.name && strcmp(opts.name, root->name))) { |
1279 | ret = -EINVAL; | 1279 | ret = -EINVAL; |
1280 | drop_parsed_module_refcounts(opts.subsys_bits); | 1280 | drop_parsed_module_refcounts(opts.subsys_bits); |
1281 | goto out_unlock; | 1281 | goto out_unlock; |
1282 | } | 1282 | } |
1283 | 1283 | ||
1284 | ret = rebind_subsystems(root, opts.subsys_bits); | 1284 | ret = rebind_subsystems(root, opts.subsys_bits); |
1285 | if (ret) { | 1285 | if (ret) { |
1286 | drop_parsed_module_refcounts(opts.subsys_bits); | 1286 | drop_parsed_module_refcounts(opts.subsys_bits); |
1287 | goto out_unlock; | 1287 | goto out_unlock; |
1288 | } | 1288 | } |
1289 | 1289 | ||
1290 | /* (re)populate subsystem files */ | 1290 | /* (re)populate subsystem files */ |
1291 | cgroup_populate_dir(cgrp); | 1291 | cgroup_populate_dir(cgrp); |
1292 | 1292 | ||
1293 | if (opts.release_agent) | 1293 | if (opts.release_agent) |
1294 | strcpy(root->release_agent_path, opts.release_agent); | 1294 | strcpy(root->release_agent_path, opts.release_agent); |
1295 | out_unlock: | 1295 | out_unlock: |
1296 | kfree(opts.release_agent); | 1296 | kfree(opts.release_agent); |
1297 | kfree(opts.name); | 1297 | kfree(opts.name); |
1298 | mutex_unlock(&cgroup_mutex); | 1298 | mutex_unlock(&cgroup_mutex); |
1299 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1299 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1300 | return ret; | 1300 | return ret; |
1301 | } | 1301 | } |
1302 | 1302 | ||
1303 | static const struct super_operations cgroup_ops = { | 1303 | static const struct super_operations cgroup_ops = { |
1304 | .statfs = simple_statfs, | 1304 | .statfs = simple_statfs, |
1305 | .drop_inode = generic_delete_inode, | 1305 | .drop_inode = generic_delete_inode, |
1306 | .show_options = cgroup_show_options, | 1306 | .show_options = cgroup_show_options, |
1307 | .remount_fs = cgroup_remount, | 1307 | .remount_fs = cgroup_remount, |
1308 | }; | 1308 | }; |
1309 | 1309 | ||
1310 | static void init_cgroup_housekeeping(struct cgroup *cgrp) | 1310 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
1311 | { | 1311 | { |
1312 | INIT_LIST_HEAD(&cgrp->sibling); | 1312 | INIT_LIST_HEAD(&cgrp->sibling); |
1313 | INIT_LIST_HEAD(&cgrp->children); | 1313 | INIT_LIST_HEAD(&cgrp->children); |
1314 | INIT_LIST_HEAD(&cgrp->css_sets); | 1314 | INIT_LIST_HEAD(&cgrp->css_sets); |
1315 | INIT_LIST_HEAD(&cgrp->release_list); | 1315 | INIT_LIST_HEAD(&cgrp->release_list); |
1316 | INIT_LIST_HEAD(&cgrp->pidlists); | 1316 | INIT_LIST_HEAD(&cgrp->pidlists); |
1317 | mutex_init(&cgrp->pidlist_mutex); | 1317 | mutex_init(&cgrp->pidlist_mutex); |
1318 | INIT_LIST_HEAD(&cgrp->event_list); | 1318 | INIT_LIST_HEAD(&cgrp->event_list); |
1319 | spin_lock_init(&cgrp->event_list_lock); | 1319 | spin_lock_init(&cgrp->event_list_lock); |
1320 | } | 1320 | } |
1321 | 1321 | ||
1322 | static void init_cgroup_root(struct cgroupfs_root *root) | 1322 | static void init_cgroup_root(struct cgroupfs_root *root) |
1323 | { | 1323 | { |
1324 | struct cgroup *cgrp = &root->top_cgroup; | 1324 | struct cgroup *cgrp = &root->top_cgroup; |
1325 | INIT_LIST_HEAD(&root->subsys_list); | 1325 | INIT_LIST_HEAD(&root->subsys_list); |
1326 | INIT_LIST_HEAD(&root->root_list); | 1326 | INIT_LIST_HEAD(&root->root_list); |
1327 | root->number_of_cgroups = 1; | 1327 | root->number_of_cgroups = 1; |
1328 | cgrp->root = root; | 1328 | cgrp->root = root; |
1329 | cgrp->top_cgroup = cgrp; | 1329 | cgrp->top_cgroup = cgrp; |
1330 | init_cgroup_housekeeping(cgrp); | 1330 | init_cgroup_housekeeping(cgrp); |
1331 | } | 1331 | } |
1332 | 1332 | ||
1333 | static bool init_root_id(struct cgroupfs_root *root) | 1333 | static bool init_root_id(struct cgroupfs_root *root) |
1334 | { | 1334 | { |
1335 | int ret = 0; | 1335 | int ret = 0; |
1336 | 1336 | ||
1337 | do { | 1337 | do { |
1338 | if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) | 1338 | if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) |
1339 | return false; | 1339 | return false; |
1340 | spin_lock(&hierarchy_id_lock); | 1340 | spin_lock(&hierarchy_id_lock); |
1341 | /* Try to allocate the next unused ID */ | 1341 | /* Try to allocate the next unused ID */ |
1342 | ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, | 1342 | ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, |
1343 | &root->hierarchy_id); | 1343 | &root->hierarchy_id); |
1344 | if (ret == -ENOSPC) | 1344 | if (ret == -ENOSPC) |
1345 | /* Try again starting from 0 */ | 1345 | /* Try again starting from 0 */ |
1346 | ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); | 1346 | ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); |
1347 | if (!ret) { | 1347 | if (!ret) { |
1348 | next_hierarchy_id = root->hierarchy_id + 1; | 1348 | next_hierarchy_id = root->hierarchy_id + 1; |
1349 | } else if (ret != -EAGAIN) { | 1349 | } else if (ret != -EAGAIN) { |
1350 | /* Can only get here if the 31-bit IDR is full ... */ | 1350 | /* Can only get here if the 31-bit IDR is full ... */ |
1351 | BUG_ON(ret); | 1351 | BUG_ON(ret); |
1352 | } | 1352 | } |
1353 | spin_unlock(&hierarchy_id_lock); | 1353 | spin_unlock(&hierarchy_id_lock); |
1354 | } while (ret); | 1354 | } while (ret); |
1355 | return true; | 1355 | return true; |
1356 | } | 1356 | } |
1357 | 1357 | ||
1358 | static int cgroup_test_super(struct super_block *sb, void *data) | 1358 | static int cgroup_test_super(struct super_block *sb, void *data) |
1359 | { | 1359 | { |
1360 | struct cgroup_sb_opts *opts = data; | 1360 | struct cgroup_sb_opts *opts = data; |
1361 | struct cgroupfs_root *root = sb->s_fs_info; | 1361 | struct cgroupfs_root *root = sb->s_fs_info; |
1362 | 1362 | ||
1363 | /* If we asked for a name then it must match */ | 1363 | /* If we asked for a name then it must match */ |
1364 | if (opts->name && strcmp(opts->name, root->name)) | 1364 | if (opts->name && strcmp(opts->name, root->name)) |
1365 | return 0; | 1365 | return 0; |
1366 | 1366 | ||
1367 | /* | 1367 | /* |
1368 | * If we asked for subsystems (or explicitly for no | 1368 | * If we asked for subsystems (or explicitly for no |
1369 | * subsystems) then they must match | 1369 | * subsystems) then they must match |
1370 | */ | 1370 | */ |
1371 | if ((opts->subsys_bits || opts->none) | 1371 | if ((opts->subsys_bits || opts->none) |
1372 | && (opts->subsys_bits != root->subsys_bits)) | 1372 | && (opts->subsys_bits != root->subsys_bits)) |
1373 | return 0; | 1373 | return 0; |
1374 | 1374 | ||
1375 | return 1; | 1375 | return 1; |
1376 | } | 1376 | } |
1377 | 1377 | ||
1378 | static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | 1378 | static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) |
1379 | { | 1379 | { |
1380 | struct cgroupfs_root *root; | 1380 | struct cgroupfs_root *root; |
1381 | 1381 | ||
1382 | if (!opts->subsys_bits && !opts->none) | 1382 | if (!opts->subsys_bits && !opts->none) |
1383 | return NULL; | 1383 | return NULL; |
1384 | 1384 | ||
1385 | root = kzalloc(sizeof(*root), GFP_KERNEL); | 1385 | root = kzalloc(sizeof(*root), GFP_KERNEL); |
1386 | if (!root) | 1386 | if (!root) |
1387 | return ERR_PTR(-ENOMEM); | 1387 | return ERR_PTR(-ENOMEM); |
1388 | 1388 | ||
1389 | if (!init_root_id(root)) { | 1389 | if (!init_root_id(root)) { |
1390 | kfree(root); | 1390 | kfree(root); |
1391 | return ERR_PTR(-ENOMEM); | 1391 | return ERR_PTR(-ENOMEM); |
1392 | } | 1392 | } |
1393 | init_cgroup_root(root); | 1393 | init_cgroup_root(root); |
1394 | 1394 | ||
1395 | root->subsys_bits = opts->subsys_bits; | 1395 | root->subsys_bits = opts->subsys_bits; |
1396 | root->flags = opts->flags; | 1396 | root->flags = opts->flags; |
1397 | if (opts->release_agent) | 1397 | if (opts->release_agent) |
1398 | strcpy(root->release_agent_path, opts->release_agent); | 1398 | strcpy(root->release_agent_path, opts->release_agent); |
1399 | if (opts->name) | 1399 | if (opts->name) |
1400 | strcpy(root->name, opts->name); | 1400 | strcpy(root->name, opts->name); |
1401 | if (opts->clone_children) | 1401 | if (opts->clone_children) |
1402 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); | 1402 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); |
1403 | return root; | 1403 | return root; |
1404 | } | 1404 | } |
1405 | 1405 | ||
1406 | static void cgroup_drop_root(struct cgroupfs_root *root) | 1406 | static void cgroup_drop_root(struct cgroupfs_root *root) |
1407 | { | 1407 | { |
1408 | if (!root) | 1408 | if (!root) |
1409 | return; | 1409 | return; |
1410 | 1410 | ||
1411 | BUG_ON(!root->hierarchy_id); | 1411 | BUG_ON(!root->hierarchy_id); |
1412 | spin_lock(&hierarchy_id_lock); | 1412 | spin_lock(&hierarchy_id_lock); |
1413 | ida_remove(&hierarchy_ida, root->hierarchy_id); | 1413 | ida_remove(&hierarchy_ida, root->hierarchy_id); |
1414 | spin_unlock(&hierarchy_id_lock); | 1414 | spin_unlock(&hierarchy_id_lock); |
1415 | kfree(root); | 1415 | kfree(root); |
1416 | } | 1416 | } |
1417 | 1417 | ||
1418 | static int cgroup_set_super(struct super_block *sb, void *data) | 1418 | static int cgroup_set_super(struct super_block *sb, void *data) |
1419 | { | 1419 | { |
1420 | int ret; | 1420 | int ret; |
1421 | struct cgroup_sb_opts *opts = data; | 1421 | struct cgroup_sb_opts *opts = data; |
1422 | 1422 | ||
1423 | /* If we don't have a new root, we can't set up a new sb */ | 1423 | /* If we don't have a new root, we can't set up a new sb */ |
1424 | if (!opts->new_root) | 1424 | if (!opts->new_root) |
1425 | return -EINVAL; | 1425 | return -EINVAL; |
1426 | 1426 | ||
1427 | BUG_ON(!opts->subsys_bits && !opts->none); | 1427 | BUG_ON(!opts->subsys_bits && !opts->none); |
1428 | 1428 | ||
1429 | ret = set_anon_super(sb, NULL); | 1429 | ret = set_anon_super(sb, NULL); |
1430 | if (ret) | 1430 | if (ret) |
1431 | return ret; | 1431 | return ret; |
1432 | 1432 | ||
1433 | sb->s_fs_info = opts->new_root; | 1433 | sb->s_fs_info = opts->new_root; |
1434 | opts->new_root->sb = sb; | 1434 | opts->new_root->sb = sb; |
1435 | 1435 | ||
1436 | sb->s_blocksize = PAGE_CACHE_SIZE; | 1436 | sb->s_blocksize = PAGE_CACHE_SIZE; |
1437 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | 1437 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; |
1438 | sb->s_magic = CGROUP_SUPER_MAGIC; | 1438 | sb->s_magic = CGROUP_SUPER_MAGIC; |
1439 | sb->s_op = &cgroup_ops; | 1439 | sb->s_op = &cgroup_ops; |
1440 | 1440 | ||
1441 | return 0; | 1441 | return 0; |
1442 | } | 1442 | } |
1443 | 1443 | ||
1444 | static int cgroup_get_rootdir(struct super_block *sb) | 1444 | static int cgroup_get_rootdir(struct super_block *sb) |
1445 | { | 1445 | { |
1446 | static const struct dentry_operations cgroup_dops = { | 1446 | static const struct dentry_operations cgroup_dops = { |
1447 | .d_iput = cgroup_diput, | 1447 | .d_iput = cgroup_diput, |
1448 | .d_delete = cgroup_delete, | 1448 | .d_delete = cgroup_delete, |
1449 | }; | 1449 | }; |
1450 | 1450 | ||
1451 | struct inode *inode = | 1451 | struct inode *inode = |
1452 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); | 1452 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); |
1453 | struct dentry *dentry; | 1453 | struct dentry *dentry; |
1454 | 1454 | ||
1455 | if (!inode) | 1455 | if (!inode) |
1456 | return -ENOMEM; | 1456 | return -ENOMEM; |
1457 | 1457 | ||
1458 | inode->i_fop = &simple_dir_operations; | 1458 | inode->i_fop = &simple_dir_operations; |
1459 | inode->i_op = &cgroup_dir_inode_operations; | 1459 | inode->i_op = &cgroup_dir_inode_operations; |
1460 | /* directories start off with i_nlink == 2 (for "." entry) */ | 1460 | /* directories start off with i_nlink == 2 (for "." entry) */ |
1461 | inc_nlink(inode); | 1461 | inc_nlink(inode); |
1462 | dentry = d_alloc_root(inode); | 1462 | dentry = d_alloc_root(inode); |
1463 | if (!dentry) { | 1463 | if (!dentry) { |
1464 | iput(inode); | 1464 | iput(inode); |
1465 | return -ENOMEM; | 1465 | return -ENOMEM; |
1466 | } | 1466 | } |
1467 | sb->s_root = dentry; | 1467 | sb->s_root = dentry; |
1468 | /* for everything else we want ->d_op set */ | 1468 | /* for everything else we want ->d_op set */ |
1469 | sb->s_d_op = &cgroup_dops; | 1469 | sb->s_d_op = &cgroup_dops; |
1470 | return 0; | 1470 | return 0; |
1471 | } | 1471 | } |
1472 | 1472 | ||
1473 | static struct dentry *cgroup_mount(struct file_system_type *fs_type, | 1473 | static struct dentry *cgroup_mount(struct file_system_type *fs_type, |
1474 | int flags, const char *unused_dev_name, | 1474 | int flags, const char *unused_dev_name, |
1475 | void *data) | 1475 | void *data) |
1476 | { | 1476 | { |
1477 | struct cgroup_sb_opts opts; | 1477 | struct cgroup_sb_opts opts; |
1478 | struct cgroupfs_root *root; | 1478 | struct cgroupfs_root *root; |
1479 | int ret = 0; | 1479 | int ret = 0; |
1480 | struct super_block *sb; | 1480 | struct super_block *sb; |
1481 | struct cgroupfs_root *new_root; | 1481 | struct cgroupfs_root *new_root; |
1482 | 1482 | ||
1483 | /* First find the desired set of subsystems */ | 1483 | /* First find the desired set of subsystems */ |
1484 | mutex_lock(&cgroup_mutex); | 1484 | mutex_lock(&cgroup_mutex); |
1485 | ret = parse_cgroupfs_options(data, &opts); | 1485 | ret = parse_cgroupfs_options(data, &opts); |
1486 | mutex_unlock(&cgroup_mutex); | 1486 | mutex_unlock(&cgroup_mutex); |
1487 | if (ret) | 1487 | if (ret) |
1488 | goto out_err; | 1488 | goto out_err; |
1489 | 1489 | ||
1490 | /* | 1490 | /* |
1491 | * Allocate a new cgroup root. We may not need it if we're | 1491 | * Allocate a new cgroup root. We may not need it if we're |
1492 | * reusing an existing hierarchy. | 1492 | * reusing an existing hierarchy. |
1493 | */ | 1493 | */ |
1494 | new_root = cgroup_root_from_opts(&opts); | 1494 | new_root = cgroup_root_from_opts(&opts); |
1495 | if (IS_ERR(new_root)) { | 1495 | if (IS_ERR(new_root)) { |
1496 | ret = PTR_ERR(new_root); | 1496 | ret = PTR_ERR(new_root); |
1497 | goto drop_modules; | 1497 | goto drop_modules; |
1498 | } | 1498 | } |
1499 | opts.new_root = new_root; | 1499 | opts.new_root = new_root; |
1500 | 1500 | ||
1501 | /* Locate an existing or new sb for this hierarchy */ | 1501 | /* Locate an existing or new sb for this hierarchy */ |
1502 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); | 1502 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); |
1503 | if (IS_ERR(sb)) { | 1503 | if (IS_ERR(sb)) { |
1504 | ret = PTR_ERR(sb); | 1504 | ret = PTR_ERR(sb); |
1505 | cgroup_drop_root(opts.new_root); | 1505 | cgroup_drop_root(opts.new_root); |
1506 | goto drop_modules; | 1506 | goto drop_modules; |
1507 | } | 1507 | } |
1508 | 1508 | ||
1509 | root = sb->s_fs_info; | 1509 | root = sb->s_fs_info; |
1510 | BUG_ON(!root); | 1510 | BUG_ON(!root); |
1511 | if (root == opts.new_root) { | 1511 | if (root == opts.new_root) { |
1512 | /* We used the new root structure, so this is a new hierarchy */ | 1512 | /* We used the new root structure, so this is a new hierarchy */ |
1513 | struct list_head tmp_cg_links; | 1513 | struct list_head tmp_cg_links; |
1514 | struct cgroup *root_cgrp = &root->top_cgroup; | 1514 | struct cgroup *root_cgrp = &root->top_cgroup; |
1515 | struct inode *inode; | 1515 | struct inode *inode; |
1516 | struct cgroupfs_root *existing_root; | 1516 | struct cgroupfs_root *existing_root; |
1517 | int i; | 1517 | int i; |
1518 | 1518 | ||
1519 | BUG_ON(sb->s_root != NULL); | 1519 | BUG_ON(sb->s_root != NULL); |
1520 | 1520 | ||
1521 | ret = cgroup_get_rootdir(sb); | 1521 | ret = cgroup_get_rootdir(sb); |
1522 | if (ret) | 1522 | if (ret) |
1523 | goto drop_new_super; | 1523 | goto drop_new_super; |
1524 | inode = sb->s_root->d_inode; | 1524 | inode = sb->s_root->d_inode; |
1525 | 1525 | ||
1526 | mutex_lock(&inode->i_mutex); | 1526 | mutex_lock(&inode->i_mutex); |
1527 | mutex_lock(&cgroup_mutex); | 1527 | mutex_lock(&cgroup_mutex); |
1528 | 1528 | ||
1529 | if (strlen(root->name)) { | 1529 | if (strlen(root->name)) { |
1530 | /* Check for name clashes with existing mounts */ | 1530 | /* Check for name clashes with existing mounts */ |
1531 | for_each_active_root(existing_root) { | 1531 | for_each_active_root(existing_root) { |
1532 | if (!strcmp(existing_root->name, root->name)) { | 1532 | if (!strcmp(existing_root->name, root->name)) { |
1533 | ret = -EBUSY; | 1533 | ret = -EBUSY; |
1534 | mutex_unlock(&cgroup_mutex); | 1534 | mutex_unlock(&cgroup_mutex); |
1535 | mutex_unlock(&inode->i_mutex); | 1535 | mutex_unlock(&inode->i_mutex); |
1536 | goto drop_new_super; | 1536 | goto drop_new_super; |
1537 | } | 1537 | } |
1538 | } | 1538 | } |
1539 | } | 1539 | } |
1540 | 1540 | ||
1541 | /* | 1541 | /* |
1542 | * We're accessing css_set_count without locking | 1542 | * We're accessing css_set_count without locking |
1543 | * css_set_lock here, but that's OK - it can only be | 1543 | * css_set_lock here, but that's OK - it can only be |
1544 | * increased by someone holding cgroup_lock, and | 1544 | * increased by someone holding cgroup_lock, and |
1545 | * that's us. The worst that can happen is that we | 1545 | * that's us. The worst that can happen is that we |
1546 | * have some link structures left over | 1546 | * have some link structures left over |
1547 | */ | 1547 | */ |
1548 | ret = allocate_cg_links(css_set_count, &tmp_cg_links); | 1548 | ret = allocate_cg_links(css_set_count, &tmp_cg_links); |
1549 | if (ret) { | 1549 | if (ret) { |
1550 | mutex_unlock(&cgroup_mutex); | 1550 | mutex_unlock(&cgroup_mutex); |
1551 | mutex_unlock(&inode->i_mutex); | 1551 | mutex_unlock(&inode->i_mutex); |
1552 | goto drop_new_super; | 1552 | goto drop_new_super; |
1553 | } | 1553 | } |
1554 | 1554 | ||
1555 | ret = rebind_subsystems(root, root->subsys_bits); | 1555 | ret = rebind_subsystems(root, root->subsys_bits); |
1556 | if (ret == -EBUSY) { | 1556 | if (ret == -EBUSY) { |
1557 | mutex_unlock(&cgroup_mutex); | 1557 | mutex_unlock(&cgroup_mutex); |
1558 | mutex_unlock(&inode->i_mutex); | 1558 | mutex_unlock(&inode->i_mutex); |
1559 | free_cg_links(&tmp_cg_links); | 1559 | free_cg_links(&tmp_cg_links); |
1560 | goto drop_new_super; | 1560 | goto drop_new_super; |
1561 | } | 1561 | } |
1562 | /* | 1562 | /* |
1563 | * There must be no failure case after here, since rebinding | 1563 | * There must be no failure case after here, since rebinding |
1564 | * takes care of subsystems' refcounts, which are explicitly | 1564 | * takes care of subsystems' refcounts, which are explicitly |
1565 | * dropped in the failure exit path. | 1565 | * dropped in the failure exit path. |
1566 | */ | 1566 | */ |
1567 | 1567 | ||
1568 | /* EBUSY should be the only error here */ | 1568 | /* EBUSY should be the only error here */ |
1569 | BUG_ON(ret); | 1569 | BUG_ON(ret); |
1570 | 1570 | ||
1571 | list_add(&root->root_list, &roots); | 1571 | list_add(&root->root_list, &roots); |
1572 | root_count++; | 1572 | root_count++; |
1573 | 1573 | ||
1574 | sb->s_root->d_fsdata = root_cgrp; | 1574 | sb->s_root->d_fsdata = root_cgrp; |
1575 | root->top_cgroup.dentry = sb->s_root; | 1575 | root->top_cgroup.dentry = sb->s_root; |
1576 | 1576 | ||
1577 | /* Link the top cgroup in this hierarchy into all | 1577 | /* Link the top cgroup in this hierarchy into all |
1578 | * the css_set objects */ | 1578 | * the css_set objects */ |
1579 | write_lock(&css_set_lock); | 1579 | write_lock(&css_set_lock); |
1580 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { | 1580 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { |
1581 | struct hlist_head *hhead = &css_set_table[i]; | 1581 | struct hlist_head *hhead = &css_set_table[i]; |
1582 | struct hlist_node *node; | 1582 | struct hlist_node *node; |
1583 | struct css_set *cg; | 1583 | struct css_set *cg; |
1584 | 1584 | ||
1585 | hlist_for_each_entry(cg, node, hhead, hlist) | 1585 | hlist_for_each_entry(cg, node, hhead, hlist) |
1586 | link_css_set(&tmp_cg_links, cg, root_cgrp); | 1586 | link_css_set(&tmp_cg_links, cg, root_cgrp); |
1587 | } | 1587 | } |
1588 | write_unlock(&css_set_lock); | 1588 | write_unlock(&css_set_lock); |
1589 | 1589 | ||
1590 | free_cg_links(&tmp_cg_links); | 1590 | free_cg_links(&tmp_cg_links); |
1591 | 1591 | ||
1592 | BUG_ON(!list_empty(&root_cgrp->sibling)); | 1592 | BUG_ON(!list_empty(&root_cgrp->sibling)); |
1593 | BUG_ON(!list_empty(&root_cgrp->children)); | 1593 | BUG_ON(!list_empty(&root_cgrp->children)); |
1594 | BUG_ON(root->number_of_cgroups != 1); | 1594 | BUG_ON(root->number_of_cgroups != 1); |
1595 | 1595 | ||
1596 | cgroup_populate_dir(root_cgrp); | 1596 | cgroup_populate_dir(root_cgrp); |
1597 | mutex_unlock(&cgroup_mutex); | 1597 | mutex_unlock(&cgroup_mutex); |
1598 | mutex_unlock(&inode->i_mutex); | 1598 | mutex_unlock(&inode->i_mutex); |
1599 | } else { | 1599 | } else { |
1600 | /* | 1600 | /* |
1601 | * We re-used an existing hierarchy - the new root (if | 1601 | * We re-used an existing hierarchy - the new root (if |
1602 | * any) is not needed | 1602 | * any) is not needed |
1603 | */ | 1603 | */ |
1604 | cgroup_drop_root(opts.new_root); | 1604 | cgroup_drop_root(opts.new_root); |
1605 | /* no subsys rebinding, so refcounts don't change */ | 1605 | /* no subsys rebinding, so refcounts don't change */ |
1606 | drop_parsed_module_refcounts(opts.subsys_bits); | 1606 | drop_parsed_module_refcounts(opts.subsys_bits); |
1607 | } | 1607 | } |
1608 | 1608 | ||
1609 | kfree(opts.release_agent); | 1609 | kfree(opts.release_agent); |
1610 | kfree(opts.name); | 1610 | kfree(opts.name); |
1611 | return dget(sb->s_root); | 1611 | return dget(sb->s_root); |
1612 | 1612 | ||
1613 | drop_new_super: | 1613 | drop_new_super: |
1614 | deactivate_locked_super(sb); | 1614 | deactivate_locked_super(sb); |
1615 | drop_modules: | 1615 | drop_modules: |
1616 | drop_parsed_module_refcounts(opts.subsys_bits); | 1616 | drop_parsed_module_refcounts(opts.subsys_bits); |
1617 | out_err: | 1617 | out_err: |
1618 | kfree(opts.release_agent); | 1618 | kfree(opts.release_agent); |
1619 | kfree(opts.name); | 1619 | kfree(opts.name); |
1620 | return ERR_PTR(ret); | 1620 | return ERR_PTR(ret); |
1621 | } | 1621 | } |
1622 | 1622 | ||
1623 | static void cgroup_kill_sb(struct super_block *sb) { | 1623 | static void cgroup_kill_sb(struct super_block *sb) { |
1624 | struct cgroupfs_root *root = sb->s_fs_info; | 1624 | struct cgroupfs_root *root = sb->s_fs_info; |
1625 | struct cgroup *cgrp = &root->top_cgroup; | 1625 | struct cgroup *cgrp = &root->top_cgroup; |
1626 | int ret; | 1626 | int ret; |
1627 | struct cg_cgroup_link *link; | 1627 | struct cg_cgroup_link *link; |
1628 | struct cg_cgroup_link *saved_link; | 1628 | struct cg_cgroup_link *saved_link; |
1629 | 1629 | ||
1630 | BUG_ON(!root); | 1630 | BUG_ON(!root); |
1631 | 1631 | ||
1632 | BUG_ON(root->number_of_cgroups != 1); | 1632 | BUG_ON(root->number_of_cgroups != 1); |
1633 | BUG_ON(!list_empty(&cgrp->children)); | 1633 | BUG_ON(!list_empty(&cgrp->children)); |
1634 | BUG_ON(!list_empty(&cgrp->sibling)); | 1634 | BUG_ON(!list_empty(&cgrp->sibling)); |
1635 | 1635 | ||
1636 | mutex_lock(&cgroup_mutex); | 1636 | mutex_lock(&cgroup_mutex); |
1637 | 1637 | ||
1638 | /* Rebind all subsystems back to the default hierarchy */ | 1638 | /* Rebind all subsystems back to the default hierarchy */ |
1639 | ret = rebind_subsystems(root, 0); | 1639 | ret = rebind_subsystems(root, 0); |
1640 | /* Shouldn't be able to fail ... */ | 1640 | /* Shouldn't be able to fail ... */ |
1641 | BUG_ON(ret); | 1641 | BUG_ON(ret); |
1642 | 1642 | ||
1643 | /* | 1643 | /* |
1644 | * Release all the links from css_sets to this hierarchy's | 1644 | * Release all the links from css_sets to this hierarchy's |
1645 | * root cgroup | 1645 | * root cgroup |
1646 | */ | 1646 | */ |
1647 | write_lock(&css_set_lock); | 1647 | write_lock(&css_set_lock); |
1648 | 1648 | ||
1649 | list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, | 1649 | list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, |
1650 | cgrp_link_list) { | 1650 | cgrp_link_list) { |
1651 | list_del(&link->cg_link_list); | 1651 | list_del(&link->cg_link_list); |
1652 | list_del(&link->cgrp_link_list); | 1652 | list_del(&link->cgrp_link_list); |
1653 | kfree(link); | 1653 | kfree(link); |
1654 | } | 1654 | } |
1655 | write_unlock(&css_set_lock); | 1655 | write_unlock(&css_set_lock); |
1656 | 1656 | ||
1657 | if (!list_empty(&root->root_list)) { | 1657 | if (!list_empty(&root->root_list)) { |
1658 | list_del(&root->root_list); | 1658 | list_del(&root->root_list); |
1659 | root_count--; | 1659 | root_count--; |
1660 | } | 1660 | } |
1661 | 1661 | ||
1662 | mutex_unlock(&cgroup_mutex); | 1662 | mutex_unlock(&cgroup_mutex); |
1663 | 1663 | ||
1664 | kill_litter_super(sb); | 1664 | kill_litter_super(sb); |
1665 | cgroup_drop_root(root); | 1665 | cgroup_drop_root(root); |
1666 | } | 1666 | } |
1667 | 1667 | ||
1668 | static struct file_system_type cgroup_fs_type = { | 1668 | static struct file_system_type cgroup_fs_type = { |
1669 | .name = "cgroup", | 1669 | .name = "cgroup", |
1670 | .mount = cgroup_mount, | 1670 | .mount = cgroup_mount, |
1671 | .kill_sb = cgroup_kill_sb, | 1671 | .kill_sb = cgroup_kill_sb, |
1672 | }; | 1672 | }; |
1673 | 1673 | ||
1674 | static struct kobject *cgroup_kobj; | 1674 | static struct kobject *cgroup_kobj; |
1675 | 1675 | ||
1676 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | 1676 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) |
1677 | { | 1677 | { |
1678 | return dentry->d_fsdata; | 1678 | return dentry->d_fsdata; |
1679 | } | 1679 | } |
1680 | 1680 | ||
1681 | static inline struct cftype *__d_cft(struct dentry *dentry) | 1681 | static inline struct cftype *__d_cft(struct dentry *dentry) |
1682 | { | 1682 | { |
1683 | return dentry->d_fsdata; | 1683 | return dentry->d_fsdata; |
1684 | } | 1684 | } |
1685 | 1685 | ||
1686 | /** | 1686 | /** |
1687 | * cgroup_path - generate the path of a cgroup | 1687 | * cgroup_path - generate the path of a cgroup |
1688 | * @cgrp: the cgroup in question | 1688 | * @cgrp: the cgroup in question |
1689 | * @buf: the buffer to write the path into | 1689 | * @buf: the buffer to write the path into |
1690 | * @buflen: the length of the buffer | 1690 | * @buflen: the length of the buffer |
1691 | * | 1691 | * |
1692 | * Called with cgroup_mutex held or else with an RCU-protected cgroup | 1692 | * Called with cgroup_mutex held or else with an RCU-protected cgroup |
1693 | * reference. Writes path of cgroup into buf. Returns 0 on success, | 1693 | * reference. Writes path of cgroup into buf. Returns 0 on success, |
1694 | * -errno on error. | 1694 | * -errno on error. |
1695 | */ | 1695 | */ |
1696 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1696 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1697 | { | 1697 | { |
1698 | char *start; | 1698 | char *start; |
1699 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, | 1699 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, |
1700 | rcu_read_lock_held() || | ||
1701 | cgroup_lock_is_held()); | 1700 | cgroup_lock_is_held()); |
1702 | 1701 | ||
1703 | if (!dentry || cgrp == dummytop) { | 1702 | if (!dentry || cgrp == dummytop) { |
1704 | /* | 1703 | /* |
1705 | * Inactive subsystems have no dentry for their root | 1704 | * Inactive subsystems have no dentry for their root |
1706 | * cgroup | 1705 | * cgroup |
1707 | */ | 1706 | */ |
1708 | strcpy(buf, "/"); | 1707 | strcpy(buf, "/"); |
1709 | return 0; | 1708 | return 0; |
1710 | } | 1709 | } |
1711 | 1710 | ||
1712 | start = buf + buflen; | 1711 | start = buf + buflen; |
1713 | 1712 | ||
1714 | *--start = '\0'; | 1713 | *--start = '\0'; |
1715 | for (;;) { | 1714 | for (;;) { |
1716 | int len = dentry->d_name.len; | 1715 | int len = dentry->d_name.len; |
1717 | 1716 | ||
1718 | if ((start -= len) < buf) | 1717 | if ((start -= len) < buf) |
1719 | return -ENAMETOOLONG; | 1718 | return -ENAMETOOLONG; |
1720 | memcpy(start, dentry->d_name.name, len); | 1719 | memcpy(start, dentry->d_name.name, len); |
1721 | cgrp = cgrp->parent; | 1720 | cgrp = cgrp->parent; |
1722 | if (!cgrp) | 1721 | if (!cgrp) |
1723 | break; | 1722 | break; |
1724 | 1723 | ||
1725 | dentry = rcu_dereference_check(cgrp->dentry, | 1724 | dentry = rcu_dereference_check(cgrp->dentry, |
1726 | rcu_read_lock_held() || | ||
1727 | cgroup_lock_is_held()); | 1725 | cgroup_lock_is_held()); |
1728 | if (!cgrp->parent) | 1726 | if (!cgrp->parent) |
1729 | continue; | 1727 | continue; |
1730 | if (--start < buf) | 1728 | if (--start < buf) |
1731 | return -ENAMETOOLONG; | 1729 | return -ENAMETOOLONG; |
1732 | *start = '/'; | 1730 | *start = '/'; |
1733 | } | 1731 | } |
1734 | memmove(buf, start, buf + buflen - start); | 1732 | memmove(buf, start, buf + buflen - start); |
1735 | return 0; | 1733 | return 0; |
1736 | } | 1734 | } |
1737 | EXPORT_SYMBOL_GPL(cgroup_path); | 1735 | EXPORT_SYMBOL_GPL(cgroup_path); |
1738 | 1736 | ||
1739 | /* | 1737 | /* |
1740 | * cgroup_task_migrate - move a task from one cgroup to another. | 1738 | * cgroup_task_migrate - move a task from one cgroup to another. |
1741 | * | 1739 | * |
1742 | * 'guarantee' is set if the caller promises that a new css_set for the task | 1740 | * 'guarantee' is set if the caller promises that a new css_set for the task |
1743 | * will already exist. If not set, this function might sleep, and can fail with | 1741 | * will already exist. If not set, this function might sleep, and can fail with |
1744 | * -ENOMEM. Otherwise, it can only fail with -ESRCH. | 1742 | * -ENOMEM. Otherwise, it can only fail with -ESRCH. |
1745 | */ | 1743 | */ |
1746 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1744 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, |
1747 | struct task_struct *tsk, bool guarantee) | 1745 | struct task_struct *tsk, bool guarantee) |
1748 | { | 1746 | { |
1749 | struct css_set *oldcg; | 1747 | struct css_set *oldcg; |
1750 | struct css_set *newcg; | 1748 | struct css_set *newcg; |
1751 | 1749 | ||
1752 | /* | 1750 | /* |
1753 | * get old css_set. we need to take task_lock and refcount it, because | 1751 | * get old css_set. we need to take task_lock and refcount it, because |
1754 | * an exiting task can change its css_set to init_css_set and drop its | 1752 | * an exiting task can change its css_set to init_css_set and drop its |
1755 | * old one without taking cgroup_mutex. | 1753 | * old one without taking cgroup_mutex. |
1756 | */ | 1754 | */ |
1757 | task_lock(tsk); | 1755 | task_lock(tsk); |
1758 | oldcg = tsk->cgroups; | 1756 | oldcg = tsk->cgroups; |
1759 | get_css_set(oldcg); | 1757 | get_css_set(oldcg); |
1760 | task_unlock(tsk); | 1758 | task_unlock(tsk); |
1761 | 1759 | ||
1762 | /* locate or allocate a new css_set for this task. */ | 1760 | /* locate or allocate a new css_set for this task. */ |
1763 | if (guarantee) { | 1761 | if (guarantee) { |
1764 | /* we know the css_set we want already exists. */ | 1762 | /* we know the css_set we want already exists. */ |
1765 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | 1763 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; |
1766 | read_lock(&css_set_lock); | 1764 | read_lock(&css_set_lock); |
1767 | newcg = find_existing_css_set(oldcg, cgrp, template); | 1765 | newcg = find_existing_css_set(oldcg, cgrp, template); |
1768 | BUG_ON(!newcg); | 1766 | BUG_ON(!newcg); |
1769 | get_css_set(newcg); | 1767 | get_css_set(newcg); |
1770 | read_unlock(&css_set_lock); | 1768 | read_unlock(&css_set_lock); |
1771 | } else { | 1769 | } else { |
1772 | might_sleep(); | 1770 | might_sleep(); |
1773 | /* find_css_set will give us newcg already referenced. */ | 1771 | /* find_css_set will give us newcg already referenced. */ |
1774 | newcg = find_css_set(oldcg, cgrp); | 1772 | newcg = find_css_set(oldcg, cgrp); |
1775 | if (!newcg) { | 1773 | if (!newcg) { |
1776 | put_css_set(oldcg); | 1774 | put_css_set(oldcg); |
1777 | return -ENOMEM; | 1775 | return -ENOMEM; |
1778 | } | 1776 | } |
1779 | } | 1777 | } |
1780 | put_css_set(oldcg); | 1778 | put_css_set(oldcg); |
1781 | 1779 | ||
1782 | /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ | 1780 | /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ |
1783 | task_lock(tsk); | 1781 | task_lock(tsk); |
1784 | if (tsk->flags & PF_EXITING) { | 1782 | if (tsk->flags & PF_EXITING) { |
1785 | task_unlock(tsk); | 1783 | task_unlock(tsk); |
1786 | put_css_set(newcg); | 1784 | put_css_set(newcg); |
1787 | return -ESRCH; | 1785 | return -ESRCH; |
1788 | } | 1786 | } |
1789 | rcu_assign_pointer(tsk->cgroups, newcg); | 1787 | rcu_assign_pointer(tsk->cgroups, newcg); |
1790 | task_unlock(tsk); | 1788 | task_unlock(tsk); |
1791 | 1789 | ||
1792 | /* Update the css_set linked lists if we're using them */ | 1790 | /* Update the css_set linked lists if we're using them */ |
1793 | write_lock(&css_set_lock); | 1791 | write_lock(&css_set_lock); |
1794 | if (!list_empty(&tsk->cg_list)) | 1792 | if (!list_empty(&tsk->cg_list)) |
1795 | list_move(&tsk->cg_list, &newcg->tasks); | 1793 | list_move(&tsk->cg_list, &newcg->tasks); |
1796 | write_unlock(&css_set_lock); | 1794 | write_unlock(&css_set_lock); |
1797 | 1795 | ||
1798 | /* | 1796 | /* |
1799 | * We just gained a reference on oldcg by taking it from the task. As | 1797 | * We just gained a reference on oldcg by taking it from the task. As |
1800 | * trading it for newcg is protected by cgroup_mutex, we're safe to drop | 1798 | * trading it for newcg is protected by cgroup_mutex, we're safe to drop |
1801 | * it here; it will be freed under RCU. | 1799 | * it here; it will be freed under RCU. |
1802 | */ | 1800 | */ |
1803 | put_css_set(oldcg); | 1801 | put_css_set(oldcg); |
1804 | 1802 | ||
1805 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1803 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); |
1806 | return 0; | 1804 | return 0; |
1807 | } | 1805 | } |
1808 | 1806 | ||
1809 | /** | 1807 | /** |
1810 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' | 1808 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' |
1811 | * @cgrp: the cgroup the task is attaching to | 1809 | * @cgrp: the cgroup the task is attaching to |
1812 | * @tsk: the task to be attached | 1810 | * @tsk: the task to be attached |
1813 | * | 1811 | * |
1814 | * Call holding cgroup_mutex. May take task_lock of | 1812 | * Call holding cgroup_mutex. May take task_lock of |
1815 | * the task 'tsk' during call. | 1813 | * the task 'tsk' during call. |
1816 | */ | 1814 | */ |
1817 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1815 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
1818 | { | 1816 | { |
1819 | int retval; | 1817 | int retval; |
1820 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1818 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1821 | struct cgroup *oldcgrp; | 1819 | struct cgroup *oldcgrp; |
1822 | struct cgroupfs_root *root = cgrp->root; | 1820 | struct cgroupfs_root *root = cgrp->root; |
1823 | 1821 | ||
1824 | /* Nothing to do if the task is already in that cgroup */ | 1822 | /* Nothing to do if the task is already in that cgroup */ |
1825 | oldcgrp = task_cgroup_from_root(tsk, root); | 1823 | oldcgrp = task_cgroup_from_root(tsk, root); |
1826 | if (cgrp == oldcgrp) | 1824 | if (cgrp == oldcgrp) |
1827 | return 0; | 1825 | return 0; |
1828 | 1826 | ||
1829 | for_each_subsys(root, ss) { | 1827 | for_each_subsys(root, ss) { |
1830 | if (ss->can_attach) { | 1828 | if (ss->can_attach) { |
1831 | retval = ss->can_attach(ss, cgrp, tsk); | 1829 | retval = ss->can_attach(ss, cgrp, tsk); |
1832 | if (retval) { | 1830 | if (retval) { |
1833 | /* | 1831 | /* |
1834 | * Remember on which subsystem the can_attach() | 1832 | * Remember on which subsystem the can_attach() |
1835 | * failed, so that we only call cancel_attach() | 1833 | * failed, so that we only call cancel_attach() |
1836 | * against the subsystems whose can_attach() | 1834 | * against the subsystems whose can_attach() |
1837 | * succeeded. (See below) | 1835 | * succeeded. (See below) |
1838 | */ | 1836 | */ |
1839 | failed_ss = ss; | 1837 | failed_ss = ss; |
1840 | goto out; | 1838 | goto out; |
1841 | } | 1839 | } |
1842 | } | 1840 | } |
1843 | if (ss->can_attach_task) { | 1841 | if (ss->can_attach_task) { |
1844 | retval = ss->can_attach_task(cgrp, tsk); | 1842 | retval = ss->can_attach_task(cgrp, tsk); |
1845 | if (retval) { | 1843 | if (retval) { |
1846 | failed_ss = ss; | 1844 | failed_ss = ss; |
1847 | goto out; | 1845 | goto out; |
1848 | } | 1846 | } |
1849 | } | 1847 | } |
1850 | } | 1848 | } |
1851 | 1849 | ||
1852 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); | 1850 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); |
1853 | if (retval) | 1851 | if (retval) |
1854 | goto out; | 1852 | goto out; |
1855 | 1853 | ||
1856 | for_each_subsys(root, ss) { | 1854 | for_each_subsys(root, ss) { |
1857 | if (ss->pre_attach) | 1855 | if (ss->pre_attach) |
1858 | ss->pre_attach(cgrp); | 1856 | ss->pre_attach(cgrp); |
1859 | if (ss->attach_task) | 1857 | if (ss->attach_task) |
1860 | ss->attach_task(cgrp, tsk); | 1858 | ss->attach_task(cgrp, tsk); |
1861 | if (ss->attach) | 1859 | if (ss->attach) |
1862 | ss->attach(ss, cgrp, oldcgrp, tsk); | 1860 | ss->attach(ss, cgrp, oldcgrp, tsk); |
1863 | } | 1861 | } |
1864 | 1862 | ||
1865 | synchronize_rcu(); | 1863 | synchronize_rcu(); |
1866 | 1864 | ||
1867 | /* | 1865 | /* |
1868 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1866 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
1869 | * is no longer empty. | 1867 | * is no longer empty. |
1870 | */ | 1868 | */ |
1871 | cgroup_wakeup_rmdir_waiter(cgrp); | 1869 | cgroup_wakeup_rmdir_waiter(cgrp); |
1872 | out: | 1870 | out: |
1873 | if (retval) { | 1871 | if (retval) { |
1874 | for_each_subsys(root, ss) { | 1872 | for_each_subsys(root, ss) { |
1875 | if (ss == failed_ss) | 1873 | if (ss == failed_ss) |
1876 | /* | 1874 | /* |
1877 | * This subsystem was the one that failed the | 1875 | * This subsystem was the one that failed the |
1878 | * can_attach() check earlier, so we don't need | 1876 | * can_attach() check earlier, so we don't need |
1879 | * to call cancel_attach() against it or any | 1877 | * to call cancel_attach() against it or any |
1880 | * remaining subsystems. | 1878 | * remaining subsystems. |
1881 | */ | 1879 | */ |
1882 | break; | 1880 | break; |
1883 | if (ss->cancel_attach) | 1881 | if (ss->cancel_attach) |
1884 | ss->cancel_attach(ss, cgrp, tsk); | 1882 | ss->cancel_attach(ss, cgrp, tsk); |
1885 | } | 1883 | } |
1886 | } | 1884 | } |
1887 | return retval; | 1885 | return retval; |
1888 | } | 1886 | } |
1889 | 1887 | ||
1890 | /** | 1888 | /** |
1891 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' | 1889 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' |
1892 | * @from: attach to all cgroups of a given task | 1890 | * @from: attach to all cgroups of a given task |
1893 | * @tsk: the task to be attached | 1891 | * @tsk: the task to be attached |
1894 | */ | 1892 | */ |
1895 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | 1893 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) |
1896 | { | 1894 | { |
1897 | struct cgroupfs_root *root; | 1895 | struct cgroupfs_root *root; |
1898 | int retval = 0; | 1896 | int retval = 0; |
1899 | 1897 | ||
1900 | cgroup_lock(); | 1898 | cgroup_lock(); |
1901 | for_each_active_root(root) { | 1899 | for_each_active_root(root) { |
1902 | struct cgroup *from_cg = task_cgroup_from_root(from, root); | 1900 | struct cgroup *from_cg = task_cgroup_from_root(from, root); |
1903 | 1901 | ||
1904 | retval = cgroup_attach_task(from_cg, tsk); | 1902 | retval = cgroup_attach_task(from_cg, tsk); |
1905 | if (retval) | 1903 | if (retval) |
1906 | break; | 1904 | break; |
1907 | } | 1905 | } |
1908 | cgroup_unlock(); | 1906 | cgroup_unlock(); |
1909 | 1907 | ||
1910 | return retval; | 1908 | return retval; |
1911 | } | 1909 | } |
1912 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 1910 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
1913 | 1911 | ||
1914 | /* | 1912 | /* |
1915 | * cgroup_attach_proc works in two stages, the first of which prefetches all | 1913 | * cgroup_attach_proc works in two stages, the first of which prefetches all |
1916 | * new css_sets needed (to make sure we have enough memory before committing | 1914 | * new css_sets needed (to make sure we have enough memory before committing |
1917 | * to the move) and stores them in a list of entries of the following type. | 1915 | * to the move) and stores them in a list of entries of the following type. |
1918 | * TODO: possible optimization: use css_set->rcu_head for chaining instead | 1916 | * TODO: possible optimization: use css_set->rcu_head for chaining instead |
1919 | */ | 1917 | */ |
1920 | struct cg_list_entry { | 1918 | struct cg_list_entry { |
1921 | struct css_set *cg; | 1919 | struct css_set *cg; |
1922 | struct list_head links; | 1920 | struct list_head links; |
1923 | }; | 1921 | }; |
1924 | 1922 | ||
1925 | static bool css_set_check_fetched(struct cgroup *cgrp, | 1923 | static bool css_set_check_fetched(struct cgroup *cgrp, |
1926 | struct task_struct *tsk, struct css_set *cg, | 1924 | struct task_struct *tsk, struct css_set *cg, |
1927 | struct list_head *newcg_list) | 1925 | struct list_head *newcg_list) |
1928 | { | 1926 | { |
1929 | struct css_set *newcg; | 1927 | struct css_set *newcg; |
1930 | struct cg_list_entry *cg_entry; | 1928 | struct cg_list_entry *cg_entry; |
1931 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | 1929 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; |
1932 | 1930 | ||
1933 | read_lock(&css_set_lock); | 1931 | read_lock(&css_set_lock); |
1934 | newcg = find_existing_css_set(cg, cgrp, template); | 1932 | newcg = find_existing_css_set(cg, cgrp, template); |
1935 | if (newcg) | 1933 | if (newcg) |
1936 | get_css_set(newcg); | 1934 | get_css_set(newcg); |
1937 | read_unlock(&css_set_lock); | 1935 | read_unlock(&css_set_lock); |
1938 | 1936 | ||
1939 | /* doesn't exist at all? */ | 1937 | /* doesn't exist at all? */ |
1940 | if (!newcg) | 1938 | if (!newcg) |
1941 | return false; | 1939 | return false; |
1942 | /* see if it's already in the list */ | 1940 | /* see if it's already in the list */ |
1943 | list_for_each_entry(cg_entry, newcg_list, links) { | 1941 | list_for_each_entry(cg_entry, newcg_list, links) { |
1944 | if (cg_entry->cg == newcg) { | 1942 | if (cg_entry->cg == newcg) { |
1945 | put_css_set(newcg); | 1943 | put_css_set(newcg); |
1946 | return true; | 1944 | return true; |
1947 | } | 1945 | } |
1948 | } | 1946 | } |
1949 | 1947 | ||
1950 | /* not found */ | 1948 | /* not found */ |
1951 | put_css_set(newcg); | 1949 | put_css_set(newcg); |
1952 | return false; | 1950 | return false; |
1953 | } | 1951 | } |
1954 | 1952 | ||
1955 | /* | 1953 | /* |
1956 | * Find the new css_set and store it in the list in preparation for moving the | 1954 | * Find the new css_set and store it in the list in preparation for moving the |
1957 | * given task to the given cgroup. Returns 0 or -ENOMEM. | 1955 | * given task to the given cgroup. Returns 0 or -ENOMEM. |
1958 | */ | 1956 | */ |
1959 | static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, | 1957 | static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, |
1960 | struct list_head *newcg_list) | 1958 | struct list_head *newcg_list) |
1961 | { | 1959 | { |
1962 | struct css_set *newcg; | 1960 | struct css_set *newcg; |
1963 | struct cg_list_entry *cg_entry; | 1961 | struct cg_list_entry *cg_entry; |
1964 | 1962 | ||
1965 | /* ensure a new css_set will exist for this thread */ | 1963 | /* ensure a new css_set will exist for this thread */ |
1966 | newcg = find_css_set(cg, cgrp); | 1964 | newcg = find_css_set(cg, cgrp); |
1967 | if (!newcg) | 1965 | if (!newcg) |
1968 | return -ENOMEM; | 1966 | return -ENOMEM; |
1969 | /* add it to the list */ | 1967 | /* add it to the list */ |
1970 | cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); | 1968 | cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); |
1971 | if (!cg_entry) { | 1969 | if (!cg_entry) { |
1972 | put_css_set(newcg); | 1970 | put_css_set(newcg); |
1973 | return -ENOMEM; | 1971 | return -ENOMEM; |
1974 | } | 1972 | } |
1975 | cg_entry->cg = newcg; | 1973 | cg_entry->cg = newcg; |
1976 | list_add(&cg_entry->links, newcg_list); | 1974 | list_add(&cg_entry->links, newcg_list); |
1977 | return 0; | 1975 | return 0; |
1978 | } | 1976 | } |
1979 | 1977 | ||
1980 | /** | 1978 | /** |
1981 | * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup | 1979 | * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup |
1982 | * @cgrp: the cgroup to attach to | 1980 | * @cgrp: the cgroup to attach to |
1983 | * @leader: the threadgroup leader task_struct of the group to be attached | 1981 | * @leader: the threadgroup leader task_struct of the group to be attached |
1984 | * | 1982 | * |
1985 | * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will | 1983 | * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will |
1986 | * take task_lock of each thread in leader's threadgroup individually in turn. | 1984 | * take task_lock of each thread in leader's threadgroup individually in turn. |
1987 | */ | 1985 | */ |
1988 | int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | 1986 | int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) |
1989 | { | 1987 | { |
1990 | int retval, i, group_size; | 1988 | int retval, i, group_size; |
1991 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1989 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1992 | bool cancel_failed_ss = false; | 1990 | bool cancel_failed_ss = false; |
1993 | /* guaranteed to be initialized later, but the compiler needs this */ | 1991 | /* guaranteed to be initialized later, but the compiler needs this */ |
1994 | struct cgroup *oldcgrp = NULL; | 1992 | struct cgroup *oldcgrp = NULL; |
1995 | struct css_set *oldcg; | 1993 | struct css_set *oldcg; |
1996 | struct cgroupfs_root *root = cgrp->root; | 1994 | struct cgroupfs_root *root = cgrp->root; |
1997 | /* threadgroup list cursor and array */ | 1995 | /* threadgroup list cursor and array */ |
1998 | struct task_struct *tsk; | 1996 | struct task_struct *tsk; |
1999 | struct flex_array *group; | 1997 | struct flex_array *group; |
2000 | /* | 1998 | /* |
2001 | * we need to make sure we have css_sets for all the tasks we're | 1999 | * we need to make sure we have css_sets for all the tasks we're |
2002 | * going to move -before- we actually start moving them, so that in | 2000 | * going to move -before- we actually start moving them, so that in |
2003 | * case we get an ENOMEM we can bail out before making any changes. | 2001 | * case we get an ENOMEM we can bail out before making any changes. |
2004 | */ | 2002 | */ |
2005 | struct list_head newcg_list; | 2003 | struct list_head newcg_list; |
2006 | struct cg_list_entry *cg_entry, *temp_nobe; | 2004 | struct cg_list_entry *cg_entry, *temp_nobe; |
2007 | 2005 | ||
2008 | /* | 2006 | /* |
2009 | * step 0: in order to do expensive, possibly blocking operations for | 2007 | * step 0: in order to do expensive, possibly blocking operations for |
2010 | * every thread, we cannot iterate the thread group list, since it needs | 2008 | * every thread, we cannot iterate the thread group list, since it needs |
2011 | * rcu or tasklist locked. instead, build an array of all threads in the | 2009 | * rcu or tasklist locked. instead, build an array of all threads in the |
2012 | * group - threadgroup_fork_lock prevents new threads from appearing, | 2010 | * group - threadgroup_fork_lock prevents new threads from appearing, |
2013 | * and if threads exit, this will just be an over-estimate. | 2011 | * and if threads exit, this will just be an over-estimate. |
2014 | */ | 2012 | */ |
2015 | group_size = get_nr_threads(leader); | 2013 | group_size = get_nr_threads(leader); |
2016 | /* flex_array supports very large thread-groups better than kmalloc. */ | 2014 | /* flex_array supports very large thread-groups better than kmalloc. */ |
2017 | group = flex_array_alloc(sizeof(struct task_struct *), group_size, | 2015 | group = flex_array_alloc(sizeof(struct task_struct *), group_size, |
2018 | GFP_KERNEL); | 2016 | GFP_KERNEL); |
2019 | if (!group) | 2017 | if (!group) |
2020 | return -ENOMEM; | 2018 | return -ENOMEM; |
2021 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ | 2019 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ |
2022 | retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); | 2020 | retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); |
2023 | if (retval) | 2021 | if (retval) |
2024 | goto out_free_group_list; | 2022 | goto out_free_group_list; |
2025 | 2023 | ||
2026 | /* prevent changes to the threadgroup list while we take a snapshot. */ | 2024 | /* prevent changes to the threadgroup list while we take a snapshot. */ |
2027 | rcu_read_lock(); | 2025 | rcu_read_lock(); |
2028 | if (!thread_group_leader(leader)) { | 2026 | if (!thread_group_leader(leader)) { |
2029 | /* | 2027 | /* |
2030 | * a race with de_thread from another thread's exec() may strip | 2028 | * a race with de_thread from another thread's exec() may strip |
2031 | * us of our leadership, making while_each_thread unsafe to use | 2029 | * us of our leadership, making while_each_thread unsafe to use |
2032 | * on this task. if this happens, there is no choice but to | 2030 | * on this task. if this happens, there is no choice but to |
2033 | * throw this task away and try again (from cgroup_procs_write); | 2031 | * throw this task away and try again (from cgroup_procs_write); |
2034 | * this is "double-double-toil-and-trouble-check locking". | 2032 | * this is "double-double-toil-and-trouble-check locking". |
2035 | */ | 2033 | */ |
2036 | rcu_read_unlock(); | 2034 | rcu_read_unlock(); |
2037 | retval = -EAGAIN; | 2035 | retval = -EAGAIN; |
2038 | goto out_free_group_list; | 2036 | goto out_free_group_list; |
2039 | } | 2037 | } |
2040 | /* take a reference on each task in the group to go in the array. */ | 2038 | /* take a reference on each task in the group to go in the array. */ |
2041 | tsk = leader; | 2039 | tsk = leader; |
2042 | i = 0; | 2040 | i = 0; |
2043 | do { | 2041 | do { |
2044 | /* as per above, nr_threads may decrease, but not increase. */ | 2042 | /* as per above, nr_threads may decrease, but not increase. */ |
2045 | BUG_ON(i >= group_size); | 2043 | BUG_ON(i >= group_size); |
2046 | get_task_struct(tsk); | 2044 | get_task_struct(tsk); |
2047 | /* | 2045 | /* |
2048 | * saying GFP_ATOMIC has no effect here because we did prealloc | 2046 | * saying GFP_ATOMIC has no effect here because we did prealloc |
2049 | * earlier, but it's good form to communicate our expectations. | 2047 | * earlier, but it's good form to communicate our expectations. |
2050 | */ | 2048 | */ |
2051 | retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); | 2049 | retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); |
2052 | BUG_ON(retval != 0); | 2050 | BUG_ON(retval != 0); |
2053 | i++; | 2051 | i++; |
2054 | } while_each_thread(leader, tsk); | 2052 | } while_each_thread(leader, tsk); |
2055 | /* remember the number of threads in the array for later. */ | 2053 | /* remember the number of threads in the array for later. */ |
2056 | group_size = i; | 2054 | group_size = i; |
2057 | rcu_read_unlock(); | 2055 | rcu_read_unlock(); |
2058 | 2056 | ||
2059 | /* | 2057 | /* |
2060 | * step 1: check that we can legitimately attach to the cgroup. | 2058 | * step 1: check that we can legitimately attach to the cgroup. |
2061 | */ | 2059 | */ |
2062 | for_each_subsys(root, ss) { | 2060 | for_each_subsys(root, ss) { |
2063 | if (ss->can_attach) { | 2061 | if (ss->can_attach) { |
2064 | retval = ss->can_attach(ss, cgrp, leader); | 2062 | retval = ss->can_attach(ss, cgrp, leader); |
2065 | if (retval) { | 2063 | if (retval) { |
2066 | failed_ss = ss; | 2064 | failed_ss = ss; |
2067 | goto out_cancel_attach; | 2065 | goto out_cancel_attach; |
2068 | } | 2066 | } |
2069 | } | 2067 | } |
2070 | /* a callback to be run on every thread in the threadgroup. */ | 2068 | /* a callback to be run on every thread in the threadgroup. */ |
2071 | if (ss->can_attach_task) { | 2069 | if (ss->can_attach_task) { |
2072 | /* run on each task in the threadgroup. */ | 2070 | /* run on each task in the threadgroup. */ |
2073 | for (i = 0; i < group_size; i++) { | 2071 | for (i = 0; i < group_size; i++) { |
2074 | tsk = flex_array_get_ptr(group, i); | 2072 | tsk = flex_array_get_ptr(group, i); |
2075 | retval = ss->can_attach_task(cgrp, tsk); | 2073 | retval = ss->can_attach_task(cgrp, tsk); |
2076 | if (retval) { | 2074 | if (retval) { |
2077 | failed_ss = ss; | 2075 | failed_ss = ss; |
2078 | cancel_failed_ss = true; | 2076 | cancel_failed_ss = true; |
2079 | goto out_cancel_attach; | 2077 | goto out_cancel_attach; |
2080 | } | 2078 | } |
2081 | } | 2079 | } |
2082 | } | 2080 | } |
2083 | } | 2081 | } |
2084 | 2082 | ||
2085 | /* | 2083 | /* |
2086 | * step 2: make sure css_sets exist for all threads to be migrated. | 2084 | * step 2: make sure css_sets exist for all threads to be migrated. |
2087 | * we use find_css_set, which allocates a new one if necessary. | 2085 | * we use find_css_set, which allocates a new one if necessary. |
2088 | */ | 2086 | */ |
2089 | INIT_LIST_HEAD(&newcg_list); | 2087 | INIT_LIST_HEAD(&newcg_list); |
2090 | for (i = 0; i < group_size; i++) { | 2088 | for (i = 0; i < group_size; i++) { |
2091 | tsk = flex_array_get_ptr(group, i); | 2089 | tsk = flex_array_get_ptr(group, i); |
2092 | /* nothing to do if this task is already in the cgroup */ | 2090 | /* nothing to do if this task is already in the cgroup */ |
2093 | oldcgrp = task_cgroup_from_root(tsk, root); | 2091 | oldcgrp = task_cgroup_from_root(tsk, root); |
2094 | if (cgrp == oldcgrp) | 2092 | if (cgrp == oldcgrp) |
2095 | continue; | 2093 | continue; |
2096 | /* get old css_set pointer */ | 2094 | /* get old css_set pointer */ |
2097 | task_lock(tsk); | 2095 | task_lock(tsk); |
2098 | if (tsk->flags & PF_EXITING) { | 2096 | if (tsk->flags & PF_EXITING) { |
2099 | /* ignore this task if it's going away */ | 2097 | /* ignore this task if it's going away */ |
2100 | task_unlock(tsk); | 2098 | task_unlock(tsk); |
2101 | continue; | 2099 | continue; |
2102 | } | 2100 | } |
2103 | oldcg = tsk->cgroups; | 2101 | oldcg = tsk->cgroups; |
2104 | get_css_set(oldcg); | 2102 | get_css_set(oldcg); |
2105 | task_unlock(tsk); | 2103 | task_unlock(tsk); |
2106 | /* see if the new one for us is already in the list? */ | 2104 | /* see if the new one for us is already in the list? */ |
2107 | if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { | 2105 | if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { |
2108 | /* was already there, nothing to do. */ | 2106 | /* was already there, nothing to do. */ |
2109 | put_css_set(oldcg); | 2107 | put_css_set(oldcg); |
2110 | } else { | 2108 | } else { |
2111 | /* we don't already have it. get new one. */ | 2109 | /* we don't already have it. get new one. */ |
2112 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); | 2110 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); |
2113 | put_css_set(oldcg); | 2111 | put_css_set(oldcg); |
2114 | if (retval) | 2112 | if (retval) |
2115 | goto out_list_teardown; | 2113 | goto out_list_teardown; |
2116 | } | 2114 | } |
2117 | } | 2115 | } |
2118 | 2116 | ||
2119 | /* | 2117 | /* |
2120 | * step 3: now that we're guaranteed success wrt the css_sets, proceed | 2118 | * step 3: now that we're guaranteed success wrt the css_sets, proceed |
2121 | * to move all tasks to the new cgroup, calling ss->attach_task for each | 2119 | * to move all tasks to the new cgroup, calling ss->attach_task for each |
2122 | * one along the way. there are no failure cases after here, so this is | 2120 | * one along the way. there are no failure cases after here, so this is |
2123 | * the commit point. | 2121 | * the commit point. |
2124 | */ | 2122 | */ |
2125 | for_each_subsys(root, ss) { | 2123 | for_each_subsys(root, ss) { |
2126 | if (ss->pre_attach) | 2124 | if (ss->pre_attach) |
2127 | ss->pre_attach(cgrp); | 2125 | ss->pre_attach(cgrp); |
2128 | } | 2126 | } |
2129 | for (i = 0; i < group_size; i++) { | 2127 | for (i = 0; i < group_size; i++) { |
2130 | tsk = flex_array_get_ptr(group, i); | 2128 | tsk = flex_array_get_ptr(group, i); |
2131 | /* leave current thread as it is if it's already there */ | 2129 | /* leave current thread as it is if it's already there */ |
2132 | oldcgrp = task_cgroup_from_root(tsk, root); | 2130 | oldcgrp = task_cgroup_from_root(tsk, root); |
2133 | if (cgrp == oldcgrp) | 2131 | if (cgrp == oldcgrp) |
2134 | continue; | 2132 | continue; |
2135 | /* attach each task to each subsystem */ | 2133 | /* attach each task to each subsystem */ |
2136 | for_each_subsys(root, ss) { | 2134 | for_each_subsys(root, ss) { |
2137 | if (ss->attach_task) | 2135 | if (ss->attach_task) |
2138 | ss->attach_task(cgrp, tsk); | 2136 | ss->attach_task(cgrp, tsk); |
2139 | } | 2137 | } |
2140 | /* if the thread is PF_EXITING, it can just get skipped. */ | 2138 | /* if the thread is PF_EXITING, it can just get skipped. */ |
2141 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); | 2139 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); |
2142 | BUG_ON(retval != 0 && retval != -ESRCH); | 2140 | BUG_ON(retval != 0 && retval != -ESRCH); |
2143 | } | 2141 | } |
2144 | /* nothing is sensitive to fork() after this point. */ | 2142 | /* nothing is sensitive to fork() after this point. */ |
2145 | 2143 | ||
2146 | /* | 2144 | /* |
2147 | * step 4: do expensive, non-thread-specific subsystem callbacks. | 2145 | * step 4: do expensive, non-thread-specific subsystem callbacks. |
2148 | * TODO: if ever a subsystem needs to know the oldcgrp for each task | 2146 | * TODO: if ever a subsystem needs to know the oldcgrp for each task |
2149 | * being moved, this call will need to be reworked to communicate that. | 2147 | * being moved, this call will need to be reworked to communicate that. |
2150 | */ | 2148 | */ |
2151 | for_each_subsys(root, ss) { | 2149 | for_each_subsys(root, ss) { |
2152 | if (ss->attach) | 2150 | if (ss->attach) |
2153 | ss->attach(ss, cgrp, oldcgrp, leader); | 2151 | ss->attach(ss, cgrp, oldcgrp, leader); |
2154 | } | 2152 | } |
2155 | 2153 | ||
2156 | /* | 2154 | /* |
2157 | * step 5: success! and cleanup | 2155 | * step 5: success! and cleanup |
2158 | */ | 2156 | */ |
2159 | synchronize_rcu(); | 2157 | synchronize_rcu(); |
2160 | cgroup_wakeup_rmdir_waiter(cgrp); | 2158 | cgroup_wakeup_rmdir_waiter(cgrp); |
2161 | retval = 0; | 2159 | retval = 0; |
2162 | out_list_teardown: | 2160 | out_list_teardown: |
2163 | /* clean up the list of prefetched css_sets. */ | 2161 | /* clean up the list of prefetched css_sets. */ |
2164 | list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { | 2162 | list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { |
2165 | list_del(&cg_entry->links); | 2163 | list_del(&cg_entry->links); |
2166 | put_css_set(cg_entry->cg); | 2164 | put_css_set(cg_entry->cg); |
2167 | kfree(cg_entry); | 2165 | kfree(cg_entry); |
2168 | } | 2166 | } |
2169 | out_cancel_attach: | 2167 | out_cancel_attach: |
2170 | /* same deal as in cgroup_attach_task */ | 2168 | /* same deal as in cgroup_attach_task */ |
2171 | if (retval) { | 2169 | if (retval) { |
2172 | for_each_subsys(root, ss) { | 2170 | for_each_subsys(root, ss) { |
2173 | if (ss == failed_ss) { | 2171 | if (ss == failed_ss) { |
2174 | if (cancel_failed_ss && ss->cancel_attach) | 2172 | if (cancel_failed_ss && ss->cancel_attach) |
2175 | ss->cancel_attach(ss, cgrp, leader); | 2173 | ss->cancel_attach(ss, cgrp, leader); |
2176 | break; | 2174 | break; |
2177 | } | 2175 | } |
2178 | if (ss->cancel_attach) | 2176 | if (ss->cancel_attach) |
2179 | ss->cancel_attach(ss, cgrp, leader); | 2177 | ss->cancel_attach(ss, cgrp, leader); |
2180 | } | 2178 | } |
2181 | } | 2179 | } |
2182 | /* clean up the array of referenced threads in the group. */ | 2180 | /* clean up the array of referenced threads in the group. */ |
2183 | for (i = 0; i < group_size; i++) { | 2181 | for (i = 0; i < group_size; i++) { |
2184 | tsk = flex_array_get_ptr(group, i); | 2182 | tsk = flex_array_get_ptr(group, i); |
2185 | put_task_struct(tsk); | 2183 | put_task_struct(tsk); |
2186 | } | 2184 | } |
2187 | out_free_group_list: | 2185 | out_free_group_list: |
2188 | flex_array_free(group); | 2186 | flex_array_free(group); |
2189 | return retval; | 2187 | return retval; |
2190 | } | 2188 | } |
2191 | 2189 | ||
2192 | /* | 2190 | /* |
2193 | * Find the task_struct of the task to attach by vpid and pass it along to the | 2191 | * Find the task_struct of the task to attach by vpid and pass it along to the |
2194 | * function to attach either it or all tasks in its threadgroup. Will take | 2192 | * function to attach either it or all tasks in its threadgroup. Will take |
2195 | * cgroup_mutex; may take task_lock of task. | 2193 | * cgroup_mutex; may take task_lock of task. |
2196 | */ | 2194 | */ |
2197 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | 2195 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) |
2198 | { | 2196 | { |
2199 | struct task_struct *tsk; | 2197 | struct task_struct *tsk; |
2200 | const struct cred *cred = current_cred(), *tcred; | 2198 | const struct cred *cred = current_cred(), *tcred; |
2201 | int ret; | 2199 | int ret; |
2202 | 2200 | ||
2203 | if (!cgroup_lock_live_group(cgrp)) | 2201 | if (!cgroup_lock_live_group(cgrp)) |
2204 | return -ENODEV; | 2202 | return -ENODEV; |
2205 | 2203 | ||
2206 | if (pid) { | 2204 | if (pid) { |
2207 | rcu_read_lock(); | 2205 | rcu_read_lock(); |
2208 | tsk = find_task_by_vpid(pid); | 2206 | tsk = find_task_by_vpid(pid); |
2209 | if (!tsk) { | 2207 | if (!tsk) { |
2210 | rcu_read_unlock(); | 2208 | rcu_read_unlock(); |
2211 | cgroup_unlock(); | 2209 | cgroup_unlock(); |
2212 | return -ESRCH; | 2210 | return -ESRCH; |
2213 | } | 2211 | } |
2214 | if (threadgroup) { | 2212 | if (threadgroup) { |
2215 | /* | 2213 | /* |
2216 | * RCU protects this access, since tsk was found in the | 2214 | * RCU protects this access, since tsk was found in the |
2217 | * tid map. a race with de_thread may cause group_leader | 2215 | * tid map. a race with de_thread may cause group_leader |
2218 | * to stop being the leader, but cgroup_attach_proc will | 2216 | * to stop being the leader, but cgroup_attach_proc will |
2219 | * detect it later. | 2217 | * detect it later. |
2220 | */ | 2218 | */ |
2221 | tsk = tsk->group_leader; | 2219 | tsk = tsk->group_leader; |
2222 | } else if (tsk->flags & PF_EXITING) { | 2220 | } else if (tsk->flags & PF_EXITING) { |
2223 | /* optimization for the single-task-only case */ | 2221 | /* optimization for the single-task-only case */ |
2224 | rcu_read_unlock(); | 2222 | rcu_read_unlock(); |
2225 | cgroup_unlock(); | 2223 | cgroup_unlock(); |
2226 | return -ESRCH; | 2224 | return -ESRCH; |
2227 | } | 2225 | } |
2228 | 2226 | ||
2229 | /* | 2227 | /* |
2230 | * even if we're attaching all tasks in the thread group, we | 2228 | * even if we're attaching all tasks in the thread group, we |
2231 | * only need to check permissions on one of them. | 2229 | * only need to check permissions on one of them. |
2232 | */ | 2230 | */ |
2233 | tcred = __task_cred(tsk); | 2231 | tcred = __task_cred(tsk); |
2234 | if (cred->euid && | 2232 | if (cred->euid && |
2235 | cred->euid != tcred->uid && | 2233 | cred->euid != tcred->uid && |
2236 | cred->euid != tcred->suid) { | 2234 | cred->euid != tcred->suid) { |
2237 | rcu_read_unlock(); | 2235 | rcu_read_unlock(); |
2238 | cgroup_unlock(); | 2236 | cgroup_unlock(); |
2239 | return -EACCES; | 2237 | return -EACCES; |
2240 | } | 2238 | } |
2241 | get_task_struct(tsk); | 2239 | get_task_struct(tsk); |
2242 | rcu_read_unlock(); | 2240 | rcu_read_unlock(); |
2243 | } else { | 2241 | } else { |
2244 | if (threadgroup) | 2242 | if (threadgroup) |
2245 | tsk = current->group_leader; | 2243 | tsk = current->group_leader; |
2246 | else | 2244 | else |
2247 | tsk = current; | 2245 | tsk = current; |
2248 | get_task_struct(tsk); | 2246 | get_task_struct(tsk); |
2249 | } | 2247 | } |
2250 | 2248 | ||
2251 | if (threadgroup) { | 2249 | if (threadgroup) { |
2252 | threadgroup_fork_write_lock(tsk); | 2250 | threadgroup_fork_write_lock(tsk); |
2253 | ret = cgroup_attach_proc(cgrp, tsk); | 2251 | ret = cgroup_attach_proc(cgrp, tsk); |
2254 | threadgroup_fork_write_unlock(tsk); | 2252 | threadgroup_fork_write_unlock(tsk); |
2255 | } else { | 2253 | } else { |
2256 | ret = cgroup_attach_task(cgrp, tsk); | 2254 | ret = cgroup_attach_task(cgrp, tsk); |
2257 | } | 2255 | } |
2258 | put_task_struct(tsk); | 2256 | put_task_struct(tsk); |
2259 | cgroup_unlock(); | 2257 | cgroup_unlock(); |
2260 | return ret; | 2258 | return ret; |
2261 | } | 2259 | } |
2262 | 2260 | ||
2263 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | 2261 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) |
2264 | { | 2262 | { |
2265 | return attach_task_by_pid(cgrp, pid, false); | 2263 | return attach_task_by_pid(cgrp, pid, false); |
2266 | } | 2264 | } |
2267 | 2265 | ||
2268 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | 2266 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) |
2269 | { | 2267 | { |
2270 | int ret; | 2268 | int ret; |
2271 | do { | 2269 | do { |
2272 | /* | 2270 | /* |
2273 | * attach_proc fails with -EAGAIN if threadgroup leadership | 2271 | * attach_proc fails with -EAGAIN if threadgroup leadership |
2274 | * changes in the middle of the operation, in which case we need | 2272 | * changes in the middle of the operation, in which case we need |
2275 | * to find the task_struct for the new leader and start over. | 2273 | * to find the task_struct for the new leader and start over. |
2276 | */ | 2274 | */ |
2277 | ret = attach_task_by_pid(cgrp, tgid, true); | 2275 | ret = attach_task_by_pid(cgrp, tgid, true); |
2278 | } while (ret == -EAGAIN); | 2276 | } while (ret == -EAGAIN); |
2279 | return ret; | 2277 | return ret; |
2280 | } | 2278 | } |
2281 | 2279 | ||
2282 | /** | 2280 | /** |
2283 | * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. | 2281 | * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. |
2284 | * @cgrp: the cgroup to be checked for liveness | 2282 | * @cgrp: the cgroup to be checked for liveness |
2285 | * | 2283 | * |
2286 | * On success, returns true; the lock should be later released with | 2284 | * On success, returns true; the lock should be later released with |
2287 | * cgroup_unlock(). On failure returns false with no lock held. | 2285 | * cgroup_unlock(). On failure returns false with no lock held. |
2288 | */ | 2286 | */ |
2289 | bool cgroup_lock_live_group(struct cgroup *cgrp) | 2287 | bool cgroup_lock_live_group(struct cgroup *cgrp) |
2290 | { | 2288 | { |
2291 | mutex_lock(&cgroup_mutex); | 2289 | mutex_lock(&cgroup_mutex); |
2292 | if (cgroup_is_removed(cgrp)) { | 2290 | if (cgroup_is_removed(cgrp)) { |
2293 | mutex_unlock(&cgroup_mutex); | 2291 | mutex_unlock(&cgroup_mutex); |
2294 | return false; | 2292 | return false; |
2295 | } | 2293 | } |
2296 | return true; | 2294 | return true; |
2297 | } | 2295 | } |
2298 | EXPORT_SYMBOL_GPL(cgroup_lock_live_group); | 2296 | EXPORT_SYMBOL_GPL(cgroup_lock_live_group); |
2299 | 2297 | ||
2300 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | 2298 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, |
2301 | const char *buffer) | 2299 | const char *buffer) |
2302 | { | 2300 | { |
2303 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | 2301 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
2304 | if (strlen(buffer) >= PATH_MAX) | 2302 | if (strlen(buffer) >= PATH_MAX) |
2305 | return -EINVAL; | 2303 | return -EINVAL; |
2306 | if (!cgroup_lock_live_group(cgrp)) | 2304 | if (!cgroup_lock_live_group(cgrp)) |
2307 | return -ENODEV; | 2305 | return -ENODEV; |
2308 | strcpy(cgrp->root->release_agent_path, buffer); | 2306 | strcpy(cgrp->root->release_agent_path, buffer); |
2309 | cgroup_unlock(); | 2307 | cgroup_unlock(); |
2310 | return 0; | 2308 | return 0; |
2311 | } | 2309 | } |
2312 | 2310 | ||
2313 | static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | 2311 | static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, |
2314 | struct seq_file *seq) | 2312 | struct seq_file *seq) |
2315 | { | 2313 | { |
2316 | if (!cgroup_lock_live_group(cgrp)) | 2314 | if (!cgroup_lock_live_group(cgrp)) |
2317 | return -ENODEV; | 2315 | return -ENODEV; |
2318 | seq_puts(seq, cgrp->root->release_agent_path); | 2316 | seq_puts(seq, cgrp->root->release_agent_path); |
2319 | seq_putc(seq, '\n'); | 2317 | seq_putc(seq, '\n'); |
2320 | cgroup_unlock(); | 2318 | cgroup_unlock(); |
2321 | return 0; | 2319 | return 0; |
2322 | } | 2320 | } |
2323 | 2321 | ||
2324 | /* A buffer size big enough for numbers or short strings */ | 2322 | /* A buffer size big enough for numbers or short strings */ |
2325 | #define CGROUP_LOCAL_BUFFER_SIZE 64 | 2323 | #define CGROUP_LOCAL_BUFFER_SIZE 64 |
2326 | 2324 | ||
2327 | static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | 2325 | static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, |
2328 | struct file *file, | 2326 | struct file *file, |
2329 | const char __user *userbuf, | 2327 | const char __user *userbuf, |
2330 | size_t nbytes, loff_t *unused_ppos) | 2328 | size_t nbytes, loff_t *unused_ppos) |
2331 | { | 2329 | { |
2332 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2330 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
2333 | int retval = 0; | 2331 | int retval = 0; |
2334 | char *end; | 2332 | char *end; |
2335 | 2333 | ||
2336 | if (!nbytes) | 2334 | if (!nbytes) |
2337 | return -EINVAL; | 2335 | return -EINVAL; |
2338 | if (nbytes >= sizeof(buffer)) | 2336 | if (nbytes >= sizeof(buffer)) |
2339 | return -E2BIG; | 2337 | return -E2BIG; |
2340 | if (copy_from_user(buffer, userbuf, nbytes)) | 2338 | if (copy_from_user(buffer, userbuf, nbytes)) |
2341 | return -EFAULT; | 2339 | return -EFAULT; |
2342 | 2340 | ||
2343 | buffer[nbytes] = 0; /* nul-terminate */ | 2341 | buffer[nbytes] = 0; /* nul-terminate */ |
2344 | if (cft->write_u64) { | 2342 | if (cft->write_u64) { |
2345 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); | 2343 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); |
2346 | if (*end) | 2344 | if (*end) |
2347 | return -EINVAL; | 2345 | return -EINVAL; |
2348 | retval = cft->write_u64(cgrp, cft, val); | 2346 | retval = cft->write_u64(cgrp, cft, val); |
2349 | } else { | 2347 | } else { |
2350 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); | 2348 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); |
2351 | if (*end) | 2349 | if (*end) |
2352 | return -EINVAL; | 2350 | return -EINVAL; |
2353 | retval = cft->write_s64(cgrp, cft, val); | 2351 | retval = cft->write_s64(cgrp, cft, val); |
2354 | } | 2352 | } |
2355 | if (!retval) | 2353 | if (!retval) |
2356 | retval = nbytes; | 2354 | retval = nbytes; |
2357 | return retval; | 2355 | return retval; |
2358 | } | 2356 | } |
2359 | 2357 | ||
2360 | static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | 2358 | static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, |
2361 | struct file *file, | 2359 | struct file *file, |
2362 | const char __user *userbuf, | 2360 | const char __user *userbuf, |
2363 | size_t nbytes, loff_t *unused_ppos) | 2361 | size_t nbytes, loff_t *unused_ppos) |
2364 | { | 2362 | { |
2365 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2363 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
2366 | int retval = 0; | 2364 | int retval = 0; |
2367 | size_t max_bytes = cft->max_write_len; | 2365 | size_t max_bytes = cft->max_write_len; |
2368 | char *buffer = local_buffer; | 2366 | char *buffer = local_buffer; |
2369 | 2367 | ||
2370 | if (!max_bytes) | 2368 | if (!max_bytes) |
2371 | max_bytes = sizeof(local_buffer) - 1; | 2369 | max_bytes = sizeof(local_buffer) - 1; |
2372 | if (nbytes >= max_bytes) | 2370 | if (nbytes >= max_bytes) |
2373 | return -E2BIG; | 2371 | return -E2BIG; |
2374 | /* Allocate a dynamic buffer if we need one */ | 2372 | /* Allocate a dynamic buffer if we need one */ |
2375 | if (nbytes >= sizeof(local_buffer)) { | 2373 | if (nbytes >= sizeof(local_buffer)) { |
2376 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); | 2374 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); |
2377 | if (buffer == NULL) | 2375 | if (buffer == NULL) |
2378 | return -ENOMEM; | 2376 | return -ENOMEM; |
2379 | } | 2377 | } |
2380 | if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { | 2378 | if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { |
2381 | retval = -EFAULT; | 2379 | retval = -EFAULT; |
2382 | goto out; | 2380 | goto out; |
2383 | } | 2381 | } |
2384 | 2382 | ||
2385 | buffer[nbytes] = 0; /* nul-terminate */ | 2383 | buffer[nbytes] = 0; /* nul-terminate */ |
2386 | retval = cft->write_string(cgrp, cft, strstrip(buffer)); | 2384 | retval = cft->write_string(cgrp, cft, strstrip(buffer)); |
2387 | if (!retval) | 2385 | if (!retval) |
2388 | retval = nbytes; | 2386 | retval = nbytes; |
2389 | out: | 2387 | out: |
2390 | if (buffer != local_buffer) | 2388 | if (buffer != local_buffer) |
2391 | kfree(buffer); | 2389 | kfree(buffer); |
2392 | return retval; | 2390 | return retval; |
2393 | } | 2391 | } |
2394 | 2392 | ||
2395 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | 2393 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, |
2396 | size_t nbytes, loff_t *ppos) | 2394 | size_t nbytes, loff_t *ppos) |
2397 | { | 2395 | { |
2398 | struct cftype *cft = __d_cft(file->f_dentry); | 2396 | struct cftype *cft = __d_cft(file->f_dentry); |
2399 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2397 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2400 | 2398 | ||
2401 | if (cgroup_is_removed(cgrp)) | 2399 | if (cgroup_is_removed(cgrp)) |
2402 | return -ENODEV; | 2400 | return -ENODEV; |
2403 | if (cft->write) | 2401 | if (cft->write) |
2404 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 2402 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); |
2405 | if (cft->write_u64 || cft->write_s64) | 2403 | if (cft->write_u64 || cft->write_s64) |
2406 | return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); | 2404 | return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); |
2407 | if (cft->write_string) | 2405 | if (cft->write_string) |
2408 | return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); | 2406 | return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); |
2409 | if (cft->trigger) { | 2407 | if (cft->trigger) { |
2410 | int ret = cft->trigger(cgrp, (unsigned int)cft->private); | 2408 | int ret = cft->trigger(cgrp, (unsigned int)cft->private); |
2411 | return ret ? ret : nbytes; | 2409 | return ret ? ret : nbytes; |
2412 | } | 2410 | } |
2413 | return -EINVAL; | 2411 | return -EINVAL; |
2414 | } | 2412 | } |
2415 | 2413 | ||
2416 | static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, | 2414 | static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, |
2417 | struct file *file, | 2415 | struct file *file, |
2418 | char __user *buf, size_t nbytes, | 2416 | char __user *buf, size_t nbytes, |
2419 | loff_t *ppos) | 2417 | loff_t *ppos) |
2420 | { | 2418 | { |
2421 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2419 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
2422 | u64 val = cft->read_u64(cgrp, cft); | 2420 | u64 val = cft->read_u64(cgrp, cft); |
2423 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); | 2421 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); |
2424 | 2422 | ||
2425 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2423 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
2426 | } | 2424 | } |
2427 | 2425 | ||
2428 | static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, | 2426 | static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, |
2429 | struct file *file, | 2427 | struct file *file, |
2430 | char __user *buf, size_t nbytes, | 2428 | char __user *buf, size_t nbytes, |
2431 | loff_t *ppos) | 2429 | loff_t *ppos) |
2432 | { | 2430 | { |
2433 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2431 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
2434 | s64 val = cft->read_s64(cgrp, cft); | 2432 | s64 val = cft->read_s64(cgrp, cft); |
2435 | int len = sprintf(tmp, "%lld\n", (long long) val); | 2433 | int len = sprintf(tmp, "%lld\n", (long long) val); |
2436 | 2434 | ||
2437 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2435 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
2438 | } | 2436 | } |
2439 | 2437 | ||
2440 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, | 2438 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, |
2441 | size_t nbytes, loff_t *ppos) | 2439 | size_t nbytes, loff_t *ppos) |
2442 | { | 2440 | { |
2443 | struct cftype *cft = __d_cft(file->f_dentry); | 2441 | struct cftype *cft = __d_cft(file->f_dentry); |
2444 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2442 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2445 | 2443 | ||
2446 | if (cgroup_is_removed(cgrp)) | 2444 | if (cgroup_is_removed(cgrp)) |
2447 | return -ENODEV; | 2445 | return -ENODEV; |
2448 | 2446 | ||
2449 | if (cft->read) | 2447 | if (cft->read) |
2450 | return cft->read(cgrp, cft, file, buf, nbytes, ppos); | 2448 | return cft->read(cgrp, cft, file, buf, nbytes, ppos); |
2451 | if (cft->read_u64) | 2449 | if (cft->read_u64) |
2452 | return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); | 2450 | return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); |
2453 | if (cft->read_s64) | 2451 | if (cft->read_s64) |
2454 | return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); | 2452 | return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); |
2455 | return -EINVAL; | 2453 | return -EINVAL; |
2456 | } | 2454 | } |
2457 | 2455 | ||
2458 | /* | 2456 | /* |
2459 | * seqfile ops/methods for returning structured data. Currently just | 2457 | * seqfile ops/methods for returning structured data. Currently just |
2460 | * supports string->u64 maps, but can be extended in future. | 2458 | * supports string->u64 maps, but can be extended in future. |
2461 | */ | 2459 | */ |
2462 | 2460 | ||
2463 | struct cgroup_seqfile_state { | 2461 | struct cgroup_seqfile_state { |
2464 | struct cftype *cft; | 2462 | struct cftype *cft; |
2465 | struct cgroup *cgroup; | 2463 | struct cgroup *cgroup; |
2466 | }; | 2464 | }; |
2467 | 2465 | ||
2468 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | 2466 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) |
2469 | { | 2467 | { |
2470 | struct seq_file *sf = cb->state; | 2468 | struct seq_file *sf = cb->state; |
2471 | return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); | 2469 | return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); |
2472 | } | 2470 | } |
2473 | 2471 | ||
2474 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) | 2472 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) |
2475 | { | 2473 | { |
2476 | struct cgroup_seqfile_state *state = m->private; | 2474 | struct cgroup_seqfile_state *state = m->private; |
2477 | struct cftype *cft = state->cft; | 2475 | struct cftype *cft = state->cft; |
2478 | if (cft->read_map) { | 2476 | if (cft->read_map) { |
2479 | struct cgroup_map_cb cb = { | 2477 | struct cgroup_map_cb cb = { |
2480 | .fill = cgroup_map_add, | 2478 | .fill = cgroup_map_add, |
2481 | .state = m, | 2479 | .state = m, |
2482 | }; | 2480 | }; |
2483 | return cft->read_map(state->cgroup, cft, &cb); | 2481 | return cft->read_map(state->cgroup, cft, &cb); |
2484 | } | 2482 | } |
2485 | return cft->read_seq_string(state->cgroup, cft, m); | 2483 | return cft->read_seq_string(state->cgroup, cft, m); |
2486 | } | 2484 | } |
2487 | 2485 | ||
2488 | static int cgroup_seqfile_release(struct inode *inode, struct file *file) | 2486 | static int cgroup_seqfile_release(struct inode *inode, struct file *file) |
2489 | { | 2487 | { |
2490 | struct seq_file *seq = file->private_data; | 2488 | struct seq_file *seq = file->private_data; |
2491 | kfree(seq->private); | 2489 | kfree(seq->private); |
2492 | return single_release(inode, file); | 2490 | return single_release(inode, file); |
2493 | } | 2491 | } |
2494 | 2492 | ||
2495 | static const struct file_operations cgroup_seqfile_operations = { | 2493 | static const struct file_operations cgroup_seqfile_operations = { |
2496 | .read = seq_read, | 2494 | .read = seq_read, |
2497 | .write = cgroup_file_write, | 2495 | .write = cgroup_file_write, |
2498 | .llseek = seq_lseek, | 2496 | .llseek = seq_lseek, |
2499 | .release = cgroup_seqfile_release, | 2497 | .release = cgroup_seqfile_release, |
2500 | }; | 2498 | }; |
2501 | 2499 | ||
2502 | static int cgroup_file_open(struct inode *inode, struct file *file) | 2500 | static int cgroup_file_open(struct inode *inode, struct file *file) |
2503 | { | 2501 | { |
2504 | int err; | 2502 | int err; |
2505 | struct cftype *cft; | 2503 | struct cftype *cft; |
2506 | 2504 | ||
2507 | err = generic_file_open(inode, file); | 2505 | err = generic_file_open(inode, file); |
2508 | if (err) | 2506 | if (err) |
2509 | return err; | 2507 | return err; |
2510 | cft = __d_cft(file->f_dentry); | 2508 | cft = __d_cft(file->f_dentry); |
2511 | 2509 | ||
2512 | if (cft->read_map || cft->read_seq_string) { | 2510 | if (cft->read_map || cft->read_seq_string) { |
2513 | struct cgroup_seqfile_state *state = | 2511 | struct cgroup_seqfile_state *state = |
2514 | kzalloc(sizeof(*state), GFP_USER); | 2512 | kzalloc(sizeof(*state), GFP_USER); |
2515 | if (!state) | 2513 | if (!state) |
2516 | return -ENOMEM; | 2514 | return -ENOMEM; |
2517 | state->cft = cft; | 2515 | state->cft = cft; |
2518 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); | 2516 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); |
2519 | file->f_op = &cgroup_seqfile_operations; | 2517 | file->f_op = &cgroup_seqfile_operations; |
2520 | err = single_open(file, cgroup_seqfile_show, state); | 2518 | err = single_open(file, cgroup_seqfile_show, state); |
2521 | if (err < 0) | 2519 | if (err < 0) |
2522 | kfree(state); | 2520 | kfree(state); |
2523 | } else if (cft->open) | 2521 | } else if (cft->open) |
2524 | err = cft->open(inode, file); | 2522 | err = cft->open(inode, file); |
2525 | else | 2523 | else |
2526 | err = 0; | 2524 | err = 0; |
2527 | 2525 | ||
2528 | return err; | 2526 | return err; |
2529 | } | 2527 | } |
2530 | 2528 | ||
2531 | static int cgroup_file_release(struct inode *inode, struct file *file) | 2529 | static int cgroup_file_release(struct inode *inode, struct file *file) |
2532 | { | 2530 | { |
2533 | struct cftype *cft = __d_cft(file->f_dentry); | 2531 | struct cftype *cft = __d_cft(file->f_dentry); |
2534 | if (cft->release) | 2532 | if (cft->release) |
2535 | return cft->release(inode, file); | 2533 | return cft->release(inode, file); |
2536 | return 0; | 2534 | return 0; |
2537 | } | 2535 | } |
2538 | 2536 | ||
2539 | /* | 2537 | /* |
2540 | * cgroup_rename - Only allow simple rename of directories in place. | 2538 | * cgroup_rename - Only allow simple rename of directories in place. |
2541 | */ | 2539 | */ |
2542 | static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | 2540 | static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, |
2543 | struct inode *new_dir, struct dentry *new_dentry) | 2541 | struct inode *new_dir, struct dentry *new_dentry) |
2544 | { | 2542 | { |
2545 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) | 2543 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) |
2546 | return -ENOTDIR; | 2544 | return -ENOTDIR; |
2547 | if (new_dentry->d_inode) | 2545 | if (new_dentry->d_inode) |
2548 | return -EEXIST; | 2546 | return -EEXIST; |
2549 | if (old_dir != new_dir) | 2547 | if (old_dir != new_dir) |
2550 | return -EIO; | 2548 | return -EIO; |
2551 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | 2549 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); |
2552 | } | 2550 | } |
2553 | 2551 | ||
2554 | static const struct file_operations cgroup_file_operations = { | 2552 | static const struct file_operations cgroup_file_operations = { |
2555 | .read = cgroup_file_read, | 2553 | .read = cgroup_file_read, |
2556 | .write = cgroup_file_write, | 2554 | .write = cgroup_file_write, |
2557 | .llseek = generic_file_llseek, | 2555 | .llseek = generic_file_llseek, |
2558 | .open = cgroup_file_open, | 2556 | .open = cgroup_file_open, |
2559 | .release = cgroup_file_release, | 2557 | .release = cgroup_file_release, |
2560 | }; | 2558 | }; |
2561 | 2559 | ||
2562 | static const struct inode_operations cgroup_dir_inode_operations = { | 2560 | static const struct inode_operations cgroup_dir_inode_operations = { |
2563 | .lookup = cgroup_lookup, | 2561 | .lookup = cgroup_lookup, |
2564 | .mkdir = cgroup_mkdir, | 2562 | .mkdir = cgroup_mkdir, |
2565 | .rmdir = cgroup_rmdir, | 2563 | .rmdir = cgroup_rmdir, |
2566 | .rename = cgroup_rename, | 2564 | .rename = cgroup_rename, |
2567 | }; | 2565 | }; |
2568 | 2566 | ||
2569 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) | 2567 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) |
2570 | { | 2568 | { |
2571 | if (dentry->d_name.len > NAME_MAX) | 2569 | if (dentry->d_name.len > NAME_MAX) |
2572 | return ERR_PTR(-ENAMETOOLONG); | 2570 | return ERR_PTR(-ENAMETOOLONG); |
2573 | d_add(dentry, NULL); | 2571 | d_add(dentry, NULL); |
2574 | return NULL; | 2572 | return NULL; |
2575 | } | 2573 | } |
2576 | 2574 | ||
2577 | /* | 2575 | /* |
2578 | * Check if a file is a control file | 2576 | * Check if a file is a control file |
2579 | */ | 2577 | */ |
2580 | static inline struct cftype *__file_cft(struct file *file) | 2578 | static inline struct cftype *__file_cft(struct file *file) |
2581 | { | 2579 | { |
2582 | if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) | 2580 | if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) |
2583 | return ERR_PTR(-EINVAL); | 2581 | return ERR_PTR(-EINVAL); |
2584 | return __d_cft(file->f_dentry); | 2582 | return __d_cft(file->f_dentry); |
2585 | } | 2583 | } |
2586 | 2584 | ||
2587 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2585 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, |
2588 | struct super_block *sb) | 2586 | struct super_block *sb) |
2589 | { | 2587 | { |
2590 | struct inode *inode; | 2588 | struct inode *inode; |
2591 | 2589 | ||
2592 | if (!dentry) | 2590 | if (!dentry) |
2593 | return -ENOENT; | 2591 | return -ENOENT; |
2594 | if (dentry->d_inode) | 2592 | if (dentry->d_inode) |
2595 | return -EEXIST; | 2593 | return -EEXIST; |
2596 | 2594 | ||
2597 | inode = cgroup_new_inode(mode, sb); | 2595 | inode = cgroup_new_inode(mode, sb); |
2598 | if (!inode) | 2596 | if (!inode) |
2599 | return -ENOMEM; | 2597 | return -ENOMEM; |
2600 | 2598 | ||
2601 | if (S_ISDIR(mode)) { | 2599 | if (S_ISDIR(mode)) { |
2602 | inode->i_op = &cgroup_dir_inode_operations; | 2600 | inode->i_op = &cgroup_dir_inode_operations; |
2603 | inode->i_fop = &simple_dir_operations; | 2601 | inode->i_fop = &simple_dir_operations; |
2604 | 2602 | ||
2605 | /* start off with i_nlink == 2 (for "." entry) */ | 2603 | /* start off with i_nlink == 2 (for "." entry) */ |
2606 | inc_nlink(inode); | 2604 | inc_nlink(inode); |
2607 | 2605 | ||
2608 | /* start with the directory inode held, so that we can | 2606 | /* start with the directory inode held, so that we can |
2609 | * populate it without racing with another mkdir */ | 2607 | * populate it without racing with another mkdir */ |
2610 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); | 2608 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); |
2611 | } else if (S_ISREG(mode)) { | 2609 | } else if (S_ISREG(mode)) { |
2612 | inode->i_size = 0; | 2610 | inode->i_size = 0; |
2613 | inode->i_fop = &cgroup_file_operations; | 2611 | inode->i_fop = &cgroup_file_operations; |
2614 | } | 2612 | } |
2615 | d_instantiate(dentry, inode); | 2613 | d_instantiate(dentry, inode); |
2616 | dget(dentry); /* Extra count - pin the dentry in core */ | 2614 | dget(dentry); /* Extra count - pin the dentry in core */ |
2617 | return 0; | 2615 | return 0; |
2618 | } | 2616 | } |
2619 | 2617 | ||
2620 | /* | 2618 | /* |
2621 | * cgroup_create_dir - create a directory for an object. | 2619 | * cgroup_create_dir - create a directory for an object. |
2622 | * @cgrp: the cgroup we create the directory for. It must have a valid | 2620 | * @cgrp: the cgroup we create the directory for. It must have a valid |
2623 | * ->parent field. And we are going to fill its ->dentry field. | 2621 | * ->parent field. And we are going to fill its ->dentry field. |
2624 | * @dentry: dentry of the new cgroup | 2622 | * @dentry: dentry of the new cgroup |
2625 | * @mode: mode to set on new directory. | 2623 | * @mode: mode to set on new directory. |
2626 | */ | 2624 | */ |
2627 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | 2625 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, |
2628 | mode_t mode) | 2626 | mode_t mode) |
2629 | { | 2627 | { |
2630 | struct dentry *parent; | 2628 | struct dentry *parent; |
2631 | int error = 0; | 2629 | int error = 0; |
2632 | 2630 | ||
2633 | parent = cgrp->parent->dentry; | 2631 | parent = cgrp->parent->dentry; |
2634 | error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); | 2632 | error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); |
2635 | if (!error) { | 2633 | if (!error) { |
2636 | dentry->d_fsdata = cgrp; | 2634 | dentry->d_fsdata = cgrp; |
2637 | inc_nlink(parent->d_inode); | 2635 | inc_nlink(parent->d_inode); |
2638 | rcu_assign_pointer(cgrp->dentry, dentry); | 2636 | rcu_assign_pointer(cgrp->dentry, dentry); |
2639 | dget(dentry); | 2637 | dget(dentry); |
2640 | } | 2638 | } |
2641 | dput(dentry); | 2639 | dput(dentry); |
2642 | 2640 | ||
2643 | return error; | 2641 | return error; |
2644 | } | 2642 | } |
2645 | 2643 | ||
2646 | /** | 2644 | /** |
2647 | * cgroup_file_mode - deduce file mode of a control file | 2645 | * cgroup_file_mode - deduce file mode of a control file |
2648 | * @cft: the control file in question | 2646 | * @cft: the control file in question |
2649 | * | 2647 | * |
2650 | * returns cft->mode if ->mode is not 0 | 2648 | * returns cft->mode if ->mode is not 0 |
2651 | * returns S_IRUGO|S_IWUSR if it has both a read and a write handler | 2649 | * returns S_IRUGO|S_IWUSR if it has both a read and a write handler |
2652 | * returns S_IRUGO if it has only a read handler | 2650 | * returns S_IRUGO if it has only a read handler |
2653 | * returns S_IWUSR if it has only a write hander | 2651 | * returns S_IWUSR if it has only a write hander |
2654 | */ | 2652 | */ |
2655 | static mode_t cgroup_file_mode(const struct cftype *cft) | 2653 | static mode_t cgroup_file_mode(const struct cftype *cft) |
2656 | { | 2654 | { |
2657 | mode_t mode = 0; | 2655 | mode_t mode = 0; |
2658 | 2656 | ||
2659 | if (cft->mode) | 2657 | if (cft->mode) |
2660 | return cft->mode; | 2658 | return cft->mode; |
2661 | 2659 | ||
2662 | if (cft->read || cft->read_u64 || cft->read_s64 || | 2660 | if (cft->read || cft->read_u64 || cft->read_s64 || |
2663 | cft->read_map || cft->read_seq_string) | 2661 | cft->read_map || cft->read_seq_string) |
2664 | mode |= S_IRUGO; | 2662 | mode |= S_IRUGO; |
2665 | 2663 | ||
2666 | if (cft->write || cft->write_u64 || cft->write_s64 || | 2664 | if (cft->write || cft->write_u64 || cft->write_s64 || |
2667 | cft->write_string || cft->trigger) | 2665 | cft->write_string || cft->trigger) |
2668 | mode |= S_IWUSR; | 2666 | mode |= S_IWUSR; |
2669 | 2667 | ||
2670 | return mode; | 2668 | return mode; |
2671 | } | 2669 | } |
2672 | 2670 | ||
2673 | int cgroup_add_file(struct cgroup *cgrp, | 2671 | int cgroup_add_file(struct cgroup *cgrp, |
2674 | struct cgroup_subsys *subsys, | 2672 | struct cgroup_subsys *subsys, |
2675 | const struct cftype *cft) | 2673 | const struct cftype *cft) |
2676 | { | 2674 | { |
2677 | struct dentry *dir = cgrp->dentry; | 2675 | struct dentry *dir = cgrp->dentry; |
2678 | struct dentry *dentry; | 2676 | struct dentry *dentry; |
2679 | int error; | 2677 | int error; |
2680 | mode_t mode; | 2678 | mode_t mode; |
2681 | 2679 | ||
2682 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2680 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2683 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2681 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2684 | strcpy(name, subsys->name); | 2682 | strcpy(name, subsys->name); |
2685 | strcat(name, "."); | 2683 | strcat(name, "."); |
2686 | } | 2684 | } |
2687 | strcat(name, cft->name); | 2685 | strcat(name, cft->name); |
2688 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); | 2686 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); |
2689 | dentry = lookup_one_len(name, dir, strlen(name)); | 2687 | dentry = lookup_one_len(name, dir, strlen(name)); |
2690 | if (!IS_ERR(dentry)) { | 2688 | if (!IS_ERR(dentry)) { |
2691 | mode = cgroup_file_mode(cft); | 2689 | mode = cgroup_file_mode(cft); |
2692 | error = cgroup_create_file(dentry, mode | S_IFREG, | 2690 | error = cgroup_create_file(dentry, mode | S_IFREG, |
2693 | cgrp->root->sb); | 2691 | cgrp->root->sb); |
2694 | if (!error) | 2692 | if (!error) |
2695 | dentry->d_fsdata = (void *)cft; | 2693 | dentry->d_fsdata = (void *)cft; |
2696 | dput(dentry); | 2694 | dput(dentry); |
2697 | } else | 2695 | } else |
2698 | error = PTR_ERR(dentry); | 2696 | error = PTR_ERR(dentry); |
2699 | return error; | 2697 | return error; |
2700 | } | 2698 | } |
2701 | EXPORT_SYMBOL_GPL(cgroup_add_file); | 2699 | EXPORT_SYMBOL_GPL(cgroup_add_file); |
2702 | 2700 | ||
2703 | int cgroup_add_files(struct cgroup *cgrp, | 2701 | int cgroup_add_files(struct cgroup *cgrp, |
2704 | struct cgroup_subsys *subsys, | 2702 | struct cgroup_subsys *subsys, |
2705 | const struct cftype cft[], | 2703 | const struct cftype cft[], |
2706 | int count) | 2704 | int count) |
2707 | { | 2705 | { |
2708 | int i, err; | 2706 | int i, err; |
2709 | for (i = 0; i < count; i++) { | 2707 | for (i = 0; i < count; i++) { |
2710 | err = cgroup_add_file(cgrp, subsys, &cft[i]); | 2708 | err = cgroup_add_file(cgrp, subsys, &cft[i]); |
2711 | if (err) | 2709 | if (err) |
2712 | return err; | 2710 | return err; |
2713 | } | 2711 | } |
2714 | return 0; | 2712 | return 0; |
2715 | } | 2713 | } |
2716 | EXPORT_SYMBOL_GPL(cgroup_add_files); | 2714 | EXPORT_SYMBOL_GPL(cgroup_add_files); |
2717 | 2715 | ||
2718 | /** | 2716 | /** |
2719 | * cgroup_task_count - count the number of tasks in a cgroup. | 2717 | * cgroup_task_count - count the number of tasks in a cgroup. |
2720 | * @cgrp: the cgroup in question | 2718 | * @cgrp: the cgroup in question |
2721 | * | 2719 | * |
2722 | * Return the number of tasks in the cgroup. | 2720 | * Return the number of tasks in the cgroup. |
2723 | */ | 2721 | */ |
2724 | int cgroup_task_count(const struct cgroup *cgrp) | 2722 | int cgroup_task_count(const struct cgroup *cgrp) |
2725 | { | 2723 | { |
2726 | int count = 0; | 2724 | int count = 0; |
2727 | struct cg_cgroup_link *link; | 2725 | struct cg_cgroup_link *link; |
2728 | 2726 | ||
2729 | read_lock(&css_set_lock); | 2727 | read_lock(&css_set_lock); |
2730 | list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { | 2728 | list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { |
2731 | count += atomic_read(&link->cg->refcount); | 2729 | count += atomic_read(&link->cg->refcount); |
2732 | } | 2730 | } |
2733 | read_unlock(&css_set_lock); | 2731 | read_unlock(&css_set_lock); |
2734 | return count; | 2732 | return count; |
2735 | } | 2733 | } |
2736 | 2734 | ||
2737 | /* | 2735 | /* |
2738 | * Advance a list_head iterator. The iterator should be positioned at | 2736 | * Advance a list_head iterator. The iterator should be positioned at |
2739 | * the start of a css_set | 2737 | * the start of a css_set |
2740 | */ | 2738 | */ |
2741 | static void cgroup_advance_iter(struct cgroup *cgrp, | 2739 | static void cgroup_advance_iter(struct cgroup *cgrp, |
2742 | struct cgroup_iter *it) | 2740 | struct cgroup_iter *it) |
2743 | { | 2741 | { |
2744 | struct list_head *l = it->cg_link; | 2742 | struct list_head *l = it->cg_link; |
2745 | struct cg_cgroup_link *link; | 2743 | struct cg_cgroup_link *link; |
2746 | struct css_set *cg; | 2744 | struct css_set *cg; |
2747 | 2745 | ||
2748 | /* Advance to the next non-empty css_set */ | 2746 | /* Advance to the next non-empty css_set */ |
2749 | do { | 2747 | do { |
2750 | l = l->next; | 2748 | l = l->next; |
2751 | if (l == &cgrp->css_sets) { | 2749 | if (l == &cgrp->css_sets) { |
2752 | it->cg_link = NULL; | 2750 | it->cg_link = NULL; |
2753 | return; | 2751 | return; |
2754 | } | 2752 | } |
2755 | link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); | 2753 | link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); |
2756 | cg = link->cg; | 2754 | cg = link->cg; |
2757 | } while (list_empty(&cg->tasks)); | 2755 | } while (list_empty(&cg->tasks)); |
2758 | it->cg_link = l; | 2756 | it->cg_link = l; |
2759 | it->task = cg->tasks.next; | 2757 | it->task = cg->tasks.next; |
2760 | } | 2758 | } |
2761 | 2759 | ||
2762 | /* | 2760 | /* |
2763 | * To reduce the fork() overhead for systems that are not actually | 2761 | * To reduce the fork() overhead for systems that are not actually |
2764 | * using their cgroups capability, we don't maintain the lists running | 2762 | * using their cgroups capability, we don't maintain the lists running |
2765 | * through each css_set to its tasks until we see the list actually | 2763 | * through each css_set to its tasks until we see the list actually |
2766 | * used - in other words after the first call to cgroup_iter_start(). | 2764 | * used - in other words after the first call to cgroup_iter_start(). |
2767 | * | 2765 | * |
2768 | * The tasklist_lock is not held here, as do_each_thread() and | 2766 | * The tasklist_lock is not held here, as do_each_thread() and |
2769 | * while_each_thread() are protected by RCU. | 2767 | * while_each_thread() are protected by RCU. |
2770 | */ | 2768 | */ |
2771 | static void cgroup_enable_task_cg_lists(void) | 2769 | static void cgroup_enable_task_cg_lists(void) |
2772 | { | 2770 | { |
2773 | struct task_struct *p, *g; | 2771 | struct task_struct *p, *g; |
2774 | write_lock(&css_set_lock); | 2772 | write_lock(&css_set_lock); |
2775 | use_task_css_set_links = 1; | 2773 | use_task_css_set_links = 1; |
2776 | do_each_thread(g, p) { | 2774 | do_each_thread(g, p) { |
2777 | task_lock(p); | 2775 | task_lock(p); |
2778 | /* | 2776 | /* |
2779 | * We should check if the process is exiting, otherwise | 2777 | * We should check if the process is exiting, otherwise |
2780 | * it will race with cgroup_exit() in that the list | 2778 | * it will race with cgroup_exit() in that the list |
2781 | * entry won't be deleted though the process has exited. | 2779 | * entry won't be deleted though the process has exited. |
2782 | */ | 2780 | */ |
2783 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) | 2781 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) |
2784 | list_add(&p->cg_list, &p->cgroups->tasks); | 2782 | list_add(&p->cg_list, &p->cgroups->tasks); |
2785 | task_unlock(p); | 2783 | task_unlock(p); |
2786 | } while_each_thread(g, p); | 2784 | } while_each_thread(g, p); |
2787 | write_unlock(&css_set_lock); | 2785 | write_unlock(&css_set_lock); |
2788 | } | 2786 | } |
2789 | 2787 | ||
2790 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 2788 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
2791 | { | 2789 | { |
2792 | /* | 2790 | /* |
2793 | * The first time anyone tries to iterate across a cgroup, | 2791 | * The first time anyone tries to iterate across a cgroup, |
2794 | * we need to enable the list linking each css_set to its | 2792 | * we need to enable the list linking each css_set to its |
2795 | * tasks, and fix up all existing tasks. | 2793 | * tasks, and fix up all existing tasks. |
2796 | */ | 2794 | */ |
2797 | if (!use_task_css_set_links) | 2795 | if (!use_task_css_set_links) |
2798 | cgroup_enable_task_cg_lists(); | 2796 | cgroup_enable_task_cg_lists(); |
2799 | 2797 | ||
2800 | read_lock(&css_set_lock); | 2798 | read_lock(&css_set_lock); |
2801 | it->cg_link = &cgrp->css_sets; | 2799 | it->cg_link = &cgrp->css_sets; |
2802 | cgroup_advance_iter(cgrp, it); | 2800 | cgroup_advance_iter(cgrp, it); |
2803 | } | 2801 | } |
2804 | 2802 | ||
2805 | struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | 2803 | struct task_struct *cgroup_iter_next(struct cgroup *cgrp, |
2806 | struct cgroup_iter *it) | 2804 | struct cgroup_iter *it) |
2807 | { | 2805 | { |
2808 | struct task_struct *res; | 2806 | struct task_struct *res; |
2809 | struct list_head *l = it->task; | 2807 | struct list_head *l = it->task; |
2810 | struct cg_cgroup_link *link; | 2808 | struct cg_cgroup_link *link; |
2811 | 2809 | ||
2812 | /* If the iterator cg is NULL, we have no tasks */ | 2810 | /* If the iterator cg is NULL, we have no tasks */ |
2813 | if (!it->cg_link) | 2811 | if (!it->cg_link) |
2814 | return NULL; | 2812 | return NULL; |
2815 | res = list_entry(l, struct task_struct, cg_list); | 2813 | res = list_entry(l, struct task_struct, cg_list); |
2816 | /* Advance iterator to find next entry */ | 2814 | /* Advance iterator to find next entry */ |
2817 | l = l->next; | 2815 | l = l->next; |
2818 | link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); | 2816 | link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); |
2819 | if (l == &link->cg->tasks) { | 2817 | if (l == &link->cg->tasks) { |
2820 | /* We reached the end of this task list - move on to | 2818 | /* We reached the end of this task list - move on to |
2821 | * the next cg_cgroup_link */ | 2819 | * the next cg_cgroup_link */ |
2822 | cgroup_advance_iter(cgrp, it); | 2820 | cgroup_advance_iter(cgrp, it); |
2823 | } else { | 2821 | } else { |
2824 | it->task = l; | 2822 | it->task = l; |
2825 | } | 2823 | } |
2826 | return res; | 2824 | return res; |
2827 | } | 2825 | } |
2828 | 2826 | ||
2829 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | 2827 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) |
2830 | { | 2828 | { |
2831 | read_unlock(&css_set_lock); | 2829 | read_unlock(&css_set_lock); |
2832 | } | 2830 | } |
2833 | 2831 | ||
2834 | static inline int started_after_time(struct task_struct *t1, | 2832 | static inline int started_after_time(struct task_struct *t1, |
2835 | struct timespec *time, | 2833 | struct timespec *time, |
2836 | struct task_struct *t2) | 2834 | struct task_struct *t2) |
2837 | { | 2835 | { |
2838 | int start_diff = timespec_compare(&t1->start_time, time); | 2836 | int start_diff = timespec_compare(&t1->start_time, time); |
2839 | if (start_diff > 0) { | 2837 | if (start_diff > 0) { |
2840 | return 1; | 2838 | return 1; |
2841 | } else if (start_diff < 0) { | 2839 | } else if (start_diff < 0) { |
2842 | return 0; | 2840 | return 0; |
2843 | } else { | 2841 | } else { |
2844 | /* | 2842 | /* |
2845 | * Arbitrarily, if two processes started at the same | 2843 | * Arbitrarily, if two processes started at the same |
2846 | * time, we'll say that the lower pointer value | 2844 | * time, we'll say that the lower pointer value |
2847 | * started first. Note that t2 may have exited by now | 2845 | * started first. Note that t2 may have exited by now |
2848 | * so this may not be a valid pointer any longer, but | 2846 | * so this may not be a valid pointer any longer, but |
2849 | * that's fine - it still serves to distinguish | 2847 | * that's fine - it still serves to distinguish |
2850 | * between two tasks started (effectively) simultaneously. | 2848 | * between two tasks started (effectively) simultaneously. |
2851 | */ | 2849 | */ |
2852 | return t1 > t2; | 2850 | return t1 > t2; |
2853 | } | 2851 | } |
2854 | } | 2852 | } |
2855 | 2853 | ||
2856 | /* | 2854 | /* |
2857 | * This function is a callback from heap_insert() and is used to order | 2855 | * This function is a callback from heap_insert() and is used to order |
2858 | * the heap. | 2856 | * the heap. |
2859 | * In this case we order the heap in descending task start time. | 2857 | * In this case we order the heap in descending task start time. |
2860 | */ | 2858 | */ |
2861 | static inline int started_after(void *p1, void *p2) | 2859 | static inline int started_after(void *p1, void *p2) |
2862 | { | 2860 | { |
2863 | struct task_struct *t1 = p1; | 2861 | struct task_struct *t1 = p1; |
2864 | struct task_struct *t2 = p2; | 2862 | struct task_struct *t2 = p2; |
2865 | return started_after_time(t1, &t2->start_time, t2); | 2863 | return started_after_time(t1, &t2->start_time, t2); |
2866 | } | 2864 | } |
2867 | 2865 | ||
2868 | /** | 2866 | /** |
2869 | * cgroup_scan_tasks - iterate though all the tasks in a cgroup | 2867 | * cgroup_scan_tasks - iterate though all the tasks in a cgroup |
2870 | * @scan: struct cgroup_scanner containing arguments for the scan | 2868 | * @scan: struct cgroup_scanner containing arguments for the scan |
2871 | * | 2869 | * |
2872 | * Arguments include pointers to callback functions test_task() and | 2870 | * Arguments include pointers to callback functions test_task() and |
2873 | * process_task(). | 2871 | * process_task(). |
2874 | * Iterate through all the tasks in a cgroup, calling test_task() for each, | 2872 | * Iterate through all the tasks in a cgroup, calling test_task() for each, |
2875 | * and if it returns true, call process_task() for it also. | 2873 | * and if it returns true, call process_task() for it also. |
2876 | * The test_task pointer may be NULL, meaning always true (select all tasks). | 2874 | * The test_task pointer may be NULL, meaning always true (select all tasks). |
2877 | * Effectively duplicates cgroup_iter_{start,next,end}() | 2875 | * Effectively duplicates cgroup_iter_{start,next,end}() |
2878 | * but does not lock css_set_lock for the call to process_task(). | 2876 | * but does not lock css_set_lock for the call to process_task(). |
2879 | * The struct cgroup_scanner may be embedded in any structure of the caller's | 2877 | * The struct cgroup_scanner may be embedded in any structure of the caller's |
2880 | * creation. | 2878 | * creation. |
2881 | * It is guaranteed that process_task() will act on every task that | 2879 | * It is guaranteed that process_task() will act on every task that |
2882 | * is a member of the cgroup for the duration of this call. This | 2880 | * is a member of the cgroup for the duration of this call. This |
2883 | * function may or may not call process_task() for tasks that exit | 2881 | * function may or may not call process_task() for tasks that exit |
2884 | * or move to a different cgroup during the call, or are forked or | 2882 | * or move to a different cgroup during the call, or are forked or |
2885 | * move into the cgroup during the call. | 2883 | * move into the cgroup during the call. |
2886 | * | 2884 | * |
2887 | * Note that test_task() may be called with locks held, and may in some | 2885 | * Note that test_task() may be called with locks held, and may in some |
2888 | * situations be called multiple times for the same task, so it should | 2886 | * situations be called multiple times for the same task, so it should |
2889 | * be cheap. | 2887 | * be cheap. |
2890 | * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been | 2888 | * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been |
2891 | * pre-allocated and will be used for heap operations (and its "gt" member will | 2889 | * pre-allocated and will be used for heap operations (and its "gt" member will |
2892 | * be overwritten), else a temporary heap will be used (allocation of which | 2890 | * be overwritten), else a temporary heap will be used (allocation of which |
2893 | * may cause this function to fail). | 2891 | * may cause this function to fail). |
2894 | */ | 2892 | */ |
2895 | int cgroup_scan_tasks(struct cgroup_scanner *scan) | 2893 | int cgroup_scan_tasks(struct cgroup_scanner *scan) |
2896 | { | 2894 | { |
2897 | int retval, i; | 2895 | int retval, i; |
2898 | struct cgroup_iter it; | 2896 | struct cgroup_iter it; |
2899 | struct task_struct *p, *dropped; | 2897 | struct task_struct *p, *dropped; |
2900 | /* Never dereference latest_task, since it's not refcounted */ | 2898 | /* Never dereference latest_task, since it's not refcounted */ |
2901 | struct task_struct *latest_task = NULL; | 2899 | struct task_struct *latest_task = NULL; |
2902 | struct ptr_heap tmp_heap; | 2900 | struct ptr_heap tmp_heap; |
2903 | struct ptr_heap *heap; | 2901 | struct ptr_heap *heap; |
2904 | struct timespec latest_time = { 0, 0 }; | 2902 | struct timespec latest_time = { 0, 0 }; |
2905 | 2903 | ||
2906 | if (scan->heap) { | 2904 | if (scan->heap) { |
2907 | /* The caller supplied our heap and pre-allocated its memory */ | 2905 | /* The caller supplied our heap and pre-allocated its memory */ |
2908 | heap = scan->heap; | 2906 | heap = scan->heap; |
2909 | heap->gt = &started_after; | 2907 | heap->gt = &started_after; |
2910 | } else { | 2908 | } else { |
2911 | /* We need to allocate our own heap memory */ | 2909 | /* We need to allocate our own heap memory */ |
2912 | heap = &tmp_heap; | 2910 | heap = &tmp_heap; |
2913 | retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); | 2911 | retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); |
2914 | if (retval) | 2912 | if (retval) |
2915 | /* cannot allocate the heap */ | 2913 | /* cannot allocate the heap */ |
2916 | return retval; | 2914 | return retval; |
2917 | } | 2915 | } |
2918 | 2916 | ||
2919 | again: | 2917 | again: |
2920 | /* | 2918 | /* |
2921 | * Scan tasks in the cgroup, using the scanner's "test_task" callback | 2919 | * Scan tasks in the cgroup, using the scanner's "test_task" callback |
2922 | * to determine which are of interest, and using the scanner's | 2920 | * to determine which are of interest, and using the scanner's |
2923 | * "process_task" callback to process any of them that need an update. | 2921 | * "process_task" callback to process any of them that need an update. |
2924 | * Since we don't want to hold any locks during the task updates, | 2922 | * Since we don't want to hold any locks during the task updates, |
2925 | * gather tasks to be processed in a heap structure. | 2923 | * gather tasks to be processed in a heap structure. |
2926 | * The heap is sorted by descending task start time. | 2924 | * The heap is sorted by descending task start time. |
2927 | * If the statically-sized heap fills up, we overflow tasks that | 2925 | * If the statically-sized heap fills up, we overflow tasks that |
2928 | * started later, and in future iterations only consider tasks that | 2926 | * started later, and in future iterations only consider tasks that |
2929 | * started after the latest task in the previous pass. This | 2927 | * started after the latest task in the previous pass. This |
2930 | * guarantees forward progress and that we don't miss any tasks. | 2928 | * guarantees forward progress and that we don't miss any tasks. |
2931 | */ | 2929 | */ |
2932 | heap->size = 0; | 2930 | heap->size = 0; |
2933 | cgroup_iter_start(scan->cg, &it); | 2931 | cgroup_iter_start(scan->cg, &it); |
2934 | while ((p = cgroup_iter_next(scan->cg, &it))) { | 2932 | while ((p = cgroup_iter_next(scan->cg, &it))) { |
2935 | /* | 2933 | /* |
2936 | * Only affect tasks that qualify per the caller's callback, | 2934 | * Only affect tasks that qualify per the caller's callback, |
2937 | * if he provided one | 2935 | * if he provided one |
2938 | */ | 2936 | */ |
2939 | if (scan->test_task && !scan->test_task(p, scan)) | 2937 | if (scan->test_task && !scan->test_task(p, scan)) |
2940 | continue; | 2938 | continue; |
2941 | /* | 2939 | /* |
2942 | * Only process tasks that started after the last task | 2940 | * Only process tasks that started after the last task |
2943 | * we processed | 2941 | * we processed |
2944 | */ | 2942 | */ |
2945 | if (!started_after_time(p, &latest_time, latest_task)) | 2943 | if (!started_after_time(p, &latest_time, latest_task)) |
2946 | continue; | 2944 | continue; |
2947 | dropped = heap_insert(heap, p); | 2945 | dropped = heap_insert(heap, p); |
2948 | if (dropped == NULL) { | 2946 | if (dropped == NULL) { |
2949 | /* | 2947 | /* |
2950 | * The new task was inserted; the heap wasn't | 2948 | * The new task was inserted; the heap wasn't |
2951 | * previously full | 2949 | * previously full |
2952 | */ | 2950 | */ |
2953 | get_task_struct(p); | 2951 | get_task_struct(p); |
2954 | } else if (dropped != p) { | 2952 | } else if (dropped != p) { |
2955 | /* | 2953 | /* |
2956 | * The new task was inserted, and pushed out a | 2954 | * The new task was inserted, and pushed out a |
2957 | * different task | 2955 | * different task |
2958 | */ | 2956 | */ |
2959 | get_task_struct(p); | 2957 | get_task_struct(p); |
2960 | put_task_struct(dropped); | 2958 | put_task_struct(dropped); |
2961 | } | 2959 | } |
2962 | /* | 2960 | /* |
2963 | * Else the new task was newer than anything already in | 2961 | * Else the new task was newer than anything already in |
2964 | * the heap and wasn't inserted | 2962 | * the heap and wasn't inserted |
2965 | */ | 2963 | */ |
2966 | } | 2964 | } |
2967 | cgroup_iter_end(scan->cg, &it); | 2965 | cgroup_iter_end(scan->cg, &it); |
2968 | 2966 | ||
2969 | if (heap->size) { | 2967 | if (heap->size) { |
2970 | for (i = 0; i < heap->size; i++) { | 2968 | for (i = 0; i < heap->size; i++) { |
2971 | struct task_struct *q = heap->ptrs[i]; | 2969 | struct task_struct *q = heap->ptrs[i]; |
2972 | if (i == 0) { | 2970 | if (i == 0) { |
2973 | latest_time = q->start_time; | 2971 | latest_time = q->start_time; |
2974 | latest_task = q; | 2972 | latest_task = q; |
2975 | } | 2973 | } |
2976 | /* Process the task per the caller's callback */ | 2974 | /* Process the task per the caller's callback */ |
2977 | scan->process_task(q, scan); | 2975 | scan->process_task(q, scan); |
2978 | put_task_struct(q); | 2976 | put_task_struct(q); |
2979 | } | 2977 | } |
2980 | /* | 2978 | /* |
2981 | * If we had to process any tasks at all, scan again | 2979 | * If we had to process any tasks at all, scan again |
2982 | * in case some of them were in the middle of forking | 2980 | * in case some of them were in the middle of forking |
2983 | * children that didn't get processed. | 2981 | * children that didn't get processed. |
2984 | * Not the most efficient way to do it, but it avoids | 2982 | * Not the most efficient way to do it, but it avoids |
2985 | * having to take callback_mutex in the fork path | 2983 | * having to take callback_mutex in the fork path |
2986 | */ | 2984 | */ |
2987 | goto again; | 2985 | goto again; |
2988 | } | 2986 | } |
2989 | if (heap == &tmp_heap) | 2987 | if (heap == &tmp_heap) |
2990 | heap_free(&tmp_heap); | 2988 | heap_free(&tmp_heap); |
2991 | return 0; | 2989 | return 0; |
2992 | } | 2990 | } |
2993 | 2991 | ||
2994 | /* | 2992 | /* |
2995 | * Stuff for reading the 'tasks'/'procs' files. | 2993 | * Stuff for reading the 'tasks'/'procs' files. |
2996 | * | 2994 | * |
2997 | * Reading this file can return large amounts of data if a cgroup has | 2995 | * Reading this file can return large amounts of data if a cgroup has |
2998 | * *lots* of attached tasks. So it may need several calls to read(), | 2996 | * *lots* of attached tasks. So it may need several calls to read(), |
2999 | * but we cannot guarantee that the information we produce is correct | 2997 | * but we cannot guarantee that the information we produce is correct |
3000 | * unless we produce it entirely atomically. | 2998 | * unless we produce it entirely atomically. |
3001 | * | 2999 | * |
3002 | */ | 3000 | */ |
3003 | 3001 | ||
3004 | /* | 3002 | /* |
3005 | * The following two functions "fix" the issue where there are more pids | 3003 | * The following two functions "fix" the issue where there are more pids |
3006 | * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. | 3004 | * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. |
3007 | * TODO: replace with a kernel-wide solution to this problem | 3005 | * TODO: replace with a kernel-wide solution to this problem |
3008 | */ | 3006 | */ |
3009 | #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) | 3007 | #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) |
3010 | static void *pidlist_allocate(int count) | 3008 | static void *pidlist_allocate(int count) |
3011 | { | 3009 | { |
3012 | if (PIDLIST_TOO_LARGE(count)) | 3010 | if (PIDLIST_TOO_LARGE(count)) |
3013 | return vmalloc(count * sizeof(pid_t)); | 3011 | return vmalloc(count * sizeof(pid_t)); |
3014 | else | 3012 | else |
3015 | return kmalloc(count * sizeof(pid_t), GFP_KERNEL); | 3013 | return kmalloc(count * sizeof(pid_t), GFP_KERNEL); |
3016 | } | 3014 | } |
3017 | static void pidlist_free(void *p) | 3015 | static void pidlist_free(void *p) |
3018 | { | 3016 | { |
3019 | if (is_vmalloc_addr(p)) | 3017 | if (is_vmalloc_addr(p)) |
3020 | vfree(p); | 3018 | vfree(p); |
3021 | else | 3019 | else |
3022 | kfree(p); | 3020 | kfree(p); |
3023 | } | 3021 | } |
3024 | static void *pidlist_resize(void *p, int newcount) | 3022 | static void *pidlist_resize(void *p, int newcount) |
3025 | { | 3023 | { |
3026 | void *newlist; | 3024 | void *newlist; |
3027 | /* note: if new alloc fails, old p will still be valid either way */ | 3025 | /* note: if new alloc fails, old p will still be valid either way */ |
3028 | if (is_vmalloc_addr(p)) { | 3026 | if (is_vmalloc_addr(p)) { |
3029 | newlist = vmalloc(newcount * sizeof(pid_t)); | 3027 | newlist = vmalloc(newcount * sizeof(pid_t)); |
3030 | if (!newlist) | 3028 | if (!newlist) |
3031 | return NULL; | 3029 | return NULL; |
3032 | memcpy(newlist, p, newcount * sizeof(pid_t)); | 3030 | memcpy(newlist, p, newcount * sizeof(pid_t)); |
3033 | vfree(p); | 3031 | vfree(p); |
3034 | } else { | 3032 | } else { |
3035 | newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); | 3033 | newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); |
3036 | } | 3034 | } |
3037 | return newlist; | 3035 | return newlist; |
3038 | } | 3036 | } |
3039 | 3037 | ||
3040 | /* | 3038 | /* |
3041 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries | 3039 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries |
3042 | * If the new stripped list is sufficiently smaller and there's enough memory | 3040 | * If the new stripped list is sufficiently smaller and there's enough memory |
3043 | * to allocate a new buffer, will let go of the unneeded memory. Returns the | 3041 | * to allocate a new buffer, will let go of the unneeded memory. Returns the |
3044 | * number of unique elements. | 3042 | * number of unique elements. |
3045 | */ | 3043 | */ |
3046 | /* is the size difference enough that we should re-allocate the array? */ | 3044 | /* is the size difference enough that we should re-allocate the array? */ |
3047 | #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) | 3045 | #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) |
3048 | static int pidlist_uniq(pid_t **p, int length) | 3046 | static int pidlist_uniq(pid_t **p, int length) |
3049 | { | 3047 | { |
3050 | int src, dest = 1; | 3048 | int src, dest = 1; |
3051 | pid_t *list = *p; | 3049 | pid_t *list = *p; |
3052 | pid_t *newlist; | 3050 | pid_t *newlist; |
3053 | 3051 | ||
3054 | /* | 3052 | /* |
3055 | * we presume the 0th element is unique, so i starts at 1. trivial | 3053 | * we presume the 0th element is unique, so i starts at 1. trivial |
3056 | * edge cases first; no work needs to be done for either | 3054 | * edge cases first; no work needs to be done for either |
3057 | */ | 3055 | */ |
3058 | if (length == 0 || length == 1) | 3056 | if (length == 0 || length == 1) |
3059 | return length; | 3057 | return length; |
3060 | /* src and dest walk down the list; dest counts unique elements */ | 3058 | /* src and dest walk down the list; dest counts unique elements */ |
3061 | for (src = 1; src < length; src++) { | 3059 | for (src = 1; src < length; src++) { |
3062 | /* find next unique element */ | 3060 | /* find next unique element */ |
3063 | while (list[src] == list[src-1]) { | 3061 | while (list[src] == list[src-1]) { |
3064 | src++; | 3062 | src++; |
3065 | if (src == length) | 3063 | if (src == length) |
3066 | goto after; | 3064 | goto after; |
3067 | } | 3065 | } |
3068 | /* dest always points to where the next unique element goes */ | 3066 | /* dest always points to where the next unique element goes */ |
3069 | list[dest] = list[src]; | 3067 | list[dest] = list[src]; |
3070 | dest++; | 3068 | dest++; |
3071 | } | 3069 | } |
3072 | after: | 3070 | after: |
3073 | /* | 3071 | /* |
3074 | * if the length difference is large enough, we want to allocate a | 3072 | * if the length difference is large enough, we want to allocate a |
3075 | * smaller buffer to save memory. if this fails due to out of memory, | 3073 | * smaller buffer to save memory. if this fails due to out of memory, |
3076 | * we'll just stay with what we've got. | 3074 | * we'll just stay with what we've got. |
3077 | */ | 3075 | */ |
3078 | if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { | 3076 | if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { |
3079 | newlist = pidlist_resize(list, dest); | 3077 | newlist = pidlist_resize(list, dest); |
3080 | if (newlist) | 3078 | if (newlist) |
3081 | *p = newlist; | 3079 | *p = newlist; |
3082 | } | 3080 | } |
3083 | return dest; | 3081 | return dest; |
3084 | } | 3082 | } |
3085 | 3083 | ||
3086 | static int cmppid(const void *a, const void *b) | 3084 | static int cmppid(const void *a, const void *b) |
3087 | { | 3085 | { |
3088 | return *(pid_t *)a - *(pid_t *)b; | 3086 | return *(pid_t *)a - *(pid_t *)b; |
3089 | } | 3087 | } |
3090 | 3088 | ||
3091 | /* | 3089 | /* |
3092 | * find the appropriate pidlist for our purpose (given procs vs tasks) | 3090 | * find the appropriate pidlist for our purpose (given procs vs tasks) |
3093 | * returns with the lock on that pidlist already held, and takes care | 3091 | * returns with the lock on that pidlist already held, and takes care |
3094 | * of the use count, or returns NULL with no locks held if we're out of | 3092 | * of the use count, or returns NULL with no locks held if we're out of |
3095 | * memory. | 3093 | * memory. |
3096 | */ | 3094 | */ |
3097 | static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | 3095 | static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, |
3098 | enum cgroup_filetype type) | 3096 | enum cgroup_filetype type) |
3099 | { | 3097 | { |
3100 | struct cgroup_pidlist *l; | 3098 | struct cgroup_pidlist *l; |
3101 | /* don't need task_nsproxy() if we're looking at ourself */ | 3099 | /* don't need task_nsproxy() if we're looking at ourself */ |
3102 | struct pid_namespace *ns = current->nsproxy->pid_ns; | 3100 | struct pid_namespace *ns = current->nsproxy->pid_ns; |
3103 | 3101 | ||
3104 | /* | 3102 | /* |
3105 | * We can't drop the pidlist_mutex before taking the l->mutex in case | 3103 | * We can't drop the pidlist_mutex before taking the l->mutex in case |
3106 | * the last ref-holder is trying to remove l from the list at the same | 3104 | * the last ref-holder is trying to remove l from the list at the same |
3107 | * time. Holding the pidlist_mutex precludes somebody taking whichever | 3105 | * time. Holding the pidlist_mutex precludes somebody taking whichever |
3108 | * list we find out from under us - compare release_pid_array(). | 3106 | * list we find out from under us - compare release_pid_array(). |
3109 | */ | 3107 | */ |
3110 | mutex_lock(&cgrp->pidlist_mutex); | 3108 | mutex_lock(&cgrp->pidlist_mutex); |
3111 | list_for_each_entry(l, &cgrp->pidlists, links) { | 3109 | list_for_each_entry(l, &cgrp->pidlists, links) { |
3112 | if (l->key.type == type && l->key.ns == ns) { | 3110 | if (l->key.type == type && l->key.ns == ns) { |
3113 | /* make sure l doesn't vanish out from under us */ | 3111 | /* make sure l doesn't vanish out from under us */ |
3114 | down_write(&l->mutex); | 3112 | down_write(&l->mutex); |
3115 | mutex_unlock(&cgrp->pidlist_mutex); | 3113 | mutex_unlock(&cgrp->pidlist_mutex); |
3116 | return l; | 3114 | return l; |
3117 | } | 3115 | } |
3118 | } | 3116 | } |
3119 | /* entry not found; create a new one */ | 3117 | /* entry not found; create a new one */ |
3120 | l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); | 3118 | l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); |
3121 | if (!l) { | 3119 | if (!l) { |
3122 | mutex_unlock(&cgrp->pidlist_mutex); | 3120 | mutex_unlock(&cgrp->pidlist_mutex); |
3123 | return l; | 3121 | return l; |
3124 | } | 3122 | } |
3125 | init_rwsem(&l->mutex); | 3123 | init_rwsem(&l->mutex); |
3126 | down_write(&l->mutex); | 3124 | down_write(&l->mutex); |
3127 | l->key.type = type; | 3125 | l->key.type = type; |
3128 | l->key.ns = get_pid_ns(ns); | 3126 | l->key.ns = get_pid_ns(ns); |
3129 | l->use_count = 0; /* don't increment here */ | 3127 | l->use_count = 0; /* don't increment here */ |
3130 | l->list = NULL; | 3128 | l->list = NULL; |
3131 | l->owner = cgrp; | 3129 | l->owner = cgrp; |
3132 | list_add(&l->links, &cgrp->pidlists); | 3130 | list_add(&l->links, &cgrp->pidlists); |
3133 | mutex_unlock(&cgrp->pidlist_mutex); | 3131 | mutex_unlock(&cgrp->pidlist_mutex); |
3134 | return l; | 3132 | return l; |
3135 | } | 3133 | } |
3136 | 3134 | ||
3137 | /* | 3135 | /* |
3138 | * Load a cgroup's pidarray with either procs' tgids or tasks' pids | 3136 | * Load a cgroup's pidarray with either procs' tgids or tasks' pids |
3139 | */ | 3137 | */ |
3140 | static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | 3138 | static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, |
3141 | struct cgroup_pidlist **lp) | 3139 | struct cgroup_pidlist **lp) |
3142 | { | 3140 | { |
3143 | pid_t *array; | 3141 | pid_t *array; |
3144 | int length; | 3142 | int length; |
3145 | int pid, n = 0; /* used for populating the array */ | 3143 | int pid, n = 0; /* used for populating the array */ |
3146 | struct cgroup_iter it; | 3144 | struct cgroup_iter it; |
3147 | struct task_struct *tsk; | 3145 | struct task_struct *tsk; |
3148 | struct cgroup_pidlist *l; | 3146 | struct cgroup_pidlist *l; |
3149 | 3147 | ||
3150 | /* | 3148 | /* |
3151 | * If cgroup gets more users after we read count, we won't have | 3149 | * If cgroup gets more users after we read count, we won't have |
3152 | * enough space - tough. This race is indistinguishable to the | 3150 | * enough space - tough. This race is indistinguishable to the |
3153 | * caller from the case that the additional cgroup users didn't | 3151 | * caller from the case that the additional cgroup users didn't |
3154 | * show up until sometime later on. | 3152 | * show up until sometime later on. |
3155 | */ | 3153 | */ |
3156 | length = cgroup_task_count(cgrp); | 3154 | length = cgroup_task_count(cgrp); |
3157 | array = pidlist_allocate(length); | 3155 | array = pidlist_allocate(length); |
3158 | if (!array) | 3156 | if (!array) |
3159 | return -ENOMEM; | 3157 | return -ENOMEM; |
3160 | /* now, populate the array */ | 3158 | /* now, populate the array */ |
3161 | cgroup_iter_start(cgrp, &it); | 3159 | cgroup_iter_start(cgrp, &it); |
3162 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3160 | while ((tsk = cgroup_iter_next(cgrp, &it))) { |
3163 | if (unlikely(n == length)) | 3161 | if (unlikely(n == length)) |
3164 | break; | 3162 | break; |
3165 | /* get tgid or pid for procs or tasks file respectively */ | 3163 | /* get tgid or pid for procs or tasks file respectively */ |
3166 | if (type == CGROUP_FILE_PROCS) | 3164 | if (type == CGROUP_FILE_PROCS) |
3167 | pid = task_tgid_vnr(tsk); | 3165 | pid = task_tgid_vnr(tsk); |
3168 | else | 3166 | else |
3169 | pid = task_pid_vnr(tsk); | 3167 | pid = task_pid_vnr(tsk); |
3170 | if (pid > 0) /* make sure to only use valid results */ | 3168 | if (pid > 0) /* make sure to only use valid results */ |
3171 | array[n++] = pid; | 3169 | array[n++] = pid; |
3172 | } | 3170 | } |
3173 | cgroup_iter_end(cgrp, &it); | 3171 | cgroup_iter_end(cgrp, &it); |
3174 | length = n; | 3172 | length = n; |
3175 | /* now sort & (if procs) strip out duplicates */ | 3173 | /* now sort & (if procs) strip out duplicates */ |
3176 | sort(array, length, sizeof(pid_t), cmppid, NULL); | 3174 | sort(array, length, sizeof(pid_t), cmppid, NULL); |
3177 | if (type == CGROUP_FILE_PROCS) | 3175 | if (type == CGROUP_FILE_PROCS) |
3178 | length = pidlist_uniq(&array, length); | 3176 | length = pidlist_uniq(&array, length); |
3179 | l = cgroup_pidlist_find(cgrp, type); | 3177 | l = cgroup_pidlist_find(cgrp, type); |
3180 | if (!l) { | 3178 | if (!l) { |
3181 | pidlist_free(array); | 3179 | pidlist_free(array); |
3182 | return -ENOMEM; | 3180 | return -ENOMEM; |
3183 | } | 3181 | } |
3184 | /* store array, freeing old if necessary - lock already held */ | 3182 | /* store array, freeing old if necessary - lock already held */ |
3185 | pidlist_free(l->list); | 3183 | pidlist_free(l->list); |
3186 | l->list = array; | 3184 | l->list = array; |
3187 | l->length = length; | 3185 | l->length = length; |
3188 | l->use_count++; | 3186 | l->use_count++; |
3189 | up_write(&l->mutex); | 3187 | up_write(&l->mutex); |
3190 | *lp = l; | 3188 | *lp = l; |
3191 | return 0; | 3189 | return 0; |
3192 | } | 3190 | } |
3193 | 3191 | ||
3194 | /** | 3192 | /** |
3195 | * cgroupstats_build - build and fill cgroupstats | 3193 | * cgroupstats_build - build and fill cgroupstats |
3196 | * @stats: cgroupstats to fill information into | 3194 | * @stats: cgroupstats to fill information into |
3197 | * @dentry: A dentry entry belonging to the cgroup for which stats have | 3195 | * @dentry: A dentry entry belonging to the cgroup for which stats have |
3198 | * been requested. | 3196 | * been requested. |
3199 | * | 3197 | * |
3200 | * Build and fill cgroupstats so that taskstats can export it to user | 3198 | * Build and fill cgroupstats so that taskstats can export it to user |
3201 | * space. | 3199 | * space. |
3202 | */ | 3200 | */ |
3203 | int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | 3201 | int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) |
3204 | { | 3202 | { |
3205 | int ret = -EINVAL; | 3203 | int ret = -EINVAL; |
3206 | struct cgroup *cgrp; | 3204 | struct cgroup *cgrp; |
3207 | struct cgroup_iter it; | 3205 | struct cgroup_iter it; |
3208 | struct task_struct *tsk; | 3206 | struct task_struct *tsk; |
3209 | 3207 | ||
3210 | /* | 3208 | /* |
3211 | * Validate dentry by checking the superblock operations, | 3209 | * Validate dentry by checking the superblock operations, |
3212 | * and make sure it's a directory. | 3210 | * and make sure it's a directory. |
3213 | */ | 3211 | */ |
3214 | if (dentry->d_sb->s_op != &cgroup_ops || | 3212 | if (dentry->d_sb->s_op != &cgroup_ops || |
3215 | !S_ISDIR(dentry->d_inode->i_mode)) | 3213 | !S_ISDIR(dentry->d_inode->i_mode)) |
3216 | goto err; | 3214 | goto err; |
3217 | 3215 | ||
3218 | ret = 0; | 3216 | ret = 0; |
3219 | cgrp = dentry->d_fsdata; | 3217 | cgrp = dentry->d_fsdata; |
3220 | 3218 | ||
3221 | cgroup_iter_start(cgrp, &it); | 3219 | cgroup_iter_start(cgrp, &it); |
3222 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3220 | while ((tsk = cgroup_iter_next(cgrp, &it))) { |
3223 | switch (tsk->state) { | 3221 | switch (tsk->state) { |
3224 | case TASK_RUNNING: | 3222 | case TASK_RUNNING: |
3225 | stats->nr_running++; | 3223 | stats->nr_running++; |
3226 | break; | 3224 | break; |
3227 | case TASK_INTERRUPTIBLE: | 3225 | case TASK_INTERRUPTIBLE: |
3228 | stats->nr_sleeping++; | 3226 | stats->nr_sleeping++; |
3229 | break; | 3227 | break; |
3230 | case TASK_UNINTERRUPTIBLE: | 3228 | case TASK_UNINTERRUPTIBLE: |
3231 | stats->nr_uninterruptible++; | 3229 | stats->nr_uninterruptible++; |
3232 | break; | 3230 | break; |
3233 | case TASK_STOPPED: | 3231 | case TASK_STOPPED: |
3234 | stats->nr_stopped++; | 3232 | stats->nr_stopped++; |
3235 | break; | 3233 | break; |
3236 | default: | 3234 | default: |
3237 | if (delayacct_is_task_waiting_on_io(tsk)) | 3235 | if (delayacct_is_task_waiting_on_io(tsk)) |
3238 | stats->nr_io_wait++; | 3236 | stats->nr_io_wait++; |
3239 | break; | 3237 | break; |
3240 | } | 3238 | } |
3241 | } | 3239 | } |
3242 | cgroup_iter_end(cgrp, &it); | 3240 | cgroup_iter_end(cgrp, &it); |
3243 | 3241 | ||
3244 | err: | 3242 | err: |
3245 | return ret; | 3243 | return ret; |
3246 | } | 3244 | } |
3247 | 3245 | ||
3248 | 3246 | ||
3249 | /* | 3247 | /* |
3250 | * seq_file methods for the tasks/procs files. The seq_file position is the | 3248 | * seq_file methods for the tasks/procs files. The seq_file position is the |
3251 | * next pid to display; the seq_file iterator is a pointer to the pid | 3249 | * next pid to display; the seq_file iterator is a pointer to the pid |
3252 | * in the cgroup->l->list array. | 3250 | * in the cgroup->l->list array. |
3253 | */ | 3251 | */ |
3254 | 3252 | ||
3255 | static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | 3253 | static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) |
3256 | { | 3254 | { |
3257 | /* | 3255 | /* |
3258 | * Initially we receive a position value that corresponds to | 3256 | * Initially we receive a position value that corresponds to |
3259 | * one more than the last pid shown (or 0 on the first call or | 3257 | * one more than the last pid shown (or 0 on the first call or |
3260 | * after a seek to the start). Use a binary-search to find the | 3258 | * after a seek to the start). Use a binary-search to find the |
3261 | * next pid to display, if any | 3259 | * next pid to display, if any |
3262 | */ | 3260 | */ |
3263 | struct cgroup_pidlist *l = s->private; | 3261 | struct cgroup_pidlist *l = s->private; |
3264 | int index = 0, pid = *pos; | 3262 | int index = 0, pid = *pos; |
3265 | int *iter; | 3263 | int *iter; |
3266 | 3264 | ||
3267 | down_read(&l->mutex); | 3265 | down_read(&l->mutex); |
3268 | if (pid) { | 3266 | if (pid) { |
3269 | int end = l->length; | 3267 | int end = l->length; |
3270 | 3268 | ||
3271 | while (index < end) { | 3269 | while (index < end) { |
3272 | int mid = (index + end) / 2; | 3270 | int mid = (index + end) / 2; |
3273 | if (l->list[mid] == pid) { | 3271 | if (l->list[mid] == pid) { |
3274 | index = mid; | 3272 | index = mid; |
3275 | break; | 3273 | break; |
3276 | } else if (l->list[mid] <= pid) | 3274 | } else if (l->list[mid] <= pid) |
3277 | index = mid + 1; | 3275 | index = mid + 1; |
3278 | else | 3276 | else |
3279 | end = mid; | 3277 | end = mid; |
3280 | } | 3278 | } |
3281 | } | 3279 | } |
3282 | /* If we're off the end of the array, we're done */ | 3280 | /* If we're off the end of the array, we're done */ |
3283 | if (index >= l->length) | 3281 | if (index >= l->length) |
3284 | return NULL; | 3282 | return NULL; |
3285 | /* Update the abstract position to be the actual pid that we found */ | 3283 | /* Update the abstract position to be the actual pid that we found */ |
3286 | iter = l->list + index; | 3284 | iter = l->list + index; |
3287 | *pos = *iter; | 3285 | *pos = *iter; |
3288 | return iter; | 3286 | return iter; |
3289 | } | 3287 | } |
3290 | 3288 | ||
3291 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) | 3289 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
3292 | { | 3290 | { |
3293 | struct cgroup_pidlist *l = s->private; | 3291 | struct cgroup_pidlist *l = s->private; |
3294 | up_read(&l->mutex); | 3292 | up_read(&l->mutex); |
3295 | } | 3293 | } |
3296 | 3294 | ||
3297 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | 3295 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
3298 | { | 3296 | { |
3299 | struct cgroup_pidlist *l = s->private; | 3297 | struct cgroup_pidlist *l = s->private; |
3300 | pid_t *p = v; | 3298 | pid_t *p = v; |
3301 | pid_t *end = l->list + l->length; | 3299 | pid_t *end = l->list + l->length; |
3302 | /* | 3300 | /* |
3303 | * Advance to the next pid in the array. If this goes off the | 3301 | * Advance to the next pid in the array. If this goes off the |
3304 | * end, we're done | 3302 | * end, we're done |
3305 | */ | 3303 | */ |
3306 | p++; | 3304 | p++; |
3307 | if (p >= end) { | 3305 | if (p >= end) { |
3308 | return NULL; | 3306 | return NULL; |
3309 | } else { | 3307 | } else { |
3310 | *pos = *p; | 3308 | *pos = *p; |
3311 | return p; | 3309 | return p; |
3312 | } | 3310 | } |
3313 | } | 3311 | } |
3314 | 3312 | ||
3315 | static int cgroup_pidlist_show(struct seq_file *s, void *v) | 3313 | static int cgroup_pidlist_show(struct seq_file *s, void *v) |
3316 | { | 3314 | { |
3317 | return seq_printf(s, "%d\n", *(int *)v); | 3315 | return seq_printf(s, "%d\n", *(int *)v); |
3318 | } | 3316 | } |
3319 | 3317 | ||
3320 | /* | 3318 | /* |
3321 | * seq_operations functions for iterating on pidlists through seq_file - | 3319 | * seq_operations functions for iterating on pidlists through seq_file - |
3322 | * independent of whether it's tasks or procs | 3320 | * independent of whether it's tasks or procs |
3323 | */ | 3321 | */ |
3324 | static const struct seq_operations cgroup_pidlist_seq_operations = { | 3322 | static const struct seq_operations cgroup_pidlist_seq_operations = { |
3325 | .start = cgroup_pidlist_start, | 3323 | .start = cgroup_pidlist_start, |
3326 | .stop = cgroup_pidlist_stop, | 3324 | .stop = cgroup_pidlist_stop, |
3327 | .next = cgroup_pidlist_next, | 3325 | .next = cgroup_pidlist_next, |
3328 | .show = cgroup_pidlist_show, | 3326 | .show = cgroup_pidlist_show, |
3329 | }; | 3327 | }; |
3330 | 3328 | ||
3331 | static void cgroup_release_pid_array(struct cgroup_pidlist *l) | 3329 | static void cgroup_release_pid_array(struct cgroup_pidlist *l) |
3332 | { | 3330 | { |
3333 | /* | 3331 | /* |
3334 | * the case where we're the last user of this particular pidlist will | 3332 | * the case where we're the last user of this particular pidlist will |
3335 | * have us remove it from the cgroup's list, which entails taking the | 3333 | * have us remove it from the cgroup's list, which entails taking the |
3336 | * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> | 3334 | * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> |
3337 | * pidlist_mutex, we have to take pidlist_mutex first. | 3335 | * pidlist_mutex, we have to take pidlist_mutex first. |
3338 | */ | 3336 | */ |
3339 | mutex_lock(&l->owner->pidlist_mutex); | 3337 | mutex_lock(&l->owner->pidlist_mutex); |
3340 | down_write(&l->mutex); | 3338 | down_write(&l->mutex); |
3341 | BUG_ON(!l->use_count); | 3339 | BUG_ON(!l->use_count); |
3342 | if (!--l->use_count) { | 3340 | if (!--l->use_count) { |
3343 | /* we're the last user if refcount is 0; remove and free */ | 3341 | /* we're the last user if refcount is 0; remove and free */ |
3344 | list_del(&l->links); | 3342 | list_del(&l->links); |
3345 | mutex_unlock(&l->owner->pidlist_mutex); | 3343 | mutex_unlock(&l->owner->pidlist_mutex); |
3346 | pidlist_free(l->list); | 3344 | pidlist_free(l->list); |
3347 | put_pid_ns(l->key.ns); | 3345 | put_pid_ns(l->key.ns); |
3348 | up_write(&l->mutex); | 3346 | up_write(&l->mutex); |
3349 | kfree(l); | 3347 | kfree(l); |
3350 | return; | 3348 | return; |
3351 | } | 3349 | } |
3352 | mutex_unlock(&l->owner->pidlist_mutex); | 3350 | mutex_unlock(&l->owner->pidlist_mutex); |
3353 | up_write(&l->mutex); | 3351 | up_write(&l->mutex); |
3354 | } | 3352 | } |
3355 | 3353 | ||
3356 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) | 3354 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) |
3357 | { | 3355 | { |
3358 | struct cgroup_pidlist *l; | 3356 | struct cgroup_pidlist *l; |
3359 | if (!(file->f_mode & FMODE_READ)) | 3357 | if (!(file->f_mode & FMODE_READ)) |
3360 | return 0; | 3358 | return 0; |
3361 | /* | 3359 | /* |
3362 | * the seq_file will only be initialized if the file was opened for | 3360 | * the seq_file will only be initialized if the file was opened for |
3363 | * reading; hence we check if it's not null only in that case. | 3361 | * reading; hence we check if it's not null only in that case. |
3364 | */ | 3362 | */ |
3365 | l = ((struct seq_file *)file->private_data)->private; | 3363 | l = ((struct seq_file *)file->private_data)->private; |
3366 | cgroup_release_pid_array(l); | 3364 | cgroup_release_pid_array(l); |
3367 | return seq_release(inode, file); | 3365 | return seq_release(inode, file); |
3368 | } | 3366 | } |
3369 | 3367 | ||
3370 | static const struct file_operations cgroup_pidlist_operations = { | 3368 | static const struct file_operations cgroup_pidlist_operations = { |
3371 | .read = seq_read, | 3369 | .read = seq_read, |
3372 | .llseek = seq_lseek, | 3370 | .llseek = seq_lseek, |
3373 | .write = cgroup_file_write, | 3371 | .write = cgroup_file_write, |
3374 | .release = cgroup_pidlist_release, | 3372 | .release = cgroup_pidlist_release, |
3375 | }; | 3373 | }; |
3376 | 3374 | ||
3377 | /* | 3375 | /* |
3378 | * The following functions handle opens on a file that displays a pidlist | 3376 | * The following functions handle opens on a file that displays a pidlist |
3379 | * (tasks or procs). Prepare an array of the process/thread IDs of whoever's | 3377 | * (tasks or procs). Prepare an array of the process/thread IDs of whoever's |
3380 | * in the cgroup. | 3378 | * in the cgroup. |
3381 | */ | 3379 | */ |
3382 | /* helper function for the two below it */ | 3380 | /* helper function for the two below it */ |
3383 | static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) | 3381 | static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) |
3384 | { | 3382 | { |
3385 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 3383 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
3386 | struct cgroup_pidlist *l; | 3384 | struct cgroup_pidlist *l; |
3387 | int retval; | 3385 | int retval; |
3388 | 3386 | ||
3389 | /* Nothing to do for write-only files */ | 3387 | /* Nothing to do for write-only files */ |
3390 | if (!(file->f_mode & FMODE_READ)) | 3388 | if (!(file->f_mode & FMODE_READ)) |
3391 | return 0; | 3389 | return 0; |
3392 | 3390 | ||
3393 | /* have the array populated */ | 3391 | /* have the array populated */ |
3394 | retval = pidlist_array_load(cgrp, type, &l); | 3392 | retval = pidlist_array_load(cgrp, type, &l); |
3395 | if (retval) | 3393 | if (retval) |
3396 | return retval; | 3394 | return retval; |
3397 | /* configure file information */ | 3395 | /* configure file information */ |
3398 | file->f_op = &cgroup_pidlist_operations; | 3396 | file->f_op = &cgroup_pidlist_operations; |
3399 | 3397 | ||
3400 | retval = seq_open(file, &cgroup_pidlist_seq_operations); | 3398 | retval = seq_open(file, &cgroup_pidlist_seq_operations); |
3401 | if (retval) { | 3399 | if (retval) { |
3402 | cgroup_release_pid_array(l); | 3400 | cgroup_release_pid_array(l); |
3403 | return retval; | 3401 | return retval; |
3404 | } | 3402 | } |
3405 | ((struct seq_file *)file->private_data)->private = l; | 3403 | ((struct seq_file *)file->private_data)->private = l; |
3406 | return 0; | 3404 | return 0; |
3407 | } | 3405 | } |
3408 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | 3406 | static int cgroup_tasks_open(struct inode *unused, struct file *file) |
3409 | { | 3407 | { |
3410 | return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); | 3408 | return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); |
3411 | } | 3409 | } |
3412 | static int cgroup_procs_open(struct inode *unused, struct file *file) | 3410 | static int cgroup_procs_open(struct inode *unused, struct file *file) |
3413 | { | 3411 | { |
3414 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); | 3412 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); |
3415 | } | 3413 | } |
3416 | 3414 | ||
3417 | static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, | 3415 | static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, |
3418 | struct cftype *cft) | 3416 | struct cftype *cft) |
3419 | { | 3417 | { |
3420 | return notify_on_release(cgrp); | 3418 | return notify_on_release(cgrp); |
3421 | } | 3419 | } |
3422 | 3420 | ||
3423 | static int cgroup_write_notify_on_release(struct cgroup *cgrp, | 3421 | static int cgroup_write_notify_on_release(struct cgroup *cgrp, |
3424 | struct cftype *cft, | 3422 | struct cftype *cft, |
3425 | u64 val) | 3423 | u64 val) |
3426 | { | 3424 | { |
3427 | clear_bit(CGRP_RELEASABLE, &cgrp->flags); | 3425 | clear_bit(CGRP_RELEASABLE, &cgrp->flags); |
3428 | if (val) | 3426 | if (val) |
3429 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3427 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
3430 | else | 3428 | else |
3431 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3429 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
3432 | return 0; | 3430 | return 0; |
3433 | } | 3431 | } |
3434 | 3432 | ||
3435 | /* | 3433 | /* |
3436 | * Unregister event and free resources. | 3434 | * Unregister event and free resources. |
3437 | * | 3435 | * |
3438 | * Gets called from workqueue. | 3436 | * Gets called from workqueue. |
3439 | */ | 3437 | */ |
3440 | static void cgroup_event_remove(struct work_struct *work) | 3438 | static void cgroup_event_remove(struct work_struct *work) |
3441 | { | 3439 | { |
3442 | struct cgroup_event *event = container_of(work, struct cgroup_event, | 3440 | struct cgroup_event *event = container_of(work, struct cgroup_event, |
3443 | remove); | 3441 | remove); |
3444 | struct cgroup *cgrp = event->cgrp; | 3442 | struct cgroup *cgrp = event->cgrp; |
3445 | 3443 | ||
3446 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | 3444 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); |
3447 | 3445 | ||
3448 | eventfd_ctx_put(event->eventfd); | 3446 | eventfd_ctx_put(event->eventfd); |
3449 | kfree(event); | 3447 | kfree(event); |
3450 | dput(cgrp->dentry); | 3448 | dput(cgrp->dentry); |
3451 | } | 3449 | } |
3452 | 3450 | ||
3453 | /* | 3451 | /* |
3454 | * Gets called on POLLHUP on eventfd when user closes it. | 3452 | * Gets called on POLLHUP on eventfd when user closes it. |
3455 | * | 3453 | * |
3456 | * Called with wqh->lock held and interrupts disabled. | 3454 | * Called with wqh->lock held and interrupts disabled. |
3457 | */ | 3455 | */ |
3458 | static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | 3456 | static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, |
3459 | int sync, void *key) | 3457 | int sync, void *key) |
3460 | { | 3458 | { |
3461 | struct cgroup_event *event = container_of(wait, | 3459 | struct cgroup_event *event = container_of(wait, |
3462 | struct cgroup_event, wait); | 3460 | struct cgroup_event, wait); |
3463 | struct cgroup *cgrp = event->cgrp; | 3461 | struct cgroup *cgrp = event->cgrp; |
3464 | unsigned long flags = (unsigned long)key; | 3462 | unsigned long flags = (unsigned long)key; |
3465 | 3463 | ||
3466 | if (flags & POLLHUP) { | 3464 | if (flags & POLLHUP) { |
3467 | __remove_wait_queue(event->wqh, &event->wait); | 3465 | __remove_wait_queue(event->wqh, &event->wait); |
3468 | spin_lock(&cgrp->event_list_lock); | 3466 | spin_lock(&cgrp->event_list_lock); |
3469 | list_del(&event->list); | 3467 | list_del(&event->list); |
3470 | spin_unlock(&cgrp->event_list_lock); | 3468 | spin_unlock(&cgrp->event_list_lock); |
3471 | /* | 3469 | /* |
3472 | * We are in atomic context, but cgroup_event_remove() may | 3470 | * We are in atomic context, but cgroup_event_remove() may |
3473 | * sleep, so we have to call it in workqueue. | 3471 | * sleep, so we have to call it in workqueue. |
3474 | */ | 3472 | */ |
3475 | schedule_work(&event->remove); | 3473 | schedule_work(&event->remove); |
3476 | } | 3474 | } |
3477 | 3475 | ||
3478 | return 0; | 3476 | return 0; |
3479 | } | 3477 | } |
3480 | 3478 | ||
3481 | static void cgroup_event_ptable_queue_proc(struct file *file, | 3479 | static void cgroup_event_ptable_queue_proc(struct file *file, |
3482 | wait_queue_head_t *wqh, poll_table *pt) | 3480 | wait_queue_head_t *wqh, poll_table *pt) |
3483 | { | 3481 | { |
3484 | struct cgroup_event *event = container_of(pt, | 3482 | struct cgroup_event *event = container_of(pt, |
3485 | struct cgroup_event, pt); | 3483 | struct cgroup_event, pt); |
3486 | 3484 | ||
3487 | event->wqh = wqh; | 3485 | event->wqh = wqh; |
3488 | add_wait_queue(wqh, &event->wait); | 3486 | add_wait_queue(wqh, &event->wait); |
3489 | } | 3487 | } |
3490 | 3488 | ||
3491 | /* | 3489 | /* |
3492 | * Parse input and register new cgroup event handler. | 3490 | * Parse input and register new cgroup event handler. |
3493 | * | 3491 | * |
3494 | * Input must be in format '<event_fd> <control_fd> <args>'. | 3492 | * Input must be in format '<event_fd> <control_fd> <args>'. |
3495 | * Interpretation of args is defined by control file implementation. | 3493 | * Interpretation of args is defined by control file implementation. |
3496 | */ | 3494 | */ |
3497 | static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | 3495 | static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, |
3498 | const char *buffer) | 3496 | const char *buffer) |
3499 | { | 3497 | { |
3500 | struct cgroup_event *event = NULL; | 3498 | struct cgroup_event *event = NULL; |
3501 | unsigned int efd, cfd; | 3499 | unsigned int efd, cfd; |
3502 | struct file *efile = NULL; | 3500 | struct file *efile = NULL; |
3503 | struct file *cfile = NULL; | 3501 | struct file *cfile = NULL; |
3504 | char *endp; | 3502 | char *endp; |
3505 | int ret; | 3503 | int ret; |
3506 | 3504 | ||
3507 | efd = simple_strtoul(buffer, &endp, 10); | 3505 | efd = simple_strtoul(buffer, &endp, 10); |
3508 | if (*endp != ' ') | 3506 | if (*endp != ' ') |
3509 | return -EINVAL; | 3507 | return -EINVAL; |
3510 | buffer = endp + 1; | 3508 | buffer = endp + 1; |
3511 | 3509 | ||
3512 | cfd = simple_strtoul(buffer, &endp, 10); | 3510 | cfd = simple_strtoul(buffer, &endp, 10); |
3513 | if ((*endp != ' ') && (*endp != '\0')) | 3511 | if ((*endp != ' ') && (*endp != '\0')) |
3514 | return -EINVAL; | 3512 | return -EINVAL; |
3515 | buffer = endp + 1; | 3513 | buffer = endp + 1; |
3516 | 3514 | ||
3517 | event = kzalloc(sizeof(*event), GFP_KERNEL); | 3515 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
3518 | if (!event) | 3516 | if (!event) |
3519 | return -ENOMEM; | 3517 | return -ENOMEM; |
3520 | event->cgrp = cgrp; | 3518 | event->cgrp = cgrp; |
3521 | INIT_LIST_HEAD(&event->list); | 3519 | INIT_LIST_HEAD(&event->list); |
3522 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | 3520 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); |
3523 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | 3521 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); |
3524 | INIT_WORK(&event->remove, cgroup_event_remove); | 3522 | INIT_WORK(&event->remove, cgroup_event_remove); |
3525 | 3523 | ||
3526 | efile = eventfd_fget(efd); | 3524 | efile = eventfd_fget(efd); |
3527 | if (IS_ERR(efile)) { | 3525 | if (IS_ERR(efile)) { |
3528 | ret = PTR_ERR(efile); | 3526 | ret = PTR_ERR(efile); |
3529 | goto fail; | 3527 | goto fail; |
3530 | } | 3528 | } |
3531 | 3529 | ||
3532 | event->eventfd = eventfd_ctx_fileget(efile); | 3530 | event->eventfd = eventfd_ctx_fileget(efile); |
3533 | if (IS_ERR(event->eventfd)) { | 3531 | if (IS_ERR(event->eventfd)) { |
3534 | ret = PTR_ERR(event->eventfd); | 3532 | ret = PTR_ERR(event->eventfd); |
3535 | goto fail; | 3533 | goto fail; |
3536 | } | 3534 | } |
3537 | 3535 | ||
3538 | cfile = fget(cfd); | 3536 | cfile = fget(cfd); |
3539 | if (!cfile) { | 3537 | if (!cfile) { |
3540 | ret = -EBADF; | 3538 | ret = -EBADF; |
3541 | goto fail; | 3539 | goto fail; |
3542 | } | 3540 | } |
3543 | 3541 | ||
3544 | /* the process need read permission on control file */ | 3542 | /* the process need read permission on control file */ |
3545 | ret = file_permission(cfile, MAY_READ); | 3543 | ret = file_permission(cfile, MAY_READ); |
3546 | if (ret < 0) | 3544 | if (ret < 0) |
3547 | goto fail; | 3545 | goto fail; |
3548 | 3546 | ||
3549 | event->cft = __file_cft(cfile); | 3547 | event->cft = __file_cft(cfile); |
3550 | if (IS_ERR(event->cft)) { | 3548 | if (IS_ERR(event->cft)) { |
3551 | ret = PTR_ERR(event->cft); | 3549 | ret = PTR_ERR(event->cft); |
3552 | goto fail; | 3550 | goto fail; |
3553 | } | 3551 | } |
3554 | 3552 | ||
3555 | if (!event->cft->register_event || !event->cft->unregister_event) { | 3553 | if (!event->cft->register_event || !event->cft->unregister_event) { |
3556 | ret = -EINVAL; | 3554 | ret = -EINVAL; |
3557 | goto fail; | 3555 | goto fail; |
3558 | } | 3556 | } |
3559 | 3557 | ||
3560 | ret = event->cft->register_event(cgrp, event->cft, | 3558 | ret = event->cft->register_event(cgrp, event->cft, |
3561 | event->eventfd, buffer); | 3559 | event->eventfd, buffer); |
3562 | if (ret) | 3560 | if (ret) |
3563 | goto fail; | 3561 | goto fail; |
3564 | 3562 | ||
3565 | if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { | 3563 | if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { |
3566 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | 3564 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); |
3567 | ret = 0; | 3565 | ret = 0; |
3568 | goto fail; | 3566 | goto fail; |
3569 | } | 3567 | } |
3570 | 3568 | ||
3571 | /* | 3569 | /* |
3572 | * Events should be removed after rmdir of cgroup directory, but before | 3570 | * Events should be removed after rmdir of cgroup directory, but before |
3573 | * destroying subsystem state objects. Let's take reference to cgroup | 3571 | * destroying subsystem state objects. Let's take reference to cgroup |
3574 | * directory dentry to do that. | 3572 | * directory dentry to do that. |
3575 | */ | 3573 | */ |
3576 | dget(cgrp->dentry); | 3574 | dget(cgrp->dentry); |
3577 | 3575 | ||
3578 | spin_lock(&cgrp->event_list_lock); | 3576 | spin_lock(&cgrp->event_list_lock); |
3579 | list_add(&event->list, &cgrp->event_list); | 3577 | list_add(&event->list, &cgrp->event_list); |
3580 | spin_unlock(&cgrp->event_list_lock); | 3578 | spin_unlock(&cgrp->event_list_lock); |
3581 | 3579 | ||
3582 | fput(cfile); | 3580 | fput(cfile); |
3583 | fput(efile); | 3581 | fput(efile); |
3584 | 3582 | ||
3585 | return 0; | 3583 | return 0; |
3586 | 3584 | ||
3587 | fail: | 3585 | fail: |
3588 | if (cfile) | 3586 | if (cfile) |
3589 | fput(cfile); | 3587 | fput(cfile); |
3590 | 3588 | ||
3591 | if (event && event->eventfd && !IS_ERR(event->eventfd)) | 3589 | if (event && event->eventfd && !IS_ERR(event->eventfd)) |
3592 | eventfd_ctx_put(event->eventfd); | 3590 | eventfd_ctx_put(event->eventfd); |
3593 | 3591 | ||
3594 | if (!IS_ERR_OR_NULL(efile)) | 3592 | if (!IS_ERR_OR_NULL(efile)) |
3595 | fput(efile); | 3593 | fput(efile); |
3596 | 3594 | ||
3597 | kfree(event); | 3595 | kfree(event); |
3598 | 3596 | ||
3599 | return ret; | 3597 | return ret; |
3600 | } | 3598 | } |
3601 | 3599 | ||
3602 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 3600 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, |
3603 | struct cftype *cft) | 3601 | struct cftype *cft) |
3604 | { | 3602 | { |
3605 | return clone_children(cgrp); | 3603 | return clone_children(cgrp); |
3606 | } | 3604 | } |
3607 | 3605 | ||
3608 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 3606 | static int cgroup_clone_children_write(struct cgroup *cgrp, |
3609 | struct cftype *cft, | 3607 | struct cftype *cft, |
3610 | u64 val) | 3608 | u64 val) |
3611 | { | 3609 | { |
3612 | if (val) | 3610 | if (val) |
3613 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3611 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); |
3614 | else | 3612 | else |
3615 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3613 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); |
3616 | return 0; | 3614 | return 0; |
3617 | } | 3615 | } |
3618 | 3616 | ||
3619 | /* | 3617 | /* |
3620 | * for the common functions, 'private' gives the type of file | 3618 | * for the common functions, 'private' gives the type of file |
3621 | */ | 3619 | */ |
3622 | /* for hysterical raisins, we can't put this on the older files */ | 3620 | /* for hysterical raisins, we can't put this on the older files */ |
3623 | #define CGROUP_FILE_GENERIC_PREFIX "cgroup." | 3621 | #define CGROUP_FILE_GENERIC_PREFIX "cgroup." |
3624 | static struct cftype files[] = { | 3622 | static struct cftype files[] = { |
3625 | { | 3623 | { |
3626 | .name = "tasks", | 3624 | .name = "tasks", |
3627 | .open = cgroup_tasks_open, | 3625 | .open = cgroup_tasks_open, |
3628 | .write_u64 = cgroup_tasks_write, | 3626 | .write_u64 = cgroup_tasks_write, |
3629 | .release = cgroup_pidlist_release, | 3627 | .release = cgroup_pidlist_release, |
3630 | .mode = S_IRUGO | S_IWUSR, | 3628 | .mode = S_IRUGO | S_IWUSR, |
3631 | }, | 3629 | }, |
3632 | { | 3630 | { |
3633 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", | 3631 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", |
3634 | .open = cgroup_procs_open, | 3632 | .open = cgroup_procs_open, |
3635 | .write_u64 = cgroup_procs_write, | 3633 | .write_u64 = cgroup_procs_write, |
3636 | .release = cgroup_pidlist_release, | 3634 | .release = cgroup_pidlist_release, |
3637 | .mode = S_IRUGO | S_IWUSR, | 3635 | .mode = S_IRUGO | S_IWUSR, |
3638 | }, | 3636 | }, |
3639 | { | 3637 | { |
3640 | .name = "notify_on_release", | 3638 | .name = "notify_on_release", |
3641 | .read_u64 = cgroup_read_notify_on_release, | 3639 | .read_u64 = cgroup_read_notify_on_release, |
3642 | .write_u64 = cgroup_write_notify_on_release, | 3640 | .write_u64 = cgroup_write_notify_on_release, |
3643 | }, | 3641 | }, |
3644 | { | 3642 | { |
3645 | .name = CGROUP_FILE_GENERIC_PREFIX "event_control", | 3643 | .name = CGROUP_FILE_GENERIC_PREFIX "event_control", |
3646 | .write_string = cgroup_write_event_control, | 3644 | .write_string = cgroup_write_event_control, |
3647 | .mode = S_IWUGO, | 3645 | .mode = S_IWUGO, |
3648 | }, | 3646 | }, |
3649 | { | 3647 | { |
3650 | .name = "cgroup.clone_children", | 3648 | .name = "cgroup.clone_children", |
3651 | .read_u64 = cgroup_clone_children_read, | 3649 | .read_u64 = cgroup_clone_children_read, |
3652 | .write_u64 = cgroup_clone_children_write, | 3650 | .write_u64 = cgroup_clone_children_write, |
3653 | }, | 3651 | }, |
3654 | }; | 3652 | }; |
3655 | 3653 | ||
3656 | static struct cftype cft_release_agent = { | 3654 | static struct cftype cft_release_agent = { |
3657 | .name = "release_agent", | 3655 | .name = "release_agent", |
3658 | .read_seq_string = cgroup_release_agent_show, | 3656 | .read_seq_string = cgroup_release_agent_show, |
3659 | .write_string = cgroup_release_agent_write, | 3657 | .write_string = cgroup_release_agent_write, |
3660 | .max_write_len = PATH_MAX, | 3658 | .max_write_len = PATH_MAX, |
3661 | }; | 3659 | }; |
3662 | 3660 | ||
3663 | static int cgroup_populate_dir(struct cgroup *cgrp) | 3661 | static int cgroup_populate_dir(struct cgroup *cgrp) |
3664 | { | 3662 | { |
3665 | int err; | 3663 | int err; |
3666 | struct cgroup_subsys *ss; | 3664 | struct cgroup_subsys *ss; |
3667 | 3665 | ||
3668 | /* First clear out any existing files */ | 3666 | /* First clear out any existing files */ |
3669 | cgroup_clear_directory(cgrp->dentry); | 3667 | cgroup_clear_directory(cgrp->dentry); |
3670 | 3668 | ||
3671 | err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); | 3669 | err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); |
3672 | if (err < 0) | 3670 | if (err < 0) |
3673 | return err; | 3671 | return err; |
3674 | 3672 | ||
3675 | if (cgrp == cgrp->top_cgroup) { | 3673 | if (cgrp == cgrp->top_cgroup) { |
3676 | if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) | 3674 | if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) |
3677 | return err; | 3675 | return err; |
3678 | } | 3676 | } |
3679 | 3677 | ||
3680 | for_each_subsys(cgrp->root, ss) { | 3678 | for_each_subsys(cgrp->root, ss) { |
3681 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) | 3679 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) |
3682 | return err; | 3680 | return err; |
3683 | } | 3681 | } |
3684 | /* This cgroup is ready now */ | 3682 | /* This cgroup is ready now */ |
3685 | for_each_subsys(cgrp->root, ss) { | 3683 | for_each_subsys(cgrp->root, ss) { |
3686 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 3684 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3687 | /* | 3685 | /* |
3688 | * Update id->css pointer and make this css visible from | 3686 | * Update id->css pointer and make this css visible from |
3689 | * CSS ID functions. This pointer will be dereferened | 3687 | * CSS ID functions. This pointer will be dereferened |
3690 | * from RCU-read-side without locks. | 3688 | * from RCU-read-side without locks. |
3691 | */ | 3689 | */ |
3692 | if (css->id) | 3690 | if (css->id) |
3693 | rcu_assign_pointer(css->id->css, css); | 3691 | rcu_assign_pointer(css->id->css, css); |
3694 | } | 3692 | } |
3695 | 3693 | ||
3696 | return 0; | 3694 | return 0; |
3697 | } | 3695 | } |
3698 | 3696 | ||
3699 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3697 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
3700 | struct cgroup_subsys *ss, | 3698 | struct cgroup_subsys *ss, |
3701 | struct cgroup *cgrp) | 3699 | struct cgroup *cgrp) |
3702 | { | 3700 | { |
3703 | css->cgroup = cgrp; | 3701 | css->cgroup = cgrp; |
3704 | atomic_set(&css->refcnt, 1); | 3702 | atomic_set(&css->refcnt, 1); |
3705 | css->flags = 0; | 3703 | css->flags = 0; |
3706 | css->id = NULL; | 3704 | css->id = NULL; |
3707 | if (cgrp == dummytop) | 3705 | if (cgrp == dummytop) |
3708 | set_bit(CSS_ROOT, &css->flags); | 3706 | set_bit(CSS_ROOT, &css->flags); |
3709 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 3707 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
3710 | cgrp->subsys[ss->subsys_id] = css; | 3708 | cgrp->subsys[ss->subsys_id] = css; |
3711 | } | 3709 | } |
3712 | 3710 | ||
3713 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | 3711 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) |
3714 | { | 3712 | { |
3715 | /* We need to take each hierarchy_mutex in a consistent order */ | 3713 | /* We need to take each hierarchy_mutex in a consistent order */ |
3716 | int i; | 3714 | int i; |
3717 | 3715 | ||
3718 | /* | 3716 | /* |
3719 | * No worry about a race with rebind_subsystems that might mess up the | 3717 | * No worry about a race with rebind_subsystems that might mess up the |
3720 | * locking order, since both parties are under cgroup_mutex. | 3718 | * locking order, since both parties are under cgroup_mutex. |
3721 | */ | 3719 | */ |
3722 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3720 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3723 | struct cgroup_subsys *ss = subsys[i]; | 3721 | struct cgroup_subsys *ss = subsys[i]; |
3724 | if (ss == NULL) | 3722 | if (ss == NULL) |
3725 | continue; | 3723 | continue; |
3726 | if (ss->root == root) | 3724 | if (ss->root == root) |
3727 | mutex_lock(&ss->hierarchy_mutex); | 3725 | mutex_lock(&ss->hierarchy_mutex); |
3728 | } | 3726 | } |
3729 | } | 3727 | } |
3730 | 3728 | ||
3731 | static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) | 3729 | static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) |
3732 | { | 3730 | { |
3733 | int i; | 3731 | int i; |
3734 | 3732 | ||
3735 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3733 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3736 | struct cgroup_subsys *ss = subsys[i]; | 3734 | struct cgroup_subsys *ss = subsys[i]; |
3737 | if (ss == NULL) | 3735 | if (ss == NULL) |
3738 | continue; | 3736 | continue; |
3739 | if (ss->root == root) | 3737 | if (ss->root == root) |
3740 | mutex_unlock(&ss->hierarchy_mutex); | 3738 | mutex_unlock(&ss->hierarchy_mutex); |
3741 | } | 3739 | } |
3742 | } | 3740 | } |
3743 | 3741 | ||
3744 | /* | 3742 | /* |
3745 | * cgroup_create - create a cgroup | 3743 | * cgroup_create - create a cgroup |
3746 | * @parent: cgroup that will be parent of the new cgroup | 3744 | * @parent: cgroup that will be parent of the new cgroup |
3747 | * @dentry: dentry of the new cgroup | 3745 | * @dentry: dentry of the new cgroup |
3748 | * @mode: mode to set on new inode | 3746 | * @mode: mode to set on new inode |
3749 | * | 3747 | * |
3750 | * Must be called with the mutex on the parent inode held | 3748 | * Must be called with the mutex on the parent inode held |
3751 | */ | 3749 | */ |
3752 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | 3750 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, |
3753 | mode_t mode) | 3751 | mode_t mode) |
3754 | { | 3752 | { |
3755 | struct cgroup *cgrp; | 3753 | struct cgroup *cgrp; |
3756 | struct cgroupfs_root *root = parent->root; | 3754 | struct cgroupfs_root *root = parent->root; |
3757 | int err = 0; | 3755 | int err = 0; |
3758 | struct cgroup_subsys *ss; | 3756 | struct cgroup_subsys *ss; |
3759 | struct super_block *sb = root->sb; | 3757 | struct super_block *sb = root->sb; |
3760 | 3758 | ||
3761 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); | 3759 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); |
3762 | if (!cgrp) | 3760 | if (!cgrp) |
3763 | return -ENOMEM; | 3761 | return -ENOMEM; |
3764 | 3762 | ||
3765 | /* Grab a reference on the superblock so the hierarchy doesn't | 3763 | /* Grab a reference on the superblock so the hierarchy doesn't |
3766 | * get deleted on unmount if there are child cgroups. This | 3764 | * get deleted on unmount if there are child cgroups. This |
3767 | * can be done outside cgroup_mutex, since the sb can't | 3765 | * can be done outside cgroup_mutex, since the sb can't |
3768 | * disappear while someone has an open control file on the | 3766 | * disappear while someone has an open control file on the |
3769 | * fs */ | 3767 | * fs */ |
3770 | atomic_inc(&sb->s_active); | 3768 | atomic_inc(&sb->s_active); |
3771 | 3769 | ||
3772 | mutex_lock(&cgroup_mutex); | 3770 | mutex_lock(&cgroup_mutex); |
3773 | 3771 | ||
3774 | init_cgroup_housekeeping(cgrp); | 3772 | init_cgroup_housekeeping(cgrp); |
3775 | 3773 | ||
3776 | cgrp->parent = parent; | 3774 | cgrp->parent = parent; |
3777 | cgrp->root = parent->root; | 3775 | cgrp->root = parent->root; |
3778 | cgrp->top_cgroup = parent->top_cgroup; | 3776 | cgrp->top_cgroup = parent->top_cgroup; |
3779 | 3777 | ||
3780 | if (notify_on_release(parent)) | 3778 | if (notify_on_release(parent)) |
3781 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3779 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
3782 | 3780 | ||
3783 | if (clone_children(parent)) | 3781 | if (clone_children(parent)) |
3784 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3782 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); |
3785 | 3783 | ||
3786 | for_each_subsys(root, ss) { | 3784 | for_each_subsys(root, ss) { |
3787 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); | 3785 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); |
3788 | 3786 | ||
3789 | if (IS_ERR(css)) { | 3787 | if (IS_ERR(css)) { |
3790 | err = PTR_ERR(css); | 3788 | err = PTR_ERR(css); |
3791 | goto err_destroy; | 3789 | goto err_destroy; |
3792 | } | 3790 | } |
3793 | init_cgroup_css(css, ss, cgrp); | 3791 | init_cgroup_css(css, ss, cgrp); |
3794 | if (ss->use_id) { | 3792 | if (ss->use_id) { |
3795 | err = alloc_css_id(ss, parent, cgrp); | 3793 | err = alloc_css_id(ss, parent, cgrp); |
3796 | if (err) | 3794 | if (err) |
3797 | goto err_destroy; | 3795 | goto err_destroy; |
3798 | } | 3796 | } |
3799 | /* At error, ->destroy() callback has to free assigned ID. */ | 3797 | /* At error, ->destroy() callback has to free assigned ID. */ |
3800 | if (clone_children(parent) && ss->post_clone) | 3798 | if (clone_children(parent) && ss->post_clone) |
3801 | ss->post_clone(ss, cgrp); | 3799 | ss->post_clone(ss, cgrp); |
3802 | } | 3800 | } |
3803 | 3801 | ||
3804 | cgroup_lock_hierarchy(root); | 3802 | cgroup_lock_hierarchy(root); |
3805 | list_add(&cgrp->sibling, &cgrp->parent->children); | 3803 | list_add(&cgrp->sibling, &cgrp->parent->children); |
3806 | cgroup_unlock_hierarchy(root); | 3804 | cgroup_unlock_hierarchy(root); |
3807 | root->number_of_cgroups++; | 3805 | root->number_of_cgroups++; |
3808 | 3806 | ||
3809 | err = cgroup_create_dir(cgrp, dentry, mode); | 3807 | err = cgroup_create_dir(cgrp, dentry, mode); |
3810 | if (err < 0) | 3808 | if (err < 0) |
3811 | goto err_remove; | 3809 | goto err_remove; |
3812 | 3810 | ||
3813 | /* The cgroup directory was pre-locked for us */ | 3811 | /* The cgroup directory was pre-locked for us */ |
3814 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 3812 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
3815 | 3813 | ||
3816 | err = cgroup_populate_dir(cgrp); | 3814 | err = cgroup_populate_dir(cgrp); |
3817 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 3815 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
3818 | 3816 | ||
3819 | mutex_unlock(&cgroup_mutex); | 3817 | mutex_unlock(&cgroup_mutex); |
3820 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 3818 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
3821 | 3819 | ||
3822 | return 0; | 3820 | return 0; |
3823 | 3821 | ||
3824 | err_remove: | 3822 | err_remove: |
3825 | 3823 | ||
3826 | cgroup_lock_hierarchy(root); | 3824 | cgroup_lock_hierarchy(root); |
3827 | list_del(&cgrp->sibling); | 3825 | list_del(&cgrp->sibling); |
3828 | cgroup_unlock_hierarchy(root); | 3826 | cgroup_unlock_hierarchy(root); |
3829 | root->number_of_cgroups--; | 3827 | root->number_of_cgroups--; |
3830 | 3828 | ||
3831 | err_destroy: | 3829 | err_destroy: |
3832 | 3830 | ||
3833 | for_each_subsys(root, ss) { | 3831 | for_each_subsys(root, ss) { |
3834 | if (cgrp->subsys[ss->subsys_id]) | 3832 | if (cgrp->subsys[ss->subsys_id]) |
3835 | ss->destroy(ss, cgrp); | 3833 | ss->destroy(ss, cgrp); |
3836 | } | 3834 | } |
3837 | 3835 | ||
3838 | mutex_unlock(&cgroup_mutex); | 3836 | mutex_unlock(&cgroup_mutex); |
3839 | 3837 | ||
3840 | /* Release the reference count that we took on the superblock */ | 3838 | /* Release the reference count that we took on the superblock */ |
3841 | deactivate_super(sb); | 3839 | deactivate_super(sb); |
3842 | 3840 | ||
3843 | kfree(cgrp); | 3841 | kfree(cgrp); |
3844 | return err; | 3842 | return err; |
3845 | } | 3843 | } |
3846 | 3844 | ||
3847 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) | 3845 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) |
3848 | { | 3846 | { |
3849 | struct cgroup *c_parent = dentry->d_parent->d_fsdata; | 3847 | struct cgroup *c_parent = dentry->d_parent->d_fsdata; |
3850 | 3848 | ||
3851 | /* the vfs holds inode->i_mutex already */ | 3849 | /* the vfs holds inode->i_mutex already */ |
3852 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 3850 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
3853 | } | 3851 | } |
3854 | 3852 | ||
3855 | static int cgroup_has_css_refs(struct cgroup *cgrp) | 3853 | static int cgroup_has_css_refs(struct cgroup *cgrp) |
3856 | { | 3854 | { |
3857 | /* Check the reference count on each subsystem. Since we | 3855 | /* Check the reference count on each subsystem. Since we |
3858 | * already established that there are no tasks in the | 3856 | * already established that there are no tasks in the |
3859 | * cgroup, if the css refcount is also 1, then there should | 3857 | * cgroup, if the css refcount is also 1, then there should |
3860 | * be no outstanding references, so the subsystem is safe to | 3858 | * be no outstanding references, so the subsystem is safe to |
3861 | * destroy. We scan across all subsystems rather than using | 3859 | * destroy. We scan across all subsystems rather than using |
3862 | * the per-hierarchy linked list of mounted subsystems since | 3860 | * the per-hierarchy linked list of mounted subsystems since |
3863 | * we can be called via check_for_release() with no | 3861 | * we can be called via check_for_release() with no |
3864 | * synchronization other than RCU, and the subsystem linked | 3862 | * synchronization other than RCU, and the subsystem linked |
3865 | * list isn't RCU-safe */ | 3863 | * list isn't RCU-safe */ |
3866 | int i; | 3864 | int i; |
3867 | /* | 3865 | /* |
3868 | * We won't need to lock the subsys array, because the subsystems | 3866 | * We won't need to lock the subsys array, because the subsystems |
3869 | * we're concerned about aren't going anywhere since our cgroup root | 3867 | * we're concerned about aren't going anywhere since our cgroup root |
3870 | * has a reference on them. | 3868 | * has a reference on them. |
3871 | */ | 3869 | */ |
3872 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3870 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3873 | struct cgroup_subsys *ss = subsys[i]; | 3871 | struct cgroup_subsys *ss = subsys[i]; |
3874 | struct cgroup_subsys_state *css; | 3872 | struct cgroup_subsys_state *css; |
3875 | /* Skip subsystems not present or not in this hierarchy */ | 3873 | /* Skip subsystems not present or not in this hierarchy */ |
3876 | if (ss == NULL || ss->root != cgrp->root) | 3874 | if (ss == NULL || ss->root != cgrp->root) |
3877 | continue; | 3875 | continue; |
3878 | css = cgrp->subsys[ss->subsys_id]; | 3876 | css = cgrp->subsys[ss->subsys_id]; |
3879 | /* When called from check_for_release() it's possible | 3877 | /* When called from check_for_release() it's possible |
3880 | * that by this point the cgroup has been removed | 3878 | * that by this point the cgroup has been removed |
3881 | * and the css deleted. But a false-positive doesn't | 3879 | * and the css deleted. But a false-positive doesn't |
3882 | * matter, since it can only happen if the cgroup | 3880 | * matter, since it can only happen if the cgroup |
3883 | * has been deleted and hence no longer needs the | 3881 | * has been deleted and hence no longer needs the |
3884 | * release agent to be called anyway. */ | 3882 | * release agent to be called anyway. */ |
3885 | if (css && (atomic_read(&css->refcnt) > 1)) | 3883 | if (css && (atomic_read(&css->refcnt) > 1)) |
3886 | return 1; | 3884 | return 1; |
3887 | } | 3885 | } |
3888 | return 0; | 3886 | return 0; |
3889 | } | 3887 | } |
3890 | 3888 | ||
3891 | /* | 3889 | /* |
3892 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 3890 | * Atomically mark all (or else none) of the cgroup's CSS objects as |
3893 | * CSS_REMOVED. Return true on success, or false if the cgroup has | 3891 | * CSS_REMOVED. Return true on success, or false if the cgroup has |
3894 | * busy subsystems. Call with cgroup_mutex held | 3892 | * busy subsystems. Call with cgroup_mutex held |
3895 | */ | 3893 | */ |
3896 | 3894 | ||
3897 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | 3895 | static int cgroup_clear_css_refs(struct cgroup *cgrp) |
3898 | { | 3896 | { |
3899 | struct cgroup_subsys *ss; | 3897 | struct cgroup_subsys *ss; |
3900 | unsigned long flags; | 3898 | unsigned long flags; |
3901 | bool failed = false; | 3899 | bool failed = false; |
3902 | local_irq_save(flags); | 3900 | local_irq_save(flags); |
3903 | for_each_subsys(cgrp->root, ss) { | 3901 | for_each_subsys(cgrp->root, ss) { |
3904 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 3902 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3905 | int refcnt; | 3903 | int refcnt; |
3906 | while (1) { | 3904 | while (1) { |
3907 | /* We can only remove a CSS with a refcnt==1 */ | 3905 | /* We can only remove a CSS with a refcnt==1 */ |
3908 | refcnt = atomic_read(&css->refcnt); | 3906 | refcnt = atomic_read(&css->refcnt); |
3909 | if (refcnt > 1) { | 3907 | if (refcnt > 1) { |
3910 | failed = true; | 3908 | failed = true; |
3911 | goto done; | 3909 | goto done; |
3912 | } | 3910 | } |
3913 | BUG_ON(!refcnt); | 3911 | BUG_ON(!refcnt); |
3914 | /* | 3912 | /* |
3915 | * Drop the refcnt to 0 while we check other | 3913 | * Drop the refcnt to 0 while we check other |
3916 | * subsystems. This will cause any racing | 3914 | * subsystems. This will cause any racing |
3917 | * css_tryget() to spin until we set the | 3915 | * css_tryget() to spin until we set the |
3918 | * CSS_REMOVED bits or abort | 3916 | * CSS_REMOVED bits or abort |
3919 | */ | 3917 | */ |
3920 | if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) | 3918 | if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) |
3921 | break; | 3919 | break; |
3922 | cpu_relax(); | 3920 | cpu_relax(); |
3923 | } | 3921 | } |
3924 | } | 3922 | } |
3925 | done: | 3923 | done: |
3926 | for_each_subsys(cgrp->root, ss) { | 3924 | for_each_subsys(cgrp->root, ss) { |
3927 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 3925 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3928 | if (failed) { | 3926 | if (failed) { |
3929 | /* | 3927 | /* |
3930 | * Restore old refcnt if we previously managed | 3928 | * Restore old refcnt if we previously managed |
3931 | * to clear it from 1 to 0 | 3929 | * to clear it from 1 to 0 |
3932 | */ | 3930 | */ |
3933 | if (!atomic_read(&css->refcnt)) | 3931 | if (!atomic_read(&css->refcnt)) |
3934 | atomic_set(&css->refcnt, 1); | 3932 | atomic_set(&css->refcnt, 1); |
3935 | } else { | 3933 | } else { |
3936 | /* Commit the fact that the CSS is removed */ | 3934 | /* Commit the fact that the CSS is removed */ |
3937 | set_bit(CSS_REMOVED, &css->flags); | 3935 | set_bit(CSS_REMOVED, &css->flags); |
3938 | } | 3936 | } |
3939 | } | 3937 | } |
3940 | local_irq_restore(flags); | 3938 | local_irq_restore(flags); |
3941 | return !failed; | 3939 | return !failed; |
3942 | } | 3940 | } |
3943 | 3941 | ||
3944 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 3942 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
3945 | { | 3943 | { |
3946 | struct cgroup *cgrp = dentry->d_fsdata; | 3944 | struct cgroup *cgrp = dentry->d_fsdata; |
3947 | struct dentry *d; | 3945 | struct dentry *d; |
3948 | struct cgroup *parent; | 3946 | struct cgroup *parent; |
3949 | DEFINE_WAIT(wait); | 3947 | DEFINE_WAIT(wait); |
3950 | struct cgroup_event *event, *tmp; | 3948 | struct cgroup_event *event, *tmp; |
3951 | int ret; | 3949 | int ret; |
3952 | 3950 | ||
3953 | /* the vfs holds both inode->i_mutex already */ | 3951 | /* the vfs holds both inode->i_mutex already */ |
3954 | again: | 3952 | again: |
3955 | mutex_lock(&cgroup_mutex); | 3953 | mutex_lock(&cgroup_mutex); |
3956 | if (atomic_read(&cgrp->count) != 0) { | 3954 | if (atomic_read(&cgrp->count) != 0) { |
3957 | mutex_unlock(&cgroup_mutex); | 3955 | mutex_unlock(&cgroup_mutex); |
3958 | return -EBUSY; | 3956 | return -EBUSY; |
3959 | } | 3957 | } |
3960 | if (!list_empty(&cgrp->children)) { | 3958 | if (!list_empty(&cgrp->children)) { |
3961 | mutex_unlock(&cgroup_mutex); | 3959 | mutex_unlock(&cgroup_mutex); |
3962 | return -EBUSY; | 3960 | return -EBUSY; |
3963 | } | 3961 | } |
3964 | mutex_unlock(&cgroup_mutex); | 3962 | mutex_unlock(&cgroup_mutex); |
3965 | 3963 | ||
3966 | /* | 3964 | /* |
3967 | * In general, subsystem has no css->refcnt after pre_destroy(). But | 3965 | * In general, subsystem has no css->refcnt after pre_destroy(). But |
3968 | * in racy cases, subsystem may have to get css->refcnt after | 3966 | * in racy cases, subsystem may have to get css->refcnt after |
3969 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | 3967 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes |
3970 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | 3968 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue |
3971 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | 3969 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir |
3972 | * and subsystem's reference count handling. Please see css_get/put | 3970 | * and subsystem's reference count handling. Please see css_get/put |
3973 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | 3971 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. |
3974 | */ | 3972 | */ |
3975 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 3973 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
3976 | 3974 | ||
3977 | /* | 3975 | /* |
3978 | * Call pre_destroy handlers of subsys. Notify subsystems | 3976 | * Call pre_destroy handlers of subsys. Notify subsystems |
3979 | * that rmdir() request comes. | 3977 | * that rmdir() request comes. |
3980 | */ | 3978 | */ |
3981 | ret = cgroup_call_pre_destroy(cgrp); | 3979 | ret = cgroup_call_pre_destroy(cgrp); |
3982 | if (ret) { | 3980 | if (ret) { |
3983 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 3981 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
3984 | return ret; | 3982 | return ret; |
3985 | } | 3983 | } |
3986 | 3984 | ||
3987 | mutex_lock(&cgroup_mutex); | 3985 | mutex_lock(&cgroup_mutex); |
3988 | parent = cgrp->parent; | 3986 | parent = cgrp->parent; |
3989 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | 3987 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { |
3990 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 3988 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
3991 | mutex_unlock(&cgroup_mutex); | 3989 | mutex_unlock(&cgroup_mutex); |
3992 | return -EBUSY; | 3990 | return -EBUSY; |
3993 | } | 3991 | } |
3994 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | 3992 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); |
3995 | if (!cgroup_clear_css_refs(cgrp)) { | 3993 | if (!cgroup_clear_css_refs(cgrp)) { |
3996 | mutex_unlock(&cgroup_mutex); | 3994 | mutex_unlock(&cgroup_mutex); |
3997 | /* | 3995 | /* |
3998 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | 3996 | * Because someone may call cgroup_wakeup_rmdir_waiter() before |
3999 | * prepare_to_wait(), we need to check this flag. | 3997 | * prepare_to_wait(), we need to check this flag. |
4000 | */ | 3998 | */ |
4001 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | 3999 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) |
4002 | schedule(); | 4000 | schedule(); |
4003 | finish_wait(&cgroup_rmdir_waitq, &wait); | 4001 | finish_wait(&cgroup_rmdir_waitq, &wait); |
4004 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4002 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
4005 | if (signal_pending(current)) | 4003 | if (signal_pending(current)) |
4006 | return -EINTR; | 4004 | return -EINTR; |
4007 | goto again; | 4005 | goto again; |
4008 | } | 4006 | } |
4009 | /* NO css_tryget() can success after here. */ | 4007 | /* NO css_tryget() can success after here. */ |
4010 | finish_wait(&cgroup_rmdir_waitq, &wait); | 4008 | finish_wait(&cgroup_rmdir_waitq, &wait); |
4011 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4009 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
4012 | 4010 | ||
4013 | spin_lock(&release_list_lock); | 4011 | spin_lock(&release_list_lock); |
4014 | set_bit(CGRP_REMOVED, &cgrp->flags); | 4012 | set_bit(CGRP_REMOVED, &cgrp->flags); |
4015 | if (!list_empty(&cgrp->release_list)) | 4013 | if (!list_empty(&cgrp->release_list)) |
4016 | list_del_init(&cgrp->release_list); | 4014 | list_del_init(&cgrp->release_list); |
4017 | spin_unlock(&release_list_lock); | 4015 | spin_unlock(&release_list_lock); |
4018 | 4016 | ||
4019 | cgroup_lock_hierarchy(cgrp->root); | 4017 | cgroup_lock_hierarchy(cgrp->root); |
4020 | /* delete this cgroup from parent->children */ | 4018 | /* delete this cgroup from parent->children */ |
4021 | list_del_init(&cgrp->sibling); | 4019 | list_del_init(&cgrp->sibling); |
4022 | cgroup_unlock_hierarchy(cgrp->root); | 4020 | cgroup_unlock_hierarchy(cgrp->root); |
4023 | 4021 | ||
4024 | d = dget(cgrp->dentry); | 4022 | d = dget(cgrp->dentry); |
4025 | 4023 | ||
4026 | cgroup_d_remove_dir(d); | 4024 | cgroup_d_remove_dir(d); |
4027 | dput(d); | 4025 | dput(d); |
4028 | 4026 | ||
4029 | set_bit(CGRP_RELEASABLE, &parent->flags); | 4027 | set_bit(CGRP_RELEASABLE, &parent->flags); |
4030 | check_for_release(parent); | 4028 | check_for_release(parent); |
4031 | 4029 | ||
4032 | /* | 4030 | /* |
4033 | * Unregister events and notify userspace. | 4031 | * Unregister events and notify userspace. |
4034 | * Notify userspace about cgroup removing only after rmdir of cgroup | 4032 | * Notify userspace about cgroup removing only after rmdir of cgroup |
4035 | * directory to avoid race between userspace and kernelspace | 4033 | * directory to avoid race between userspace and kernelspace |
4036 | */ | 4034 | */ |
4037 | spin_lock(&cgrp->event_list_lock); | 4035 | spin_lock(&cgrp->event_list_lock); |
4038 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | 4036 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { |
4039 | list_del(&event->list); | 4037 | list_del(&event->list); |
4040 | remove_wait_queue(event->wqh, &event->wait); | 4038 | remove_wait_queue(event->wqh, &event->wait); |
4041 | eventfd_signal(event->eventfd, 1); | 4039 | eventfd_signal(event->eventfd, 1); |
4042 | schedule_work(&event->remove); | 4040 | schedule_work(&event->remove); |
4043 | } | 4041 | } |
4044 | spin_unlock(&cgrp->event_list_lock); | 4042 | spin_unlock(&cgrp->event_list_lock); |
4045 | 4043 | ||
4046 | mutex_unlock(&cgroup_mutex); | 4044 | mutex_unlock(&cgroup_mutex); |
4047 | return 0; | 4045 | return 0; |
4048 | } | 4046 | } |
4049 | 4047 | ||
4050 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4048 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) |
4051 | { | 4049 | { |
4052 | struct cgroup_subsys_state *css; | 4050 | struct cgroup_subsys_state *css; |
4053 | 4051 | ||
4054 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4052 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4055 | 4053 | ||
4056 | /* Create the top cgroup state for this subsystem */ | 4054 | /* Create the top cgroup state for this subsystem */ |
4057 | list_add(&ss->sibling, &rootnode.subsys_list); | 4055 | list_add(&ss->sibling, &rootnode.subsys_list); |
4058 | ss->root = &rootnode; | 4056 | ss->root = &rootnode; |
4059 | css = ss->create(ss, dummytop); | 4057 | css = ss->create(ss, dummytop); |
4060 | /* We don't handle early failures gracefully */ | 4058 | /* We don't handle early failures gracefully */ |
4061 | BUG_ON(IS_ERR(css)); | 4059 | BUG_ON(IS_ERR(css)); |
4062 | init_cgroup_css(css, ss, dummytop); | 4060 | init_cgroup_css(css, ss, dummytop); |
4063 | 4061 | ||
4064 | /* Update the init_css_set to contain a subsys | 4062 | /* Update the init_css_set to contain a subsys |
4065 | * pointer to this state - since the subsystem is | 4063 | * pointer to this state - since the subsystem is |
4066 | * newly registered, all tasks and hence the | 4064 | * newly registered, all tasks and hence the |
4067 | * init_css_set is in the subsystem's top cgroup. */ | 4065 | * init_css_set is in the subsystem's top cgroup. */ |
4068 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; | 4066 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; |
4069 | 4067 | ||
4070 | need_forkexit_callback |= ss->fork || ss->exit; | 4068 | need_forkexit_callback |= ss->fork || ss->exit; |
4071 | 4069 | ||
4072 | /* At system boot, before all subsystems have been | 4070 | /* At system boot, before all subsystems have been |
4073 | * registered, no tasks have been forked, so we don't | 4071 | * registered, no tasks have been forked, so we don't |
4074 | * need to invoke fork callbacks here. */ | 4072 | * need to invoke fork callbacks here. */ |
4075 | BUG_ON(!list_empty(&init_task.tasks)); | 4073 | BUG_ON(!list_empty(&init_task.tasks)); |
4076 | 4074 | ||
4077 | mutex_init(&ss->hierarchy_mutex); | 4075 | mutex_init(&ss->hierarchy_mutex); |
4078 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); | 4076 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); |
4079 | ss->active = 1; | 4077 | ss->active = 1; |
4080 | 4078 | ||
4081 | /* this function shouldn't be used with modular subsystems, since they | 4079 | /* this function shouldn't be used with modular subsystems, since they |
4082 | * need to register a subsys_id, among other things */ | 4080 | * need to register a subsys_id, among other things */ |
4083 | BUG_ON(ss->module); | 4081 | BUG_ON(ss->module); |
4084 | } | 4082 | } |
4085 | 4083 | ||
4086 | /** | 4084 | /** |
4087 | * cgroup_load_subsys: load and register a modular subsystem at runtime | 4085 | * cgroup_load_subsys: load and register a modular subsystem at runtime |
4088 | * @ss: the subsystem to load | 4086 | * @ss: the subsystem to load |
4089 | * | 4087 | * |
4090 | * This function should be called in a modular subsystem's initcall. If the | 4088 | * This function should be called in a modular subsystem's initcall. If the |
4091 | * subsystem is built as a module, it will be assigned a new subsys_id and set | 4089 | * subsystem is built as a module, it will be assigned a new subsys_id and set |
4092 | * up for use. If the subsystem is built-in anyway, work is delegated to the | 4090 | * up for use. If the subsystem is built-in anyway, work is delegated to the |
4093 | * simpler cgroup_init_subsys. | 4091 | * simpler cgroup_init_subsys. |
4094 | */ | 4092 | */ |
4095 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | 4093 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) |
4096 | { | 4094 | { |
4097 | int i; | 4095 | int i; |
4098 | struct cgroup_subsys_state *css; | 4096 | struct cgroup_subsys_state *css; |
4099 | 4097 | ||
4100 | /* check name and function validity */ | 4098 | /* check name and function validity */ |
4101 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || | 4099 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || |
4102 | ss->create == NULL || ss->destroy == NULL) | 4100 | ss->create == NULL || ss->destroy == NULL) |
4103 | return -EINVAL; | 4101 | return -EINVAL; |
4104 | 4102 | ||
4105 | /* | 4103 | /* |
4106 | * we don't support callbacks in modular subsystems. this check is | 4104 | * we don't support callbacks in modular subsystems. this check is |
4107 | * before the ss->module check for consistency; a subsystem that could | 4105 | * before the ss->module check for consistency; a subsystem that could |
4108 | * be a module should still have no callbacks even if the user isn't | 4106 | * be a module should still have no callbacks even if the user isn't |
4109 | * compiling it as one. | 4107 | * compiling it as one. |
4110 | */ | 4108 | */ |
4111 | if (ss->fork || ss->exit) | 4109 | if (ss->fork || ss->exit) |
4112 | return -EINVAL; | 4110 | return -EINVAL; |
4113 | 4111 | ||
4114 | /* | 4112 | /* |
4115 | * an optionally modular subsystem is built-in: we want to do nothing, | 4113 | * an optionally modular subsystem is built-in: we want to do nothing, |
4116 | * since cgroup_init_subsys will have already taken care of it. | 4114 | * since cgroup_init_subsys will have already taken care of it. |
4117 | */ | 4115 | */ |
4118 | if (ss->module == NULL) { | 4116 | if (ss->module == NULL) { |
4119 | /* a few sanity checks */ | 4117 | /* a few sanity checks */ |
4120 | BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); | 4118 | BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); |
4121 | BUG_ON(subsys[ss->subsys_id] != ss); | 4119 | BUG_ON(subsys[ss->subsys_id] != ss); |
4122 | return 0; | 4120 | return 0; |
4123 | } | 4121 | } |
4124 | 4122 | ||
4125 | /* | 4123 | /* |
4126 | * need to register a subsys id before anything else - for example, | 4124 | * need to register a subsys id before anything else - for example, |
4127 | * init_cgroup_css needs it. | 4125 | * init_cgroup_css needs it. |
4128 | */ | 4126 | */ |
4129 | mutex_lock(&cgroup_mutex); | 4127 | mutex_lock(&cgroup_mutex); |
4130 | /* find the first empty slot in the array */ | 4128 | /* find the first empty slot in the array */ |
4131 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | 4129 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { |
4132 | if (subsys[i] == NULL) | 4130 | if (subsys[i] == NULL) |
4133 | break; | 4131 | break; |
4134 | } | 4132 | } |
4135 | if (i == CGROUP_SUBSYS_COUNT) { | 4133 | if (i == CGROUP_SUBSYS_COUNT) { |
4136 | /* maximum number of subsystems already registered! */ | 4134 | /* maximum number of subsystems already registered! */ |
4137 | mutex_unlock(&cgroup_mutex); | 4135 | mutex_unlock(&cgroup_mutex); |
4138 | return -EBUSY; | 4136 | return -EBUSY; |
4139 | } | 4137 | } |
4140 | /* assign ourselves the subsys_id */ | 4138 | /* assign ourselves the subsys_id */ |
4141 | ss->subsys_id = i; | 4139 | ss->subsys_id = i; |
4142 | subsys[i] = ss; | 4140 | subsys[i] = ss; |
4143 | 4141 | ||
4144 | /* | 4142 | /* |
4145 | * no ss->create seems to need anything important in the ss struct, so | 4143 | * no ss->create seems to need anything important in the ss struct, so |
4146 | * this can happen first (i.e. before the rootnode attachment). | 4144 | * this can happen first (i.e. before the rootnode attachment). |
4147 | */ | 4145 | */ |
4148 | css = ss->create(ss, dummytop); | 4146 | css = ss->create(ss, dummytop); |
4149 | if (IS_ERR(css)) { | 4147 | if (IS_ERR(css)) { |
4150 | /* failure case - need to deassign the subsys[] slot. */ | 4148 | /* failure case - need to deassign the subsys[] slot. */ |
4151 | subsys[i] = NULL; | 4149 | subsys[i] = NULL; |
4152 | mutex_unlock(&cgroup_mutex); | 4150 | mutex_unlock(&cgroup_mutex); |
4153 | return PTR_ERR(css); | 4151 | return PTR_ERR(css); |
4154 | } | 4152 | } |
4155 | 4153 | ||
4156 | list_add(&ss->sibling, &rootnode.subsys_list); | 4154 | list_add(&ss->sibling, &rootnode.subsys_list); |
4157 | ss->root = &rootnode; | 4155 | ss->root = &rootnode; |
4158 | 4156 | ||
4159 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4157 | /* our new subsystem will be attached to the dummy hierarchy. */ |
4160 | init_cgroup_css(css, ss, dummytop); | 4158 | init_cgroup_css(css, ss, dummytop); |
4161 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4159 | /* init_idr must be after init_cgroup_css because it sets css->id. */ |
4162 | if (ss->use_id) { | 4160 | if (ss->use_id) { |
4163 | int ret = cgroup_init_idr(ss, css); | 4161 | int ret = cgroup_init_idr(ss, css); |
4164 | if (ret) { | 4162 | if (ret) { |
4165 | dummytop->subsys[ss->subsys_id] = NULL; | 4163 | dummytop->subsys[ss->subsys_id] = NULL; |
4166 | ss->destroy(ss, dummytop); | 4164 | ss->destroy(ss, dummytop); |
4167 | subsys[i] = NULL; | 4165 | subsys[i] = NULL; |
4168 | mutex_unlock(&cgroup_mutex); | 4166 | mutex_unlock(&cgroup_mutex); |
4169 | return ret; | 4167 | return ret; |
4170 | } | 4168 | } |
4171 | } | 4169 | } |
4172 | 4170 | ||
4173 | /* | 4171 | /* |
4174 | * Now we need to entangle the css into the existing css_sets. unlike | 4172 | * Now we need to entangle the css into the existing css_sets. unlike |
4175 | * in cgroup_init_subsys, there are now multiple css_sets, so each one | 4173 | * in cgroup_init_subsys, there are now multiple css_sets, so each one |
4176 | * will need a new pointer to it; done by iterating the css_set_table. | 4174 | * will need a new pointer to it; done by iterating the css_set_table. |
4177 | * furthermore, modifying the existing css_sets will corrupt the hash | 4175 | * furthermore, modifying the existing css_sets will corrupt the hash |
4178 | * table state, so each changed css_set will need its hash recomputed. | 4176 | * table state, so each changed css_set will need its hash recomputed. |
4179 | * this is all done under the css_set_lock. | 4177 | * this is all done under the css_set_lock. |
4180 | */ | 4178 | */ |
4181 | write_lock(&css_set_lock); | 4179 | write_lock(&css_set_lock); |
4182 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { | 4180 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { |
4183 | struct css_set *cg; | 4181 | struct css_set *cg; |
4184 | struct hlist_node *node, *tmp; | 4182 | struct hlist_node *node, *tmp; |
4185 | struct hlist_head *bucket = &css_set_table[i], *new_bucket; | 4183 | struct hlist_head *bucket = &css_set_table[i], *new_bucket; |
4186 | 4184 | ||
4187 | hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { | 4185 | hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { |
4188 | /* skip entries that we already rehashed */ | 4186 | /* skip entries that we already rehashed */ |
4189 | if (cg->subsys[ss->subsys_id]) | 4187 | if (cg->subsys[ss->subsys_id]) |
4190 | continue; | 4188 | continue; |
4191 | /* remove existing entry */ | 4189 | /* remove existing entry */ |
4192 | hlist_del(&cg->hlist); | 4190 | hlist_del(&cg->hlist); |
4193 | /* set new value */ | 4191 | /* set new value */ |
4194 | cg->subsys[ss->subsys_id] = css; | 4192 | cg->subsys[ss->subsys_id] = css; |
4195 | /* recompute hash and restore entry */ | 4193 | /* recompute hash and restore entry */ |
4196 | new_bucket = css_set_hash(cg->subsys); | 4194 | new_bucket = css_set_hash(cg->subsys); |
4197 | hlist_add_head(&cg->hlist, new_bucket); | 4195 | hlist_add_head(&cg->hlist, new_bucket); |
4198 | } | 4196 | } |
4199 | } | 4197 | } |
4200 | write_unlock(&css_set_lock); | 4198 | write_unlock(&css_set_lock); |
4201 | 4199 | ||
4202 | mutex_init(&ss->hierarchy_mutex); | 4200 | mutex_init(&ss->hierarchy_mutex); |
4203 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); | 4201 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); |
4204 | ss->active = 1; | 4202 | ss->active = 1; |
4205 | 4203 | ||
4206 | /* success! */ | 4204 | /* success! */ |
4207 | mutex_unlock(&cgroup_mutex); | 4205 | mutex_unlock(&cgroup_mutex); |
4208 | return 0; | 4206 | return 0; |
4209 | } | 4207 | } |
4210 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); | 4208 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); |
4211 | 4209 | ||
4212 | /** | 4210 | /** |
4213 | * cgroup_unload_subsys: unload a modular subsystem | 4211 | * cgroup_unload_subsys: unload a modular subsystem |
4214 | * @ss: the subsystem to unload | 4212 | * @ss: the subsystem to unload |
4215 | * | 4213 | * |
4216 | * This function should be called in a modular subsystem's exitcall. When this | 4214 | * This function should be called in a modular subsystem's exitcall. When this |
4217 | * function is invoked, the refcount on the subsystem's module will be 0, so | 4215 | * function is invoked, the refcount on the subsystem's module will be 0, so |
4218 | * the subsystem will not be attached to any hierarchy. | 4216 | * the subsystem will not be attached to any hierarchy. |
4219 | */ | 4217 | */ |
4220 | void cgroup_unload_subsys(struct cgroup_subsys *ss) | 4218 | void cgroup_unload_subsys(struct cgroup_subsys *ss) |
4221 | { | 4219 | { |
4222 | struct cg_cgroup_link *link; | 4220 | struct cg_cgroup_link *link; |
4223 | struct hlist_head *hhead; | 4221 | struct hlist_head *hhead; |
4224 | 4222 | ||
4225 | BUG_ON(ss->module == NULL); | 4223 | BUG_ON(ss->module == NULL); |
4226 | 4224 | ||
4227 | /* | 4225 | /* |
4228 | * we shouldn't be called if the subsystem is in use, and the use of | 4226 | * we shouldn't be called if the subsystem is in use, and the use of |
4229 | * try_module_get in parse_cgroupfs_options should ensure that it | 4227 | * try_module_get in parse_cgroupfs_options should ensure that it |
4230 | * doesn't start being used while we're killing it off. | 4228 | * doesn't start being used while we're killing it off. |
4231 | */ | 4229 | */ |
4232 | BUG_ON(ss->root != &rootnode); | 4230 | BUG_ON(ss->root != &rootnode); |
4233 | 4231 | ||
4234 | mutex_lock(&cgroup_mutex); | 4232 | mutex_lock(&cgroup_mutex); |
4235 | /* deassign the subsys_id */ | 4233 | /* deassign the subsys_id */ |
4236 | BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); | 4234 | BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); |
4237 | subsys[ss->subsys_id] = NULL; | 4235 | subsys[ss->subsys_id] = NULL; |
4238 | 4236 | ||
4239 | /* remove subsystem from rootnode's list of subsystems */ | 4237 | /* remove subsystem from rootnode's list of subsystems */ |
4240 | list_del_init(&ss->sibling); | 4238 | list_del_init(&ss->sibling); |
4241 | 4239 | ||
4242 | /* | 4240 | /* |
4243 | * disentangle the css from all css_sets attached to the dummytop. as | 4241 | * disentangle the css from all css_sets attached to the dummytop. as |
4244 | * in loading, we need to pay our respects to the hashtable gods. | 4242 | * in loading, we need to pay our respects to the hashtable gods. |
4245 | */ | 4243 | */ |
4246 | write_lock(&css_set_lock); | 4244 | write_lock(&css_set_lock); |
4247 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { | 4245 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { |
4248 | struct css_set *cg = link->cg; | 4246 | struct css_set *cg = link->cg; |
4249 | 4247 | ||
4250 | hlist_del(&cg->hlist); | 4248 | hlist_del(&cg->hlist); |
4251 | BUG_ON(!cg->subsys[ss->subsys_id]); | 4249 | BUG_ON(!cg->subsys[ss->subsys_id]); |
4252 | cg->subsys[ss->subsys_id] = NULL; | 4250 | cg->subsys[ss->subsys_id] = NULL; |
4253 | hhead = css_set_hash(cg->subsys); | 4251 | hhead = css_set_hash(cg->subsys); |
4254 | hlist_add_head(&cg->hlist, hhead); | 4252 | hlist_add_head(&cg->hlist, hhead); |
4255 | } | 4253 | } |
4256 | write_unlock(&css_set_lock); | 4254 | write_unlock(&css_set_lock); |
4257 | 4255 | ||
4258 | /* | 4256 | /* |
4259 | * remove subsystem's css from the dummytop and free it - need to free | 4257 | * remove subsystem's css from the dummytop and free it - need to free |
4260 | * before marking as null because ss->destroy needs the cgrp->subsys | 4258 | * before marking as null because ss->destroy needs the cgrp->subsys |
4261 | * pointer to find their state. note that this also takes care of | 4259 | * pointer to find their state. note that this also takes care of |
4262 | * freeing the css_id. | 4260 | * freeing the css_id. |
4263 | */ | 4261 | */ |
4264 | ss->destroy(ss, dummytop); | 4262 | ss->destroy(ss, dummytop); |
4265 | dummytop->subsys[ss->subsys_id] = NULL; | 4263 | dummytop->subsys[ss->subsys_id] = NULL; |
4266 | 4264 | ||
4267 | mutex_unlock(&cgroup_mutex); | 4265 | mutex_unlock(&cgroup_mutex); |
4268 | } | 4266 | } |
4269 | EXPORT_SYMBOL_GPL(cgroup_unload_subsys); | 4267 | EXPORT_SYMBOL_GPL(cgroup_unload_subsys); |
4270 | 4268 | ||
4271 | /** | 4269 | /** |
4272 | * cgroup_init_early - cgroup initialization at system boot | 4270 | * cgroup_init_early - cgroup initialization at system boot |
4273 | * | 4271 | * |
4274 | * Initialize cgroups at system boot, and initialize any | 4272 | * Initialize cgroups at system boot, and initialize any |
4275 | * subsystems that request early init. | 4273 | * subsystems that request early init. |
4276 | */ | 4274 | */ |
4277 | int __init cgroup_init_early(void) | 4275 | int __init cgroup_init_early(void) |
4278 | { | 4276 | { |
4279 | int i; | 4277 | int i; |
4280 | atomic_set(&init_css_set.refcount, 1); | 4278 | atomic_set(&init_css_set.refcount, 1); |
4281 | INIT_LIST_HEAD(&init_css_set.cg_links); | 4279 | INIT_LIST_HEAD(&init_css_set.cg_links); |
4282 | INIT_LIST_HEAD(&init_css_set.tasks); | 4280 | INIT_LIST_HEAD(&init_css_set.tasks); |
4283 | INIT_HLIST_NODE(&init_css_set.hlist); | 4281 | INIT_HLIST_NODE(&init_css_set.hlist); |
4284 | css_set_count = 1; | 4282 | css_set_count = 1; |
4285 | init_cgroup_root(&rootnode); | 4283 | init_cgroup_root(&rootnode); |
4286 | root_count = 1; | 4284 | root_count = 1; |
4287 | init_task.cgroups = &init_css_set; | 4285 | init_task.cgroups = &init_css_set; |
4288 | 4286 | ||
4289 | init_css_set_link.cg = &init_css_set; | 4287 | init_css_set_link.cg = &init_css_set; |
4290 | init_css_set_link.cgrp = dummytop; | 4288 | init_css_set_link.cgrp = dummytop; |
4291 | list_add(&init_css_set_link.cgrp_link_list, | 4289 | list_add(&init_css_set_link.cgrp_link_list, |
4292 | &rootnode.top_cgroup.css_sets); | 4290 | &rootnode.top_cgroup.css_sets); |
4293 | list_add(&init_css_set_link.cg_link_list, | 4291 | list_add(&init_css_set_link.cg_link_list, |
4294 | &init_css_set.cg_links); | 4292 | &init_css_set.cg_links); |
4295 | 4293 | ||
4296 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) | 4294 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) |
4297 | INIT_HLIST_HEAD(&css_set_table[i]); | 4295 | INIT_HLIST_HEAD(&css_set_table[i]); |
4298 | 4296 | ||
4299 | /* at bootup time, we don't worry about modular subsystems */ | 4297 | /* at bootup time, we don't worry about modular subsystems */ |
4300 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 4298 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { |
4301 | struct cgroup_subsys *ss = subsys[i]; | 4299 | struct cgroup_subsys *ss = subsys[i]; |
4302 | 4300 | ||
4303 | BUG_ON(!ss->name); | 4301 | BUG_ON(!ss->name); |
4304 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | 4302 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); |
4305 | BUG_ON(!ss->create); | 4303 | BUG_ON(!ss->create); |
4306 | BUG_ON(!ss->destroy); | 4304 | BUG_ON(!ss->destroy); |
4307 | if (ss->subsys_id != i) { | 4305 | if (ss->subsys_id != i) { |
4308 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", | 4306 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", |
4309 | ss->name, ss->subsys_id); | 4307 | ss->name, ss->subsys_id); |
4310 | BUG(); | 4308 | BUG(); |
4311 | } | 4309 | } |
4312 | 4310 | ||
4313 | if (ss->early_init) | 4311 | if (ss->early_init) |
4314 | cgroup_init_subsys(ss); | 4312 | cgroup_init_subsys(ss); |
4315 | } | 4313 | } |
4316 | return 0; | 4314 | return 0; |
4317 | } | 4315 | } |
4318 | 4316 | ||
4319 | /** | 4317 | /** |
4320 | * cgroup_init - cgroup initialization | 4318 | * cgroup_init - cgroup initialization |
4321 | * | 4319 | * |
4322 | * Register cgroup filesystem and /proc file, and initialize | 4320 | * Register cgroup filesystem and /proc file, and initialize |
4323 | * any subsystems that didn't request early init. | 4321 | * any subsystems that didn't request early init. |
4324 | */ | 4322 | */ |
4325 | int __init cgroup_init(void) | 4323 | int __init cgroup_init(void) |
4326 | { | 4324 | { |
4327 | int err; | 4325 | int err; |
4328 | int i; | 4326 | int i; |
4329 | struct hlist_head *hhead; | 4327 | struct hlist_head *hhead; |
4330 | 4328 | ||
4331 | err = bdi_init(&cgroup_backing_dev_info); | 4329 | err = bdi_init(&cgroup_backing_dev_info); |
4332 | if (err) | 4330 | if (err) |
4333 | return err; | 4331 | return err; |
4334 | 4332 | ||
4335 | /* at bootup time, we don't worry about modular subsystems */ | 4333 | /* at bootup time, we don't worry about modular subsystems */ |
4336 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 4334 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { |
4337 | struct cgroup_subsys *ss = subsys[i]; | 4335 | struct cgroup_subsys *ss = subsys[i]; |
4338 | if (!ss->early_init) | 4336 | if (!ss->early_init) |
4339 | cgroup_init_subsys(ss); | 4337 | cgroup_init_subsys(ss); |
4340 | if (ss->use_id) | 4338 | if (ss->use_id) |
4341 | cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); | 4339 | cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); |
4342 | } | 4340 | } |
4343 | 4341 | ||
4344 | /* Add init_css_set to the hash table */ | 4342 | /* Add init_css_set to the hash table */ |
4345 | hhead = css_set_hash(init_css_set.subsys); | 4343 | hhead = css_set_hash(init_css_set.subsys); |
4346 | hlist_add_head(&init_css_set.hlist, hhead); | 4344 | hlist_add_head(&init_css_set.hlist, hhead); |
4347 | BUG_ON(!init_root_id(&rootnode)); | 4345 | BUG_ON(!init_root_id(&rootnode)); |
4348 | 4346 | ||
4349 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 4347 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
4350 | if (!cgroup_kobj) { | 4348 | if (!cgroup_kobj) { |
4351 | err = -ENOMEM; | 4349 | err = -ENOMEM; |
4352 | goto out; | 4350 | goto out; |
4353 | } | 4351 | } |
4354 | 4352 | ||
4355 | err = register_filesystem(&cgroup_fs_type); | 4353 | err = register_filesystem(&cgroup_fs_type); |
4356 | if (err < 0) { | 4354 | if (err < 0) { |
4357 | kobject_put(cgroup_kobj); | 4355 | kobject_put(cgroup_kobj); |
4358 | goto out; | 4356 | goto out; |
4359 | } | 4357 | } |
4360 | 4358 | ||
4361 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); | 4359 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); |
4362 | 4360 | ||
4363 | out: | 4361 | out: |
4364 | if (err) | 4362 | if (err) |
4365 | bdi_destroy(&cgroup_backing_dev_info); | 4363 | bdi_destroy(&cgroup_backing_dev_info); |
4366 | 4364 | ||
4367 | return err; | 4365 | return err; |
4368 | } | 4366 | } |
4369 | 4367 | ||
4370 | /* | 4368 | /* |
4371 | * proc_cgroup_show() | 4369 | * proc_cgroup_show() |
4372 | * - Print task's cgroup paths into seq_file, one line for each hierarchy | 4370 | * - Print task's cgroup paths into seq_file, one line for each hierarchy |
4373 | * - Used for /proc/<pid>/cgroup. | 4371 | * - Used for /proc/<pid>/cgroup. |
4374 | * - No need to task_lock(tsk) on this tsk->cgroup reference, as it | 4372 | * - No need to task_lock(tsk) on this tsk->cgroup reference, as it |
4375 | * doesn't really matter if tsk->cgroup changes after we read it, | 4373 | * doesn't really matter if tsk->cgroup changes after we read it, |
4376 | * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it | 4374 | * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it |
4377 | * anyway. No need to check that tsk->cgroup != NULL, thanks to | 4375 | * anyway. No need to check that tsk->cgroup != NULL, thanks to |
4378 | * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks | 4376 | * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks |
4379 | * cgroup to top_cgroup. | 4377 | * cgroup to top_cgroup. |
4380 | */ | 4378 | */ |
4381 | 4379 | ||
4382 | /* TODO: Use a proper seq_file iterator */ | 4380 | /* TODO: Use a proper seq_file iterator */ |
4383 | static int proc_cgroup_show(struct seq_file *m, void *v) | 4381 | static int proc_cgroup_show(struct seq_file *m, void *v) |
4384 | { | 4382 | { |
4385 | struct pid *pid; | 4383 | struct pid *pid; |
4386 | struct task_struct *tsk; | 4384 | struct task_struct *tsk; |
4387 | char *buf; | 4385 | char *buf; |
4388 | int retval; | 4386 | int retval; |
4389 | struct cgroupfs_root *root; | 4387 | struct cgroupfs_root *root; |
4390 | 4388 | ||
4391 | retval = -ENOMEM; | 4389 | retval = -ENOMEM; |
4392 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 4390 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
4393 | if (!buf) | 4391 | if (!buf) |
4394 | goto out; | 4392 | goto out; |
4395 | 4393 | ||
4396 | retval = -ESRCH; | 4394 | retval = -ESRCH; |
4397 | pid = m->private; | 4395 | pid = m->private; |
4398 | tsk = get_pid_task(pid, PIDTYPE_PID); | 4396 | tsk = get_pid_task(pid, PIDTYPE_PID); |
4399 | if (!tsk) | 4397 | if (!tsk) |
4400 | goto out_free; | 4398 | goto out_free; |
4401 | 4399 | ||
4402 | retval = 0; | 4400 | retval = 0; |
4403 | 4401 | ||
4404 | mutex_lock(&cgroup_mutex); | 4402 | mutex_lock(&cgroup_mutex); |
4405 | 4403 | ||
4406 | for_each_active_root(root) { | 4404 | for_each_active_root(root) { |
4407 | struct cgroup_subsys *ss; | 4405 | struct cgroup_subsys *ss; |
4408 | struct cgroup *cgrp; | 4406 | struct cgroup *cgrp; |
4409 | int count = 0; | 4407 | int count = 0; |
4410 | 4408 | ||
4411 | seq_printf(m, "%d:", root->hierarchy_id); | 4409 | seq_printf(m, "%d:", root->hierarchy_id); |
4412 | for_each_subsys(root, ss) | 4410 | for_each_subsys(root, ss) |
4413 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 4411 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
4414 | if (strlen(root->name)) | 4412 | if (strlen(root->name)) |
4415 | seq_printf(m, "%sname=%s", count ? "," : "", | 4413 | seq_printf(m, "%sname=%s", count ? "," : "", |
4416 | root->name); | 4414 | root->name); |
4417 | seq_putc(m, ':'); | 4415 | seq_putc(m, ':'); |
4418 | cgrp = task_cgroup_from_root(tsk, root); | 4416 | cgrp = task_cgroup_from_root(tsk, root); |
4419 | retval = cgroup_path(cgrp, buf, PAGE_SIZE); | 4417 | retval = cgroup_path(cgrp, buf, PAGE_SIZE); |
4420 | if (retval < 0) | 4418 | if (retval < 0) |
4421 | goto out_unlock; | 4419 | goto out_unlock; |
4422 | seq_puts(m, buf); | 4420 | seq_puts(m, buf); |
4423 | seq_putc(m, '\n'); | 4421 | seq_putc(m, '\n'); |
4424 | } | 4422 | } |
4425 | 4423 | ||
4426 | out_unlock: | 4424 | out_unlock: |
4427 | mutex_unlock(&cgroup_mutex); | 4425 | mutex_unlock(&cgroup_mutex); |
4428 | put_task_struct(tsk); | 4426 | put_task_struct(tsk); |
4429 | out_free: | 4427 | out_free: |
4430 | kfree(buf); | 4428 | kfree(buf); |
4431 | out: | 4429 | out: |
4432 | return retval; | 4430 | return retval; |
4433 | } | 4431 | } |
4434 | 4432 | ||
4435 | static int cgroup_open(struct inode *inode, struct file *file) | 4433 | static int cgroup_open(struct inode *inode, struct file *file) |
4436 | { | 4434 | { |
4437 | struct pid *pid = PROC_I(inode)->pid; | 4435 | struct pid *pid = PROC_I(inode)->pid; |
4438 | return single_open(file, proc_cgroup_show, pid); | 4436 | return single_open(file, proc_cgroup_show, pid); |
4439 | } | 4437 | } |
4440 | 4438 | ||
4441 | const struct file_operations proc_cgroup_operations = { | 4439 | const struct file_operations proc_cgroup_operations = { |
4442 | .open = cgroup_open, | 4440 | .open = cgroup_open, |
4443 | .read = seq_read, | 4441 | .read = seq_read, |
4444 | .llseek = seq_lseek, | 4442 | .llseek = seq_lseek, |
4445 | .release = single_release, | 4443 | .release = single_release, |
4446 | }; | 4444 | }; |
4447 | 4445 | ||
4448 | /* Display information about each subsystem and each hierarchy */ | 4446 | /* Display information about each subsystem and each hierarchy */ |
4449 | static int proc_cgroupstats_show(struct seq_file *m, void *v) | 4447 | static int proc_cgroupstats_show(struct seq_file *m, void *v) |
4450 | { | 4448 | { |
4451 | int i; | 4449 | int i; |
4452 | 4450 | ||
4453 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); | 4451 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); |
4454 | /* | 4452 | /* |
4455 | * ideally we don't want subsystems moving around while we do this. | 4453 | * ideally we don't want subsystems moving around while we do this. |
4456 | * cgroup_mutex is also necessary to guarantee an atomic snapshot of | 4454 | * cgroup_mutex is also necessary to guarantee an atomic snapshot of |
4457 | * subsys/hierarchy state. | 4455 | * subsys/hierarchy state. |
4458 | */ | 4456 | */ |
4459 | mutex_lock(&cgroup_mutex); | 4457 | mutex_lock(&cgroup_mutex); |
4460 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4458 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
4461 | struct cgroup_subsys *ss = subsys[i]; | 4459 | struct cgroup_subsys *ss = subsys[i]; |
4462 | if (ss == NULL) | 4460 | if (ss == NULL) |
4463 | continue; | 4461 | continue; |
4464 | seq_printf(m, "%s\t%d\t%d\t%d\n", | 4462 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
4465 | ss->name, ss->root->hierarchy_id, | 4463 | ss->name, ss->root->hierarchy_id, |
4466 | ss->root->number_of_cgroups, !ss->disabled); | 4464 | ss->root->number_of_cgroups, !ss->disabled); |
4467 | } | 4465 | } |
4468 | mutex_unlock(&cgroup_mutex); | 4466 | mutex_unlock(&cgroup_mutex); |
4469 | return 0; | 4467 | return 0; |
4470 | } | 4468 | } |
4471 | 4469 | ||
4472 | static int cgroupstats_open(struct inode *inode, struct file *file) | 4470 | static int cgroupstats_open(struct inode *inode, struct file *file) |
4473 | { | 4471 | { |
4474 | return single_open(file, proc_cgroupstats_show, NULL); | 4472 | return single_open(file, proc_cgroupstats_show, NULL); |
4475 | } | 4473 | } |
4476 | 4474 | ||
4477 | static const struct file_operations proc_cgroupstats_operations = { | 4475 | static const struct file_operations proc_cgroupstats_operations = { |
4478 | .open = cgroupstats_open, | 4476 | .open = cgroupstats_open, |
4479 | .read = seq_read, | 4477 | .read = seq_read, |
4480 | .llseek = seq_lseek, | 4478 | .llseek = seq_lseek, |
4481 | .release = single_release, | 4479 | .release = single_release, |
4482 | }; | 4480 | }; |
4483 | 4481 | ||
4484 | /** | 4482 | /** |
4485 | * cgroup_fork - attach newly forked task to its parents cgroup. | 4483 | * cgroup_fork - attach newly forked task to its parents cgroup. |
4486 | * @child: pointer to task_struct of forking parent process. | 4484 | * @child: pointer to task_struct of forking parent process. |
4487 | * | 4485 | * |
4488 | * Description: A task inherits its parent's cgroup at fork(). | 4486 | * Description: A task inherits its parent's cgroup at fork(). |
4489 | * | 4487 | * |
4490 | * A pointer to the shared css_set was automatically copied in | 4488 | * A pointer to the shared css_set was automatically copied in |
4491 | * fork.c by dup_task_struct(). However, we ignore that copy, since | 4489 | * fork.c by dup_task_struct(). However, we ignore that copy, since |
4492 | * it was not made under the protection of RCU or cgroup_mutex, so | 4490 | * it was not made under the protection of RCU or cgroup_mutex, so |
4493 | * might no longer be a valid cgroup pointer. cgroup_attach_task() might | 4491 | * might no longer be a valid cgroup pointer. cgroup_attach_task() might |
4494 | * have already changed current->cgroups, allowing the previously | 4492 | * have already changed current->cgroups, allowing the previously |
4495 | * referenced cgroup group to be removed and freed. | 4493 | * referenced cgroup group to be removed and freed. |
4496 | * | 4494 | * |
4497 | * At the point that cgroup_fork() is called, 'current' is the parent | 4495 | * At the point that cgroup_fork() is called, 'current' is the parent |
4498 | * task, and the passed argument 'child' points to the child task. | 4496 | * task, and the passed argument 'child' points to the child task. |
4499 | */ | 4497 | */ |
4500 | void cgroup_fork(struct task_struct *child) | 4498 | void cgroup_fork(struct task_struct *child) |
4501 | { | 4499 | { |
4502 | task_lock(current); | 4500 | task_lock(current); |
4503 | child->cgroups = current->cgroups; | 4501 | child->cgroups = current->cgroups; |
4504 | get_css_set(child->cgroups); | 4502 | get_css_set(child->cgroups); |
4505 | task_unlock(current); | 4503 | task_unlock(current); |
4506 | INIT_LIST_HEAD(&child->cg_list); | 4504 | INIT_LIST_HEAD(&child->cg_list); |
4507 | } | 4505 | } |
4508 | 4506 | ||
4509 | /** | 4507 | /** |
4510 | * cgroup_fork_callbacks - run fork callbacks | 4508 | * cgroup_fork_callbacks - run fork callbacks |
4511 | * @child: the new task | 4509 | * @child: the new task |
4512 | * | 4510 | * |
4513 | * Called on a new task very soon before adding it to the | 4511 | * Called on a new task very soon before adding it to the |
4514 | * tasklist. No need to take any locks since no-one can | 4512 | * tasklist. No need to take any locks since no-one can |
4515 | * be operating on this task. | 4513 | * be operating on this task. |
4516 | */ | 4514 | */ |
4517 | void cgroup_fork_callbacks(struct task_struct *child) | 4515 | void cgroup_fork_callbacks(struct task_struct *child) |
4518 | { | 4516 | { |
4519 | if (need_forkexit_callback) { | 4517 | if (need_forkexit_callback) { |
4520 | int i; | 4518 | int i; |
4521 | /* | 4519 | /* |
4522 | * forkexit callbacks are only supported for builtin | 4520 | * forkexit callbacks are only supported for builtin |
4523 | * subsystems, and the builtin section of the subsys array is | 4521 | * subsystems, and the builtin section of the subsys array is |
4524 | * immutable, so we don't need to lock the subsys array here. | 4522 | * immutable, so we don't need to lock the subsys array here. |
4525 | */ | 4523 | */ |
4526 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 4524 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { |
4527 | struct cgroup_subsys *ss = subsys[i]; | 4525 | struct cgroup_subsys *ss = subsys[i]; |
4528 | if (ss->fork) | 4526 | if (ss->fork) |
4529 | ss->fork(ss, child); | 4527 | ss->fork(ss, child); |
4530 | } | 4528 | } |
4531 | } | 4529 | } |
4532 | } | 4530 | } |
4533 | 4531 | ||
4534 | /** | 4532 | /** |
4535 | * cgroup_post_fork - called on a new task after adding it to the task list | 4533 | * cgroup_post_fork - called on a new task after adding it to the task list |
4536 | * @child: the task in question | 4534 | * @child: the task in question |
4537 | * | 4535 | * |
4538 | * Adds the task to the list running through its css_set if necessary. | 4536 | * Adds the task to the list running through its css_set if necessary. |
4539 | * Has to be after the task is visible on the task list in case we race | 4537 | * Has to be after the task is visible on the task list in case we race |
4540 | * with the first call to cgroup_iter_start() - to guarantee that the | 4538 | * with the first call to cgroup_iter_start() - to guarantee that the |
4541 | * new task ends up on its list. | 4539 | * new task ends up on its list. |
4542 | */ | 4540 | */ |
4543 | void cgroup_post_fork(struct task_struct *child) | 4541 | void cgroup_post_fork(struct task_struct *child) |
4544 | { | 4542 | { |
4545 | if (use_task_css_set_links) { | 4543 | if (use_task_css_set_links) { |
4546 | write_lock(&css_set_lock); | 4544 | write_lock(&css_set_lock); |
4547 | task_lock(child); | 4545 | task_lock(child); |
4548 | if (list_empty(&child->cg_list)) | 4546 | if (list_empty(&child->cg_list)) |
4549 | list_add(&child->cg_list, &child->cgroups->tasks); | 4547 | list_add(&child->cg_list, &child->cgroups->tasks); |
4550 | task_unlock(child); | 4548 | task_unlock(child); |
4551 | write_unlock(&css_set_lock); | 4549 | write_unlock(&css_set_lock); |
4552 | } | 4550 | } |
4553 | } | 4551 | } |
4554 | /** | 4552 | /** |
4555 | * cgroup_exit - detach cgroup from exiting task | 4553 | * cgroup_exit - detach cgroup from exiting task |
4556 | * @tsk: pointer to task_struct of exiting process | 4554 | * @tsk: pointer to task_struct of exiting process |
4557 | * @run_callback: run exit callbacks? | 4555 | * @run_callback: run exit callbacks? |
4558 | * | 4556 | * |
4559 | * Description: Detach cgroup from @tsk and release it. | 4557 | * Description: Detach cgroup from @tsk and release it. |
4560 | * | 4558 | * |
4561 | * Note that cgroups marked notify_on_release force every task in | 4559 | * Note that cgroups marked notify_on_release force every task in |
4562 | * them to take the global cgroup_mutex mutex when exiting. | 4560 | * them to take the global cgroup_mutex mutex when exiting. |
4563 | * This could impact scaling on very large systems. Be reluctant to | 4561 | * This could impact scaling on very large systems. Be reluctant to |
4564 | * use notify_on_release cgroups where very high task exit scaling | 4562 | * use notify_on_release cgroups where very high task exit scaling |
4565 | * is required on large systems. | 4563 | * is required on large systems. |
4566 | * | 4564 | * |
4567 | * the_top_cgroup_hack: | 4565 | * the_top_cgroup_hack: |
4568 | * | 4566 | * |
4569 | * Set the exiting tasks cgroup to the root cgroup (top_cgroup). | 4567 | * Set the exiting tasks cgroup to the root cgroup (top_cgroup). |
4570 | * | 4568 | * |
4571 | * We call cgroup_exit() while the task is still competent to | 4569 | * We call cgroup_exit() while the task is still competent to |
4572 | * handle notify_on_release(), then leave the task attached to the | 4570 | * handle notify_on_release(), then leave the task attached to the |
4573 | * root cgroup in each hierarchy for the remainder of its exit. | 4571 | * root cgroup in each hierarchy for the remainder of its exit. |
4574 | * | 4572 | * |
4575 | * To do this properly, we would increment the reference count on | 4573 | * To do this properly, we would increment the reference count on |
4576 | * top_cgroup, and near the very end of the kernel/exit.c do_exit() | 4574 | * top_cgroup, and near the very end of the kernel/exit.c do_exit() |
4577 | * code we would add a second cgroup function call, to drop that | 4575 | * code we would add a second cgroup function call, to drop that |
4578 | * reference. This would just create an unnecessary hot spot on | 4576 | * reference. This would just create an unnecessary hot spot on |
4579 | * the top_cgroup reference count, to no avail. | 4577 | * the top_cgroup reference count, to no avail. |
4580 | * | 4578 | * |
4581 | * Normally, holding a reference to a cgroup without bumping its | 4579 | * Normally, holding a reference to a cgroup without bumping its |
4582 | * count is unsafe. The cgroup could go away, or someone could | 4580 | * count is unsafe. The cgroup could go away, or someone could |
4583 | * attach us to a different cgroup, decrementing the count on | 4581 | * attach us to a different cgroup, decrementing the count on |
4584 | * the first cgroup that we never incremented. But in this case, | 4582 | * the first cgroup that we never incremented. But in this case, |
4585 | * top_cgroup isn't going away, and either task has PF_EXITING set, | 4583 | * top_cgroup isn't going away, and either task has PF_EXITING set, |
4586 | * which wards off any cgroup_attach_task() attempts, or task is a failed | 4584 | * which wards off any cgroup_attach_task() attempts, or task is a failed |
4587 | * fork, never visible to cgroup_attach_task. | 4585 | * fork, never visible to cgroup_attach_task. |
4588 | */ | 4586 | */ |
4589 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 4587 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
4590 | { | 4588 | { |
4591 | struct css_set *cg; | 4589 | struct css_set *cg; |
4592 | int i; | 4590 | int i; |
4593 | 4591 | ||
4594 | /* | 4592 | /* |
4595 | * Unlink from the css_set task list if necessary. | 4593 | * Unlink from the css_set task list if necessary. |
4596 | * Optimistically check cg_list before taking | 4594 | * Optimistically check cg_list before taking |
4597 | * css_set_lock | 4595 | * css_set_lock |
4598 | */ | 4596 | */ |
4599 | if (!list_empty(&tsk->cg_list)) { | 4597 | if (!list_empty(&tsk->cg_list)) { |
4600 | write_lock(&css_set_lock); | 4598 | write_lock(&css_set_lock); |
4601 | if (!list_empty(&tsk->cg_list)) | 4599 | if (!list_empty(&tsk->cg_list)) |
4602 | list_del_init(&tsk->cg_list); | 4600 | list_del_init(&tsk->cg_list); |
4603 | write_unlock(&css_set_lock); | 4601 | write_unlock(&css_set_lock); |
4604 | } | 4602 | } |
4605 | 4603 | ||
4606 | /* Reassign the task to the init_css_set. */ | 4604 | /* Reassign the task to the init_css_set. */ |
4607 | task_lock(tsk); | 4605 | task_lock(tsk); |
4608 | cg = tsk->cgroups; | 4606 | cg = tsk->cgroups; |
4609 | tsk->cgroups = &init_css_set; | 4607 | tsk->cgroups = &init_css_set; |
4610 | 4608 | ||
4611 | if (run_callbacks && need_forkexit_callback) { | 4609 | if (run_callbacks && need_forkexit_callback) { |
4612 | /* | 4610 | /* |
4613 | * modular subsystems can't use callbacks, so no need to lock | 4611 | * modular subsystems can't use callbacks, so no need to lock |
4614 | * the subsys array | 4612 | * the subsys array |
4615 | */ | 4613 | */ |
4616 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 4614 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { |
4617 | struct cgroup_subsys *ss = subsys[i]; | 4615 | struct cgroup_subsys *ss = subsys[i]; |
4618 | if (ss->exit) { | 4616 | if (ss->exit) { |
4619 | struct cgroup *old_cgrp = | 4617 | struct cgroup *old_cgrp = |
4620 | rcu_dereference_raw(cg->subsys[i])->cgroup; | 4618 | rcu_dereference_raw(cg->subsys[i])->cgroup; |
4621 | struct cgroup *cgrp = task_cgroup(tsk, i); | 4619 | struct cgroup *cgrp = task_cgroup(tsk, i); |
4622 | ss->exit(ss, cgrp, old_cgrp, tsk); | 4620 | ss->exit(ss, cgrp, old_cgrp, tsk); |
4623 | } | 4621 | } |
4624 | } | 4622 | } |
4625 | } | 4623 | } |
4626 | task_unlock(tsk); | 4624 | task_unlock(tsk); |
4627 | 4625 | ||
4628 | if (cg) | 4626 | if (cg) |
4629 | put_css_set_taskexit(cg); | 4627 | put_css_set_taskexit(cg); |
4630 | } | 4628 | } |
4631 | 4629 | ||
4632 | /** | 4630 | /** |
4633 | * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp | 4631 | * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp |
4634 | * @cgrp: the cgroup in question | 4632 | * @cgrp: the cgroup in question |
4635 | * @task: the task in question | 4633 | * @task: the task in question |
4636 | * | 4634 | * |
4637 | * See if @cgrp is a descendant of @task's cgroup in the appropriate | 4635 | * See if @cgrp is a descendant of @task's cgroup in the appropriate |
4638 | * hierarchy. | 4636 | * hierarchy. |
4639 | * | 4637 | * |
4640 | * If we are sending in dummytop, then presumably we are creating | 4638 | * If we are sending in dummytop, then presumably we are creating |
4641 | * the top cgroup in the subsystem. | 4639 | * the top cgroup in the subsystem. |
4642 | * | 4640 | * |
4643 | * Called only by the ns (nsproxy) cgroup. | 4641 | * Called only by the ns (nsproxy) cgroup. |
4644 | */ | 4642 | */ |
4645 | int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) | 4643 | int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) |
4646 | { | 4644 | { |
4647 | int ret; | 4645 | int ret; |
4648 | struct cgroup *target; | 4646 | struct cgroup *target; |
4649 | 4647 | ||
4650 | if (cgrp == dummytop) | 4648 | if (cgrp == dummytop) |
4651 | return 1; | 4649 | return 1; |
4652 | 4650 | ||
4653 | target = task_cgroup_from_root(task, cgrp->root); | 4651 | target = task_cgroup_from_root(task, cgrp->root); |
4654 | while (cgrp != target && cgrp!= cgrp->top_cgroup) | 4652 | while (cgrp != target && cgrp!= cgrp->top_cgroup) |
4655 | cgrp = cgrp->parent; | 4653 | cgrp = cgrp->parent; |
4656 | ret = (cgrp == target); | 4654 | ret = (cgrp == target); |
4657 | return ret; | 4655 | return ret; |
4658 | } | 4656 | } |
4659 | 4657 | ||
4660 | static void check_for_release(struct cgroup *cgrp) | 4658 | static void check_for_release(struct cgroup *cgrp) |
4661 | { | 4659 | { |
4662 | /* All of these checks rely on RCU to keep the cgroup | 4660 | /* All of these checks rely on RCU to keep the cgroup |
4663 | * structure alive */ | 4661 | * structure alive */ |
4664 | if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) | 4662 | if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) |
4665 | && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { | 4663 | && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { |
4666 | /* Control Group is currently removeable. If it's not | 4664 | /* Control Group is currently removeable. If it's not |
4667 | * already queued for a userspace notification, queue | 4665 | * already queued for a userspace notification, queue |
4668 | * it now */ | 4666 | * it now */ |
4669 | int need_schedule_work = 0; | 4667 | int need_schedule_work = 0; |
4670 | spin_lock(&release_list_lock); | 4668 | spin_lock(&release_list_lock); |
4671 | if (!cgroup_is_removed(cgrp) && | 4669 | if (!cgroup_is_removed(cgrp) && |
4672 | list_empty(&cgrp->release_list)) { | 4670 | list_empty(&cgrp->release_list)) { |
4673 | list_add(&cgrp->release_list, &release_list); | 4671 | list_add(&cgrp->release_list, &release_list); |
4674 | need_schedule_work = 1; | 4672 | need_schedule_work = 1; |
4675 | } | 4673 | } |
4676 | spin_unlock(&release_list_lock); | 4674 | spin_unlock(&release_list_lock); |
4677 | if (need_schedule_work) | 4675 | if (need_schedule_work) |
4678 | schedule_work(&release_agent_work); | 4676 | schedule_work(&release_agent_work); |
4679 | } | 4677 | } |
4680 | } | 4678 | } |
4681 | 4679 | ||
4682 | /* Caller must verify that the css is not for root cgroup */ | 4680 | /* Caller must verify that the css is not for root cgroup */ |
4683 | void __css_put(struct cgroup_subsys_state *css, int count) | 4681 | void __css_put(struct cgroup_subsys_state *css, int count) |
4684 | { | 4682 | { |
4685 | struct cgroup *cgrp = css->cgroup; | 4683 | struct cgroup *cgrp = css->cgroup; |
4686 | int val; | 4684 | int val; |
4687 | rcu_read_lock(); | 4685 | rcu_read_lock(); |
4688 | val = atomic_sub_return(count, &css->refcnt); | 4686 | val = atomic_sub_return(count, &css->refcnt); |
4689 | if (val == 1) { | 4687 | if (val == 1) { |
4690 | if (notify_on_release(cgrp)) { | 4688 | if (notify_on_release(cgrp)) { |
4691 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4689 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
4692 | check_for_release(cgrp); | 4690 | check_for_release(cgrp); |
4693 | } | 4691 | } |
4694 | cgroup_wakeup_rmdir_waiter(cgrp); | 4692 | cgroup_wakeup_rmdir_waiter(cgrp); |
4695 | } | 4693 | } |
4696 | rcu_read_unlock(); | 4694 | rcu_read_unlock(); |
4697 | WARN_ON_ONCE(val < 1); | 4695 | WARN_ON_ONCE(val < 1); |
4698 | } | 4696 | } |
4699 | EXPORT_SYMBOL_GPL(__css_put); | 4697 | EXPORT_SYMBOL_GPL(__css_put); |
4700 | 4698 | ||
4701 | /* | 4699 | /* |
4702 | * Notify userspace when a cgroup is released, by running the | 4700 | * Notify userspace when a cgroup is released, by running the |
4703 | * configured release agent with the name of the cgroup (path | 4701 | * configured release agent with the name of the cgroup (path |
4704 | * relative to the root of cgroup file system) as the argument. | 4702 | * relative to the root of cgroup file system) as the argument. |
4705 | * | 4703 | * |
4706 | * Most likely, this user command will try to rmdir this cgroup. | 4704 | * Most likely, this user command will try to rmdir this cgroup. |
4707 | * | 4705 | * |
4708 | * This races with the possibility that some other task will be | 4706 | * This races with the possibility that some other task will be |
4709 | * attached to this cgroup before it is removed, or that some other | 4707 | * attached to this cgroup before it is removed, or that some other |
4710 | * user task will 'mkdir' a child cgroup of this cgroup. That's ok. | 4708 | * user task will 'mkdir' a child cgroup of this cgroup. That's ok. |
4711 | * The presumed 'rmdir' will fail quietly if this cgroup is no longer | 4709 | * The presumed 'rmdir' will fail quietly if this cgroup is no longer |
4712 | * unused, and this cgroup will be reprieved from its death sentence, | 4710 | * unused, and this cgroup will be reprieved from its death sentence, |
4713 | * to continue to serve a useful existence. Next time it's released, | 4711 | * to continue to serve a useful existence. Next time it's released, |
4714 | * we will get notified again, if it still has 'notify_on_release' set. | 4712 | * we will get notified again, if it still has 'notify_on_release' set. |
4715 | * | 4713 | * |
4716 | * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which | 4714 | * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which |
4717 | * means only wait until the task is successfully execve()'d. The | 4715 | * means only wait until the task is successfully execve()'d. The |
4718 | * separate release agent task is forked by call_usermodehelper(), | 4716 | * separate release agent task is forked by call_usermodehelper(), |
4719 | * then control in this thread returns here, without waiting for the | 4717 | * then control in this thread returns here, without waiting for the |
4720 | * release agent task. We don't bother to wait because the caller of | 4718 | * release agent task. We don't bother to wait because the caller of |
4721 | * this routine has no use for the exit status of the release agent | 4719 | * this routine has no use for the exit status of the release agent |
4722 | * task, so no sense holding our caller up for that. | 4720 | * task, so no sense holding our caller up for that. |
4723 | */ | 4721 | */ |
4724 | static void cgroup_release_agent(struct work_struct *work) | 4722 | static void cgroup_release_agent(struct work_struct *work) |
4725 | { | 4723 | { |
4726 | BUG_ON(work != &release_agent_work); | 4724 | BUG_ON(work != &release_agent_work); |
4727 | mutex_lock(&cgroup_mutex); | 4725 | mutex_lock(&cgroup_mutex); |
4728 | spin_lock(&release_list_lock); | 4726 | spin_lock(&release_list_lock); |
4729 | while (!list_empty(&release_list)) { | 4727 | while (!list_empty(&release_list)) { |
4730 | char *argv[3], *envp[3]; | 4728 | char *argv[3], *envp[3]; |
4731 | int i; | 4729 | int i; |
4732 | char *pathbuf = NULL, *agentbuf = NULL; | 4730 | char *pathbuf = NULL, *agentbuf = NULL; |
4733 | struct cgroup *cgrp = list_entry(release_list.next, | 4731 | struct cgroup *cgrp = list_entry(release_list.next, |
4734 | struct cgroup, | 4732 | struct cgroup, |
4735 | release_list); | 4733 | release_list); |
4736 | list_del_init(&cgrp->release_list); | 4734 | list_del_init(&cgrp->release_list); |
4737 | spin_unlock(&release_list_lock); | 4735 | spin_unlock(&release_list_lock); |
4738 | pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 4736 | pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
4739 | if (!pathbuf) | 4737 | if (!pathbuf) |
4740 | goto continue_free; | 4738 | goto continue_free; |
4741 | if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) | 4739 | if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) |
4742 | goto continue_free; | 4740 | goto continue_free; |
4743 | agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); | 4741 | agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); |
4744 | if (!agentbuf) | 4742 | if (!agentbuf) |
4745 | goto continue_free; | 4743 | goto continue_free; |
4746 | 4744 | ||
4747 | i = 0; | 4745 | i = 0; |
4748 | argv[i++] = agentbuf; | 4746 | argv[i++] = agentbuf; |
4749 | argv[i++] = pathbuf; | 4747 | argv[i++] = pathbuf; |
4750 | argv[i] = NULL; | 4748 | argv[i] = NULL; |
4751 | 4749 | ||
4752 | i = 0; | 4750 | i = 0; |
4753 | /* minimal command environment */ | 4751 | /* minimal command environment */ |
4754 | envp[i++] = "HOME=/"; | 4752 | envp[i++] = "HOME=/"; |
4755 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | 4753 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
4756 | envp[i] = NULL; | 4754 | envp[i] = NULL; |
4757 | 4755 | ||
4758 | /* Drop the lock while we invoke the usermode helper, | 4756 | /* Drop the lock while we invoke the usermode helper, |
4759 | * since the exec could involve hitting disk and hence | 4757 | * since the exec could involve hitting disk and hence |
4760 | * be a slow process */ | 4758 | * be a slow process */ |
4761 | mutex_unlock(&cgroup_mutex); | 4759 | mutex_unlock(&cgroup_mutex); |
4762 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | 4760 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); |
4763 | mutex_lock(&cgroup_mutex); | 4761 | mutex_lock(&cgroup_mutex); |
4764 | continue_free: | 4762 | continue_free: |
4765 | kfree(pathbuf); | 4763 | kfree(pathbuf); |
4766 | kfree(agentbuf); | 4764 | kfree(agentbuf); |
4767 | spin_lock(&release_list_lock); | 4765 | spin_lock(&release_list_lock); |
4768 | } | 4766 | } |
4769 | spin_unlock(&release_list_lock); | 4767 | spin_unlock(&release_list_lock); |
4770 | mutex_unlock(&cgroup_mutex); | 4768 | mutex_unlock(&cgroup_mutex); |
4771 | } | 4769 | } |
4772 | 4770 | ||
4773 | static int __init cgroup_disable(char *str) | 4771 | static int __init cgroup_disable(char *str) |
4774 | { | 4772 | { |
4775 | int i; | 4773 | int i; |
4776 | char *token; | 4774 | char *token; |
4777 | 4775 | ||
4778 | while ((token = strsep(&str, ",")) != NULL) { | 4776 | while ((token = strsep(&str, ",")) != NULL) { |
4779 | if (!*token) | 4777 | if (!*token) |
4780 | continue; | 4778 | continue; |
4781 | /* | 4779 | /* |
4782 | * cgroup_disable, being at boot time, can't know about module | 4780 | * cgroup_disable, being at boot time, can't know about module |
4783 | * subsystems, so we don't worry about them. | 4781 | * subsystems, so we don't worry about them. |
4784 | */ | 4782 | */ |
4785 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 4783 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { |
4786 | struct cgroup_subsys *ss = subsys[i]; | 4784 | struct cgroup_subsys *ss = subsys[i]; |
4787 | 4785 | ||
4788 | if (!strcmp(token, ss->name)) { | 4786 | if (!strcmp(token, ss->name)) { |
4789 | ss->disabled = 1; | 4787 | ss->disabled = 1; |
4790 | printk(KERN_INFO "Disabling %s control group" | 4788 | printk(KERN_INFO "Disabling %s control group" |
4791 | " subsystem\n", ss->name); | 4789 | " subsystem\n", ss->name); |
4792 | break; | 4790 | break; |
4793 | } | 4791 | } |
4794 | } | 4792 | } |
4795 | } | 4793 | } |
4796 | return 1; | 4794 | return 1; |
4797 | } | 4795 | } |
4798 | __setup("cgroup_disable=", cgroup_disable); | 4796 | __setup("cgroup_disable=", cgroup_disable); |
4799 | 4797 | ||
4800 | /* | 4798 | /* |
4801 | * Functons for CSS ID. | 4799 | * Functons for CSS ID. |
4802 | */ | 4800 | */ |
4803 | 4801 | ||
4804 | /* | 4802 | /* |
4805 | *To get ID other than 0, this should be called when !cgroup_is_removed(). | 4803 | *To get ID other than 0, this should be called when !cgroup_is_removed(). |
4806 | */ | 4804 | */ |
4807 | unsigned short css_id(struct cgroup_subsys_state *css) | 4805 | unsigned short css_id(struct cgroup_subsys_state *css) |
4808 | { | 4806 | { |
4809 | struct css_id *cssid; | 4807 | struct css_id *cssid; |
4810 | 4808 | ||
4811 | /* | 4809 | /* |
4812 | * This css_id() can return correct value when somone has refcnt | 4810 | * This css_id() can return correct value when somone has refcnt |
4813 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 4811 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
4814 | * it's unchanged until freed. | 4812 | * it's unchanged until freed. |
4815 | */ | 4813 | */ |
4816 | cssid = rcu_dereference_check(css->id, | 4814 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); |
4817 | rcu_read_lock_held() || atomic_read(&css->refcnt)); | ||
4818 | 4815 | ||
4819 | if (cssid) | 4816 | if (cssid) |
4820 | return cssid->id; | 4817 | return cssid->id; |
4821 | return 0; | 4818 | return 0; |
4822 | } | 4819 | } |
4823 | EXPORT_SYMBOL_GPL(css_id); | 4820 | EXPORT_SYMBOL_GPL(css_id); |
4824 | 4821 | ||
4825 | unsigned short css_depth(struct cgroup_subsys_state *css) | 4822 | unsigned short css_depth(struct cgroup_subsys_state *css) |
4826 | { | 4823 | { |
4827 | struct css_id *cssid; | 4824 | struct css_id *cssid; |
4828 | 4825 | ||
4829 | cssid = rcu_dereference_check(css->id, | 4826 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); |
4830 | rcu_read_lock_held() || atomic_read(&css->refcnt)); | ||
4831 | 4827 | ||
4832 | if (cssid) | 4828 | if (cssid) |
4833 | return cssid->depth; | 4829 | return cssid->depth; |
4834 | return 0; | 4830 | return 0; |
4835 | } | 4831 | } |
4836 | EXPORT_SYMBOL_GPL(css_depth); | 4832 | EXPORT_SYMBOL_GPL(css_depth); |
4837 | 4833 | ||
4838 | /** | 4834 | /** |
4839 | * css_is_ancestor - test "root" css is an ancestor of "child" | 4835 | * css_is_ancestor - test "root" css is an ancestor of "child" |
4840 | * @child: the css to be tested. | 4836 | * @child: the css to be tested. |
4841 | * @root: the css supporsed to be an ancestor of the child. | 4837 | * @root: the css supporsed to be an ancestor of the child. |
4842 | * | 4838 | * |
4843 | * Returns true if "root" is an ancestor of "child" in its hierarchy. Because | 4839 | * Returns true if "root" is an ancestor of "child" in its hierarchy. Because |
4844 | * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). | 4840 | * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). |
4845 | * But, considering usual usage, the csses should be valid objects after test. | 4841 | * But, considering usual usage, the csses should be valid objects after test. |
4846 | * Assuming that the caller will do some action to the child if this returns | 4842 | * Assuming that the caller will do some action to the child if this returns |
4847 | * returns true, the caller must take "child";s reference count. | 4843 | * returns true, the caller must take "child";s reference count. |
4848 | * If "child" is valid object and this returns true, "root" is valid, too. | 4844 | * If "child" is valid object and this returns true, "root" is valid, too. |
4849 | */ | 4845 | */ |
4850 | 4846 | ||
4851 | bool css_is_ancestor(struct cgroup_subsys_state *child, | 4847 | bool css_is_ancestor(struct cgroup_subsys_state *child, |
4852 | const struct cgroup_subsys_state *root) | 4848 | const struct cgroup_subsys_state *root) |
4853 | { | 4849 | { |
4854 | struct css_id *child_id; | 4850 | struct css_id *child_id; |
4855 | struct css_id *root_id; | 4851 | struct css_id *root_id; |
4856 | bool ret = true; | 4852 | bool ret = true; |
4857 | 4853 | ||
4858 | rcu_read_lock(); | 4854 | rcu_read_lock(); |
4859 | child_id = rcu_dereference(child->id); | 4855 | child_id = rcu_dereference(child->id); |
4860 | root_id = rcu_dereference(root->id); | 4856 | root_id = rcu_dereference(root->id); |
4861 | if (!child_id | 4857 | if (!child_id |
4862 | || !root_id | 4858 | || !root_id |
4863 | || (child_id->depth < root_id->depth) | 4859 | || (child_id->depth < root_id->depth) |
4864 | || (child_id->stack[root_id->depth] != root_id->id)) | 4860 | || (child_id->stack[root_id->depth] != root_id->id)) |
4865 | ret = false; | 4861 | ret = false; |
4866 | rcu_read_unlock(); | 4862 | rcu_read_unlock(); |
4867 | return ret; | 4863 | return ret; |
4868 | } | 4864 | } |
4869 | 4865 | ||
4870 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | 4866 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) |
4871 | { | 4867 | { |
4872 | struct css_id *id = css->id; | 4868 | struct css_id *id = css->id; |
4873 | /* When this is called before css_id initialization, id can be NULL */ | 4869 | /* When this is called before css_id initialization, id can be NULL */ |
4874 | if (!id) | 4870 | if (!id) |
4875 | return; | 4871 | return; |
4876 | 4872 | ||
4877 | BUG_ON(!ss->use_id); | 4873 | BUG_ON(!ss->use_id); |
4878 | 4874 | ||
4879 | rcu_assign_pointer(id->css, NULL); | 4875 | rcu_assign_pointer(id->css, NULL); |
4880 | rcu_assign_pointer(css->id, NULL); | 4876 | rcu_assign_pointer(css->id, NULL); |
4881 | spin_lock(&ss->id_lock); | 4877 | spin_lock(&ss->id_lock); |
4882 | idr_remove(&ss->idr, id->id); | 4878 | idr_remove(&ss->idr, id->id); |
4883 | spin_unlock(&ss->id_lock); | 4879 | spin_unlock(&ss->id_lock); |
4884 | kfree_rcu(id, rcu_head); | 4880 | kfree_rcu(id, rcu_head); |
4885 | } | 4881 | } |
4886 | EXPORT_SYMBOL_GPL(free_css_id); | 4882 | EXPORT_SYMBOL_GPL(free_css_id); |
4887 | 4883 | ||
4888 | /* | 4884 | /* |
4889 | * This is called by init or create(). Then, calls to this function are | 4885 | * This is called by init or create(). Then, calls to this function are |
4890 | * always serialized (By cgroup_mutex() at create()). | 4886 | * always serialized (By cgroup_mutex() at create()). |
4891 | */ | 4887 | */ |
4892 | 4888 | ||
4893 | static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | 4889 | static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) |
4894 | { | 4890 | { |
4895 | struct css_id *newid; | 4891 | struct css_id *newid; |
4896 | int myid, error, size; | 4892 | int myid, error, size; |
4897 | 4893 | ||
4898 | BUG_ON(!ss->use_id); | 4894 | BUG_ON(!ss->use_id); |
4899 | 4895 | ||
4900 | size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); | 4896 | size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); |
4901 | newid = kzalloc(size, GFP_KERNEL); | 4897 | newid = kzalloc(size, GFP_KERNEL); |
4902 | if (!newid) | 4898 | if (!newid) |
4903 | return ERR_PTR(-ENOMEM); | 4899 | return ERR_PTR(-ENOMEM); |
4904 | /* get id */ | 4900 | /* get id */ |
4905 | if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { | 4901 | if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { |
4906 | error = -ENOMEM; | 4902 | error = -ENOMEM; |
4907 | goto err_out; | 4903 | goto err_out; |
4908 | } | 4904 | } |
4909 | spin_lock(&ss->id_lock); | 4905 | spin_lock(&ss->id_lock); |
4910 | /* Don't use 0. allocates an ID of 1-65535 */ | 4906 | /* Don't use 0. allocates an ID of 1-65535 */ |
4911 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); | 4907 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); |
4912 | spin_unlock(&ss->id_lock); | 4908 | spin_unlock(&ss->id_lock); |
4913 | 4909 | ||
4914 | /* Returns error when there are no free spaces for new ID.*/ | 4910 | /* Returns error when there are no free spaces for new ID.*/ |
4915 | if (error) { | 4911 | if (error) { |
4916 | error = -ENOSPC; | 4912 | error = -ENOSPC; |
4917 | goto err_out; | 4913 | goto err_out; |
4918 | } | 4914 | } |
4919 | if (myid > CSS_ID_MAX) | 4915 | if (myid > CSS_ID_MAX) |
4920 | goto remove_idr; | 4916 | goto remove_idr; |
4921 | 4917 | ||
4922 | newid->id = myid; | 4918 | newid->id = myid; |
4923 | newid->depth = depth; | 4919 | newid->depth = depth; |
4924 | return newid; | 4920 | return newid; |
4925 | remove_idr: | 4921 | remove_idr: |
4926 | error = -ENOSPC; | 4922 | error = -ENOSPC; |
4927 | spin_lock(&ss->id_lock); | 4923 | spin_lock(&ss->id_lock); |
4928 | idr_remove(&ss->idr, myid); | 4924 | idr_remove(&ss->idr, myid); |
4929 | spin_unlock(&ss->id_lock); | 4925 | spin_unlock(&ss->id_lock); |
4930 | err_out: | 4926 | err_out: |
4931 | kfree(newid); | 4927 | kfree(newid); |
4932 | return ERR_PTR(error); | 4928 | return ERR_PTR(error); |
4933 | 4929 | ||
4934 | } | 4930 | } |
4935 | 4931 | ||
4936 | static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | 4932 | static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, |
4937 | struct cgroup_subsys_state *rootcss) | 4933 | struct cgroup_subsys_state *rootcss) |
4938 | { | 4934 | { |
4939 | struct css_id *newid; | 4935 | struct css_id *newid; |
4940 | 4936 | ||
4941 | spin_lock_init(&ss->id_lock); | 4937 | spin_lock_init(&ss->id_lock); |
4942 | idr_init(&ss->idr); | 4938 | idr_init(&ss->idr); |
4943 | 4939 | ||
4944 | newid = get_new_cssid(ss, 0); | 4940 | newid = get_new_cssid(ss, 0); |
4945 | if (IS_ERR(newid)) | 4941 | if (IS_ERR(newid)) |
4946 | return PTR_ERR(newid); | 4942 | return PTR_ERR(newid); |
4947 | 4943 | ||
4948 | newid->stack[0] = newid->id; | 4944 | newid->stack[0] = newid->id; |
4949 | newid->css = rootcss; | 4945 | newid->css = rootcss; |
4950 | rootcss->id = newid; | 4946 | rootcss->id = newid; |
4951 | return 0; | 4947 | return 0; |
4952 | } | 4948 | } |
4953 | 4949 | ||
4954 | static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, | 4950 | static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, |
4955 | struct cgroup *child) | 4951 | struct cgroup *child) |
4956 | { | 4952 | { |
4957 | int subsys_id, i, depth = 0; | 4953 | int subsys_id, i, depth = 0; |
4958 | struct cgroup_subsys_state *parent_css, *child_css; | 4954 | struct cgroup_subsys_state *parent_css, *child_css; |
4959 | struct css_id *child_id, *parent_id; | 4955 | struct css_id *child_id, *parent_id; |
4960 | 4956 | ||
4961 | subsys_id = ss->subsys_id; | 4957 | subsys_id = ss->subsys_id; |
4962 | parent_css = parent->subsys[subsys_id]; | 4958 | parent_css = parent->subsys[subsys_id]; |
4963 | child_css = child->subsys[subsys_id]; | 4959 | child_css = child->subsys[subsys_id]; |
4964 | parent_id = parent_css->id; | 4960 | parent_id = parent_css->id; |
4965 | depth = parent_id->depth + 1; | 4961 | depth = parent_id->depth + 1; |
4966 | 4962 | ||
4967 | child_id = get_new_cssid(ss, depth); | 4963 | child_id = get_new_cssid(ss, depth); |
4968 | if (IS_ERR(child_id)) | 4964 | if (IS_ERR(child_id)) |
4969 | return PTR_ERR(child_id); | 4965 | return PTR_ERR(child_id); |
4970 | 4966 | ||
4971 | for (i = 0; i < depth; i++) | 4967 | for (i = 0; i < depth; i++) |
4972 | child_id->stack[i] = parent_id->stack[i]; | 4968 | child_id->stack[i] = parent_id->stack[i]; |
4973 | child_id->stack[depth] = child_id->id; | 4969 | child_id->stack[depth] = child_id->id; |
4974 | /* | 4970 | /* |
4975 | * child_id->css pointer will be set after this cgroup is available | 4971 | * child_id->css pointer will be set after this cgroup is available |
4976 | * see cgroup_populate_dir() | 4972 | * see cgroup_populate_dir() |
4977 | */ | 4973 | */ |
4978 | rcu_assign_pointer(child_css->id, child_id); | 4974 | rcu_assign_pointer(child_css->id, child_id); |
4979 | 4975 | ||
4980 | return 0; | 4976 | return 0; |
4981 | } | 4977 | } |
4982 | 4978 | ||
4983 | /** | 4979 | /** |
4984 | * css_lookup - lookup css by id | 4980 | * css_lookup - lookup css by id |
4985 | * @ss: cgroup subsys to be looked into. | 4981 | * @ss: cgroup subsys to be looked into. |
4986 | * @id: the id | 4982 | * @id: the id |
4987 | * | 4983 | * |
4988 | * Returns pointer to cgroup_subsys_state if there is valid one with id. | 4984 | * Returns pointer to cgroup_subsys_state if there is valid one with id. |
4989 | * NULL if not. Should be called under rcu_read_lock() | 4985 | * NULL if not. Should be called under rcu_read_lock() |
4990 | */ | 4986 | */ |
4991 | struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) | 4987 | struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) |
4992 | { | 4988 | { |
4993 | struct css_id *cssid = NULL; | 4989 | struct css_id *cssid = NULL; |
4994 | 4990 | ||
4995 | BUG_ON(!ss->use_id); | 4991 | BUG_ON(!ss->use_id); |
4996 | cssid = idr_find(&ss->idr, id); | 4992 | cssid = idr_find(&ss->idr, id); |
4997 | 4993 | ||
4998 | if (unlikely(!cssid)) | 4994 | if (unlikely(!cssid)) |
4999 | return NULL; | 4995 | return NULL; |
5000 | 4996 | ||
5001 | return rcu_dereference(cssid->css); | 4997 | return rcu_dereference(cssid->css); |
5002 | } | 4998 | } |
5003 | EXPORT_SYMBOL_GPL(css_lookup); | 4999 | EXPORT_SYMBOL_GPL(css_lookup); |
5004 | 5000 | ||
5005 | /** | 5001 | /** |
5006 | * css_get_next - lookup next cgroup under specified hierarchy. | 5002 | * css_get_next - lookup next cgroup under specified hierarchy. |
5007 | * @ss: pointer to subsystem | 5003 | * @ss: pointer to subsystem |
5008 | * @id: current position of iteration. | 5004 | * @id: current position of iteration. |
5009 | * @root: pointer to css. search tree under this. | 5005 | * @root: pointer to css. search tree under this. |
5010 | * @foundid: position of found object. | 5006 | * @foundid: position of found object. |
5011 | * | 5007 | * |
5012 | * Search next css under the specified hierarchy of rootid. Calling under | 5008 | * Search next css under the specified hierarchy of rootid. Calling under |
5013 | * rcu_read_lock() is necessary. Returns NULL if it reaches the end. | 5009 | * rcu_read_lock() is necessary. Returns NULL if it reaches the end. |
5014 | */ | 5010 | */ |
5015 | struct cgroup_subsys_state * | 5011 | struct cgroup_subsys_state * |
5016 | css_get_next(struct cgroup_subsys *ss, int id, | 5012 | css_get_next(struct cgroup_subsys *ss, int id, |
5017 | struct cgroup_subsys_state *root, int *foundid) | 5013 | struct cgroup_subsys_state *root, int *foundid) |
5018 | { | 5014 | { |
5019 | struct cgroup_subsys_state *ret = NULL; | 5015 | struct cgroup_subsys_state *ret = NULL; |
5020 | struct css_id *tmp; | 5016 | struct css_id *tmp; |
5021 | int tmpid; | 5017 | int tmpid; |
5022 | int rootid = css_id(root); | 5018 | int rootid = css_id(root); |
5023 | int depth = css_depth(root); | 5019 | int depth = css_depth(root); |
5024 | 5020 | ||
5025 | if (!rootid) | 5021 | if (!rootid) |
5026 | return NULL; | 5022 | return NULL; |
5027 | 5023 | ||
5028 | BUG_ON(!ss->use_id); | 5024 | BUG_ON(!ss->use_id); |
5029 | /* fill start point for scan */ | 5025 | /* fill start point for scan */ |
5030 | tmpid = id; | 5026 | tmpid = id; |
5031 | while (1) { | 5027 | while (1) { |
5032 | /* | 5028 | /* |
5033 | * scan next entry from bitmap(tree), tmpid is updated after | 5029 | * scan next entry from bitmap(tree), tmpid is updated after |
5034 | * idr_get_next(). | 5030 | * idr_get_next(). |
5035 | */ | 5031 | */ |
5036 | spin_lock(&ss->id_lock); | 5032 | spin_lock(&ss->id_lock); |
5037 | tmp = idr_get_next(&ss->idr, &tmpid); | 5033 | tmp = idr_get_next(&ss->idr, &tmpid); |
5038 | spin_unlock(&ss->id_lock); | 5034 | spin_unlock(&ss->id_lock); |
5039 | 5035 | ||
5040 | if (!tmp) | 5036 | if (!tmp) |
5041 | break; | 5037 | break; |
5042 | if (tmp->depth >= depth && tmp->stack[depth] == rootid) { | 5038 | if (tmp->depth >= depth && tmp->stack[depth] == rootid) { |
5043 | ret = rcu_dereference(tmp->css); | 5039 | ret = rcu_dereference(tmp->css); |
5044 | if (ret) { | 5040 | if (ret) { |
5045 | *foundid = tmpid; | 5041 | *foundid = tmpid; |
5046 | break; | 5042 | break; |
5047 | } | 5043 | } |
5048 | } | 5044 | } |
5049 | /* continue to scan from next id */ | 5045 | /* continue to scan from next id */ |
5050 | tmpid = tmpid + 1; | 5046 | tmpid = tmpid + 1; |
5051 | } | 5047 | } |
5052 | return ret; | 5048 | return ret; |
5053 | } | 5049 | } |
5054 | 5050 | ||
5055 | /* | 5051 | /* |
5056 | * get corresponding css from file open on cgroupfs directory | 5052 | * get corresponding css from file open on cgroupfs directory |
5057 | */ | 5053 | */ |
5058 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | 5054 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) |
5059 | { | 5055 | { |
5060 | struct cgroup *cgrp; | 5056 | struct cgroup *cgrp; |
5061 | struct inode *inode; | 5057 | struct inode *inode; |
5062 | struct cgroup_subsys_state *css; | 5058 | struct cgroup_subsys_state *css; |
5063 | 5059 | ||
5064 | inode = f->f_dentry->d_inode; | 5060 | inode = f->f_dentry->d_inode; |
5065 | /* check in cgroup filesystem dir */ | 5061 | /* check in cgroup filesystem dir */ |
5066 | if (inode->i_op != &cgroup_dir_inode_operations) | 5062 | if (inode->i_op != &cgroup_dir_inode_operations) |
5067 | return ERR_PTR(-EBADF); | 5063 | return ERR_PTR(-EBADF); |
5068 | 5064 | ||
5069 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | 5065 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) |
5070 | return ERR_PTR(-EINVAL); | 5066 | return ERR_PTR(-EINVAL); |
5071 | 5067 | ||
5072 | /* get cgroup */ | 5068 | /* get cgroup */ |
5073 | cgrp = __d_cgrp(f->f_dentry); | 5069 | cgrp = __d_cgrp(f->f_dentry); |
5074 | css = cgrp->subsys[id]; | 5070 | css = cgrp->subsys[id]; |
5075 | return css ? css : ERR_PTR(-ENOENT); | 5071 | return css ? css : ERR_PTR(-ENOENT); |
5076 | } | 5072 | } |
5077 | 5073 | ||
5078 | #ifdef CONFIG_CGROUP_DEBUG | 5074 | #ifdef CONFIG_CGROUP_DEBUG |
5079 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, | 5075 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, |
5080 | struct cgroup *cont) | 5076 | struct cgroup *cont) |
5081 | { | 5077 | { |
5082 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5078 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5083 | 5079 | ||
5084 | if (!css) | 5080 | if (!css) |
5085 | return ERR_PTR(-ENOMEM); | 5081 | return ERR_PTR(-ENOMEM); |
5086 | 5082 | ||
5087 | return css; | 5083 | return css; |
5088 | } | 5084 | } |
5089 | 5085 | ||
5090 | static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 5086 | static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
5091 | { | 5087 | { |
5092 | kfree(cont->subsys[debug_subsys_id]); | 5088 | kfree(cont->subsys[debug_subsys_id]); |
5093 | } | 5089 | } |
5094 | 5090 | ||
5095 | static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) | 5091 | static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) |
5096 | { | 5092 | { |
5097 | return atomic_read(&cont->count); | 5093 | return atomic_read(&cont->count); |
5098 | } | 5094 | } |
5099 | 5095 | ||
5100 | static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) | 5096 | static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) |
5101 | { | 5097 | { |
5102 | return cgroup_task_count(cont); | 5098 | return cgroup_task_count(cont); |
5103 | } | 5099 | } |
5104 | 5100 | ||
5105 | static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) | 5101 | static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) |
5106 | { | 5102 | { |
5107 | return (u64)(unsigned long)current->cgroups; | 5103 | return (u64)(unsigned long)current->cgroups; |
5108 | } | 5104 | } |
5109 | 5105 | ||
5110 | static u64 current_css_set_refcount_read(struct cgroup *cont, | 5106 | static u64 current_css_set_refcount_read(struct cgroup *cont, |
5111 | struct cftype *cft) | 5107 | struct cftype *cft) |
5112 | { | 5108 | { |
5113 | u64 count; | 5109 | u64 count; |
5114 | 5110 | ||
5115 | rcu_read_lock(); | 5111 | rcu_read_lock(); |
5116 | count = atomic_read(¤t->cgroups->refcount); | 5112 | count = atomic_read(¤t->cgroups->refcount); |
5117 | rcu_read_unlock(); | 5113 | rcu_read_unlock(); |
5118 | return count; | 5114 | return count; |
5119 | } | 5115 | } |
5120 | 5116 | ||
5121 | static int current_css_set_cg_links_read(struct cgroup *cont, | 5117 | static int current_css_set_cg_links_read(struct cgroup *cont, |
5122 | struct cftype *cft, | 5118 | struct cftype *cft, |
5123 | struct seq_file *seq) | 5119 | struct seq_file *seq) |
5124 | { | 5120 | { |
5125 | struct cg_cgroup_link *link; | 5121 | struct cg_cgroup_link *link; |
5126 | struct css_set *cg; | 5122 | struct css_set *cg; |
5127 | 5123 | ||
5128 | read_lock(&css_set_lock); | 5124 | read_lock(&css_set_lock); |
5129 | rcu_read_lock(); | 5125 | rcu_read_lock(); |
5130 | cg = rcu_dereference(current->cgroups); | 5126 | cg = rcu_dereference(current->cgroups); |
5131 | list_for_each_entry(link, &cg->cg_links, cg_link_list) { | 5127 | list_for_each_entry(link, &cg->cg_links, cg_link_list) { |
5132 | struct cgroup *c = link->cgrp; | 5128 | struct cgroup *c = link->cgrp; |
5133 | const char *name; | 5129 | const char *name; |
5134 | 5130 | ||
5135 | if (c->dentry) | 5131 | if (c->dentry) |
5136 | name = c->dentry->d_name.name; | 5132 | name = c->dentry->d_name.name; |
5137 | else | 5133 | else |
5138 | name = "?"; | 5134 | name = "?"; |
5139 | seq_printf(seq, "Root %d group %s\n", | 5135 | seq_printf(seq, "Root %d group %s\n", |
5140 | c->root->hierarchy_id, name); | 5136 | c->root->hierarchy_id, name); |
5141 | } | 5137 | } |
5142 | rcu_read_unlock(); | 5138 | rcu_read_unlock(); |
5143 | read_unlock(&css_set_lock); | 5139 | read_unlock(&css_set_lock); |
5144 | return 0; | 5140 | return 0; |
5145 | } | 5141 | } |
5146 | 5142 | ||
5147 | #define MAX_TASKS_SHOWN_PER_CSS 25 | 5143 | #define MAX_TASKS_SHOWN_PER_CSS 25 |
5148 | static int cgroup_css_links_read(struct cgroup *cont, | 5144 | static int cgroup_css_links_read(struct cgroup *cont, |
5149 | struct cftype *cft, | 5145 | struct cftype *cft, |
5150 | struct seq_file *seq) | 5146 | struct seq_file *seq) |
5151 | { | 5147 | { |
5152 | struct cg_cgroup_link *link; | 5148 | struct cg_cgroup_link *link; |
5153 | 5149 | ||
5154 | read_lock(&css_set_lock); | 5150 | read_lock(&css_set_lock); |
5155 | list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { | 5151 | list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { |
5156 | struct css_set *cg = link->cg; | 5152 | struct css_set *cg = link->cg; |
5157 | struct task_struct *task; | 5153 | struct task_struct *task; |
5158 | int count = 0; | 5154 | int count = 0; |
5159 | seq_printf(seq, "css_set %p\n", cg); | 5155 | seq_printf(seq, "css_set %p\n", cg); |
5160 | list_for_each_entry(task, &cg->tasks, cg_list) { | 5156 | list_for_each_entry(task, &cg->tasks, cg_list) { |
5161 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) { | 5157 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) { |
5162 | seq_puts(seq, " ...\n"); | 5158 | seq_puts(seq, " ...\n"); |
5163 | break; | 5159 | break; |
5164 | } else { | 5160 | } else { |
5165 | seq_printf(seq, " task %d\n", | 5161 | seq_printf(seq, " task %d\n", |
5166 | task_pid_vnr(task)); | 5162 | task_pid_vnr(task)); |
5167 | } | 5163 | } |
5168 | } | 5164 | } |
5169 | } | 5165 | } |
5170 | read_unlock(&css_set_lock); | 5166 | read_unlock(&css_set_lock); |
5171 | return 0; | 5167 | return 0; |
5172 | } | 5168 | } |
5173 | 5169 | ||
5174 | static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) | 5170 | static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) |
5175 | { | 5171 | { |
5176 | return test_bit(CGRP_RELEASABLE, &cgrp->flags); | 5172 | return test_bit(CGRP_RELEASABLE, &cgrp->flags); |
5177 | } | 5173 | } |
5178 | 5174 | ||
5179 | static struct cftype debug_files[] = { | 5175 | static struct cftype debug_files[] = { |
5180 | { | 5176 | { |
5181 | .name = "cgroup_refcount", | 5177 | .name = "cgroup_refcount", |
5182 | .read_u64 = cgroup_refcount_read, | 5178 | .read_u64 = cgroup_refcount_read, |
5183 | }, | 5179 | }, |
5184 | { | 5180 | { |
5185 | .name = "taskcount", | 5181 | .name = "taskcount", |
5186 | .read_u64 = debug_taskcount_read, | 5182 | .read_u64 = debug_taskcount_read, |
5187 | }, | 5183 | }, |
5188 | 5184 | ||
5189 | { | 5185 | { |
5190 | .name = "current_css_set", | 5186 | .name = "current_css_set", |
5191 | .read_u64 = current_css_set_read, | 5187 | .read_u64 = current_css_set_read, |
5192 | }, | 5188 | }, |
5193 | 5189 | ||
5194 | { | 5190 | { |
5195 | .name = "current_css_set_refcount", | 5191 | .name = "current_css_set_refcount", |
5196 | .read_u64 = current_css_set_refcount_read, | 5192 | .read_u64 = current_css_set_refcount_read, |
5197 | }, | 5193 | }, |
5198 | 5194 | ||
5199 | { | 5195 | { |
5200 | .name = "current_css_set_cg_links", | 5196 | .name = "current_css_set_cg_links", |
5201 | .read_seq_string = current_css_set_cg_links_read, | 5197 | .read_seq_string = current_css_set_cg_links_read, |
5202 | }, | 5198 | }, |
5203 | 5199 | ||
5204 | { | 5200 | { |
5205 | .name = "cgroup_css_links", | 5201 | .name = "cgroup_css_links", |
5206 | .read_seq_string = cgroup_css_links_read, | 5202 | .read_seq_string = cgroup_css_links_read, |
5207 | }, | 5203 | }, |
5208 | 5204 | ||
5209 | { | 5205 | { |
5210 | .name = "releasable", | 5206 | .name = "releasable", |
5211 | .read_u64 = releasable_read, | 5207 | .read_u64 = releasable_read, |
5212 | }, | 5208 | }, |
5213 | }; | 5209 | }; |
5214 | 5210 | ||
5215 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 5211 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
5216 | { | 5212 | { |
5217 | return cgroup_add_files(cont, ss, debug_files, | 5213 | return cgroup_add_files(cont, ss, debug_files, |
5218 | ARRAY_SIZE(debug_files)); | 5214 | ARRAY_SIZE(debug_files)); |
5219 | } | 5215 | } |
5220 | 5216 | ||
5221 | struct cgroup_subsys debug_subsys = { | 5217 | struct cgroup_subsys debug_subsys = { |
5222 | .name = "debug", | 5218 | .name = "debug", |
5223 | .create = debug_create, | 5219 | .create = debug_create, |
5224 | .destroy = debug_destroy, | 5220 | .destroy = debug_destroy, |
5225 | .populate = debug_populate, | 5221 | .populate = debug_populate, |
5226 | .subsys_id = debug_subsys_id, | 5222 | .subsys_id = debug_subsys_id, |
5227 | }; | 5223 | }; |
5228 | #endif /* CONFIG_CGROUP_DEBUG */ | 5224 | #endif /* CONFIG_CGROUP_DEBUG */ |
5229 | 5225 |
kernel/exit.c
1 | /* | 1 | /* |
2 | * linux/kernel/exit.c | 2 | * linux/kernel/exit.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/capability.h> | 11 | #include <linux/capability.h> |
12 | #include <linux/completion.h> | 12 | #include <linux/completion.h> |
13 | #include <linux/personality.h> | 13 | #include <linux/personality.h> |
14 | #include <linux/tty.h> | 14 | #include <linux/tty.h> |
15 | #include <linux/iocontext.h> | 15 | #include <linux/iocontext.h> |
16 | #include <linux/key.h> | 16 | #include <linux/key.h> |
17 | #include <linux/security.h> | 17 | #include <linux/security.h> |
18 | #include <linux/cpu.h> | 18 | #include <linux/cpu.h> |
19 | #include <linux/acct.h> | 19 | #include <linux/acct.h> |
20 | #include <linux/tsacct_kern.h> | 20 | #include <linux/tsacct_kern.h> |
21 | #include <linux/file.h> | 21 | #include <linux/file.h> |
22 | #include <linux/fdtable.h> | 22 | #include <linux/fdtable.h> |
23 | #include <linux/binfmts.h> | 23 | #include <linux/binfmts.h> |
24 | #include <linux/nsproxy.h> | 24 | #include <linux/nsproxy.h> |
25 | #include <linux/pid_namespace.h> | 25 | #include <linux/pid_namespace.h> |
26 | #include <linux/ptrace.h> | 26 | #include <linux/ptrace.h> |
27 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
28 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
29 | #include <linux/proc_fs.h> | 29 | #include <linux/proc_fs.h> |
30 | #include <linux/kthread.h> | 30 | #include <linux/kthread.h> |
31 | #include <linux/mempolicy.h> | 31 | #include <linux/mempolicy.h> |
32 | #include <linux/taskstats_kern.h> | 32 | #include <linux/taskstats_kern.h> |
33 | #include <linux/delayacct.h> | 33 | #include <linux/delayacct.h> |
34 | #include <linux/freezer.h> | 34 | #include <linux/freezer.h> |
35 | #include <linux/cgroup.h> | 35 | #include <linux/cgroup.h> |
36 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
37 | #include <linux/signal.h> | 37 | #include <linux/signal.h> |
38 | #include <linux/posix-timers.h> | 38 | #include <linux/posix-timers.h> |
39 | #include <linux/cn_proc.h> | 39 | #include <linux/cn_proc.h> |
40 | #include <linux/mutex.h> | 40 | #include <linux/mutex.h> |
41 | #include <linux/futex.h> | 41 | #include <linux/futex.h> |
42 | #include <linux/pipe_fs_i.h> | 42 | #include <linux/pipe_fs_i.h> |
43 | #include <linux/audit.h> /* for audit_free() */ | 43 | #include <linux/audit.h> /* for audit_free() */ |
44 | #include <linux/resource.h> | 44 | #include <linux/resource.h> |
45 | #include <linux/blkdev.h> | 45 | #include <linux/blkdev.h> |
46 | #include <linux/task_io_accounting_ops.h> | 46 | #include <linux/task_io_accounting_ops.h> |
47 | #include <linux/tracehook.h> | 47 | #include <linux/tracehook.h> |
48 | #include <linux/fs_struct.h> | 48 | #include <linux/fs_struct.h> |
49 | #include <linux/init_task.h> | 49 | #include <linux/init_task.h> |
50 | #include <linux/perf_event.h> | 50 | #include <linux/perf_event.h> |
51 | #include <trace/events/sched.h> | 51 | #include <trace/events/sched.h> |
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | 53 | #include <linux/oom.h> |
54 | 54 | ||
55 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
56 | #include <asm/unistd.h> | 56 | #include <asm/unistd.h> |
57 | #include <asm/pgtable.h> | 57 | #include <asm/pgtable.h> |
58 | #include <asm/mmu_context.h> | 58 | #include <asm/mmu_context.h> |
59 | 59 | ||
60 | static void exit_mm(struct task_struct * tsk); | 60 | static void exit_mm(struct task_struct * tsk); |
61 | 61 | ||
62 | static void __unhash_process(struct task_struct *p, bool group_dead) | 62 | static void __unhash_process(struct task_struct *p, bool group_dead) |
63 | { | 63 | { |
64 | nr_threads--; | 64 | nr_threads--; |
65 | detach_pid(p, PIDTYPE_PID); | 65 | detach_pid(p, PIDTYPE_PID); |
66 | if (group_dead) { | 66 | if (group_dead) { |
67 | detach_pid(p, PIDTYPE_PGID); | 67 | detach_pid(p, PIDTYPE_PGID); |
68 | detach_pid(p, PIDTYPE_SID); | 68 | detach_pid(p, PIDTYPE_SID); |
69 | 69 | ||
70 | list_del_rcu(&p->tasks); | 70 | list_del_rcu(&p->tasks); |
71 | list_del_init(&p->sibling); | 71 | list_del_init(&p->sibling); |
72 | __this_cpu_dec(process_counts); | 72 | __this_cpu_dec(process_counts); |
73 | } | 73 | } |
74 | list_del_rcu(&p->thread_group); | 74 | list_del_rcu(&p->thread_group); |
75 | } | 75 | } |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * This function expects the tasklist_lock write-locked. | 78 | * This function expects the tasklist_lock write-locked. |
79 | */ | 79 | */ |
80 | static void __exit_signal(struct task_struct *tsk) | 80 | static void __exit_signal(struct task_struct *tsk) |
81 | { | 81 | { |
82 | struct signal_struct *sig = tsk->signal; | 82 | struct signal_struct *sig = tsk->signal; |
83 | bool group_dead = thread_group_leader(tsk); | 83 | bool group_dead = thread_group_leader(tsk); |
84 | struct sighand_struct *sighand; | 84 | struct sighand_struct *sighand; |
85 | struct tty_struct *uninitialized_var(tty); | 85 | struct tty_struct *uninitialized_var(tty); |
86 | 86 | ||
87 | sighand = rcu_dereference_check(tsk->sighand, | 87 | sighand = rcu_dereference_check(tsk->sighand, |
88 | rcu_read_lock_held() || | ||
89 | lockdep_tasklist_lock_is_held()); | 88 | lockdep_tasklist_lock_is_held()); |
90 | spin_lock(&sighand->siglock); | 89 | spin_lock(&sighand->siglock); |
91 | 90 | ||
92 | posix_cpu_timers_exit(tsk); | 91 | posix_cpu_timers_exit(tsk); |
93 | if (group_dead) { | 92 | if (group_dead) { |
94 | posix_cpu_timers_exit_group(tsk); | 93 | posix_cpu_timers_exit_group(tsk); |
95 | tty = sig->tty; | 94 | tty = sig->tty; |
96 | sig->tty = NULL; | 95 | sig->tty = NULL; |
97 | } else { | 96 | } else { |
98 | /* | 97 | /* |
99 | * This can only happen if the caller is de_thread(). | 98 | * This can only happen if the caller is de_thread(). |
100 | * FIXME: this is the temporary hack, we should teach | 99 | * FIXME: this is the temporary hack, we should teach |
101 | * posix-cpu-timers to handle this case correctly. | 100 | * posix-cpu-timers to handle this case correctly. |
102 | */ | 101 | */ |
103 | if (unlikely(has_group_leader_pid(tsk))) | 102 | if (unlikely(has_group_leader_pid(tsk))) |
104 | posix_cpu_timers_exit_group(tsk); | 103 | posix_cpu_timers_exit_group(tsk); |
105 | 104 | ||
106 | /* | 105 | /* |
107 | * If there is any task waiting for the group exit | 106 | * If there is any task waiting for the group exit |
108 | * then notify it: | 107 | * then notify it: |
109 | */ | 108 | */ |
110 | if (sig->notify_count > 0 && !--sig->notify_count) | 109 | if (sig->notify_count > 0 && !--sig->notify_count) |
111 | wake_up_process(sig->group_exit_task); | 110 | wake_up_process(sig->group_exit_task); |
112 | 111 | ||
113 | if (tsk == sig->curr_target) | 112 | if (tsk == sig->curr_target) |
114 | sig->curr_target = next_thread(tsk); | 113 | sig->curr_target = next_thread(tsk); |
115 | /* | 114 | /* |
116 | * Accumulate here the counters for all threads but the | 115 | * Accumulate here the counters for all threads but the |
117 | * group leader as they die, so they can be added into | 116 | * group leader as they die, so they can be added into |
118 | * the process-wide totals when those are taken. | 117 | * the process-wide totals when those are taken. |
119 | * The group leader stays around as a zombie as long | 118 | * The group leader stays around as a zombie as long |
120 | * as there are other threads. When it gets reaped, | 119 | * as there are other threads. When it gets reaped, |
121 | * the exit.c code will add its counts into these totals. | 120 | * the exit.c code will add its counts into these totals. |
122 | * We won't ever get here for the group leader, since it | 121 | * We won't ever get here for the group leader, since it |
123 | * will have been the last reference on the signal_struct. | 122 | * will have been the last reference on the signal_struct. |
124 | */ | 123 | */ |
125 | sig->utime = cputime_add(sig->utime, tsk->utime); | 124 | sig->utime = cputime_add(sig->utime, tsk->utime); |
126 | sig->stime = cputime_add(sig->stime, tsk->stime); | 125 | sig->stime = cputime_add(sig->stime, tsk->stime); |
127 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); | 126 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); |
128 | sig->min_flt += tsk->min_flt; | 127 | sig->min_flt += tsk->min_flt; |
129 | sig->maj_flt += tsk->maj_flt; | 128 | sig->maj_flt += tsk->maj_flt; |
130 | sig->nvcsw += tsk->nvcsw; | 129 | sig->nvcsw += tsk->nvcsw; |
131 | sig->nivcsw += tsk->nivcsw; | 130 | sig->nivcsw += tsk->nivcsw; |
132 | sig->inblock += task_io_get_inblock(tsk); | 131 | sig->inblock += task_io_get_inblock(tsk); |
133 | sig->oublock += task_io_get_oublock(tsk); | 132 | sig->oublock += task_io_get_oublock(tsk); |
134 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | 133 | task_io_accounting_add(&sig->ioac, &tsk->ioac); |
135 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | 134 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; |
136 | } | 135 | } |
137 | 136 | ||
138 | sig->nr_threads--; | 137 | sig->nr_threads--; |
139 | __unhash_process(tsk, group_dead); | 138 | __unhash_process(tsk, group_dead); |
140 | 139 | ||
141 | /* | 140 | /* |
142 | * Do this under ->siglock, we can race with another thread | 141 | * Do this under ->siglock, we can race with another thread |
143 | * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. | 142 | * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. |
144 | */ | 143 | */ |
145 | flush_sigqueue(&tsk->pending); | 144 | flush_sigqueue(&tsk->pending); |
146 | tsk->sighand = NULL; | 145 | tsk->sighand = NULL; |
147 | spin_unlock(&sighand->siglock); | 146 | spin_unlock(&sighand->siglock); |
148 | 147 | ||
149 | __cleanup_sighand(sighand); | 148 | __cleanup_sighand(sighand); |
150 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | 149 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); |
151 | if (group_dead) { | 150 | if (group_dead) { |
152 | flush_sigqueue(&sig->shared_pending); | 151 | flush_sigqueue(&sig->shared_pending); |
153 | tty_kref_put(tty); | 152 | tty_kref_put(tty); |
154 | } | 153 | } |
155 | } | 154 | } |
156 | 155 | ||
157 | static void delayed_put_task_struct(struct rcu_head *rhp) | 156 | static void delayed_put_task_struct(struct rcu_head *rhp) |
158 | { | 157 | { |
159 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); | 158 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
160 | 159 | ||
161 | perf_event_delayed_put(tsk); | 160 | perf_event_delayed_put(tsk); |
162 | trace_sched_process_free(tsk); | 161 | trace_sched_process_free(tsk); |
163 | put_task_struct(tsk); | 162 | put_task_struct(tsk); |
164 | } | 163 | } |
165 | 164 | ||
166 | 165 | ||
167 | void release_task(struct task_struct * p) | 166 | void release_task(struct task_struct * p) |
168 | { | 167 | { |
169 | struct task_struct *leader; | 168 | struct task_struct *leader; |
170 | int zap_leader; | 169 | int zap_leader; |
171 | repeat: | 170 | repeat: |
172 | tracehook_prepare_release_task(p); | 171 | tracehook_prepare_release_task(p); |
173 | /* don't need to get the RCU readlock here - the process is dead and | 172 | /* don't need to get the RCU readlock here - the process is dead and |
174 | * can't be modifying its own credentials. But shut RCU-lockdep up */ | 173 | * can't be modifying its own credentials. But shut RCU-lockdep up */ |
175 | rcu_read_lock(); | 174 | rcu_read_lock(); |
176 | atomic_dec(&__task_cred(p)->user->processes); | 175 | atomic_dec(&__task_cred(p)->user->processes); |
177 | rcu_read_unlock(); | 176 | rcu_read_unlock(); |
178 | 177 | ||
179 | proc_flush_task(p); | 178 | proc_flush_task(p); |
180 | 179 | ||
181 | write_lock_irq(&tasklist_lock); | 180 | write_lock_irq(&tasklist_lock); |
182 | tracehook_finish_release_task(p); | 181 | tracehook_finish_release_task(p); |
183 | __exit_signal(p); | 182 | __exit_signal(p); |
184 | 183 | ||
185 | /* | 184 | /* |
186 | * If we are the last non-leader member of the thread | 185 | * If we are the last non-leader member of the thread |
187 | * group, and the leader is zombie, then notify the | 186 | * group, and the leader is zombie, then notify the |
188 | * group leader's parent process. (if it wants notification.) | 187 | * group leader's parent process. (if it wants notification.) |
189 | */ | 188 | */ |
190 | zap_leader = 0; | 189 | zap_leader = 0; |
191 | leader = p->group_leader; | 190 | leader = p->group_leader; |
192 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { | 191 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { |
193 | BUG_ON(task_detached(leader)); | 192 | BUG_ON(task_detached(leader)); |
194 | do_notify_parent(leader, leader->exit_signal); | 193 | do_notify_parent(leader, leader->exit_signal); |
195 | /* | 194 | /* |
196 | * If we were the last child thread and the leader has | 195 | * If we were the last child thread and the leader has |
197 | * exited already, and the leader's parent ignores SIGCHLD, | 196 | * exited already, and the leader's parent ignores SIGCHLD, |
198 | * then we are the one who should release the leader. | 197 | * then we are the one who should release the leader. |
199 | * | 198 | * |
200 | * do_notify_parent() will have marked it self-reaping in | 199 | * do_notify_parent() will have marked it self-reaping in |
201 | * that case. | 200 | * that case. |
202 | */ | 201 | */ |
203 | zap_leader = task_detached(leader); | 202 | zap_leader = task_detached(leader); |
204 | 203 | ||
205 | /* | 204 | /* |
206 | * This maintains the invariant that release_task() | 205 | * This maintains the invariant that release_task() |
207 | * only runs on a task in EXIT_DEAD, just for sanity. | 206 | * only runs on a task in EXIT_DEAD, just for sanity. |
208 | */ | 207 | */ |
209 | if (zap_leader) | 208 | if (zap_leader) |
210 | leader->exit_state = EXIT_DEAD; | 209 | leader->exit_state = EXIT_DEAD; |
211 | } | 210 | } |
212 | 211 | ||
213 | write_unlock_irq(&tasklist_lock); | 212 | write_unlock_irq(&tasklist_lock); |
214 | release_thread(p); | 213 | release_thread(p); |
215 | call_rcu(&p->rcu, delayed_put_task_struct); | 214 | call_rcu(&p->rcu, delayed_put_task_struct); |
216 | 215 | ||
217 | p = leader; | 216 | p = leader; |
218 | if (unlikely(zap_leader)) | 217 | if (unlikely(zap_leader)) |
219 | goto repeat; | 218 | goto repeat; |
220 | } | 219 | } |
221 | 220 | ||
222 | /* | 221 | /* |
223 | * This checks not only the pgrp, but falls back on the pid if no | 222 | * This checks not only the pgrp, but falls back on the pid if no |
224 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly | 223 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly |
225 | * without this... | 224 | * without this... |
226 | * | 225 | * |
227 | * The caller must hold rcu lock or the tasklist lock. | 226 | * The caller must hold rcu lock or the tasklist lock. |
228 | */ | 227 | */ |
229 | struct pid *session_of_pgrp(struct pid *pgrp) | 228 | struct pid *session_of_pgrp(struct pid *pgrp) |
230 | { | 229 | { |
231 | struct task_struct *p; | 230 | struct task_struct *p; |
232 | struct pid *sid = NULL; | 231 | struct pid *sid = NULL; |
233 | 232 | ||
234 | p = pid_task(pgrp, PIDTYPE_PGID); | 233 | p = pid_task(pgrp, PIDTYPE_PGID); |
235 | if (p == NULL) | 234 | if (p == NULL) |
236 | p = pid_task(pgrp, PIDTYPE_PID); | 235 | p = pid_task(pgrp, PIDTYPE_PID); |
237 | if (p != NULL) | 236 | if (p != NULL) |
238 | sid = task_session(p); | 237 | sid = task_session(p); |
239 | 238 | ||
240 | return sid; | 239 | return sid; |
241 | } | 240 | } |
242 | 241 | ||
243 | /* | 242 | /* |
244 | * Determine if a process group is "orphaned", according to the POSIX | 243 | * Determine if a process group is "orphaned", according to the POSIX |
245 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | 244 | * definition in 2.2.2.52. Orphaned process groups are not to be affected |
246 | * by terminal-generated stop signals. Newly orphaned process groups are | 245 | * by terminal-generated stop signals. Newly orphaned process groups are |
247 | * to receive a SIGHUP and a SIGCONT. | 246 | * to receive a SIGHUP and a SIGCONT. |
248 | * | 247 | * |
249 | * "I ask you, have you ever known what it is to be an orphan?" | 248 | * "I ask you, have you ever known what it is to be an orphan?" |
250 | */ | 249 | */ |
251 | static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) | 250 | static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) |
252 | { | 251 | { |
253 | struct task_struct *p; | 252 | struct task_struct *p; |
254 | 253 | ||
255 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | 254 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
256 | if ((p == ignored_task) || | 255 | if ((p == ignored_task) || |
257 | (p->exit_state && thread_group_empty(p)) || | 256 | (p->exit_state && thread_group_empty(p)) || |
258 | is_global_init(p->real_parent)) | 257 | is_global_init(p->real_parent)) |
259 | continue; | 258 | continue; |
260 | 259 | ||
261 | if (task_pgrp(p->real_parent) != pgrp && | 260 | if (task_pgrp(p->real_parent) != pgrp && |
262 | task_session(p->real_parent) == task_session(p)) | 261 | task_session(p->real_parent) == task_session(p)) |
263 | return 0; | 262 | return 0; |
264 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); | 263 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
265 | 264 | ||
266 | return 1; | 265 | return 1; |
267 | } | 266 | } |
268 | 267 | ||
269 | int is_current_pgrp_orphaned(void) | 268 | int is_current_pgrp_orphaned(void) |
270 | { | 269 | { |
271 | int retval; | 270 | int retval; |
272 | 271 | ||
273 | read_lock(&tasklist_lock); | 272 | read_lock(&tasklist_lock); |
274 | retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); | 273 | retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); |
275 | read_unlock(&tasklist_lock); | 274 | read_unlock(&tasklist_lock); |
276 | 275 | ||
277 | return retval; | 276 | return retval; |
278 | } | 277 | } |
279 | 278 | ||
280 | static int has_stopped_jobs(struct pid *pgrp) | 279 | static int has_stopped_jobs(struct pid *pgrp) |
281 | { | 280 | { |
282 | int retval = 0; | 281 | int retval = 0; |
283 | struct task_struct *p; | 282 | struct task_struct *p; |
284 | 283 | ||
285 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | 284 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
286 | if (!task_is_stopped(p)) | 285 | if (!task_is_stopped(p)) |
287 | continue; | 286 | continue; |
288 | retval = 1; | 287 | retval = 1; |
289 | break; | 288 | break; |
290 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); | 289 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
291 | return retval; | 290 | return retval; |
292 | } | 291 | } |
293 | 292 | ||
294 | /* | 293 | /* |
295 | * Check to see if any process groups have become orphaned as | 294 | * Check to see if any process groups have become orphaned as |
296 | * a result of our exiting, and if they have any stopped jobs, | 295 | * a result of our exiting, and if they have any stopped jobs, |
297 | * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | 296 | * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) |
298 | */ | 297 | */ |
299 | static void | 298 | static void |
300 | kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | 299 | kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) |
301 | { | 300 | { |
302 | struct pid *pgrp = task_pgrp(tsk); | 301 | struct pid *pgrp = task_pgrp(tsk); |
303 | struct task_struct *ignored_task = tsk; | 302 | struct task_struct *ignored_task = tsk; |
304 | 303 | ||
305 | if (!parent) | 304 | if (!parent) |
306 | /* exit: our father is in a different pgrp than | 305 | /* exit: our father is in a different pgrp than |
307 | * we are and we were the only connection outside. | 306 | * we are and we were the only connection outside. |
308 | */ | 307 | */ |
309 | parent = tsk->real_parent; | 308 | parent = tsk->real_parent; |
310 | else | 309 | else |
311 | /* reparent: our child is in a different pgrp than | 310 | /* reparent: our child is in a different pgrp than |
312 | * we are, and it was the only connection outside. | 311 | * we are, and it was the only connection outside. |
313 | */ | 312 | */ |
314 | ignored_task = NULL; | 313 | ignored_task = NULL; |
315 | 314 | ||
316 | if (task_pgrp(parent) != pgrp && | 315 | if (task_pgrp(parent) != pgrp && |
317 | task_session(parent) == task_session(tsk) && | 316 | task_session(parent) == task_session(tsk) && |
318 | will_become_orphaned_pgrp(pgrp, ignored_task) && | 317 | will_become_orphaned_pgrp(pgrp, ignored_task) && |
319 | has_stopped_jobs(pgrp)) { | 318 | has_stopped_jobs(pgrp)) { |
320 | __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); | 319 | __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
321 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); | 320 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
322 | } | 321 | } |
323 | } | 322 | } |
324 | 323 | ||
325 | /** | 324 | /** |
326 | * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd | 325 | * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd |
327 | * | 326 | * |
328 | * If a kernel thread is launched as a result of a system call, or if | 327 | * If a kernel thread is launched as a result of a system call, or if |
329 | * it ever exits, it should generally reparent itself to kthreadd so it | 328 | * it ever exits, it should generally reparent itself to kthreadd so it |
330 | * isn't in the way of other processes and is correctly cleaned up on exit. | 329 | * isn't in the way of other processes and is correctly cleaned up on exit. |
331 | * | 330 | * |
332 | * The various task state such as scheduling policy and priority may have | 331 | * The various task state such as scheduling policy and priority may have |
333 | * been inherited from a user process, so we reset them to sane values here. | 332 | * been inherited from a user process, so we reset them to sane values here. |
334 | * | 333 | * |
335 | * NOTE that reparent_to_kthreadd() gives the caller full capabilities. | 334 | * NOTE that reparent_to_kthreadd() gives the caller full capabilities. |
336 | */ | 335 | */ |
337 | static void reparent_to_kthreadd(void) | 336 | static void reparent_to_kthreadd(void) |
338 | { | 337 | { |
339 | write_lock_irq(&tasklist_lock); | 338 | write_lock_irq(&tasklist_lock); |
340 | 339 | ||
341 | ptrace_unlink(current); | 340 | ptrace_unlink(current); |
342 | /* Reparent to init */ | 341 | /* Reparent to init */ |
343 | current->real_parent = current->parent = kthreadd_task; | 342 | current->real_parent = current->parent = kthreadd_task; |
344 | list_move_tail(¤t->sibling, ¤t->real_parent->children); | 343 | list_move_tail(¤t->sibling, ¤t->real_parent->children); |
345 | 344 | ||
346 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | 345 | /* Set the exit signal to SIGCHLD so we signal init on exit */ |
347 | current->exit_signal = SIGCHLD; | 346 | current->exit_signal = SIGCHLD; |
348 | 347 | ||
349 | if (task_nice(current) < 0) | 348 | if (task_nice(current) < 0) |
350 | set_user_nice(current, 0); | 349 | set_user_nice(current, 0); |
351 | /* cpus_allowed? */ | 350 | /* cpus_allowed? */ |
352 | /* rt_priority? */ | 351 | /* rt_priority? */ |
353 | /* signals? */ | 352 | /* signals? */ |
354 | memcpy(current->signal->rlim, init_task.signal->rlim, | 353 | memcpy(current->signal->rlim, init_task.signal->rlim, |
355 | sizeof(current->signal->rlim)); | 354 | sizeof(current->signal->rlim)); |
356 | 355 | ||
357 | atomic_inc(&init_cred.usage); | 356 | atomic_inc(&init_cred.usage); |
358 | commit_creds(&init_cred); | 357 | commit_creds(&init_cred); |
359 | write_unlock_irq(&tasklist_lock); | 358 | write_unlock_irq(&tasklist_lock); |
360 | } | 359 | } |
361 | 360 | ||
362 | void __set_special_pids(struct pid *pid) | 361 | void __set_special_pids(struct pid *pid) |
363 | { | 362 | { |
364 | struct task_struct *curr = current->group_leader; | 363 | struct task_struct *curr = current->group_leader; |
365 | 364 | ||
366 | if (task_session(curr) != pid) | 365 | if (task_session(curr) != pid) |
367 | change_pid(curr, PIDTYPE_SID, pid); | 366 | change_pid(curr, PIDTYPE_SID, pid); |
368 | 367 | ||
369 | if (task_pgrp(curr) != pid) | 368 | if (task_pgrp(curr) != pid) |
370 | change_pid(curr, PIDTYPE_PGID, pid); | 369 | change_pid(curr, PIDTYPE_PGID, pid); |
371 | } | 370 | } |
372 | 371 | ||
373 | static void set_special_pids(struct pid *pid) | 372 | static void set_special_pids(struct pid *pid) |
374 | { | 373 | { |
375 | write_lock_irq(&tasklist_lock); | 374 | write_lock_irq(&tasklist_lock); |
376 | __set_special_pids(pid); | 375 | __set_special_pids(pid); |
377 | write_unlock_irq(&tasklist_lock); | 376 | write_unlock_irq(&tasklist_lock); |
378 | } | 377 | } |
379 | 378 | ||
380 | /* | 379 | /* |
381 | * Let kernel threads use this to say that they allow a certain signal. | 380 | * Let kernel threads use this to say that they allow a certain signal. |
382 | * Must not be used if kthread was cloned with CLONE_SIGHAND. | 381 | * Must not be used if kthread was cloned with CLONE_SIGHAND. |
383 | */ | 382 | */ |
384 | int allow_signal(int sig) | 383 | int allow_signal(int sig) |
385 | { | 384 | { |
386 | if (!valid_signal(sig) || sig < 1) | 385 | if (!valid_signal(sig) || sig < 1) |
387 | return -EINVAL; | 386 | return -EINVAL; |
388 | 387 | ||
389 | spin_lock_irq(¤t->sighand->siglock); | 388 | spin_lock_irq(¤t->sighand->siglock); |
390 | /* This is only needed for daemonize()'ed kthreads */ | 389 | /* This is only needed for daemonize()'ed kthreads */ |
391 | sigdelset(¤t->blocked, sig); | 390 | sigdelset(¤t->blocked, sig); |
392 | /* | 391 | /* |
393 | * Kernel threads handle their own signals. Let the signal code | 392 | * Kernel threads handle their own signals. Let the signal code |
394 | * know it'll be handled, so that they don't get converted to | 393 | * know it'll be handled, so that they don't get converted to |
395 | * SIGKILL or just silently dropped. | 394 | * SIGKILL or just silently dropped. |
396 | */ | 395 | */ |
397 | current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; | 396 | current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; |
398 | recalc_sigpending(); | 397 | recalc_sigpending(); |
399 | spin_unlock_irq(¤t->sighand->siglock); | 398 | spin_unlock_irq(¤t->sighand->siglock); |
400 | return 0; | 399 | return 0; |
401 | } | 400 | } |
402 | 401 | ||
403 | EXPORT_SYMBOL(allow_signal); | 402 | EXPORT_SYMBOL(allow_signal); |
404 | 403 | ||
405 | int disallow_signal(int sig) | 404 | int disallow_signal(int sig) |
406 | { | 405 | { |
407 | if (!valid_signal(sig) || sig < 1) | 406 | if (!valid_signal(sig) || sig < 1) |
408 | return -EINVAL; | 407 | return -EINVAL; |
409 | 408 | ||
410 | spin_lock_irq(¤t->sighand->siglock); | 409 | spin_lock_irq(¤t->sighand->siglock); |
411 | current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; | 410 | current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; |
412 | recalc_sigpending(); | 411 | recalc_sigpending(); |
413 | spin_unlock_irq(¤t->sighand->siglock); | 412 | spin_unlock_irq(¤t->sighand->siglock); |
414 | return 0; | 413 | return 0; |
415 | } | 414 | } |
416 | 415 | ||
417 | EXPORT_SYMBOL(disallow_signal); | 416 | EXPORT_SYMBOL(disallow_signal); |
418 | 417 | ||
419 | /* | 418 | /* |
420 | * Put all the gunge required to become a kernel thread without | 419 | * Put all the gunge required to become a kernel thread without |
421 | * attached user resources in one place where it belongs. | 420 | * attached user resources in one place where it belongs. |
422 | */ | 421 | */ |
423 | 422 | ||
424 | void daemonize(const char *name, ...) | 423 | void daemonize(const char *name, ...) |
425 | { | 424 | { |
426 | va_list args; | 425 | va_list args; |
427 | sigset_t blocked; | 426 | sigset_t blocked; |
428 | 427 | ||
429 | va_start(args, name); | 428 | va_start(args, name); |
430 | vsnprintf(current->comm, sizeof(current->comm), name, args); | 429 | vsnprintf(current->comm, sizeof(current->comm), name, args); |
431 | va_end(args); | 430 | va_end(args); |
432 | 431 | ||
433 | /* | 432 | /* |
434 | * If we were started as result of loading a module, close all of the | 433 | * If we were started as result of loading a module, close all of the |
435 | * user space pages. We don't need them, and if we didn't close them | 434 | * user space pages. We don't need them, and if we didn't close them |
436 | * they would be locked into memory. | 435 | * they would be locked into memory. |
437 | */ | 436 | */ |
438 | exit_mm(current); | 437 | exit_mm(current); |
439 | /* | 438 | /* |
440 | * We don't want to have TIF_FREEZE set if the system-wide hibernation | 439 | * We don't want to have TIF_FREEZE set if the system-wide hibernation |
441 | * or suspend transition begins right now. | 440 | * or suspend transition begins right now. |
442 | */ | 441 | */ |
443 | current->flags |= (PF_NOFREEZE | PF_KTHREAD); | 442 | current->flags |= (PF_NOFREEZE | PF_KTHREAD); |
444 | 443 | ||
445 | if (current->nsproxy != &init_nsproxy) { | 444 | if (current->nsproxy != &init_nsproxy) { |
446 | get_nsproxy(&init_nsproxy); | 445 | get_nsproxy(&init_nsproxy); |
447 | switch_task_namespaces(current, &init_nsproxy); | 446 | switch_task_namespaces(current, &init_nsproxy); |
448 | } | 447 | } |
449 | set_special_pids(&init_struct_pid); | 448 | set_special_pids(&init_struct_pid); |
450 | proc_clear_tty(current); | 449 | proc_clear_tty(current); |
451 | 450 | ||
452 | /* Block and flush all signals */ | 451 | /* Block and flush all signals */ |
453 | sigfillset(&blocked); | 452 | sigfillset(&blocked); |
454 | sigprocmask(SIG_BLOCK, &blocked, NULL); | 453 | sigprocmask(SIG_BLOCK, &blocked, NULL); |
455 | flush_signals(current); | 454 | flush_signals(current); |
456 | 455 | ||
457 | /* Become as one with the init task */ | 456 | /* Become as one with the init task */ |
458 | 457 | ||
459 | daemonize_fs_struct(); | 458 | daemonize_fs_struct(); |
460 | exit_files(current); | 459 | exit_files(current); |
461 | current->files = init_task.files; | 460 | current->files = init_task.files; |
462 | atomic_inc(¤t->files->count); | 461 | atomic_inc(¤t->files->count); |
463 | 462 | ||
464 | reparent_to_kthreadd(); | 463 | reparent_to_kthreadd(); |
465 | } | 464 | } |
466 | 465 | ||
467 | EXPORT_SYMBOL(daemonize); | 466 | EXPORT_SYMBOL(daemonize); |
468 | 467 | ||
469 | static void close_files(struct files_struct * files) | 468 | static void close_files(struct files_struct * files) |
470 | { | 469 | { |
471 | int i, j; | 470 | int i, j; |
472 | struct fdtable *fdt; | 471 | struct fdtable *fdt; |
473 | 472 | ||
474 | j = 0; | 473 | j = 0; |
475 | 474 | ||
476 | /* | 475 | /* |
477 | * It is safe to dereference the fd table without RCU or | 476 | * It is safe to dereference the fd table without RCU or |
478 | * ->file_lock because this is the last reference to the | 477 | * ->file_lock because this is the last reference to the |
479 | * files structure. But use RCU to shut RCU-lockdep up. | 478 | * files structure. But use RCU to shut RCU-lockdep up. |
480 | */ | 479 | */ |
481 | rcu_read_lock(); | 480 | rcu_read_lock(); |
482 | fdt = files_fdtable(files); | 481 | fdt = files_fdtable(files); |
483 | rcu_read_unlock(); | 482 | rcu_read_unlock(); |
484 | for (;;) { | 483 | for (;;) { |
485 | unsigned long set; | 484 | unsigned long set; |
486 | i = j * __NFDBITS; | 485 | i = j * __NFDBITS; |
487 | if (i >= fdt->max_fds) | 486 | if (i >= fdt->max_fds) |
488 | break; | 487 | break; |
489 | set = fdt->open_fds->fds_bits[j++]; | 488 | set = fdt->open_fds->fds_bits[j++]; |
490 | while (set) { | 489 | while (set) { |
491 | if (set & 1) { | 490 | if (set & 1) { |
492 | struct file * file = xchg(&fdt->fd[i], NULL); | 491 | struct file * file = xchg(&fdt->fd[i], NULL); |
493 | if (file) { | 492 | if (file) { |
494 | filp_close(file, files); | 493 | filp_close(file, files); |
495 | cond_resched(); | 494 | cond_resched(); |
496 | } | 495 | } |
497 | } | 496 | } |
498 | i++; | 497 | i++; |
499 | set >>= 1; | 498 | set >>= 1; |
500 | } | 499 | } |
501 | } | 500 | } |
502 | } | 501 | } |
503 | 502 | ||
504 | struct files_struct *get_files_struct(struct task_struct *task) | 503 | struct files_struct *get_files_struct(struct task_struct *task) |
505 | { | 504 | { |
506 | struct files_struct *files; | 505 | struct files_struct *files; |
507 | 506 | ||
508 | task_lock(task); | 507 | task_lock(task); |
509 | files = task->files; | 508 | files = task->files; |
510 | if (files) | 509 | if (files) |
511 | atomic_inc(&files->count); | 510 | atomic_inc(&files->count); |
512 | task_unlock(task); | 511 | task_unlock(task); |
513 | 512 | ||
514 | return files; | 513 | return files; |
515 | } | 514 | } |
516 | 515 | ||
517 | void put_files_struct(struct files_struct *files) | 516 | void put_files_struct(struct files_struct *files) |
518 | { | 517 | { |
519 | struct fdtable *fdt; | 518 | struct fdtable *fdt; |
520 | 519 | ||
521 | if (atomic_dec_and_test(&files->count)) { | 520 | if (atomic_dec_and_test(&files->count)) { |
522 | close_files(files); | 521 | close_files(files); |
523 | /* | 522 | /* |
524 | * Free the fd and fdset arrays if we expanded them. | 523 | * Free the fd and fdset arrays if we expanded them. |
525 | * If the fdtable was embedded, pass files for freeing | 524 | * If the fdtable was embedded, pass files for freeing |
526 | * at the end of the RCU grace period. Otherwise, | 525 | * at the end of the RCU grace period. Otherwise, |
527 | * you can free files immediately. | 526 | * you can free files immediately. |
528 | */ | 527 | */ |
529 | rcu_read_lock(); | 528 | rcu_read_lock(); |
530 | fdt = files_fdtable(files); | 529 | fdt = files_fdtable(files); |
531 | if (fdt != &files->fdtab) | 530 | if (fdt != &files->fdtab) |
532 | kmem_cache_free(files_cachep, files); | 531 | kmem_cache_free(files_cachep, files); |
533 | free_fdtable(fdt); | 532 | free_fdtable(fdt); |
534 | rcu_read_unlock(); | 533 | rcu_read_unlock(); |
535 | } | 534 | } |
536 | } | 535 | } |
537 | 536 | ||
538 | void reset_files_struct(struct files_struct *files) | 537 | void reset_files_struct(struct files_struct *files) |
539 | { | 538 | { |
540 | struct task_struct *tsk = current; | 539 | struct task_struct *tsk = current; |
541 | struct files_struct *old; | 540 | struct files_struct *old; |
542 | 541 | ||
543 | old = tsk->files; | 542 | old = tsk->files; |
544 | task_lock(tsk); | 543 | task_lock(tsk); |
545 | tsk->files = files; | 544 | tsk->files = files; |
546 | task_unlock(tsk); | 545 | task_unlock(tsk); |
547 | put_files_struct(old); | 546 | put_files_struct(old); |
548 | } | 547 | } |
549 | 548 | ||
550 | void exit_files(struct task_struct *tsk) | 549 | void exit_files(struct task_struct *tsk) |
551 | { | 550 | { |
552 | struct files_struct * files = tsk->files; | 551 | struct files_struct * files = tsk->files; |
553 | 552 | ||
554 | if (files) { | 553 | if (files) { |
555 | task_lock(tsk); | 554 | task_lock(tsk); |
556 | tsk->files = NULL; | 555 | tsk->files = NULL; |
557 | task_unlock(tsk); | 556 | task_unlock(tsk); |
558 | put_files_struct(files); | 557 | put_files_struct(files); |
559 | } | 558 | } |
560 | } | 559 | } |
561 | 560 | ||
562 | #ifdef CONFIG_MM_OWNER | 561 | #ifdef CONFIG_MM_OWNER |
563 | /* | 562 | /* |
564 | * Task p is exiting and it owned mm, lets find a new owner for it | 563 | * Task p is exiting and it owned mm, lets find a new owner for it |
565 | */ | 564 | */ |
566 | static inline int | 565 | static inline int |
567 | mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) | 566 | mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) |
568 | { | 567 | { |
569 | /* | 568 | /* |
570 | * If there are other users of the mm and the owner (us) is exiting | 569 | * If there are other users of the mm and the owner (us) is exiting |
571 | * we need to find a new owner to take on the responsibility. | 570 | * we need to find a new owner to take on the responsibility. |
572 | */ | 571 | */ |
573 | if (atomic_read(&mm->mm_users) <= 1) | 572 | if (atomic_read(&mm->mm_users) <= 1) |
574 | return 0; | 573 | return 0; |
575 | if (mm->owner != p) | 574 | if (mm->owner != p) |
576 | return 0; | 575 | return 0; |
577 | return 1; | 576 | return 1; |
578 | } | 577 | } |
579 | 578 | ||
580 | void mm_update_next_owner(struct mm_struct *mm) | 579 | void mm_update_next_owner(struct mm_struct *mm) |
581 | { | 580 | { |
582 | struct task_struct *c, *g, *p = current; | 581 | struct task_struct *c, *g, *p = current; |
583 | 582 | ||
584 | retry: | 583 | retry: |
585 | if (!mm_need_new_owner(mm, p)) | 584 | if (!mm_need_new_owner(mm, p)) |
586 | return; | 585 | return; |
587 | 586 | ||
588 | read_lock(&tasklist_lock); | 587 | read_lock(&tasklist_lock); |
589 | /* | 588 | /* |
590 | * Search in the children | 589 | * Search in the children |
591 | */ | 590 | */ |
592 | list_for_each_entry(c, &p->children, sibling) { | 591 | list_for_each_entry(c, &p->children, sibling) { |
593 | if (c->mm == mm) | 592 | if (c->mm == mm) |
594 | goto assign_new_owner; | 593 | goto assign_new_owner; |
595 | } | 594 | } |
596 | 595 | ||
597 | /* | 596 | /* |
598 | * Search in the siblings | 597 | * Search in the siblings |
599 | */ | 598 | */ |
600 | list_for_each_entry(c, &p->real_parent->children, sibling) { | 599 | list_for_each_entry(c, &p->real_parent->children, sibling) { |
601 | if (c->mm == mm) | 600 | if (c->mm == mm) |
602 | goto assign_new_owner; | 601 | goto assign_new_owner; |
603 | } | 602 | } |
604 | 603 | ||
605 | /* | 604 | /* |
606 | * Search through everything else. We should not get | 605 | * Search through everything else. We should not get |
607 | * here often | 606 | * here often |
608 | */ | 607 | */ |
609 | do_each_thread(g, c) { | 608 | do_each_thread(g, c) { |
610 | if (c->mm == mm) | 609 | if (c->mm == mm) |
611 | goto assign_new_owner; | 610 | goto assign_new_owner; |
612 | } while_each_thread(g, c); | 611 | } while_each_thread(g, c); |
613 | 612 | ||
614 | read_unlock(&tasklist_lock); | 613 | read_unlock(&tasklist_lock); |
615 | /* | 614 | /* |
616 | * We found no owner yet mm_users > 1: this implies that we are | 615 | * We found no owner yet mm_users > 1: this implies that we are |
617 | * most likely racing with swapoff (try_to_unuse()) or /proc or | 616 | * most likely racing with swapoff (try_to_unuse()) or /proc or |
618 | * ptrace or page migration (get_task_mm()). Mark owner as NULL. | 617 | * ptrace or page migration (get_task_mm()). Mark owner as NULL. |
619 | */ | 618 | */ |
620 | mm->owner = NULL; | 619 | mm->owner = NULL; |
621 | return; | 620 | return; |
622 | 621 | ||
623 | assign_new_owner: | 622 | assign_new_owner: |
624 | BUG_ON(c == p); | 623 | BUG_ON(c == p); |
625 | get_task_struct(c); | 624 | get_task_struct(c); |
626 | /* | 625 | /* |
627 | * The task_lock protects c->mm from changing. | 626 | * The task_lock protects c->mm from changing. |
628 | * We always want mm->owner->mm == mm | 627 | * We always want mm->owner->mm == mm |
629 | */ | 628 | */ |
630 | task_lock(c); | 629 | task_lock(c); |
631 | /* | 630 | /* |
632 | * Delay read_unlock() till we have the task_lock() | 631 | * Delay read_unlock() till we have the task_lock() |
633 | * to ensure that c does not slip away underneath us | 632 | * to ensure that c does not slip away underneath us |
634 | */ | 633 | */ |
635 | read_unlock(&tasklist_lock); | 634 | read_unlock(&tasklist_lock); |
636 | if (c->mm != mm) { | 635 | if (c->mm != mm) { |
637 | task_unlock(c); | 636 | task_unlock(c); |
638 | put_task_struct(c); | 637 | put_task_struct(c); |
639 | goto retry; | 638 | goto retry; |
640 | } | 639 | } |
641 | mm->owner = c; | 640 | mm->owner = c; |
642 | task_unlock(c); | 641 | task_unlock(c); |
643 | put_task_struct(c); | 642 | put_task_struct(c); |
644 | } | 643 | } |
645 | #endif /* CONFIG_MM_OWNER */ | 644 | #endif /* CONFIG_MM_OWNER */ |
646 | 645 | ||
647 | /* | 646 | /* |
648 | * Turn us into a lazy TLB process if we | 647 | * Turn us into a lazy TLB process if we |
649 | * aren't already.. | 648 | * aren't already.. |
650 | */ | 649 | */ |
651 | static void exit_mm(struct task_struct * tsk) | 650 | static void exit_mm(struct task_struct * tsk) |
652 | { | 651 | { |
653 | struct mm_struct *mm = tsk->mm; | 652 | struct mm_struct *mm = tsk->mm; |
654 | struct core_state *core_state; | 653 | struct core_state *core_state; |
655 | 654 | ||
656 | mm_release(tsk, mm); | 655 | mm_release(tsk, mm); |
657 | if (!mm) | 656 | if (!mm) |
658 | return; | 657 | return; |
659 | /* | 658 | /* |
660 | * Serialize with any possible pending coredump. | 659 | * Serialize with any possible pending coredump. |
661 | * We must hold mmap_sem around checking core_state | 660 | * We must hold mmap_sem around checking core_state |
662 | * and clearing tsk->mm. The core-inducing thread | 661 | * and clearing tsk->mm. The core-inducing thread |
663 | * will increment ->nr_threads for each thread in the | 662 | * will increment ->nr_threads for each thread in the |
664 | * group with ->mm != NULL. | 663 | * group with ->mm != NULL. |
665 | */ | 664 | */ |
666 | down_read(&mm->mmap_sem); | 665 | down_read(&mm->mmap_sem); |
667 | core_state = mm->core_state; | 666 | core_state = mm->core_state; |
668 | if (core_state) { | 667 | if (core_state) { |
669 | struct core_thread self; | 668 | struct core_thread self; |
670 | up_read(&mm->mmap_sem); | 669 | up_read(&mm->mmap_sem); |
671 | 670 | ||
672 | self.task = tsk; | 671 | self.task = tsk; |
673 | self.next = xchg(&core_state->dumper.next, &self); | 672 | self.next = xchg(&core_state->dumper.next, &self); |
674 | /* | 673 | /* |
675 | * Implies mb(), the result of xchg() must be visible | 674 | * Implies mb(), the result of xchg() must be visible |
676 | * to core_state->dumper. | 675 | * to core_state->dumper. |
677 | */ | 676 | */ |
678 | if (atomic_dec_and_test(&core_state->nr_threads)) | 677 | if (atomic_dec_and_test(&core_state->nr_threads)) |
679 | complete(&core_state->startup); | 678 | complete(&core_state->startup); |
680 | 679 | ||
681 | for (;;) { | 680 | for (;;) { |
682 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 681 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
683 | if (!self.task) /* see coredump_finish() */ | 682 | if (!self.task) /* see coredump_finish() */ |
684 | break; | 683 | break; |
685 | schedule(); | 684 | schedule(); |
686 | } | 685 | } |
687 | __set_task_state(tsk, TASK_RUNNING); | 686 | __set_task_state(tsk, TASK_RUNNING); |
688 | down_read(&mm->mmap_sem); | 687 | down_read(&mm->mmap_sem); |
689 | } | 688 | } |
690 | atomic_inc(&mm->mm_count); | 689 | atomic_inc(&mm->mm_count); |
691 | BUG_ON(mm != tsk->active_mm); | 690 | BUG_ON(mm != tsk->active_mm); |
692 | /* more a memory barrier than a real lock */ | 691 | /* more a memory barrier than a real lock */ |
693 | task_lock(tsk); | 692 | task_lock(tsk); |
694 | tsk->mm = NULL; | 693 | tsk->mm = NULL; |
695 | up_read(&mm->mmap_sem); | 694 | up_read(&mm->mmap_sem); |
696 | enter_lazy_tlb(mm, current); | 695 | enter_lazy_tlb(mm, current); |
697 | /* We don't want this task to be frozen prematurely */ | 696 | /* We don't want this task to be frozen prematurely */ |
698 | clear_freeze_flag(tsk); | 697 | clear_freeze_flag(tsk); |
699 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | 698 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) |
700 | atomic_dec(&mm->oom_disable_count); | 699 | atomic_dec(&mm->oom_disable_count); |
701 | task_unlock(tsk); | 700 | task_unlock(tsk); |
702 | mm_update_next_owner(mm); | 701 | mm_update_next_owner(mm); |
703 | mmput(mm); | 702 | mmput(mm); |
704 | } | 703 | } |
705 | 704 | ||
706 | /* | 705 | /* |
707 | * When we die, we re-parent all our children. | 706 | * When we die, we re-parent all our children. |
708 | * Try to give them to another thread in our thread | 707 | * Try to give them to another thread in our thread |
709 | * group, and if no such member exists, give it to | 708 | * group, and if no such member exists, give it to |
710 | * the child reaper process (ie "init") in our pid | 709 | * the child reaper process (ie "init") in our pid |
711 | * space. | 710 | * space. |
712 | */ | 711 | */ |
713 | static struct task_struct *find_new_reaper(struct task_struct *father) | 712 | static struct task_struct *find_new_reaper(struct task_struct *father) |
714 | __releases(&tasklist_lock) | 713 | __releases(&tasklist_lock) |
715 | __acquires(&tasklist_lock) | 714 | __acquires(&tasklist_lock) |
716 | { | 715 | { |
717 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | 716 | struct pid_namespace *pid_ns = task_active_pid_ns(father); |
718 | struct task_struct *thread; | 717 | struct task_struct *thread; |
719 | 718 | ||
720 | thread = father; | 719 | thread = father; |
721 | while_each_thread(father, thread) { | 720 | while_each_thread(father, thread) { |
722 | if (thread->flags & PF_EXITING) | 721 | if (thread->flags & PF_EXITING) |
723 | continue; | 722 | continue; |
724 | if (unlikely(pid_ns->child_reaper == father)) | 723 | if (unlikely(pid_ns->child_reaper == father)) |
725 | pid_ns->child_reaper = thread; | 724 | pid_ns->child_reaper = thread; |
726 | return thread; | 725 | return thread; |
727 | } | 726 | } |
728 | 727 | ||
729 | if (unlikely(pid_ns->child_reaper == father)) { | 728 | if (unlikely(pid_ns->child_reaper == father)) { |
730 | write_unlock_irq(&tasklist_lock); | 729 | write_unlock_irq(&tasklist_lock); |
731 | if (unlikely(pid_ns == &init_pid_ns)) | 730 | if (unlikely(pid_ns == &init_pid_ns)) |
732 | panic("Attempted to kill init!"); | 731 | panic("Attempted to kill init!"); |
733 | 732 | ||
734 | zap_pid_ns_processes(pid_ns); | 733 | zap_pid_ns_processes(pid_ns); |
735 | write_lock_irq(&tasklist_lock); | 734 | write_lock_irq(&tasklist_lock); |
736 | /* | 735 | /* |
737 | * We can not clear ->child_reaper or leave it alone. | 736 | * We can not clear ->child_reaper or leave it alone. |
738 | * There may by stealth EXIT_DEAD tasks on ->children, | 737 | * There may by stealth EXIT_DEAD tasks on ->children, |
739 | * forget_original_parent() must move them somewhere. | 738 | * forget_original_parent() must move them somewhere. |
740 | */ | 739 | */ |
741 | pid_ns->child_reaper = init_pid_ns.child_reaper; | 740 | pid_ns->child_reaper = init_pid_ns.child_reaper; |
742 | } | 741 | } |
743 | 742 | ||
744 | return pid_ns->child_reaper; | 743 | return pid_ns->child_reaper; |
745 | } | 744 | } |
746 | 745 | ||
747 | /* | 746 | /* |
748 | * Any that need to be release_task'd are put on the @dead list. | 747 | * Any that need to be release_task'd are put on the @dead list. |
749 | */ | 748 | */ |
750 | static void reparent_leader(struct task_struct *father, struct task_struct *p, | 749 | static void reparent_leader(struct task_struct *father, struct task_struct *p, |
751 | struct list_head *dead) | 750 | struct list_head *dead) |
752 | { | 751 | { |
753 | list_move_tail(&p->sibling, &p->real_parent->children); | 752 | list_move_tail(&p->sibling, &p->real_parent->children); |
754 | 753 | ||
755 | if (task_detached(p)) | 754 | if (task_detached(p)) |
756 | return; | 755 | return; |
757 | /* | 756 | /* |
758 | * If this is a threaded reparent there is no need to | 757 | * If this is a threaded reparent there is no need to |
759 | * notify anyone anything has happened. | 758 | * notify anyone anything has happened. |
760 | */ | 759 | */ |
761 | if (same_thread_group(p->real_parent, father)) | 760 | if (same_thread_group(p->real_parent, father)) |
762 | return; | 761 | return; |
763 | 762 | ||
764 | /* We don't want people slaying init. */ | 763 | /* We don't want people slaying init. */ |
765 | p->exit_signal = SIGCHLD; | 764 | p->exit_signal = SIGCHLD; |
766 | 765 | ||
767 | /* If it has exited notify the new parent about this child's death. */ | 766 | /* If it has exited notify the new parent about this child's death. */ |
768 | if (!task_ptrace(p) && | 767 | if (!task_ptrace(p) && |
769 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { | 768 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { |
770 | do_notify_parent(p, p->exit_signal); | 769 | do_notify_parent(p, p->exit_signal); |
771 | if (task_detached(p)) { | 770 | if (task_detached(p)) { |
772 | p->exit_state = EXIT_DEAD; | 771 | p->exit_state = EXIT_DEAD; |
773 | list_move_tail(&p->sibling, dead); | 772 | list_move_tail(&p->sibling, dead); |
774 | } | 773 | } |
775 | } | 774 | } |
776 | 775 | ||
777 | kill_orphaned_pgrp(p, father); | 776 | kill_orphaned_pgrp(p, father); |
778 | } | 777 | } |
779 | 778 | ||
780 | static void forget_original_parent(struct task_struct *father) | 779 | static void forget_original_parent(struct task_struct *father) |
781 | { | 780 | { |
782 | struct task_struct *p, *n, *reaper; | 781 | struct task_struct *p, *n, *reaper; |
783 | LIST_HEAD(dead_children); | 782 | LIST_HEAD(dead_children); |
784 | 783 | ||
785 | write_lock_irq(&tasklist_lock); | 784 | write_lock_irq(&tasklist_lock); |
786 | /* | 785 | /* |
787 | * Note that exit_ptrace() and find_new_reaper() might | 786 | * Note that exit_ptrace() and find_new_reaper() might |
788 | * drop tasklist_lock and reacquire it. | 787 | * drop tasklist_lock and reacquire it. |
789 | */ | 788 | */ |
790 | exit_ptrace(father); | 789 | exit_ptrace(father); |
791 | reaper = find_new_reaper(father); | 790 | reaper = find_new_reaper(father); |
792 | 791 | ||
793 | list_for_each_entry_safe(p, n, &father->children, sibling) { | 792 | list_for_each_entry_safe(p, n, &father->children, sibling) { |
794 | struct task_struct *t = p; | 793 | struct task_struct *t = p; |
795 | do { | 794 | do { |
796 | t->real_parent = reaper; | 795 | t->real_parent = reaper; |
797 | if (t->parent == father) { | 796 | if (t->parent == father) { |
798 | BUG_ON(task_ptrace(t)); | 797 | BUG_ON(task_ptrace(t)); |
799 | t->parent = t->real_parent; | 798 | t->parent = t->real_parent; |
800 | } | 799 | } |
801 | if (t->pdeath_signal) | 800 | if (t->pdeath_signal) |
802 | group_send_sig_info(t->pdeath_signal, | 801 | group_send_sig_info(t->pdeath_signal, |
803 | SEND_SIG_NOINFO, t); | 802 | SEND_SIG_NOINFO, t); |
804 | } while_each_thread(p, t); | 803 | } while_each_thread(p, t); |
805 | reparent_leader(father, p, &dead_children); | 804 | reparent_leader(father, p, &dead_children); |
806 | } | 805 | } |
807 | write_unlock_irq(&tasklist_lock); | 806 | write_unlock_irq(&tasklist_lock); |
808 | 807 | ||
809 | BUG_ON(!list_empty(&father->children)); | 808 | BUG_ON(!list_empty(&father->children)); |
810 | 809 | ||
811 | list_for_each_entry_safe(p, n, &dead_children, sibling) { | 810 | list_for_each_entry_safe(p, n, &dead_children, sibling) { |
812 | list_del_init(&p->sibling); | 811 | list_del_init(&p->sibling); |
813 | release_task(p); | 812 | release_task(p); |
814 | } | 813 | } |
815 | } | 814 | } |
816 | 815 | ||
817 | /* | 816 | /* |
818 | * Send signals to all our closest relatives so that they know | 817 | * Send signals to all our closest relatives so that they know |
819 | * to properly mourn us.. | 818 | * to properly mourn us.. |
820 | */ | 819 | */ |
821 | static void exit_notify(struct task_struct *tsk, int group_dead) | 820 | static void exit_notify(struct task_struct *tsk, int group_dead) |
822 | { | 821 | { |
823 | int signal; | 822 | int signal; |
824 | void *cookie; | 823 | void *cookie; |
825 | 824 | ||
826 | /* | 825 | /* |
827 | * This does two things: | 826 | * This does two things: |
828 | * | 827 | * |
829 | * A. Make init inherit all the child processes | 828 | * A. Make init inherit all the child processes |
830 | * B. Check to see if any process groups have become orphaned | 829 | * B. Check to see if any process groups have become orphaned |
831 | * as a result of our exiting, and if they have any stopped | 830 | * as a result of our exiting, and if they have any stopped |
832 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | 831 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) |
833 | */ | 832 | */ |
834 | forget_original_parent(tsk); | 833 | forget_original_parent(tsk); |
835 | exit_task_namespaces(tsk); | 834 | exit_task_namespaces(tsk); |
836 | 835 | ||
837 | write_lock_irq(&tasklist_lock); | 836 | write_lock_irq(&tasklist_lock); |
838 | if (group_dead) | 837 | if (group_dead) |
839 | kill_orphaned_pgrp(tsk->group_leader, NULL); | 838 | kill_orphaned_pgrp(tsk->group_leader, NULL); |
840 | 839 | ||
841 | /* Let father know we died | 840 | /* Let father know we died |
842 | * | 841 | * |
843 | * Thread signals are configurable, but you aren't going to use | 842 | * Thread signals are configurable, but you aren't going to use |
844 | * that to send signals to arbitrary processes. | 843 | * that to send signals to arbitrary processes. |
845 | * That stops right now. | 844 | * That stops right now. |
846 | * | 845 | * |
847 | * If the parent exec id doesn't match the exec id we saved | 846 | * If the parent exec id doesn't match the exec id we saved |
848 | * when we started then we know the parent has changed security | 847 | * when we started then we know the parent has changed security |
849 | * domain. | 848 | * domain. |
850 | * | 849 | * |
851 | * If our self_exec id doesn't match our parent_exec_id then | 850 | * If our self_exec id doesn't match our parent_exec_id then |
852 | * we have changed execution domain as these two values started | 851 | * we have changed execution domain as these two values started |
853 | * the same after a fork. | 852 | * the same after a fork. |
854 | */ | 853 | */ |
855 | if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && | 854 | if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && |
856 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || | 855 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || |
857 | tsk->self_exec_id != tsk->parent_exec_id)) | 856 | tsk->self_exec_id != tsk->parent_exec_id)) |
858 | tsk->exit_signal = SIGCHLD; | 857 | tsk->exit_signal = SIGCHLD; |
859 | 858 | ||
860 | signal = tracehook_notify_death(tsk, &cookie, group_dead); | 859 | signal = tracehook_notify_death(tsk, &cookie, group_dead); |
861 | if (signal >= 0) | 860 | if (signal >= 0) |
862 | signal = do_notify_parent(tsk, signal); | 861 | signal = do_notify_parent(tsk, signal); |
863 | 862 | ||
864 | tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; | 863 | tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; |
865 | 864 | ||
866 | /* mt-exec, de_thread() is waiting for group leader */ | 865 | /* mt-exec, de_thread() is waiting for group leader */ |
867 | if (unlikely(tsk->signal->notify_count < 0)) | 866 | if (unlikely(tsk->signal->notify_count < 0)) |
868 | wake_up_process(tsk->signal->group_exit_task); | 867 | wake_up_process(tsk->signal->group_exit_task); |
869 | write_unlock_irq(&tasklist_lock); | 868 | write_unlock_irq(&tasklist_lock); |
870 | 869 | ||
871 | tracehook_report_death(tsk, signal, cookie, group_dead); | 870 | tracehook_report_death(tsk, signal, cookie, group_dead); |
872 | 871 | ||
873 | /* If the process is dead, release it - nobody will wait for it */ | 872 | /* If the process is dead, release it - nobody will wait for it */ |
874 | if (signal == DEATH_REAP) | 873 | if (signal == DEATH_REAP) |
875 | release_task(tsk); | 874 | release_task(tsk); |
876 | } | 875 | } |
877 | 876 | ||
878 | #ifdef CONFIG_DEBUG_STACK_USAGE | 877 | #ifdef CONFIG_DEBUG_STACK_USAGE |
879 | static void check_stack_usage(void) | 878 | static void check_stack_usage(void) |
880 | { | 879 | { |
881 | static DEFINE_SPINLOCK(low_water_lock); | 880 | static DEFINE_SPINLOCK(low_water_lock); |
882 | static int lowest_to_date = THREAD_SIZE; | 881 | static int lowest_to_date = THREAD_SIZE; |
883 | unsigned long free; | 882 | unsigned long free; |
884 | 883 | ||
885 | free = stack_not_used(current); | 884 | free = stack_not_used(current); |
886 | 885 | ||
887 | if (free >= lowest_to_date) | 886 | if (free >= lowest_to_date) |
888 | return; | 887 | return; |
889 | 888 | ||
890 | spin_lock(&low_water_lock); | 889 | spin_lock(&low_water_lock); |
891 | if (free < lowest_to_date) { | 890 | if (free < lowest_to_date) { |
892 | printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " | 891 | printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " |
893 | "left\n", | 892 | "left\n", |
894 | current->comm, free); | 893 | current->comm, free); |
895 | lowest_to_date = free; | 894 | lowest_to_date = free; |
896 | } | 895 | } |
897 | spin_unlock(&low_water_lock); | 896 | spin_unlock(&low_water_lock); |
898 | } | 897 | } |
899 | #else | 898 | #else |
900 | static inline void check_stack_usage(void) {} | 899 | static inline void check_stack_usage(void) {} |
901 | #endif | 900 | #endif |
902 | 901 | ||
903 | NORET_TYPE void do_exit(long code) | 902 | NORET_TYPE void do_exit(long code) |
904 | { | 903 | { |
905 | struct task_struct *tsk = current; | 904 | struct task_struct *tsk = current; |
906 | int group_dead; | 905 | int group_dead; |
907 | 906 | ||
908 | profile_task_exit(tsk); | 907 | profile_task_exit(tsk); |
909 | 908 | ||
910 | WARN_ON(atomic_read(&tsk->fs_excl)); | 909 | WARN_ON(atomic_read(&tsk->fs_excl)); |
911 | WARN_ON(blk_needs_flush_plug(tsk)); | 910 | WARN_ON(blk_needs_flush_plug(tsk)); |
912 | 911 | ||
913 | if (unlikely(in_interrupt())) | 912 | if (unlikely(in_interrupt())) |
914 | panic("Aiee, killing interrupt handler!"); | 913 | panic("Aiee, killing interrupt handler!"); |
915 | if (unlikely(!tsk->pid)) | 914 | if (unlikely(!tsk->pid)) |
916 | panic("Attempted to kill the idle task!"); | 915 | panic("Attempted to kill the idle task!"); |
917 | 916 | ||
918 | /* | 917 | /* |
919 | * If do_exit is called because this processes oopsed, it's possible | 918 | * If do_exit is called because this processes oopsed, it's possible |
920 | * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before | 919 | * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before |
921 | * continuing. Amongst other possible reasons, this is to prevent | 920 | * continuing. Amongst other possible reasons, this is to prevent |
922 | * mm_release()->clear_child_tid() from writing to a user-controlled | 921 | * mm_release()->clear_child_tid() from writing to a user-controlled |
923 | * kernel address. | 922 | * kernel address. |
924 | */ | 923 | */ |
925 | set_fs(USER_DS); | 924 | set_fs(USER_DS); |
926 | 925 | ||
927 | tracehook_report_exit(&code); | 926 | tracehook_report_exit(&code); |
928 | 927 | ||
929 | validate_creds_for_do_exit(tsk); | 928 | validate_creds_for_do_exit(tsk); |
930 | 929 | ||
931 | /* | 930 | /* |
932 | * We're taking recursive faults here in do_exit. Safest is to just | 931 | * We're taking recursive faults here in do_exit. Safest is to just |
933 | * leave this task alone and wait for reboot. | 932 | * leave this task alone and wait for reboot. |
934 | */ | 933 | */ |
935 | if (unlikely(tsk->flags & PF_EXITING)) { | 934 | if (unlikely(tsk->flags & PF_EXITING)) { |
936 | printk(KERN_ALERT | 935 | printk(KERN_ALERT |
937 | "Fixing recursive fault but reboot is needed!\n"); | 936 | "Fixing recursive fault but reboot is needed!\n"); |
938 | /* | 937 | /* |
939 | * We can do this unlocked here. The futex code uses | 938 | * We can do this unlocked here. The futex code uses |
940 | * this flag just to verify whether the pi state | 939 | * this flag just to verify whether the pi state |
941 | * cleanup has been done or not. In the worst case it | 940 | * cleanup has been done or not. In the worst case it |
942 | * loops once more. We pretend that the cleanup was | 941 | * loops once more. We pretend that the cleanup was |
943 | * done as there is no way to return. Either the | 942 | * done as there is no way to return. Either the |
944 | * OWNER_DIED bit is set by now or we push the blocked | 943 | * OWNER_DIED bit is set by now or we push the blocked |
945 | * task into the wait for ever nirwana as well. | 944 | * task into the wait for ever nirwana as well. |
946 | */ | 945 | */ |
947 | tsk->flags |= PF_EXITPIDONE; | 946 | tsk->flags |= PF_EXITPIDONE; |
948 | set_current_state(TASK_UNINTERRUPTIBLE); | 947 | set_current_state(TASK_UNINTERRUPTIBLE); |
949 | schedule(); | 948 | schedule(); |
950 | } | 949 | } |
951 | 950 | ||
952 | exit_irq_thread(); | 951 | exit_irq_thread(); |
953 | 952 | ||
954 | exit_signals(tsk); /* sets PF_EXITING */ | 953 | exit_signals(tsk); /* sets PF_EXITING */ |
955 | /* | 954 | /* |
956 | * tsk->flags are checked in the futex code to protect against | 955 | * tsk->flags are checked in the futex code to protect against |
957 | * an exiting task cleaning up the robust pi futexes. | 956 | * an exiting task cleaning up the robust pi futexes. |
958 | */ | 957 | */ |
959 | smp_mb(); | 958 | smp_mb(); |
960 | raw_spin_unlock_wait(&tsk->pi_lock); | 959 | raw_spin_unlock_wait(&tsk->pi_lock); |
961 | 960 | ||
962 | if (unlikely(in_atomic())) | 961 | if (unlikely(in_atomic())) |
963 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 962 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", |
964 | current->comm, task_pid_nr(current), | 963 | current->comm, task_pid_nr(current), |
965 | preempt_count()); | 964 | preempt_count()); |
966 | 965 | ||
967 | acct_update_integrals(tsk); | 966 | acct_update_integrals(tsk); |
968 | /* sync mm's RSS info before statistics gathering */ | 967 | /* sync mm's RSS info before statistics gathering */ |
969 | if (tsk->mm) | 968 | if (tsk->mm) |
970 | sync_mm_rss(tsk, tsk->mm); | 969 | sync_mm_rss(tsk, tsk->mm); |
971 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 970 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
972 | if (group_dead) { | 971 | if (group_dead) { |
973 | hrtimer_cancel(&tsk->signal->real_timer); | 972 | hrtimer_cancel(&tsk->signal->real_timer); |
974 | exit_itimers(tsk->signal); | 973 | exit_itimers(tsk->signal); |
975 | if (tsk->mm) | 974 | if (tsk->mm) |
976 | setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); | 975 | setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); |
977 | } | 976 | } |
978 | acct_collect(code, group_dead); | 977 | acct_collect(code, group_dead); |
979 | if (group_dead) | 978 | if (group_dead) |
980 | tty_audit_exit(); | 979 | tty_audit_exit(); |
981 | if (unlikely(tsk->audit_context)) | 980 | if (unlikely(tsk->audit_context)) |
982 | audit_free(tsk); | 981 | audit_free(tsk); |
983 | 982 | ||
984 | tsk->exit_code = code; | 983 | tsk->exit_code = code; |
985 | taskstats_exit(tsk, group_dead); | 984 | taskstats_exit(tsk, group_dead); |
986 | 985 | ||
987 | exit_mm(tsk); | 986 | exit_mm(tsk); |
988 | 987 | ||
989 | if (group_dead) | 988 | if (group_dead) |
990 | acct_process(); | 989 | acct_process(); |
991 | trace_sched_process_exit(tsk); | 990 | trace_sched_process_exit(tsk); |
992 | 991 | ||
993 | exit_sem(tsk); | 992 | exit_sem(tsk); |
994 | exit_files(tsk); | 993 | exit_files(tsk); |
995 | exit_fs(tsk); | 994 | exit_fs(tsk); |
996 | check_stack_usage(); | 995 | check_stack_usage(); |
997 | exit_thread(); | 996 | exit_thread(); |
998 | 997 | ||
999 | /* | 998 | /* |
1000 | * Flush inherited counters to the parent - before the parent | 999 | * Flush inherited counters to the parent - before the parent |
1001 | * gets woken up by child-exit notifications. | 1000 | * gets woken up by child-exit notifications. |
1002 | * | 1001 | * |
1003 | * because of cgroup mode, must be called before cgroup_exit() | 1002 | * because of cgroup mode, must be called before cgroup_exit() |
1004 | */ | 1003 | */ |
1005 | perf_event_exit_task(tsk); | 1004 | perf_event_exit_task(tsk); |
1006 | 1005 | ||
1007 | cgroup_exit(tsk, 1); | 1006 | cgroup_exit(tsk, 1); |
1008 | 1007 | ||
1009 | if (group_dead) | 1008 | if (group_dead) |
1010 | disassociate_ctty(1); | 1009 | disassociate_ctty(1); |
1011 | 1010 | ||
1012 | module_put(task_thread_info(tsk)->exec_domain->module); | 1011 | module_put(task_thread_info(tsk)->exec_domain->module); |
1013 | 1012 | ||
1014 | proc_exit_connector(tsk); | 1013 | proc_exit_connector(tsk); |
1015 | 1014 | ||
1016 | /* | 1015 | /* |
1017 | * FIXME: do that only when needed, using sched_exit tracepoint | 1016 | * FIXME: do that only when needed, using sched_exit tracepoint |
1018 | */ | 1017 | */ |
1019 | ptrace_put_breakpoints(tsk); | 1018 | ptrace_put_breakpoints(tsk); |
1020 | 1019 | ||
1021 | exit_notify(tsk, group_dead); | 1020 | exit_notify(tsk, group_dead); |
1022 | #ifdef CONFIG_NUMA | 1021 | #ifdef CONFIG_NUMA |
1023 | task_lock(tsk); | 1022 | task_lock(tsk); |
1024 | mpol_put(tsk->mempolicy); | 1023 | mpol_put(tsk->mempolicy); |
1025 | tsk->mempolicy = NULL; | 1024 | tsk->mempolicy = NULL; |
1026 | task_unlock(tsk); | 1025 | task_unlock(tsk); |
1027 | #endif | 1026 | #endif |
1028 | #ifdef CONFIG_FUTEX | 1027 | #ifdef CONFIG_FUTEX |
1029 | if (unlikely(current->pi_state_cache)) | 1028 | if (unlikely(current->pi_state_cache)) |
1030 | kfree(current->pi_state_cache); | 1029 | kfree(current->pi_state_cache); |
1031 | #endif | 1030 | #endif |
1032 | /* | 1031 | /* |
1033 | * Make sure we are holding no locks: | 1032 | * Make sure we are holding no locks: |
1034 | */ | 1033 | */ |
1035 | debug_check_no_locks_held(tsk); | 1034 | debug_check_no_locks_held(tsk); |
1036 | /* | 1035 | /* |
1037 | * We can do this unlocked here. The futex code uses this flag | 1036 | * We can do this unlocked here. The futex code uses this flag |
1038 | * just to verify whether the pi state cleanup has been done | 1037 | * just to verify whether the pi state cleanup has been done |
1039 | * or not. In the worst case it loops once more. | 1038 | * or not. In the worst case it loops once more. |
1040 | */ | 1039 | */ |
1041 | tsk->flags |= PF_EXITPIDONE; | 1040 | tsk->flags |= PF_EXITPIDONE; |
1042 | 1041 | ||
1043 | if (tsk->io_context) | 1042 | if (tsk->io_context) |
1044 | exit_io_context(tsk); | 1043 | exit_io_context(tsk); |
1045 | 1044 | ||
1046 | if (tsk->splice_pipe) | 1045 | if (tsk->splice_pipe) |
1047 | __free_pipe_info(tsk->splice_pipe); | 1046 | __free_pipe_info(tsk->splice_pipe); |
1048 | 1047 | ||
1049 | validate_creds_for_do_exit(tsk); | 1048 | validate_creds_for_do_exit(tsk); |
1050 | 1049 | ||
1051 | preempt_disable(); | 1050 | preempt_disable(); |
1052 | exit_rcu(); | 1051 | exit_rcu(); |
1053 | /* causes final put_task_struct in finish_task_switch(). */ | 1052 | /* causes final put_task_struct in finish_task_switch(). */ |
1054 | tsk->state = TASK_DEAD; | 1053 | tsk->state = TASK_DEAD; |
1055 | schedule(); | 1054 | schedule(); |
1056 | BUG(); | 1055 | BUG(); |
1057 | /* Avoid "noreturn function does return". */ | 1056 | /* Avoid "noreturn function does return". */ |
1058 | for (;;) | 1057 | for (;;) |
1059 | cpu_relax(); /* For when BUG is null */ | 1058 | cpu_relax(); /* For when BUG is null */ |
1060 | } | 1059 | } |
1061 | 1060 | ||
1062 | EXPORT_SYMBOL_GPL(do_exit); | 1061 | EXPORT_SYMBOL_GPL(do_exit); |
1063 | 1062 | ||
1064 | NORET_TYPE void complete_and_exit(struct completion *comp, long code) | 1063 | NORET_TYPE void complete_and_exit(struct completion *comp, long code) |
1065 | { | 1064 | { |
1066 | if (comp) | 1065 | if (comp) |
1067 | complete(comp); | 1066 | complete(comp); |
1068 | 1067 | ||
1069 | do_exit(code); | 1068 | do_exit(code); |
1070 | } | 1069 | } |
1071 | 1070 | ||
1072 | EXPORT_SYMBOL(complete_and_exit); | 1071 | EXPORT_SYMBOL(complete_and_exit); |
1073 | 1072 | ||
1074 | SYSCALL_DEFINE1(exit, int, error_code) | 1073 | SYSCALL_DEFINE1(exit, int, error_code) |
1075 | { | 1074 | { |
1076 | do_exit((error_code&0xff)<<8); | 1075 | do_exit((error_code&0xff)<<8); |
1077 | } | 1076 | } |
1078 | 1077 | ||
1079 | /* | 1078 | /* |
1080 | * Take down every thread in the group. This is called by fatal signals | 1079 | * Take down every thread in the group. This is called by fatal signals |
1081 | * as well as by sys_exit_group (below). | 1080 | * as well as by sys_exit_group (below). |
1082 | */ | 1081 | */ |
1083 | NORET_TYPE void | 1082 | NORET_TYPE void |
1084 | do_group_exit(int exit_code) | 1083 | do_group_exit(int exit_code) |
1085 | { | 1084 | { |
1086 | struct signal_struct *sig = current->signal; | 1085 | struct signal_struct *sig = current->signal; |
1087 | 1086 | ||
1088 | BUG_ON(exit_code & 0x80); /* core dumps don't get here */ | 1087 | BUG_ON(exit_code & 0x80); /* core dumps don't get here */ |
1089 | 1088 | ||
1090 | if (signal_group_exit(sig)) | 1089 | if (signal_group_exit(sig)) |
1091 | exit_code = sig->group_exit_code; | 1090 | exit_code = sig->group_exit_code; |
1092 | else if (!thread_group_empty(current)) { | 1091 | else if (!thread_group_empty(current)) { |
1093 | struct sighand_struct *const sighand = current->sighand; | 1092 | struct sighand_struct *const sighand = current->sighand; |
1094 | spin_lock_irq(&sighand->siglock); | 1093 | spin_lock_irq(&sighand->siglock); |
1095 | if (signal_group_exit(sig)) | 1094 | if (signal_group_exit(sig)) |
1096 | /* Another thread got here before we took the lock. */ | 1095 | /* Another thread got here before we took the lock. */ |
1097 | exit_code = sig->group_exit_code; | 1096 | exit_code = sig->group_exit_code; |
1098 | else { | 1097 | else { |
1099 | sig->group_exit_code = exit_code; | 1098 | sig->group_exit_code = exit_code; |
1100 | sig->flags = SIGNAL_GROUP_EXIT; | 1099 | sig->flags = SIGNAL_GROUP_EXIT; |
1101 | zap_other_threads(current); | 1100 | zap_other_threads(current); |
1102 | } | 1101 | } |
1103 | spin_unlock_irq(&sighand->siglock); | 1102 | spin_unlock_irq(&sighand->siglock); |
1104 | } | 1103 | } |
1105 | 1104 | ||
1106 | do_exit(exit_code); | 1105 | do_exit(exit_code); |
1107 | /* NOTREACHED */ | 1106 | /* NOTREACHED */ |
1108 | } | 1107 | } |
1109 | 1108 | ||
1110 | /* | 1109 | /* |
1111 | * this kills every thread in the thread group. Note that any externally | 1110 | * this kills every thread in the thread group. Note that any externally |
1112 | * wait4()-ing process will get the correct exit code - even if this | 1111 | * wait4()-ing process will get the correct exit code - even if this |
1113 | * thread is not the thread group leader. | 1112 | * thread is not the thread group leader. |
1114 | */ | 1113 | */ |
1115 | SYSCALL_DEFINE1(exit_group, int, error_code) | 1114 | SYSCALL_DEFINE1(exit_group, int, error_code) |
1116 | { | 1115 | { |
1117 | do_group_exit((error_code & 0xff) << 8); | 1116 | do_group_exit((error_code & 0xff) << 8); |
1118 | /* NOTREACHED */ | 1117 | /* NOTREACHED */ |
1119 | return 0; | 1118 | return 0; |
1120 | } | 1119 | } |
1121 | 1120 | ||
1122 | struct wait_opts { | 1121 | struct wait_opts { |
1123 | enum pid_type wo_type; | 1122 | enum pid_type wo_type; |
1124 | int wo_flags; | 1123 | int wo_flags; |
1125 | struct pid *wo_pid; | 1124 | struct pid *wo_pid; |
1126 | 1125 | ||
1127 | struct siginfo __user *wo_info; | 1126 | struct siginfo __user *wo_info; |
1128 | int __user *wo_stat; | 1127 | int __user *wo_stat; |
1129 | struct rusage __user *wo_rusage; | 1128 | struct rusage __user *wo_rusage; |
1130 | 1129 | ||
1131 | wait_queue_t child_wait; | 1130 | wait_queue_t child_wait; |
1132 | int notask_error; | 1131 | int notask_error; |
1133 | }; | 1132 | }; |
1134 | 1133 | ||
1135 | static inline | 1134 | static inline |
1136 | struct pid *task_pid_type(struct task_struct *task, enum pid_type type) | 1135 | struct pid *task_pid_type(struct task_struct *task, enum pid_type type) |
1137 | { | 1136 | { |
1138 | if (type != PIDTYPE_PID) | 1137 | if (type != PIDTYPE_PID) |
1139 | task = task->group_leader; | 1138 | task = task->group_leader; |
1140 | return task->pids[type].pid; | 1139 | return task->pids[type].pid; |
1141 | } | 1140 | } |
1142 | 1141 | ||
1143 | static int eligible_pid(struct wait_opts *wo, struct task_struct *p) | 1142 | static int eligible_pid(struct wait_opts *wo, struct task_struct *p) |
1144 | { | 1143 | { |
1145 | return wo->wo_type == PIDTYPE_MAX || | 1144 | return wo->wo_type == PIDTYPE_MAX || |
1146 | task_pid_type(p, wo->wo_type) == wo->wo_pid; | 1145 | task_pid_type(p, wo->wo_type) == wo->wo_pid; |
1147 | } | 1146 | } |
1148 | 1147 | ||
1149 | static int eligible_child(struct wait_opts *wo, struct task_struct *p) | 1148 | static int eligible_child(struct wait_opts *wo, struct task_struct *p) |
1150 | { | 1149 | { |
1151 | if (!eligible_pid(wo, p)) | 1150 | if (!eligible_pid(wo, p)) |
1152 | return 0; | 1151 | return 0; |
1153 | /* Wait for all children (clone and not) if __WALL is set; | 1152 | /* Wait for all children (clone and not) if __WALL is set; |
1154 | * otherwise, wait for clone children *only* if __WCLONE is | 1153 | * otherwise, wait for clone children *only* if __WCLONE is |
1155 | * set; otherwise, wait for non-clone children *only*. (Note: | 1154 | * set; otherwise, wait for non-clone children *only*. (Note: |
1156 | * A "clone" child here is one that reports to its parent | 1155 | * A "clone" child here is one that reports to its parent |
1157 | * using a signal other than SIGCHLD.) */ | 1156 | * using a signal other than SIGCHLD.) */ |
1158 | if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) | 1157 | if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) |
1159 | && !(wo->wo_flags & __WALL)) | 1158 | && !(wo->wo_flags & __WALL)) |
1160 | return 0; | 1159 | return 0; |
1161 | 1160 | ||
1162 | return 1; | 1161 | return 1; |
1163 | } | 1162 | } |
1164 | 1163 | ||
1165 | static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, | 1164 | static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, |
1166 | pid_t pid, uid_t uid, int why, int status) | 1165 | pid_t pid, uid_t uid, int why, int status) |
1167 | { | 1166 | { |
1168 | struct siginfo __user *infop; | 1167 | struct siginfo __user *infop; |
1169 | int retval = wo->wo_rusage | 1168 | int retval = wo->wo_rusage |
1170 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1169 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
1171 | 1170 | ||
1172 | put_task_struct(p); | 1171 | put_task_struct(p); |
1173 | infop = wo->wo_info; | 1172 | infop = wo->wo_info; |
1174 | if (infop) { | 1173 | if (infop) { |
1175 | if (!retval) | 1174 | if (!retval) |
1176 | retval = put_user(SIGCHLD, &infop->si_signo); | 1175 | retval = put_user(SIGCHLD, &infop->si_signo); |
1177 | if (!retval) | 1176 | if (!retval) |
1178 | retval = put_user(0, &infop->si_errno); | 1177 | retval = put_user(0, &infop->si_errno); |
1179 | if (!retval) | 1178 | if (!retval) |
1180 | retval = put_user((short)why, &infop->si_code); | 1179 | retval = put_user((short)why, &infop->si_code); |
1181 | if (!retval) | 1180 | if (!retval) |
1182 | retval = put_user(pid, &infop->si_pid); | 1181 | retval = put_user(pid, &infop->si_pid); |
1183 | if (!retval) | 1182 | if (!retval) |
1184 | retval = put_user(uid, &infop->si_uid); | 1183 | retval = put_user(uid, &infop->si_uid); |
1185 | if (!retval) | 1184 | if (!retval) |
1186 | retval = put_user(status, &infop->si_status); | 1185 | retval = put_user(status, &infop->si_status); |
1187 | } | 1186 | } |
1188 | if (!retval) | 1187 | if (!retval) |
1189 | retval = pid; | 1188 | retval = pid; |
1190 | return retval; | 1189 | return retval; |
1191 | } | 1190 | } |
1192 | 1191 | ||
1193 | /* | 1192 | /* |
1194 | * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold | 1193 | * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold |
1195 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1194 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold |
1196 | * the lock and this task is uninteresting. If we return nonzero, we have | 1195 | * the lock and this task is uninteresting. If we return nonzero, we have |
1197 | * released the lock and the system call should return. | 1196 | * released the lock and the system call should return. |
1198 | */ | 1197 | */ |
1199 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | 1198 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) |
1200 | { | 1199 | { |
1201 | unsigned long state; | 1200 | unsigned long state; |
1202 | int retval, status, traced; | 1201 | int retval, status, traced; |
1203 | pid_t pid = task_pid_vnr(p); | 1202 | pid_t pid = task_pid_vnr(p); |
1204 | uid_t uid = __task_cred(p)->uid; | 1203 | uid_t uid = __task_cred(p)->uid; |
1205 | struct siginfo __user *infop; | 1204 | struct siginfo __user *infop; |
1206 | 1205 | ||
1207 | if (!likely(wo->wo_flags & WEXITED)) | 1206 | if (!likely(wo->wo_flags & WEXITED)) |
1208 | return 0; | 1207 | return 0; |
1209 | 1208 | ||
1210 | if (unlikely(wo->wo_flags & WNOWAIT)) { | 1209 | if (unlikely(wo->wo_flags & WNOWAIT)) { |
1211 | int exit_code = p->exit_code; | 1210 | int exit_code = p->exit_code; |
1212 | int why; | 1211 | int why; |
1213 | 1212 | ||
1214 | get_task_struct(p); | 1213 | get_task_struct(p); |
1215 | read_unlock(&tasklist_lock); | 1214 | read_unlock(&tasklist_lock); |
1216 | if ((exit_code & 0x7f) == 0) { | 1215 | if ((exit_code & 0x7f) == 0) { |
1217 | why = CLD_EXITED; | 1216 | why = CLD_EXITED; |
1218 | status = exit_code >> 8; | 1217 | status = exit_code >> 8; |
1219 | } else { | 1218 | } else { |
1220 | why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; | 1219 | why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; |
1221 | status = exit_code & 0x7f; | 1220 | status = exit_code & 0x7f; |
1222 | } | 1221 | } |
1223 | return wait_noreap_copyout(wo, p, pid, uid, why, status); | 1222 | return wait_noreap_copyout(wo, p, pid, uid, why, status); |
1224 | } | 1223 | } |
1225 | 1224 | ||
1226 | /* | 1225 | /* |
1227 | * Try to move the task's state to DEAD | 1226 | * Try to move the task's state to DEAD |
1228 | * only one thread is allowed to do this: | 1227 | * only one thread is allowed to do this: |
1229 | */ | 1228 | */ |
1230 | state = xchg(&p->exit_state, EXIT_DEAD); | 1229 | state = xchg(&p->exit_state, EXIT_DEAD); |
1231 | if (state != EXIT_ZOMBIE) { | 1230 | if (state != EXIT_ZOMBIE) { |
1232 | BUG_ON(state != EXIT_DEAD); | 1231 | BUG_ON(state != EXIT_DEAD); |
1233 | return 0; | 1232 | return 0; |
1234 | } | 1233 | } |
1235 | 1234 | ||
1236 | traced = ptrace_reparented(p); | 1235 | traced = ptrace_reparented(p); |
1237 | /* | 1236 | /* |
1238 | * It can be ptraced but not reparented, check | 1237 | * It can be ptraced but not reparented, check |
1239 | * !task_detached() to filter out sub-threads. | 1238 | * !task_detached() to filter out sub-threads. |
1240 | */ | 1239 | */ |
1241 | if (likely(!traced) && likely(!task_detached(p))) { | 1240 | if (likely(!traced) && likely(!task_detached(p))) { |
1242 | struct signal_struct *psig; | 1241 | struct signal_struct *psig; |
1243 | struct signal_struct *sig; | 1242 | struct signal_struct *sig; |
1244 | unsigned long maxrss; | 1243 | unsigned long maxrss; |
1245 | cputime_t tgutime, tgstime; | 1244 | cputime_t tgutime, tgstime; |
1246 | 1245 | ||
1247 | /* | 1246 | /* |
1248 | * The resource counters for the group leader are in its | 1247 | * The resource counters for the group leader are in its |
1249 | * own task_struct. Those for dead threads in the group | 1248 | * own task_struct. Those for dead threads in the group |
1250 | * are in its signal_struct, as are those for the child | 1249 | * are in its signal_struct, as are those for the child |
1251 | * processes it has previously reaped. All these | 1250 | * processes it has previously reaped. All these |
1252 | * accumulate in the parent's signal_struct c* fields. | 1251 | * accumulate in the parent's signal_struct c* fields. |
1253 | * | 1252 | * |
1254 | * We don't bother to take a lock here to protect these | 1253 | * We don't bother to take a lock here to protect these |
1255 | * p->signal fields, because they are only touched by | 1254 | * p->signal fields, because they are only touched by |
1256 | * __exit_signal, which runs with tasklist_lock | 1255 | * __exit_signal, which runs with tasklist_lock |
1257 | * write-locked anyway, and so is excluded here. We do | 1256 | * write-locked anyway, and so is excluded here. We do |
1258 | * need to protect the access to parent->signal fields, | 1257 | * need to protect the access to parent->signal fields, |
1259 | * as other threads in the parent group can be right | 1258 | * as other threads in the parent group can be right |
1260 | * here reaping other children at the same time. | 1259 | * here reaping other children at the same time. |
1261 | * | 1260 | * |
1262 | * We use thread_group_times() to get times for the thread | 1261 | * We use thread_group_times() to get times for the thread |
1263 | * group, which consolidates times for all threads in the | 1262 | * group, which consolidates times for all threads in the |
1264 | * group including the group leader. | 1263 | * group including the group leader. |
1265 | */ | 1264 | */ |
1266 | thread_group_times(p, &tgutime, &tgstime); | 1265 | thread_group_times(p, &tgutime, &tgstime); |
1267 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1266 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1268 | psig = p->real_parent->signal; | 1267 | psig = p->real_parent->signal; |
1269 | sig = p->signal; | 1268 | sig = p->signal; |
1270 | psig->cutime = | 1269 | psig->cutime = |
1271 | cputime_add(psig->cutime, | 1270 | cputime_add(psig->cutime, |
1272 | cputime_add(tgutime, | 1271 | cputime_add(tgutime, |
1273 | sig->cutime)); | 1272 | sig->cutime)); |
1274 | psig->cstime = | 1273 | psig->cstime = |
1275 | cputime_add(psig->cstime, | 1274 | cputime_add(psig->cstime, |
1276 | cputime_add(tgstime, | 1275 | cputime_add(tgstime, |
1277 | sig->cstime)); | 1276 | sig->cstime)); |
1278 | psig->cgtime = | 1277 | psig->cgtime = |
1279 | cputime_add(psig->cgtime, | 1278 | cputime_add(psig->cgtime, |
1280 | cputime_add(p->gtime, | 1279 | cputime_add(p->gtime, |
1281 | cputime_add(sig->gtime, | 1280 | cputime_add(sig->gtime, |
1282 | sig->cgtime))); | 1281 | sig->cgtime))); |
1283 | psig->cmin_flt += | 1282 | psig->cmin_flt += |
1284 | p->min_flt + sig->min_flt + sig->cmin_flt; | 1283 | p->min_flt + sig->min_flt + sig->cmin_flt; |
1285 | psig->cmaj_flt += | 1284 | psig->cmaj_flt += |
1286 | p->maj_flt + sig->maj_flt + sig->cmaj_flt; | 1285 | p->maj_flt + sig->maj_flt + sig->cmaj_flt; |
1287 | psig->cnvcsw += | 1286 | psig->cnvcsw += |
1288 | p->nvcsw + sig->nvcsw + sig->cnvcsw; | 1287 | p->nvcsw + sig->nvcsw + sig->cnvcsw; |
1289 | psig->cnivcsw += | 1288 | psig->cnivcsw += |
1290 | p->nivcsw + sig->nivcsw + sig->cnivcsw; | 1289 | p->nivcsw + sig->nivcsw + sig->cnivcsw; |
1291 | psig->cinblock += | 1290 | psig->cinblock += |
1292 | task_io_get_inblock(p) + | 1291 | task_io_get_inblock(p) + |
1293 | sig->inblock + sig->cinblock; | 1292 | sig->inblock + sig->cinblock; |
1294 | psig->coublock += | 1293 | psig->coublock += |
1295 | task_io_get_oublock(p) + | 1294 | task_io_get_oublock(p) + |
1296 | sig->oublock + sig->coublock; | 1295 | sig->oublock + sig->coublock; |
1297 | maxrss = max(sig->maxrss, sig->cmaxrss); | 1296 | maxrss = max(sig->maxrss, sig->cmaxrss); |
1298 | if (psig->cmaxrss < maxrss) | 1297 | if (psig->cmaxrss < maxrss) |
1299 | psig->cmaxrss = maxrss; | 1298 | psig->cmaxrss = maxrss; |
1300 | task_io_accounting_add(&psig->ioac, &p->ioac); | 1299 | task_io_accounting_add(&psig->ioac, &p->ioac); |
1301 | task_io_accounting_add(&psig->ioac, &sig->ioac); | 1300 | task_io_accounting_add(&psig->ioac, &sig->ioac); |
1302 | spin_unlock_irq(&p->real_parent->sighand->siglock); | 1301 | spin_unlock_irq(&p->real_parent->sighand->siglock); |
1303 | } | 1302 | } |
1304 | 1303 | ||
1305 | /* | 1304 | /* |
1306 | * Now we are sure this task is interesting, and no other | 1305 | * Now we are sure this task is interesting, and no other |
1307 | * thread can reap it because we set its state to EXIT_DEAD. | 1306 | * thread can reap it because we set its state to EXIT_DEAD. |
1308 | */ | 1307 | */ |
1309 | read_unlock(&tasklist_lock); | 1308 | read_unlock(&tasklist_lock); |
1310 | 1309 | ||
1311 | retval = wo->wo_rusage | 1310 | retval = wo->wo_rusage |
1312 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1311 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
1313 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | 1312 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) |
1314 | ? p->signal->group_exit_code : p->exit_code; | 1313 | ? p->signal->group_exit_code : p->exit_code; |
1315 | if (!retval && wo->wo_stat) | 1314 | if (!retval && wo->wo_stat) |
1316 | retval = put_user(status, wo->wo_stat); | 1315 | retval = put_user(status, wo->wo_stat); |
1317 | 1316 | ||
1318 | infop = wo->wo_info; | 1317 | infop = wo->wo_info; |
1319 | if (!retval && infop) | 1318 | if (!retval && infop) |
1320 | retval = put_user(SIGCHLD, &infop->si_signo); | 1319 | retval = put_user(SIGCHLD, &infop->si_signo); |
1321 | if (!retval && infop) | 1320 | if (!retval && infop) |
1322 | retval = put_user(0, &infop->si_errno); | 1321 | retval = put_user(0, &infop->si_errno); |
1323 | if (!retval && infop) { | 1322 | if (!retval && infop) { |
1324 | int why; | 1323 | int why; |
1325 | 1324 | ||
1326 | if ((status & 0x7f) == 0) { | 1325 | if ((status & 0x7f) == 0) { |
1327 | why = CLD_EXITED; | 1326 | why = CLD_EXITED; |
1328 | status >>= 8; | 1327 | status >>= 8; |
1329 | } else { | 1328 | } else { |
1330 | why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; | 1329 | why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; |
1331 | status &= 0x7f; | 1330 | status &= 0x7f; |
1332 | } | 1331 | } |
1333 | retval = put_user((short)why, &infop->si_code); | 1332 | retval = put_user((short)why, &infop->si_code); |
1334 | if (!retval) | 1333 | if (!retval) |
1335 | retval = put_user(status, &infop->si_status); | 1334 | retval = put_user(status, &infop->si_status); |
1336 | } | 1335 | } |
1337 | if (!retval && infop) | 1336 | if (!retval && infop) |
1338 | retval = put_user(pid, &infop->si_pid); | 1337 | retval = put_user(pid, &infop->si_pid); |
1339 | if (!retval && infop) | 1338 | if (!retval && infop) |
1340 | retval = put_user(uid, &infop->si_uid); | 1339 | retval = put_user(uid, &infop->si_uid); |
1341 | if (!retval) | 1340 | if (!retval) |
1342 | retval = pid; | 1341 | retval = pid; |
1343 | 1342 | ||
1344 | if (traced) { | 1343 | if (traced) { |
1345 | write_lock_irq(&tasklist_lock); | 1344 | write_lock_irq(&tasklist_lock); |
1346 | /* We dropped tasklist, ptracer could die and untrace */ | 1345 | /* We dropped tasklist, ptracer could die and untrace */ |
1347 | ptrace_unlink(p); | 1346 | ptrace_unlink(p); |
1348 | /* | 1347 | /* |
1349 | * If this is not a detached task, notify the parent. | 1348 | * If this is not a detached task, notify the parent. |
1350 | * If it's still not detached after that, don't release | 1349 | * If it's still not detached after that, don't release |
1351 | * it now. | 1350 | * it now. |
1352 | */ | 1351 | */ |
1353 | if (!task_detached(p)) { | 1352 | if (!task_detached(p)) { |
1354 | do_notify_parent(p, p->exit_signal); | 1353 | do_notify_parent(p, p->exit_signal); |
1355 | if (!task_detached(p)) { | 1354 | if (!task_detached(p)) { |
1356 | p->exit_state = EXIT_ZOMBIE; | 1355 | p->exit_state = EXIT_ZOMBIE; |
1357 | p = NULL; | 1356 | p = NULL; |
1358 | } | 1357 | } |
1359 | } | 1358 | } |
1360 | write_unlock_irq(&tasklist_lock); | 1359 | write_unlock_irq(&tasklist_lock); |
1361 | } | 1360 | } |
1362 | if (p != NULL) | 1361 | if (p != NULL) |
1363 | release_task(p); | 1362 | release_task(p); |
1364 | 1363 | ||
1365 | return retval; | 1364 | return retval; |
1366 | } | 1365 | } |
1367 | 1366 | ||
1368 | static int *task_stopped_code(struct task_struct *p, bool ptrace) | 1367 | static int *task_stopped_code(struct task_struct *p, bool ptrace) |
1369 | { | 1368 | { |
1370 | if (ptrace) { | 1369 | if (ptrace) { |
1371 | if (task_is_stopped_or_traced(p)) | 1370 | if (task_is_stopped_or_traced(p)) |
1372 | return &p->exit_code; | 1371 | return &p->exit_code; |
1373 | } else { | 1372 | } else { |
1374 | if (p->signal->flags & SIGNAL_STOP_STOPPED) | 1373 | if (p->signal->flags & SIGNAL_STOP_STOPPED) |
1375 | return &p->signal->group_exit_code; | 1374 | return &p->signal->group_exit_code; |
1376 | } | 1375 | } |
1377 | return NULL; | 1376 | return NULL; |
1378 | } | 1377 | } |
1379 | 1378 | ||
1380 | /** | 1379 | /** |
1381 | * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED | 1380 | * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED |
1382 | * @wo: wait options | 1381 | * @wo: wait options |
1383 | * @ptrace: is the wait for ptrace | 1382 | * @ptrace: is the wait for ptrace |
1384 | * @p: task to wait for | 1383 | * @p: task to wait for |
1385 | * | 1384 | * |
1386 | * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. | 1385 | * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. |
1387 | * | 1386 | * |
1388 | * CONTEXT: | 1387 | * CONTEXT: |
1389 | * read_lock(&tasklist_lock), which is released if return value is | 1388 | * read_lock(&tasklist_lock), which is released if return value is |
1390 | * non-zero. Also, grabs and releases @p->sighand->siglock. | 1389 | * non-zero. Also, grabs and releases @p->sighand->siglock. |
1391 | * | 1390 | * |
1392 | * RETURNS: | 1391 | * RETURNS: |
1393 | * 0 if wait condition didn't exist and search for other wait conditions | 1392 | * 0 if wait condition didn't exist and search for other wait conditions |
1394 | * should continue. Non-zero return, -errno on failure and @p's pid on | 1393 | * should continue. Non-zero return, -errno on failure and @p's pid on |
1395 | * success, implies that tasklist_lock is released and wait condition | 1394 | * success, implies that tasklist_lock is released and wait condition |
1396 | * search should terminate. | 1395 | * search should terminate. |
1397 | */ | 1396 | */ |
1398 | static int wait_task_stopped(struct wait_opts *wo, | 1397 | static int wait_task_stopped(struct wait_opts *wo, |
1399 | int ptrace, struct task_struct *p) | 1398 | int ptrace, struct task_struct *p) |
1400 | { | 1399 | { |
1401 | struct siginfo __user *infop; | 1400 | struct siginfo __user *infop; |
1402 | int retval, exit_code, *p_code, why; | 1401 | int retval, exit_code, *p_code, why; |
1403 | uid_t uid = 0; /* unneeded, required by compiler */ | 1402 | uid_t uid = 0; /* unneeded, required by compiler */ |
1404 | pid_t pid; | 1403 | pid_t pid; |
1405 | 1404 | ||
1406 | /* | 1405 | /* |
1407 | * Traditionally we see ptrace'd stopped tasks regardless of options. | 1406 | * Traditionally we see ptrace'd stopped tasks regardless of options. |
1408 | */ | 1407 | */ |
1409 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) | 1408 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) |
1410 | return 0; | 1409 | return 0; |
1411 | 1410 | ||
1412 | if (!task_stopped_code(p, ptrace)) | 1411 | if (!task_stopped_code(p, ptrace)) |
1413 | return 0; | 1412 | return 0; |
1414 | 1413 | ||
1415 | exit_code = 0; | 1414 | exit_code = 0; |
1416 | spin_lock_irq(&p->sighand->siglock); | 1415 | spin_lock_irq(&p->sighand->siglock); |
1417 | 1416 | ||
1418 | p_code = task_stopped_code(p, ptrace); | 1417 | p_code = task_stopped_code(p, ptrace); |
1419 | if (unlikely(!p_code)) | 1418 | if (unlikely(!p_code)) |
1420 | goto unlock_sig; | 1419 | goto unlock_sig; |
1421 | 1420 | ||
1422 | exit_code = *p_code; | 1421 | exit_code = *p_code; |
1423 | if (!exit_code) | 1422 | if (!exit_code) |
1424 | goto unlock_sig; | 1423 | goto unlock_sig; |
1425 | 1424 | ||
1426 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1425 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1427 | *p_code = 0; | 1426 | *p_code = 0; |
1428 | 1427 | ||
1429 | uid = task_uid(p); | 1428 | uid = task_uid(p); |
1430 | unlock_sig: | 1429 | unlock_sig: |
1431 | spin_unlock_irq(&p->sighand->siglock); | 1430 | spin_unlock_irq(&p->sighand->siglock); |
1432 | if (!exit_code) | 1431 | if (!exit_code) |
1433 | return 0; | 1432 | return 0; |
1434 | 1433 | ||
1435 | /* | 1434 | /* |
1436 | * Now we are pretty sure this task is interesting. | 1435 | * Now we are pretty sure this task is interesting. |
1437 | * Make sure it doesn't get reaped out from under us while we | 1436 | * Make sure it doesn't get reaped out from under us while we |
1438 | * give up the lock and then examine it below. We don't want to | 1437 | * give up the lock and then examine it below. We don't want to |
1439 | * keep holding onto the tasklist_lock while we call getrusage and | 1438 | * keep holding onto the tasklist_lock while we call getrusage and |
1440 | * possibly take page faults for user memory. | 1439 | * possibly take page faults for user memory. |
1441 | */ | 1440 | */ |
1442 | get_task_struct(p); | 1441 | get_task_struct(p); |
1443 | pid = task_pid_vnr(p); | 1442 | pid = task_pid_vnr(p); |
1444 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; | 1443 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; |
1445 | read_unlock(&tasklist_lock); | 1444 | read_unlock(&tasklist_lock); |
1446 | 1445 | ||
1447 | if (unlikely(wo->wo_flags & WNOWAIT)) | 1446 | if (unlikely(wo->wo_flags & WNOWAIT)) |
1448 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); | 1447 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); |
1449 | 1448 | ||
1450 | retval = wo->wo_rusage | 1449 | retval = wo->wo_rusage |
1451 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1450 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
1452 | if (!retval && wo->wo_stat) | 1451 | if (!retval && wo->wo_stat) |
1453 | retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); | 1452 | retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); |
1454 | 1453 | ||
1455 | infop = wo->wo_info; | 1454 | infop = wo->wo_info; |
1456 | if (!retval && infop) | 1455 | if (!retval && infop) |
1457 | retval = put_user(SIGCHLD, &infop->si_signo); | 1456 | retval = put_user(SIGCHLD, &infop->si_signo); |
1458 | if (!retval && infop) | 1457 | if (!retval && infop) |
1459 | retval = put_user(0, &infop->si_errno); | 1458 | retval = put_user(0, &infop->si_errno); |
1460 | if (!retval && infop) | 1459 | if (!retval && infop) |
1461 | retval = put_user((short)why, &infop->si_code); | 1460 | retval = put_user((short)why, &infop->si_code); |
1462 | if (!retval && infop) | 1461 | if (!retval && infop) |
1463 | retval = put_user(exit_code, &infop->si_status); | 1462 | retval = put_user(exit_code, &infop->si_status); |
1464 | if (!retval && infop) | 1463 | if (!retval && infop) |
1465 | retval = put_user(pid, &infop->si_pid); | 1464 | retval = put_user(pid, &infop->si_pid); |
1466 | if (!retval && infop) | 1465 | if (!retval && infop) |
1467 | retval = put_user(uid, &infop->si_uid); | 1466 | retval = put_user(uid, &infop->si_uid); |
1468 | if (!retval) | 1467 | if (!retval) |
1469 | retval = pid; | 1468 | retval = pid; |
1470 | put_task_struct(p); | 1469 | put_task_struct(p); |
1471 | 1470 | ||
1472 | BUG_ON(!retval); | 1471 | BUG_ON(!retval); |
1473 | return retval; | 1472 | return retval; |
1474 | } | 1473 | } |
1475 | 1474 | ||
1476 | /* | 1475 | /* |
1477 | * Handle do_wait work for one task in a live, non-stopped state. | 1476 | * Handle do_wait work for one task in a live, non-stopped state. |
1478 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1477 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold |
1479 | * the lock and this task is uninteresting. If we return nonzero, we have | 1478 | * the lock and this task is uninteresting. If we return nonzero, we have |
1480 | * released the lock and the system call should return. | 1479 | * released the lock and the system call should return. |
1481 | */ | 1480 | */ |
1482 | static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | 1481 | static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) |
1483 | { | 1482 | { |
1484 | int retval; | 1483 | int retval; |
1485 | pid_t pid; | 1484 | pid_t pid; |
1486 | uid_t uid; | 1485 | uid_t uid; |
1487 | 1486 | ||
1488 | if (!unlikely(wo->wo_flags & WCONTINUED)) | 1487 | if (!unlikely(wo->wo_flags & WCONTINUED)) |
1489 | return 0; | 1488 | return 0; |
1490 | 1489 | ||
1491 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) | 1490 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) |
1492 | return 0; | 1491 | return 0; |
1493 | 1492 | ||
1494 | spin_lock_irq(&p->sighand->siglock); | 1493 | spin_lock_irq(&p->sighand->siglock); |
1495 | /* Re-check with the lock held. */ | 1494 | /* Re-check with the lock held. */ |
1496 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { | 1495 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { |
1497 | spin_unlock_irq(&p->sighand->siglock); | 1496 | spin_unlock_irq(&p->sighand->siglock); |
1498 | return 0; | 1497 | return 0; |
1499 | } | 1498 | } |
1500 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1499 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1501 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; | 1500 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; |
1502 | uid = task_uid(p); | 1501 | uid = task_uid(p); |
1503 | spin_unlock_irq(&p->sighand->siglock); | 1502 | spin_unlock_irq(&p->sighand->siglock); |
1504 | 1503 | ||
1505 | pid = task_pid_vnr(p); | 1504 | pid = task_pid_vnr(p); |
1506 | get_task_struct(p); | 1505 | get_task_struct(p); |
1507 | read_unlock(&tasklist_lock); | 1506 | read_unlock(&tasklist_lock); |
1508 | 1507 | ||
1509 | if (!wo->wo_info) { | 1508 | if (!wo->wo_info) { |
1510 | retval = wo->wo_rusage | 1509 | retval = wo->wo_rusage |
1511 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1510 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
1512 | put_task_struct(p); | 1511 | put_task_struct(p); |
1513 | if (!retval && wo->wo_stat) | 1512 | if (!retval && wo->wo_stat) |
1514 | retval = put_user(0xffff, wo->wo_stat); | 1513 | retval = put_user(0xffff, wo->wo_stat); |
1515 | if (!retval) | 1514 | if (!retval) |
1516 | retval = pid; | 1515 | retval = pid; |
1517 | } else { | 1516 | } else { |
1518 | retval = wait_noreap_copyout(wo, p, pid, uid, | 1517 | retval = wait_noreap_copyout(wo, p, pid, uid, |
1519 | CLD_CONTINUED, SIGCONT); | 1518 | CLD_CONTINUED, SIGCONT); |
1520 | BUG_ON(retval == 0); | 1519 | BUG_ON(retval == 0); |
1521 | } | 1520 | } |
1522 | 1521 | ||
1523 | return retval; | 1522 | return retval; |
1524 | } | 1523 | } |
1525 | 1524 | ||
1526 | /* | 1525 | /* |
1527 | * Consider @p for a wait by @parent. | 1526 | * Consider @p for a wait by @parent. |
1528 | * | 1527 | * |
1529 | * -ECHILD should be in ->notask_error before the first call. | 1528 | * -ECHILD should be in ->notask_error before the first call. |
1530 | * Returns nonzero for a final return, when we have unlocked tasklist_lock. | 1529 | * Returns nonzero for a final return, when we have unlocked tasklist_lock. |
1531 | * Returns zero if the search for a child should continue; | 1530 | * Returns zero if the search for a child should continue; |
1532 | * then ->notask_error is 0 if @p is an eligible child, | 1531 | * then ->notask_error is 0 if @p is an eligible child, |
1533 | * or another error from security_task_wait(), or still -ECHILD. | 1532 | * or another error from security_task_wait(), or still -ECHILD. |
1534 | */ | 1533 | */ |
1535 | static int wait_consider_task(struct wait_opts *wo, int ptrace, | 1534 | static int wait_consider_task(struct wait_opts *wo, int ptrace, |
1536 | struct task_struct *p) | 1535 | struct task_struct *p) |
1537 | { | 1536 | { |
1538 | int ret = eligible_child(wo, p); | 1537 | int ret = eligible_child(wo, p); |
1539 | if (!ret) | 1538 | if (!ret) |
1540 | return ret; | 1539 | return ret; |
1541 | 1540 | ||
1542 | ret = security_task_wait(p); | 1541 | ret = security_task_wait(p); |
1543 | if (unlikely(ret < 0)) { | 1542 | if (unlikely(ret < 0)) { |
1544 | /* | 1543 | /* |
1545 | * If we have not yet seen any eligible child, | 1544 | * If we have not yet seen any eligible child, |
1546 | * then let this error code replace -ECHILD. | 1545 | * then let this error code replace -ECHILD. |
1547 | * A permission error will give the user a clue | 1546 | * A permission error will give the user a clue |
1548 | * to look for security policy problems, rather | 1547 | * to look for security policy problems, rather |
1549 | * than for mysterious wait bugs. | 1548 | * than for mysterious wait bugs. |
1550 | */ | 1549 | */ |
1551 | if (wo->notask_error) | 1550 | if (wo->notask_error) |
1552 | wo->notask_error = ret; | 1551 | wo->notask_error = ret; |
1553 | return 0; | 1552 | return 0; |
1554 | } | 1553 | } |
1555 | 1554 | ||
1556 | /* dead body doesn't have much to contribute */ | 1555 | /* dead body doesn't have much to contribute */ |
1557 | if (p->exit_state == EXIT_DEAD) | 1556 | if (p->exit_state == EXIT_DEAD) |
1558 | return 0; | 1557 | return 0; |
1559 | 1558 | ||
1560 | /* slay zombie? */ | 1559 | /* slay zombie? */ |
1561 | if (p->exit_state == EXIT_ZOMBIE) { | 1560 | if (p->exit_state == EXIT_ZOMBIE) { |
1562 | /* | 1561 | /* |
1563 | * A zombie ptracee is only visible to its ptracer. | 1562 | * A zombie ptracee is only visible to its ptracer. |
1564 | * Notification and reaping will be cascaded to the real | 1563 | * Notification and reaping will be cascaded to the real |
1565 | * parent when the ptracer detaches. | 1564 | * parent when the ptracer detaches. |
1566 | */ | 1565 | */ |
1567 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { | 1566 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { |
1568 | /* it will become visible, clear notask_error */ | 1567 | /* it will become visible, clear notask_error */ |
1569 | wo->notask_error = 0; | 1568 | wo->notask_error = 0; |
1570 | return 0; | 1569 | return 0; |
1571 | } | 1570 | } |
1572 | 1571 | ||
1573 | /* we don't reap group leaders with subthreads */ | 1572 | /* we don't reap group leaders with subthreads */ |
1574 | if (!delay_group_leader(p)) | 1573 | if (!delay_group_leader(p)) |
1575 | return wait_task_zombie(wo, p); | 1574 | return wait_task_zombie(wo, p); |
1576 | 1575 | ||
1577 | /* | 1576 | /* |
1578 | * Allow access to stopped/continued state via zombie by | 1577 | * Allow access to stopped/continued state via zombie by |
1579 | * falling through. Clearing of notask_error is complex. | 1578 | * falling through. Clearing of notask_error is complex. |
1580 | * | 1579 | * |
1581 | * When !@ptrace: | 1580 | * When !@ptrace: |
1582 | * | 1581 | * |
1583 | * If WEXITED is set, notask_error should naturally be | 1582 | * If WEXITED is set, notask_error should naturally be |
1584 | * cleared. If not, subset of WSTOPPED|WCONTINUED is set, | 1583 | * cleared. If not, subset of WSTOPPED|WCONTINUED is set, |
1585 | * so, if there are live subthreads, there are events to | 1584 | * so, if there are live subthreads, there are events to |
1586 | * wait for. If all subthreads are dead, it's still safe | 1585 | * wait for. If all subthreads are dead, it's still safe |
1587 | * to clear - this function will be called again in finite | 1586 | * to clear - this function will be called again in finite |
1588 | * amount time once all the subthreads are released and | 1587 | * amount time once all the subthreads are released and |
1589 | * will then return without clearing. | 1588 | * will then return without clearing. |
1590 | * | 1589 | * |
1591 | * When @ptrace: | 1590 | * When @ptrace: |
1592 | * | 1591 | * |
1593 | * Stopped state is per-task and thus can't change once the | 1592 | * Stopped state is per-task and thus can't change once the |
1594 | * target task dies. Only continued and exited can happen. | 1593 | * target task dies. Only continued and exited can happen. |
1595 | * Clear notask_error if WCONTINUED | WEXITED. | 1594 | * Clear notask_error if WCONTINUED | WEXITED. |
1596 | */ | 1595 | */ |
1597 | if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) | 1596 | if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) |
1598 | wo->notask_error = 0; | 1597 | wo->notask_error = 0; |
1599 | } else { | 1598 | } else { |
1600 | /* | 1599 | /* |
1601 | * If @p is ptraced by a task in its real parent's group, | 1600 | * If @p is ptraced by a task in its real parent's group, |
1602 | * hide group stop/continued state when looking at @p as | 1601 | * hide group stop/continued state when looking at @p as |
1603 | * the real parent; otherwise, a single stop can be | 1602 | * the real parent; otherwise, a single stop can be |
1604 | * reported twice as group and ptrace stops. | 1603 | * reported twice as group and ptrace stops. |
1605 | * | 1604 | * |
1606 | * If a ptracer wants to distinguish the two events for its | 1605 | * If a ptracer wants to distinguish the two events for its |
1607 | * own children, it should create a separate process which | 1606 | * own children, it should create a separate process which |
1608 | * takes the role of real parent. | 1607 | * takes the role of real parent. |
1609 | */ | 1608 | */ |
1610 | if (likely(!ptrace) && task_ptrace(p) && | 1609 | if (likely(!ptrace) && task_ptrace(p) && |
1611 | same_thread_group(p->parent, p->real_parent)) | 1610 | same_thread_group(p->parent, p->real_parent)) |
1612 | return 0; | 1611 | return 0; |
1613 | 1612 | ||
1614 | /* | 1613 | /* |
1615 | * @p is alive and it's gonna stop, continue or exit, so | 1614 | * @p is alive and it's gonna stop, continue or exit, so |
1616 | * there always is something to wait for. | 1615 | * there always is something to wait for. |
1617 | */ | 1616 | */ |
1618 | wo->notask_error = 0; | 1617 | wo->notask_error = 0; |
1619 | } | 1618 | } |
1620 | 1619 | ||
1621 | /* | 1620 | /* |
1622 | * Wait for stopped. Depending on @ptrace, different stopped state | 1621 | * Wait for stopped. Depending on @ptrace, different stopped state |
1623 | * is used and the two don't interact with each other. | 1622 | * is used and the two don't interact with each other. |
1624 | */ | 1623 | */ |
1625 | ret = wait_task_stopped(wo, ptrace, p); | 1624 | ret = wait_task_stopped(wo, ptrace, p); |
1626 | if (ret) | 1625 | if (ret) |
1627 | return ret; | 1626 | return ret; |
1628 | 1627 | ||
1629 | /* | 1628 | /* |
1630 | * Wait for continued. There's only one continued state and the | 1629 | * Wait for continued. There's only one continued state and the |
1631 | * ptracer can consume it which can confuse the real parent. Don't | 1630 | * ptracer can consume it which can confuse the real parent. Don't |
1632 | * use WCONTINUED from ptracer. You don't need or want it. | 1631 | * use WCONTINUED from ptracer. You don't need or want it. |
1633 | */ | 1632 | */ |
1634 | return wait_task_continued(wo, p); | 1633 | return wait_task_continued(wo, p); |
1635 | } | 1634 | } |
1636 | 1635 | ||
1637 | /* | 1636 | /* |
1638 | * Do the work of do_wait() for one thread in the group, @tsk. | 1637 | * Do the work of do_wait() for one thread in the group, @tsk. |
1639 | * | 1638 | * |
1640 | * -ECHILD should be in ->notask_error before the first call. | 1639 | * -ECHILD should be in ->notask_error before the first call. |
1641 | * Returns nonzero for a final return, when we have unlocked tasklist_lock. | 1640 | * Returns nonzero for a final return, when we have unlocked tasklist_lock. |
1642 | * Returns zero if the search for a child should continue; then | 1641 | * Returns zero if the search for a child should continue; then |
1643 | * ->notask_error is 0 if there were any eligible children, | 1642 | * ->notask_error is 0 if there were any eligible children, |
1644 | * or another error from security_task_wait(), or still -ECHILD. | 1643 | * or another error from security_task_wait(), or still -ECHILD. |
1645 | */ | 1644 | */ |
1646 | static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) | 1645 | static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) |
1647 | { | 1646 | { |
1648 | struct task_struct *p; | 1647 | struct task_struct *p; |
1649 | 1648 | ||
1650 | list_for_each_entry(p, &tsk->children, sibling) { | 1649 | list_for_each_entry(p, &tsk->children, sibling) { |
1651 | int ret = wait_consider_task(wo, 0, p); | 1650 | int ret = wait_consider_task(wo, 0, p); |
1652 | if (ret) | 1651 | if (ret) |
1653 | return ret; | 1652 | return ret; |
1654 | } | 1653 | } |
1655 | 1654 | ||
1656 | return 0; | 1655 | return 0; |
1657 | } | 1656 | } |
1658 | 1657 | ||
1659 | static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) | 1658 | static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) |
1660 | { | 1659 | { |
1661 | struct task_struct *p; | 1660 | struct task_struct *p; |
1662 | 1661 | ||
1663 | list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { | 1662 | list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { |
1664 | int ret = wait_consider_task(wo, 1, p); | 1663 | int ret = wait_consider_task(wo, 1, p); |
1665 | if (ret) | 1664 | if (ret) |
1666 | return ret; | 1665 | return ret; |
1667 | } | 1666 | } |
1668 | 1667 | ||
1669 | return 0; | 1668 | return 0; |
1670 | } | 1669 | } |
1671 | 1670 | ||
1672 | static int child_wait_callback(wait_queue_t *wait, unsigned mode, | 1671 | static int child_wait_callback(wait_queue_t *wait, unsigned mode, |
1673 | int sync, void *key) | 1672 | int sync, void *key) |
1674 | { | 1673 | { |
1675 | struct wait_opts *wo = container_of(wait, struct wait_opts, | 1674 | struct wait_opts *wo = container_of(wait, struct wait_opts, |
1676 | child_wait); | 1675 | child_wait); |
1677 | struct task_struct *p = key; | 1676 | struct task_struct *p = key; |
1678 | 1677 | ||
1679 | if (!eligible_pid(wo, p)) | 1678 | if (!eligible_pid(wo, p)) |
1680 | return 0; | 1679 | return 0; |
1681 | 1680 | ||
1682 | if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) | 1681 | if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) |
1683 | return 0; | 1682 | return 0; |
1684 | 1683 | ||
1685 | return default_wake_function(wait, mode, sync, key); | 1684 | return default_wake_function(wait, mode, sync, key); |
1686 | } | 1685 | } |
1687 | 1686 | ||
1688 | void __wake_up_parent(struct task_struct *p, struct task_struct *parent) | 1687 | void __wake_up_parent(struct task_struct *p, struct task_struct *parent) |
1689 | { | 1688 | { |
1690 | __wake_up_sync_key(&parent->signal->wait_chldexit, | 1689 | __wake_up_sync_key(&parent->signal->wait_chldexit, |
1691 | TASK_INTERRUPTIBLE, 1, p); | 1690 | TASK_INTERRUPTIBLE, 1, p); |
1692 | } | 1691 | } |
1693 | 1692 | ||
1694 | static long do_wait(struct wait_opts *wo) | 1693 | static long do_wait(struct wait_opts *wo) |
1695 | { | 1694 | { |
1696 | struct task_struct *tsk; | 1695 | struct task_struct *tsk; |
1697 | int retval; | 1696 | int retval; |
1698 | 1697 | ||
1699 | trace_sched_process_wait(wo->wo_pid); | 1698 | trace_sched_process_wait(wo->wo_pid); |
1700 | 1699 | ||
1701 | init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); | 1700 | init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); |
1702 | wo->child_wait.private = current; | 1701 | wo->child_wait.private = current; |
1703 | add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); | 1702 | add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); |
1704 | repeat: | 1703 | repeat: |
1705 | /* | 1704 | /* |
1706 | * If there is nothing that can match our critiera just get out. | 1705 | * If there is nothing that can match our critiera just get out. |
1707 | * We will clear ->notask_error to zero if we see any child that | 1706 | * We will clear ->notask_error to zero if we see any child that |
1708 | * might later match our criteria, even if we are not able to reap | 1707 | * might later match our criteria, even if we are not able to reap |
1709 | * it yet. | 1708 | * it yet. |
1710 | */ | 1709 | */ |
1711 | wo->notask_error = -ECHILD; | 1710 | wo->notask_error = -ECHILD; |
1712 | if ((wo->wo_type < PIDTYPE_MAX) && | 1711 | if ((wo->wo_type < PIDTYPE_MAX) && |
1713 | (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) | 1712 | (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) |
1714 | goto notask; | 1713 | goto notask; |
1715 | 1714 | ||
1716 | set_current_state(TASK_INTERRUPTIBLE); | 1715 | set_current_state(TASK_INTERRUPTIBLE); |
1717 | read_lock(&tasklist_lock); | 1716 | read_lock(&tasklist_lock); |
1718 | tsk = current; | 1717 | tsk = current; |
1719 | do { | 1718 | do { |
1720 | retval = do_wait_thread(wo, tsk); | 1719 | retval = do_wait_thread(wo, tsk); |
1721 | if (retval) | 1720 | if (retval) |
1722 | goto end; | 1721 | goto end; |
1723 | 1722 | ||
1724 | retval = ptrace_do_wait(wo, tsk); | 1723 | retval = ptrace_do_wait(wo, tsk); |
1725 | if (retval) | 1724 | if (retval) |
1726 | goto end; | 1725 | goto end; |
1727 | 1726 | ||
1728 | if (wo->wo_flags & __WNOTHREAD) | 1727 | if (wo->wo_flags & __WNOTHREAD) |
1729 | break; | 1728 | break; |
1730 | } while_each_thread(current, tsk); | 1729 | } while_each_thread(current, tsk); |
1731 | read_unlock(&tasklist_lock); | 1730 | read_unlock(&tasklist_lock); |
1732 | 1731 | ||
1733 | notask: | 1732 | notask: |
1734 | retval = wo->notask_error; | 1733 | retval = wo->notask_error; |
1735 | if (!retval && !(wo->wo_flags & WNOHANG)) { | 1734 | if (!retval && !(wo->wo_flags & WNOHANG)) { |
1736 | retval = -ERESTARTSYS; | 1735 | retval = -ERESTARTSYS; |
1737 | if (!signal_pending(current)) { | 1736 | if (!signal_pending(current)) { |
1738 | schedule(); | 1737 | schedule(); |
1739 | goto repeat; | 1738 | goto repeat; |
1740 | } | 1739 | } |
1741 | } | 1740 | } |
1742 | end: | 1741 | end: |
1743 | __set_current_state(TASK_RUNNING); | 1742 | __set_current_state(TASK_RUNNING); |
1744 | remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); | 1743 | remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); |
1745 | return retval; | 1744 | return retval; |
1746 | } | 1745 | } |
1747 | 1746 | ||
1748 | SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | 1747 | SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, |
1749 | infop, int, options, struct rusage __user *, ru) | 1748 | infop, int, options, struct rusage __user *, ru) |
1750 | { | 1749 | { |
1751 | struct wait_opts wo; | 1750 | struct wait_opts wo; |
1752 | struct pid *pid = NULL; | 1751 | struct pid *pid = NULL; |
1753 | enum pid_type type; | 1752 | enum pid_type type; |
1754 | long ret; | 1753 | long ret; |
1755 | 1754 | ||
1756 | if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) | 1755 | if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) |
1757 | return -EINVAL; | 1756 | return -EINVAL; |
1758 | if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) | 1757 | if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) |
1759 | return -EINVAL; | 1758 | return -EINVAL; |
1760 | 1759 | ||
1761 | switch (which) { | 1760 | switch (which) { |
1762 | case P_ALL: | 1761 | case P_ALL: |
1763 | type = PIDTYPE_MAX; | 1762 | type = PIDTYPE_MAX; |
1764 | break; | 1763 | break; |
1765 | case P_PID: | 1764 | case P_PID: |
1766 | type = PIDTYPE_PID; | 1765 | type = PIDTYPE_PID; |
1767 | if (upid <= 0) | 1766 | if (upid <= 0) |
1768 | return -EINVAL; | 1767 | return -EINVAL; |
1769 | break; | 1768 | break; |
1770 | case P_PGID: | 1769 | case P_PGID: |
1771 | type = PIDTYPE_PGID; | 1770 | type = PIDTYPE_PGID; |
1772 | if (upid <= 0) | 1771 | if (upid <= 0) |
1773 | return -EINVAL; | 1772 | return -EINVAL; |
1774 | break; | 1773 | break; |
1775 | default: | 1774 | default: |
1776 | return -EINVAL; | 1775 | return -EINVAL; |
1777 | } | 1776 | } |
1778 | 1777 | ||
1779 | if (type < PIDTYPE_MAX) | 1778 | if (type < PIDTYPE_MAX) |
1780 | pid = find_get_pid(upid); | 1779 | pid = find_get_pid(upid); |
1781 | 1780 | ||
1782 | wo.wo_type = type; | 1781 | wo.wo_type = type; |
1783 | wo.wo_pid = pid; | 1782 | wo.wo_pid = pid; |
1784 | wo.wo_flags = options; | 1783 | wo.wo_flags = options; |
1785 | wo.wo_info = infop; | 1784 | wo.wo_info = infop; |
1786 | wo.wo_stat = NULL; | 1785 | wo.wo_stat = NULL; |
1787 | wo.wo_rusage = ru; | 1786 | wo.wo_rusage = ru; |
1788 | ret = do_wait(&wo); | 1787 | ret = do_wait(&wo); |
1789 | 1788 | ||
1790 | if (ret > 0) { | 1789 | if (ret > 0) { |
1791 | ret = 0; | 1790 | ret = 0; |
1792 | } else if (infop) { | 1791 | } else if (infop) { |
1793 | /* | 1792 | /* |
1794 | * For a WNOHANG return, clear out all the fields | 1793 | * For a WNOHANG return, clear out all the fields |
1795 | * we would set so the user can easily tell the | 1794 | * we would set so the user can easily tell the |
1796 | * difference. | 1795 | * difference. |
1797 | */ | 1796 | */ |
1798 | if (!ret) | 1797 | if (!ret) |
1799 | ret = put_user(0, &infop->si_signo); | 1798 | ret = put_user(0, &infop->si_signo); |
1800 | if (!ret) | 1799 | if (!ret) |
1801 | ret = put_user(0, &infop->si_errno); | 1800 | ret = put_user(0, &infop->si_errno); |
1802 | if (!ret) | 1801 | if (!ret) |
1803 | ret = put_user(0, &infop->si_code); | 1802 | ret = put_user(0, &infop->si_code); |
1804 | if (!ret) | 1803 | if (!ret) |
1805 | ret = put_user(0, &infop->si_pid); | 1804 | ret = put_user(0, &infop->si_pid); |
1806 | if (!ret) | 1805 | if (!ret) |
1807 | ret = put_user(0, &infop->si_uid); | 1806 | ret = put_user(0, &infop->si_uid); |
1808 | if (!ret) | 1807 | if (!ret) |
1809 | ret = put_user(0, &infop->si_status); | 1808 | ret = put_user(0, &infop->si_status); |
1810 | } | 1809 | } |
1811 | 1810 | ||
1812 | put_pid(pid); | 1811 | put_pid(pid); |
1813 | 1812 | ||
1814 | /* avoid REGPARM breakage on x86: */ | 1813 | /* avoid REGPARM breakage on x86: */ |
1815 | asmlinkage_protect(5, ret, which, upid, infop, options, ru); | 1814 | asmlinkage_protect(5, ret, which, upid, infop, options, ru); |
1816 | return ret; | 1815 | return ret; |
1817 | } | 1816 | } |
1818 | 1817 | ||
1819 | SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | 1818 | SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, |
1820 | int, options, struct rusage __user *, ru) | 1819 | int, options, struct rusage __user *, ru) |
1821 | { | 1820 | { |
1822 | struct wait_opts wo; | 1821 | struct wait_opts wo; |
1823 | struct pid *pid = NULL; | 1822 | struct pid *pid = NULL; |
1824 | enum pid_type type; | 1823 | enum pid_type type; |
1825 | long ret; | 1824 | long ret; |
1826 | 1825 | ||
1827 | if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| | 1826 | if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| |
1828 | __WNOTHREAD|__WCLONE|__WALL)) | 1827 | __WNOTHREAD|__WCLONE|__WALL)) |
1829 | return -EINVAL; | 1828 | return -EINVAL; |
1830 | 1829 | ||
1831 | if (upid == -1) | 1830 | if (upid == -1) |
1832 | type = PIDTYPE_MAX; | 1831 | type = PIDTYPE_MAX; |
1833 | else if (upid < 0) { | 1832 | else if (upid < 0) { |
1834 | type = PIDTYPE_PGID; | 1833 | type = PIDTYPE_PGID; |
1835 | pid = find_get_pid(-upid); | 1834 | pid = find_get_pid(-upid); |
1836 | } else if (upid == 0) { | 1835 | } else if (upid == 0) { |
1837 | type = PIDTYPE_PGID; | 1836 | type = PIDTYPE_PGID; |
1838 | pid = get_task_pid(current, PIDTYPE_PGID); | 1837 | pid = get_task_pid(current, PIDTYPE_PGID); |
1839 | } else /* upid > 0 */ { | 1838 | } else /* upid > 0 */ { |
1840 | type = PIDTYPE_PID; | 1839 | type = PIDTYPE_PID; |
1841 | pid = find_get_pid(upid); | 1840 | pid = find_get_pid(upid); |
1842 | } | 1841 | } |
1843 | 1842 | ||
1844 | wo.wo_type = type; | 1843 | wo.wo_type = type; |
1845 | wo.wo_pid = pid; | 1844 | wo.wo_pid = pid; |
1846 | wo.wo_flags = options | WEXITED; | 1845 | wo.wo_flags = options | WEXITED; |
1847 | wo.wo_info = NULL; | 1846 | wo.wo_info = NULL; |
1848 | wo.wo_stat = stat_addr; | 1847 | wo.wo_stat = stat_addr; |
1849 | wo.wo_rusage = ru; | 1848 | wo.wo_rusage = ru; |
1850 | ret = do_wait(&wo); | 1849 | ret = do_wait(&wo); |
1851 | put_pid(pid); | 1850 | put_pid(pid); |
1852 | 1851 | ||
1853 | /* avoid REGPARM breakage on x86: */ | 1852 | /* avoid REGPARM breakage on x86: */ |
1854 | asmlinkage_protect(4, ret, upid, stat_addr, options, ru); | 1853 | asmlinkage_protect(4, ret, upid, stat_addr, options, ru); |
1855 | return ret; | 1854 | return ret; |
1856 | } | 1855 | } |
1857 | 1856 | ||
1858 | #ifdef __ARCH_WANT_SYS_WAITPID | 1857 | #ifdef __ARCH_WANT_SYS_WAITPID |
1859 | 1858 | ||
1860 | /* | 1859 | /* |
1861 | * sys_waitpid() remains for compatibility. waitpid() should be | 1860 | * sys_waitpid() remains for compatibility. waitpid() should be |
1862 | * implemented by calling sys_wait4() from libc.a. | 1861 | * implemented by calling sys_wait4() from libc.a. |
1863 | */ | 1862 | */ |
1864 | SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) | 1863 | SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) |
1865 | { | 1864 | { |
1866 | return sys_wait4(pid, stat_addr, options, NULL); | 1865 | return sys_wait4(pid, stat_addr, options, NULL); |
1867 | } | 1866 | } |
1868 | 1867 | ||
1869 | #endif | 1868 | #endif |
1870 | 1869 |
kernel/pid.c
1 | /* | 1 | /* |
2 | * Generic pidhash and scalable, time-bounded PID allocator | 2 | * Generic pidhash and scalable, time-bounded PID allocator |
3 | * | 3 | * |
4 | * (C) 2002-2003 William Irwin, IBM | 4 | * (C) 2002-2003 William Irwin, IBM |
5 | * (C) 2004 William Irwin, Oracle | 5 | * (C) 2004 William Irwin, Oracle |
6 | * (C) 2002-2004 Ingo Molnar, Red Hat | 6 | * (C) 2002-2004 Ingo Molnar, Red Hat |
7 | * | 7 | * |
8 | * pid-structures are backing objects for tasks sharing a given ID to chain | 8 | * pid-structures are backing objects for tasks sharing a given ID to chain |
9 | * against. There is very little to them aside from hashing them and | 9 | * against. There is very little to them aside from hashing them and |
10 | * parking tasks using given ID's on a list. | 10 | * parking tasks using given ID's on a list. |
11 | * | 11 | * |
12 | * The hash is always changed with the tasklist_lock write-acquired, | 12 | * The hash is always changed with the tasklist_lock write-acquired, |
13 | * and the hash is only accessed with the tasklist_lock at least | 13 | * and the hash is only accessed with the tasklist_lock at least |
14 | * read-acquired, so there's no additional SMP locking needed here. | 14 | * read-acquired, so there's no additional SMP locking needed here. |
15 | * | 15 | * |
16 | * We have a list of bitmap pages, which bitmaps represent the PID space. | 16 | * We have a list of bitmap pages, which bitmaps represent the PID space. |
17 | * Allocating and freeing PIDs is completely lockless. The worst-case | 17 | * Allocating and freeing PIDs is completely lockless. The worst-case |
18 | * allocation scenario when all but one out of 1 million PIDs possible are | 18 | * allocation scenario when all but one out of 1 million PIDs possible are |
19 | * allocated already: the scanning of 32 list entries and at most PAGE_SIZE | 19 | * allocated already: the scanning of 32 list entries and at most PAGE_SIZE |
20 | * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). | 20 | * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). |
21 | * | 21 | * |
22 | * Pid namespaces: | 22 | * Pid namespaces: |
23 | * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. | 23 | * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. |
24 | * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM | 24 | * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM |
25 | * Many thanks to Oleg Nesterov for comments and help | 25 | * Many thanks to Oleg Nesterov for comments and help |
26 | * | 26 | * |
27 | */ | 27 | */ |
28 | 28 | ||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/rculist.h> | 33 | #include <linux/rculist.h> |
34 | #include <linux/bootmem.h> | 34 | #include <linux/bootmem.h> |
35 | #include <linux/hash.h> | 35 | #include <linux/hash.h> |
36 | #include <linux/pid_namespace.h> | 36 | #include <linux/pid_namespace.h> |
37 | #include <linux/init_task.h> | 37 | #include <linux/init_task.h> |
38 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
39 | 39 | ||
40 | #define pid_hashfn(nr, ns) \ | 40 | #define pid_hashfn(nr, ns) \ |
41 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) | 41 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) |
42 | static struct hlist_head *pid_hash; | 42 | static struct hlist_head *pid_hash; |
43 | static unsigned int pidhash_shift = 4; | 43 | static unsigned int pidhash_shift = 4; |
44 | struct pid init_struct_pid = INIT_STRUCT_PID; | 44 | struct pid init_struct_pid = INIT_STRUCT_PID; |
45 | 45 | ||
46 | int pid_max = PID_MAX_DEFAULT; | 46 | int pid_max = PID_MAX_DEFAULT; |
47 | 47 | ||
48 | #define RESERVED_PIDS 300 | 48 | #define RESERVED_PIDS 300 |
49 | 49 | ||
50 | int pid_max_min = RESERVED_PIDS + 1; | 50 | int pid_max_min = RESERVED_PIDS + 1; |
51 | int pid_max_max = PID_MAX_LIMIT; | 51 | int pid_max_max = PID_MAX_LIMIT; |
52 | 52 | ||
53 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 53 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
54 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) | 54 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) |
55 | 55 | ||
56 | static inline int mk_pid(struct pid_namespace *pid_ns, | 56 | static inline int mk_pid(struct pid_namespace *pid_ns, |
57 | struct pidmap *map, int off) | 57 | struct pidmap *map, int off) |
58 | { | 58 | { |
59 | return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; | 59 | return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; |
60 | } | 60 | } |
61 | 61 | ||
62 | #define find_next_offset(map, off) \ | 62 | #define find_next_offset(map, off) \ |
63 | find_next_zero_bit((map)->page, BITS_PER_PAGE, off) | 63 | find_next_zero_bit((map)->page, BITS_PER_PAGE, off) |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * PID-map pages start out as NULL, they get allocated upon | 66 | * PID-map pages start out as NULL, they get allocated upon |
67 | * first use and are never deallocated. This way a low pid_max | 67 | * first use and are never deallocated. This way a low pid_max |
68 | * value does not cause lots of bitmaps to be allocated, but | 68 | * value does not cause lots of bitmaps to be allocated, but |
69 | * the scheme scales to up to 4 million PIDs, runtime. | 69 | * the scheme scales to up to 4 million PIDs, runtime. |
70 | */ | 70 | */ |
71 | struct pid_namespace init_pid_ns = { | 71 | struct pid_namespace init_pid_ns = { |
72 | .kref = { | 72 | .kref = { |
73 | .refcount = ATOMIC_INIT(2), | 73 | .refcount = ATOMIC_INIT(2), |
74 | }, | 74 | }, |
75 | .pidmap = { | 75 | .pidmap = { |
76 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } | 76 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } |
77 | }, | 77 | }, |
78 | .last_pid = 0, | 78 | .last_pid = 0, |
79 | .level = 0, | 79 | .level = 0, |
80 | .child_reaper = &init_task, | 80 | .child_reaper = &init_task, |
81 | }; | 81 | }; |
82 | EXPORT_SYMBOL_GPL(init_pid_ns); | 82 | EXPORT_SYMBOL_GPL(init_pid_ns); |
83 | 83 | ||
84 | int is_container_init(struct task_struct *tsk) | 84 | int is_container_init(struct task_struct *tsk) |
85 | { | 85 | { |
86 | int ret = 0; | 86 | int ret = 0; |
87 | struct pid *pid; | 87 | struct pid *pid; |
88 | 88 | ||
89 | rcu_read_lock(); | 89 | rcu_read_lock(); |
90 | pid = task_pid(tsk); | 90 | pid = task_pid(tsk); |
91 | if (pid != NULL && pid->numbers[pid->level].nr == 1) | 91 | if (pid != NULL && pid->numbers[pid->level].nr == 1) |
92 | ret = 1; | 92 | ret = 1; |
93 | rcu_read_unlock(); | 93 | rcu_read_unlock(); |
94 | 94 | ||
95 | return ret; | 95 | return ret; |
96 | } | 96 | } |
97 | EXPORT_SYMBOL(is_container_init); | 97 | EXPORT_SYMBOL(is_container_init); |
98 | 98 | ||
99 | /* | 99 | /* |
100 | * Note: disable interrupts while the pidmap_lock is held as an | 100 | * Note: disable interrupts while the pidmap_lock is held as an |
101 | * interrupt might come in and do read_lock(&tasklist_lock). | 101 | * interrupt might come in and do read_lock(&tasklist_lock). |
102 | * | 102 | * |
103 | * If we don't disable interrupts there is a nasty deadlock between | 103 | * If we don't disable interrupts there is a nasty deadlock between |
104 | * detach_pid()->free_pid() and another cpu that does | 104 | * detach_pid()->free_pid() and another cpu that does |
105 | * spin_lock(&pidmap_lock) followed by an interrupt routine that does | 105 | * spin_lock(&pidmap_lock) followed by an interrupt routine that does |
106 | * read_lock(&tasklist_lock); | 106 | * read_lock(&tasklist_lock); |
107 | * | 107 | * |
108 | * After we clean up the tasklist_lock and know there are no | 108 | * After we clean up the tasklist_lock and know there are no |
109 | * irq handlers that take it we can leave the interrupts enabled. | 109 | * irq handlers that take it we can leave the interrupts enabled. |
110 | * For now it is easier to be safe than to prove it can't happen. | 110 | * For now it is easier to be safe than to prove it can't happen. |
111 | */ | 111 | */ |
112 | 112 | ||
113 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); | 113 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); |
114 | 114 | ||
115 | static void free_pidmap(struct upid *upid) | 115 | static void free_pidmap(struct upid *upid) |
116 | { | 116 | { |
117 | int nr = upid->nr; | 117 | int nr = upid->nr; |
118 | struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; | 118 | struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; |
119 | int offset = nr & BITS_PER_PAGE_MASK; | 119 | int offset = nr & BITS_PER_PAGE_MASK; |
120 | 120 | ||
121 | clear_bit(offset, map->page); | 121 | clear_bit(offset, map->page); |
122 | atomic_inc(&map->nr_free); | 122 | atomic_inc(&map->nr_free); |
123 | } | 123 | } |
124 | 124 | ||
125 | /* | 125 | /* |
126 | * If we started walking pids at 'base', is 'a' seen before 'b'? | 126 | * If we started walking pids at 'base', is 'a' seen before 'b'? |
127 | */ | 127 | */ |
128 | static int pid_before(int base, int a, int b) | 128 | static int pid_before(int base, int a, int b) |
129 | { | 129 | { |
130 | /* | 130 | /* |
131 | * This is the same as saying | 131 | * This is the same as saying |
132 | * | 132 | * |
133 | * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT | 133 | * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT |
134 | * and that mapping orders 'a' and 'b' with respect to 'base'. | 134 | * and that mapping orders 'a' and 'b' with respect to 'base'. |
135 | */ | 135 | */ |
136 | return (unsigned)(a - base) < (unsigned)(b - base); | 136 | return (unsigned)(a - base) < (unsigned)(b - base); |
137 | } | 137 | } |
138 | 138 | ||
139 | /* | 139 | /* |
140 | * We might be racing with someone else trying to set pid_ns->last_pid. | 140 | * We might be racing with someone else trying to set pid_ns->last_pid. |
141 | * We want the winner to have the "later" value, because if the | 141 | * We want the winner to have the "later" value, because if the |
142 | * "earlier" value prevails, then a pid may get reused immediately. | 142 | * "earlier" value prevails, then a pid may get reused immediately. |
143 | * | 143 | * |
144 | * Since pids rollover, it is not sufficient to just pick the bigger | 144 | * Since pids rollover, it is not sufficient to just pick the bigger |
145 | * value. We have to consider where we started counting from. | 145 | * value. We have to consider where we started counting from. |
146 | * | 146 | * |
147 | * 'base' is the value of pid_ns->last_pid that we observed when | 147 | * 'base' is the value of pid_ns->last_pid that we observed when |
148 | * we started looking for a pid. | 148 | * we started looking for a pid. |
149 | * | 149 | * |
150 | * 'pid' is the pid that we eventually found. | 150 | * 'pid' is the pid that we eventually found. |
151 | */ | 151 | */ |
152 | static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) | 152 | static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) |
153 | { | 153 | { |
154 | int prev; | 154 | int prev; |
155 | int last_write = base; | 155 | int last_write = base; |
156 | do { | 156 | do { |
157 | prev = last_write; | 157 | prev = last_write; |
158 | last_write = cmpxchg(&pid_ns->last_pid, prev, pid); | 158 | last_write = cmpxchg(&pid_ns->last_pid, prev, pid); |
159 | } while ((prev != last_write) && (pid_before(base, last_write, pid))); | 159 | } while ((prev != last_write) && (pid_before(base, last_write, pid))); |
160 | } | 160 | } |
161 | 161 | ||
162 | static int alloc_pidmap(struct pid_namespace *pid_ns) | 162 | static int alloc_pidmap(struct pid_namespace *pid_ns) |
163 | { | 163 | { |
164 | int i, offset, max_scan, pid, last = pid_ns->last_pid; | 164 | int i, offset, max_scan, pid, last = pid_ns->last_pid; |
165 | struct pidmap *map; | 165 | struct pidmap *map; |
166 | 166 | ||
167 | pid = last + 1; | 167 | pid = last + 1; |
168 | if (pid >= pid_max) | 168 | if (pid >= pid_max) |
169 | pid = RESERVED_PIDS; | 169 | pid = RESERVED_PIDS; |
170 | offset = pid & BITS_PER_PAGE_MASK; | 170 | offset = pid & BITS_PER_PAGE_MASK; |
171 | map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; | 171 | map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; |
172 | /* | 172 | /* |
173 | * If last_pid points into the middle of the map->page we | 173 | * If last_pid points into the middle of the map->page we |
174 | * want to scan this bitmap block twice, the second time | 174 | * want to scan this bitmap block twice, the second time |
175 | * we start with offset == 0 (or RESERVED_PIDS). | 175 | * we start with offset == 0 (or RESERVED_PIDS). |
176 | */ | 176 | */ |
177 | max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; | 177 | max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; |
178 | for (i = 0; i <= max_scan; ++i) { | 178 | for (i = 0; i <= max_scan; ++i) { |
179 | if (unlikely(!map->page)) { | 179 | if (unlikely(!map->page)) { |
180 | void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 180 | void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
181 | /* | 181 | /* |
182 | * Free the page if someone raced with us | 182 | * Free the page if someone raced with us |
183 | * installing it: | 183 | * installing it: |
184 | */ | 184 | */ |
185 | spin_lock_irq(&pidmap_lock); | 185 | spin_lock_irq(&pidmap_lock); |
186 | if (!map->page) { | 186 | if (!map->page) { |
187 | map->page = page; | 187 | map->page = page; |
188 | page = NULL; | 188 | page = NULL; |
189 | } | 189 | } |
190 | spin_unlock_irq(&pidmap_lock); | 190 | spin_unlock_irq(&pidmap_lock); |
191 | kfree(page); | 191 | kfree(page); |
192 | if (unlikely(!map->page)) | 192 | if (unlikely(!map->page)) |
193 | break; | 193 | break; |
194 | } | 194 | } |
195 | if (likely(atomic_read(&map->nr_free))) { | 195 | if (likely(atomic_read(&map->nr_free))) { |
196 | do { | 196 | do { |
197 | if (!test_and_set_bit(offset, map->page)) { | 197 | if (!test_and_set_bit(offset, map->page)) { |
198 | atomic_dec(&map->nr_free); | 198 | atomic_dec(&map->nr_free); |
199 | set_last_pid(pid_ns, last, pid); | 199 | set_last_pid(pid_ns, last, pid); |
200 | return pid; | 200 | return pid; |
201 | } | 201 | } |
202 | offset = find_next_offset(map, offset); | 202 | offset = find_next_offset(map, offset); |
203 | pid = mk_pid(pid_ns, map, offset); | 203 | pid = mk_pid(pid_ns, map, offset); |
204 | } while (offset < BITS_PER_PAGE && pid < pid_max); | 204 | } while (offset < BITS_PER_PAGE && pid < pid_max); |
205 | } | 205 | } |
206 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { | 206 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { |
207 | ++map; | 207 | ++map; |
208 | offset = 0; | 208 | offset = 0; |
209 | } else { | 209 | } else { |
210 | map = &pid_ns->pidmap[0]; | 210 | map = &pid_ns->pidmap[0]; |
211 | offset = RESERVED_PIDS; | 211 | offset = RESERVED_PIDS; |
212 | if (unlikely(last == offset)) | 212 | if (unlikely(last == offset)) |
213 | break; | 213 | break; |
214 | } | 214 | } |
215 | pid = mk_pid(pid_ns, map, offset); | 215 | pid = mk_pid(pid_ns, map, offset); |
216 | } | 216 | } |
217 | return -1; | 217 | return -1; |
218 | } | 218 | } |
219 | 219 | ||
220 | int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) | 220 | int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) |
221 | { | 221 | { |
222 | int offset; | 222 | int offset; |
223 | struct pidmap *map, *end; | 223 | struct pidmap *map, *end; |
224 | 224 | ||
225 | if (last >= PID_MAX_LIMIT) | 225 | if (last >= PID_MAX_LIMIT) |
226 | return -1; | 226 | return -1; |
227 | 227 | ||
228 | offset = (last + 1) & BITS_PER_PAGE_MASK; | 228 | offset = (last + 1) & BITS_PER_PAGE_MASK; |
229 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; | 229 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; |
230 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; | 230 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; |
231 | for (; map < end; map++, offset = 0) { | 231 | for (; map < end; map++, offset = 0) { |
232 | if (unlikely(!map->page)) | 232 | if (unlikely(!map->page)) |
233 | continue; | 233 | continue; |
234 | offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); | 234 | offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); |
235 | if (offset < BITS_PER_PAGE) | 235 | if (offset < BITS_PER_PAGE) |
236 | return mk_pid(pid_ns, map, offset); | 236 | return mk_pid(pid_ns, map, offset); |
237 | } | 237 | } |
238 | return -1; | 238 | return -1; |
239 | } | 239 | } |
240 | 240 | ||
241 | void put_pid(struct pid *pid) | 241 | void put_pid(struct pid *pid) |
242 | { | 242 | { |
243 | struct pid_namespace *ns; | 243 | struct pid_namespace *ns; |
244 | 244 | ||
245 | if (!pid) | 245 | if (!pid) |
246 | return; | 246 | return; |
247 | 247 | ||
248 | ns = pid->numbers[pid->level].ns; | 248 | ns = pid->numbers[pid->level].ns; |
249 | if ((atomic_read(&pid->count) == 1) || | 249 | if ((atomic_read(&pid->count) == 1) || |
250 | atomic_dec_and_test(&pid->count)) { | 250 | atomic_dec_and_test(&pid->count)) { |
251 | kmem_cache_free(ns->pid_cachep, pid); | 251 | kmem_cache_free(ns->pid_cachep, pid); |
252 | put_pid_ns(ns); | 252 | put_pid_ns(ns); |
253 | } | 253 | } |
254 | } | 254 | } |
255 | EXPORT_SYMBOL_GPL(put_pid); | 255 | EXPORT_SYMBOL_GPL(put_pid); |
256 | 256 | ||
257 | static void delayed_put_pid(struct rcu_head *rhp) | 257 | static void delayed_put_pid(struct rcu_head *rhp) |
258 | { | 258 | { |
259 | struct pid *pid = container_of(rhp, struct pid, rcu); | 259 | struct pid *pid = container_of(rhp, struct pid, rcu); |
260 | put_pid(pid); | 260 | put_pid(pid); |
261 | } | 261 | } |
262 | 262 | ||
263 | void free_pid(struct pid *pid) | 263 | void free_pid(struct pid *pid) |
264 | { | 264 | { |
265 | /* We can be called with write_lock_irq(&tasklist_lock) held */ | 265 | /* We can be called with write_lock_irq(&tasklist_lock) held */ |
266 | int i; | 266 | int i; |
267 | unsigned long flags; | 267 | unsigned long flags; |
268 | 268 | ||
269 | spin_lock_irqsave(&pidmap_lock, flags); | 269 | spin_lock_irqsave(&pidmap_lock, flags); |
270 | for (i = 0; i <= pid->level; i++) | 270 | for (i = 0; i <= pid->level; i++) |
271 | hlist_del_rcu(&pid->numbers[i].pid_chain); | 271 | hlist_del_rcu(&pid->numbers[i].pid_chain); |
272 | spin_unlock_irqrestore(&pidmap_lock, flags); | 272 | spin_unlock_irqrestore(&pidmap_lock, flags); |
273 | 273 | ||
274 | for (i = 0; i <= pid->level; i++) | 274 | for (i = 0; i <= pid->level; i++) |
275 | free_pidmap(pid->numbers + i); | 275 | free_pidmap(pid->numbers + i); |
276 | 276 | ||
277 | call_rcu(&pid->rcu, delayed_put_pid); | 277 | call_rcu(&pid->rcu, delayed_put_pid); |
278 | } | 278 | } |
279 | 279 | ||
280 | struct pid *alloc_pid(struct pid_namespace *ns) | 280 | struct pid *alloc_pid(struct pid_namespace *ns) |
281 | { | 281 | { |
282 | struct pid *pid; | 282 | struct pid *pid; |
283 | enum pid_type type; | 283 | enum pid_type type; |
284 | int i, nr; | 284 | int i, nr; |
285 | struct pid_namespace *tmp; | 285 | struct pid_namespace *tmp; |
286 | struct upid *upid; | 286 | struct upid *upid; |
287 | 287 | ||
288 | pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); | 288 | pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); |
289 | if (!pid) | 289 | if (!pid) |
290 | goto out; | 290 | goto out; |
291 | 291 | ||
292 | tmp = ns; | 292 | tmp = ns; |
293 | for (i = ns->level; i >= 0; i--) { | 293 | for (i = ns->level; i >= 0; i--) { |
294 | nr = alloc_pidmap(tmp); | 294 | nr = alloc_pidmap(tmp); |
295 | if (nr < 0) | 295 | if (nr < 0) |
296 | goto out_free; | 296 | goto out_free; |
297 | 297 | ||
298 | pid->numbers[i].nr = nr; | 298 | pid->numbers[i].nr = nr; |
299 | pid->numbers[i].ns = tmp; | 299 | pid->numbers[i].ns = tmp; |
300 | tmp = tmp->parent; | 300 | tmp = tmp->parent; |
301 | } | 301 | } |
302 | 302 | ||
303 | get_pid_ns(ns); | 303 | get_pid_ns(ns); |
304 | pid->level = ns->level; | 304 | pid->level = ns->level; |
305 | atomic_set(&pid->count, 1); | 305 | atomic_set(&pid->count, 1); |
306 | for (type = 0; type < PIDTYPE_MAX; ++type) | 306 | for (type = 0; type < PIDTYPE_MAX; ++type) |
307 | INIT_HLIST_HEAD(&pid->tasks[type]); | 307 | INIT_HLIST_HEAD(&pid->tasks[type]); |
308 | 308 | ||
309 | upid = pid->numbers + ns->level; | 309 | upid = pid->numbers + ns->level; |
310 | spin_lock_irq(&pidmap_lock); | 310 | spin_lock_irq(&pidmap_lock); |
311 | for ( ; upid >= pid->numbers; --upid) | 311 | for ( ; upid >= pid->numbers; --upid) |
312 | hlist_add_head_rcu(&upid->pid_chain, | 312 | hlist_add_head_rcu(&upid->pid_chain, |
313 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); | 313 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); |
314 | spin_unlock_irq(&pidmap_lock); | 314 | spin_unlock_irq(&pidmap_lock); |
315 | 315 | ||
316 | out: | 316 | out: |
317 | return pid; | 317 | return pid; |
318 | 318 | ||
319 | out_free: | 319 | out_free: |
320 | while (++i <= ns->level) | 320 | while (++i <= ns->level) |
321 | free_pidmap(pid->numbers + i); | 321 | free_pidmap(pid->numbers + i); |
322 | 322 | ||
323 | kmem_cache_free(ns->pid_cachep, pid); | 323 | kmem_cache_free(ns->pid_cachep, pid); |
324 | pid = NULL; | 324 | pid = NULL; |
325 | goto out; | 325 | goto out; |
326 | } | 326 | } |
327 | 327 | ||
328 | struct pid *find_pid_ns(int nr, struct pid_namespace *ns) | 328 | struct pid *find_pid_ns(int nr, struct pid_namespace *ns) |
329 | { | 329 | { |
330 | struct hlist_node *elem; | 330 | struct hlist_node *elem; |
331 | struct upid *pnr; | 331 | struct upid *pnr; |
332 | 332 | ||
333 | hlist_for_each_entry_rcu(pnr, elem, | 333 | hlist_for_each_entry_rcu(pnr, elem, |
334 | &pid_hash[pid_hashfn(nr, ns)], pid_chain) | 334 | &pid_hash[pid_hashfn(nr, ns)], pid_chain) |
335 | if (pnr->nr == nr && pnr->ns == ns) | 335 | if (pnr->nr == nr && pnr->ns == ns) |
336 | return container_of(pnr, struct pid, | 336 | return container_of(pnr, struct pid, |
337 | numbers[ns->level]); | 337 | numbers[ns->level]); |
338 | 338 | ||
339 | return NULL; | 339 | return NULL; |
340 | } | 340 | } |
341 | EXPORT_SYMBOL_GPL(find_pid_ns); | 341 | EXPORT_SYMBOL_GPL(find_pid_ns); |
342 | 342 | ||
343 | struct pid *find_vpid(int nr) | 343 | struct pid *find_vpid(int nr) |
344 | { | 344 | { |
345 | return find_pid_ns(nr, current->nsproxy->pid_ns); | 345 | return find_pid_ns(nr, current->nsproxy->pid_ns); |
346 | } | 346 | } |
347 | EXPORT_SYMBOL_GPL(find_vpid); | 347 | EXPORT_SYMBOL_GPL(find_vpid); |
348 | 348 | ||
349 | /* | 349 | /* |
350 | * attach_pid() must be called with the tasklist_lock write-held. | 350 | * attach_pid() must be called with the tasklist_lock write-held. |
351 | */ | 351 | */ |
352 | void attach_pid(struct task_struct *task, enum pid_type type, | 352 | void attach_pid(struct task_struct *task, enum pid_type type, |
353 | struct pid *pid) | 353 | struct pid *pid) |
354 | { | 354 | { |
355 | struct pid_link *link; | 355 | struct pid_link *link; |
356 | 356 | ||
357 | link = &task->pids[type]; | 357 | link = &task->pids[type]; |
358 | link->pid = pid; | 358 | link->pid = pid; |
359 | hlist_add_head_rcu(&link->node, &pid->tasks[type]); | 359 | hlist_add_head_rcu(&link->node, &pid->tasks[type]); |
360 | } | 360 | } |
361 | 361 | ||
362 | static void __change_pid(struct task_struct *task, enum pid_type type, | 362 | static void __change_pid(struct task_struct *task, enum pid_type type, |
363 | struct pid *new) | 363 | struct pid *new) |
364 | { | 364 | { |
365 | struct pid_link *link; | 365 | struct pid_link *link; |
366 | struct pid *pid; | 366 | struct pid *pid; |
367 | int tmp; | 367 | int tmp; |
368 | 368 | ||
369 | link = &task->pids[type]; | 369 | link = &task->pids[type]; |
370 | pid = link->pid; | 370 | pid = link->pid; |
371 | 371 | ||
372 | hlist_del_rcu(&link->node); | 372 | hlist_del_rcu(&link->node); |
373 | link->pid = new; | 373 | link->pid = new; |
374 | 374 | ||
375 | for (tmp = PIDTYPE_MAX; --tmp >= 0; ) | 375 | for (tmp = PIDTYPE_MAX; --tmp >= 0; ) |
376 | if (!hlist_empty(&pid->tasks[tmp])) | 376 | if (!hlist_empty(&pid->tasks[tmp])) |
377 | return; | 377 | return; |
378 | 378 | ||
379 | free_pid(pid); | 379 | free_pid(pid); |
380 | } | 380 | } |
381 | 381 | ||
382 | void detach_pid(struct task_struct *task, enum pid_type type) | 382 | void detach_pid(struct task_struct *task, enum pid_type type) |
383 | { | 383 | { |
384 | __change_pid(task, type, NULL); | 384 | __change_pid(task, type, NULL); |
385 | } | 385 | } |
386 | 386 | ||
387 | void change_pid(struct task_struct *task, enum pid_type type, | 387 | void change_pid(struct task_struct *task, enum pid_type type, |
388 | struct pid *pid) | 388 | struct pid *pid) |
389 | { | 389 | { |
390 | __change_pid(task, type, pid); | 390 | __change_pid(task, type, pid); |
391 | attach_pid(task, type, pid); | 391 | attach_pid(task, type, pid); |
392 | } | 392 | } |
393 | 393 | ||
394 | /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ | 394 | /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ |
395 | void transfer_pid(struct task_struct *old, struct task_struct *new, | 395 | void transfer_pid(struct task_struct *old, struct task_struct *new, |
396 | enum pid_type type) | 396 | enum pid_type type) |
397 | { | 397 | { |
398 | new->pids[type].pid = old->pids[type].pid; | 398 | new->pids[type].pid = old->pids[type].pid; |
399 | hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); | 399 | hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); |
400 | } | 400 | } |
401 | 401 | ||
402 | struct task_struct *pid_task(struct pid *pid, enum pid_type type) | 402 | struct task_struct *pid_task(struct pid *pid, enum pid_type type) |
403 | { | 403 | { |
404 | struct task_struct *result = NULL; | 404 | struct task_struct *result = NULL; |
405 | if (pid) { | 405 | if (pid) { |
406 | struct hlist_node *first; | 406 | struct hlist_node *first; |
407 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), | 407 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), |
408 | rcu_read_lock_held() || | ||
409 | lockdep_tasklist_lock_is_held()); | 408 | lockdep_tasklist_lock_is_held()); |
410 | if (first) | 409 | if (first) |
411 | result = hlist_entry(first, struct task_struct, pids[(type)].node); | 410 | result = hlist_entry(first, struct task_struct, pids[(type)].node); |
412 | } | 411 | } |
413 | return result; | 412 | return result; |
414 | } | 413 | } |
415 | EXPORT_SYMBOL(pid_task); | 414 | EXPORT_SYMBOL(pid_task); |
416 | 415 | ||
417 | /* | 416 | /* |
418 | * Must be called under rcu_read_lock(). | 417 | * Must be called under rcu_read_lock(). |
419 | */ | 418 | */ |
420 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 419 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
421 | { | 420 | { |
422 | rcu_lockdep_assert(rcu_read_lock_held()); | 421 | rcu_lockdep_assert(rcu_read_lock_held()); |
423 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); | 422 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); |
424 | } | 423 | } |
425 | 424 | ||
426 | struct task_struct *find_task_by_vpid(pid_t vnr) | 425 | struct task_struct *find_task_by_vpid(pid_t vnr) |
427 | { | 426 | { |
428 | return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); | 427 | return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); |
429 | } | 428 | } |
430 | 429 | ||
431 | struct pid *get_task_pid(struct task_struct *task, enum pid_type type) | 430 | struct pid *get_task_pid(struct task_struct *task, enum pid_type type) |
432 | { | 431 | { |
433 | struct pid *pid; | 432 | struct pid *pid; |
434 | rcu_read_lock(); | 433 | rcu_read_lock(); |
435 | if (type != PIDTYPE_PID) | 434 | if (type != PIDTYPE_PID) |
436 | task = task->group_leader; | 435 | task = task->group_leader; |
437 | pid = get_pid(task->pids[type].pid); | 436 | pid = get_pid(task->pids[type].pid); |
438 | rcu_read_unlock(); | 437 | rcu_read_unlock(); |
439 | return pid; | 438 | return pid; |
440 | } | 439 | } |
441 | EXPORT_SYMBOL_GPL(get_task_pid); | 440 | EXPORT_SYMBOL_GPL(get_task_pid); |
442 | 441 | ||
443 | struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) | 442 | struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) |
444 | { | 443 | { |
445 | struct task_struct *result; | 444 | struct task_struct *result; |
446 | rcu_read_lock(); | 445 | rcu_read_lock(); |
447 | result = pid_task(pid, type); | 446 | result = pid_task(pid, type); |
448 | if (result) | 447 | if (result) |
449 | get_task_struct(result); | 448 | get_task_struct(result); |
450 | rcu_read_unlock(); | 449 | rcu_read_unlock(); |
451 | return result; | 450 | return result; |
452 | } | 451 | } |
453 | EXPORT_SYMBOL_GPL(get_pid_task); | 452 | EXPORT_SYMBOL_GPL(get_pid_task); |
454 | 453 | ||
455 | struct pid *find_get_pid(pid_t nr) | 454 | struct pid *find_get_pid(pid_t nr) |
456 | { | 455 | { |
457 | struct pid *pid; | 456 | struct pid *pid; |
458 | 457 | ||
459 | rcu_read_lock(); | 458 | rcu_read_lock(); |
460 | pid = get_pid(find_vpid(nr)); | 459 | pid = get_pid(find_vpid(nr)); |
461 | rcu_read_unlock(); | 460 | rcu_read_unlock(); |
462 | 461 | ||
463 | return pid; | 462 | return pid; |
464 | } | 463 | } |
465 | EXPORT_SYMBOL_GPL(find_get_pid); | 464 | EXPORT_SYMBOL_GPL(find_get_pid); |
466 | 465 | ||
467 | pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) | 466 | pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) |
468 | { | 467 | { |
469 | struct upid *upid; | 468 | struct upid *upid; |
470 | pid_t nr = 0; | 469 | pid_t nr = 0; |
471 | 470 | ||
472 | if (pid && ns->level <= pid->level) { | 471 | if (pid && ns->level <= pid->level) { |
473 | upid = &pid->numbers[ns->level]; | 472 | upid = &pid->numbers[ns->level]; |
474 | if (upid->ns == ns) | 473 | if (upid->ns == ns) |
475 | nr = upid->nr; | 474 | nr = upid->nr; |
476 | } | 475 | } |
477 | return nr; | 476 | return nr; |
478 | } | 477 | } |
479 | 478 | ||
480 | pid_t pid_vnr(struct pid *pid) | 479 | pid_t pid_vnr(struct pid *pid) |
481 | { | 480 | { |
482 | return pid_nr_ns(pid, current->nsproxy->pid_ns); | 481 | return pid_nr_ns(pid, current->nsproxy->pid_ns); |
483 | } | 482 | } |
484 | EXPORT_SYMBOL_GPL(pid_vnr); | 483 | EXPORT_SYMBOL_GPL(pid_vnr); |
485 | 484 | ||
486 | pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, | 485 | pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, |
487 | struct pid_namespace *ns) | 486 | struct pid_namespace *ns) |
488 | { | 487 | { |
489 | pid_t nr = 0; | 488 | pid_t nr = 0; |
490 | 489 | ||
491 | rcu_read_lock(); | 490 | rcu_read_lock(); |
492 | if (!ns) | 491 | if (!ns) |
493 | ns = current->nsproxy->pid_ns; | 492 | ns = current->nsproxy->pid_ns; |
494 | if (likely(pid_alive(task))) { | 493 | if (likely(pid_alive(task))) { |
495 | if (type != PIDTYPE_PID) | 494 | if (type != PIDTYPE_PID) |
496 | task = task->group_leader; | 495 | task = task->group_leader; |
497 | nr = pid_nr_ns(task->pids[type].pid, ns); | 496 | nr = pid_nr_ns(task->pids[type].pid, ns); |
498 | } | 497 | } |
499 | rcu_read_unlock(); | 498 | rcu_read_unlock(); |
500 | 499 | ||
501 | return nr; | 500 | return nr; |
502 | } | 501 | } |
503 | EXPORT_SYMBOL(__task_pid_nr_ns); | 502 | EXPORT_SYMBOL(__task_pid_nr_ns); |
504 | 503 | ||
505 | pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) | 504 | pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) |
506 | { | 505 | { |
507 | return pid_nr_ns(task_tgid(tsk), ns); | 506 | return pid_nr_ns(task_tgid(tsk), ns); |
508 | } | 507 | } |
509 | EXPORT_SYMBOL(task_tgid_nr_ns); | 508 | EXPORT_SYMBOL(task_tgid_nr_ns); |
510 | 509 | ||
511 | struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) | 510 | struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) |
512 | { | 511 | { |
513 | return ns_of_pid(task_pid(tsk)); | 512 | return ns_of_pid(task_pid(tsk)); |
514 | } | 513 | } |
515 | EXPORT_SYMBOL_GPL(task_active_pid_ns); | 514 | EXPORT_SYMBOL_GPL(task_active_pid_ns); |
516 | 515 | ||
517 | /* | 516 | /* |
518 | * Used by proc to find the first pid that is greater than or equal to nr. | 517 | * Used by proc to find the first pid that is greater than or equal to nr. |
519 | * | 518 | * |
520 | * If there is a pid at nr this function is exactly the same as find_pid_ns. | 519 | * If there is a pid at nr this function is exactly the same as find_pid_ns. |
521 | */ | 520 | */ |
522 | struct pid *find_ge_pid(int nr, struct pid_namespace *ns) | 521 | struct pid *find_ge_pid(int nr, struct pid_namespace *ns) |
523 | { | 522 | { |
524 | struct pid *pid; | 523 | struct pid *pid; |
525 | 524 | ||
526 | do { | 525 | do { |
527 | pid = find_pid_ns(nr, ns); | 526 | pid = find_pid_ns(nr, ns); |
528 | if (pid) | 527 | if (pid) |
529 | break; | 528 | break; |
530 | nr = next_pidmap(ns, nr); | 529 | nr = next_pidmap(ns, nr); |
531 | } while (nr > 0); | 530 | } while (nr > 0); |
532 | 531 | ||
533 | return pid; | 532 | return pid; |
534 | } | 533 | } |
535 | 534 | ||
536 | /* | 535 | /* |
537 | * The pid hash table is scaled according to the amount of memory in the | 536 | * The pid hash table is scaled according to the amount of memory in the |
538 | * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or | 537 | * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or |
539 | * more. | 538 | * more. |
540 | */ | 539 | */ |
541 | void __init pidhash_init(void) | 540 | void __init pidhash_init(void) |
542 | { | 541 | { |
543 | int i, pidhash_size; | 542 | int i, pidhash_size; |
544 | 543 | ||
545 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, | 544 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, |
546 | HASH_EARLY | HASH_SMALL, | 545 | HASH_EARLY | HASH_SMALL, |
547 | &pidhash_shift, NULL, 4096); | 546 | &pidhash_shift, NULL, 4096); |
548 | pidhash_size = 1 << pidhash_shift; | 547 | pidhash_size = 1 << pidhash_shift; |
549 | 548 | ||
550 | for (i = 0; i < pidhash_size; i++) | 549 | for (i = 0; i < pidhash_size; i++) |
551 | INIT_HLIST_HEAD(&pid_hash[i]); | 550 | INIT_HLIST_HEAD(&pid_hash[i]); |
552 | } | 551 | } |
553 | 552 | ||
554 | void __init pidmap_init(void) | 553 | void __init pidmap_init(void) |
555 | { | 554 | { |
556 | /* bump default and minimum pid_max based on number of cpus */ | 555 | /* bump default and minimum pid_max based on number of cpus */ |
557 | pid_max = min(pid_max_max, max_t(int, pid_max, | 556 | pid_max = min(pid_max_max, max_t(int, pid_max, |
558 | PIDS_PER_CPU_DEFAULT * num_possible_cpus())); | 557 | PIDS_PER_CPU_DEFAULT * num_possible_cpus())); |
559 | pid_max_min = max_t(int, pid_max_min, | 558 | pid_max_min = max_t(int, pid_max_min, |
560 | PIDS_PER_CPU_MIN * num_possible_cpus()); | 559 | PIDS_PER_CPU_MIN * num_possible_cpus()); |
561 | pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); | 560 | pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); |
562 | 561 | ||
563 | init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 562 | init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
564 | /* Reserve PID 0. We never call free_pidmap(0) */ | 563 | /* Reserve PID 0. We never call free_pidmap(0) */ |
565 | set_bit(0, init_pid_ns.pidmap[0].page); | 564 | set_bit(0, init_pid_ns.pidmap[0].page); |
566 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); | 565 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); |
567 | 566 | ||
568 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, | 567 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, |
569 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); | 568 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); |
570 | } | 569 | } |
571 | 570 |
kernel/rcutorture.c
1 | /* | 1 | /* |
2 | * Read-Copy Update module-based torture test facility | 2 | * Read-Copy Update module-based torture test facility |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
6 | * the Free Software Foundation; either version 2 of the License, or | 6 | * the Free Software Foundation; either version 2 of the License, or |
7 | * (at your option) any later version. | 7 | * (at your option) any later version. |
8 | * | 8 | * |
9 | * This program is distributed in the hope that it will be useful, | 9 | * This program is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. | 12 | * GNU General Public License for more details. |
13 | * | 13 | * |
14 | * You should have received a copy of the GNU General Public License | 14 | * You should have received a copy of the GNU General Public License |
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2005, 2006 | 18 | * Copyright (C) IBM Corporation, 2005, 2006 |
19 | * | 19 | * |
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | 20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> |
21 | * Josh Triplett <josh@freedesktop.org> | 21 | * Josh Triplett <josh@freedesktop.org> |
22 | * | 22 | * |
23 | * See also: Documentation/RCU/torture.txt | 23 | * See also: Documentation/RCU/torture.txt |
24 | */ | 24 | */ |
25 | #include <linux/types.h> | 25 | #include <linux/types.h> |
26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
27 | #include <linux/init.h> | 27 | #include <linux/init.h> |
28 | #include <linux/module.h> | 28 | #include <linux/module.h> |
29 | #include <linux/kthread.h> | 29 | #include <linux/kthread.h> |
30 | #include <linux/err.h> | 30 | #include <linux/err.h> |
31 | #include <linux/spinlock.h> | 31 | #include <linux/spinlock.h> |
32 | #include <linux/smp.h> | 32 | #include <linux/smp.h> |
33 | #include <linux/rcupdate.h> | 33 | #include <linux/rcupdate.h> |
34 | #include <linux/interrupt.h> | 34 | #include <linux/interrupt.h> |
35 | #include <linux/sched.h> | 35 | #include <linux/sched.h> |
36 | #include <asm/atomic.h> | 36 | #include <asm/atomic.h> |
37 | #include <linux/bitops.h> | 37 | #include <linux/bitops.h> |
38 | #include <linux/completion.h> | 38 | #include <linux/completion.h> |
39 | #include <linux/moduleparam.h> | 39 | #include <linux/moduleparam.h> |
40 | #include <linux/percpu.h> | 40 | #include <linux/percpu.h> |
41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
42 | #include <linux/reboot.h> | 42 | #include <linux/reboot.h> |
43 | #include <linux/freezer.h> | 43 | #include <linux/freezer.h> |
44 | #include <linux/cpu.h> | 44 | #include <linux/cpu.h> |
45 | #include <linux/delay.h> | 45 | #include <linux/delay.h> |
46 | #include <linux/stat.h> | 46 | #include <linux/stat.h> |
47 | #include <linux/srcu.h> | 47 | #include <linux/srcu.h> |
48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
49 | #include <asm/byteorder.h> | 49 | #include <asm/byteorder.h> |
50 | 50 | ||
51 | MODULE_LICENSE("GPL"); | 51 | MODULE_LICENSE("GPL"); |
52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " | 52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " |
53 | "Josh Triplett <josh@freedesktop.org>"); | 53 | "Josh Triplett <josh@freedesktop.org>"); |
54 | 54 | ||
55 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | 55 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ |
56 | static int nfakewriters = 4; /* # fake writer threads */ | 56 | static int nfakewriters = 4; /* # fake writer threads */ |
57 | static int stat_interval; /* Interval between stats, in seconds. */ | 57 | static int stat_interval; /* Interval between stats, in seconds. */ |
58 | /* Defaults to "only at end of test". */ | 58 | /* Defaults to "only at end of test". */ |
59 | static int verbose; /* Print more debug info. */ | 59 | static int verbose; /* Print more debug info. */ |
60 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 60 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ |
61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ |
62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ |
63 | static int irqreader = 1; /* RCU readers from irq (timers). */ | 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ |
64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | 67 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ |
68 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | 68 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ |
69 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | 69 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ |
70 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | 70 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ |
71 | 71 | ||
72 | module_param(nreaders, int, 0444); | 72 | module_param(nreaders, int, 0444); |
73 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 73 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
74 | module_param(nfakewriters, int, 0444); | 74 | module_param(nfakewriters, int, 0444); |
75 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | 75 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); |
76 | module_param(stat_interval, int, 0444); | 76 | module_param(stat_interval, int, 0444); |
77 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | 77 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); |
78 | module_param(verbose, bool, 0444); | 78 | module_param(verbose, bool, 0444); |
79 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | 79 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); |
80 | module_param(test_no_idle_hz, bool, 0444); | 80 | module_param(test_no_idle_hz, bool, 0444); |
81 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | 81 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); |
82 | module_param(shuffle_interval, int, 0444); | 82 | module_param(shuffle_interval, int, 0444); |
83 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | 83 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); |
84 | module_param(stutter, int, 0444); | 84 | module_param(stutter, int, 0444); |
85 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | 85 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); |
86 | module_param(irqreader, int, 0444); | 86 | module_param(irqreader, int, 0444); |
87 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | 87 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); |
88 | module_param(fqs_duration, int, 0444); | 88 | module_param(fqs_duration, int, 0444); |
89 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); | 89 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); |
90 | module_param(fqs_holdoff, int, 0444); | 90 | module_param(fqs_holdoff, int, 0444); |
91 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 91 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
92 | module_param(fqs_stutter, int, 0444); | 92 | module_param(fqs_stutter, int, 0444); |
93 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 93 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
94 | module_param(test_boost, int, 0444); | 94 | module_param(test_boost, int, 0444); |
95 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | 95 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); |
96 | module_param(test_boost_interval, int, 0444); | 96 | module_param(test_boost_interval, int, 0444); |
97 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | 97 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); |
98 | module_param(test_boost_duration, int, 0444); | 98 | module_param(test_boost_duration, int, 0444); |
99 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | 99 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); |
100 | module_param(torture_type, charp, 0444); | 100 | module_param(torture_type, charp, 0444); |
101 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 101 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); |
102 | 102 | ||
103 | #define TORTURE_FLAG "-torture:" | 103 | #define TORTURE_FLAG "-torture:" |
104 | #define PRINTK_STRING(s) \ | 104 | #define PRINTK_STRING(s) \ |
105 | do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) | 105 | do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
106 | #define VERBOSE_PRINTK_STRING(s) \ | 106 | #define VERBOSE_PRINTK_STRING(s) \ |
107 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) | 107 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
108 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 108 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
109 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) | 109 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) |
110 | 110 | ||
111 | static char printk_buf[4096]; | 111 | static char printk_buf[4096]; |
112 | 112 | ||
113 | static int nrealreaders; | 113 | static int nrealreaders; |
114 | static struct task_struct *writer_task; | 114 | static struct task_struct *writer_task; |
115 | static struct task_struct **fakewriter_tasks; | 115 | static struct task_struct **fakewriter_tasks; |
116 | static struct task_struct **reader_tasks; | 116 | static struct task_struct **reader_tasks; |
117 | static struct task_struct *stats_task; | 117 | static struct task_struct *stats_task; |
118 | static struct task_struct *shuffler_task; | 118 | static struct task_struct *shuffler_task; |
119 | static struct task_struct *stutter_task; | 119 | static struct task_struct *stutter_task; |
120 | static struct task_struct *fqs_task; | 120 | static struct task_struct *fqs_task; |
121 | static struct task_struct *boost_tasks[NR_CPUS]; | 121 | static struct task_struct *boost_tasks[NR_CPUS]; |
122 | 122 | ||
123 | #define RCU_TORTURE_PIPE_LEN 10 | 123 | #define RCU_TORTURE_PIPE_LEN 10 |
124 | 124 | ||
125 | struct rcu_torture { | 125 | struct rcu_torture { |
126 | struct rcu_head rtort_rcu; | 126 | struct rcu_head rtort_rcu; |
127 | int rtort_pipe_count; | 127 | int rtort_pipe_count; |
128 | struct list_head rtort_free; | 128 | struct list_head rtort_free; |
129 | int rtort_mbtest; | 129 | int rtort_mbtest; |
130 | }; | 130 | }; |
131 | 131 | ||
132 | static LIST_HEAD(rcu_torture_freelist); | 132 | static LIST_HEAD(rcu_torture_freelist); |
133 | static struct rcu_torture __rcu *rcu_torture_current; | 133 | static struct rcu_torture __rcu *rcu_torture_current; |
134 | static unsigned long rcu_torture_current_version; | 134 | static unsigned long rcu_torture_current_version; |
135 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | 135 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
136 | static DEFINE_SPINLOCK(rcu_torture_lock); | 136 | static DEFINE_SPINLOCK(rcu_torture_lock); |
137 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | 137 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = |
138 | { 0 }; | 138 | { 0 }; |
139 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = | 139 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = |
140 | { 0 }; | 140 | { 0 }; |
141 | static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; | 141 | static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; |
142 | static atomic_t n_rcu_torture_alloc; | 142 | static atomic_t n_rcu_torture_alloc; |
143 | static atomic_t n_rcu_torture_alloc_fail; | 143 | static atomic_t n_rcu_torture_alloc_fail; |
144 | static atomic_t n_rcu_torture_free; | 144 | static atomic_t n_rcu_torture_free; |
145 | static atomic_t n_rcu_torture_mberror; | 145 | static atomic_t n_rcu_torture_mberror; |
146 | static atomic_t n_rcu_torture_error; | 146 | static atomic_t n_rcu_torture_error; |
147 | static long n_rcu_torture_boost_ktrerror; | 147 | static long n_rcu_torture_boost_ktrerror; |
148 | static long n_rcu_torture_boost_rterror; | 148 | static long n_rcu_torture_boost_rterror; |
149 | static long n_rcu_torture_boost_failure; | 149 | static long n_rcu_torture_boost_failure; |
150 | static long n_rcu_torture_boosts; | 150 | static long n_rcu_torture_boosts; |
151 | static long n_rcu_torture_timers; | 151 | static long n_rcu_torture_timers; |
152 | static struct list_head rcu_torture_removed; | 152 | static struct list_head rcu_torture_removed; |
153 | static cpumask_var_t shuffle_tmp_mask; | 153 | static cpumask_var_t shuffle_tmp_mask; |
154 | 154 | ||
155 | static int stutter_pause_test; | 155 | static int stutter_pause_test; |
156 | 156 | ||
157 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) | 157 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) |
158 | #define RCUTORTURE_RUNNABLE_INIT 1 | 158 | #define RCUTORTURE_RUNNABLE_INIT 1 |
159 | #else | 159 | #else |
160 | #define RCUTORTURE_RUNNABLE_INIT 0 | 160 | #define RCUTORTURE_RUNNABLE_INIT 0 |
161 | #endif | 161 | #endif |
162 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 162 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
163 | 163 | ||
164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) | 164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) |
165 | #define rcu_can_boost() 1 | 165 | #define rcu_can_boost() 1 |
166 | #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | 166 | #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
167 | #define rcu_can_boost() 0 | 167 | #define rcu_can_boost() 0 |
168 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | 168 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
169 | 169 | ||
170 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 170 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
171 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 171 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
172 | /* and boost task create/destroy. */ | 172 | /* and boost task create/destroy. */ |
173 | 173 | ||
174 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 174 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
175 | 175 | ||
176 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ | 176 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ |
177 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ | 177 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ |
178 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ | 178 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ |
179 | static int fullstop = FULLSTOP_RMMOD; | 179 | static int fullstop = FULLSTOP_RMMOD; |
180 | /* | 180 | /* |
181 | * Protect fullstop transitions and spawning of kthreads. | 181 | * Protect fullstop transitions and spawning of kthreads. |
182 | */ | 182 | */ |
183 | static DEFINE_MUTEX(fullstop_mutex); | 183 | static DEFINE_MUTEX(fullstop_mutex); |
184 | 184 | ||
185 | /* | 185 | /* |
186 | * Detect and respond to a system shutdown. | 186 | * Detect and respond to a system shutdown. |
187 | */ | 187 | */ |
188 | static int | 188 | static int |
189 | rcutorture_shutdown_notify(struct notifier_block *unused1, | 189 | rcutorture_shutdown_notify(struct notifier_block *unused1, |
190 | unsigned long unused2, void *unused3) | 190 | unsigned long unused2, void *unused3) |
191 | { | 191 | { |
192 | mutex_lock(&fullstop_mutex); | 192 | mutex_lock(&fullstop_mutex); |
193 | if (fullstop == FULLSTOP_DONTSTOP) | 193 | if (fullstop == FULLSTOP_DONTSTOP) |
194 | fullstop = FULLSTOP_SHUTDOWN; | 194 | fullstop = FULLSTOP_SHUTDOWN; |
195 | else | 195 | else |
196 | printk(KERN_WARNING /* but going down anyway, so... */ | 196 | printk(KERN_WARNING /* but going down anyway, so... */ |
197 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 197 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
198 | mutex_unlock(&fullstop_mutex); | 198 | mutex_unlock(&fullstop_mutex); |
199 | return NOTIFY_DONE; | 199 | return NOTIFY_DONE; |
200 | } | 200 | } |
201 | 201 | ||
202 | /* | 202 | /* |
203 | * Absorb kthreads into a kernel function that won't return, so that | 203 | * Absorb kthreads into a kernel function that won't return, so that |
204 | * they won't ever access module text or data again. | 204 | * they won't ever access module text or data again. |
205 | */ | 205 | */ |
206 | static void rcutorture_shutdown_absorb(char *title) | 206 | static void rcutorture_shutdown_absorb(char *title) |
207 | { | 207 | { |
208 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | 208 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { |
209 | printk(KERN_NOTICE | 209 | printk(KERN_NOTICE |
210 | "rcutorture thread %s parking due to system shutdown\n", | 210 | "rcutorture thread %s parking due to system shutdown\n", |
211 | title); | 211 | title); |
212 | schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); | 212 | schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); |
213 | } | 213 | } |
214 | } | 214 | } |
215 | 215 | ||
216 | /* | 216 | /* |
217 | * Allocate an element from the rcu_tortures pool. | 217 | * Allocate an element from the rcu_tortures pool. |
218 | */ | 218 | */ |
219 | static struct rcu_torture * | 219 | static struct rcu_torture * |
220 | rcu_torture_alloc(void) | 220 | rcu_torture_alloc(void) |
221 | { | 221 | { |
222 | struct list_head *p; | 222 | struct list_head *p; |
223 | 223 | ||
224 | spin_lock_bh(&rcu_torture_lock); | 224 | spin_lock_bh(&rcu_torture_lock); |
225 | if (list_empty(&rcu_torture_freelist)) { | 225 | if (list_empty(&rcu_torture_freelist)) { |
226 | atomic_inc(&n_rcu_torture_alloc_fail); | 226 | atomic_inc(&n_rcu_torture_alloc_fail); |
227 | spin_unlock_bh(&rcu_torture_lock); | 227 | spin_unlock_bh(&rcu_torture_lock); |
228 | return NULL; | 228 | return NULL; |
229 | } | 229 | } |
230 | atomic_inc(&n_rcu_torture_alloc); | 230 | atomic_inc(&n_rcu_torture_alloc); |
231 | p = rcu_torture_freelist.next; | 231 | p = rcu_torture_freelist.next; |
232 | list_del_init(p); | 232 | list_del_init(p); |
233 | spin_unlock_bh(&rcu_torture_lock); | 233 | spin_unlock_bh(&rcu_torture_lock); |
234 | return container_of(p, struct rcu_torture, rtort_free); | 234 | return container_of(p, struct rcu_torture, rtort_free); |
235 | } | 235 | } |
236 | 236 | ||
237 | /* | 237 | /* |
238 | * Free an element to the rcu_tortures pool. | 238 | * Free an element to the rcu_tortures pool. |
239 | */ | 239 | */ |
240 | static void | 240 | static void |
241 | rcu_torture_free(struct rcu_torture *p) | 241 | rcu_torture_free(struct rcu_torture *p) |
242 | { | 242 | { |
243 | atomic_inc(&n_rcu_torture_free); | 243 | atomic_inc(&n_rcu_torture_free); |
244 | spin_lock_bh(&rcu_torture_lock); | 244 | spin_lock_bh(&rcu_torture_lock); |
245 | list_add_tail(&p->rtort_free, &rcu_torture_freelist); | 245 | list_add_tail(&p->rtort_free, &rcu_torture_freelist); |
246 | spin_unlock_bh(&rcu_torture_lock); | 246 | spin_unlock_bh(&rcu_torture_lock); |
247 | } | 247 | } |
248 | 248 | ||
249 | struct rcu_random_state { | 249 | struct rcu_random_state { |
250 | unsigned long rrs_state; | 250 | unsigned long rrs_state; |
251 | long rrs_count; | 251 | long rrs_count; |
252 | }; | 252 | }; |
253 | 253 | ||
254 | #define RCU_RANDOM_MULT 39916801 /* prime */ | 254 | #define RCU_RANDOM_MULT 39916801 /* prime */ |
255 | #define RCU_RANDOM_ADD 479001701 /* prime */ | 255 | #define RCU_RANDOM_ADD 479001701 /* prime */ |
256 | #define RCU_RANDOM_REFRESH 10000 | 256 | #define RCU_RANDOM_REFRESH 10000 |
257 | 257 | ||
258 | #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } | 258 | #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } |
259 | 259 | ||
260 | /* | 260 | /* |
261 | * Crude but fast random-number generator. Uses a linear congruential | 261 | * Crude but fast random-number generator. Uses a linear congruential |
262 | * generator, with occasional help from cpu_clock(). | 262 | * generator, with occasional help from cpu_clock(). |
263 | */ | 263 | */ |
264 | static unsigned long | 264 | static unsigned long |
265 | rcu_random(struct rcu_random_state *rrsp) | 265 | rcu_random(struct rcu_random_state *rrsp) |
266 | { | 266 | { |
267 | if (--rrsp->rrs_count < 0) { | 267 | if (--rrsp->rrs_count < 0) { |
268 | rrsp->rrs_state += (unsigned long)local_clock(); | 268 | rrsp->rrs_state += (unsigned long)local_clock(); |
269 | rrsp->rrs_count = RCU_RANDOM_REFRESH; | 269 | rrsp->rrs_count = RCU_RANDOM_REFRESH; |
270 | } | 270 | } |
271 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; | 271 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; |
272 | return swahw32(rrsp->rrs_state); | 272 | return swahw32(rrsp->rrs_state); |
273 | } | 273 | } |
274 | 274 | ||
275 | static void | 275 | static void |
276 | rcu_stutter_wait(char *title) | 276 | rcu_stutter_wait(char *title) |
277 | { | 277 | { |
278 | while (stutter_pause_test || !rcutorture_runnable) { | 278 | while (stutter_pause_test || !rcutorture_runnable) { |
279 | if (rcutorture_runnable) | 279 | if (rcutorture_runnable) |
280 | schedule_timeout_interruptible(1); | 280 | schedule_timeout_interruptible(1); |
281 | else | 281 | else |
282 | schedule_timeout_interruptible(round_jiffies_relative(HZ)); | 282 | schedule_timeout_interruptible(round_jiffies_relative(HZ)); |
283 | rcutorture_shutdown_absorb(title); | 283 | rcutorture_shutdown_absorb(title); |
284 | } | 284 | } |
285 | } | 285 | } |
286 | 286 | ||
287 | /* | 287 | /* |
288 | * Operations vector for selecting different types of tests. | 288 | * Operations vector for selecting different types of tests. |
289 | */ | 289 | */ |
290 | 290 | ||
291 | struct rcu_torture_ops { | 291 | struct rcu_torture_ops { |
292 | void (*init)(void); | 292 | void (*init)(void); |
293 | void (*cleanup)(void); | 293 | void (*cleanup)(void); |
294 | int (*readlock)(void); | 294 | int (*readlock)(void); |
295 | void (*read_delay)(struct rcu_random_state *rrsp); | 295 | void (*read_delay)(struct rcu_random_state *rrsp); |
296 | void (*readunlock)(int idx); | 296 | void (*readunlock)(int idx); |
297 | int (*completed)(void); | 297 | int (*completed)(void); |
298 | void (*deferred_free)(struct rcu_torture *p); | 298 | void (*deferred_free)(struct rcu_torture *p); |
299 | void (*sync)(void); | 299 | void (*sync)(void); |
300 | void (*cb_barrier)(void); | 300 | void (*cb_barrier)(void); |
301 | void (*fqs)(void); | 301 | void (*fqs)(void); |
302 | int (*stats)(char *page); | 302 | int (*stats)(char *page); |
303 | int irq_capable; | 303 | int irq_capable; |
304 | int can_boost; | 304 | int can_boost; |
305 | char *name; | 305 | char *name; |
306 | }; | 306 | }; |
307 | 307 | ||
308 | static struct rcu_torture_ops *cur_ops; | 308 | static struct rcu_torture_ops *cur_ops; |
309 | 309 | ||
310 | /* | 310 | /* |
311 | * Definitions for rcu torture testing. | 311 | * Definitions for rcu torture testing. |
312 | */ | 312 | */ |
313 | 313 | ||
314 | static int rcu_torture_read_lock(void) __acquires(RCU) | 314 | static int rcu_torture_read_lock(void) __acquires(RCU) |
315 | { | 315 | { |
316 | rcu_read_lock(); | 316 | rcu_read_lock(); |
317 | return 0; | 317 | return 0; |
318 | } | 318 | } |
319 | 319 | ||
320 | static void rcu_read_delay(struct rcu_random_state *rrsp) | 320 | static void rcu_read_delay(struct rcu_random_state *rrsp) |
321 | { | 321 | { |
322 | const unsigned long shortdelay_us = 200; | 322 | const unsigned long shortdelay_us = 200; |
323 | const unsigned long longdelay_ms = 50; | 323 | const unsigned long longdelay_ms = 50; |
324 | 324 | ||
325 | /* We want a short delay sometimes to make a reader delay the grace | 325 | /* We want a short delay sometimes to make a reader delay the grace |
326 | * period, and we want a long delay occasionally to trigger | 326 | * period, and we want a long delay occasionally to trigger |
327 | * force_quiescent_state. */ | 327 | * force_quiescent_state. */ |
328 | 328 | ||
329 | if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) | 329 | if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) |
330 | mdelay(longdelay_ms); | 330 | mdelay(longdelay_ms); |
331 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) | 331 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) |
332 | udelay(shortdelay_us); | 332 | udelay(shortdelay_us); |
333 | #ifdef CONFIG_PREEMPT | 333 | #ifdef CONFIG_PREEMPT |
334 | if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) | 334 | if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) |
335 | preempt_schedule(); /* No QS if preempt_disable() in effect */ | 335 | preempt_schedule(); /* No QS if preempt_disable() in effect */ |
336 | #endif | 336 | #endif |
337 | } | 337 | } |
338 | 338 | ||
339 | static void rcu_torture_read_unlock(int idx) __releases(RCU) | 339 | static void rcu_torture_read_unlock(int idx) __releases(RCU) |
340 | { | 340 | { |
341 | rcu_read_unlock(); | 341 | rcu_read_unlock(); |
342 | } | 342 | } |
343 | 343 | ||
344 | static int rcu_torture_completed(void) | 344 | static int rcu_torture_completed(void) |
345 | { | 345 | { |
346 | return rcu_batches_completed(); | 346 | return rcu_batches_completed(); |
347 | } | 347 | } |
348 | 348 | ||
349 | static void | 349 | static void |
350 | rcu_torture_cb(struct rcu_head *p) | 350 | rcu_torture_cb(struct rcu_head *p) |
351 | { | 351 | { |
352 | int i; | 352 | int i; |
353 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | 353 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); |
354 | 354 | ||
355 | if (fullstop != FULLSTOP_DONTSTOP) { | 355 | if (fullstop != FULLSTOP_DONTSTOP) { |
356 | /* Test is ending, just drop callbacks on the floor. */ | 356 | /* Test is ending, just drop callbacks on the floor. */ |
357 | /* The next initialization will pick up the pieces. */ | 357 | /* The next initialization will pick up the pieces. */ |
358 | return; | 358 | return; |
359 | } | 359 | } |
360 | i = rp->rtort_pipe_count; | 360 | i = rp->rtort_pipe_count; |
361 | if (i > RCU_TORTURE_PIPE_LEN) | 361 | if (i > RCU_TORTURE_PIPE_LEN) |
362 | i = RCU_TORTURE_PIPE_LEN; | 362 | i = RCU_TORTURE_PIPE_LEN; |
363 | atomic_inc(&rcu_torture_wcount[i]); | 363 | atomic_inc(&rcu_torture_wcount[i]); |
364 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | 364 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { |
365 | rp->rtort_mbtest = 0; | 365 | rp->rtort_mbtest = 0; |
366 | rcu_torture_free(rp); | 366 | rcu_torture_free(rp); |
367 | } else | 367 | } else |
368 | cur_ops->deferred_free(rp); | 368 | cur_ops->deferred_free(rp); |
369 | } | 369 | } |
370 | 370 | ||
371 | static int rcu_no_completed(void) | 371 | static int rcu_no_completed(void) |
372 | { | 372 | { |
373 | return 0; | 373 | return 0; |
374 | } | 374 | } |
375 | 375 | ||
376 | static void rcu_torture_deferred_free(struct rcu_torture *p) | 376 | static void rcu_torture_deferred_free(struct rcu_torture *p) |
377 | { | 377 | { |
378 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | 378 | call_rcu(&p->rtort_rcu, rcu_torture_cb); |
379 | } | 379 | } |
380 | 380 | ||
381 | static struct rcu_torture_ops rcu_ops = { | 381 | static struct rcu_torture_ops rcu_ops = { |
382 | .init = NULL, | 382 | .init = NULL, |
383 | .cleanup = NULL, | 383 | .cleanup = NULL, |
384 | .readlock = rcu_torture_read_lock, | 384 | .readlock = rcu_torture_read_lock, |
385 | .read_delay = rcu_read_delay, | 385 | .read_delay = rcu_read_delay, |
386 | .readunlock = rcu_torture_read_unlock, | 386 | .readunlock = rcu_torture_read_unlock, |
387 | .completed = rcu_torture_completed, | 387 | .completed = rcu_torture_completed, |
388 | .deferred_free = rcu_torture_deferred_free, | 388 | .deferred_free = rcu_torture_deferred_free, |
389 | .sync = synchronize_rcu, | 389 | .sync = synchronize_rcu, |
390 | .cb_barrier = rcu_barrier, | 390 | .cb_barrier = rcu_barrier, |
391 | .fqs = rcu_force_quiescent_state, | 391 | .fqs = rcu_force_quiescent_state, |
392 | .stats = NULL, | 392 | .stats = NULL, |
393 | .irq_capable = 1, | 393 | .irq_capable = 1, |
394 | .can_boost = rcu_can_boost(), | 394 | .can_boost = rcu_can_boost(), |
395 | .name = "rcu" | 395 | .name = "rcu" |
396 | }; | 396 | }; |
397 | 397 | ||
398 | static void rcu_sync_torture_deferred_free(struct rcu_torture *p) | 398 | static void rcu_sync_torture_deferred_free(struct rcu_torture *p) |
399 | { | 399 | { |
400 | int i; | 400 | int i; |
401 | struct rcu_torture *rp; | 401 | struct rcu_torture *rp; |
402 | struct rcu_torture *rp1; | 402 | struct rcu_torture *rp1; |
403 | 403 | ||
404 | cur_ops->sync(); | 404 | cur_ops->sync(); |
405 | list_add(&p->rtort_free, &rcu_torture_removed); | 405 | list_add(&p->rtort_free, &rcu_torture_removed); |
406 | list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { | 406 | list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { |
407 | i = rp->rtort_pipe_count; | 407 | i = rp->rtort_pipe_count; |
408 | if (i > RCU_TORTURE_PIPE_LEN) | 408 | if (i > RCU_TORTURE_PIPE_LEN) |
409 | i = RCU_TORTURE_PIPE_LEN; | 409 | i = RCU_TORTURE_PIPE_LEN; |
410 | atomic_inc(&rcu_torture_wcount[i]); | 410 | atomic_inc(&rcu_torture_wcount[i]); |
411 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | 411 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { |
412 | rp->rtort_mbtest = 0; | 412 | rp->rtort_mbtest = 0; |
413 | list_del(&rp->rtort_free); | 413 | list_del(&rp->rtort_free); |
414 | rcu_torture_free(rp); | 414 | rcu_torture_free(rp); |
415 | } | 415 | } |
416 | } | 416 | } |
417 | } | 417 | } |
418 | 418 | ||
419 | static void rcu_sync_torture_init(void) | 419 | static void rcu_sync_torture_init(void) |
420 | { | 420 | { |
421 | INIT_LIST_HEAD(&rcu_torture_removed); | 421 | INIT_LIST_HEAD(&rcu_torture_removed); |
422 | } | 422 | } |
423 | 423 | ||
424 | static struct rcu_torture_ops rcu_sync_ops = { | 424 | static struct rcu_torture_ops rcu_sync_ops = { |
425 | .init = rcu_sync_torture_init, | 425 | .init = rcu_sync_torture_init, |
426 | .cleanup = NULL, | 426 | .cleanup = NULL, |
427 | .readlock = rcu_torture_read_lock, | 427 | .readlock = rcu_torture_read_lock, |
428 | .read_delay = rcu_read_delay, | 428 | .read_delay = rcu_read_delay, |
429 | .readunlock = rcu_torture_read_unlock, | 429 | .readunlock = rcu_torture_read_unlock, |
430 | .completed = rcu_torture_completed, | 430 | .completed = rcu_torture_completed, |
431 | .deferred_free = rcu_sync_torture_deferred_free, | 431 | .deferred_free = rcu_sync_torture_deferred_free, |
432 | .sync = synchronize_rcu, | 432 | .sync = synchronize_rcu, |
433 | .cb_barrier = NULL, | 433 | .cb_barrier = NULL, |
434 | .fqs = rcu_force_quiescent_state, | 434 | .fqs = rcu_force_quiescent_state, |
435 | .stats = NULL, | 435 | .stats = NULL, |
436 | .irq_capable = 1, | 436 | .irq_capable = 1, |
437 | .can_boost = rcu_can_boost(), | 437 | .can_boost = rcu_can_boost(), |
438 | .name = "rcu_sync" | 438 | .name = "rcu_sync" |
439 | }; | 439 | }; |
440 | 440 | ||
441 | static struct rcu_torture_ops rcu_expedited_ops = { | 441 | static struct rcu_torture_ops rcu_expedited_ops = { |
442 | .init = rcu_sync_torture_init, | 442 | .init = rcu_sync_torture_init, |
443 | .cleanup = NULL, | 443 | .cleanup = NULL, |
444 | .readlock = rcu_torture_read_lock, | 444 | .readlock = rcu_torture_read_lock, |
445 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 445 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
446 | .readunlock = rcu_torture_read_unlock, | 446 | .readunlock = rcu_torture_read_unlock, |
447 | .completed = rcu_no_completed, | 447 | .completed = rcu_no_completed, |
448 | .deferred_free = rcu_sync_torture_deferred_free, | 448 | .deferred_free = rcu_sync_torture_deferred_free, |
449 | .sync = synchronize_rcu_expedited, | 449 | .sync = synchronize_rcu_expedited, |
450 | .cb_barrier = NULL, | 450 | .cb_barrier = NULL, |
451 | .fqs = rcu_force_quiescent_state, | 451 | .fqs = rcu_force_quiescent_state, |
452 | .stats = NULL, | 452 | .stats = NULL, |
453 | .irq_capable = 1, | 453 | .irq_capable = 1, |
454 | .can_boost = rcu_can_boost(), | 454 | .can_boost = rcu_can_boost(), |
455 | .name = "rcu_expedited" | 455 | .name = "rcu_expedited" |
456 | }; | 456 | }; |
457 | 457 | ||
458 | /* | 458 | /* |
459 | * Definitions for rcu_bh torture testing. | 459 | * Definitions for rcu_bh torture testing. |
460 | */ | 460 | */ |
461 | 461 | ||
462 | static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) | 462 | static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) |
463 | { | 463 | { |
464 | rcu_read_lock_bh(); | 464 | rcu_read_lock_bh(); |
465 | return 0; | 465 | return 0; |
466 | } | 466 | } |
467 | 467 | ||
468 | static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) | 468 | static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) |
469 | { | 469 | { |
470 | rcu_read_unlock_bh(); | 470 | rcu_read_unlock_bh(); |
471 | } | 471 | } |
472 | 472 | ||
473 | static int rcu_bh_torture_completed(void) | 473 | static int rcu_bh_torture_completed(void) |
474 | { | 474 | { |
475 | return rcu_batches_completed_bh(); | 475 | return rcu_batches_completed_bh(); |
476 | } | 476 | } |
477 | 477 | ||
478 | static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | 478 | static void rcu_bh_torture_deferred_free(struct rcu_torture *p) |
479 | { | 479 | { |
480 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); | 480 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); |
481 | } | 481 | } |
482 | 482 | ||
483 | struct rcu_bh_torture_synchronize { | 483 | struct rcu_bh_torture_synchronize { |
484 | struct rcu_head head; | 484 | struct rcu_head head; |
485 | struct completion completion; | 485 | struct completion completion; |
486 | }; | 486 | }; |
487 | 487 | ||
488 | static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) | 488 | static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) |
489 | { | 489 | { |
490 | struct rcu_bh_torture_synchronize *rcu; | 490 | struct rcu_bh_torture_synchronize *rcu; |
491 | 491 | ||
492 | rcu = container_of(head, struct rcu_bh_torture_synchronize, head); | 492 | rcu = container_of(head, struct rcu_bh_torture_synchronize, head); |
493 | complete(&rcu->completion); | 493 | complete(&rcu->completion); |
494 | } | 494 | } |
495 | 495 | ||
496 | static void rcu_bh_torture_synchronize(void) | 496 | static void rcu_bh_torture_synchronize(void) |
497 | { | 497 | { |
498 | struct rcu_bh_torture_synchronize rcu; | 498 | struct rcu_bh_torture_synchronize rcu; |
499 | 499 | ||
500 | init_rcu_head_on_stack(&rcu.head); | 500 | init_rcu_head_on_stack(&rcu.head); |
501 | init_completion(&rcu.completion); | 501 | init_completion(&rcu.completion); |
502 | call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); | 502 | call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); |
503 | wait_for_completion(&rcu.completion); | 503 | wait_for_completion(&rcu.completion); |
504 | destroy_rcu_head_on_stack(&rcu.head); | 504 | destroy_rcu_head_on_stack(&rcu.head); |
505 | } | 505 | } |
506 | 506 | ||
507 | static struct rcu_torture_ops rcu_bh_ops = { | 507 | static struct rcu_torture_ops rcu_bh_ops = { |
508 | .init = NULL, | 508 | .init = NULL, |
509 | .cleanup = NULL, | 509 | .cleanup = NULL, |
510 | .readlock = rcu_bh_torture_read_lock, | 510 | .readlock = rcu_bh_torture_read_lock, |
511 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 511 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
512 | .readunlock = rcu_bh_torture_read_unlock, | 512 | .readunlock = rcu_bh_torture_read_unlock, |
513 | .completed = rcu_bh_torture_completed, | 513 | .completed = rcu_bh_torture_completed, |
514 | .deferred_free = rcu_bh_torture_deferred_free, | 514 | .deferred_free = rcu_bh_torture_deferred_free, |
515 | .sync = rcu_bh_torture_synchronize, | 515 | .sync = rcu_bh_torture_synchronize, |
516 | .cb_barrier = rcu_barrier_bh, | 516 | .cb_barrier = rcu_barrier_bh, |
517 | .fqs = rcu_bh_force_quiescent_state, | 517 | .fqs = rcu_bh_force_quiescent_state, |
518 | .stats = NULL, | 518 | .stats = NULL, |
519 | .irq_capable = 1, | 519 | .irq_capable = 1, |
520 | .name = "rcu_bh" | 520 | .name = "rcu_bh" |
521 | }; | 521 | }; |
522 | 522 | ||
523 | static struct rcu_torture_ops rcu_bh_sync_ops = { | 523 | static struct rcu_torture_ops rcu_bh_sync_ops = { |
524 | .init = rcu_sync_torture_init, | 524 | .init = rcu_sync_torture_init, |
525 | .cleanup = NULL, | 525 | .cleanup = NULL, |
526 | .readlock = rcu_bh_torture_read_lock, | 526 | .readlock = rcu_bh_torture_read_lock, |
527 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 527 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
528 | .readunlock = rcu_bh_torture_read_unlock, | 528 | .readunlock = rcu_bh_torture_read_unlock, |
529 | .completed = rcu_bh_torture_completed, | 529 | .completed = rcu_bh_torture_completed, |
530 | .deferred_free = rcu_sync_torture_deferred_free, | 530 | .deferred_free = rcu_sync_torture_deferred_free, |
531 | .sync = rcu_bh_torture_synchronize, | 531 | .sync = rcu_bh_torture_synchronize, |
532 | .cb_barrier = NULL, | 532 | .cb_barrier = NULL, |
533 | .fqs = rcu_bh_force_quiescent_state, | 533 | .fqs = rcu_bh_force_quiescent_state, |
534 | .stats = NULL, | 534 | .stats = NULL, |
535 | .irq_capable = 1, | 535 | .irq_capable = 1, |
536 | .name = "rcu_bh_sync" | 536 | .name = "rcu_bh_sync" |
537 | }; | 537 | }; |
538 | 538 | ||
539 | /* | 539 | /* |
540 | * Definitions for srcu torture testing. | 540 | * Definitions for srcu torture testing. |
541 | */ | 541 | */ |
542 | 542 | ||
543 | static struct srcu_struct srcu_ctl; | 543 | static struct srcu_struct srcu_ctl; |
544 | 544 | ||
545 | static void srcu_torture_init(void) | 545 | static void srcu_torture_init(void) |
546 | { | 546 | { |
547 | init_srcu_struct(&srcu_ctl); | 547 | init_srcu_struct(&srcu_ctl); |
548 | rcu_sync_torture_init(); | 548 | rcu_sync_torture_init(); |
549 | } | 549 | } |
550 | 550 | ||
551 | static void srcu_torture_cleanup(void) | 551 | static void srcu_torture_cleanup(void) |
552 | { | 552 | { |
553 | synchronize_srcu(&srcu_ctl); | 553 | synchronize_srcu(&srcu_ctl); |
554 | cleanup_srcu_struct(&srcu_ctl); | 554 | cleanup_srcu_struct(&srcu_ctl); |
555 | } | 555 | } |
556 | 556 | ||
557 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) | 557 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) |
558 | { | 558 | { |
559 | return srcu_read_lock(&srcu_ctl); | 559 | return srcu_read_lock(&srcu_ctl); |
560 | } | 560 | } |
561 | 561 | ||
562 | static void srcu_read_delay(struct rcu_random_state *rrsp) | 562 | static void srcu_read_delay(struct rcu_random_state *rrsp) |
563 | { | 563 | { |
564 | long delay; | 564 | long delay; |
565 | const long uspertick = 1000000 / HZ; | 565 | const long uspertick = 1000000 / HZ; |
566 | const long longdelay = 10; | 566 | const long longdelay = 10; |
567 | 567 | ||
568 | /* We want there to be long-running readers, but not all the time. */ | 568 | /* We want there to be long-running readers, but not all the time. */ |
569 | 569 | ||
570 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); | 570 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); |
571 | if (!delay) | 571 | if (!delay) |
572 | schedule_timeout_interruptible(longdelay); | 572 | schedule_timeout_interruptible(longdelay); |
573 | else | 573 | else |
574 | rcu_read_delay(rrsp); | 574 | rcu_read_delay(rrsp); |
575 | } | 575 | } |
576 | 576 | ||
577 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) | 577 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) |
578 | { | 578 | { |
579 | srcu_read_unlock(&srcu_ctl, idx); | 579 | srcu_read_unlock(&srcu_ctl, idx); |
580 | } | 580 | } |
581 | 581 | ||
582 | static int srcu_torture_completed(void) | 582 | static int srcu_torture_completed(void) |
583 | { | 583 | { |
584 | return srcu_batches_completed(&srcu_ctl); | 584 | return srcu_batches_completed(&srcu_ctl); |
585 | } | 585 | } |
586 | 586 | ||
587 | static void srcu_torture_synchronize(void) | 587 | static void srcu_torture_synchronize(void) |
588 | { | 588 | { |
589 | synchronize_srcu(&srcu_ctl); | 589 | synchronize_srcu(&srcu_ctl); |
590 | } | 590 | } |
591 | 591 | ||
592 | static int srcu_torture_stats(char *page) | 592 | static int srcu_torture_stats(char *page) |
593 | { | 593 | { |
594 | int cnt = 0; | 594 | int cnt = 0; |
595 | int cpu; | 595 | int cpu; |
596 | int idx = srcu_ctl.completed & 0x1; | 596 | int idx = srcu_ctl.completed & 0x1; |
597 | 597 | ||
598 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", | 598 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", |
599 | torture_type, TORTURE_FLAG, idx); | 599 | torture_type, TORTURE_FLAG, idx); |
600 | for_each_possible_cpu(cpu) { | 600 | for_each_possible_cpu(cpu) { |
601 | cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, | 601 | cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, |
602 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | 602 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], |
603 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | 603 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); |
604 | } | 604 | } |
605 | cnt += sprintf(&page[cnt], "\n"); | 605 | cnt += sprintf(&page[cnt], "\n"); |
606 | return cnt; | 606 | return cnt; |
607 | } | 607 | } |
608 | 608 | ||
609 | static struct rcu_torture_ops srcu_ops = { | 609 | static struct rcu_torture_ops srcu_ops = { |
610 | .init = srcu_torture_init, | 610 | .init = srcu_torture_init, |
611 | .cleanup = srcu_torture_cleanup, | 611 | .cleanup = srcu_torture_cleanup, |
612 | .readlock = srcu_torture_read_lock, | 612 | .readlock = srcu_torture_read_lock, |
613 | .read_delay = srcu_read_delay, | 613 | .read_delay = srcu_read_delay, |
614 | .readunlock = srcu_torture_read_unlock, | 614 | .readunlock = srcu_torture_read_unlock, |
615 | .completed = srcu_torture_completed, | 615 | .completed = srcu_torture_completed, |
616 | .deferred_free = rcu_sync_torture_deferred_free, | 616 | .deferred_free = rcu_sync_torture_deferred_free, |
617 | .sync = srcu_torture_synchronize, | 617 | .sync = srcu_torture_synchronize, |
618 | .cb_barrier = NULL, | 618 | .cb_barrier = NULL, |
619 | .stats = srcu_torture_stats, | 619 | .stats = srcu_torture_stats, |
620 | .name = "srcu" | 620 | .name = "srcu" |
621 | }; | 621 | }; |
622 | 622 | ||
623 | static void srcu_torture_synchronize_expedited(void) | 623 | static void srcu_torture_synchronize_expedited(void) |
624 | { | 624 | { |
625 | synchronize_srcu_expedited(&srcu_ctl); | 625 | synchronize_srcu_expedited(&srcu_ctl); |
626 | } | 626 | } |
627 | 627 | ||
628 | static struct rcu_torture_ops srcu_expedited_ops = { | 628 | static struct rcu_torture_ops srcu_expedited_ops = { |
629 | .init = srcu_torture_init, | 629 | .init = srcu_torture_init, |
630 | .cleanup = srcu_torture_cleanup, | 630 | .cleanup = srcu_torture_cleanup, |
631 | .readlock = srcu_torture_read_lock, | 631 | .readlock = srcu_torture_read_lock, |
632 | .read_delay = srcu_read_delay, | 632 | .read_delay = srcu_read_delay, |
633 | .readunlock = srcu_torture_read_unlock, | 633 | .readunlock = srcu_torture_read_unlock, |
634 | .completed = srcu_torture_completed, | 634 | .completed = srcu_torture_completed, |
635 | .deferred_free = rcu_sync_torture_deferred_free, | 635 | .deferred_free = rcu_sync_torture_deferred_free, |
636 | .sync = srcu_torture_synchronize_expedited, | 636 | .sync = srcu_torture_synchronize_expedited, |
637 | .cb_barrier = NULL, | 637 | .cb_barrier = NULL, |
638 | .stats = srcu_torture_stats, | 638 | .stats = srcu_torture_stats, |
639 | .name = "srcu_expedited" | 639 | .name = "srcu_expedited" |
640 | }; | 640 | }; |
641 | 641 | ||
642 | /* | 642 | /* |
643 | * Definitions for sched torture testing. | 643 | * Definitions for sched torture testing. |
644 | */ | 644 | */ |
645 | 645 | ||
646 | static int sched_torture_read_lock(void) | 646 | static int sched_torture_read_lock(void) |
647 | { | 647 | { |
648 | preempt_disable(); | 648 | preempt_disable(); |
649 | return 0; | 649 | return 0; |
650 | } | 650 | } |
651 | 651 | ||
652 | static void sched_torture_read_unlock(int idx) | 652 | static void sched_torture_read_unlock(int idx) |
653 | { | 653 | { |
654 | preempt_enable(); | 654 | preempt_enable(); |
655 | } | 655 | } |
656 | 656 | ||
657 | static void rcu_sched_torture_deferred_free(struct rcu_torture *p) | 657 | static void rcu_sched_torture_deferred_free(struct rcu_torture *p) |
658 | { | 658 | { |
659 | call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); | 659 | call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); |
660 | } | 660 | } |
661 | 661 | ||
662 | static void sched_torture_synchronize(void) | 662 | static void sched_torture_synchronize(void) |
663 | { | 663 | { |
664 | synchronize_sched(); | 664 | synchronize_sched(); |
665 | } | 665 | } |
666 | 666 | ||
667 | static struct rcu_torture_ops sched_ops = { | 667 | static struct rcu_torture_ops sched_ops = { |
668 | .init = rcu_sync_torture_init, | 668 | .init = rcu_sync_torture_init, |
669 | .cleanup = NULL, | 669 | .cleanup = NULL, |
670 | .readlock = sched_torture_read_lock, | 670 | .readlock = sched_torture_read_lock, |
671 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 671 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
672 | .readunlock = sched_torture_read_unlock, | 672 | .readunlock = sched_torture_read_unlock, |
673 | .completed = rcu_no_completed, | 673 | .completed = rcu_no_completed, |
674 | .deferred_free = rcu_sched_torture_deferred_free, | 674 | .deferred_free = rcu_sched_torture_deferred_free, |
675 | .sync = sched_torture_synchronize, | 675 | .sync = sched_torture_synchronize, |
676 | .cb_barrier = rcu_barrier_sched, | 676 | .cb_barrier = rcu_barrier_sched, |
677 | .fqs = rcu_sched_force_quiescent_state, | 677 | .fqs = rcu_sched_force_quiescent_state, |
678 | .stats = NULL, | 678 | .stats = NULL, |
679 | .irq_capable = 1, | 679 | .irq_capable = 1, |
680 | .name = "sched" | 680 | .name = "sched" |
681 | }; | 681 | }; |
682 | 682 | ||
683 | static struct rcu_torture_ops sched_sync_ops = { | 683 | static struct rcu_torture_ops sched_sync_ops = { |
684 | .init = rcu_sync_torture_init, | 684 | .init = rcu_sync_torture_init, |
685 | .cleanup = NULL, | 685 | .cleanup = NULL, |
686 | .readlock = sched_torture_read_lock, | 686 | .readlock = sched_torture_read_lock, |
687 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 687 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
688 | .readunlock = sched_torture_read_unlock, | 688 | .readunlock = sched_torture_read_unlock, |
689 | .completed = rcu_no_completed, | 689 | .completed = rcu_no_completed, |
690 | .deferred_free = rcu_sync_torture_deferred_free, | 690 | .deferred_free = rcu_sync_torture_deferred_free, |
691 | .sync = sched_torture_synchronize, | 691 | .sync = sched_torture_synchronize, |
692 | .cb_barrier = NULL, | 692 | .cb_barrier = NULL, |
693 | .fqs = rcu_sched_force_quiescent_state, | 693 | .fqs = rcu_sched_force_quiescent_state, |
694 | .stats = NULL, | 694 | .stats = NULL, |
695 | .name = "sched_sync" | 695 | .name = "sched_sync" |
696 | }; | 696 | }; |
697 | 697 | ||
698 | static struct rcu_torture_ops sched_expedited_ops = { | 698 | static struct rcu_torture_ops sched_expedited_ops = { |
699 | .init = rcu_sync_torture_init, | 699 | .init = rcu_sync_torture_init, |
700 | .cleanup = NULL, | 700 | .cleanup = NULL, |
701 | .readlock = sched_torture_read_lock, | 701 | .readlock = sched_torture_read_lock, |
702 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 702 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
703 | .readunlock = sched_torture_read_unlock, | 703 | .readunlock = sched_torture_read_unlock, |
704 | .completed = rcu_no_completed, | 704 | .completed = rcu_no_completed, |
705 | .deferred_free = rcu_sync_torture_deferred_free, | 705 | .deferred_free = rcu_sync_torture_deferred_free, |
706 | .sync = synchronize_sched_expedited, | 706 | .sync = synchronize_sched_expedited, |
707 | .cb_barrier = NULL, | 707 | .cb_barrier = NULL, |
708 | .fqs = rcu_sched_force_quiescent_state, | 708 | .fqs = rcu_sched_force_quiescent_state, |
709 | .stats = NULL, | 709 | .stats = NULL, |
710 | .irq_capable = 1, | 710 | .irq_capable = 1, |
711 | .name = "sched_expedited" | 711 | .name = "sched_expedited" |
712 | }; | 712 | }; |
713 | 713 | ||
714 | /* | 714 | /* |
715 | * RCU torture priority-boost testing. Runs one real-time thread per | 715 | * RCU torture priority-boost testing. Runs one real-time thread per |
716 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | 716 | * CPU for moderate bursts, repeatedly registering RCU callbacks and |
717 | * spinning waiting for them to be invoked. If a given callback takes | 717 | * spinning waiting for them to be invoked. If a given callback takes |
718 | * too long to be invoked, we assume that priority inversion has occurred. | 718 | * too long to be invoked, we assume that priority inversion has occurred. |
719 | */ | 719 | */ |
720 | 720 | ||
721 | struct rcu_boost_inflight { | 721 | struct rcu_boost_inflight { |
722 | struct rcu_head rcu; | 722 | struct rcu_head rcu; |
723 | int inflight; | 723 | int inflight; |
724 | }; | 724 | }; |
725 | 725 | ||
726 | static void rcu_torture_boost_cb(struct rcu_head *head) | 726 | static void rcu_torture_boost_cb(struct rcu_head *head) |
727 | { | 727 | { |
728 | struct rcu_boost_inflight *rbip = | 728 | struct rcu_boost_inflight *rbip = |
729 | container_of(head, struct rcu_boost_inflight, rcu); | 729 | container_of(head, struct rcu_boost_inflight, rcu); |
730 | 730 | ||
731 | smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ | 731 | smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ |
732 | rbip->inflight = 0; | 732 | rbip->inflight = 0; |
733 | } | 733 | } |
734 | 734 | ||
735 | static int rcu_torture_boost(void *arg) | 735 | static int rcu_torture_boost(void *arg) |
736 | { | 736 | { |
737 | unsigned long call_rcu_time; | 737 | unsigned long call_rcu_time; |
738 | unsigned long endtime; | 738 | unsigned long endtime; |
739 | unsigned long oldstarttime; | 739 | unsigned long oldstarttime; |
740 | struct rcu_boost_inflight rbi = { .inflight = 0 }; | 740 | struct rcu_boost_inflight rbi = { .inflight = 0 }; |
741 | struct sched_param sp; | 741 | struct sched_param sp; |
742 | 742 | ||
743 | VERBOSE_PRINTK_STRING("rcu_torture_boost started"); | 743 | VERBOSE_PRINTK_STRING("rcu_torture_boost started"); |
744 | 744 | ||
745 | /* Set real-time priority. */ | 745 | /* Set real-time priority. */ |
746 | sp.sched_priority = 1; | 746 | sp.sched_priority = 1; |
747 | if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { | 747 | if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { |
748 | VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); | 748 | VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); |
749 | n_rcu_torture_boost_rterror++; | 749 | n_rcu_torture_boost_rterror++; |
750 | } | 750 | } |
751 | 751 | ||
752 | init_rcu_head_on_stack(&rbi.rcu); | 752 | init_rcu_head_on_stack(&rbi.rcu); |
753 | /* Each pass through the following loop does one boost-test cycle. */ | 753 | /* Each pass through the following loop does one boost-test cycle. */ |
754 | do { | 754 | do { |
755 | /* Wait for the next test interval. */ | 755 | /* Wait for the next test interval. */ |
756 | oldstarttime = boost_starttime; | 756 | oldstarttime = boost_starttime; |
757 | while (jiffies - oldstarttime > ULONG_MAX / 2) { | 757 | while (jiffies - oldstarttime > ULONG_MAX / 2) { |
758 | schedule_timeout_uninterruptible(1); | 758 | schedule_timeout_uninterruptible(1); |
759 | rcu_stutter_wait("rcu_torture_boost"); | 759 | rcu_stutter_wait("rcu_torture_boost"); |
760 | if (kthread_should_stop() || | 760 | if (kthread_should_stop() || |
761 | fullstop != FULLSTOP_DONTSTOP) | 761 | fullstop != FULLSTOP_DONTSTOP) |
762 | goto checkwait; | 762 | goto checkwait; |
763 | } | 763 | } |
764 | 764 | ||
765 | /* Do one boost-test interval. */ | 765 | /* Do one boost-test interval. */ |
766 | endtime = oldstarttime + test_boost_duration * HZ; | 766 | endtime = oldstarttime + test_boost_duration * HZ; |
767 | call_rcu_time = jiffies; | 767 | call_rcu_time = jiffies; |
768 | while (jiffies - endtime > ULONG_MAX / 2) { | 768 | while (jiffies - endtime > ULONG_MAX / 2) { |
769 | /* If we don't have a callback in flight, post one. */ | 769 | /* If we don't have a callback in flight, post one. */ |
770 | if (!rbi.inflight) { | 770 | if (!rbi.inflight) { |
771 | smp_mb(); /* RCU core before ->inflight = 1. */ | 771 | smp_mb(); /* RCU core before ->inflight = 1. */ |
772 | rbi.inflight = 1; | 772 | rbi.inflight = 1; |
773 | call_rcu(&rbi.rcu, rcu_torture_boost_cb); | 773 | call_rcu(&rbi.rcu, rcu_torture_boost_cb); |
774 | if (jiffies - call_rcu_time > | 774 | if (jiffies - call_rcu_time > |
775 | test_boost_duration * HZ - HZ / 2) { | 775 | test_boost_duration * HZ - HZ / 2) { |
776 | VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); | 776 | VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); |
777 | n_rcu_torture_boost_failure++; | 777 | n_rcu_torture_boost_failure++; |
778 | } | 778 | } |
779 | call_rcu_time = jiffies; | 779 | call_rcu_time = jiffies; |
780 | } | 780 | } |
781 | cond_resched(); | 781 | cond_resched(); |
782 | rcu_stutter_wait("rcu_torture_boost"); | 782 | rcu_stutter_wait("rcu_torture_boost"); |
783 | if (kthread_should_stop() || | 783 | if (kthread_should_stop() || |
784 | fullstop != FULLSTOP_DONTSTOP) | 784 | fullstop != FULLSTOP_DONTSTOP) |
785 | goto checkwait; | 785 | goto checkwait; |
786 | } | 786 | } |
787 | 787 | ||
788 | /* | 788 | /* |
789 | * Set the start time of the next test interval. | 789 | * Set the start time of the next test interval. |
790 | * Yes, this is vulnerable to long delays, but such | 790 | * Yes, this is vulnerable to long delays, but such |
791 | * delays simply cause a false negative for the next | 791 | * delays simply cause a false negative for the next |
792 | * interval. Besides, we are running at RT priority, | 792 | * interval. Besides, we are running at RT priority, |
793 | * so delays should be relatively rare. | 793 | * so delays should be relatively rare. |
794 | */ | 794 | */ |
795 | while (oldstarttime == boost_starttime) { | 795 | while (oldstarttime == boost_starttime) { |
796 | if (mutex_trylock(&boost_mutex)) { | 796 | if (mutex_trylock(&boost_mutex)) { |
797 | boost_starttime = jiffies + | 797 | boost_starttime = jiffies + |
798 | test_boost_interval * HZ; | 798 | test_boost_interval * HZ; |
799 | n_rcu_torture_boosts++; | 799 | n_rcu_torture_boosts++; |
800 | mutex_unlock(&boost_mutex); | 800 | mutex_unlock(&boost_mutex); |
801 | break; | 801 | break; |
802 | } | 802 | } |
803 | schedule_timeout_uninterruptible(1); | 803 | schedule_timeout_uninterruptible(1); |
804 | } | 804 | } |
805 | 805 | ||
806 | /* Go do the stutter. */ | 806 | /* Go do the stutter. */ |
807 | checkwait: rcu_stutter_wait("rcu_torture_boost"); | 807 | checkwait: rcu_stutter_wait("rcu_torture_boost"); |
808 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 808 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
809 | 809 | ||
810 | /* Clean up and exit. */ | 810 | /* Clean up and exit. */ |
811 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | 811 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); |
812 | destroy_rcu_head_on_stack(&rbi.rcu); | 812 | destroy_rcu_head_on_stack(&rbi.rcu); |
813 | rcutorture_shutdown_absorb("rcu_torture_boost"); | 813 | rcutorture_shutdown_absorb("rcu_torture_boost"); |
814 | while (!kthread_should_stop() || rbi.inflight) | 814 | while (!kthread_should_stop() || rbi.inflight) |
815 | schedule_timeout_uninterruptible(1); | 815 | schedule_timeout_uninterruptible(1); |
816 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ | 816 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ |
817 | return 0; | 817 | return 0; |
818 | } | 818 | } |
819 | 819 | ||
820 | /* | 820 | /* |
821 | * RCU torture force-quiescent-state kthread. Repeatedly induces | 821 | * RCU torture force-quiescent-state kthread. Repeatedly induces |
822 | * bursts of calls to force_quiescent_state(), increasing the probability | 822 | * bursts of calls to force_quiescent_state(), increasing the probability |
823 | * of occurrence of some important types of race conditions. | 823 | * of occurrence of some important types of race conditions. |
824 | */ | 824 | */ |
825 | static int | 825 | static int |
826 | rcu_torture_fqs(void *arg) | 826 | rcu_torture_fqs(void *arg) |
827 | { | 827 | { |
828 | unsigned long fqs_resume_time; | 828 | unsigned long fqs_resume_time; |
829 | int fqs_burst_remaining; | 829 | int fqs_burst_remaining; |
830 | 830 | ||
831 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); | 831 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); |
832 | do { | 832 | do { |
833 | fqs_resume_time = jiffies + fqs_stutter * HZ; | 833 | fqs_resume_time = jiffies + fqs_stutter * HZ; |
834 | while (jiffies - fqs_resume_time > LONG_MAX) { | 834 | while (jiffies - fqs_resume_time > LONG_MAX) { |
835 | schedule_timeout_interruptible(1); | 835 | schedule_timeout_interruptible(1); |
836 | } | 836 | } |
837 | fqs_burst_remaining = fqs_duration; | 837 | fqs_burst_remaining = fqs_duration; |
838 | while (fqs_burst_remaining > 0) { | 838 | while (fqs_burst_remaining > 0) { |
839 | cur_ops->fqs(); | 839 | cur_ops->fqs(); |
840 | udelay(fqs_holdoff); | 840 | udelay(fqs_holdoff); |
841 | fqs_burst_remaining -= fqs_holdoff; | 841 | fqs_burst_remaining -= fqs_holdoff; |
842 | } | 842 | } |
843 | rcu_stutter_wait("rcu_torture_fqs"); | 843 | rcu_stutter_wait("rcu_torture_fqs"); |
844 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 844 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
845 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); | 845 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); |
846 | rcutorture_shutdown_absorb("rcu_torture_fqs"); | 846 | rcutorture_shutdown_absorb("rcu_torture_fqs"); |
847 | while (!kthread_should_stop()) | 847 | while (!kthread_should_stop()) |
848 | schedule_timeout_uninterruptible(1); | 848 | schedule_timeout_uninterruptible(1); |
849 | return 0; | 849 | return 0; |
850 | } | 850 | } |
851 | 851 | ||
852 | /* | 852 | /* |
853 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 853 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
854 | * for that pointed to by rcu_torture_current, freeing the old structure | 854 | * for that pointed to by rcu_torture_current, freeing the old structure |
855 | * after a series of grace periods (the "pipeline"). | 855 | * after a series of grace periods (the "pipeline"). |
856 | */ | 856 | */ |
857 | static int | 857 | static int |
858 | rcu_torture_writer(void *arg) | 858 | rcu_torture_writer(void *arg) |
859 | { | 859 | { |
860 | int i; | 860 | int i; |
861 | long oldbatch = rcu_batches_completed(); | 861 | long oldbatch = rcu_batches_completed(); |
862 | struct rcu_torture *rp; | 862 | struct rcu_torture *rp; |
863 | struct rcu_torture *old_rp; | 863 | struct rcu_torture *old_rp; |
864 | static DEFINE_RCU_RANDOM(rand); | 864 | static DEFINE_RCU_RANDOM(rand); |
865 | 865 | ||
866 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); | 866 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); |
867 | set_user_nice(current, 19); | 867 | set_user_nice(current, 19); |
868 | 868 | ||
869 | do { | 869 | do { |
870 | schedule_timeout_uninterruptible(1); | 870 | schedule_timeout_uninterruptible(1); |
871 | rp = rcu_torture_alloc(); | 871 | rp = rcu_torture_alloc(); |
872 | if (rp == NULL) | 872 | if (rp == NULL) |
873 | continue; | 873 | continue; |
874 | rp->rtort_pipe_count = 0; | 874 | rp->rtort_pipe_count = 0; |
875 | udelay(rcu_random(&rand) & 0x3ff); | 875 | udelay(rcu_random(&rand) & 0x3ff); |
876 | old_rp = rcu_dereference_check(rcu_torture_current, | 876 | old_rp = rcu_dereference_check(rcu_torture_current, |
877 | current == writer_task); | 877 | current == writer_task); |
878 | rp->rtort_mbtest = 1; | 878 | rp->rtort_mbtest = 1; |
879 | rcu_assign_pointer(rcu_torture_current, rp); | 879 | rcu_assign_pointer(rcu_torture_current, rp); |
880 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ | 880 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ |
881 | if (old_rp) { | 881 | if (old_rp) { |
882 | i = old_rp->rtort_pipe_count; | 882 | i = old_rp->rtort_pipe_count; |
883 | if (i > RCU_TORTURE_PIPE_LEN) | 883 | if (i > RCU_TORTURE_PIPE_LEN) |
884 | i = RCU_TORTURE_PIPE_LEN; | 884 | i = RCU_TORTURE_PIPE_LEN; |
885 | atomic_inc(&rcu_torture_wcount[i]); | 885 | atomic_inc(&rcu_torture_wcount[i]); |
886 | old_rp->rtort_pipe_count++; | 886 | old_rp->rtort_pipe_count++; |
887 | cur_ops->deferred_free(old_rp); | 887 | cur_ops->deferred_free(old_rp); |
888 | } | 888 | } |
889 | rcutorture_record_progress(++rcu_torture_current_version); | 889 | rcutorture_record_progress(++rcu_torture_current_version); |
890 | oldbatch = cur_ops->completed(); | 890 | oldbatch = cur_ops->completed(); |
891 | rcu_stutter_wait("rcu_torture_writer"); | 891 | rcu_stutter_wait("rcu_torture_writer"); |
892 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 892 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
893 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 893 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
894 | rcutorture_shutdown_absorb("rcu_torture_writer"); | 894 | rcutorture_shutdown_absorb("rcu_torture_writer"); |
895 | while (!kthread_should_stop()) | 895 | while (!kthread_should_stop()) |
896 | schedule_timeout_uninterruptible(1); | 896 | schedule_timeout_uninterruptible(1); |
897 | return 0; | 897 | return 0; |
898 | } | 898 | } |
899 | 899 | ||
900 | /* | 900 | /* |
901 | * RCU torture fake writer kthread. Repeatedly calls sync, with a random | 901 | * RCU torture fake writer kthread. Repeatedly calls sync, with a random |
902 | * delay between calls. | 902 | * delay between calls. |
903 | */ | 903 | */ |
904 | static int | 904 | static int |
905 | rcu_torture_fakewriter(void *arg) | 905 | rcu_torture_fakewriter(void *arg) |
906 | { | 906 | { |
907 | DEFINE_RCU_RANDOM(rand); | 907 | DEFINE_RCU_RANDOM(rand); |
908 | 908 | ||
909 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); | 909 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); |
910 | set_user_nice(current, 19); | 910 | set_user_nice(current, 19); |
911 | 911 | ||
912 | do { | 912 | do { |
913 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | 913 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); |
914 | udelay(rcu_random(&rand) & 0x3ff); | 914 | udelay(rcu_random(&rand) & 0x3ff); |
915 | cur_ops->sync(); | 915 | cur_ops->sync(); |
916 | rcu_stutter_wait("rcu_torture_fakewriter"); | 916 | rcu_stutter_wait("rcu_torture_fakewriter"); |
917 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 917 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
918 | 918 | ||
919 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); | 919 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); |
920 | rcutorture_shutdown_absorb("rcu_torture_fakewriter"); | 920 | rcutorture_shutdown_absorb("rcu_torture_fakewriter"); |
921 | while (!kthread_should_stop()) | 921 | while (!kthread_should_stop()) |
922 | schedule_timeout_uninterruptible(1); | 922 | schedule_timeout_uninterruptible(1); |
923 | return 0; | 923 | return 0; |
924 | } | 924 | } |
925 | 925 | ||
926 | /* | 926 | /* |
927 | * RCU torture reader from timer handler. Dereferences rcu_torture_current, | 927 | * RCU torture reader from timer handler. Dereferences rcu_torture_current, |
928 | * incrementing the corresponding element of the pipeline array. The | 928 | * incrementing the corresponding element of the pipeline array. The |
929 | * counter in the element should never be greater than 1, otherwise, the | 929 | * counter in the element should never be greater than 1, otherwise, the |
930 | * RCU implementation is broken. | 930 | * RCU implementation is broken. |
931 | */ | 931 | */ |
932 | static void rcu_torture_timer(unsigned long unused) | 932 | static void rcu_torture_timer(unsigned long unused) |
933 | { | 933 | { |
934 | int idx; | 934 | int idx; |
935 | int completed; | 935 | int completed; |
936 | static DEFINE_RCU_RANDOM(rand); | 936 | static DEFINE_RCU_RANDOM(rand); |
937 | static DEFINE_SPINLOCK(rand_lock); | 937 | static DEFINE_SPINLOCK(rand_lock); |
938 | struct rcu_torture *p; | 938 | struct rcu_torture *p; |
939 | int pipe_count; | 939 | int pipe_count; |
940 | 940 | ||
941 | idx = cur_ops->readlock(); | 941 | idx = cur_ops->readlock(); |
942 | completed = cur_ops->completed(); | 942 | completed = cur_ops->completed(); |
943 | p = rcu_dereference_check(rcu_torture_current, | 943 | p = rcu_dereference_check(rcu_torture_current, |
944 | rcu_read_lock_held() || | ||
945 | rcu_read_lock_bh_held() || | 944 | rcu_read_lock_bh_held() || |
946 | rcu_read_lock_sched_held() || | 945 | rcu_read_lock_sched_held() || |
947 | srcu_read_lock_held(&srcu_ctl)); | 946 | srcu_read_lock_held(&srcu_ctl)); |
948 | if (p == NULL) { | 947 | if (p == NULL) { |
949 | /* Leave because rcu_torture_writer is not yet underway */ | 948 | /* Leave because rcu_torture_writer is not yet underway */ |
950 | cur_ops->readunlock(idx); | 949 | cur_ops->readunlock(idx); |
951 | return; | 950 | return; |
952 | } | 951 | } |
953 | if (p->rtort_mbtest == 0) | 952 | if (p->rtort_mbtest == 0) |
954 | atomic_inc(&n_rcu_torture_mberror); | 953 | atomic_inc(&n_rcu_torture_mberror); |
955 | spin_lock(&rand_lock); | 954 | spin_lock(&rand_lock); |
956 | cur_ops->read_delay(&rand); | 955 | cur_ops->read_delay(&rand); |
957 | n_rcu_torture_timers++; | 956 | n_rcu_torture_timers++; |
958 | spin_unlock(&rand_lock); | 957 | spin_unlock(&rand_lock); |
959 | preempt_disable(); | 958 | preempt_disable(); |
960 | pipe_count = p->rtort_pipe_count; | 959 | pipe_count = p->rtort_pipe_count; |
961 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { | 960 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { |
962 | /* Should not happen, but... */ | 961 | /* Should not happen, but... */ |
963 | pipe_count = RCU_TORTURE_PIPE_LEN; | 962 | pipe_count = RCU_TORTURE_PIPE_LEN; |
964 | } | 963 | } |
965 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 964 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
966 | completed = cur_ops->completed() - completed; | 965 | completed = cur_ops->completed() - completed; |
967 | if (completed > RCU_TORTURE_PIPE_LEN) { | 966 | if (completed > RCU_TORTURE_PIPE_LEN) { |
968 | /* Should not happen, but... */ | 967 | /* Should not happen, but... */ |
969 | completed = RCU_TORTURE_PIPE_LEN; | 968 | completed = RCU_TORTURE_PIPE_LEN; |
970 | } | 969 | } |
971 | __this_cpu_inc(rcu_torture_batch[completed]); | 970 | __this_cpu_inc(rcu_torture_batch[completed]); |
972 | preempt_enable(); | 971 | preempt_enable(); |
973 | cur_ops->readunlock(idx); | 972 | cur_ops->readunlock(idx); |
974 | } | 973 | } |
975 | 974 | ||
976 | /* | 975 | /* |
977 | * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, | 976 | * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, |
978 | * incrementing the corresponding element of the pipeline array. The | 977 | * incrementing the corresponding element of the pipeline array. The |
979 | * counter in the element should never be greater than 1, otherwise, the | 978 | * counter in the element should never be greater than 1, otherwise, the |
980 | * RCU implementation is broken. | 979 | * RCU implementation is broken. |
981 | */ | 980 | */ |
982 | static int | 981 | static int |
983 | rcu_torture_reader(void *arg) | 982 | rcu_torture_reader(void *arg) |
984 | { | 983 | { |
985 | int completed; | 984 | int completed; |
986 | int idx; | 985 | int idx; |
987 | DEFINE_RCU_RANDOM(rand); | 986 | DEFINE_RCU_RANDOM(rand); |
988 | struct rcu_torture *p; | 987 | struct rcu_torture *p; |
989 | int pipe_count; | 988 | int pipe_count; |
990 | struct timer_list t; | 989 | struct timer_list t; |
991 | 990 | ||
992 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); | 991 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); |
993 | set_user_nice(current, 19); | 992 | set_user_nice(current, 19); |
994 | if (irqreader && cur_ops->irq_capable) | 993 | if (irqreader && cur_ops->irq_capable) |
995 | setup_timer_on_stack(&t, rcu_torture_timer, 0); | 994 | setup_timer_on_stack(&t, rcu_torture_timer, 0); |
996 | 995 | ||
997 | do { | 996 | do { |
998 | if (irqreader && cur_ops->irq_capable) { | 997 | if (irqreader && cur_ops->irq_capable) { |
999 | if (!timer_pending(&t)) | 998 | if (!timer_pending(&t)) |
1000 | mod_timer(&t, jiffies + 1); | 999 | mod_timer(&t, jiffies + 1); |
1001 | } | 1000 | } |
1002 | idx = cur_ops->readlock(); | 1001 | idx = cur_ops->readlock(); |
1003 | completed = cur_ops->completed(); | 1002 | completed = cur_ops->completed(); |
1004 | p = rcu_dereference_check(rcu_torture_current, | 1003 | p = rcu_dereference_check(rcu_torture_current, |
1005 | rcu_read_lock_held() || | ||
1006 | rcu_read_lock_bh_held() || | 1004 | rcu_read_lock_bh_held() || |
1007 | rcu_read_lock_sched_held() || | 1005 | rcu_read_lock_sched_held() || |
1008 | srcu_read_lock_held(&srcu_ctl)); | 1006 | srcu_read_lock_held(&srcu_ctl)); |
1009 | if (p == NULL) { | 1007 | if (p == NULL) { |
1010 | /* Wait for rcu_torture_writer to get underway */ | 1008 | /* Wait for rcu_torture_writer to get underway */ |
1011 | cur_ops->readunlock(idx); | 1009 | cur_ops->readunlock(idx); |
1012 | schedule_timeout_interruptible(HZ); | 1010 | schedule_timeout_interruptible(HZ); |
1013 | continue; | 1011 | continue; |
1014 | } | 1012 | } |
1015 | if (p->rtort_mbtest == 0) | 1013 | if (p->rtort_mbtest == 0) |
1016 | atomic_inc(&n_rcu_torture_mberror); | 1014 | atomic_inc(&n_rcu_torture_mberror); |
1017 | cur_ops->read_delay(&rand); | 1015 | cur_ops->read_delay(&rand); |
1018 | preempt_disable(); | 1016 | preempt_disable(); |
1019 | pipe_count = p->rtort_pipe_count; | 1017 | pipe_count = p->rtort_pipe_count; |
1020 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { | 1018 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { |
1021 | /* Should not happen, but... */ | 1019 | /* Should not happen, but... */ |
1022 | pipe_count = RCU_TORTURE_PIPE_LEN; | 1020 | pipe_count = RCU_TORTURE_PIPE_LEN; |
1023 | } | 1021 | } |
1024 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 1022 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
1025 | completed = cur_ops->completed() - completed; | 1023 | completed = cur_ops->completed() - completed; |
1026 | if (completed > RCU_TORTURE_PIPE_LEN) { | 1024 | if (completed > RCU_TORTURE_PIPE_LEN) { |
1027 | /* Should not happen, but... */ | 1025 | /* Should not happen, but... */ |
1028 | completed = RCU_TORTURE_PIPE_LEN; | 1026 | completed = RCU_TORTURE_PIPE_LEN; |
1029 | } | 1027 | } |
1030 | __this_cpu_inc(rcu_torture_batch[completed]); | 1028 | __this_cpu_inc(rcu_torture_batch[completed]); |
1031 | preempt_enable(); | 1029 | preempt_enable(); |
1032 | cur_ops->readunlock(idx); | 1030 | cur_ops->readunlock(idx); |
1033 | schedule(); | 1031 | schedule(); |
1034 | rcu_stutter_wait("rcu_torture_reader"); | 1032 | rcu_stutter_wait("rcu_torture_reader"); |
1035 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 1033 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
1036 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | 1034 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); |
1037 | rcutorture_shutdown_absorb("rcu_torture_reader"); | 1035 | rcutorture_shutdown_absorb("rcu_torture_reader"); |
1038 | if (irqreader && cur_ops->irq_capable) | 1036 | if (irqreader && cur_ops->irq_capable) |
1039 | del_timer_sync(&t); | 1037 | del_timer_sync(&t); |
1040 | while (!kthread_should_stop()) | 1038 | while (!kthread_should_stop()) |
1041 | schedule_timeout_uninterruptible(1); | 1039 | schedule_timeout_uninterruptible(1); |
1042 | return 0; | 1040 | return 0; |
1043 | } | 1041 | } |
1044 | 1042 | ||
1045 | /* | 1043 | /* |
1046 | * Create an RCU-torture statistics message in the specified buffer. | 1044 | * Create an RCU-torture statistics message in the specified buffer. |
1047 | */ | 1045 | */ |
1048 | static int | 1046 | static int |
1049 | rcu_torture_printk(char *page) | 1047 | rcu_torture_printk(char *page) |
1050 | { | 1048 | { |
1051 | int cnt = 0; | 1049 | int cnt = 0; |
1052 | int cpu; | 1050 | int cpu; |
1053 | int i; | 1051 | int i; |
1054 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1052 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
1055 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1053 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
1056 | 1054 | ||
1057 | for_each_possible_cpu(cpu) { | 1055 | for_each_possible_cpu(cpu) { |
1058 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1056 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
1059 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; | 1057 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; |
1060 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; | 1058 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; |
1061 | } | 1059 | } |
1062 | } | 1060 | } |
1063 | for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { | 1061 | for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { |
1064 | if (pipesummary[i] != 0) | 1062 | if (pipesummary[i] != 0) |
1065 | break; | 1063 | break; |
1066 | } | 1064 | } |
1067 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1065 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
1068 | cnt += sprintf(&page[cnt], | 1066 | cnt += sprintf(&page[cnt], |
1069 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1067 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1070 | "rtmbe: %d rtbke: %ld rtbre: %ld " | 1068 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1071 | "rtbf: %ld rtb: %ld nt: %ld", | 1069 | "rtbf: %ld rtb: %ld nt: %ld", |
1072 | rcu_torture_current, | 1070 | rcu_torture_current, |
1073 | rcu_torture_current_version, | 1071 | rcu_torture_current_version, |
1074 | list_empty(&rcu_torture_freelist), | 1072 | list_empty(&rcu_torture_freelist), |
1075 | atomic_read(&n_rcu_torture_alloc), | 1073 | atomic_read(&n_rcu_torture_alloc), |
1076 | atomic_read(&n_rcu_torture_alloc_fail), | 1074 | atomic_read(&n_rcu_torture_alloc_fail), |
1077 | atomic_read(&n_rcu_torture_free), | 1075 | atomic_read(&n_rcu_torture_free), |
1078 | atomic_read(&n_rcu_torture_mberror), | 1076 | atomic_read(&n_rcu_torture_mberror), |
1079 | n_rcu_torture_boost_ktrerror, | 1077 | n_rcu_torture_boost_ktrerror, |
1080 | n_rcu_torture_boost_rterror, | 1078 | n_rcu_torture_boost_rterror, |
1081 | n_rcu_torture_boost_failure, | 1079 | n_rcu_torture_boost_failure, |
1082 | n_rcu_torture_boosts, | 1080 | n_rcu_torture_boosts, |
1083 | n_rcu_torture_timers); | 1081 | n_rcu_torture_timers); |
1084 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1082 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1085 | n_rcu_torture_boost_ktrerror != 0 || | 1083 | n_rcu_torture_boost_ktrerror != 0 || |
1086 | n_rcu_torture_boost_rterror != 0 || | 1084 | n_rcu_torture_boost_rterror != 0 || |
1087 | n_rcu_torture_boost_failure != 0) | 1085 | n_rcu_torture_boost_failure != 0) |
1088 | cnt += sprintf(&page[cnt], " !!!"); | 1086 | cnt += sprintf(&page[cnt], " !!!"); |
1089 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1087 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
1090 | if (i > 1) { | 1088 | if (i > 1) { |
1091 | cnt += sprintf(&page[cnt], "!!! "); | 1089 | cnt += sprintf(&page[cnt], "!!! "); |
1092 | atomic_inc(&n_rcu_torture_error); | 1090 | atomic_inc(&n_rcu_torture_error); |
1093 | WARN_ON_ONCE(1); | 1091 | WARN_ON_ONCE(1); |
1094 | } | 1092 | } |
1095 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | 1093 | cnt += sprintf(&page[cnt], "Reader Pipe: "); |
1096 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1094 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
1097 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | 1095 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); |
1098 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1096 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
1099 | cnt += sprintf(&page[cnt], "Reader Batch: "); | 1097 | cnt += sprintf(&page[cnt], "Reader Batch: "); |
1100 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1098 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
1101 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | 1099 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); |
1102 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1100 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
1103 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | 1101 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); |
1104 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1102 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
1105 | cnt += sprintf(&page[cnt], " %d", | 1103 | cnt += sprintf(&page[cnt], " %d", |
1106 | atomic_read(&rcu_torture_wcount[i])); | 1104 | atomic_read(&rcu_torture_wcount[i])); |
1107 | } | 1105 | } |
1108 | cnt += sprintf(&page[cnt], "\n"); | 1106 | cnt += sprintf(&page[cnt], "\n"); |
1109 | if (cur_ops->stats) | 1107 | if (cur_ops->stats) |
1110 | cnt += cur_ops->stats(&page[cnt]); | 1108 | cnt += cur_ops->stats(&page[cnt]); |
1111 | return cnt; | 1109 | return cnt; |
1112 | } | 1110 | } |
1113 | 1111 | ||
1114 | /* | 1112 | /* |
1115 | * Print torture statistics. Caller must ensure that there is only | 1113 | * Print torture statistics. Caller must ensure that there is only |
1116 | * one call to this function at a given time!!! This is normally | 1114 | * one call to this function at a given time!!! This is normally |
1117 | * accomplished by relying on the module system to only have one copy | 1115 | * accomplished by relying on the module system to only have one copy |
1118 | * of the module loaded, and then by giving the rcu_torture_stats | 1116 | * of the module loaded, and then by giving the rcu_torture_stats |
1119 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | 1117 | * kthread full control (or the init/cleanup functions when rcu_torture_stats |
1120 | * thread is not running). | 1118 | * thread is not running). |
1121 | */ | 1119 | */ |
1122 | static void | 1120 | static void |
1123 | rcu_torture_stats_print(void) | 1121 | rcu_torture_stats_print(void) |
1124 | { | 1122 | { |
1125 | int cnt; | 1123 | int cnt; |
1126 | 1124 | ||
1127 | cnt = rcu_torture_printk(printk_buf); | 1125 | cnt = rcu_torture_printk(printk_buf); |
1128 | printk(KERN_ALERT "%s", printk_buf); | 1126 | printk(KERN_ALERT "%s", printk_buf); |
1129 | } | 1127 | } |
1130 | 1128 | ||
1131 | /* | 1129 | /* |
1132 | * Periodically prints torture statistics, if periodic statistics printing | 1130 | * Periodically prints torture statistics, if periodic statistics printing |
1133 | * was specified via the stat_interval module parameter. | 1131 | * was specified via the stat_interval module parameter. |
1134 | * | 1132 | * |
1135 | * No need to worry about fullstop here, since this one doesn't reference | 1133 | * No need to worry about fullstop here, since this one doesn't reference |
1136 | * volatile state or register callbacks. | 1134 | * volatile state or register callbacks. |
1137 | */ | 1135 | */ |
1138 | static int | 1136 | static int |
1139 | rcu_torture_stats(void *arg) | 1137 | rcu_torture_stats(void *arg) |
1140 | { | 1138 | { |
1141 | VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); | 1139 | VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); |
1142 | do { | 1140 | do { |
1143 | schedule_timeout_interruptible(stat_interval * HZ); | 1141 | schedule_timeout_interruptible(stat_interval * HZ); |
1144 | rcu_torture_stats_print(); | 1142 | rcu_torture_stats_print(); |
1145 | rcutorture_shutdown_absorb("rcu_torture_stats"); | 1143 | rcutorture_shutdown_absorb("rcu_torture_stats"); |
1146 | } while (!kthread_should_stop()); | 1144 | } while (!kthread_should_stop()); |
1147 | VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); | 1145 | VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); |
1148 | return 0; | 1146 | return 0; |
1149 | } | 1147 | } |
1150 | 1148 | ||
1151 | static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ | 1149 | static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ |
1152 | 1150 | ||
1153 | /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case | 1151 | /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case |
1154 | * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. | 1152 | * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. |
1155 | */ | 1153 | */ |
1156 | static void rcu_torture_shuffle_tasks(void) | 1154 | static void rcu_torture_shuffle_tasks(void) |
1157 | { | 1155 | { |
1158 | int i; | 1156 | int i; |
1159 | 1157 | ||
1160 | cpumask_setall(shuffle_tmp_mask); | 1158 | cpumask_setall(shuffle_tmp_mask); |
1161 | get_online_cpus(); | 1159 | get_online_cpus(); |
1162 | 1160 | ||
1163 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 1161 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
1164 | if (num_online_cpus() == 1) { | 1162 | if (num_online_cpus() == 1) { |
1165 | put_online_cpus(); | 1163 | put_online_cpus(); |
1166 | return; | 1164 | return; |
1167 | } | 1165 | } |
1168 | 1166 | ||
1169 | if (rcu_idle_cpu != -1) | 1167 | if (rcu_idle_cpu != -1) |
1170 | cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); | 1168 | cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); |
1171 | 1169 | ||
1172 | set_cpus_allowed_ptr(current, shuffle_tmp_mask); | 1170 | set_cpus_allowed_ptr(current, shuffle_tmp_mask); |
1173 | 1171 | ||
1174 | if (reader_tasks) { | 1172 | if (reader_tasks) { |
1175 | for (i = 0; i < nrealreaders; i++) | 1173 | for (i = 0; i < nrealreaders; i++) |
1176 | if (reader_tasks[i]) | 1174 | if (reader_tasks[i]) |
1177 | set_cpus_allowed_ptr(reader_tasks[i], | 1175 | set_cpus_allowed_ptr(reader_tasks[i], |
1178 | shuffle_tmp_mask); | 1176 | shuffle_tmp_mask); |
1179 | } | 1177 | } |
1180 | 1178 | ||
1181 | if (fakewriter_tasks) { | 1179 | if (fakewriter_tasks) { |
1182 | for (i = 0; i < nfakewriters; i++) | 1180 | for (i = 0; i < nfakewriters; i++) |
1183 | if (fakewriter_tasks[i]) | 1181 | if (fakewriter_tasks[i]) |
1184 | set_cpus_allowed_ptr(fakewriter_tasks[i], | 1182 | set_cpus_allowed_ptr(fakewriter_tasks[i], |
1185 | shuffle_tmp_mask); | 1183 | shuffle_tmp_mask); |
1186 | } | 1184 | } |
1187 | 1185 | ||
1188 | if (writer_task) | 1186 | if (writer_task) |
1189 | set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); | 1187 | set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); |
1190 | 1188 | ||
1191 | if (stats_task) | 1189 | if (stats_task) |
1192 | set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); | 1190 | set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); |
1193 | 1191 | ||
1194 | if (rcu_idle_cpu == -1) | 1192 | if (rcu_idle_cpu == -1) |
1195 | rcu_idle_cpu = num_online_cpus() - 1; | 1193 | rcu_idle_cpu = num_online_cpus() - 1; |
1196 | else | 1194 | else |
1197 | rcu_idle_cpu--; | 1195 | rcu_idle_cpu--; |
1198 | 1196 | ||
1199 | put_online_cpus(); | 1197 | put_online_cpus(); |
1200 | } | 1198 | } |
1201 | 1199 | ||
1202 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | 1200 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the |
1203 | * system to become idle at a time and cut off its timer ticks. This is meant | 1201 | * system to become idle at a time and cut off its timer ticks. This is meant |
1204 | * to test the support for such tickless idle CPU in RCU. | 1202 | * to test the support for such tickless idle CPU in RCU. |
1205 | */ | 1203 | */ |
1206 | static int | 1204 | static int |
1207 | rcu_torture_shuffle(void *arg) | 1205 | rcu_torture_shuffle(void *arg) |
1208 | { | 1206 | { |
1209 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); | 1207 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); |
1210 | do { | 1208 | do { |
1211 | schedule_timeout_interruptible(shuffle_interval * HZ); | 1209 | schedule_timeout_interruptible(shuffle_interval * HZ); |
1212 | rcu_torture_shuffle_tasks(); | 1210 | rcu_torture_shuffle_tasks(); |
1213 | rcutorture_shutdown_absorb("rcu_torture_shuffle"); | 1211 | rcutorture_shutdown_absorb("rcu_torture_shuffle"); |
1214 | } while (!kthread_should_stop()); | 1212 | } while (!kthread_should_stop()); |
1215 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); | 1213 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); |
1216 | return 0; | 1214 | return 0; |
1217 | } | 1215 | } |
1218 | 1216 | ||
1219 | /* Cause the rcutorture test to "stutter", starting and stopping all | 1217 | /* Cause the rcutorture test to "stutter", starting and stopping all |
1220 | * threads periodically. | 1218 | * threads periodically. |
1221 | */ | 1219 | */ |
1222 | static int | 1220 | static int |
1223 | rcu_torture_stutter(void *arg) | 1221 | rcu_torture_stutter(void *arg) |
1224 | { | 1222 | { |
1225 | VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); | 1223 | VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); |
1226 | do { | 1224 | do { |
1227 | schedule_timeout_interruptible(stutter * HZ); | 1225 | schedule_timeout_interruptible(stutter * HZ); |
1228 | stutter_pause_test = 1; | 1226 | stutter_pause_test = 1; |
1229 | if (!kthread_should_stop()) | 1227 | if (!kthread_should_stop()) |
1230 | schedule_timeout_interruptible(stutter * HZ); | 1228 | schedule_timeout_interruptible(stutter * HZ); |
1231 | stutter_pause_test = 0; | 1229 | stutter_pause_test = 0; |
1232 | rcutorture_shutdown_absorb("rcu_torture_stutter"); | 1230 | rcutorture_shutdown_absorb("rcu_torture_stutter"); |
1233 | } while (!kthread_should_stop()); | 1231 | } while (!kthread_should_stop()); |
1234 | VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); | 1232 | VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); |
1235 | return 0; | 1233 | return 0; |
1236 | } | 1234 | } |
1237 | 1235 | ||
1238 | static inline void | 1236 | static inline void |
1239 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | 1237 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) |
1240 | { | 1238 | { |
1241 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1239 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1242 | "--- %s: nreaders=%d nfakewriters=%d " | 1240 | "--- %s: nreaders=%d nfakewriters=%d " |
1243 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1241 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
1244 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1242 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1245 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | 1243 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1246 | "test_boost=%d/%d test_boost_interval=%d " | 1244 | "test_boost=%d/%d test_boost_interval=%d " |
1247 | "test_boost_duration=%d\n", | 1245 | "test_boost_duration=%d\n", |
1248 | torture_type, tag, nrealreaders, nfakewriters, | 1246 | torture_type, tag, nrealreaders, nfakewriters, |
1249 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1247 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1250 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | 1248 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1251 | test_boost, cur_ops->can_boost, | 1249 | test_boost, cur_ops->can_boost, |
1252 | test_boost_interval, test_boost_duration); | 1250 | test_boost_interval, test_boost_duration); |
1253 | } | 1251 | } |
1254 | 1252 | ||
1255 | static struct notifier_block rcutorture_shutdown_nb = { | 1253 | static struct notifier_block rcutorture_shutdown_nb = { |
1256 | .notifier_call = rcutorture_shutdown_notify, | 1254 | .notifier_call = rcutorture_shutdown_notify, |
1257 | }; | 1255 | }; |
1258 | 1256 | ||
1259 | static void rcutorture_booster_cleanup(int cpu) | 1257 | static void rcutorture_booster_cleanup(int cpu) |
1260 | { | 1258 | { |
1261 | struct task_struct *t; | 1259 | struct task_struct *t; |
1262 | 1260 | ||
1263 | if (boost_tasks[cpu] == NULL) | 1261 | if (boost_tasks[cpu] == NULL) |
1264 | return; | 1262 | return; |
1265 | mutex_lock(&boost_mutex); | 1263 | mutex_lock(&boost_mutex); |
1266 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); | 1264 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); |
1267 | t = boost_tasks[cpu]; | 1265 | t = boost_tasks[cpu]; |
1268 | boost_tasks[cpu] = NULL; | 1266 | boost_tasks[cpu] = NULL; |
1269 | mutex_unlock(&boost_mutex); | 1267 | mutex_unlock(&boost_mutex); |
1270 | 1268 | ||
1271 | /* This must be outside of the mutex, otherwise deadlock! */ | 1269 | /* This must be outside of the mutex, otherwise deadlock! */ |
1272 | kthread_stop(t); | 1270 | kthread_stop(t); |
1273 | } | 1271 | } |
1274 | 1272 | ||
1275 | static int rcutorture_booster_init(int cpu) | 1273 | static int rcutorture_booster_init(int cpu) |
1276 | { | 1274 | { |
1277 | int retval; | 1275 | int retval; |
1278 | 1276 | ||
1279 | if (boost_tasks[cpu] != NULL) | 1277 | if (boost_tasks[cpu] != NULL) |
1280 | return 0; /* Already created, nothing more to do. */ | 1278 | return 0; /* Already created, nothing more to do. */ |
1281 | 1279 | ||
1282 | /* Don't allow time recalculation while creating a new task. */ | 1280 | /* Don't allow time recalculation while creating a new task. */ |
1283 | mutex_lock(&boost_mutex); | 1281 | mutex_lock(&boost_mutex); |
1284 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); | 1282 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); |
1285 | boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, | 1283 | boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, |
1286 | "rcu_torture_boost"); | 1284 | "rcu_torture_boost"); |
1287 | if (IS_ERR(boost_tasks[cpu])) { | 1285 | if (IS_ERR(boost_tasks[cpu])) { |
1288 | retval = PTR_ERR(boost_tasks[cpu]); | 1286 | retval = PTR_ERR(boost_tasks[cpu]); |
1289 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); | 1287 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); |
1290 | n_rcu_torture_boost_ktrerror++; | 1288 | n_rcu_torture_boost_ktrerror++; |
1291 | boost_tasks[cpu] = NULL; | 1289 | boost_tasks[cpu] = NULL; |
1292 | mutex_unlock(&boost_mutex); | 1290 | mutex_unlock(&boost_mutex); |
1293 | return retval; | 1291 | return retval; |
1294 | } | 1292 | } |
1295 | kthread_bind(boost_tasks[cpu], cpu); | 1293 | kthread_bind(boost_tasks[cpu], cpu); |
1296 | wake_up_process(boost_tasks[cpu]); | 1294 | wake_up_process(boost_tasks[cpu]); |
1297 | mutex_unlock(&boost_mutex); | 1295 | mutex_unlock(&boost_mutex); |
1298 | return 0; | 1296 | return 0; |
1299 | } | 1297 | } |
1300 | 1298 | ||
1301 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1299 | static int rcutorture_cpu_notify(struct notifier_block *self, |
1302 | unsigned long action, void *hcpu) | 1300 | unsigned long action, void *hcpu) |
1303 | { | 1301 | { |
1304 | long cpu = (long)hcpu; | 1302 | long cpu = (long)hcpu; |
1305 | 1303 | ||
1306 | switch (action) { | 1304 | switch (action) { |
1307 | case CPU_ONLINE: | 1305 | case CPU_ONLINE: |
1308 | case CPU_DOWN_FAILED: | 1306 | case CPU_DOWN_FAILED: |
1309 | (void)rcutorture_booster_init(cpu); | 1307 | (void)rcutorture_booster_init(cpu); |
1310 | break; | 1308 | break; |
1311 | case CPU_DOWN_PREPARE: | 1309 | case CPU_DOWN_PREPARE: |
1312 | rcutorture_booster_cleanup(cpu); | 1310 | rcutorture_booster_cleanup(cpu); |
1313 | break; | 1311 | break; |
1314 | default: | 1312 | default: |
1315 | break; | 1313 | break; |
1316 | } | 1314 | } |
1317 | return NOTIFY_OK; | 1315 | return NOTIFY_OK; |
1318 | } | 1316 | } |
1319 | 1317 | ||
1320 | static struct notifier_block rcutorture_cpu_nb = { | 1318 | static struct notifier_block rcutorture_cpu_nb = { |
1321 | .notifier_call = rcutorture_cpu_notify, | 1319 | .notifier_call = rcutorture_cpu_notify, |
1322 | }; | 1320 | }; |
1323 | 1321 | ||
1324 | static void | 1322 | static void |
1325 | rcu_torture_cleanup(void) | 1323 | rcu_torture_cleanup(void) |
1326 | { | 1324 | { |
1327 | int i; | 1325 | int i; |
1328 | 1326 | ||
1329 | mutex_lock(&fullstop_mutex); | 1327 | mutex_lock(&fullstop_mutex); |
1330 | rcutorture_record_test_transition(); | 1328 | rcutorture_record_test_transition(); |
1331 | if (fullstop == FULLSTOP_SHUTDOWN) { | 1329 | if (fullstop == FULLSTOP_SHUTDOWN) { |
1332 | printk(KERN_WARNING /* but going down anyway, so... */ | 1330 | printk(KERN_WARNING /* but going down anyway, so... */ |
1333 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 1331 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
1334 | mutex_unlock(&fullstop_mutex); | 1332 | mutex_unlock(&fullstop_mutex); |
1335 | schedule_timeout_uninterruptible(10); | 1333 | schedule_timeout_uninterruptible(10); |
1336 | if (cur_ops->cb_barrier != NULL) | 1334 | if (cur_ops->cb_barrier != NULL) |
1337 | cur_ops->cb_barrier(); | 1335 | cur_ops->cb_barrier(); |
1338 | return; | 1336 | return; |
1339 | } | 1337 | } |
1340 | fullstop = FULLSTOP_RMMOD; | 1338 | fullstop = FULLSTOP_RMMOD; |
1341 | mutex_unlock(&fullstop_mutex); | 1339 | mutex_unlock(&fullstop_mutex); |
1342 | unregister_reboot_notifier(&rcutorture_shutdown_nb); | 1340 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1343 | if (stutter_task) { | 1341 | if (stutter_task) { |
1344 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1342 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
1345 | kthread_stop(stutter_task); | 1343 | kthread_stop(stutter_task); |
1346 | } | 1344 | } |
1347 | stutter_task = NULL; | 1345 | stutter_task = NULL; |
1348 | if (shuffler_task) { | 1346 | if (shuffler_task) { |
1349 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); | 1347 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); |
1350 | kthread_stop(shuffler_task); | 1348 | kthread_stop(shuffler_task); |
1351 | free_cpumask_var(shuffle_tmp_mask); | 1349 | free_cpumask_var(shuffle_tmp_mask); |
1352 | } | 1350 | } |
1353 | shuffler_task = NULL; | 1351 | shuffler_task = NULL; |
1354 | 1352 | ||
1355 | if (writer_task) { | 1353 | if (writer_task) { |
1356 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | 1354 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); |
1357 | kthread_stop(writer_task); | 1355 | kthread_stop(writer_task); |
1358 | } | 1356 | } |
1359 | writer_task = NULL; | 1357 | writer_task = NULL; |
1360 | 1358 | ||
1361 | if (reader_tasks) { | 1359 | if (reader_tasks) { |
1362 | for (i = 0; i < nrealreaders; i++) { | 1360 | for (i = 0; i < nrealreaders; i++) { |
1363 | if (reader_tasks[i]) { | 1361 | if (reader_tasks[i]) { |
1364 | VERBOSE_PRINTK_STRING( | 1362 | VERBOSE_PRINTK_STRING( |
1365 | "Stopping rcu_torture_reader task"); | 1363 | "Stopping rcu_torture_reader task"); |
1366 | kthread_stop(reader_tasks[i]); | 1364 | kthread_stop(reader_tasks[i]); |
1367 | } | 1365 | } |
1368 | reader_tasks[i] = NULL; | 1366 | reader_tasks[i] = NULL; |
1369 | } | 1367 | } |
1370 | kfree(reader_tasks); | 1368 | kfree(reader_tasks); |
1371 | reader_tasks = NULL; | 1369 | reader_tasks = NULL; |
1372 | } | 1370 | } |
1373 | rcu_torture_current = NULL; | 1371 | rcu_torture_current = NULL; |
1374 | 1372 | ||
1375 | if (fakewriter_tasks) { | 1373 | if (fakewriter_tasks) { |
1376 | for (i = 0; i < nfakewriters; i++) { | 1374 | for (i = 0; i < nfakewriters; i++) { |
1377 | if (fakewriter_tasks[i]) { | 1375 | if (fakewriter_tasks[i]) { |
1378 | VERBOSE_PRINTK_STRING( | 1376 | VERBOSE_PRINTK_STRING( |
1379 | "Stopping rcu_torture_fakewriter task"); | 1377 | "Stopping rcu_torture_fakewriter task"); |
1380 | kthread_stop(fakewriter_tasks[i]); | 1378 | kthread_stop(fakewriter_tasks[i]); |
1381 | } | 1379 | } |
1382 | fakewriter_tasks[i] = NULL; | 1380 | fakewriter_tasks[i] = NULL; |
1383 | } | 1381 | } |
1384 | kfree(fakewriter_tasks); | 1382 | kfree(fakewriter_tasks); |
1385 | fakewriter_tasks = NULL; | 1383 | fakewriter_tasks = NULL; |
1386 | } | 1384 | } |
1387 | 1385 | ||
1388 | if (stats_task) { | 1386 | if (stats_task) { |
1389 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); | 1387 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); |
1390 | kthread_stop(stats_task); | 1388 | kthread_stop(stats_task); |
1391 | } | 1389 | } |
1392 | stats_task = NULL; | 1390 | stats_task = NULL; |
1393 | 1391 | ||
1394 | if (fqs_task) { | 1392 | if (fqs_task) { |
1395 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); | 1393 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); |
1396 | kthread_stop(fqs_task); | 1394 | kthread_stop(fqs_task); |
1397 | } | 1395 | } |
1398 | fqs_task = NULL; | 1396 | fqs_task = NULL; |
1399 | if ((test_boost == 1 && cur_ops->can_boost) || | 1397 | if ((test_boost == 1 && cur_ops->can_boost) || |
1400 | test_boost == 2) { | 1398 | test_boost == 2) { |
1401 | unregister_cpu_notifier(&rcutorture_cpu_nb); | 1399 | unregister_cpu_notifier(&rcutorture_cpu_nb); |
1402 | for_each_possible_cpu(i) | 1400 | for_each_possible_cpu(i) |
1403 | rcutorture_booster_cleanup(i); | 1401 | rcutorture_booster_cleanup(i); |
1404 | } | 1402 | } |
1405 | 1403 | ||
1406 | /* Wait for all RCU callbacks to fire. */ | 1404 | /* Wait for all RCU callbacks to fire. */ |
1407 | 1405 | ||
1408 | if (cur_ops->cb_barrier != NULL) | 1406 | if (cur_ops->cb_barrier != NULL) |
1409 | cur_ops->cb_barrier(); | 1407 | cur_ops->cb_barrier(); |
1410 | 1408 | ||
1411 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 1409 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
1412 | 1410 | ||
1413 | if (cur_ops->cleanup) | 1411 | if (cur_ops->cleanup) |
1414 | cur_ops->cleanup(); | 1412 | cur_ops->cleanup(); |
1415 | if (atomic_read(&n_rcu_torture_error)) | 1413 | if (atomic_read(&n_rcu_torture_error)) |
1416 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1414 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1417 | else | 1415 | else |
1418 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | 1416 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
1419 | } | 1417 | } |
1420 | 1418 | ||
1421 | static int __init | 1419 | static int __init |
1422 | rcu_torture_init(void) | 1420 | rcu_torture_init(void) |
1423 | { | 1421 | { |
1424 | int i; | 1422 | int i; |
1425 | int cpu; | 1423 | int cpu; |
1426 | int firsterr = 0; | 1424 | int firsterr = 0; |
1427 | static struct rcu_torture_ops *torture_ops[] = | 1425 | static struct rcu_torture_ops *torture_ops[] = |
1428 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1426 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1429 | &rcu_bh_ops, &rcu_bh_sync_ops, | 1427 | &rcu_bh_ops, &rcu_bh_sync_ops, |
1430 | &srcu_ops, &srcu_expedited_ops, | 1428 | &srcu_ops, &srcu_expedited_ops, |
1431 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1429 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1432 | 1430 | ||
1433 | mutex_lock(&fullstop_mutex); | 1431 | mutex_lock(&fullstop_mutex); |
1434 | 1432 | ||
1435 | /* Process args and tell the world that the torturer is on the job. */ | 1433 | /* Process args and tell the world that the torturer is on the job. */ |
1436 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { | 1434 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { |
1437 | cur_ops = torture_ops[i]; | 1435 | cur_ops = torture_ops[i]; |
1438 | if (strcmp(torture_type, cur_ops->name) == 0) | 1436 | if (strcmp(torture_type, cur_ops->name) == 0) |
1439 | break; | 1437 | break; |
1440 | } | 1438 | } |
1441 | if (i == ARRAY_SIZE(torture_ops)) { | 1439 | if (i == ARRAY_SIZE(torture_ops)) { |
1442 | printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", | 1440 | printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", |
1443 | torture_type); | 1441 | torture_type); |
1444 | printk(KERN_ALERT "rcu-torture types:"); | 1442 | printk(KERN_ALERT "rcu-torture types:"); |
1445 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) | 1443 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) |
1446 | printk(KERN_ALERT " %s", torture_ops[i]->name); | 1444 | printk(KERN_ALERT " %s", torture_ops[i]->name); |
1447 | printk(KERN_ALERT "\n"); | 1445 | printk(KERN_ALERT "\n"); |
1448 | mutex_unlock(&fullstop_mutex); | 1446 | mutex_unlock(&fullstop_mutex); |
1449 | return -EINVAL; | 1447 | return -EINVAL; |
1450 | } | 1448 | } |
1451 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | 1449 | if (cur_ops->fqs == NULL && fqs_duration != 0) { |
1452 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " | 1450 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " |
1453 | "fqs_duration, fqs disabled.\n"); | 1451 | "fqs_duration, fqs disabled.\n"); |
1454 | fqs_duration = 0; | 1452 | fqs_duration = 0; |
1455 | } | 1453 | } |
1456 | if (cur_ops->init) | 1454 | if (cur_ops->init) |
1457 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 1455 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
1458 | 1456 | ||
1459 | if (nreaders >= 0) | 1457 | if (nreaders >= 0) |
1460 | nrealreaders = nreaders; | 1458 | nrealreaders = nreaders; |
1461 | else | 1459 | else |
1462 | nrealreaders = 2 * num_online_cpus(); | 1460 | nrealreaders = 2 * num_online_cpus(); |
1463 | rcu_torture_print_module_parms(cur_ops, "Start of test"); | 1461 | rcu_torture_print_module_parms(cur_ops, "Start of test"); |
1464 | fullstop = FULLSTOP_DONTSTOP; | 1462 | fullstop = FULLSTOP_DONTSTOP; |
1465 | 1463 | ||
1466 | /* Set up the freelist. */ | 1464 | /* Set up the freelist. */ |
1467 | 1465 | ||
1468 | INIT_LIST_HEAD(&rcu_torture_freelist); | 1466 | INIT_LIST_HEAD(&rcu_torture_freelist); |
1469 | for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { | 1467 | for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { |
1470 | rcu_tortures[i].rtort_mbtest = 0; | 1468 | rcu_tortures[i].rtort_mbtest = 0; |
1471 | list_add_tail(&rcu_tortures[i].rtort_free, | 1469 | list_add_tail(&rcu_tortures[i].rtort_free, |
1472 | &rcu_torture_freelist); | 1470 | &rcu_torture_freelist); |
1473 | } | 1471 | } |
1474 | 1472 | ||
1475 | /* Initialize the statistics so that each run gets its own numbers. */ | 1473 | /* Initialize the statistics so that each run gets its own numbers. */ |
1476 | 1474 | ||
1477 | rcu_torture_current = NULL; | 1475 | rcu_torture_current = NULL; |
1478 | rcu_torture_current_version = 0; | 1476 | rcu_torture_current_version = 0; |
1479 | atomic_set(&n_rcu_torture_alloc, 0); | 1477 | atomic_set(&n_rcu_torture_alloc, 0); |
1480 | atomic_set(&n_rcu_torture_alloc_fail, 0); | 1478 | atomic_set(&n_rcu_torture_alloc_fail, 0); |
1481 | atomic_set(&n_rcu_torture_free, 0); | 1479 | atomic_set(&n_rcu_torture_free, 0); |
1482 | atomic_set(&n_rcu_torture_mberror, 0); | 1480 | atomic_set(&n_rcu_torture_mberror, 0); |
1483 | atomic_set(&n_rcu_torture_error, 0); | 1481 | atomic_set(&n_rcu_torture_error, 0); |
1484 | n_rcu_torture_boost_ktrerror = 0; | 1482 | n_rcu_torture_boost_ktrerror = 0; |
1485 | n_rcu_torture_boost_rterror = 0; | 1483 | n_rcu_torture_boost_rterror = 0; |
1486 | n_rcu_torture_boost_failure = 0; | 1484 | n_rcu_torture_boost_failure = 0; |
1487 | n_rcu_torture_boosts = 0; | 1485 | n_rcu_torture_boosts = 0; |
1488 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1486 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
1489 | atomic_set(&rcu_torture_wcount[i], 0); | 1487 | atomic_set(&rcu_torture_wcount[i], 0); |
1490 | for_each_possible_cpu(cpu) { | 1488 | for_each_possible_cpu(cpu) { |
1491 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1489 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
1492 | per_cpu(rcu_torture_count, cpu)[i] = 0; | 1490 | per_cpu(rcu_torture_count, cpu)[i] = 0; |
1493 | per_cpu(rcu_torture_batch, cpu)[i] = 0; | 1491 | per_cpu(rcu_torture_batch, cpu)[i] = 0; |
1494 | } | 1492 | } |
1495 | } | 1493 | } |
1496 | 1494 | ||
1497 | /* Start up the kthreads. */ | 1495 | /* Start up the kthreads. */ |
1498 | 1496 | ||
1499 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); | 1497 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); |
1500 | writer_task = kthread_run(rcu_torture_writer, NULL, | 1498 | writer_task = kthread_run(rcu_torture_writer, NULL, |
1501 | "rcu_torture_writer"); | 1499 | "rcu_torture_writer"); |
1502 | if (IS_ERR(writer_task)) { | 1500 | if (IS_ERR(writer_task)) { |
1503 | firsterr = PTR_ERR(writer_task); | 1501 | firsterr = PTR_ERR(writer_task); |
1504 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); | 1502 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); |
1505 | writer_task = NULL; | 1503 | writer_task = NULL; |
1506 | goto unwind; | 1504 | goto unwind; |
1507 | } | 1505 | } |
1508 | fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), | 1506 | fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), |
1509 | GFP_KERNEL); | 1507 | GFP_KERNEL); |
1510 | if (fakewriter_tasks == NULL) { | 1508 | if (fakewriter_tasks == NULL) { |
1511 | VERBOSE_PRINTK_ERRSTRING("out of memory"); | 1509 | VERBOSE_PRINTK_ERRSTRING("out of memory"); |
1512 | firsterr = -ENOMEM; | 1510 | firsterr = -ENOMEM; |
1513 | goto unwind; | 1511 | goto unwind; |
1514 | } | 1512 | } |
1515 | for (i = 0; i < nfakewriters; i++) { | 1513 | for (i = 0; i < nfakewriters; i++) { |
1516 | VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); | 1514 | VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); |
1517 | fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, | 1515 | fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, |
1518 | "rcu_torture_fakewriter"); | 1516 | "rcu_torture_fakewriter"); |
1519 | if (IS_ERR(fakewriter_tasks[i])) { | 1517 | if (IS_ERR(fakewriter_tasks[i])) { |
1520 | firsterr = PTR_ERR(fakewriter_tasks[i]); | 1518 | firsterr = PTR_ERR(fakewriter_tasks[i]); |
1521 | VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); | 1519 | VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); |
1522 | fakewriter_tasks[i] = NULL; | 1520 | fakewriter_tasks[i] = NULL; |
1523 | goto unwind; | 1521 | goto unwind; |
1524 | } | 1522 | } |
1525 | } | 1523 | } |
1526 | reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), | 1524 | reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), |
1527 | GFP_KERNEL); | 1525 | GFP_KERNEL); |
1528 | if (reader_tasks == NULL) { | 1526 | if (reader_tasks == NULL) { |
1529 | VERBOSE_PRINTK_ERRSTRING("out of memory"); | 1527 | VERBOSE_PRINTK_ERRSTRING("out of memory"); |
1530 | firsterr = -ENOMEM; | 1528 | firsterr = -ENOMEM; |
1531 | goto unwind; | 1529 | goto unwind; |
1532 | } | 1530 | } |
1533 | for (i = 0; i < nrealreaders; i++) { | 1531 | for (i = 0; i < nrealreaders; i++) { |
1534 | VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); | 1532 | VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); |
1535 | reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, | 1533 | reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, |
1536 | "rcu_torture_reader"); | 1534 | "rcu_torture_reader"); |
1537 | if (IS_ERR(reader_tasks[i])) { | 1535 | if (IS_ERR(reader_tasks[i])) { |
1538 | firsterr = PTR_ERR(reader_tasks[i]); | 1536 | firsterr = PTR_ERR(reader_tasks[i]); |
1539 | VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); | 1537 | VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); |
1540 | reader_tasks[i] = NULL; | 1538 | reader_tasks[i] = NULL; |
1541 | goto unwind; | 1539 | goto unwind; |
1542 | } | 1540 | } |
1543 | } | 1541 | } |
1544 | if (stat_interval > 0) { | 1542 | if (stat_interval > 0) { |
1545 | VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); | 1543 | VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); |
1546 | stats_task = kthread_run(rcu_torture_stats, NULL, | 1544 | stats_task = kthread_run(rcu_torture_stats, NULL, |
1547 | "rcu_torture_stats"); | 1545 | "rcu_torture_stats"); |
1548 | if (IS_ERR(stats_task)) { | 1546 | if (IS_ERR(stats_task)) { |
1549 | firsterr = PTR_ERR(stats_task); | 1547 | firsterr = PTR_ERR(stats_task); |
1550 | VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); | 1548 | VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); |
1551 | stats_task = NULL; | 1549 | stats_task = NULL; |
1552 | goto unwind; | 1550 | goto unwind; |
1553 | } | 1551 | } |
1554 | } | 1552 | } |
1555 | if (test_no_idle_hz) { | 1553 | if (test_no_idle_hz) { |
1556 | rcu_idle_cpu = num_online_cpus() - 1; | 1554 | rcu_idle_cpu = num_online_cpus() - 1; |
1557 | 1555 | ||
1558 | if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { | 1556 | if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { |
1559 | firsterr = -ENOMEM; | 1557 | firsterr = -ENOMEM; |
1560 | VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); | 1558 | VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); |
1561 | goto unwind; | 1559 | goto unwind; |
1562 | } | 1560 | } |
1563 | 1561 | ||
1564 | /* Create the shuffler thread */ | 1562 | /* Create the shuffler thread */ |
1565 | shuffler_task = kthread_run(rcu_torture_shuffle, NULL, | 1563 | shuffler_task = kthread_run(rcu_torture_shuffle, NULL, |
1566 | "rcu_torture_shuffle"); | 1564 | "rcu_torture_shuffle"); |
1567 | if (IS_ERR(shuffler_task)) { | 1565 | if (IS_ERR(shuffler_task)) { |
1568 | free_cpumask_var(shuffle_tmp_mask); | 1566 | free_cpumask_var(shuffle_tmp_mask); |
1569 | firsterr = PTR_ERR(shuffler_task); | 1567 | firsterr = PTR_ERR(shuffler_task); |
1570 | VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); | 1568 | VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); |
1571 | shuffler_task = NULL; | 1569 | shuffler_task = NULL; |
1572 | goto unwind; | 1570 | goto unwind; |
1573 | } | 1571 | } |
1574 | } | 1572 | } |
1575 | if (stutter < 0) | 1573 | if (stutter < 0) |
1576 | stutter = 0; | 1574 | stutter = 0; |
1577 | if (stutter) { | 1575 | if (stutter) { |
1578 | /* Create the stutter thread */ | 1576 | /* Create the stutter thread */ |
1579 | stutter_task = kthread_run(rcu_torture_stutter, NULL, | 1577 | stutter_task = kthread_run(rcu_torture_stutter, NULL, |
1580 | "rcu_torture_stutter"); | 1578 | "rcu_torture_stutter"); |
1581 | if (IS_ERR(stutter_task)) { | 1579 | if (IS_ERR(stutter_task)) { |
1582 | firsterr = PTR_ERR(stutter_task); | 1580 | firsterr = PTR_ERR(stutter_task); |
1583 | VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); | 1581 | VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); |
1584 | stutter_task = NULL; | 1582 | stutter_task = NULL; |
1585 | goto unwind; | 1583 | goto unwind; |
1586 | } | 1584 | } |
1587 | } | 1585 | } |
1588 | if (fqs_duration < 0) | 1586 | if (fqs_duration < 0) |
1589 | fqs_duration = 0; | 1587 | fqs_duration = 0; |
1590 | if (fqs_duration) { | 1588 | if (fqs_duration) { |
1591 | /* Create the stutter thread */ | 1589 | /* Create the stutter thread */ |
1592 | fqs_task = kthread_run(rcu_torture_fqs, NULL, | 1590 | fqs_task = kthread_run(rcu_torture_fqs, NULL, |
1593 | "rcu_torture_fqs"); | 1591 | "rcu_torture_fqs"); |
1594 | if (IS_ERR(fqs_task)) { | 1592 | if (IS_ERR(fqs_task)) { |
1595 | firsterr = PTR_ERR(fqs_task); | 1593 | firsterr = PTR_ERR(fqs_task); |
1596 | VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); | 1594 | VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); |
1597 | fqs_task = NULL; | 1595 | fqs_task = NULL; |
1598 | goto unwind; | 1596 | goto unwind; |
1599 | } | 1597 | } |
1600 | } | 1598 | } |
1601 | if (test_boost_interval < 1) | 1599 | if (test_boost_interval < 1) |
1602 | test_boost_interval = 1; | 1600 | test_boost_interval = 1; |
1603 | if (test_boost_duration < 2) | 1601 | if (test_boost_duration < 2) |
1604 | test_boost_duration = 2; | 1602 | test_boost_duration = 2; |
1605 | if ((test_boost == 1 && cur_ops->can_boost) || | 1603 | if ((test_boost == 1 && cur_ops->can_boost) || |
1606 | test_boost == 2) { | 1604 | test_boost == 2) { |
1607 | int retval; | 1605 | int retval; |
1608 | 1606 | ||
1609 | boost_starttime = jiffies + test_boost_interval * HZ; | 1607 | boost_starttime = jiffies + test_boost_interval * HZ; |
1610 | register_cpu_notifier(&rcutorture_cpu_nb); | 1608 | register_cpu_notifier(&rcutorture_cpu_nb); |
1611 | for_each_possible_cpu(i) { | 1609 | for_each_possible_cpu(i) { |
1612 | if (cpu_is_offline(i)) | 1610 | if (cpu_is_offline(i)) |
1613 | continue; /* Heuristic: CPU can go offline. */ | 1611 | continue; /* Heuristic: CPU can go offline. */ |
1614 | retval = rcutorture_booster_init(i); | 1612 | retval = rcutorture_booster_init(i); |
1615 | if (retval < 0) { | 1613 | if (retval < 0) { |
1616 | firsterr = retval; | 1614 | firsterr = retval; |
1617 | goto unwind; | 1615 | goto unwind; |
1618 | } | 1616 | } |
1619 | } | 1617 | } |
1620 | } | 1618 | } |
1621 | register_reboot_notifier(&rcutorture_shutdown_nb); | 1619 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1622 | rcutorture_record_test_transition(); | 1620 | rcutorture_record_test_transition(); |
1623 | mutex_unlock(&fullstop_mutex); | 1621 | mutex_unlock(&fullstop_mutex); |
1624 | return 0; | 1622 | return 0; |
1625 | 1623 | ||
1626 | unwind: | 1624 | unwind: |
1627 | mutex_unlock(&fullstop_mutex); | 1625 | mutex_unlock(&fullstop_mutex); |
1628 | rcu_torture_cleanup(); | 1626 | rcu_torture_cleanup(); |
1629 | return firsterr; | 1627 | return firsterr; |
1630 | } | 1628 | } |
1631 | 1629 | ||
1632 | module_init(rcu_torture_init); | 1630 | module_init(rcu_torture_init); |
1633 | module_exit(rcu_torture_cleanup); | 1631 | module_exit(rcu_torture_cleanup); |
1634 | 1632 |
kernel/sched.c
1 | /* | 1 | /* |
2 | * kernel/sched.c | 2 | * kernel/sched.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | * | 7 | * |
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | 8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and |
9 | * make semaphores SMP safe | 9 | * make semaphores SMP safe |
10 | * 1998-11-19 Implemented schedule_timeout() and related stuff | 10 | * 1998-11-19 Implemented schedule_timeout() and related stuff |
11 | * by Andrea Arcangeli | 11 | * by Andrea Arcangeli |
12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: | 12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: |
13 | * hybrid priority-list and round-robin design with | 13 | * hybrid priority-list and round-robin design with |
14 | * an array-switch method of distributing timeslices | 14 | * an array-switch method of distributing timeslices |
15 | * and per-CPU runqueues. Cleanups and useful suggestions | 15 | * and per-CPU runqueues. Cleanups and useful suggestions |
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | 16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | 17 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | 18 | * 2004-04-02 Scheduler domains code by Nick Piggin |
19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a | 19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a |
20 | * fair scheduling design by Con Kolivas. | 20 | * fair scheduling design by Con Kolivas. |
21 | * 2007-05-05 Load balancing (smp-nice) and other improvements | 21 | * 2007-05-05 Load balancing (smp-nice) and other improvements |
22 | * by Peter Williams | 22 | * by Peter Williams |
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | 25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, |
26 | * Thomas Gleixner, Mike Kravetz | 26 | * Thomas Gleixner, Mike Kravetz |
27 | */ | 27 | */ |
28 | 28 | ||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/nmi.h> | 31 | #include <linux/nmi.h> |
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/uaccess.h> | 33 | #include <linux/uaccess.h> |
34 | #include <linux/highmem.h> | 34 | #include <linux/highmem.h> |
35 | #include <asm/mmu_context.h> | 35 | #include <asm/mmu_context.h> |
36 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
37 | #include <linux/capability.h> | 37 | #include <linux/capability.h> |
38 | #include <linux/completion.h> | 38 | #include <linux/completion.h> |
39 | #include <linux/kernel_stat.h> | 39 | #include <linux/kernel_stat.h> |
40 | #include <linux/debug_locks.h> | 40 | #include <linux/debug_locks.h> |
41 | #include <linux/perf_event.h> | 41 | #include <linux/perf_event.h> |
42 | #include <linux/security.h> | 42 | #include <linux/security.h> |
43 | #include <linux/notifier.h> | 43 | #include <linux/notifier.h> |
44 | #include <linux/profile.h> | 44 | #include <linux/profile.h> |
45 | #include <linux/freezer.h> | 45 | #include <linux/freezer.h> |
46 | #include <linux/vmalloc.h> | 46 | #include <linux/vmalloc.h> |
47 | #include <linux/blkdev.h> | 47 | #include <linux/blkdev.h> |
48 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
49 | #include <linux/pid_namespace.h> | 49 | #include <linux/pid_namespace.h> |
50 | #include <linux/smp.h> | 50 | #include <linux/smp.h> |
51 | #include <linux/threads.h> | 51 | #include <linux/threads.h> |
52 | #include <linux/timer.h> | 52 | #include <linux/timer.h> |
53 | #include <linux/rcupdate.h> | 53 | #include <linux/rcupdate.h> |
54 | #include <linux/cpu.h> | 54 | #include <linux/cpu.h> |
55 | #include <linux/cpuset.h> | 55 | #include <linux/cpuset.h> |
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/proc_fs.h> | 57 | #include <linux/proc_fs.h> |
58 | #include <linux/seq_file.h> | 58 | #include <linux/seq_file.h> |
59 | #include <linux/stop_machine.h> | 59 | #include <linux/stop_machine.h> |
60 | #include <linux/sysctl.h> | 60 | #include <linux/sysctl.h> |
61 | #include <linux/syscalls.h> | 61 | #include <linux/syscalls.h> |
62 | #include <linux/times.h> | 62 | #include <linux/times.h> |
63 | #include <linux/tsacct_kern.h> | 63 | #include <linux/tsacct_kern.h> |
64 | #include <linux/kprobes.h> | 64 | #include <linux/kprobes.h> |
65 | #include <linux/delayacct.h> | 65 | #include <linux/delayacct.h> |
66 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
67 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | 68 | #include <linux/hrtimer.h> |
69 | #include <linux/tick.h> | 69 | #include <linux/tick.h> |
70 | #include <linux/debugfs.h> | 70 | #include <linux/debugfs.h> |
71 | #include <linux/ctype.h> | 71 | #include <linux/ctype.h> |
72 | #include <linux/ftrace.h> | 72 | #include <linux/ftrace.h> |
73 | #include <linux/slab.h> | 73 | #include <linux/slab.h> |
74 | 74 | ||
75 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
77 | #include <asm/mutex.h> | 77 | #include <asm/mutex.h> |
78 | 78 | ||
79 | #include "sched_cpupri.h" | 79 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 80 | #include "workqueue_sched.h" |
81 | #include "sched_autogroup.h" | 81 | #include "sched_autogroup.h" |
82 | 82 | ||
83 | #define CREATE_TRACE_POINTS | 83 | #define CREATE_TRACE_POINTS |
84 | #include <trace/events/sched.h> | 84 | #include <trace/events/sched.h> |
85 | 85 | ||
86 | /* | 86 | /* |
87 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 87 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
88 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 88 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
89 | * and back. | 89 | * and back. |
90 | */ | 90 | */ |
91 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | 91 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) |
92 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | 92 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) |
93 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | 93 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) |
94 | 94 | ||
95 | /* | 95 | /* |
96 | * 'User priority' is the nice value converted to something we | 96 | * 'User priority' is the nice value converted to something we |
97 | * can work with better when scaling various scheduler parameters, | 97 | * can work with better when scaling various scheduler parameters, |
98 | * it's a [ 0 ... 39 ] range. | 98 | * it's a [ 0 ... 39 ] range. |
99 | */ | 99 | */ |
100 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | 100 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) |
101 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | 101 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) |
102 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 102 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
103 | 103 | ||
104 | /* | 104 | /* |
105 | * Helpers for converting nanosecond timing to jiffy resolution | 105 | * Helpers for converting nanosecond timing to jiffy resolution |
106 | */ | 106 | */ |
107 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 107 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
108 | 108 | ||
109 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 109 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
110 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 110 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * These are the 'tuning knobs' of the scheduler: | 113 | * These are the 'tuning knobs' of the scheduler: |
114 | * | 114 | * |
115 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | 115 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). |
116 | * Timeslices get refilled after they expire. | 116 | * Timeslices get refilled after they expire. |
117 | */ | 117 | */ |
118 | #define DEF_TIMESLICE (100 * HZ / 1000) | 118 | #define DEF_TIMESLICE (100 * HZ / 1000) |
119 | 119 | ||
120 | /* | 120 | /* |
121 | * single value that denotes runtime == period, ie unlimited time. | 121 | * single value that denotes runtime == period, ie unlimited time. |
122 | */ | 122 | */ |
123 | #define RUNTIME_INF ((u64)~0ULL) | 123 | #define RUNTIME_INF ((u64)~0ULL) |
124 | 124 | ||
125 | static inline int rt_policy(int policy) | 125 | static inline int rt_policy(int policy) |
126 | { | 126 | { |
127 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 127 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
128 | return 1; | 128 | return 1; |
129 | return 0; | 129 | return 0; |
130 | } | 130 | } |
131 | 131 | ||
132 | static inline int task_has_rt_policy(struct task_struct *p) | 132 | static inline int task_has_rt_policy(struct task_struct *p) |
133 | { | 133 | { |
134 | return rt_policy(p->policy); | 134 | return rt_policy(p->policy); |
135 | } | 135 | } |
136 | 136 | ||
137 | /* | 137 | /* |
138 | * This is the priority-queue data structure of the RT scheduling class: | 138 | * This is the priority-queue data structure of the RT scheduling class: |
139 | */ | 139 | */ |
140 | struct rt_prio_array { | 140 | struct rt_prio_array { |
141 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | 141 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ |
142 | struct list_head queue[MAX_RT_PRIO]; | 142 | struct list_head queue[MAX_RT_PRIO]; |
143 | }; | 143 | }; |
144 | 144 | ||
145 | struct rt_bandwidth { | 145 | struct rt_bandwidth { |
146 | /* nests inside the rq lock: */ | 146 | /* nests inside the rq lock: */ |
147 | raw_spinlock_t rt_runtime_lock; | 147 | raw_spinlock_t rt_runtime_lock; |
148 | ktime_t rt_period; | 148 | ktime_t rt_period; |
149 | u64 rt_runtime; | 149 | u64 rt_runtime; |
150 | struct hrtimer rt_period_timer; | 150 | struct hrtimer rt_period_timer; |
151 | }; | 151 | }; |
152 | 152 | ||
153 | static struct rt_bandwidth def_rt_bandwidth; | 153 | static struct rt_bandwidth def_rt_bandwidth; |
154 | 154 | ||
155 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | 155 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); |
156 | 156 | ||
157 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | 157 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) |
158 | { | 158 | { |
159 | struct rt_bandwidth *rt_b = | 159 | struct rt_bandwidth *rt_b = |
160 | container_of(timer, struct rt_bandwidth, rt_period_timer); | 160 | container_of(timer, struct rt_bandwidth, rt_period_timer); |
161 | ktime_t now; | 161 | ktime_t now; |
162 | int overrun; | 162 | int overrun; |
163 | int idle = 0; | 163 | int idle = 0; |
164 | 164 | ||
165 | for (;;) { | 165 | for (;;) { |
166 | now = hrtimer_cb_get_time(timer); | 166 | now = hrtimer_cb_get_time(timer); |
167 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | 167 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); |
168 | 168 | ||
169 | if (!overrun) | 169 | if (!overrun) |
170 | break; | 170 | break; |
171 | 171 | ||
172 | idle = do_sched_rt_period_timer(rt_b, overrun); | 172 | idle = do_sched_rt_period_timer(rt_b, overrun); |
173 | } | 173 | } |
174 | 174 | ||
175 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | 175 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; |
176 | } | 176 | } |
177 | 177 | ||
178 | static | 178 | static |
179 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | 179 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) |
180 | { | 180 | { |
181 | rt_b->rt_period = ns_to_ktime(period); | 181 | rt_b->rt_period = ns_to_ktime(period); |
182 | rt_b->rt_runtime = runtime; | 182 | rt_b->rt_runtime = runtime; |
183 | 183 | ||
184 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | 184 | raw_spin_lock_init(&rt_b->rt_runtime_lock); |
185 | 185 | ||
186 | hrtimer_init(&rt_b->rt_period_timer, | 186 | hrtimer_init(&rt_b->rt_period_timer, |
187 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 187 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
188 | rt_b->rt_period_timer.function = sched_rt_period_timer; | 188 | rt_b->rt_period_timer.function = sched_rt_period_timer; |
189 | } | 189 | } |
190 | 190 | ||
191 | static inline int rt_bandwidth_enabled(void) | 191 | static inline int rt_bandwidth_enabled(void) |
192 | { | 192 | { |
193 | return sysctl_sched_rt_runtime >= 0; | 193 | return sysctl_sched_rt_runtime >= 0; |
194 | } | 194 | } |
195 | 195 | ||
196 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 196 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) |
197 | { | 197 | { |
198 | ktime_t now; | 198 | ktime_t now; |
199 | 199 | ||
200 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | 200 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
201 | return; | 201 | return; |
202 | 202 | ||
203 | if (hrtimer_active(&rt_b->rt_period_timer)) | 203 | if (hrtimer_active(&rt_b->rt_period_timer)) |
204 | return; | 204 | return; |
205 | 205 | ||
206 | raw_spin_lock(&rt_b->rt_runtime_lock); | 206 | raw_spin_lock(&rt_b->rt_runtime_lock); |
207 | for (;;) { | 207 | for (;;) { |
208 | unsigned long delta; | 208 | unsigned long delta; |
209 | ktime_t soft, hard; | 209 | ktime_t soft, hard; |
210 | 210 | ||
211 | if (hrtimer_active(&rt_b->rt_period_timer)) | 211 | if (hrtimer_active(&rt_b->rt_period_timer)) |
212 | break; | 212 | break; |
213 | 213 | ||
214 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | 214 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); |
215 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | 215 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); |
216 | 216 | ||
217 | soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); | 217 | soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); |
218 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); | 218 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); |
219 | delta = ktime_to_ns(ktime_sub(hard, soft)); | 219 | delta = ktime_to_ns(ktime_sub(hard, soft)); |
220 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | 220 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, |
221 | HRTIMER_MODE_ABS_PINNED, 0); | 221 | HRTIMER_MODE_ABS_PINNED, 0); |
222 | } | 222 | } |
223 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 223 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
224 | } | 224 | } |
225 | 225 | ||
226 | #ifdef CONFIG_RT_GROUP_SCHED | 226 | #ifdef CONFIG_RT_GROUP_SCHED |
227 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | 227 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) |
228 | { | 228 | { |
229 | hrtimer_cancel(&rt_b->rt_period_timer); | 229 | hrtimer_cancel(&rt_b->rt_period_timer); |
230 | } | 230 | } |
231 | #endif | 231 | #endif |
232 | 232 | ||
233 | /* | 233 | /* |
234 | * sched_domains_mutex serializes calls to init_sched_domains, | 234 | * sched_domains_mutex serializes calls to init_sched_domains, |
235 | * detach_destroy_domains and partition_sched_domains. | 235 | * detach_destroy_domains and partition_sched_domains. |
236 | */ | 236 | */ |
237 | static DEFINE_MUTEX(sched_domains_mutex); | 237 | static DEFINE_MUTEX(sched_domains_mutex); |
238 | 238 | ||
239 | #ifdef CONFIG_CGROUP_SCHED | 239 | #ifdef CONFIG_CGROUP_SCHED |
240 | 240 | ||
241 | #include <linux/cgroup.h> | 241 | #include <linux/cgroup.h> |
242 | 242 | ||
243 | struct cfs_rq; | 243 | struct cfs_rq; |
244 | 244 | ||
245 | static LIST_HEAD(task_groups); | 245 | static LIST_HEAD(task_groups); |
246 | 246 | ||
247 | /* task group related information */ | 247 | /* task group related information */ |
248 | struct task_group { | 248 | struct task_group { |
249 | struct cgroup_subsys_state css; | 249 | struct cgroup_subsys_state css; |
250 | 250 | ||
251 | #ifdef CONFIG_FAIR_GROUP_SCHED | 251 | #ifdef CONFIG_FAIR_GROUP_SCHED |
252 | /* schedulable entities of this group on each cpu */ | 252 | /* schedulable entities of this group on each cpu */ |
253 | struct sched_entity **se; | 253 | struct sched_entity **se; |
254 | /* runqueue "owned" by this group on each cpu */ | 254 | /* runqueue "owned" by this group on each cpu */ |
255 | struct cfs_rq **cfs_rq; | 255 | struct cfs_rq **cfs_rq; |
256 | unsigned long shares; | 256 | unsigned long shares; |
257 | 257 | ||
258 | atomic_t load_weight; | 258 | atomic_t load_weight; |
259 | #endif | 259 | #endif |
260 | 260 | ||
261 | #ifdef CONFIG_RT_GROUP_SCHED | 261 | #ifdef CONFIG_RT_GROUP_SCHED |
262 | struct sched_rt_entity **rt_se; | 262 | struct sched_rt_entity **rt_se; |
263 | struct rt_rq **rt_rq; | 263 | struct rt_rq **rt_rq; |
264 | 264 | ||
265 | struct rt_bandwidth rt_bandwidth; | 265 | struct rt_bandwidth rt_bandwidth; |
266 | #endif | 266 | #endif |
267 | 267 | ||
268 | struct rcu_head rcu; | 268 | struct rcu_head rcu; |
269 | struct list_head list; | 269 | struct list_head list; |
270 | 270 | ||
271 | struct task_group *parent; | 271 | struct task_group *parent; |
272 | struct list_head siblings; | 272 | struct list_head siblings; |
273 | struct list_head children; | 273 | struct list_head children; |
274 | 274 | ||
275 | #ifdef CONFIG_SCHED_AUTOGROUP | 275 | #ifdef CONFIG_SCHED_AUTOGROUP |
276 | struct autogroup *autogroup; | 276 | struct autogroup *autogroup; |
277 | #endif | 277 | #endif |
278 | }; | 278 | }; |
279 | 279 | ||
280 | /* task_group_lock serializes the addition/removal of task groups */ | 280 | /* task_group_lock serializes the addition/removal of task groups */ |
281 | static DEFINE_SPINLOCK(task_group_lock); | 281 | static DEFINE_SPINLOCK(task_group_lock); |
282 | 282 | ||
283 | #ifdef CONFIG_FAIR_GROUP_SCHED | 283 | #ifdef CONFIG_FAIR_GROUP_SCHED |
284 | 284 | ||
285 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD | 285 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD |
286 | 286 | ||
287 | /* | 287 | /* |
288 | * A weight of 0 or 1 can cause arithmetics problems. | 288 | * A weight of 0 or 1 can cause arithmetics problems. |
289 | * A weight of a cfs_rq is the sum of weights of which entities | 289 | * A weight of a cfs_rq is the sum of weights of which entities |
290 | * are queued on this cfs_rq, so a weight of a entity should not be | 290 | * are queued on this cfs_rq, so a weight of a entity should not be |
291 | * too large, so as the shares value of a task group. | 291 | * too large, so as the shares value of a task group. |
292 | * (The default weight is 1024 - so there's no practical | 292 | * (The default weight is 1024 - so there's no practical |
293 | * limitation from this.) | 293 | * limitation from this.) |
294 | */ | 294 | */ |
295 | #define MIN_SHARES 2 | 295 | #define MIN_SHARES 2 |
296 | #define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) | 296 | #define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) |
297 | 297 | ||
298 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; | 298 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
299 | #endif | 299 | #endif |
300 | 300 | ||
301 | /* Default task group. | 301 | /* Default task group. |
302 | * Every task in system belong to this group at bootup. | 302 | * Every task in system belong to this group at bootup. |
303 | */ | 303 | */ |
304 | struct task_group root_task_group; | 304 | struct task_group root_task_group; |
305 | 305 | ||
306 | #endif /* CONFIG_CGROUP_SCHED */ | 306 | #endif /* CONFIG_CGROUP_SCHED */ |
307 | 307 | ||
308 | /* CFS-related fields in a runqueue */ | 308 | /* CFS-related fields in a runqueue */ |
309 | struct cfs_rq { | 309 | struct cfs_rq { |
310 | struct load_weight load; | 310 | struct load_weight load; |
311 | unsigned long nr_running; | 311 | unsigned long nr_running; |
312 | 312 | ||
313 | u64 exec_clock; | 313 | u64 exec_clock; |
314 | u64 min_vruntime; | 314 | u64 min_vruntime; |
315 | #ifndef CONFIG_64BIT | 315 | #ifndef CONFIG_64BIT |
316 | u64 min_vruntime_copy; | 316 | u64 min_vruntime_copy; |
317 | #endif | 317 | #endif |
318 | 318 | ||
319 | struct rb_root tasks_timeline; | 319 | struct rb_root tasks_timeline; |
320 | struct rb_node *rb_leftmost; | 320 | struct rb_node *rb_leftmost; |
321 | 321 | ||
322 | struct list_head tasks; | 322 | struct list_head tasks; |
323 | struct list_head *balance_iterator; | 323 | struct list_head *balance_iterator; |
324 | 324 | ||
325 | /* | 325 | /* |
326 | * 'curr' points to currently running entity on this cfs_rq. | 326 | * 'curr' points to currently running entity on this cfs_rq. |
327 | * It is set to NULL otherwise (i.e when none are currently running). | 327 | * It is set to NULL otherwise (i.e when none are currently running). |
328 | */ | 328 | */ |
329 | struct sched_entity *curr, *next, *last, *skip; | 329 | struct sched_entity *curr, *next, *last, *skip; |
330 | 330 | ||
331 | #ifdef CONFIG_SCHED_DEBUG | 331 | #ifdef CONFIG_SCHED_DEBUG |
332 | unsigned int nr_spread_over; | 332 | unsigned int nr_spread_over; |
333 | #endif | 333 | #endif |
334 | 334 | ||
335 | #ifdef CONFIG_FAIR_GROUP_SCHED | 335 | #ifdef CONFIG_FAIR_GROUP_SCHED |
336 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 336 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
337 | 337 | ||
338 | /* | 338 | /* |
339 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 339 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
340 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 340 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities |
341 | * (like users, containers etc.) | 341 | * (like users, containers etc.) |
342 | * | 342 | * |
343 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 343 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
344 | * list is used during load balance. | 344 | * list is used during load balance. |
345 | */ | 345 | */ |
346 | int on_list; | 346 | int on_list; |
347 | struct list_head leaf_cfs_rq_list; | 347 | struct list_head leaf_cfs_rq_list; |
348 | struct task_group *tg; /* group that "owns" this runqueue */ | 348 | struct task_group *tg; /* group that "owns" this runqueue */ |
349 | 349 | ||
350 | #ifdef CONFIG_SMP | 350 | #ifdef CONFIG_SMP |
351 | /* | 351 | /* |
352 | * the part of load.weight contributed by tasks | 352 | * the part of load.weight contributed by tasks |
353 | */ | 353 | */ |
354 | unsigned long task_weight; | 354 | unsigned long task_weight; |
355 | 355 | ||
356 | /* | 356 | /* |
357 | * h_load = weight * f(tg) | 357 | * h_load = weight * f(tg) |
358 | * | 358 | * |
359 | * Where f(tg) is the recursive weight fraction assigned to | 359 | * Where f(tg) is the recursive weight fraction assigned to |
360 | * this group. | 360 | * this group. |
361 | */ | 361 | */ |
362 | unsigned long h_load; | 362 | unsigned long h_load; |
363 | 363 | ||
364 | /* | 364 | /* |
365 | * Maintaining per-cpu shares distribution for group scheduling | 365 | * Maintaining per-cpu shares distribution for group scheduling |
366 | * | 366 | * |
367 | * load_stamp is the last time we updated the load average | 367 | * load_stamp is the last time we updated the load average |
368 | * load_last is the last time we updated the load average and saw load | 368 | * load_last is the last time we updated the load average and saw load |
369 | * load_unacc_exec_time is currently unaccounted execution time | 369 | * load_unacc_exec_time is currently unaccounted execution time |
370 | */ | 370 | */ |
371 | u64 load_avg; | 371 | u64 load_avg; |
372 | u64 load_period; | 372 | u64 load_period; |
373 | u64 load_stamp, load_last, load_unacc_exec_time; | 373 | u64 load_stamp, load_last, load_unacc_exec_time; |
374 | 374 | ||
375 | unsigned long load_contribution; | 375 | unsigned long load_contribution; |
376 | #endif | 376 | #endif |
377 | #endif | 377 | #endif |
378 | }; | 378 | }; |
379 | 379 | ||
380 | /* Real-Time classes' related field in a runqueue: */ | 380 | /* Real-Time classes' related field in a runqueue: */ |
381 | struct rt_rq { | 381 | struct rt_rq { |
382 | struct rt_prio_array active; | 382 | struct rt_prio_array active; |
383 | unsigned long rt_nr_running; | 383 | unsigned long rt_nr_running; |
384 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 384 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
385 | struct { | 385 | struct { |
386 | int curr; /* highest queued rt task prio */ | 386 | int curr; /* highest queued rt task prio */ |
387 | #ifdef CONFIG_SMP | 387 | #ifdef CONFIG_SMP |
388 | int next; /* next highest */ | 388 | int next; /* next highest */ |
389 | #endif | 389 | #endif |
390 | } highest_prio; | 390 | } highest_prio; |
391 | #endif | 391 | #endif |
392 | #ifdef CONFIG_SMP | 392 | #ifdef CONFIG_SMP |
393 | unsigned long rt_nr_migratory; | 393 | unsigned long rt_nr_migratory; |
394 | unsigned long rt_nr_total; | 394 | unsigned long rt_nr_total; |
395 | int overloaded; | 395 | int overloaded; |
396 | struct plist_head pushable_tasks; | 396 | struct plist_head pushable_tasks; |
397 | #endif | 397 | #endif |
398 | int rt_throttled; | 398 | int rt_throttled; |
399 | u64 rt_time; | 399 | u64 rt_time; |
400 | u64 rt_runtime; | 400 | u64 rt_runtime; |
401 | /* Nests inside the rq lock: */ | 401 | /* Nests inside the rq lock: */ |
402 | raw_spinlock_t rt_runtime_lock; | 402 | raw_spinlock_t rt_runtime_lock; |
403 | 403 | ||
404 | #ifdef CONFIG_RT_GROUP_SCHED | 404 | #ifdef CONFIG_RT_GROUP_SCHED |
405 | unsigned long rt_nr_boosted; | 405 | unsigned long rt_nr_boosted; |
406 | 406 | ||
407 | struct rq *rq; | 407 | struct rq *rq; |
408 | struct list_head leaf_rt_rq_list; | 408 | struct list_head leaf_rt_rq_list; |
409 | struct task_group *tg; | 409 | struct task_group *tg; |
410 | #endif | 410 | #endif |
411 | }; | 411 | }; |
412 | 412 | ||
413 | #ifdef CONFIG_SMP | 413 | #ifdef CONFIG_SMP |
414 | 414 | ||
415 | /* | 415 | /* |
416 | * We add the notion of a root-domain which will be used to define per-domain | 416 | * We add the notion of a root-domain which will be used to define per-domain |
417 | * variables. Each exclusive cpuset essentially defines an island domain by | 417 | * variables. Each exclusive cpuset essentially defines an island domain by |
418 | * fully partitioning the member cpus from any other cpuset. Whenever a new | 418 | * fully partitioning the member cpus from any other cpuset. Whenever a new |
419 | * exclusive cpuset is created, we also create and attach a new root-domain | 419 | * exclusive cpuset is created, we also create and attach a new root-domain |
420 | * object. | 420 | * object. |
421 | * | 421 | * |
422 | */ | 422 | */ |
423 | struct root_domain { | 423 | struct root_domain { |
424 | atomic_t refcount; | 424 | atomic_t refcount; |
425 | struct rcu_head rcu; | 425 | struct rcu_head rcu; |
426 | cpumask_var_t span; | 426 | cpumask_var_t span; |
427 | cpumask_var_t online; | 427 | cpumask_var_t online; |
428 | 428 | ||
429 | /* | 429 | /* |
430 | * The "RT overload" flag: it gets set if a CPU has more than | 430 | * The "RT overload" flag: it gets set if a CPU has more than |
431 | * one runnable RT task. | 431 | * one runnable RT task. |
432 | */ | 432 | */ |
433 | cpumask_var_t rto_mask; | 433 | cpumask_var_t rto_mask; |
434 | atomic_t rto_count; | 434 | atomic_t rto_count; |
435 | struct cpupri cpupri; | 435 | struct cpupri cpupri; |
436 | }; | 436 | }; |
437 | 437 | ||
438 | /* | 438 | /* |
439 | * By default the system creates a single root-domain with all cpus as | 439 | * By default the system creates a single root-domain with all cpus as |
440 | * members (mimicking the global state we have today). | 440 | * members (mimicking the global state we have today). |
441 | */ | 441 | */ |
442 | static struct root_domain def_root_domain; | 442 | static struct root_domain def_root_domain; |
443 | 443 | ||
444 | #endif /* CONFIG_SMP */ | 444 | #endif /* CONFIG_SMP */ |
445 | 445 | ||
446 | /* | 446 | /* |
447 | * This is the main, per-CPU runqueue data structure. | 447 | * This is the main, per-CPU runqueue data structure. |
448 | * | 448 | * |
449 | * Locking rule: those places that want to lock multiple runqueues | 449 | * Locking rule: those places that want to lock multiple runqueues |
450 | * (such as the load balancing or the thread migration code), lock | 450 | * (such as the load balancing or the thread migration code), lock |
451 | * acquire operations must be ordered by ascending &runqueue. | 451 | * acquire operations must be ordered by ascending &runqueue. |
452 | */ | 452 | */ |
453 | struct rq { | 453 | struct rq { |
454 | /* runqueue lock: */ | 454 | /* runqueue lock: */ |
455 | raw_spinlock_t lock; | 455 | raw_spinlock_t lock; |
456 | 456 | ||
457 | /* | 457 | /* |
458 | * nr_running and cpu_load should be in the same cacheline because | 458 | * nr_running and cpu_load should be in the same cacheline because |
459 | * remote CPUs use both these fields when doing load calculation. | 459 | * remote CPUs use both these fields when doing load calculation. |
460 | */ | 460 | */ |
461 | unsigned long nr_running; | 461 | unsigned long nr_running; |
462 | #define CPU_LOAD_IDX_MAX 5 | 462 | #define CPU_LOAD_IDX_MAX 5 |
463 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 463 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
464 | unsigned long last_load_update_tick; | 464 | unsigned long last_load_update_tick; |
465 | #ifdef CONFIG_NO_HZ | 465 | #ifdef CONFIG_NO_HZ |
466 | u64 nohz_stamp; | 466 | u64 nohz_stamp; |
467 | unsigned char nohz_balance_kick; | 467 | unsigned char nohz_balance_kick; |
468 | #endif | 468 | #endif |
469 | int skip_clock_update; | 469 | int skip_clock_update; |
470 | 470 | ||
471 | /* capture load from *all* tasks on this cpu: */ | 471 | /* capture load from *all* tasks on this cpu: */ |
472 | struct load_weight load; | 472 | struct load_weight load; |
473 | unsigned long nr_load_updates; | 473 | unsigned long nr_load_updates; |
474 | u64 nr_switches; | 474 | u64 nr_switches; |
475 | 475 | ||
476 | struct cfs_rq cfs; | 476 | struct cfs_rq cfs; |
477 | struct rt_rq rt; | 477 | struct rt_rq rt; |
478 | 478 | ||
479 | #ifdef CONFIG_FAIR_GROUP_SCHED | 479 | #ifdef CONFIG_FAIR_GROUP_SCHED |
480 | /* list of leaf cfs_rq on this cpu: */ | 480 | /* list of leaf cfs_rq on this cpu: */ |
481 | struct list_head leaf_cfs_rq_list; | 481 | struct list_head leaf_cfs_rq_list; |
482 | #endif | 482 | #endif |
483 | #ifdef CONFIG_RT_GROUP_SCHED | 483 | #ifdef CONFIG_RT_GROUP_SCHED |
484 | struct list_head leaf_rt_rq_list; | 484 | struct list_head leaf_rt_rq_list; |
485 | #endif | 485 | #endif |
486 | 486 | ||
487 | /* | 487 | /* |
488 | * This is part of a global counter where only the total sum | 488 | * This is part of a global counter where only the total sum |
489 | * over all CPUs matters. A task can increase this counter on | 489 | * over all CPUs matters. A task can increase this counter on |
490 | * one CPU and if it got migrated afterwards it may decrease | 490 | * one CPU and if it got migrated afterwards it may decrease |
491 | * it on another CPU. Always updated under the runqueue lock: | 491 | * it on another CPU. Always updated under the runqueue lock: |
492 | */ | 492 | */ |
493 | unsigned long nr_uninterruptible; | 493 | unsigned long nr_uninterruptible; |
494 | 494 | ||
495 | struct task_struct *curr, *idle, *stop; | 495 | struct task_struct *curr, *idle, *stop; |
496 | unsigned long next_balance; | 496 | unsigned long next_balance; |
497 | struct mm_struct *prev_mm; | 497 | struct mm_struct *prev_mm; |
498 | 498 | ||
499 | u64 clock; | 499 | u64 clock; |
500 | u64 clock_task; | 500 | u64 clock_task; |
501 | 501 | ||
502 | atomic_t nr_iowait; | 502 | atomic_t nr_iowait; |
503 | 503 | ||
504 | #ifdef CONFIG_SMP | 504 | #ifdef CONFIG_SMP |
505 | struct root_domain *rd; | 505 | struct root_domain *rd; |
506 | struct sched_domain *sd; | 506 | struct sched_domain *sd; |
507 | 507 | ||
508 | unsigned long cpu_power; | 508 | unsigned long cpu_power; |
509 | 509 | ||
510 | unsigned char idle_at_tick; | 510 | unsigned char idle_at_tick; |
511 | /* For active balancing */ | 511 | /* For active balancing */ |
512 | int post_schedule; | 512 | int post_schedule; |
513 | int active_balance; | 513 | int active_balance; |
514 | int push_cpu; | 514 | int push_cpu; |
515 | struct cpu_stop_work active_balance_work; | 515 | struct cpu_stop_work active_balance_work; |
516 | /* cpu of this runqueue: */ | 516 | /* cpu of this runqueue: */ |
517 | int cpu; | 517 | int cpu; |
518 | int online; | 518 | int online; |
519 | 519 | ||
520 | unsigned long avg_load_per_task; | 520 | unsigned long avg_load_per_task; |
521 | 521 | ||
522 | u64 rt_avg; | 522 | u64 rt_avg; |
523 | u64 age_stamp; | 523 | u64 age_stamp; |
524 | u64 idle_stamp; | 524 | u64 idle_stamp; |
525 | u64 avg_idle; | 525 | u64 avg_idle; |
526 | #endif | 526 | #endif |
527 | 527 | ||
528 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 528 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
529 | u64 prev_irq_time; | 529 | u64 prev_irq_time; |
530 | #endif | 530 | #endif |
531 | 531 | ||
532 | /* calc_load related fields */ | 532 | /* calc_load related fields */ |
533 | unsigned long calc_load_update; | 533 | unsigned long calc_load_update; |
534 | long calc_load_active; | 534 | long calc_load_active; |
535 | 535 | ||
536 | #ifdef CONFIG_SCHED_HRTICK | 536 | #ifdef CONFIG_SCHED_HRTICK |
537 | #ifdef CONFIG_SMP | 537 | #ifdef CONFIG_SMP |
538 | int hrtick_csd_pending; | 538 | int hrtick_csd_pending; |
539 | struct call_single_data hrtick_csd; | 539 | struct call_single_data hrtick_csd; |
540 | #endif | 540 | #endif |
541 | struct hrtimer hrtick_timer; | 541 | struct hrtimer hrtick_timer; |
542 | #endif | 542 | #endif |
543 | 543 | ||
544 | #ifdef CONFIG_SCHEDSTATS | 544 | #ifdef CONFIG_SCHEDSTATS |
545 | /* latency stats */ | 545 | /* latency stats */ |
546 | struct sched_info rq_sched_info; | 546 | struct sched_info rq_sched_info; |
547 | unsigned long long rq_cpu_time; | 547 | unsigned long long rq_cpu_time; |
548 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | 548 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ |
549 | 549 | ||
550 | /* sys_sched_yield() stats */ | 550 | /* sys_sched_yield() stats */ |
551 | unsigned int yld_count; | 551 | unsigned int yld_count; |
552 | 552 | ||
553 | /* schedule() stats */ | 553 | /* schedule() stats */ |
554 | unsigned int sched_switch; | 554 | unsigned int sched_switch; |
555 | unsigned int sched_count; | 555 | unsigned int sched_count; |
556 | unsigned int sched_goidle; | 556 | unsigned int sched_goidle; |
557 | 557 | ||
558 | /* try_to_wake_up() stats */ | 558 | /* try_to_wake_up() stats */ |
559 | unsigned int ttwu_count; | 559 | unsigned int ttwu_count; |
560 | unsigned int ttwu_local; | 560 | unsigned int ttwu_local; |
561 | #endif | 561 | #endif |
562 | 562 | ||
563 | #ifdef CONFIG_SMP | 563 | #ifdef CONFIG_SMP |
564 | struct task_struct *wake_list; | 564 | struct task_struct *wake_list; |
565 | #endif | 565 | #endif |
566 | }; | 566 | }; |
567 | 567 | ||
568 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 568 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
569 | 569 | ||
570 | 570 | ||
571 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | 571 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); |
572 | 572 | ||
573 | static inline int cpu_of(struct rq *rq) | 573 | static inline int cpu_of(struct rq *rq) |
574 | { | 574 | { |
575 | #ifdef CONFIG_SMP | 575 | #ifdef CONFIG_SMP |
576 | return rq->cpu; | 576 | return rq->cpu; |
577 | #else | 577 | #else |
578 | return 0; | 578 | return 0; |
579 | #endif | 579 | #endif |
580 | } | 580 | } |
581 | 581 | ||
582 | #define rcu_dereference_check_sched_domain(p) \ | 582 | #define rcu_dereference_check_sched_domain(p) \ |
583 | rcu_dereference_check((p), \ | 583 | rcu_dereference_check((p), \ |
584 | rcu_read_lock_held() || \ | ||
585 | lockdep_is_held(&sched_domains_mutex)) | 584 | lockdep_is_held(&sched_domains_mutex)) |
586 | 585 | ||
587 | /* | 586 | /* |
588 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 587 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
589 | * See detach_destroy_domains: synchronize_sched for details. | 588 | * See detach_destroy_domains: synchronize_sched for details. |
590 | * | 589 | * |
591 | * The domain tree of any CPU may only be accessed from within | 590 | * The domain tree of any CPU may only be accessed from within |
592 | * preempt-disabled sections. | 591 | * preempt-disabled sections. |
593 | */ | 592 | */ |
594 | #define for_each_domain(cpu, __sd) \ | 593 | #define for_each_domain(cpu, __sd) \ |
595 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | 594 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
596 | 595 | ||
597 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 596 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
598 | #define this_rq() (&__get_cpu_var(runqueues)) | 597 | #define this_rq() (&__get_cpu_var(runqueues)) |
599 | #define task_rq(p) cpu_rq(task_cpu(p)) | 598 | #define task_rq(p) cpu_rq(task_cpu(p)) |
600 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 599 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
601 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | 600 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) |
602 | 601 | ||
603 | #ifdef CONFIG_CGROUP_SCHED | 602 | #ifdef CONFIG_CGROUP_SCHED |
604 | 603 | ||
605 | /* | 604 | /* |
606 | * Return the group to which this tasks belongs. | 605 | * Return the group to which this tasks belongs. |
607 | * | 606 | * |
608 | * We use task_subsys_state_check() and extend the RCU verification with | 607 | * We use task_subsys_state_check() and extend the RCU verification with |
609 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | 608 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each |
610 | * task it moves into the cgroup. Therefore by holding either of those locks, | 609 | * task it moves into the cgroup. Therefore by holding either of those locks, |
611 | * we pin the task to the current cgroup. | 610 | * we pin the task to the current cgroup. |
612 | */ | 611 | */ |
613 | static inline struct task_group *task_group(struct task_struct *p) | 612 | static inline struct task_group *task_group(struct task_struct *p) |
614 | { | 613 | { |
615 | struct task_group *tg; | 614 | struct task_group *tg; |
616 | struct cgroup_subsys_state *css; | 615 | struct cgroup_subsys_state *css; |
617 | 616 | ||
618 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 617 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
619 | lockdep_is_held(&p->pi_lock) || | 618 | lockdep_is_held(&p->pi_lock) || |
620 | lockdep_is_held(&task_rq(p)->lock)); | 619 | lockdep_is_held(&task_rq(p)->lock)); |
621 | tg = container_of(css, struct task_group, css); | 620 | tg = container_of(css, struct task_group, css); |
622 | 621 | ||
623 | return autogroup_task_group(p, tg); | 622 | return autogroup_task_group(p, tg); |
624 | } | 623 | } |
625 | 624 | ||
626 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 625 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
627 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | 626 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
628 | { | 627 | { |
629 | #ifdef CONFIG_FAIR_GROUP_SCHED | 628 | #ifdef CONFIG_FAIR_GROUP_SCHED |
630 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | 629 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; |
631 | p->se.parent = task_group(p)->se[cpu]; | 630 | p->se.parent = task_group(p)->se[cpu]; |
632 | #endif | 631 | #endif |
633 | 632 | ||
634 | #ifdef CONFIG_RT_GROUP_SCHED | 633 | #ifdef CONFIG_RT_GROUP_SCHED |
635 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | 634 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; |
636 | p->rt.parent = task_group(p)->rt_se[cpu]; | 635 | p->rt.parent = task_group(p)->rt_se[cpu]; |
637 | #endif | 636 | #endif |
638 | } | 637 | } |
639 | 638 | ||
640 | #else /* CONFIG_CGROUP_SCHED */ | 639 | #else /* CONFIG_CGROUP_SCHED */ |
641 | 640 | ||
642 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 641 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
643 | static inline struct task_group *task_group(struct task_struct *p) | 642 | static inline struct task_group *task_group(struct task_struct *p) |
644 | { | 643 | { |
645 | return NULL; | 644 | return NULL; |
646 | } | 645 | } |
647 | 646 | ||
648 | #endif /* CONFIG_CGROUP_SCHED */ | 647 | #endif /* CONFIG_CGROUP_SCHED */ |
649 | 648 | ||
650 | static void update_rq_clock_task(struct rq *rq, s64 delta); | 649 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
651 | 650 | ||
652 | static void update_rq_clock(struct rq *rq) | 651 | static void update_rq_clock(struct rq *rq) |
653 | { | 652 | { |
654 | s64 delta; | 653 | s64 delta; |
655 | 654 | ||
656 | if (rq->skip_clock_update > 0) | 655 | if (rq->skip_clock_update > 0) |
657 | return; | 656 | return; |
658 | 657 | ||
659 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 658 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
660 | rq->clock += delta; | 659 | rq->clock += delta; |
661 | update_rq_clock_task(rq, delta); | 660 | update_rq_clock_task(rq, delta); |
662 | } | 661 | } |
663 | 662 | ||
664 | /* | 663 | /* |
665 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 664 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
666 | */ | 665 | */ |
667 | #ifdef CONFIG_SCHED_DEBUG | 666 | #ifdef CONFIG_SCHED_DEBUG |
668 | # define const_debug __read_mostly | 667 | # define const_debug __read_mostly |
669 | #else | 668 | #else |
670 | # define const_debug static const | 669 | # define const_debug static const |
671 | #endif | 670 | #endif |
672 | 671 | ||
673 | /** | 672 | /** |
674 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked | 673 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked |
675 | * @cpu: the processor in question. | 674 | * @cpu: the processor in question. |
676 | * | 675 | * |
677 | * This interface allows printk to be called with the runqueue lock | 676 | * This interface allows printk to be called with the runqueue lock |
678 | * held and know whether or not it is OK to wake up the klogd. | 677 | * held and know whether or not it is OK to wake up the klogd. |
679 | */ | 678 | */ |
680 | int runqueue_is_locked(int cpu) | 679 | int runqueue_is_locked(int cpu) |
681 | { | 680 | { |
682 | return raw_spin_is_locked(&cpu_rq(cpu)->lock); | 681 | return raw_spin_is_locked(&cpu_rq(cpu)->lock); |
683 | } | 682 | } |
684 | 683 | ||
685 | /* | 684 | /* |
686 | * Debugging: various feature bits | 685 | * Debugging: various feature bits |
687 | */ | 686 | */ |
688 | 687 | ||
689 | #define SCHED_FEAT(name, enabled) \ | 688 | #define SCHED_FEAT(name, enabled) \ |
690 | __SCHED_FEAT_##name , | 689 | __SCHED_FEAT_##name , |
691 | 690 | ||
692 | enum { | 691 | enum { |
693 | #include "sched_features.h" | 692 | #include "sched_features.h" |
694 | }; | 693 | }; |
695 | 694 | ||
696 | #undef SCHED_FEAT | 695 | #undef SCHED_FEAT |
697 | 696 | ||
698 | #define SCHED_FEAT(name, enabled) \ | 697 | #define SCHED_FEAT(name, enabled) \ |
699 | (1UL << __SCHED_FEAT_##name) * enabled | | 698 | (1UL << __SCHED_FEAT_##name) * enabled | |
700 | 699 | ||
701 | const_debug unsigned int sysctl_sched_features = | 700 | const_debug unsigned int sysctl_sched_features = |
702 | #include "sched_features.h" | 701 | #include "sched_features.h" |
703 | 0; | 702 | 0; |
704 | 703 | ||
705 | #undef SCHED_FEAT | 704 | #undef SCHED_FEAT |
706 | 705 | ||
707 | #ifdef CONFIG_SCHED_DEBUG | 706 | #ifdef CONFIG_SCHED_DEBUG |
708 | #define SCHED_FEAT(name, enabled) \ | 707 | #define SCHED_FEAT(name, enabled) \ |
709 | #name , | 708 | #name , |
710 | 709 | ||
711 | static __read_mostly char *sched_feat_names[] = { | 710 | static __read_mostly char *sched_feat_names[] = { |
712 | #include "sched_features.h" | 711 | #include "sched_features.h" |
713 | NULL | 712 | NULL |
714 | }; | 713 | }; |
715 | 714 | ||
716 | #undef SCHED_FEAT | 715 | #undef SCHED_FEAT |
717 | 716 | ||
718 | static int sched_feat_show(struct seq_file *m, void *v) | 717 | static int sched_feat_show(struct seq_file *m, void *v) |
719 | { | 718 | { |
720 | int i; | 719 | int i; |
721 | 720 | ||
722 | for (i = 0; sched_feat_names[i]; i++) { | 721 | for (i = 0; sched_feat_names[i]; i++) { |
723 | if (!(sysctl_sched_features & (1UL << i))) | 722 | if (!(sysctl_sched_features & (1UL << i))) |
724 | seq_puts(m, "NO_"); | 723 | seq_puts(m, "NO_"); |
725 | seq_printf(m, "%s ", sched_feat_names[i]); | 724 | seq_printf(m, "%s ", sched_feat_names[i]); |
726 | } | 725 | } |
727 | seq_puts(m, "\n"); | 726 | seq_puts(m, "\n"); |
728 | 727 | ||
729 | return 0; | 728 | return 0; |
730 | } | 729 | } |
731 | 730 | ||
732 | static ssize_t | 731 | static ssize_t |
733 | sched_feat_write(struct file *filp, const char __user *ubuf, | 732 | sched_feat_write(struct file *filp, const char __user *ubuf, |
734 | size_t cnt, loff_t *ppos) | 733 | size_t cnt, loff_t *ppos) |
735 | { | 734 | { |
736 | char buf[64]; | 735 | char buf[64]; |
737 | char *cmp; | 736 | char *cmp; |
738 | int neg = 0; | 737 | int neg = 0; |
739 | int i; | 738 | int i; |
740 | 739 | ||
741 | if (cnt > 63) | 740 | if (cnt > 63) |
742 | cnt = 63; | 741 | cnt = 63; |
743 | 742 | ||
744 | if (copy_from_user(&buf, ubuf, cnt)) | 743 | if (copy_from_user(&buf, ubuf, cnt)) |
745 | return -EFAULT; | 744 | return -EFAULT; |
746 | 745 | ||
747 | buf[cnt] = 0; | 746 | buf[cnt] = 0; |
748 | cmp = strstrip(buf); | 747 | cmp = strstrip(buf); |
749 | 748 | ||
750 | if (strncmp(cmp, "NO_", 3) == 0) { | 749 | if (strncmp(cmp, "NO_", 3) == 0) { |
751 | neg = 1; | 750 | neg = 1; |
752 | cmp += 3; | 751 | cmp += 3; |
753 | } | 752 | } |
754 | 753 | ||
755 | for (i = 0; sched_feat_names[i]; i++) { | 754 | for (i = 0; sched_feat_names[i]; i++) { |
756 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | 755 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
757 | if (neg) | 756 | if (neg) |
758 | sysctl_sched_features &= ~(1UL << i); | 757 | sysctl_sched_features &= ~(1UL << i); |
759 | else | 758 | else |
760 | sysctl_sched_features |= (1UL << i); | 759 | sysctl_sched_features |= (1UL << i); |
761 | break; | 760 | break; |
762 | } | 761 | } |
763 | } | 762 | } |
764 | 763 | ||
765 | if (!sched_feat_names[i]) | 764 | if (!sched_feat_names[i]) |
766 | return -EINVAL; | 765 | return -EINVAL; |
767 | 766 | ||
768 | *ppos += cnt; | 767 | *ppos += cnt; |
769 | 768 | ||
770 | return cnt; | 769 | return cnt; |
771 | } | 770 | } |
772 | 771 | ||
773 | static int sched_feat_open(struct inode *inode, struct file *filp) | 772 | static int sched_feat_open(struct inode *inode, struct file *filp) |
774 | { | 773 | { |
775 | return single_open(filp, sched_feat_show, NULL); | 774 | return single_open(filp, sched_feat_show, NULL); |
776 | } | 775 | } |
777 | 776 | ||
778 | static const struct file_operations sched_feat_fops = { | 777 | static const struct file_operations sched_feat_fops = { |
779 | .open = sched_feat_open, | 778 | .open = sched_feat_open, |
780 | .write = sched_feat_write, | 779 | .write = sched_feat_write, |
781 | .read = seq_read, | 780 | .read = seq_read, |
782 | .llseek = seq_lseek, | 781 | .llseek = seq_lseek, |
783 | .release = single_release, | 782 | .release = single_release, |
784 | }; | 783 | }; |
785 | 784 | ||
786 | static __init int sched_init_debug(void) | 785 | static __init int sched_init_debug(void) |
787 | { | 786 | { |
788 | debugfs_create_file("sched_features", 0644, NULL, NULL, | 787 | debugfs_create_file("sched_features", 0644, NULL, NULL, |
789 | &sched_feat_fops); | 788 | &sched_feat_fops); |
790 | 789 | ||
791 | return 0; | 790 | return 0; |
792 | } | 791 | } |
793 | late_initcall(sched_init_debug); | 792 | late_initcall(sched_init_debug); |
794 | 793 | ||
795 | #endif | 794 | #endif |
796 | 795 | ||
797 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 796 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
798 | 797 | ||
799 | /* | 798 | /* |
800 | * Number of tasks to iterate in a single balance run. | 799 | * Number of tasks to iterate in a single balance run. |
801 | * Limited because this is done with IRQs disabled. | 800 | * Limited because this is done with IRQs disabled. |
802 | */ | 801 | */ |
803 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 802 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
804 | 803 | ||
805 | /* | 804 | /* |
806 | * period over which we average the RT time consumption, measured | 805 | * period over which we average the RT time consumption, measured |
807 | * in ms. | 806 | * in ms. |
808 | * | 807 | * |
809 | * default: 1s | 808 | * default: 1s |
810 | */ | 809 | */ |
811 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | 810 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; |
812 | 811 | ||
813 | /* | 812 | /* |
814 | * period over which we measure -rt task cpu usage in us. | 813 | * period over which we measure -rt task cpu usage in us. |
815 | * default: 1s | 814 | * default: 1s |
816 | */ | 815 | */ |
817 | unsigned int sysctl_sched_rt_period = 1000000; | 816 | unsigned int sysctl_sched_rt_period = 1000000; |
818 | 817 | ||
819 | static __read_mostly int scheduler_running; | 818 | static __read_mostly int scheduler_running; |
820 | 819 | ||
821 | /* | 820 | /* |
822 | * part of the period that we allow rt tasks to run in us. | 821 | * part of the period that we allow rt tasks to run in us. |
823 | * default: 0.95s | 822 | * default: 0.95s |
824 | */ | 823 | */ |
825 | int sysctl_sched_rt_runtime = 950000; | 824 | int sysctl_sched_rt_runtime = 950000; |
826 | 825 | ||
827 | static inline u64 global_rt_period(void) | 826 | static inline u64 global_rt_period(void) |
828 | { | 827 | { |
829 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | 828 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
830 | } | 829 | } |
831 | 830 | ||
832 | static inline u64 global_rt_runtime(void) | 831 | static inline u64 global_rt_runtime(void) |
833 | { | 832 | { |
834 | if (sysctl_sched_rt_runtime < 0) | 833 | if (sysctl_sched_rt_runtime < 0) |
835 | return RUNTIME_INF; | 834 | return RUNTIME_INF; |
836 | 835 | ||
837 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 836 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
838 | } | 837 | } |
839 | 838 | ||
840 | #ifndef prepare_arch_switch | 839 | #ifndef prepare_arch_switch |
841 | # define prepare_arch_switch(next) do { } while (0) | 840 | # define prepare_arch_switch(next) do { } while (0) |
842 | #endif | 841 | #endif |
843 | #ifndef finish_arch_switch | 842 | #ifndef finish_arch_switch |
844 | # define finish_arch_switch(prev) do { } while (0) | 843 | # define finish_arch_switch(prev) do { } while (0) |
845 | #endif | 844 | #endif |
846 | 845 | ||
847 | static inline int task_current(struct rq *rq, struct task_struct *p) | 846 | static inline int task_current(struct rq *rq, struct task_struct *p) |
848 | { | 847 | { |
849 | return rq->curr == p; | 848 | return rq->curr == p; |
850 | } | 849 | } |
851 | 850 | ||
852 | static inline int task_running(struct rq *rq, struct task_struct *p) | 851 | static inline int task_running(struct rq *rq, struct task_struct *p) |
853 | { | 852 | { |
854 | #ifdef CONFIG_SMP | 853 | #ifdef CONFIG_SMP |
855 | return p->on_cpu; | 854 | return p->on_cpu; |
856 | #else | 855 | #else |
857 | return task_current(rq, p); | 856 | return task_current(rq, p); |
858 | #endif | 857 | #endif |
859 | } | 858 | } |
860 | 859 | ||
861 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 860 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
862 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 861 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
863 | { | 862 | { |
864 | #ifdef CONFIG_SMP | 863 | #ifdef CONFIG_SMP |
865 | /* | 864 | /* |
866 | * We can optimise this out completely for !SMP, because the | 865 | * We can optimise this out completely for !SMP, because the |
867 | * SMP rebalancing from interrupt is the only thing that cares | 866 | * SMP rebalancing from interrupt is the only thing that cares |
868 | * here. | 867 | * here. |
869 | */ | 868 | */ |
870 | next->on_cpu = 1; | 869 | next->on_cpu = 1; |
871 | #endif | 870 | #endif |
872 | } | 871 | } |
873 | 872 | ||
874 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 873 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
875 | { | 874 | { |
876 | #ifdef CONFIG_SMP | 875 | #ifdef CONFIG_SMP |
877 | /* | 876 | /* |
878 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | 877 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
879 | * We must ensure this doesn't happen until the switch is completely | 878 | * We must ensure this doesn't happen until the switch is completely |
880 | * finished. | 879 | * finished. |
881 | */ | 880 | */ |
882 | smp_wmb(); | 881 | smp_wmb(); |
883 | prev->on_cpu = 0; | 882 | prev->on_cpu = 0; |
884 | #endif | 883 | #endif |
885 | #ifdef CONFIG_DEBUG_SPINLOCK | 884 | #ifdef CONFIG_DEBUG_SPINLOCK |
886 | /* this is a valid case when another task releases the spinlock */ | 885 | /* this is a valid case when another task releases the spinlock */ |
887 | rq->lock.owner = current; | 886 | rq->lock.owner = current; |
888 | #endif | 887 | #endif |
889 | /* | 888 | /* |
890 | * If we are tracking spinlock dependencies then we have to | 889 | * If we are tracking spinlock dependencies then we have to |
891 | * fix up the runqueue lock - which gets 'carried over' from | 890 | * fix up the runqueue lock - which gets 'carried over' from |
892 | * prev into current: | 891 | * prev into current: |
893 | */ | 892 | */ |
894 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | 893 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); |
895 | 894 | ||
896 | raw_spin_unlock_irq(&rq->lock); | 895 | raw_spin_unlock_irq(&rq->lock); |
897 | } | 896 | } |
898 | 897 | ||
899 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 898 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
900 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 899 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
901 | { | 900 | { |
902 | #ifdef CONFIG_SMP | 901 | #ifdef CONFIG_SMP |
903 | /* | 902 | /* |
904 | * We can optimise this out completely for !SMP, because the | 903 | * We can optimise this out completely for !SMP, because the |
905 | * SMP rebalancing from interrupt is the only thing that cares | 904 | * SMP rebalancing from interrupt is the only thing that cares |
906 | * here. | 905 | * here. |
907 | */ | 906 | */ |
908 | next->on_cpu = 1; | 907 | next->on_cpu = 1; |
909 | #endif | 908 | #endif |
910 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 909 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
911 | raw_spin_unlock_irq(&rq->lock); | 910 | raw_spin_unlock_irq(&rq->lock); |
912 | #else | 911 | #else |
913 | raw_spin_unlock(&rq->lock); | 912 | raw_spin_unlock(&rq->lock); |
914 | #endif | 913 | #endif |
915 | } | 914 | } |
916 | 915 | ||
917 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 916 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
918 | { | 917 | { |
919 | #ifdef CONFIG_SMP | 918 | #ifdef CONFIG_SMP |
920 | /* | 919 | /* |
921 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | 920 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
922 | * We must ensure this doesn't happen until the switch is completely | 921 | * We must ensure this doesn't happen until the switch is completely |
923 | * finished. | 922 | * finished. |
924 | */ | 923 | */ |
925 | smp_wmb(); | 924 | smp_wmb(); |
926 | prev->on_cpu = 0; | 925 | prev->on_cpu = 0; |
927 | #endif | 926 | #endif |
928 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 927 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
929 | local_irq_enable(); | 928 | local_irq_enable(); |
930 | #endif | 929 | #endif |
931 | } | 930 | } |
932 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 931 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
933 | 932 | ||
934 | /* | 933 | /* |
935 | * __task_rq_lock - lock the rq @p resides on. | 934 | * __task_rq_lock - lock the rq @p resides on. |
936 | */ | 935 | */ |
937 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 936 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
938 | __acquires(rq->lock) | 937 | __acquires(rq->lock) |
939 | { | 938 | { |
940 | struct rq *rq; | 939 | struct rq *rq; |
941 | 940 | ||
942 | lockdep_assert_held(&p->pi_lock); | 941 | lockdep_assert_held(&p->pi_lock); |
943 | 942 | ||
944 | for (;;) { | 943 | for (;;) { |
945 | rq = task_rq(p); | 944 | rq = task_rq(p); |
946 | raw_spin_lock(&rq->lock); | 945 | raw_spin_lock(&rq->lock); |
947 | if (likely(rq == task_rq(p))) | 946 | if (likely(rq == task_rq(p))) |
948 | return rq; | 947 | return rq; |
949 | raw_spin_unlock(&rq->lock); | 948 | raw_spin_unlock(&rq->lock); |
950 | } | 949 | } |
951 | } | 950 | } |
952 | 951 | ||
953 | /* | 952 | /* |
954 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. | 953 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
955 | */ | 954 | */ |
956 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 955 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
957 | __acquires(p->pi_lock) | 956 | __acquires(p->pi_lock) |
958 | __acquires(rq->lock) | 957 | __acquires(rq->lock) |
959 | { | 958 | { |
960 | struct rq *rq; | 959 | struct rq *rq; |
961 | 960 | ||
962 | for (;;) { | 961 | for (;;) { |
963 | raw_spin_lock_irqsave(&p->pi_lock, *flags); | 962 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
964 | rq = task_rq(p); | 963 | rq = task_rq(p); |
965 | raw_spin_lock(&rq->lock); | 964 | raw_spin_lock(&rq->lock); |
966 | if (likely(rq == task_rq(p))) | 965 | if (likely(rq == task_rq(p))) |
967 | return rq; | 966 | return rq; |
968 | raw_spin_unlock(&rq->lock); | 967 | raw_spin_unlock(&rq->lock); |
969 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | 968 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); |
970 | } | 969 | } |
971 | } | 970 | } |
972 | 971 | ||
973 | static void __task_rq_unlock(struct rq *rq) | 972 | static void __task_rq_unlock(struct rq *rq) |
974 | __releases(rq->lock) | 973 | __releases(rq->lock) |
975 | { | 974 | { |
976 | raw_spin_unlock(&rq->lock); | 975 | raw_spin_unlock(&rq->lock); |
977 | } | 976 | } |
978 | 977 | ||
979 | static inline void | 978 | static inline void |
980 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | 979 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) |
981 | __releases(rq->lock) | 980 | __releases(rq->lock) |
982 | __releases(p->pi_lock) | 981 | __releases(p->pi_lock) |
983 | { | 982 | { |
984 | raw_spin_unlock(&rq->lock); | 983 | raw_spin_unlock(&rq->lock); |
985 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | 984 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); |
986 | } | 985 | } |
987 | 986 | ||
988 | /* | 987 | /* |
989 | * this_rq_lock - lock this runqueue and disable interrupts. | 988 | * this_rq_lock - lock this runqueue and disable interrupts. |
990 | */ | 989 | */ |
991 | static struct rq *this_rq_lock(void) | 990 | static struct rq *this_rq_lock(void) |
992 | __acquires(rq->lock) | 991 | __acquires(rq->lock) |
993 | { | 992 | { |
994 | struct rq *rq; | 993 | struct rq *rq; |
995 | 994 | ||
996 | local_irq_disable(); | 995 | local_irq_disable(); |
997 | rq = this_rq(); | 996 | rq = this_rq(); |
998 | raw_spin_lock(&rq->lock); | 997 | raw_spin_lock(&rq->lock); |
999 | 998 | ||
1000 | return rq; | 999 | return rq; |
1001 | } | 1000 | } |
1002 | 1001 | ||
1003 | #ifdef CONFIG_SCHED_HRTICK | 1002 | #ifdef CONFIG_SCHED_HRTICK |
1004 | /* | 1003 | /* |
1005 | * Use HR-timers to deliver accurate preemption points. | 1004 | * Use HR-timers to deliver accurate preemption points. |
1006 | * | 1005 | * |
1007 | * Its all a bit involved since we cannot program an hrt while holding the | 1006 | * Its all a bit involved since we cannot program an hrt while holding the |
1008 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a | 1007 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a |
1009 | * reschedule event. | 1008 | * reschedule event. |
1010 | * | 1009 | * |
1011 | * When we get rescheduled we reprogram the hrtick_timer outside of the | 1010 | * When we get rescheduled we reprogram the hrtick_timer outside of the |
1012 | * rq->lock. | 1011 | * rq->lock. |
1013 | */ | 1012 | */ |
1014 | 1013 | ||
1015 | /* | 1014 | /* |
1016 | * Use hrtick when: | 1015 | * Use hrtick when: |
1017 | * - enabled by features | 1016 | * - enabled by features |
1018 | * - hrtimer is actually high res | 1017 | * - hrtimer is actually high res |
1019 | */ | 1018 | */ |
1020 | static inline int hrtick_enabled(struct rq *rq) | 1019 | static inline int hrtick_enabled(struct rq *rq) |
1021 | { | 1020 | { |
1022 | if (!sched_feat(HRTICK)) | 1021 | if (!sched_feat(HRTICK)) |
1023 | return 0; | 1022 | return 0; |
1024 | if (!cpu_active(cpu_of(rq))) | 1023 | if (!cpu_active(cpu_of(rq))) |
1025 | return 0; | 1024 | return 0; |
1026 | return hrtimer_is_hres_active(&rq->hrtick_timer); | 1025 | return hrtimer_is_hres_active(&rq->hrtick_timer); |
1027 | } | 1026 | } |
1028 | 1027 | ||
1029 | static void hrtick_clear(struct rq *rq) | 1028 | static void hrtick_clear(struct rq *rq) |
1030 | { | 1029 | { |
1031 | if (hrtimer_active(&rq->hrtick_timer)) | 1030 | if (hrtimer_active(&rq->hrtick_timer)) |
1032 | hrtimer_cancel(&rq->hrtick_timer); | 1031 | hrtimer_cancel(&rq->hrtick_timer); |
1033 | } | 1032 | } |
1034 | 1033 | ||
1035 | /* | 1034 | /* |
1036 | * High-resolution timer tick. | 1035 | * High-resolution timer tick. |
1037 | * Runs from hardirq context with interrupts disabled. | 1036 | * Runs from hardirq context with interrupts disabled. |
1038 | */ | 1037 | */ |
1039 | static enum hrtimer_restart hrtick(struct hrtimer *timer) | 1038 | static enum hrtimer_restart hrtick(struct hrtimer *timer) |
1040 | { | 1039 | { |
1041 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); | 1040 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); |
1042 | 1041 | ||
1043 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | 1042 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); |
1044 | 1043 | ||
1045 | raw_spin_lock(&rq->lock); | 1044 | raw_spin_lock(&rq->lock); |
1046 | update_rq_clock(rq); | 1045 | update_rq_clock(rq); |
1047 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | 1046 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); |
1048 | raw_spin_unlock(&rq->lock); | 1047 | raw_spin_unlock(&rq->lock); |
1049 | 1048 | ||
1050 | return HRTIMER_NORESTART; | 1049 | return HRTIMER_NORESTART; |
1051 | } | 1050 | } |
1052 | 1051 | ||
1053 | #ifdef CONFIG_SMP | 1052 | #ifdef CONFIG_SMP |
1054 | /* | 1053 | /* |
1055 | * called from hardirq (IPI) context | 1054 | * called from hardirq (IPI) context |
1056 | */ | 1055 | */ |
1057 | static void __hrtick_start(void *arg) | 1056 | static void __hrtick_start(void *arg) |
1058 | { | 1057 | { |
1059 | struct rq *rq = arg; | 1058 | struct rq *rq = arg; |
1060 | 1059 | ||
1061 | raw_spin_lock(&rq->lock); | 1060 | raw_spin_lock(&rq->lock); |
1062 | hrtimer_restart(&rq->hrtick_timer); | 1061 | hrtimer_restart(&rq->hrtick_timer); |
1063 | rq->hrtick_csd_pending = 0; | 1062 | rq->hrtick_csd_pending = 0; |
1064 | raw_spin_unlock(&rq->lock); | 1063 | raw_spin_unlock(&rq->lock); |
1065 | } | 1064 | } |
1066 | 1065 | ||
1067 | /* | 1066 | /* |
1068 | * Called to set the hrtick timer state. | 1067 | * Called to set the hrtick timer state. |
1069 | * | 1068 | * |
1070 | * called with rq->lock held and irqs disabled | 1069 | * called with rq->lock held and irqs disabled |
1071 | */ | 1070 | */ |
1072 | static void hrtick_start(struct rq *rq, u64 delay) | 1071 | static void hrtick_start(struct rq *rq, u64 delay) |
1073 | { | 1072 | { |
1074 | struct hrtimer *timer = &rq->hrtick_timer; | 1073 | struct hrtimer *timer = &rq->hrtick_timer; |
1075 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 1074 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
1076 | 1075 | ||
1077 | hrtimer_set_expires(timer, time); | 1076 | hrtimer_set_expires(timer, time); |
1078 | 1077 | ||
1079 | if (rq == this_rq()) { | 1078 | if (rq == this_rq()) { |
1080 | hrtimer_restart(timer); | 1079 | hrtimer_restart(timer); |
1081 | } else if (!rq->hrtick_csd_pending) { | 1080 | } else if (!rq->hrtick_csd_pending) { |
1082 | __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); | 1081 | __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); |
1083 | rq->hrtick_csd_pending = 1; | 1082 | rq->hrtick_csd_pending = 1; |
1084 | } | 1083 | } |
1085 | } | 1084 | } |
1086 | 1085 | ||
1087 | static int | 1086 | static int |
1088 | hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) | 1087 | hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) |
1089 | { | 1088 | { |
1090 | int cpu = (int)(long)hcpu; | 1089 | int cpu = (int)(long)hcpu; |
1091 | 1090 | ||
1092 | switch (action) { | 1091 | switch (action) { |
1093 | case CPU_UP_CANCELED: | 1092 | case CPU_UP_CANCELED: |
1094 | case CPU_UP_CANCELED_FROZEN: | 1093 | case CPU_UP_CANCELED_FROZEN: |
1095 | case CPU_DOWN_PREPARE: | 1094 | case CPU_DOWN_PREPARE: |
1096 | case CPU_DOWN_PREPARE_FROZEN: | 1095 | case CPU_DOWN_PREPARE_FROZEN: |
1097 | case CPU_DEAD: | 1096 | case CPU_DEAD: |
1098 | case CPU_DEAD_FROZEN: | 1097 | case CPU_DEAD_FROZEN: |
1099 | hrtick_clear(cpu_rq(cpu)); | 1098 | hrtick_clear(cpu_rq(cpu)); |
1100 | return NOTIFY_OK; | 1099 | return NOTIFY_OK; |
1101 | } | 1100 | } |
1102 | 1101 | ||
1103 | return NOTIFY_DONE; | 1102 | return NOTIFY_DONE; |
1104 | } | 1103 | } |
1105 | 1104 | ||
1106 | static __init void init_hrtick(void) | 1105 | static __init void init_hrtick(void) |
1107 | { | 1106 | { |
1108 | hotcpu_notifier(hotplug_hrtick, 0); | 1107 | hotcpu_notifier(hotplug_hrtick, 0); |
1109 | } | 1108 | } |
1110 | #else | 1109 | #else |
1111 | /* | 1110 | /* |
1112 | * Called to set the hrtick timer state. | 1111 | * Called to set the hrtick timer state. |
1113 | * | 1112 | * |
1114 | * called with rq->lock held and irqs disabled | 1113 | * called with rq->lock held and irqs disabled |
1115 | */ | 1114 | */ |
1116 | static void hrtick_start(struct rq *rq, u64 delay) | 1115 | static void hrtick_start(struct rq *rq, u64 delay) |
1117 | { | 1116 | { |
1118 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, | 1117 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, |
1119 | HRTIMER_MODE_REL_PINNED, 0); | 1118 | HRTIMER_MODE_REL_PINNED, 0); |
1120 | } | 1119 | } |
1121 | 1120 | ||
1122 | static inline void init_hrtick(void) | 1121 | static inline void init_hrtick(void) |
1123 | { | 1122 | { |
1124 | } | 1123 | } |
1125 | #endif /* CONFIG_SMP */ | 1124 | #endif /* CONFIG_SMP */ |
1126 | 1125 | ||
1127 | static void init_rq_hrtick(struct rq *rq) | 1126 | static void init_rq_hrtick(struct rq *rq) |
1128 | { | 1127 | { |
1129 | #ifdef CONFIG_SMP | 1128 | #ifdef CONFIG_SMP |
1130 | rq->hrtick_csd_pending = 0; | 1129 | rq->hrtick_csd_pending = 0; |
1131 | 1130 | ||
1132 | rq->hrtick_csd.flags = 0; | 1131 | rq->hrtick_csd.flags = 0; |
1133 | rq->hrtick_csd.func = __hrtick_start; | 1132 | rq->hrtick_csd.func = __hrtick_start; |
1134 | rq->hrtick_csd.info = rq; | 1133 | rq->hrtick_csd.info = rq; |
1135 | #endif | 1134 | #endif |
1136 | 1135 | ||
1137 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1136 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1138 | rq->hrtick_timer.function = hrtick; | 1137 | rq->hrtick_timer.function = hrtick; |
1139 | } | 1138 | } |
1140 | #else /* CONFIG_SCHED_HRTICK */ | 1139 | #else /* CONFIG_SCHED_HRTICK */ |
1141 | static inline void hrtick_clear(struct rq *rq) | 1140 | static inline void hrtick_clear(struct rq *rq) |
1142 | { | 1141 | { |
1143 | } | 1142 | } |
1144 | 1143 | ||
1145 | static inline void init_rq_hrtick(struct rq *rq) | 1144 | static inline void init_rq_hrtick(struct rq *rq) |
1146 | { | 1145 | { |
1147 | } | 1146 | } |
1148 | 1147 | ||
1149 | static inline void init_hrtick(void) | 1148 | static inline void init_hrtick(void) |
1150 | { | 1149 | { |
1151 | } | 1150 | } |
1152 | #endif /* CONFIG_SCHED_HRTICK */ | 1151 | #endif /* CONFIG_SCHED_HRTICK */ |
1153 | 1152 | ||
1154 | /* | 1153 | /* |
1155 | * resched_task - mark a task 'to be rescheduled now'. | 1154 | * resched_task - mark a task 'to be rescheduled now'. |
1156 | * | 1155 | * |
1157 | * On UP this means the setting of the need_resched flag, on SMP it | 1156 | * On UP this means the setting of the need_resched flag, on SMP it |
1158 | * might also involve a cross-CPU call to trigger the scheduler on | 1157 | * might also involve a cross-CPU call to trigger the scheduler on |
1159 | * the target CPU. | 1158 | * the target CPU. |
1160 | */ | 1159 | */ |
1161 | #ifdef CONFIG_SMP | 1160 | #ifdef CONFIG_SMP |
1162 | 1161 | ||
1163 | #ifndef tsk_is_polling | 1162 | #ifndef tsk_is_polling |
1164 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1163 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
1165 | #endif | 1164 | #endif |
1166 | 1165 | ||
1167 | static void resched_task(struct task_struct *p) | 1166 | static void resched_task(struct task_struct *p) |
1168 | { | 1167 | { |
1169 | int cpu; | 1168 | int cpu; |
1170 | 1169 | ||
1171 | assert_raw_spin_locked(&task_rq(p)->lock); | 1170 | assert_raw_spin_locked(&task_rq(p)->lock); |
1172 | 1171 | ||
1173 | if (test_tsk_need_resched(p)) | 1172 | if (test_tsk_need_resched(p)) |
1174 | return; | 1173 | return; |
1175 | 1174 | ||
1176 | set_tsk_need_resched(p); | 1175 | set_tsk_need_resched(p); |
1177 | 1176 | ||
1178 | cpu = task_cpu(p); | 1177 | cpu = task_cpu(p); |
1179 | if (cpu == smp_processor_id()) | 1178 | if (cpu == smp_processor_id()) |
1180 | return; | 1179 | return; |
1181 | 1180 | ||
1182 | /* NEED_RESCHED must be visible before we test polling */ | 1181 | /* NEED_RESCHED must be visible before we test polling */ |
1183 | smp_mb(); | 1182 | smp_mb(); |
1184 | if (!tsk_is_polling(p)) | 1183 | if (!tsk_is_polling(p)) |
1185 | smp_send_reschedule(cpu); | 1184 | smp_send_reschedule(cpu); |
1186 | } | 1185 | } |
1187 | 1186 | ||
1188 | static void resched_cpu(int cpu) | 1187 | static void resched_cpu(int cpu) |
1189 | { | 1188 | { |
1190 | struct rq *rq = cpu_rq(cpu); | 1189 | struct rq *rq = cpu_rq(cpu); |
1191 | unsigned long flags; | 1190 | unsigned long flags; |
1192 | 1191 | ||
1193 | if (!raw_spin_trylock_irqsave(&rq->lock, flags)) | 1192 | if (!raw_spin_trylock_irqsave(&rq->lock, flags)) |
1194 | return; | 1193 | return; |
1195 | resched_task(cpu_curr(cpu)); | 1194 | resched_task(cpu_curr(cpu)); |
1196 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1195 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1197 | } | 1196 | } |
1198 | 1197 | ||
1199 | #ifdef CONFIG_NO_HZ | 1198 | #ifdef CONFIG_NO_HZ |
1200 | /* | 1199 | /* |
1201 | * In the semi idle case, use the nearest busy cpu for migrating timers | 1200 | * In the semi idle case, use the nearest busy cpu for migrating timers |
1202 | * from an idle cpu. This is good for power-savings. | 1201 | * from an idle cpu. This is good for power-savings. |
1203 | * | 1202 | * |
1204 | * We don't do similar optimization for completely idle system, as | 1203 | * We don't do similar optimization for completely idle system, as |
1205 | * selecting an idle cpu will add more delays to the timers than intended | 1204 | * selecting an idle cpu will add more delays to the timers than intended |
1206 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | 1205 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). |
1207 | */ | 1206 | */ |
1208 | int get_nohz_timer_target(void) | 1207 | int get_nohz_timer_target(void) |
1209 | { | 1208 | { |
1210 | int cpu = smp_processor_id(); | 1209 | int cpu = smp_processor_id(); |
1211 | int i; | 1210 | int i; |
1212 | struct sched_domain *sd; | 1211 | struct sched_domain *sd; |
1213 | 1212 | ||
1214 | rcu_read_lock(); | 1213 | rcu_read_lock(); |
1215 | for_each_domain(cpu, sd) { | 1214 | for_each_domain(cpu, sd) { |
1216 | for_each_cpu(i, sched_domain_span(sd)) { | 1215 | for_each_cpu(i, sched_domain_span(sd)) { |
1217 | if (!idle_cpu(i)) { | 1216 | if (!idle_cpu(i)) { |
1218 | cpu = i; | 1217 | cpu = i; |
1219 | goto unlock; | 1218 | goto unlock; |
1220 | } | 1219 | } |
1221 | } | 1220 | } |
1222 | } | 1221 | } |
1223 | unlock: | 1222 | unlock: |
1224 | rcu_read_unlock(); | 1223 | rcu_read_unlock(); |
1225 | return cpu; | 1224 | return cpu; |
1226 | } | 1225 | } |
1227 | /* | 1226 | /* |
1228 | * When add_timer_on() enqueues a timer into the timer wheel of an | 1227 | * When add_timer_on() enqueues a timer into the timer wheel of an |
1229 | * idle CPU then this timer might expire before the next timer event | 1228 | * idle CPU then this timer might expire before the next timer event |
1230 | * which is scheduled to wake up that CPU. In case of a completely | 1229 | * which is scheduled to wake up that CPU. In case of a completely |
1231 | * idle system the next event might even be infinite time into the | 1230 | * idle system the next event might even be infinite time into the |
1232 | * future. wake_up_idle_cpu() ensures that the CPU is woken up and | 1231 | * future. wake_up_idle_cpu() ensures that the CPU is woken up and |
1233 | * leaves the inner idle loop so the newly added timer is taken into | 1232 | * leaves the inner idle loop so the newly added timer is taken into |
1234 | * account when the CPU goes back to idle and evaluates the timer | 1233 | * account when the CPU goes back to idle and evaluates the timer |
1235 | * wheel for the next timer event. | 1234 | * wheel for the next timer event. |
1236 | */ | 1235 | */ |
1237 | void wake_up_idle_cpu(int cpu) | 1236 | void wake_up_idle_cpu(int cpu) |
1238 | { | 1237 | { |
1239 | struct rq *rq = cpu_rq(cpu); | 1238 | struct rq *rq = cpu_rq(cpu); |
1240 | 1239 | ||
1241 | if (cpu == smp_processor_id()) | 1240 | if (cpu == smp_processor_id()) |
1242 | return; | 1241 | return; |
1243 | 1242 | ||
1244 | /* | 1243 | /* |
1245 | * This is safe, as this function is called with the timer | 1244 | * This is safe, as this function is called with the timer |
1246 | * wheel base lock of (cpu) held. When the CPU is on the way | 1245 | * wheel base lock of (cpu) held. When the CPU is on the way |
1247 | * to idle and has not yet set rq->curr to idle then it will | 1246 | * to idle and has not yet set rq->curr to idle then it will |
1248 | * be serialized on the timer wheel base lock and take the new | 1247 | * be serialized on the timer wheel base lock and take the new |
1249 | * timer into account automatically. | 1248 | * timer into account automatically. |
1250 | */ | 1249 | */ |
1251 | if (rq->curr != rq->idle) | 1250 | if (rq->curr != rq->idle) |
1252 | return; | 1251 | return; |
1253 | 1252 | ||
1254 | /* | 1253 | /* |
1255 | * We can set TIF_RESCHED on the idle task of the other CPU | 1254 | * We can set TIF_RESCHED on the idle task of the other CPU |
1256 | * lockless. The worst case is that the other CPU runs the | 1255 | * lockless. The worst case is that the other CPU runs the |
1257 | * idle task through an additional NOOP schedule() | 1256 | * idle task through an additional NOOP schedule() |
1258 | */ | 1257 | */ |
1259 | set_tsk_need_resched(rq->idle); | 1258 | set_tsk_need_resched(rq->idle); |
1260 | 1259 | ||
1261 | /* NEED_RESCHED must be visible before we test polling */ | 1260 | /* NEED_RESCHED must be visible before we test polling */ |
1262 | smp_mb(); | 1261 | smp_mb(); |
1263 | if (!tsk_is_polling(rq->idle)) | 1262 | if (!tsk_is_polling(rq->idle)) |
1264 | smp_send_reschedule(cpu); | 1263 | smp_send_reschedule(cpu); |
1265 | } | 1264 | } |
1266 | 1265 | ||
1267 | #endif /* CONFIG_NO_HZ */ | 1266 | #endif /* CONFIG_NO_HZ */ |
1268 | 1267 | ||
1269 | static u64 sched_avg_period(void) | 1268 | static u64 sched_avg_period(void) |
1270 | { | 1269 | { |
1271 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | 1270 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; |
1272 | } | 1271 | } |
1273 | 1272 | ||
1274 | static void sched_avg_update(struct rq *rq) | 1273 | static void sched_avg_update(struct rq *rq) |
1275 | { | 1274 | { |
1276 | s64 period = sched_avg_period(); | 1275 | s64 period = sched_avg_period(); |
1277 | 1276 | ||
1278 | while ((s64)(rq->clock - rq->age_stamp) > period) { | 1277 | while ((s64)(rq->clock - rq->age_stamp) > period) { |
1279 | /* | 1278 | /* |
1280 | * Inline assembly required to prevent the compiler | 1279 | * Inline assembly required to prevent the compiler |
1281 | * optimising this loop into a divmod call. | 1280 | * optimising this loop into a divmod call. |
1282 | * See __iter_div_u64_rem() for another example of this. | 1281 | * See __iter_div_u64_rem() for another example of this. |
1283 | */ | 1282 | */ |
1284 | asm("" : "+rm" (rq->age_stamp)); | 1283 | asm("" : "+rm" (rq->age_stamp)); |
1285 | rq->age_stamp += period; | 1284 | rq->age_stamp += period; |
1286 | rq->rt_avg /= 2; | 1285 | rq->rt_avg /= 2; |
1287 | } | 1286 | } |
1288 | } | 1287 | } |
1289 | 1288 | ||
1290 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1289 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
1291 | { | 1290 | { |
1292 | rq->rt_avg += rt_delta; | 1291 | rq->rt_avg += rt_delta; |
1293 | sched_avg_update(rq); | 1292 | sched_avg_update(rq); |
1294 | } | 1293 | } |
1295 | 1294 | ||
1296 | #else /* !CONFIG_SMP */ | 1295 | #else /* !CONFIG_SMP */ |
1297 | static void resched_task(struct task_struct *p) | 1296 | static void resched_task(struct task_struct *p) |
1298 | { | 1297 | { |
1299 | assert_raw_spin_locked(&task_rq(p)->lock); | 1298 | assert_raw_spin_locked(&task_rq(p)->lock); |
1300 | set_tsk_need_resched(p); | 1299 | set_tsk_need_resched(p); |
1301 | } | 1300 | } |
1302 | 1301 | ||
1303 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1302 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
1304 | { | 1303 | { |
1305 | } | 1304 | } |
1306 | 1305 | ||
1307 | static void sched_avg_update(struct rq *rq) | 1306 | static void sched_avg_update(struct rq *rq) |
1308 | { | 1307 | { |
1309 | } | 1308 | } |
1310 | #endif /* CONFIG_SMP */ | 1309 | #endif /* CONFIG_SMP */ |
1311 | 1310 | ||
1312 | #if BITS_PER_LONG == 32 | 1311 | #if BITS_PER_LONG == 32 |
1313 | # define WMULT_CONST (~0UL) | 1312 | # define WMULT_CONST (~0UL) |
1314 | #else | 1313 | #else |
1315 | # define WMULT_CONST (1UL << 32) | 1314 | # define WMULT_CONST (1UL << 32) |
1316 | #endif | 1315 | #endif |
1317 | 1316 | ||
1318 | #define WMULT_SHIFT 32 | 1317 | #define WMULT_SHIFT 32 |
1319 | 1318 | ||
1320 | /* | 1319 | /* |
1321 | * Shift right and round: | 1320 | * Shift right and round: |
1322 | */ | 1321 | */ |
1323 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1322 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
1324 | 1323 | ||
1325 | /* | 1324 | /* |
1326 | * delta *= weight / lw | 1325 | * delta *= weight / lw |
1327 | */ | 1326 | */ |
1328 | static unsigned long | 1327 | static unsigned long |
1329 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1328 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
1330 | struct load_weight *lw) | 1329 | struct load_weight *lw) |
1331 | { | 1330 | { |
1332 | u64 tmp; | 1331 | u64 tmp; |
1333 | 1332 | ||
1334 | /* | 1333 | /* |
1335 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | 1334 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched |
1336 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | 1335 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than |
1337 | * 2^SCHED_LOAD_RESOLUTION. | 1336 | * 2^SCHED_LOAD_RESOLUTION. |
1338 | */ | 1337 | */ |
1339 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | 1338 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) |
1340 | tmp = (u64)delta_exec * scale_load_down(weight); | 1339 | tmp = (u64)delta_exec * scale_load_down(weight); |
1341 | else | 1340 | else |
1342 | tmp = (u64)delta_exec; | 1341 | tmp = (u64)delta_exec; |
1343 | 1342 | ||
1344 | if (!lw->inv_weight) { | 1343 | if (!lw->inv_weight) { |
1345 | unsigned long w = scale_load_down(lw->weight); | 1344 | unsigned long w = scale_load_down(lw->weight); |
1346 | 1345 | ||
1347 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | 1346 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) |
1348 | lw->inv_weight = 1; | 1347 | lw->inv_weight = 1; |
1349 | else if (unlikely(!w)) | 1348 | else if (unlikely(!w)) |
1350 | lw->inv_weight = WMULT_CONST; | 1349 | lw->inv_weight = WMULT_CONST; |
1351 | else | 1350 | else |
1352 | lw->inv_weight = WMULT_CONST / w; | 1351 | lw->inv_weight = WMULT_CONST / w; |
1353 | } | 1352 | } |
1354 | 1353 | ||
1355 | /* | 1354 | /* |
1356 | * Check whether we'd overflow the 64-bit multiplication: | 1355 | * Check whether we'd overflow the 64-bit multiplication: |
1357 | */ | 1356 | */ |
1358 | if (unlikely(tmp > WMULT_CONST)) | 1357 | if (unlikely(tmp > WMULT_CONST)) |
1359 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | 1358 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, |
1360 | WMULT_SHIFT/2); | 1359 | WMULT_SHIFT/2); |
1361 | else | 1360 | else |
1362 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | 1361 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); |
1363 | 1362 | ||
1364 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1363 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
1365 | } | 1364 | } |
1366 | 1365 | ||
1367 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1366 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
1368 | { | 1367 | { |
1369 | lw->weight += inc; | 1368 | lw->weight += inc; |
1370 | lw->inv_weight = 0; | 1369 | lw->inv_weight = 0; |
1371 | } | 1370 | } |
1372 | 1371 | ||
1373 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | 1372 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) |
1374 | { | 1373 | { |
1375 | lw->weight -= dec; | 1374 | lw->weight -= dec; |
1376 | lw->inv_weight = 0; | 1375 | lw->inv_weight = 0; |
1377 | } | 1376 | } |
1378 | 1377 | ||
1379 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | 1378 | static inline void update_load_set(struct load_weight *lw, unsigned long w) |
1380 | { | 1379 | { |
1381 | lw->weight = w; | 1380 | lw->weight = w; |
1382 | lw->inv_weight = 0; | 1381 | lw->inv_weight = 0; |
1383 | } | 1382 | } |
1384 | 1383 | ||
1385 | /* | 1384 | /* |
1386 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1385 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1387 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1386 | * of tasks with abnormal "nice" values across CPUs the contribution that |
1388 | * each task makes to its run queue's load is weighted according to its | 1387 | * each task makes to its run queue's load is weighted according to its |
1389 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | 1388 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a |
1390 | * scaled version of the new time slice allocation that they receive on time | 1389 | * scaled version of the new time slice allocation that they receive on time |
1391 | * slice expiry etc. | 1390 | * slice expiry etc. |
1392 | */ | 1391 | */ |
1393 | 1392 | ||
1394 | #define WEIGHT_IDLEPRIO 3 | 1393 | #define WEIGHT_IDLEPRIO 3 |
1395 | #define WMULT_IDLEPRIO 1431655765 | 1394 | #define WMULT_IDLEPRIO 1431655765 |
1396 | 1395 | ||
1397 | /* | 1396 | /* |
1398 | * Nice levels are multiplicative, with a gentle 10% change for every | 1397 | * Nice levels are multiplicative, with a gentle 10% change for every |
1399 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | 1398 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to |
1400 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | 1399 | * nice 1, it will get ~10% less CPU time than another CPU-bound task |
1401 | * that remained on nice 0. | 1400 | * that remained on nice 0. |
1402 | * | 1401 | * |
1403 | * The "10% effect" is relative and cumulative: from _any_ nice level, | 1402 | * The "10% effect" is relative and cumulative: from _any_ nice level, |
1404 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | 1403 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level |
1405 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | 1404 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. |
1406 | * If a task goes up by ~10% and another task goes down by ~10% then | 1405 | * If a task goes up by ~10% and another task goes down by ~10% then |
1407 | * the relative distance between them is ~25%.) | 1406 | * the relative distance between them is ~25%.) |
1408 | */ | 1407 | */ |
1409 | static const int prio_to_weight[40] = { | 1408 | static const int prio_to_weight[40] = { |
1410 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | 1409 | /* -20 */ 88761, 71755, 56483, 46273, 36291, |
1411 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | 1410 | /* -15 */ 29154, 23254, 18705, 14949, 11916, |
1412 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | 1411 | /* -10 */ 9548, 7620, 6100, 4904, 3906, |
1413 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | 1412 | /* -5 */ 3121, 2501, 1991, 1586, 1277, |
1414 | /* 0 */ 1024, 820, 655, 526, 423, | 1413 | /* 0 */ 1024, 820, 655, 526, 423, |
1415 | /* 5 */ 335, 272, 215, 172, 137, | 1414 | /* 5 */ 335, 272, 215, 172, 137, |
1416 | /* 10 */ 110, 87, 70, 56, 45, | 1415 | /* 10 */ 110, 87, 70, 56, 45, |
1417 | /* 15 */ 36, 29, 23, 18, 15, | 1416 | /* 15 */ 36, 29, 23, 18, 15, |
1418 | }; | 1417 | }; |
1419 | 1418 | ||
1420 | /* | 1419 | /* |
1421 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | 1420 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. |
1422 | * | 1421 | * |
1423 | * In cases where the weight does not change often, we can use the | 1422 | * In cases where the weight does not change often, we can use the |
1424 | * precalculated inverse to speed up arithmetics by turning divisions | 1423 | * precalculated inverse to speed up arithmetics by turning divisions |
1425 | * into multiplications: | 1424 | * into multiplications: |
1426 | */ | 1425 | */ |
1427 | static const u32 prio_to_wmult[40] = { | 1426 | static const u32 prio_to_wmult[40] = { |
1428 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | 1427 | /* -20 */ 48388, 59856, 76040, 92818, 118348, |
1429 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | 1428 | /* -15 */ 147320, 184698, 229616, 287308, 360437, |
1430 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | 1429 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, |
1431 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | 1430 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, |
1432 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | 1431 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, |
1433 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | 1432 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, |
1434 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | 1433 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, |
1435 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1434 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
1436 | }; | 1435 | }; |
1437 | 1436 | ||
1438 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 1437 | /* Time spent by the tasks of the cpu accounting group executing in ... */ |
1439 | enum cpuacct_stat_index { | 1438 | enum cpuacct_stat_index { |
1440 | CPUACCT_STAT_USER, /* ... user mode */ | 1439 | CPUACCT_STAT_USER, /* ... user mode */ |
1441 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | 1440 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ |
1442 | 1441 | ||
1443 | CPUACCT_STAT_NSTATS, | 1442 | CPUACCT_STAT_NSTATS, |
1444 | }; | 1443 | }; |
1445 | 1444 | ||
1446 | #ifdef CONFIG_CGROUP_CPUACCT | 1445 | #ifdef CONFIG_CGROUP_CPUACCT |
1447 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | 1446 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); |
1448 | static void cpuacct_update_stats(struct task_struct *tsk, | 1447 | static void cpuacct_update_stats(struct task_struct *tsk, |
1449 | enum cpuacct_stat_index idx, cputime_t val); | 1448 | enum cpuacct_stat_index idx, cputime_t val); |
1450 | #else | 1449 | #else |
1451 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1450 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
1452 | static inline void cpuacct_update_stats(struct task_struct *tsk, | 1451 | static inline void cpuacct_update_stats(struct task_struct *tsk, |
1453 | enum cpuacct_stat_index idx, cputime_t val) {} | 1452 | enum cpuacct_stat_index idx, cputime_t val) {} |
1454 | #endif | 1453 | #endif |
1455 | 1454 | ||
1456 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | 1455 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) |
1457 | { | 1456 | { |
1458 | update_load_add(&rq->load, load); | 1457 | update_load_add(&rq->load, load); |
1459 | } | 1458 | } |
1460 | 1459 | ||
1461 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | 1460 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) |
1462 | { | 1461 | { |
1463 | update_load_sub(&rq->load, load); | 1462 | update_load_sub(&rq->load, load); |
1464 | } | 1463 | } |
1465 | 1464 | ||
1466 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) | 1465 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) |
1467 | typedef int (*tg_visitor)(struct task_group *, void *); | 1466 | typedef int (*tg_visitor)(struct task_group *, void *); |
1468 | 1467 | ||
1469 | /* | 1468 | /* |
1470 | * Iterate the full tree, calling @down when first entering a node and @up when | 1469 | * Iterate the full tree, calling @down when first entering a node and @up when |
1471 | * leaving it for the final time. | 1470 | * leaving it for the final time. |
1472 | */ | 1471 | */ |
1473 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | 1472 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) |
1474 | { | 1473 | { |
1475 | struct task_group *parent, *child; | 1474 | struct task_group *parent, *child; |
1476 | int ret; | 1475 | int ret; |
1477 | 1476 | ||
1478 | rcu_read_lock(); | 1477 | rcu_read_lock(); |
1479 | parent = &root_task_group; | 1478 | parent = &root_task_group; |
1480 | down: | 1479 | down: |
1481 | ret = (*down)(parent, data); | 1480 | ret = (*down)(parent, data); |
1482 | if (ret) | 1481 | if (ret) |
1483 | goto out_unlock; | 1482 | goto out_unlock; |
1484 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1483 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1485 | parent = child; | 1484 | parent = child; |
1486 | goto down; | 1485 | goto down; |
1487 | 1486 | ||
1488 | up: | 1487 | up: |
1489 | continue; | 1488 | continue; |
1490 | } | 1489 | } |
1491 | ret = (*up)(parent, data); | 1490 | ret = (*up)(parent, data); |
1492 | if (ret) | 1491 | if (ret) |
1493 | goto out_unlock; | 1492 | goto out_unlock; |
1494 | 1493 | ||
1495 | child = parent; | 1494 | child = parent; |
1496 | parent = parent->parent; | 1495 | parent = parent->parent; |
1497 | if (parent) | 1496 | if (parent) |
1498 | goto up; | 1497 | goto up; |
1499 | out_unlock: | 1498 | out_unlock: |
1500 | rcu_read_unlock(); | 1499 | rcu_read_unlock(); |
1501 | 1500 | ||
1502 | return ret; | 1501 | return ret; |
1503 | } | 1502 | } |
1504 | 1503 | ||
1505 | static int tg_nop(struct task_group *tg, void *data) | 1504 | static int tg_nop(struct task_group *tg, void *data) |
1506 | { | 1505 | { |
1507 | return 0; | 1506 | return 0; |
1508 | } | 1507 | } |
1509 | #endif | 1508 | #endif |
1510 | 1509 | ||
1511 | #ifdef CONFIG_SMP | 1510 | #ifdef CONFIG_SMP |
1512 | /* Used instead of source_load when we know the type == 0 */ | 1511 | /* Used instead of source_load when we know the type == 0 */ |
1513 | static unsigned long weighted_cpuload(const int cpu) | 1512 | static unsigned long weighted_cpuload(const int cpu) |
1514 | { | 1513 | { |
1515 | return cpu_rq(cpu)->load.weight; | 1514 | return cpu_rq(cpu)->load.weight; |
1516 | } | 1515 | } |
1517 | 1516 | ||
1518 | /* | 1517 | /* |
1519 | * Return a low guess at the load of a migration-source cpu weighted | 1518 | * Return a low guess at the load of a migration-source cpu weighted |
1520 | * according to the scheduling class and "nice" value. | 1519 | * according to the scheduling class and "nice" value. |
1521 | * | 1520 | * |
1522 | * We want to under-estimate the load of migration sources, to | 1521 | * We want to under-estimate the load of migration sources, to |
1523 | * balance conservatively. | 1522 | * balance conservatively. |
1524 | */ | 1523 | */ |
1525 | static unsigned long source_load(int cpu, int type) | 1524 | static unsigned long source_load(int cpu, int type) |
1526 | { | 1525 | { |
1527 | struct rq *rq = cpu_rq(cpu); | 1526 | struct rq *rq = cpu_rq(cpu); |
1528 | unsigned long total = weighted_cpuload(cpu); | 1527 | unsigned long total = weighted_cpuload(cpu); |
1529 | 1528 | ||
1530 | if (type == 0 || !sched_feat(LB_BIAS)) | 1529 | if (type == 0 || !sched_feat(LB_BIAS)) |
1531 | return total; | 1530 | return total; |
1532 | 1531 | ||
1533 | return min(rq->cpu_load[type-1], total); | 1532 | return min(rq->cpu_load[type-1], total); |
1534 | } | 1533 | } |
1535 | 1534 | ||
1536 | /* | 1535 | /* |
1537 | * Return a high guess at the load of a migration-target cpu weighted | 1536 | * Return a high guess at the load of a migration-target cpu weighted |
1538 | * according to the scheduling class and "nice" value. | 1537 | * according to the scheduling class and "nice" value. |
1539 | */ | 1538 | */ |
1540 | static unsigned long target_load(int cpu, int type) | 1539 | static unsigned long target_load(int cpu, int type) |
1541 | { | 1540 | { |
1542 | struct rq *rq = cpu_rq(cpu); | 1541 | struct rq *rq = cpu_rq(cpu); |
1543 | unsigned long total = weighted_cpuload(cpu); | 1542 | unsigned long total = weighted_cpuload(cpu); |
1544 | 1543 | ||
1545 | if (type == 0 || !sched_feat(LB_BIAS)) | 1544 | if (type == 0 || !sched_feat(LB_BIAS)) |
1546 | return total; | 1545 | return total; |
1547 | 1546 | ||
1548 | return max(rq->cpu_load[type-1], total); | 1547 | return max(rq->cpu_load[type-1], total); |
1549 | } | 1548 | } |
1550 | 1549 | ||
1551 | static unsigned long power_of(int cpu) | 1550 | static unsigned long power_of(int cpu) |
1552 | { | 1551 | { |
1553 | return cpu_rq(cpu)->cpu_power; | 1552 | return cpu_rq(cpu)->cpu_power; |
1554 | } | 1553 | } |
1555 | 1554 | ||
1556 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1555 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1557 | 1556 | ||
1558 | static unsigned long cpu_avg_load_per_task(int cpu) | 1557 | static unsigned long cpu_avg_load_per_task(int cpu) |
1559 | { | 1558 | { |
1560 | struct rq *rq = cpu_rq(cpu); | 1559 | struct rq *rq = cpu_rq(cpu); |
1561 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 1560 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); |
1562 | 1561 | ||
1563 | if (nr_running) | 1562 | if (nr_running) |
1564 | rq->avg_load_per_task = rq->load.weight / nr_running; | 1563 | rq->avg_load_per_task = rq->load.weight / nr_running; |
1565 | else | 1564 | else |
1566 | rq->avg_load_per_task = 0; | 1565 | rq->avg_load_per_task = 0; |
1567 | 1566 | ||
1568 | return rq->avg_load_per_task; | 1567 | return rq->avg_load_per_task; |
1569 | } | 1568 | } |
1570 | 1569 | ||
1571 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1570 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1572 | 1571 | ||
1573 | /* | 1572 | /* |
1574 | * Compute the cpu's hierarchical load factor for each task group. | 1573 | * Compute the cpu's hierarchical load factor for each task group. |
1575 | * This needs to be done in a top-down fashion because the load of a child | 1574 | * This needs to be done in a top-down fashion because the load of a child |
1576 | * group is a fraction of its parents load. | 1575 | * group is a fraction of its parents load. |
1577 | */ | 1576 | */ |
1578 | static int tg_load_down(struct task_group *tg, void *data) | 1577 | static int tg_load_down(struct task_group *tg, void *data) |
1579 | { | 1578 | { |
1580 | unsigned long load; | 1579 | unsigned long load; |
1581 | long cpu = (long)data; | 1580 | long cpu = (long)data; |
1582 | 1581 | ||
1583 | if (!tg->parent) { | 1582 | if (!tg->parent) { |
1584 | load = cpu_rq(cpu)->load.weight; | 1583 | load = cpu_rq(cpu)->load.weight; |
1585 | } else { | 1584 | } else { |
1586 | load = tg->parent->cfs_rq[cpu]->h_load; | 1585 | load = tg->parent->cfs_rq[cpu]->h_load; |
1587 | load *= tg->se[cpu]->load.weight; | 1586 | load *= tg->se[cpu]->load.weight; |
1588 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1587 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1589 | } | 1588 | } |
1590 | 1589 | ||
1591 | tg->cfs_rq[cpu]->h_load = load; | 1590 | tg->cfs_rq[cpu]->h_load = load; |
1592 | 1591 | ||
1593 | return 0; | 1592 | return 0; |
1594 | } | 1593 | } |
1595 | 1594 | ||
1596 | static void update_h_load(long cpu) | 1595 | static void update_h_load(long cpu) |
1597 | { | 1596 | { |
1598 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1597 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1599 | } | 1598 | } |
1600 | 1599 | ||
1601 | #endif | 1600 | #endif |
1602 | 1601 | ||
1603 | #ifdef CONFIG_PREEMPT | 1602 | #ifdef CONFIG_PREEMPT |
1604 | 1603 | ||
1605 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | 1604 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); |
1606 | 1605 | ||
1607 | /* | 1606 | /* |
1608 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | 1607 | * fair double_lock_balance: Safely acquires both rq->locks in a fair |
1609 | * way at the expense of forcing extra atomic operations in all | 1608 | * way at the expense of forcing extra atomic operations in all |
1610 | * invocations. This assures that the double_lock is acquired using the | 1609 | * invocations. This assures that the double_lock is acquired using the |
1611 | * same underlying policy as the spinlock_t on this architecture, which | 1610 | * same underlying policy as the spinlock_t on this architecture, which |
1612 | * reduces latency compared to the unfair variant below. However, it | 1611 | * reduces latency compared to the unfair variant below. However, it |
1613 | * also adds more overhead and therefore may reduce throughput. | 1612 | * also adds more overhead and therefore may reduce throughput. |
1614 | */ | 1613 | */ |
1615 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1614 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) |
1616 | __releases(this_rq->lock) | 1615 | __releases(this_rq->lock) |
1617 | __acquires(busiest->lock) | 1616 | __acquires(busiest->lock) |
1618 | __acquires(this_rq->lock) | 1617 | __acquires(this_rq->lock) |
1619 | { | 1618 | { |
1620 | raw_spin_unlock(&this_rq->lock); | 1619 | raw_spin_unlock(&this_rq->lock); |
1621 | double_rq_lock(this_rq, busiest); | 1620 | double_rq_lock(this_rq, busiest); |
1622 | 1621 | ||
1623 | return 1; | 1622 | return 1; |
1624 | } | 1623 | } |
1625 | 1624 | ||
1626 | #else | 1625 | #else |
1627 | /* | 1626 | /* |
1628 | * Unfair double_lock_balance: Optimizes throughput at the expense of | 1627 | * Unfair double_lock_balance: Optimizes throughput at the expense of |
1629 | * latency by eliminating extra atomic operations when the locks are | 1628 | * latency by eliminating extra atomic operations when the locks are |
1630 | * already in proper order on entry. This favors lower cpu-ids and will | 1629 | * already in proper order on entry. This favors lower cpu-ids and will |
1631 | * grant the double lock to lower cpus over higher ids under contention, | 1630 | * grant the double lock to lower cpus over higher ids under contention, |
1632 | * regardless of entry order into the function. | 1631 | * regardless of entry order into the function. |
1633 | */ | 1632 | */ |
1634 | static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1633 | static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) |
1635 | __releases(this_rq->lock) | 1634 | __releases(this_rq->lock) |
1636 | __acquires(busiest->lock) | 1635 | __acquires(busiest->lock) |
1637 | __acquires(this_rq->lock) | 1636 | __acquires(this_rq->lock) |
1638 | { | 1637 | { |
1639 | int ret = 0; | 1638 | int ret = 0; |
1640 | 1639 | ||
1641 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { | 1640 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { |
1642 | if (busiest < this_rq) { | 1641 | if (busiest < this_rq) { |
1643 | raw_spin_unlock(&this_rq->lock); | 1642 | raw_spin_unlock(&this_rq->lock); |
1644 | raw_spin_lock(&busiest->lock); | 1643 | raw_spin_lock(&busiest->lock); |
1645 | raw_spin_lock_nested(&this_rq->lock, | 1644 | raw_spin_lock_nested(&this_rq->lock, |
1646 | SINGLE_DEPTH_NESTING); | 1645 | SINGLE_DEPTH_NESTING); |
1647 | ret = 1; | 1646 | ret = 1; |
1648 | } else | 1647 | } else |
1649 | raw_spin_lock_nested(&busiest->lock, | 1648 | raw_spin_lock_nested(&busiest->lock, |
1650 | SINGLE_DEPTH_NESTING); | 1649 | SINGLE_DEPTH_NESTING); |
1651 | } | 1650 | } |
1652 | return ret; | 1651 | return ret; |
1653 | } | 1652 | } |
1654 | 1653 | ||
1655 | #endif /* CONFIG_PREEMPT */ | 1654 | #endif /* CONFIG_PREEMPT */ |
1656 | 1655 | ||
1657 | /* | 1656 | /* |
1658 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 1657 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
1659 | */ | 1658 | */ |
1660 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1659 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
1661 | { | 1660 | { |
1662 | if (unlikely(!irqs_disabled())) { | 1661 | if (unlikely(!irqs_disabled())) { |
1663 | /* printk() doesn't work good under rq->lock */ | 1662 | /* printk() doesn't work good under rq->lock */ |
1664 | raw_spin_unlock(&this_rq->lock); | 1663 | raw_spin_unlock(&this_rq->lock); |
1665 | BUG_ON(1); | 1664 | BUG_ON(1); |
1666 | } | 1665 | } |
1667 | 1666 | ||
1668 | return _double_lock_balance(this_rq, busiest); | 1667 | return _double_lock_balance(this_rq, busiest); |
1669 | } | 1668 | } |
1670 | 1669 | ||
1671 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | 1670 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) |
1672 | __releases(busiest->lock) | 1671 | __releases(busiest->lock) |
1673 | { | 1672 | { |
1674 | raw_spin_unlock(&busiest->lock); | 1673 | raw_spin_unlock(&busiest->lock); |
1675 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | 1674 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); |
1676 | } | 1675 | } |
1677 | 1676 | ||
1678 | /* | 1677 | /* |
1679 | * double_rq_lock - safely lock two runqueues | 1678 | * double_rq_lock - safely lock two runqueues |
1680 | * | 1679 | * |
1681 | * Note this does not disable interrupts like task_rq_lock, | 1680 | * Note this does not disable interrupts like task_rq_lock, |
1682 | * you need to do so manually before calling. | 1681 | * you need to do so manually before calling. |
1683 | */ | 1682 | */ |
1684 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | 1683 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) |
1685 | __acquires(rq1->lock) | 1684 | __acquires(rq1->lock) |
1686 | __acquires(rq2->lock) | 1685 | __acquires(rq2->lock) |
1687 | { | 1686 | { |
1688 | BUG_ON(!irqs_disabled()); | 1687 | BUG_ON(!irqs_disabled()); |
1689 | if (rq1 == rq2) { | 1688 | if (rq1 == rq2) { |
1690 | raw_spin_lock(&rq1->lock); | 1689 | raw_spin_lock(&rq1->lock); |
1691 | __acquire(rq2->lock); /* Fake it out ;) */ | 1690 | __acquire(rq2->lock); /* Fake it out ;) */ |
1692 | } else { | 1691 | } else { |
1693 | if (rq1 < rq2) { | 1692 | if (rq1 < rq2) { |
1694 | raw_spin_lock(&rq1->lock); | 1693 | raw_spin_lock(&rq1->lock); |
1695 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | 1694 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); |
1696 | } else { | 1695 | } else { |
1697 | raw_spin_lock(&rq2->lock); | 1696 | raw_spin_lock(&rq2->lock); |
1698 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | 1697 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); |
1699 | } | 1698 | } |
1700 | } | 1699 | } |
1701 | } | 1700 | } |
1702 | 1701 | ||
1703 | /* | 1702 | /* |
1704 | * double_rq_unlock - safely unlock two runqueues | 1703 | * double_rq_unlock - safely unlock two runqueues |
1705 | * | 1704 | * |
1706 | * Note this does not restore interrupts like task_rq_unlock, | 1705 | * Note this does not restore interrupts like task_rq_unlock, |
1707 | * you need to do so manually after calling. | 1706 | * you need to do so manually after calling. |
1708 | */ | 1707 | */ |
1709 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | 1708 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) |
1710 | __releases(rq1->lock) | 1709 | __releases(rq1->lock) |
1711 | __releases(rq2->lock) | 1710 | __releases(rq2->lock) |
1712 | { | 1711 | { |
1713 | raw_spin_unlock(&rq1->lock); | 1712 | raw_spin_unlock(&rq1->lock); |
1714 | if (rq1 != rq2) | 1713 | if (rq1 != rq2) |
1715 | raw_spin_unlock(&rq2->lock); | 1714 | raw_spin_unlock(&rq2->lock); |
1716 | else | 1715 | else |
1717 | __release(rq2->lock); | 1716 | __release(rq2->lock); |
1718 | } | 1717 | } |
1719 | 1718 | ||
1720 | #else /* CONFIG_SMP */ | 1719 | #else /* CONFIG_SMP */ |
1721 | 1720 | ||
1722 | /* | 1721 | /* |
1723 | * double_rq_lock - safely lock two runqueues | 1722 | * double_rq_lock - safely lock two runqueues |
1724 | * | 1723 | * |
1725 | * Note this does not disable interrupts like task_rq_lock, | 1724 | * Note this does not disable interrupts like task_rq_lock, |
1726 | * you need to do so manually before calling. | 1725 | * you need to do so manually before calling. |
1727 | */ | 1726 | */ |
1728 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | 1727 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) |
1729 | __acquires(rq1->lock) | 1728 | __acquires(rq1->lock) |
1730 | __acquires(rq2->lock) | 1729 | __acquires(rq2->lock) |
1731 | { | 1730 | { |
1732 | BUG_ON(!irqs_disabled()); | 1731 | BUG_ON(!irqs_disabled()); |
1733 | BUG_ON(rq1 != rq2); | 1732 | BUG_ON(rq1 != rq2); |
1734 | raw_spin_lock(&rq1->lock); | 1733 | raw_spin_lock(&rq1->lock); |
1735 | __acquire(rq2->lock); /* Fake it out ;) */ | 1734 | __acquire(rq2->lock); /* Fake it out ;) */ |
1736 | } | 1735 | } |
1737 | 1736 | ||
1738 | /* | 1737 | /* |
1739 | * double_rq_unlock - safely unlock two runqueues | 1738 | * double_rq_unlock - safely unlock two runqueues |
1740 | * | 1739 | * |
1741 | * Note this does not restore interrupts like task_rq_unlock, | 1740 | * Note this does not restore interrupts like task_rq_unlock, |
1742 | * you need to do so manually after calling. | 1741 | * you need to do so manually after calling. |
1743 | */ | 1742 | */ |
1744 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | 1743 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) |
1745 | __releases(rq1->lock) | 1744 | __releases(rq1->lock) |
1746 | __releases(rq2->lock) | 1745 | __releases(rq2->lock) |
1747 | { | 1746 | { |
1748 | BUG_ON(rq1 != rq2); | 1747 | BUG_ON(rq1 != rq2); |
1749 | raw_spin_unlock(&rq1->lock); | 1748 | raw_spin_unlock(&rq1->lock); |
1750 | __release(rq2->lock); | 1749 | __release(rq2->lock); |
1751 | } | 1750 | } |
1752 | 1751 | ||
1753 | #endif | 1752 | #endif |
1754 | 1753 | ||
1755 | static void calc_load_account_idle(struct rq *this_rq); | 1754 | static void calc_load_account_idle(struct rq *this_rq); |
1756 | static void update_sysctl(void); | 1755 | static void update_sysctl(void); |
1757 | static int get_update_sysctl_factor(void); | 1756 | static int get_update_sysctl_factor(void); |
1758 | static void update_cpu_load(struct rq *this_rq); | 1757 | static void update_cpu_load(struct rq *this_rq); |
1759 | 1758 | ||
1760 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1759 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1761 | { | 1760 | { |
1762 | set_task_rq(p, cpu); | 1761 | set_task_rq(p, cpu); |
1763 | #ifdef CONFIG_SMP | 1762 | #ifdef CONFIG_SMP |
1764 | /* | 1763 | /* |
1765 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | 1764 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
1766 | * successfuly executed on another CPU. We must ensure that updates of | 1765 | * successfuly executed on another CPU. We must ensure that updates of |
1767 | * per-task data have been completed by this moment. | 1766 | * per-task data have been completed by this moment. |
1768 | */ | 1767 | */ |
1769 | smp_wmb(); | 1768 | smp_wmb(); |
1770 | task_thread_info(p)->cpu = cpu; | 1769 | task_thread_info(p)->cpu = cpu; |
1771 | #endif | 1770 | #endif |
1772 | } | 1771 | } |
1773 | 1772 | ||
1774 | static const struct sched_class rt_sched_class; | 1773 | static const struct sched_class rt_sched_class; |
1775 | 1774 | ||
1776 | #define sched_class_highest (&stop_sched_class) | 1775 | #define sched_class_highest (&stop_sched_class) |
1777 | #define for_each_class(class) \ | 1776 | #define for_each_class(class) \ |
1778 | for (class = sched_class_highest; class; class = class->next) | 1777 | for (class = sched_class_highest; class; class = class->next) |
1779 | 1778 | ||
1780 | #include "sched_stats.h" | 1779 | #include "sched_stats.h" |
1781 | 1780 | ||
1782 | static void inc_nr_running(struct rq *rq) | 1781 | static void inc_nr_running(struct rq *rq) |
1783 | { | 1782 | { |
1784 | rq->nr_running++; | 1783 | rq->nr_running++; |
1785 | } | 1784 | } |
1786 | 1785 | ||
1787 | static void dec_nr_running(struct rq *rq) | 1786 | static void dec_nr_running(struct rq *rq) |
1788 | { | 1787 | { |
1789 | rq->nr_running--; | 1788 | rq->nr_running--; |
1790 | } | 1789 | } |
1791 | 1790 | ||
1792 | static void set_load_weight(struct task_struct *p) | 1791 | static void set_load_weight(struct task_struct *p) |
1793 | { | 1792 | { |
1794 | int prio = p->static_prio - MAX_RT_PRIO; | 1793 | int prio = p->static_prio - MAX_RT_PRIO; |
1795 | struct load_weight *load = &p->se.load; | 1794 | struct load_weight *load = &p->se.load; |
1796 | 1795 | ||
1797 | /* | 1796 | /* |
1798 | * SCHED_IDLE tasks get minimal weight: | 1797 | * SCHED_IDLE tasks get minimal weight: |
1799 | */ | 1798 | */ |
1800 | if (p->policy == SCHED_IDLE) { | 1799 | if (p->policy == SCHED_IDLE) { |
1801 | load->weight = scale_load(WEIGHT_IDLEPRIO); | 1800 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
1802 | load->inv_weight = WMULT_IDLEPRIO; | 1801 | load->inv_weight = WMULT_IDLEPRIO; |
1803 | return; | 1802 | return; |
1804 | } | 1803 | } |
1805 | 1804 | ||
1806 | load->weight = scale_load(prio_to_weight[prio]); | 1805 | load->weight = scale_load(prio_to_weight[prio]); |
1807 | load->inv_weight = prio_to_wmult[prio]; | 1806 | load->inv_weight = prio_to_wmult[prio]; |
1808 | } | 1807 | } |
1809 | 1808 | ||
1810 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 1809 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
1811 | { | 1810 | { |
1812 | update_rq_clock(rq); | 1811 | update_rq_clock(rq); |
1813 | sched_info_queued(p); | 1812 | sched_info_queued(p); |
1814 | p->sched_class->enqueue_task(rq, p, flags); | 1813 | p->sched_class->enqueue_task(rq, p, flags); |
1815 | } | 1814 | } |
1816 | 1815 | ||
1817 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 1816 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
1818 | { | 1817 | { |
1819 | update_rq_clock(rq); | 1818 | update_rq_clock(rq); |
1820 | sched_info_dequeued(p); | 1819 | sched_info_dequeued(p); |
1821 | p->sched_class->dequeue_task(rq, p, flags); | 1820 | p->sched_class->dequeue_task(rq, p, flags); |
1822 | } | 1821 | } |
1823 | 1822 | ||
1824 | /* | 1823 | /* |
1825 | * activate_task - move a task to the runqueue. | 1824 | * activate_task - move a task to the runqueue. |
1826 | */ | 1825 | */ |
1827 | static void activate_task(struct rq *rq, struct task_struct *p, int flags) | 1826 | static void activate_task(struct rq *rq, struct task_struct *p, int flags) |
1828 | { | 1827 | { |
1829 | if (task_contributes_to_load(p)) | 1828 | if (task_contributes_to_load(p)) |
1830 | rq->nr_uninterruptible--; | 1829 | rq->nr_uninterruptible--; |
1831 | 1830 | ||
1832 | enqueue_task(rq, p, flags); | 1831 | enqueue_task(rq, p, flags); |
1833 | inc_nr_running(rq); | 1832 | inc_nr_running(rq); |
1834 | } | 1833 | } |
1835 | 1834 | ||
1836 | /* | 1835 | /* |
1837 | * deactivate_task - remove a task from the runqueue. | 1836 | * deactivate_task - remove a task from the runqueue. |
1838 | */ | 1837 | */ |
1839 | static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | 1838 | static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) |
1840 | { | 1839 | { |
1841 | if (task_contributes_to_load(p)) | 1840 | if (task_contributes_to_load(p)) |
1842 | rq->nr_uninterruptible++; | 1841 | rq->nr_uninterruptible++; |
1843 | 1842 | ||
1844 | dequeue_task(rq, p, flags); | 1843 | dequeue_task(rq, p, flags); |
1845 | dec_nr_running(rq); | 1844 | dec_nr_running(rq); |
1846 | } | 1845 | } |
1847 | 1846 | ||
1848 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1847 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
1849 | 1848 | ||
1850 | /* | 1849 | /* |
1851 | * There are no locks covering percpu hardirq/softirq time. | 1850 | * There are no locks covering percpu hardirq/softirq time. |
1852 | * They are only modified in account_system_vtime, on corresponding CPU | 1851 | * They are only modified in account_system_vtime, on corresponding CPU |
1853 | * with interrupts disabled. So, writes are safe. | 1852 | * with interrupts disabled. So, writes are safe. |
1854 | * They are read and saved off onto struct rq in update_rq_clock(). | 1853 | * They are read and saved off onto struct rq in update_rq_clock(). |
1855 | * This may result in other CPU reading this CPU's irq time and can | 1854 | * This may result in other CPU reading this CPU's irq time and can |
1856 | * race with irq/account_system_vtime on this CPU. We would either get old | 1855 | * race with irq/account_system_vtime on this CPU. We would either get old |
1857 | * or new value with a side effect of accounting a slice of irq time to wrong | 1856 | * or new value with a side effect of accounting a slice of irq time to wrong |
1858 | * task when irq is in progress while we read rq->clock. That is a worthy | 1857 | * task when irq is in progress while we read rq->clock. That is a worthy |
1859 | * compromise in place of having locks on each irq in account_system_time. | 1858 | * compromise in place of having locks on each irq in account_system_time. |
1860 | */ | 1859 | */ |
1861 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | 1860 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); |
1862 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | 1861 | static DEFINE_PER_CPU(u64, cpu_softirq_time); |
1863 | 1862 | ||
1864 | static DEFINE_PER_CPU(u64, irq_start_time); | 1863 | static DEFINE_PER_CPU(u64, irq_start_time); |
1865 | static int sched_clock_irqtime; | 1864 | static int sched_clock_irqtime; |
1866 | 1865 | ||
1867 | void enable_sched_clock_irqtime(void) | 1866 | void enable_sched_clock_irqtime(void) |
1868 | { | 1867 | { |
1869 | sched_clock_irqtime = 1; | 1868 | sched_clock_irqtime = 1; |
1870 | } | 1869 | } |
1871 | 1870 | ||
1872 | void disable_sched_clock_irqtime(void) | 1871 | void disable_sched_clock_irqtime(void) |
1873 | { | 1872 | { |
1874 | sched_clock_irqtime = 0; | 1873 | sched_clock_irqtime = 0; |
1875 | } | 1874 | } |
1876 | 1875 | ||
1877 | #ifndef CONFIG_64BIT | 1876 | #ifndef CONFIG_64BIT |
1878 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | 1877 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); |
1879 | 1878 | ||
1880 | static inline void irq_time_write_begin(void) | 1879 | static inline void irq_time_write_begin(void) |
1881 | { | 1880 | { |
1882 | __this_cpu_inc(irq_time_seq.sequence); | 1881 | __this_cpu_inc(irq_time_seq.sequence); |
1883 | smp_wmb(); | 1882 | smp_wmb(); |
1884 | } | 1883 | } |
1885 | 1884 | ||
1886 | static inline void irq_time_write_end(void) | 1885 | static inline void irq_time_write_end(void) |
1887 | { | 1886 | { |
1888 | smp_wmb(); | 1887 | smp_wmb(); |
1889 | __this_cpu_inc(irq_time_seq.sequence); | 1888 | __this_cpu_inc(irq_time_seq.sequence); |
1890 | } | 1889 | } |
1891 | 1890 | ||
1892 | static inline u64 irq_time_read(int cpu) | 1891 | static inline u64 irq_time_read(int cpu) |
1893 | { | 1892 | { |
1894 | u64 irq_time; | 1893 | u64 irq_time; |
1895 | unsigned seq; | 1894 | unsigned seq; |
1896 | 1895 | ||
1897 | do { | 1896 | do { |
1898 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | 1897 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); |
1899 | irq_time = per_cpu(cpu_softirq_time, cpu) + | 1898 | irq_time = per_cpu(cpu_softirq_time, cpu) + |
1900 | per_cpu(cpu_hardirq_time, cpu); | 1899 | per_cpu(cpu_hardirq_time, cpu); |
1901 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | 1900 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); |
1902 | 1901 | ||
1903 | return irq_time; | 1902 | return irq_time; |
1904 | } | 1903 | } |
1905 | #else /* CONFIG_64BIT */ | 1904 | #else /* CONFIG_64BIT */ |
1906 | static inline void irq_time_write_begin(void) | 1905 | static inline void irq_time_write_begin(void) |
1907 | { | 1906 | { |
1908 | } | 1907 | } |
1909 | 1908 | ||
1910 | static inline void irq_time_write_end(void) | 1909 | static inline void irq_time_write_end(void) |
1911 | { | 1910 | { |
1912 | } | 1911 | } |
1913 | 1912 | ||
1914 | static inline u64 irq_time_read(int cpu) | 1913 | static inline u64 irq_time_read(int cpu) |
1915 | { | 1914 | { |
1916 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | 1915 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); |
1917 | } | 1916 | } |
1918 | #endif /* CONFIG_64BIT */ | 1917 | #endif /* CONFIG_64BIT */ |
1919 | 1918 | ||
1920 | /* | 1919 | /* |
1921 | * Called before incrementing preempt_count on {soft,}irq_enter | 1920 | * Called before incrementing preempt_count on {soft,}irq_enter |
1922 | * and before decrementing preempt_count on {soft,}irq_exit. | 1921 | * and before decrementing preempt_count on {soft,}irq_exit. |
1923 | */ | 1922 | */ |
1924 | void account_system_vtime(struct task_struct *curr) | 1923 | void account_system_vtime(struct task_struct *curr) |
1925 | { | 1924 | { |
1926 | unsigned long flags; | 1925 | unsigned long flags; |
1927 | s64 delta; | 1926 | s64 delta; |
1928 | int cpu; | 1927 | int cpu; |
1929 | 1928 | ||
1930 | if (!sched_clock_irqtime) | 1929 | if (!sched_clock_irqtime) |
1931 | return; | 1930 | return; |
1932 | 1931 | ||
1933 | local_irq_save(flags); | 1932 | local_irq_save(flags); |
1934 | 1933 | ||
1935 | cpu = smp_processor_id(); | 1934 | cpu = smp_processor_id(); |
1936 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | 1935 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); |
1937 | __this_cpu_add(irq_start_time, delta); | 1936 | __this_cpu_add(irq_start_time, delta); |
1938 | 1937 | ||
1939 | irq_time_write_begin(); | 1938 | irq_time_write_begin(); |
1940 | /* | 1939 | /* |
1941 | * We do not account for softirq time from ksoftirqd here. | 1940 | * We do not account for softirq time from ksoftirqd here. |
1942 | * We want to continue accounting softirq time to ksoftirqd thread | 1941 | * We want to continue accounting softirq time to ksoftirqd thread |
1943 | * in that case, so as not to confuse scheduler with a special task | 1942 | * in that case, so as not to confuse scheduler with a special task |
1944 | * that do not consume any time, but still wants to run. | 1943 | * that do not consume any time, but still wants to run. |
1945 | */ | 1944 | */ |
1946 | if (hardirq_count()) | 1945 | if (hardirq_count()) |
1947 | __this_cpu_add(cpu_hardirq_time, delta); | 1946 | __this_cpu_add(cpu_hardirq_time, delta); |
1948 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | 1947 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
1949 | __this_cpu_add(cpu_softirq_time, delta); | 1948 | __this_cpu_add(cpu_softirq_time, delta); |
1950 | 1949 | ||
1951 | irq_time_write_end(); | 1950 | irq_time_write_end(); |
1952 | local_irq_restore(flags); | 1951 | local_irq_restore(flags); |
1953 | } | 1952 | } |
1954 | EXPORT_SYMBOL_GPL(account_system_vtime); | 1953 | EXPORT_SYMBOL_GPL(account_system_vtime); |
1955 | 1954 | ||
1956 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 1955 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1957 | { | 1956 | { |
1958 | s64 irq_delta; | 1957 | s64 irq_delta; |
1959 | 1958 | ||
1960 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | 1959 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
1961 | 1960 | ||
1962 | /* | 1961 | /* |
1963 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | 1962 | * Since irq_time is only updated on {soft,}irq_exit, we might run into |
1964 | * this case when a previous update_rq_clock() happened inside a | 1963 | * this case when a previous update_rq_clock() happened inside a |
1965 | * {soft,}irq region. | 1964 | * {soft,}irq region. |
1966 | * | 1965 | * |
1967 | * When this happens, we stop ->clock_task and only update the | 1966 | * When this happens, we stop ->clock_task and only update the |
1968 | * prev_irq_time stamp to account for the part that fit, so that a next | 1967 | * prev_irq_time stamp to account for the part that fit, so that a next |
1969 | * update will consume the rest. This ensures ->clock_task is | 1968 | * update will consume the rest. This ensures ->clock_task is |
1970 | * monotonic. | 1969 | * monotonic. |
1971 | * | 1970 | * |
1972 | * It does however cause some slight miss-attribution of {soft,}irq | 1971 | * It does however cause some slight miss-attribution of {soft,}irq |
1973 | * time, a more accurate solution would be to update the irq_time using | 1972 | * time, a more accurate solution would be to update the irq_time using |
1974 | * the current rq->clock timestamp, except that would require using | 1973 | * the current rq->clock timestamp, except that would require using |
1975 | * atomic ops. | 1974 | * atomic ops. |
1976 | */ | 1975 | */ |
1977 | if (irq_delta > delta) | 1976 | if (irq_delta > delta) |
1978 | irq_delta = delta; | 1977 | irq_delta = delta; |
1979 | 1978 | ||
1980 | rq->prev_irq_time += irq_delta; | 1979 | rq->prev_irq_time += irq_delta; |
1981 | delta -= irq_delta; | 1980 | delta -= irq_delta; |
1982 | rq->clock_task += delta; | 1981 | rq->clock_task += delta; |
1983 | 1982 | ||
1984 | if (irq_delta && sched_feat(NONIRQ_POWER)) | 1983 | if (irq_delta && sched_feat(NONIRQ_POWER)) |
1985 | sched_rt_avg_update(rq, irq_delta); | 1984 | sched_rt_avg_update(rq, irq_delta); |
1986 | } | 1985 | } |
1987 | 1986 | ||
1988 | static int irqtime_account_hi_update(void) | 1987 | static int irqtime_account_hi_update(void) |
1989 | { | 1988 | { |
1990 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 1989 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
1991 | unsigned long flags; | 1990 | unsigned long flags; |
1992 | u64 latest_ns; | 1991 | u64 latest_ns; |
1993 | int ret = 0; | 1992 | int ret = 0; |
1994 | 1993 | ||
1995 | local_irq_save(flags); | 1994 | local_irq_save(flags); |
1996 | latest_ns = this_cpu_read(cpu_hardirq_time); | 1995 | latest_ns = this_cpu_read(cpu_hardirq_time); |
1997 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | 1996 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) |
1998 | ret = 1; | 1997 | ret = 1; |
1999 | local_irq_restore(flags); | 1998 | local_irq_restore(flags); |
2000 | return ret; | 1999 | return ret; |
2001 | } | 2000 | } |
2002 | 2001 | ||
2003 | static int irqtime_account_si_update(void) | 2002 | static int irqtime_account_si_update(void) |
2004 | { | 2003 | { |
2005 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2004 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
2006 | unsigned long flags; | 2005 | unsigned long flags; |
2007 | u64 latest_ns; | 2006 | u64 latest_ns; |
2008 | int ret = 0; | 2007 | int ret = 0; |
2009 | 2008 | ||
2010 | local_irq_save(flags); | 2009 | local_irq_save(flags); |
2011 | latest_ns = this_cpu_read(cpu_softirq_time); | 2010 | latest_ns = this_cpu_read(cpu_softirq_time); |
2012 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | 2011 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) |
2013 | ret = 1; | 2012 | ret = 1; |
2014 | local_irq_restore(flags); | 2013 | local_irq_restore(flags); |
2015 | return ret; | 2014 | return ret; |
2016 | } | 2015 | } |
2017 | 2016 | ||
2018 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 2017 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2019 | 2018 | ||
2020 | #define sched_clock_irqtime (0) | 2019 | #define sched_clock_irqtime (0) |
2021 | 2020 | ||
2022 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 2021 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
2023 | { | 2022 | { |
2024 | rq->clock_task += delta; | 2023 | rq->clock_task += delta; |
2025 | } | 2024 | } |
2026 | 2025 | ||
2027 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 2026 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2028 | 2027 | ||
2029 | #include "sched_idletask.c" | 2028 | #include "sched_idletask.c" |
2030 | #include "sched_fair.c" | 2029 | #include "sched_fair.c" |
2031 | #include "sched_rt.c" | 2030 | #include "sched_rt.c" |
2032 | #include "sched_autogroup.c" | 2031 | #include "sched_autogroup.c" |
2033 | #include "sched_stoptask.c" | 2032 | #include "sched_stoptask.c" |
2034 | #ifdef CONFIG_SCHED_DEBUG | 2033 | #ifdef CONFIG_SCHED_DEBUG |
2035 | # include "sched_debug.c" | 2034 | # include "sched_debug.c" |
2036 | #endif | 2035 | #endif |
2037 | 2036 | ||
2038 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 2037 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
2039 | { | 2038 | { |
2040 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 2039 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
2041 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | 2040 | struct task_struct *old_stop = cpu_rq(cpu)->stop; |
2042 | 2041 | ||
2043 | if (stop) { | 2042 | if (stop) { |
2044 | /* | 2043 | /* |
2045 | * Make it appear like a SCHED_FIFO task, its something | 2044 | * Make it appear like a SCHED_FIFO task, its something |
2046 | * userspace knows about and won't get confused about. | 2045 | * userspace knows about and won't get confused about. |
2047 | * | 2046 | * |
2048 | * Also, it will make PI more or less work without too | 2047 | * Also, it will make PI more or less work without too |
2049 | * much confusion -- but then, stop work should not | 2048 | * much confusion -- but then, stop work should not |
2050 | * rely on PI working anyway. | 2049 | * rely on PI working anyway. |
2051 | */ | 2050 | */ |
2052 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | 2051 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); |
2053 | 2052 | ||
2054 | stop->sched_class = &stop_sched_class; | 2053 | stop->sched_class = &stop_sched_class; |
2055 | } | 2054 | } |
2056 | 2055 | ||
2057 | cpu_rq(cpu)->stop = stop; | 2056 | cpu_rq(cpu)->stop = stop; |
2058 | 2057 | ||
2059 | if (old_stop) { | 2058 | if (old_stop) { |
2060 | /* | 2059 | /* |
2061 | * Reset it back to a normal scheduling class so that | 2060 | * Reset it back to a normal scheduling class so that |
2062 | * it can die in pieces. | 2061 | * it can die in pieces. |
2063 | */ | 2062 | */ |
2064 | old_stop->sched_class = &rt_sched_class; | 2063 | old_stop->sched_class = &rt_sched_class; |
2065 | } | 2064 | } |
2066 | } | 2065 | } |
2067 | 2066 | ||
2068 | /* | 2067 | /* |
2069 | * __normal_prio - return the priority that is based on the static prio | 2068 | * __normal_prio - return the priority that is based on the static prio |
2070 | */ | 2069 | */ |
2071 | static inline int __normal_prio(struct task_struct *p) | 2070 | static inline int __normal_prio(struct task_struct *p) |
2072 | { | 2071 | { |
2073 | return p->static_prio; | 2072 | return p->static_prio; |
2074 | } | 2073 | } |
2075 | 2074 | ||
2076 | /* | 2075 | /* |
2077 | * Calculate the expected normal priority: i.e. priority | 2076 | * Calculate the expected normal priority: i.e. priority |
2078 | * without taking RT-inheritance into account. Might be | 2077 | * without taking RT-inheritance into account. Might be |
2079 | * boosted by interactivity modifiers. Changes upon fork, | 2078 | * boosted by interactivity modifiers. Changes upon fork, |
2080 | * setprio syscalls, and whenever the interactivity | 2079 | * setprio syscalls, and whenever the interactivity |
2081 | * estimator recalculates. | 2080 | * estimator recalculates. |
2082 | */ | 2081 | */ |
2083 | static inline int normal_prio(struct task_struct *p) | 2082 | static inline int normal_prio(struct task_struct *p) |
2084 | { | 2083 | { |
2085 | int prio; | 2084 | int prio; |
2086 | 2085 | ||
2087 | if (task_has_rt_policy(p)) | 2086 | if (task_has_rt_policy(p)) |
2088 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 2087 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
2089 | else | 2088 | else |
2090 | prio = __normal_prio(p); | 2089 | prio = __normal_prio(p); |
2091 | return prio; | 2090 | return prio; |
2092 | } | 2091 | } |
2093 | 2092 | ||
2094 | /* | 2093 | /* |
2095 | * Calculate the current priority, i.e. the priority | 2094 | * Calculate the current priority, i.e. the priority |
2096 | * taken into account by the scheduler. This value might | 2095 | * taken into account by the scheduler. This value might |
2097 | * be boosted by RT tasks, or might be boosted by | 2096 | * be boosted by RT tasks, or might be boosted by |
2098 | * interactivity modifiers. Will be RT if the task got | 2097 | * interactivity modifiers. Will be RT if the task got |
2099 | * RT-boosted. If not then it returns p->normal_prio. | 2098 | * RT-boosted. If not then it returns p->normal_prio. |
2100 | */ | 2099 | */ |
2101 | static int effective_prio(struct task_struct *p) | 2100 | static int effective_prio(struct task_struct *p) |
2102 | { | 2101 | { |
2103 | p->normal_prio = normal_prio(p); | 2102 | p->normal_prio = normal_prio(p); |
2104 | /* | 2103 | /* |
2105 | * If we are RT tasks or we were boosted to RT priority, | 2104 | * If we are RT tasks or we were boosted to RT priority, |
2106 | * keep the priority unchanged. Otherwise, update priority | 2105 | * keep the priority unchanged. Otherwise, update priority |
2107 | * to the normal priority: | 2106 | * to the normal priority: |
2108 | */ | 2107 | */ |
2109 | if (!rt_prio(p->prio)) | 2108 | if (!rt_prio(p->prio)) |
2110 | return p->normal_prio; | 2109 | return p->normal_prio; |
2111 | return p->prio; | 2110 | return p->prio; |
2112 | } | 2111 | } |
2113 | 2112 | ||
2114 | /** | 2113 | /** |
2115 | * task_curr - is this task currently executing on a CPU? | 2114 | * task_curr - is this task currently executing on a CPU? |
2116 | * @p: the task in question. | 2115 | * @p: the task in question. |
2117 | */ | 2116 | */ |
2118 | inline int task_curr(const struct task_struct *p) | 2117 | inline int task_curr(const struct task_struct *p) |
2119 | { | 2118 | { |
2120 | return cpu_curr(task_cpu(p)) == p; | 2119 | return cpu_curr(task_cpu(p)) == p; |
2121 | } | 2120 | } |
2122 | 2121 | ||
2123 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2122 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
2124 | const struct sched_class *prev_class, | 2123 | const struct sched_class *prev_class, |
2125 | int oldprio) | 2124 | int oldprio) |
2126 | { | 2125 | { |
2127 | if (prev_class != p->sched_class) { | 2126 | if (prev_class != p->sched_class) { |
2128 | if (prev_class->switched_from) | 2127 | if (prev_class->switched_from) |
2129 | prev_class->switched_from(rq, p); | 2128 | prev_class->switched_from(rq, p); |
2130 | p->sched_class->switched_to(rq, p); | 2129 | p->sched_class->switched_to(rq, p); |
2131 | } else if (oldprio != p->prio) | 2130 | } else if (oldprio != p->prio) |
2132 | p->sched_class->prio_changed(rq, p, oldprio); | 2131 | p->sched_class->prio_changed(rq, p, oldprio); |
2133 | } | 2132 | } |
2134 | 2133 | ||
2135 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 2134 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
2136 | { | 2135 | { |
2137 | const struct sched_class *class; | 2136 | const struct sched_class *class; |
2138 | 2137 | ||
2139 | if (p->sched_class == rq->curr->sched_class) { | 2138 | if (p->sched_class == rq->curr->sched_class) { |
2140 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | 2139 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); |
2141 | } else { | 2140 | } else { |
2142 | for_each_class(class) { | 2141 | for_each_class(class) { |
2143 | if (class == rq->curr->sched_class) | 2142 | if (class == rq->curr->sched_class) |
2144 | break; | 2143 | break; |
2145 | if (class == p->sched_class) { | 2144 | if (class == p->sched_class) { |
2146 | resched_task(rq->curr); | 2145 | resched_task(rq->curr); |
2147 | break; | 2146 | break; |
2148 | } | 2147 | } |
2149 | } | 2148 | } |
2150 | } | 2149 | } |
2151 | 2150 | ||
2152 | /* | 2151 | /* |
2153 | * A queue event has occurred, and we're going to schedule. In | 2152 | * A queue event has occurred, and we're going to schedule. In |
2154 | * this case, we can save a useless back to back clock update. | 2153 | * this case, we can save a useless back to back clock update. |
2155 | */ | 2154 | */ |
2156 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) | 2155 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) |
2157 | rq->skip_clock_update = 1; | 2156 | rq->skip_clock_update = 1; |
2158 | } | 2157 | } |
2159 | 2158 | ||
2160 | #ifdef CONFIG_SMP | 2159 | #ifdef CONFIG_SMP |
2161 | /* | 2160 | /* |
2162 | * Is this task likely cache-hot: | 2161 | * Is this task likely cache-hot: |
2163 | */ | 2162 | */ |
2164 | static int | 2163 | static int |
2165 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 2164 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) |
2166 | { | 2165 | { |
2167 | s64 delta; | 2166 | s64 delta; |
2168 | 2167 | ||
2169 | if (p->sched_class != &fair_sched_class) | 2168 | if (p->sched_class != &fair_sched_class) |
2170 | return 0; | 2169 | return 0; |
2171 | 2170 | ||
2172 | if (unlikely(p->policy == SCHED_IDLE)) | 2171 | if (unlikely(p->policy == SCHED_IDLE)) |
2173 | return 0; | 2172 | return 0; |
2174 | 2173 | ||
2175 | /* | 2174 | /* |
2176 | * Buddy candidates are cache hot: | 2175 | * Buddy candidates are cache hot: |
2177 | */ | 2176 | */ |
2178 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | 2177 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && |
2179 | (&p->se == cfs_rq_of(&p->se)->next || | 2178 | (&p->se == cfs_rq_of(&p->se)->next || |
2180 | &p->se == cfs_rq_of(&p->se)->last)) | 2179 | &p->se == cfs_rq_of(&p->se)->last)) |
2181 | return 1; | 2180 | return 1; |
2182 | 2181 | ||
2183 | if (sysctl_sched_migration_cost == -1) | 2182 | if (sysctl_sched_migration_cost == -1) |
2184 | return 1; | 2183 | return 1; |
2185 | if (sysctl_sched_migration_cost == 0) | 2184 | if (sysctl_sched_migration_cost == 0) |
2186 | return 0; | 2185 | return 0; |
2187 | 2186 | ||
2188 | delta = now - p->se.exec_start; | 2187 | delta = now - p->se.exec_start; |
2189 | 2188 | ||
2190 | return delta < (s64)sysctl_sched_migration_cost; | 2189 | return delta < (s64)sysctl_sched_migration_cost; |
2191 | } | 2190 | } |
2192 | 2191 | ||
2193 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 2192 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
2194 | { | 2193 | { |
2195 | #ifdef CONFIG_SCHED_DEBUG | 2194 | #ifdef CONFIG_SCHED_DEBUG |
2196 | /* | 2195 | /* |
2197 | * We should never call set_task_cpu() on a blocked task, | 2196 | * We should never call set_task_cpu() on a blocked task, |
2198 | * ttwu() will sort out the placement. | 2197 | * ttwu() will sort out the placement. |
2199 | */ | 2198 | */ |
2200 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2199 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
2201 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2200 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
2202 | 2201 | ||
2203 | #ifdef CONFIG_LOCKDEP | 2202 | #ifdef CONFIG_LOCKDEP |
2204 | /* | 2203 | /* |
2205 | * The caller should hold either p->pi_lock or rq->lock, when changing | 2204 | * The caller should hold either p->pi_lock or rq->lock, when changing |
2206 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. | 2205 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. |
2207 | * | 2206 | * |
2208 | * sched_move_task() holds both and thus holding either pins the cgroup, | 2207 | * sched_move_task() holds both and thus holding either pins the cgroup, |
2209 | * see set_task_rq(). | 2208 | * see set_task_rq(). |
2210 | * | 2209 | * |
2211 | * Furthermore, all task_rq users should acquire both locks, see | 2210 | * Furthermore, all task_rq users should acquire both locks, see |
2212 | * task_rq_lock(). | 2211 | * task_rq_lock(). |
2213 | */ | 2212 | */ |
2214 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || | 2213 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || |
2215 | lockdep_is_held(&task_rq(p)->lock))); | 2214 | lockdep_is_held(&task_rq(p)->lock))); |
2216 | #endif | 2215 | #endif |
2217 | #endif | 2216 | #endif |
2218 | 2217 | ||
2219 | trace_sched_migrate_task(p, new_cpu); | 2218 | trace_sched_migrate_task(p, new_cpu); |
2220 | 2219 | ||
2221 | if (task_cpu(p) != new_cpu) { | 2220 | if (task_cpu(p) != new_cpu) { |
2222 | p->se.nr_migrations++; | 2221 | p->se.nr_migrations++; |
2223 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); | 2222 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); |
2224 | } | 2223 | } |
2225 | 2224 | ||
2226 | __set_task_cpu(p, new_cpu); | 2225 | __set_task_cpu(p, new_cpu); |
2227 | } | 2226 | } |
2228 | 2227 | ||
2229 | struct migration_arg { | 2228 | struct migration_arg { |
2230 | struct task_struct *task; | 2229 | struct task_struct *task; |
2231 | int dest_cpu; | 2230 | int dest_cpu; |
2232 | }; | 2231 | }; |
2233 | 2232 | ||
2234 | static int migration_cpu_stop(void *data); | 2233 | static int migration_cpu_stop(void *data); |
2235 | 2234 | ||
2236 | /* | 2235 | /* |
2237 | * wait_task_inactive - wait for a thread to unschedule. | 2236 | * wait_task_inactive - wait for a thread to unschedule. |
2238 | * | 2237 | * |
2239 | * If @match_state is nonzero, it's the @p->state value just checked and | 2238 | * If @match_state is nonzero, it's the @p->state value just checked and |
2240 | * not expected to change. If it changes, i.e. @p might have woken up, | 2239 | * not expected to change. If it changes, i.e. @p might have woken up, |
2241 | * then return zero. When we succeed in waiting for @p to be off its CPU, | 2240 | * then return zero. When we succeed in waiting for @p to be off its CPU, |
2242 | * we return a positive number (its total switch count). If a second call | 2241 | * we return a positive number (its total switch count). If a second call |
2243 | * a short while later returns the same number, the caller can be sure that | 2242 | * a short while later returns the same number, the caller can be sure that |
2244 | * @p has remained unscheduled the whole time. | 2243 | * @p has remained unscheduled the whole time. |
2245 | * | 2244 | * |
2246 | * The caller must ensure that the task *will* unschedule sometime soon, | 2245 | * The caller must ensure that the task *will* unschedule sometime soon, |
2247 | * else this function might spin for a *long* time. This function can't | 2246 | * else this function might spin for a *long* time. This function can't |
2248 | * be called with interrupts off, or it may introduce deadlock with | 2247 | * be called with interrupts off, or it may introduce deadlock with |
2249 | * smp_call_function() if an IPI is sent by the same process we are | 2248 | * smp_call_function() if an IPI is sent by the same process we are |
2250 | * waiting to become inactive. | 2249 | * waiting to become inactive. |
2251 | */ | 2250 | */ |
2252 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) | 2251 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) |
2253 | { | 2252 | { |
2254 | unsigned long flags; | 2253 | unsigned long flags; |
2255 | int running, on_rq; | 2254 | int running, on_rq; |
2256 | unsigned long ncsw; | 2255 | unsigned long ncsw; |
2257 | struct rq *rq; | 2256 | struct rq *rq; |
2258 | 2257 | ||
2259 | for (;;) { | 2258 | for (;;) { |
2260 | /* | 2259 | /* |
2261 | * We do the initial early heuristics without holding | 2260 | * We do the initial early heuristics without holding |
2262 | * any task-queue locks at all. We'll only try to get | 2261 | * any task-queue locks at all. We'll only try to get |
2263 | * the runqueue lock when things look like they will | 2262 | * the runqueue lock when things look like they will |
2264 | * work out! | 2263 | * work out! |
2265 | */ | 2264 | */ |
2266 | rq = task_rq(p); | 2265 | rq = task_rq(p); |
2267 | 2266 | ||
2268 | /* | 2267 | /* |
2269 | * If the task is actively running on another CPU | 2268 | * If the task is actively running on another CPU |
2270 | * still, just relax and busy-wait without holding | 2269 | * still, just relax and busy-wait without holding |
2271 | * any locks. | 2270 | * any locks. |
2272 | * | 2271 | * |
2273 | * NOTE! Since we don't hold any locks, it's not | 2272 | * NOTE! Since we don't hold any locks, it's not |
2274 | * even sure that "rq" stays as the right runqueue! | 2273 | * even sure that "rq" stays as the right runqueue! |
2275 | * But we don't care, since "task_running()" will | 2274 | * But we don't care, since "task_running()" will |
2276 | * return false if the runqueue has changed and p | 2275 | * return false if the runqueue has changed and p |
2277 | * is actually now running somewhere else! | 2276 | * is actually now running somewhere else! |
2278 | */ | 2277 | */ |
2279 | while (task_running(rq, p)) { | 2278 | while (task_running(rq, p)) { |
2280 | if (match_state && unlikely(p->state != match_state)) | 2279 | if (match_state && unlikely(p->state != match_state)) |
2281 | return 0; | 2280 | return 0; |
2282 | cpu_relax(); | 2281 | cpu_relax(); |
2283 | } | 2282 | } |
2284 | 2283 | ||
2285 | /* | 2284 | /* |
2286 | * Ok, time to look more closely! We need the rq | 2285 | * Ok, time to look more closely! We need the rq |
2287 | * lock now, to be *sure*. If we're wrong, we'll | 2286 | * lock now, to be *sure*. If we're wrong, we'll |
2288 | * just go back and repeat. | 2287 | * just go back and repeat. |
2289 | */ | 2288 | */ |
2290 | rq = task_rq_lock(p, &flags); | 2289 | rq = task_rq_lock(p, &flags); |
2291 | trace_sched_wait_task(p); | 2290 | trace_sched_wait_task(p); |
2292 | running = task_running(rq, p); | 2291 | running = task_running(rq, p); |
2293 | on_rq = p->on_rq; | 2292 | on_rq = p->on_rq; |
2294 | ncsw = 0; | 2293 | ncsw = 0; |
2295 | if (!match_state || p->state == match_state) | 2294 | if (!match_state || p->state == match_state) |
2296 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 2295 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
2297 | task_rq_unlock(rq, p, &flags); | 2296 | task_rq_unlock(rq, p, &flags); |
2298 | 2297 | ||
2299 | /* | 2298 | /* |
2300 | * If it changed from the expected state, bail out now. | 2299 | * If it changed from the expected state, bail out now. |
2301 | */ | 2300 | */ |
2302 | if (unlikely(!ncsw)) | 2301 | if (unlikely(!ncsw)) |
2303 | break; | 2302 | break; |
2304 | 2303 | ||
2305 | /* | 2304 | /* |
2306 | * Was it really running after all now that we | 2305 | * Was it really running after all now that we |
2307 | * checked with the proper locks actually held? | 2306 | * checked with the proper locks actually held? |
2308 | * | 2307 | * |
2309 | * Oops. Go back and try again.. | 2308 | * Oops. Go back and try again.. |
2310 | */ | 2309 | */ |
2311 | if (unlikely(running)) { | 2310 | if (unlikely(running)) { |
2312 | cpu_relax(); | 2311 | cpu_relax(); |
2313 | continue; | 2312 | continue; |
2314 | } | 2313 | } |
2315 | 2314 | ||
2316 | /* | 2315 | /* |
2317 | * It's not enough that it's not actively running, | 2316 | * It's not enough that it's not actively running, |
2318 | * it must be off the runqueue _entirely_, and not | 2317 | * it must be off the runqueue _entirely_, and not |
2319 | * preempted! | 2318 | * preempted! |
2320 | * | 2319 | * |
2321 | * So if it was still runnable (but just not actively | 2320 | * So if it was still runnable (but just not actively |
2322 | * running right now), it's preempted, and we should | 2321 | * running right now), it's preempted, and we should |
2323 | * yield - it could be a while. | 2322 | * yield - it could be a while. |
2324 | */ | 2323 | */ |
2325 | if (unlikely(on_rq)) { | 2324 | if (unlikely(on_rq)) { |
2326 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); | 2325 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
2327 | 2326 | ||
2328 | set_current_state(TASK_UNINTERRUPTIBLE); | 2327 | set_current_state(TASK_UNINTERRUPTIBLE); |
2329 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); | 2328 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); |
2330 | continue; | 2329 | continue; |
2331 | } | 2330 | } |
2332 | 2331 | ||
2333 | /* | 2332 | /* |
2334 | * Ahh, all good. It wasn't running, and it wasn't | 2333 | * Ahh, all good. It wasn't running, and it wasn't |
2335 | * runnable, which means that it will never become | 2334 | * runnable, which means that it will never become |
2336 | * running in the future either. We're all done! | 2335 | * running in the future either. We're all done! |
2337 | */ | 2336 | */ |
2338 | break; | 2337 | break; |
2339 | } | 2338 | } |
2340 | 2339 | ||
2341 | return ncsw; | 2340 | return ncsw; |
2342 | } | 2341 | } |
2343 | 2342 | ||
2344 | /*** | 2343 | /*** |
2345 | * kick_process - kick a running thread to enter/exit the kernel | 2344 | * kick_process - kick a running thread to enter/exit the kernel |
2346 | * @p: the to-be-kicked thread | 2345 | * @p: the to-be-kicked thread |
2347 | * | 2346 | * |
2348 | * Cause a process which is running on another CPU to enter | 2347 | * Cause a process which is running on another CPU to enter |
2349 | * kernel-mode, without any delay. (to get signals handled.) | 2348 | * kernel-mode, without any delay. (to get signals handled.) |
2350 | * | 2349 | * |
2351 | * NOTE: this function doesn't have to take the runqueue lock, | 2350 | * NOTE: this function doesn't have to take the runqueue lock, |
2352 | * because all it wants to ensure is that the remote task enters | 2351 | * because all it wants to ensure is that the remote task enters |
2353 | * the kernel. If the IPI races and the task has been migrated | 2352 | * the kernel. If the IPI races and the task has been migrated |
2354 | * to another CPU then no harm is done and the purpose has been | 2353 | * to another CPU then no harm is done and the purpose has been |
2355 | * achieved as well. | 2354 | * achieved as well. |
2356 | */ | 2355 | */ |
2357 | void kick_process(struct task_struct *p) | 2356 | void kick_process(struct task_struct *p) |
2358 | { | 2357 | { |
2359 | int cpu; | 2358 | int cpu; |
2360 | 2359 | ||
2361 | preempt_disable(); | 2360 | preempt_disable(); |
2362 | cpu = task_cpu(p); | 2361 | cpu = task_cpu(p); |
2363 | if ((cpu != smp_processor_id()) && task_curr(p)) | 2362 | if ((cpu != smp_processor_id()) && task_curr(p)) |
2364 | smp_send_reschedule(cpu); | 2363 | smp_send_reschedule(cpu); |
2365 | preempt_enable(); | 2364 | preempt_enable(); |
2366 | } | 2365 | } |
2367 | EXPORT_SYMBOL_GPL(kick_process); | 2366 | EXPORT_SYMBOL_GPL(kick_process); |
2368 | #endif /* CONFIG_SMP */ | 2367 | #endif /* CONFIG_SMP */ |
2369 | 2368 | ||
2370 | #ifdef CONFIG_SMP | 2369 | #ifdef CONFIG_SMP |
2371 | /* | 2370 | /* |
2372 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock | 2371 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
2373 | */ | 2372 | */ |
2374 | static int select_fallback_rq(int cpu, struct task_struct *p) | 2373 | static int select_fallback_rq(int cpu, struct task_struct *p) |
2375 | { | 2374 | { |
2376 | int dest_cpu; | 2375 | int dest_cpu; |
2377 | const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); | 2376 | const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); |
2378 | 2377 | ||
2379 | /* Look for allowed, online CPU in same node. */ | 2378 | /* Look for allowed, online CPU in same node. */ |
2380 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) | 2379 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) |
2381 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 2380 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) |
2382 | return dest_cpu; | 2381 | return dest_cpu; |
2383 | 2382 | ||
2384 | /* Any allowed, online CPU? */ | 2383 | /* Any allowed, online CPU? */ |
2385 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); | 2384 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); |
2386 | if (dest_cpu < nr_cpu_ids) | 2385 | if (dest_cpu < nr_cpu_ids) |
2387 | return dest_cpu; | 2386 | return dest_cpu; |
2388 | 2387 | ||
2389 | /* No more Mr. Nice Guy. */ | 2388 | /* No more Mr. Nice Guy. */ |
2390 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2389 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2391 | /* | 2390 | /* |
2392 | * Don't tell them about moving exiting tasks or | 2391 | * Don't tell them about moving exiting tasks or |
2393 | * kernel threads (both mm NULL), since they never | 2392 | * kernel threads (both mm NULL), since they never |
2394 | * leave kernel. | 2393 | * leave kernel. |
2395 | */ | 2394 | */ |
2396 | if (p->mm && printk_ratelimit()) { | 2395 | if (p->mm && printk_ratelimit()) { |
2397 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", | 2396 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
2398 | task_pid_nr(p), p->comm, cpu); | 2397 | task_pid_nr(p), p->comm, cpu); |
2399 | } | 2398 | } |
2400 | 2399 | ||
2401 | return dest_cpu; | 2400 | return dest_cpu; |
2402 | } | 2401 | } |
2403 | 2402 | ||
2404 | /* | 2403 | /* |
2405 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. | 2404 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
2406 | */ | 2405 | */ |
2407 | static inline | 2406 | static inline |
2408 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | 2407 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
2409 | { | 2408 | { |
2410 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); | 2409 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
2411 | 2410 | ||
2412 | /* | 2411 | /* |
2413 | * In order not to call set_task_cpu() on a blocking task we need | 2412 | * In order not to call set_task_cpu() on a blocking task we need |
2414 | * to rely on ttwu() to place the task on a valid ->cpus_allowed | 2413 | * to rely on ttwu() to place the task on a valid ->cpus_allowed |
2415 | * cpu. | 2414 | * cpu. |
2416 | * | 2415 | * |
2417 | * Since this is common to all placement strategies, this lives here. | 2416 | * Since this is common to all placement strategies, this lives here. |
2418 | * | 2417 | * |
2419 | * [ this allows ->select_task() to simply return task_cpu(p) and | 2418 | * [ this allows ->select_task() to simply return task_cpu(p) and |
2420 | * not worry about this generic constraint ] | 2419 | * not worry about this generic constraint ] |
2421 | */ | 2420 | */ |
2422 | if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || | 2421 | if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || |
2423 | !cpu_online(cpu))) | 2422 | !cpu_online(cpu))) |
2424 | cpu = select_fallback_rq(task_cpu(p), p); | 2423 | cpu = select_fallback_rq(task_cpu(p), p); |
2425 | 2424 | ||
2426 | return cpu; | 2425 | return cpu; |
2427 | } | 2426 | } |
2428 | 2427 | ||
2429 | static void update_avg(u64 *avg, u64 sample) | 2428 | static void update_avg(u64 *avg, u64 sample) |
2430 | { | 2429 | { |
2431 | s64 diff = sample - *avg; | 2430 | s64 diff = sample - *avg; |
2432 | *avg += diff >> 3; | 2431 | *avg += diff >> 3; |
2433 | } | 2432 | } |
2434 | #endif | 2433 | #endif |
2435 | 2434 | ||
2436 | static void | 2435 | static void |
2437 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | 2436 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
2438 | { | 2437 | { |
2439 | #ifdef CONFIG_SCHEDSTATS | 2438 | #ifdef CONFIG_SCHEDSTATS |
2440 | struct rq *rq = this_rq(); | 2439 | struct rq *rq = this_rq(); |
2441 | 2440 | ||
2442 | #ifdef CONFIG_SMP | 2441 | #ifdef CONFIG_SMP |
2443 | int this_cpu = smp_processor_id(); | 2442 | int this_cpu = smp_processor_id(); |
2444 | 2443 | ||
2445 | if (cpu == this_cpu) { | 2444 | if (cpu == this_cpu) { |
2446 | schedstat_inc(rq, ttwu_local); | 2445 | schedstat_inc(rq, ttwu_local); |
2447 | schedstat_inc(p, se.statistics.nr_wakeups_local); | 2446 | schedstat_inc(p, se.statistics.nr_wakeups_local); |
2448 | } else { | 2447 | } else { |
2449 | struct sched_domain *sd; | 2448 | struct sched_domain *sd; |
2450 | 2449 | ||
2451 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | 2450 | schedstat_inc(p, se.statistics.nr_wakeups_remote); |
2452 | rcu_read_lock(); | 2451 | rcu_read_lock(); |
2453 | for_each_domain(this_cpu, sd) { | 2452 | for_each_domain(this_cpu, sd) { |
2454 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 2453 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
2455 | schedstat_inc(sd, ttwu_wake_remote); | 2454 | schedstat_inc(sd, ttwu_wake_remote); |
2456 | break; | 2455 | break; |
2457 | } | 2456 | } |
2458 | } | 2457 | } |
2459 | rcu_read_unlock(); | 2458 | rcu_read_unlock(); |
2460 | } | 2459 | } |
2461 | 2460 | ||
2462 | if (wake_flags & WF_MIGRATED) | 2461 | if (wake_flags & WF_MIGRATED) |
2463 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 2462 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); |
2464 | 2463 | ||
2465 | #endif /* CONFIG_SMP */ | 2464 | #endif /* CONFIG_SMP */ |
2466 | 2465 | ||
2467 | schedstat_inc(rq, ttwu_count); | 2466 | schedstat_inc(rq, ttwu_count); |
2468 | schedstat_inc(p, se.statistics.nr_wakeups); | 2467 | schedstat_inc(p, se.statistics.nr_wakeups); |
2469 | 2468 | ||
2470 | if (wake_flags & WF_SYNC) | 2469 | if (wake_flags & WF_SYNC) |
2471 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 2470 | schedstat_inc(p, se.statistics.nr_wakeups_sync); |
2472 | 2471 | ||
2473 | #endif /* CONFIG_SCHEDSTATS */ | 2472 | #endif /* CONFIG_SCHEDSTATS */ |
2474 | } | 2473 | } |
2475 | 2474 | ||
2476 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 2475 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
2477 | { | 2476 | { |
2478 | activate_task(rq, p, en_flags); | 2477 | activate_task(rq, p, en_flags); |
2479 | p->on_rq = 1; | 2478 | p->on_rq = 1; |
2480 | 2479 | ||
2481 | /* if a worker is waking up, notify workqueue */ | 2480 | /* if a worker is waking up, notify workqueue */ |
2482 | if (p->flags & PF_WQ_WORKER) | 2481 | if (p->flags & PF_WQ_WORKER) |
2483 | wq_worker_waking_up(p, cpu_of(rq)); | 2482 | wq_worker_waking_up(p, cpu_of(rq)); |
2484 | } | 2483 | } |
2485 | 2484 | ||
2486 | /* | 2485 | /* |
2487 | * Mark the task runnable and perform wakeup-preemption. | 2486 | * Mark the task runnable and perform wakeup-preemption. |
2488 | */ | 2487 | */ |
2489 | static void | 2488 | static void |
2490 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | 2489 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) |
2491 | { | 2490 | { |
2492 | trace_sched_wakeup(p, true); | 2491 | trace_sched_wakeup(p, true); |
2493 | check_preempt_curr(rq, p, wake_flags); | 2492 | check_preempt_curr(rq, p, wake_flags); |
2494 | 2493 | ||
2495 | p->state = TASK_RUNNING; | 2494 | p->state = TASK_RUNNING; |
2496 | #ifdef CONFIG_SMP | 2495 | #ifdef CONFIG_SMP |
2497 | if (p->sched_class->task_woken) | 2496 | if (p->sched_class->task_woken) |
2498 | p->sched_class->task_woken(rq, p); | 2497 | p->sched_class->task_woken(rq, p); |
2499 | 2498 | ||
2500 | if (unlikely(rq->idle_stamp)) { | 2499 | if (unlikely(rq->idle_stamp)) { |
2501 | u64 delta = rq->clock - rq->idle_stamp; | 2500 | u64 delta = rq->clock - rq->idle_stamp; |
2502 | u64 max = 2*sysctl_sched_migration_cost; | 2501 | u64 max = 2*sysctl_sched_migration_cost; |
2503 | 2502 | ||
2504 | if (delta > max) | 2503 | if (delta > max) |
2505 | rq->avg_idle = max; | 2504 | rq->avg_idle = max; |
2506 | else | 2505 | else |
2507 | update_avg(&rq->avg_idle, delta); | 2506 | update_avg(&rq->avg_idle, delta); |
2508 | rq->idle_stamp = 0; | 2507 | rq->idle_stamp = 0; |
2509 | } | 2508 | } |
2510 | #endif | 2509 | #endif |
2511 | } | 2510 | } |
2512 | 2511 | ||
2513 | static void | 2512 | static void |
2514 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) | 2513 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) |
2515 | { | 2514 | { |
2516 | #ifdef CONFIG_SMP | 2515 | #ifdef CONFIG_SMP |
2517 | if (p->sched_contributes_to_load) | 2516 | if (p->sched_contributes_to_load) |
2518 | rq->nr_uninterruptible--; | 2517 | rq->nr_uninterruptible--; |
2519 | #endif | 2518 | #endif |
2520 | 2519 | ||
2521 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); | 2520 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); |
2522 | ttwu_do_wakeup(rq, p, wake_flags); | 2521 | ttwu_do_wakeup(rq, p, wake_flags); |
2523 | } | 2522 | } |
2524 | 2523 | ||
2525 | /* | 2524 | /* |
2526 | * Called in case the task @p isn't fully descheduled from its runqueue, | 2525 | * Called in case the task @p isn't fully descheduled from its runqueue, |
2527 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, | 2526 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, |
2528 | * since all we need to do is flip p->state to TASK_RUNNING, since | 2527 | * since all we need to do is flip p->state to TASK_RUNNING, since |
2529 | * the task is still ->on_rq. | 2528 | * the task is still ->on_rq. |
2530 | */ | 2529 | */ |
2531 | static int ttwu_remote(struct task_struct *p, int wake_flags) | 2530 | static int ttwu_remote(struct task_struct *p, int wake_flags) |
2532 | { | 2531 | { |
2533 | struct rq *rq; | 2532 | struct rq *rq; |
2534 | int ret = 0; | 2533 | int ret = 0; |
2535 | 2534 | ||
2536 | rq = __task_rq_lock(p); | 2535 | rq = __task_rq_lock(p); |
2537 | if (p->on_rq) { | 2536 | if (p->on_rq) { |
2538 | ttwu_do_wakeup(rq, p, wake_flags); | 2537 | ttwu_do_wakeup(rq, p, wake_flags); |
2539 | ret = 1; | 2538 | ret = 1; |
2540 | } | 2539 | } |
2541 | __task_rq_unlock(rq); | 2540 | __task_rq_unlock(rq); |
2542 | 2541 | ||
2543 | return ret; | 2542 | return ret; |
2544 | } | 2543 | } |
2545 | 2544 | ||
2546 | #ifdef CONFIG_SMP | 2545 | #ifdef CONFIG_SMP |
2547 | static void sched_ttwu_pending(void) | 2546 | static void sched_ttwu_pending(void) |
2548 | { | 2547 | { |
2549 | struct rq *rq = this_rq(); | 2548 | struct rq *rq = this_rq(); |
2550 | struct task_struct *list = xchg(&rq->wake_list, NULL); | 2549 | struct task_struct *list = xchg(&rq->wake_list, NULL); |
2551 | 2550 | ||
2552 | if (!list) | 2551 | if (!list) |
2553 | return; | 2552 | return; |
2554 | 2553 | ||
2555 | raw_spin_lock(&rq->lock); | 2554 | raw_spin_lock(&rq->lock); |
2556 | 2555 | ||
2557 | while (list) { | 2556 | while (list) { |
2558 | struct task_struct *p = list; | 2557 | struct task_struct *p = list; |
2559 | list = list->wake_entry; | 2558 | list = list->wake_entry; |
2560 | ttwu_do_activate(rq, p, 0); | 2559 | ttwu_do_activate(rq, p, 0); |
2561 | } | 2560 | } |
2562 | 2561 | ||
2563 | raw_spin_unlock(&rq->lock); | 2562 | raw_spin_unlock(&rq->lock); |
2564 | } | 2563 | } |
2565 | 2564 | ||
2566 | void scheduler_ipi(void) | 2565 | void scheduler_ipi(void) |
2567 | { | 2566 | { |
2568 | sched_ttwu_pending(); | 2567 | sched_ttwu_pending(); |
2569 | } | 2568 | } |
2570 | 2569 | ||
2571 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | 2570 | static void ttwu_queue_remote(struct task_struct *p, int cpu) |
2572 | { | 2571 | { |
2573 | struct rq *rq = cpu_rq(cpu); | 2572 | struct rq *rq = cpu_rq(cpu); |
2574 | struct task_struct *next = rq->wake_list; | 2573 | struct task_struct *next = rq->wake_list; |
2575 | 2574 | ||
2576 | for (;;) { | 2575 | for (;;) { |
2577 | struct task_struct *old = next; | 2576 | struct task_struct *old = next; |
2578 | 2577 | ||
2579 | p->wake_entry = next; | 2578 | p->wake_entry = next; |
2580 | next = cmpxchg(&rq->wake_list, old, p); | 2579 | next = cmpxchg(&rq->wake_list, old, p); |
2581 | if (next == old) | 2580 | if (next == old) |
2582 | break; | 2581 | break; |
2583 | } | 2582 | } |
2584 | 2583 | ||
2585 | if (!next) | 2584 | if (!next) |
2586 | smp_send_reschedule(cpu); | 2585 | smp_send_reschedule(cpu); |
2587 | } | 2586 | } |
2588 | 2587 | ||
2589 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 2588 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2590 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | 2589 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) |
2591 | { | 2590 | { |
2592 | struct rq *rq; | 2591 | struct rq *rq; |
2593 | int ret = 0; | 2592 | int ret = 0; |
2594 | 2593 | ||
2595 | rq = __task_rq_lock(p); | 2594 | rq = __task_rq_lock(p); |
2596 | if (p->on_cpu) { | 2595 | if (p->on_cpu) { |
2597 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2596 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2598 | ttwu_do_wakeup(rq, p, wake_flags); | 2597 | ttwu_do_wakeup(rq, p, wake_flags); |
2599 | ret = 1; | 2598 | ret = 1; |
2600 | } | 2599 | } |
2601 | __task_rq_unlock(rq); | 2600 | __task_rq_unlock(rq); |
2602 | 2601 | ||
2603 | return ret; | 2602 | return ret; |
2604 | 2603 | ||
2605 | } | 2604 | } |
2606 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 2605 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
2607 | #endif /* CONFIG_SMP */ | 2606 | #endif /* CONFIG_SMP */ |
2608 | 2607 | ||
2609 | static void ttwu_queue(struct task_struct *p, int cpu) | 2608 | static void ttwu_queue(struct task_struct *p, int cpu) |
2610 | { | 2609 | { |
2611 | struct rq *rq = cpu_rq(cpu); | 2610 | struct rq *rq = cpu_rq(cpu); |
2612 | 2611 | ||
2613 | #if defined(CONFIG_SMP) | 2612 | #if defined(CONFIG_SMP) |
2614 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | 2613 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { |
2615 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | 2614 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ |
2616 | ttwu_queue_remote(p, cpu); | 2615 | ttwu_queue_remote(p, cpu); |
2617 | return; | 2616 | return; |
2618 | } | 2617 | } |
2619 | #endif | 2618 | #endif |
2620 | 2619 | ||
2621 | raw_spin_lock(&rq->lock); | 2620 | raw_spin_lock(&rq->lock); |
2622 | ttwu_do_activate(rq, p, 0); | 2621 | ttwu_do_activate(rq, p, 0); |
2623 | raw_spin_unlock(&rq->lock); | 2622 | raw_spin_unlock(&rq->lock); |
2624 | } | 2623 | } |
2625 | 2624 | ||
2626 | /** | 2625 | /** |
2627 | * try_to_wake_up - wake up a thread | 2626 | * try_to_wake_up - wake up a thread |
2628 | * @p: the thread to be awakened | 2627 | * @p: the thread to be awakened |
2629 | * @state: the mask of task states that can be woken | 2628 | * @state: the mask of task states that can be woken |
2630 | * @wake_flags: wake modifier flags (WF_*) | 2629 | * @wake_flags: wake modifier flags (WF_*) |
2631 | * | 2630 | * |
2632 | * Put it on the run-queue if it's not already there. The "current" | 2631 | * Put it on the run-queue if it's not already there. The "current" |
2633 | * thread is always on the run-queue (except when the actual | 2632 | * thread is always on the run-queue (except when the actual |
2634 | * re-schedule is in progress), and as such you're allowed to do | 2633 | * re-schedule is in progress), and as such you're allowed to do |
2635 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 2634 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
2636 | * runnable without the overhead of this. | 2635 | * runnable without the overhead of this. |
2637 | * | 2636 | * |
2638 | * Returns %true if @p was woken up, %false if it was already running | 2637 | * Returns %true if @p was woken up, %false if it was already running |
2639 | * or @state didn't match @p's state. | 2638 | * or @state didn't match @p's state. |
2640 | */ | 2639 | */ |
2641 | static int | 2640 | static int |
2642 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | 2641 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
2643 | { | 2642 | { |
2644 | unsigned long flags; | 2643 | unsigned long flags; |
2645 | int cpu, success = 0; | 2644 | int cpu, success = 0; |
2646 | 2645 | ||
2647 | smp_wmb(); | 2646 | smp_wmb(); |
2648 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2647 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2649 | if (!(p->state & state)) | 2648 | if (!(p->state & state)) |
2650 | goto out; | 2649 | goto out; |
2651 | 2650 | ||
2652 | success = 1; /* we're going to change ->state */ | 2651 | success = 1; /* we're going to change ->state */ |
2653 | cpu = task_cpu(p); | 2652 | cpu = task_cpu(p); |
2654 | 2653 | ||
2655 | if (p->on_rq && ttwu_remote(p, wake_flags)) | 2654 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
2656 | goto stat; | 2655 | goto stat; |
2657 | 2656 | ||
2658 | #ifdef CONFIG_SMP | 2657 | #ifdef CONFIG_SMP |
2659 | /* | 2658 | /* |
2660 | * If the owning (remote) cpu is still in the middle of schedule() with | 2659 | * If the owning (remote) cpu is still in the middle of schedule() with |
2661 | * this task as prev, wait until its done referencing the task. | 2660 | * this task as prev, wait until its done referencing the task. |
2662 | */ | 2661 | */ |
2663 | while (p->on_cpu) { | 2662 | while (p->on_cpu) { |
2664 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 2663 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2665 | /* | 2664 | /* |
2666 | * In case the architecture enables interrupts in | 2665 | * In case the architecture enables interrupts in |
2667 | * context_switch(), we cannot busy wait, since that | 2666 | * context_switch(), we cannot busy wait, since that |
2668 | * would lead to deadlocks when an interrupt hits and | 2667 | * would lead to deadlocks when an interrupt hits and |
2669 | * tries to wake up @prev. So bail and do a complete | 2668 | * tries to wake up @prev. So bail and do a complete |
2670 | * remote wakeup. | 2669 | * remote wakeup. |
2671 | */ | 2670 | */ |
2672 | if (ttwu_activate_remote(p, wake_flags)) | 2671 | if (ttwu_activate_remote(p, wake_flags)) |
2673 | goto stat; | 2672 | goto stat; |
2674 | #else | 2673 | #else |
2675 | cpu_relax(); | 2674 | cpu_relax(); |
2676 | #endif | 2675 | #endif |
2677 | } | 2676 | } |
2678 | /* | 2677 | /* |
2679 | * Pairs with the smp_wmb() in finish_lock_switch(). | 2678 | * Pairs with the smp_wmb() in finish_lock_switch(). |
2680 | */ | 2679 | */ |
2681 | smp_rmb(); | 2680 | smp_rmb(); |
2682 | 2681 | ||
2683 | p->sched_contributes_to_load = !!task_contributes_to_load(p); | 2682 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2684 | p->state = TASK_WAKING; | 2683 | p->state = TASK_WAKING; |
2685 | 2684 | ||
2686 | if (p->sched_class->task_waking) | 2685 | if (p->sched_class->task_waking) |
2687 | p->sched_class->task_waking(p); | 2686 | p->sched_class->task_waking(p); |
2688 | 2687 | ||
2689 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 2688 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2690 | if (task_cpu(p) != cpu) { | 2689 | if (task_cpu(p) != cpu) { |
2691 | wake_flags |= WF_MIGRATED; | 2690 | wake_flags |= WF_MIGRATED; |
2692 | set_task_cpu(p, cpu); | 2691 | set_task_cpu(p, cpu); |
2693 | } | 2692 | } |
2694 | #endif /* CONFIG_SMP */ | 2693 | #endif /* CONFIG_SMP */ |
2695 | 2694 | ||
2696 | ttwu_queue(p, cpu); | 2695 | ttwu_queue(p, cpu); |
2697 | stat: | 2696 | stat: |
2698 | ttwu_stat(p, cpu, wake_flags); | 2697 | ttwu_stat(p, cpu, wake_flags); |
2699 | out: | 2698 | out: |
2700 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2699 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2701 | 2700 | ||
2702 | return success; | 2701 | return success; |
2703 | } | 2702 | } |
2704 | 2703 | ||
2705 | /** | 2704 | /** |
2706 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2705 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2707 | * @p: the thread to be awakened | 2706 | * @p: the thread to be awakened |
2708 | * | 2707 | * |
2709 | * Put @p on the run-queue if it's not already there. The caller must | 2708 | * Put @p on the run-queue if it's not already there. The caller must |
2710 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2709 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2711 | * the current task. | 2710 | * the current task. |
2712 | */ | 2711 | */ |
2713 | static void try_to_wake_up_local(struct task_struct *p) | 2712 | static void try_to_wake_up_local(struct task_struct *p) |
2714 | { | 2713 | { |
2715 | struct rq *rq = task_rq(p); | 2714 | struct rq *rq = task_rq(p); |
2716 | 2715 | ||
2717 | BUG_ON(rq != this_rq()); | 2716 | BUG_ON(rq != this_rq()); |
2718 | BUG_ON(p == current); | 2717 | BUG_ON(p == current); |
2719 | lockdep_assert_held(&rq->lock); | 2718 | lockdep_assert_held(&rq->lock); |
2720 | 2719 | ||
2721 | if (!raw_spin_trylock(&p->pi_lock)) { | 2720 | if (!raw_spin_trylock(&p->pi_lock)) { |
2722 | raw_spin_unlock(&rq->lock); | 2721 | raw_spin_unlock(&rq->lock); |
2723 | raw_spin_lock(&p->pi_lock); | 2722 | raw_spin_lock(&p->pi_lock); |
2724 | raw_spin_lock(&rq->lock); | 2723 | raw_spin_lock(&rq->lock); |
2725 | } | 2724 | } |
2726 | 2725 | ||
2727 | if (!(p->state & TASK_NORMAL)) | 2726 | if (!(p->state & TASK_NORMAL)) |
2728 | goto out; | 2727 | goto out; |
2729 | 2728 | ||
2730 | if (!p->on_rq) | 2729 | if (!p->on_rq) |
2731 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2730 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2732 | 2731 | ||
2733 | ttwu_do_wakeup(rq, p, 0); | 2732 | ttwu_do_wakeup(rq, p, 0); |
2734 | ttwu_stat(p, smp_processor_id(), 0); | 2733 | ttwu_stat(p, smp_processor_id(), 0); |
2735 | out: | 2734 | out: |
2736 | raw_spin_unlock(&p->pi_lock); | 2735 | raw_spin_unlock(&p->pi_lock); |
2737 | } | 2736 | } |
2738 | 2737 | ||
2739 | /** | 2738 | /** |
2740 | * wake_up_process - Wake up a specific process | 2739 | * wake_up_process - Wake up a specific process |
2741 | * @p: The process to be woken up. | 2740 | * @p: The process to be woken up. |
2742 | * | 2741 | * |
2743 | * Attempt to wake up the nominated process and move it to the set of runnable | 2742 | * Attempt to wake up the nominated process and move it to the set of runnable |
2744 | * processes. Returns 1 if the process was woken up, 0 if it was already | 2743 | * processes. Returns 1 if the process was woken up, 0 if it was already |
2745 | * running. | 2744 | * running. |
2746 | * | 2745 | * |
2747 | * It may be assumed that this function implies a write memory barrier before | 2746 | * It may be assumed that this function implies a write memory barrier before |
2748 | * changing the task state if and only if any tasks are woken up. | 2747 | * changing the task state if and only if any tasks are woken up. |
2749 | */ | 2748 | */ |
2750 | int wake_up_process(struct task_struct *p) | 2749 | int wake_up_process(struct task_struct *p) |
2751 | { | 2750 | { |
2752 | return try_to_wake_up(p, TASK_ALL, 0); | 2751 | return try_to_wake_up(p, TASK_ALL, 0); |
2753 | } | 2752 | } |
2754 | EXPORT_SYMBOL(wake_up_process); | 2753 | EXPORT_SYMBOL(wake_up_process); |
2755 | 2754 | ||
2756 | int wake_up_state(struct task_struct *p, unsigned int state) | 2755 | int wake_up_state(struct task_struct *p, unsigned int state) |
2757 | { | 2756 | { |
2758 | return try_to_wake_up(p, state, 0); | 2757 | return try_to_wake_up(p, state, 0); |
2759 | } | 2758 | } |
2760 | 2759 | ||
2761 | /* | 2760 | /* |
2762 | * Perform scheduler related setup for a newly forked process p. | 2761 | * Perform scheduler related setup for a newly forked process p. |
2763 | * p is forked by current. | 2762 | * p is forked by current. |
2764 | * | 2763 | * |
2765 | * __sched_fork() is basic setup used by init_idle() too: | 2764 | * __sched_fork() is basic setup used by init_idle() too: |
2766 | */ | 2765 | */ |
2767 | static void __sched_fork(struct task_struct *p) | 2766 | static void __sched_fork(struct task_struct *p) |
2768 | { | 2767 | { |
2769 | p->on_rq = 0; | 2768 | p->on_rq = 0; |
2770 | 2769 | ||
2771 | p->se.on_rq = 0; | 2770 | p->se.on_rq = 0; |
2772 | p->se.exec_start = 0; | 2771 | p->se.exec_start = 0; |
2773 | p->se.sum_exec_runtime = 0; | 2772 | p->se.sum_exec_runtime = 0; |
2774 | p->se.prev_sum_exec_runtime = 0; | 2773 | p->se.prev_sum_exec_runtime = 0; |
2775 | p->se.nr_migrations = 0; | 2774 | p->se.nr_migrations = 0; |
2776 | p->se.vruntime = 0; | 2775 | p->se.vruntime = 0; |
2777 | INIT_LIST_HEAD(&p->se.group_node); | 2776 | INIT_LIST_HEAD(&p->se.group_node); |
2778 | 2777 | ||
2779 | #ifdef CONFIG_SCHEDSTATS | 2778 | #ifdef CONFIG_SCHEDSTATS |
2780 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2779 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2781 | #endif | 2780 | #endif |
2782 | 2781 | ||
2783 | INIT_LIST_HEAD(&p->rt.run_list); | 2782 | INIT_LIST_HEAD(&p->rt.run_list); |
2784 | 2783 | ||
2785 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2784 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2786 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2785 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
2787 | #endif | 2786 | #endif |
2788 | } | 2787 | } |
2789 | 2788 | ||
2790 | /* | 2789 | /* |
2791 | * fork()/clone()-time setup: | 2790 | * fork()/clone()-time setup: |
2792 | */ | 2791 | */ |
2793 | void sched_fork(struct task_struct *p) | 2792 | void sched_fork(struct task_struct *p) |
2794 | { | 2793 | { |
2795 | unsigned long flags; | 2794 | unsigned long flags; |
2796 | int cpu = get_cpu(); | 2795 | int cpu = get_cpu(); |
2797 | 2796 | ||
2798 | __sched_fork(p); | 2797 | __sched_fork(p); |
2799 | /* | 2798 | /* |
2800 | * We mark the process as running here. This guarantees that | 2799 | * We mark the process as running here. This guarantees that |
2801 | * nobody will actually run it, and a signal or other external | 2800 | * nobody will actually run it, and a signal or other external |
2802 | * event cannot wake it up and insert it on the runqueue either. | 2801 | * event cannot wake it up and insert it on the runqueue either. |
2803 | */ | 2802 | */ |
2804 | p->state = TASK_RUNNING; | 2803 | p->state = TASK_RUNNING; |
2805 | 2804 | ||
2806 | /* | 2805 | /* |
2807 | * Revert to default priority/policy on fork if requested. | 2806 | * Revert to default priority/policy on fork if requested. |
2808 | */ | 2807 | */ |
2809 | if (unlikely(p->sched_reset_on_fork)) { | 2808 | if (unlikely(p->sched_reset_on_fork)) { |
2810 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | 2809 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { |
2811 | p->policy = SCHED_NORMAL; | 2810 | p->policy = SCHED_NORMAL; |
2812 | p->normal_prio = p->static_prio; | 2811 | p->normal_prio = p->static_prio; |
2813 | } | 2812 | } |
2814 | 2813 | ||
2815 | if (PRIO_TO_NICE(p->static_prio) < 0) { | 2814 | if (PRIO_TO_NICE(p->static_prio) < 0) { |
2816 | p->static_prio = NICE_TO_PRIO(0); | 2815 | p->static_prio = NICE_TO_PRIO(0); |
2817 | p->normal_prio = p->static_prio; | 2816 | p->normal_prio = p->static_prio; |
2818 | set_load_weight(p); | 2817 | set_load_weight(p); |
2819 | } | 2818 | } |
2820 | 2819 | ||
2821 | /* | 2820 | /* |
2822 | * We don't need the reset flag anymore after the fork. It has | 2821 | * We don't need the reset flag anymore after the fork. It has |
2823 | * fulfilled its duty: | 2822 | * fulfilled its duty: |
2824 | */ | 2823 | */ |
2825 | p->sched_reset_on_fork = 0; | 2824 | p->sched_reset_on_fork = 0; |
2826 | } | 2825 | } |
2827 | 2826 | ||
2828 | /* | 2827 | /* |
2829 | * Make sure we do not leak PI boosting priority to the child. | 2828 | * Make sure we do not leak PI boosting priority to the child. |
2830 | */ | 2829 | */ |
2831 | p->prio = current->normal_prio; | 2830 | p->prio = current->normal_prio; |
2832 | 2831 | ||
2833 | if (!rt_prio(p->prio)) | 2832 | if (!rt_prio(p->prio)) |
2834 | p->sched_class = &fair_sched_class; | 2833 | p->sched_class = &fair_sched_class; |
2835 | 2834 | ||
2836 | if (p->sched_class->task_fork) | 2835 | if (p->sched_class->task_fork) |
2837 | p->sched_class->task_fork(p); | 2836 | p->sched_class->task_fork(p); |
2838 | 2837 | ||
2839 | /* | 2838 | /* |
2840 | * The child is not yet in the pid-hash so no cgroup attach races, | 2839 | * The child is not yet in the pid-hash so no cgroup attach races, |
2841 | * and the cgroup is pinned to this child due to cgroup_fork() | 2840 | * and the cgroup is pinned to this child due to cgroup_fork() |
2842 | * is ran before sched_fork(). | 2841 | * is ran before sched_fork(). |
2843 | * | 2842 | * |
2844 | * Silence PROVE_RCU. | 2843 | * Silence PROVE_RCU. |
2845 | */ | 2844 | */ |
2846 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2845 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2847 | set_task_cpu(p, cpu); | 2846 | set_task_cpu(p, cpu); |
2848 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2847 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2849 | 2848 | ||
2850 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2849 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2851 | if (likely(sched_info_on())) | 2850 | if (likely(sched_info_on())) |
2852 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2851 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
2853 | #endif | 2852 | #endif |
2854 | #if defined(CONFIG_SMP) | 2853 | #if defined(CONFIG_SMP) |
2855 | p->on_cpu = 0; | 2854 | p->on_cpu = 0; |
2856 | #endif | 2855 | #endif |
2857 | #ifdef CONFIG_PREEMPT | 2856 | #ifdef CONFIG_PREEMPT |
2858 | /* Want to start with kernel preemption disabled. */ | 2857 | /* Want to start with kernel preemption disabled. */ |
2859 | task_thread_info(p)->preempt_count = 1; | 2858 | task_thread_info(p)->preempt_count = 1; |
2860 | #endif | 2859 | #endif |
2861 | #ifdef CONFIG_SMP | 2860 | #ifdef CONFIG_SMP |
2862 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2861 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2863 | #endif | 2862 | #endif |
2864 | 2863 | ||
2865 | put_cpu(); | 2864 | put_cpu(); |
2866 | } | 2865 | } |
2867 | 2866 | ||
2868 | /* | 2867 | /* |
2869 | * wake_up_new_task - wake up a newly created task for the first time. | 2868 | * wake_up_new_task - wake up a newly created task for the first time. |
2870 | * | 2869 | * |
2871 | * This function will do some initial scheduler statistics housekeeping | 2870 | * This function will do some initial scheduler statistics housekeeping |
2872 | * that must be done for every newly created context, then puts the task | 2871 | * that must be done for every newly created context, then puts the task |
2873 | * on the runqueue and wakes it. | 2872 | * on the runqueue and wakes it. |
2874 | */ | 2873 | */ |
2875 | void wake_up_new_task(struct task_struct *p) | 2874 | void wake_up_new_task(struct task_struct *p) |
2876 | { | 2875 | { |
2877 | unsigned long flags; | 2876 | unsigned long flags; |
2878 | struct rq *rq; | 2877 | struct rq *rq; |
2879 | 2878 | ||
2880 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2879 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2881 | #ifdef CONFIG_SMP | 2880 | #ifdef CONFIG_SMP |
2882 | /* | 2881 | /* |
2883 | * Fork balancing, do it here and not earlier because: | 2882 | * Fork balancing, do it here and not earlier because: |
2884 | * - cpus_allowed can change in the fork path | 2883 | * - cpus_allowed can change in the fork path |
2885 | * - any previously selected cpu might disappear through hotplug | 2884 | * - any previously selected cpu might disappear through hotplug |
2886 | */ | 2885 | */ |
2887 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); | 2886 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
2888 | #endif | 2887 | #endif |
2889 | 2888 | ||
2890 | rq = __task_rq_lock(p); | 2889 | rq = __task_rq_lock(p); |
2891 | activate_task(rq, p, 0); | 2890 | activate_task(rq, p, 0); |
2892 | p->on_rq = 1; | 2891 | p->on_rq = 1; |
2893 | trace_sched_wakeup_new(p, true); | 2892 | trace_sched_wakeup_new(p, true); |
2894 | check_preempt_curr(rq, p, WF_FORK); | 2893 | check_preempt_curr(rq, p, WF_FORK); |
2895 | #ifdef CONFIG_SMP | 2894 | #ifdef CONFIG_SMP |
2896 | if (p->sched_class->task_woken) | 2895 | if (p->sched_class->task_woken) |
2897 | p->sched_class->task_woken(rq, p); | 2896 | p->sched_class->task_woken(rq, p); |
2898 | #endif | 2897 | #endif |
2899 | task_rq_unlock(rq, p, &flags); | 2898 | task_rq_unlock(rq, p, &flags); |
2900 | } | 2899 | } |
2901 | 2900 | ||
2902 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2901 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2903 | 2902 | ||
2904 | /** | 2903 | /** |
2905 | * preempt_notifier_register - tell me when current is being preempted & rescheduled | 2904 | * preempt_notifier_register - tell me when current is being preempted & rescheduled |
2906 | * @notifier: notifier struct to register | 2905 | * @notifier: notifier struct to register |
2907 | */ | 2906 | */ |
2908 | void preempt_notifier_register(struct preempt_notifier *notifier) | 2907 | void preempt_notifier_register(struct preempt_notifier *notifier) |
2909 | { | 2908 | { |
2910 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); | 2909 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); |
2911 | } | 2910 | } |
2912 | EXPORT_SYMBOL_GPL(preempt_notifier_register); | 2911 | EXPORT_SYMBOL_GPL(preempt_notifier_register); |
2913 | 2912 | ||
2914 | /** | 2913 | /** |
2915 | * preempt_notifier_unregister - no longer interested in preemption notifications | 2914 | * preempt_notifier_unregister - no longer interested in preemption notifications |
2916 | * @notifier: notifier struct to unregister | 2915 | * @notifier: notifier struct to unregister |
2917 | * | 2916 | * |
2918 | * This is safe to call from within a preemption notifier. | 2917 | * This is safe to call from within a preemption notifier. |
2919 | */ | 2918 | */ |
2920 | void preempt_notifier_unregister(struct preempt_notifier *notifier) | 2919 | void preempt_notifier_unregister(struct preempt_notifier *notifier) |
2921 | { | 2920 | { |
2922 | hlist_del(¬ifier->link); | 2921 | hlist_del(¬ifier->link); |
2923 | } | 2922 | } |
2924 | EXPORT_SYMBOL_GPL(preempt_notifier_unregister); | 2923 | EXPORT_SYMBOL_GPL(preempt_notifier_unregister); |
2925 | 2924 | ||
2926 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2925 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2927 | { | 2926 | { |
2928 | struct preempt_notifier *notifier; | 2927 | struct preempt_notifier *notifier; |
2929 | struct hlist_node *node; | 2928 | struct hlist_node *node; |
2930 | 2929 | ||
2931 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | 2930 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) |
2932 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); | 2931 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); |
2933 | } | 2932 | } |
2934 | 2933 | ||
2935 | static void | 2934 | static void |
2936 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | 2935 | fire_sched_out_preempt_notifiers(struct task_struct *curr, |
2937 | struct task_struct *next) | 2936 | struct task_struct *next) |
2938 | { | 2937 | { |
2939 | struct preempt_notifier *notifier; | 2938 | struct preempt_notifier *notifier; |
2940 | struct hlist_node *node; | 2939 | struct hlist_node *node; |
2941 | 2940 | ||
2942 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | 2941 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) |
2943 | notifier->ops->sched_out(notifier, next); | 2942 | notifier->ops->sched_out(notifier, next); |
2944 | } | 2943 | } |
2945 | 2944 | ||
2946 | #else /* !CONFIG_PREEMPT_NOTIFIERS */ | 2945 | #else /* !CONFIG_PREEMPT_NOTIFIERS */ |
2947 | 2946 | ||
2948 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2947 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2949 | { | 2948 | { |
2950 | } | 2949 | } |
2951 | 2950 | ||
2952 | static void | 2951 | static void |
2953 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | 2952 | fire_sched_out_preempt_notifiers(struct task_struct *curr, |
2954 | struct task_struct *next) | 2953 | struct task_struct *next) |
2955 | { | 2954 | { |
2956 | } | 2955 | } |
2957 | 2956 | ||
2958 | #endif /* CONFIG_PREEMPT_NOTIFIERS */ | 2957 | #endif /* CONFIG_PREEMPT_NOTIFIERS */ |
2959 | 2958 | ||
2960 | /** | 2959 | /** |
2961 | * prepare_task_switch - prepare to switch tasks | 2960 | * prepare_task_switch - prepare to switch tasks |
2962 | * @rq: the runqueue preparing to switch | 2961 | * @rq: the runqueue preparing to switch |
2963 | * @prev: the current task that is being switched out | 2962 | * @prev: the current task that is being switched out |
2964 | * @next: the task we are going to switch to. | 2963 | * @next: the task we are going to switch to. |
2965 | * | 2964 | * |
2966 | * This is called with the rq lock held and interrupts off. It must | 2965 | * This is called with the rq lock held and interrupts off. It must |
2967 | * be paired with a subsequent finish_task_switch after the context | 2966 | * be paired with a subsequent finish_task_switch after the context |
2968 | * switch. | 2967 | * switch. |
2969 | * | 2968 | * |
2970 | * prepare_task_switch sets up locking and calls architecture specific | 2969 | * prepare_task_switch sets up locking and calls architecture specific |
2971 | * hooks. | 2970 | * hooks. |
2972 | */ | 2971 | */ |
2973 | static inline void | 2972 | static inline void |
2974 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2973 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2975 | struct task_struct *next) | 2974 | struct task_struct *next) |
2976 | { | 2975 | { |
2977 | sched_info_switch(prev, next); | 2976 | sched_info_switch(prev, next); |
2978 | perf_event_task_sched_out(prev, next); | 2977 | perf_event_task_sched_out(prev, next); |
2979 | fire_sched_out_preempt_notifiers(prev, next); | 2978 | fire_sched_out_preempt_notifiers(prev, next); |
2980 | prepare_lock_switch(rq, next); | 2979 | prepare_lock_switch(rq, next); |
2981 | prepare_arch_switch(next); | 2980 | prepare_arch_switch(next); |
2982 | trace_sched_switch(prev, next); | 2981 | trace_sched_switch(prev, next); |
2983 | } | 2982 | } |
2984 | 2983 | ||
2985 | /** | 2984 | /** |
2986 | * finish_task_switch - clean up after a task-switch | 2985 | * finish_task_switch - clean up after a task-switch |
2987 | * @rq: runqueue associated with task-switch | 2986 | * @rq: runqueue associated with task-switch |
2988 | * @prev: the thread we just switched away from. | 2987 | * @prev: the thread we just switched away from. |
2989 | * | 2988 | * |
2990 | * finish_task_switch must be called after the context switch, paired | 2989 | * finish_task_switch must be called after the context switch, paired |
2991 | * with a prepare_task_switch call before the context switch. | 2990 | * with a prepare_task_switch call before the context switch. |
2992 | * finish_task_switch will reconcile locking set up by prepare_task_switch, | 2991 | * finish_task_switch will reconcile locking set up by prepare_task_switch, |
2993 | * and do any other architecture-specific cleanup actions. | 2992 | * and do any other architecture-specific cleanup actions. |
2994 | * | 2993 | * |
2995 | * Note that we may have delayed dropping an mm in context_switch(). If | 2994 | * Note that we may have delayed dropping an mm in context_switch(). If |
2996 | * so, we finish that here outside of the runqueue lock. (Doing it | 2995 | * so, we finish that here outside of the runqueue lock. (Doing it |
2997 | * with the lock held can cause deadlocks; see schedule() for | 2996 | * with the lock held can cause deadlocks; see schedule() for |
2998 | * details.) | 2997 | * details.) |
2999 | */ | 2998 | */ |
3000 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) | 2999 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) |
3001 | __releases(rq->lock) | 3000 | __releases(rq->lock) |
3002 | { | 3001 | { |
3003 | struct mm_struct *mm = rq->prev_mm; | 3002 | struct mm_struct *mm = rq->prev_mm; |
3004 | long prev_state; | 3003 | long prev_state; |
3005 | 3004 | ||
3006 | rq->prev_mm = NULL; | 3005 | rq->prev_mm = NULL; |
3007 | 3006 | ||
3008 | /* | 3007 | /* |
3009 | * A task struct has one reference for the use as "current". | 3008 | * A task struct has one reference for the use as "current". |
3010 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls | 3009 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls |
3011 | * schedule one last time. The schedule call will never return, and | 3010 | * schedule one last time. The schedule call will never return, and |
3012 | * the scheduled task must drop that reference. | 3011 | * the scheduled task must drop that reference. |
3013 | * The test for TASK_DEAD must occur while the runqueue locks are | 3012 | * The test for TASK_DEAD must occur while the runqueue locks are |
3014 | * still held, otherwise prev could be scheduled on another cpu, die | 3013 | * still held, otherwise prev could be scheduled on another cpu, die |
3015 | * there before we look at prev->state, and then the reference would | 3014 | * there before we look at prev->state, and then the reference would |
3016 | * be dropped twice. | 3015 | * be dropped twice. |
3017 | * Manfred Spraul <manfred@colorfullife.com> | 3016 | * Manfred Spraul <manfred@colorfullife.com> |
3018 | */ | 3017 | */ |
3019 | prev_state = prev->state; | 3018 | prev_state = prev->state; |
3020 | finish_arch_switch(prev); | 3019 | finish_arch_switch(prev); |
3021 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 3020 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
3022 | local_irq_disable(); | 3021 | local_irq_disable(); |
3023 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 3022 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
3024 | perf_event_task_sched_in(current); | 3023 | perf_event_task_sched_in(current); |
3025 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 3024 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
3026 | local_irq_enable(); | 3025 | local_irq_enable(); |
3027 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 3026 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
3028 | finish_lock_switch(rq, prev); | 3027 | finish_lock_switch(rq, prev); |
3029 | 3028 | ||
3030 | fire_sched_in_preempt_notifiers(current); | 3029 | fire_sched_in_preempt_notifiers(current); |
3031 | if (mm) | 3030 | if (mm) |
3032 | mmdrop(mm); | 3031 | mmdrop(mm); |
3033 | if (unlikely(prev_state == TASK_DEAD)) { | 3032 | if (unlikely(prev_state == TASK_DEAD)) { |
3034 | /* | 3033 | /* |
3035 | * Remove function-return probe instances associated with this | 3034 | * Remove function-return probe instances associated with this |
3036 | * task and put them back on the free list. | 3035 | * task and put them back on the free list. |
3037 | */ | 3036 | */ |
3038 | kprobe_flush_task(prev); | 3037 | kprobe_flush_task(prev); |
3039 | put_task_struct(prev); | 3038 | put_task_struct(prev); |
3040 | } | 3039 | } |
3041 | } | 3040 | } |
3042 | 3041 | ||
3043 | #ifdef CONFIG_SMP | 3042 | #ifdef CONFIG_SMP |
3044 | 3043 | ||
3045 | /* assumes rq->lock is held */ | 3044 | /* assumes rq->lock is held */ |
3046 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | 3045 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) |
3047 | { | 3046 | { |
3048 | if (prev->sched_class->pre_schedule) | 3047 | if (prev->sched_class->pre_schedule) |
3049 | prev->sched_class->pre_schedule(rq, prev); | 3048 | prev->sched_class->pre_schedule(rq, prev); |
3050 | } | 3049 | } |
3051 | 3050 | ||
3052 | /* rq->lock is NOT held, but preemption is disabled */ | 3051 | /* rq->lock is NOT held, but preemption is disabled */ |
3053 | static inline void post_schedule(struct rq *rq) | 3052 | static inline void post_schedule(struct rq *rq) |
3054 | { | 3053 | { |
3055 | if (rq->post_schedule) { | 3054 | if (rq->post_schedule) { |
3056 | unsigned long flags; | 3055 | unsigned long flags; |
3057 | 3056 | ||
3058 | raw_spin_lock_irqsave(&rq->lock, flags); | 3057 | raw_spin_lock_irqsave(&rq->lock, flags); |
3059 | if (rq->curr->sched_class->post_schedule) | 3058 | if (rq->curr->sched_class->post_schedule) |
3060 | rq->curr->sched_class->post_schedule(rq); | 3059 | rq->curr->sched_class->post_schedule(rq); |
3061 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 3060 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
3062 | 3061 | ||
3063 | rq->post_schedule = 0; | 3062 | rq->post_schedule = 0; |
3064 | } | 3063 | } |
3065 | } | 3064 | } |
3066 | 3065 | ||
3067 | #else | 3066 | #else |
3068 | 3067 | ||
3069 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | 3068 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) |
3070 | { | 3069 | { |
3071 | } | 3070 | } |
3072 | 3071 | ||
3073 | static inline void post_schedule(struct rq *rq) | 3072 | static inline void post_schedule(struct rq *rq) |
3074 | { | 3073 | { |
3075 | } | 3074 | } |
3076 | 3075 | ||
3077 | #endif | 3076 | #endif |
3078 | 3077 | ||
3079 | /** | 3078 | /** |
3080 | * schedule_tail - first thing a freshly forked thread must call. | 3079 | * schedule_tail - first thing a freshly forked thread must call. |
3081 | * @prev: the thread we just switched away from. | 3080 | * @prev: the thread we just switched away from. |
3082 | */ | 3081 | */ |
3083 | asmlinkage void schedule_tail(struct task_struct *prev) | 3082 | asmlinkage void schedule_tail(struct task_struct *prev) |
3084 | __releases(rq->lock) | 3083 | __releases(rq->lock) |
3085 | { | 3084 | { |
3086 | struct rq *rq = this_rq(); | 3085 | struct rq *rq = this_rq(); |
3087 | 3086 | ||
3088 | finish_task_switch(rq, prev); | 3087 | finish_task_switch(rq, prev); |
3089 | 3088 | ||
3090 | /* | 3089 | /* |
3091 | * FIXME: do we need to worry about rq being invalidated by the | 3090 | * FIXME: do we need to worry about rq being invalidated by the |
3092 | * task_switch? | 3091 | * task_switch? |
3093 | */ | 3092 | */ |
3094 | post_schedule(rq); | 3093 | post_schedule(rq); |
3095 | 3094 | ||
3096 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 3095 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
3097 | /* In this case, finish_task_switch does not reenable preemption */ | 3096 | /* In this case, finish_task_switch does not reenable preemption */ |
3098 | preempt_enable(); | 3097 | preempt_enable(); |
3099 | #endif | 3098 | #endif |
3100 | if (current->set_child_tid) | 3099 | if (current->set_child_tid) |
3101 | put_user(task_pid_vnr(current), current->set_child_tid); | 3100 | put_user(task_pid_vnr(current), current->set_child_tid); |
3102 | } | 3101 | } |
3103 | 3102 | ||
3104 | /* | 3103 | /* |
3105 | * context_switch - switch to the new MM and the new | 3104 | * context_switch - switch to the new MM and the new |
3106 | * thread's register state. | 3105 | * thread's register state. |
3107 | */ | 3106 | */ |
3108 | static inline void | 3107 | static inline void |
3109 | context_switch(struct rq *rq, struct task_struct *prev, | 3108 | context_switch(struct rq *rq, struct task_struct *prev, |
3110 | struct task_struct *next) | 3109 | struct task_struct *next) |
3111 | { | 3110 | { |
3112 | struct mm_struct *mm, *oldmm; | 3111 | struct mm_struct *mm, *oldmm; |
3113 | 3112 | ||
3114 | prepare_task_switch(rq, prev, next); | 3113 | prepare_task_switch(rq, prev, next); |
3115 | 3114 | ||
3116 | mm = next->mm; | 3115 | mm = next->mm; |
3117 | oldmm = prev->active_mm; | 3116 | oldmm = prev->active_mm; |
3118 | /* | 3117 | /* |
3119 | * For paravirt, this is coupled with an exit in switch_to to | 3118 | * For paravirt, this is coupled with an exit in switch_to to |
3120 | * combine the page table reload and the switch backend into | 3119 | * combine the page table reload and the switch backend into |
3121 | * one hypercall. | 3120 | * one hypercall. |
3122 | */ | 3121 | */ |
3123 | arch_start_context_switch(prev); | 3122 | arch_start_context_switch(prev); |
3124 | 3123 | ||
3125 | if (!mm) { | 3124 | if (!mm) { |
3126 | next->active_mm = oldmm; | 3125 | next->active_mm = oldmm; |
3127 | atomic_inc(&oldmm->mm_count); | 3126 | atomic_inc(&oldmm->mm_count); |
3128 | enter_lazy_tlb(oldmm, next); | 3127 | enter_lazy_tlb(oldmm, next); |
3129 | } else | 3128 | } else |
3130 | switch_mm(oldmm, mm, next); | 3129 | switch_mm(oldmm, mm, next); |
3131 | 3130 | ||
3132 | if (!prev->mm) { | 3131 | if (!prev->mm) { |
3133 | prev->active_mm = NULL; | 3132 | prev->active_mm = NULL; |
3134 | rq->prev_mm = oldmm; | 3133 | rq->prev_mm = oldmm; |
3135 | } | 3134 | } |
3136 | /* | 3135 | /* |
3137 | * Since the runqueue lock will be released by the next | 3136 | * Since the runqueue lock will be released by the next |
3138 | * task (which is an invalid locking op but in the case | 3137 | * task (which is an invalid locking op but in the case |
3139 | * of the scheduler it's an obvious special-case), so we | 3138 | * of the scheduler it's an obvious special-case), so we |
3140 | * do an early lockdep release here: | 3139 | * do an early lockdep release here: |
3141 | */ | 3140 | */ |
3142 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 3141 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
3143 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 3142 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
3144 | #endif | 3143 | #endif |
3145 | 3144 | ||
3146 | /* Here we just switch the register state and the stack. */ | 3145 | /* Here we just switch the register state and the stack. */ |
3147 | switch_to(prev, next, prev); | 3146 | switch_to(prev, next, prev); |
3148 | 3147 | ||
3149 | barrier(); | 3148 | barrier(); |
3150 | /* | 3149 | /* |
3151 | * this_rq must be evaluated again because prev may have moved | 3150 | * this_rq must be evaluated again because prev may have moved |
3152 | * CPUs since it called schedule(), thus the 'rq' on its stack | 3151 | * CPUs since it called schedule(), thus the 'rq' on its stack |
3153 | * frame will be invalid. | 3152 | * frame will be invalid. |
3154 | */ | 3153 | */ |
3155 | finish_task_switch(this_rq(), prev); | 3154 | finish_task_switch(this_rq(), prev); |
3156 | } | 3155 | } |
3157 | 3156 | ||
3158 | /* | 3157 | /* |
3159 | * nr_running, nr_uninterruptible and nr_context_switches: | 3158 | * nr_running, nr_uninterruptible and nr_context_switches: |
3160 | * | 3159 | * |
3161 | * externally visible scheduler statistics: current number of runnable | 3160 | * externally visible scheduler statistics: current number of runnable |
3162 | * threads, current number of uninterruptible-sleeping threads, total | 3161 | * threads, current number of uninterruptible-sleeping threads, total |
3163 | * number of context switches performed since bootup. | 3162 | * number of context switches performed since bootup. |
3164 | */ | 3163 | */ |
3165 | unsigned long nr_running(void) | 3164 | unsigned long nr_running(void) |
3166 | { | 3165 | { |
3167 | unsigned long i, sum = 0; | 3166 | unsigned long i, sum = 0; |
3168 | 3167 | ||
3169 | for_each_online_cpu(i) | 3168 | for_each_online_cpu(i) |
3170 | sum += cpu_rq(i)->nr_running; | 3169 | sum += cpu_rq(i)->nr_running; |
3171 | 3170 | ||
3172 | return sum; | 3171 | return sum; |
3173 | } | 3172 | } |
3174 | 3173 | ||
3175 | unsigned long nr_uninterruptible(void) | 3174 | unsigned long nr_uninterruptible(void) |
3176 | { | 3175 | { |
3177 | unsigned long i, sum = 0; | 3176 | unsigned long i, sum = 0; |
3178 | 3177 | ||
3179 | for_each_possible_cpu(i) | 3178 | for_each_possible_cpu(i) |
3180 | sum += cpu_rq(i)->nr_uninterruptible; | 3179 | sum += cpu_rq(i)->nr_uninterruptible; |
3181 | 3180 | ||
3182 | /* | 3181 | /* |
3183 | * Since we read the counters lockless, it might be slightly | 3182 | * Since we read the counters lockless, it might be slightly |
3184 | * inaccurate. Do not allow it to go below zero though: | 3183 | * inaccurate. Do not allow it to go below zero though: |
3185 | */ | 3184 | */ |
3186 | if (unlikely((long)sum < 0)) | 3185 | if (unlikely((long)sum < 0)) |
3187 | sum = 0; | 3186 | sum = 0; |
3188 | 3187 | ||
3189 | return sum; | 3188 | return sum; |
3190 | } | 3189 | } |
3191 | 3190 | ||
3192 | unsigned long long nr_context_switches(void) | 3191 | unsigned long long nr_context_switches(void) |
3193 | { | 3192 | { |
3194 | int i; | 3193 | int i; |
3195 | unsigned long long sum = 0; | 3194 | unsigned long long sum = 0; |
3196 | 3195 | ||
3197 | for_each_possible_cpu(i) | 3196 | for_each_possible_cpu(i) |
3198 | sum += cpu_rq(i)->nr_switches; | 3197 | sum += cpu_rq(i)->nr_switches; |
3199 | 3198 | ||
3200 | return sum; | 3199 | return sum; |
3201 | } | 3200 | } |
3202 | 3201 | ||
3203 | unsigned long nr_iowait(void) | 3202 | unsigned long nr_iowait(void) |
3204 | { | 3203 | { |
3205 | unsigned long i, sum = 0; | 3204 | unsigned long i, sum = 0; |
3206 | 3205 | ||
3207 | for_each_possible_cpu(i) | 3206 | for_each_possible_cpu(i) |
3208 | sum += atomic_read(&cpu_rq(i)->nr_iowait); | 3207 | sum += atomic_read(&cpu_rq(i)->nr_iowait); |
3209 | 3208 | ||
3210 | return sum; | 3209 | return sum; |
3211 | } | 3210 | } |
3212 | 3211 | ||
3213 | unsigned long nr_iowait_cpu(int cpu) | 3212 | unsigned long nr_iowait_cpu(int cpu) |
3214 | { | 3213 | { |
3215 | struct rq *this = cpu_rq(cpu); | 3214 | struct rq *this = cpu_rq(cpu); |
3216 | return atomic_read(&this->nr_iowait); | 3215 | return atomic_read(&this->nr_iowait); |
3217 | } | 3216 | } |
3218 | 3217 | ||
3219 | unsigned long this_cpu_load(void) | 3218 | unsigned long this_cpu_load(void) |
3220 | { | 3219 | { |
3221 | struct rq *this = this_rq(); | 3220 | struct rq *this = this_rq(); |
3222 | return this->cpu_load[0]; | 3221 | return this->cpu_load[0]; |
3223 | } | 3222 | } |
3224 | 3223 | ||
3225 | 3224 | ||
3226 | /* Variables and functions for calc_load */ | 3225 | /* Variables and functions for calc_load */ |
3227 | static atomic_long_t calc_load_tasks; | 3226 | static atomic_long_t calc_load_tasks; |
3228 | static unsigned long calc_load_update; | 3227 | static unsigned long calc_load_update; |
3229 | unsigned long avenrun[3]; | 3228 | unsigned long avenrun[3]; |
3230 | EXPORT_SYMBOL(avenrun); | 3229 | EXPORT_SYMBOL(avenrun); |
3231 | 3230 | ||
3232 | static long calc_load_fold_active(struct rq *this_rq) | 3231 | static long calc_load_fold_active(struct rq *this_rq) |
3233 | { | 3232 | { |
3234 | long nr_active, delta = 0; | 3233 | long nr_active, delta = 0; |
3235 | 3234 | ||
3236 | nr_active = this_rq->nr_running; | 3235 | nr_active = this_rq->nr_running; |
3237 | nr_active += (long) this_rq->nr_uninterruptible; | 3236 | nr_active += (long) this_rq->nr_uninterruptible; |
3238 | 3237 | ||
3239 | if (nr_active != this_rq->calc_load_active) { | 3238 | if (nr_active != this_rq->calc_load_active) { |
3240 | delta = nr_active - this_rq->calc_load_active; | 3239 | delta = nr_active - this_rq->calc_load_active; |
3241 | this_rq->calc_load_active = nr_active; | 3240 | this_rq->calc_load_active = nr_active; |
3242 | } | 3241 | } |
3243 | 3242 | ||
3244 | return delta; | 3243 | return delta; |
3245 | } | 3244 | } |
3246 | 3245 | ||
3247 | static unsigned long | 3246 | static unsigned long |
3248 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | 3247 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
3249 | { | 3248 | { |
3250 | load *= exp; | 3249 | load *= exp; |
3251 | load += active * (FIXED_1 - exp); | 3250 | load += active * (FIXED_1 - exp); |
3252 | load += 1UL << (FSHIFT - 1); | 3251 | load += 1UL << (FSHIFT - 1); |
3253 | return load >> FSHIFT; | 3252 | return load >> FSHIFT; |
3254 | } | 3253 | } |
3255 | 3254 | ||
3256 | #ifdef CONFIG_NO_HZ | 3255 | #ifdef CONFIG_NO_HZ |
3257 | /* | 3256 | /* |
3258 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3257 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
3259 | * | 3258 | * |
3260 | * When making the ILB scale, we should try to pull this in as well. | 3259 | * When making the ILB scale, we should try to pull this in as well. |
3261 | */ | 3260 | */ |
3262 | static atomic_long_t calc_load_tasks_idle; | 3261 | static atomic_long_t calc_load_tasks_idle; |
3263 | 3262 | ||
3264 | static void calc_load_account_idle(struct rq *this_rq) | 3263 | static void calc_load_account_idle(struct rq *this_rq) |
3265 | { | 3264 | { |
3266 | long delta; | 3265 | long delta; |
3267 | 3266 | ||
3268 | delta = calc_load_fold_active(this_rq); | 3267 | delta = calc_load_fold_active(this_rq); |
3269 | if (delta) | 3268 | if (delta) |
3270 | atomic_long_add(delta, &calc_load_tasks_idle); | 3269 | atomic_long_add(delta, &calc_load_tasks_idle); |
3271 | } | 3270 | } |
3272 | 3271 | ||
3273 | static long calc_load_fold_idle(void) | 3272 | static long calc_load_fold_idle(void) |
3274 | { | 3273 | { |
3275 | long delta = 0; | 3274 | long delta = 0; |
3276 | 3275 | ||
3277 | /* | 3276 | /* |
3278 | * Its got a race, we don't care... | 3277 | * Its got a race, we don't care... |
3279 | */ | 3278 | */ |
3280 | if (atomic_long_read(&calc_load_tasks_idle)) | 3279 | if (atomic_long_read(&calc_load_tasks_idle)) |
3281 | delta = atomic_long_xchg(&calc_load_tasks_idle, 0); | 3280 | delta = atomic_long_xchg(&calc_load_tasks_idle, 0); |
3282 | 3281 | ||
3283 | return delta; | 3282 | return delta; |
3284 | } | 3283 | } |
3285 | 3284 | ||
3286 | /** | 3285 | /** |
3287 | * fixed_power_int - compute: x^n, in O(log n) time | 3286 | * fixed_power_int - compute: x^n, in O(log n) time |
3288 | * | 3287 | * |
3289 | * @x: base of the power | 3288 | * @x: base of the power |
3290 | * @frac_bits: fractional bits of @x | 3289 | * @frac_bits: fractional bits of @x |
3291 | * @n: power to raise @x to. | 3290 | * @n: power to raise @x to. |
3292 | * | 3291 | * |
3293 | * By exploiting the relation between the definition of the natural power | 3292 | * By exploiting the relation between the definition of the natural power |
3294 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | 3293 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and |
3295 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | 3294 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, |
3296 | * (where: n_i \elem {0, 1}, the binary vector representing n), | 3295 | * (where: n_i \elem {0, 1}, the binary vector representing n), |
3297 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | 3296 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is |
3298 | * of course trivially computable in O(log_2 n), the length of our binary | 3297 | * of course trivially computable in O(log_2 n), the length of our binary |
3299 | * vector. | 3298 | * vector. |
3300 | */ | 3299 | */ |
3301 | static unsigned long | 3300 | static unsigned long |
3302 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | 3301 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) |
3303 | { | 3302 | { |
3304 | unsigned long result = 1UL << frac_bits; | 3303 | unsigned long result = 1UL << frac_bits; |
3305 | 3304 | ||
3306 | if (n) for (;;) { | 3305 | if (n) for (;;) { |
3307 | if (n & 1) { | 3306 | if (n & 1) { |
3308 | result *= x; | 3307 | result *= x; |
3309 | result += 1UL << (frac_bits - 1); | 3308 | result += 1UL << (frac_bits - 1); |
3310 | result >>= frac_bits; | 3309 | result >>= frac_bits; |
3311 | } | 3310 | } |
3312 | n >>= 1; | 3311 | n >>= 1; |
3313 | if (!n) | 3312 | if (!n) |
3314 | break; | 3313 | break; |
3315 | x *= x; | 3314 | x *= x; |
3316 | x += 1UL << (frac_bits - 1); | 3315 | x += 1UL << (frac_bits - 1); |
3317 | x >>= frac_bits; | 3316 | x >>= frac_bits; |
3318 | } | 3317 | } |
3319 | 3318 | ||
3320 | return result; | 3319 | return result; |
3321 | } | 3320 | } |
3322 | 3321 | ||
3323 | /* | 3322 | /* |
3324 | * a1 = a0 * e + a * (1 - e) | 3323 | * a1 = a0 * e + a * (1 - e) |
3325 | * | 3324 | * |
3326 | * a2 = a1 * e + a * (1 - e) | 3325 | * a2 = a1 * e + a * (1 - e) |
3327 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | 3326 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) |
3328 | * = a0 * e^2 + a * (1 - e) * (1 + e) | 3327 | * = a0 * e^2 + a * (1 - e) * (1 + e) |
3329 | * | 3328 | * |
3330 | * a3 = a2 * e + a * (1 - e) | 3329 | * a3 = a2 * e + a * (1 - e) |
3331 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | 3330 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) |
3332 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | 3331 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) |
3333 | * | 3332 | * |
3334 | * ... | 3333 | * ... |
3335 | * | 3334 | * |
3336 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | 3335 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] |
3337 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | 3336 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) |
3338 | * = a0 * e^n + a * (1 - e^n) | 3337 | * = a0 * e^n + a * (1 - e^n) |
3339 | * | 3338 | * |
3340 | * [1] application of the geometric series: | 3339 | * [1] application of the geometric series: |
3341 | * | 3340 | * |
3342 | * n 1 - x^(n+1) | 3341 | * n 1 - x^(n+1) |
3343 | * S_n := \Sum x^i = ------------- | 3342 | * S_n := \Sum x^i = ------------- |
3344 | * i=0 1 - x | 3343 | * i=0 1 - x |
3345 | */ | 3344 | */ |
3346 | static unsigned long | 3345 | static unsigned long |
3347 | calc_load_n(unsigned long load, unsigned long exp, | 3346 | calc_load_n(unsigned long load, unsigned long exp, |
3348 | unsigned long active, unsigned int n) | 3347 | unsigned long active, unsigned int n) |
3349 | { | 3348 | { |
3350 | 3349 | ||
3351 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | 3350 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); |
3352 | } | 3351 | } |
3353 | 3352 | ||
3354 | /* | 3353 | /* |
3355 | * NO_HZ can leave us missing all per-cpu ticks calling | 3354 | * NO_HZ can leave us missing all per-cpu ticks calling |
3356 | * calc_load_account_active(), but since an idle CPU folds its delta into | 3355 | * calc_load_account_active(), but since an idle CPU folds its delta into |
3357 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | 3356 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold |
3358 | * in the pending idle delta if our idle period crossed a load cycle boundary. | 3357 | * in the pending idle delta if our idle period crossed a load cycle boundary. |
3359 | * | 3358 | * |
3360 | * Once we've updated the global active value, we need to apply the exponential | 3359 | * Once we've updated the global active value, we need to apply the exponential |
3361 | * weights adjusted to the number of cycles missed. | 3360 | * weights adjusted to the number of cycles missed. |
3362 | */ | 3361 | */ |
3363 | static void calc_global_nohz(unsigned long ticks) | 3362 | static void calc_global_nohz(unsigned long ticks) |
3364 | { | 3363 | { |
3365 | long delta, active, n; | 3364 | long delta, active, n; |
3366 | 3365 | ||
3367 | if (time_before(jiffies, calc_load_update)) | 3366 | if (time_before(jiffies, calc_load_update)) |
3368 | return; | 3367 | return; |
3369 | 3368 | ||
3370 | /* | 3369 | /* |
3371 | * If we crossed a calc_load_update boundary, make sure to fold | 3370 | * If we crossed a calc_load_update boundary, make sure to fold |
3372 | * any pending idle changes, the respective CPUs might have | 3371 | * any pending idle changes, the respective CPUs might have |
3373 | * missed the tick driven calc_load_account_active() update | 3372 | * missed the tick driven calc_load_account_active() update |
3374 | * due to NO_HZ. | 3373 | * due to NO_HZ. |
3375 | */ | 3374 | */ |
3376 | delta = calc_load_fold_idle(); | 3375 | delta = calc_load_fold_idle(); |
3377 | if (delta) | 3376 | if (delta) |
3378 | atomic_long_add(delta, &calc_load_tasks); | 3377 | atomic_long_add(delta, &calc_load_tasks); |
3379 | 3378 | ||
3380 | /* | 3379 | /* |
3381 | * If we were idle for multiple load cycles, apply them. | 3380 | * If we were idle for multiple load cycles, apply them. |
3382 | */ | 3381 | */ |
3383 | if (ticks >= LOAD_FREQ) { | 3382 | if (ticks >= LOAD_FREQ) { |
3384 | n = ticks / LOAD_FREQ; | 3383 | n = ticks / LOAD_FREQ; |
3385 | 3384 | ||
3386 | active = atomic_long_read(&calc_load_tasks); | 3385 | active = atomic_long_read(&calc_load_tasks); |
3387 | active = active > 0 ? active * FIXED_1 : 0; | 3386 | active = active > 0 ? active * FIXED_1 : 0; |
3388 | 3387 | ||
3389 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | 3388 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
3390 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | 3389 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
3391 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | 3390 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); |
3392 | 3391 | ||
3393 | calc_load_update += n * LOAD_FREQ; | 3392 | calc_load_update += n * LOAD_FREQ; |
3394 | } | 3393 | } |
3395 | 3394 | ||
3396 | /* | 3395 | /* |
3397 | * Its possible the remainder of the above division also crosses | 3396 | * Its possible the remainder of the above division also crosses |
3398 | * a LOAD_FREQ period, the regular check in calc_global_load() | 3397 | * a LOAD_FREQ period, the regular check in calc_global_load() |
3399 | * which comes after this will take care of that. | 3398 | * which comes after this will take care of that. |
3400 | * | 3399 | * |
3401 | * Consider us being 11 ticks before a cycle completion, and us | 3400 | * Consider us being 11 ticks before a cycle completion, and us |
3402 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | 3401 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will |
3403 | * age us 4 cycles, and the test in calc_global_load() will | 3402 | * age us 4 cycles, and the test in calc_global_load() will |
3404 | * pick up the final one. | 3403 | * pick up the final one. |
3405 | */ | 3404 | */ |
3406 | } | 3405 | } |
3407 | #else | 3406 | #else |
3408 | static void calc_load_account_idle(struct rq *this_rq) | 3407 | static void calc_load_account_idle(struct rq *this_rq) |
3409 | { | 3408 | { |
3410 | } | 3409 | } |
3411 | 3410 | ||
3412 | static inline long calc_load_fold_idle(void) | 3411 | static inline long calc_load_fold_idle(void) |
3413 | { | 3412 | { |
3414 | return 0; | 3413 | return 0; |
3415 | } | 3414 | } |
3416 | 3415 | ||
3417 | static void calc_global_nohz(unsigned long ticks) | 3416 | static void calc_global_nohz(unsigned long ticks) |
3418 | { | 3417 | { |
3419 | } | 3418 | } |
3420 | #endif | 3419 | #endif |
3421 | 3420 | ||
3422 | /** | 3421 | /** |
3423 | * get_avenrun - get the load average array | 3422 | * get_avenrun - get the load average array |
3424 | * @loads: pointer to dest load array | 3423 | * @loads: pointer to dest load array |
3425 | * @offset: offset to add | 3424 | * @offset: offset to add |
3426 | * @shift: shift count to shift the result left | 3425 | * @shift: shift count to shift the result left |
3427 | * | 3426 | * |
3428 | * These values are estimates at best, so no need for locking. | 3427 | * These values are estimates at best, so no need for locking. |
3429 | */ | 3428 | */ |
3430 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | 3429 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) |
3431 | { | 3430 | { |
3432 | loads[0] = (avenrun[0] + offset) << shift; | 3431 | loads[0] = (avenrun[0] + offset) << shift; |
3433 | loads[1] = (avenrun[1] + offset) << shift; | 3432 | loads[1] = (avenrun[1] + offset) << shift; |
3434 | loads[2] = (avenrun[2] + offset) << shift; | 3433 | loads[2] = (avenrun[2] + offset) << shift; |
3435 | } | 3434 | } |
3436 | 3435 | ||
3437 | /* | 3436 | /* |
3438 | * calc_load - update the avenrun load estimates 10 ticks after the | 3437 | * calc_load - update the avenrun load estimates 10 ticks after the |
3439 | * CPUs have updated calc_load_tasks. | 3438 | * CPUs have updated calc_load_tasks. |
3440 | */ | 3439 | */ |
3441 | void calc_global_load(unsigned long ticks) | 3440 | void calc_global_load(unsigned long ticks) |
3442 | { | 3441 | { |
3443 | long active; | 3442 | long active; |
3444 | 3443 | ||
3445 | calc_global_nohz(ticks); | 3444 | calc_global_nohz(ticks); |
3446 | 3445 | ||
3447 | if (time_before(jiffies, calc_load_update + 10)) | 3446 | if (time_before(jiffies, calc_load_update + 10)) |
3448 | return; | 3447 | return; |
3449 | 3448 | ||
3450 | active = atomic_long_read(&calc_load_tasks); | 3449 | active = atomic_long_read(&calc_load_tasks); |
3451 | active = active > 0 ? active * FIXED_1 : 0; | 3450 | active = active > 0 ? active * FIXED_1 : 0; |
3452 | 3451 | ||
3453 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | 3452 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); |
3454 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | 3453 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); |
3455 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | 3454 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); |
3456 | 3455 | ||
3457 | calc_load_update += LOAD_FREQ; | 3456 | calc_load_update += LOAD_FREQ; |
3458 | } | 3457 | } |
3459 | 3458 | ||
3460 | /* | 3459 | /* |
3461 | * Called from update_cpu_load() to periodically update this CPU's | 3460 | * Called from update_cpu_load() to periodically update this CPU's |
3462 | * active count. | 3461 | * active count. |
3463 | */ | 3462 | */ |
3464 | static void calc_load_account_active(struct rq *this_rq) | 3463 | static void calc_load_account_active(struct rq *this_rq) |
3465 | { | 3464 | { |
3466 | long delta; | 3465 | long delta; |
3467 | 3466 | ||
3468 | if (time_before(jiffies, this_rq->calc_load_update)) | 3467 | if (time_before(jiffies, this_rq->calc_load_update)) |
3469 | return; | 3468 | return; |
3470 | 3469 | ||
3471 | delta = calc_load_fold_active(this_rq); | 3470 | delta = calc_load_fold_active(this_rq); |
3472 | delta += calc_load_fold_idle(); | 3471 | delta += calc_load_fold_idle(); |
3473 | if (delta) | 3472 | if (delta) |
3474 | atomic_long_add(delta, &calc_load_tasks); | 3473 | atomic_long_add(delta, &calc_load_tasks); |
3475 | 3474 | ||
3476 | this_rq->calc_load_update += LOAD_FREQ; | 3475 | this_rq->calc_load_update += LOAD_FREQ; |
3477 | } | 3476 | } |
3478 | 3477 | ||
3479 | /* | 3478 | /* |
3480 | * The exact cpuload at various idx values, calculated at every tick would be | 3479 | * The exact cpuload at various idx values, calculated at every tick would be |
3481 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | 3480 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load |
3482 | * | 3481 | * |
3483 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | 3482 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called |
3484 | * on nth tick when cpu may be busy, then we have: | 3483 | * on nth tick when cpu may be busy, then we have: |
3485 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | 3484 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load |
3486 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | 3485 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load |
3487 | * | 3486 | * |
3488 | * decay_load_missed() below does efficient calculation of | 3487 | * decay_load_missed() below does efficient calculation of |
3489 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | 3488 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load |
3490 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | 3489 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load |
3491 | * | 3490 | * |
3492 | * The calculation is approximated on a 128 point scale. | 3491 | * The calculation is approximated on a 128 point scale. |
3493 | * degrade_zero_ticks is the number of ticks after which load at any | 3492 | * degrade_zero_ticks is the number of ticks after which load at any |
3494 | * particular idx is approximated to be zero. | 3493 | * particular idx is approximated to be zero. |
3495 | * degrade_factor is a precomputed table, a row for each load idx. | 3494 | * degrade_factor is a precomputed table, a row for each load idx. |
3496 | * Each column corresponds to degradation factor for a power of two ticks, | 3495 | * Each column corresponds to degradation factor for a power of two ticks, |
3497 | * based on 128 point scale. | 3496 | * based on 128 point scale. |
3498 | * Example: | 3497 | * Example: |
3499 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | 3498 | * row 2, col 3 (=12) says that the degradation at load idx 2 after |
3500 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | 3499 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). |
3501 | * | 3500 | * |
3502 | * With this power of 2 load factors, we can degrade the load n times | 3501 | * With this power of 2 load factors, we can degrade the load n times |
3503 | * by looking at 1 bits in n and doing as many mult/shift instead of | 3502 | * by looking at 1 bits in n and doing as many mult/shift instead of |
3504 | * n mult/shifts needed by the exact degradation. | 3503 | * n mult/shifts needed by the exact degradation. |
3505 | */ | 3504 | */ |
3506 | #define DEGRADE_SHIFT 7 | 3505 | #define DEGRADE_SHIFT 7 |
3507 | static const unsigned char | 3506 | static const unsigned char |
3508 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | 3507 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; |
3509 | static const unsigned char | 3508 | static const unsigned char |
3510 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | 3509 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { |
3511 | {0, 0, 0, 0, 0, 0, 0, 0}, | 3510 | {0, 0, 0, 0, 0, 0, 0, 0}, |
3512 | {64, 32, 8, 0, 0, 0, 0, 0}, | 3511 | {64, 32, 8, 0, 0, 0, 0, 0}, |
3513 | {96, 72, 40, 12, 1, 0, 0}, | 3512 | {96, 72, 40, 12, 1, 0, 0}, |
3514 | {112, 98, 75, 43, 15, 1, 0}, | 3513 | {112, 98, 75, 43, 15, 1, 0}, |
3515 | {120, 112, 98, 76, 45, 16, 2} }; | 3514 | {120, 112, 98, 76, 45, 16, 2} }; |
3516 | 3515 | ||
3517 | /* | 3516 | /* |
3518 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | 3517 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog |
3519 | * would be when CPU is idle and so we just decay the old load without | 3518 | * would be when CPU is idle and so we just decay the old load without |
3520 | * adding any new load. | 3519 | * adding any new load. |
3521 | */ | 3520 | */ |
3522 | static unsigned long | 3521 | static unsigned long |
3523 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | 3522 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) |
3524 | { | 3523 | { |
3525 | int j = 0; | 3524 | int j = 0; |
3526 | 3525 | ||
3527 | if (!missed_updates) | 3526 | if (!missed_updates) |
3528 | return load; | 3527 | return load; |
3529 | 3528 | ||
3530 | if (missed_updates >= degrade_zero_ticks[idx]) | 3529 | if (missed_updates >= degrade_zero_ticks[idx]) |
3531 | return 0; | 3530 | return 0; |
3532 | 3531 | ||
3533 | if (idx == 1) | 3532 | if (idx == 1) |
3534 | return load >> missed_updates; | 3533 | return load >> missed_updates; |
3535 | 3534 | ||
3536 | while (missed_updates) { | 3535 | while (missed_updates) { |
3537 | if (missed_updates % 2) | 3536 | if (missed_updates % 2) |
3538 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | 3537 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; |
3539 | 3538 | ||
3540 | missed_updates >>= 1; | 3539 | missed_updates >>= 1; |
3541 | j++; | 3540 | j++; |
3542 | } | 3541 | } |
3543 | return load; | 3542 | return load; |
3544 | } | 3543 | } |
3545 | 3544 | ||
3546 | /* | 3545 | /* |
3547 | * Update rq->cpu_load[] statistics. This function is usually called every | 3546 | * Update rq->cpu_load[] statistics. This function is usually called every |
3548 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 3547 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
3549 | * every tick. We fix it up based on jiffies. | 3548 | * every tick. We fix it up based on jiffies. |
3550 | */ | 3549 | */ |
3551 | static void update_cpu_load(struct rq *this_rq) | 3550 | static void update_cpu_load(struct rq *this_rq) |
3552 | { | 3551 | { |
3553 | unsigned long this_load = this_rq->load.weight; | 3552 | unsigned long this_load = this_rq->load.weight; |
3554 | unsigned long curr_jiffies = jiffies; | 3553 | unsigned long curr_jiffies = jiffies; |
3555 | unsigned long pending_updates; | 3554 | unsigned long pending_updates; |
3556 | int i, scale; | 3555 | int i, scale; |
3557 | 3556 | ||
3558 | this_rq->nr_load_updates++; | 3557 | this_rq->nr_load_updates++; |
3559 | 3558 | ||
3560 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | 3559 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ |
3561 | if (curr_jiffies == this_rq->last_load_update_tick) | 3560 | if (curr_jiffies == this_rq->last_load_update_tick) |
3562 | return; | 3561 | return; |
3563 | 3562 | ||
3564 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | 3563 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; |
3565 | this_rq->last_load_update_tick = curr_jiffies; | 3564 | this_rq->last_load_update_tick = curr_jiffies; |
3566 | 3565 | ||
3567 | /* Update our load: */ | 3566 | /* Update our load: */ |
3568 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | 3567 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
3569 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 3568 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
3570 | unsigned long old_load, new_load; | 3569 | unsigned long old_load, new_load; |
3571 | 3570 | ||
3572 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 3571 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
3573 | 3572 | ||
3574 | old_load = this_rq->cpu_load[i]; | 3573 | old_load = this_rq->cpu_load[i]; |
3575 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | 3574 | old_load = decay_load_missed(old_load, pending_updates - 1, i); |
3576 | new_load = this_load; | 3575 | new_load = this_load; |
3577 | /* | 3576 | /* |
3578 | * Round up the averaging division if load is increasing. This | 3577 | * Round up the averaging division if load is increasing. This |
3579 | * prevents us from getting stuck on 9 if the load is 10, for | 3578 | * prevents us from getting stuck on 9 if the load is 10, for |
3580 | * example. | 3579 | * example. |
3581 | */ | 3580 | */ |
3582 | if (new_load > old_load) | 3581 | if (new_load > old_load) |
3583 | new_load += scale - 1; | 3582 | new_load += scale - 1; |
3584 | 3583 | ||
3585 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | 3584 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; |
3586 | } | 3585 | } |
3587 | 3586 | ||
3588 | sched_avg_update(this_rq); | 3587 | sched_avg_update(this_rq); |
3589 | } | 3588 | } |
3590 | 3589 | ||
3591 | static void update_cpu_load_active(struct rq *this_rq) | 3590 | static void update_cpu_load_active(struct rq *this_rq) |
3592 | { | 3591 | { |
3593 | update_cpu_load(this_rq); | 3592 | update_cpu_load(this_rq); |
3594 | 3593 | ||
3595 | calc_load_account_active(this_rq); | 3594 | calc_load_account_active(this_rq); |
3596 | } | 3595 | } |
3597 | 3596 | ||
3598 | #ifdef CONFIG_SMP | 3597 | #ifdef CONFIG_SMP |
3599 | 3598 | ||
3600 | /* | 3599 | /* |
3601 | * sched_exec - execve() is a valuable balancing opportunity, because at | 3600 | * sched_exec - execve() is a valuable balancing opportunity, because at |
3602 | * this point the task has the smallest effective memory and cache footprint. | 3601 | * this point the task has the smallest effective memory and cache footprint. |
3603 | */ | 3602 | */ |
3604 | void sched_exec(void) | 3603 | void sched_exec(void) |
3605 | { | 3604 | { |
3606 | struct task_struct *p = current; | 3605 | struct task_struct *p = current; |
3607 | unsigned long flags; | 3606 | unsigned long flags; |
3608 | int dest_cpu; | 3607 | int dest_cpu; |
3609 | 3608 | ||
3610 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 3609 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3611 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); | 3610 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
3612 | if (dest_cpu == smp_processor_id()) | 3611 | if (dest_cpu == smp_processor_id()) |
3613 | goto unlock; | 3612 | goto unlock; |
3614 | 3613 | ||
3615 | if (likely(cpu_active(dest_cpu))) { | 3614 | if (likely(cpu_active(dest_cpu))) { |
3616 | struct migration_arg arg = { p, dest_cpu }; | 3615 | struct migration_arg arg = { p, dest_cpu }; |
3617 | 3616 | ||
3618 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 3617 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3619 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); | 3618 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); |
3620 | return; | 3619 | return; |
3621 | } | 3620 | } |
3622 | unlock: | 3621 | unlock: |
3623 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 3622 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3624 | } | 3623 | } |
3625 | 3624 | ||
3626 | #endif | 3625 | #endif |
3627 | 3626 | ||
3628 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3627 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
3629 | 3628 | ||
3630 | EXPORT_PER_CPU_SYMBOL(kstat); | 3629 | EXPORT_PER_CPU_SYMBOL(kstat); |
3631 | 3630 | ||
3632 | /* | 3631 | /* |
3633 | * Return any ns on the sched_clock that have not yet been accounted in | 3632 | * Return any ns on the sched_clock that have not yet been accounted in |
3634 | * @p in case that task is currently running. | 3633 | * @p in case that task is currently running. |
3635 | * | 3634 | * |
3636 | * Called with task_rq_lock() held on @rq. | 3635 | * Called with task_rq_lock() held on @rq. |
3637 | */ | 3636 | */ |
3638 | static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | 3637 | static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) |
3639 | { | 3638 | { |
3640 | u64 ns = 0; | 3639 | u64 ns = 0; |
3641 | 3640 | ||
3642 | if (task_current(rq, p)) { | 3641 | if (task_current(rq, p)) { |
3643 | update_rq_clock(rq); | 3642 | update_rq_clock(rq); |
3644 | ns = rq->clock_task - p->se.exec_start; | 3643 | ns = rq->clock_task - p->se.exec_start; |
3645 | if ((s64)ns < 0) | 3644 | if ((s64)ns < 0) |
3646 | ns = 0; | 3645 | ns = 0; |
3647 | } | 3646 | } |
3648 | 3647 | ||
3649 | return ns; | 3648 | return ns; |
3650 | } | 3649 | } |
3651 | 3650 | ||
3652 | unsigned long long task_delta_exec(struct task_struct *p) | 3651 | unsigned long long task_delta_exec(struct task_struct *p) |
3653 | { | 3652 | { |
3654 | unsigned long flags; | 3653 | unsigned long flags; |
3655 | struct rq *rq; | 3654 | struct rq *rq; |
3656 | u64 ns = 0; | 3655 | u64 ns = 0; |
3657 | 3656 | ||
3658 | rq = task_rq_lock(p, &flags); | 3657 | rq = task_rq_lock(p, &flags); |
3659 | ns = do_task_delta_exec(p, rq); | 3658 | ns = do_task_delta_exec(p, rq); |
3660 | task_rq_unlock(rq, p, &flags); | 3659 | task_rq_unlock(rq, p, &flags); |
3661 | 3660 | ||
3662 | return ns; | 3661 | return ns; |
3663 | } | 3662 | } |
3664 | 3663 | ||
3665 | /* | 3664 | /* |
3666 | * Return accounted runtime for the task. | 3665 | * Return accounted runtime for the task. |
3667 | * In case the task is currently running, return the runtime plus current's | 3666 | * In case the task is currently running, return the runtime plus current's |
3668 | * pending runtime that have not been accounted yet. | 3667 | * pending runtime that have not been accounted yet. |
3669 | */ | 3668 | */ |
3670 | unsigned long long task_sched_runtime(struct task_struct *p) | 3669 | unsigned long long task_sched_runtime(struct task_struct *p) |
3671 | { | 3670 | { |
3672 | unsigned long flags; | 3671 | unsigned long flags; |
3673 | struct rq *rq; | 3672 | struct rq *rq; |
3674 | u64 ns = 0; | 3673 | u64 ns = 0; |
3675 | 3674 | ||
3676 | rq = task_rq_lock(p, &flags); | 3675 | rq = task_rq_lock(p, &flags); |
3677 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 3676 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
3678 | task_rq_unlock(rq, p, &flags); | 3677 | task_rq_unlock(rq, p, &flags); |
3679 | 3678 | ||
3680 | return ns; | 3679 | return ns; |
3681 | } | 3680 | } |
3682 | 3681 | ||
3683 | /* | 3682 | /* |
3684 | * Return sum_exec_runtime for the thread group. | 3683 | * Return sum_exec_runtime for the thread group. |
3685 | * In case the task is currently running, return the sum plus current's | 3684 | * In case the task is currently running, return the sum plus current's |
3686 | * pending runtime that have not been accounted yet. | 3685 | * pending runtime that have not been accounted yet. |
3687 | * | 3686 | * |
3688 | * Note that the thread group might have other running tasks as well, | 3687 | * Note that the thread group might have other running tasks as well, |
3689 | * so the return value not includes other pending runtime that other | 3688 | * so the return value not includes other pending runtime that other |
3690 | * running tasks might have. | 3689 | * running tasks might have. |
3691 | */ | 3690 | */ |
3692 | unsigned long long thread_group_sched_runtime(struct task_struct *p) | 3691 | unsigned long long thread_group_sched_runtime(struct task_struct *p) |
3693 | { | 3692 | { |
3694 | struct task_cputime totals; | 3693 | struct task_cputime totals; |
3695 | unsigned long flags; | 3694 | unsigned long flags; |
3696 | struct rq *rq; | 3695 | struct rq *rq; |
3697 | u64 ns; | 3696 | u64 ns; |
3698 | 3697 | ||
3699 | rq = task_rq_lock(p, &flags); | 3698 | rq = task_rq_lock(p, &flags); |
3700 | thread_group_cputime(p, &totals); | 3699 | thread_group_cputime(p, &totals); |
3701 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | 3700 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
3702 | task_rq_unlock(rq, p, &flags); | 3701 | task_rq_unlock(rq, p, &flags); |
3703 | 3702 | ||
3704 | return ns; | 3703 | return ns; |
3705 | } | 3704 | } |
3706 | 3705 | ||
3707 | /* | 3706 | /* |
3708 | * Account user cpu time to a process. | 3707 | * Account user cpu time to a process. |
3709 | * @p: the process that the cpu time gets accounted to | 3708 | * @p: the process that the cpu time gets accounted to |
3710 | * @cputime: the cpu time spent in user space since the last update | 3709 | * @cputime: the cpu time spent in user space since the last update |
3711 | * @cputime_scaled: cputime scaled by cpu frequency | 3710 | * @cputime_scaled: cputime scaled by cpu frequency |
3712 | */ | 3711 | */ |
3713 | void account_user_time(struct task_struct *p, cputime_t cputime, | 3712 | void account_user_time(struct task_struct *p, cputime_t cputime, |
3714 | cputime_t cputime_scaled) | 3713 | cputime_t cputime_scaled) |
3715 | { | 3714 | { |
3716 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3715 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3717 | cputime64_t tmp; | 3716 | cputime64_t tmp; |
3718 | 3717 | ||
3719 | /* Add user time to process. */ | 3718 | /* Add user time to process. */ |
3720 | p->utime = cputime_add(p->utime, cputime); | 3719 | p->utime = cputime_add(p->utime, cputime); |
3721 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); | 3720 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); |
3722 | account_group_user_time(p, cputime); | 3721 | account_group_user_time(p, cputime); |
3723 | 3722 | ||
3724 | /* Add user time to cpustat. */ | 3723 | /* Add user time to cpustat. */ |
3725 | tmp = cputime_to_cputime64(cputime); | 3724 | tmp = cputime_to_cputime64(cputime); |
3726 | if (TASK_NICE(p) > 0) | 3725 | if (TASK_NICE(p) > 0) |
3727 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 3726 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
3728 | else | 3727 | else |
3729 | cpustat->user = cputime64_add(cpustat->user, tmp); | 3728 | cpustat->user = cputime64_add(cpustat->user, tmp); |
3730 | 3729 | ||
3731 | cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); | 3730 | cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); |
3732 | /* Account for user time used */ | 3731 | /* Account for user time used */ |
3733 | acct_update_integrals(p); | 3732 | acct_update_integrals(p); |
3734 | } | 3733 | } |
3735 | 3734 | ||
3736 | /* | 3735 | /* |
3737 | * Account guest cpu time to a process. | 3736 | * Account guest cpu time to a process. |
3738 | * @p: the process that the cpu time gets accounted to | 3737 | * @p: the process that the cpu time gets accounted to |
3739 | * @cputime: the cpu time spent in virtual machine since the last update | 3738 | * @cputime: the cpu time spent in virtual machine since the last update |
3740 | * @cputime_scaled: cputime scaled by cpu frequency | 3739 | * @cputime_scaled: cputime scaled by cpu frequency |
3741 | */ | 3740 | */ |
3742 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | 3741 | static void account_guest_time(struct task_struct *p, cputime_t cputime, |
3743 | cputime_t cputime_scaled) | 3742 | cputime_t cputime_scaled) |
3744 | { | 3743 | { |
3745 | cputime64_t tmp; | 3744 | cputime64_t tmp; |
3746 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3745 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3747 | 3746 | ||
3748 | tmp = cputime_to_cputime64(cputime); | 3747 | tmp = cputime_to_cputime64(cputime); |
3749 | 3748 | ||
3750 | /* Add guest time to process. */ | 3749 | /* Add guest time to process. */ |
3751 | p->utime = cputime_add(p->utime, cputime); | 3750 | p->utime = cputime_add(p->utime, cputime); |
3752 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); | 3751 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); |
3753 | account_group_user_time(p, cputime); | 3752 | account_group_user_time(p, cputime); |
3754 | p->gtime = cputime_add(p->gtime, cputime); | 3753 | p->gtime = cputime_add(p->gtime, cputime); |
3755 | 3754 | ||
3756 | /* Add guest time to cpustat. */ | 3755 | /* Add guest time to cpustat. */ |
3757 | if (TASK_NICE(p) > 0) { | 3756 | if (TASK_NICE(p) > 0) { |
3758 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 3757 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
3759 | cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); | 3758 | cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); |
3760 | } else { | 3759 | } else { |
3761 | cpustat->user = cputime64_add(cpustat->user, tmp); | 3760 | cpustat->user = cputime64_add(cpustat->user, tmp); |
3762 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | 3761 | cpustat->guest = cputime64_add(cpustat->guest, tmp); |
3763 | } | 3762 | } |
3764 | } | 3763 | } |
3765 | 3764 | ||
3766 | /* | 3765 | /* |
3767 | * Account system cpu time to a process and desired cpustat field | 3766 | * Account system cpu time to a process and desired cpustat field |
3768 | * @p: the process that the cpu time gets accounted to | 3767 | * @p: the process that the cpu time gets accounted to |
3769 | * @cputime: the cpu time spent in kernel space since the last update | 3768 | * @cputime: the cpu time spent in kernel space since the last update |
3770 | * @cputime_scaled: cputime scaled by cpu frequency | 3769 | * @cputime_scaled: cputime scaled by cpu frequency |
3771 | * @target_cputime64: pointer to cpustat field that has to be updated | 3770 | * @target_cputime64: pointer to cpustat field that has to be updated |
3772 | */ | 3771 | */ |
3773 | static inline | 3772 | static inline |
3774 | void __account_system_time(struct task_struct *p, cputime_t cputime, | 3773 | void __account_system_time(struct task_struct *p, cputime_t cputime, |
3775 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | 3774 | cputime_t cputime_scaled, cputime64_t *target_cputime64) |
3776 | { | 3775 | { |
3777 | cputime64_t tmp = cputime_to_cputime64(cputime); | 3776 | cputime64_t tmp = cputime_to_cputime64(cputime); |
3778 | 3777 | ||
3779 | /* Add system time to process. */ | 3778 | /* Add system time to process. */ |
3780 | p->stime = cputime_add(p->stime, cputime); | 3779 | p->stime = cputime_add(p->stime, cputime); |
3781 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | 3780 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); |
3782 | account_group_system_time(p, cputime); | 3781 | account_group_system_time(p, cputime); |
3783 | 3782 | ||
3784 | /* Add system time to cpustat. */ | 3783 | /* Add system time to cpustat. */ |
3785 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | 3784 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); |
3786 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | 3785 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); |
3787 | 3786 | ||
3788 | /* Account for system time used */ | 3787 | /* Account for system time used */ |
3789 | acct_update_integrals(p); | 3788 | acct_update_integrals(p); |
3790 | } | 3789 | } |
3791 | 3790 | ||
3792 | /* | 3791 | /* |
3793 | * Account system cpu time to a process. | 3792 | * Account system cpu time to a process. |
3794 | * @p: the process that the cpu time gets accounted to | 3793 | * @p: the process that the cpu time gets accounted to |
3795 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3794 | * @hardirq_offset: the offset to subtract from hardirq_count() |
3796 | * @cputime: the cpu time spent in kernel space since the last update | 3795 | * @cputime: the cpu time spent in kernel space since the last update |
3797 | * @cputime_scaled: cputime scaled by cpu frequency | 3796 | * @cputime_scaled: cputime scaled by cpu frequency |
3798 | */ | 3797 | */ |
3799 | void account_system_time(struct task_struct *p, int hardirq_offset, | 3798 | void account_system_time(struct task_struct *p, int hardirq_offset, |
3800 | cputime_t cputime, cputime_t cputime_scaled) | 3799 | cputime_t cputime, cputime_t cputime_scaled) |
3801 | { | 3800 | { |
3802 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3801 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3803 | cputime64_t *target_cputime64; | 3802 | cputime64_t *target_cputime64; |
3804 | 3803 | ||
3805 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 3804 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3806 | account_guest_time(p, cputime, cputime_scaled); | 3805 | account_guest_time(p, cputime, cputime_scaled); |
3807 | return; | 3806 | return; |
3808 | } | 3807 | } |
3809 | 3808 | ||
3810 | if (hardirq_count() - hardirq_offset) | 3809 | if (hardirq_count() - hardirq_offset) |
3811 | target_cputime64 = &cpustat->irq; | 3810 | target_cputime64 = &cpustat->irq; |
3812 | else if (in_serving_softirq()) | 3811 | else if (in_serving_softirq()) |
3813 | target_cputime64 = &cpustat->softirq; | 3812 | target_cputime64 = &cpustat->softirq; |
3814 | else | 3813 | else |
3815 | target_cputime64 = &cpustat->system; | 3814 | target_cputime64 = &cpustat->system; |
3816 | 3815 | ||
3817 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); | 3816 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); |
3818 | } | 3817 | } |
3819 | 3818 | ||
3820 | /* | 3819 | /* |
3821 | * Account for involuntary wait time. | 3820 | * Account for involuntary wait time. |
3822 | * @cputime: the cpu time spent in involuntary wait | 3821 | * @cputime: the cpu time spent in involuntary wait |
3823 | */ | 3822 | */ |
3824 | void account_steal_time(cputime_t cputime) | 3823 | void account_steal_time(cputime_t cputime) |
3825 | { | 3824 | { |
3826 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3825 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3827 | cputime64_t cputime64 = cputime_to_cputime64(cputime); | 3826 | cputime64_t cputime64 = cputime_to_cputime64(cputime); |
3828 | 3827 | ||
3829 | cpustat->steal = cputime64_add(cpustat->steal, cputime64); | 3828 | cpustat->steal = cputime64_add(cpustat->steal, cputime64); |
3830 | } | 3829 | } |
3831 | 3830 | ||
3832 | /* | 3831 | /* |
3833 | * Account for idle time. | 3832 | * Account for idle time. |
3834 | * @cputime: the cpu time spent in idle wait | 3833 | * @cputime: the cpu time spent in idle wait |
3835 | */ | 3834 | */ |
3836 | void account_idle_time(cputime_t cputime) | 3835 | void account_idle_time(cputime_t cputime) |
3837 | { | 3836 | { |
3838 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3837 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3839 | cputime64_t cputime64 = cputime_to_cputime64(cputime); | 3838 | cputime64_t cputime64 = cputime_to_cputime64(cputime); |
3840 | struct rq *rq = this_rq(); | 3839 | struct rq *rq = this_rq(); |
3841 | 3840 | ||
3842 | if (atomic_read(&rq->nr_iowait) > 0) | 3841 | if (atomic_read(&rq->nr_iowait) > 0) |
3843 | cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); | 3842 | cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); |
3844 | else | 3843 | else |
3845 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); | 3844 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); |
3846 | } | 3845 | } |
3847 | 3846 | ||
3848 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3847 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3849 | 3848 | ||
3850 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 3849 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
3851 | /* | 3850 | /* |
3852 | * Account a tick to a process and cpustat | 3851 | * Account a tick to a process and cpustat |
3853 | * @p: the process that the cpu time gets accounted to | 3852 | * @p: the process that the cpu time gets accounted to |
3854 | * @user_tick: is the tick from userspace | 3853 | * @user_tick: is the tick from userspace |
3855 | * @rq: the pointer to rq | 3854 | * @rq: the pointer to rq |
3856 | * | 3855 | * |
3857 | * Tick demultiplexing follows the order | 3856 | * Tick demultiplexing follows the order |
3858 | * - pending hardirq update | 3857 | * - pending hardirq update |
3859 | * - pending softirq update | 3858 | * - pending softirq update |
3860 | * - user_time | 3859 | * - user_time |
3861 | * - idle_time | 3860 | * - idle_time |
3862 | * - system time | 3861 | * - system time |
3863 | * - check for guest_time | 3862 | * - check for guest_time |
3864 | * - else account as system_time | 3863 | * - else account as system_time |
3865 | * | 3864 | * |
3866 | * Check for hardirq is done both for system and user time as there is | 3865 | * Check for hardirq is done both for system and user time as there is |
3867 | * no timer going off while we are on hardirq and hence we may never get an | 3866 | * no timer going off while we are on hardirq and hence we may never get an |
3868 | * opportunity to update it solely in system time. | 3867 | * opportunity to update it solely in system time. |
3869 | * p->stime and friends are only updated on system time and not on irq | 3868 | * p->stime and friends are only updated on system time and not on irq |
3870 | * softirq as those do not count in task exec_runtime any more. | 3869 | * softirq as those do not count in task exec_runtime any more. |
3871 | */ | 3870 | */ |
3872 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 3871 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
3873 | struct rq *rq) | 3872 | struct rq *rq) |
3874 | { | 3873 | { |
3875 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 3874 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3876 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | 3875 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); |
3877 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3876 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3878 | 3877 | ||
3879 | if (irqtime_account_hi_update()) { | 3878 | if (irqtime_account_hi_update()) { |
3880 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3879 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3881 | } else if (irqtime_account_si_update()) { | 3880 | } else if (irqtime_account_si_update()) { |
3882 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3881 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
3883 | } else if (this_cpu_ksoftirqd() == p) { | 3882 | } else if (this_cpu_ksoftirqd() == p) { |
3884 | /* | 3883 | /* |
3885 | * ksoftirqd time do not get accounted in cpu_softirq_time. | 3884 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
3886 | * So, we have to handle it separately here. | 3885 | * So, we have to handle it separately here. |
3887 | * Also, p->stime needs to be updated for ksoftirqd. | 3886 | * Also, p->stime needs to be updated for ksoftirqd. |
3888 | */ | 3887 | */ |
3889 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 3888 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
3890 | &cpustat->softirq); | 3889 | &cpustat->softirq); |
3891 | } else if (user_tick) { | 3890 | } else if (user_tick) { |
3892 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 3891 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3893 | } else if (p == rq->idle) { | 3892 | } else if (p == rq->idle) { |
3894 | account_idle_time(cputime_one_jiffy); | 3893 | account_idle_time(cputime_one_jiffy); |
3895 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | 3894 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ |
3896 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | 3895 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3897 | } else { | 3896 | } else { |
3898 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 3897 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
3899 | &cpustat->system); | 3898 | &cpustat->system); |
3900 | } | 3899 | } |
3901 | } | 3900 | } |
3902 | 3901 | ||
3903 | static void irqtime_account_idle_ticks(int ticks) | 3902 | static void irqtime_account_idle_ticks(int ticks) |
3904 | { | 3903 | { |
3905 | int i; | 3904 | int i; |
3906 | struct rq *rq = this_rq(); | 3905 | struct rq *rq = this_rq(); |
3907 | 3906 | ||
3908 | for (i = 0; i < ticks; i++) | 3907 | for (i = 0; i < ticks; i++) |
3909 | irqtime_account_process_tick(current, 0, rq); | 3908 | irqtime_account_process_tick(current, 0, rq); |
3910 | } | 3909 | } |
3911 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 3910 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
3912 | static void irqtime_account_idle_ticks(int ticks) {} | 3911 | static void irqtime_account_idle_ticks(int ticks) {} |
3913 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 3912 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
3914 | struct rq *rq) {} | 3913 | struct rq *rq) {} |
3915 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 3914 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
3916 | 3915 | ||
3917 | /* | 3916 | /* |
3918 | * Account a single tick of cpu time. | 3917 | * Account a single tick of cpu time. |
3919 | * @p: the process that the cpu time gets accounted to | 3918 | * @p: the process that the cpu time gets accounted to |
3920 | * @user_tick: indicates if the tick is a user or a system tick | 3919 | * @user_tick: indicates if the tick is a user or a system tick |
3921 | */ | 3920 | */ |
3922 | void account_process_tick(struct task_struct *p, int user_tick) | 3921 | void account_process_tick(struct task_struct *p, int user_tick) |
3923 | { | 3922 | { |
3924 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 3923 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3925 | struct rq *rq = this_rq(); | 3924 | struct rq *rq = this_rq(); |
3926 | 3925 | ||
3927 | if (sched_clock_irqtime) { | 3926 | if (sched_clock_irqtime) { |
3928 | irqtime_account_process_tick(p, user_tick, rq); | 3927 | irqtime_account_process_tick(p, user_tick, rq); |
3929 | return; | 3928 | return; |
3930 | } | 3929 | } |
3931 | 3930 | ||
3932 | if (user_tick) | 3931 | if (user_tick) |
3933 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 3932 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3934 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 3933 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
3935 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | 3934 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, |
3936 | one_jiffy_scaled); | 3935 | one_jiffy_scaled); |
3937 | else | 3936 | else |
3938 | account_idle_time(cputime_one_jiffy); | 3937 | account_idle_time(cputime_one_jiffy); |
3939 | } | 3938 | } |
3940 | 3939 | ||
3941 | /* | 3940 | /* |
3942 | * Account multiple ticks of steal time. | 3941 | * Account multiple ticks of steal time. |
3943 | * @p: the process from which the cpu time has been stolen | 3942 | * @p: the process from which the cpu time has been stolen |
3944 | * @ticks: number of stolen ticks | 3943 | * @ticks: number of stolen ticks |
3945 | */ | 3944 | */ |
3946 | void account_steal_ticks(unsigned long ticks) | 3945 | void account_steal_ticks(unsigned long ticks) |
3947 | { | 3946 | { |
3948 | account_steal_time(jiffies_to_cputime(ticks)); | 3947 | account_steal_time(jiffies_to_cputime(ticks)); |
3949 | } | 3948 | } |
3950 | 3949 | ||
3951 | /* | 3950 | /* |
3952 | * Account multiple ticks of idle time. | 3951 | * Account multiple ticks of idle time. |
3953 | * @ticks: number of stolen ticks | 3952 | * @ticks: number of stolen ticks |
3954 | */ | 3953 | */ |
3955 | void account_idle_ticks(unsigned long ticks) | 3954 | void account_idle_ticks(unsigned long ticks) |
3956 | { | 3955 | { |
3957 | 3956 | ||
3958 | if (sched_clock_irqtime) { | 3957 | if (sched_clock_irqtime) { |
3959 | irqtime_account_idle_ticks(ticks); | 3958 | irqtime_account_idle_ticks(ticks); |
3960 | return; | 3959 | return; |
3961 | } | 3960 | } |
3962 | 3961 | ||
3963 | account_idle_time(jiffies_to_cputime(ticks)); | 3962 | account_idle_time(jiffies_to_cputime(ticks)); |
3964 | } | 3963 | } |
3965 | 3964 | ||
3966 | #endif | 3965 | #endif |
3967 | 3966 | ||
3968 | /* | 3967 | /* |
3969 | * Use precise platform statistics if available: | 3968 | * Use precise platform statistics if available: |
3970 | */ | 3969 | */ |
3971 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 3970 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
3972 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 3971 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
3973 | { | 3972 | { |
3974 | *ut = p->utime; | 3973 | *ut = p->utime; |
3975 | *st = p->stime; | 3974 | *st = p->stime; |
3976 | } | 3975 | } |
3977 | 3976 | ||
3978 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 3977 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
3979 | { | 3978 | { |
3980 | struct task_cputime cputime; | 3979 | struct task_cputime cputime; |
3981 | 3980 | ||
3982 | thread_group_cputime(p, &cputime); | 3981 | thread_group_cputime(p, &cputime); |
3983 | 3982 | ||
3984 | *ut = cputime.utime; | 3983 | *ut = cputime.utime; |
3985 | *st = cputime.stime; | 3984 | *st = cputime.stime; |
3986 | } | 3985 | } |
3987 | #else | 3986 | #else |
3988 | 3987 | ||
3989 | #ifndef nsecs_to_cputime | 3988 | #ifndef nsecs_to_cputime |
3990 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | 3989 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) |
3991 | #endif | 3990 | #endif |
3992 | 3991 | ||
3993 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 3992 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
3994 | { | 3993 | { |
3995 | cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); | 3994 | cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); |
3996 | 3995 | ||
3997 | /* | 3996 | /* |
3998 | * Use CFS's precise accounting: | 3997 | * Use CFS's precise accounting: |
3999 | */ | 3998 | */ |
4000 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 3999 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
4001 | 4000 | ||
4002 | if (total) { | 4001 | if (total) { |
4003 | u64 temp = rtime; | 4002 | u64 temp = rtime; |
4004 | 4003 | ||
4005 | temp *= utime; | 4004 | temp *= utime; |
4006 | do_div(temp, total); | 4005 | do_div(temp, total); |
4007 | utime = (cputime_t)temp; | 4006 | utime = (cputime_t)temp; |
4008 | } else | 4007 | } else |
4009 | utime = rtime; | 4008 | utime = rtime; |
4010 | 4009 | ||
4011 | /* | 4010 | /* |
4012 | * Compare with previous values, to keep monotonicity: | 4011 | * Compare with previous values, to keep monotonicity: |
4013 | */ | 4012 | */ |
4014 | p->prev_utime = max(p->prev_utime, utime); | 4013 | p->prev_utime = max(p->prev_utime, utime); |
4015 | p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); | 4014 | p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); |
4016 | 4015 | ||
4017 | *ut = p->prev_utime; | 4016 | *ut = p->prev_utime; |
4018 | *st = p->prev_stime; | 4017 | *st = p->prev_stime; |
4019 | } | 4018 | } |
4020 | 4019 | ||
4021 | /* | 4020 | /* |
4022 | * Must be called with siglock held. | 4021 | * Must be called with siglock held. |
4023 | */ | 4022 | */ |
4024 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 4023 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
4025 | { | 4024 | { |
4026 | struct signal_struct *sig = p->signal; | 4025 | struct signal_struct *sig = p->signal; |
4027 | struct task_cputime cputime; | 4026 | struct task_cputime cputime; |
4028 | cputime_t rtime, utime, total; | 4027 | cputime_t rtime, utime, total; |
4029 | 4028 | ||
4030 | thread_group_cputime(p, &cputime); | 4029 | thread_group_cputime(p, &cputime); |
4031 | 4030 | ||
4032 | total = cputime_add(cputime.utime, cputime.stime); | 4031 | total = cputime_add(cputime.utime, cputime.stime); |
4033 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | 4032 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
4034 | 4033 | ||
4035 | if (total) { | 4034 | if (total) { |
4036 | u64 temp = rtime; | 4035 | u64 temp = rtime; |
4037 | 4036 | ||
4038 | temp *= cputime.utime; | 4037 | temp *= cputime.utime; |
4039 | do_div(temp, total); | 4038 | do_div(temp, total); |
4040 | utime = (cputime_t)temp; | 4039 | utime = (cputime_t)temp; |
4041 | } else | 4040 | } else |
4042 | utime = rtime; | 4041 | utime = rtime; |
4043 | 4042 | ||
4044 | sig->prev_utime = max(sig->prev_utime, utime); | 4043 | sig->prev_utime = max(sig->prev_utime, utime); |
4045 | sig->prev_stime = max(sig->prev_stime, | 4044 | sig->prev_stime = max(sig->prev_stime, |
4046 | cputime_sub(rtime, sig->prev_utime)); | 4045 | cputime_sub(rtime, sig->prev_utime)); |
4047 | 4046 | ||
4048 | *ut = sig->prev_utime; | 4047 | *ut = sig->prev_utime; |
4049 | *st = sig->prev_stime; | 4048 | *st = sig->prev_stime; |
4050 | } | 4049 | } |
4051 | #endif | 4050 | #endif |
4052 | 4051 | ||
4053 | /* | 4052 | /* |
4054 | * This function gets called by the timer code, with HZ frequency. | 4053 | * This function gets called by the timer code, with HZ frequency. |
4055 | * We call it with interrupts disabled. | 4054 | * We call it with interrupts disabled. |
4056 | */ | 4055 | */ |
4057 | void scheduler_tick(void) | 4056 | void scheduler_tick(void) |
4058 | { | 4057 | { |
4059 | int cpu = smp_processor_id(); | 4058 | int cpu = smp_processor_id(); |
4060 | struct rq *rq = cpu_rq(cpu); | 4059 | struct rq *rq = cpu_rq(cpu); |
4061 | struct task_struct *curr = rq->curr; | 4060 | struct task_struct *curr = rq->curr; |
4062 | 4061 | ||
4063 | sched_clock_tick(); | 4062 | sched_clock_tick(); |
4064 | 4063 | ||
4065 | raw_spin_lock(&rq->lock); | 4064 | raw_spin_lock(&rq->lock); |
4066 | update_rq_clock(rq); | 4065 | update_rq_clock(rq); |
4067 | update_cpu_load_active(rq); | 4066 | update_cpu_load_active(rq); |
4068 | curr->sched_class->task_tick(rq, curr, 0); | 4067 | curr->sched_class->task_tick(rq, curr, 0); |
4069 | raw_spin_unlock(&rq->lock); | 4068 | raw_spin_unlock(&rq->lock); |
4070 | 4069 | ||
4071 | perf_event_task_tick(); | 4070 | perf_event_task_tick(); |
4072 | 4071 | ||
4073 | #ifdef CONFIG_SMP | 4072 | #ifdef CONFIG_SMP |
4074 | rq->idle_at_tick = idle_cpu(cpu); | 4073 | rq->idle_at_tick = idle_cpu(cpu); |
4075 | trigger_load_balance(rq, cpu); | 4074 | trigger_load_balance(rq, cpu); |
4076 | #endif | 4075 | #endif |
4077 | } | 4076 | } |
4078 | 4077 | ||
4079 | notrace unsigned long get_parent_ip(unsigned long addr) | 4078 | notrace unsigned long get_parent_ip(unsigned long addr) |
4080 | { | 4079 | { |
4081 | if (in_lock_functions(addr)) { | 4080 | if (in_lock_functions(addr)) { |
4082 | addr = CALLER_ADDR2; | 4081 | addr = CALLER_ADDR2; |
4083 | if (in_lock_functions(addr)) | 4082 | if (in_lock_functions(addr)) |
4084 | addr = CALLER_ADDR3; | 4083 | addr = CALLER_ADDR3; |
4085 | } | 4084 | } |
4086 | return addr; | 4085 | return addr; |
4087 | } | 4086 | } |
4088 | 4087 | ||
4089 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 4088 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
4090 | defined(CONFIG_PREEMPT_TRACER)) | 4089 | defined(CONFIG_PREEMPT_TRACER)) |
4091 | 4090 | ||
4092 | void __kprobes add_preempt_count(int val) | 4091 | void __kprobes add_preempt_count(int val) |
4093 | { | 4092 | { |
4094 | #ifdef CONFIG_DEBUG_PREEMPT | 4093 | #ifdef CONFIG_DEBUG_PREEMPT |
4095 | /* | 4094 | /* |
4096 | * Underflow? | 4095 | * Underflow? |
4097 | */ | 4096 | */ |
4098 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 4097 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
4099 | return; | 4098 | return; |
4100 | #endif | 4099 | #endif |
4101 | preempt_count() += val; | 4100 | preempt_count() += val; |
4102 | #ifdef CONFIG_DEBUG_PREEMPT | 4101 | #ifdef CONFIG_DEBUG_PREEMPT |
4103 | /* | 4102 | /* |
4104 | * Spinlock count overflowing soon? | 4103 | * Spinlock count overflowing soon? |
4105 | */ | 4104 | */ |
4106 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= | 4105 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
4107 | PREEMPT_MASK - 10); | 4106 | PREEMPT_MASK - 10); |
4108 | #endif | 4107 | #endif |
4109 | if (preempt_count() == val) | 4108 | if (preempt_count() == val) |
4110 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 4109 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
4111 | } | 4110 | } |
4112 | EXPORT_SYMBOL(add_preempt_count); | 4111 | EXPORT_SYMBOL(add_preempt_count); |
4113 | 4112 | ||
4114 | void __kprobes sub_preempt_count(int val) | 4113 | void __kprobes sub_preempt_count(int val) |
4115 | { | 4114 | { |
4116 | #ifdef CONFIG_DEBUG_PREEMPT | 4115 | #ifdef CONFIG_DEBUG_PREEMPT |
4117 | /* | 4116 | /* |
4118 | * Underflow? | 4117 | * Underflow? |
4119 | */ | 4118 | */ |
4120 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) | 4119 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) |
4121 | return; | 4120 | return; |
4122 | /* | 4121 | /* |
4123 | * Is the spinlock portion underflowing? | 4122 | * Is the spinlock portion underflowing? |
4124 | */ | 4123 | */ |
4125 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && | 4124 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
4126 | !(preempt_count() & PREEMPT_MASK))) | 4125 | !(preempt_count() & PREEMPT_MASK))) |
4127 | return; | 4126 | return; |
4128 | #endif | 4127 | #endif |
4129 | 4128 | ||
4130 | if (preempt_count() == val) | 4129 | if (preempt_count() == val) |
4131 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 4130 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
4132 | preempt_count() -= val; | 4131 | preempt_count() -= val; |
4133 | } | 4132 | } |
4134 | EXPORT_SYMBOL(sub_preempt_count); | 4133 | EXPORT_SYMBOL(sub_preempt_count); |
4135 | 4134 | ||
4136 | #endif | 4135 | #endif |
4137 | 4136 | ||
4138 | /* | 4137 | /* |
4139 | * Print scheduling while atomic bug: | 4138 | * Print scheduling while atomic bug: |
4140 | */ | 4139 | */ |
4141 | static noinline void __schedule_bug(struct task_struct *prev) | 4140 | static noinline void __schedule_bug(struct task_struct *prev) |
4142 | { | 4141 | { |
4143 | struct pt_regs *regs = get_irq_regs(); | 4142 | struct pt_regs *regs = get_irq_regs(); |
4144 | 4143 | ||
4145 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", | 4144 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", |
4146 | prev->comm, prev->pid, preempt_count()); | 4145 | prev->comm, prev->pid, preempt_count()); |
4147 | 4146 | ||
4148 | debug_show_held_locks(prev); | 4147 | debug_show_held_locks(prev); |
4149 | print_modules(); | 4148 | print_modules(); |
4150 | if (irqs_disabled()) | 4149 | if (irqs_disabled()) |
4151 | print_irqtrace_events(prev); | 4150 | print_irqtrace_events(prev); |
4152 | 4151 | ||
4153 | if (regs) | 4152 | if (regs) |
4154 | show_regs(regs); | 4153 | show_regs(regs); |
4155 | else | 4154 | else |
4156 | dump_stack(); | 4155 | dump_stack(); |
4157 | } | 4156 | } |
4158 | 4157 | ||
4159 | /* | 4158 | /* |
4160 | * Various schedule()-time debugging checks and statistics: | 4159 | * Various schedule()-time debugging checks and statistics: |
4161 | */ | 4160 | */ |
4162 | static inline void schedule_debug(struct task_struct *prev) | 4161 | static inline void schedule_debug(struct task_struct *prev) |
4163 | { | 4162 | { |
4164 | /* | 4163 | /* |
4165 | * Test if we are atomic. Since do_exit() needs to call into | 4164 | * Test if we are atomic. Since do_exit() needs to call into |
4166 | * schedule() atomically, we ignore that path for now. | 4165 | * schedule() atomically, we ignore that path for now. |
4167 | * Otherwise, whine if we are scheduling when we should not be. | 4166 | * Otherwise, whine if we are scheduling when we should not be. |
4168 | */ | 4167 | */ |
4169 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) | 4168 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) |
4170 | __schedule_bug(prev); | 4169 | __schedule_bug(prev); |
4171 | 4170 | ||
4172 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4171 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
4173 | 4172 | ||
4174 | schedstat_inc(this_rq(), sched_count); | 4173 | schedstat_inc(this_rq(), sched_count); |
4175 | } | 4174 | } |
4176 | 4175 | ||
4177 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 4176 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
4178 | { | 4177 | { |
4179 | if (prev->on_rq || rq->skip_clock_update < 0) | 4178 | if (prev->on_rq || rq->skip_clock_update < 0) |
4180 | update_rq_clock(rq); | 4179 | update_rq_clock(rq); |
4181 | prev->sched_class->put_prev_task(rq, prev); | 4180 | prev->sched_class->put_prev_task(rq, prev); |
4182 | } | 4181 | } |
4183 | 4182 | ||
4184 | /* | 4183 | /* |
4185 | * Pick up the highest-prio task: | 4184 | * Pick up the highest-prio task: |
4186 | */ | 4185 | */ |
4187 | static inline struct task_struct * | 4186 | static inline struct task_struct * |
4188 | pick_next_task(struct rq *rq) | 4187 | pick_next_task(struct rq *rq) |
4189 | { | 4188 | { |
4190 | const struct sched_class *class; | 4189 | const struct sched_class *class; |
4191 | struct task_struct *p; | 4190 | struct task_struct *p; |
4192 | 4191 | ||
4193 | /* | 4192 | /* |
4194 | * Optimization: we know that if all tasks are in | 4193 | * Optimization: we know that if all tasks are in |
4195 | * the fair class we can call that function directly: | 4194 | * the fair class we can call that function directly: |
4196 | */ | 4195 | */ |
4197 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 4196 | if (likely(rq->nr_running == rq->cfs.nr_running)) { |
4198 | p = fair_sched_class.pick_next_task(rq); | 4197 | p = fair_sched_class.pick_next_task(rq); |
4199 | if (likely(p)) | 4198 | if (likely(p)) |
4200 | return p; | 4199 | return p; |
4201 | } | 4200 | } |
4202 | 4201 | ||
4203 | for_each_class(class) { | 4202 | for_each_class(class) { |
4204 | p = class->pick_next_task(rq); | 4203 | p = class->pick_next_task(rq); |
4205 | if (p) | 4204 | if (p) |
4206 | return p; | 4205 | return p; |
4207 | } | 4206 | } |
4208 | 4207 | ||
4209 | BUG(); /* the idle class will always have a runnable task */ | 4208 | BUG(); /* the idle class will always have a runnable task */ |
4210 | } | 4209 | } |
4211 | 4210 | ||
4212 | /* | 4211 | /* |
4213 | * schedule() is the main scheduler function. | 4212 | * schedule() is the main scheduler function. |
4214 | */ | 4213 | */ |
4215 | asmlinkage void __sched schedule(void) | 4214 | asmlinkage void __sched schedule(void) |
4216 | { | 4215 | { |
4217 | struct task_struct *prev, *next; | 4216 | struct task_struct *prev, *next; |
4218 | unsigned long *switch_count; | 4217 | unsigned long *switch_count; |
4219 | struct rq *rq; | 4218 | struct rq *rq; |
4220 | int cpu; | 4219 | int cpu; |
4221 | 4220 | ||
4222 | need_resched: | 4221 | need_resched: |
4223 | preempt_disable(); | 4222 | preempt_disable(); |
4224 | cpu = smp_processor_id(); | 4223 | cpu = smp_processor_id(); |
4225 | rq = cpu_rq(cpu); | 4224 | rq = cpu_rq(cpu); |
4226 | rcu_note_context_switch(cpu); | 4225 | rcu_note_context_switch(cpu); |
4227 | prev = rq->curr; | 4226 | prev = rq->curr; |
4228 | 4227 | ||
4229 | schedule_debug(prev); | 4228 | schedule_debug(prev); |
4230 | 4229 | ||
4231 | if (sched_feat(HRTICK)) | 4230 | if (sched_feat(HRTICK)) |
4232 | hrtick_clear(rq); | 4231 | hrtick_clear(rq); |
4233 | 4232 | ||
4234 | raw_spin_lock_irq(&rq->lock); | 4233 | raw_spin_lock_irq(&rq->lock); |
4235 | 4234 | ||
4236 | switch_count = &prev->nivcsw; | 4235 | switch_count = &prev->nivcsw; |
4237 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4236 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
4238 | if (unlikely(signal_pending_state(prev->state, prev))) { | 4237 | if (unlikely(signal_pending_state(prev->state, prev))) { |
4239 | prev->state = TASK_RUNNING; | 4238 | prev->state = TASK_RUNNING; |
4240 | } else { | 4239 | } else { |
4241 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 4240 | deactivate_task(rq, prev, DEQUEUE_SLEEP); |
4242 | prev->on_rq = 0; | 4241 | prev->on_rq = 0; |
4243 | 4242 | ||
4244 | /* | 4243 | /* |
4245 | * If a worker went to sleep, notify and ask workqueue | 4244 | * If a worker went to sleep, notify and ask workqueue |
4246 | * whether it wants to wake up a task to maintain | 4245 | * whether it wants to wake up a task to maintain |
4247 | * concurrency. | 4246 | * concurrency. |
4248 | */ | 4247 | */ |
4249 | if (prev->flags & PF_WQ_WORKER) { | 4248 | if (prev->flags & PF_WQ_WORKER) { |
4250 | struct task_struct *to_wakeup; | 4249 | struct task_struct *to_wakeup; |
4251 | 4250 | ||
4252 | to_wakeup = wq_worker_sleeping(prev, cpu); | 4251 | to_wakeup = wq_worker_sleeping(prev, cpu); |
4253 | if (to_wakeup) | 4252 | if (to_wakeup) |
4254 | try_to_wake_up_local(to_wakeup); | 4253 | try_to_wake_up_local(to_wakeup); |
4255 | } | 4254 | } |
4256 | 4255 | ||
4257 | /* | 4256 | /* |
4258 | * If we are going to sleep and we have plugged IO | 4257 | * If we are going to sleep and we have plugged IO |
4259 | * queued, make sure to submit it to avoid deadlocks. | 4258 | * queued, make sure to submit it to avoid deadlocks. |
4260 | */ | 4259 | */ |
4261 | if (blk_needs_flush_plug(prev)) { | 4260 | if (blk_needs_flush_plug(prev)) { |
4262 | raw_spin_unlock(&rq->lock); | 4261 | raw_spin_unlock(&rq->lock); |
4263 | blk_schedule_flush_plug(prev); | 4262 | blk_schedule_flush_plug(prev); |
4264 | raw_spin_lock(&rq->lock); | 4263 | raw_spin_lock(&rq->lock); |
4265 | } | 4264 | } |
4266 | } | 4265 | } |
4267 | switch_count = &prev->nvcsw; | 4266 | switch_count = &prev->nvcsw; |
4268 | } | 4267 | } |
4269 | 4268 | ||
4270 | pre_schedule(rq, prev); | 4269 | pre_schedule(rq, prev); |
4271 | 4270 | ||
4272 | if (unlikely(!rq->nr_running)) | 4271 | if (unlikely(!rq->nr_running)) |
4273 | idle_balance(cpu, rq); | 4272 | idle_balance(cpu, rq); |
4274 | 4273 | ||
4275 | put_prev_task(rq, prev); | 4274 | put_prev_task(rq, prev); |
4276 | next = pick_next_task(rq); | 4275 | next = pick_next_task(rq); |
4277 | clear_tsk_need_resched(prev); | 4276 | clear_tsk_need_resched(prev); |
4278 | rq->skip_clock_update = 0; | 4277 | rq->skip_clock_update = 0; |
4279 | 4278 | ||
4280 | if (likely(prev != next)) { | 4279 | if (likely(prev != next)) { |
4281 | rq->nr_switches++; | 4280 | rq->nr_switches++; |
4282 | rq->curr = next; | 4281 | rq->curr = next; |
4283 | ++*switch_count; | 4282 | ++*switch_count; |
4284 | 4283 | ||
4285 | context_switch(rq, prev, next); /* unlocks the rq */ | 4284 | context_switch(rq, prev, next); /* unlocks the rq */ |
4286 | /* | 4285 | /* |
4287 | * The context switch have flipped the stack from under us | 4286 | * The context switch have flipped the stack from under us |
4288 | * and restored the local variables which were saved when | 4287 | * and restored the local variables which were saved when |
4289 | * this task called schedule() in the past. prev == current | 4288 | * this task called schedule() in the past. prev == current |
4290 | * is still correct, but it can be moved to another cpu/rq. | 4289 | * is still correct, but it can be moved to another cpu/rq. |
4291 | */ | 4290 | */ |
4292 | cpu = smp_processor_id(); | 4291 | cpu = smp_processor_id(); |
4293 | rq = cpu_rq(cpu); | 4292 | rq = cpu_rq(cpu); |
4294 | } else | 4293 | } else |
4295 | raw_spin_unlock_irq(&rq->lock); | 4294 | raw_spin_unlock_irq(&rq->lock); |
4296 | 4295 | ||
4297 | post_schedule(rq); | 4296 | post_schedule(rq); |
4298 | 4297 | ||
4299 | preempt_enable_no_resched(); | 4298 | preempt_enable_no_resched(); |
4300 | if (need_resched()) | 4299 | if (need_resched()) |
4301 | goto need_resched; | 4300 | goto need_resched; |
4302 | } | 4301 | } |
4303 | EXPORT_SYMBOL(schedule); | 4302 | EXPORT_SYMBOL(schedule); |
4304 | 4303 | ||
4305 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4304 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
4306 | 4305 | ||
4307 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | 4306 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) |
4308 | { | 4307 | { |
4309 | bool ret = false; | 4308 | bool ret = false; |
4310 | 4309 | ||
4311 | rcu_read_lock(); | 4310 | rcu_read_lock(); |
4312 | if (lock->owner != owner) | 4311 | if (lock->owner != owner) |
4313 | goto fail; | 4312 | goto fail; |
4314 | 4313 | ||
4315 | /* | 4314 | /* |
4316 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | 4315 | * Ensure we emit the owner->on_cpu, dereference _after_ checking |
4317 | * lock->owner still matches owner, if that fails, owner might | 4316 | * lock->owner still matches owner, if that fails, owner might |
4318 | * point to free()d memory, if it still matches, the rcu_read_lock() | 4317 | * point to free()d memory, if it still matches, the rcu_read_lock() |
4319 | * ensures the memory stays valid. | 4318 | * ensures the memory stays valid. |
4320 | */ | 4319 | */ |
4321 | barrier(); | 4320 | barrier(); |
4322 | 4321 | ||
4323 | ret = owner->on_cpu; | 4322 | ret = owner->on_cpu; |
4324 | fail: | 4323 | fail: |
4325 | rcu_read_unlock(); | 4324 | rcu_read_unlock(); |
4326 | 4325 | ||
4327 | return ret; | 4326 | return ret; |
4328 | } | 4327 | } |
4329 | 4328 | ||
4330 | /* | 4329 | /* |
4331 | * Look out! "owner" is an entirely speculative pointer | 4330 | * Look out! "owner" is an entirely speculative pointer |
4332 | * access and not reliable. | 4331 | * access and not reliable. |
4333 | */ | 4332 | */ |
4334 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | 4333 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
4335 | { | 4334 | { |
4336 | if (!sched_feat(OWNER_SPIN)) | 4335 | if (!sched_feat(OWNER_SPIN)) |
4337 | return 0; | 4336 | return 0; |
4338 | 4337 | ||
4339 | while (owner_running(lock, owner)) { | 4338 | while (owner_running(lock, owner)) { |
4340 | if (need_resched()) | 4339 | if (need_resched()) |
4341 | return 0; | 4340 | return 0; |
4342 | 4341 | ||
4343 | arch_mutex_cpu_relax(); | 4342 | arch_mutex_cpu_relax(); |
4344 | } | 4343 | } |
4345 | 4344 | ||
4346 | /* | 4345 | /* |
4347 | * If the owner changed to another task there is likely | 4346 | * If the owner changed to another task there is likely |
4348 | * heavy contention, stop spinning. | 4347 | * heavy contention, stop spinning. |
4349 | */ | 4348 | */ |
4350 | if (lock->owner) | 4349 | if (lock->owner) |
4351 | return 0; | 4350 | return 0; |
4352 | 4351 | ||
4353 | return 1; | 4352 | return 1; |
4354 | } | 4353 | } |
4355 | #endif | 4354 | #endif |
4356 | 4355 | ||
4357 | #ifdef CONFIG_PREEMPT | 4356 | #ifdef CONFIG_PREEMPT |
4358 | /* | 4357 | /* |
4359 | * this is the entry point to schedule() from in-kernel preemption | 4358 | * this is the entry point to schedule() from in-kernel preemption |
4360 | * off of preempt_enable. Kernel preemptions off return from interrupt | 4359 | * off of preempt_enable. Kernel preemptions off return from interrupt |
4361 | * occur there and call schedule directly. | 4360 | * occur there and call schedule directly. |
4362 | */ | 4361 | */ |
4363 | asmlinkage void __sched notrace preempt_schedule(void) | 4362 | asmlinkage void __sched notrace preempt_schedule(void) |
4364 | { | 4363 | { |
4365 | struct thread_info *ti = current_thread_info(); | 4364 | struct thread_info *ti = current_thread_info(); |
4366 | 4365 | ||
4367 | /* | 4366 | /* |
4368 | * If there is a non-zero preempt_count or interrupts are disabled, | 4367 | * If there is a non-zero preempt_count or interrupts are disabled, |
4369 | * we do not want to preempt the current task. Just return.. | 4368 | * we do not want to preempt the current task. Just return.. |
4370 | */ | 4369 | */ |
4371 | if (likely(ti->preempt_count || irqs_disabled())) | 4370 | if (likely(ti->preempt_count || irqs_disabled())) |
4372 | return; | 4371 | return; |
4373 | 4372 | ||
4374 | do { | 4373 | do { |
4375 | add_preempt_count_notrace(PREEMPT_ACTIVE); | 4374 | add_preempt_count_notrace(PREEMPT_ACTIVE); |
4376 | schedule(); | 4375 | schedule(); |
4377 | sub_preempt_count_notrace(PREEMPT_ACTIVE); | 4376 | sub_preempt_count_notrace(PREEMPT_ACTIVE); |
4378 | 4377 | ||
4379 | /* | 4378 | /* |
4380 | * Check again in case we missed a preemption opportunity | 4379 | * Check again in case we missed a preemption opportunity |
4381 | * between schedule and now. | 4380 | * between schedule and now. |
4382 | */ | 4381 | */ |
4383 | barrier(); | 4382 | barrier(); |
4384 | } while (need_resched()); | 4383 | } while (need_resched()); |
4385 | } | 4384 | } |
4386 | EXPORT_SYMBOL(preempt_schedule); | 4385 | EXPORT_SYMBOL(preempt_schedule); |
4387 | 4386 | ||
4388 | /* | 4387 | /* |
4389 | * this is the entry point to schedule() from kernel preemption | 4388 | * this is the entry point to schedule() from kernel preemption |
4390 | * off of irq context. | 4389 | * off of irq context. |
4391 | * Note, that this is called and return with irqs disabled. This will | 4390 | * Note, that this is called and return with irqs disabled. This will |
4392 | * protect us against recursive calling from irq. | 4391 | * protect us against recursive calling from irq. |
4393 | */ | 4392 | */ |
4394 | asmlinkage void __sched preempt_schedule_irq(void) | 4393 | asmlinkage void __sched preempt_schedule_irq(void) |
4395 | { | 4394 | { |
4396 | struct thread_info *ti = current_thread_info(); | 4395 | struct thread_info *ti = current_thread_info(); |
4397 | 4396 | ||
4398 | /* Catch callers which need to be fixed */ | 4397 | /* Catch callers which need to be fixed */ |
4399 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 4398 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
4400 | 4399 | ||
4401 | do { | 4400 | do { |
4402 | add_preempt_count(PREEMPT_ACTIVE); | 4401 | add_preempt_count(PREEMPT_ACTIVE); |
4403 | local_irq_enable(); | 4402 | local_irq_enable(); |
4404 | schedule(); | 4403 | schedule(); |
4405 | local_irq_disable(); | 4404 | local_irq_disable(); |
4406 | sub_preempt_count(PREEMPT_ACTIVE); | 4405 | sub_preempt_count(PREEMPT_ACTIVE); |
4407 | 4406 | ||
4408 | /* | 4407 | /* |
4409 | * Check again in case we missed a preemption opportunity | 4408 | * Check again in case we missed a preemption opportunity |
4410 | * between schedule and now. | 4409 | * between schedule and now. |
4411 | */ | 4410 | */ |
4412 | barrier(); | 4411 | barrier(); |
4413 | } while (need_resched()); | 4412 | } while (need_resched()); |
4414 | } | 4413 | } |
4415 | 4414 | ||
4416 | #endif /* CONFIG_PREEMPT */ | 4415 | #endif /* CONFIG_PREEMPT */ |
4417 | 4416 | ||
4418 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, | 4417 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, |
4419 | void *key) | 4418 | void *key) |
4420 | { | 4419 | { |
4421 | return try_to_wake_up(curr->private, mode, wake_flags); | 4420 | return try_to_wake_up(curr->private, mode, wake_flags); |
4422 | } | 4421 | } |
4423 | EXPORT_SYMBOL(default_wake_function); | 4422 | EXPORT_SYMBOL(default_wake_function); |
4424 | 4423 | ||
4425 | /* | 4424 | /* |
4426 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | 4425 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just |
4427 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | 4426 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve |
4428 | * number) then we wake all the non-exclusive tasks and one exclusive task. | 4427 | * number) then we wake all the non-exclusive tasks and one exclusive task. |
4429 | * | 4428 | * |
4430 | * There are circumstances in which we can try to wake a task which has already | 4429 | * There are circumstances in which we can try to wake a task which has already |
4431 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 4430 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
4432 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 4431 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
4433 | */ | 4432 | */ |
4434 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 4433 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
4435 | int nr_exclusive, int wake_flags, void *key) | 4434 | int nr_exclusive, int wake_flags, void *key) |
4436 | { | 4435 | { |
4437 | wait_queue_t *curr, *next; | 4436 | wait_queue_t *curr, *next; |
4438 | 4437 | ||
4439 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | 4438 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
4440 | unsigned flags = curr->flags; | 4439 | unsigned flags = curr->flags; |
4441 | 4440 | ||
4442 | if (curr->func(curr, mode, wake_flags, key) && | 4441 | if (curr->func(curr, mode, wake_flags, key) && |
4443 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | 4442 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
4444 | break; | 4443 | break; |
4445 | } | 4444 | } |
4446 | } | 4445 | } |
4447 | 4446 | ||
4448 | /** | 4447 | /** |
4449 | * __wake_up - wake up threads blocked on a waitqueue. | 4448 | * __wake_up - wake up threads blocked on a waitqueue. |
4450 | * @q: the waitqueue | 4449 | * @q: the waitqueue |
4451 | * @mode: which threads | 4450 | * @mode: which threads |
4452 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 4451 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
4453 | * @key: is directly passed to the wakeup function | 4452 | * @key: is directly passed to the wakeup function |
4454 | * | 4453 | * |
4455 | * It may be assumed that this function implies a write memory barrier before | 4454 | * It may be assumed that this function implies a write memory barrier before |
4456 | * changing the task state if and only if any tasks are woken up. | 4455 | * changing the task state if and only if any tasks are woken up. |
4457 | */ | 4456 | */ |
4458 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | 4457 | void __wake_up(wait_queue_head_t *q, unsigned int mode, |
4459 | int nr_exclusive, void *key) | 4458 | int nr_exclusive, void *key) |
4460 | { | 4459 | { |
4461 | unsigned long flags; | 4460 | unsigned long flags; |
4462 | 4461 | ||
4463 | spin_lock_irqsave(&q->lock, flags); | 4462 | spin_lock_irqsave(&q->lock, flags); |
4464 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 4463 | __wake_up_common(q, mode, nr_exclusive, 0, key); |
4465 | spin_unlock_irqrestore(&q->lock, flags); | 4464 | spin_unlock_irqrestore(&q->lock, flags); |
4466 | } | 4465 | } |
4467 | EXPORT_SYMBOL(__wake_up); | 4466 | EXPORT_SYMBOL(__wake_up); |
4468 | 4467 | ||
4469 | /* | 4468 | /* |
4470 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | 4469 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. |
4471 | */ | 4470 | */ |
4472 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | 4471 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) |
4473 | { | 4472 | { |
4474 | __wake_up_common(q, mode, 1, 0, NULL); | 4473 | __wake_up_common(q, mode, 1, 0, NULL); |
4475 | } | 4474 | } |
4476 | EXPORT_SYMBOL_GPL(__wake_up_locked); | 4475 | EXPORT_SYMBOL_GPL(__wake_up_locked); |
4477 | 4476 | ||
4478 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | 4477 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) |
4479 | { | 4478 | { |
4480 | __wake_up_common(q, mode, 1, 0, key); | 4479 | __wake_up_common(q, mode, 1, 0, key); |
4481 | } | 4480 | } |
4482 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | 4481 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); |
4483 | 4482 | ||
4484 | /** | 4483 | /** |
4485 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | 4484 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. |
4486 | * @q: the waitqueue | 4485 | * @q: the waitqueue |
4487 | * @mode: which threads | 4486 | * @mode: which threads |
4488 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 4487 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
4489 | * @key: opaque value to be passed to wakeup targets | 4488 | * @key: opaque value to be passed to wakeup targets |
4490 | * | 4489 | * |
4491 | * The sync wakeup differs that the waker knows that it will schedule | 4490 | * The sync wakeup differs that the waker knows that it will schedule |
4492 | * away soon, so while the target thread will be woken up, it will not | 4491 | * away soon, so while the target thread will be woken up, it will not |
4493 | * be migrated to another CPU - ie. the two threads are 'synchronized' | 4492 | * be migrated to another CPU - ie. the two threads are 'synchronized' |
4494 | * with each other. This can prevent needless bouncing between CPUs. | 4493 | * with each other. This can prevent needless bouncing between CPUs. |
4495 | * | 4494 | * |
4496 | * On UP it can prevent extra preemption. | 4495 | * On UP it can prevent extra preemption. |
4497 | * | 4496 | * |
4498 | * It may be assumed that this function implies a write memory barrier before | 4497 | * It may be assumed that this function implies a write memory barrier before |
4499 | * changing the task state if and only if any tasks are woken up. | 4498 | * changing the task state if and only if any tasks are woken up. |
4500 | */ | 4499 | */ |
4501 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | 4500 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, |
4502 | int nr_exclusive, void *key) | 4501 | int nr_exclusive, void *key) |
4503 | { | 4502 | { |
4504 | unsigned long flags; | 4503 | unsigned long flags; |
4505 | int wake_flags = WF_SYNC; | 4504 | int wake_flags = WF_SYNC; |
4506 | 4505 | ||
4507 | if (unlikely(!q)) | 4506 | if (unlikely(!q)) |
4508 | return; | 4507 | return; |
4509 | 4508 | ||
4510 | if (unlikely(!nr_exclusive)) | 4509 | if (unlikely(!nr_exclusive)) |
4511 | wake_flags = 0; | 4510 | wake_flags = 0; |
4512 | 4511 | ||
4513 | spin_lock_irqsave(&q->lock, flags); | 4512 | spin_lock_irqsave(&q->lock, flags); |
4514 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); | 4513 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); |
4515 | spin_unlock_irqrestore(&q->lock, flags); | 4514 | spin_unlock_irqrestore(&q->lock, flags); |
4516 | } | 4515 | } |
4517 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | 4516 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); |
4518 | 4517 | ||
4519 | /* | 4518 | /* |
4520 | * __wake_up_sync - see __wake_up_sync_key() | 4519 | * __wake_up_sync - see __wake_up_sync_key() |
4521 | */ | 4520 | */ |
4522 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | 4521 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) |
4523 | { | 4522 | { |
4524 | __wake_up_sync_key(q, mode, nr_exclusive, NULL); | 4523 | __wake_up_sync_key(q, mode, nr_exclusive, NULL); |
4525 | } | 4524 | } |
4526 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 4525 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
4527 | 4526 | ||
4528 | /** | 4527 | /** |
4529 | * complete: - signals a single thread waiting on this completion | 4528 | * complete: - signals a single thread waiting on this completion |
4530 | * @x: holds the state of this particular completion | 4529 | * @x: holds the state of this particular completion |
4531 | * | 4530 | * |
4532 | * This will wake up a single thread waiting on this completion. Threads will be | 4531 | * This will wake up a single thread waiting on this completion. Threads will be |
4533 | * awakened in the same order in which they were queued. | 4532 | * awakened in the same order in which they were queued. |
4534 | * | 4533 | * |
4535 | * See also complete_all(), wait_for_completion() and related routines. | 4534 | * See also complete_all(), wait_for_completion() and related routines. |
4536 | * | 4535 | * |
4537 | * It may be assumed that this function implies a write memory barrier before | 4536 | * It may be assumed that this function implies a write memory barrier before |
4538 | * changing the task state if and only if any tasks are woken up. | 4537 | * changing the task state if and only if any tasks are woken up. |
4539 | */ | 4538 | */ |
4540 | void complete(struct completion *x) | 4539 | void complete(struct completion *x) |
4541 | { | 4540 | { |
4542 | unsigned long flags; | 4541 | unsigned long flags; |
4543 | 4542 | ||
4544 | spin_lock_irqsave(&x->wait.lock, flags); | 4543 | spin_lock_irqsave(&x->wait.lock, flags); |
4545 | x->done++; | 4544 | x->done++; |
4546 | __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); | 4545 | __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); |
4547 | spin_unlock_irqrestore(&x->wait.lock, flags); | 4546 | spin_unlock_irqrestore(&x->wait.lock, flags); |
4548 | } | 4547 | } |
4549 | EXPORT_SYMBOL(complete); | 4548 | EXPORT_SYMBOL(complete); |
4550 | 4549 | ||
4551 | /** | 4550 | /** |
4552 | * complete_all: - signals all threads waiting on this completion | 4551 | * complete_all: - signals all threads waiting on this completion |
4553 | * @x: holds the state of this particular completion | 4552 | * @x: holds the state of this particular completion |
4554 | * | 4553 | * |
4555 | * This will wake up all threads waiting on this particular completion event. | 4554 | * This will wake up all threads waiting on this particular completion event. |
4556 | * | 4555 | * |
4557 | * It may be assumed that this function implies a write memory barrier before | 4556 | * It may be assumed that this function implies a write memory barrier before |
4558 | * changing the task state if and only if any tasks are woken up. | 4557 | * changing the task state if and only if any tasks are woken up. |
4559 | */ | 4558 | */ |
4560 | void complete_all(struct completion *x) | 4559 | void complete_all(struct completion *x) |
4561 | { | 4560 | { |
4562 | unsigned long flags; | 4561 | unsigned long flags; |
4563 | 4562 | ||
4564 | spin_lock_irqsave(&x->wait.lock, flags); | 4563 | spin_lock_irqsave(&x->wait.lock, flags); |
4565 | x->done += UINT_MAX/2; | 4564 | x->done += UINT_MAX/2; |
4566 | __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); | 4565 | __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); |
4567 | spin_unlock_irqrestore(&x->wait.lock, flags); | 4566 | spin_unlock_irqrestore(&x->wait.lock, flags); |
4568 | } | 4567 | } |
4569 | EXPORT_SYMBOL(complete_all); | 4568 | EXPORT_SYMBOL(complete_all); |
4570 | 4569 | ||
4571 | static inline long __sched | 4570 | static inline long __sched |
4572 | do_wait_for_common(struct completion *x, long timeout, int state) | 4571 | do_wait_for_common(struct completion *x, long timeout, int state) |
4573 | { | 4572 | { |
4574 | if (!x->done) { | 4573 | if (!x->done) { |
4575 | DECLARE_WAITQUEUE(wait, current); | 4574 | DECLARE_WAITQUEUE(wait, current); |
4576 | 4575 | ||
4577 | __add_wait_queue_tail_exclusive(&x->wait, &wait); | 4576 | __add_wait_queue_tail_exclusive(&x->wait, &wait); |
4578 | do { | 4577 | do { |
4579 | if (signal_pending_state(state, current)) { | 4578 | if (signal_pending_state(state, current)) { |
4580 | timeout = -ERESTARTSYS; | 4579 | timeout = -ERESTARTSYS; |
4581 | break; | 4580 | break; |
4582 | } | 4581 | } |
4583 | __set_current_state(state); | 4582 | __set_current_state(state); |
4584 | spin_unlock_irq(&x->wait.lock); | 4583 | spin_unlock_irq(&x->wait.lock); |
4585 | timeout = schedule_timeout(timeout); | 4584 | timeout = schedule_timeout(timeout); |
4586 | spin_lock_irq(&x->wait.lock); | 4585 | spin_lock_irq(&x->wait.lock); |
4587 | } while (!x->done && timeout); | 4586 | } while (!x->done && timeout); |
4588 | __remove_wait_queue(&x->wait, &wait); | 4587 | __remove_wait_queue(&x->wait, &wait); |
4589 | if (!x->done) | 4588 | if (!x->done) |
4590 | return timeout; | 4589 | return timeout; |
4591 | } | 4590 | } |
4592 | x->done--; | 4591 | x->done--; |
4593 | return timeout ?: 1; | 4592 | return timeout ?: 1; |
4594 | } | 4593 | } |
4595 | 4594 | ||
4596 | static long __sched | 4595 | static long __sched |
4597 | wait_for_common(struct completion *x, long timeout, int state) | 4596 | wait_for_common(struct completion *x, long timeout, int state) |
4598 | { | 4597 | { |
4599 | might_sleep(); | 4598 | might_sleep(); |
4600 | 4599 | ||
4601 | spin_lock_irq(&x->wait.lock); | 4600 | spin_lock_irq(&x->wait.lock); |
4602 | timeout = do_wait_for_common(x, timeout, state); | 4601 | timeout = do_wait_for_common(x, timeout, state); |
4603 | spin_unlock_irq(&x->wait.lock); | 4602 | spin_unlock_irq(&x->wait.lock); |
4604 | return timeout; | 4603 | return timeout; |
4605 | } | 4604 | } |
4606 | 4605 | ||
4607 | /** | 4606 | /** |
4608 | * wait_for_completion: - waits for completion of a task | 4607 | * wait_for_completion: - waits for completion of a task |
4609 | * @x: holds the state of this particular completion | 4608 | * @x: holds the state of this particular completion |
4610 | * | 4609 | * |
4611 | * This waits to be signaled for completion of a specific task. It is NOT | 4610 | * This waits to be signaled for completion of a specific task. It is NOT |
4612 | * interruptible and there is no timeout. | 4611 | * interruptible and there is no timeout. |
4613 | * | 4612 | * |
4614 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | 4613 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout |
4615 | * and interrupt capability. Also see complete(). | 4614 | * and interrupt capability. Also see complete(). |
4616 | */ | 4615 | */ |
4617 | void __sched wait_for_completion(struct completion *x) | 4616 | void __sched wait_for_completion(struct completion *x) |
4618 | { | 4617 | { |
4619 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | 4618 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
4620 | } | 4619 | } |
4621 | EXPORT_SYMBOL(wait_for_completion); | 4620 | EXPORT_SYMBOL(wait_for_completion); |
4622 | 4621 | ||
4623 | /** | 4622 | /** |
4624 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | 4623 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) |
4625 | * @x: holds the state of this particular completion | 4624 | * @x: holds the state of this particular completion |
4626 | * @timeout: timeout value in jiffies | 4625 | * @timeout: timeout value in jiffies |
4627 | * | 4626 | * |
4628 | * This waits for either a completion of a specific task to be signaled or for a | 4627 | * This waits for either a completion of a specific task to be signaled or for a |
4629 | * specified timeout to expire. The timeout is in jiffies. It is not | 4628 | * specified timeout to expire. The timeout is in jiffies. It is not |
4630 | * interruptible. | 4629 | * interruptible. |
4631 | */ | 4630 | */ |
4632 | unsigned long __sched | 4631 | unsigned long __sched |
4633 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 4632 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
4634 | { | 4633 | { |
4635 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); | 4634 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); |
4636 | } | 4635 | } |
4637 | EXPORT_SYMBOL(wait_for_completion_timeout); | 4636 | EXPORT_SYMBOL(wait_for_completion_timeout); |
4638 | 4637 | ||
4639 | /** | 4638 | /** |
4640 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | 4639 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) |
4641 | * @x: holds the state of this particular completion | 4640 | * @x: holds the state of this particular completion |
4642 | * | 4641 | * |
4643 | * This waits for completion of a specific task to be signaled. It is | 4642 | * This waits for completion of a specific task to be signaled. It is |
4644 | * interruptible. | 4643 | * interruptible. |
4645 | */ | 4644 | */ |
4646 | int __sched wait_for_completion_interruptible(struct completion *x) | 4645 | int __sched wait_for_completion_interruptible(struct completion *x) |
4647 | { | 4646 | { |
4648 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | 4647 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
4649 | if (t == -ERESTARTSYS) | 4648 | if (t == -ERESTARTSYS) |
4650 | return t; | 4649 | return t; |
4651 | return 0; | 4650 | return 0; |
4652 | } | 4651 | } |
4653 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 4652 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
4654 | 4653 | ||
4655 | /** | 4654 | /** |
4656 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | 4655 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) |
4657 | * @x: holds the state of this particular completion | 4656 | * @x: holds the state of this particular completion |
4658 | * @timeout: timeout value in jiffies | 4657 | * @timeout: timeout value in jiffies |
4659 | * | 4658 | * |
4660 | * This waits for either a completion of a specific task to be signaled or for a | 4659 | * This waits for either a completion of a specific task to be signaled or for a |
4661 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4660 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
4662 | */ | 4661 | */ |
4663 | long __sched | 4662 | long __sched |
4664 | wait_for_completion_interruptible_timeout(struct completion *x, | 4663 | wait_for_completion_interruptible_timeout(struct completion *x, |
4665 | unsigned long timeout) | 4664 | unsigned long timeout) |
4666 | { | 4665 | { |
4667 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); | 4666 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); |
4668 | } | 4667 | } |
4669 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 4668 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
4670 | 4669 | ||
4671 | /** | 4670 | /** |
4672 | * wait_for_completion_killable: - waits for completion of a task (killable) | 4671 | * wait_for_completion_killable: - waits for completion of a task (killable) |
4673 | * @x: holds the state of this particular completion | 4672 | * @x: holds the state of this particular completion |
4674 | * | 4673 | * |
4675 | * This waits to be signaled for completion of a specific task. It can be | 4674 | * This waits to be signaled for completion of a specific task. It can be |
4676 | * interrupted by a kill signal. | 4675 | * interrupted by a kill signal. |
4677 | */ | 4676 | */ |
4678 | int __sched wait_for_completion_killable(struct completion *x) | 4677 | int __sched wait_for_completion_killable(struct completion *x) |
4679 | { | 4678 | { |
4680 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | 4679 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); |
4681 | if (t == -ERESTARTSYS) | 4680 | if (t == -ERESTARTSYS) |
4682 | return t; | 4681 | return t; |
4683 | return 0; | 4682 | return 0; |
4684 | } | 4683 | } |
4685 | EXPORT_SYMBOL(wait_for_completion_killable); | 4684 | EXPORT_SYMBOL(wait_for_completion_killable); |
4686 | 4685 | ||
4687 | /** | 4686 | /** |
4688 | * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) | 4687 | * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) |
4689 | * @x: holds the state of this particular completion | 4688 | * @x: holds the state of this particular completion |
4690 | * @timeout: timeout value in jiffies | 4689 | * @timeout: timeout value in jiffies |
4691 | * | 4690 | * |
4692 | * This waits for either a completion of a specific task to be | 4691 | * This waits for either a completion of a specific task to be |
4693 | * signaled or for a specified timeout to expire. It can be | 4692 | * signaled or for a specified timeout to expire. It can be |
4694 | * interrupted by a kill signal. The timeout is in jiffies. | 4693 | * interrupted by a kill signal. The timeout is in jiffies. |
4695 | */ | 4694 | */ |
4696 | long __sched | 4695 | long __sched |
4697 | wait_for_completion_killable_timeout(struct completion *x, | 4696 | wait_for_completion_killable_timeout(struct completion *x, |
4698 | unsigned long timeout) | 4697 | unsigned long timeout) |
4699 | { | 4698 | { |
4700 | return wait_for_common(x, timeout, TASK_KILLABLE); | 4699 | return wait_for_common(x, timeout, TASK_KILLABLE); |
4701 | } | 4700 | } |
4702 | EXPORT_SYMBOL(wait_for_completion_killable_timeout); | 4701 | EXPORT_SYMBOL(wait_for_completion_killable_timeout); |
4703 | 4702 | ||
4704 | /** | 4703 | /** |
4705 | * try_wait_for_completion - try to decrement a completion without blocking | 4704 | * try_wait_for_completion - try to decrement a completion without blocking |
4706 | * @x: completion structure | 4705 | * @x: completion structure |
4707 | * | 4706 | * |
4708 | * Returns: 0 if a decrement cannot be done without blocking | 4707 | * Returns: 0 if a decrement cannot be done without blocking |
4709 | * 1 if a decrement succeeded. | 4708 | * 1 if a decrement succeeded. |
4710 | * | 4709 | * |
4711 | * If a completion is being used as a counting completion, | 4710 | * If a completion is being used as a counting completion, |
4712 | * attempt to decrement the counter without blocking. This | 4711 | * attempt to decrement the counter without blocking. This |
4713 | * enables us to avoid waiting if the resource the completion | 4712 | * enables us to avoid waiting if the resource the completion |
4714 | * is protecting is not available. | 4713 | * is protecting is not available. |
4715 | */ | 4714 | */ |
4716 | bool try_wait_for_completion(struct completion *x) | 4715 | bool try_wait_for_completion(struct completion *x) |
4717 | { | 4716 | { |
4718 | unsigned long flags; | 4717 | unsigned long flags; |
4719 | int ret = 1; | 4718 | int ret = 1; |
4720 | 4719 | ||
4721 | spin_lock_irqsave(&x->wait.lock, flags); | 4720 | spin_lock_irqsave(&x->wait.lock, flags); |
4722 | if (!x->done) | 4721 | if (!x->done) |
4723 | ret = 0; | 4722 | ret = 0; |
4724 | else | 4723 | else |
4725 | x->done--; | 4724 | x->done--; |
4726 | spin_unlock_irqrestore(&x->wait.lock, flags); | 4725 | spin_unlock_irqrestore(&x->wait.lock, flags); |
4727 | return ret; | 4726 | return ret; |
4728 | } | 4727 | } |
4729 | EXPORT_SYMBOL(try_wait_for_completion); | 4728 | EXPORT_SYMBOL(try_wait_for_completion); |
4730 | 4729 | ||
4731 | /** | 4730 | /** |
4732 | * completion_done - Test to see if a completion has any waiters | 4731 | * completion_done - Test to see if a completion has any waiters |
4733 | * @x: completion structure | 4732 | * @x: completion structure |
4734 | * | 4733 | * |
4735 | * Returns: 0 if there are waiters (wait_for_completion() in progress) | 4734 | * Returns: 0 if there are waiters (wait_for_completion() in progress) |
4736 | * 1 if there are no waiters. | 4735 | * 1 if there are no waiters. |
4737 | * | 4736 | * |
4738 | */ | 4737 | */ |
4739 | bool completion_done(struct completion *x) | 4738 | bool completion_done(struct completion *x) |
4740 | { | 4739 | { |
4741 | unsigned long flags; | 4740 | unsigned long flags; |
4742 | int ret = 1; | 4741 | int ret = 1; |
4743 | 4742 | ||
4744 | spin_lock_irqsave(&x->wait.lock, flags); | 4743 | spin_lock_irqsave(&x->wait.lock, flags); |
4745 | if (!x->done) | 4744 | if (!x->done) |
4746 | ret = 0; | 4745 | ret = 0; |
4747 | spin_unlock_irqrestore(&x->wait.lock, flags); | 4746 | spin_unlock_irqrestore(&x->wait.lock, flags); |
4748 | return ret; | 4747 | return ret; |
4749 | } | 4748 | } |
4750 | EXPORT_SYMBOL(completion_done); | 4749 | EXPORT_SYMBOL(completion_done); |
4751 | 4750 | ||
4752 | static long __sched | 4751 | static long __sched |
4753 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | 4752 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) |
4754 | { | 4753 | { |
4755 | unsigned long flags; | 4754 | unsigned long flags; |
4756 | wait_queue_t wait; | 4755 | wait_queue_t wait; |
4757 | 4756 | ||
4758 | init_waitqueue_entry(&wait, current); | 4757 | init_waitqueue_entry(&wait, current); |
4759 | 4758 | ||
4760 | __set_current_state(state); | 4759 | __set_current_state(state); |
4761 | 4760 | ||
4762 | spin_lock_irqsave(&q->lock, flags); | 4761 | spin_lock_irqsave(&q->lock, flags); |
4763 | __add_wait_queue(q, &wait); | 4762 | __add_wait_queue(q, &wait); |
4764 | spin_unlock(&q->lock); | 4763 | spin_unlock(&q->lock); |
4765 | timeout = schedule_timeout(timeout); | 4764 | timeout = schedule_timeout(timeout); |
4766 | spin_lock_irq(&q->lock); | 4765 | spin_lock_irq(&q->lock); |
4767 | __remove_wait_queue(q, &wait); | 4766 | __remove_wait_queue(q, &wait); |
4768 | spin_unlock_irqrestore(&q->lock, flags); | 4767 | spin_unlock_irqrestore(&q->lock, flags); |
4769 | 4768 | ||
4770 | return timeout; | 4769 | return timeout; |
4771 | } | 4770 | } |
4772 | 4771 | ||
4773 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | 4772 | void __sched interruptible_sleep_on(wait_queue_head_t *q) |
4774 | { | 4773 | { |
4775 | sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | 4774 | sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
4776 | } | 4775 | } |
4777 | EXPORT_SYMBOL(interruptible_sleep_on); | 4776 | EXPORT_SYMBOL(interruptible_sleep_on); |
4778 | 4777 | ||
4779 | long __sched | 4778 | long __sched |
4780 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 4779 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
4781 | { | 4780 | { |
4782 | return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); | 4781 | return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); |
4783 | } | 4782 | } |
4784 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 4783 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
4785 | 4784 | ||
4786 | void __sched sleep_on(wait_queue_head_t *q) | 4785 | void __sched sleep_on(wait_queue_head_t *q) |
4787 | { | 4786 | { |
4788 | sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | 4787 | sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
4789 | } | 4788 | } |
4790 | EXPORT_SYMBOL(sleep_on); | 4789 | EXPORT_SYMBOL(sleep_on); |
4791 | 4790 | ||
4792 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 4791 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
4793 | { | 4792 | { |
4794 | return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); | 4793 | return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); |
4795 | } | 4794 | } |
4796 | EXPORT_SYMBOL(sleep_on_timeout); | 4795 | EXPORT_SYMBOL(sleep_on_timeout); |
4797 | 4796 | ||
4798 | #ifdef CONFIG_RT_MUTEXES | 4797 | #ifdef CONFIG_RT_MUTEXES |
4799 | 4798 | ||
4800 | /* | 4799 | /* |
4801 | * rt_mutex_setprio - set the current priority of a task | 4800 | * rt_mutex_setprio - set the current priority of a task |
4802 | * @p: task | 4801 | * @p: task |
4803 | * @prio: prio value (kernel-internal form) | 4802 | * @prio: prio value (kernel-internal form) |
4804 | * | 4803 | * |
4805 | * This function changes the 'effective' priority of a task. It does | 4804 | * This function changes the 'effective' priority of a task. It does |
4806 | * not touch ->normal_prio like __setscheduler(). | 4805 | * not touch ->normal_prio like __setscheduler(). |
4807 | * | 4806 | * |
4808 | * Used by the rt_mutex code to implement priority inheritance logic. | 4807 | * Used by the rt_mutex code to implement priority inheritance logic. |
4809 | */ | 4808 | */ |
4810 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4809 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4811 | { | 4810 | { |
4812 | int oldprio, on_rq, running; | 4811 | int oldprio, on_rq, running; |
4813 | struct rq *rq; | 4812 | struct rq *rq; |
4814 | const struct sched_class *prev_class; | 4813 | const struct sched_class *prev_class; |
4815 | 4814 | ||
4816 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4815 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4817 | 4816 | ||
4818 | rq = __task_rq_lock(p); | 4817 | rq = __task_rq_lock(p); |
4819 | 4818 | ||
4820 | trace_sched_pi_setprio(p, prio); | 4819 | trace_sched_pi_setprio(p, prio); |
4821 | oldprio = p->prio; | 4820 | oldprio = p->prio; |
4822 | prev_class = p->sched_class; | 4821 | prev_class = p->sched_class; |
4823 | on_rq = p->on_rq; | 4822 | on_rq = p->on_rq; |
4824 | running = task_current(rq, p); | 4823 | running = task_current(rq, p); |
4825 | if (on_rq) | 4824 | if (on_rq) |
4826 | dequeue_task(rq, p, 0); | 4825 | dequeue_task(rq, p, 0); |
4827 | if (running) | 4826 | if (running) |
4828 | p->sched_class->put_prev_task(rq, p); | 4827 | p->sched_class->put_prev_task(rq, p); |
4829 | 4828 | ||
4830 | if (rt_prio(prio)) | 4829 | if (rt_prio(prio)) |
4831 | p->sched_class = &rt_sched_class; | 4830 | p->sched_class = &rt_sched_class; |
4832 | else | 4831 | else |
4833 | p->sched_class = &fair_sched_class; | 4832 | p->sched_class = &fair_sched_class; |
4834 | 4833 | ||
4835 | p->prio = prio; | 4834 | p->prio = prio; |
4836 | 4835 | ||
4837 | if (running) | 4836 | if (running) |
4838 | p->sched_class->set_curr_task(rq); | 4837 | p->sched_class->set_curr_task(rq); |
4839 | if (on_rq) | 4838 | if (on_rq) |
4840 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4839 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4841 | 4840 | ||
4842 | check_class_changed(rq, p, prev_class, oldprio); | 4841 | check_class_changed(rq, p, prev_class, oldprio); |
4843 | __task_rq_unlock(rq); | 4842 | __task_rq_unlock(rq); |
4844 | } | 4843 | } |
4845 | 4844 | ||
4846 | #endif | 4845 | #endif |
4847 | 4846 | ||
4848 | void set_user_nice(struct task_struct *p, long nice) | 4847 | void set_user_nice(struct task_struct *p, long nice) |
4849 | { | 4848 | { |
4850 | int old_prio, delta, on_rq; | 4849 | int old_prio, delta, on_rq; |
4851 | unsigned long flags; | 4850 | unsigned long flags; |
4852 | struct rq *rq; | 4851 | struct rq *rq; |
4853 | 4852 | ||
4854 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 4853 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
4855 | return; | 4854 | return; |
4856 | /* | 4855 | /* |
4857 | * We have to be careful, if called from sys_setpriority(), | 4856 | * We have to be careful, if called from sys_setpriority(), |
4858 | * the task might be in the middle of scheduling on another CPU. | 4857 | * the task might be in the middle of scheduling on another CPU. |
4859 | */ | 4858 | */ |
4860 | rq = task_rq_lock(p, &flags); | 4859 | rq = task_rq_lock(p, &flags); |
4861 | /* | 4860 | /* |
4862 | * The RT priorities are set via sched_setscheduler(), but we still | 4861 | * The RT priorities are set via sched_setscheduler(), but we still |
4863 | * allow the 'normal' nice value to be set - but as expected | 4862 | * allow the 'normal' nice value to be set - but as expected |
4864 | * it wont have any effect on scheduling until the task is | 4863 | * it wont have any effect on scheduling until the task is |
4865 | * SCHED_FIFO/SCHED_RR: | 4864 | * SCHED_FIFO/SCHED_RR: |
4866 | */ | 4865 | */ |
4867 | if (task_has_rt_policy(p)) { | 4866 | if (task_has_rt_policy(p)) { |
4868 | p->static_prio = NICE_TO_PRIO(nice); | 4867 | p->static_prio = NICE_TO_PRIO(nice); |
4869 | goto out_unlock; | 4868 | goto out_unlock; |
4870 | } | 4869 | } |
4871 | on_rq = p->on_rq; | 4870 | on_rq = p->on_rq; |
4872 | if (on_rq) | 4871 | if (on_rq) |
4873 | dequeue_task(rq, p, 0); | 4872 | dequeue_task(rq, p, 0); |
4874 | 4873 | ||
4875 | p->static_prio = NICE_TO_PRIO(nice); | 4874 | p->static_prio = NICE_TO_PRIO(nice); |
4876 | set_load_weight(p); | 4875 | set_load_weight(p); |
4877 | old_prio = p->prio; | 4876 | old_prio = p->prio; |
4878 | p->prio = effective_prio(p); | 4877 | p->prio = effective_prio(p); |
4879 | delta = p->prio - old_prio; | 4878 | delta = p->prio - old_prio; |
4880 | 4879 | ||
4881 | if (on_rq) { | 4880 | if (on_rq) { |
4882 | enqueue_task(rq, p, 0); | 4881 | enqueue_task(rq, p, 0); |
4883 | /* | 4882 | /* |
4884 | * If the task increased its priority or is running and | 4883 | * If the task increased its priority or is running and |
4885 | * lowered its priority, then reschedule its CPU: | 4884 | * lowered its priority, then reschedule its CPU: |
4886 | */ | 4885 | */ |
4887 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 4886 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
4888 | resched_task(rq->curr); | 4887 | resched_task(rq->curr); |
4889 | } | 4888 | } |
4890 | out_unlock: | 4889 | out_unlock: |
4891 | task_rq_unlock(rq, p, &flags); | 4890 | task_rq_unlock(rq, p, &flags); |
4892 | } | 4891 | } |
4893 | EXPORT_SYMBOL(set_user_nice); | 4892 | EXPORT_SYMBOL(set_user_nice); |
4894 | 4893 | ||
4895 | /* | 4894 | /* |
4896 | * can_nice - check if a task can reduce its nice value | 4895 | * can_nice - check if a task can reduce its nice value |
4897 | * @p: task | 4896 | * @p: task |
4898 | * @nice: nice value | 4897 | * @nice: nice value |
4899 | */ | 4898 | */ |
4900 | int can_nice(const struct task_struct *p, const int nice) | 4899 | int can_nice(const struct task_struct *p, const int nice) |
4901 | { | 4900 | { |
4902 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 4901 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
4903 | int nice_rlim = 20 - nice; | 4902 | int nice_rlim = 20 - nice; |
4904 | 4903 | ||
4905 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || | 4904 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
4906 | capable(CAP_SYS_NICE)); | 4905 | capable(CAP_SYS_NICE)); |
4907 | } | 4906 | } |
4908 | 4907 | ||
4909 | #ifdef __ARCH_WANT_SYS_NICE | 4908 | #ifdef __ARCH_WANT_SYS_NICE |
4910 | 4909 | ||
4911 | /* | 4910 | /* |
4912 | * sys_nice - change the priority of the current process. | 4911 | * sys_nice - change the priority of the current process. |
4913 | * @increment: priority increment | 4912 | * @increment: priority increment |
4914 | * | 4913 | * |
4915 | * sys_setpriority is a more generic, but much slower function that | 4914 | * sys_setpriority is a more generic, but much slower function that |
4916 | * does similar things. | 4915 | * does similar things. |
4917 | */ | 4916 | */ |
4918 | SYSCALL_DEFINE1(nice, int, increment) | 4917 | SYSCALL_DEFINE1(nice, int, increment) |
4919 | { | 4918 | { |
4920 | long nice, retval; | 4919 | long nice, retval; |
4921 | 4920 | ||
4922 | /* | 4921 | /* |
4923 | * Setpriority might change our priority at the same moment. | 4922 | * Setpriority might change our priority at the same moment. |
4924 | * We don't have to worry. Conceptually one call occurs first | 4923 | * We don't have to worry. Conceptually one call occurs first |
4925 | * and we have a single winner. | 4924 | * and we have a single winner. |
4926 | */ | 4925 | */ |
4927 | if (increment < -40) | 4926 | if (increment < -40) |
4928 | increment = -40; | 4927 | increment = -40; |
4929 | if (increment > 40) | 4928 | if (increment > 40) |
4930 | increment = 40; | 4929 | increment = 40; |
4931 | 4930 | ||
4932 | nice = TASK_NICE(current) + increment; | 4931 | nice = TASK_NICE(current) + increment; |
4933 | if (nice < -20) | 4932 | if (nice < -20) |
4934 | nice = -20; | 4933 | nice = -20; |
4935 | if (nice > 19) | 4934 | if (nice > 19) |
4936 | nice = 19; | 4935 | nice = 19; |
4937 | 4936 | ||
4938 | if (increment < 0 && !can_nice(current, nice)) | 4937 | if (increment < 0 && !can_nice(current, nice)) |
4939 | return -EPERM; | 4938 | return -EPERM; |
4940 | 4939 | ||
4941 | retval = security_task_setnice(current, nice); | 4940 | retval = security_task_setnice(current, nice); |
4942 | if (retval) | 4941 | if (retval) |
4943 | return retval; | 4942 | return retval; |
4944 | 4943 | ||
4945 | set_user_nice(current, nice); | 4944 | set_user_nice(current, nice); |
4946 | return 0; | 4945 | return 0; |
4947 | } | 4946 | } |
4948 | 4947 | ||
4949 | #endif | 4948 | #endif |
4950 | 4949 | ||
4951 | /** | 4950 | /** |
4952 | * task_prio - return the priority value of a given task. | 4951 | * task_prio - return the priority value of a given task. |
4953 | * @p: the task in question. | 4952 | * @p: the task in question. |
4954 | * | 4953 | * |
4955 | * This is the priority value as seen by users in /proc. | 4954 | * This is the priority value as seen by users in /proc. |
4956 | * RT tasks are offset by -200. Normal tasks are centered | 4955 | * RT tasks are offset by -200. Normal tasks are centered |
4957 | * around 0, value goes from -16 to +15. | 4956 | * around 0, value goes from -16 to +15. |
4958 | */ | 4957 | */ |
4959 | int task_prio(const struct task_struct *p) | 4958 | int task_prio(const struct task_struct *p) |
4960 | { | 4959 | { |
4961 | return p->prio - MAX_RT_PRIO; | 4960 | return p->prio - MAX_RT_PRIO; |
4962 | } | 4961 | } |
4963 | 4962 | ||
4964 | /** | 4963 | /** |
4965 | * task_nice - return the nice value of a given task. | 4964 | * task_nice - return the nice value of a given task. |
4966 | * @p: the task in question. | 4965 | * @p: the task in question. |
4967 | */ | 4966 | */ |
4968 | int task_nice(const struct task_struct *p) | 4967 | int task_nice(const struct task_struct *p) |
4969 | { | 4968 | { |
4970 | return TASK_NICE(p); | 4969 | return TASK_NICE(p); |
4971 | } | 4970 | } |
4972 | EXPORT_SYMBOL(task_nice); | 4971 | EXPORT_SYMBOL(task_nice); |
4973 | 4972 | ||
4974 | /** | 4973 | /** |
4975 | * idle_cpu - is a given cpu idle currently? | 4974 | * idle_cpu - is a given cpu idle currently? |
4976 | * @cpu: the processor in question. | 4975 | * @cpu: the processor in question. |
4977 | */ | 4976 | */ |
4978 | int idle_cpu(int cpu) | 4977 | int idle_cpu(int cpu) |
4979 | { | 4978 | { |
4980 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 4979 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; |
4981 | } | 4980 | } |
4982 | 4981 | ||
4983 | /** | 4982 | /** |
4984 | * idle_task - return the idle task for a given cpu. | 4983 | * idle_task - return the idle task for a given cpu. |
4985 | * @cpu: the processor in question. | 4984 | * @cpu: the processor in question. |
4986 | */ | 4985 | */ |
4987 | struct task_struct *idle_task(int cpu) | 4986 | struct task_struct *idle_task(int cpu) |
4988 | { | 4987 | { |
4989 | return cpu_rq(cpu)->idle; | 4988 | return cpu_rq(cpu)->idle; |
4990 | } | 4989 | } |
4991 | 4990 | ||
4992 | /** | 4991 | /** |
4993 | * find_process_by_pid - find a process with a matching PID value. | 4992 | * find_process_by_pid - find a process with a matching PID value. |
4994 | * @pid: the pid in question. | 4993 | * @pid: the pid in question. |
4995 | */ | 4994 | */ |
4996 | static struct task_struct *find_process_by_pid(pid_t pid) | 4995 | static struct task_struct *find_process_by_pid(pid_t pid) |
4997 | { | 4996 | { |
4998 | return pid ? find_task_by_vpid(pid) : current; | 4997 | return pid ? find_task_by_vpid(pid) : current; |
4999 | } | 4998 | } |
5000 | 4999 | ||
5001 | /* Actually do priority change: must hold rq lock. */ | 5000 | /* Actually do priority change: must hold rq lock. */ |
5002 | static void | 5001 | static void |
5003 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 5002 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
5004 | { | 5003 | { |
5005 | p->policy = policy; | 5004 | p->policy = policy; |
5006 | p->rt_priority = prio; | 5005 | p->rt_priority = prio; |
5007 | p->normal_prio = normal_prio(p); | 5006 | p->normal_prio = normal_prio(p); |
5008 | /* we are holding p->pi_lock already */ | 5007 | /* we are holding p->pi_lock already */ |
5009 | p->prio = rt_mutex_getprio(p); | 5008 | p->prio = rt_mutex_getprio(p); |
5010 | if (rt_prio(p->prio)) | 5009 | if (rt_prio(p->prio)) |
5011 | p->sched_class = &rt_sched_class; | 5010 | p->sched_class = &rt_sched_class; |
5012 | else | 5011 | else |
5013 | p->sched_class = &fair_sched_class; | 5012 | p->sched_class = &fair_sched_class; |
5014 | set_load_weight(p); | 5013 | set_load_weight(p); |
5015 | } | 5014 | } |
5016 | 5015 | ||
5017 | /* | 5016 | /* |
5018 | * check the target process has a UID that matches the current process's | 5017 | * check the target process has a UID that matches the current process's |
5019 | */ | 5018 | */ |
5020 | static bool check_same_owner(struct task_struct *p) | 5019 | static bool check_same_owner(struct task_struct *p) |
5021 | { | 5020 | { |
5022 | const struct cred *cred = current_cred(), *pcred; | 5021 | const struct cred *cred = current_cred(), *pcred; |
5023 | bool match; | 5022 | bool match; |
5024 | 5023 | ||
5025 | rcu_read_lock(); | 5024 | rcu_read_lock(); |
5026 | pcred = __task_cred(p); | 5025 | pcred = __task_cred(p); |
5027 | if (cred->user->user_ns == pcred->user->user_ns) | 5026 | if (cred->user->user_ns == pcred->user->user_ns) |
5028 | match = (cred->euid == pcred->euid || | 5027 | match = (cred->euid == pcred->euid || |
5029 | cred->euid == pcred->uid); | 5028 | cred->euid == pcred->uid); |
5030 | else | 5029 | else |
5031 | match = false; | 5030 | match = false; |
5032 | rcu_read_unlock(); | 5031 | rcu_read_unlock(); |
5033 | return match; | 5032 | return match; |
5034 | } | 5033 | } |
5035 | 5034 | ||
5036 | static int __sched_setscheduler(struct task_struct *p, int policy, | 5035 | static int __sched_setscheduler(struct task_struct *p, int policy, |
5037 | const struct sched_param *param, bool user) | 5036 | const struct sched_param *param, bool user) |
5038 | { | 5037 | { |
5039 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 5038 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
5040 | unsigned long flags; | 5039 | unsigned long flags; |
5041 | const struct sched_class *prev_class; | 5040 | const struct sched_class *prev_class; |
5042 | struct rq *rq; | 5041 | struct rq *rq; |
5043 | int reset_on_fork; | 5042 | int reset_on_fork; |
5044 | 5043 | ||
5045 | /* may grab non-irq protected spin_locks */ | 5044 | /* may grab non-irq protected spin_locks */ |
5046 | BUG_ON(in_interrupt()); | 5045 | BUG_ON(in_interrupt()); |
5047 | recheck: | 5046 | recheck: |
5048 | /* double check policy once rq lock held */ | 5047 | /* double check policy once rq lock held */ |
5049 | if (policy < 0) { | 5048 | if (policy < 0) { |
5050 | reset_on_fork = p->sched_reset_on_fork; | 5049 | reset_on_fork = p->sched_reset_on_fork; |
5051 | policy = oldpolicy = p->policy; | 5050 | policy = oldpolicy = p->policy; |
5052 | } else { | 5051 | } else { |
5053 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); | 5052 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
5054 | policy &= ~SCHED_RESET_ON_FORK; | 5053 | policy &= ~SCHED_RESET_ON_FORK; |
5055 | 5054 | ||
5056 | if (policy != SCHED_FIFO && policy != SCHED_RR && | 5055 | if (policy != SCHED_FIFO && policy != SCHED_RR && |
5057 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 5056 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
5058 | policy != SCHED_IDLE) | 5057 | policy != SCHED_IDLE) |
5059 | return -EINVAL; | 5058 | return -EINVAL; |
5060 | } | 5059 | } |
5061 | 5060 | ||
5062 | /* | 5061 | /* |
5063 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 5062 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
5064 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 5063 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
5065 | * SCHED_BATCH and SCHED_IDLE is 0. | 5064 | * SCHED_BATCH and SCHED_IDLE is 0. |
5066 | */ | 5065 | */ |
5067 | if (param->sched_priority < 0 || | 5066 | if (param->sched_priority < 0 || |
5068 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 5067 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
5069 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 5068 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
5070 | return -EINVAL; | 5069 | return -EINVAL; |
5071 | if (rt_policy(policy) != (param->sched_priority != 0)) | 5070 | if (rt_policy(policy) != (param->sched_priority != 0)) |
5072 | return -EINVAL; | 5071 | return -EINVAL; |
5073 | 5072 | ||
5074 | /* | 5073 | /* |
5075 | * Allow unprivileged RT tasks to decrease priority: | 5074 | * Allow unprivileged RT tasks to decrease priority: |
5076 | */ | 5075 | */ |
5077 | if (user && !capable(CAP_SYS_NICE)) { | 5076 | if (user && !capable(CAP_SYS_NICE)) { |
5078 | if (rt_policy(policy)) { | 5077 | if (rt_policy(policy)) { |
5079 | unsigned long rlim_rtprio = | 5078 | unsigned long rlim_rtprio = |
5080 | task_rlimit(p, RLIMIT_RTPRIO); | 5079 | task_rlimit(p, RLIMIT_RTPRIO); |
5081 | 5080 | ||
5082 | /* can't set/change the rt policy */ | 5081 | /* can't set/change the rt policy */ |
5083 | if (policy != p->policy && !rlim_rtprio) | 5082 | if (policy != p->policy && !rlim_rtprio) |
5084 | return -EPERM; | 5083 | return -EPERM; |
5085 | 5084 | ||
5086 | /* can't increase priority */ | 5085 | /* can't increase priority */ |
5087 | if (param->sched_priority > p->rt_priority && | 5086 | if (param->sched_priority > p->rt_priority && |
5088 | param->sched_priority > rlim_rtprio) | 5087 | param->sched_priority > rlim_rtprio) |
5089 | return -EPERM; | 5088 | return -EPERM; |
5090 | } | 5089 | } |
5091 | 5090 | ||
5092 | /* | 5091 | /* |
5093 | * Treat SCHED_IDLE as nice 20. Only allow a switch to | 5092 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
5094 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. | 5093 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
5095 | */ | 5094 | */ |
5096 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { | 5095 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
5097 | if (!can_nice(p, TASK_NICE(p))) | 5096 | if (!can_nice(p, TASK_NICE(p))) |
5098 | return -EPERM; | 5097 | return -EPERM; |
5099 | } | 5098 | } |
5100 | 5099 | ||
5101 | /* can't change other user's priorities */ | 5100 | /* can't change other user's priorities */ |
5102 | if (!check_same_owner(p)) | 5101 | if (!check_same_owner(p)) |
5103 | return -EPERM; | 5102 | return -EPERM; |
5104 | 5103 | ||
5105 | /* Normal users shall not reset the sched_reset_on_fork flag */ | 5104 | /* Normal users shall not reset the sched_reset_on_fork flag */ |
5106 | if (p->sched_reset_on_fork && !reset_on_fork) | 5105 | if (p->sched_reset_on_fork && !reset_on_fork) |
5107 | return -EPERM; | 5106 | return -EPERM; |
5108 | } | 5107 | } |
5109 | 5108 | ||
5110 | if (user) { | 5109 | if (user) { |
5111 | retval = security_task_setscheduler(p); | 5110 | retval = security_task_setscheduler(p); |
5112 | if (retval) | 5111 | if (retval) |
5113 | return retval; | 5112 | return retval; |
5114 | } | 5113 | } |
5115 | 5114 | ||
5116 | /* | 5115 | /* |
5117 | * make sure no PI-waiters arrive (or leave) while we are | 5116 | * make sure no PI-waiters arrive (or leave) while we are |
5118 | * changing the priority of the task: | 5117 | * changing the priority of the task: |
5119 | * | 5118 | * |
5120 | * To be able to change p->policy safely, the appropriate | 5119 | * To be able to change p->policy safely, the appropriate |
5121 | * runqueue lock must be held. | 5120 | * runqueue lock must be held. |
5122 | */ | 5121 | */ |
5123 | rq = task_rq_lock(p, &flags); | 5122 | rq = task_rq_lock(p, &flags); |
5124 | 5123 | ||
5125 | /* | 5124 | /* |
5126 | * Changing the policy of the stop threads its a very bad idea | 5125 | * Changing the policy of the stop threads its a very bad idea |
5127 | */ | 5126 | */ |
5128 | if (p == rq->stop) { | 5127 | if (p == rq->stop) { |
5129 | task_rq_unlock(rq, p, &flags); | 5128 | task_rq_unlock(rq, p, &flags); |
5130 | return -EINVAL; | 5129 | return -EINVAL; |
5131 | } | 5130 | } |
5132 | 5131 | ||
5133 | /* | 5132 | /* |
5134 | * If not changing anything there's no need to proceed further: | 5133 | * If not changing anything there's no need to proceed further: |
5135 | */ | 5134 | */ |
5136 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | 5135 | if (unlikely(policy == p->policy && (!rt_policy(policy) || |
5137 | param->sched_priority == p->rt_priority))) { | 5136 | param->sched_priority == p->rt_priority))) { |
5138 | 5137 | ||
5139 | __task_rq_unlock(rq); | 5138 | __task_rq_unlock(rq); |
5140 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5139 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5141 | return 0; | 5140 | return 0; |
5142 | } | 5141 | } |
5143 | 5142 | ||
5144 | #ifdef CONFIG_RT_GROUP_SCHED | 5143 | #ifdef CONFIG_RT_GROUP_SCHED |
5145 | if (user) { | 5144 | if (user) { |
5146 | /* | 5145 | /* |
5147 | * Do not allow realtime tasks into groups that have no runtime | 5146 | * Do not allow realtime tasks into groups that have no runtime |
5148 | * assigned. | 5147 | * assigned. |
5149 | */ | 5148 | */ |
5150 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5149 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
5151 | task_group(p)->rt_bandwidth.rt_runtime == 0 && | 5150 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
5152 | !task_group_is_autogroup(task_group(p))) { | 5151 | !task_group_is_autogroup(task_group(p))) { |
5153 | task_rq_unlock(rq, p, &flags); | 5152 | task_rq_unlock(rq, p, &flags); |
5154 | return -EPERM; | 5153 | return -EPERM; |
5155 | } | 5154 | } |
5156 | } | 5155 | } |
5157 | #endif | 5156 | #endif |
5158 | 5157 | ||
5159 | /* recheck policy now with rq lock held */ | 5158 | /* recheck policy now with rq lock held */ |
5160 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 5159 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
5161 | policy = oldpolicy = -1; | 5160 | policy = oldpolicy = -1; |
5162 | task_rq_unlock(rq, p, &flags); | 5161 | task_rq_unlock(rq, p, &flags); |
5163 | goto recheck; | 5162 | goto recheck; |
5164 | } | 5163 | } |
5165 | on_rq = p->on_rq; | 5164 | on_rq = p->on_rq; |
5166 | running = task_current(rq, p); | 5165 | running = task_current(rq, p); |
5167 | if (on_rq) | 5166 | if (on_rq) |
5168 | deactivate_task(rq, p, 0); | 5167 | deactivate_task(rq, p, 0); |
5169 | if (running) | 5168 | if (running) |
5170 | p->sched_class->put_prev_task(rq, p); | 5169 | p->sched_class->put_prev_task(rq, p); |
5171 | 5170 | ||
5172 | p->sched_reset_on_fork = reset_on_fork; | 5171 | p->sched_reset_on_fork = reset_on_fork; |
5173 | 5172 | ||
5174 | oldprio = p->prio; | 5173 | oldprio = p->prio; |
5175 | prev_class = p->sched_class; | 5174 | prev_class = p->sched_class; |
5176 | __setscheduler(rq, p, policy, param->sched_priority); | 5175 | __setscheduler(rq, p, policy, param->sched_priority); |
5177 | 5176 | ||
5178 | if (running) | 5177 | if (running) |
5179 | p->sched_class->set_curr_task(rq); | 5178 | p->sched_class->set_curr_task(rq); |
5180 | if (on_rq) | 5179 | if (on_rq) |
5181 | activate_task(rq, p, 0); | 5180 | activate_task(rq, p, 0); |
5182 | 5181 | ||
5183 | check_class_changed(rq, p, prev_class, oldprio); | 5182 | check_class_changed(rq, p, prev_class, oldprio); |
5184 | task_rq_unlock(rq, p, &flags); | 5183 | task_rq_unlock(rq, p, &flags); |
5185 | 5184 | ||
5186 | rt_mutex_adjust_pi(p); | 5185 | rt_mutex_adjust_pi(p); |
5187 | 5186 | ||
5188 | return 0; | 5187 | return 0; |
5189 | } | 5188 | } |
5190 | 5189 | ||
5191 | /** | 5190 | /** |
5192 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 5191 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
5193 | * @p: the task in question. | 5192 | * @p: the task in question. |
5194 | * @policy: new policy. | 5193 | * @policy: new policy. |
5195 | * @param: structure containing the new RT priority. | 5194 | * @param: structure containing the new RT priority. |
5196 | * | 5195 | * |
5197 | * NOTE that the task may be already dead. | 5196 | * NOTE that the task may be already dead. |
5198 | */ | 5197 | */ |
5199 | int sched_setscheduler(struct task_struct *p, int policy, | 5198 | int sched_setscheduler(struct task_struct *p, int policy, |
5200 | const struct sched_param *param) | 5199 | const struct sched_param *param) |
5201 | { | 5200 | { |
5202 | return __sched_setscheduler(p, policy, param, true); | 5201 | return __sched_setscheduler(p, policy, param, true); |
5203 | } | 5202 | } |
5204 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 5203 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
5205 | 5204 | ||
5206 | /** | 5205 | /** |
5207 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. | 5206 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. |
5208 | * @p: the task in question. | 5207 | * @p: the task in question. |
5209 | * @policy: new policy. | 5208 | * @policy: new policy. |
5210 | * @param: structure containing the new RT priority. | 5209 | * @param: structure containing the new RT priority. |
5211 | * | 5210 | * |
5212 | * Just like sched_setscheduler, only don't bother checking if the | 5211 | * Just like sched_setscheduler, only don't bother checking if the |
5213 | * current context has permission. For example, this is needed in | 5212 | * current context has permission. For example, this is needed in |
5214 | * stop_machine(): we create temporary high priority worker threads, | 5213 | * stop_machine(): we create temporary high priority worker threads, |
5215 | * but our caller might not have that capability. | 5214 | * but our caller might not have that capability. |
5216 | */ | 5215 | */ |
5217 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 5216 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
5218 | const struct sched_param *param) | 5217 | const struct sched_param *param) |
5219 | { | 5218 | { |
5220 | return __sched_setscheduler(p, policy, param, false); | 5219 | return __sched_setscheduler(p, policy, param, false); |
5221 | } | 5220 | } |
5222 | 5221 | ||
5223 | static int | 5222 | static int |
5224 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 5223 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
5225 | { | 5224 | { |
5226 | struct sched_param lparam; | 5225 | struct sched_param lparam; |
5227 | struct task_struct *p; | 5226 | struct task_struct *p; |
5228 | int retval; | 5227 | int retval; |
5229 | 5228 | ||
5230 | if (!param || pid < 0) | 5229 | if (!param || pid < 0) |
5231 | return -EINVAL; | 5230 | return -EINVAL; |
5232 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) | 5231 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) |
5233 | return -EFAULT; | 5232 | return -EFAULT; |
5234 | 5233 | ||
5235 | rcu_read_lock(); | 5234 | rcu_read_lock(); |
5236 | retval = -ESRCH; | 5235 | retval = -ESRCH; |
5237 | p = find_process_by_pid(pid); | 5236 | p = find_process_by_pid(pid); |
5238 | if (p != NULL) | 5237 | if (p != NULL) |
5239 | retval = sched_setscheduler(p, policy, &lparam); | 5238 | retval = sched_setscheduler(p, policy, &lparam); |
5240 | rcu_read_unlock(); | 5239 | rcu_read_unlock(); |
5241 | 5240 | ||
5242 | return retval; | 5241 | return retval; |
5243 | } | 5242 | } |
5244 | 5243 | ||
5245 | /** | 5244 | /** |
5246 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 5245 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
5247 | * @pid: the pid in question. | 5246 | * @pid: the pid in question. |
5248 | * @policy: new policy. | 5247 | * @policy: new policy. |
5249 | * @param: structure containing the new RT priority. | 5248 | * @param: structure containing the new RT priority. |
5250 | */ | 5249 | */ |
5251 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | 5250 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, |
5252 | struct sched_param __user *, param) | 5251 | struct sched_param __user *, param) |
5253 | { | 5252 | { |
5254 | /* negative values for policy are not valid */ | 5253 | /* negative values for policy are not valid */ |
5255 | if (policy < 0) | 5254 | if (policy < 0) |
5256 | return -EINVAL; | 5255 | return -EINVAL; |
5257 | 5256 | ||
5258 | return do_sched_setscheduler(pid, policy, param); | 5257 | return do_sched_setscheduler(pid, policy, param); |
5259 | } | 5258 | } |
5260 | 5259 | ||
5261 | /** | 5260 | /** |
5262 | * sys_sched_setparam - set/change the RT priority of a thread | 5261 | * sys_sched_setparam - set/change the RT priority of a thread |
5263 | * @pid: the pid in question. | 5262 | * @pid: the pid in question. |
5264 | * @param: structure containing the new RT priority. | 5263 | * @param: structure containing the new RT priority. |
5265 | */ | 5264 | */ |
5266 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | 5265 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) |
5267 | { | 5266 | { |
5268 | return do_sched_setscheduler(pid, -1, param); | 5267 | return do_sched_setscheduler(pid, -1, param); |
5269 | } | 5268 | } |
5270 | 5269 | ||
5271 | /** | 5270 | /** |
5272 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 5271 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
5273 | * @pid: the pid in question. | 5272 | * @pid: the pid in question. |
5274 | */ | 5273 | */ |
5275 | SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | 5274 | SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) |
5276 | { | 5275 | { |
5277 | struct task_struct *p; | 5276 | struct task_struct *p; |
5278 | int retval; | 5277 | int retval; |
5279 | 5278 | ||
5280 | if (pid < 0) | 5279 | if (pid < 0) |
5281 | return -EINVAL; | 5280 | return -EINVAL; |
5282 | 5281 | ||
5283 | retval = -ESRCH; | 5282 | retval = -ESRCH; |
5284 | rcu_read_lock(); | 5283 | rcu_read_lock(); |
5285 | p = find_process_by_pid(pid); | 5284 | p = find_process_by_pid(pid); |
5286 | if (p) { | 5285 | if (p) { |
5287 | retval = security_task_getscheduler(p); | 5286 | retval = security_task_getscheduler(p); |
5288 | if (!retval) | 5287 | if (!retval) |
5289 | retval = p->policy | 5288 | retval = p->policy |
5290 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | 5289 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); |
5291 | } | 5290 | } |
5292 | rcu_read_unlock(); | 5291 | rcu_read_unlock(); |
5293 | return retval; | 5292 | return retval; |
5294 | } | 5293 | } |
5295 | 5294 | ||
5296 | /** | 5295 | /** |
5297 | * sys_sched_getparam - get the RT priority of a thread | 5296 | * sys_sched_getparam - get the RT priority of a thread |
5298 | * @pid: the pid in question. | 5297 | * @pid: the pid in question. |
5299 | * @param: structure containing the RT priority. | 5298 | * @param: structure containing the RT priority. |
5300 | */ | 5299 | */ |
5301 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | 5300 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) |
5302 | { | 5301 | { |
5303 | struct sched_param lp; | 5302 | struct sched_param lp; |
5304 | struct task_struct *p; | 5303 | struct task_struct *p; |
5305 | int retval; | 5304 | int retval; |
5306 | 5305 | ||
5307 | if (!param || pid < 0) | 5306 | if (!param || pid < 0) |
5308 | return -EINVAL; | 5307 | return -EINVAL; |
5309 | 5308 | ||
5310 | rcu_read_lock(); | 5309 | rcu_read_lock(); |
5311 | p = find_process_by_pid(pid); | 5310 | p = find_process_by_pid(pid); |
5312 | retval = -ESRCH; | 5311 | retval = -ESRCH; |
5313 | if (!p) | 5312 | if (!p) |
5314 | goto out_unlock; | 5313 | goto out_unlock; |
5315 | 5314 | ||
5316 | retval = security_task_getscheduler(p); | 5315 | retval = security_task_getscheduler(p); |
5317 | if (retval) | 5316 | if (retval) |
5318 | goto out_unlock; | 5317 | goto out_unlock; |
5319 | 5318 | ||
5320 | lp.sched_priority = p->rt_priority; | 5319 | lp.sched_priority = p->rt_priority; |
5321 | rcu_read_unlock(); | 5320 | rcu_read_unlock(); |
5322 | 5321 | ||
5323 | /* | 5322 | /* |
5324 | * This one might sleep, we cannot do it with a spinlock held ... | 5323 | * This one might sleep, we cannot do it with a spinlock held ... |
5325 | */ | 5324 | */ |
5326 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | 5325 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
5327 | 5326 | ||
5328 | return retval; | 5327 | return retval; |
5329 | 5328 | ||
5330 | out_unlock: | 5329 | out_unlock: |
5331 | rcu_read_unlock(); | 5330 | rcu_read_unlock(); |
5332 | return retval; | 5331 | return retval; |
5333 | } | 5332 | } |
5334 | 5333 | ||
5335 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | 5334 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) |
5336 | { | 5335 | { |
5337 | cpumask_var_t cpus_allowed, new_mask; | 5336 | cpumask_var_t cpus_allowed, new_mask; |
5338 | struct task_struct *p; | 5337 | struct task_struct *p; |
5339 | int retval; | 5338 | int retval; |
5340 | 5339 | ||
5341 | get_online_cpus(); | 5340 | get_online_cpus(); |
5342 | rcu_read_lock(); | 5341 | rcu_read_lock(); |
5343 | 5342 | ||
5344 | p = find_process_by_pid(pid); | 5343 | p = find_process_by_pid(pid); |
5345 | if (!p) { | 5344 | if (!p) { |
5346 | rcu_read_unlock(); | 5345 | rcu_read_unlock(); |
5347 | put_online_cpus(); | 5346 | put_online_cpus(); |
5348 | return -ESRCH; | 5347 | return -ESRCH; |
5349 | } | 5348 | } |
5350 | 5349 | ||
5351 | /* Prevent p going away */ | 5350 | /* Prevent p going away */ |
5352 | get_task_struct(p); | 5351 | get_task_struct(p); |
5353 | rcu_read_unlock(); | 5352 | rcu_read_unlock(); |
5354 | 5353 | ||
5355 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { | 5354 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { |
5356 | retval = -ENOMEM; | 5355 | retval = -ENOMEM; |
5357 | goto out_put_task; | 5356 | goto out_put_task; |
5358 | } | 5357 | } |
5359 | if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { | 5358 | if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { |
5360 | retval = -ENOMEM; | 5359 | retval = -ENOMEM; |
5361 | goto out_free_cpus_allowed; | 5360 | goto out_free_cpus_allowed; |
5362 | } | 5361 | } |
5363 | retval = -EPERM; | 5362 | retval = -EPERM; |
5364 | if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) | 5363 | if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) |
5365 | goto out_unlock; | 5364 | goto out_unlock; |
5366 | 5365 | ||
5367 | retval = security_task_setscheduler(p); | 5366 | retval = security_task_setscheduler(p); |
5368 | if (retval) | 5367 | if (retval) |
5369 | goto out_unlock; | 5368 | goto out_unlock; |
5370 | 5369 | ||
5371 | cpuset_cpus_allowed(p, cpus_allowed); | 5370 | cpuset_cpus_allowed(p, cpus_allowed); |
5372 | cpumask_and(new_mask, in_mask, cpus_allowed); | 5371 | cpumask_and(new_mask, in_mask, cpus_allowed); |
5373 | again: | 5372 | again: |
5374 | retval = set_cpus_allowed_ptr(p, new_mask); | 5373 | retval = set_cpus_allowed_ptr(p, new_mask); |
5375 | 5374 | ||
5376 | if (!retval) { | 5375 | if (!retval) { |
5377 | cpuset_cpus_allowed(p, cpus_allowed); | 5376 | cpuset_cpus_allowed(p, cpus_allowed); |
5378 | if (!cpumask_subset(new_mask, cpus_allowed)) { | 5377 | if (!cpumask_subset(new_mask, cpus_allowed)) { |
5379 | /* | 5378 | /* |
5380 | * We must have raced with a concurrent cpuset | 5379 | * We must have raced with a concurrent cpuset |
5381 | * update. Just reset the cpus_allowed to the | 5380 | * update. Just reset the cpus_allowed to the |
5382 | * cpuset's cpus_allowed | 5381 | * cpuset's cpus_allowed |
5383 | */ | 5382 | */ |
5384 | cpumask_copy(new_mask, cpus_allowed); | 5383 | cpumask_copy(new_mask, cpus_allowed); |
5385 | goto again; | 5384 | goto again; |
5386 | } | 5385 | } |
5387 | } | 5386 | } |
5388 | out_unlock: | 5387 | out_unlock: |
5389 | free_cpumask_var(new_mask); | 5388 | free_cpumask_var(new_mask); |
5390 | out_free_cpus_allowed: | 5389 | out_free_cpus_allowed: |
5391 | free_cpumask_var(cpus_allowed); | 5390 | free_cpumask_var(cpus_allowed); |
5392 | out_put_task: | 5391 | out_put_task: |
5393 | put_task_struct(p); | 5392 | put_task_struct(p); |
5394 | put_online_cpus(); | 5393 | put_online_cpus(); |
5395 | return retval; | 5394 | return retval; |
5396 | } | 5395 | } |
5397 | 5396 | ||
5398 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | 5397 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, |
5399 | struct cpumask *new_mask) | 5398 | struct cpumask *new_mask) |
5400 | { | 5399 | { |
5401 | if (len < cpumask_size()) | 5400 | if (len < cpumask_size()) |
5402 | cpumask_clear(new_mask); | 5401 | cpumask_clear(new_mask); |
5403 | else if (len > cpumask_size()) | 5402 | else if (len > cpumask_size()) |
5404 | len = cpumask_size(); | 5403 | len = cpumask_size(); |
5405 | 5404 | ||
5406 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; | 5405 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; |
5407 | } | 5406 | } |
5408 | 5407 | ||
5409 | /** | 5408 | /** |
5410 | * sys_sched_setaffinity - set the cpu affinity of a process | 5409 | * sys_sched_setaffinity - set the cpu affinity of a process |
5411 | * @pid: pid of the process | 5410 | * @pid: pid of the process |
5412 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 5411 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
5413 | * @user_mask_ptr: user-space pointer to the new cpu mask | 5412 | * @user_mask_ptr: user-space pointer to the new cpu mask |
5414 | */ | 5413 | */ |
5415 | SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, | 5414 | SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, |
5416 | unsigned long __user *, user_mask_ptr) | 5415 | unsigned long __user *, user_mask_ptr) |
5417 | { | 5416 | { |
5418 | cpumask_var_t new_mask; | 5417 | cpumask_var_t new_mask; |
5419 | int retval; | 5418 | int retval; |
5420 | 5419 | ||
5421 | if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) | 5420 | if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) |
5422 | return -ENOMEM; | 5421 | return -ENOMEM; |
5423 | 5422 | ||
5424 | retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); | 5423 | retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); |
5425 | if (retval == 0) | 5424 | if (retval == 0) |
5426 | retval = sched_setaffinity(pid, new_mask); | 5425 | retval = sched_setaffinity(pid, new_mask); |
5427 | free_cpumask_var(new_mask); | 5426 | free_cpumask_var(new_mask); |
5428 | return retval; | 5427 | return retval; |
5429 | } | 5428 | } |
5430 | 5429 | ||
5431 | long sched_getaffinity(pid_t pid, struct cpumask *mask) | 5430 | long sched_getaffinity(pid_t pid, struct cpumask *mask) |
5432 | { | 5431 | { |
5433 | struct task_struct *p; | 5432 | struct task_struct *p; |
5434 | unsigned long flags; | 5433 | unsigned long flags; |
5435 | int retval; | 5434 | int retval; |
5436 | 5435 | ||
5437 | get_online_cpus(); | 5436 | get_online_cpus(); |
5438 | rcu_read_lock(); | 5437 | rcu_read_lock(); |
5439 | 5438 | ||
5440 | retval = -ESRCH; | 5439 | retval = -ESRCH; |
5441 | p = find_process_by_pid(pid); | 5440 | p = find_process_by_pid(pid); |
5442 | if (!p) | 5441 | if (!p) |
5443 | goto out_unlock; | 5442 | goto out_unlock; |
5444 | 5443 | ||
5445 | retval = security_task_getscheduler(p); | 5444 | retval = security_task_getscheduler(p); |
5446 | if (retval) | 5445 | if (retval) |
5447 | goto out_unlock; | 5446 | goto out_unlock; |
5448 | 5447 | ||
5449 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 5448 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
5450 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 5449 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
5451 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5450 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5452 | 5451 | ||
5453 | out_unlock: | 5452 | out_unlock: |
5454 | rcu_read_unlock(); | 5453 | rcu_read_unlock(); |
5455 | put_online_cpus(); | 5454 | put_online_cpus(); |
5456 | 5455 | ||
5457 | return retval; | 5456 | return retval; |
5458 | } | 5457 | } |
5459 | 5458 | ||
5460 | /** | 5459 | /** |
5461 | * sys_sched_getaffinity - get the cpu affinity of a process | 5460 | * sys_sched_getaffinity - get the cpu affinity of a process |
5462 | * @pid: pid of the process | 5461 | * @pid: pid of the process |
5463 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 5462 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
5464 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 5463 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
5465 | */ | 5464 | */ |
5466 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | 5465 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, |
5467 | unsigned long __user *, user_mask_ptr) | 5466 | unsigned long __user *, user_mask_ptr) |
5468 | { | 5467 | { |
5469 | int ret; | 5468 | int ret; |
5470 | cpumask_var_t mask; | 5469 | cpumask_var_t mask; |
5471 | 5470 | ||
5472 | if ((len * BITS_PER_BYTE) < nr_cpu_ids) | 5471 | if ((len * BITS_PER_BYTE) < nr_cpu_ids) |
5473 | return -EINVAL; | 5472 | return -EINVAL; |
5474 | if (len & (sizeof(unsigned long)-1)) | 5473 | if (len & (sizeof(unsigned long)-1)) |
5475 | return -EINVAL; | 5474 | return -EINVAL; |
5476 | 5475 | ||
5477 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) | 5476 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) |
5478 | return -ENOMEM; | 5477 | return -ENOMEM; |
5479 | 5478 | ||
5480 | ret = sched_getaffinity(pid, mask); | 5479 | ret = sched_getaffinity(pid, mask); |
5481 | if (ret == 0) { | 5480 | if (ret == 0) { |
5482 | size_t retlen = min_t(size_t, len, cpumask_size()); | 5481 | size_t retlen = min_t(size_t, len, cpumask_size()); |
5483 | 5482 | ||
5484 | if (copy_to_user(user_mask_ptr, mask, retlen)) | 5483 | if (copy_to_user(user_mask_ptr, mask, retlen)) |
5485 | ret = -EFAULT; | 5484 | ret = -EFAULT; |
5486 | else | 5485 | else |
5487 | ret = retlen; | 5486 | ret = retlen; |
5488 | } | 5487 | } |
5489 | free_cpumask_var(mask); | 5488 | free_cpumask_var(mask); |
5490 | 5489 | ||
5491 | return ret; | 5490 | return ret; |
5492 | } | 5491 | } |
5493 | 5492 | ||
5494 | /** | 5493 | /** |
5495 | * sys_sched_yield - yield the current processor to other threads. | 5494 | * sys_sched_yield - yield the current processor to other threads. |
5496 | * | 5495 | * |
5497 | * This function yields the current CPU to other tasks. If there are no | 5496 | * This function yields the current CPU to other tasks. If there are no |
5498 | * other threads running on this CPU then this function will return. | 5497 | * other threads running on this CPU then this function will return. |
5499 | */ | 5498 | */ |
5500 | SYSCALL_DEFINE0(sched_yield) | 5499 | SYSCALL_DEFINE0(sched_yield) |
5501 | { | 5500 | { |
5502 | struct rq *rq = this_rq_lock(); | 5501 | struct rq *rq = this_rq_lock(); |
5503 | 5502 | ||
5504 | schedstat_inc(rq, yld_count); | 5503 | schedstat_inc(rq, yld_count); |
5505 | current->sched_class->yield_task(rq); | 5504 | current->sched_class->yield_task(rq); |
5506 | 5505 | ||
5507 | /* | 5506 | /* |
5508 | * Since we are going to call schedule() anyway, there's | 5507 | * Since we are going to call schedule() anyway, there's |
5509 | * no need to preempt or enable interrupts: | 5508 | * no need to preempt or enable interrupts: |
5510 | */ | 5509 | */ |
5511 | __release(rq->lock); | 5510 | __release(rq->lock); |
5512 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 5511 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
5513 | do_raw_spin_unlock(&rq->lock); | 5512 | do_raw_spin_unlock(&rq->lock); |
5514 | preempt_enable_no_resched(); | 5513 | preempt_enable_no_resched(); |
5515 | 5514 | ||
5516 | schedule(); | 5515 | schedule(); |
5517 | 5516 | ||
5518 | return 0; | 5517 | return 0; |
5519 | } | 5518 | } |
5520 | 5519 | ||
5521 | static inline int should_resched(void) | 5520 | static inline int should_resched(void) |
5522 | { | 5521 | { |
5523 | return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); | 5522 | return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); |
5524 | } | 5523 | } |
5525 | 5524 | ||
5526 | static void __cond_resched(void) | 5525 | static void __cond_resched(void) |
5527 | { | 5526 | { |
5528 | add_preempt_count(PREEMPT_ACTIVE); | 5527 | add_preempt_count(PREEMPT_ACTIVE); |
5529 | schedule(); | 5528 | schedule(); |
5530 | sub_preempt_count(PREEMPT_ACTIVE); | 5529 | sub_preempt_count(PREEMPT_ACTIVE); |
5531 | } | 5530 | } |
5532 | 5531 | ||
5533 | int __sched _cond_resched(void) | 5532 | int __sched _cond_resched(void) |
5534 | { | 5533 | { |
5535 | if (should_resched()) { | 5534 | if (should_resched()) { |
5536 | __cond_resched(); | 5535 | __cond_resched(); |
5537 | return 1; | 5536 | return 1; |
5538 | } | 5537 | } |
5539 | return 0; | 5538 | return 0; |
5540 | } | 5539 | } |
5541 | EXPORT_SYMBOL(_cond_resched); | 5540 | EXPORT_SYMBOL(_cond_resched); |
5542 | 5541 | ||
5543 | /* | 5542 | /* |
5544 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, | 5543 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
5545 | * call schedule, and on return reacquire the lock. | 5544 | * call schedule, and on return reacquire the lock. |
5546 | * | 5545 | * |
5547 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 5546 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
5548 | * operations here to prevent schedule() from being called twice (once via | 5547 | * operations here to prevent schedule() from being called twice (once via |
5549 | * spin_unlock(), once by hand). | 5548 | * spin_unlock(), once by hand). |
5550 | */ | 5549 | */ |
5551 | int __cond_resched_lock(spinlock_t *lock) | 5550 | int __cond_resched_lock(spinlock_t *lock) |
5552 | { | 5551 | { |
5553 | int resched = should_resched(); | 5552 | int resched = should_resched(); |
5554 | int ret = 0; | 5553 | int ret = 0; |
5555 | 5554 | ||
5556 | lockdep_assert_held(lock); | 5555 | lockdep_assert_held(lock); |
5557 | 5556 | ||
5558 | if (spin_needbreak(lock) || resched) { | 5557 | if (spin_needbreak(lock) || resched) { |
5559 | spin_unlock(lock); | 5558 | spin_unlock(lock); |
5560 | if (resched) | 5559 | if (resched) |
5561 | __cond_resched(); | 5560 | __cond_resched(); |
5562 | else | 5561 | else |
5563 | cpu_relax(); | 5562 | cpu_relax(); |
5564 | ret = 1; | 5563 | ret = 1; |
5565 | spin_lock(lock); | 5564 | spin_lock(lock); |
5566 | } | 5565 | } |
5567 | return ret; | 5566 | return ret; |
5568 | } | 5567 | } |
5569 | EXPORT_SYMBOL(__cond_resched_lock); | 5568 | EXPORT_SYMBOL(__cond_resched_lock); |
5570 | 5569 | ||
5571 | int __sched __cond_resched_softirq(void) | 5570 | int __sched __cond_resched_softirq(void) |
5572 | { | 5571 | { |
5573 | BUG_ON(!in_softirq()); | 5572 | BUG_ON(!in_softirq()); |
5574 | 5573 | ||
5575 | if (should_resched()) { | 5574 | if (should_resched()) { |
5576 | local_bh_enable(); | 5575 | local_bh_enable(); |
5577 | __cond_resched(); | 5576 | __cond_resched(); |
5578 | local_bh_disable(); | 5577 | local_bh_disable(); |
5579 | return 1; | 5578 | return 1; |
5580 | } | 5579 | } |
5581 | return 0; | 5580 | return 0; |
5582 | } | 5581 | } |
5583 | EXPORT_SYMBOL(__cond_resched_softirq); | 5582 | EXPORT_SYMBOL(__cond_resched_softirq); |
5584 | 5583 | ||
5585 | /** | 5584 | /** |
5586 | * yield - yield the current processor to other threads. | 5585 | * yield - yield the current processor to other threads. |
5587 | * | 5586 | * |
5588 | * This is a shortcut for kernel-space yielding - it marks the | 5587 | * This is a shortcut for kernel-space yielding - it marks the |
5589 | * thread runnable and calls sys_sched_yield(). | 5588 | * thread runnable and calls sys_sched_yield(). |
5590 | */ | 5589 | */ |
5591 | void __sched yield(void) | 5590 | void __sched yield(void) |
5592 | { | 5591 | { |
5593 | set_current_state(TASK_RUNNING); | 5592 | set_current_state(TASK_RUNNING); |
5594 | sys_sched_yield(); | 5593 | sys_sched_yield(); |
5595 | } | 5594 | } |
5596 | EXPORT_SYMBOL(yield); | 5595 | EXPORT_SYMBOL(yield); |
5597 | 5596 | ||
5598 | /** | 5597 | /** |
5599 | * yield_to - yield the current processor to another thread in | 5598 | * yield_to - yield the current processor to another thread in |
5600 | * your thread group, or accelerate that thread toward the | 5599 | * your thread group, or accelerate that thread toward the |
5601 | * processor it's on. | 5600 | * processor it's on. |
5602 | * @p: target task | 5601 | * @p: target task |
5603 | * @preempt: whether task preemption is allowed or not | 5602 | * @preempt: whether task preemption is allowed or not |
5604 | * | 5603 | * |
5605 | * It's the caller's job to ensure that the target task struct | 5604 | * It's the caller's job to ensure that the target task struct |
5606 | * can't go away on us before we can do any checks. | 5605 | * can't go away on us before we can do any checks. |
5607 | * | 5606 | * |
5608 | * Returns true if we indeed boosted the target task. | 5607 | * Returns true if we indeed boosted the target task. |
5609 | */ | 5608 | */ |
5610 | bool __sched yield_to(struct task_struct *p, bool preempt) | 5609 | bool __sched yield_to(struct task_struct *p, bool preempt) |
5611 | { | 5610 | { |
5612 | struct task_struct *curr = current; | 5611 | struct task_struct *curr = current; |
5613 | struct rq *rq, *p_rq; | 5612 | struct rq *rq, *p_rq; |
5614 | unsigned long flags; | 5613 | unsigned long flags; |
5615 | bool yielded = 0; | 5614 | bool yielded = 0; |
5616 | 5615 | ||
5617 | local_irq_save(flags); | 5616 | local_irq_save(flags); |
5618 | rq = this_rq(); | 5617 | rq = this_rq(); |
5619 | 5618 | ||
5620 | again: | 5619 | again: |
5621 | p_rq = task_rq(p); | 5620 | p_rq = task_rq(p); |
5622 | double_rq_lock(rq, p_rq); | 5621 | double_rq_lock(rq, p_rq); |
5623 | while (task_rq(p) != p_rq) { | 5622 | while (task_rq(p) != p_rq) { |
5624 | double_rq_unlock(rq, p_rq); | 5623 | double_rq_unlock(rq, p_rq); |
5625 | goto again; | 5624 | goto again; |
5626 | } | 5625 | } |
5627 | 5626 | ||
5628 | if (!curr->sched_class->yield_to_task) | 5627 | if (!curr->sched_class->yield_to_task) |
5629 | goto out; | 5628 | goto out; |
5630 | 5629 | ||
5631 | if (curr->sched_class != p->sched_class) | 5630 | if (curr->sched_class != p->sched_class) |
5632 | goto out; | 5631 | goto out; |
5633 | 5632 | ||
5634 | if (task_running(p_rq, p) || p->state) | 5633 | if (task_running(p_rq, p) || p->state) |
5635 | goto out; | 5634 | goto out; |
5636 | 5635 | ||
5637 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | 5636 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); |
5638 | if (yielded) { | 5637 | if (yielded) { |
5639 | schedstat_inc(rq, yld_count); | 5638 | schedstat_inc(rq, yld_count); |
5640 | /* | 5639 | /* |
5641 | * Make p's CPU reschedule; pick_next_entity takes care of | 5640 | * Make p's CPU reschedule; pick_next_entity takes care of |
5642 | * fairness. | 5641 | * fairness. |
5643 | */ | 5642 | */ |
5644 | if (preempt && rq != p_rq) | 5643 | if (preempt && rq != p_rq) |
5645 | resched_task(p_rq->curr); | 5644 | resched_task(p_rq->curr); |
5646 | } | 5645 | } |
5647 | 5646 | ||
5648 | out: | 5647 | out: |
5649 | double_rq_unlock(rq, p_rq); | 5648 | double_rq_unlock(rq, p_rq); |
5650 | local_irq_restore(flags); | 5649 | local_irq_restore(flags); |
5651 | 5650 | ||
5652 | if (yielded) | 5651 | if (yielded) |
5653 | schedule(); | 5652 | schedule(); |
5654 | 5653 | ||
5655 | return yielded; | 5654 | return yielded; |
5656 | } | 5655 | } |
5657 | EXPORT_SYMBOL_GPL(yield_to); | 5656 | EXPORT_SYMBOL_GPL(yield_to); |
5658 | 5657 | ||
5659 | /* | 5658 | /* |
5660 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5659 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5661 | * that process accounting knows that this is a task in IO wait state. | 5660 | * that process accounting knows that this is a task in IO wait state. |
5662 | */ | 5661 | */ |
5663 | void __sched io_schedule(void) | 5662 | void __sched io_schedule(void) |
5664 | { | 5663 | { |
5665 | struct rq *rq = raw_rq(); | 5664 | struct rq *rq = raw_rq(); |
5666 | 5665 | ||
5667 | delayacct_blkio_start(); | 5666 | delayacct_blkio_start(); |
5668 | atomic_inc(&rq->nr_iowait); | 5667 | atomic_inc(&rq->nr_iowait); |
5669 | blk_flush_plug(current); | 5668 | blk_flush_plug(current); |
5670 | current->in_iowait = 1; | 5669 | current->in_iowait = 1; |
5671 | schedule(); | 5670 | schedule(); |
5672 | current->in_iowait = 0; | 5671 | current->in_iowait = 0; |
5673 | atomic_dec(&rq->nr_iowait); | 5672 | atomic_dec(&rq->nr_iowait); |
5674 | delayacct_blkio_end(); | 5673 | delayacct_blkio_end(); |
5675 | } | 5674 | } |
5676 | EXPORT_SYMBOL(io_schedule); | 5675 | EXPORT_SYMBOL(io_schedule); |
5677 | 5676 | ||
5678 | long __sched io_schedule_timeout(long timeout) | 5677 | long __sched io_schedule_timeout(long timeout) |
5679 | { | 5678 | { |
5680 | struct rq *rq = raw_rq(); | 5679 | struct rq *rq = raw_rq(); |
5681 | long ret; | 5680 | long ret; |
5682 | 5681 | ||
5683 | delayacct_blkio_start(); | 5682 | delayacct_blkio_start(); |
5684 | atomic_inc(&rq->nr_iowait); | 5683 | atomic_inc(&rq->nr_iowait); |
5685 | blk_flush_plug(current); | 5684 | blk_flush_plug(current); |
5686 | current->in_iowait = 1; | 5685 | current->in_iowait = 1; |
5687 | ret = schedule_timeout(timeout); | 5686 | ret = schedule_timeout(timeout); |
5688 | current->in_iowait = 0; | 5687 | current->in_iowait = 0; |
5689 | atomic_dec(&rq->nr_iowait); | 5688 | atomic_dec(&rq->nr_iowait); |
5690 | delayacct_blkio_end(); | 5689 | delayacct_blkio_end(); |
5691 | return ret; | 5690 | return ret; |
5692 | } | 5691 | } |
5693 | 5692 | ||
5694 | /** | 5693 | /** |
5695 | * sys_sched_get_priority_max - return maximum RT priority. | 5694 | * sys_sched_get_priority_max - return maximum RT priority. |
5696 | * @policy: scheduling class. | 5695 | * @policy: scheduling class. |
5697 | * | 5696 | * |
5698 | * this syscall returns the maximum rt_priority that can be used | 5697 | * this syscall returns the maximum rt_priority that can be used |
5699 | * by a given scheduling class. | 5698 | * by a given scheduling class. |
5700 | */ | 5699 | */ |
5701 | SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | 5700 | SYSCALL_DEFINE1(sched_get_priority_max, int, policy) |
5702 | { | 5701 | { |
5703 | int ret = -EINVAL; | 5702 | int ret = -EINVAL; |
5704 | 5703 | ||
5705 | switch (policy) { | 5704 | switch (policy) { |
5706 | case SCHED_FIFO: | 5705 | case SCHED_FIFO: |
5707 | case SCHED_RR: | 5706 | case SCHED_RR: |
5708 | ret = MAX_USER_RT_PRIO-1; | 5707 | ret = MAX_USER_RT_PRIO-1; |
5709 | break; | 5708 | break; |
5710 | case SCHED_NORMAL: | 5709 | case SCHED_NORMAL: |
5711 | case SCHED_BATCH: | 5710 | case SCHED_BATCH: |
5712 | case SCHED_IDLE: | 5711 | case SCHED_IDLE: |
5713 | ret = 0; | 5712 | ret = 0; |
5714 | break; | 5713 | break; |
5715 | } | 5714 | } |
5716 | return ret; | 5715 | return ret; |
5717 | } | 5716 | } |
5718 | 5717 | ||
5719 | /** | 5718 | /** |
5720 | * sys_sched_get_priority_min - return minimum RT priority. | 5719 | * sys_sched_get_priority_min - return minimum RT priority. |
5721 | * @policy: scheduling class. | 5720 | * @policy: scheduling class. |
5722 | * | 5721 | * |
5723 | * this syscall returns the minimum rt_priority that can be used | 5722 | * this syscall returns the minimum rt_priority that can be used |
5724 | * by a given scheduling class. | 5723 | * by a given scheduling class. |
5725 | */ | 5724 | */ |
5726 | SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | 5725 | SYSCALL_DEFINE1(sched_get_priority_min, int, policy) |
5727 | { | 5726 | { |
5728 | int ret = -EINVAL; | 5727 | int ret = -EINVAL; |
5729 | 5728 | ||
5730 | switch (policy) { | 5729 | switch (policy) { |
5731 | case SCHED_FIFO: | 5730 | case SCHED_FIFO: |
5732 | case SCHED_RR: | 5731 | case SCHED_RR: |
5733 | ret = 1; | 5732 | ret = 1; |
5734 | break; | 5733 | break; |
5735 | case SCHED_NORMAL: | 5734 | case SCHED_NORMAL: |
5736 | case SCHED_BATCH: | 5735 | case SCHED_BATCH: |
5737 | case SCHED_IDLE: | 5736 | case SCHED_IDLE: |
5738 | ret = 0; | 5737 | ret = 0; |
5739 | } | 5738 | } |
5740 | return ret; | 5739 | return ret; |
5741 | } | 5740 | } |
5742 | 5741 | ||
5743 | /** | 5742 | /** |
5744 | * sys_sched_rr_get_interval - return the default timeslice of a process. | 5743 | * sys_sched_rr_get_interval - return the default timeslice of a process. |
5745 | * @pid: pid of the process. | 5744 | * @pid: pid of the process. |
5746 | * @interval: userspace pointer to the timeslice value. | 5745 | * @interval: userspace pointer to the timeslice value. |
5747 | * | 5746 | * |
5748 | * this syscall writes the default timeslice value of a given process | 5747 | * this syscall writes the default timeslice value of a given process |
5749 | * into the user-space timespec buffer. A value of '0' means infinity. | 5748 | * into the user-space timespec buffer. A value of '0' means infinity. |
5750 | */ | 5749 | */ |
5751 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | 5750 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, |
5752 | struct timespec __user *, interval) | 5751 | struct timespec __user *, interval) |
5753 | { | 5752 | { |
5754 | struct task_struct *p; | 5753 | struct task_struct *p; |
5755 | unsigned int time_slice; | 5754 | unsigned int time_slice; |
5756 | unsigned long flags; | 5755 | unsigned long flags; |
5757 | struct rq *rq; | 5756 | struct rq *rq; |
5758 | int retval; | 5757 | int retval; |
5759 | struct timespec t; | 5758 | struct timespec t; |
5760 | 5759 | ||
5761 | if (pid < 0) | 5760 | if (pid < 0) |
5762 | return -EINVAL; | 5761 | return -EINVAL; |
5763 | 5762 | ||
5764 | retval = -ESRCH; | 5763 | retval = -ESRCH; |
5765 | rcu_read_lock(); | 5764 | rcu_read_lock(); |
5766 | p = find_process_by_pid(pid); | 5765 | p = find_process_by_pid(pid); |
5767 | if (!p) | 5766 | if (!p) |
5768 | goto out_unlock; | 5767 | goto out_unlock; |
5769 | 5768 | ||
5770 | retval = security_task_getscheduler(p); | 5769 | retval = security_task_getscheduler(p); |
5771 | if (retval) | 5770 | if (retval) |
5772 | goto out_unlock; | 5771 | goto out_unlock; |
5773 | 5772 | ||
5774 | rq = task_rq_lock(p, &flags); | 5773 | rq = task_rq_lock(p, &flags); |
5775 | time_slice = p->sched_class->get_rr_interval(rq, p); | 5774 | time_slice = p->sched_class->get_rr_interval(rq, p); |
5776 | task_rq_unlock(rq, p, &flags); | 5775 | task_rq_unlock(rq, p, &flags); |
5777 | 5776 | ||
5778 | rcu_read_unlock(); | 5777 | rcu_read_unlock(); |
5779 | jiffies_to_timespec(time_slice, &t); | 5778 | jiffies_to_timespec(time_slice, &t); |
5780 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 5779 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
5781 | return retval; | 5780 | return retval; |
5782 | 5781 | ||
5783 | out_unlock: | 5782 | out_unlock: |
5784 | rcu_read_unlock(); | 5783 | rcu_read_unlock(); |
5785 | return retval; | 5784 | return retval; |
5786 | } | 5785 | } |
5787 | 5786 | ||
5788 | static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; | 5787 | static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; |
5789 | 5788 | ||
5790 | void sched_show_task(struct task_struct *p) | 5789 | void sched_show_task(struct task_struct *p) |
5791 | { | 5790 | { |
5792 | unsigned long free = 0; | 5791 | unsigned long free = 0; |
5793 | unsigned state; | 5792 | unsigned state; |
5794 | 5793 | ||
5795 | state = p->state ? __ffs(p->state) + 1 : 0; | 5794 | state = p->state ? __ffs(p->state) + 1 : 0; |
5796 | printk(KERN_INFO "%-15.15s %c", p->comm, | 5795 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5797 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5796 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5798 | #if BITS_PER_LONG == 32 | 5797 | #if BITS_PER_LONG == 32 |
5799 | if (state == TASK_RUNNING) | 5798 | if (state == TASK_RUNNING) |
5800 | printk(KERN_CONT " running "); | 5799 | printk(KERN_CONT " running "); |
5801 | else | 5800 | else |
5802 | printk(KERN_CONT " %08lx ", thread_saved_pc(p)); | 5801 | printk(KERN_CONT " %08lx ", thread_saved_pc(p)); |
5803 | #else | 5802 | #else |
5804 | if (state == TASK_RUNNING) | 5803 | if (state == TASK_RUNNING) |
5805 | printk(KERN_CONT " running task "); | 5804 | printk(KERN_CONT " running task "); |
5806 | else | 5805 | else |
5807 | printk(KERN_CONT " %016lx ", thread_saved_pc(p)); | 5806 | printk(KERN_CONT " %016lx ", thread_saved_pc(p)); |
5808 | #endif | 5807 | #endif |
5809 | #ifdef CONFIG_DEBUG_STACK_USAGE | 5808 | #ifdef CONFIG_DEBUG_STACK_USAGE |
5810 | free = stack_not_used(p); | 5809 | free = stack_not_used(p); |
5811 | #endif | 5810 | #endif |
5812 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 5811 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
5813 | task_pid_nr(p), task_pid_nr(p->real_parent), | 5812 | task_pid_nr(p), task_pid_nr(p->real_parent), |
5814 | (unsigned long)task_thread_info(p)->flags); | 5813 | (unsigned long)task_thread_info(p)->flags); |
5815 | 5814 | ||
5816 | show_stack(p, NULL); | 5815 | show_stack(p, NULL); |
5817 | } | 5816 | } |
5818 | 5817 | ||
5819 | void show_state_filter(unsigned long state_filter) | 5818 | void show_state_filter(unsigned long state_filter) |
5820 | { | 5819 | { |
5821 | struct task_struct *g, *p; | 5820 | struct task_struct *g, *p; |
5822 | 5821 | ||
5823 | #if BITS_PER_LONG == 32 | 5822 | #if BITS_PER_LONG == 32 |
5824 | printk(KERN_INFO | 5823 | printk(KERN_INFO |
5825 | " task PC stack pid father\n"); | 5824 | " task PC stack pid father\n"); |
5826 | #else | 5825 | #else |
5827 | printk(KERN_INFO | 5826 | printk(KERN_INFO |
5828 | " task PC stack pid father\n"); | 5827 | " task PC stack pid father\n"); |
5829 | #endif | 5828 | #endif |
5830 | read_lock(&tasklist_lock); | 5829 | read_lock(&tasklist_lock); |
5831 | do_each_thread(g, p) { | 5830 | do_each_thread(g, p) { |
5832 | /* | 5831 | /* |
5833 | * reset the NMI-timeout, listing all files on a slow | 5832 | * reset the NMI-timeout, listing all files on a slow |
5834 | * console might take a lot of time: | 5833 | * console might take a lot of time: |
5835 | */ | 5834 | */ |
5836 | touch_nmi_watchdog(); | 5835 | touch_nmi_watchdog(); |
5837 | if (!state_filter || (p->state & state_filter)) | 5836 | if (!state_filter || (p->state & state_filter)) |
5838 | sched_show_task(p); | 5837 | sched_show_task(p); |
5839 | } while_each_thread(g, p); | 5838 | } while_each_thread(g, p); |
5840 | 5839 | ||
5841 | touch_all_softlockup_watchdogs(); | 5840 | touch_all_softlockup_watchdogs(); |
5842 | 5841 | ||
5843 | #ifdef CONFIG_SCHED_DEBUG | 5842 | #ifdef CONFIG_SCHED_DEBUG |
5844 | sysrq_sched_debug_show(); | 5843 | sysrq_sched_debug_show(); |
5845 | #endif | 5844 | #endif |
5846 | read_unlock(&tasklist_lock); | 5845 | read_unlock(&tasklist_lock); |
5847 | /* | 5846 | /* |
5848 | * Only show locks if all tasks are dumped: | 5847 | * Only show locks if all tasks are dumped: |
5849 | */ | 5848 | */ |
5850 | if (!state_filter) | 5849 | if (!state_filter) |
5851 | debug_show_all_locks(); | 5850 | debug_show_all_locks(); |
5852 | } | 5851 | } |
5853 | 5852 | ||
5854 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) | 5853 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) |
5855 | { | 5854 | { |
5856 | idle->sched_class = &idle_sched_class; | 5855 | idle->sched_class = &idle_sched_class; |
5857 | } | 5856 | } |
5858 | 5857 | ||
5859 | /** | 5858 | /** |
5860 | * init_idle - set up an idle thread for a given CPU | 5859 | * init_idle - set up an idle thread for a given CPU |
5861 | * @idle: task in question | 5860 | * @idle: task in question |
5862 | * @cpu: cpu the idle task belongs to | 5861 | * @cpu: cpu the idle task belongs to |
5863 | * | 5862 | * |
5864 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 5863 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
5865 | * flag, to make booting more robust. | 5864 | * flag, to make booting more robust. |
5866 | */ | 5865 | */ |
5867 | void __cpuinit init_idle(struct task_struct *idle, int cpu) | 5866 | void __cpuinit init_idle(struct task_struct *idle, int cpu) |
5868 | { | 5867 | { |
5869 | struct rq *rq = cpu_rq(cpu); | 5868 | struct rq *rq = cpu_rq(cpu); |
5870 | unsigned long flags; | 5869 | unsigned long flags; |
5871 | 5870 | ||
5872 | raw_spin_lock_irqsave(&rq->lock, flags); | 5871 | raw_spin_lock_irqsave(&rq->lock, flags); |
5873 | 5872 | ||
5874 | __sched_fork(idle); | 5873 | __sched_fork(idle); |
5875 | idle->state = TASK_RUNNING; | 5874 | idle->state = TASK_RUNNING; |
5876 | idle->se.exec_start = sched_clock(); | 5875 | idle->se.exec_start = sched_clock(); |
5877 | 5876 | ||
5878 | do_set_cpus_allowed(idle, cpumask_of(cpu)); | 5877 | do_set_cpus_allowed(idle, cpumask_of(cpu)); |
5879 | /* | 5878 | /* |
5880 | * We're having a chicken and egg problem, even though we are | 5879 | * We're having a chicken and egg problem, even though we are |
5881 | * holding rq->lock, the cpu isn't yet set to this cpu so the | 5880 | * holding rq->lock, the cpu isn't yet set to this cpu so the |
5882 | * lockdep check in task_group() will fail. | 5881 | * lockdep check in task_group() will fail. |
5883 | * | 5882 | * |
5884 | * Similar case to sched_fork(). / Alternatively we could | 5883 | * Similar case to sched_fork(). / Alternatively we could |
5885 | * use task_rq_lock() here and obtain the other rq->lock. | 5884 | * use task_rq_lock() here and obtain the other rq->lock. |
5886 | * | 5885 | * |
5887 | * Silence PROVE_RCU | 5886 | * Silence PROVE_RCU |
5888 | */ | 5887 | */ |
5889 | rcu_read_lock(); | 5888 | rcu_read_lock(); |
5890 | __set_task_cpu(idle, cpu); | 5889 | __set_task_cpu(idle, cpu); |
5891 | rcu_read_unlock(); | 5890 | rcu_read_unlock(); |
5892 | 5891 | ||
5893 | rq->curr = rq->idle = idle; | 5892 | rq->curr = rq->idle = idle; |
5894 | #if defined(CONFIG_SMP) | 5893 | #if defined(CONFIG_SMP) |
5895 | idle->on_cpu = 1; | 5894 | idle->on_cpu = 1; |
5896 | #endif | 5895 | #endif |
5897 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5896 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5898 | 5897 | ||
5899 | /* Set the preempt count _outside_ the spinlocks! */ | 5898 | /* Set the preempt count _outside_ the spinlocks! */ |
5900 | task_thread_info(idle)->preempt_count = 0; | 5899 | task_thread_info(idle)->preempt_count = 0; |
5901 | 5900 | ||
5902 | /* | 5901 | /* |
5903 | * The idle tasks have their own, simple scheduling class: | 5902 | * The idle tasks have their own, simple scheduling class: |
5904 | */ | 5903 | */ |
5905 | idle->sched_class = &idle_sched_class; | 5904 | idle->sched_class = &idle_sched_class; |
5906 | ftrace_graph_init_idle_task(idle, cpu); | 5905 | ftrace_graph_init_idle_task(idle, cpu); |
5907 | } | 5906 | } |
5908 | 5907 | ||
5909 | /* | 5908 | /* |
5910 | * In a system that switches off the HZ timer nohz_cpu_mask | 5909 | * In a system that switches off the HZ timer nohz_cpu_mask |
5911 | * indicates which cpus entered this state. This is used | 5910 | * indicates which cpus entered this state. This is used |
5912 | * in the rcu update to wait only for active cpus. For system | 5911 | * in the rcu update to wait only for active cpus. For system |
5913 | * which do not switch off the HZ timer nohz_cpu_mask should | 5912 | * which do not switch off the HZ timer nohz_cpu_mask should |
5914 | * always be CPU_BITS_NONE. | 5913 | * always be CPU_BITS_NONE. |
5915 | */ | 5914 | */ |
5916 | cpumask_var_t nohz_cpu_mask; | 5915 | cpumask_var_t nohz_cpu_mask; |
5917 | 5916 | ||
5918 | /* | 5917 | /* |
5919 | * Increase the granularity value when there are more CPUs, | 5918 | * Increase the granularity value when there are more CPUs, |
5920 | * because with more CPUs the 'effective latency' as visible | 5919 | * because with more CPUs the 'effective latency' as visible |
5921 | * to users decreases. But the relationship is not linear, | 5920 | * to users decreases. But the relationship is not linear, |
5922 | * so pick a second-best guess by going with the log2 of the | 5921 | * so pick a second-best guess by going with the log2 of the |
5923 | * number of CPUs. | 5922 | * number of CPUs. |
5924 | * | 5923 | * |
5925 | * This idea comes from the SD scheduler of Con Kolivas: | 5924 | * This idea comes from the SD scheduler of Con Kolivas: |
5926 | */ | 5925 | */ |
5927 | static int get_update_sysctl_factor(void) | 5926 | static int get_update_sysctl_factor(void) |
5928 | { | 5927 | { |
5929 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | 5928 | unsigned int cpus = min_t(int, num_online_cpus(), 8); |
5930 | unsigned int factor; | 5929 | unsigned int factor; |
5931 | 5930 | ||
5932 | switch (sysctl_sched_tunable_scaling) { | 5931 | switch (sysctl_sched_tunable_scaling) { |
5933 | case SCHED_TUNABLESCALING_NONE: | 5932 | case SCHED_TUNABLESCALING_NONE: |
5934 | factor = 1; | 5933 | factor = 1; |
5935 | break; | 5934 | break; |
5936 | case SCHED_TUNABLESCALING_LINEAR: | 5935 | case SCHED_TUNABLESCALING_LINEAR: |
5937 | factor = cpus; | 5936 | factor = cpus; |
5938 | break; | 5937 | break; |
5939 | case SCHED_TUNABLESCALING_LOG: | 5938 | case SCHED_TUNABLESCALING_LOG: |
5940 | default: | 5939 | default: |
5941 | factor = 1 + ilog2(cpus); | 5940 | factor = 1 + ilog2(cpus); |
5942 | break; | 5941 | break; |
5943 | } | 5942 | } |
5944 | 5943 | ||
5945 | return factor; | 5944 | return factor; |
5946 | } | 5945 | } |
5947 | 5946 | ||
5948 | static void update_sysctl(void) | 5947 | static void update_sysctl(void) |
5949 | { | 5948 | { |
5950 | unsigned int factor = get_update_sysctl_factor(); | 5949 | unsigned int factor = get_update_sysctl_factor(); |
5951 | 5950 | ||
5952 | #define SET_SYSCTL(name) \ | 5951 | #define SET_SYSCTL(name) \ |
5953 | (sysctl_##name = (factor) * normalized_sysctl_##name) | 5952 | (sysctl_##name = (factor) * normalized_sysctl_##name) |
5954 | SET_SYSCTL(sched_min_granularity); | 5953 | SET_SYSCTL(sched_min_granularity); |
5955 | SET_SYSCTL(sched_latency); | 5954 | SET_SYSCTL(sched_latency); |
5956 | SET_SYSCTL(sched_wakeup_granularity); | 5955 | SET_SYSCTL(sched_wakeup_granularity); |
5957 | #undef SET_SYSCTL | 5956 | #undef SET_SYSCTL |
5958 | } | 5957 | } |
5959 | 5958 | ||
5960 | static inline void sched_init_granularity(void) | 5959 | static inline void sched_init_granularity(void) |
5961 | { | 5960 | { |
5962 | update_sysctl(); | 5961 | update_sysctl(); |
5963 | } | 5962 | } |
5964 | 5963 | ||
5965 | #ifdef CONFIG_SMP | 5964 | #ifdef CONFIG_SMP |
5966 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | 5965 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
5967 | { | 5966 | { |
5968 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 5967 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
5969 | p->sched_class->set_cpus_allowed(p, new_mask); | 5968 | p->sched_class->set_cpus_allowed(p, new_mask); |
5970 | else { | 5969 | else { |
5971 | cpumask_copy(&p->cpus_allowed, new_mask); | 5970 | cpumask_copy(&p->cpus_allowed, new_mask); |
5972 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 5971 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); |
5973 | } | 5972 | } |
5974 | } | 5973 | } |
5975 | 5974 | ||
5976 | /* | 5975 | /* |
5977 | * This is how migration works: | 5976 | * This is how migration works: |
5978 | * | 5977 | * |
5979 | * 1) we invoke migration_cpu_stop() on the target CPU using | 5978 | * 1) we invoke migration_cpu_stop() on the target CPU using |
5980 | * stop_one_cpu(). | 5979 | * stop_one_cpu(). |
5981 | * 2) stopper starts to run (implicitly forcing the migrated thread | 5980 | * 2) stopper starts to run (implicitly forcing the migrated thread |
5982 | * off the CPU) | 5981 | * off the CPU) |
5983 | * 3) it checks whether the migrated task is still in the wrong runqueue. | 5982 | * 3) it checks whether the migrated task is still in the wrong runqueue. |
5984 | * 4) if it's in the wrong runqueue then the migration thread removes | 5983 | * 4) if it's in the wrong runqueue then the migration thread removes |
5985 | * it and puts it into the right queue. | 5984 | * it and puts it into the right queue. |
5986 | * 5) stopper completes and stop_one_cpu() returns and the migration | 5985 | * 5) stopper completes and stop_one_cpu() returns and the migration |
5987 | * is done. | 5986 | * is done. |
5988 | */ | 5987 | */ |
5989 | 5988 | ||
5990 | /* | 5989 | /* |
5991 | * Change a given task's CPU affinity. Migrate the thread to a | 5990 | * Change a given task's CPU affinity. Migrate the thread to a |
5992 | * proper CPU and schedule it away if the CPU it's executing on | 5991 | * proper CPU and schedule it away if the CPU it's executing on |
5993 | * is removed from the allowed bitmask. | 5992 | * is removed from the allowed bitmask. |
5994 | * | 5993 | * |
5995 | * NOTE: the caller must have a valid reference to the task, the | 5994 | * NOTE: the caller must have a valid reference to the task, the |
5996 | * task must not exit() & deallocate itself prematurely. The | 5995 | * task must not exit() & deallocate itself prematurely. The |
5997 | * call is not atomic; no spinlocks may be held. | 5996 | * call is not atomic; no spinlocks may be held. |
5998 | */ | 5997 | */ |
5999 | int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | 5998 | int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) |
6000 | { | 5999 | { |
6001 | unsigned long flags; | 6000 | unsigned long flags; |
6002 | struct rq *rq; | 6001 | struct rq *rq; |
6003 | unsigned int dest_cpu; | 6002 | unsigned int dest_cpu; |
6004 | int ret = 0; | 6003 | int ret = 0; |
6005 | 6004 | ||
6006 | rq = task_rq_lock(p, &flags); | 6005 | rq = task_rq_lock(p, &flags); |
6007 | 6006 | ||
6008 | if (cpumask_equal(&p->cpus_allowed, new_mask)) | 6007 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
6009 | goto out; | 6008 | goto out; |
6010 | 6009 | ||
6011 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 6010 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
6012 | ret = -EINVAL; | 6011 | ret = -EINVAL; |
6013 | goto out; | 6012 | goto out; |
6014 | } | 6013 | } |
6015 | 6014 | ||
6016 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { | 6015 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { |
6017 | ret = -EINVAL; | 6016 | ret = -EINVAL; |
6018 | goto out; | 6017 | goto out; |
6019 | } | 6018 | } |
6020 | 6019 | ||
6021 | do_set_cpus_allowed(p, new_mask); | 6020 | do_set_cpus_allowed(p, new_mask); |
6022 | 6021 | ||
6023 | /* Can the task run on the task's current CPU? If so, we're done */ | 6022 | /* Can the task run on the task's current CPU? If so, we're done */ |
6024 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 6023 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
6025 | goto out; | 6024 | goto out; |
6026 | 6025 | ||
6027 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 6026 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
6028 | if (p->on_rq) { | 6027 | if (p->on_rq) { |
6029 | struct migration_arg arg = { p, dest_cpu }; | 6028 | struct migration_arg arg = { p, dest_cpu }; |
6030 | /* Need help from migration thread: drop lock and wait. */ | 6029 | /* Need help from migration thread: drop lock and wait. */ |
6031 | task_rq_unlock(rq, p, &flags); | 6030 | task_rq_unlock(rq, p, &flags); |
6032 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 6031 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
6033 | tlb_migrate_finish(p->mm); | 6032 | tlb_migrate_finish(p->mm); |
6034 | return 0; | 6033 | return 0; |
6035 | } | 6034 | } |
6036 | out: | 6035 | out: |
6037 | task_rq_unlock(rq, p, &flags); | 6036 | task_rq_unlock(rq, p, &flags); |
6038 | 6037 | ||
6039 | return ret; | 6038 | return ret; |
6040 | } | 6039 | } |
6041 | EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); | 6040 | EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); |
6042 | 6041 | ||
6043 | /* | 6042 | /* |
6044 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 6043 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
6045 | * this because either it can't run here any more (set_cpus_allowed() | 6044 | * this because either it can't run here any more (set_cpus_allowed() |
6046 | * away from this CPU, or CPU going down), or because we're | 6045 | * away from this CPU, or CPU going down), or because we're |
6047 | * attempting to rebalance this task on exec (sched_exec). | 6046 | * attempting to rebalance this task on exec (sched_exec). |
6048 | * | 6047 | * |
6049 | * So we race with normal scheduler movements, but that's OK, as long | 6048 | * So we race with normal scheduler movements, but that's OK, as long |
6050 | * as the task is no longer on this CPU. | 6049 | * as the task is no longer on this CPU. |
6051 | * | 6050 | * |
6052 | * Returns non-zero if task was successfully migrated. | 6051 | * Returns non-zero if task was successfully migrated. |
6053 | */ | 6052 | */ |
6054 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 6053 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
6055 | { | 6054 | { |
6056 | struct rq *rq_dest, *rq_src; | 6055 | struct rq *rq_dest, *rq_src; |
6057 | int ret = 0; | 6056 | int ret = 0; |
6058 | 6057 | ||
6059 | if (unlikely(!cpu_active(dest_cpu))) | 6058 | if (unlikely(!cpu_active(dest_cpu))) |
6060 | return ret; | 6059 | return ret; |
6061 | 6060 | ||
6062 | rq_src = cpu_rq(src_cpu); | 6061 | rq_src = cpu_rq(src_cpu); |
6063 | rq_dest = cpu_rq(dest_cpu); | 6062 | rq_dest = cpu_rq(dest_cpu); |
6064 | 6063 | ||
6065 | raw_spin_lock(&p->pi_lock); | 6064 | raw_spin_lock(&p->pi_lock); |
6066 | double_rq_lock(rq_src, rq_dest); | 6065 | double_rq_lock(rq_src, rq_dest); |
6067 | /* Already moved. */ | 6066 | /* Already moved. */ |
6068 | if (task_cpu(p) != src_cpu) | 6067 | if (task_cpu(p) != src_cpu) |
6069 | goto done; | 6068 | goto done; |
6070 | /* Affinity changed (again). */ | 6069 | /* Affinity changed (again). */ |
6071 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 6070 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) |
6072 | goto fail; | 6071 | goto fail; |
6073 | 6072 | ||
6074 | /* | 6073 | /* |
6075 | * If we're not on a rq, the next wake-up will ensure we're | 6074 | * If we're not on a rq, the next wake-up will ensure we're |
6076 | * placed properly. | 6075 | * placed properly. |
6077 | */ | 6076 | */ |
6078 | if (p->on_rq) { | 6077 | if (p->on_rq) { |
6079 | deactivate_task(rq_src, p, 0); | 6078 | deactivate_task(rq_src, p, 0); |
6080 | set_task_cpu(p, dest_cpu); | 6079 | set_task_cpu(p, dest_cpu); |
6081 | activate_task(rq_dest, p, 0); | 6080 | activate_task(rq_dest, p, 0); |
6082 | check_preempt_curr(rq_dest, p, 0); | 6081 | check_preempt_curr(rq_dest, p, 0); |
6083 | } | 6082 | } |
6084 | done: | 6083 | done: |
6085 | ret = 1; | 6084 | ret = 1; |
6086 | fail: | 6085 | fail: |
6087 | double_rq_unlock(rq_src, rq_dest); | 6086 | double_rq_unlock(rq_src, rq_dest); |
6088 | raw_spin_unlock(&p->pi_lock); | 6087 | raw_spin_unlock(&p->pi_lock); |
6089 | return ret; | 6088 | return ret; |
6090 | } | 6089 | } |
6091 | 6090 | ||
6092 | /* | 6091 | /* |
6093 | * migration_cpu_stop - this will be executed by a highprio stopper thread | 6092 | * migration_cpu_stop - this will be executed by a highprio stopper thread |
6094 | * and performs thread migration by bumping thread off CPU then | 6093 | * and performs thread migration by bumping thread off CPU then |
6095 | * 'pushing' onto another runqueue. | 6094 | * 'pushing' onto another runqueue. |
6096 | */ | 6095 | */ |
6097 | static int migration_cpu_stop(void *data) | 6096 | static int migration_cpu_stop(void *data) |
6098 | { | 6097 | { |
6099 | struct migration_arg *arg = data; | 6098 | struct migration_arg *arg = data; |
6100 | 6099 | ||
6101 | /* | 6100 | /* |
6102 | * The original target cpu might have gone down and we might | 6101 | * The original target cpu might have gone down and we might |
6103 | * be on another cpu but it doesn't matter. | 6102 | * be on another cpu but it doesn't matter. |
6104 | */ | 6103 | */ |
6105 | local_irq_disable(); | 6104 | local_irq_disable(); |
6106 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); | 6105 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); |
6107 | local_irq_enable(); | 6106 | local_irq_enable(); |
6108 | return 0; | 6107 | return 0; |
6109 | } | 6108 | } |
6110 | 6109 | ||
6111 | #ifdef CONFIG_HOTPLUG_CPU | 6110 | #ifdef CONFIG_HOTPLUG_CPU |
6112 | 6111 | ||
6113 | /* | 6112 | /* |
6114 | * Ensures that the idle task is using init_mm right before its cpu goes | 6113 | * Ensures that the idle task is using init_mm right before its cpu goes |
6115 | * offline. | 6114 | * offline. |
6116 | */ | 6115 | */ |
6117 | void idle_task_exit(void) | 6116 | void idle_task_exit(void) |
6118 | { | 6117 | { |
6119 | struct mm_struct *mm = current->active_mm; | 6118 | struct mm_struct *mm = current->active_mm; |
6120 | 6119 | ||
6121 | BUG_ON(cpu_online(smp_processor_id())); | 6120 | BUG_ON(cpu_online(smp_processor_id())); |
6122 | 6121 | ||
6123 | if (mm != &init_mm) | 6122 | if (mm != &init_mm) |
6124 | switch_mm(mm, &init_mm, current); | 6123 | switch_mm(mm, &init_mm, current); |
6125 | mmdrop(mm); | 6124 | mmdrop(mm); |
6126 | } | 6125 | } |
6127 | 6126 | ||
6128 | /* | 6127 | /* |
6129 | * While a dead CPU has no uninterruptible tasks queued at this point, | 6128 | * While a dead CPU has no uninterruptible tasks queued at this point, |
6130 | * it might still have a nonzero ->nr_uninterruptible counter, because | 6129 | * it might still have a nonzero ->nr_uninterruptible counter, because |
6131 | * for performance reasons the counter is not stricly tracking tasks to | 6130 | * for performance reasons the counter is not stricly tracking tasks to |
6132 | * their home CPUs. So we just add the counter to another CPU's counter, | 6131 | * their home CPUs. So we just add the counter to another CPU's counter, |
6133 | * to keep the global sum constant after CPU-down: | 6132 | * to keep the global sum constant after CPU-down: |
6134 | */ | 6133 | */ |
6135 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 6134 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
6136 | { | 6135 | { |
6137 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 6136 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
6138 | 6137 | ||
6139 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 6138 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
6140 | rq_src->nr_uninterruptible = 0; | 6139 | rq_src->nr_uninterruptible = 0; |
6141 | } | 6140 | } |
6142 | 6141 | ||
6143 | /* | 6142 | /* |
6144 | * remove the tasks which were accounted by rq from calc_load_tasks. | 6143 | * remove the tasks which were accounted by rq from calc_load_tasks. |
6145 | */ | 6144 | */ |
6146 | static void calc_global_load_remove(struct rq *rq) | 6145 | static void calc_global_load_remove(struct rq *rq) |
6147 | { | 6146 | { |
6148 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | 6147 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
6149 | rq->calc_load_active = 0; | 6148 | rq->calc_load_active = 0; |
6150 | } | 6149 | } |
6151 | 6150 | ||
6152 | /* | 6151 | /* |
6153 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 6152 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
6154 | * try_to_wake_up()->select_task_rq(). | 6153 | * try_to_wake_up()->select_task_rq(). |
6155 | * | 6154 | * |
6156 | * Called with rq->lock held even though we'er in stop_machine() and | 6155 | * Called with rq->lock held even though we'er in stop_machine() and |
6157 | * there's no concurrency possible, we hold the required locks anyway | 6156 | * there's no concurrency possible, we hold the required locks anyway |
6158 | * because of lock validation efforts. | 6157 | * because of lock validation efforts. |
6159 | */ | 6158 | */ |
6160 | static void migrate_tasks(unsigned int dead_cpu) | 6159 | static void migrate_tasks(unsigned int dead_cpu) |
6161 | { | 6160 | { |
6162 | struct rq *rq = cpu_rq(dead_cpu); | 6161 | struct rq *rq = cpu_rq(dead_cpu); |
6163 | struct task_struct *next, *stop = rq->stop; | 6162 | struct task_struct *next, *stop = rq->stop; |
6164 | int dest_cpu; | 6163 | int dest_cpu; |
6165 | 6164 | ||
6166 | /* | 6165 | /* |
6167 | * Fudge the rq selection such that the below task selection loop | 6166 | * Fudge the rq selection such that the below task selection loop |
6168 | * doesn't get stuck on the currently eligible stop task. | 6167 | * doesn't get stuck on the currently eligible stop task. |
6169 | * | 6168 | * |
6170 | * We're currently inside stop_machine() and the rq is either stuck | 6169 | * We're currently inside stop_machine() and the rq is either stuck |
6171 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | 6170 | * in the stop_machine_cpu_stop() loop, or we're executing this code, |
6172 | * either way we should never end up calling schedule() until we're | 6171 | * either way we should never end up calling schedule() until we're |
6173 | * done here. | 6172 | * done here. |
6174 | */ | 6173 | */ |
6175 | rq->stop = NULL; | 6174 | rq->stop = NULL; |
6176 | 6175 | ||
6177 | for ( ; ; ) { | 6176 | for ( ; ; ) { |
6178 | /* | 6177 | /* |
6179 | * There's this thread running, bail when that's the only | 6178 | * There's this thread running, bail when that's the only |
6180 | * remaining thread. | 6179 | * remaining thread. |
6181 | */ | 6180 | */ |
6182 | if (rq->nr_running == 1) | 6181 | if (rq->nr_running == 1) |
6183 | break; | 6182 | break; |
6184 | 6183 | ||
6185 | next = pick_next_task(rq); | 6184 | next = pick_next_task(rq); |
6186 | BUG_ON(!next); | 6185 | BUG_ON(!next); |
6187 | next->sched_class->put_prev_task(rq, next); | 6186 | next->sched_class->put_prev_task(rq, next); |
6188 | 6187 | ||
6189 | /* Find suitable destination for @next, with force if needed. */ | 6188 | /* Find suitable destination for @next, with force if needed. */ |
6190 | dest_cpu = select_fallback_rq(dead_cpu, next); | 6189 | dest_cpu = select_fallback_rq(dead_cpu, next); |
6191 | raw_spin_unlock(&rq->lock); | 6190 | raw_spin_unlock(&rq->lock); |
6192 | 6191 | ||
6193 | __migrate_task(next, dead_cpu, dest_cpu); | 6192 | __migrate_task(next, dead_cpu, dest_cpu); |
6194 | 6193 | ||
6195 | raw_spin_lock(&rq->lock); | 6194 | raw_spin_lock(&rq->lock); |
6196 | } | 6195 | } |
6197 | 6196 | ||
6198 | rq->stop = stop; | 6197 | rq->stop = stop; |
6199 | } | 6198 | } |
6200 | 6199 | ||
6201 | #endif /* CONFIG_HOTPLUG_CPU */ | 6200 | #endif /* CONFIG_HOTPLUG_CPU */ |
6202 | 6201 | ||
6203 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 6202 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
6204 | 6203 | ||
6205 | static struct ctl_table sd_ctl_dir[] = { | 6204 | static struct ctl_table sd_ctl_dir[] = { |
6206 | { | 6205 | { |
6207 | .procname = "sched_domain", | 6206 | .procname = "sched_domain", |
6208 | .mode = 0555, | 6207 | .mode = 0555, |
6209 | }, | 6208 | }, |
6210 | {} | 6209 | {} |
6211 | }; | 6210 | }; |
6212 | 6211 | ||
6213 | static struct ctl_table sd_ctl_root[] = { | 6212 | static struct ctl_table sd_ctl_root[] = { |
6214 | { | 6213 | { |
6215 | .procname = "kernel", | 6214 | .procname = "kernel", |
6216 | .mode = 0555, | 6215 | .mode = 0555, |
6217 | .child = sd_ctl_dir, | 6216 | .child = sd_ctl_dir, |
6218 | }, | 6217 | }, |
6219 | {} | 6218 | {} |
6220 | }; | 6219 | }; |
6221 | 6220 | ||
6222 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 6221 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
6223 | { | 6222 | { |
6224 | struct ctl_table *entry = | 6223 | struct ctl_table *entry = |
6225 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); | 6224 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); |
6226 | 6225 | ||
6227 | return entry; | 6226 | return entry; |
6228 | } | 6227 | } |
6229 | 6228 | ||
6230 | static void sd_free_ctl_entry(struct ctl_table **tablep) | 6229 | static void sd_free_ctl_entry(struct ctl_table **tablep) |
6231 | { | 6230 | { |
6232 | struct ctl_table *entry; | 6231 | struct ctl_table *entry; |
6233 | 6232 | ||
6234 | /* | 6233 | /* |
6235 | * In the intermediate directories, both the child directory and | 6234 | * In the intermediate directories, both the child directory and |
6236 | * procname are dynamically allocated and could fail but the mode | 6235 | * procname are dynamically allocated and could fail but the mode |
6237 | * will always be set. In the lowest directory the names are | 6236 | * will always be set. In the lowest directory the names are |
6238 | * static strings and all have proc handlers. | 6237 | * static strings and all have proc handlers. |
6239 | */ | 6238 | */ |
6240 | for (entry = *tablep; entry->mode; entry++) { | 6239 | for (entry = *tablep; entry->mode; entry++) { |
6241 | if (entry->child) | 6240 | if (entry->child) |
6242 | sd_free_ctl_entry(&entry->child); | 6241 | sd_free_ctl_entry(&entry->child); |
6243 | if (entry->proc_handler == NULL) | 6242 | if (entry->proc_handler == NULL) |
6244 | kfree(entry->procname); | 6243 | kfree(entry->procname); |
6245 | } | 6244 | } |
6246 | 6245 | ||
6247 | kfree(*tablep); | 6246 | kfree(*tablep); |
6248 | *tablep = NULL; | 6247 | *tablep = NULL; |
6249 | } | 6248 | } |
6250 | 6249 | ||
6251 | static void | 6250 | static void |
6252 | set_table_entry(struct ctl_table *entry, | 6251 | set_table_entry(struct ctl_table *entry, |
6253 | const char *procname, void *data, int maxlen, | 6252 | const char *procname, void *data, int maxlen, |
6254 | mode_t mode, proc_handler *proc_handler) | 6253 | mode_t mode, proc_handler *proc_handler) |
6255 | { | 6254 | { |
6256 | entry->procname = procname; | 6255 | entry->procname = procname; |
6257 | entry->data = data; | 6256 | entry->data = data; |
6258 | entry->maxlen = maxlen; | 6257 | entry->maxlen = maxlen; |
6259 | entry->mode = mode; | 6258 | entry->mode = mode; |
6260 | entry->proc_handler = proc_handler; | 6259 | entry->proc_handler = proc_handler; |
6261 | } | 6260 | } |
6262 | 6261 | ||
6263 | static struct ctl_table * | 6262 | static struct ctl_table * |
6264 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 6263 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
6265 | { | 6264 | { |
6266 | struct ctl_table *table = sd_alloc_ctl_entry(13); | 6265 | struct ctl_table *table = sd_alloc_ctl_entry(13); |
6267 | 6266 | ||
6268 | if (table == NULL) | 6267 | if (table == NULL) |
6269 | return NULL; | 6268 | return NULL; |
6270 | 6269 | ||
6271 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 6270 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
6272 | sizeof(long), 0644, proc_doulongvec_minmax); | 6271 | sizeof(long), 0644, proc_doulongvec_minmax); |
6273 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 6272 | set_table_entry(&table[1], "max_interval", &sd->max_interval, |
6274 | sizeof(long), 0644, proc_doulongvec_minmax); | 6273 | sizeof(long), 0644, proc_doulongvec_minmax); |
6275 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 6274 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, |
6276 | sizeof(int), 0644, proc_dointvec_minmax); | 6275 | sizeof(int), 0644, proc_dointvec_minmax); |
6277 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 6276 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, |
6278 | sizeof(int), 0644, proc_dointvec_minmax); | 6277 | sizeof(int), 0644, proc_dointvec_minmax); |
6279 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 6278 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, |
6280 | sizeof(int), 0644, proc_dointvec_minmax); | 6279 | sizeof(int), 0644, proc_dointvec_minmax); |
6281 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 6280 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, |
6282 | sizeof(int), 0644, proc_dointvec_minmax); | 6281 | sizeof(int), 0644, proc_dointvec_minmax); |
6283 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 6282 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, |
6284 | sizeof(int), 0644, proc_dointvec_minmax); | 6283 | sizeof(int), 0644, proc_dointvec_minmax); |
6285 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | 6284 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, |
6286 | sizeof(int), 0644, proc_dointvec_minmax); | 6285 | sizeof(int), 0644, proc_dointvec_minmax); |
6287 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 6286 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
6288 | sizeof(int), 0644, proc_dointvec_minmax); | 6287 | sizeof(int), 0644, proc_dointvec_minmax); |
6289 | set_table_entry(&table[9], "cache_nice_tries", | 6288 | set_table_entry(&table[9], "cache_nice_tries", |
6290 | &sd->cache_nice_tries, | 6289 | &sd->cache_nice_tries, |
6291 | sizeof(int), 0644, proc_dointvec_minmax); | 6290 | sizeof(int), 0644, proc_dointvec_minmax); |
6292 | set_table_entry(&table[10], "flags", &sd->flags, | 6291 | set_table_entry(&table[10], "flags", &sd->flags, |
6293 | sizeof(int), 0644, proc_dointvec_minmax); | 6292 | sizeof(int), 0644, proc_dointvec_minmax); |
6294 | set_table_entry(&table[11], "name", sd->name, | 6293 | set_table_entry(&table[11], "name", sd->name, |
6295 | CORENAME_MAX_SIZE, 0444, proc_dostring); | 6294 | CORENAME_MAX_SIZE, 0444, proc_dostring); |
6296 | /* &table[12] is terminator */ | 6295 | /* &table[12] is terminator */ |
6297 | 6296 | ||
6298 | return table; | 6297 | return table; |
6299 | } | 6298 | } |
6300 | 6299 | ||
6301 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | 6300 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) |
6302 | { | 6301 | { |
6303 | struct ctl_table *entry, *table; | 6302 | struct ctl_table *entry, *table; |
6304 | struct sched_domain *sd; | 6303 | struct sched_domain *sd; |
6305 | int domain_num = 0, i; | 6304 | int domain_num = 0, i; |
6306 | char buf[32]; | 6305 | char buf[32]; |
6307 | 6306 | ||
6308 | for_each_domain(cpu, sd) | 6307 | for_each_domain(cpu, sd) |
6309 | domain_num++; | 6308 | domain_num++; |
6310 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | 6309 | entry = table = sd_alloc_ctl_entry(domain_num + 1); |
6311 | if (table == NULL) | 6310 | if (table == NULL) |
6312 | return NULL; | 6311 | return NULL; |
6313 | 6312 | ||
6314 | i = 0; | 6313 | i = 0; |
6315 | for_each_domain(cpu, sd) { | 6314 | for_each_domain(cpu, sd) { |
6316 | snprintf(buf, 32, "domain%d", i); | 6315 | snprintf(buf, 32, "domain%d", i); |
6317 | entry->procname = kstrdup(buf, GFP_KERNEL); | 6316 | entry->procname = kstrdup(buf, GFP_KERNEL); |
6318 | entry->mode = 0555; | 6317 | entry->mode = 0555; |
6319 | entry->child = sd_alloc_ctl_domain_table(sd); | 6318 | entry->child = sd_alloc_ctl_domain_table(sd); |
6320 | entry++; | 6319 | entry++; |
6321 | i++; | 6320 | i++; |
6322 | } | 6321 | } |
6323 | return table; | 6322 | return table; |
6324 | } | 6323 | } |
6325 | 6324 | ||
6326 | static struct ctl_table_header *sd_sysctl_header; | 6325 | static struct ctl_table_header *sd_sysctl_header; |
6327 | static void register_sched_domain_sysctl(void) | 6326 | static void register_sched_domain_sysctl(void) |
6328 | { | 6327 | { |
6329 | int i, cpu_num = num_possible_cpus(); | 6328 | int i, cpu_num = num_possible_cpus(); |
6330 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 6329 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
6331 | char buf[32]; | 6330 | char buf[32]; |
6332 | 6331 | ||
6333 | WARN_ON(sd_ctl_dir[0].child); | 6332 | WARN_ON(sd_ctl_dir[0].child); |
6334 | sd_ctl_dir[0].child = entry; | 6333 | sd_ctl_dir[0].child = entry; |
6335 | 6334 | ||
6336 | if (entry == NULL) | 6335 | if (entry == NULL) |
6337 | return; | 6336 | return; |
6338 | 6337 | ||
6339 | for_each_possible_cpu(i) { | 6338 | for_each_possible_cpu(i) { |
6340 | snprintf(buf, 32, "cpu%d", i); | 6339 | snprintf(buf, 32, "cpu%d", i); |
6341 | entry->procname = kstrdup(buf, GFP_KERNEL); | 6340 | entry->procname = kstrdup(buf, GFP_KERNEL); |
6342 | entry->mode = 0555; | 6341 | entry->mode = 0555; |
6343 | entry->child = sd_alloc_ctl_cpu_table(i); | 6342 | entry->child = sd_alloc_ctl_cpu_table(i); |
6344 | entry++; | 6343 | entry++; |
6345 | } | 6344 | } |
6346 | 6345 | ||
6347 | WARN_ON(sd_sysctl_header); | 6346 | WARN_ON(sd_sysctl_header); |
6348 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | 6347 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); |
6349 | } | 6348 | } |
6350 | 6349 | ||
6351 | /* may be called multiple times per register */ | 6350 | /* may be called multiple times per register */ |
6352 | static void unregister_sched_domain_sysctl(void) | 6351 | static void unregister_sched_domain_sysctl(void) |
6353 | { | 6352 | { |
6354 | if (sd_sysctl_header) | 6353 | if (sd_sysctl_header) |
6355 | unregister_sysctl_table(sd_sysctl_header); | 6354 | unregister_sysctl_table(sd_sysctl_header); |
6356 | sd_sysctl_header = NULL; | 6355 | sd_sysctl_header = NULL; |
6357 | if (sd_ctl_dir[0].child) | 6356 | if (sd_ctl_dir[0].child) |
6358 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | 6357 | sd_free_ctl_entry(&sd_ctl_dir[0].child); |
6359 | } | 6358 | } |
6360 | #else | 6359 | #else |
6361 | static void register_sched_domain_sysctl(void) | 6360 | static void register_sched_domain_sysctl(void) |
6362 | { | 6361 | { |
6363 | } | 6362 | } |
6364 | static void unregister_sched_domain_sysctl(void) | 6363 | static void unregister_sched_domain_sysctl(void) |
6365 | { | 6364 | { |
6366 | } | 6365 | } |
6367 | #endif | 6366 | #endif |
6368 | 6367 | ||
6369 | static void set_rq_online(struct rq *rq) | 6368 | static void set_rq_online(struct rq *rq) |
6370 | { | 6369 | { |
6371 | if (!rq->online) { | 6370 | if (!rq->online) { |
6372 | const struct sched_class *class; | 6371 | const struct sched_class *class; |
6373 | 6372 | ||
6374 | cpumask_set_cpu(rq->cpu, rq->rd->online); | 6373 | cpumask_set_cpu(rq->cpu, rq->rd->online); |
6375 | rq->online = 1; | 6374 | rq->online = 1; |
6376 | 6375 | ||
6377 | for_each_class(class) { | 6376 | for_each_class(class) { |
6378 | if (class->rq_online) | 6377 | if (class->rq_online) |
6379 | class->rq_online(rq); | 6378 | class->rq_online(rq); |
6380 | } | 6379 | } |
6381 | } | 6380 | } |
6382 | } | 6381 | } |
6383 | 6382 | ||
6384 | static void set_rq_offline(struct rq *rq) | 6383 | static void set_rq_offline(struct rq *rq) |
6385 | { | 6384 | { |
6386 | if (rq->online) { | 6385 | if (rq->online) { |
6387 | const struct sched_class *class; | 6386 | const struct sched_class *class; |
6388 | 6387 | ||
6389 | for_each_class(class) { | 6388 | for_each_class(class) { |
6390 | if (class->rq_offline) | 6389 | if (class->rq_offline) |
6391 | class->rq_offline(rq); | 6390 | class->rq_offline(rq); |
6392 | } | 6391 | } |
6393 | 6392 | ||
6394 | cpumask_clear_cpu(rq->cpu, rq->rd->online); | 6393 | cpumask_clear_cpu(rq->cpu, rq->rd->online); |
6395 | rq->online = 0; | 6394 | rq->online = 0; |
6396 | } | 6395 | } |
6397 | } | 6396 | } |
6398 | 6397 | ||
6399 | /* | 6398 | /* |
6400 | * migration_call - callback that gets triggered when a CPU is added. | 6399 | * migration_call - callback that gets triggered when a CPU is added. |
6401 | * Here we can start up the necessary migration thread for the new CPU. | 6400 | * Here we can start up the necessary migration thread for the new CPU. |
6402 | */ | 6401 | */ |
6403 | static int __cpuinit | 6402 | static int __cpuinit |
6404 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | 6403 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
6405 | { | 6404 | { |
6406 | int cpu = (long)hcpu; | 6405 | int cpu = (long)hcpu; |
6407 | unsigned long flags; | 6406 | unsigned long flags; |
6408 | struct rq *rq = cpu_rq(cpu); | 6407 | struct rq *rq = cpu_rq(cpu); |
6409 | 6408 | ||
6410 | switch (action & ~CPU_TASKS_FROZEN) { | 6409 | switch (action & ~CPU_TASKS_FROZEN) { |
6411 | 6410 | ||
6412 | case CPU_UP_PREPARE: | 6411 | case CPU_UP_PREPARE: |
6413 | rq->calc_load_update = calc_load_update; | 6412 | rq->calc_load_update = calc_load_update; |
6414 | break; | 6413 | break; |
6415 | 6414 | ||
6416 | case CPU_ONLINE: | 6415 | case CPU_ONLINE: |
6417 | /* Update our root-domain */ | 6416 | /* Update our root-domain */ |
6418 | raw_spin_lock_irqsave(&rq->lock, flags); | 6417 | raw_spin_lock_irqsave(&rq->lock, flags); |
6419 | if (rq->rd) { | 6418 | if (rq->rd) { |
6420 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6419 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
6421 | 6420 | ||
6422 | set_rq_online(rq); | 6421 | set_rq_online(rq); |
6423 | } | 6422 | } |
6424 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6423 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6425 | break; | 6424 | break; |
6426 | 6425 | ||
6427 | #ifdef CONFIG_HOTPLUG_CPU | 6426 | #ifdef CONFIG_HOTPLUG_CPU |
6428 | case CPU_DYING: | 6427 | case CPU_DYING: |
6429 | sched_ttwu_pending(); | 6428 | sched_ttwu_pending(); |
6430 | /* Update our root-domain */ | 6429 | /* Update our root-domain */ |
6431 | raw_spin_lock_irqsave(&rq->lock, flags); | 6430 | raw_spin_lock_irqsave(&rq->lock, flags); |
6432 | if (rq->rd) { | 6431 | if (rq->rd) { |
6433 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6432 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
6434 | set_rq_offline(rq); | 6433 | set_rq_offline(rq); |
6435 | } | 6434 | } |
6436 | migrate_tasks(cpu); | 6435 | migrate_tasks(cpu); |
6437 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | 6436 | BUG_ON(rq->nr_running != 1); /* the migration thread */ |
6438 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6437 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6439 | 6438 | ||
6440 | migrate_nr_uninterruptible(rq); | 6439 | migrate_nr_uninterruptible(rq); |
6441 | calc_global_load_remove(rq); | 6440 | calc_global_load_remove(rq); |
6442 | break; | 6441 | break; |
6443 | #endif | 6442 | #endif |
6444 | } | 6443 | } |
6445 | 6444 | ||
6446 | update_max_interval(); | 6445 | update_max_interval(); |
6447 | 6446 | ||
6448 | return NOTIFY_OK; | 6447 | return NOTIFY_OK; |
6449 | } | 6448 | } |
6450 | 6449 | ||
6451 | /* | 6450 | /* |
6452 | * Register at high priority so that task migration (migrate_all_tasks) | 6451 | * Register at high priority so that task migration (migrate_all_tasks) |
6453 | * happens before everything else. This has to be lower priority than | 6452 | * happens before everything else. This has to be lower priority than |
6454 | * the notifier in the perf_event subsystem, though. | 6453 | * the notifier in the perf_event subsystem, though. |
6455 | */ | 6454 | */ |
6456 | static struct notifier_block __cpuinitdata migration_notifier = { | 6455 | static struct notifier_block __cpuinitdata migration_notifier = { |
6457 | .notifier_call = migration_call, | 6456 | .notifier_call = migration_call, |
6458 | .priority = CPU_PRI_MIGRATION, | 6457 | .priority = CPU_PRI_MIGRATION, |
6459 | }; | 6458 | }; |
6460 | 6459 | ||
6461 | static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | 6460 | static int __cpuinit sched_cpu_active(struct notifier_block *nfb, |
6462 | unsigned long action, void *hcpu) | 6461 | unsigned long action, void *hcpu) |
6463 | { | 6462 | { |
6464 | switch (action & ~CPU_TASKS_FROZEN) { | 6463 | switch (action & ~CPU_TASKS_FROZEN) { |
6465 | case CPU_ONLINE: | 6464 | case CPU_ONLINE: |
6466 | case CPU_DOWN_FAILED: | 6465 | case CPU_DOWN_FAILED: |
6467 | set_cpu_active((long)hcpu, true); | 6466 | set_cpu_active((long)hcpu, true); |
6468 | return NOTIFY_OK; | 6467 | return NOTIFY_OK; |
6469 | default: | 6468 | default: |
6470 | return NOTIFY_DONE; | 6469 | return NOTIFY_DONE; |
6471 | } | 6470 | } |
6472 | } | 6471 | } |
6473 | 6472 | ||
6474 | static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, | 6473 | static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, |
6475 | unsigned long action, void *hcpu) | 6474 | unsigned long action, void *hcpu) |
6476 | { | 6475 | { |
6477 | switch (action & ~CPU_TASKS_FROZEN) { | 6476 | switch (action & ~CPU_TASKS_FROZEN) { |
6478 | case CPU_DOWN_PREPARE: | 6477 | case CPU_DOWN_PREPARE: |
6479 | set_cpu_active((long)hcpu, false); | 6478 | set_cpu_active((long)hcpu, false); |
6480 | return NOTIFY_OK; | 6479 | return NOTIFY_OK; |
6481 | default: | 6480 | default: |
6482 | return NOTIFY_DONE; | 6481 | return NOTIFY_DONE; |
6483 | } | 6482 | } |
6484 | } | 6483 | } |
6485 | 6484 | ||
6486 | static int __init migration_init(void) | 6485 | static int __init migration_init(void) |
6487 | { | 6486 | { |
6488 | void *cpu = (void *)(long)smp_processor_id(); | 6487 | void *cpu = (void *)(long)smp_processor_id(); |
6489 | int err; | 6488 | int err; |
6490 | 6489 | ||
6491 | /* Initialize migration for the boot CPU */ | 6490 | /* Initialize migration for the boot CPU */ |
6492 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 6491 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
6493 | BUG_ON(err == NOTIFY_BAD); | 6492 | BUG_ON(err == NOTIFY_BAD); |
6494 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 6493 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
6495 | register_cpu_notifier(&migration_notifier); | 6494 | register_cpu_notifier(&migration_notifier); |
6496 | 6495 | ||
6497 | /* Register cpu active notifiers */ | 6496 | /* Register cpu active notifiers */ |
6498 | cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); | 6497 | cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); |
6499 | cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); | 6498 | cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); |
6500 | 6499 | ||
6501 | return 0; | 6500 | return 0; |
6502 | } | 6501 | } |
6503 | early_initcall(migration_init); | 6502 | early_initcall(migration_init); |
6504 | #endif | 6503 | #endif |
6505 | 6504 | ||
6506 | #ifdef CONFIG_SMP | 6505 | #ifdef CONFIG_SMP |
6507 | 6506 | ||
6508 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | 6507 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ |
6509 | 6508 | ||
6510 | #ifdef CONFIG_SCHED_DEBUG | 6509 | #ifdef CONFIG_SCHED_DEBUG |
6511 | 6510 | ||
6512 | static __read_mostly int sched_domain_debug_enabled; | 6511 | static __read_mostly int sched_domain_debug_enabled; |
6513 | 6512 | ||
6514 | static int __init sched_domain_debug_setup(char *str) | 6513 | static int __init sched_domain_debug_setup(char *str) |
6515 | { | 6514 | { |
6516 | sched_domain_debug_enabled = 1; | 6515 | sched_domain_debug_enabled = 1; |
6517 | 6516 | ||
6518 | return 0; | 6517 | return 0; |
6519 | } | 6518 | } |
6520 | early_param("sched_debug", sched_domain_debug_setup); | 6519 | early_param("sched_debug", sched_domain_debug_setup); |
6521 | 6520 | ||
6522 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 6521 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
6523 | struct cpumask *groupmask) | 6522 | struct cpumask *groupmask) |
6524 | { | 6523 | { |
6525 | struct sched_group *group = sd->groups; | 6524 | struct sched_group *group = sd->groups; |
6526 | char str[256]; | 6525 | char str[256]; |
6527 | 6526 | ||
6528 | cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); | 6527 | cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); |
6529 | cpumask_clear(groupmask); | 6528 | cpumask_clear(groupmask); |
6530 | 6529 | ||
6531 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | 6530 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); |
6532 | 6531 | ||
6533 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 6532 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
6534 | printk("does not load-balance\n"); | 6533 | printk("does not load-balance\n"); |
6535 | if (sd->parent) | 6534 | if (sd->parent) |
6536 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | 6535 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" |
6537 | " has parent"); | 6536 | " has parent"); |
6538 | return -1; | 6537 | return -1; |
6539 | } | 6538 | } |
6540 | 6539 | ||
6541 | printk(KERN_CONT "span %s level %s\n", str, sd->name); | 6540 | printk(KERN_CONT "span %s level %s\n", str, sd->name); |
6542 | 6541 | ||
6543 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 6542 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
6544 | printk(KERN_ERR "ERROR: domain->span does not contain " | 6543 | printk(KERN_ERR "ERROR: domain->span does not contain " |
6545 | "CPU%d\n", cpu); | 6544 | "CPU%d\n", cpu); |
6546 | } | 6545 | } |
6547 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | 6546 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { |
6548 | printk(KERN_ERR "ERROR: domain->groups does not contain" | 6547 | printk(KERN_ERR "ERROR: domain->groups does not contain" |
6549 | " CPU%d\n", cpu); | 6548 | " CPU%d\n", cpu); |
6550 | } | 6549 | } |
6551 | 6550 | ||
6552 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | 6551 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); |
6553 | do { | 6552 | do { |
6554 | if (!group) { | 6553 | if (!group) { |
6555 | printk("\n"); | 6554 | printk("\n"); |
6556 | printk(KERN_ERR "ERROR: group is NULL\n"); | 6555 | printk(KERN_ERR "ERROR: group is NULL\n"); |
6557 | break; | 6556 | break; |
6558 | } | 6557 | } |
6559 | 6558 | ||
6560 | if (!group->cpu_power) { | 6559 | if (!group->cpu_power) { |
6561 | printk(KERN_CONT "\n"); | 6560 | printk(KERN_CONT "\n"); |
6562 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 6561 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
6563 | "set\n"); | 6562 | "set\n"); |
6564 | break; | 6563 | break; |
6565 | } | 6564 | } |
6566 | 6565 | ||
6567 | if (!cpumask_weight(sched_group_cpus(group))) { | 6566 | if (!cpumask_weight(sched_group_cpus(group))) { |
6568 | printk(KERN_CONT "\n"); | 6567 | printk(KERN_CONT "\n"); |
6569 | printk(KERN_ERR "ERROR: empty group\n"); | 6568 | printk(KERN_ERR "ERROR: empty group\n"); |
6570 | break; | 6569 | break; |
6571 | } | 6570 | } |
6572 | 6571 | ||
6573 | if (cpumask_intersects(groupmask, sched_group_cpus(group))) { | 6572 | if (cpumask_intersects(groupmask, sched_group_cpus(group))) { |
6574 | printk(KERN_CONT "\n"); | 6573 | printk(KERN_CONT "\n"); |
6575 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 6574 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
6576 | break; | 6575 | break; |
6577 | } | 6576 | } |
6578 | 6577 | ||
6579 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | 6578 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); |
6580 | 6579 | ||
6581 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 6580 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
6582 | 6581 | ||
6583 | printk(KERN_CONT " %s", str); | 6582 | printk(KERN_CONT " %s", str); |
6584 | if (group->cpu_power != SCHED_POWER_SCALE) { | 6583 | if (group->cpu_power != SCHED_POWER_SCALE) { |
6585 | printk(KERN_CONT " (cpu_power = %d)", | 6584 | printk(KERN_CONT " (cpu_power = %d)", |
6586 | group->cpu_power); | 6585 | group->cpu_power); |
6587 | } | 6586 | } |
6588 | 6587 | ||
6589 | group = group->next; | 6588 | group = group->next; |
6590 | } while (group != sd->groups); | 6589 | } while (group != sd->groups); |
6591 | printk(KERN_CONT "\n"); | 6590 | printk(KERN_CONT "\n"); |
6592 | 6591 | ||
6593 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) | 6592 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) |
6594 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 6593 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); |
6595 | 6594 | ||
6596 | if (sd->parent && | 6595 | if (sd->parent && |
6597 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | 6596 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) |
6598 | printk(KERN_ERR "ERROR: parent span is not a superset " | 6597 | printk(KERN_ERR "ERROR: parent span is not a superset " |
6599 | "of domain->span\n"); | 6598 | "of domain->span\n"); |
6600 | return 0; | 6599 | return 0; |
6601 | } | 6600 | } |
6602 | 6601 | ||
6603 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6602 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6604 | { | 6603 | { |
6605 | int level = 0; | 6604 | int level = 0; |
6606 | 6605 | ||
6607 | if (!sched_domain_debug_enabled) | 6606 | if (!sched_domain_debug_enabled) |
6608 | return; | 6607 | return; |
6609 | 6608 | ||
6610 | if (!sd) { | 6609 | if (!sd) { |
6611 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | 6610 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); |
6612 | return; | 6611 | return; |
6613 | } | 6612 | } |
6614 | 6613 | ||
6615 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6614 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6616 | 6615 | ||
6617 | for (;;) { | 6616 | for (;;) { |
6618 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | 6617 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
6619 | break; | 6618 | break; |
6620 | level++; | 6619 | level++; |
6621 | sd = sd->parent; | 6620 | sd = sd->parent; |
6622 | if (!sd) | 6621 | if (!sd) |
6623 | break; | 6622 | break; |
6624 | } | 6623 | } |
6625 | } | 6624 | } |
6626 | #else /* !CONFIG_SCHED_DEBUG */ | 6625 | #else /* !CONFIG_SCHED_DEBUG */ |
6627 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6626 | # define sched_domain_debug(sd, cpu) do { } while (0) |
6628 | #endif /* CONFIG_SCHED_DEBUG */ | 6627 | #endif /* CONFIG_SCHED_DEBUG */ |
6629 | 6628 | ||
6630 | static int sd_degenerate(struct sched_domain *sd) | 6629 | static int sd_degenerate(struct sched_domain *sd) |
6631 | { | 6630 | { |
6632 | if (cpumask_weight(sched_domain_span(sd)) == 1) | 6631 | if (cpumask_weight(sched_domain_span(sd)) == 1) |
6633 | return 1; | 6632 | return 1; |
6634 | 6633 | ||
6635 | /* Following flags need at least 2 groups */ | 6634 | /* Following flags need at least 2 groups */ |
6636 | if (sd->flags & (SD_LOAD_BALANCE | | 6635 | if (sd->flags & (SD_LOAD_BALANCE | |
6637 | SD_BALANCE_NEWIDLE | | 6636 | SD_BALANCE_NEWIDLE | |
6638 | SD_BALANCE_FORK | | 6637 | SD_BALANCE_FORK | |
6639 | SD_BALANCE_EXEC | | 6638 | SD_BALANCE_EXEC | |
6640 | SD_SHARE_CPUPOWER | | 6639 | SD_SHARE_CPUPOWER | |
6641 | SD_SHARE_PKG_RESOURCES)) { | 6640 | SD_SHARE_PKG_RESOURCES)) { |
6642 | if (sd->groups != sd->groups->next) | 6641 | if (sd->groups != sd->groups->next) |
6643 | return 0; | 6642 | return 0; |
6644 | } | 6643 | } |
6645 | 6644 | ||
6646 | /* Following flags don't use groups */ | 6645 | /* Following flags don't use groups */ |
6647 | if (sd->flags & (SD_WAKE_AFFINE)) | 6646 | if (sd->flags & (SD_WAKE_AFFINE)) |
6648 | return 0; | 6647 | return 0; |
6649 | 6648 | ||
6650 | return 1; | 6649 | return 1; |
6651 | } | 6650 | } |
6652 | 6651 | ||
6653 | static int | 6652 | static int |
6654 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | 6653 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
6655 | { | 6654 | { |
6656 | unsigned long cflags = sd->flags, pflags = parent->flags; | 6655 | unsigned long cflags = sd->flags, pflags = parent->flags; |
6657 | 6656 | ||
6658 | if (sd_degenerate(parent)) | 6657 | if (sd_degenerate(parent)) |
6659 | return 1; | 6658 | return 1; |
6660 | 6659 | ||
6661 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | 6660 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) |
6662 | return 0; | 6661 | return 0; |
6663 | 6662 | ||
6664 | /* Flags needing groups don't count if only 1 group in parent */ | 6663 | /* Flags needing groups don't count if only 1 group in parent */ |
6665 | if (parent->groups == parent->groups->next) { | 6664 | if (parent->groups == parent->groups->next) { |
6666 | pflags &= ~(SD_LOAD_BALANCE | | 6665 | pflags &= ~(SD_LOAD_BALANCE | |
6667 | SD_BALANCE_NEWIDLE | | 6666 | SD_BALANCE_NEWIDLE | |
6668 | SD_BALANCE_FORK | | 6667 | SD_BALANCE_FORK | |
6669 | SD_BALANCE_EXEC | | 6668 | SD_BALANCE_EXEC | |
6670 | SD_SHARE_CPUPOWER | | 6669 | SD_SHARE_CPUPOWER | |
6671 | SD_SHARE_PKG_RESOURCES); | 6670 | SD_SHARE_PKG_RESOURCES); |
6672 | if (nr_node_ids == 1) | 6671 | if (nr_node_ids == 1) |
6673 | pflags &= ~SD_SERIALIZE; | 6672 | pflags &= ~SD_SERIALIZE; |
6674 | } | 6673 | } |
6675 | if (~cflags & pflags) | 6674 | if (~cflags & pflags) |
6676 | return 0; | 6675 | return 0; |
6677 | 6676 | ||
6678 | return 1; | 6677 | return 1; |
6679 | } | 6678 | } |
6680 | 6679 | ||
6681 | static void free_rootdomain(struct rcu_head *rcu) | 6680 | static void free_rootdomain(struct rcu_head *rcu) |
6682 | { | 6681 | { |
6683 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | 6682 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
6684 | 6683 | ||
6685 | cpupri_cleanup(&rd->cpupri); | 6684 | cpupri_cleanup(&rd->cpupri); |
6686 | free_cpumask_var(rd->rto_mask); | 6685 | free_cpumask_var(rd->rto_mask); |
6687 | free_cpumask_var(rd->online); | 6686 | free_cpumask_var(rd->online); |
6688 | free_cpumask_var(rd->span); | 6687 | free_cpumask_var(rd->span); |
6689 | kfree(rd); | 6688 | kfree(rd); |
6690 | } | 6689 | } |
6691 | 6690 | ||
6692 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | 6691 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) |
6693 | { | 6692 | { |
6694 | struct root_domain *old_rd = NULL; | 6693 | struct root_domain *old_rd = NULL; |
6695 | unsigned long flags; | 6694 | unsigned long flags; |
6696 | 6695 | ||
6697 | raw_spin_lock_irqsave(&rq->lock, flags); | 6696 | raw_spin_lock_irqsave(&rq->lock, flags); |
6698 | 6697 | ||
6699 | if (rq->rd) { | 6698 | if (rq->rd) { |
6700 | old_rd = rq->rd; | 6699 | old_rd = rq->rd; |
6701 | 6700 | ||
6702 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) | 6701 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) |
6703 | set_rq_offline(rq); | 6702 | set_rq_offline(rq); |
6704 | 6703 | ||
6705 | cpumask_clear_cpu(rq->cpu, old_rd->span); | 6704 | cpumask_clear_cpu(rq->cpu, old_rd->span); |
6706 | 6705 | ||
6707 | /* | 6706 | /* |
6708 | * If we dont want to free the old_rt yet then | 6707 | * If we dont want to free the old_rt yet then |
6709 | * set old_rd to NULL to skip the freeing later | 6708 | * set old_rd to NULL to skip the freeing later |
6710 | * in this function: | 6709 | * in this function: |
6711 | */ | 6710 | */ |
6712 | if (!atomic_dec_and_test(&old_rd->refcount)) | 6711 | if (!atomic_dec_and_test(&old_rd->refcount)) |
6713 | old_rd = NULL; | 6712 | old_rd = NULL; |
6714 | } | 6713 | } |
6715 | 6714 | ||
6716 | atomic_inc(&rd->refcount); | 6715 | atomic_inc(&rd->refcount); |
6717 | rq->rd = rd; | 6716 | rq->rd = rd; |
6718 | 6717 | ||
6719 | cpumask_set_cpu(rq->cpu, rd->span); | 6718 | cpumask_set_cpu(rq->cpu, rd->span); |
6720 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | 6719 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
6721 | set_rq_online(rq); | 6720 | set_rq_online(rq); |
6722 | 6721 | ||
6723 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6722 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6724 | 6723 | ||
6725 | if (old_rd) | 6724 | if (old_rd) |
6726 | call_rcu_sched(&old_rd->rcu, free_rootdomain); | 6725 | call_rcu_sched(&old_rd->rcu, free_rootdomain); |
6727 | } | 6726 | } |
6728 | 6727 | ||
6729 | static int init_rootdomain(struct root_domain *rd) | 6728 | static int init_rootdomain(struct root_domain *rd) |
6730 | { | 6729 | { |
6731 | memset(rd, 0, sizeof(*rd)); | 6730 | memset(rd, 0, sizeof(*rd)); |
6732 | 6731 | ||
6733 | if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) | 6732 | if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) |
6734 | goto out; | 6733 | goto out; |
6735 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) | 6734 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
6736 | goto free_span; | 6735 | goto free_span; |
6737 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | 6736 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) |
6738 | goto free_online; | 6737 | goto free_online; |
6739 | 6738 | ||
6740 | if (cpupri_init(&rd->cpupri) != 0) | 6739 | if (cpupri_init(&rd->cpupri) != 0) |
6741 | goto free_rto_mask; | 6740 | goto free_rto_mask; |
6742 | return 0; | 6741 | return 0; |
6743 | 6742 | ||
6744 | free_rto_mask: | 6743 | free_rto_mask: |
6745 | free_cpumask_var(rd->rto_mask); | 6744 | free_cpumask_var(rd->rto_mask); |
6746 | free_online: | 6745 | free_online: |
6747 | free_cpumask_var(rd->online); | 6746 | free_cpumask_var(rd->online); |
6748 | free_span: | 6747 | free_span: |
6749 | free_cpumask_var(rd->span); | 6748 | free_cpumask_var(rd->span); |
6750 | out: | 6749 | out: |
6751 | return -ENOMEM; | 6750 | return -ENOMEM; |
6752 | } | 6751 | } |
6753 | 6752 | ||
6754 | static void init_defrootdomain(void) | 6753 | static void init_defrootdomain(void) |
6755 | { | 6754 | { |
6756 | init_rootdomain(&def_root_domain); | 6755 | init_rootdomain(&def_root_domain); |
6757 | 6756 | ||
6758 | atomic_set(&def_root_domain.refcount, 1); | 6757 | atomic_set(&def_root_domain.refcount, 1); |
6759 | } | 6758 | } |
6760 | 6759 | ||
6761 | static struct root_domain *alloc_rootdomain(void) | 6760 | static struct root_domain *alloc_rootdomain(void) |
6762 | { | 6761 | { |
6763 | struct root_domain *rd; | 6762 | struct root_domain *rd; |
6764 | 6763 | ||
6765 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | 6764 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); |
6766 | if (!rd) | 6765 | if (!rd) |
6767 | return NULL; | 6766 | return NULL; |
6768 | 6767 | ||
6769 | if (init_rootdomain(rd) != 0) { | 6768 | if (init_rootdomain(rd) != 0) { |
6770 | kfree(rd); | 6769 | kfree(rd); |
6771 | return NULL; | 6770 | return NULL; |
6772 | } | 6771 | } |
6773 | 6772 | ||
6774 | return rd; | 6773 | return rd; |
6775 | } | 6774 | } |
6776 | 6775 | ||
6777 | static void free_sched_domain(struct rcu_head *rcu) | 6776 | static void free_sched_domain(struct rcu_head *rcu) |
6778 | { | 6777 | { |
6779 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | 6778 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); |
6780 | if (atomic_dec_and_test(&sd->groups->ref)) | 6779 | if (atomic_dec_and_test(&sd->groups->ref)) |
6781 | kfree(sd->groups); | 6780 | kfree(sd->groups); |
6782 | kfree(sd); | 6781 | kfree(sd); |
6783 | } | 6782 | } |
6784 | 6783 | ||
6785 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | 6784 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) |
6786 | { | 6785 | { |
6787 | call_rcu(&sd->rcu, free_sched_domain); | 6786 | call_rcu(&sd->rcu, free_sched_domain); |
6788 | } | 6787 | } |
6789 | 6788 | ||
6790 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | 6789 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) |
6791 | { | 6790 | { |
6792 | for (; sd; sd = sd->parent) | 6791 | for (; sd; sd = sd->parent) |
6793 | destroy_sched_domain(sd, cpu); | 6792 | destroy_sched_domain(sd, cpu); |
6794 | } | 6793 | } |
6795 | 6794 | ||
6796 | /* | 6795 | /* |
6797 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6796 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6798 | * hold the hotplug lock. | 6797 | * hold the hotplug lock. |
6799 | */ | 6798 | */ |
6800 | static void | 6799 | static void |
6801 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | 6800 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) |
6802 | { | 6801 | { |
6803 | struct rq *rq = cpu_rq(cpu); | 6802 | struct rq *rq = cpu_rq(cpu); |
6804 | struct sched_domain *tmp; | 6803 | struct sched_domain *tmp; |
6805 | 6804 | ||
6806 | /* Remove the sched domains which do not contribute to scheduling. */ | 6805 | /* Remove the sched domains which do not contribute to scheduling. */ |
6807 | for (tmp = sd; tmp; ) { | 6806 | for (tmp = sd; tmp; ) { |
6808 | struct sched_domain *parent = tmp->parent; | 6807 | struct sched_domain *parent = tmp->parent; |
6809 | if (!parent) | 6808 | if (!parent) |
6810 | break; | 6809 | break; |
6811 | 6810 | ||
6812 | if (sd_parent_degenerate(tmp, parent)) { | 6811 | if (sd_parent_degenerate(tmp, parent)) { |
6813 | tmp->parent = parent->parent; | 6812 | tmp->parent = parent->parent; |
6814 | if (parent->parent) | 6813 | if (parent->parent) |
6815 | parent->parent->child = tmp; | 6814 | parent->parent->child = tmp; |
6816 | destroy_sched_domain(parent, cpu); | 6815 | destroy_sched_domain(parent, cpu); |
6817 | } else | 6816 | } else |
6818 | tmp = tmp->parent; | 6817 | tmp = tmp->parent; |
6819 | } | 6818 | } |
6820 | 6819 | ||
6821 | if (sd && sd_degenerate(sd)) { | 6820 | if (sd && sd_degenerate(sd)) { |
6822 | tmp = sd; | 6821 | tmp = sd; |
6823 | sd = sd->parent; | 6822 | sd = sd->parent; |
6824 | destroy_sched_domain(tmp, cpu); | 6823 | destroy_sched_domain(tmp, cpu); |
6825 | if (sd) | 6824 | if (sd) |
6826 | sd->child = NULL; | 6825 | sd->child = NULL; |
6827 | } | 6826 | } |
6828 | 6827 | ||
6829 | sched_domain_debug(sd, cpu); | 6828 | sched_domain_debug(sd, cpu); |
6830 | 6829 | ||
6831 | rq_attach_root(rq, rd); | 6830 | rq_attach_root(rq, rd); |
6832 | tmp = rq->sd; | 6831 | tmp = rq->sd; |
6833 | rcu_assign_pointer(rq->sd, sd); | 6832 | rcu_assign_pointer(rq->sd, sd); |
6834 | destroy_sched_domains(tmp, cpu); | 6833 | destroy_sched_domains(tmp, cpu); |
6835 | } | 6834 | } |
6836 | 6835 | ||
6837 | /* cpus with isolated domains */ | 6836 | /* cpus with isolated domains */ |
6838 | static cpumask_var_t cpu_isolated_map; | 6837 | static cpumask_var_t cpu_isolated_map; |
6839 | 6838 | ||
6840 | /* Setup the mask of cpus configured for isolated domains */ | 6839 | /* Setup the mask of cpus configured for isolated domains */ |
6841 | static int __init isolated_cpu_setup(char *str) | 6840 | static int __init isolated_cpu_setup(char *str) |
6842 | { | 6841 | { |
6843 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | 6842 | alloc_bootmem_cpumask_var(&cpu_isolated_map); |
6844 | cpulist_parse(str, cpu_isolated_map); | 6843 | cpulist_parse(str, cpu_isolated_map); |
6845 | return 1; | 6844 | return 1; |
6846 | } | 6845 | } |
6847 | 6846 | ||
6848 | __setup("isolcpus=", isolated_cpu_setup); | 6847 | __setup("isolcpus=", isolated_cpu_setup); |
6849 | 6848 | ||
6850 | #define SD_NODES_PER_DOMAIN 16 | 6849 | #define SD_NODES_PER_DOMAIN 16 |
6851 | 6850 | ||
6852 | #ifdef CONFIG_NUMA | 6851 | #ifdef CONFIG_NUMA |
6853 | 6852 | ||
6854 | /** | 6853 | /** |
6855 | * find_next_best_node - find the next node to include in a sched_domain | 6854 | * find_next_best_node - find the next node to include in a sched_domain |
6856 | * @node: node whose sched_domain we're building | 6855 | * @node: node whose sched_domain we're building |
6857 | * @used_nodes: nodes already in the sched_domain | 6856 | * @used_nodes: nodes already in the sched_domain |
6858 | * | 6857 | * |
6859 | * Find the next node to include in a given scheduling domain. Simply | 6858 | * Find the next node to include in a given scheduling domain. Simply |
6860 | * finds the closest node not already in the @used_nodes map. | 6859 | * finds the closest node not already in the @used_nodes map. |
6861 | * | 6860 | * |
6862 | * Should use nodemask_t. | 6861 | * Should use nodemask_t. |
6863 | */ | 6862 | */ |
6864 | static int find_next_best_node(int node, nodemask_t *used_nodes) | 6863 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6865 | { | 6864 | { |
6866 | int i, n, val, min_val, best_node = -1; | 6865 | int i, n, val, min_val, best_node = -1; |
6867 | 6866 | ||
6868 | min_val = INT_MAX; | 6867 | min_val = INT_MAX; |
6869 | 6868 | ||
6870 | for (i = 0; i < nr_node_ids; i++) { | 6869 | for (i = 0; i < nr_node_ids; i++) { |
6871 | /* Start at @node */ | 6870 | /* Start at @node */ |
6872 | n = (node + i) % nr_node_ids; | 6871 | n = (node + i) % nr_node_ids; |
6873 | 6872 | ||
6874 | if (!nr_cpus_node(n)) | 6873 | if (!nr_cpus_node(n)) |
6875 | continue; | 6874 | continue; |
6876 | 6875 | ||
6877 | /* Skip already used nodes */ | 6876 | /* Skip already used nodes */ |
6878 | if (node_isset(n, *used_nodes)) | 6877 | if (node_isset(n, *used_nodes)) |
6879 | continue; | 6878 | continue; |
6880 | 6879 | ||
6881 | /* Simple min distance search */ | 6880 | /* Simple min distance search */ |
6882 | val = node_distance(node, n); | 6881 | val = node_distance(node, n); |
6883 | 6882 | ||
6884 | if (val < min_val) { | 6883 | if (val < min_val) { |
6885 | min_val = val; | 6884 | min_val = val; |
6886 | best_node = n; | 6885 | best_node = n; |
6887 | } | 6886 | } |
6888 | } | 6887 | } |
6889 | 6888 | ||
6890 | if (best_node != -1) | 6889 | if (best_node != -1) |
6891 | node_set(best_node, *used_nodes); | 6890 | node_set(best_node, *used_nodes); |
6892 | return best_node; | 6891 | return best_node; |
6893 | } | 6892 | } |
6894 | 6893 | ||
6895 | /** | 6894 | /** |
6896 | * sched_domain_node_span - get a cpumask for a node's sched_domain | 6895 | * sched_domain_node_span - get a cpumask for a node's sched_domain |
6897 | * @node: node whose cpumask we're constructing | 6896 | * @node: node whose cpumask we're constructing |
6898 | * @span: resulting cpumask | 6897 | * @span: resulting cpumask |
6899 | * | 6898 | * |
6900 | * Given a node, construct a good cpumask for its sched_domain to span. It | 6899 | * Given a node, construct a good cpumask for its sched_domain to span. It |
6901 | * should be one that prevents unnecessary balancing, but also spreads tasks | 6900 | * should be one that prevents unnecessary balancing, but also spreads tasks |
6902 | * out optimally. | 6901 | * out optimally. |
6903 | */ | 6902 | */ |
6904 | static void sched_domain_node_span(int node, struct cpumask *span) | 6903 | static void sched_domain_node_span(int node, struct cpumask *span) |
6905 | { | 6904 | { |
6906 | nodemask_t used_nodes; | 6905 | nodemask_t used_nodes; |
6907 | int i; | 6906 | int i; |
6908 | 6907 | ||
6909 | cpumask_clear(span); | 6908 | cpumask_clear(span); |
6910 | nodes_clear(used_nodes); | 6909 | nodes_clear(used_nodes); |
6911 | 6910 | ||
6912 | cpumask_or(span, span, cpumask_of_node(node)); | 6911 | cpumask_or(span, span, cpumask_of_node(node)); |
6913 | node_set(node, used_nodes); | 6912 | node_set(node, used_nodes); |
6914 | 6913 | ||
6915 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6914 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6916 | int next_node = find_next_best_node(node, &used_nodes); | 6915 | int next_node = find_next_best_node(node, &used_nodes); |
6917 | if (next_node < 0) | 6916 | if (next_node < 0) |
6918 | break; | 6917 | break; |
6919 | cpumask_or(span, span, cpumask_of_node(next_node)); | 6918 | cpumask_or(span, span, cpumask_of_node(next_node)); |
6920 | } | 6919 | } |
6921 | } | 6920 | } |
6922 | 6921 | ||
6923 | static const struct cpumask *cpu_node_mask(int cpu) | 6922 | static const struct cpumask *cpu_node_mask(int cpu) |
6924 | { | 6923 | { |
6925 | lockdep_assert_held(&sched_domains_mutex); | 6924 | lockdep_assert_held(&sched_domains_mutex); |
6926 | 6925 | ||
6927 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | 6926 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); |
6928 | 6927 | ||
6929 | return sched_domains_tmpmask; | 6928 | return sched_domains_tmpmask; |
6930 | } | 6929 | } |
6931 | 6930 | ||
6932 | static const struct cpumask *cpu_allnodes_mask(int cpu) | 6931 | static const struct cpumask *cpu_allnodes_mask(int cpu) |
6933 | { | 6932 | { |
6934 | return cpu_possible_mask; | 6933 | return cpu_possible_mask; |
6935 | } | 6934 | } |
6936 | #endif /* CONFIG_NUMA */ | 6935 | #endif /* CONFIG_NUMA */ |
6937 | 6936 | ||
6938 | static const struct cpumask *cpu_cpu_mask(int cpu) | 6937 | static const struct cpumask *cpu_cpu_mask(int cpu) |
6939 | { | 6938 | { |
6940 | return cpumask_of_node(cpu_to_node(cpu)); | 6939 | return cpumask_of_node(cpu_to_node(cpu)); |
6941 | } | 6940 | } |
6942 | 6941 | ||
6943 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6942 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6944 | 6943 | ||
6945 | struct sd_data { | 6944 | struct sd_data { |
6946 | struct sched_domain **__percpu sd; | 6945 | struct sched_domain **__percpu sd; |
6947 | struct sched_group **__percpu sg; | 6946 | struct sched_group **__percpu sg; |
6948 | }; | 6947 | }; |
6949 | 6948 | ||
6950 | struct s_data { | 6949 | struct s_data { |
6951 | struct sched_domain ** __percpu sd; | 6950 | struct sched_domain ** __percpu sd; |
6952 | struct root_domain *rd; | 6951 | struct root_domain *rd; |
6953 | }; | 6952 | }; |
6954 | 6953 | ||
6955 | enum s_alloc { | 6954 | enum s_alloc { |
6956 | sa_rootdomain, | 6955 | sa_rootdomain, |
6957 | sa_sd, | 6956 | sa_sd, |
6958 | sa_sd_storage, | 6957 | sa_sd_storage, |
6959 | sa_none, | 6958 | sa_none, |
6960 | }; | 6959 | }; |
6961 | 6960 | ||
6962 | struct sched_domain_topology_level; | 6961 | struct sched_domain_topology_level; |
6963 | 6962 | ||
6964 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); | 6963 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6965 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); | 6964 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6966 | 6965 | ||
6967 | struct sched_domain_topology_level { | 6966 | struct sched_domain_topology_level { |
6968 | sched_domain_init_f init; | 6967 | sched_domain_init_f init; |
6969 | sched_domain_mask_f mask; | 6968 | sched_domain_mask_f mask; |
6970 | struct sd_data data; | 6969 | struct sd_data data; |
6971 | }; | 6970 | }; |
6972 | 6971 | ||
6973 | /* | 6972 | /* |
6974 | * Assumes the sched_domain tree is fully constructed | 6973 | * Assumes the sched_domain tree is fully constructed |
6975 | */ | 6974 | */ |
6976 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | 6975 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
6977 | { | 6976 | { |
6978 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | 6977 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6979 | struct sched_domain *child = sd->child; | 6978 | struct sched_domain *child = sd->child; |
6980 | 6979 | ||
6981 | if (child) | 6980 | if (child) |
6982 | cpu = cpumask_first(sched_domain_span(child)); | 6981 | cpu = cpumask_first(sched_domain_span(child)); |
6983 | 6982 | ||
6984 | if (sg) | 6983 | if (sg) |
6985 | *sg = *per_cpu_ptr(sdd->sg, cpu); | 6984 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
6986 | 6985 | ||
6987 | return cpu; | 6986 | return cpu; |
6988 | } | 6987 | } |
6989 | 6988 | ||
6990 | /* | 6989 | /* |
6991 | * build_sched_groups takes the cpumask we wish to span, and a pointer | 6990 | * build_sched_groups takes the cpumask we wish to span, and a pointer |
6992 | * to a function which identifies what group(along with sched group) a CPU | 6991 | * to a function which identifies what group(along with sched group) a CPU |
6993 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | 6992 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids |
6994 | * (due to the fact that we keep track of groups covered with a struct cpumask). | 6993 | * (due to the fact that we keep track of groups covered with a struct cpumask). |
6995 | * | 6994 | * |
6996 | * build_sched_groups will build a circular linked list of the groups | 6995 | * build_sched_groups will build a circular linked list of the groups |
6997 | * covered by the given span, and will set each group's ->cpumask correctly, | 6996 | * covered by the given span, and will set each group's ->cpumask correctly, |
6998 | * and ->cpu_power to 0. | 6997 | * and ->cpu_power to 0. |
6999 | */ | 6998 | */ |
7000 | static void | 6999 | static void |
7001 | build_sched_groups(struct sched_domain *sd) | 7000 | build_sched_groups(struct sched_domain *sd) |
7002 | { | 7001 | { |
7003 | struct sched_group *first = NULL, *last = NULL; | 7002 | struct sched_group *first = NULL, *last = NULL; |
7004 | struct sd_data *sdd = sd->private; | 7003 | struct sd_data *sdd = sd->private; |
7005 | const struct cpumask *span = sched_domain_span(sd); | 7004 | const struct cpumask *span = sched_domain_span(sd); |
7006 | struct cpumask *covered; | 7005 | struct cpumask *covered; |
7007 | int i; | 7006 | int i; |
7008 | 7007 | ||
7009 | lockdep_assert_held(&sched_domains_mutex); | 7008 | lockdep_assert_held(&sched_domains_mutex); |
7010 | covered = sched_domains_tmpmask; | 7009 | covered = sched_domains_tmpmask; |
7011 | 7010 | ||
7012 | cpumask_clear(covered); | 7011 | cpumask_clear(covered); |
7013 | 7012 | ||
7014 | for_each_cpu(i, span) { | 7013 | for_each_cpu(i, span) { |
7015 | struct sched_group *sg; | 7014 | struct sched_group *sg; |
7016 | int group = get_group(i, sdd, &sg); | 7015 | int group = get_group(i, sdd, &sg); |
7017 | int j; | 7016 | int j; |
7018 | 7017 | ||
7019 | if (cpumask_test_cpu(i, covered)) | 7018 | if (cpumask_test_cpu(i, covered)) |
7020 | continue; | 7019 | continue; |
7021 | 7020 | ||
7022 | cpumask_clear(sched_group_cpus(sg)); | 7021 | cpumask_clear(sched_group_cpus(sg)); |
7023 | sg->cpu_power = 0; | 7022 | sg->cpu_power = 0; |
7024 | 7023 | ||
7025 | for_each_cpu(j, span) { | 7024 | for_each_cpu(j, span) { |
7026 | if (get_group(j, sdd, NULL) != group) | 7025 | if (get_group(j, sdd, NULL) != group) |
7027 | continue; | 7026 | continue; |
7028 | 7027 | ||
7029 | cpumask_set_cpu(j, covered); | 7028 | cpumask_set_cpu(j, covered); |
7030 | cpumask_set_cpu(j, sched_group_cpus(sg)); | 7029 | cpumask_set_cpu(j, sched_group_cpus(sg)); |
7031 | } | 7030 | } |
7032 | 7031 | ||
7033 | if (!first) | 7032 | if (!first) |
7034 | first = sg; | 7033 | first = sg; |
7035 | if (last) | 7034 | if (last) |
7036 | last->next = sg; | 7035 | last->next = sg; |
7037 | last = sg; | 7036 | last = sg; |
7038 | } | 7037 | } |
7039 | last->next = first; | 7038 | last->next = first; |
7040 | } | 7039 | } |
7041 | 7040 | ||
7042 | /* | 7041 | /* |
7043 | * Initialize sched groups cpu_power. | 7042 | * Initialize sched groups cpu_power. |
7044 | * | 7043 | * |
7045 | * cpu_power indicates the capacity of sched group, which is used while | 7044 | * cpu_power indicates the capacity of sched group, which is used while |
7046 | * distributing the load between different sched groups in a sched domain. | 7045 | * distributing the load between different sched groups in a sched domain. |
7047 | * Typically cpu_power for all the groups in a sched domain will be same unless | 7046 | * Typically cpu_power for all the groups in a sched domain will be same unless |
7048 | * there are asymmetries in the topology. If there are asymmetries, group | 7047 | * there are asymmetries in the topology. If there are asymmetries, group |
7049 | * having more cpu_power will pickup more load compared to the group having | 7048 | * having more cpu_power will pickup more load compared to the group having |
7050 | * less cpu_power. | 7049 | * less cpu_power. |
7051 | */ | 7050 | */ |
7052 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 7051 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
7053 | { | 7052 | { |
7054 | WARN_ON(!sd || !sd->groups); | 7053 | WARN_ON(!sd || !sd->groups); |
7055 | 7054 | ||
7056 | if (cpu != group_first_cpu(sd->groups)) | 7055 | if (cpu != group_first_cpu(sd->groups)) |
7057 | return; | 7056 | return; |
7058 | 7057 | ||
7059 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | 7058 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); |
7060 | 7059 | ||
7061 | update_group_power(sd, cpu); | 7060 | update_group_power(sd, cpu); |
7062 | } | 7061 | } |
7063 | 7062 | ||
7064 | /* | 7063 | /* |
7065 | * Initializers for schedule domains | 7064 | * Initializers for schedule domains |
7066 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | 7065 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
7067 | */ | 7066 | */ |
7068 | 7067 | ||
7069 | #ifdef CONFIG_SCHED_DEBUG | 7068 | #ifdef CONFIG_SCHED_DEBUG |
7070 | # define SD_INIT_NAME(sd, type) sd->name = #type | 7069 | # define SD_INIT_NAME(sd, type) sd->name = #type |
7071 | #else | 7070 | #else |
7072 | # define SD_INIT_NAME(sd, type) do { } while (0) | 7071 | # define SD_INIT_NAME(sd, type) do { } while (0) |
7073 | #endif | 7072 | #endif |
7074 | 7073 | ||
7075 | #define SD_INIT_FUNC(type) \ | 7074 | #define SD_INIT_FUNC(type) \ |
7076 | static noinline struct sched_domain * \ | 7075 | static noinline struct sched_domain * \ |
7077 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | 7076 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
7078 | { \ | 7077 | { \ |
7079 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ | 7078 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
7080 | *sd = SD_##type##_INIT; \ | 7079 | *sd = SD_##type##_INIT; \ |
7081 | SD_INIT_NAME(sd, type); \ | 7080 | SD_INIT_NAME(sd, type); \ |
7082 | sd->private = &tl->data; \ | 7081 | sd->private = &tl->data; \ |
7083 | return sd; \ | 7082 | return sd; \ |
7084 | } | 7083 | } |
7085 | 7084 | ||
7086 | SD_INIT_FUNC(CPU) | 7085 | SD_INIT_FUNC(CPU) |
7087 | #ifdef CONFIG_NUMA | 7086 | #ifdef CONFIG_NUMA |
7088 | SD_INIT_FUNC(ALLNODES) | 7087 | SD_INIT_FUNC(ALLNODES) |
7089 | SD_INIT_FUNC(NODE) | 7088 | SD_INIT_FUNC(NODE) |
7090 | #endif | 7089 | #endif |
7091 | #ifdef CONFIG_SCHED_SMT | 7090 | #ifdef CONFIG_SCHED_SMT |
7092 | SD_INIT_FUNC(SIBLING) | 7091 | SD_INIT_FUNC(SIBLING) |
7093 | #endif | 7092 | #endif |
7094 | #ifdef CONFIG_SCHED_MC | 7093 | #ifdef CONFIG_SCHED_MC |
7095 | SD_INIT_FUNC(MC) | 7094 | SD_INIT_FUNC(MC) |
7096 | #endif | 7095 | #endif |
7097 | #ifdef CONFIG_SCHED_BOOK | 7096 | #ifdef CONFIG_SCHED_BOOK |
7098 | SD_INIT_FUNC(BOOK) | 7097 | SD_INIT_FUNC(BOOK) |
7099 | #endif | 7098 | #endif |
7100 | 7099 | ||
7101 | static int default_relax_domain_level = -1; | 7100 | static int default_relax_domain_level = -1; |
7102 | int sched_domain_level_max; | 7101 | int sched_domain_level_max; |
7103 | 7102 | ||
7104 | static int __init setup_relax_domain_level(char *str) | 7103 | static int __init setup_relax_domain_level(char *str) |
7105 | { | 7104 | { |
7106 | unsigned long val; | 7105 | unsigned long val; |
7107 | 7106 | ||
7108 | val = simple_strtoul(str, NULL, 0); | 7107 | val = simple_strtoul(str, NULL, 0); |
7109 | if (val < sched_domain_level_max) | 7108 | if (val < sched_domain_level_max) |
7110 | default_relax_domain_level = val; | 7109 | default_relax_domain_level = val; |
7111 | 7110 | ||
7112 | return 1; | 7111 | return 1; |
7113 | } | 7112 | } |
7114 | __setup("relax_domain_level=", setup_relax_domain_level); | 7113 | __setup("relax_domain_level=", setup_relax_domain_level); |
7115 | 7114 | ||
7116 | static void set_domain_attribute(struct sched_domain *sd, | 7115 | static void set_domain_attribute(struct sched_domain *sd, |
7117 | struct sched_domain_attr *attr) | 7116 | struct sched_domain_attr *attr) |
7118 | { | 7117 | { |
7119 | int request; | 7118 | int request; |
7120 | 7119 | ||
7121 | if (!attr || attr->relax_domain_level < 0) { | 7120 | if (!attr || attr->relax_domain_level < 0) { |
7122 | if (default_relax_domain_level < 0) | 7121 | if (default_relax_domain_level < 0) |
7123 | return; | 7122 | return; |
7124 | else | 7123 | else |
7125 | request = default_relax_domain_level; | 7124 | request = default_relax_domain_level; |
7126 | } else | 7125 | } else |
7127 | request = attr->relax_domain_level; | 7126 | request = attr->relax_domain_level; |
7128 | if (request < sd->level) { | 7127 | if (request < sd->level) { |
7129 | /* turn off idle balance on this domain */ | 7128 | /* turn off idle balance on this domain */ |
7130 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | 7129 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
7131 | } else { | 7130 | } else { |
7132 | /* turn on idle balance on this domain */ | 7131 | /* turn on idle balance on this domain */ |
7133 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | 7132 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
7134 | } | 7133 | } |
7135 | } | 7134 | } |
7136 | 7135 | ||
7137 | static void __sdt_free(const struct cpumask *cpu_map); | 7136 | static void __sdt_free(const struct cpumask *cpu_map); |
7138 | static int __sdt_alloc(const struct cpumask *cpu_map); | 7137 | static int __sdt_alloc(const struct cpumask *cpu_map); |
7139 | 7138 | ||
7140 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | 7139 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
7141 | const struct cpumask *cpu_map) | 7140 | const struct cpumask *cpu_map) |
7142 | { | 7141 | { |
7143 | switch (what) { | 7142 | switch (what) { |
7144 | case sa_rootdomain: | 7143 | case sa_rootdomain: |
7145 | if (!atomic_read(&d->rd->refcount)) | 7144 | if (!atomic_read(&d->rd->refcount)) |
7146 | free_rootdomain(&d->rd->rcu); /* fall through */ | 7145 | free_rootdomain(&d->rd->rcu); /* fall through */ |
7147 | case sa_sd: | 7146 | case sa_sd: |
7148 | free_percpu(d->sd); /* fall through */ | 7147 | free_percpu(d->sd); /* fall through */ |
7149 | case sa_sd_storage: | 7148 | case sa_sd_storage: |
7150 | __sdt_free(cpu_map); /* fall through */ | 7149 | __sdt_free(cpu_map); /* fall through */ |
7151 | case sa_none: | 7150 | case sa_none: |
7152 | break; | 7151 | break; |
7153 | } | 7152 | } |
7154 | } | 7153 | } |
7155 | 7154 | ||
7156 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7155 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
7157 | const struct cpumask *cpu_map) | 7156 | const struct cpumask *cpu_map) |
7158 | { | 7157 | { |
7159 | memset(d, 0, sizeof(*d)); | 7158 | memset(d, 0, sizeof(*d)); |
7160 | 7159 | ||
7161 | if (__sdt_alloc(cpu_map)) | 7160 | if (__sdt_alloc(cpu_map)) |
7162 | return sa_sd_storage; | 7161 | return sa_sd_storage; |
7163 | d->sd = alloc_percpu(struct sched_domain *); | 7162 | d->sd = alloc_percpu(struct sched_domain *); |
7164 | if (!d->sd) | 7163 | if (!d->sd) |
7165 | return sa_sd_storage; | 7164 | return sa_sd_storage; |
7166 | d->rd = alloc_rootdomain(); | 7165 | d->rd = alloc_rootdomain(); |
7167 | if (!d->rd) | 7166 | if (!d->rd) |
7168 | return sa_sd; | 7167 | return sa_sd; |
7169 | return sa_rootdomain; | 7168 | return sa_rootdomain; |
7170 | } | 7169 | } |
7171 | 7170 | ||
7172 | /* | 7171 | /* |
7173 | * NULL the sd_data elements we've used to build the sched_domain and | 7172 | * NULL the sd_data elements we've used to build the sched_domain and |
7174 | * sched_group structure so that the subsequent __free_domain_allocs() | 7173 | * sched_group structure so that the subsequent __free_domain_allocs() |
7175 | * will not free the data we're using. | 7174 | * will not free the data we're using. |
7176 | */ | 7175 | */ |
7177 | static void claim_allocations(int cpu, struct sched_domain *sd) | 7176 | static void claim_allocations(int cpu, struct sched_domain *sd) |
7178 | { | 7177 | { |
7179 | struct sd_data *sdd = sd->private; | 7178 | struct sd_data *sdd = sd->private; |
7180 | struct sched_group *sg = sd->groups; | 7179 | struct sched_group *sg = sd->groups; |
7181 | 7180 | ||
7182 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | 7181 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
7183 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | 7182 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
7184 | 7183 | ||
7185 | if (cpu == cpumask_first(sched_group_cpus(sg))) { | 7184 | if (cpu == cpumask_first(sched_group_cpus(sg))) { |
7186 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); | 7185 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); |
7187 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | 7186 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
7188 | } | 7187 | } |
7189 | } | 7188 | } |
7190 | 7189 | ||
7191 | #ifdef CONFIG_SCHED_SMT | 7190 | #ifdef CONFIG_SCHED_SMT |
7192 | static const struct cpumask *cpu_smt_mask(int cpu) | 7191 | static const struct cpumask *cpu_smt_mask(int cpu) |
7193 | { | 7192 | { |
7194 | return topology_thread_cpumask(cpu); | 7193 | return topology_thread_cpumask(cpu); |
7195 | } | 7194 | } |
7196 | #endif | 7195 | #endif |
7197 | 7196 | ||
7198 | /* | 7197 | /* |
7199 | * Topology list, bottom-up. | 7198 | * Topology list, bottom-up. |
7200 | */ | 7199 | */ |
7201 | static struct sched_domain_topology_level default_topology[] = { | 7200 | static struct sched_domain_topology_level default_topology[] = { |
7202 | #ifdef CONFIG_SCHED_SMT | 7201 | #ifdef CONFIG_SCHED_SMT |
7203 | { sd_init_SIBLING, cpu_smt_mask, }, | 7202 | { sd_init_SIBLING, cpu_smt_mask, }, |
7204 | #endif | 7203 | #endif |
7205 | #ifdef CONFIG_SCHED_MC | 7204 | #ifdef CONFIG_SCHED_MC |
7206 | { sd_init_MC, cpu_coregroup_mask, }, | 7205 | { sd_init_MC, cpu_coregroup_mask, }, |
7207 | #endif | 7206 | #endif |
7208 | #ifdef CONFIG_SCHED_BOOK | 7207 | #ifdef CONFIG_SCHED_BOOK |
7209 | { sd_init_BOOK, cpu_book_mask, }, | 7208 | { sd_init_BOOK, cpu_book_mask, }, |
7210 | #endif | 7209 | #endif |
7211 | { sd_init_CPU, cpu_cpu_mask, }, | 7210 | { sd_init_CPU, cpu_cpu_mask, }, |
7212 | #ifdef CONFIG_NUMA | 7211 | #ifdef CONFIG_NUMA |
7213 | { sd_init_NODE, cpu_node_mask, }, | 7212 | { sd_init_NODE, cpu_node_mask, }, |
7214 | { sd_init_ALLNODES, cpu_allnodes_mask, }, | 7213 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
7215 | #endif | 7214 | #endif |
7216 | { NULL, }, | 7215 | { NULL, }, |
7217 | }; | 7216 | }; |
7218 | 7217 | ||
7219 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 7218 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
7220 | 7219 | ||
7221 | static int __sdt_alloc(const struct cpumask *cpu_map) | 7220 | static int __sdt_alloc(const struct cpumask *cpu_map) |
7222 | { | 7221 | { |
7223 | struct sched_domain_topology_level *tl; | 7222 | struct sched_domain_topology_level *tl; |
7224 | int j; | 7223 | int j; |
7225 | 7224 | ||
7226 | for (tl = sched_domain_topology; tl->init; tl++) { | 7225 | for (tl = sched_domain_topology; tl->init; tl++) { |
7227 | struct sd_data *sdd = &tl->data; | 7226 | struct sd_data *sdd = &tl->data; |
7228 | 7227 | ||
7229 | sdd->sd = alloc_percpu(struct sched_domain *); | 7228 | sdd->sd = alloc_percpu(struct sched_domain *); |
7230 | if (!sdd->sd) | 7229 | if (!sdd->sd) |
7231 | return -ENOMEM; | 7230 | return -ENOMEM; |
7232 | 7231 | ||
7233 | sdd->sg = alloc_percpu(struct sched_group *); | 7232 | sdd->sg = alloc_percpu(struct sched_group *); |
7234 | if (!sdd->sg) | 7233 | if (!sdd->sg) |
7235 | return -ENOMEM; | 7234 | return -ENOMEM; |
7236 | 7235 | ||
7237 | for_each_cpu(j, cpu_map) { | 7236 | for_each_cpu(j, cpu_map) { |
7238 | struct sched_domain *sd; | 7237 | struct sched_domain *sd; |
7239 | struct sched_group *sg; | 7238 | struct sched_group *sg; |
7240 | 7239 | ||
7241 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | 7240 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), |
7242 | GFP_KERNEL, cpu_to_node(j)); | 7241 | GFP_KERNEL, cpu_to_node(j)); |
7243 | if (!sd) | 7242 | if (!sd) |
7244 | return -ENOMEM; | 7243 | return -ENOMEM; |
7245 | 7244 | ||
7246 | *per_cpu_ptr(sdd->sd, j) = sd; | 7245 | *per_cpu_ptr(sdd->sd, j) = sd; |
7247 | 7246 | ||
7248 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 7247 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
7249 | GFP_KERNEL, cpu_to_node(j)); | 7248 | GFP_KERNEL, cpu_to_node(j)); |
7250 | if (!sg) | 7249 | if (!sg) |
7251 | return -ENOMEM; | 7250 | return -ENOMEM; |
7252 | 7251 | ||
7253 | *per_cpu_ptr(sdd->sg, j) = sg; | 7252 | *per_cpu_ptr(sdd->sg, j) = sg; |
7254 | } | 7253 | } |
7255 | } | 7254 | } |
7256 | 7255 | ||
7257 | return 0; | 7256 | return 0; |
7258 | } | 7257 | } |
7259 | 7258 | ||
7260 | static void __sdt_free(const struct cpumask *cpu_map) | 7259 | static void __sdt_free(const struct cpumask *cpu_map) |
7261 | { | 7260 | { |
7262 | struct sched_domain_topology_level *tl; | 7261 | struct sched_domain_topology_level *tl; |
7263 | int j; | 7262 | int j; |
7264 | 7263 | ||
7265 | for (tl = sched_domain_topology; tl->init; tl++) { | 7264 | for (tl = sched_domain_topology; tl->init; tl++) { |
7266 | struct sd_data *sdd = &tl->data; | 7265 | struct sd_data *sdd = &tl->data; |
7267 | 7266 | ||
7268 | for_each_cpu(j, cpu_map) { | 7267 | for_each_cpu(j, cpu_map) { |
7269 | kfree(*per_cpu_ptr(sdd->sd, j)); | 7268 | kfree(*per_cpu_ptr(sdd->sd, j)); |
7270 | kfree(*per_cpu_ptr(sdd->sg, j)); | 7269 | kfree(*per_cpu_ptr(sdd->sg, j)); |
7271 | } | 7270 | } |
7272 | free_percpu(sdd->sd); | 7271 | free_percpu(sdd->sd); |
7273 | free_percpu(sdd->sg); | 7272 | free_percpu(sdd->sg); |
7274 | } | 7273 | } |
7275 | } | 7274 | } |
7276 | 7275 | ||
7277 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | 7276 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, |
7278 | struct s_data *d, const struct cpumask *cpu_map, | 7277 | struct s_data *d, const struct cpumask *cpu_map, |
7279 | struct sched_domain_attr *attr, struct sched_domain *child, | 7278 | struct sched_domain_attr *attr, struct sched_domain *child, |
7280 | int cpu) | 7279 | int cpu) |
7281 | { | 7280 | { |
7282 | struct sched_domain *sd = tl->init(tl, cpu); | 7281 | struct sched_domain *sd = tl->init(tl, cpu); |
7283 | if (!sd) | 7282 | if (!sd) |
7284 | return child; | 7283 | return child; |
7285 | 7284 | ||
7286 | set_domain_attribute(sd, attr); | 7285 | set_domain_attribute(sd, attr); |
7287 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | 7286 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); |
7288 | if (child) { | 7287 | if (child) { |
7289 | sd->level = child->level + 1; | 7288 | sd->level = child->level + 1; |
7290 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | 7289 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
7291 | child->parent = sd; | 7290 | child->parent = sd; |
7292 | } | 7291 | } |
7293 | sd->child = child; | 7292 | sd->child = child; |
7294 | 7293 | ||
7295 | return sd; | 7294 | return sd; |
7296 | } | 7295 | } |
7297 | 7296 | ||
7298 | /* | 7297 | /* |
7299 | * Build sched domains for a given set of cpus and attach the sched domains | 7298 | * Build sched domains for a given set of cpus and attach the sched domains |
7300 | * to the individual cpus | 7299 | * to the individual cpus |
7301 | */ | 7300 | */ |
7302 | static int build_sched_domains(const struct cpumask *cpu_map, | 7301 | static int build_sched_domains(const struct cpumask *cpu_map, |
7303 | struct sched_domain_attr *attr) | 7302 | struct sched_domain_attr *attr) |
7304 | { | 7303 | { |
7305 | enum s_alloc alloc_state = sa_none; | 7304 | enum s_alloc alloc_state = sa_none; |
7306 | struct sched_domain *sd; | 7305 | struct sched_domain *sd; |
7307 | struct s_data d; | 7306 | struct s_data d; |
7308 | int i, ret = -ENOMEM; | 7307 | int i, ret = -ENOMEM; |
7309 | 7308 | ||
7310 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7309 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
7311 | if (alloc_state != sa_rootdomain) | 7310 | if (alloc_state != sa_rootdomain) |
7312 | goto error; | 7311 | goto error; |
7313 | 7312 | ||
7314 | /* Set up domains for cpus specified by the cpu_map. */ | 7313 | /* Set up domains for cpus specified by the cpu_map. */ |
7315 | for_each_cpu(i, cpu_map) { | 7314 | for_each_cpu(i, cpu_map) { |
7316 | struct sched_domain_topology_level *tl; | 7315 | struct sched_domain_topology_level *tl; |
7317 | 7316 | ||
7318 | sd = NULL; | 7317 | sd = NULL; |
7319 | for (tl = sched_domain_topology; tl->init; tl++) | 7318 | for (tl = sched_domain_topology; tl->init; tl++) |
7320 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); | 7319 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); |
7321 | 7320 | ||
7322 | while (sd->child) | 7321 | while (sd->child) |
7323 | sd = sd->child; | 7322 | sd = sd->child; |
7324 | 7323 | ||
7325 | *per_cpu_ptr(d.sd, i) = sd; | 7324 | *per_cpu_ptr(d.sd, i) = sd; |
7326 | } | 7325 | } |
7327 | 7326 | ||
7328 | /* Build the groups for the domains */ | 7327 | /* Build the groups for the domains */ |
7329 | for_each_cpu(i, cpu_map) { | 7328 | for_each_cpu(i, cpu_map) { |
7330 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | 7329 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7331 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | 7330 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
7332 | get_group(i, sd->private, &sd->groups); | 7331 | get_group(i, sd->private, &sd->groups); |
7333 | atomic_inc(&sd->groups->ref); | 7332 | atomic_inc(&sd->groups->ref); |
7334 | 7333 | ||
7335 | if (i != cpumask_first(sched_domain_span(sd))) | 7334 | if (i != cpumask_first(sched_domain_span(sd))) |
7336 | continue; | 7335 | continue; |
7337 | 7336 | ||
7338 | build_sched_groups(sd); | 7337 | build_sched_groups(sd); |
7339 | } | 7338 | } |
7340 | } | 7339 | } |
7341 | 7340 | ||
7342 | /* Calculate CPU power for physical packages and nodes */ | 7341 | /* Calculate CPU power for physical packages and nodes */ |
7343 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | 7342 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
7344 | if (!cpumask_test_cpu(i, cpu_map)) | 7343 | if (!cpumask_test_cpu(i, cpu_map)) |
7345 | continue; | 7344 | continue; |
7346 | 7345 | ||
7347 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | 7346 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7348 | claim_allocations(i, sd); | 7347 | claim_allocations(i, sd); |
7349 | init_sched_groups_power(i, sd); | 7348 | init_sched_groups_power(i, sd); |
7350 | } | 7349 | } |
7351 | } | 7350 | } |
7352 | 7351 | ||
7353 | /* Attach the domains */ | 7352 | /* Attach the domains */ |
7354 | rcu_read_lock(); | 7353 | rcu_read_lock(); |
7355 | for_each_cpu(i, cpu_map) { | 7354 | for_each_cpu(i, cpu_map) { |
7356 | sd = *per_cpu_ptr(d.sd, i); | 7355 | sd = *per_cpu_ptr(d.sd, i); |
7357 | cpu_attach_domain(sd, d.rd, i); | 7356 | cpu_attach_domain(sd, d.rd, i); |
7358 | } | 7357 | } |
7359 | rcu_read_unlock(); | 7358 | rcu_read_unlock(); |
7360 | 7359 | ||
7361 | ret = 0; | 7360 | ret = 0; |
7362 | error: | 7361 | error: |
7363 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7362 | __free_domain_allocs(&d, alloc_state, cpu_map); |
7364 | return ret; | 7363 | return ret; |
7365 | } | 7364 | } |
7366 | 7365 | ||
7367 | static cpumask_var_t *doms_cur; /* current sched domains */ | 7366 | static cpumask_var_t *doms_cur; /* current sched domains */ |
7368 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 7367 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
7369 | static struct sched_domain_attr *dattr_cur; | 7368 | static struct sched_domain_attr *dattr_cur; |
7370 | /* attribues of custom domains in 'doms_cur' */ | 7369 | /* attribues of custom domains in 'doms_cur' */ |
7371 | 7370 | ||
7372 | /* | 7371 | /* |
7373 | * Special case: If a kmalloc of a doms_cur partition (array of | 7372 | * Special case: If a kmalloc of a doms_cur partition (array of |
7374 | * cpumask) fails, then fallback to a single sched domain, | 7373 | * cpumask) fails, then fallback to a single sched domain, |
7375 | * as determined by the single cpumask fallback_doms. | 7374 | * as determined by the single cpumask fallback_doms. |
7376 | */ | 7375 | */ |
7377 | static cpumask_var_t fallback_doms; | 7376 | static cpumask_var_t fallback_doms; |
7378 | 7377 | ||
7379 | /* | 7378 | /* |
7380 | * arch_update_cpu_topology lets virtualized architectures update the | 7379 | * arch_update_cpu_topology lets virtualized architectures update the |
7381 | * cpu core maps. It is supposed to return 1 if the topology changed | 7380 | * cpu core maps. It is supposed to return 1 if the topology changed |
7382 | * or 0 if it stayed the same. | 7381 | * or 0 if it stayed the same. |
7383 | */ | 7382 | */ |
7384 | int __attribute__((weak)) arch_update_cpu_topology(void) | 7383 | int __attribute__((weak)) arch_update_cpu_topology(void) |
7385 | { | 7384 | { |
7386 | return 0; | 7385 | return 0; |
7387 | } | 7386 | } |
7388 | 7387 | ||
7389 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | 7388 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) |
7390 | { | 7389 | { |
7391 | int i; | 7390 | int i; |
7392 | cpumask_var_t *doms; | 7391 | cpumask_var_t *doms; |
7393 | 7392 | ||
7394 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | 7393 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); |
7395 | if (!doms) | 7394 | if (!doms) |
7396 | return NULL; | 7395 | return NULL; |
7397 | for (i = 0; i < ndoms; i++) { | 7396 | for (i = 0; i < ndoms; i++) { |
7398 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | 7397 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { |
7399 | free_sched_domains(doms, i); | 7398 | free_sched_domains(doms, i); |
7400 | return NULL; | 7399 | return NULL; |
7401 | } | 7400 | } |
7402 | } | 7401 | } |
7403 | return doms; | 7402 | return doms; |
7404 | } | 7403 | } |
7405 | 7404 | ||
7406 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | 7405 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) |
7407 | { | 7406 | { |
7408 | unsigned int i; | 7407 | unsigned int i; |
7409 | for (i = 0; i < ndoms; i++) | 7408 | for (i = 0; i < ndoms; i++) |
7410 | free_cpumask_var(doms[i]); | 7409 | free_cpumask_var(doms[i]); |
7411 | kfree(doms); | 7410 | kfree(doms); |
7412 | } | 7411 | } |
7413 | 7412 | ||
7414 | /* | 7413 | /* |
7415 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 7414 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
7416 | * For now this just excludes isolated cpus, but could be used to | 7415 | * For now this just excludes isolated cpus, but could be used to |
7417 | * exclude other special cases in the future. | 7416 | * exclude other special cases in the future. |
7418 | */ | 7417 | */ |
7419 | static int init_sched_domains(const struct cpumask *cpu_map) | 7418 | static int init_sched_domains(const struct cpumask *cpu_map) |
7420 | { | 7419 | { |
7421 | int err; | 7420 | int err; |
7422 | 7421 | ||
7423 | arch_update_cpu_topology(); | 7422 | arch_update_cpu_topology(); |
7424 | ndoms_cur = 1; | 7423 | ndoms_cur = 1; |
7425 | doms_cur = alloc_sched_domains(ndoms_cur); | 7424 | doms_cur = alloc_sched_domains(ndoms_cur); |
7426 | if (!doms_cur) | 7425 | if (!doms_cur) |
7427 | doms_cur = &fallback_doms; | 7426 | doms_cur = &fallback_doms; |
7428 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 7427 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
7429 | dattr_cur = NULL; | 7428 | dattr_cur = NULL; |
7430 | err = build_sched_domains(doms_cur[0], NULL); | 7429 | err = build_sched_domains(doms_cur[0], NULL); |
7431 | register_sched_domain_sysctl(); | 7430 | register_sched_domain_sysctl(); |
7432 | 7431 | ||
7433 | return err; | 7432 | return err; |
7434 | } | 7433 | } |
7435 | 7434 | ||
7436 | /* | 7435 | /* |
7437 | * Detach sched domains from a group of cpus specified in cpu_map | 7436 | * Detach sched domains from a group of cpus specified in cpu_map |
7438 | * These cpus will now be attached to the NULL domain | 7437 | * These cpus will now be attached to the NULL domain |
7439 | */ | 7438 | */ |
7440 | static void detach_destroy_domains(const struct cpumask *cpu_map) | 7439 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
7441 | { | 7440 | { |
7442 | int i; | 7441 | int i; |
7443 | 7442 | ||
7444 | rcu_read_lock(); | 7443 | rcu_read_lock(); |
7445 | for_each_cpu(i, cpu_map) | 7444 | for_each_cpu(i, cpu_map) |
7446 | cpu_attach_domain(NULL, &def_root_domain, i); | 7445 | cpu_attach_domain(NULL, &def_root_domain, i); |
7447 | rcu_read_unlock(); | 7446 | rcu_read_unlock(); |
7448 | } | 7447 | } |
7449 | 7448 | ||
7450 | /* handle null as "default" */ | 7449 | /* handle null as "default" */ |
7451 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | 7450 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, |
7452 | struct sched_domain_attr *new, int idx_new) | 7451 | struct sched_domain_attr *new, int idx_new) |
7453 | { | 7452 | { |
7454 | struct sched_domain_attr tmp; | 7453 | struct sched_domain_attr tmp; |
7455 | 7454 | ||
7456 | /* fast path */ | 7455 | /* fast path */ |
7457 | if (!new && !cur) | 7456 | if (!new && !cur) |
7458 | return 1; | 7457 | return 1; |
7459 | 7458 | ||
7460 | tmp = SD_ATTR_INIT; | 7459 | tmp = SD_ATTR_INIT; |
7461 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | 7460 | return !memcmp(cur ? (cur + idx_cur) : &tmp, |
7462 | new ? (new + idx_new) : &tmp, | 7461 | new ? (new + idx_new) : &tmp, |
7463 | sizeof(struct sched_domain_attr)); | 7462 | sizeof(struct sched_domain_attr)); |
7464 | } | 7463 | } |
7465 | 7464 | ||
7466 | /* | 7465 | /* |
7467 | * Partition sched domains as specified by the 'ndoms_new' | 7466 | * Partition sched domains as specified by the 'ndoms_new' |
7468 | * cpumasks in the array doms_new[] of cpumasks. This compares | 7467 | * cpumasks in the array doms_new[] of cpumasks. This compares |
7469 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | 7468 | * doms_new[] to the current sched domain partitioning, doms_cur[]. |
7470 | * It destroys each deleted domain and builds each new domain. | 7469 | * It destroys each deleted domain and builds each new domain. |
7471 | * | 7470 | * |
7472 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. | 7471 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. |
7473 | * The masks don't intersect (don't overlap.) We should setup one | 7472 | * The masks don't intersect (don't overlap.) We should setup one |
7474 | * sched domain for each mask. CPUs not in any of the cpumasks will | 7473 | * sched domain for each mask. CPUs not in any of the cpumasks will |
7475 | * not be load balanced. If the same cpumask appears both in the | 7474 | * not be load balanced. If the same cpumask appears both in the |
7476 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | 7475 | * current 'doms_cur' domains and in the new 'doms_new', we can leave |
7477 | * it as it is. | 7476 | * it as it is. |
7478 | * | 7477 | * |
7479 | * The passed in 'doms_new' should be allocated using | 7478 | * The passed in 'doms_new' should be allocated using |
7480 | * alloc_sched_domains. This routine takes ownership of it and will | 7479 | * alloc_sched_domains. This routine takes ownership of it and will |
7481 | * free_sched_domains it when done with it. If the caller failed the | 7480 | * free_sched_domains it when done with it. If the caller failed the |
7482 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, | 7481 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, |
7483 | * and partition_sched_domains() will fallback to the single partition | 7482 | * and partition_sched_domains() will fallback to the single partition |
7484 | * 'fallback_doms', it also forces the domains to be rebuilt. | 7483 | * 'fallback_doms', it also forces the domains to be rebuilt. |
7485 | * | 7484 | * |
7486 | * If doms_new == NULL it will be replaced with cpu_online_mask. | 7485 | * If doms_new == NULL it will be replaced with cpu_online_mask. |
7487 | * ndoms_new == 0 is a special case for destroying existing domains, | 7486 | * ndoms_new == 0 is a special case for destroying existing domains, |
7488 | * and it will not create the default domain. | 7487 | * and it will not create the default domain. |
7489 | * | 7488 | * |
7490 | * Call with hotplug lock held | 7489 | * Call with hotplug lock held |
7491 | */ | 7490 | */ |
7492 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | 7491 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
7493 | struct sched_domain_attr *dattr_new) | 7492 | struct sched_domain_attr *dattr_new) |
7494 | { | 7493 | { |
7495 | int i, j, n; | 7494 | int i, j, n; |
7496 | int new_topology; | 7495 | int new_topology; |
7497 | 7496 | ||
7498 | mutex_lock(&sched_domains_mutex); | 7497 | mutex_lock(&sched_domains_mutex); |
7499 | 7498 | ||
7500 | /* always unregister in case we don't destroy any domains */ | 7499 | /* always unregister in case we don't destroy any domains */ |
7501 | unregister_sched_domain_sysctl(); | 7500 | unregister_sched_domain_sysctl(); |
7502 | 7501 | ||
7503 | /* Let architecture update cpu core mappings. */ | 7502 | /* Let architecture update cpu core mappings. */ |
7504 | new_topology = arch_update_cpu_topology(); | 7503 | new_topology = arch_update_cpu_topology(); |
7505 | 7504 | ||
7506 | n = doms_new ? ndoms_new : 0; | 7505 | n = doms_new ? ndoms_new : 0; |
7507 | 7506 | ||
7508 | /* Destroy deleted domains */ | 7507 | /* Destroy deleted domains */ |
7509 | for (i = 0; i < ndoms_cur; i++) { | 7508 | for (i = 0; i < ndoms_cur; i++) { |
7510 | for (j = 0; j < n && !new_topology; j++) { | 7509 | for (j = 0; j < n && !new_topology; j++) { |
7511 | if (cpumask_equal(doms_cur[i], doms_new[j]) | 7510 | if (cpumask_equal(doms_cur[i], doms_new[j]) |
7512 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 7511 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
7513 | goto match1; | 7512 | goto match1; |
7514 | } | 7513 | } |
7515 | /* no match - a current sched domain not in new doms_new[] */ | 7514 | /* no match - a current sched domain not in new doms_new[] */ |
7516 | detach_destroy_domains(doms_cur[i]); | 7515 | detach_destroy_domains(doms_cur[i]); |
7517 | match1: | 7516 | match1: |
7518 | ; | 7517 | ; |
7519 | } | 7518 | } |
7520 | 7519 | ||
7521 | if (doms_new == NULL) { | 7520 | if (doms_new == NULL) { |
7522 | ndoms_cur = 0; | 7521 | ndoms_cur = 0; |
7523 | doms_new = &fallback_doms; | 7522 | doms_new = &fallback_doms; |
7524 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | 7523 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
7525 | WARN_ON_ONCE(dattr_new); | 7524 | WARN_ON_ONCE(dattr_new); |
7526 | } | 7525 | } |
7527 | 7526 | ||
7528 | /* Build new domains */ | 7527 | /* Build new domains */ |
7529 | for (i = 0; i < ndoms_new; i++) { | 7528 | for (i = 0; i < ndoms_new; i++) { |
7530 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 7529 | for (j = 0; j < ndoms_cur && !new_topology; j++) { |
7531 | if (cpumask_equal(doms_new[i], doms_cur[j]) | 7530 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
7532 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 7531 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
7533 | goto match2; | 7532 | goto match2; |
7534 | } | 7533 | } |
7535 | /* no match - add a new doms_new */ | 7534 | /* no match - add a new doms_new */ |
7536 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); | 7535 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
7537 | match2: | 7536 | match2: |
7538 | ; | 7537 | ; |
7539 | } | 7538 | } |
7540 | 7539 | ||
7541 | /* Remember the new sched domains */ | 7540 | /* Remember the new sched domains */ |
7542 | if (doms_cur != &fallback_doms) | 7541 | if (doms_cur != &fallback_doms) |
7543 | free_sched_domains(doms_cur, ndoms_cur); | 7542 | free_sched_domains(doms_cur, ndoms_cur); |
7544 | kfree(dattr_cur); /* kfree(NULL) is safe */ | 7543 | kfree(dattr_cur); /* kfree(NULL) is safe */ |
7545 | doms_cur = doms_new; | 7544 | doms_cur = doms_new; |
7546 | dattr_cur = dattr_new; | 7545 | dattr_cur = dattr_new; |
7547 | ndoms_cur = ndoms_new; | 7546 | ndoms_cur = ndoms_new; |
7548 | 7547 | ||
7549 | register_sched_domain_sysctl(); | 7548 | register_sched_domain_sysctl(); |
7550 | 7549 | ||
7551 | mutex_unlock(&sched_domains_mutex); | 7550 | mutex_unlock(&sched_domains_mutex); |
7552 | } | 7551 | } |
7553 | 7552 | ||
7554 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7553 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7555 | static void reinit_sched_domains(void) | 7554 | static void reinit_sched_domains(void) |
7556 | { | 7555 | { |
7557 | get_online_cpus(); | 7556 | get_online_cpus(); |
7558 | 7557 | ||
7559 | /* Destroy domains first to force the rebuild */ | 7558 | /* Destroy domains first to force the rebuild */ |
7560 | partition_sched_domains(0, NULL, NULL); | 7559 | partition_sched_domains(0, NULL, NULL); |
7561 | 7560 | ||
7562 | rebuild_sched_domains(); | 7561 | rebuild_sched_domains(); |
7563 | put_online_cpus(); | 7562 | put_online_cpus(); |
7564 | } | 7563 | } |
7565 | 7564 | ||
7566 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | 7565 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) |
7567 | { | 7566 | { |
7568 | unsigned int level = 0; | 7567 | unsigned int level = 0; |
7569 | 7568 | ||
7570 | if (sscanf(buf, "%u", &level) != 1) | 7569 | if (sscanf(buf, "%u", &level) != 1) |
7571 | return -EINVAL; | 7570 | return -EINVAL; |
7572 | 7571 | ||
7573 | /* | 7572 | /* |
7574 | * level is always be positive so don't check for | 7573 | * level is always be positive so don't check for |
7575 | * level < POWERSAVINGS_BALANCE_NONE which is 0 | 7574 | * level < POWERSAVINGS_BALANCE_NONE which is 0 |
7576 | * What happens on 0 or 1 byte write, | 7575 | * What happens on 0 or 1 byte write, |
7577 | * need to check for count as well? | 7576 | * need to check for count as well? |
7578 | */ | 7577 | */ |
7579 | 7578 | ||
7580 | if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) | 7579 | if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) |
7581 | return -EINVAL; | 7580 | return -EINVAL; |
7582 | 7581 | ||
7583 | if (smt) | 7582 | if (smt) |
7584 | sched_smt_power_savings = level; | 7583 | sched_smt_power_savings = level; |
7585 | else | 7584 | else |
7586 | sched_mc_power_savings = level; | 7585 | sched_mc_power_savings = level; |
7587 | 7586 | ||
7588 | reinit_sched_domains(); | 7587 | reinit_sched_domains(); |
7589 | 7588 | ||
7590 | return count; | 7589 | return count; |
7591 | } | 7590 | } |
7592 | 7591 | ||
7593 | #ifdef CONFIG_SCHED_MC | 7592 | #ifdef CONFIG_SCHED_MC |
7594 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, | 7593 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, |
7595 | struct sysdev_class_attribute *attr, | 7594 | struct sysdev_class_attribute *attr, |
7596 | char *page) | 7595 | char *page) |
7597 | { | 7596 | { |
7598 | return sprintf(page, "%u\n", sched_mc_power_savings); | 7597 | return sprintf(page, "%u\n", sched_mc_power_savings); |
7599 | } | 7598 | } |
7600 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, | 7599 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, |
7601 | struct sysdev_class_attribute *attr, | 7600 | struct sysdev_class_attribute *attr, |
7602 | const char *buf, size_t count) | 7601 | const char *buf, size_t count) |
7603 | { | 7602 | { |
7604 | return sched_power_savings_store(buf, count, 0); | 7603 | return sched_power_savings_store(buf, count, 0); |
7605 | } | 7604 | } |
7606 | static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, | 7605 | static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, |
7607 | sched_mc_power_savings_show, | 7606 | sched_mc_power_savings_show, |
7608 | sched_mc_power_savings_store); | 7607 | sched_mc_power_savings_store); |
7609 | #endif | 7608 | #endif |
7610 | 7609 | ||
7611 | #ifdef CONFIG_SCHED_SMT | 7610 | #ifdef CONFIG_SCHED_SMT |
7612 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, | 7611 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, |
7613 | struct sysdev_class_attribute *attr, | 7612 | struct sysdev_class_attribute *attr, |
7614 | char *page) | 7613 | char *page) |
7615 | { | 7614 | { |
7616 | return sprintf(page, "%u\n", sched_smt_power_savings); | 7615 | return sprintf(page, "%u\n", sched_smt_power_savings); |
7617 | } | 7616 | } |
7618 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, | 7617 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, |
7619 | struct sysdev_class_attribute *attr, | 7618 | struct sysdev_class_attribute *attr, |
7620 | const char *buf, size_t count) | 7619 | const char *buf, size_t count) |
7621 | { | 7620 | { |
7622 | return sched_power_savings_store(buf, count, 1); | 7621 | return sched_power_savings_store(buf, count, 1); |
7623 | } | 7622 | } |
7624 | static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, | 7623 | static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, |
7625 | sched_smt_power_savings_show, | 7624 | sched_smt_power_savings_show, |
7626 | sched_smt_power_savings_store); | 7625 | sched_smt_power_savings_store); |
7627 | #endif | 7626 | #endif |
7628 | 7627 | ||
7629 | int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | 7628 | int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) |
7630 | { | 7629 | { |
7631 | int err = 0; | 7630 | int err = 0; |
7632 | 7631 | ||
7633 | #ifdef CONFIG_SCHED_SMT | 7632 | #ifdef CONFIG_SCHED_SMT |
7634 | if (smt_capable()) | 7633 | if (smt_capable()) |
7635 | err = sysfs_create_file(&cls->kset.kobj, | 7634 | err = sysfs_create_file(&cls->kset.kobj, |
7636 | &attr_sched_smt_power_savings.attr); | 7635 | &attr_sched_smt_power_savings.attr); |
7637 | #endif | 7636 | #endif |
7638 | #ifdef CONFIG_SCHED_MC | 7637 | #ifdef CONFIG_SCHED_MC |
7639 | if (!err && mc_capable()) | 7638 | if (!err && mc_capable()) |
7640 | err = sysfs_create_file(&cls->kset.kobj, | 7639 | err = sysfs_create_file(&cls->kset.kobj, |
7641 | &attr_sched_mc_power_savings.attr); | 7640 | &attr_sched_mc_power_savings.attr); |
7642 | #endif | 7641 | #endif |
7643 | return err; | 7642 | return err; |
7644 | } | 7643 | } |
7645 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 7644 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
7646 | 7645 | ||
7647 | /* | 7646 | /* |
7648 | * Update cpusets according to cpu_active mask. If cpusets are | 7647 | * Update cpusets according to cpu_active mask. If cpusets are |
7649 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper | 7648 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
7650 | * around partition_sched_domains(). | 7649 | * around partition_sched_domains(). |
7651 | */ | 7650 | */ |
7652 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | 7651 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, |
7653 | void *hcpu) | 7652 | void *hcpu) |
7654 | { | 7653 | { |
7655 | switch (action & ~CPU_TASKS_FROZEN) { | 7654 | switch (action & ~CPU_TASKS_FROZEN) { |
7656 | case CPU_ONLINE: | 7655 | case CPU_ONLINE: |
7657 | case CPU_DOWN_FAILED: | 7656 | case CPU_DOWN_FAILED: |
7658 | cpuset_update_active_cpus(); | 7657 | cpuset_update_active_cpus(); |
7659 | return NOTIFY_OK; | 7658 | return NOTIFY_OK; |
7660 | default: | 7659 | default: |
7661 | return NOTIFY_DONE; | 7660 | return NOTIFY_DONE; |
7662 | } | 7661 | } |
7663 | } | 7662 | } |
7664 | 7663 | ||
7665 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 7664 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
7666 | void *hcpu) | 7665 | void *hcpu) |
7667 | { | 7666 | { |
7668 | switch (action & ~CPU_TASKS_FROZEN) { | 7667 | switch (action & ~CPU_TASKS_FROZEN) { |
7669 | case CPU_DOWN_PREPARE: | 7668 | case CPU_DOWN_PREPARE: |
7670 | cpuset_update_active_cpus(); | 7669 | cpuset_update_active_cpus(); |
7671 | return NOTIFY_OK; | 7670 | return NOTIFY_OK; |
7672 | default: | 7671 | default: |
7673 | return NOTIFY_DONE; | 7672 | return NOTIFY_DONE; |
7674 | } | 7673 | } |
7675 | } | 7674 | } |
7676 | 7675 | ||
7677 | static int update_runtime(struct notifier_block *nfb, | 7676 | static int update_runtime(struct notifier_block *nfb, |
7678 | unsigned long action, void *hcpu) | 7677 | unsigned long action, void *hcpu) |
7679 | { | 7678 | { |
7680 | int cpu = (int)(long)hcpu; | 7679 | int cpu = (int)(long)hcpu; |
7681 | 7680 | ||
7682 | switch (action) { | 7681 | switch (action) { |
7683 | case CPU_DOWN_PREPARE: | 7682 | case CPU_DOWN_PREPARE: |
7684 | case CPU_DOWN_PREPARE_FROZEN: | 7683 | case CPU_DOWN_PREPARE_FROZEN: |
7685 | disable_runtime(cpu_rq(cpu)); | 7684 | disable_runtime(cpu_rq(cpu)); |
7686 | return NOTIFY_OK; | 7685 | return NOTIFY_OK; |
7687 | 7686 | ||
7688 | case CPU_DOWN_FAILED: | 7687 | case CPU_DOWN_FAILED: |
7689 | case CPU_DOWN_FAILED_FROZEN: | 7688 | case CPU_DOWN_FAILED_FROZEN: |
7690 | case CPU_ONLINE: | 7689 | case CPU_ONLINE: |
7691 | case CPU_ONLINE_FROZEN: | 7690 | case CPU_ONLINE_FROZEN: |
7692 | enable_runtime(cpu_rq(cpu)); | 7691 | enable_runtime(cpu_rq(cpu)); |
7693 | return NOTIFY_OK; | 7692 | return NOTIFY_OK; |
7694 | 7693 | ||
7695 | default: | 7694 | default: |
7696 | return NOTIFY_DONE; | 7695 | return NOTIFY_DONE; |
7697 | } | 7696 | } |
7698 | } | 7697 | } |
7699 | 7698 | ||
7700 | void __init sched_init_smp(void) | 7699 | void __init sched_init_smp(void) |
7701 | { | 7700 | { |
7702 | cpumask_var_t non_isolated_cpus; | 7701 | cpumask_var_t non_isolated_cpus; |
7703 | 7702 | ||
7704 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7703 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7705 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7704 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7706 | 7705 | ||
7707 | get_online_cpus(); | 7706 | get_online_cpus(); |
7708 | mutex_lock(&sched_domains_mutex); | 7707 | mutex_lock(&sched_domains_mutex); |
7709 | init_sched_domains(cpu_active_mask); | 7708 | init_sched_domains(cpu_active_mask); |
7710 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7709 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
7711 | if (cpumask_empty(non_isolated_cpus)) | 7710 | if (cpumask_empty(non_isolated_cpus)) |
7712 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7711 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
7713 | mutex_unlock(&sched_domains_mutex); | 7712 | mutex_unlock(&sched_domains_mutex); |
7714 | put_online_cpus(); | 7713 | put_online_cpus(); |
7715 | 7714 | ||
7716 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | 7715 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
7717 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); | 7716 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
7718 | 7717 | ||
7719 | /* RT runtime code needs to handle some hotplug events */ | 7718 | /* RT runtime code needs to handle some hotplug events */ |
7720 | hotcpu_notifier(update_runtime, 0); | 7719 | hotcpu_notifier(update_runtime, 0); |
7721 | 7720 | ||
7722 | init_hrtick(); | 7721 | init_hrtick(); |
7723 | 7722 | ||
7724 | /* Move init over to a non-isolated CPU */ | 7723 | /* Move init over to a non-isolated CPU */ |
7725 | if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) | 7724 | if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) |
7726 | BUG(); | 7725 | BUG(); |
7727 | sched_init_granularity(); | 7726 | sched_init_granularity(); |
7728 | free_cpumask_var(non_isolated_cpus); | 7727 | free_cpumask_var(non_isolated_cpus); |
7729 | 7728 | ||
7730 | init_sched_rt_class(); | 7729 | init_sched_rt_class(); |
7731 | } | 7730 | } |
7732 | #else | 7731 | #else |
7733 | void __init sched_init_smp(void) | 7732 | void __init sched_init_smp(void) |
7734 | { | 7733 | { |
7735 | sched_init_granularity(); | 7734 | sched_init_granularity(); |
7736 | } | 7735 | } |
7737 | #endif /* CONFIG_SMP */ | 7736 | #endif /* CONFIG_SMP */ |
7738 | 7737 | ||
7739 | const_debug unsigned int sysctl_timer_migration = 1; | 7738 | const_debug unsigned int sysctl_timer_migration = 1; |
7740 | 7739 | ||
7741 | int in_sched_functions(unsigned long addr) | 7740 | int in_sched_functions(unsigned long addr) |
7742 | { | 7741 | { |
7743 | return in_lock_functions(addr) || | 7742 | return in_lock_functions(addr) || |
7744 | (addr >= (unsigned long)__sched_text_start | 7743 | (addr >= (unsigned long)__sched_text_start |
7745 | && addr < (unsigned long)__sched_text_end); | 7744 | && addr < (unsigned long)__sched_text_end); |
7746 | } | 7745 | } |
7747 | 7746 | ||
7748 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 7747 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
7749 | { | 7748 | { |
7750 | cfs_rq->tasks_timeline = RB_ROOT; | 7749 | cfs_rq->tasks_timeline = RB_ROOT; |
7751 | INIT_LIST_HEAD(&cfs_rq->tasks); | 7750 | INIT_LIST_HEAD(&cfs_rq->tasks); |
7752 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7751 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7753 | cfs_rq->rq = rq; | 7752 | cfs_rq->rq = rq; |
7754 | /* allow initial update_cfs_load() to truncate */ | 7753 | /* allow initial update_cfs_load() to truncate */ |
7755 | #ifdef CONFIG_SMP | 7754 | #ifdef CONFIG_SMP |
7756 | cfs_rq->load_stamp = 1; | 7755 | cfs_rq->load_stamp = 1; |
7757 | #endif | 7756 | #endif |
7758 | #endif | 7757 | #endif |
7759 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7758 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
7760 | } | 7759 | } |
7761 | 7760 | ||
7762 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 7761 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) |
7763 | { | 7762 | { |
7764 | struct rt_prio_array *array; | 7763 | struct rt_prio_array *array; |
7765 | int i; | 7764 | int i; |
7766 | 7765 | ||
7767 | array = &rt_rq->active; | 7766 | array = &rt_rq->active; |
7768 | for (i = 0; i < MAX_RT_PRIO; i++) { | 7767 | for (i = 0; i < MAX_RT_PRIO; i++) { |
7769 | INIT_LIST_HEAD(array->queue + i); | 7768 | INIT_LIST_HEAD(array->queue + i); |
7770 | __clear_bit(i, array->bitmap); | 7769 | __clear_bit(i, array->bitmap); |
7771 | } | 7770 | } |
7772 | /* delimiter for bitsearch: */ | 7771 | /* delimiter for bitsearch: */ |
7773 | __set_bit(MAX_RT_PRIO, array->bitmap); | 7772 | __set_bit(MAX_RT_PRIO, array->bitmap); |
7774 | 7773 | ||
7775 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 7774 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
7776 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | 7775 | rt_rq->highest_prio.curr = MAX_RT_PRIO; |
7777 | #ifdef CONFIG_SMP | 7776 | #ifdef CONFIG_SMP |
7778 | rt_rq->highest_prio.next = MAX_RT_PRIO; | 7777 | rt_rq->highest_prio.next = MAX_RT_PRIO; |
7779 | #endif | 7778 | #endif |
7780 | #endif | 7779 | #endif |
7781 | #ifdef CONFIG_SMP | 7780 | #ifdef CONFIG_SMP |
7782 | rt_rq->rt_nr_migratory = 0; | 7781 | rt_rq->rt_nr_migratory = 0; |
7783 | rt_rq->overloaded = 0; | 7782 | rt_rq->overloaded = 0; |
7784 | plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); | 7783 | plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); |
7785 | #endif | 7784 | #endif |
7786 | 7785 | ||
7787 | rt_rq->rt_time = 0; | 7786 | rt_rq->rt_time = 0; |
7788 | rt_rq->rt_throttled = 0; | 7787 | rt_rq->rt_throttled = 0; |
7789 | rt_rq->rt_runtime = 0; | 7788 | rt_rq->rt_runtime = 0; |
7790 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | 7789 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); |
7791 | 7790 | ||
7792 | #ifdef CONFIG_RT_GROUP_SCHED | 7791 | #ifdef CONFIG_RT_GROUP_SCHED |
7793 | rt_rq->rt_nr_boosted = 0; | 7792 | rt_rq->rt_nr_boosted = 0; |
7794 | rt_rq->rq = rq; | 7793 | rt_rq->rq = rq; |
7795 | #endif | 7794 | #endif |
7796 | } | 7795 | } |
7797 | 7796 | ||
7798 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7797 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7799 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 7798 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7800 | struct sched_entity *se, int cpu, | 7799 | struct sched_entity *se, int cpu, |
7801 | struct sched_entity *parent) | 7800 | struct sched_entity *parent) |
7802 | { | 7801 | { |
7803 | struct rq *rq = cpu_rq(cpu); | 7802 | struct rq *rq = cpu_rq(cpu); |
7804 | tg->cfs_rq[cpu] = cfs_rq; | 7803 | tg->cfs_rq[cpu] = cfs_rq; |
7805 | init_cfs_rq(cfs_rq, rq); | 7804 | init_cfs_rq(cfs_rq, rq); |
7806 | cfs_rq->tg = tg; | 7805 | cfs_rq->tg = tg; |
7807 | 7806 | ||
7808 | tg->se[cpu] = se; | 7807 | tg->se[cpu] = se; |
7809 | /* se could be NULL for root_task_group */ | 7808 | /* se could be NULL for root_task_group */ |
7810 | if (!se) | 7809 | if (!se) |
7811 | return; | 7810 | return; |
7812 | 7811 | ||
7813 | if (!parent) | 7812 | if (!parent) |
7814 | se->cfs_rq = &rq->cfs; | 7813 | se->cfs_rq = &rq->cfs; |
7815 | else | 7814 | else |
7816 | se->cfs_rq = parent->my_q; | 7815 | se->cfs_rq = parent->my_q; |
7817 | 7816 | ||
7818 | se->my_q = cfs_rq; | 7817 | se->my_q = cfs_rq; |
7819 | update_load_set(&se->load, 0); | 7818 | update_load_set(&se->load, 0); |
7820 | se->parent = parent; | 7819 | se->parent = parent; |
7821 | } | 7820 | } |
7822 | #endif | 7821 | #endif |
7823 | 7822 | ||
7824 | #ifdef CONFIG_RT_GROUP_SCHED | 7823 | #ifdef CONFIG_RT_GROUP_SCHED |
7825 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 7824 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7826 | struct sched_rt_entity *rt_se, int cpu, | 7825 | struct sched_rt_entity *rt_se, int cpu, |
7827 | struct sched_rt_entity *parent) | 7826 | struct sched_rt_entity *parent) |
7828 | { | 7827 | { |
7829 | struct rq *rq = cpu_rq(cpu); | 7828 | struct rq *rq = cpu_rq(cpu); |
7830 | 7829 | ||
7831 | tg->rt_rq[cpu] = rt_rq; | 7830 | tg->rt_rq[cpu] = rt_rq; |
7832 | init_rt_rq(rt_rq, rq); | 7831 | init_rt_rq(rt_rq, rq); |
7833 | rt_rq->tg = tg; | 7832 | rt_rq->tg = tg; |
7834 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7833 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
7835 | 7834 | ||
7836 | tg->rt_se[cpu] = rt_se; | 7835 | tg->rt_se[cpu] = rt_se; |
7837 | if (!rt_se) | 7836 | if (!rt_se) |
7838 | return; | 7837 | return; |
7839 | 7838 | ||
7840 | if (!parent) | 7839 | if (!parent) |
7841 | rt_se->rt_rq = &rq->rt; | 7840 | rt_se->rt_rq = &rq->rt; |
7842 | else | 7841 | else |
7843 | rt_se->rt_rq = parent->my_q; | 7842 | rt_se->rt_rq = parent->my_q; |
7844 | 7843 | ||
7845 | rt_se->my_q = rt_rq; | 7844 | rt_se->my_q = rt_rq; |
7846 | rt_se->parent = parent; | 7845 | rt_se->parent = parent; |
7847 | INIT_LIST_HEAD(&rt_se->run_list); | 7846 | INIT_LIST_HEAD(&rt_se->run_list); |
7848 | } | 7847 | } |
7849 | #endif | 7848 | #endif |
7850 | 7849 | ||
7851 | void __init sched_init(void) | 7850 | void __init sched_init(void) |
7852 | { | 7851 | { |
7853 | int i, j; | 7852 | int i, j; |
7854 | unsigned long alloc_size = 0, ptr; | 7853 | unsigned long alloc_size = 0, ptr; |
7855 | 7854 | ||
7856 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7855 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7857 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7856 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
7858 | #endif | 7857 | #endif |
7859 | #ifdef CONFIG_RT_GROUP_SCHED | 7858 | #ifdef CONFIG_RT_GROUP_SCHED |
7860 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7859 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
7861 | #endif | 7860 | #endif |
7862 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7861 | #ifdef CONFIG_CPUMASK_OFFSTACK |
7863 | alloc_size += num_possible_cpus() * cpumask_size(); | 7862 | alloc_size += num_possible_cpus() * cpumask_size(); |
7864 | #endif | 7863 | #endif |
7865 | if (alloc_size) { | 7864 | if (alloc_size) { |
7866 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 7865 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
7867 | 7866 | ||
7868 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7867 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7869 | root_task_group.se = (struct sched_entity **)ptr; | 7868 | root_task_group.se = (struct sched_entity **)ptr; |
7870 | ptr += nr_cpu_ids * sizeof(void **); | 7869 | ptr += nr_cpu_ids * sizeof(void **); |
7871 | 7870 | ||
7872 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7871 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7873 | ptr += nr_cpu_ids * sizeof(void **); | 7872 | ptr += nr_cpu_ids * sizeof(void **); |
7874 | 7873 | ||
7875 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7874 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7876 | #ifdef CONFIG_RT_GROUP_SCHED | 7875 | #ifdef CONFIG_RT_GROUP_SCHED |
7877 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7876 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7878 | ptr += nr_cpu_ids * sizeof(void **); | 7877 | ptr += nr_cpu_ids * sizeof(void **); |
7879 | 7878 | ||
7880 | root_task_group.rt_rq = (struct rt_rq **)ptr; | 7879 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7881 | ptr += nr_cpu_ids * sizeof(void **); | 7880 | ptr += nr_cpu_ids * sizeof(void **); |
7882 | 7881 | ||
7883 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7882 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7884 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7883 | #ifdef CONFIG_CPUMASK_OFFSTACK |
7885 | for_each_possible_cpu(i) { | 7884 | for_each_possible_cpu(i) { |
7886 | per_cpu(load_balance_tmpmask, i) = (void *)ptr; | 7885 | per_cpu(load_balance_tmpmask, i) = (void *)ptr; |
7887 | ptr += cpumask_size(); | 7886 | ptr += cpumask_size(); |
7888 | } | 7887 | } |
7889 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 7888 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
7890 | } | 7889 | } |
7891 | 7890 | ||
7892 | #ifdef CONFIG_SMP | 7891 | #ifdef CONFIG_SMP |
7893 | init_defrootdomain(); | 7892 | init_defrootdomain(); |
7894 | #endif | 7893 | #endif |
7895 | 7894 | ||
7896 | init_rt_bandwidth(&def_rt_bandwidth, | 7895 | init_rt_bandwidth(&def_rt_bandwidth, |
7897 | global_rt_period(), global_rt_runtime()); | 7896 | global_rt_period(), global_rt_runtime()); |
7898 | 7897 | ||
7899 | #ifdef CONFIG_RT_GROUP_SCHED | 7898 | #ifdef CONFIG_RT_GROUP_SCHED |
7900 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 7899 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7901 | global_rt_period(), global_rt_runtime()); | 7900 | global_rt_period(), global_rt_runtime()); |
7902 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7901 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7903 | 7902 | ||
7904 | #ifdef CONFIG_CGROUP_SCHED | 7903 | #ifdef CONFIG_CGROUP_SCHED |
7905 | list_add(&root_task_group.list, &task_groups); | 7904 | list_add(&root_task_group.list, &task_groups); |
7906 | INIT_LIST_HEAD(&root_task_group.children); | 7905 | INIT_LIST_HEAD(&root_task_group.children); |
7907 | autogroup_init(&init_task); | 7906 | autogroup_init(&init_task); |
7908 | #endif /* CONFIG_CGROUP_SCHED */ | 7907 | #endif /* CONFIG_CGROUP_SCHED */ |
7909 | 7908 | ||
7910 | for_each_possible_cpu(i) { | 7909 | for_each_possible_cpu(i) { |
7911 | struct rq *rq; | 7910 | struct rq *rq; |
7912 | 7911 | ||
7913 | rq = cpu_rq(i); | 7912 | rq = cpu_rq(i); |
7914 | raw_spin_lock_init(&rq->lock); | 7913 | raw_spin_lock_init(&rq->lock); |
7915 | rq->nr_running = 0; | 7914 | rq->nr_running = 0; |
7916 | rq->calc_load_active = 0; | 7915 | rq->calc_load_active = 0; |
7917 | rq->calc_load_update = jiffies + LOAD_FREQ; | 7916 | rq->calc_load_update = jiffies + LOAD_FREQ; |
7918 | init_cfs_rq(&rq->cfs, rq); | 7917 | init_cfs_rq(&rq->cfs, rq); |
7919 | init_rt_rq(&rq->rt, rq); | 7918 | init_rt_rq(&rq->rt, rq); |
7920 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7919 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7921 | root_task_group.shares = root_task_group_load; | 7920 | root_task_group.shares = root_task_group_load; |
7922 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7921 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7923 | /* | 7922 | /* |
7924 | * How much cpu bandwidth does root_task_group get? | 7923 | * How much cpu bandwidth does root_task_group get? |
7925 | * | 7924 | * |
7926 | * In case of task-groups formed thr' the cgroup filesystem, it | 7925 | * In case of task-groups formed thr' the cgroup filesystem, it |
7927 | * gets 100% of the cpu resources in the system. This overall | 7926 | * gets 100% of the cpu resources in the system. This overall |
7928 | * system cpu resource is divided among the tasks of | 7927 | * system cpu resource is divided among the tasks of |
7929 | * root_task_group and its child task-groups in a fair manner, | 7928 | * root_task_group and its child task-groups in a fair manner, |
7930 | * based on each entity's (task or task-group's) weight | 7929 | * based on each entity's (task or task-group's) weight |
7931 | * (se->load.weight). | 7930 | * (se->load.weight). |
7932 | * | 7931 | * |
7933 | * In other words, if root_task_group has 10 tasks of weight | 7932 | * In other words, if root_task_group has 10 tasks of weight |
7934 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 7933 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
7935 | * then A0's share of the cpu resource is: | 7934 | * then A0's share of the cpu resource is: |
7936 | * | 7935 | * |
7937 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 7936 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
7938 | * | 7937 | * |
7939 | * We achieve this by letting root_task_group's tasks sit | 7938 | * We achieve this by letting root_task_group's tasks sit |
7940 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). | 7939 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
7941 | */ | 7940 | */ |
7942 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); | 7941 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
7943 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7942 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7944 | 7943 | ||
7945 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 7944 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
7946 | #ifdef CONFIG_RT_GROUP_SCHED | 7945 | #ifdef CONFIG_RT_GROUP_SCHED |
7947 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7946 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7948 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); | 7947 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
7949 | #endif | 7948 | #endif |
7950 | 7949 | ||
7951 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7950 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
7952 | rq->cpu_load[j] = 0; | 7951 | rq->cpu_load[j] = 0; |
7953 | 7952 | ||
7954 | rq->last_load_update_tick = jiffies; | 7953 | rq->last_load_update_tick = jiffies; |
7955 | 7954 | ||
7956 | #ifdef CONFIG_SMP | 7955 | #ifdef CONFIG_SMP |
7957 | rq->sd = NULL; | 7956 | rq->sd = NULL; |
7958 | rq->rd = NULL; | 7957 | rq->rd = NULL; |
7959 | rq->cpu_power = SCHED_POWER_SCALE; | 7958 | rq->cpu_power = SCHED_POWER_SCALE; |
7960 | rq->post_schedule = 0; | 7959 | rq->post_schedule = 0; |
7961 | rq->active_balance = 0; | 7960 | rq->active_balance = 0; |
7962 | rq->next_balance = jiffies; | 7961 | rq->next_balance = jiffies; |
7963 | rq->push_cpu = 0; | 7962 | rq->push_cpu = 0; |
7964 | rq->cpu = i; | 7963 | rq->cpu = i; |
7965 | rq->online = 0; | 7964 | rq->online = 0; |
7966 | rq->idle_stamp = 0; | 7965 | rq->idle_stamp = 0; |
7967 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 7966 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
7968 | rq_attach_root(rq, &def_root_domain); | 7967 | rq_attach_root(rq, &def_root_domain); |
7969 | #ifdef CONFIG_NO_HZ | 7968 | #ifdef CONFIG_NO_HZ |
7970 | rq->nohz_balance_kick = 0; | 7969 | rq->nohz_balance_kick = 0; |
7971 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | 7970 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); |
7972 | #endif | 7971 | #endif |
7973 | #endif | 7972 | #endif |
7974 | init_rq_hrtick(rq); | 7973 | init_rq_hrtick(rq); |
7975 | atomic_set(&rq->nr_iowait, 0); | 7974 | atomic_set(&rq->nr_iowait, 0); |
7976 | } | 7975 | } |
7977 | 7976 | ||
7978 | set_load_weight(&init_task); | 7977 | set_load_weight(&init_task); |
7979 | 7978 | ||
7980 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 7979 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
7981 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 7980 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
7982 | #endif | 7981 | #endif |
7983 | 7982 | ||
7984 | #ifdef CONFIG_SMP | 7983 | #ifdef CONFIG_SMP |
7985 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | 7984 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |
7986 | #endif | 7985 | #endif |
7987 | 7986 | ||
7988 | #ifdef CONFIG_RT_MUTEXES | 7987 | #ifdef CONFIG_RT_MUTEXES |
7989 | plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); | 7988 | plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); |
7990 | #endif | 7989 | #endif |
7991 | 7990 | ||
7992 | /* | 7991 | /* |
7993 | * The boot idle thread does lazy MMU switching as well: | 7992 | * The boot idle thread does lazy MMU switching as well: |
7994 | */ | 7993 | */ |
7995 | atomic_inc(&init_mm.mm_count); | 7994 | atomic_inc(&init_mm.mm_count); |
7996 | enter_lazy_tlb(&init_mm, current); | 7995 | enter_lazy_tlb(&init_mm, current); |
7997 | 7996 | ||
7998 | /* | 7997 | /* |
7999 | * Make us the idle thread. Technically, schedule() should not be | 7998 | * Make us the idle thread. Technically, schedule() should not be |
8000 | * called from this thread, however somewhere below it might be, | 7999 | * called from this thread, however somewhere below it might be, |
8001 | * but because we are the idle thread, we just pick up running again | 8000 | * but because we are the idle thread, we just pick up running again |
8002 | * when this runqueue becomes "idle". | 8001 | * when this runqueue becomes "idle". |
8003 | */ | 8002 | */ |
8004 | init_idle(current, smp_processor_id()); | 8003 | init_idle(current, smp_processor_id()); |
8005 | 8004 | ||
8006 | calc_load_update = jiffies + LOAD_FREQ; | 8005 | calc_load_update = jiffies + LOAD_FREQ; |
8007 | 8006 | ||
8008 | /* | 8007 | /* |
8009 | * During early bootup we pretend to be a normal task: | 8008 | * During early bootup we pretend to be a normal task: |
8010 | */ | 8009 | */ |
8011 | current->sched_class = &fair_sched_class; | 8010 | current->sched_class = &fair_sched_class; |
8012 | 8011 | ||
8013 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 8012 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
8014 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 8013 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
8015 | #ifdef CONFIG_SMP | 8014 | #ifdef CONFIG_SMP |
8016 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | 8015 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); |
8017 | #ifdef CONFIG_NO_HZ | 8016 | #ifdef CONFIG_NO_HZ |
8018 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 8017 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
8019 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 8018 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
8020 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | 8019 | atomic_set(&nohz.load_balancer, nr_cpu_ids); |
8021 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | 8020 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); |
8022 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | 8021 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); |
8023 | #endif | 8022 | #endif |
8024 | /* May be allocated at isolcpus cmdline parse time */ | 8023 | /* May be allocated at isolcpus cmdline parse time */ |
8025 | if (cpu_isolated_map == NULL) | 8024 | if (cpu_isolated_map == NULL) |
8026 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8025 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
8027 | #endif /* SMP */ | 8026 | #endif /* SMP */ |
8028 | 8027 | ||
8029 | scheduler_running = 1; | 8028 | scheduler_running = 1; |
8030 | } | 8029 | } |
8031 | 8030 | ||
8032 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 8031 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
8033 | static inline int preempt_count_equals(int preempt_offset) | 8032 | static inline int preempt_count_equals(int preempt_offset) |
8034 | { | 8033 | { |
8035 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 8034 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
8036 | 8035 | ||
8037 | return (nested == preempt_offset); | 8036 | return (nested == preempt_offset); |
8038 | } | 8037 | } |
8039 | 8038 | ||
8040 | void __might_sleep(const char *file, int line, int preempt_offset) | 8039 | void __might_sleep(const char *file, int line, int preempt_offset) |
8041 | { | 8040 | { |
8042 | #ifdef in_atomic | 8041 | #ifdef in_atomic |
8043 | static unsigned long prev_jiffy; /* ratelimiting */ | 8042 | static unsigned long prev_jiffy; /* ratelimiting */ |
8044 | 8043 | ||
8045 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || | 8044 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
8046 | system_state != SYSTEM_RUNNING || oops_in_progress) | 8045 | system_state != SYSTEM_RUNNING || oops_in_progress) |
8047 | return; | 8046 | return; |
8048 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 8047 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
8049 | return; | 8048 | return; |
8050 | prev_jiffy = jiffies; | 8049 | prev_jiffy = jiffies; |
8051 | 8050 | ||
8052 | printk(KERN_ERR | 8051 | printk(KERN_ERR |
8053 | "BUG: sleeping function called from invalid context at %s:%d\n", | 8052 | "BUG: sleeping function called from invalid context at %s:%d\n", |
8054 | file, line); | 8053 | file, line); |
8055 | printk(KERN_ERR | 8054 | printk(KERN_ERR |
8056 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", | 8055 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
8057 | in_atomic(), irqs_disabled(), | 8056 | in_atomic(), irqs_disabled(), |
8058 | current->pid, current->comm); | 8057 | current->pid, current->comm); |
8059 | 8058 | ||
8060 | debug_show_held_locks(current); | 8059 | debug_show_held_locks(current); |
8061 | if (irqs_disabled()) | 8060 | if (irqs_disabled()) |
8062 | print_irqtrace_events(current); | 8061 | print_irqtrace_events(current); |
8063 | dump_stack(); | 8062 | dump_stack(); |
8064 | #endif | 8063 | #endif |
8065 | } | 8064 | } |
8066 | EXPORT_SYMBOL(__might_sleep); | 8065 | EXPORT_SYMBOL(__might_sleep); |
8067 | #endif | 8066 | #endif |
8068 | 8067 | ||
8069 | #ifdef CONFIG_MAGIC_SYSRQ | 8068 | #ifdef CONFIG_MAGIC_SYSRQ |
8070 | static void normalize_task(struct rq *rq, struct task_struct *p) | 8069 | static void normalize_task(struct rq *rq, struct task_struct *p) |
8071 | { | 8070 | { |
8072 | const struct sched_class *prev_class = p->sched_class; | 8071 | const struct sched_class *prev_class = p->sched_class; |
8073 | int old_prio = p->prio; | 8072 | int old_prio = p->prio; |
8074 | int on_rq; | 8073 | int on_rq; |
8075 | 8074 | ||
8076 | on_rq = p->on_rq; | 8075 | on_rq = p->on_rq; |
8077 | if (on_rq) | 8076 | if (on_rq) |
8078 | deactivate_task(rq, p, 0); | 8077 | deactivate_task(rq, p, 0); |
8079 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 8078 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
8080 | if (on_rq) { | 8079 | if (on_rq) { |
8081 | activate_task(rq, p, 0); | 8080 | activate_task(rq, p, 0); |
8082 | resched_task(rq->curr); | 8081 | resched_task(rq->curr); |
8083 | } | 8082 | } |
8084 | 8083 | ||
8085 | check_class_changed(rq, p, prev_class, old_prio); | 8084 | check_class_changed(rq, p, prev_class, old_prio); |
8086 | } | 8085 | } |
8087 | 8086 | ||
8088 | void normalize_rt_tasks(void) | 8087 | void normalize_rt_tasks(void) |
8089 | { | 8088 | { |
8090 | struct task_struct *g, *p; | 8089 | struct task_struct *g, *p; |
8091 | unsigned long flags; | 8090 | unsigned long flags; |
8092 | struct rq *rq; | 8091 | struct rq *rq; |
8093 | 8092 | ||
8094 | read_lock_irqsave(&tasklist_lock, flags); | 8093 | read_lock_irqsave(&tasklist_lock, flags); |
8095 | do_each_thread(g, p) { | 8094 | do_each_thread(g, p) { |
8096 | /* | 8095 | /* |
8097 | * Only normalize user tasks: | 8096 | * Only normalize user tasks: |
8098 | */ | 8097 | */ |
8099 | if (!p->mm) | 8098 | if (!p->mm) |
8100 | continue; | 8099 | continue; |
8101 | 8100 | ||
8102 | p->se.exec_start = 0; | 8101 | p->se.exec_start = 0; |
8103 | #ifdef CONFIG_SCHEDSTATS | 8102 | #ifdef CONFIG_SCHEDSTATS |
8104 | p->se.statistics.wait_start = 0; | 8103 | p->se.statistics.wait_start = 0; |
8105 | p->se.statistics.sleep_start = 0; | 8104 | p->se.statistics.sleep_start = 0; |
8106 | p->se.statistics.block_start = 0; | 8105 | p->se.statistics.block_start = 0; |
8107 | #endif | 8106 | #endif |
8108 | 8107 | ||
8109 | if (!rt_task(p)) { | 8108 | if (!rt_task(p)) { |
8110 | /* | 8109 | /* |
8111 | * Renice negative nice level userspace | 8110 | * Renice negative nice level userspace |
8112 | * tasks back to 0: | 8111 | * tasks back to 0: |
8113 | */ | 8112 | */ |
8114 | if (TASK_NICE(p) < 0 && p->mm) | 8113 | if (TASK_NICE(p) < 0 && p->mm) |
8115 | set_user_nice(p, 0); | 8114 | set_user_nice(p, 0); |
8116 | continue; | 8115 | continue; |
8117 | } | 8116 | } |
8118 | 8117 | ||
8119 | raw_spin_lock(&p->pi_lock); | 8118 | raw_spin_lock(&p->pi_lock); |
8120 | rq = __task_rq_lock(p); | 8119 | rq = __task_rq_lock(p); |
8121 | 8120 | ||
8122 | normalize_task(rq, p); | 8121 | normalize_task(rq, p); |
8123 | 8122 | ||
8124 | __task_rq_unlock(rq); | 8123 | __task_rq_unlock(rq); |
8125 | raw_spin_unlock(&p->pi_lock); | 8124 | raw_spin_unlock(&p->pi_lock); |
8126 | } while_each_thread(g, p); | 8125 | } while_each_thread(g, p); |
8127 | 8126 | ||
8128 | read_unlock_irqrestore(&tasklist_lock, flags); | 8127 | read_unlock_irqrestore(&tasklist_lock, flags); |
8129 | } | 8128 | } |
8130 | 8129 | ||
8131 | #endif /* CONFIG_MAGIC_SYSRQ */ | 8130 | #endif /* CONFIG_MAGIC_SYSRQ */ |
8132 | 8131 | ||
8133 | #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) | 8132 | #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) |
8134 | /* | 8133 | /* |
8135 | * These functions are only useful for the IA64 MCA handling, or kdb. | 8134 | * These functions are only useful for the IA64 MCA handling, or kdb. |
8136 | * | 8135 | * |
8137 | * They can only be called when the whole system has been | 8136 | * They can only be called when the whole system has been |
8138 | * stopped - every CPU needs to be quiescent, and no scheduling | 8137 | * stopped - every CPU needs to be quiescent, and no scheduling |
8139 | * activity can take place. Using them for anything else would | 8138 | * activity can take place. Using them for anything else would |
8140 | * be a serious bug, and as a result, they aren't even visible | 8139 | * be a serious bug, and as a result, they aren't even visible |
8141 | * under any other configuration. | 8140 | * under any other configuration. |
8142 | */ | 8141 | */ |
8143 | 8142 | ||
8144 | /** | 8143 | /** |
8145 | * curr_task - return the current task for a given cpu. | 8144 | * curr_task - return the current task for a given cpu. |
8146 | * @cpu: the processor in question. | 8145 | * @cpu: the processor in question. |
8147 | * | 8146 | * |
8148 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 8147 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
8149 | */ | 8148 | */ |
8150 | struct task_struct *curr_task(int cpu) | 8149 | struct task_struct *curr_task(int cpu) |
8151 | { | 8150 | { |
8152 | return cpu_curr(cpu); | 8151 | return cpu_curr(cpu); |
8153 | } | 8152 | } |
8154 | 8153 | ||
8155 | #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ | 8154 | #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ |
8156 | 8155 | ||
8157 | #ifdef CONFIG_IA64 | 8156 | #ifdef CONFIG_IA64 |
8158 | /** | 8157 | /** |
8159 | * set_curr_task - set the current task for a given cpu. | 8158 | * set_curr_task - set the current task for a given cpu. |
8160 | * @cpu: the processor in question. | 8159 | * @cpu: the processor in question. |
8161 | * @p: the task pointer to set. | 8160 | * @p: the task pointer to set. |
8162 | * | 8161 | * |
8163 | * Description: This function must only be used when non-maskable interrupts | 8162 | * Description: This function must only be used when non-maskable interrupts |
8164 | * are serviced on a separate stack. It allows the architecture to switch the | 8163 | * are serviced on a separate stack. It allows the architecture to switch the |
8165 | * notion of the current task on a cpu in a non-blocking manner. This function | 8164 | * notion of the current task on a cpu in a non-blocking manner. This function |
8166 | * must be called with all CPU's synchronized, and interrupts disabled, the | 8165 | * must be called with all CPU's synchronized, and interrupts disabled, the |
8167 | * and caller must save the original value of the current task (see | 8166 | * and caller must save the original value of the current task (see |
8168 | * curr_task() above) and restore that value before reenabling interrupts and | 8167 | * curr_task() above) and restore that value before reenabling interrupts and |
8169 | * re-starting the system. | 8168 | * re-starting the system. |
8170 | * | 8169 | * |
8171 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 8170 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
8172 | */ | 8171 | */ |
8173 | void set_curr_task(int cpu, struct task_struct *p) | 8172 | void set_curr_task(int cpu, struct task_struct *p) |
8174 | { | 8173 | { |
8175 | cpu_curr(cpu) = p; | 8174 | cpu_curr(cpu) = p; |
8176 | } | 8175 | } |
8177 | 8176 | ||
8178 | #endif | 8177 | #endif |
8179 | 8178 | ||
8180 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8179 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8181 | static void free_fair_sched_group(struct task_group *tg) | 8180 | static void free_fair_sched_group(struct task_group *tg) |
8182 | { | 8181 | { |
8183 | int i; | 8182 | int i; |
8184 | 8183 | ||
8185 | for_each_possible_cpu(i) { | 8184 | for_each_possible_cpu(i) { |
8186 | if (tg->cfs_rq) | 8185 | if (tg->cfs_rq) |
8187 | kfree(tg->cfs_rq[i]); | 8186 | kfree(tg->cfs_rq[i]); |
8188 | if (tg->se) | 8187 | if (tg->se) |
8189 | kfree(tg->se[i]); | 8188 | kfree(tg->se[i]); |
8190 | } | 8189 | } |
8191 | 8190 | ||
8192 | kfree(tg->cfs_rq); | 8191 | kfree(tg->cfs_rq); |
8193 | kfree(tg->se); | 8192 | kfree(tg->se); |
8194 | } | 8193 | } |
8195 | 8194 | ||
8196 | static | 8195 | static |
8197 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | 8196 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) |
8198 | { | 8197 | { |
8199 | struct cfs_rq *cfs_rq; | 8198 | struct cfs_rq *cfs_rq; |
8200 | struct sched_entity *se; | 8199 | struct sched_entity *se; |
8201 | int i; | 8200 | int i; |
8202 | 8201 | ||
8203 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8202 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
8204 | if (!tg->cfs_rq) | 8203 | if (!tg->cfs_rq) |
8205 | goto err; | 8204 | goto err; |
8206 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); | 8205 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); |
8207 | if (!tg->se) | 8206 | if (!tg->se) |
8208 | goto err; | 8207 | goto err; |
8209 | 8208 | ||
8210 | tg->shares = NICE_0_LOAD; | 8209 | tg->shares = NICE_0_LOAD; |
8211 | 8210 | ||
8212 | for_each_possible_cpu(i) { | 8211 | for_each_possible_cpu(i) { |
8213 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8212 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8214 | GFP_KERNEL, cpu_to_node(i)); | 8213 | GFP_KERNEL, cpu_to_node(i)); |
8215 | if (!cfs_rq) | 8214 | if (!cfs_rq) |
8216 | goto err; | 8215 | goto err; |
8217 | 8216 | ||
8218 | se = kzalloc_node(sizeof(struct sched_entity), | 8217 | se = kzalloc_node(sizeof(struct sched_entity), |
8219 | GFP_KERNEL, cpu_to_node(i)); | 8218 | GFP_KERNEL, cpu_to_node(i)); |
8220 | if (!se) | 8219 | if (!se) |
8221 | goto err_free_rq; | 8220 | goto err_free_rq; |
8222 | 8221 | ||
8223 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | 8222 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8224 | } | 8223 | } |
8225 | 8224 | ||
8226 | return 1; | 8225 | return 1; |
8227 | 8226 | ||
8228 | err_free_rq: | 8227 | err_free_rq: |
8229 | kfree(cfs_rq); | 8228 | kfree(cfs_rq); |
8230 | err: | 8229 | err: |
8231 | return 0; | 8230 | return 0; |
8232 | } | 8231 | } |
8233 | 8232 | ||
8234 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8233 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8235 | { | 8234 | { |
8236 | struct rq *rq = cpu_rq(cpu); | 8235 | struct rq *rq = cpu_rq(cpu); |
8237 | unsigned long flags; | 8236 | unsigned long flags; |
8238 | 8237 | ||
8239 | /* | 8238 | /* |
8240 | * Only empty task groups can be destroyed; so we can speculatively | 8239 | * Only empty task groups can be destroyed; so we can speculatively |
8241 | * check on_list without danger of it being re-added. | 8240 | * check on_list without danger of it being re-added. |
8242 | */ | 8241 | */ |
8243 | if (!tg->cfs_rq[cpu]->on_list) | 8242 | if (!tg->cfs_rq[cpu]->on_list) |
8244 | return; | 8243 | return; |
8245 | 8244 | ||
8246 | raw_spin_lock_irqsave(&rq->lock, flags); | 8245 | raw_spin_lock_irqsave(&rq->lock, flags); |
8247 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | 8246 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); |
8248 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8247 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
8249 | } | 8248 | } |
8250 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8249 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8251 | static inline void free_fair_sched_group(struct task_group *tg) | 8250 | static inline void free_fair_sched_group(struct task_group *tg) |
8252 | { | 8251 | { |
8253 | } | 8252 | } |
8254 | 8253 | ||
8255 | static inline | 8254 | static inline |
8256 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | 8255 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) |
8257 | { | 8256 | { |
8258 | return 1; | 8257 | return 1; |
8259 | } | 8258 | } |
8260 | 8259 | ||
8261 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8260 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8262 | { | 8261 | { |
8263 | } | 8262 | } |
8264 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8263 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8265 | 8264 | ||
8266 | #ifdef CONFIG_RT_GROUP_SCHED | 8265 | #ifdef CONFIG_RT_GROUP_SCHED |
8267 | static void free_rt_sched_group(struct task_group *tg) | 8266 | static void free_rt_sched_group(struct task_group *tg) |
8268 | { | 8267 | { |
8269 | int i; | 8268 | int i; |
8270 | 8269 | ||
8271 | destroy_rt_bandwidth(&tg->rt_bandwidth); | 8270 | destroy_rt_bandwidth(&tg->rt_bandwidth); |
8272 | 8271 | ||
8273 | for_each_possible_cpu(i) { | 8272 | for_each_possible_cpu(i) { |
8274 | if (tg->rt_rq) | 8273 | if (tg->rt_rq) |
8275 | kfree(tg->rt_rq[i]); | 8274 | kfree(tg->rt_rq[i]); |
8276 | if (tg->rt_se) | 8275 | if (tg->rt_se) |
8277 | kfree(tg->rt_se[i]); | 8276 | kfree(tg->rt_se[i]); |
8278 | } | 8277 | } |
8279 | 8278 | ||
8280 | kfree(tg->rt_rq); | 8279 | kfree(tg->rt_rq); |
8281 | kfree(tg->rt_se); | 8280 | kfree(tg->rt_se); |
8282 | } | 8281 | } |
8283 | 8282 | ||
8284 | static | 8283 | static |
8285 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | 8284 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) |
8286 | { | 8285 | { |
8287 | struct rt_rq *rt_rq; | 8286 | struct rt_rq *rt_rq; |
8288 | struct sched_rt_entity *rt_se; | 8287 | struct sched_rt_entity *rt_se; |
8289 | int i; | 8288 | int i; |
8290 | 8289 | ||
8291 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | 8290 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
8292 | if (!tg->rt_rq) | 8291 | if (!tg->rt_rq) |
8293 | goto err; | 8292 | goto err; |
8294 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); | 8293 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); |
8295 | if (!tg->rt_se) | 8294 | if (!tg->rt_se) |
8296 | goto err; | 8295 | goto err; |
8297 | 8296 | ||
8298 | init_rt_bandwidth(&tg->rt_bandwidth, | 8297 | init_rt_bandwidth(&tg->rt_bandwidth, |
8299 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | 8298 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); |
8300 | 8299 | ||
8301 | for_each_possible_cpu(i) { | 8300 | for_each_possible_cpu(i) { |
8302 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | 8301 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
8303 | GFP_KERNEL, cpu_to_node(i)); | 8302 | GFP_KERNEL, cpu_to_node(i)); |
8304 | if (!rt_rq) | 8303 | if (!rt_rq) |
8305 | goto err; | 8304 | goto err; |
8306 | 8305 | ||
8307 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | 8306 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), |
8308 | GFP_KERNEL, cpu_to_node(i)); | 8307 | GFP_KERNEL, cpu_to_node(i)); |
8309 | if (!rt_se) | 8308 | if (!rt_se) |
8310 | goto err_free_rq; | 8309 | goto err_free_rq; |
8311 | 8310 | ||
8312 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 8311 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8313 | } | 8312 | } |
8314 | 8313 | ||
8315 | return 1; | 8314 | return 1; |
8316 | 8315 | ||
8317 | err_free_rq: | 8316 | err_free_rq: |
8318 | kfree(rt_rq); | 8317 | kfree(rt_rq); |
8319 | err: | 8318 | err: |
8320 | return 0; | 8319 | return 0; |
8321 | } | 8320 | } |
8322 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8321 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8323 | static inline void free_rt_sched_group(struct task_group *tg) | 8322 | static inline void free_rt_sched_group(struct task_group *tg) |
8324 | { | 8323 | { |
8325 | } | 8324 | } |
8326 | 8325 | ||
8327 | static inline | 8326 | static inline |
8328 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | 8327 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) |
8329 | { | 8328 | { |
8330 | return 1; | 8329 | return 1; |
8331 | } | 8330 | } |
8332 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8331 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8333 | 8332 | ||
8334 | #ifdef CONFIG_CGROUP_SCHED | 8333 | #ifdef CONFIG_CGROUP_SCHED |
8335 | static void free_sched_group(struct task_group *tg) | 8334 | static void free_sched_group(struct task_group *tg) |
8336 | { | 8335 | { |
8337 | free_fair_sched_group(tg); | 8336 | free_fair_sched_group(tg); |
8338 | free_rt_sched_group(tg); | 8337 | free_rt_sched_group(tg); |
8339 | autogroup_free(tg); | 8338 | autogroup_free(tg); |
8340 | kfree(tg); | 8339 | kfree(tg); |
8341 | } | 8340 | } |
8342 | 8341 | ||
8343 | /* allocate runqueue etc for a new task group */ | 8342 | /* allocate runqueue etc for a new task group */ |
8344 | struct task_group *sched_create_group(struct task_group *parent) | 8343 | struct task_group *sched_create_group(struct task_group *parent) |
8345 | { | 8344 | { |
8346 | struct task_group *tg; | 8345 | struct task_group *tg; |
8347 | unsigned long flags; | 8346 | unsigned long flags; |
8348 | 8347 | ||
8349 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8348 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
8350 | if (!tg) | 8349 | if (!tg) |
8351 | return ERR_PTR(-ENOMEM); | 8350 | return ERR_PTR(-ENOMEM); |
8352 | 8351 | ||
8353 | if (!alloc_fair_sched_group(tg, parent)) | 8352 | if (!alloc_fair_sched_group(tg, parent)) |
8354 | goto err; | 8353 | goto err; |
8355 | 8354 | ||
8356 | if (!alloc_rt_sched_group(tg, parent)) | 8355 | if (!alloc_rt_sched_group(tg, parent)) |
8357 | goto err; | 8356 | goto err; |
8358 | 8357 | ||
8359 | spin_lock_irqsave(&task_group_lock, flags); | 8358 | spin_lock_irqsave(&task_group_lock, flags); |
8360 | list_add_rcu(&tg->list, &task_groups); | 8359 | list_add_rcu(&tg->list, &task_groups); |
8361 | 8360 | ||
8362 | WARN_ON(!parent); /* root should already exist */ | 8361 | WARN_ON(!parent); /* root should already exist */ |
8363 | 8362 | ||
8364 | tg->parent = parent; | 8363 | tg->parent = parent; |
8365 | INIT_LIST_HEAD(&tg->children); | 8364 | INIT_LIST_HEAD(&tg->children); |
8366 | list_add_rcu(&tg->siblings, &parent->children); | 8365 | list_add_rcu(&tg->siblings, &parent->children); |
8367 | spin_unlock_irqrestore(&task_group_lock, flags); | 8366 | spin_unlock_irqrestore(&task_group_lock, flags); |
8368 | 8367 | ||
8369 | return tg; | 8368 | return tg; |
8370 | 8369 | ||
8371 | err: | 8370 | err: |
8372 | free_sched_group(tg); | 8371 | free_sched_group(tg); |
8373 | return ERR_PTR(-ENOMEM); | 8372 | return ERR_PTR(-ENOMEM); |
8374 | } | 8373 | } |
8375 | 8374 | ||
8376 | /* rcu callback to free various structures associated with a task group */ | 8375 | /* rcu callback to free various structures associated with a task group */ |
8377 | static void free_sched_group_rcu(struct rcu_head *rhp) | 8376 | static void free_sched_group_rcu(struct rcu_head *rhp) |
8378 | { | 8377 | { |
8379 | /* now it should be safe to free those cfs_rqs */ | 8378 | /* now it should be safe to free those cfs_rqs */ |
8380 | free_sched_group(container_of(rhp, struct task_group, rcu)); | 8379 | free_sched_group(container_of(rhp, struct task_group, rcu)); |
8381 | } | 8380 | } |
8382 | 8381 | ||
8383 | /* Destroy runqueue etc associated with a task group */ | 8382 | /* Destroy runqueue etc associated with a task group */ |
8384 | void sched_destroy_group(struct task_group *tg) | 8383 | void sched_destroy_group(struct task_group *tg) |
8385 | { | 8384 | { |
8386 | unsigned long flags; | 8385 | unsigned long flags; |
8387 | int i; | 8386 | int i; |
8388 | 8387 | ||
8389 | /* end participation in shares distribution */ | 8388 | /* end participation in shares distribution */ |
8390 | for_each_possible_cpu(i) | 8389 | for_each_possible_cpu(i) |
8391 | unregister_fair_sched_group(tg, i); | 8390 | unregister_fair_sched_group(tg, i); |
8392 | 8391 | ||
8393 | spin_lock_irqsave(&task_group_lock, flags); | 8392 | spin_lock_irqsave(&task_group_lock, flags); |
8394 | list_del_rcu(&tg->list); | 8393 | list_del_rcu(&tg->list); |
8395 | list_del_rcu(&tg->siblings); | 8394 | list_del_rcu(&tg->siblings); |
8396 | spin_unlock_irqrestore(&task_group_lock, flags); | 8395 | spin_unlock_irqrestore(&task_group_lock, flags); |
8397 | 8396 | ||
8398 | /* wait for possible concurrent references to cfs_rqs complete */ | 8397 | /* wait for possible concurrent references to cfs_rqs complete */ |
8399 | call_rcu(&tg->rcu, free_sched_group_rcu); | 8398 | call_rcu(&tg->rcu, free_sched_group_rcu); |
8400 | } | 8399 | } |
8401 | 8400 | ||
8402 | /* change task's runqueue when it moves between groups. | 8401 | /* change task's runqueue when it moves between groups. |
8403 | * The caller of this function should have put the task in its new group | 8402 | * The caller of this function should have put the task in its new group |
8404 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to | 8403 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to |
8405 | * reflect its new group. | 8404 | * reflect its new group. |
8406 | */ | 8405 | */ |
8407 | void sched_move_task(struct task_struct *tsk) | 8406 | void sched_move_task(struct task_struct *tsk) |
8408 | { | 8407 | { |
8409 | int on_rq, running; | 8408 | int on_rq, running; |
8410 | unsigned long flags; | 8409 | unsigned long flags; |
8411 | struct rq *rq; | 8410 | struct rq *rq; |
8412 | 8411 | ||
8413 | rq = task_rq_lock(tsk, &flags); | 8412 | rq = task_rq_lock(tsk, &flags); |
8414 | 8413 | ||
8415 | running = task_current(rq, tsk); | 8414 | running = task_current(rq, tsk); |
8416 | on_rq = tsk->on_rq; | 8415 | on_rq = tsk->on_rq; |
8417 | 8416 | ||
8418 | if (on_rq) | 8417 | if (on_rq) |
8419 | dequeue_task(rq, tsk, 0); | 8418 | dequeue_task(rq, tsk, 0); |
8420 | if (unlikely(running)) | 8419 | if (unlikely(running)) |
8421 | tsk->sched_class->put_prev_task(rq, tsk); | 8420 | tsk->sched_class->put_prev_task(rq, tsk); |
8422 | 8421 | ||
8423 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8422 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8424 | if (tsk->sched_class->task_move_group) | 8423 | if (tsk->sched_class->task_move_group) |
8425 | tsk->sched_class->task_move_group(tsk, on_rq); | 8424 | tsk->sched_class->task_move_group(tsk, on_rq); |
8426 | else | 8425 | else |
8427 | #endif | 8426 | #endif |
8428 | set_task_rq(tsk, task_cpu(tsk)); | 8427 | set_task_rq(tsk, task_cpu(tsk)); |
8429 | 8428 | ||
8430 | if (unlikely(running)) | 8429 | if (unlikely(running)) |
8431 | tsk->sched_class->set_curr_task(rq); | 8430 | tsk->sched_class->set_curr_task(rq); |
8432 | if (on_rq) | 8431 | if (on_rq) |
8433 | enqueue_task(rq, tsk, 0); | 8432 | enqueue_task(rq, tsk, 0); |
8434 | 8433 | ||
8435 | task_rq_unlock(rq, tsk, &flags); | 8434 | task_rq_unlock(rq, tsk, &flags); |
8436 | } | 8435 | } |
8437 | #endif /* CONFIG_CGROUP_SCHED */ | 8436 | #endif /* CONFIG_CGROUP_SCHED */ |
8438 | 8437 | ||
8439 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8438 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8440 | static DEFINE_MUTEX(shares_mutex); | 8439 | static DEFINE_MUTEX(shares_mutex); |
8441 | 8440 | ||
8442 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8441 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
8443 | { | 8442 | { |
8444 | int i; | 8443 | int i; |
8445 | unsigned long flags; | 8444 | unsigned long flags; |
8446 | 8445 | ||
8447 | /* | 8446 | /* |
8448 | * We can't change the weight of the root cgroup. | 8447 | * We can't change the weight of the root cgroup. |
8449 | */ | 8448 | */ |
8450 | if (!tg->se[0]) | 8449 | if (!tg->se[0]) |
8451 | return -EINVAL; | 8450 | return -EINVAL; |
8452 | 8451 | ||
8453 | if (shares < MIN_SHARES) | 8452 | if (shares < MIN_SHARES) |
8454 | shares = MIN_SHARES; | 8453 | shares = MIN_SHARES; |
8455 | else if (shares > MAX_SHARES) | 8454 | else if (shares > MAX_SHARES) |
8456 | shares = MAX_SHARES; | 8455 | shares = MAX_SHARES; |
8457 | 8456 | ||
8458 | mutex_lock(&shares_mutex); | 8457 | mutex_lock(&shares_mutex); |
8459 | if (tg->shares == shares) | 8458 | if (tg->shares == shares) |
8460 | goto done; | 8459 | goto done; |
8461 | 8460 | ||
8462 | tg->shares = shares; | 8461 | tg->shares = shares; |
8463 | for_each_possible_cpu(i) { | 8462 | for_each_possible_cpu(i) { |
8464 | struct rq *rq = cpu_rq(i); | 8463 | struct rq *rq = cpu_rq(i); |
8465 | struct sched_entity *se; | 8464 | struct sched_entity *se; |
8466 | 8465 | ||
8467 | se = tg->se[i]; | 8466 | se = tg->se[i]; |
8468 | /* Propagate contribution to hierarchy */ | 8467 | /* Propagate contribution to hierarchy */ |
8469 | raw_spin_lock_irqsave(&rq->lock, flags); | 8468 | raw_spin_lock_irqsave(&rq->lock, flags); |
8470 | for_each_sched_entity(se) | 8469 | for_each_sched_entity(se) |
8471 | update_cfs_shares(group_cfs_rq(se)); | 8470 | update_cfs_shares(group_cfs_rq(se)); |
8472 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8471 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
8473 | } | 8472 | } |
8474 | 8473 | ||
8475 | done: | 8474 | done: |
8476 | mutex_unlock(&shares_mutex); | 8475 | mutex_unlock(&shares_mutex); |
8477 | return 0; | 8476 | return 0; |
8478 | } | 8477 | } |
8479 | 8478 | ||
8480 | unsigned long sched_group_shares(struct task_group *tg) | 8479 | unsigned long sched_group_shares(struct task_group *tg) |
8481 | { | 8480 | { |
8482 | return tg->shares; | 8481 | return tg->shares; |
8483 | } | 8482 | } |
8484 | #endif | 8483 | #endif |
8485 | 8484 | ||
8486 | #ifdef CONFIG_RT_GROUP_SCHED | 8485 | #ifdef CONFIG_RT_GROUP_SCHED |
8487 | /* | 8486 | /* |
8488 | * Ensure that the real time constraints are schedulable. | 8487 | * Ensure that the real time constraints are schedulable. |
8489 | */ | 8488 | */ |
8490 | static DEFINE_MUTEX(rt_constraints_mutex); | 8489 | static DEFINE_MUTEX(rt_constraints_mutex); |
8491 | 8490 | ||
8492 | static unsigned long to_ratio(u64 period, u64 runtime) | 8491 | static unsigned long to_ratio(u64 period, u64 runtime) |
8493 | { | 8492 | { |
8494 | if (runtime == RUNTIME_INF) | 8493 | if (runtime == RUNTIME_INF) |
8495 | return 1ULL << 20; | 8494 | return 1ULL << 20; |
8496 | 8495 | ||
8497 | return div64_u64(runtime << 20, period); | 8496 | return div64_u64(runtime << 20, period); |
8498 | } | 8497 | } |
8499 | 8498 | ||
8500 | /* Must be called with tasklist_lock held */ | 8499 | /* Must be called with tasklist_lock held */ |
8501 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8500 | static inline int tg_has_rt_tasks(struct task_group *tg) |
8502 | { | 8501 | { |
8503 | struct task_struct *g, *p; | 8502 | struct task_struct *g, *p; |
8504 | 8503 | ||
8505 | do_each_thread(g, p) { | 8504 | do_each_thread(g, p) { |
8506 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 8505 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) |
8507 | return 1; | 8506 | return 1; |
8508 | } while_each_thread(g, p); | 8507 | } while_each_thread(g, p); |
8509 | 8508 | ||
8510 | return 0; | 8509 | return 0; |
8511 | } | 8510 | } |
8512 | 8511 | ||
8513 | struct rt_schedulable_data { | 8512 | struct rt_schedulable_data { |
8514 | struct task_group *tg; | 8513 | struct task_group *tg; |
8515 | u64 rt_period; | 8514 | u64 rt_period; |
8516 | u64 rt_runtime; | 8515 | u64 rt_runtime; |
8517 | }; | 8516 | }; |
8518 | 8517 | ||
8519 | static int tg_schedulable(struct task_group *tg, void *data) | 8518 | static int tg_schedulable(struct task_group *tg, void *data) |
8520 | { | 8519 | { |
8521 | struct rt_schedulable_data *d = data; | 8520 | struct rt_schedulable_data *d = data; |
8522 | struct task_group *child; | 8521 | struct task_group *child; |
8523 | unsigned long total, sum = 0; | 8522 | unsigned long total, sum = 0; |
8524 | u64 period, runtime; | 8523 | u64 period, runtime; |
8525 | 8524 | ||
8526 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); | 8525 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); |
8527 | runtime = tg->rt_bandwidth.rt_runtime; | 8526 | runtime = tg->rt_bandwidth.rt_runtime; |
8528 | 8527 | ||
8529 | if (tg == d->tg) { | 8528 | if (tg == d->tg) { |
8530 | period = d->rt_period; | 8529 | period = d->rt_period; |
8531 | runtime = d->rt_runtime; | 8530 | runtime = d->rt_runtime; |
8532 | } | 8531 | } |
8533 | 8532 | ||
8534 | /* | 8533 | /* |
8535 | * Cannot have more runtime than the period. | 8534 | * Cannot have more runtime than the period. |
8536 | */ | 8535 | */ |
8537 | if (runtime > period && runtime != RUNTIME_INF) | 8536 | if (runtime > period && runtime != RUNTIME_INF) |
8538 | return -EINVAL; | 8537 | return -EINVAL; |
8539 | 8538 | ||
8540 | /* | 8539 | /* |
8541 | * Ensure we don't starve existing RT tasks. | 8540 | * Ensure we don't starve existing RT tasks. |
8542 | */ | 8541 | */ |
8543 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) | 8542 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) |
8544 | return -EBUSY; | 8543 | return -EBUSY; |
8545 | 8544 | ||
8546 | total = to_ratio(period, runtime); | 8545 | total = to_ratio(period, runtime); |
8547 | 8546 | ||
8548 | /* | 8547 | /* |
8549 | * Nobody can have more than the global setting allows. | 8548 | * Nobody can have more than the global setting allows. |
8550 | */ | 8549 | */ |
8551 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) | 8550 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) |
8552 | return -EINVAL; | 8551 | return -EINVAL; |
8553 | 8552 | ||
8554 | /* | 8553 | /* |
8555 | * The sum of our children's runtime should not exceed our own. | 8554 | * The sum of our children's runtime should not exceed our own. |
8556 | */ | 8555 | */ |
8557 | list_for_each_entry_rcu(child, &tg->children, siblings) { | 8556 | list_for_each_entry_rcu(child, &tg->children, siblings) { |
8558 | period = ktime_to_ns(child->rt_bandwidth.rt_period); | 8557 | period = ktime_to_ns(child->rt_bandwidth.rt_period); |
8559 | runtime = child->rt_bandwidth.rt_runtime; | 8558 | runtime = child->rt_bandwidth.rt_runtime; |
8560 | 8559 | ||
8561 | if (child == d->tg) { | 8560 | if (child == d->tg) { |
8562 | period = d->rt_period; | 8561 | period = d->rt_period; |
8563 | runtime = d->rt_runtime; | 8562 | runtime = d->rt_runtime; |
8564 | } | 8563 | } |
8565 | 8564 | ||
8566 | sum += to_ratio(period, runtime); | 8565 | sum += to_ratio(period, runtime); |
8567 | } | 8566 | } |
8568 | 8567 | ||
8569 | if (sum > total) | 8568 | if (sum > total) |
8570 | return -EINVAL; | 8569 | return -EINVAL; |
8571 | 8570 | ||
8572 | return 0; | 8571 | return 0; |
8573 | } | 8572 | } |
8574 | 8573 | ||
8575 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8574 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8576 | { | 8575 | { |
8577 | struct rt_schedulable_data data = { | 8576 | struct rt_schedulable_data data = { |
8578 | .tg = tg, | 8577 | .tg = tg, |
8579 | .rt_period = period, | 8578 | .rt_period = period, |
8580 | .rt_runtime = runtime, | 8579 | .rt_runtime = runtime, |
8581 | }; | 8580 | }; |
8582 | 8581 | ||
8583 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | 8582 | return walk_tg_tree(tg_schedulable, tg_nop, &data); |
8584 | } | 8583 | } |
8585 | 8584 | ||
8586 | static int tg_set_bandwidth(struct task_group *tg, | 8585 | static int tg_set_bandwidth(struct task_group *tg, |
8587 | u64 rt_period, u64 rt_runtime) | 8586 | u64 rt_period, u64 rt_runtime) |
8588 | { | 8587 | { |
8589 | int i, err = 0; | 8588 | int i, err = 0; |
8590 | 8589 | ||
8591 | mutex_lock(&rt_constraints_mutex); | 8590 | mutex_lock(&rt_constraints_mutex); |
8592 | read_lock(&tasklist_lock); | 8591 | read_lock(&tasklist_lock); |
8593 | err = __rt_schedulable(tg, rt_period, rt_runtime); | 8592 | err = __rt_schedulable(tg, rt_period, rt_runtime); |
8594 | if (err) | 8593 | if (err) |
8595 | goto unlock; | 8594 | goto unlock; |
8596 | 8595 | ||
8597 | raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8596 | raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8598 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | 8597 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); |
8599 | tg->rt_bandwidth.rt_runtime = rt_runtime; | 8598 | tg->rt_bandwidth.rt_runtime = rt_runtime; |
8600 | 8599 | ||
8601 | for_each_possible_cpu(i) { | 8600 | for_each_possible_cpu(i) { |
8602 | struct rt_rq *rt_rq = tg->rt_rq[i]; | 8601 | struct rt_rq *rt_rq = tg->rt_rq[i]; |
8603 | 8602 | ||
8604 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 8603 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
8605 | rt_rq->rt_runtime = rt_runtime; | 8604 | rt_rq->rt_runtime = rt_runtime; |
8606 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8605 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
8607 | } | 8606 | } |
8608 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8607 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8609 | unlock: | 8608 | unlock: |
8610 | read_unlock(&tasklist_lock); | 8609 | read_unlock(&tasklist_lock); |
8611 | mutex_unlock(&rt_constraints_mutex); | 8610 | mutex_unlock(&rt_constraints_mutex); |
8612 | 8611 | ||
8613 | return err; | 8612 | return err; |
8614 | } | 8613 | } |
8615 | 8614 | ||
8616 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 8615 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) |
8617 | { | 8616 | { |
8618 | u64 rt_runtime, rt_period; | 8617 | u64 rt_runtime, rt_period; |
8619 | 8618 | ||
8620 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | 8619 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); |
8621 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | 8620 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; |
8622 | if (rt_runtime_us < 0) | 8621 | if (rt_runtime_us < 0) |
8623 | rt_runtime = RUNTIME_INF; | 8622 | rt_runtime = RUNTIME_INF; |
8624 | 8623 | ||
8625 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8624 | return tg_set_bandwidth(tg, rt_period, rt_runtime); |
8626 | } | 8625 | } |
8627 | 8626 | ||
8628 | long sched_group_rt_runtime(struct task_group *tg) | 8627 | long sched_group_rt_runtime(struct task_group *tg) |
8629 | { | 8628 | { |
8630 | u64 rt_runtime_us; | 8629 | u64 rt_runtime_us; |
8631 | 8630 | ||
8632 | if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) | 8631 | if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) |
8633 | return -1; | 8632 | return -1; |
8634 | 8633 | ||
8635 | rt_runtime_us = tg->rt_bandwidth.rt_runtime; | 8634 | rt_runtime_us = tg->rt_bandwidth.rt_runtime; |
8636 | do_div(rt_runtime_us, NSEC_PER_USEC); | 8635 | do_div(rt_runtime_us, NSEC_PER_USEC); |
8637 | return rt_runtime_us; | 8636 | return rt_runtime_us; |
8638 | } | 8637 | } |
8639 | 8638 | ||
8640 | int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | 8639 | int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) |
8641 | { | 8640 | { |
8642 | u64 rt_runtime, rt_period; | 8641 | u64 rt_runtime, rt_period; |
8643 | 8642 | ||
8644 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | 8643 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; |
8645 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 8644 | rt_runtime = tg->rt_bandwidth.rt_runtime; |
8646 | 8645 | ||
8647 | if (rt_period == 0) | 8646 | if (rt_period == 0) |
8648 | return -EINVAL; | 8647 | return -EINVAL; |
8649 | 8648 | ||
8650 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8649 | return tg_set_bandwidth(tg, rt_period, rt_runtime); |
8651 | } | 8650 | } |
8652 | 8651 | ||
8653 | long sched_group_rt_period(struct task_group *tg) | 8652 | long sched_group_rt_period(struct task_group *tg) |
8654 | { | 8653 | { |
8655 | u64 rt_period_us; | 8654 | u64 rt_period_us; |
8656 | 8655 | ||
8657 | rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); | 8656 | rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); |
8658 | do_div(rt_period_us, NSEC_PER_USEC); | 8657 | do_div(rt_period_us, NSEC_PER_USEC); |
8659 | return rt_period_us; | 8658 | return rt_period_us; |
8660 | } | 8659 | } |
8661 | 8660 | ||
8662 | static int sched_rt_global_constraints(void) | 8661 | static int sched_rt_global_constraints(void) |
8663 | { | 8662 | { |
8664 | u64 runtime, period; | 8663 | u64 runtime, period; |
8665 | int ret = 0; | 8664 | int ret = 0; |
8666 | 8665 | ||
8667 | if (sysctl_sched_rt_period <= 0) | 8666 | if (sysctl_sched_rt_period <= 0) |
8668 | return -EINVAL; | 8667 | return -EINVAL; |
8669 | 8668 | ||
8670 | runtime = global_rt_runtime(); | 8669 | runtime = global_rt_runtime(); |
8671 | period = global_rt_period(); | 8670 | period = global_rt_period(); |
8672 | 8671 | ||
8673 | /* | 8672 | /* |
8674 | * Sanity check on the sysctl variables. | 8673 | * Sanity check on the sysctl variables. |
8675 | */ | 8674 | */ |
8676 | if (runtime > period && runtime != RUNTIME_INF) | 8675 | if (runtime > period && runtime != RUNTIME_INF) |
8677 | return -EINVAL; | 8676 | return -EINVAL; |
8678 | 8677 | ||
8679 | mutex_lock(&rt_constraints_mutex); | 8678 | mutex_lock(&rt_constraints_mutex); |
8680 | read_lock(&tasklist_lock); | 8679 | read_lock(&tasklist_lock); |
8681 | ret = __rt_schedulable(NULL, 0, 0); | 8680 | ret = __rt_schedulable(NULL, 0, 0); |
8682 | read_unlock(&tasklist_lock); | 8681 | read_unlock(&tasklist_lock); |
8683 | mutex_unlock(&rt_constraints_mutex); | 8682 | mutex_unlock(&rt_constraints_mutex); |
8684 | 8683 | ||
8685 | return ret; | 8684 | return ret; |
8686 | } | 8685 | } |
8687 | 8686 | ||
8688 | int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | 8687 | int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) |
8689 | { | 8688 | { |
8690 | /* Don't accept realtime tasks when there is no way for them to run */ | 8689 | /* Don't accept realtime tasks when there is no way for them to run */ |
8691 | if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) | 8690 | if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) |
8692 | return 0; | 8691 | return 0; |
8693 | 8692 | ||
8694 | return 1; | 8693 | return 1; |
8695 | } | 8694 | } |
8696 | 8695 | ||
8697 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8696 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8698 | static int sched_rt_global_constraints(void) | 8697 | static int sched_rt_global_constraints(void) |
8699 | { | 8698 | { |
8700 | unsigned long flags; | 8699 | unsigned long flags; |
8701 | int i; | 8700 | int i; |
8702 | 8701 | ||
8703 | if (sysctl_sched_rt_period <= 0) | 8702 | if (sysctl_sched_rt_period <= 0) |
8704 | return -EINVAL; | 8703 | return -EINVAL; |
8705 | 8704 | ||
8706 | /* | 8705 | /* |
8707 | * There's always some RT tasks in the root group | 8706 | * There's always some RT tasks in the root group |
8708 | * -- migration, kstopmachine etc.. | 8707 | * -- migration, kstopmachine etc.. |
8709 | */ | 8708 | */ |
8710 | if (sysctl_sched_rt_runtime == 0) | 8709 | if (sysctl_sched_rt_runtime == 0) |
8711 | return -EBUSY; | 8710 | return -EBUSY; |
8712 | 8711 | ||
8713 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 8712 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
8714 | for_each_possible_cpu(i) { | 8713 | for_each_possible_cpu(i) { |
8715 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | 8714 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; |
8716 | 8715 | ||
8717 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 8716 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
8718 | rt_rq->rt_runtime = global_rt_runtime(); | 8717 | rt_rq->rt_runtime = global_rt_runtime(); |
8719 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8718 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
8720 | } | 8719 | } |
8721 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | 8720 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); |
8722 | 8721 | ||
8723 | return 0; | 8722 | return 0; |
8724 | } | 8723 | } |
8725 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8724 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8726 | 8725 | ||
8727 | int sched_rt_handler(struct ctl_table *table, int write, | 8726 | int sched_rt_handler(struct ctl_table *table, int write, |
8728 | void __user *buffer, size_t *lenp, | 8727 | void __user *buffer, size_t *lenp, |
8729 | loff_t *ppos) | 8728 | loff_t *ppos) |
8730 | { | 8729 | { |
8731 | int ret; | 8730 | int ret; |
8732 | int old_period, old_runtime; | 8731 | int old_period, old_runtime; |
8733 | static DEFINE_MUTEX(mutex); | 8732 | static DEFINE_MUTEX(mutex); |
8734 | 8733 | ||
8735 | mutex_lock(&mutex); | 8734 | mutex_lock(&mutex); |
8736 | old_period = sysctl_sched_rt_period; | 8735 | old_period = sysctl_sched_rt_period; |
8737 | old_runtime = sysctl_sched_rt_runtime; | 8736 | old_runtime = sysctl_sched_rt_runtime; |
8738 | 8737 | ||
8739 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 8738 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
8740 | 8739 | ||
8741 | if (!ret && write) { | 8740 | if (!ret && write) { |
8742 | ret = sched_rt_global_constraints(); | 8741 | ret = sched_rt_global_constraints(); |
8743 | if (ret) { | 8742 | if (ret) { |
8744 | sysctl_sched_rt_period = old_period; | 8743 | sysctl_sched_rt_period = old_period; |
8745 | sysctl_sched_rt_runtime = old_runtime; | 8744 | sysctl_sched_rt_runtime = old_runtime; |
8746 | } else { | 8745 | } else { |
8747 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | 8746 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); |
8748 | def_rt_bandwidth.rt_period = | 8747 | def_rt_bandwidth.rt_period = |
8749 | ns_to_ktime(global_rt_period()); | 8748 | ns_to_ktime(global_rt_period()); |
8750 | } | 8749 | } |
8751 | } | 8750 | } |
8752 | mutex_unlock(&mutex); | 8751 | mutex_unlock(&mutex); |
8753 | 8752 | ||
8754 | return ret; | 8753 | return ret; |
8755 | } | 8754 | } |
8756 | 8755 | ||
8757 | #ifdef CONFIG_CGROUP_SCHED | 8756 | #ifdef CONFIG_CGROUP_SCHED |
8758 | 8757 | ||
8759 | /* return corresponding task_group object of a cgroup */ | 8758 | /* return corresponding task_group object of a cgroup */ |
8760 | static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | 8759 | static inline struct task_group *cgroup_tg(struct cgroup *cgrp) |
8761 | { | 8760 | { |
8762 | return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), | 8761 | return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), |
8763 | struct task_group, css); | 8762 | struct task_group, css); |
8764 | } | 8763 | } |
8765 | 8764 | ||
8766 | static struct cgroup_subsys_state * | 8765 | static struct cgroup_subsys_state * |
8767 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | 8766 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) |
8768 | { | 8767 | { |
8769 | struct task_group *tg, *parent; | 8768 | struct task_group *tg, *parent; |
8770 | 8769 | ||
8771 | if (!cgrp->parent) { | 8770 | if (!cgrp->parent) { |
8772 | /* This is early initialization for the top cgroup */ | 8771 | /* This is early initialization for the top cgroup */ |
8773 | return &root_task_group.css; | 8772 | return &root_task_group.css; |
8774 | } | 8773 | } |
8775 | 8774 | ||
8776 | parent = cgroup_tg(cgrp->parent); | 8775 | parent = cgroup_tg(cgrp->parent); |
8777 | tg = sched_create_group(parent); | 8776 | tg = sched_create_group(parent); |
8778 | if (IS_ERR(tg)) | 8777 | if (IS_ERR(tg)) |
8779 | return ERR_PTR(-ENOMEM); | 8778 | return ERR_PTR(-ENOMEM); |
8780 | 8779 | ||
8781 | return &tg->css; | 8780 | return &tg->css; |
8782 | } | 8781 | } |
8783 | 8782 | ||
8784 | static void | 8783 | static void |
8785 | cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | 8784 | cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
8786 | { | 8785 | { |
8787 | struct task_group *tg = cgroup_tg(cgrp); | 8786 | struct task_group *tg = cgroup_tg(cgrp); |
8788 | 8787 | ||
8789 | sched_destroy_group(tg); | 8788 | sched_destroy_group(tg); |
8790 | } | 8789 | } |
8791 | 8790 | ||
8792 | static int | 8791 | static int |
8793 | cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 8792 | cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
8794 | { | 8793 | { |
8795 | #ifdef CONFIG_RT_GROUP_SCHED | 8794 | #ifdef CONFIG_RT_GROUP_SCHED |
8796 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) | 8795 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) |
8797 | return -EINVAL; | 8796 | return -EINVAL; |
8798 | #else | 8797 | #else |
8799 | /* We don't support RT-tasks being in separate groups */ | 8798 | /* We don't support RT-tasks being in separate groups */ |
8800 | if (tsk->sched_class != &fair_sched_class) | 8799 | if (tsk->sched_class != &fair_sched_class) |
8801 | return -EINVAL; | 8800 | return -EINVAL; |
8802 | #endif | 8801 | #endif |
8803 | return 0; | 8802 | return 0; |
8804 | } | 8803 | } |
8805 | 8804 | ||
8806 | static void | 8805 | static void |
8807 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 8806 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
8808 | { | 8807 | { |
8809 | sched_move_task(tsk); | 8808 | sched_move_task(tsk); |
8810 | } | 8809 | } |
8811 | 8810 | ||
8812 | static void | 8811 | static void |
8813 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | 8812 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
8814 | struct cgroup *old_cgrp, struct task_struct *task) | 8813 | struct cgroup *old_cgrp, struct task_struct *task) |
8815 | { | 8814 | { |
8816 | /* | 8815 | /* |
8817 | * cgroup_exit() is called in the copy_process() failure path. | 8816 | * cgroup_exit() is called in the copy_process() failure path. |
8818 | * Ignore this case since the task hasn't ran yet, this avoids | 8817 | * Ignore this case since the task hasn't ran yet, this avoids |
8819 | * trying to poke a half freed task state from generic code. | 8818 | * trying to poke a half freed task state from generic code. |
8820 | */ | 8819 | */ |
8821 | if (!(task->flags & PF_EXITING)) | 8820 | if (!(task->flags & PF_EXITING)) |
8822 | return; | 8821 | return; |
8823 | 8822 | ||
8824 | sched_move_task(task); | 8823 | sched_move_task(task); |
8825 | } | 8824 | } |
8826 | 8825 | ||
8827 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8826 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8828 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 8827 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
8829 | u64 shareval) | 8828 | u64 shareval) |
8830 | { | 8829 | { |
8831 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); | 8830 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); |
8832 | } | 8831 | } |
8833 | 8832 | ||
8834 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 8833 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) |
8835 | { | 8834 | { |
8836 | struct task_group *tg = cgroup_tg(cgrp); | 8835 | struct task_group *tg = cgroup_tg(cgrp); |
8837 | 8836 | ||
8838 | return (u64) scale_load_down(tg->shares); | 8837 | return (u64) scale_load_down(tg->shares); |
8839 | } | 8838 | } |
8840 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8839 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8841 | 8840 | ||
8842 | #ifdef CONFIG_RT_GROUP_SCHED | 8841 | #ifdef CONFIG_RT_GROUP_SCHED |
8843 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 8842 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
8844 | s64 val) | 8843 | s64 val) |
8845 | { | 8844 | { |
8846 | return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); | 8845 | return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); |
8847 | } | 8846 | } |
8848 | 8847 | ||
8849 | static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) | 8848 | static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) |
8850 | { | 8849 | { |
8851 | return sched_group_rt_runtime(cgroup_tg(cgrp)); | 8850 | return sched_group_rt_runtime(cgroup_tg(cgrp)); |
8852 | } | 8851 | } |
8853 | 8852 | ||
8854 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, | 8853 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, |
8855 | u64 rt_period_us) | 8854 | u64 rt_period_us) |
8856 | { | 8855 | { |
8857 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); | 8856 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); |
8858 | } | 8857 | } |
8859 | 8858 | ||
8860 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | 8859 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) |
8861 | { | 8860 | { |
8862 | return sched_group_rt_period(cgroup_tg(cgrp)); | 8861 | return sched_group_rt_period(cgroup_tg(cgrp)); |
8863 | } | 8862 | } |
8864 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8863 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8865 | 8864 | ||
8866 | static struct cftype cpu_files[] = { | 8865 | static struct cftype cpu_files[] = { |
8867 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8866 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8868 | { | 8867 | { |
8869 | .name = "shares", | 8868 | .name = "shares", |
8870 | .read_u64 = cpu_shares_read_u64, | 8869 | .read_u64 = cpu_shares_read_u64, |
8871 | .write_u64 = cpu_shares_write_u64, | 8870 | .write_u64 = cpu_shares_write_u64, |
8872 | }, | 8871 | }, |
8873 | #endif | 8872 | #endif |
8874 | #ifdef CONFIG_RT_GROUP_SCHED | 8873 | #ifdef CONFIG_RT_GROUP_SCHED |
8875 | { | 8874 | { |
8876 | .name = "rt_runtime_us", | 8875 | .name = "rt_runtime_us", |
8877 | .read_s64 = cpu_rt_runtime_read, | 8876 | .read_s64 = cpu_rt_runtime_read, |
8878 | .write_s64 = cpu_rt_runtime_write, | 8877 | .write_s64 = cpu_rt_runtime_write, |
8879 | }, | 8878 | }, |
8880 | { | 8879 | { |
8881 | .name = "rt_period_us", | 8880 | .name = "rt_period_us", |
8882 | .read_u64 = cpu_rt_period_read_uint, | 8881 | .read_u64 = cpu_rt_period_read_uint, |
8883 | .write_u64 = cpu_rt_period_write_uint, | 8882 | .write_u64 = cpu_rt_period_write_uint, |
8884 | }, | 8883 | }, |
8885 | #endif | 8884 | #endif |
8886 | }; | 8885 | }; |
8887 | 8886 | ||
8888 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 8887 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
8889 | { | 8888 | { |
8890 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | 8889 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); |
8891 | } | 8890 | } |
8892 | 8891 | ||
8893 | struct cgroup_subsys cpu_cgroup_subsys = { | 8892 | struct cgroup_subsys cpu_cgroup_subsys = { |
8894 | .name = "cpu", | 8893 | .name = "cpu", |
8895 | .create = cpu_cgroup_create, | 8894 | .create = cpu_cgroup_create, |
8896 | .destroy = cpu_cgroup_destroy, | 8895 | .destroy = cpu_cgroup_destroy, |
8897 | .can_attach_task = cpu_cgroup_can_attach_task, | 8896 | .can_attach_task = cpu_cgroup_can_attach_task, |
8898 | .attach_task = cpu_cgroup_attach_task, | 8897 | .attach_task = cpu_cgroup_attach_task, |
8899 | .exit = cpu_cgroup_exit, | 8898 | .exit = cpu_cgroup_exit, |
8900 | .populate = cpu_cgroup_populate, | 8899 | .populate = cpu_cgroup_populate, |
8901 | .subsys_id = cpu_cgroup_subsys_id, | 8900 | .subsys_id = cpu_cgroup_subsys_id, |
8902 | .early_init = 1, | 8901 | .early_init = 1, |
8903 | }; | 8902 | }; |
8904 | 8903 | ||
8905 | #endif /* CONFIG_CGROUP_SCHED */ | 8904 | #endif /* CONFIG_CGROUP_SCHED */ |
8906 | 8905 | ||
8907 | #ifdef CONFIG_CGROUP_CPUACCT | 8906 | #ifdef CONFIG_CGROUP_CPUACCT |
8908 | 8907 | ||
8909 | /* | 8908 | /* |
8910 | * CPU accounting code for task groups. | 8909 | * CPU accounting code for task groups. |
8911 | * | 8910 | * |
8912 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | 8911 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh |
8913 | * (balbir@in.ibm.com). | 8912 | * (balbir@in.ibm.com). |
8914 | */ | 8913 | */ |
8915 | 8914 | ||
8916 | /* track cpu usage of a group of tasks and its child groups */ | 8915 | /* track cpu usage of a group of tasks and its child groups */ |
8917 | struct cpuacct { | 8916 | struct cpuacct { |
8918 | struct cgroup_subsys_state css; | 8917 | struct cgroup_subsys_state css; |
8919 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 8918 | /* cpuusage holds pointer to a u64-type object on every cpu */ |
8920 | u64 __percpu *cpuusage; | 8919 | u64 __percpu *cpuusage; |
8921 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | 8920 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; |
8922 | struct cpuacct *parent; | 8921 | struct cpuacct *parent; |
8923 | }; | 8922 | }; |
8924 | 8923 | ||
8925 | struct cgroup_subsys cpuacct_subsys; | 8924 | struct cgroup_subsys cpuacct_subsys; |
8926 | 8925 | ||
8927 | /* return cpu accounting group corresponding to this container */ | 8926 | /* return cpu accounting group corresponding to this container */ |
8928 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | 8927 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) |
8929 | { | 8928 | { |
8930 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | 8929 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), |
8931 | struct cpuacct, css); | 8930 | struct cpuacct, css); |
8932 | } | 8931 | } |
8933 | 8932 | ||
8934 | /* return cpu accounting group to which this task belongs */ | 8933 | /* return cpu accounting group to which this task belongs */ |
8935 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 8934 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
8936 | { | 8935 | { |
8937 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | 8936 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), |
8938 | struct cpuacct, css); | 8937 | struct cpuacct, css); |
8939 | } | 8938 | } |
8940 | 8939 | ||
8941 | /* create a new cpu accounting group */ | 8940 | /* create a new cpu accounting group */ |
8942 | static struct cgroup_subsys_state *cpuacct_create( | 8941 | static struct cgroup_subsys_state *cpuacct_create( |
8943 | struct cgroup_subsys *ss, struct cgroup *cgrp) | 8942 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
8944 | { | 8943 | { |
8945 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 8944 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
8946 | int i; | 8945 | int i; |
8947 | 8946 | ||
8948 | if (!ca) | 8947 | if (!ca) |
8949 | goto out; | 8948 | goto out; |
8950 | 8949 | ||
8951 | ca->cpuusage = alloc_percpu(u64); | 8950 | ca->cpuusage = alloc_percpu(u64); |
8952 | if (!ca->cpuusage) | 8951 | if (!ca->cpuusage) |
8953 | goto out_free_ca; | 8952 | goto out_free_ca; |
8954 | 8953 | ||
8955 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 8954 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) |
8956 | if (percpu_counter_init(&ca->cpustat[i], 0)) | 8955 | if (percpu_counter_init(&ca->cpustat[i], 0)) |
8957 | goto out_free_counters; | 8956 | goto out_free_counters; |
8958 | 8957 | ||
8959 | if (cgrp->parent) | 8958 | if (cgrp->parent) |
8960 | ca->parent = cgroup_ca(cgrp->parent); | 8959 | ca->parent = cgroup_ca(cgrp->parent); |
8961 | 8960 | ||
8962 | return &ca->css; | 8961 | return &ca->css; |
8963 | 8962 | ||
8964 | out_free_counters: | 8963 | out_free_counters: |
8965 | while (--i >= 0) | 8964 | while (--i >= 0) |
8966 | percpu_counter_destroy(&ca->cpustat[i]); | 8965 | percpu_counter_destroy(&ca->cpustat[i]); |
8967 | free_percpu(ca->cpuusage); | 8966 | free_percpu(ca->cpuusage); |
8968 | out_free_ca: | 8967 | out_free_ca: |
8969 | kfree(ca); | 8968 | kfree(ca); |
8970 | out: | 8969 | out: |
8971 | return ERR_PTR(-ENOMEM); | 8970 | return ERR_PTR(-ENOMEM); |
8972 | } | 8971 | } |
8973 | 8972 | ||
8974 | /* destroy an existing cpu accounting group */ | 8973 | /* destroy an existing cpu accounting group */ |
8975 | static void | 8974 | static void |
8976 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | 8975 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
8977 | { | 8976 | { |
8978 | struct cpuacct *ca = cgroup_ca(cgrp); | 8977 | struct cpuacct *ca = cgroup_ca(cgrp); |
8979 | int i; | 8978 | int i; |
8980 | 8979 | ||
8981 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 8980 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) |
8982 | percpu_counter_destroy(&ca->cpustat[i]); | 8981 | percpu_counter_destroy(&ca->cpustat[i]); |
8983 | free_percpu(ca->cpuusage); | 8982 | free_percpu(ca->cpuusage); |
8984 | kfree(ca); | 8983 | kfree(ca); |
8985 | } | 8984 | } |
8986 | 8985 | ||
8987 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) | 8986 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) |
8988 | { | 8987 | { |
8989 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 8988 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
8990 | u64 data; | 8989 | u64 data; |
8991 | 8990 | ||
8992 | #ifndef CONFIG_64BIT | 8991 | #ifndef CONFIG_64BIT |
8993 | /* | 8992 | /* |
8994 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. | 8993 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. |
8995 | */ | 8994 | */ |
8996 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | 8995 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
8997 | data = *cpuusage; | 8996 | data = *cpuusage; |
8998 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | 8997 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); |
8999 | #else | 8998 | #else |
9000 | data = *cpuusage; | 8999 | data = *cpuusage; |
9001 | #endif | 9000 | #endif |
9002 | 9001 | ||
9003 | return data; | 9002 | return data; |
9004 | } | 9003 | } |
9005 | 9004 | ||
9006 | static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | 9005 | static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) |
9007 | { | 9006 | { |
9008 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 9007 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
9009 | 9008 | ||
9010 | #ifndef CONFIG_64BIT | 9009 | #ifndef CONFIG_64BIT |
9011 | /* | 9010 | /* |
9012 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. | 9011 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. |
9013 | */ | 9012 | */ |
9014 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | 9013 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
9015 | *cpuusage = val; | 9014 | *cpuusage = val; |
9016 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | 9015 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); |
9017 | #else | 9016 | #else |
9018 | *cpuusage = val; | 9017 | *cpuusage = val; |
9019 | #endif | 9018 | #endif |
9020 | } | 9019 | } |
9021 | 9020 | ||
9022 | /* return total cpu usage (in nanoseconds) of a group */ | 9021 | /* return total cpu usage (in nanoseconds) of a group */ |
9023 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | 9022 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) |
9024 | { | 9023 | { |
9025 | struct cpuacct *ca = cgroup_ca(cgrp); | 9024 | struct cpuacct *ca = cgroup_ca(cgrp); |
9026 | u64 totalcpuusage = 0; | 9025 | u64 totalcpuusage = 0; |
9027 | int i; | 9026 | int i; |
9028 | 9027 | ||
9029 | for_each_present_cpu(i) | 9028 | for_each_present_cpu(i) |
9030 | totalcpuusage += cpuacct_cpuusage_read(ca, i); | 9029 | totalcpuusage += cpuacct_cpuusage_read(ca, i); |
9031 | 9030 | ||
9032 | return totalcpuusage; | 9031 | return totalcpuusage; |
9033 | } | 9032 | } |
9034 | 9033 | ||
9035 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | 9034 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, |
9036 | u64 reset) | 9035 | u64 reset) |
9037 | { | 9036 | { |
9038 | struct cpuacct *ca = cgroup_ca(cgrp); | 9037 | struct cpuacct *ca = cgroup_ca(cgrp); |
9039 | int err = 0; | 9038 | int err = 0; |
9040 | int i; | 9039 | int i; |
9041 | 9040 | ||
9042 | if (reset) { | 9041 | if (reset) { |
9043 | err = -EINVAL; | 9042 | err = -EINVAL; |
9044 | goto out; | 9043 | goto out; |
9045 | } | 9044 | } |
9046 | 9045 | ||
9047 | for_each_present_cpu(i) | 9046 | for_each_present_cpu(i) |
9048 | cpuacct_cpuusage_write(ca, i, 0); | 9047 | cpuacct_cpuusage_write(ca, i, 0); |
9049 | 9048 | ||
9050 | out: | 9049 | out: |
9051 | return err; | 9050 | return err; |
9052 | } | 9051 | } |
9053 | 9052 | ||
9054 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | 9053 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, |
9055 | struct seq_file *m) | 9054 | struct seq_file *m) |
9056 | { | 9055 | { |
9057 | struct cpuacct *ca = cgroup_ca(cgroup); | 9056 | struct cpuacct *ca = cgroup_ca(cgroup); |
9058 | u64 percpu; | 9057 | u64 percpu; |
9059 | int i; | 9058 | int i; |
9060 | 9059 | ||
9061 | for_each_present_cpu(i) { | 9060 | for_each_present_cpu(i) { |
9062 | percpu = cpuacct_cpuusage_read(ca, i); | 9061 | percpu = cpuacct_cpuusage_read(ca, i); |
9063 | seq_printf(m, "%llu ", (unsigned long long) percpu); | 9062 | seq_printf(m, "%llu ", (unsigned long long) percpu); |
9064 | } | 9063 | } |
9065 | seq_printf(m, "\n"); | 9064 | seq_printf(m, "\n"); |
9066 | return 0; | 9065 | return 0; |
9067 | } | 9066 | } |
9068 | 9067 | ||
9069 | static const char *cpuacct_stat_desc[] = { | 9068 | static const char *cpuacct_stat_desc[] = { |
9070 | [CPUACCT_STAT_USER] = "user", | 9069 | [CPUACCT_STAT_USER] = "user", |
9071 | [CPUACCT_STAT_SYSTEM] = "system", | 9070 | [CPUACCT_STAT_SYSTEM] = "system", |
9072 | }; | 9071 | }; |
9073 | 9072 | ||
9074 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 9073 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, |
9075 | struct cgroup_map_cb *cb) | 9074 | struct cgroup_map_cb *cb) |
9076 | { | 9075 | { |
9077 | struct cpuacct *ca = cgroup_ca(cgrp); | 9076 | struct cpuacct *ca = cgroup_ca(cgrp); |
9078 | int i; | 9077 | int i; |
9079 | 9078 | ||
9080 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { | 9079 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { |
9081 | s64 val = percpu_counter_read(&ca->cpustat[i]); | 9080 | s64 val = percpu_counter_read(&ca->cpustat[i]); |
9082 | val = cputime64_to_clock_t(val); | 9081 | val = cputime64_to_clock_t(val); |
9083 | cb->fill(cb, cpuacct_stat_desc[i], val); | 9082 | cb->fill(cb, cpuacct_stat_desc[i], val); |
9084 | } | 9083 | } |
9085 | return 0; | 9084 | return 0; |
9086 | } | 9085 | } |
9087 | 9086 | ||
9088 | static struct cftype files[] = { | 9087 | static struct cftype files[] = { |
9089 | { | 9088 | { |
9090 | .name = "usage", | 9089 | .name = "usage", |
9091 | .read_u64 = cpuusage_read, | 9090 | .read_u64 = cpuusage_read, |
9092 | .write_u64 = cpuusage_write, | 9091 | .write_u64 = cpuusage_write, |
9093 | }, | 9092 | }, |
9094 | { | 9093 | { |
9095 | .name = "usage_percpu", | 9094 | .name = "usage_percpu", |
9096 | .read_seq_string = cpuacct_percpu_seq_read, | 9095 | .read_seq_string = cpuacct_percpu_seq_read, |
9097 | }, | 9096 | }, |
9098 | { | 9097 | { |
9099 | .name = "stat", | 9098 | .name = "stat", |
9100 | .read_map = cpuacct_stats_show, | 9099 | .read_map = cpuacct_stats_show, |
9101 | }, | 9100 | }, |
9102 | }; | 9101 | }; |
9103 | 9102 | ||
9104 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | 9103 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) |
9105 | { | 9104 | { |
9106 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); | 9105 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); |
9107 | } | 9106 | } |
9108 | 9107 | ||
9109 | /* | 9108 | /* |
9110 | * charge this task's execution time to its accounting group. | 9109 | * charge this task's execution time to its accounting group. |
9111 | * | 9110 | * |
9112 | * called with rq->lock held. | 9111 | * called with rq->lock held. |
9113 | */ | 9112 | */ |
9114 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 9113 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
9115 | { | 9114 | { |
9116 | struct cpuacct *ca; | 9115 | struct cpuacct *ca; |
9117 | int cpu; | 9116 | int cpu; |
9118 | 9117 | ||
9119 | if (unlikely(!cpuacct_subsys.active)) | 9118 | if (unlikely(!cpuacct_subsys.active)) |
9120 | return; | 9119 | return; |
9121 | 9120 | ||
9122 | cpu = task_cpu(tsk); | 9121 | cpu = task_cpu(tsk); |
9123 | 9122 | ||
9124 | rcu_read_lock(); | 9123 | rcu_read_lock(); |
9125 | 9124 | ||
9126 | ca = task_ca(tsk); | 9125 | ca = task_ca(tsk); |
9127 | 9126 | ||
9128 | for (; ca; ca = ca->parent) { | 9127 | for (; ca; ca = ca->parent) { |
9129 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 9128 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
9130 | *cpuusage += cputime; | 9129 | *cpuusage += cputime; |
9131 | } | 9130 | } |
9132 | 9131 | ||
9133 | rcu_read_unlock(); | 9132 | rcu_read_unlock(); |
9134 | } | 9133 | } |
9135 | 9134 | ||
9136 | /* | 9135 | /* |
9137 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | 9136 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large |
9138 | * in cputime_t units. As a result, cpuacct_update_stats calls | 9137 | * in cputime_t units. As a result, cpuacct_update_stats calls |
9139 | * percpu_counter_add with values large enough to always overflow the | 9138 | * percpu_counter_add with values large enough to always overflow the |
9140 | * per cpu batch limit causing bad SMP scalability. | 9139 | * per cpu batch limit causing bad SMP scalability. |
9141 | * | 9140 | * |
9142 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | 9141 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we |
9143 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | 9142 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled |
9144 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | 9143 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. |
9145 | */ | 9144 | */ |
9146 | #ifdef CONFIG_SMP | 9145 | #ifdef CONFIG_SMP |
9147 | #define CPUACCT_BATCH \ | 9146 | #define CPUACCT_BATCH \ |
9148 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | 9147 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) |
9149 | #else | 9148 | #else |
9150 | #define CPUACCT_BATCH 0 | 9149 | #define CPUACCT_BATCH 0 |
9151 | #endif | 9150 | #endif |
9152 | 9151 | ||
9153 | /* | 9152 | /* |
9154 | * Charge the system/user time to the task's accounting group. | 9153 | * Charge the system/user time to the task's accounting group. |
9155 | */ | 9154 | */ |
9156 | static void cpuacct_update_stats(struct task_struct *tsk, | 9155 | static void cpuacct_update_stats(struct task_struct *tsk, |
9157 | enum cpuacct_stat_index idx, cputime_t val) | 9156 | enum cpuacct_stat_index idx, cputime_t val) |
9158 | { | 9157 | { |
9159 | struct cpuacct *ca; | 9158 | struct cpuacct *ca; |
9160 | int batch = CPUACCT_BATCH; | 9159 | int batch = CPUACCT_BATCH; |
9161 | 9160 | ||
9162 | if (unlikely(!cpuacct_subsys.active)) | 9161 | if (unlikely(!cpuacct_subsys.active)) |
9163 | return; | 9162 | return; |
9164 | 9163 | ||
9165 | rcu_read_lock(); | 9164 | rcu_read_lock(); |
9166 | ca = task_ca(tsk); | 9165 | ca = task_ca(tsk); |
9167 | 9166 | ||
9168 | do { | 9167 | do { |
9169 | __percpu_counter_add(&ca->cpustat[idx], val, batch); | 9168 | __percpu_counter_add(&ca->cpustat[idx], val, batch); |
9170 | ca = ca->parent; | 9169 | ca = ca->parent; |
9171 | } while (ca); | 9170 | } while (ca); |
9172 | rcu_read_unlock(); | 9171 | rcu_read_unlock(); |
9173 | } | 9172 | } |
9174 | 9173 | ||
9175 | struct cgroup_subsys cpuacct_subsys = { | 9174 | struct cgroup_subsys cpuacct_subsys = { |
9176 | .name = "cpuacct", | 9175 | .name = "cpuacct", |
9177 | .create = cpuacct_create, | 9176 | .create = cpuacct_create, |
9178 | .destroy = cpuacct_destroy, | 9177 | .destroy = cpuacct_destroy, |
9179 | .populate = cpuacct_populate, | 9178 | .populate = cpuacct_populate, |
9180 | .subsys_id = cpuacct_subsys_id, | 9179 | .subsys_id = cpuacct_subsys_id, |
9181 | }; | 9180 | }; |
9182 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9181 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9183 | 9182 | ||
9184 | 9183 |
net/mac80211/sta_info.c
1 | /* | 1 | /* |
2 | * Copyright 2002-2005, Instant802 Networks, Inc. | 2 | * Copyright 2002-2005, Instant802 Networks, Inc. |
3 | * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> | 3 | * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> |
4 | * | 4 | * |
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License version 2 as | 6 | * it under the terms of the GNU General Public License version 2 as |
7 | * published by the Free Software Foundation. | 7 | * published by the Free Software Foundation. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/netdevice.h> | 12 | #include <linux/netdevice.h> |
13 | #include <linux/types.h> | 13 | #include <linux/types.h> |
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/skbuff.h> | 15 | #include <linux/skbuff.h> |
16 | #include <linux/if_arp.h> | 16 | #include <linux/if_arp.h> |
17 | #include <linux/timer.h> | 17 | #include <linux/timer.h> |
18 | #include <linux/rtnetlink.h> | 18 | #include <linux/rtnetlink.h> |
19 | 19 | ||
20 | #include <net/mac80211.h> | 20 | #include <net/mac80211.h> |
21 | #include "ieee80211_i.h" | 21 | #include "ieee80211_i.h" |
22 | #include "driver-ops.h" | 22 | #include "driver-ops.h" |
23 | #include "rate.h" | 23 | #include "rate.h" |
24 | #include "sta_info.h" | 24 | #include "sta_info.h" |
25 | #include "debugfs_sta.h" | 25 | #include "debugfs_sta.h" |
26 | #include "mesh.h" | 26 | #include "mesh.h" |
27 | 27 | ||
28 | /** | 28 | /** |
29 | * DOC: STA information lifetime rules | 29 | * DOC: STA information lifetime rules |
30 | * | 30 | * |
31 | * STA info structures (&struct sta_info) are managed in a hash table | 31 | * STA info structures (&struct sta_info) are managed in a hash table |
32 | * for faster lookup and a list for iteration. They are managed using | 32 | * for faster lookup and a list for iteration. They are managed using |
33 | * RCU, i.e. access to the list and hash table is protected by RCU. | 33 | * RCU, i.e. access to the list and hash table is protected by RCU. |
34 | * | 34 | * |
35 | * Upon allocating a STA info structure with sta_info_alloc(), the caller | 35 | * Upon allocating a STA info structure with sta_info_alloc(), the caller |
36 | * owns that structure. It must then insert it into the hash table using | 36 | * owns that structure. It must then insert it into the hash table using |
37 | * either sta_info_insert() or sta_info_insert_rcu(); only in the latter | 37 | * either sta_info_insert() or sta_info_insert_rcu(); only in the latter |
38 | * case (which acquires an rcu read section but must not be called from | 38 | * case (which acquires an rcu read section but must not be called from |
39 | * within one) will the pointer still be valid after the call. Note that | 39 | * within one) will the pointer still be valid after the call. Note that |
40 | * the caller may not do much with the STA info before inserting it, in | 40 | * the caller may not do much with the STA info before inserting it, in |
41 | * particular, it may not start any mesh peer link management or add | 41 | * particular, it may not start any mesh peer link management or add |
42 | * encryption keys. | 42 | * encryption keys. |
43 | * | 43 | * |
44 | * When the insertion fails (sta_info_insert()) returns non-zero), the | 44 | * When the insertion fails (sta_info_insert()) returns non-zero), the |
45 | * structure will have been freed by sta_info_insert()! | 45 | * structure will have been freed by sta_info_insert()! |
46 | * | 46 | * |
47 | * Station entries are added by mac80211 when you establish a link with a | 47 | * Station entries are added by mac80211 when you establish a link with a |
48 | * peer. This means different things for the different type of interfaces | 48 | * peer. This means different things for the different type of interfaces |
49 | * we support. For a regular station this mean we add the AP sta when we | 49 | * we support. For a regular station this mean we add the AP sta when we |
50 | * receive an association response from the AP. For IBSS this occurs when | 50 | * receive an association response from the AP. For IBSS this occurs when |
51 | * get to know about a peer on the same IBSS. For WDS we add the sta for | 51 | * get to know about a peer on the same IBSS. For WDS we add the sta for |
52 | * the peer immediately upon device open. When using AP mode we add stations | 52 | * the peer immediately upon device open. When using AP mode we add stations |
53 | * for each respective station upon request from userspace through nl80211. | 53 | * for each respective station upon request from userspace through nl80211. |
54 | * | 54 | * |
55 | * In order to remove a STA info structure, various sta_info_destroy_*() | 55 | * In order to remove a STA info structure, various sta_info_destroy_*() |
56 | * calls are available. | 56 | * calls are available. |
57 | * | 57 | * |
58 | * There is no concept of ownership on a STA entry, each structure is | 58 | * There is no concept of ownership on a STA entry, each structure is |
59 | * owned by the global hash table/list until it is removed. All users of | 59 | * owned by the global hash table/list until it is removed. All users of |
60 | * the structure need to be RCU protected so that the structure won't be | 60 | * the structure need to be RCU protected so that the structure won't be |
61 | * freed before they are done using it. | 61 | * freed before they are done using it. |
62 | */ | 62 | */ |
63 | 63 | ||
64 | /* Caller must hold local->sta_lock */ | 64 | /* Caller must hold local->sta_lock */ |
65 | static int sta_info_hash_del(struct ieee80211_local *local, | 65 | static int sta_info_hash_del(struct ieee80211_local *local, |
66 | struct sta_info *sta) | 66 | struct sta_info *sta) |
67 | { | 67 | { |
68 | struct sta_info *s; | 68 | struct sta_info *s; |
69 | 69 | ||
70 | s = rcu_dereference_protected(local->sta_hash[STA_HASH(sta->sta.addr)], | 70 | s = rcu_dereference_protected(local->sta_hash[STA_HASH(sta->sta.addr)], |
71 | lockdep_is_held(&local->sta_lock)); | 71 | lockdep_is_held(&local->sta_lock)); |
72 | if (!s) | 72 | if (!s) |
73 | return -ENOENT; | 73 | return -ENOENT; |
74 | if (s == sta) { | 74 | if (s == sta) { |
75 | rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], | 75 | rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], |
76 | s->hnext); | 76 | s->hnext); |
77 | return 0; | 77 | return 0; |
78 | } | 78 | } |
79 | 79 | ||
80 | while (rcu_access_pointer(s->hnext) && | 80 | while (rcu_access_pointer(s->hnext) && |
81 | rcu_access_pointer(s->hnext) != sta) | 81 | rcu_access_pointer(s->hnext) != sta) |
82 | s = rcu_dereference_protected(s->hnext, | 82 | s = rcu_dereference_protected(s->hnext, |
83 | lockdep_is_held(&local->sta_lock)); | 83 | lockdep_is_held(&local->sta_lock)); |
84 | if (rcu_access_pointer(s->hnext)) { | 84 | if (rcu_access_pointer(s->hnext)) { |
85 | rcu_assign_pointer(s->hnext, sta->hnext); | 85 | rcu_assign_pointer(s->hnext, sta->hnext); |
86 | return 0; | 86 | return 0; |
87 | } | 87 | } |
88 | 88 | ||
89 | return -ENOENT; | 89 | return -ENOENT; |
90 | } | 90 | } |
91 | 91 | ||
92 | /* protected by RCU */ | 92 | /* protected by RCU */ |
93 | struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, | 93 | struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, |
94 | const u8 *addr) | 94 | const u8 *addr) |
95 | { | 95 | { |
96 | struct ieee80211_local *local = sdata->local; | 96 | struct ieee80211_local *local = sdata->local; |
97 | struct sta_info *sta; | 97 | struct sta_info *sta; |
98 | 98 | ||
99 | sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], | 99 | sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], |
100 | rcu_read_lock_held() || | ||
101 | lockdep_is_held(&local->sta_lock) || | 100 | lockdep_is_held(&local->sta_lock) || |
102 | lockdep_is_held(&local->sta_mtx)); | 101 | lockdep_is_held(&local->sta_mtx)); |
103 | while (sta) { | 102 | while (sta) { |
104 | if (sta->sdata == sdata && | 103 | if (sta->sdata == sdata && |
105 | memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) | 104 | memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) |
106 | break; | 105 | break; |
107 | sta = rcu_dereference_check(sta->hnext, | 106 | sta = rcu_dereference_check(sta->hnext, |
108 | rcu_read_lock_held() || | ||
109 | lockdep_is_held(&local->sta_lock) || | 107 | lockdep_is_held(&local->sta_lock) || |
110 | lockdep_is_held(&local->sta_mtx)); | 108 | lockdep_is_held(&local->sta_mtx)); |
111 | } | 109 | } |
112 | return sta; | 110 | return sta; |
113 | } | 111 | } |
114 | 112 | ||
115 | /* | 113 | /* |
116 | * Get sta info either from the specified interface | 114 | * Get sta info either from the specified interface |
117 | * or from one of its vlans | 115 | * or from one of its vlans |
118 | */ | 116 | */ |
119 | struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, | 117 | struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, |
120 | const u8 *addr) | 118 | const u8 *addr) |
121 | { | 119 | { |
122 | struct ieee80211_local *local = sdata->local; | 120 | struct ieee80211_local *local = sdata->local; |
123 | struct sta_info *sta; | 121 | struct sta_info *sta; |
124 | 122 | ||
125 | sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], | 123 | sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], |
126 | rcu_read_lock_held() || | ||
127 | lockdep_is_held(&local->sta_lock) || | 124 | lockdep_is_held(&local->sta_lock) || |
128 | lockdep_is_held(&local->sta_mtx)); | 125 | lockdep_is_held(&local->sta_mtx)); |
129 | while (sta) { | 126 | while (sta) { |
130 | if ((sta->sdata == sdata || | 127 | if ((sta->sdata == sdata || |
131 | (sta->sdata->bss && sta->sdata->bss == sdata->bss)) && | 128 | (sta->sdata->bss && sta->sdata->bss == sdata->bss)) && |
132 | memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) | 129 | memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) |
133 | break; | 130 | break; |
134 | sta = rcu_dereference_check(sta->hnext, | 131 | sta = rcu_dereference_check(sta->hnext, |
135 | rcu_read_lock_held() || | ||
136 | lockdep_is_held(&local->sta_lock) || | 132 | lockdep_is_held(&local->sta_lock) || |
137 | lockdep_is_held(&local->sta_mtx)); | 133 | lockdep_is_held(&local->sta_mtx)); |
138 | } | 134 | } |
139 | return sta; | 135 | return sta; |
140 | } | 136 | } |
141 | 137 | ||
142 | struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, | 138 | struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, |
143 | int idx) | 139 | int idx) |
144 | { | 140 | { |
145 | struct ieee80211_local *local = sdata->local; | 141 | struct ieee80211_local *local = sdata->local; |
146 | struct sta_info *sta; | 142 | struct sta_info *sta; |
147 | int i = 0; | 143 | int i = 0; |
148 | 144 | ||
149 | list_for_each_entry_rcu(sta, &local->sta_list, list) { | 145 | list_for_each_entry_rcu(sta, &local->sta_list, list) { |
150 | if (sdata != sta->sdata) | 146 | if (sdata != sta->sdata) |
151 | continue; | 147 | continue; |
152 | if (i < idx) { | 148 | if (i < idx) { |
153 | ++i; | 149 | ++i; |
154 | continue; | 150 | continue; |
155 | } | 151 | } |
156 | return sta; | 152 | return sta; |
157 | } | 153 | } |
158 | 154 | ||
159 | return NULL; | 155 | return NULL; |
160 | } | 156 | } |
161 | 157 | ||
162 | /** | 158 | /** |
163 | * __sta_info_free - internal STA free helper | 159 | * __sta_info_free - internal STA free helper |
164 | * | 160 | * |
165 | * @local: pointer to the global information | 161 | * @local: pointer to the global information |
166 | * @sta: STA info to free | 162 | * @sta: STA info to free |
167 | * | 163 | * |
168 | * This function must undo everything done by sta_info_alloc() | 164 | * This function must undo everything done by sta_info_alloc() |
169 | * that may happen before sta_info_insert(). | 165 | * that may happen before sta_info_insert(). |
170 | */ | 166 | */ |
171 | static void __sta_info_free(struct ieee80211_local *local, | 167 | static void __sta_info_free(struct ieee80211_local *local, |
172 | struct sta_info *sta) | 168 | struct sta_info *sta) |
173 | { | 169 | { |
174 | if (sta->rate_ctrl) { | 170 | if (sta->rate_ctrl) { |
175 | rate_control_free_sta(sta); | 171 | rate_control_free_sta(sta); |
176 | rate_control_put(sta->rate_ctrl); | 172 | rate_control_put(sta->rate_ctrl); |
177 | } | 173 | } |
178 | 174 | ||
179 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG | 175 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG |
180 | wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr); | 176 | wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr); |
181 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ | 177 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ |
182 | 178 | ||
183 | kfree(sta); | 179 | kfree(sta); |
184 | } | 180 | } |
185 | 181 | ||
186 | /* Caller must hold local->sta_lock */ | 182 | /* Caller must hold local->sta_lock */ |
187 | static void sta_info_hash_add(struct ieee80211_local *local, | 183 | static void sta_info_hash_add(struct ieee80211_local *local, |
188 | struct sta_info *sta) | 184 | struct sta_info *sta) |
189 | { | 185 | { |
190 | sta->hnext = local->sta_hash[STA_HASH(sta->sta.addr)]; | 186 | sta->hnext = local->sta_hash[STA_HASH(sta->sta.addr)]; |
191 | rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta); | 187 | rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta); |
192 | } | 188 | } |
193 | 189 | ||
194 | static void sta_unblock(struct work_struct *wk) | 190 | static void sta_unblock(struct work_struct *wk) |
195 | { | 191 | { |
196 | struct sta_info *sta; | 192 | struct sta_info *sta; |
197 | 193 | ||
198 | sta = container_of(wk, struct sta_info, drv_unblock_wk); | 194 | sta = container_of(wk, struct sta_info, drv_unblock_wk); |
199 | 195 | ||
200 | if (sta->dead) | 196 | if (sta->dead) |
201 | return; | 197 | return; |
202 | 198 | ||
203 | if (!test_sta_flags(sta, WLAN_STA_PS_STA)) | 199 | if (!test_sta_flags(sta, WLAN_STA_PS_STA)) |
204 | ieee80211_sta_ps_deliver_wakeup(sta); | 200 | ieee80211_sta_ps_deliver_wakeup(sta); |
205 | else if (test_and_clear_sta_flags(sta, WLAN_STA_PSPOLL)) { | 201 | else if (test_and_clear_sta_flags(sta, WLAN_STA_PSPOLL)) { |
206 | clear_sta_flags(sta, WLAN_STA_PS_DRIVER); | 202 | clear_sta_flags(sta, WLAN_STA_PS_DRIVER); |
207 | ieee80211_sta_ps_deliver_poll_response(sta); | 203 | ieee80211_sta_ps_deliver_poll_response(sta); |
208 | } else | 204 | } else |
209 | clear_sta_flags(sta, WLAN_STA_PS_DRIVER); | 205 | clear_sta_flags(sta, WLAN_STA_PS_DRIVER); |
210 | } | 206 | } |
211 | 207 | ||
212 | static int sta_prepare_rate_control(struct ieee80211_local *local, | 208 | static int sta_prepare_rate_control(struct ieee80211_local *local, |
213 | struct sta_info *sta, gfp_t gfp) | 209 | struct sta_info *sta, gfp_t gfp) |
214 | { | 210 | { |
215 | if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) | 211 | if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) |
216 | return 0; | 212 | return 0; |
217 | 213 | ||
218 | sta->rate_ctrl = rate_control_get(local->rate_ctrl); | 214 | sta->rate_ctrl = rate_control_get(local->rate_ctrl); |
219 | sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, | 215 | sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, |
220 | &sta->sta, gfp); | 216 | &sta->sta, gfp); |
221 | if (!sta->rate_ctrl_priv) { | 217 | if (!sta->rate_ctrl_priv) { |
222 | rate_control_put(sta->rate_ctrl); | 218 | rate_control_put(sta->rate_ctrl); |
223 | return -ENOMEM; | 219 | return -ENOMEM; |
224 | } | 220 | } |
225 | 221 | ||
226 | return 0; | 222 | return 0; |
227 | } | 223 | } |
228 | 224 | ||
229 | struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, | 225 | struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, |
230 | u8 *addr, gfp_t gfp) | 226 | u8 *addr, gfp_t gfp) |
231 | { | 227 | { |
232 | struct ieee80211_local *local = sdata->local; | 228 | struct ieee80211_local *local = sdata->local; |
233 | struct sta_info *sta; | 229 | struct sta_info *sta; |
234 | struct timespec uptime; | 230 | struct timespec uptime; |
235 | int i; | 231 | int i; |
236 | 232 | ||
237 | sta = kzalloc(sizeof(*sta) + local->hw.sta_data_size, gfp); | 233 | sta = kzalloc(sizeof(*sta) + local->hw.sta_data_size, gfp); |
238 | if (!sta) | 234 | if (!sta) |
239 | return NULL; | 235 | return NULL; |
240 | 236 | ||
241 | spin_lock_init(&sta->lock); | 237 | spin_lock_init(&sta->lock); |
242 | spin_lock_init(&sta->flaglock); | 238 | spin_lock_init(&sta->flaglock); |
243 | INIT_WORK(&sta->drv_unblock_wk, sta_unblock); | 239 | INIT_WORK(&sta->drv_unblock_wk, sta_unblock); |
244 | INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); | 240 | INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); |
245 | mutex_init(&sta->ampdu_mlme.mtx); | 241 | mutex_init(&sta->ampdu_mlme.mtx); |
246 | 242 | ||
247 | memcpy(sta->sta.addr, addr, ETH_ALEN); | 243 | memcpy(sta->sta.addr, addr, ETH_ALEN); |
248 | sta->local = local; | 244 | sta->local = local; |
249 | sta->sdata = sdata; | 245 | sta->sdata = sdata; |
250 | sta->last_rx = jiffies; | 246 | sta->last_rx = jiffies; |
251 | 247 | ||
252 | do_posix_clock_monotonic_gettime(&uptime); | 248 | do_posix_clock_monotonic_gettime(&uptime); |
253 | sta->last_connected = uptime.tv_sec; | 249 | sta->last_connected = uptime.tv_sec; |
254 | ewma_init(&sta->avg_signal, 1024, 8); | 250 | ewma_init(&sta->avg_signal, 1024, 8); |
255 | 251 | ||
256 | if (sta_prepare_rate_control(local, sta, gfp)) { | 252 | if (sta_prepare_rate_control(local, sta, gfp)) { |
257 | kfree(sta); | 253 | kfree(sta); |
258 | return NULL; | 254 | return NULL; |
259 | } | 255 | } |
260 | 256 | ||
261 | for (i = 0; i < STA_TID_NUM; i++) { | 257 | for (i = 0; i < STA_TID_NUM; i++) { |
262 | /* | 258 | /* |
263 | * timer_to_tid must be initialized with identity mapping | 259 | * timer_to_tid must be initialized with identity mapping |
264 | * to enable session_timer's data differentiation. See | 260 | * to enable session_timer's data differentiation. See |
265 | * sta_rx_agg_session_timer_expired for usage. | 261 | * sta_rx_agg_session_timer_expired for usage. |
266 | */ | 262 | */ |
267 | sta->timer_to_tid[i] = i; | 263 | sta->timer_to_tid[i] = i; |
268 | } | 264 | } |
269 | skb_queue_head_init(&sta->ps_tx_buf); | 265 | skb_queue_head_init(&sta->ps_tx_buf); |
270 | skb_queue_head_init(&sta->tx_filtered); | 266 | skb_queue_head_init(&sta->tx_filtered); |
271 | 267 | ||
272 | for (i = 0; i < NUM_RX_DATA_QUEUES; i++) | 268 | for (i = 0; i < NUM_RX_DATA_QUEUES; i++) |
273 | sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); | 269 | sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); |
274 | 270 | ||
275 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG | 271 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG |
276 | wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr); | 272 | wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr); |
277 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ | 273 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ |
278 | 274 | ||
279 | #ifdef CONFIG_MAC80211_MESH | 275 | #ifdef CONFIG_MAC80211_MESH |
280 | sta->plink_state = NL80211_PLINK_LISTEN; | 276 | sta->plink_state = NL80211_PLINK_LISTEN; |
281 | init_timer(&sta->plink_timer); | 277 | init_timer(&sta->plink_timer); |
282 | #endif | 278 | #endif |
283 | 279 | ||
284 | return sta; | 280 | return sta; |
285 | } | 281 | } |
286 | 282 | ||
287 | static int sta_info_finish_insert(struct sta_info *sta, bool async) | 283 | static int sta_info_finish_insert(struct sta_info *sta, bool async) |
288 | { | 284 | { |
289 | struct ieee80211_local *local = sta->local; | 285 | struct ieee80211_local *local = sta->local; |
290 | struct ieee80211_sub_if_data *sdata = sta->sdata; | 286 | struct ieee80211_sub_if_data *sdata = sta->sdata; |
291 | struct station_info sinfo; | 287 | struct station_info sinfo; |
292 | unsigned long flags; | 288 | unsigned long flags; |
293 | int err = 0; | 289 | int err = 0; |
294 | 290 | ||
295 | lockdep_assert_held(&local->sta_mtx); | 291 | lockdep_assert_held(&local->sta_mtx); |
296 | 292 | ||
297 | /* notify driver */ | 293 | /* notify driver */ |
298 | if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) | 294 | if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) |
299 | sdata = container_of(sdata->bss, | 295 | sdata = container_of(sdata->bss, |
300 | struct ieee80211_sub_if_data, | 296 | struct ieee80211_sub_if_data, |
301 | u.ap); | 297 | u.ap); |
302 | err = drv_sta_add(local, sdata, &sta->sta); | 298 | err = drv_sta_add(local, sdata, &sta->sta); |
303 | if (err) { | 299 | if (err) { |
304 | if (!async) | 300 | if (!async) |
305 | return err; | 301 | return err; |
306 | printk(KERN_DEBUG "%s: failed to add IBSS STA %pM to driver (%d)" | 302 | printk(KERN_DEBUG "%s: failed to add IBSS STA %pM to driver (%d)" |
307 | " - keeping it anyway.\n", | 303 | " - keeping it anyway.\n", |
308 | sdata->name, sta->sta.addr, err); | 304 | sdata->name, sta->sta.addr, err); |
309 | } else { | 305 | } else { |
310 | sta->uploaded = true; | 306 | sta->uploaded = true; |
311 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG | 307 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG |
312 | if (async) | 308 | if (async) |
313 | wiphy_debug(local->hw.wiphy, | 309 | wiphy_debug(local->hw.wiphy, |
314 | "Finished adding IBSS STA %pM\n", | 310 | "Finished adding IBSS STA %pM\n", |
315 | sta->sta.addr); | 311 | sta->sta.addr); |
316 | #endif | 312 | #endif |
317 | } | 313 | } |
318 | 314 | ||
319 | sdata = sta->sdata; | 315 | sdata = sta->sdata; |
320 | 316 | ||
321 | if (!async) { | 317 | if (!async) { |
322 | local->num_sta++; | 318 | local->num_sta++; |
323 | local->sta_generation++; | 319 | local->sta_generation++; |
324 | smp_mb(); | 320 | smp_mb(); |
325 | 321 | ||
326 | /* make the station visible */ | 322 | /* make the station visible */ |
327 | spin_lock_irqsave(&local->sta_lock, flags); | 323 | spin_lock_irqsave(&local->sta_lock, flags); |
328 | sta_info_hash_add(local, sta); | 324 | sta_info_hash_add(local, sta); |
329 | spin_unlock_irqrestore(&local->sta_lock, flags); | 325 | spin_unlock_irqrestore(&local->sta_lock, flags); |
330 | } | 326 | } |
331 | 327 | ||
332 | list_add(&sta->list, &local->sta_list); | 328 | list_add(&sta->list, &local->sta_list); |
333 | 329 | ||
334 | ieee80211_sta_debugfs_add(sta); | 330 | ieee80211_sta_debugfs_add(sta); |
335 | rate_control_add_sta_debugfs(sta); | 331 | rate_control_add_sta_debugfs(sta); |
336 | 332 | ||
337 | sinfo.filled = 0; | 333 | sinfo.filled = 0; |
338 | sinfo.generation = local->sta_generation; | 334 | sinfo.generation = local->sta_generation; |
339 | cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL); | 335 | cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL); |
340 | 336 | ||
341 | 337 | ||
342 | return 0; | 338 | return 0; |
343 | } | 339 | } |
344 | 340 | ||
345 | static void sta_info_finish_pending(struct ieee80211_local *local) | 341 | static void sta_info_finish_pending(struct ieee80211_local *local) |
346 | { | 342 | { |
347 | struct sta_info *sta; | 343 | struct sta_info *sta; |
348 | unsigned long flags; | 344 | unsigned long flags; |
349 | 345 | ||
350 | spin_lock_irqsave(&local->sta_lock, flags); | 346 | spin_lock_irqsave(&local->sta_lock, flags); |
351 | while (!list_empty(&local->sta_pending_list)) { | 347 | while (!list_empty(&local->sta_pending_list)) { |
352 | sta = list_first_entry(&local->sta_pending_list, | 348 | sta = list_first_entry(&local->sta_pending_list, |
353 | struct sta_info, list); | 349 | struct sta_info, list); |
354 | list_del(&sta->list); | 350 | list_del(&sta->list); |
355 | spin_unlock_irqrestore(&local->sta_lock, flags); | 351 | spin_unlock_irqrestore(&local->sta_lock, flags); |
356 | 352 | ||
357 | sta_info_finish_insert(sta, true); | 353 | sta_info_finish_insert(sta, true); |
358 | 354 | ||
359 | spin_lock_irqsave(&local->sta_lock, flags); | 355 | spin_lock_irqsave(&local->sta_lock, flags); |
360 | } | 356 | } |
361 | spin_unlock_irqrestore(&local->sta_lock, flags); | 357 | spin_unlock_irqrestore(&local->sta_lock, flags); |
362 | } | 358 | } |
363 | 359 | ||
364 | static void sta_info_finish_work(struct work_struct *work) | 360 | static void sta_info_finish_work(struct work_struct *work) |
365 | { | 361 | { |
366 | struct ieee80211_local *local = | 362 | struct ieee80211_local *local = |
367 | container_of(work, struct ieee80211_local, sta_finish_work); | 363 | container_of(work, struct ieee80211_local, sta_finish_work); |
368 | 364 | ||
369 | mutex_lock(&local->sta_mtx); | 365 | mutex_lock(&local->sta_mtx); |
370 | sta_info_finish_pending(local); | 366 | sta_info_finish_pending(local); |
371 | mutex_unlock(&local->sta_mtx); | 367 | mutex_unlock(&local->sta_mtx); |
372 | } | 368 | } |
373 | 369 | ||
374 | int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) | 370 | int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) |
375 | { | 371 | { |
376 | struct ieee80211_local *local = sta->local; | 372 | struct ieee80211_local *local = sta->local; |
377 | struct ieee80211_sub_if_data *sdata = sta->sdata; | 373 | struct ieee80211_sub_if_data *sdata = sta->sdata; |
378 | unsigned long flags; | 374 | unsigned long flags; |
379 | int err = 0; | 375 | int err = 0; |
380 | 376 | ||
381 | /* | 377 | /* |
382 | * Can't be a WARN_ON because it can be triggered through a race: | 378 | * Can't be a WARN_ON because it can be triggered through a race: |
383 | * something inserts a STA (on one CPU) without holding the RTNL | 379 | * something inserts a STA (on one CPU) without holding the RTNL |
384 | * and another CPU turns off the net device. | 380 | * and another CPU turns off the net device. |
385 | */ | 381 | */ |
386 | if (unlikely(!ieee80211_sdata_running(sdata))) { | 382 | if (unlikely(!ieee80211_sdata_running(sdata))) { |
387 | err = -ENETDOWN; | 383 | err = -ENETDOWN; |
388 | rcu_read_lock(); | 384 | rcu_read_lock(); |
389 | goto out_free; | 385 | goto out_free; |
390 | } | 386 | } |
391 | 387 | ||
392 | if (WARN_ON(compare_ether_addr(sta->sta.addr, sdata->vif.addr) == 0 || | 388 | if (WARN_ON(compare_ether_addr(sta->sta.addr, sdata->vif.addr) == 0 || |
393 | is_multicast_ether_addr(sta->sta.addr))) { | 389 | is_multicast_ether_addr(sta->sta.addr))) { |
394 | err = -EINVAL; | 390 | err = -EINVAL; |
395 | rcu_read_lock(); | 391 | rcu_read_lock(); |
396 | goto out_free; | 392 | goto out_free; |
397 | } | 393 | } |
398 | 394 | ||
399 | /* | 395 | /* |
400 | * In ad-hoc mode, we sometimes need to insert stations | 396 | * In ad-hoc mode, we sometimes need to insert stations |
401 | * from tasklet context from the RX path. To avoid races, | 397 | * from tasklet context from the RX path. To avoid races, |
402 | * always do so in that case -- see the comment below. | 398 | * always do so in that case -- see the comment below. |
403 | */ | 399 | */ |
404 | if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { | 400 | if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { |
405 | spin_lock_irqsave(&local->sta_lock, flags); | 401 | spin_lock_irqsave(&local->sta_lock, flags); |
406 | /* check if STA exists already */ | 402 | /* check if STA exists already */ |
407 | if (sta_info_get_bss(sdata, sta->sta.addr)) { | 403 | if (sta_info_get_bss(sdata, sta->sta.addr)) { |
408 | spin_unlock_irqrestore(&local->sta_lock, flags); | 404 | spin_unlock_irqrestore(&local->sta_lock, flags); |
409 | rcu_read_lock(); | 405 | rcu_read_lock(); |
410 | err = -EEXIST; | 406 | err = -EEXIST; |
411 | goto out_free; | 407 | goto out_free; |
412 | } | 408 | } |
413 | 409 | ||
414 | local->num_sta++; | 410 | local->num_sta++; |
415 | local->sta_generation++; | 411 | local->sta_generation++; |
416 | smp_mb(); | 412 | smp_mb(); |
417 | sta_info_hash_add(local, sta); | 413 | sta_info_hash_add(local, sta); |
418 | 414 | ||
419 | list_add_tail(&sta->list, &local->sta_pending_list); | 415 | list_add_tail(&sta->list, &local->sta_pending_list); |
420 | 416 | ||
421 | rcu_read_lock(); | 417 | rcu_read_lock(); |
422 | spin_unlock_irqrestore(&local->sta_lock, flags); | 418 | spin_unlock_irqrestore(&local->sta_lock, flags); |
423 | 419 | ||
424 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG | 420 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG |
425 | wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n", | 421 | wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n", |
426 | sta->sta.addr); | 422 | sta->sta.addr); |
427 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ | 423 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ |
428 | 424 | ||
429 | ieee80211_queue_work(&local->hw, &local->sta_finish_work); | 425 | ieee80211_queue_work(&local->hw, &local->sta_finish_work); |
430 | 426 | ||
431 | return 0; | 427 | return 0; |
432 | } | 428 | } |
433 | 429 | ||
434 | /* | 430 | /* |
435 | * On first glance, this will look racy, because the code | 431 | * On first glance, this will look racy, because the code |
436 | * below this point, which inserts a station with sleeping, | 432 | * below this point, which inserts a station with sleeping, |
437 | * unlocks the sta_lock between checking existence in the | 433 | * unlocks the sta_lock between checking existence in the |
438 | * hash table and inserting into it. | 434 | * hash table and inserting into it. |
439 | * | 435 | * |
440 | * However, it is not racy against itself because it keeps | 436 | * However, it is not racy against itself because it keeps |
441 | * the mutex locked. It still seems to race against the | 437 | * the mutex locked. It still seems to race against the |
442 | * above code that atomically inserts the station... That, | 438 | * above code that atomically inserts the station... That, |
443 | * however, is not true because the above code can only | 439 | * however, is not true because the above code can only |
444 | * be invoked for IBSS interfaces, and the below code will | 440 | * be invoked for IBSS interfaces, and the below code will |
445 | * not be -- and the two do not race against each other as | 441 | * not be -- and the two do not race against each other as |
446 | * the hash table also keys off the interface. | 442 | * the hash table also keys off the interface. |
447 | */ | 443 | */ |
448 | 444 | ||
449 | might_sleep(); | 445 | might_sleep(); |
450 | 446 | ||
451 | mutex_lock(&local->sta_mtx); | 447 | mutex_lock(&local->sta_mtx); |
452 | 448 | ||
453 | spin_lock_irqsave(&local->sta_lock, flags); | 449 | spin_lock_irqsave(&local->sta_lock, flags); |
454 | /* check if STA exists already */ | 450 | /* check if STA exists already */ |
455 | if (sta_info_get_bss(sdata, sta->sta.addr)) { | 451 | if (sta_info_get_bss(sdata, sta->sta.addr)) { |
456 | spin_unlock_irqrestore(&local->sta_lock, flags); | 452 | spin_unlock_irqrestore(&local->sta_lock, flags); |
457 | mutex_unlock(&local->sta_mtx); | 453 | mutex_unlock(&local->sta_mtx); |
458 | rcu_read_lock(); | 454 | rcu_read_lock(); |
459 | err = -EEXIST; | 455 | err = -EEXIST; |
460 | goto out_free; | 456 | goto out_free; |
461 | } | 457 | } |
462 | 458 | ||
463 | spin_unlock_irqrestore(&local->sta_lock, flags); | 459 | spin_unlock_irqrestore(&local->sta_lock, flags); |
464 | 460 | ||
465 | err = sta_info_finish_insert(sta, false); | 461 | err = sta_info_finish_insert(sta, false); |
466 | if (err) { | 462 | if (err) { |
467 | mutex_unlock(&local->sta_mtx); | 463 | mutex_unlock(&local->sta_mtx); |
468 | rcu_read_lock(); | 464 | rcu_read_lock(); |
469 | goto out_free; | 465 | goto out_free; |
470 | } | 466 | } |
471 | 467 | ||
472 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG | 468 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG |
473 | wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr); | 469 | wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr); |
474 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ | 470 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ |
475 | 471 | ||
476 | /* move reference to rcu-protected */ | 472 | /* move reference to rcu-protected */ |
477 | rcu_read_lock(); | 473 | rcu_read_lock(); |
478 | mutex_unlock(&local->sta_mtx); | 474 | mutex_unlock(&local->sta_mtx); |
479 | 475 | ||
480 | if (ieee80211_vif_is_mesh(&sdata->vif)) | 476 | if (ieee80211_vif_is_mesh(&sdata->vif)) |
481 | mesh_accept_plinks_update(sdata); | 477 | mesh_accept_plinks_update(sdata); |
482 | 478 | ||
483 | return 0; | 479 | return 0; |
484 | out_free: | 480 | out_free: |
485 | BUG_ON(!err); | 481 | BUG_ON(!err); |
486 | __sta_info_free(local, sta); | 482 | __sta_info_free(local, sta); |
487 | return err; | 483 | return err; |
488 | } | 484 | } |
489 | 485 | ||
490 | int sta_info_insert(struct sta_info *sta) | 486 | int sta_info_insert(struct sta_info *sta) |
491 | { | 487 | { |
492 | int err = sta_info_insert_rcu(sta); | 488 | int err = sta_info_insert_rcu(sta); |
493 | 489 | ||
494 | rcu_read_unlock(); | 490 | rcu_read_unlock(); |
495 | 491 | ||
496 | return err; | 492 | return err; |
497 | } | 493 | } |
498 | 494 | ||
499 | static inline void __bss_tim_set(struct ieee80211_if_ap *bss, u16 aid) | 495 | static inline void __bss_tim_set(struct ieee80211_if_ap *bss, u16 aid) |
500 | { | 496 | { |
501 | /* | 497 | /* |
502 | * This format has been mandated by the IEEE specifications, | 498 | * This format has been mandated by the IEEE specifications, |
503 | * so this line may not be changed to use the __set_bit() format. | 499 | * so this line may not be changed to use the __set_bit() format. |
504 | */ | 500 | */ |
505 | bss->tim[aid / 8] |= (1 << (aid % 8)); | 501 | bss->tim[aid / 8] |= (1 << (aid % 8)); |
506 | } | 502 | } |
507 | 503 | ||
508 | static inline void __bss_tim_clear(struct ieee80211_if_ap *bss, u16 aid) | 504 | static inline void __bss_tim_clear(struct ieee80211_if_ap *bss, u16 aid) |
509 | { | 505 | { |
510 | /* | 506 | /* |
511 | * This format has been mandated by the IEEE specifications, | 507 | * This format has been mandated by the IEEE specifications, |
512 | * so this line may not be changed to use the __clear_bit() format. | 508 | * so this line may not be changed to use the __clear_bit() format. |
513 | */ | 509 | */ |
514 | bss->tim[aid / 8] &= ~(1 << (aid % 8)); | 510 | bss->tim[aid / 8] &= ~(1 << (aid % 8)); |
515 | } | 511 | } |
516 | 512 | ||
517 | static void __sta_info_set_tim_bit(struct ieee80211_if_ap *bss, | 513 | static void __sta_info_set_tim_bit(struct ieee80211_if_ap *bss, |
518 | struct sta_info *sta) | 514 | struct sta_info *sta) |
519 | { | 515 | { |
520 | BUG_ON(!bss); | 516 | BUG_ON(!bss); |
521 | 517 | ||
522 | __bss_tim_set(bss, sta->sta.aid); | 518 | __bss_tim_set(bss, sta->sta.aid); |
523 | 519 | ||
524 | if (sta->local->ops->set_tim) { | 520 | if (sta->local->ops->set_tim) { |
525 | sta->local->tim_in_locked_section = true; | 521 | sta->local->tim_in_locked_section = true; |
526 | drv_set_tim(sta->local, &sta->sta, true); | 522 | drv_set_tim(sta->local, &sta->sta, true); |
527 | sta->local->tim_in_locked_section = false; | 523 | sta->local->tim_in_locked_section = false; |
528 | } | 524 | } |
529 | } | 525 | } |
530 | 526 | ||
531 | void sta_info_set_tim_bit(struct sta_info *sta) | 527 | void sta_info_set_tim_bit(struct sta_info *sta) |
532 | { | 528 | { |
533 | unsigned long flags; | 529 | unsigned long flags; |
534 | 530 | ||
535 | BUG_ON(!sta->sdata->bss); | 531 | BUG_ON(!sta->sdata->bss); |
536 | 532 | ||
537 | spin_lock_irqsave(&sta->local->sta_lock, flags); | 533 | spin_lock_irqsave(&sta->local->sta_lock, flags); |
538 | __sta_info_set_tim_bit(sta->sdata->bss, sta); | 534 | __sta_info_set_tim_bit(sta->sdata->bss, sta); |
539 | spin_unlock_irqrestore(&sta->local->sta_lock, flags); | 535 | spin_unlock_irqrestore(&sta->local->sta_lock, flags); |
540 | } | 536 | } |
541 | 537 | ||
542 | static void __sta_info_clear_tim_bit(struct ieee80211_if_ap *bss, | 538 | static void __sta_info_clear_tim_bit(struct ieee80211_if_ap *bss, |
543 | struct sta_info *sta) | 539 | struct sta_info *sta) |
544 | { | 540 | { |
545 | BUG_ON(!bss); | 541 | BUG_ON(!bss); |
546 | 542 | ||
547 | __bss_tim_clear(bss, sta->sta.aid); | 543 | __bss_tim_clear(bss, sta->sta.aid); |
548 | 544 | ||
549 | if (sta->local->ops->set_tim) { | 545 | if (sta->local->ops->set_tim) { |
550 | sta->local->tim_in_locked_section = true; | 546 | sta->local->tim_in_locked_section = true; |
551 | drv_set_tim(sta->local, &sta->sta, false); | 547 | drv_set_tim(sta->local, &sta->sta, false); |
552 | sta->local->tim_in_locked_section = false; | 548 | sta->local->tim_in_locked_section = false; |
553 | } | 549 | } |
554 | } | 550 | } |
555 | 551 | ||
556 | void sta_info_clear_tim_bit(struct sta_info *sta) | 552 | void sta_info_clear_tim_bit(struct sta_info *sta) |
557 | { | 553 | { |
558 | unsigned long flags; | 554 | unsigned long flags; |
559 | 555 | ||
560 | BUG_ON(!sta->sdata->bss); | 556 | BUG_ON(!sta->sdata->bss); |
561 | 557 | ||
562 | spin_lock_irqsave(&sta->local->sta_lock, flags); | 558 | spin_lock_irqsave(&sta->local->sta_lock, flags); |
563 | __sta_info_clear_tim_bit(sta->sdata->bss, sta); | 559 | __sta_info_clear_tim_bit(sta->sdata->bss, sta); |
564 | spin_unlock_irqrestore(&sta->local->sta_lock, flags); | 560 | spin_unlock_irqrestore(&sta->local->sta_lock, flags); |
565 | } | 561 | } |
566 | 562 | ||
567 | static int sta_info_buffer_expired(struct sta_info *sta, | 563 | static int sta_info_buffer_expired(struct sta_info *sta, |
568 | struct sk_buff *skb) | 564 | struct sk_buff *skb) |
569 | { | 565 | { |
570 | struct ieee80211_tx_info *info; | 566 | struct ieee80211_tx_info *info; |
571 | int timeout; | 567 | int timeout; |
572 | 568 | ||
573 | if (!skb) | 569 | if (!skb) |
574 | return 0; | 570 | return 0; |
575 | 571 | ||
576 | info = IEEE80211_SKB_CB(skb); | 572 | info = IEEE80211_SKB_CB(skb); |
577 | 573 | ||
578 | /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ | 574 | /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ |
579 | timeout = (sta->listen_interval * | 575 | timeout = (sta->listen_interval * |
580 | sta->sdata->vif.bss_conf.beacon_int * | 576 | sta->sdata->vif.bss_conf.beacon_int * |
581 | 32 / 15625) * HZ; | 577 | 32 / 15625) * HZ; |
582 | if (timeout < STA_TX_BUFFER_EXPIRE) | 578 | if (timeout < STA_TX_BUFFER_EXPIRE) |
583 | timeout = STA_TX_BUFFER_EXPIRE; | 579 | timeout = STA_TX_BUFFER_EXPIRE; |
584 | return time_after(jiffies, info->control.jiffies + timeout); | 580 | return time_after(jiffies, info->control.jiffies + timeout); |
585 | } | 581 | } |
586 | 582 | ||
587 | 583 | ||
588 | static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, | 584 | static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, |
589 | struct sta_info *sta) | 585 | struct sta_info *sta) |
590 | { | 586 | { |
591 | unsigned long flags; | 587 | unsigned long flags; |
592 | struct sk_buff *skb; | 588 | struct sk_buff *skb; |
593 | 589 | ||
594 | if (skb_queue_empty(&sta->ps_tx_buf)) | 590 | if (skb_queue_empty(&sta->ps_tx_buf)) |
595 | return false; | 591 | return false; |
596 | 592 | ||
597 | for (;;) { | 593 | for (;;) { |
598 | spin_lock_irqsave(&sta->ps_tx_buf.lock, flags); | 594 | spin_lock_irqsave(&sta->ps_tx_buf.lock, flags); |
599 | skb = skb_peek(&sta->ps_tx_buf); | 595 | skb = skb_peek(&sta->ps_tx_buf); |
600 | if (sta_info_buffer_expired(sta, skb)) | 596 | if (sta_info_buffer_expired(sta, skb)) |
601 | skb = __skb_dequeue(&sta->ps_tx_buf); | 597 | skb = __skb_dequeue(&sta->ps_tx_buf); |
602 | else | 598 | else |
603 | skb = NULL; | 599 | skb = NULL; |
604 | spin_unlock_irqrestore(&sta->ps_tx_buf.lock, flags); | 600 | spin_unlock_irqrestore(&sta->ps_tx_buf.lock, flags); |
605 | 601 | ||
606 | if (!skb) | 602 | if (!skb) |
607 | break; | 603 | break; |
608 | 604 | ||
609 | local->total_ps_buffered--; | 605 | local->total_ps_buffered--; |
610 | #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG | 606 | #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG |
611 | printk(KERN_DEBUG "Buffered frame expired (STA %pM)\n", | 607 | printk(KERN_DEBUG "Buffered frame expired (STA %pM)\n", |
612 | sta->sta.addr); | 608 | sta->sta.addr); |
613 | #endif | 609 | #endif |
614 | dev_kfree_skb(skb); | 610 | dev_kfree_skb(skb); |
615 | 611 | ||
616 | if (skb_queue_empty(&sta->ps_tx_buf) && | 612 | if (skb_queue_empty(&sta->ps_tx_buf) && |
617 | !test_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF)) | 613 | !test_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF)) |
618 | sta_info_clear_tim_bit(sta); | 614 | sta_info_clear_tim_bit(sta); |
619 | } | 615 | } |
620 | 616 | ||
621 | return true; | 617 | return true; |
622 | } | 618 | } |
623 | 619 | ||
624 | static int __must_check __sta_info_destroy(struct sta_info *sta) | 620 | static int __must_check __sta_info_destroy(struct sta_info *sta) |
625 | { | 621 | { |
626 | struct ieee80211_local *local; | 622 | struct ieee80211_local *local; |
627 | struct ieee80211_sub_if_data *sdata; | 623 | struct ieee80211_sub_if_data *sdata; |
628 | struct sk_buff *skb; | 624 | struct sk_buff *skb; |
629 | unsigned long flags; | 625 | unsigned long flags; |
630 | int ret, i; | 626 | int ret, i; |
631 | 627 | ||
632 | might_sleep(); | 628 | might_sleep(); |
633 | 629 | ||
634 | if (!sta) | 630 | if (!sta) |
635 | return -ENOENT; | 631 | return -ENOENT; |
636 | 632 | ||
637 | local = sta->local; | 633 | local = sta->local; |
638 | sdata = sta->sdata; | 634 | sdata = sta->sdata; |
639 | 635 | ||
640 | /* | 636 | /* |
641 | * Before removing the station from the driver and | 637 | * Before removing the station from the driver and |
642 | * rate control, it might still start new aggregation | 638 | * rate control, it might still start new aggregation |
643 | * sessions -- block that to make sure the tear-down | 639 | * sessions -- block that to make sure the tear-down |
644 | * will be sufficient. | 640 | * will be sufficient. |
645 | */ | 641 | */ |
646 | set_sta_flags(sta, WLAN_STA_BLOCK_BA); | 642 | set_sta_flags(sta, WLAN_STA_BLOCK_BA); |
647 | ieee80211_sta_tear_down_BA_sessions(sta, true); | 643 | ieee80211_sta_tear_down_BA_sessions(sta, true); |
648 | 644 | ||
649 | spin_lock_irqsave(&local->sta_lock, flags); | 645 | spin_lock_irqsave(&local->sta_lock, flags); |
650 | ret = sta_info_hash_del(local, sta); | 646 | ret = sta_info_hash_del(local, sta); |
651 | /* this might still be the pending list ... which is fine */ | 647 | /* this might still be the pending list ... which is fine */ |
652 | if (!ret) | 648 | if (!ret) |
653 | list_del(&sta->list); | 649 | list_del(&sta->list); |
654 | spin_unlock_irqrestore(&local->sta_lock, flags); | 650 | spin_unlock_irqrestore(&local->sta_lock, flags); |
655 | if (ret) | 651 | if (ret) |
656 | return ret; | 652 | return ret; |
657 | 653 | ||
658 | mutex_lock(&local->key_mtx); | 654 | mutex_lock(&local->key_mtx); |
659 | for (i = 0; i < NUM_DEFAULT_KEYS; i++) | 655 | for (i = 0; i < NUM_DEFAULT_KEYS; i++) |
660 | __ieee80211_key_free(key_mtx_dereference(local, sta->gtk[i])); | 656 | __ieee80211_key_free(key_mtx_dereference(local, sta->gtk[i])); |
661 | if (sta->ptk) | 657 | if (sta->ptk) |
662 | __ieee80211_key_free(key_mtx_dereference(local, sta->ptk)); | 658 | __ieee80211_key_free(key_mtx_dereference(local, sta->ptk)); |
663 | mutex_unlock(&local->key_mtx); | 659 | mutex_unlock(&local->key_mtx); |
664 | 660 | ||
665 | sta->dead = true; | 661 | sta->dead = true; |
666 | 662 | ||
667 | if (test_and_clear_sta_flags(sta, | 663 | if (test_and_clear_sta_flags(sta, |
668 | WLAN_STA_PS_STA | WLAN_STA_PS_DRIVER)) { | 664 | WLAN_STA_PS_STA | WLAN_STA_PS_DRIVER)) { |
669 | BUG_ON(!sdata->bss); | 665 | BUG_ON(!sdata->bss); |
670 | 666 | ||
671 | atomic_dec(&sdata->bss->num_sta_ps); | 667 | atomic_dec(&sdata->bss->num_sta_ps); |
672 | __sta_info_clear_tim_bit(sdata->bss, sta); | 668 | __sta_info_clear_tim_bit(sdata->bss, sta); |
673 | } | 669 | } |
674 | 670 | ||
675 | local->num_sta--; | 671 | local->num_sta--; |
676 | local->sta_generation++; | 672 | local->sta_generation++; |
677 | 673 | ||
678 | if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) | 674 | if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) |
679 | rcu_assign_pointer(sdata->u.vlan.sta, NULL); | 675 | rcu_assign_pointer(sdata->u.vlan.sta, NULL); |
680 | 676 | ||
681 | if (sta->uploaded) { | 677 | if (sta->uploaded) { |
682 | if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) | 678 | if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) |
683 | sdata = container_of(sdata->bss, | 679 | sdata = container_of(sdata->bss, |
684 | struct ieee80211_sub_if_data, | 680 | struct ieee80211_sub_if_data, |
685 | u.ap); | 681 | u.ap); |
686 | drv_sta_remove(local, sdata, &sta->sta); | 682 | drv_sta_remove(local, sdata, &sta->sta); |
687 | sdata = sta->sdata; | 683 | sdata = sta->sdata; |
688 | } | 684 | } |
689 | 685 | ||
690 | /* | 686 | /* |
691 | * At this point, after we wait for an RCU grace period, | 687 | * At this point, after we wait for an RCU grace period, |
692 | * neither mac80211 nor the driver can reference this | 688 | * neither mac80211 nor the driver can reference this |
693 | * sta struct any more except by still existing timers | 689 | * sta struct any more except by still existing timers |
694 | * associated with this station that we clean up below. | 690 | * associated with this station that we clean up below. |
695 | */ | 691 | */ |
696 | synchronize_rcu(); | 692 | synchronize_rcu(); |
697 | 693 | ||
698 | #ifdef CONFIG_MAC80211_MESH | 694 | #ifdef CONFIG_MAC80211_MESH |
699 | if (ieee80211_vif_is_mesh(&sdata->vif)) | 695 | if (ieee80211_vif_is_mesh(&sdata->vif)) |
700 | mesh_accept_plinks_update(sdata); | 696 | mesh_accept_plinks_update(sdata); |
701 | #endif | 697 | #endif |
702 | 698 | ||
703 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG | 699 | #ifdef CONFIG_MAC80211_VERBOSE_DEBUG |
704 | wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr); | 700 | wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr); |
705 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ | 701 | #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ |
706 | cancel_work_sync(&sta->drv_unblock_wk); | 702 | cancel_work_sync(&sta->drv_unblock_wk); |
707 | 703 | ||
708 | cfg80211_del_sta(sdata->dev, sta->sta.addr, GFP_KERNEL); | 704 | cfg80211_del_sta(sdata->dev, sta->sta.addr, GFP_KERNEL); |
709 | 705 | ||
710 | rate_control_remove_sta_debugfs(sta); | 706 | rate_control_remove_sta_debugfs(sta); |
711 | ieee80211_sta_debugfs_remove(sta); | 707 | ieee80211_sta_debugfs_remove(sta); |
712 | 708 | ||
713 | #ifdef CONFIG_MAC80211_MESH | 709 | #ifdef CONFIG_MAC80211_MESH |
714 | if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { | 710 | if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { |
715 | mesh_plink_deactivate(sta); | 711 | mesh_plink_deactivate(sta); |
716 | del_timer_sync(&sta->plink_timer); | 712 | del_timer_sync(&sta->plink_timer); |
717 | } | 713 | } |
718 | #endif | 714 | #endif |
719 | 715 | ||
720 | while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) { | 716 | while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) { |
721 | local->total_ps_buffered--; | 717 | local->total_ps_buffered--; |
722 | dev_kfree_skb_any(skb); | 718 | dev_kfree_skb_any(skb); |
723 | } | 719 | } |
724 | 720 | ||
725 | while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL) | 721 | while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL) |
726 | dev_kfree_skb_any(skb); | 722 | dev_kfree_skb_any(skb); |
727 | 723 | ||
728 | __sta_info_free(local, sta); | 724 | __sta_info_free(local, sta); |
729 | 725 | ||
730 | return 0; | 726 | return 0; |
731 | } | 727 | } |
732 | 728 | ||
733 | int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) | 729 | int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) |
734 | { | 730 | { |
735 | struct sta_info *sta; | 731 | struct sta_info *sta; |
736 | int ret; | 732 | int ret; |
737 | 733 | ||
738 | mutex_lock(&sdata->local->sta_mtx); | 734 | mutex_lock(&sdata->local->sta_mtx); |
739 | sta = sta_info_get(sdata, addr); | 735 | sta = sta_info_get(sdata, addr); |
740 | ret = __sta_info_destroy(sta); | 736 | ret = __sta_info_destroy(sta); |
741 | mutex_unlock(&sdata->local->sta_mtx); | 737 | mutex_unlock(&sdata->local->sta_mtx); |
742 | 738 | ||
743 | return ret; | 739 | return ret; |
744 | } | 740 | } |
745 | 741 | ||
746 | int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, | 742 | int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, |
747 | const u8 *addr) | 743 | const u8 *addr) |
748 | { | 744 | { |
749 | struct sta_info *sta; | 745 | struct sta_info *sta; |
750 | int ret; | 746 | int ret; |
751 | 747 | ||
752 | mutex_lock(&sdata->local->sta_mtx); | 748 | mutex_lock(&sdata->local->sta_mtx); |
753 | sta = sta_info_get_bss(sdata, addr); | 749 | sta = sta_info_get_bss(sdata, addr); |
754 | ret = __sta_info_destroy(sta); | 750 | ret = __sta_info_destroy(sta); |
755 | mutex_unlock(&sdata->local->sta_mtx); | 751 | mutex_unlock(&sdata->local->sta_mtx); |
756 | 752 | ||
757 | return ret; | 753 | return ret; |
758 | } | 754 | } |
759 | 755 | ||
760 | static void sta_info_cleanup(unsigned long data) | 756 | static void sta_info_cleanup(unsigned long data) |
761 | { | 757 | { |
762 | struct ieee80211_local *local = (struct ieee80211_local *) data; | 758 | struct ieee80211_local *local = (struct ieee80211_local *) data; |
763 | struct sta_info *sta; | 759 | struct sta_info *sta; |
764 | bool timer_needed = false; | 760 | bool timer_needed = false; |
765 | 761 | ||
766 | rcu_read_lock(); | 762 | rcu_read_lock(); |
767 | list_for_each_entry_rcu(sta, &local->sta_list, list) | 763 | list_for_each_entry_rcu(sta, &local->sta_list, list) |
768 | if (sta_info_cleanup_expire_buffered(local, sta)) | 764 | if (sta_info_cleanup_expire_buffered(local, sta)) |
769 | timer_needed = true; | 765 | timer_needed = true; |
770 | rcu_read_unlock(); | 766 | rcu_read_unlock(); |
771 | 767 | ||
772 | if (local->quiescing) | 768 | if (local->quiescing) |
773 | return; | 769 | return; |
774 | 770 | ||
775 | if (!timer_needed) | 771 | if (!timer_needed) |
776 | return; | 772 | return; |
777 | 773 | ||
778 | mod_timer(&local->sta_cleanup, | 774 | mod_timer(&local->sta_cleanup, |
779 | round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); | 775 | round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); |
780 | } | 776 | } |
781 | 777 | ||
782 | void sta_info_init(struct ieee80211_local *local) | 778 | void sta_info_init(struct ieee80211_local *local) |
783 | { | 779 | { |
784 | spin_lock_init(&local->sta_lock); | 780 | spin_lock_init(&local->sta_lock); |
785 | mutex_init(&local->sta_mtx); | 781 | mutex_init(&local->sta_mtx); |
786 | INIT_LIST_HEAD(&local->sta_list); | 782 | INIT_LIST_HEAD(&local->sta_list); |
787 | INIT_LIST_HEAD(&local->sta_pending_list); | 783 | INIT_LIST_HEAD(&local->sta_pending_list); |
788 | INIT_WORK(&local->sta_finish_work, sta_info_finish_work); | 784 | INIT_WORK(&local->sta_finish_work, sta_info_finish_work); |
789 | 785 | ||
790 | setup_timer(&local->sta_cleanup, sta_info_cleanup, | 786 | setup_timer(&local->sta_cleanup, sta_info_cleanup, |
791 | (unsigned long)local); | 787 | (unsigned long)local); |
792 | } | 788 | } |
793 | 789 | ||
794 | void sta_info_stop(struct ieee80211_local *local) | 790 | void sta_info_stop(struct ieee80211_local *local) |
795 | { | 791 | { |
796 | del_timer(&local->sta_cleanup); | 792 | del_timer(&local->sta_cleanup); |
797 | sta_info_flush(local, NULL); | 793 | sta_info_flush(local, NULL); |
798 | } | 794 | } |
799 | 795 | ||
800 | /** | 796 | /** |
801 | * sta_info_flush - flush matching STA entries from the STA table | 797 | * sta_info_flush - flush matching STA entries from the STA table |
802 | * | 798 | * |
803 | * Returns the number of removed STA entries. | 799 | * Returns the number of removed STA entries. |
804 | * | 800 | * |
805 | * @local: local interface data | 801 | * @local: local interface data |
806 | * @sdata: matching rule for the net device (sta->dev) or %NULL to match all STAs | 802 | * @sdata: matching rule for the net device (sta->dev) or %NULL to match all STAs |
807 | */ | 803 | */ |
808 | int sta_info_flush(struct ieee80211_local *local, | 804 | int sta_info_flush(struct ieee80211_local *local, |
809 | struct ieee80211_sub_if_data *sdata) | 805 | struct ieee80211_sub_if_data *sdata) |
810 | { | 806 | { |
811 | struct sta_info *sta, *tmp; | 807 | struct sta_info *sta, *tmp; |
812 | int ret = 0; | 808 | int ret = 0; |
813 | 809 | ||
814 | might_sleep(); | 810 | might_sleep(); |
815 | 811 | ||
816 | mutex_lock(&local->sta_mtx); | 812 | mutex_lock(&local->sta_mtx); |
817 | 813 | ||
818 | sta_info_finish_pending(local); | 814 | sta_info_finish_pending(local); |
819 | 815 | ||
820 | list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { | 816 | list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { |
821 | if (!sdata || sdata == sta->sdata) | 817 | if (!sdata || sdata == sta->sdata) |
822 | WARN_ON(__sta_info_destroy(sta)); | 818 | WARN_ON(__sta_info_destroy(sta)); |
823 | } | 819 | } |
824 | mutex_unlock(&local->sta_mtx); | 820 | mutex_unlock(&local->sta_mtx); |
825 | 821 | ||
826 | return ret; | 822 | return ret; |
827 | } | 823 | } |
828 | 824 | ||
829 | void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, | 825 | void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, |
830 | unsigned long exp_time) | 826 | unsigned long exp_time) |
831 | { | 827 | { |
832 | struct ieee80211_local *local = sdata->local; | 828 | struct ieee80211_local *local = sdata->local; |
833 | struct sta_info *sta, *tmp; | 829 | struct sta_info *sta, *tmp; |
834 | 830 | ||
835 | mutex_lock(&local->sta_mtx); | 831 | mutex_lock(&local->sta_mtx); |
836 | list_for_each_entry_safe(sta, tmp, &local->sta_list, list) | 832 | list_for_each_entry_safe(sta, tmp, &local->sta_list, list) |
837 | if (time_after(jiffies, sta->last_rx + exp_time)) { | 833 | if (time_after(jiffies, sta->last_rx + exp_time)) { |
838 | #ifdef CONFIG_MAC80211_IBSS_DEBUG | 834 | #ifdef CONFIG_MAC80211_IBSS_DEBUG |
839 | printk(KERN_DEBUG "%s: expiring inactive STA %pM\n", | 835 | printk(KERN_DEBUG "%s: expiring inactive STA %pM\n", |
840 | sdata->name, sta->sta.addr); | 836 | sdata->name, sta->sta.addr); |
841 | #endif | 837 | #endif |
842 | WARN_ON(__sta_info_destroy(sta)); | 838 | WARN_ON(__sta_info_destroy(sta)); |
843 | } | 839 | } |
844 | mutex_unlock(&local->sta_mtx); | 840 | mutex_unlock(&local->sta_mtx); |
845 | } | 841 | } |
846 | 842 | ||
847 | struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, | 843 | struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, |
848 | const u8 *addr, | 844 | const u8 *addr, |
849 | const u8 *localaddr) | 845 | const u8 *localaddr) |
850 | { | 846 | { |
851 | struct sta_info *sta, *nxt; | 847 | struct sta_info *sta, *nxt; |
852 | 848 | ||
853 | /* | 849 | /* |
854 | * Just return a random station if localaddr is NULL | 850 | * Just return a random station if localaddr is NULL |
855 | * ... first in list. | 851 | * ... first in list. |
856 | */ | 852 | */ |
857 | for_each_sta_info(hw_to_local(hw), addr, sta, nxt) { | 853 | for_each_sta_info(hw_to_local(hw), addr, sta, nxt) { |
858 | if (localaddr && | 854 | if (localaddr && |
859 | compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0) | 855 | compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0) |
860 | continue; | 856 | continue; |
861 | if (!sta->uploaded) | 857 | if (!sta->uploaded) |
862 | return NULL; | 858 | return NULL; |
863 | return &sta->sta; | 859 | return &sta->sta; |
864 | } | 860 | } |
865 | 861 | ||
866 | return NULL; | 862 | return NULL; |
867 | } | 863 | } |
868 | EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); | 864 | EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); |
869 | 865 | ||
870 | struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, | 866 | struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, |
871 | const u8 *addr) | 867 | const u8 *addr) |
872 | { | 868 | { |
873 | struct sta_info *sta; | 869 | struct sta_info *sta; |
874 | 870 | ||
875 | if (!vif) | 871 | if (!vif) |
876 | return NULL; | 872 | return NULL; |
877 | 873 | ||
878 | sta = sta_info_get_bss(vif_to_sdata(vif), addr); | 874 | sta = sta_info_get_bss(vif_to_sdata(vif), addr); |
879 | if (!sta) | 875 | if (!sta) |
880 | return NULL; | 876 | return NULL; |
881 | 877 | ||
882 | if (!sta->uploaded) | 878 | if (!sta->uploaded) |
883 | return NULL; | 879 | return NULL; |
884 | 880 | ||
885 | return &sta->sta; | 881 | return &sta->sta; |
886 | } | 882 | } |
887 | EXPORT_SYMBOL(ieee80211_find_sta); | 883 | EXPORT_SYMBOL(ieee80211_find_sta); |
888 | 884 | ||
889 | static void clear_sta_ps_flags(void *_sta) | 885 | static void clear_sta_ps_flags(void *_sta) |
890 | { | 886 | { |
891 | struct sta_info *sta = _sta; | 887 | struct sta_info *sta = _sta; |
892 | 888 | ||
893 | clear_sta_flags(sta, WLAN_STA_PS_DRIVER | WLAN_STA_PS_STA); | 889 | clear_sta_flags(sta, WLAN_STA_PS_DRIVER | WLAN_STA_PS_STA); |
894 | } | 890 | } |
895 | 891 | ||
896 | /* powersave support code */ | 892 | /* powersave support code */ |
897 | void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) | 893 | void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) |
898 | { | 894 | { |
899 | struct ieee80211_sub_if_data *sdata = sta->sdata; | 895 | struct ieee80211_sub_if_data *sdata = sta->sdata; |
900 | struct ieee80211_local *local = sdata->local; | 896 | struct ieee80211_local *local = sdata->local; |
901 | int sent, buffered; | 897 | int sent, buffered; |
902 | 898 | ||
903 | clear_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF); | 899 | clear_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF); |
904 | if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) | 900 | if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) |
905 | drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); | 901 | drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); |
906 | 902 | ||
907 | if (!skb_queue_empty(&sta->ps_tx_buf)) | 903 | if (!skb_queue_empty(&sta->ps_tx_buf)) |
908 | sta_info_clear_tim_bit(sta); | 904 | sta_info_clear_tim_bit(sta); |
909 | 905 | ||
910 | /* Send all buffered frames to the station */ | 906 | /* Send all buffered frames to the station */ |
911 | sent = ieee80211_add_pending_skbs(local, &sta->tx_filtered); | 907 | sent = ieee80211_add_pending_skbs(local, &sta->tx_filtered); |
912 | buffered = ieee80211_add_pending_skbs_fn(local, &sta->ps_tx_buf, | 908 | buffered = ieee80211_add_pending_skbs_fn(local, &sta->ps_tx_buf, |
913 | clear_sta_ps_flags, sta); | 909 | clear_sta_ps_flags, sta); |
914 | sent += buffered; | 910 | sent += buffered; |
915 | local->total_ps_buffered -= buffered; | 911 | local->total_ps_buffered -= buffered; |
916 | 912 | ||
917 | #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG | 913 | #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG |
918 | printk(KERN_DEBUG "%s: STA %pM aid %d sending %d filtered/%d PS frames " | 914 | printk(KERN_DEBUG "%s: STA %pM aid %d sending %d filtered/%d PS frames " |
919 | "since STA not sleeping anymore\n", sdata->name, | 915 | "since STA not sleeping anymore\n", sdata->name, |
920 | sta->sta.addr, sta->sta.aid, sent - buffered, buffered); | 916 | sta->sta.addr, sta->sta.aid, sent - buffered, buffered); |
921 | #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ | 917 | #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ |
922 | } | 918 | } |
923 | 919 | ||
924 | void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) | 920 | void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) |
925 | { | 921 | { |
926 | struct ieee80211_sub_if_data *sdata = sta->sdata; | 922 | struct ieee80211_sub_if_data *sdata = sta->sdata; |
927 | struct ieee80211_local *local = sdata->local; | 923 | struct ieee80211_local *local = sdata->local; |
928 | struct sk_buff *skb; | 924 | struct sk_buff *skb; |
929 | int no_pending_pkts; | 925 | int no_pending_pkts; |
930 | 926 | ||
931 | skb = skb_dequeue(&sta->tx_filtered); | 927 | skb = skb_dequeue(&sta->tx_filtered); |
932 | if (!skb) { | 928 | if (!skb) { |
933 | skb = skb_dequeue(&sta->ps_tx_buf); | 929 | skb = skb_dequeue(&sta->ps_tx_buf); |
934 | if (skb) | 930 | if (skb) |
935 | local->total_ps_buffered--; | 931 | local->total_ps_buffered--; |
936 | } | 932 | } |
937 | no_pending_pkts = skb_queue_empty(&sta->tx_filtered) && | 933 | no_pending_pkts = skb_queue_empty(&sta->tx_filtered) && |
938 | skb_queue_empty(&sta->ps_tx_buf); | 934 | skb_queue_empty(&sta->ps_tx_buf); |
939 | 935 | ||
940 | if (skb) { | 936 | if (skb) { |
941 | struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); | 937 | struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); |
942 | struct ieee80211_hdr *hdr = | 938 | struct ieee80211_hdr *hdr = |
943 | (struct ieee80211_hdr *) skb->data; | 939 | (struct ieee80211_hdr *) skb->data; |
944 | 940 | ||
945 | /* | 941 | /* |
946 | * Tell TX path to send this frame even though the STA may | 942 | * Tell TX path to send this frame even though the STA may |
947 | * still remain is PS mode after this frame exchange. | 943 | * still remain is PS mode after this frame exchange. |
948 | */ | 944 | */ |
949 | info->flags |= IEEE80211_TX_CTL_PSPOLL_RESPONSE; | 945 | info->flags |= IEEE80211_TX_CTL_PSPOLL_RESPONSE; |
950 | 946 | ||
951 | #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG | 947 | #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG |
952 | printk(KERN_DEBUG "STA %pM aid %d: PS Poll (entries after %d)\n", | 948 | printk(KERN_DEBUG "STA %pM aid %d: PS Poll (entries after %d)\n", |
953 | sta->sta.addr, sta->sta.aid, | 949 | sta->sta.addr, sta->sta.aid, |
954 | skb_queue_len(&sta->ps_tx_buf)); | 950 | skb_queue_len(&sta->ps_tx_buf)); |
955 | #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ | 951 | #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ |
956 | 952 | ||
957 | /* Use MoreData flag to indicate whether there are more | 953 | /* Use MoreData flag to indicate whether there are more |
958 | * buffered frames for this STA */ | 954 | * buffered frames for this STA */ |
959 | if (no_pending_pkts) | 955 | if (no_pending_pkts) |
960 | hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); | 956 | hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); |
961 | else | 957 | else |
962 | hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); | 958 | hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); |
963 | 959 | ||
964 | ieee80211_add_pending_skb(local, skb); | 960 | ieee80211_add_pending_skb(local, skb); |
965 | 961 | ||
966 | if (no_pending_pkts) | 962 | if (no_pending_pkts) |
967 | sta_info_clear_tim_bit(sta); | 963 | sta_info_clear_tim_bit(sta); |
968 | #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG | 964 | #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG |
969 | } else { | 965 | } else { |
970 | /* | 966 | /* |
971 | * FIXME: This can be the result of a race condition between | 967 | * FIXME: This can be the result of a race condition between |
972 | * us expiring a frame and the station polling for it. | 968 | * us expiring a frame and the station polling for it. |
973 | * Should we send it a null-func frame indicating we | 969 | * Should we send it a null-func frame indicating we |
974 | * have nothing buffered for it? | 970 | * have nothing buffered for it? |
975 | */ | 971 | */ |
976 | printk(KERN_DEBUG "%s: STA %pM sent PS Poll even " | 972 | printk(KERN_DEBUG "%s: STA %pM sent PS Poll even " |
977 | "though there are no buffered frames for it\n", | 973 | "though there are no buffered frames for it\n", |
978 | sdata->name, sta->sta.addr); | 974 | sdata->name, sta->sta.addr); |
979 | #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ | 975 | #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ |
980 | } | 976 | } |
981 | } | 977 | } |
982 | 978 | ||
983 | void ieee80211_sta_block_awake(struct ieee80211_hw *hw, | 979 | void ieee80211_sta_block_awake(struct ieee80211_hw *hw, |
984 | struct ieee80211_sta *pubsta, bool block) | 980 | struct ieee80211_sta *pubsta, bool block) |
985 | { | 981 | { |
986 | struct sta_info *sta = container_of(pubsta, struct sta_info, sta); | 982 | struct sta_info *sta = container_of(pubsta, struct sta_info, sta); |
987 | 983 | ||
988 | trace_api_sta_block_awake(sta->local, pubsta, block); | 984 | trace_api_sta_block_awake(sta->local, pubsta, block); |
989 | 985 | ||
990 | if (block) | 986 | if (block) |
991 | set_sta_flags(sta, WLAN_STA_PS_DRIVER); | 987 | set_sta_flags(sta, WLAN_STA_PS_DRIVER); |
992 | else if (test_sta_flags(sta, WLAN_STA_PS_DRIVER)) | 988 | else if (test_sta_flags(sta, WLAN_STA_PS_DRIVER)) |
993 | ieee80211_queue_work(hw, &sta->drv_unblock_wk); | 989 | ieee80211_queue_work(hw, &sta->drv_unblock_wk); |
994 | } | 990 | } |
995 | EXPORT_SYMBOL(ieee80211_sta_block_awake); | 991 | EXPORT_SYMBOL(ieee80211_sta_block_awake); |
996 | 992 | ||
997 | void ieee80211_sta_set_tim(struct ieee80211_sta *pubsta) | 993 | void ieee80211_sta_set_tim(struct ieee80211_sta *pubsta) |
998 | { | 994 | { |
999 | struct sta_info *sta = container_of(pubsta, struct sta_info, sta); | 995 | struct sta_info *sta = container_of(pubsta, struct sta_info, sta); |
1000 | 996 | ||
1001 | set_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF); | 997 | set_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF); |
1002 | sta_info_set_tim_bit(sta); | 998 | sta_info_set_tim_bit(sta); |
1003 | } | 999 | } |
1004 | EXPORT_SYMBOL(ieee80211_sta_set_tim); | 1000 | EXPORT_SYMBOL(ieee80211_sta_set_tim); |
1005 | 1001 |
net/netlabel/netlabel_domainhash.c
1 | /* | 1 | /* |
2 | * NetLabel Domain Hash Table | 2 | * NetLabel Domain Hash Table |
3 | * | 3 | * |
4 | * This file manages the domain hash table that NetLabel uses to determine | 4 | * This file manages the domain hash table that NetLabel uses to determine |
5 | * which network labeling protocol to use for a given domain. The NetLabel | 5 | * which network labeling protocol to use for a given domain. The NetLabel |
6 | * system manages static and dynamic label mappings for network protocols such | 6 | * system manages static and dynamic label mappings for network protocols such |
7 | * as CIPSO and RIPSO. | 7 | * as CIPSO and RIPSO. |
8 | * | 8 | * |
9 | * Author: Paul Moore <paul.moore@hp.com> | 9 | * Author: Paul Moore <paul.moore@hp.com> |
10 | * | 10 | * |
11 | */ | 11 | */ |
12 | 12 | ||
13 | /* | 13 | /* |
14 | * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 | 14 | * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 |
15 | * | 15 | * |
16 | * This program is free software; you can redistribute it and/or modify | 16 | * This program is free software; you can redistribute it and/or modify |
17 | * it under the terms of the GNU General Public License as published by | 17 | * it under the terms of the GNU General Public License as published by |
18 | * the Free Software Foundation; either version 2 of the License, or | 18 | * the Free Software Foundation; either version 2 of the License, or |
19 | * (at your option) any later version. | 19 | * (at your option) any later version. |
20 | * | 20 | * |
21 | * This program is distributed in the hope that it will be useful, | 21 | * This program is distributed in the hope that it will be useful, |
22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See |
24 | * the GNU General Public License for more details. | 24 | * the GNU General Public License for more details. |
25 | * | 25 | * |
26 | * You should have received a copy of the GNU General Public License | 26 | * You should have received a copy of the GNU General Public License |
27 | * along with this program; if not, write to the Free Software | 27 | * along with this program; if not, write to the Free Software |
28 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 28 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
29 | * | 29 | * |
30 | */ | 30 | */ |
31 | 31 | ||
32 | #include <linux/types.h> | 32 | #include <linux/types.h> |
33 | #include <linux/rculist.h> | 33 | #include <linux/rculist.h> |
34 | #include <linux/skbuff.h> | 34 | #include <linux/skbuff.h> |
35 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
36 | #include <linux/string.h> | 36 | #include <linux/string.h> |
37 | #include <linux/audit.h> | 37 | #include <linux/audit.h> |
38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
39 | #include <net/netlabel.h> | 39 | #include <net/netlabel.h> |
40 | #include <net/cipso_ipv4.h> | 40 | #include <net/cipso_ipv4.h> |
41 | #include <asm/bug.h> | 41 | #include <asm/bug.h> |
42 | 42 | ||
43 | #include "netlabel_mgmt.h" | 43 | #include "netlabel_mgmt.h" |
44 | #include "netlabel_addrlist.h" | 44 | #include "netlabel_addrlist.h" |
45 | #include "netlabel_domainhash.h" | 45 | #include "netlabel_domainhash.h" |
46 | #include "netlabel_user.h" | 46 | #include "netlabel_user.h" |
47 | 47 | ||
48 | struct netlbl_domhsh_tbl { | 48 | struct netlbl_domhsh_tbl { |
49 | struct list_head *tbl; | 49 | struct list_head *tbl; |
50 | u32 size; | 50 | u32 size; |
51 | }; | 51 | }; |
52 | 52 | ||
53 | /* Domain hash table */ | 53 | /* Domain hash table */ |
54 | /* updates should be so rare that having one spinlock for the entire hash table | 54 | /* updates should be so rare that having one spinlock for the entire hash table |
55 | * should be okay */ | 55 | * should be okay */ |
56 | static DEFINE_SPINLOCK(netlbl_domhsh_lock); | 56 | static DEFINE_SPINLOCK(netlbl_domhsh_lock); |
57 | #define netlbl_domhsh_rcu_deref(p) \ | 57 | #define netlbl_domhsh_rcu_deref(p) \ |
58 | rcu_dereference_check(p, rcu_read_lock_held() || \ | 58 | rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock)) |
59 | lockdep_is_held(&netlbl_domhsh_lock)) | ||
60 | static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; | 59 | static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; |
61 | static struct netlbl_dom_map *netlbl_domhsh_def = NULL; | 60 | static struct netlbl_dom_map *netlbl_domhsh_def = NULL; |
62 | 61 | ||
63 | /* | 62 | /* |
64 | * Domain Hash Table Helper Functions | 63 | * Domain Hash Table Helper Functions |
65 | */ | 64 | */ |
66 | 65 | ||
67 | /** | 66 | /** |
68 | * netlbl_domhsh_free_entry - Frees a domain hash table entry | 67 | * netlbl_domhsh_free_entry - Frees a domain hash table entry |
69 | * @entry: the entry's RCU field | 68 | * @entry: the entry's RCU field |
70 | * | 69 | * |
71 | * Description: | 70 | * Description: |
72 | * This function is designed to be used as a callback to the call_rcu() | 71 | * This function is designed to be used as a callback to the call_rcu() |
73 | * function so that the memory allocated to a hash table entry can be released | 72 | * function so that the memory allocated to a hash table entry can be released |
74 | * safely. | 73 | * safely. |
75 | * | 74 | * |
76 | */ | 75 | */ |
77 | static void netlbl_domhsh_free_entry(struct rcu_head *entry) | 76 | static void netlbl_domhsh_free_entry(struct rcu_head *entry) |
78 | { | 77 | { |
79 | struct netlbl_dom_map *ptr; | 78 | struct netlbl_dom_map *ptr; |
80 | struct netlbl_af4list *iter4; | 79 | struct netlbl_af4list *iter4; |
81 | struct netlbl_af4list *tmp4; | 80 | struct netlbl_af4list *tmp4; |
82 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 81 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
83 | struct netlbl_af6list *iter6; | 82 | struct netlbl_af6list *iter6; |
84 | struct netlbl_af6list *tmp6; | 83 | struct netlbl_af6list *tmp6; |
85 | #endif /* IPv6 */ | 84 | #endif /* IPv6 */ |
86 | 85 | ||
87 | ptr = container_of(entry, struct netlbl_dom_map, rcu); | 86 | ptr = container_of(entry, struct netlbl_dom_map, rcu); |
88 | if (ptr->type == NETLBL_NLTYPE_ADDRSELECT) { | 87 | if (ptr->type == NETLBL_NLTYPE_ADDRSELECT) { |
89 | netlbl_af4list_foreach_safe(iter4, tmp4, | 88 | netlbl_af4list_foreach_safe(iter4, tmp4, |
90 | &ptr->type_def.addrsel->list4) { | 89 | &ptr->type_def.addrsel->list4) { |
91 | netlbl_af4list_remove_entry(iter4); | 90 | netlbl_af4list_remove_entry(iter4); |
92 | kfree(netlbl_domhsh_addr4_entry(iter4)); | 91 | kfree(netlbl_domhsh_addr4_entry(iter4)); |
93 | } | 92 | } |
94 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 93 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
95 | netlbl_af6list_foreach_safe(iter6, tmp6, | 94 | netlbl_af6list_foreach_safe(iter6, tmp6, |
96 | &ptr->type_def.addrsel->list6) { | 95 | &ptr->type_def.addrsel->list6) { |
97 | netlbl_af6list_remove_entry(iter6); | 96 | netlbl_af6list_remove_entry(iter6); |
98 | kfree(netlbl_domhsh_addr6_entry(iter6)); | 97 | kfree(netlbl_domhsh_addr6_entry(iter6)); |
99 | } | 98 | } |
100 | #endif /* IPv6 */ | 99 | #endif /* IPv6 */ |
101 | } | 100 | } |
102 | kfree(ptr->domain); | 101 | kfree(ptr->domain); |
103 | kfree(ptr); | 102 | kfree(ptr); |
104 | } | 103 | } |
105 | 104 | ||
106 | /** | 105 | /** |
107 | * netlbl_domhsh_hash - Hashing function for the domain hash table | 106 | * netlbl_domhsh_hash - Hashing function for the domain hash table |
108 | * @domain: the domain name to hash | 107 | * @domain: the domain name to hash |
109 | * | 108 | * |
110 | * Description: | 109 | * Description: |
111 | * This is the hashing function for the domain hash table, it returns the | 110 | * This is the hashing function for the domain hash table, it returns the |
112 | * correct bucket number for the domain. The caller is responsible for | 111 | * correct bucket number for the domain. The caller is responsible for |
113 | * ensuring that the hash table is protected with either a RCU read lock or the | 112 | * ensuring that the hash table is protected with either a RCU read lock or the |
114 | * hash table lock. | 113 | * hash table lock. |
115 | * | 114 | * |
116 | */ | 115 | */ |
117 | static u32 netlbl_domhsh_hash(const char *key) | 116 | static u32 netlbl_domhsh_hash(const char *key) |
118 | { | 117 | { |
119 | u32 iter; | 118 | u32 iter; |
120 | u32 val; | 119 | u32 val; |
121 | u32 len; | 120 | u32 len; |
122 | 121 | ||
123 | /* This is taken (with slight modification) from | 122 | /* This is taken (with slight modification) from |
124 | * security/selinux/ss/symtab.c:symhash() */ | 123 | * security/selinux/ss/symtab.c:symhash() */ |
125 | 124 | ||
126 | for (iter = 0, val = 0, len = strlen(key); iter < len; iter++) | 125 | for (iter = 0, val = 0, len = strlen(key); iter < len; iter++) |
127 | val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter]; | 126 | val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter]; |
128 | return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1); | 127 | return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1); |
129 | } | 128 | } |
130 | 129 | ||
131 | /** | 130 | /** |
132 | * netlbl_domhsh_search - Search for a domain entry | 131 | * netlbl_domhsh_search - Search for a domain entry |
133 | * @domain: the domain | 132 | * @domain: the domain |
134 | * | 133 | * |
135 | * Description: | 134 | * Description: |
136 | * Searches the domain hash table and returns a pointer to the hash table | 135 | * Searches the domain hash table and returns a pointer to the hash table |
137 | * entry if found, otherwise NULL is returned. The caller is responsible for | 136 | * entry if found, otherwise NULL is returned. The caller is responsible for |
138 | * ensuring that the hash table is protected with either a RCU read lock or the | 137 | * ensuring that the hash table is protected with either a RCU read lock or the |
139 | * hash table lock. | 138 | * hash table lock. |
140 | * | 139 | * |
141 | */ | 140 | */ |
142 | static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) | 141 | static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) |
143 | { | 142 | { |
144 | u32 bkt; | 143 | u32 bkt; |
145 | struct list_head *bkt_list; | 144 | struct list_head *bkt_list; |
146 | struct netlbl_dom_map *iter; | 145 | struct netlbl_dom_map *iter; |
147 | 146 | ||
148 | if (domain != NULL) { | 147 | if (domain != NULL) { |
149 | bkt = netlbl_domhsh_hash(domain); | 148 | bkt = netlbl_domhsh_hash(domain); |
150 | bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; | 149 | bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; |
151 | list_for_each_entry_rcu(iter, bkt_list, list) | 150 | list_for_each_entry_rcu(iter, bkt_list, list) |
152 | if (iter->valid && strcmp(iter->domain, domain) == 0) | 151 | if (iter->valid && strcmp(iter->domain, domain) == 0) |
153 | return iter; | 152 | return iter; |
154 | } | 153 | } |
155 | 154 | ||
156 | return NULL; | 155 | return NULL; |
157 | } | 156 | } |
158 | 157 | ||
159 | /** | 158 | /** |
160 | * netlbl_domhsh_search_def - Search for a domain entry | 159 | * netlbl_domhsh_search_def - Search for a domain entry |
161 | * @domain: the domain | 160 | * @domain: the domain |
162 | * @def: return default if no match is found | 161 | * @def: return default if no match is found |
163 | * | 162 | * |
164 | * Description: | 163 | * Description: |
165 | * Searches the domain hash table and returns a pointer to the hash table | 164 | * Searches the domain hash table and returns a pointer to the hash table |
166 | * entry if an exact match is found, if an exact match is not present in the | 165 | * entry if an exact match is found, if an exact match is not present in the |
167 | * hash table then the default entry is returned if valid otherwise NULL is | 166 | * hash table then the default entry is returned if valid otherwise NULL is |
168 | * returned. The caller is responsible ensuring that the hash table is | 167 | * returned. The caller is responsible ensuring that the hash table is |
169 | * protected with either a RCU read lock or the hash table lock. | 168 | * protected with either a RCU read lock or the hash table lock. |
170 | * | 169 | * |
171 | */ | 170 | */ |
172 | static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) | 171 | static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) |
173 | { | 172 | { |
174 | struct netlbl_dom_map *entry; | 173 | struct netlbl_dom_map *entry; |
175 | 174 | ||
176 | entry = netlbl_domhsh_search(domain); | 175 | entry = netlbl_domhsh_search(domain); |
177 | if (entry == NULL) { | 176 | if (entry == NULL) { |
178 | entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def); | 177 | entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def); |
179 | if (entry != NULL && !entry->valid) | 178 | if (entry != NULL && !entry->valid) |
180 | entry = NULL; | 179 | entry = NULL; |
181 | } | 180 | } |
182 | 181 | ||
183 | return entry; | 182 | return entry; |
184 | } | 183 | } |
185 | 184 | ||
186 | /** | 185 | /** |
187 | * netlbl_domhsh_audit_add - Generate an audit entry for an add event | 186 | * netlbl_domhsh_audit_add - Generate an audit entry for an add event |
188 | * @entry: the entry being added | 187 | * @entry: the entry being added |
189 | * @addr4: the IPv4 address information | 188 | * @addr4: the IPv4 address information |
190 | * @addr6: the IPv6 address information | 189 | * @addr6: the IPv6 address information |
191 | * @result: the result code | 190 | * @result: the result code |
192 | * @audit_info: NetLabel audit information | 191 | * @audit_info: NetLabel audit information |
193 | * | 192 | * |
194 | * Description: | 193 | * Description: |
195 | * Generate an audit record for adding a new NetLabel/LSM mapping entry with | 194 | * Generate an audit record for adding a new NetLabel/LSM mapping entry with |
196 | * the given information. Caller is responsible for holding the necessary | 195 | * the given information. Caller is responsible for holding the necessary |
197 | * locks. | 196 | * locks. |
198 | * | 197 | * |
199 | */ | 198 | */ |
200 | static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, | 199 | static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, |
201 | struct netlbl_af4list *addr4, | 200 | struct netlbl_af4list *addr4, |
202 | struct netlbl_af6list *addr6, | 201 | struct netlbl_af6list *addr6, |
203 | int result, | 202 | int result, |
204 | struct netlbl_audit *audit_info) | 203 | struct netlbl_audit *audit_info) |
205 | { | 204 | { |
206 | struct audit_buffer *audit_buf; | 205 | struct audit_buffer *audit_buf; |
207 | struct cipso_v4_doi *cipsov4 = NULL; | 206 | struct cipso_v4_doi *cipsov4 = NULL; |
208 | u32 type; | 207 | u32 type; |
209 | 208 | ||
210 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); | 209 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); |
211 | if (audit_buf != NULL) { | 210 | if (audit_buf != NULL) { |
212 | audit_log_format(audit_buf, " nlbl_domain=%s", | 211 | audit_log_format(audit_buf, " nlbl_domain=%s", |
213 | entry->domain ? entry->domain : "(default)"); | 212 | entry->domain ? entry->domain : "(default)"); |
214 | if (addr4 != NULL) { | 213 | if (addr4 != NULL) { |
215 | struct netlbl_domaddr4_map *map4; | 214 | struct netlbl_domaddr4_map *map4; |
216 | map4 = netlbl_domhsh_addr4_entry(addr4); | 215 | map4 = netlbl_domhsh_addr4_entry(addr4); |
217 | type = map4->type; | 216 | type = map4->type; |
218 | cipsov4 = map4->type_def.cipsov4; | 217 | cipsov4 = map4->type_def.cipsov4; |
219 | netlbl_af4list_audit_addr(audit_buf, 0, NULL, | 218 | netlbl_af4list_audit_addr(audit_buf, 0, NULL, |
220 | addr4->addr, addr4->mask); | 219 | addr4->addr, addr4->mask); |
221 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 220 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
222 | } else if (addr6 != NULL) { | 221 | } else if (addr6 != NULL) { |
223 | struct netlbl_domaddr6_map *map6; | 222 | struct netlbl_domaddr6_map *map6; |
224 | map6 = netlbl_domhsh_addr6_entry(addr6); | 223 | map6 = netlbl_domhsh_addr6_entry(addr6); |
225 | type = map6->type; | 224 | type = map6->type; |
226 | netlbl_af6list_audit_addr(audit_buf, 0, NULL, | 225 | netlbl_af6list_audit_addr(audit_buf, 0, NULL, |
227 | &addr6->addr, &addr6->mask); | 226 | &addr6->addr, &addr6->mask); |
228 | #endif /* IPv6 */ | 227 | #endif /* IPv6 */ |
229 | } else { | 228 | } else { |
230 | type = entry->type; | 229 | type = entry->type; |
231 | cipsov4 = entry->type_def.cipsov4; | 230 | cipsov4 = entry->type_def.cipsov4; |
232 | } | 231 | } |
233 | switch (type) { | 232 | switch (type) { |
234 | case NETLBL_NLTYPE_UNLABELED: | 233 | case NETLBL_NLTYPE_UNLABELED: |
235 | audit_log_format(audit_buf, " nlbl_protocol=unlbl"); | 234 | audit_log_format(audit_buf, " nlbl_protocol=unlbl"); |
236 | break; | 235 | break; |
237 | case NETLBL_NLTYPE_CIPSOV4: | 236 | case NETLBL_NLTYPE_CIPSOV4: |
238 | BUG_ON(cipsov4 == NULL); | 237 | BUG_ON(cipsov4 == NULL); |
239 | audit_log_format(audit_buf, | 238 | audit_log_format(audit_buf, |
240 | " nlbl_protocol=cipsov4 cipso_doi=%u", | 239 | " nlbl_protocol=cipsov4 cipso_doi=%u", |
241 | cipsov4->doi); | 240 | cipsov4->doi); |
242 | break; | 241 | break; |
243 | } | 242 | } |
244 | audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0); | 243 | audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0); |
245 | audit_log_end(audit_buf); | 244 | audit_log_end(audit_buf); |
246 | } | 245 | } |
247 | } | 246 | } |
248 | 247 | ||
249 | /* | 248 | /* |
250 | * Domain Hash Table Functions | 249 | * Domain Hash Table Functions |
251 | */ | 250 | */ |
252 | 251 | ||
253 | /** | 252 | /** |
254 | * netlbl_domhsh_init - Init for the domain hash | 253 | * netlbl_domhsh_init - Init for the domain hash |
255 | * @size: the number of bits to use for the hash buckets | 254 | * @size: the number of bits to use for the hash buckets |
256 | * | 255 | * |
257 | * Description: | 256 | * Description: |
258 | * Initializes the domain hash table, should be called only by | 257 | * Initializes the domain hash table, should be called only by |
259 | * netlbl_user_init() during initialization. Returns zero on success, non-zero | 258 | * netlbl_user_init() during initialization. Returns zero on success, non-zero |
260 | * values on error. | 259 | * values on error. |
261 | * | 260 | * |
262 | */ | 261 | */ |
263 | int __init netlbl_domhsh_init(u32 size) | 262 | int __init netlbl_domhsh_init(u32 size) |
264 | { | 263 | { |
265 | u32 iter; | 264 | u32 iter; |
266 | struct netlbl_domhsh_tbl *hsh_tbl; | 265 | struct netlbl_domhsh_tbl *hsh_tbl; |
267 | 266 | ||
268 | if (size == 0) | 267 | if (size == 0) |
269 | return -EINVAL; | 268 | return -EINVAL; |
270 | 269 | ||
271 | hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); | 270 | hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); |
272 | if (hsh_tbl == NULL) | 271 | if (hsh_tbl == NULL) |
273 | return -ENOMEM; | 272 | return -ENOMEM; |
274 | hsh_tbl->size = 1 << size; | 273 | hsh_tbl->size = 1 << size; |
275 | hsh_tbl->tbl = kcalloc(hsh_tbl->size, | 274 | hsh_tbl->tbl = kcalloc(hsh_tbl->size, |
276 | sizeof(struct list_head), | 275 | sizeof(struct list_head), |
277 | GFP_KERNEL); | 276 | GFP_KERNEL); |
278 | if (hsh_tbl->tbl == NULL) { | 277 | if (hsh_tbl->tbl == NULL) { |
279 | kfree(hsh_tbl); | 278 | kfree(hsh_tbl); |
280 | return -ENOMEM; | 279 | return -ENOMEM; |
281 | } | 280 | } |
282 | for (iter = 0; iter < hsh_tbl->size; iter++) | 281 | for (iter = 0; iter < hsh_tbl->size; iter++) |
283 | INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); | 282 | INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); |
284 | 283 | ||
285 | spin_lock(&netlbl_domhsh_lock); | 284 | spin_lock(&netlbl_domhsh_lock); |
286 | rcu_assign_pointer(netlbl_domhsh, hsh_tbl); | 285 | rcu_assign_pointer(netlbl_domhsh, hsh_tbl); |
287 | spin_unlock(&netlbl_domhsh_lock); | 286 | spin_unlock(&netlbl_domhsh_lock); |
288 | 287 | ||
289 | return 0; | 288 | return 0; |
290 | } | 289 | } |
291 | 290 | ||
292 | /** | 291 | /** |
293 | * netlbl_domhsh_add - Adds a entry to the domain hash table | 292 | * netlbl_domhsh_add - Adds a entry to the domain hash table |
294 | * @entry: the entry to add | 293 | * @entry: the entry to add |
295 | * @audit_info: NetLabel audit information | 294 | * @audit_info: NetLabel audit information |
296 | * | 295 | * |
297 | * Description: | 296 | * Description: |
298 | * Adds a new entry to the domain hash table and handles any updates to the | 297 | * Adds a new entry to the domain hash table and handles any updates to the |
299 | * lower level protocol handler (i.e. CIPSO). Returns zero on success, | 298 | * lower level protocol handler (i.e. CIPSO). Returns zero on success, |
300 | * negative on failure. | 299 | * negative on failure. |
301 | * | 300 | * |
302 | */ | 301 | */ |
303 | int netlbl_domhsh_add(struct netlbl_dom_map *entry, | 302 | int netlbl_domhsh_add(struct netlbl_dom_map *entry, |
304 | struct netlbl_audit *audit_info) | 303 | struct netlbl_audit *audit_info) |
305 | { | 304 | { |
306 | int ret_val = 0; | 305 | int ret_val = 0; |
307 | struct netlbl_dom_map *entry_old; | 306 | struct netlbl_dom_map *entry_old; |
308 | struct netlbl_af4list *iter4; | 307 | struct netlbl_af4list *iter4; |
309 | struct netlbl_af4list *tmp4; | 308 | struct netlbl_af4list *tmp4; |
310 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 309 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
311 | struct netlbl_af6list *iter6; | 310 | struct netlbl_af6list *iter6; |
312 | struct netlbl_af6list *tmp6; | 311 | struct netlbl_af6list *tmp6; |
313 | #endif /* IPv6 */ | 312 | #endif /* IPv6 */ |
314 | 313 | ||
315 | /* XXX - we can remove this RCU read lock as the spinlock protects the | 314 | /* XXX - we can remove this RCU read lock as the spinlock protects the |
316 | * entire function, but before we do we need to fixup the | 315 | * entire function, but before we do we need to fixup the |
317 | * netlbl_af[4,6]list RCU functions to do "the right thing" with | 316 | * netlbl_af[4,6]list RCU functions to do "the right thing" with |
318 | * respect to rcu_dereference() when only a spinlock is held. */ | 317 | * respect to rcu_dereference() when only a spinlock is held. */ |
319 | rcu_read_lock(); | 318 | rcu_read_lock(); |
320 | spin_lock(&netlbl_domhsh_lock); | 319 | spin_lock(&netlbl_domhsh_lock); |
321 | if (entry->domain != NULL) | 320 | if (entry->domain != NULL) |
322 | entry_old = netlbl_domhsh_search(entry->domain); | 321 | entry_old = netlbl_domhsh_search(entry->domain); |
323 | else | 322 | else |
324 | entry_old = netlbl_domhsh_search_def(entry->domain); | 323 | entry_old = netlbl_domhsh_search_def(entry->domain); |
325 | if (entry_old == NULL) { | 324 | if (entry_old == NULL) { |
326 | entry->valid = 1; | 325 | entry->valid = 1; |
327 | 326 | ||
328 | if (entry->domain != NULL) { | 327 | if (entry->domain != NULL) { |
329 | u32 bkt = netlbl_domhsh_hash(entry->domain); | 328 | u32 bkt = netlbl_domhsh_hash(entry->domain); |
330 | list_add_tail_rcu(&entry->list, | 329 | list_add_tail_rcu(&entry->list, |
331 | &rcu_dereference(netlbl_domhsh)->tbl[bkt]); | 330 | &rcu_dereference(netlbl_domhsh)->tbl[bkt]); |
332 | } else { | 331 | } else { |
333 | INIT_LIST_HEAD(&entry->list); | 332 | INIT_LIST_HEAD(&entry->list); |
334 | rcu_assign_pointer(netlbl_domhsh_def, entry); | 333 | rcu_assign_pointer(netlbl_domhsh_def, entry); |
335 | } | 334 | } |
336 | 335 | ||
337 | if (entry->type == NETLBL_NLTYPE_ADDRSELECT) { | 336 | if (entry->type == NETLBL_NLTYPE_ADDRSELECT) { |
338 | netlbl_af4list_foreach_rcu(iter4, | 337 | netlbl_af4list_foreach_rcu(iter4, |
339 | &entry->type_def.addrsel->list4) | 338 | &entry->type_def.addrsel->list4) |
340 | netlbl_domhsh_audit_add(entry, iter4, NULL, | 339 | netlbl_domhsh_audit_add(entry, iter4, NULL, |
341 | ret_val, audit_info); | 340 | ret_val, audit_info); |
342 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 341 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
343 | netlbl_af6list_foreach_rcu(iter6, | 342 | netlbl_af6list_foreach_rcu(iter6, |
344 | &entry->type_def.addrsel->list6) | 343 | &entry->type_def.addrsel->list6) |
345 | netlbl_domhsh_audit_add(entry, NULL, iter6, | 344 | netlbl_domhsh_audit_add(entry, NULL, iter6, |
346 | ret_val, audit_info); | 345 | ret_val, audit_info); |
347 | #endif /* IPv6 */ | 346 | #endif /* IPv6 */ |
348 | } else | 347 | } else |
349 | netlbl_domhsh_audit_add(entry, NULL, NULL, | 348 | netlbl_domhsh_audit_add(entry, NULL, NULL, |
350 | ret_val, audit_info); | 349 | ret_val, audit_info); |
351 | } else if (entry_old->type == NETLBL_NLTYPE_ADDRSELECT && | 350 | } else if (entry_old->type == NETLBL_NLTYPE_ADDRSELECT && |
352 | entry->type == NETLBL_NLTYPE_ADDRSELECT) { | 351 | entry->type == NETLBL_NLTYPE_ADDRSELECT) { |
353 | struct list_head *old_list4; | 352 | struct list_head *old_list4; |
354 | struct list_head *old_list6; | 353 | struct list_head *old_list6; |
355 | 354 | ||
356 | old_list4 = &entry_old->type_def.addrsel->list4; | 355 | old_list4 = &entry_old->type_def.addrsel->list4; |
357 | old_list6 = &entry_old->type_def.addrsel->list6; | 356 | old_list6 = &entry_old->type_def.addrsel->list6; |
358 | 357 | ||
359 | /* we only allow the addition of address selectors if all of | 358 | /* we only allow the addition of address selectors if all of |
360 | * the selectors do not exist in the existing domain map */ | 359 | * the selectors do not exist in the existing domain map */ |
361 | netlbl_af4list_foreach_rcu(iter4, | 360 | netlbl_af4list_foreach_rcu(iter4, |
362 | &entry->type_def.addrsel->list4) | 361 | &entry->type_def.addrsel->list4) |
363 | if (netlbl_af4list_search_exact(iter4->addr, | 362 | if (netlbl_af4list_search_exact(iter4->addr, |
364 | iter4->mask, | 363 | iter4->mask, |
365 | old_list4)) { | 364 | old_list4)) { |
366 | ret_val = -EEXIST; | 365 | ret_val = -EEXIST; |
367 | goto add_return; | 366 | goto add_return; |
368 | } | 367 | } |
369 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 368 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
370 | netlbl_af6list_foreach_rcu(iter6, | 369 | netlbl_af6list_foreach_rcu(iter6, |
371 | &entry->type_def.addrsel->list6) | 370 | &entry->type_def.addrsel->list6) |
372 | if (netlbl_af6list_search_exact(&iter6->addr, | 371 | if (netlbl_af6list_search_exact(&iter6->addr, |
373 | &iter6->mask, | 372 | &iter6->mask, |
374 | old_list6)) { | 373 | old_list6)) { |
375 | ret_val = -EEXIST; | 374 | ret_val = -EEXIST; |
376 | goto add_return; | 375 | goto add_return; |
377 | } | 376 | } |
378 | #endif /* IPv6 */ | 377 | #endif /* IPv6 */ |
379 | 378 | ||
380 | netlbl_af4list_foreach_safe(iter4, tmp4, | 379 | netlbl_af4list_foreach_safe(iter4, tmp4, |
381 | &entry->type_def.addrsel->list4) { | 380 | &entry->type_def.addrsel->list4) { |
382 | netlbl_af4list_remove_entry(iter4); | 381 | netlbl_af4list_remove_entry(iter4); |
383 | iter4->valid = 1; | 382 | iter4->valid = 1; |
384 | ret_val = netlbl_af4list_add(iter4, old_list4); | 383 | ret_val = netlbl_af4list_add(iter4, old_list4); |
385 | netlbl_domhsh_audit_add(entry_old, iter4, NULL, | 384 | netlbl_domhsh_audit_add(entry_old, iter4, NULL, |
386 | ret_val, audit_info); | 385 | ret_val, audit_info); |
387 | if (ret_val != 0) | 386 | if (ret_val != 0) |
388 | goto add_return; | 387 | goto add_return; |
389 | } | 388 | } |
390 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 389 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
391 | netlbl_af6list_foreach_safe(iter6, tmp6, | 390 | netlbl_af6list_foreach_safe(iter6, tmp6, |
392 | &entry->type_def.addrsel->list6) { | 391 | &entry->type_def.addrsel->list6) { |
393 | netlbl_af6list_remove_entry(iter6); | 392 | netlbl_af6list_remove_entry(iter6); |
394 | iter6->valid = 1; | 393 | iter6->valid = 1; |
395 | ret_val = netlbl_af6list_add(iter6, old_list6); | 394 | ret_val = netlbl_af6list_add(iter6, old_list6); |
396 | netlbl_domhsh_audit_add(entry_old, NULL, iter6, | 395 | netlbl_domhsh_audit_add(entry_old, NULL, iter6, |
397 | ret_val, audit_info); | 396 | ret_val, audit_info); |
398 | if (ret_val != 0) | 397 | if (ret_val != 0) |
399 | goto add_return; | 398 | goto add_return; |
400 | } | 399 | } |
401 | #endif /* IPv6 */ | 400 | #endif /* IPv6 */ |
402 | } else | 401 | } else |
403 | ret_val = -EINVAL; | 402 | ret_val = -EINVAL; |
404 | 403 | ||
405 | add_return: | 404 | add_return: |
406 | spin_unlock(&netlbl_domhsh_lock); | 405 | spin_unlock(&netlbl_domhsh_lock); |
407 | rcu_read_unlock(); | 406 | rcu_read_unlock(); |
408 | return ret_val; | 407 | return ret_val; |
409 | } | 408 | } |
410 | 409 | ||
411 | /** | 410 | /** |
412 | * netlbl_domhsh_add_default - Adds the default entry to the domain hash table | 411 | * netlbl_domhsh_add_default - Adds the default entry to the domain hash table |
413 | * @entry: the entry to add | 412 | * @entry: the entry to add |
414 | * @audit_info: NetLabel audit information | 413 | * @audit_info: NetLabel audit information |
415 | * | 414 | * |
416 | * Description: | 415 | * Description: |
417 | * Adds a new default entry to the domain hash table and handles any updates | 416 | * Adds a new default entry to the domain hash table and handles any updates |
418 | * to the lower level protocol handler (i.e. CIPSO). Returns zero on success, | 417 | * to the lower level protocol handler (i.e. CIPSO). Returns zero on success, |
419 | * negative on failure. | 418 | * negative on failure. |
420 | * | 419 | * |
421 | */ | 420 | */ |
422 | int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, | 421 | int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, |
423 | struct netlbl_audit *audit_info) | 422 | struct netlbl_audit *audit_info) |
424 | { | 423 | { |
425 | return netlbl_domhsh_add(entry, audit_info); | 424 | return netlbl_domhsh_add(entry, audit_info); |
426 | } | 425 | } |
427 | 426 | ||
428 | /** | 427 | /** |
429 | * netlbl_domhsh_remove_entry - Removes a given entry from the domain table | 428 | * netlbl_domhsh_remove_entry - Removes a given entry from the domain table |
430 | * @entry: the entry to remove | 429 | * @entry: the entry to remove |
431 | * @audit_info: NetLabel audit information | 430 | * @audit_info: NetLabel audit information |
432 | * | 431 | * |
433 | * Description: | 432 | * Description: |
434 | * Removes an entry from the domain hash table and handles any updates to the | 433 | * Removes an entry from the domain hash table and handles any updates to the |
435 | * lower level protocol handler (i.e. CIPSO). Caller is responsible for | 434 | * lower level protocol handler (i.e. CIPSO). Caller is responsible for |
436 | * ensuring that the RCU read lock is held. Returns zero on success, negative | 435 | * ensuring that the RCU read lock is held. Returns zero on success, negative |
437 | * on failure. | 436 | * on failure. |
438 | * | 437 | * |
439 | */ | 438 | */ |
440 | int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, | 439 | int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, |
441 | struct netlbl_audit *audit_info) | 440 | struct netlbl_audit *audit_info) |
442 | { | 441 | { |
443 | int ret_val = 0; | 442 | int ret_val = 0; |
444 | struct audit_buffer *audit_buf; | 443 | struct audit_buffer *audit_buf; |
445 | 444 | ||
446 | if (entry == NULL) | 445 | if (entry == NULL) |
447 | return -ENOENT; | 446 | return -ENOENT; |
448 | 447 | ||
449 | spin_lock(&netlbl_domhsh_lock); | 448 | spin_lock(&netlbl_domhsh_lock); |
450 | if (entry->valid) { | 449 | if (entry->valid) { |
451 | entry->valid = 0; | 450 | entry->valid = 0; |
452 | if (entry != rcu_dereference(netlbl_domhsh_def)) | 451 | if (entry != rcu_dereference(netlbl_domhsh_def)) |
453 | list_del_rcu(&entry->list); | 452 | list_del_rcu(&entry->list); |
454 | else | 453 | else |
455 | rcu_assign_pointer(netlbl_domhsh_def, NULL); | 454 | rcu_assign_pointer(netlbl_domhsh_def, NULL); |
456 | } else | 455 | } else |
457 | ret_val = -ENOENT; | 456 | ret_val = -ENOENT; |
458 | spin_unlock(&netlbl_domhsh_lock); | 457 | spin_unlock(&netlbl_domhsh_lock); |
459 | 458 | ||
460 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); | 459 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); |
461 | if (audit_buf != NULL) { | 460 | if (audit_buf != NULL) { |
462 | audit_log_format(audit_buf, | 461 | audit_log_format(audit_buf, |
463 | " nlbl_domain=%s res=%u", | 462 | " nlbl_domain=%s res=%u", |
464 | entry->domain ? entry->domain : "(default)", | 463 | entry->domain ? entry->domain : "(default)", |
465 | ret_val == 0 ? 1 : 0); | 464 | ret_val == 0 ? 1 : 0); |
466 | audit_log_end(audit_buf); | 465 | audit_log_end(audit_buf); |
467 | } | 466 | } |
468 | 467 | ||
469 | if (ret_val == 0) { | 468 | if (ret_val == 0) { |
470 | struct netlbl_af4list *iter4; | 469 | struct netlbl_af4list *iter4; |
471 | struct netlbl_domaddr4_map *map4; | 470 | struct netlbl_domaddr4_map *map4; |
472 | 471 | ||
473 | switch (entry->type) { | 472 | switch (entry->type) { |
474 | case NETLBL_NLTYPE_ADDRSELECT: | 473 | case NETLBL_NLTYPE_ADDRSELECT: |
475 | netlbl_af4list_foreach_rcu(iter4, | 474 | netlbl_af4list_foreach_rcu(iter4, |
476 | &entry->type_def.addrsel->list4) { | 475 | &entry->type_def.addrsel->list4) { |
477 | map4 = netlbl_domhsh_addr4_entry(iter4); | 476 | map4 = netlbl_domhsh_addr4_entry(iter4); |
478 | cipso_v4_doi_putdef(map4->type_def.cipsov4); | 477 | cipso_v4_doi_putdef(map4->type_def.cipsov4); |
479 | } | 478 | } |
480 | /* no need to check the IPv6 list since we currently | 479 | /* no need to check the IPv6 list since we currently |
481 | * support only unlabeled protocols for IPv6 */ | 480 | * support only unlabeled protocols for IPv6 */ |
482 | break; | 481 | break; |
483 | case NETLBL_NLTYPE_CIPSOV4: | 482 | case NETLBL_NLTYPE_CIPSOV4: |
484 | cipso_v4_doi_putdef(entry->type_def.cipsov4); | 483 | cipso_v4_doi_putdef(entry->type_def.cipsov4); |
485 | break; | 484 | break; |
486 | } | 485 | } |
487 | call_rcu(&entry->rcu, netlbl_domhsh_free_entry); | 486 | call_rcu(&entry->rcu, netlbl_domhsh_free_entry); |
488 | } | 487 | } |
489 | 488 | ||
490 | return ret_val; | 489 | return ret_val; |
491 | } | 490 | } |
492 | 491 | ||
493 | /** | 492 | /** |
494 | * netlbl_domhsh_remove_af4 - Removes an address selector entry | 493 | * netlbl_domhsh_remove_af4 - Removes an address selector entry |
495 | * @domain: the domain | 494 | * @domain: the domain |
496 | * @addr: IPv4 address | 495 | * @addr: IPv4 address |
497 | * @mask: IPv4 address mask | 496 | * @mask: IPv4 address mask |
498 | * @audit_info: NetLabel audit information | 497 | * @audit_info: NetLabel audit information |
499 | * | 498 | * |
500 | * Description: | 499 | * Description: |
501 | * Removes an individual address selector from a domain mapping and potentially | 500 | * Removes an individual address selector from a domain mapping and potentially |
502 | * the entire mapping if it is empty. Returns zero on success, negative values | 501 | * the entire mapping if it is empty. Returns zero on success, negative values |
503 | * on failure. | 502 | * on failure. |
504 | * | 503 | * |
505 | */ | 504 | */ |
506 | int netlbl_domhsh_remove_af4(const char *domain, | 505 | int netlbl_domhsh_remove_af4(const char *domain, |
507 | const struct in_addr *addr, | 506 | const struct in_addr *addr, |
508 | const struct in_addr *mask, | 507 | const struct in_addr *mask, |
509 | struct netlbl_audit *audit_info) | 508 | struct netlbl_audit *audit_info) |
510 | { | 509 | { |
511 | struct netlbl_dom_map *entry_map; | 510 | struct netlbl_dom_map *entry_map; |
512 | struct netlbl_af4list *entry_addr; | 511 | struct netlbl_af4list *entry_addr; |
513 | struct netlbl_af4list *iter4; | 512 | struct netlbl_af4list *iter4; |
514 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 513 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
515 | struct netlbl_af6list *iter6; | 514 | struct netlbl_af6list *iter6; |
516 | #endif /* IPv6 */ | 515 | #endif /* IPv6 */ |
517 | struct netlbl_domaddr4_map *entry; | 516 | struct netlbl_domaddr4_map *entry; |
518 | 517 | ||
519 | rcu_read_lock(); | 518 | rcu_read_lock(); |
520 | 519 | ||
521 | if (domain) | 520 | if (domain) |
522 | entry_map = netlbl_domhsh_search(domain); | 521 | entry_map = netlbl_domhsh_search(domain); |
523 | else | 522 | else |
524 | entry_map = netlbl_domhsh_search_def(domain); | 523 | entry_map = netlbl_domhsh_search_def(domain); |
525 | if (entry_map == NULL || entry_map->type != NETLBL_NLTYPE_ADDRSELECT) | 524 | if (entry_map == NULL || entry_map->type != NETLBL_NLTYPE_ADDRSELECT) |
526 | goto remove_af4_failure; | 525 | goto remove_af4_failure; |
527 | 526 | ||
528 | spin_lock(&netlbl_domhsh_lock); | 527 | spin_lock(&netlbl_domhsh_lock); |
529 | entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr, | 528 | entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr, |
530 | &entry_map->type_def.addrsel->list4); | 529 | &entry_map->type_def.addrsel->list4); |
531 | spin_unlock(&netlbl_domhsh_lock); | 530 | spin_unlock(&netlbl_domhsh_lock); |
532 | 531 | ||
533 | if (entry_addr == NULL) | 532 | if (entry_addr == NULL) |
534 | goto remove_af4_failure; | 533 | goto remove_af4_failure; |
535 | netlbl_af4list_foreach_rcu(iter4, &entry_map->type_def.addrsel->list4) | 534 | netlbl_af4list_foreach_rcu(iter4, &entry_map->type_def.addrsel->list4) |
536 | goto remove_af4_single_addr; | 535 | goto remove_af4_single_addr; |
537 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 536 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
538 | netlbl_af6list_foreach_rcu(iter6, &entry_map->type_def.addrsel->list6) | 537 | netlbl_af6list_foreach_rcu(iter6, &entry_map->type_def.addrsel->list6) |
539 | goto remove_af4_single_addr; | 538 | goto remove_af4_single_addr; |
540 | #endif /* IPv6 */ | 539 | #endif /* IPv6 */ |
541 | /* the domain mapping is empty so remove it from the mapping table */ | 540 | /* the domain mapping is empty so remove it from the mapping table */ |
542 | netlbl_domhsh_remove_entry(entry_map, audit_info); | 541 | netlbl_domhsh_remove_entry(entry_map, audit_info); |
543 | 542 | ||
544 | remove_af4_single_addr: | 543 | remove_af4_single_addr: |
545 | rcu_read_unlock(); | 544 | rcu_read_unlock(); |
546 | /* yick, we can't use call_rcu here because we don't have a rcu head | 545 | /* yick, we can't use call_rcu here because we don't have a rcu head |
547 | * pointer but hopefully this should be a rare case so the pause | 546 | * pointer but hopefully this should be a rare case so the pause |
548 | * shouldn't be a problem */ | 547 | * shouldn't be a problem */ |
549 | synchronize_rcu(); | 548 | synchronize_rcu(); |
550 | entry = netlbl_domhsh_addr4_entry(entry_addr); | 549 | entry = netlbl_domhsh_addr4_entry(entry_addr); |
551 | cipso_v4_doi_putdef(entry->type_def.cipsov4); | 550 | cipso_v4_doi_putdef(entry->type_def.cipsov4); |
552 | kfree(entry); | 551 | kfree(entry); |
553 | return 0; | 552 | return 0; |
554 | 553 | ||
555 | remove_af4_failure: | 554 | remove_af4_failure: |
556 | rcu_read_unlock(); | 555 | rcu_read_unlock(); |
557 | return -ENOENT; | 556 | return -ENOENT; |
558 | } | 557 | } |
559 | 558 | ||
560 | /** | 559 | /** |
561 | * netlbl_domhsh_remove - Removes an entry from the domain hash table | 560 | * netlbl_domhsh_remove - Removes an entry from the domain hash table |
562 | * @domain: the domain to remove | 561 | * @domain: the domain to remove |
563 | * @audit_info: NetLabel audit information | 562 | * @audit_info: NetLabel audit information |
564 | * | 563 | * |
565 | * Description: | 564 | * Description: |
566 | * Removes an entry from the domain hash table and handles any updates to the | 565 | * Removes an entry from the domain hash table and handles any updates to the |
567 | * lower level protocol handler (i.e. CIPSO). Returns zero on success, | 566 | * lower level protocol handler (i.e. CIPSO). Returns zero on success, |
568 | * negative on failure. | 567 | * negative on failure. |
569 | * | 568 | * |
570 | */ | 569 | */ |
571 | int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) | 570 | int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) |
572 | { | 571 | { |
573 | int ret_val; | 572 | int ret_val; |
574 | struct netlbl_dom_map *entry; | 573 | struct netlbl_dom_map *entry; |
575 | 574 | ||
576 | rcu_read_lock(); | 575 | rcu_read_lock(); |
577 | if (domain) | 576 | if (domain) |
578 | entry = netlbl_domhsh_search(domain); | 577 | entry = netlbl_domhsh_search(domain); |
579 | else | 578 | else |
580 | entry = netlbl_domhsh_search_def(domain); | 579 | entry = netlbl_domhsh_search_def(domain); |
581 | ret_val = netlbl_domhsh_remove_entry(entry, audit_info); | 580 | ret_val = netlbl_domhsh_remove_entry(entry, audit_info); |
582 | rcu_read_unlock(); | 581 | rcu_read_unlock(); |
583 | 582 | ||
584 | return ret_val; | 583 | return ret_val; |
585 | } | 584 | } |
586 | 585 | ||
587 | /** | 586 | /** |
588 | * netlbl_domhsh_remove_default - Removes the default entry from the table | 587 | * netlbl_domhsh_remove_default - Removes the default entry from the table |
589 | * @audit_info: NetLabel audit information | 588 | * @audit_info: NetLabel audit information |
590 | * | 589 | * |
591 | * Description: | 590 | * Description: |
592 | * Removes/resets the default entry for the domain hash table and handles any | 591 | * Removes/resets the default entry for the domain hash table and handles any |
593 | * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on | 592 | * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on |
594 | * success, non-zero on failure. | 593 | * success, non-zero on failure. |
595 | * | 594 | * |
596 | */ | 595 | */ |
597 | int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info) | 596 | int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info) |
598 | { | 597 | { |
599 | return netlbl_domhsh_remove(NULL, audit_info); | 598 | return netlbl_domhsh_remove(NULL, audit_info); |
600 | } | 599 | } |
601 | 600 | ||
602 | /** | 601 | /** |
603 | * netlbl_domhsh_getentry - Get an entry from the domain hash table | 602 | * netlbl_domhsh_getentry - Get an entry from the domain hash table |
604 | * @domain: the domain name to search for | 603 | * @domain: the domain name to search for |
605 | * | 604 | * |
606 | * Description: | 605 | * Description: |
607 | * Look through the domain hash table searching for an entry to match @domain, | 606 | * Look through the domain hash table searching for an entry to match @domain, |
608 | * return a pointer to a copy of the entry or NULL. The caller is responsible | 607 | * return a pointer to a copy of the entry or NULL. The caller is responsible |
609 | * for ensuring that rcu_read_[un]lock() is called. | 608 | * for ensuring that rcu_read_[un]lock() is called. |
610 | * | 609 | * |
611 | */ | 610 | */ |
612 | struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) | 611 | struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) |
613 | { | 612 | { |
614 | return netlbl_domhsh_search_def(domain); | 613 | return netlbl_domhsh_search_def(domain); |
615 | } | 614 | } |
616 | 615 | ||
617 | /** | 616 | /** |
618 | * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table | 617 | * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table |
619 | * @domain: the domain name to search for | 618 | * @domain: the domain name to search for |
620 | * @addr: the IP address to search for | 619 | * @addr: the IP address to search for |
621 | * | 620 | * |
622 | * Description: | 621 | * Description: |
623 | * Look through the domain hash table searching for an entry to match @domain | 622 | * Look through the domain hash table searching for an entry to match @domain |
624 | * and @addr, return a pointer to a copy of the entry or NULL. The caller is | 623 | * and @addr, return a pointer to a copy of the entry or NULL. The caller is |
625 | * responsible for ensuring that rcu_read_[un]lock() is called. | 624 | * responsible for ensuring that rcu_read_[un]lock() is called. |
626 | * | 625 | * |
627 | */ | 626 | */ |
628 | struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain, | 627 | struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain, |
629 | __be32 addr) | 628 | __be32 addr) |
630 | { | 629 | { |
631 | struct netlbl_dom_map *dom_iter; | 630 | struct netlbl_dom_map *dom_iter; |
632 | struct netlbl_af4list *addr_iter; | 631 | struct netlbl_af4list *addr_iter; |
633 | 632 | ||
634 | dom_iter = netlbl_domhsh_search_def(domain); | 633 | dom_iter = netlbl_domhsh_search_def(domain); |
635 | if (dom_iter == NULL) | 634 | if (dom_iter == NULL) |
636 | return NULL; | 635 | return NULL; |
637 | if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) | 636 | if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) |
638 | return NULL; | 637 | return NULL; |
639 | 638 | ||
640 | addr_iter = netlbl_af4list_search(addr, | 639 | addr_iter = netlbl_af4list_search(addr, |
641 | &dom_iter->type_def.addrsel->list4); | 640 | &dom_iter->type_def.addrsel->list4); |
642 | if (addr_iter == NULL) | 641 | if (addr_iter == NULL) |
643 | return NULL; | 642 | return NULL; |
644 | 643 | ||
645 | return netlbl_domhsh_addr4_entry(addr_iter); | 644 | return netlbl_domhsh_addr4_entry(addr_iter); |
646 | } | 645 | } |
647 | 646 | ||
648 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 647 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
649 | /** | 648 | /** |
650 | * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table | 649 | * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table |
651 | * @domain: the domain name to search for | 650 | * @domain: the domain name to search for |
652 | * @addr: the IP address to search for | 651 | * @addr: the IP address to search for |
653 | * | 652 | * |
654 | * Description: | 653 | * Description: |
655 | * Look through the domain hash table searching for an entry to match @domain | 654 | * Look through the domain hash table searching for an entry to match @domain |
656 | * and @addr, return a pointer to a copy of the entry or NULL. The caller is | 655 | * and @addr, return a pointer to a copy of the entry or NULL. The caller is |
657 | * responsible for ensuring that rcu_read_[un]lock() is called. | 656 | * responsible for ensuring that rcu_read_[un]lock() is called. |
658 | * | 657 | * |
659 | */ | 658 | */ |
660 | struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain, | 659 | struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain, |
661 | const struct in6_addr *addr) | 660 | const struct in6_addr *addr) |
662 | { | 661 | { |
663 | struct netlbl_dom_map *dom_iter; | 662 | struct netlbl_dom_map *dom_iter; |
664 | struct netlbl_af6list *addr_iter; | 663 | struct netlbl_af6list *addr_iter; |
665 | 664 | ||
666 | dom_iter = netlbl_domhsh_search_def(domain); | 665 | dom_iter = netlbl_domhsh_search_def(domain); |
667 | if (dom_iter == NULL) | 666 | if (dom_iter == NULL) |
668 | return NULL; | 667 | return NULL; |
669 | if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) | 668 | if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) |
670 | return NULL; | 669 | return NULL; |
671 | 670 | ||
672 | addr_iter = netlbl_af6list_search(addr, | 671 | addr_iter = netlbl_af6list_search(addr, |
673 | &dom_iter->type_def.addrsel->list6); | 672 | &dom_iter->type_def.addrsel->list6); |
674 | if (addr_iter == NULL) | 673 | if (addr_iter == NULL) |
675 | return NULL; | 674 | return NULL; |
676 | 675 | ||
677 | return netlbl_domhsh_addr6_entry(addr_iter); | 676 | return netlbl_domhsh_addr6_entry(addr_iter); |
678 | } | 677 | } |
679 | #endif /* IPv6 */ | 678 | #endif /* IPv6 */ |
680 | 679 | ||
681 | /** | 680 | /** |
682 | * netlbl_domhsh_walk - Iterate through the domain mapping hash table | 681 | * netlbl_domhsh_walk - Iterate through the domain mapping hash table |
683 | * @skip_bkt: the number of buckets to skip at the start | 682 | * @skip_bkt: the number of buckets to skip at the start |
684 | * @skip_chain: the number of entries to skip in the first iterated bucket | 683 | * @skip_chain: the number of entries to skip in the first iterated bucket |
685 | * @callback: callback for each entry | 684 | * @callback: callback for each entry |
686 | * @cb_arg: argument for the callback function | 685 | * @cb_arg: argument for the callback function |
687 | * | 686 | * |
688 | * Description: | 687 | * Description: |
689 | * Interate over the domain mapping hash table, skipping the first @skip_bkt | 688 | * Interate over the domain mapping hash table, skipping the first @skip_bkt |
690 | * buckets and @skip_chain entries. For each entry in the table call | 689 | * buckets and @skip_chain entries. For each entry in the table call |
691 | * @callback, if @callback returns a negative value stop 'walking' through the | 690 | * @callback, if @callback returns a negative value stop 'walking' through the |
692 | * table and return. Updates the values in @skip_bkt and @skip_chain on | 691 | * table and return. Updates the values in @skip_bkt and @skip_chain on |
693 | * return. Returns zero on success, negative values on failure. | 692 | * return. Returns zero on success, negative values on failure. |
694 | * | 693 | * |
695 | */ | 694 | */ |
696 | int netlbl_domhsh_walk(u32 *skip_bkt, | 695 | int netlbl_domhsh_walk(u32 *skip_bkt, |
697 | u32 *skip_chain, | 696 | u32 *skip_chain, |
698 | int (*callback) (struct netlbl_dom_map *entry, void *arg), | 697 | int (*callback) (struct netlbl_dom_map *entry, void *arg), |
699 | void *cb_arg) | 698 | void *cb_arg) |
700 | { | 699 | { |
701 | int ret_val = -ENOENT; | 700 | int ret_val = -ENOENT; |
702 | u32 iter_bkt; | 701 | u32 iter_bkt; |
703 | struct list_head *iter_list; | 702 | struct list_head *iter_list; |
704 | struct netlbl_dom_map *iter_entry; | 703 | struct netlbl_dom_map *iter_entry; |
705 | u32 chain_cnt = 0; | 704 | u32 chain_cnt = 0; |
706 | 705 | ||
707 | rcu_read_lock(); | 706 | rcu_read_lock(); |
708 | for (iter_bkt = *skip_bkt; | 707 | for (iter_bkt = *skip_bkt; |
709 | iter_bkt < rcu_dereference(netlbl_domhsh)->size; | 708 | iter_bkt < rcu_dereference(netlbl_domhsh)->size; |
710 | iter_bkt++, chain_cnt = 0) { | 709 | iter_bkt++, chain_cnt = 0) { |
711 | iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt]; | 710 | iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt]; |
712 | list_for_each_entry_rcu(iter_entry, iter_list, list) | 711 | list_for_each_entry_rcu(iter_entry, iter_list, list) |
713 | if (iter_entry->valid) { | 712 | if (iter_entry->valid) { |
714 | if (chain_cnt++ < *skip_chain) | 713 | if (chain_cnt++ < *skip_chain) |
715 | continue; | 714 | continue; |
716 | ret_val = callback(iter_entry, cb_arg); | 715 | ret_val = callback(iter_entry, cb_arg); |
717 | if (ret_val < 0) { | 716 | if (ret_val < 0) { |
718 | chain_cnt--; | 717 | chain_cnt--; |
719 | goto walk_return; | 718 | goto walk_return; |
720 | } | 719 | } |
721 | } | 720 | } |
722 | } | 721 | } |
723 | 722 | ||
724 | walk_return: | 723 | walk_return: |
725 | rcu_read_unlock(); | 724 | rcu_read_unlock(); |
726 | *skip_bkt = iter_bkt; | 725 | *skip_bkt = iter_bkt; |
727 | *skip_chain = chain_cnt; | 726 | *skip_chain = chain_cnt; |
728 | return ret_val; | 727 | return ret_val; |
729 | } | 728 | } |
730 | 729 |
net/netlabel/netlabel_unlabeled.c
1 | /* | 1 | /* |
2 | * NetLabel Unlabeled Support | 2 | * NetLabel Unlabeled Support |
3 | * | 3 | * |
4 | * This file defines functions for dealing with unlabeled packets for the | 4 | * This file defines functions for dealing with unlabeled packets for the |
5 | * NetLabel system. The NetLabel system manages static and dynamic label | 5 | * NetLabel system. The NetLabel system manages static and dynamic label |
6 | * mappings for network protocols such as CIPSO and RIPSO. | 6 | * mappings for network protocols such as CIPSO and RIPSO. |
7 | * | 7 | * |
8 | * Author: Paul Moore <paul.moore@hp.com> | 8 | * Author: Paul Moore <paul.moore@hp.com> |
9 | * | 9 | * |
10 | */ | 10 | */ |
11 | 11 | ||
12 | /* | 12 | /* |
13 | * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008 | 13 | * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008 |
14 | * | 14 | * |
15 | * This program is free software; you can redistribute it and/or modify | 15 | * This program is free software; you can redistribute it and/or modify |
16 | * it under the terms of the GNU General Public License as published by | 16 | * it under the terms of the GNU General Public License as published by |
17 | * the Free Software Foundation; either version 2 of the License, or | 17 | * the Free Software Foundation; either version 2 of the License, or |
18 | * (at your option) any later version. | 18 | * (at your option) any later version. |
19 | * | 19 | * |
20 | * This program is distributed in the hope that it will be useful, | 20 | * This program is distributed in the hope that it will be useful, |
21 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 21 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | 22 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See |
23 | * the GNU General Public License for more details. | 23 | * the GNU General Public License for more details. |
24 | * | 24 | * |
25 | * You should have received a copy of the GNU General Public License | 25 | * You should have received a copy of the GNU General Public License |
26 | * along with this program; if not, write to the Free Software | 26 | * along with this program; if not, write to the Free Software |
27 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 27 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
28 | * | 28 | * |
29 | */ | 29 | */ |
30 | 30 | ||
31 | #include <linux/types.h> | 31 | #include <linux/types.h> |
32 | #include <linux/rcupdate.h> | 32 | #include <linux/rcupdate.h> |
33 | #include <linux/list.h> | 33 | #include <linux/list.h> |
34 | #include <linux/spinlock.h> | 34 | #include <linux/spinlock.h> |
35 | #include <linux/socket.h> | 35 | #include <linux/socket.h> |
36 | #include <linux/string.h> | 36 | #include <linux/string.h> |
37 | #include <linux/skbuff.h> | 37 | #include <linux/skbuff.h> |
38 | #include <linux/audit.h> | 38 | #include <linux/audit.h> |
39 | #include <linux/in.h> | 39 | #include <linux/in.h> |
40 | #include <linux/in6.h> | 40 | #include <linux/in6.h> |
41 | #include <linux/ip.h> | 41 | #include <linux/ip.h> |
42 | #include <linux/ipv6.h> | 42 | #include <linux/ipv6.h> |
43 | #include <linux/notifier.h> | 43 | #include <linux/notifier.h> |
44 | #include <linux/netdevice.h> | 44 | #include <linux/netdevice.h> |
45 | #include <linux/security.h> | 45 | #include <linux/security.h> |
46 | #include <linux/slab.h> | 46 | #include <linux/slab.h> |
47 | #include <net/sock.h> | 47 | #include <net/sock.h> |
48 | #include <net/netlink.h> | 48 | #include <net/netlink.h> |
49 | #include <net/genetlink.h> | 49 | #include <net/genetlink.h> |
50 | #include <net/ip.h> | 50 | #include <net/ip.h> |
51 | #include <net/ipv6.h> | 51 | #include <net/ipv6.h> |
52 | #include <net/net_namespace.h> | 52 | #include <net/net_namespace.h> |
53 | #include <net/netlabel.h> | 53 | #include <net/netlabel.h> |
54 | #include <asm/bug.h> | 54 | #include <asm/bug.h> |
55 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
56 | 56 | ||
57 | #include "netlabel_user.h" | 57 | #include "netlabel_user.h" |
58 | #include "netlabel_addrlist.h" | 58 | #include "netlabel_addrlist.h" |
59 | #include "netlabel_domainhash.h" | 59 | #include "netlabel_domainhash.h" |
60 | #include "netlabel_unlabeled.h" | 60 | #include "netlabel_unlabeled.h" |
61 | #include "netlabel_mgmt.h" | 61 | #include "netlabel_mgmt.h" |
62 | 62 | ||
63 | /* NOTE: at present we always use init's network namespace since we don't | 63 | /* NOTE: at present we always use init's network namespace since we don't |
64 | * presently support different namespaces even though the majority of | 64 | * presently support different namespaces even though the majority of |
65 | * the functions in this file are "namespace safe" */ | 65 | * the functions in this file are "namespace safe" */ |
66 | 66 | ||
67 | /* The unlabeled connection hash table which we use to map network interfaces | 67 | /* The unlabeled connection hash table which we use to map network interfaces |
68 | * and addresses of unlabeled packets to a user specified secid value for the | 68 | * and addresses of unlabeled packets to a user specified secid value for the |
69 | * LSM. The hash table is used to lookup the network interface entry | 69 | * LSM. The hash table is used to lookup the network interface entry |
70 | * (struct netlbl_unlhsh_iface) and then the interface entry is used to | 70 | * (struct netlbl_unlhsh_iface) and then the interface entry is used to |
71 | * lookup an IP address match from an ordered list. If a network interface | 71 | * lookup an IP address match from an ordered list. If a network interface |
72 | * match can not be found in the hash table then the default entry | 72 | * match can not be found in the hash table then the default entry |
73 | * (netlbl_unlhsh_def) is used. The IP address entry list | 73 | * (netlbl_unlhsh_def) is used. The IP address entry list |
74 | * (struct netlbl_unlhsh_addr) is ordered such that the entries with a | 74 | * (struct netlbl_unlhsh_addr) is ordered such that the entries with a |
75 | * larger netmask come first. | 75 | * larger netmask come first. |
76 | */ | 76 | */ |
77 | struct netlbl_unlhsh_tbl { | 77 | struct netlbl_unlhsh_tbl { |
78 | struct list_head *tbl; | 78 | struct list_head *tbl; |
79 | u32 size; | 79 | u32 size; |
80 | }; | 80 | }; |
81 | #define netlbl_unlhsh_addr4_entry(iter) \ | 81 | #define netlbl_unlhsh_addr4_entry(iter) \ |
82 | container_of(iter, struct netlbl_unlhsh_addr4, list) | 82 | container_of(iter, struct netlbl_unlhsh_addr4, list) |
83 | struct netlbl_unlhsh_addr4 { | 83 | struct netlbl_unlhsh_addr4 { |
84 | u32 secid; | 84 | u32 secid; |
85 | 85 | ||
86 | struct netlbl_af4list list; | 86 | struct netlbl_af4list list; |
87 | struct rcu_head rcu; | 87 | struct rcu_head rcu; |
88 | }; | 88 | }; |
89 | #define netlbl_unlhsh_addr6_entry(iter) \ | 89 | #define netlbl_unlhsh_addr6_entry(iter) \ |
90 | container_of(iter, struct netlbl_unlhsh_addr6, list) | 90 | container_of(iter, struct netlbl_unlhsh_addr6, list) |
91 | struct netlbl_unlhsh_addr6 { | 91 | struct netlbl_unlhsh_addr6 { |
92 | u32 secid; | 92 | u32 secid; |
93 | 93 | ||
94 | struct netlbl_af6list list; | 94 | struct netlbl_af6list list; |
95 | struct rcu_head rcu; | 95 | struct rcu_head rcu; |
96 | }; | 96 | }; |
97 | struct netlbl_unlhsh_iface { | 97 | struct netlbl_unlhsh_iface { |
98 | int ifindex; | 98 | int ifindex; |
99 | struct list_head addr4_list; | 99 | struct list_head addr4_list; |
100 | struct list_head addr6_list; | 100 | struct list_head addr6_list; |
101 | 101 | ||
102 | u32 valid; | 102 | u32 valid; |
103 | struct list_head list; | 103 | struct list_head list; |
104 | struct rcu_head rcu; | 104 | struct rcu_head rcu; |
105 | }; | 105 | }; |
106 | 106 | ||
107 | /* Argument struct for netlbl_unlhsh_walk() */ | 107 | /* Argument struct for netlbl_unlhsh_walk() */ |
108 | struct netlbl_unlhsh_walk_arg { | 108 | struct netlbl_unlhsh_walk_arg { |
109 | struct netlink_callback *nl_cb; | 109 | struct netlink_callback *nl_cb; |
110 | struct sk_buff *skb; | 110 | struct sk_buff *skb; |
111 | u32 seq; | 111 | u32 seq; |
112 | }; | 112 | }; |
113 | 113 | ||
114 | /* Unlabeled connection hash table */ | 114 | /* Unlabeled connection hash table */ |
115 | /* updates should be so rare that having one spinlock for the entire | 115 | /* updates should be so rare that having one spinlock for the entire |
116 | * hash table should be okay */ | 116 | * hash table should be okay */ |
117 | static DEFINE_SPINLOCK(netlbl_unlhsh_lock); | 117 | static DEFINE_SPINLOCK(netlbl_unlhsh_lock); |
118 | #define netlbl_unlhsh_rcu_deref(p) \ | 118 | #define netlbl_unlhsh_rcu_deref(p) \ |
119 | rcu_dereference_check(p, rcu_read_lock_held() || \ | 119 | rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) |
120 | lockdep_is_held(&netlbl_unlhsh_lock)) | ||
121 | static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; | 120 | static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; |
122 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; | 121 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; |
123 | 122 | ||
124 | /* Accept unlabeled packets flag */ | 123 | /* Accept unlabeled packets flag */ |
125 | static u8 netlabel_unlabel_acceptflg = 0; | 124 | static u8 netlabel_unlabel_acceptflg = 0; |
126 | 125 | ||
127 | /* NetLabel Generic NETLINK unlabeled family */ | 126 | /* NetLabel Generic NETLINK unlabeled family */ |
128 | static struct genl_family netlbl_unlabel_gnl_family = { | 127 | static struct genl_family netlbl_unlabel_gnl_family = { |
129 | .id = GENL_ID_GENERATE, | 128 | .id = GENL_ID_GENERATE, |
130 | .hdrsize = 0, | 129 | .hdrsize = 0, |
131 | .name = NETLBL_NLTYPE_UNLABELED_NAME, | 130 | .name = NETLBL_NLTYPE_UNLABELED_NAME, |
132 | .version = NETLBL_PROTO_VERSION, | 131 | .version = NETLBL_PROTO_VERSION, |
133 | .maxattr = NLBL_UNLABEL_A_MAX, | 132 | .maxattr = NLBL_UNLABEL_A_MAX, |
134 | }; | 133 | }; |
135 | 134 | ||
136 | /* NetLabel Netlink attribute policy */ | 135 | /* NetLabel Netlink attribute policy */ |
137 | static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { | 136 | static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { |
138 | [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, | 137 | [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, |
139 | [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, | 138 | [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, |
140 | .len = sizeof(struct in6_addr) }, | 139 | .len = sizeof(struct in6_addr) }, |
141 | [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, | 140 | [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, |
142 | .len = sizeof(struct in6_addr) }, | 141 | .len = sizeof(struct in6_addr) }, |
143 | [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, | 142 | [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, |
144 | .len = sizeof(struct in_addr) }, | 143 | .len = sizeof(struct in_addr) }, |
145 | [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, | 144 | [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, |
146 | .len = sizeof(struct in_addr) }, | 145 | .len = sizeof(struct in_addr) }, |
147 | [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, | 146 | [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, |
148 | .len = IFNAMSIZ - 1 }, | 147 | .len = IFNAMSIZ - 1 }, |
149 | [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } | 148 | [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } |
150 | }; | 149 | }; |
151 | 150 | ||
152 | /* | 151 | /* |
153 | * Unlabeled Connection Hash Table Functions | 152 | * Unlabeled Connection Hash Table Functions |
154 | */ | 153 | */ |
155 | 154 | ||
156 | /** | 155 | /** |
157 | * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table | 156 | * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table |
158 | * @entry: the entry's RCU field | 157 | * @entry: the entry's RCU field |
159 | * | 158 | * |
160 | * Description: | 159 | * Description: |
161 | * This function is designed to be used as a callback to the call_rcu() | 160 | * This function is designed to be used as a callback to the call_rcu() |
162 | * function so that memory allocated to a hash table interface entry can be | 161 | * function so that memory allocated to a hash table interface entry can be |
163 | * released safely. It is important to note that this function does not free | 162 | * released safely. It is important to note that this function does not free |
164 | * the IPv4 and IPv6 address lists contained as part of an interface entry. It | 163 | * the IPv4 and IPv6 address lists contained as part of an interface entry. It |
165 | * is up to the rest of the code to make sure an interface entry is only freed | 164 | * is up to the rest of the code to make sure an interface entry is only freed |
166 | * once it's address lists are empty. | 165 | * once it's address lists are empty. |
167 | * | 166 | * |
168 | */ | 167 | */ |
169 | static void netlbl_unlhsh_free_iface(struct rcu_head *entry) | 168 | static void netlbl_unlhsh_free_iface(struct rcu_head *entry) |
170 | { | 169 | { |
171 | struct netlbl_unlhsh_iface *iface; | 170 | struct netlbl_unlhsh_iface *iface; |
172 | struct netlbl_af4list *iter4; | 171 | struct netlbl_af4list *iter4; |
173 | struct netlbl_af4list *tmp4; | 172 | struct netlbl_af4list *tmp4; |
174 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 173 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
175 | struct netlbl_af6list *iter6; | 174 | struct netlbl_af6list *iter6; |
176 | struct netlbl_af6list *tmp6; | 175 | struct netlbl_af6list *tmp6; |
177 | #endif /* IPv6 */ | 176 | #endif /* IPv6 */ |
178 | 177 | ||
179 | iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); | 178 | iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); |
180 | 179 | ||
181 | /* no need for locks here since we are the only one with access to this | 180 | /* no need for locks here since we are the only one with access to this |
182 | * structure */ | 181 | * structure */ |
183 | 182 | ||
184 | netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) { | 183 | netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) { |
185 | netlbl_af4list_remove_entry(iter4); | 184 | netlbl_af4list_remove_entry(iter4); |
186 | kfree(netlbl_unlhsh_addr4_entry(iter4)); | 185 | kfree(netlbl_unlhsh_addr4_entry(iter4)); |
187 | } | 186 | } |
188 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 187 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
189 | netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) { | 188 | netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) { |
190 | netlbl_af6list_remove_entry(iter6); | 189 | netlbl_af6list_remove_entry(iter6); |
191 | kfree(netlbl_unlhsh_addr6_entry(iter6)); | 190 | kfree(netlbl_unlhsh_addr6_entry(iter6)); |
192 | } | 191 | } |
193 | #endif /* IPv6 */ | 192 | #endif /* IPv6 */ |
194 | kfree(iface); | 193 | kfree(iface); |
195 | } | 194 | } |
196 | 195 | ||
197 | /** | 196 | /** |
198 | * netlbl_unlhsh_hash - Hashing function for the hash table | 197 | * netlbl_unlhsh_hash - Hashing function for the hash table |
199 | * @ifindex: the network interface/device to hash | 198 | * @ifindex: the network interface/device to hash |
200 | * | 199 | * |
201 | * Description: | 200 | * Description: |
202 | * This is the hashing function for the unlabeled hash table, it returns the | 201 | * This is the hashing function for the unlabeled hash table, it returns the |
203 | * bucket number for the given device/interface. The caller is responsible for | 202 | * bucket number for the given device/interface. The caller is responsible for |
204 | * ensuring that the hash table is protected with either a RCU read lock or | 203 | * ensuring that the hash table is protected with either a RCU read lock or |
205 | * the hash table lock. | 204 | * the hash table lock. |
206 | * | 205 | * |
207 | */ | 206 | */ |
208 | static u32 netlbl_unlhsh_hash(int ifindex) | 207 | static u32 netlbl_unlhsh_hash(int ifindex) |
209 | { | 208 | { |
210 | return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1); | 209 | return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1); |
211 | } | 210 | } |
212 | 211 | ||
213 | /** | 212 | /** |
214 | * netlbl_unlhsh_search_iface - Search for a matching interface entry | 213 | * netlbl_unlhsh_search_iface - Search for a matching interface entry |
215 | * @ifindex: the network interface | 214 | * @ifindex: the network interface |
216 | * | 215 | * |
217 | * Description: | 216 | * Description: |
218 | * Searches the unlabeled connection hash table and returns a pointer to the | 217 | * Searches the unlabeled connection hash table and returns a pointer to the |
219 | * interface entry which matches @ifindex, otherwise NULL is returned. The | 218 | * interface entry which matches @ifindex, otherwise NULL is returned. The |
220 | * caller is responsible for ensuring that the hash table is protected with | 219 | * caller is responsible for ensuring that the hash table is protected with |
221 | * either a RCU read lock or the hash table lock. | 220 | * either a RCU read lock or the hash table lock. |
222 | * | 221 | * |
223 | */ | 222 | */ |
224 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) | 223 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) |
225 | { | 224 | { |
226 | u32 bkt; | 225 | u32 bkt; |
227 | struct list_head *bkt_list; | 226 | struct list_head *bkt_list; |
228 | struct netlbl_unlhsh_iface *iter; | 227 | struct netlbl_unlhsh_iface *iter; |
229 | 228 | ||
230 | bkt = netlbl_unlhsh_hash(ifindex); | 229 | bkt = netlbl_unlhsh_hash(ifindex); |
231 | bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; | 230 | bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; |
232 | list_for_each_entry_rcu(iter, bkt_list, list) | 231 | list_for_each_entry_rcu(iter, bkt_list, list) |
233 | if (iter->valid && iter->ifindex == ifindex) | 232 | if (iter->valid && iter->ifindex == ifindex) |
234 | return iter; | 233 | return iter; |
235 | 234 | ||
236 | return NULL; | 235 | return NULL; |
237 | } | 236 | } |
238 | 237 | ||
239 | /** | 238 | /** |
240 | * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table | 239 | * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table |
241 | * @iface: the associated interface entry | 240 | * @iface: the associated interface entry |
242 | * @addr: IPv4 address in network byte order | 241 | * @addr: IPv4 address in network byte order |
243 | * @mask: IPv4 address mask in network byte order | 242 | * @mask: IPv4 address mask in network byte order |
244 | * @secid: LSM secid value for entry | 243 | * @secid: LSM secid value for entry |
245 | * | 244 | * |
246 | * Description: | 245 | * Description: |
247 | * Add a new address entry into the unlabeled connection hash table using the | 246 | * Add a new address entry into the unlabeled connection hash table using the |
248 | * interface entry specified by @iface. On success zero is returned, otherwise | 247 | * interface entry specified by @iface. On success zero is returned, otherwise |
249 | * a negative value is returned. | 248 | * a negative value is returned. |
250 | * | 249 | * |
251 | */ | 250 | */ |
252 | static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, | 251 | static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, |
253 | const struct in_addr *addr, | 252 | const struct in_addr *addr, |
254 | const struct in_addr *mask, | 253 | const struct in_addr *mask, |
255 | u32 secid) | 254 | u32 secid) |
256 | { | 255 | { |
257 | int ret_val; | 256 | int ret_val; |
258 | struct netlbl_unlhsh_addr4 *entry; | 257 | struct netlbl_unlhsh_addr4 *entry; |
259 | 258 | ||
260 | entry = kzalloc(sizeof(*entry), GFP_ATOMIC); | 259 | entry = kzalloc(sizeof(*entry), GFP_ATOMIC); |
261 | if (entry == NULL) | 260 | if (entry == NULL) |
262 | return -ENOMEM; | 261 | return -ENOMEM; |
263 | 262 | ||
264 | entry->list.addr = addr->s_addr & mask->s_addr; | 263 | entry->list.addr = addr->s_addr & mask->s_addr; |
265 | entry->list.mask = mask->s_addr; | 264 | entry->list.mask = mask->s_addr; |
266 | entry->list.valid = 1; | 265 | entry->list.valid = 1; |
267 | entry->secid = secid; | 266 | entry->secid = secid; |
268 | 267 | ||
269 | spin_lock(&netlbl_unlhsh_lock); | 268 | spin_lock(&netlbl_unlhsh_lock); |
270 | ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list); | 269 | ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list); |
271 | spin_unlock(&netlbl_unlhsh_lock); | 270 | spin_unlock(&netlbl_unlhsh_lock); |
272 | 271 | ||
273 | if (ret_val != 0) | 272 | if (ret_val != 0) |
274 | kfree(entry); | 273 | kfree(entry); |
275 | return ret_val; | 274 | return ret_val; |
276 | } | 275 | } |
277 | 276 | ||
278 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 277 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
279 | /** | 278 | /** |
280 | * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table | 279 | * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table |
281 | * @iface: the associated interface entry | 280 | * @iface: the associated interface entry |
282 | * @addr: IPv6 address in network byte order | 281 | * @addr: IPv6 address in network byte order |
283 | * @mask: IPv6 address mask in network byte order | 282 | * @mask: IPv6 address mask in network byte order |
284 | * @secid: LSM secid value for entry | 283 | * @secid: LSM secid value for entry |
285 | * | 284 | * |
286 | * Description: | 285 | * Description: |
287 | * Add a new address entry into the unlabeled connection hash table using the | 286 | * Add a new address entry into the unlabeled connection hash table using the |
288 | * interface entry specified by @iface. On success zero is returned, otherwise | 287 | * interface entry specified by @iface. On success zero is returned, otherwise |
289 | * a negative value is returned. | 288 | * a negative value is returned. |
290 | * | 289 | * |
291 | */ | 290 | */ |
292 | static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, | 291 | static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, |
293 | const struct in6_addr *addr, | 292 | const struct in6_addr *addr, |
294 | const struct in6_addr *mask, | 293 | const struct in6_addr *mask, |
295 | u32 secid) | 294 | u32 secid) |
296 | { | 295 | { |
297 | int ret_val; | 296 | int ret_val; |
298 | struct netlbl_unlhsh_addr6 *entry; | 297 | struct netlbl_unlhsh_addr6 *entry; |
299 | 298 | ||
300 | entry = kzalloc(sizeof(*entry), GFP_ATOMIC); | 299 | entry = kzalloc(sizeof(*entry), GFP_ATOMIC); |
301 | if (entry == NULL) | 300 | if (entry == NULL) |
302 | return -ENOMEM; | 301 | return -ENOMEM; |
303 | 302 | ||
304 | ipv6_addr_copy(&entry->list.addr, addr); | 303 | ipv6_addr_copy(&entry->list.addr, addr); |
305 | entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; | 304 | entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; |
306 | entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; | 305 | entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; |
307 | entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; | 306 | entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; |
308 | entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; | 307 | entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; |
309 | ipv6_addr_copy(&entry->list.mask, mask); | 308 | ipv6_addr_copy(&entry->list.mask, mask); |
310 | entry->list.valid = 1; | 309 | entry->list.valid = 1; |
311 | entry->secid = secid; | 310 | entry->secid = secid; |
312 | 311 | ||
313 | spin_lock(&netlbl_unlhsh_lock); | 312 | spin_lock(&netlbl_unlhsh_lock); |
314 | ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list); | 313 | ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list); |
315 | spin_unlock(&netlbl_unlhsh_lock); | 314 | spin_unlock(&netlbl_unlhsh_lock); |
316 | 315 | ||
317 | if (ret_val != 0) | 316 | if (ret_val != 0) |
318 | kfree(entry); | 317 | kfree(entry); |
319 | return 0; | 318 | return 0; |
320 | } | 319 | } |
321 | #endif /* IPv6 */ | 320 | #endif /* IPv6 */ |
322 | 321 | ||
323 | /** | 322 | /** |
324 | * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table | 323 | * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table |
325 | * @ifindex: network interface | 324 | * @ifindex: network interface |
326 | * | 325 | * |
327 | * Description: | 326 | * Description: |
328 | * Add a new, empty, interface entry into the unlabeled connection hash table. | 327 | * Add a new, empty, interface entry into the unlabeled connection hash table. |
329 | * On success a pointer to the new interface entry is returned, on failure NULL | 328 | * On success a pointer to the new interface entry is returned, on failure NULL |
330 | * is returned. | 329 | * is returned. |
331 | * | 330 | * |
332 | */ | 331 | */ |
333 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) | 332 | static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) |
334 | { | 333 | { |
335 | u32 bkt; | 334 | u32 bkt; |
336 | struct netlbl_unlhsh_iface *iface; | 335 | struct netlbl_unlhsh_iface *iface; |
337 | 336 | ||
338 | iface = kzalloc(sizeof(*iface), GFP_ATOMIC); | 337 | iface = kzalloc(sizeof(*iface), GFP_ATOMIC); |
339 | if (iface == NULL) | 338 | if (iface == NULL) |
340 | return NULL; | 339 | return NULL; |
341 | 340 | ||
342 | iface->ifindex = ifindex; | 341 | iface->ifindex = ifindex; |
343 | INIT_LIST_HEAD(&iface->addr4_list); | 342 | INIT_LIST_HEAD(&iface->addr4_list); |
344 | INIT_LIST_HEAD(&iface->addr6_list); | 343 | INIT_LIST_HEAD(&iface->addr6_list); |
345 | iface->valid = 1; | 344 | iface->valid = 1; |
346 | 345 | ||
347 | spin_lock(&netlbl_unlhsh_lock); | 346 | spin_lock(&netlbl_unlhsh_lock); |
348 | if (ifindex > 0) { | 347 | if (ifindex > 0) { |
349 | bkt = netlbl_unlhsh_hash(ifindex); | 348 | bkt = netlbl_unlhsh_hash(ifindex); |
350 | if (netlbl_unlhsh_search_iface(ifindex) != NULL) | 349 | if (netlbl_unlhsh_search_iface(ifindex) != NULL) |
351 | goto add_iface_failure; | 350 | goto add_iface_failure; |
352 | list_add_tail_rcu(&iface->list, | 351 | list_add_tail_rcu(&iface->list, |
353 | &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]); | 352 | &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]); |
354 | } else { | 353 | } else { |
355 | INIT_LIST_HEAD(&iface->list); | 354 | INIT_LIST_HEAD(&iface->list); |
356 | if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL) | 355 | if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL) |
357 | goto add_iface_failure; | 356 | goto add_iface_failure; |
358 | rcu_assign_pointer(netlbl_unlhsh_def, iface); | 357 | rcu_assign_pointer(netlbl_unlhsh_def, iface); |
359 | } | 358 | } |
360 | spin_unlock(&netlbl_unlhsh_lock); | 359 | spin_unlock(&netlbl_unlhsh_lock); |
361 | 360 | ||
362 | return iface; | 361 | return iface; |
363 | 362 | ||
364 | add_iface_failure: | 363 | add_iface_failure: |
365 | spin_unlock(&netlbl_unlhsh_lock); | 364 | spin_unlock(&netlbl_unlhsh_lock); |
366 | kfree(iface); | 365 | kfree(iface); |
367 | return NULL; | 366 | return NULL; |
368 | } | 367 | } |
369 | 368 | ||
370 | /** | 369 | /** |
371 | * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table | 370 | * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table |
372 | * @net: network namespace | 371 | * @net: network namespace |
373 | * @dev_name: interface name | 372 | * @dev_name: interface name |
374 | * @addr: IP address in network byte order | 373 | * @addr: IP address in network byte order |
375 | * @mask: address mask in network byte order | 374 | * @mask: address mask in network byte order |
376 | * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) | 375 | * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) |
377 | * @secid: LSM secid value for the entry | 376 | * @secid: LSM secid value for the entry |
378 | * @audit_info: NetLabel audit information | 377 | * @audit_info: NetLabel audit information |
379 | * | 378 | * |
380 | * Description: | 379 | * Description: |
381 | * Adds a new entry to the unlabeled connection hash table. Returns zero on | 380 | * Adds a new entry to the unlabeled connection hash table. Returns zero on |
382 | * success, negative values on failure. | 381 | * success, negative values on failure. |
383 | * | 382 | * |
384 | */ | 383 | */ |
385 | int netlbl_unlhsh_add(struct net *net, | 384 | int netlbl_unlhsh_add(struct net *net, |
386 | const char *dev_name, | 385 | const char *dev_name, |
387 | const void *addr, | 386 | const void *addr, |
388 | const void *mask, | 387 | const void *mask, |
389 | u32 addr_len, | 388 | u32 addr_len, |
390 | u32 secid, | 389 | u32 secid, |
391 | struct netlbl_audit *audit_info) | 390 | struct netlbl_audit *audit_info) |
392 | { | 391 | { |
393 | int ret_val; | 392 | int ret_val; |
394 | int ifindex; | 393 | int ifindex; |
395 | struct net_device *dev; | 394 | struct net_device *dev; |
396 | struct netlbl_unlhsh_iface *iface; | 395 | struct netlbl_unlhsh_iface *iface; |
397 | struct audit_buffer *audit_buf = NULL; | 396 | struct audit_buffer *audit_buf = NULL; |
398 | char *secctx = NULL; | 397 | char *secctx = NULL; |
399 | u32 secctx_len; | 398 | u32 secctx_len; |
400 | 399 | ||
401 | if (addr_len != sizeof(struct in_addr) && | 400 | if (addr_len != sizeof(struct in_addr) && |
402 | addr_len != sizeof(struct in6_addr)) | 401 | addr_len != sizeof(struct in6_addr)) |
403 | return -EINVAL; | 402 | return -EINVAL; |
404 | 403 | ||
405 | rcu_read_lock(); | 404 | rcu_read_lock(); |
406 | if (dev_name != NULL) { | 405 | if (dev_name != NULL) { |
407 | dev = dev_get_by_name_rcu(net, dev_name); | 406 | dev = dev_get_by_name_rcu(net, dev_name); |
408 | if (dev == NULL) { | 407 | if (dev == NULL) { |
409 | ret_val = -ENODEV; | 408 | ret_val = -ENODEV; |
410 | goto unlhsh_add_return; | 409 | goto unlhsh_add_return; |
411 | } | 410 | } |
412 | ifindex = dev->ifindex; | 411 | ifindex = dev->ifindex; |
413 | iface = netlbl_unlhsh_search_iface(ifindex); | 412 | iface = netlbl_unlhsh_search_iface(ifindex); |
414 | } else { | 413 | } else { |
415 | ifindex = 0; | 414 | ifindex = 0; |
416 | iface = rcu_dereference(netlbl_unlhsh_def); | 415 | iface = rcu_dereference(netlbl_unlhsh_def); |
417 | } | 416 | } |
418 | if (iface == NULL) { | 417 | if (iface == NULL) { |
419 | iface = netlbl_unlhsh_add_iface(ifindex); | 418 | iface = netlbl_unlhsh_add_iface(ifindex); |
420 | if (iface == NULL) { | 419 | if (iface == NULL) { |
421 | ret_val = -ENOMEM; | 420 | ret_val = -ENOMEM; |
422 | goto unlhsh_add_return; | 421 | goto unlhsh_add_return; |
423 | } | 422 | } |
424 | } | 423 | } |
425 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, | 424 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, |
426 | audit_info); | 425 | audit_info); |
427 | switch (addr_len) { | 426 | switch (addr_len) { |
428 | case sizeof(struct in_addr): { | 427 | case sizeof(struct in_addr): { |
429 | struct in_addr *addr4, *mask4; | 428 | struct in_addr *addr4, *mask4; |
430 | 429 | ||
431 | addr4 = (struct in_addr *)addr; | 430 | addr4 = (struct in_addr *)addr; |
432 | mask4 = (struct in_addr *)mask; | 431 | mask4 = (struct in_addr *)mask; |
433 | ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); | 432 | ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); |
434 | if (audit_buf != NULL) | 433 | if (audit_buf != NULL) |
435 | netlbl_af4list_audit_addr(audit_buf, 1, | 434 | netlbl_af4list_audit_addr(audit_buf, 1, |
436 | dev_name, | 435 | dev_name, |
437 | addr4->s_addr, | 436 | addr4->s_addr, |
438 | mask4->s_addr); | 437 | mask4->s_addr); |
439 | break; | 438 | break; |
440 | } | 439 | } |
441 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 440 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
442 | case sizeof(struct in6_addr): { | 441 | case sizeof(struct in6_addr): { |
443 | struct in6_addr *addr6, *mask6; | 442 | struct in6_addr *addr6, *mask6; |
444 | 443 | ||
445 | addr6 = (struct in6_addr *)addr; | 444 | addr6 = (struct in6_addr *)addr; |
446 | mask6 = (struct in6_addr *)mask; | 445 | mask6 = (struct in6_addr *)mask; |
447 | ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); | 446 | ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); |
448 | if (audit_buf != NULL) | 447 | if (audit_buf != NULL) |
449 | netlbl_af6list_audit_addr(audit_buf, 1, | 448 | netlbl_af6list_audit_addr(audit_buf, 1, |
450 | dev_name, | 449 | dev_name, |
451 | addr6, mask6); | 450 | addr6, mask6); |
452 | break; | 451 | break; |
453 | } | 452 | } |
454 | #endif /* IPv6 */ | 453 | #endif /* IPv6 */ |
455 | default: | 454 | default: |
456 | ret_val = -EINVAL; | 455 | ret_val = -EINVAL; |
457 | } | 456 | } |
458 | if (ret_val == 0) | 457 | if (ret_val == 0) |
459 | atomic_inc(&netlabel_mgmt_protocount); | 458 | atomic_inc(&netlabel_mgmt_protocount); |
460 | 459 | ||
461 | unlhsh_add_return: | 460 | unlhsh_add_return: |
462 | rcu_read_unlock(); | 461 | rcu_read_unlock(); |
463 | if (audit_buf != NULL) { | 462 | if (audit_buf != NULL) { |
464 | if (security_secid_to_secctx(secid, | 463 | if (security_secid_to_secctx(secid, |
465 | &secctx, | 464 | &secctx, |
466 | &secctx_len) == 0) { | 465 | &secctx_len) == 0) { |
467 | audit_log_format(audit_buf, " sec_obj=%s", secctx); | 466 | audit_log_format(audit_buf, " sec_obj=%s", secctx); |
468 | security_release_secctx(secctx, secctx_len); | 467 | security_release_secctx(secctx, secctx_len); |
469 | } | 468 | } |
470 | audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); | 469 | audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); |
471 | audit_log_end(audit_buf); | 470 | audit_log_end(audit_buf); |
472 | } | 471 | } |
473 | return ret_val; | 472 | return ret_val; |
474 | } | 473 | } |
475 | 474 | ||
476 | /** | 475 | /** |
477 | * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry | 476 | * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry |
478 | * @net: network namespace | 477 | * @net: network namespace |
479 | * @iface: interface entry | 478 | * @iface: interface entry |
480 | * @addr: IP address | 479 | * @addr: IP address |
481 | * @mask: IP address mask | 480 | * @mask: IP address mask |
482 | * @audit_info: NetLabel audit information | 481 | * @audit_info: NetLabel audit information |
483 | * | 482 | * |
484 | * Description: | 483 | * Description: |
485 | * Remove an IP address entry from the unlabeled connection hash table. | 484 | * Remove an IP address entry from the unlabeled connection hash table. |
486 | * Returns zero on success, negative values on failure. | 485 | * Returns zero on success, negative values on failure. |
487 | * | 486 | * |
488 | */ | 487 | */ |
489 | static int netlbl_unlhsh_remove_addr4(struct net *net, | 488 | static int netlbl_unlhsh_remove_addr4(struct net *net, |
490 | struct netlbl_unlhsh_iface *iface, | 489 | struct netlbl_unlhsh_iface *iface, |
491 | const struct in_addr *addr, | 490 | const struct in_addr *addr, |
492 | const struct in_addr *mask, | 491 | const struct in_addr *mask, |
493 | struct netlbl_audit *audit_info) | 492 | struct netlbl_audit *audit_info) |
494 | { | 493 | { |
495 | struct netlbl_af4list *list_entry; | 494 | struct netlbl_af4list *list_entry; |
496 | struct netlbl_unlhsh_addr4 *entry; | 495 | struct netlbl_unlhsh_addr4 *entry; |
497 | struct audit_buffer *audit_buf; | 496 | struct audit_buffer *audit_buf; |
498 | struct net_device *dev; | 497 | struct net_device *dev; |
499 | char *secctx; | 498 | char *secctx; |
500 | u32 secctx_len; | 499 | u32 secctx_len; |
501 | 500 | ||
502 | spin_lock(&netlbl_unlhsh_lock); | 501 | spin_lock(&netlbl_unlhsh_lock); |
503 | list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, | 502 | list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, |
504 | &iface->addr4_list); | 503 | &iface->addr4_list); |
505 | spin_unlock(&netlbl_unlhsh_lock); | 504 | spin_unlock(&netlbl_unlhsh_lock); |
506 | if (list_entry != NULL) | 505 | if (list_entry != NULL) |
507 | entry = netlbl_unlhsh_addr4_entry(list_entry); | 506 | entry = netlbl_unlhsh_addr4_entry(list_entry); |
508 | else | 507 | else |
509 | entry = NULL; | 508 | entry = NULL; |
510 | 509 | ||
511 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, | 510 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, |
512 | audit_info); | 511 | audit_info); |
513 | if (audit_buf != NULL) { | 512 | if (audit_buf != NULL) { |
514 | dev = dev_get_by_index(net, iface->ifindex); | 513 | dev = dev_get_by_index(net, iface->ifindex); |
515 | netlbl_af4list_audit_addr(audit_buf, 1, | 514 | netlbl_af4list_audit_addr(audit_buf, 1, |
516 | (dev != NULL ? dev->name : NULL), | 515 | (dev != NULL ? dev->name : NULL), |
517 | addr->s_addr, mask->s_addr); | 516 | addr->s_addr, mask->s_addr); |
518 | if (dev != NULL) | 517 | if (dev != NULL) |
519 | dev_put(dev); | 518 | dev_put(dev); |
520 | if (entry != NULL && | 519 | if (entry != NULL && |
521 | security_secid_to_secctx(entry->secid, | 520 | security_secid_to_secctx(entry->secid, |
522 | &secctx, &secctx_len) == 0) { | 521 | &secctx, &secctx_len) == 0) { |
523 | audit_log_format(audit_buf, " sec_obj=%s", secctx); | 522 | audit_log_format(audit_buf, " sec_obj=%s", secctx); |
524 | security_release_secctx(secctx, secctx_len); | 523 | security_release_secctx(secctx, secctx_len); |
525 | } | 524 | } |
526 | audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); | 525 | audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); |
527 | audit_log_end(audit_buf); | 526 | audit_log_end(audit_buf); |
528 | } | 527 | } |
529 | 528 | ||
530 | if (entry == NULL) | 529 | if (entry == NULL) |
531 | return -ENOENT; | 530 | return -ENOENT; |
532 | 531 | ||
533 | kfree_rcu(entry, rcu); | 532 | kfree_rcu(entry, rcu); |
534 | return 0; | 533 | return 0; |
535 | } | 534 | } |
536 | 535 | ||
537 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 536 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
538 | /** | 537 | /** |
539 | * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry | 538 | * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry |
540 | * @net: network namespace | 539 | * @net: network namespace |
541 | * @iface: interface entry | 540 | * @iface: interface entry |
542 | * @addr: IP address | 541 | * @addr: IP address |
543 | * @mask: IP address mask | 542 | * @mask: IP address mask |
544 | * @audit_info: NetLabel audit information | 543 | * @audit_info: NetLabel audit information |
545 | * | 544 | * |
546 | * Description: | 545 | * Description: |
547 | * Remove an IP address entry from the unlabeled connection hash table. | 546 | * Remove an IP address entry from the unlabeled connection hash table. |
548 | * Returns zero on success, negative values on failure. | 547 | * Returns zero on success, negative values on failure. |
549 | * | 548 | * |
550 | */ | 549 | */ |
551 | static int netlbl_unlhsh_remove_addr6(struct net *net, | 550 | static int netlbl_unlhsh_remove_addr6(struct net *net, |
552 | struct netlbl_unlhsh_iface *iface, | 551 | struct netlbl_unlhsh_iface *iface, |
553 | const struct in6_addr *addr, | 552 | const struct in6_addr *addr, |
554 | const struct in6_addr *mask, | 553 | const struct in6_addr *mask, |
555 | struct netlbl_audit *audit_info) | 554 | struct netlbl_audit *audit_info) |
556 | { | 555 | { |
557 | struct netlbl_af6list *list_entry; | 556 | struct netlbl_af6list *list_entry; |
558 | struct netlbl_unlhsh_addr6 *entry; | 557 | struct netlbl_unlhsh_addr6 *entry; |
559 | struct audit_buffer *audit_buf; | 558 | struct audit_buffer *audit_buf; |
560 | struct net_device *dev; | 559 | struct net_device *dev; |
561 | char *secctx; | 560 | char *secctx; |
562 | u32 secctx_len; | 561 | u32 secctx_len; |
563 | 562 | ||
564 | spin_lock(&netlbl_unlhsh_lock); | 563 | spin_lock(&netlbl_unlhsh_lock); |
565 | list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); | 564 | list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); |
566 | spin_unlock(&netlbl_unlhsh_lock); | 565 | spin_unlock(&netlbl_unlhsh_lock); |
567 | if (list_entry != NULL) | 566 | if (list_entry != NULL) |
568 | entry = netlbl_unlhsh_addr6_entry(list_entry); | 567 | entry = netlbl_unlhsh_addr6_entry(list_entry); |
569 | else | 568 | else |
570 | entry = NULL; | 569 | entry = NULL; |
571 | 570 | ||
572 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, | 571 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, |
573 | audit_info); | 572 | audit_info); |
574 | if (audit_buf != NULL) { | 573 | if (audit_buf != NULL) { |
575 | dev = dev_get_by_index(net, iface->ifindex); | 574 | dev = dev_get_by_index(net, iface->ifindex); |
576 | netlbl_af6list_audit_addr(audit_buf, 1, | 575 | netlbl_af6list_audit_addr(audit_buf, 1, |
577 | (dev != NULL ? dev->name : NULL), | 576 | (dev != NULL ? dev->name : NULL), |
578 | addr, mask); | 577 | addr, mask); |
579 | if (dev != NULL) | 578 | if (dev != NULL) |
580 | dev_put(dev); | 579 | dev_put(dev); |
581 | if (entry != NULL && | 580 | if (entry != NULL && |
582 | security_secid_to_secctx(entry->secid, | 581 | security_secid_to_secctx(entry->secid, |
583 | &secctx, &secctx_len) == 0) { | 582 | &secctx, &secctx_len) == 0) { |
584 | audit_log_format(audit_buf, " sec_obj=%s", secctx); | 583 | audit_log_format(audit_buf, " sec_obj=%s", secctx); |
585 | security_release_secctx(secctx, secctx_len); | 584 | security_release_secctx(secctx, secctx_len); |
586 | } | 585 | } |
587 | audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); | 586 | audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); |
588 | audit_log_end(audit_buf); | 587 | audit_log_end(audit_buf); |
589 | } | 588 | } |
590 | 589 | ||
591 | if (entry == NULL) | 590 | if (entry == NULL) |
592 | return -ENOENT; | 591 | return -ENOENT; |
593 | 592 | ||
594 | kfree_rcu(entry, rcu); | 593 | kfree_rcu(entry, rcu); |
595 | return 0; | 594 | return 0; |
596 | } | 595 | } |
597 | #endif /* IPv6 */ | 596 | #endif /* IPv6 */ |
598 | 597 | ||
599 | /** | 598 | /** |
600 | * netlbl_unlhsh_condremove_iface - Remove an interface entry | 599 | * netlbl_unlhsh_condremove_iface - Remove an interface entry |
601 | * @iface: the interface entry | 600 | * @iface: the interface entry |
602 | * | 601 | * |
603 | * Description: | 602 | * Description: |
604 | * Remove an interface entry from the unlabeled connection hash table if it is | 603 | * Remove an interface entry from the unlabeled connection hash table if it is |
605 | * empty. An interface entry is considered to be empty if there are no | 604 | * empty. An interface entry is considered to be empty if there are no |
606 | * address entries assigned to it. | 605 | * address entries assigned to it. |
607 | * | 606 | * |
608 | */ | 607 | */ |
609 | static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) | 608 | static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) |
610 | { | 609 | { |
611 | struct netlbl_af4list *iter4; | 610 | struct netlbl_af4list *iter4; |
612 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 611 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
613 | struct netlbl_af6list *iter6; | 612 | struct netlbl_af6list *iter6; |
614 | #endif /* IPv6 */ | 613 | #endif /* IPv6 */ |
615 | 614 | ||
616 | spin_lock(&netlbl_unlhsh_lock); | 615 | spin_lock(&netlbl_unlhsh_lock); |
617 | netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list) | 616 | netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list) |
618 | goto unlhsh_condremove_failure; | 617 | goto unlhsh_condremove_failure; |
619 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 618 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
620 | netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list) | 619 | netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list) |
621 | goto unlhsh_condremove_failure; | 620 | goto unlhsh_condremove_failure; |
622 | #endif /* IPv6 */ | 621 | #endif /* IPv6 */ |
623 | iface->valid = 0; | 622 | iface->valid = 0; |
624 | if (iface->ifindex > 0) | 623 | if (iface->ifindex > 0) |
625 | list_del_rcu(&iface->list); | 624 | list_del_rcu(&iface->list); |
626 | else | 625 | else |
627 | rcu_assign_pointer(netlbl_unlhsh_def, NULL); | 626 | rcu_assign_pointer(netlbl_unlhsh_def, NULL); |
628 | spin_unlock(&netlbl_unlhsh_lock); | 627 | spin_unlock(&netlbl_unlhsh_lock); |
629 | 628 | ||
630 | call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); | 629 | call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); |
631 | return; | 630 | return; |
632 | 631 | ||
633 | unlhsh_condremove_failure: | 632 | unlhsh_condremove_failure: |
634 | spin_unlock(&netlbl_unlhsh_lock); | 633 | spin_unlock(&netlbl_unlhsh_lock); |
635 | } | 634 | } |
636 | 635 | ||
637 | /** | 636 | /** |
638 | * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table | 637 | * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table |
639 | * @net: network namespace | 638 | * @net: network namespace |
640 | * @dev_name: interface name | 639 | * @dev_name: interface name |
641 | * @addr: IP address in network byte order | 640 | * @addr: IP address in network byte order |
642 | * @mask: address mask in network byte order | 641 | * @mask: address mask in network byte order |
643 | * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) | 642 | * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) |
644 | * @audit_info: NetLabel audit information | 643 | * @audit_info: NetLabel audit information |
645 | * | 644 | * |
646 | * Description: | 645 | * Description: |
647 | * Removes and existing entry from the unlabeled connection hash table. | 646 | * Removes and existing entry from the unlabeled connection hash table. |
648 | * Returns zero on success, negative values on failure. | 647 | * Returns zero on success, negative values on failure. |
649 | * | 648 | * |
650 | */ | 649 | */ |
651 | int netlbl_unlhsh_remove(struct net *net, | 650 | int netlbl_unlhsh_remove(struct net *net, |
652 | const char *dev_name, | 651 | const char *dev_name, |
653 | const void *addr, | 652 | const void *addr, |
654 | const void *mask, | 653 | const void *mask, |
655 | u32 addr_len, | 654 | u32 addr_len, |
656 | struct netlbl_audit *audit_info) | 655 | struct netlbl_audit *audit_info) |
657 | { | 656 | { |
658 | int ret_val; | 657 | int ret_val; |
659 | struct net_device *dev; | 658 | struct net_device *dev; |
660 | struct netlbl_unlhsh_iface *iface; | 659 | struct netlbl_unlhsh_iface *iface; |
661 | 660 | ||
662 | if (addr_len != sizeof(struct in_addr) && | 661 | if (addr_len != sizeof(struct in_addr) && |
663 | addr_len != sizeof(struct in6_addr)) | 662 | addr_len != sizeof(struct in6_addr)) |
664 | return -EINVAL; | 663 | return -EINVAL; |
665 | 664 | ||
666 | rcu_read_lock(); | 665 | rcu_read_lock(); |
667 | if (dev_name != NULL) { | 666 | if (dev_name != NULL) { |
668 | dev = dev_get_by_name_rcu(net, dev_name); | 667 | dev = dev_get_by_name_rcu(net, dev_name); |
669 | if (dev == NULL) { | 668 | if (dev == NULL) { |
670 | ret_val = -ENODEV; | 669 | ret_val = -ENODEV; |
671 | goto unlhsh_remove_return; | 670 | goto unlhsh_remove_return; |
672 | } | 671 | } |
673 | iface = netlbl_unlhsh_search_iface(dev->ifindex); | 672 | iface = netlbl_unlhsh_search_iface(dev->ifindex); |
674 | } else | 673 | } else |
675 | iface = rcu_dereference(netlbl_unlhsh_def); | 674 | iface = rcu_dereference(netlbl_unlhsh_def); |
676 | if (iface == NULL) { | 675 | if (iface == NULL) { |
677 | ret_val = -ENOENT; | 676 | ret_val = -ENOENT; |
678 | goto unlhsh_remove_return; | 677 | goto unlhsh_remove_return; |
679 | } | 678 | } |
680 | switch (addr_len) { | 679 | switch (addr_len) { |
681 | case sizeof(struct in_addr): | 680 | case sizeof(struct in_addr): |
682 | ret_val = netlbl_unlhsh_remove_addr4(net, | 681 | ret_val = netlbl_unlhsh_remove_addr4(net, |
683 | iface, addr, mask, | 682 | iface, addr, mask, |
684 | audit_info); | 683 | audit_info); |
685 | break; | 684 | break; |
686 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 685 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
687 | case sizeof(struct in6_addr): | 686 | case sizeof(struct in6_addr): |
688 | ret_val = netlbl_unlhsh_remove_addr6(net, | 687 | ret_val = netlbl_unlhsh_remove_addr6(net, |
689 | iface, addr, mask, | 688 | iface, addr, mask, |
690 | audit_info); | 689 | audit_info); |
691 | break; | 690 | break; |
692 | #endif /* IPv6 */ | 691 | #endif /* IPv6 */ |
693 | default: | 692 | default: |
694 | ret_val = -EINVAL; | 693 | ret_val = -EINVAL; |
695 | } | 694 | } |
696 | if (ret_val == 0) { | 695 | if (ret_val == 0) { |
697 | netlbl_unlhsh_condremove_iface(iface); | 696 | netlbl_unlhsh_condremove_iface(iface); |
698 | atomic_dec(&netlabel_mgmt_protocount); | 697 | atomic_dec(&netlabel_mgmt_protocount); |
699 | } | 698 | } |
700 | 699 | ||
701 | unlhsh_remove_return: | 700 | unlhsh_remove_return: |
702 | rcu_read_unlock(); | 701 | rcu_read_unlock(); |
703 | return ret_val; | 702 | return ret_val; |
704 | } | 703 | } |
705 | 704 | ||
706 | /* | 705 | /* |
707 | * General Helper Functions | 706 | * General Helper Functions |
708 | */ | 707 | */ |
709 | 708 | ||
710 | /** | 709 | /** |
711 | * netlbl_unlhsh_netdev_handler - Network device notification handler | 710 | * netlbl_unlhsh_netdev_handler - Network device notification handler |
712 | * @this: notifier block | 711 | * @this: notifier block |
713 | * @event: the event | 712 | * @event: the event |
714 | * @ptr: the network device (cast to void) | 713 | * @ptr: the network device (cast to void) |
715 | * | 714 | * |
716 | * Description: | 715 | * Description: |
717 | * Handle network device events, although at present all we care about is a | 716 | * Handle network device events, although at present all we care about is a |
718 | * network device going away. In the case of a device going away we clear any | 717 | * network device going away. In the case of a device going away we clear any |
719 | * related entries from the unlabeled connection hash table. | 718 | * related entries from the unlabeled connection hash table. |
720 | * | 719 | * |
721 | */ | 720 | */ |
722 | static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, | 721 | static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, |
723 | unsigned long event, | 722 | unsigned long event, |
724 | void *ptr) | 723 | void *ptr) |
725 | { | 724 | { |
726 | struct net_device *dev = ptr; | 725 | struct net_device *dev = ptr; |
727 | struct netlbl_unlhsh_iface *iface = NULL; | 726 | struct netlbl_unlhsh_iface *iface = NULL; |
728 | 727 | ||
729 | if (!net_eq(dev_net(dev), &init_net)) | 728 | if (!net_eq(dev_net(dev), &init_net)) |
730 | return NOTIFY_DONE; | 729 | return NOTIFY_DONE; |
731 | 730 | ||
732 | /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ | 731 | /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ |
733 | if (event == NETDEV_DOWN) { | 732 | if (event == NETDEV_DOWN) { |
734 | spin_lock(&netlbl_unlhsh_lock); | 733 | spin_lock(&netlbl_unlhsh_lock); |
735 | iface = netlbl_unlhsh_search_iface(dev->ifindex); | 734 | iface = netlbl_unlhsh_search_iface(dev->ifindex); |
736 | if (iface != NULL && iface->valid) { | 735 | if (iface != NULL && iface->valid) { |
737 | iface->valid = 0; | 736 | iface->valid = 0; |
738 | list_del_rcu(&iface->list); | 737 | list_del_rcu(&iface->list); |
739 | } else | 738 | } else |
740 | iface = NULL; | 739 | iface = NULL; |
741 | spin_unlock(&netlbl_unlhsh_lock); | 740 | spin_unlock(&netlbl_unlhsh_lock); |
742 | } | 741 | } |
743 | 742 | ||
744 | if (iface != NULL) | 743 | if (iface != NULL) |
745 | call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); | 744 | call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); |
746 | 745 | ||
747 | return NOTIFY_DONE; | 746 | return NOTIFY_DONE; |
748 | } | 747 | } |
749 | 748 | ||
750 | /** | 749 | /** |
751 | * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag | 750 | * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag |
752 | * @value: desired value | 751 | * @value: desired value |
753 | * @audit_info: NetLabel audit information | 752 | * @audit_info: NetLabel audit information |
754 | * | 753 | * |
755 | * Description: | 754 | * Description: |
756 | * Set the value of the unlabeled accept flag to @value. | 755 | * Set the value of the unlabeled accept flag to @value. |
757 | * | 756 | * |
758 | */ | 757 | */ |
759 | static void netlbl_unlabel_acceptflg_set(u8 value, | 758 | static void netlbl_unlabel_acceptflg_set(u8 value, |
760 | struct netlbl_audit *audit_info) | 759 | struct netlbl_audit *audit_info) |
761 | { | 760 | { |
762 | struct audit_buffer *audit_buf; | 761 | struct audit_buffer *audit_buf; |
763 | u8 old_val; | 762 | u8 old_val; |
764 | 763 | ||
765 | old_val = netlabel_unlabel_acceptflg; | 764 | old_val = netlabel_unlabel_acceptflg; |
766 | netlabel_unlabel_acceptflg = value; | 765 | netlabel_unlabel_acceptflg = value; |
767 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, | 766 | audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, |
768 | audit_info); | 767 | audit_info); |
769 | if (audit_buf != NULL) { | 768 | if (audit_buf != NULL) { |
770 | audit_log_format(audit_buf, | 769 | audit_log_format(audit_buf, |
771 | " unlbl_accept=%u old=%u", value, old_val); | 770 | " unlbl_accept=%u old=%u", value, old_val); |
772 | audit_log_end(audit_buf); | 771 | audit_log_end(audit_buf); |
773 | } | 772 | } |
774 | } | 773 | } |
775 | 774 | ||
776 | /** | 775 | /** |
777 | * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information | 776 | * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information |
778 | * @info: the Generic NETLINK info block | 777 | * @info: the Generic NETLINK info block |
779 | * @addr: the IP address | 778 | * @addr: the IP address |
780 | * @mask: the IP address mask | 779 | * @mask: the IP address mask |
781 | * @len: the address length | 780 | * @len: the address length |
782 | * | 781 | * |
783 | * Description: | 782 | * Description: |
784 | * Examine the Generic NETLINK message and extract the IP address information. | 783 | * Examine the Generic NETLINK message and extract the IP address information. |
785 | * Returns zero on success, negative values on failure. | 784 | * Returns zero on success, negative values on failure. |
786 | * | 785 | * |
787 | */ | 786 | */ |
788 | static int netlbl_unlabel_addrinfo_get(struct genl_info *info, | 787 | static int netlbl_unlabel_addrinfo_get(struct genl_info *info, |
789 | void **addr, | 788 | void **addr, |
790 | void **mask, | 789 | void **mask, |
791 | u32 *len) | 790 | u32 *len) |
792 | { | 791 | { |
793 | u32 addr_len; | 792 | u32 addr_len; |
794 | 793 | ||
795 | if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { | 794 | if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { |
796 | addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); | 795 | addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); |
797 | if (addr_len != sizeof(struct in_addr) && | 796 | if (addr_len != sizeof(struct in_addr) && |
798 | addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) | 797 | addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) |
799 | return -EINVAL; | 798 | return -EINVAL; |
800 | *len = addr_len; | 799 | *len = addr_len; |
801 | *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); | 800 | *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); |
802 | *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); | 801 | *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); |
803 | return 0; | 802 | return 0; |
804 | } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { | 803 | } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { |
805 | addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); | 804 | addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); |
806 | if (addr_len != sizeof(struct in6_addr) && | 805 | if (addr_len != sizeof(struct in6_addr) && |
807 | addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) | 806 | addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) |
808 | return -EINVAL; | 807 | return -EINVAL; |
809 | *len = addr_len; | 808 | *len = addr_len; |
810 | *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); | 809 | *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); |
811 | *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); | 810 | *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); |
812 | return 0; | 811 | return 0; |
813 | } | 812 | } |
814 | 813 | ||
815 | return -EINVAL; | 814 | return -EINVAL; |
816 | } | 815 | } |
817 | 816 | ||
818 | /* | 817 | /* |
819 | * NetLabel Command Handlers | 818 | * NetLabel Command Handlers |
820 | */ | 819 | */ |
821 | 820 | ||
822 | /** | 821 | /** |
823 | * netlbl_unlabel_accept - Handle an ACCEPT message | 822 | * netlbl_unlabel_accept - Handle an ACCEPT message |
824 | * @skb: the NETLINK buffer | 823 | * @skb: the NETLINK buffer |
825 | * @info: the Generic NETLINK info block | 824 | * @info: the Generic NETLINK info block |
826 | * | 825 | * |
827 | * Description: | 826 | * Description: |
828 | * Process a user generated ACCEPT message and set the accept flag accordingly. | 827 | * Process a user generated ACCEPT message and set the accept flag accordingly. |
829 | * Returns zero on success, negative values on failure. | 828 | * Returns zero on success, negative values on failure. |
830 | * | 829 | * |
831 | */ | 830 | */ |
832 | static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) | 831 | static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) |
833 | { | 832 | { |
834 | u8 value; | 833 | u8 value; |
835 | struct netlbl_audit audit_info; | 834 | struct netlbl_audit audit_info; |
836 | 835 | ||
837 | if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { | 836 | if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { |
838 | value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); | 837 | value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); |
839 | if (value == 1 || value == 0) { | 838 | if (value == 1 || value == 0) { |
840 | netlbl_netlink_auditinfo(skb, &audit_info); | 839 | netlbl_netlink_auditinfo(skb, &audit_info); |
841 | netlbl_unlabel_acceptflg_set(value, &audit_info); | 840 | netlbl_unlabel_acceptflg_set(value, &audit_info); |
842 | return 0; | 841 | return 0; |
843 | } | 842 | } |
844 | } | 843 | } |
845 | 844 | ||
846 | return -EINVAL; | 845 | return -EINVAL; |
847 | } | 846 | } |
848 | 847 | ||
849 | /** | 848 | /** |
850 | * netlbl_unlabel_list - Handle a LIST message | 849 | * netlbl_unlabel_list - Handle a LIST message |
851 | * @skb: the NETLINK buffer | 850 | * @skb: the NETLINK buffer |
852 | * @info: the Generic NETLINK info block | 851 | * @info: the Generic NETLINK info block |
853 | * | 852 | * |
854 | * Description: | 853 | * Description: |
855 | * Process a user generated LIST message and respond with the current status. | 854 | * Process a user generated LIST message and respond with the current status. |
856 | * Returns zero on success, negative values on failure. | 855 | * Returns zero on success, negative values on failure. |
857 | * | 856 | * |
858 | */ | 857 | */ |
859 | static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) | 858 | static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) |
860 | { | 859 | { |
861 | int ret_val = -EINVAL; | 860 | int ret_val = -EINVAL; |
862 | struct sk_buff *ans_skb; | 861 | struct sk_buff *ans_skb; |
863 | void *data; | 862 | void *data; |
864 | 863 | ||
865 | ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); | 864 | ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); |
866 | if (ans_skb == NULL) | 865 | if (ans_skb == NULL) |
867 | goto list_failure; | 866 | goto list_failure; |
868 | data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family, | 867 | data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family, |
869 | 0, NLBL_UNLABEL_C_LIST); | 868 | 0, NLBL_UNLABEL_C_LIST); |
870 | if (data == NULL) { | 869 | if (data == NULL) { |
871 | ret_val = -ENOMEM; | 870 | ret_val = -ENOMEM; |
872 | goto list_failure; | 871 | goto list_failure; |
873 | } | 872 | } |
874 | 873 | ||
875 | ret_val = nla_put_u8(ans_skb, | 874 | ret_val = nla_put_u8(ans_skb, |
876 | NLBL_UNLABEL_A_ACPTFLG, | 875 | NLBL_UNLABEL_A_ACPTFLG, |
877 | netlabel_unlabel_acceptflg); | 876 | netlabel_unlabel_acceptflg); |
878 | if (ret_val != 0) | 877 | if (ret_val != 0) |
879 | goto list_failure; | 878 | goto list_failure; |
880 | 879 | ||
881 | genlmsg_end(ans_skb, data); | 880 | genlmsg_end(ans_skb, data); |
882 | return genlmsg_reply(ans_skb, info); | 881 | return genlmsg_reply(ans_skb, info); |
883 | 882 | ||
884 | list_failure: | 883 | list_failure: |
885 | kfree_skb(ans_skb); | 884 | kfree_skb(ans_skb); |
886 | return ret_val; | 885 | return ret_val; |
887 | } | 886 | } |
888 | 887 | ||
889 | /** | 888 | /** |
890 | * netlbl_unlabel_staticadd - Handle a STATICADD message | 889 | * netlbl_unlabel_staticadd - Handle a STATICADD message |
891 | * @skb: the NETLINK buffer | 890 | * @skb: the NETLINK buffer |
892 | * @info: the Generic NETLINK info block | 891 | * @info: the Generic NETLINK info block |
893 | * | 892 | * |
894 | * Description: | 893 | * Description: |
895 | * Process a user generated STATICADD message and add a new unlabeled | 894 | * Process a user generated STATICADD message and add a new unlabeled |
896 | * connection entry to the hash table. Returns zero on success, negative | 895 | * connection entry to the hash table. Returns zero on success, negative |
897 | * values on failure. | 896 | * values on failure. |
898 | * | 897 | * |
899 | */ | 898 | */ |
900 | static int netlbl_unlabel_staticadd(struct sk_buff *skb, | 899 | static int netlbl_unlabel_staticadd(struct sk_buff *skb, |
901 | struct genl_info *info) | 900 | struct genl_info *info) |
902 | { | 901 | { |
903 | int ret_val; | 902 | int ret_val; |
904 | char *dev_name; | 903 | char *dev_name; |
905 | void *addr; | 904 | void *addr; |
906 | void *mask; | 905 | void *mask; |
907 | u32 addr_len; | 906 | u32 addr_len; |
908 | u32 secid; | 907 | u32 secid; |
909 | struct netlbl_audit audit_info; | 908 | struct netlbl_audit audit_info; |
910 | 909 | ||
911 | /* Don't allow users to add both IPv4 and IPv6 addresses for a | 910 | /* Don't allow users to add both IPv4 and IPv6 addresses for a |
912 | * single entry. However, allow users to create two entries, one each | 911 | * single entry. However, allow users to create two entries, one each |
913 | * for IPv4 and IPv4, with the same LSM security context which should | 912 | * for IPv4 and IPv4, with the same LSM security context which should |
914 | * achieve the same result. */ | 913 | * achieve the same result. */ |
915 | if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || | 914 | if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || |
916 | !info->attrs[NLBL_UNLABEL_A_IFACE] || | 915 | !info->attrs[NLBL_UNLABEL_A_IFACE] || |
917 | !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || | 916 | !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || |
918 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ | 917 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ |
919 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || | 918 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || |
920 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) | 919 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) |
921 | return -EINVAL; | 920 | return -EINVAL; |
922 | 921 | ||
923 | netlbl_netlink_auditinfo(skb, &audit_info); | 922 | netlbl_netlink_auditinfo(skb, &audit_info); |
924 | 923 | ||
925 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); | 924 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); |
926 | if (ret_val != 0) | 925 | if (ret_val != 0) |
927 | return ret_val; | 926 | return ret_val; |
928 | dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); | 927 | dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); |
929 | ret_val = security_secctx_to_secid( | 928 | ret_val = security_secctx_to_secid( |
930 | nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), | 929 | nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), |
931 | nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), | 930 | nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), |
932 | &secid); | 931 | &secid); |
933 | if (ret_val != 0) | 932 | if (ret_val != 0) |
934 | return ret_val; | 933 | return ret_val; |
935 | 934 | ||
936 | return netlbl_unlhsh_add(&init_net, | 935 | return netlbl_unlhsh_add(&init_net, |
937 | dev_name, addr, mask, addr_len, secid, | 936 | dev_name, addr, mask, addr_len, secid, |
938 | &audit_info); | 937 | &audit_info); |
939 | } | 938 | } |
940 | 939 | ||
941 | /** | 940 | /** |
942 | * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message | 941 | * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message |
943 | * @skb: the NETLINK buffer | 942 | * @skb: the NETLINK buffer |
944 | * @info: the Generic NETLINK info block | 943 | * @info: the Generic NETLINK info block |
945 | * | 944 | * |
946 | * Description: | 945 | * Description: |
947 | * Process a user generated STATICADDDEF message and add a new default | 946 | * Process a user generated STATICADDDEF message and add a new default |
948 | * unlabeled connection entry. Returns zero on success, negative values on | 947 | * unlabeled connection entry. Returns zero on success, negative values on |
949 | * failure. | 948 | * failure. |
950 | * | 949 | * |
951 | */ | 950 | */ |
952 | static int netlbl_unlabel_staticadddef(struct sk_buff *skb, | 951 | static int netlbl_unlabel_staticadddef(struct sk_buff *skb, |
953 | struct genl_info *info) | 952 | struct genl_info *info) |
954 | { | 953 | { |
955 | int ret_val; | 954 | int ret_val; |
956 | void *addr; | 955 | void *addr; |
957 | void *mask; | 956 | void *mask; |
958 | u32 addr_len; | 957 | u32 addr_len; |
959 | u32 secid; | 958 | u32 secid; |
960 | struct netlbl_audit audit_info; | 959 | struct netlbl_audit audit_info; |
961 | 960 | ||
962 | /* Don't allow users to add both IPv4 and IPv6 addresses for a | 961 | /* Don't allow users to add both IPv4 and IPv6 addresses for a |
963 | * single entry. However, allow users to create two entries, one each | 962 | * single entry. However, allow users to create two entries, one each |
964 | * for IPv4 and IPv6, with the same LSM security context which should | 963 | * for IPv4 and IPv6, with the same LSM security context which should |
965 | * achieve the same result. */ | 964 | * achieve the same result. */ |
966 | if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || | 965 | if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || |
967 | !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || | 966 | !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || |
968 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ | 967 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ |
969 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || | 968 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || |
970 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) | 969 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) |
971 | return -EINVAL; | 970 | return -EINVAL; |
972 | 971 | ||
973 | netlbl_netlink_auditinfo(skb, &audit_info); | 972 | netlbl_netlink_auditinfo(skb, &audit_info); |
974 | 973 | ||
975 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); | 974 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); |
976 | if (ret_val != 0) | 975 | if (ret_val != 0) |
977 | return ret_val; | 976 | return ret_val; |
978 | ret_val = security_secctx_to_secid( | 977 | ret_val = security_secctx_to_secid( |
979 | nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), | 978 | nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), |
980 | nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), | 979 | nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), |
981 | &secid); | 980 | &secid); |
982 | if (ret_val != 0) | 981 | if (ret_val != 0) |
983 | return ret_val; | 982 | return ret_val; |
984 | 983 | ||
985 | return netlbl_unlhsh_add(&init_net, | 984 | return netlbl_unlhsh_add(&init_net, |
986 | NULL, addr, mask, addr_len, secid, | 985 | NULL, addr, mask, addr_len, secid, |
987 | &audit_info); | 986 | &audit_info); |
988 | } | 987 | } |
989 | 988 | ||
990 | /** | 989 | /** |
991 | * netlbl_unlabel_staticremove - Handle a STATICREMOVE message | 990 | * netlbl_unlabel_staticremove - Handle a STATICREMOVE message |
992 | * @skb: the NETLINK buffer | 991 | * @skb: the NETLINK buffer |
993 | * @info: the Generic NETLINK info block | 992 | * @info: the Generic NETLINK info block |
994 | * | 993 | * |
995 | * Description: | 994 | * Description: |
996 | * Process a user generated STATICREMOVE message and remove the specified | 995 | * Process a user generated STATICREMOVE message and remove the specified |
997 | * unlabeled connection entry. Returns zero on success, negative values on | 996 | * unlabeled connection entry. Returns zero on success, negative values on |
998 | * failure. | 997 | * failure. |
999 | * | 998 | * |
1000 | */ | 999 | */ |
1001 | static int netlbl_unlabel_staticremove(struct sk_buff *skb, | 1000 | static int netlbl_unlabel_staticremove(struct sk_buff *skb, |
1002 | struct genl_info *info) | 1001 | struct genl_info *info) |
1003 | { | 1002 | { |
1004 | int ret_val; | 1003 | int ret_val; |
1005 | char *dev_name; | 1004 | char *dev_name; |
1006 | void *addr; | 1005 | void *addr; |
1007 | void *mask; | 1006 | void *mask; |
1008 | u32 addr_len; | 1007 | u32 addr_len; |
1009 | struct netlbl_audit audit_info; | 1008 | struct netlbl_audit audit_info; |
1010 | 1009 | ||
1011 | /* See the note in netlbl_unlabel_staticadd() about not allowing both | 1010 | /* See the note in netlbl_unlabel_staticadd() about not allowing both |
1012 | * IPv4 and IPv6 in the same entry. */ | 1011 | * IPv4 and IPv6 in the same entry. */ |
1013 | if (!info->attrs[NLBL_UNLABEL_A_IFACE] || | 1012 | if (!info->attrs[NLBL_UNLABEL_A_IFACE] || |
1014 | !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || | 1013 | !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || |
1015 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ | 1014 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ |
1016 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || | 1015 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || |
1017 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) | 1016 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) |
1018 | return -EINVAL; | 1017 | return -EINVAL; |
1019 | 1018 | ||
1020 | netlbl_netlink_auditinfo(skb, &audit_info); | 1019 | netlbl_netlink_auditinfo(skb, &audit_info); |
1021 | 1020 | ||
1022 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); | 1021 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); |
1023 | if (ret_val != 0) | 1022 | if (ret_val != 0) |
1024 | return ret_val; | 1023 | return ret_val; |
1025 | dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); | 1024 | dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); |
1026 | 1025 | ||
1027 | return netlbl_unlhsh_remove(&init_net, | 1026 | return netlbl_unlhsh_remove(&init_net, |
1028 | dev_name, addr, mask, addr_len, | 1027 | dev_name, addr, mask, addr_len, |
1029 | &audit_info); | 1028 | &audit_info); |
1030 | } | 1029 | } |
1031 | 1030 | ||
1032 | /** | 1031 | /** |
1033 | * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message | 1032 | * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message |
1034 | * @skb: the NETLINK buffer | 1033 | * @skb: the NETLINK buffer |
1035 | * @info: the Generic NETLINK info block | 1034 | * @info: the Generic NETLINK info block |
1036 | * | 1035 | * |
1037 | * Description: | 1036 | * Description: |
1038 | * Process a user generated STATICREMOVEDEF message and remove the default | 1037 | * Process a user generated STATICREMOVEDEF message and remove the default |
1039 | * unlabeled connection entry. Returns zero on success, negative values on | 1038 | * unlabeled connection entry. Returns zero on success, negative values on |
1040 | * failure. | 1039 | * failure. |
1041 | * | 1040 | * |
1042 | */ | 1041 | */ |
1043 | static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, | 1042 | static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, |
1044 | struct genl_info *info) | 1043 | struct genl_info *info) |
1045 | { | 1044 | { |
1046 | int ret_val; | 1045 | int ret_val; |
1047 | void *addr; | 1046 | void *addr; |
1048 | void *mask; | 1047 | void *mask; |
1049 | u32 addr_len; | 1048 | u32 addr_len; |
1050 | struct netlbl_audit audit_info; | 1049 | struct netlbl_audit audit_info; |
1051 | 1050 | ||
1052 | /* See the note in netlbl_unlabel_staticadd() about not allowing both | 1051 | /* See the note in netlbl_unlabel_staticadd() about not allowing both |
1053 | * IPv4 and IPv6 in the same entry. */ | 1052 | * IPv4 and IPv6 in the same entry. */ |
1054 | if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || | 1053 | if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || |
1055 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ | 1054 | !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ |
1056 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || | 1055 | (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || |
1057 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) | 1056 | !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) |
1058 | return -EINVAL; | 1057 | return -EINVAL; |
1059 | 1058 | ||
1060 | netlbl_netlink_auditinfo(skb, &audit_info); | 1059 | netlbl_netlink_auditinfo(skb, &audit_info); |
1061 | 1060 | ||
1062 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); | 1061 | ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); |
1063 | if (ret_val != 0) | 1062 | if (ret_val != 0) |
1064 | return ret_val; | 1063 | return ret_val; |
1065 | 1064 | ||
1066 | return netlbl_unlhsh_remove(&init_net, | 1065 | return netlbl_unlhsh_remove(&init_net, |
1067 | NULL, addr, mask, addr_len, | 1066 | NULL, addr, mask, addr_len, |
1068 | &audit_info); | 1067 | &audit_info); |
1069 | } | 1068 | } |
1070 | 1069 | ||
1071 | 1070 | ||
1072 | /** | 1071 | /** |
1073 | * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] | 1072 | * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] |
1074 | * @cmd: command/message | 1073 | * @cmd: command/message |
1075 | * @iface: the interface entry | 1074 | * @iface: the interface entry |
1076 | * @addr4: the IPv4 address entry | 1075 | * @addr4: the IPv4 address entry |
1077 | * @addr6: the IPv6 address entry | 1076 | * @addr6: the IPv6 address entry |
1078 | * @arg: the netlbl_unlhsh_walk_arg structure | 1077 | * @arg: the netlbl_unlhsh_walk_arg structure |
1079 | * | 1078 | * |
1080 | * Description: | 1079 | * Description: |
1081 | * This function is designed to be used to generate a response for a | 1080 | * This function is designed to be used to generate a response for a |
1082 | * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 | 1081 | * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 |
1083 | * can be specified, not both, the other unspecified entry should be set to | 1082 | * can be specified, not both, the other unspecified entry should be set to |
1084 | * NULL by the caller. Returns the size of the message on success, negative | 1083 | * NULL by the caller. Returns the size of the message on success, negative |
1085 | * values on failure. | 1084 | * values on failure. |
1086 | * | 1085 | * |
1087 | */ | 1086 | */ |
1088 | static int netlbl_unlabel_staticlist_gen(u32 cmd, | 1087 | static int netlbl_unlabel_staticlist_gen(u32 cmd, |
1089 | const struct netlbl_unlhsh_iface *iface, | 1088 | const struct netlbl_unlhsh_iface *iface, |
1090 | const struct netlbl_unlhsh_addr4 *addr4, | 1089 | const struct netlbl_unlhsh_addr4 *addr4, |
1091 | const struct netlbl_unlhsh_addr6 *addr6, | 1090 | const struct netlbl_unlhsh_addr6 *addr6, |
1092 | void *arg) | 1091 | void *arg) |
1093 | { | 1092 | { |
1094 | int ret_val = -ENOMEM; | 1093 | int ret_val = -ENOMEM; |
1095 | struct netlbl_unlhsh_walk_arg *cb_arg = arg; | 1094 | struct netlbl_unlhsh_walk_arg *cb_arg = arg; |
1096 | struct net_device *dev; | 1095 | struct net_device *dev; |
1097 | void *data; | 1096 | void *data; |
1098 | u32 secid; | 1097 | u32 secid; |
1099 | char *secctx; | 1098 | char *secctx; |
1100 | u32 secctx_len; | 1099 | u32 secctx_len; |
1101 | 1100 | ||
1102 | data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid, | 1101 | data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid, |
1103 | cb_arg->seq, &netlbl_unlabel_gnl_family, | 1102 | cb_arg->seq, &netlbl_unlabel_gnl_family, |
1104 | NLM_F_MULTI, cmd); | 1103 | NLM_F_MULTI, cmd); |
1105 | if (data == NULL) | 1104 | if (data == NULL) |
1106 | goto list_cb_failure; | 1105 | goto list_cb_failure; |
1107 | 1106 | ||
1108 | if (iface->ifindex > 0) { | 1107 | if (iface->ifindex > 0) { |
1109 | dev = dev_get_by_index(&init_net, iface->ifindex); | 1108 | dev = dev_get_by_index(&init_net, iface->ifindex); |
1110 | if (!dev) { | 1109 | if (!dev) { |
1111 | ret_val = -ENODEV; | 1110 | ret_val = -ENODEV; |
1112 | goto list_cb_failure; | 1111 | goto list_cb_failure; |
1113 | } | 1112 | } |
1114 | ret_val = nla_put_string(cb_arg->skb, | 1113 | ret_val = nla_put_string(cb_arg->skb, |
1115 | NLBL_UNLABEL_A_IFACE, dev->name); | 1114 | NLBL_UNLABEL_A_IFACE, dev->name); |
1116 | dev_put(dev); | 1115 | dev_put(dev); |
1117 | if (ret_val != 0) | 1116 | if (ret_val != 0) |
1118 | goto list_cb_failure; | 1117 | goto list_cb_failure; |
1119 | } | 1118 | } |
1120 | 1119 | ||
1121 | if (addr4) { | 1120 | if (addr4) { |
1122 | struct in_addr addr_struct; | 1121 | struct in_addr addr_struct; |
1123 | 1122 | ||
1124 | addr_struct.s_addr = addr4->list.addr; | 1123 | addr_struct.s_addr = addr4->list.addr; |
1125 | ret_val = nla_put(cb_arg->skb, | 1124 | ret_val = nla_put(cb_arg->skb, |
1126 | NLBL_UNLABEL_A_IPV4ADDR, | 1125 | NLBL_UNLABEL_A_IPV4ADDR, |
1127 | sizeof(struct in_addr), | 1126 | sizeof(struct in_addr), |
1128 | &addr_struct); | 1127 | &addr_struct); |
1129 | if (ret_val != 0) | 1128 | if (ret_val != 0) |
1130 | goto list_cb_failure; | 1129 | goto list_cb_failure; |
1131 | 1130 | ||
1132 | addr_struct.s_addr = addr4->list.mask; | 1131 | addr_struct.s_addr = addr4->list.mask; |
1133 | ret_val = nla_put(cb_arg->skb, | 1132 | ret_val = nla_put(cb_arg->skb, |
1134 | NLBL_UNLABEL_A_IPV4MASK, | 1133 | NLBL_UNLABEL_A_IPV4MASK, |
1135 | sizeof(struct in_addr), | 1134 | sizeof(struct in_addr), |
1136 | &addr_struct); | 1135 | &addr_struct); |
1137 | if (ret_val != 0) | 1136 | if (ret_val != 0) |
1138 | goto list_cb_failure; | 1137 | goto list_cb_failure; |
1139 | 1138 | ||
1140 | secid = addr4->secid; | 1139 | secid = addr4->secid; |
1141 | } else { | 1140 | } else { |
1142 | ret_val = nla_put(cb_arg->skb, | 1141 | ret_val = nla_put(cb_arg->skb, |
1143 | NLBL_UNLABEL_A_IPV6ADDR, | 1142 | NLBL_UNLABEL_A_IPV6ADDR, |
1144 | sizeof(struct in6_addr), | 1143 | sizeof(struct in6_addr), |
1145 | &addr6->list.addr); | 1144 | &addr6->list.addr); |
1146 | if (ret_val != 0) | 1145 | if (ret_val != 0) |
1147 | goto list_cb_failure; | 1146 | goto list_cb_failure; |
1148 | 1147 | ||
1149 | ret_val = nla_put(cb_arg->skb, | 1148 | ret_val = nla_put(cb_arg->skb, |
1150 | NLBL_UNLABEL_A_IPV6MASK, | 1149 | NLBL_UNLABEL_A_IPV6MASK, |
1151 | sizeof(struct in6_addr), | 1150 | sizeof(struct in6_addr), |
1152 | &addr6->list.mask); | 1151 | &addr6->list.mask); |
1153 | if (ret_val != 0) | 1152 | if (ret_val != 0) |
1154 | goto list_cb_failure; | 1153 | goto list_cb_failure; |
1155 | 1154 | ||
1156 | secid = addr6->secid; | 1155 | secid = addr6->secid; |
1157 | } | 1156 | } |
1158 | 1157 | ||
1159 | ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); | 1158 | ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); |
1160 | if (ret_val != 0) | 1159 | if (ret_val != 0) |
1161 | goto list_cb_failure; | 1160 | goto list_cb_failure; |
1162 | ret_val = nla_put(cb_arg->skb, | 1161 | ret_val = nla_put(cb_arg->skb, |
1163 | NLBL_UNLABEL_A_SECCTX, | 1162 | NLBL_UNLABEL_A_SECCTX, |
1164 | secctx_len, | 1163 | secctx_len, |
1165 | secctx); | 1164 | secctx); |
1166 | security_release_secctx(secctx, secctx_len); | 1165 | security_release_secctx(secctx, secctx_len); |
1167 | if (ret_val != 0) | 1166 | if (ret_val != 0) |
1168 | goto list_cb_failure; | 1167 | goto list_cb_failure; |
1169 | 1168 | ||
1170 | cb_arg->seq++; | 1169 | cb_arg->seq++; |
1171 | return genlmsg_end(cb_arg->skb, data); | 1170 | return genlmsg_end(cb_arg->skb, data); |
1172 | 1171 | ||
1173 | list_cb_failure: | 1172 | list_cb_failure: |
1174 | genlmsg_cancel(cb_arg->skb, data); | 1173 | genlmsg_cancel(cb_arg->skb, data); |
1175 | return ret_val; | 1174 | return ret_val; |
1176 | } | 1175 | } |
1177 | 1176 | ||
1178 | /** | 1177 | /** |
1179 | * netlbl_unlabel_staticlist - Handle a STATICLIST message | 1178 | * netlbl_unlabel_staticlist - Handle a STATICLIST message |
1180 | * @skb: the NETLINK buffer | 1179 | * @skb: the NETLINK buffer |
1181 | * @cb: the NETLINK callback | 1180 | * @cb: the NETLINK callback |
1182 | * | 1181 | * |
1183 | * Description: | 1182 | * Description: |
1184 | * Process a user generated STATICLIST message and dump the unlabeled | 1183 | * Process a user generated STATICLIST message and dump the unlabeled |
1185 | * connection hash table in a form suitable for use in a kernel generated | 1184 | * connection hash table in a form suitable for use in a kernel generated |
1186 | * STATICLIST message. Returns the length of @skb. | 1185 | * STATICLIST message. Returns the length of @skb. |
1187 | * | 1186 | * |
1188 | */ | 1187 | */ |
1189 | static int netlbl_unlabel_staticlist(struct sk_buff *skb, | 1188 | static int netlbl_unlabel_staticlist(struct sk_buff *skb, |
1190 | struct netlink_callback *cb) | 1189 | struct netlink_callback *cb) |
1191 | { | 1190 | { |
1192 | struct netlbl_unlhsh_walk_arg cb_arg; | 1191 | struct netlbl_unlhsh_walk_arg cb_arg; |
1193 | u32 skip_bkt = cb->args[0]; | 1192 | u32 skip_bkt = cb->args[0]; |
1194 | u32 skip_chain = cb->args[1]; | 1193 | u32 skip_chain = cb->args[1]; |
1195 | u32 skip_addr4 = cb->args[2]; | 1194 | u32 skip_addr4 = cb->args[2]; |
1196 | u32 skip_addr6 = cb->args[3]; | 1195 | u32 skip_addr6 = cb->args[3]; |
1197 | u32 iter_bkt; | 1196 | u32 iter_bkt; |
1198 | u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; | 1197 | u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; |
1199 | struct netlbl_unlhsh_iface *iface; | 1198 | struct netlbl_unlhsh_iface *iface; |
1200 | struct list_head *iter_list; | 1199 | struct list_head *iter_list; |
1201 | struct netlbl_af4list *addr4; | 1200 | struct netlbl_af4list *addr4; |
1202 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 1201 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
1203 | struct netlbl_af6list *addr6; | 1202 | struct netlbl_af6list *addr6; |
1204 | #endif | 1203 | #endif |
1205 | 1204 | ||
1206 | cb_arg.nl_cb = cb; | 1205 | cb_arg.nl_cb = cb; |
1207 | cb_arg.skb = skb; | 1206 | cb_arg.skb = skb; |
1208 | cb_arg.seq = cb->nlh->nlmsg_seq; | 1207 | cb_arg.seq = cb->nlh->nlmsg_seq; |
1209 | 1208 | ||
1210 | rcu_read_lock(); | 1209 | rcu_read_lock(); |
1211 | for (iter_bkt = skip_bkt; | 1210 | for (iter_bkt = skip_bkt; |
1212 | iter_bkt < rcu_dereference(netlbl_unlhsh)->size; | 1211 | iter_bkt < rcu_dereference(netlbl_unlhsh)->size; |
1213 | iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { | 1212 | iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { |
1214 | iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; | 1213 | iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; |
1215 | list_for_each_entry_rcu(iface, iter_list, list) { | 1214 | list_for_each_entry_rcu(iface, iter_list, list) { |
1216 | if (!iface->valid || | 1215 | if (!iface->valid || |
1217 | iter_chain++ < skip_chain) | 1216 | iter_chain++ < skip_chain) |
1218 | continue; | 1217 | continue; |
1219 | netlbl_af4list_foreach_rcu(addr4, | 1218 | netlbl_af4list_foreach_rcu(addr4, |
1220 | &iface->addr4_list) { | 1219 | &iface->addr4_list) { |
1221 | if (iter_addr4++ < skip_addr4) | 1220 | if (iter_addr4++ < skip_addr4) |
1222 | continue; | 1221 | continue; |
1223 | if (netlbl_unlabel_staticlist_gen( | 1222 | if (netlbl_unlabel_staticlist_gen( |
1224 | NLBL_UNLABEL_C_STATICLIST, | 1223 | NLBL_UNLABEL_C_STATICLIST, |
1225 | iface, | 1224 | iface, |
1226 | netlbl_unlhsh_addr4_entry(addr4), | 1225 | netlbl_unlhsh_addr4_entry(addr4), |
1227 | NULL, | 1226 | NULL, |
1228 | &cb_arg) < 0) { | 1227 | &cb_arg) < 0) { |
1229 | iter_addr4--; | 1228 | iter_addr4--; |
1230 | iter_chain--; | 1229 | iter_chain--; |
1231 | goto unlabel_staticlist_return; | 1230 | goto unlabel_staticlist_return; |
1232 | } | 1231 | } |
1233 | } | 1232 | } |
1234 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 1233 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
1235 | netlbl_af6list_foreach_rcu(addr6, | 1234 | netlbl_af6list_foreach_rcu(addr6, |
1236 | &iface->addr6_list) { | 1235 | &iface->addr6_list) { |
1237 | if (iter_addr6++ < skip_addr6) | 1236 | if (iter_addr6++ < skip_addr6) |
1238 | continue; | 1237 | continue; |
1239 | if (netlbl_unlabel_staticlist_gen( | 1238 | if (netlbl_unlabel_staticlist_gen( |
1240 | NLBL_UNLABEL_C_STATICLIST, | 1239 | NLBL_UNLABEL_C_STATICLIST, |
1241 | iface, | 1240 | iface, |
1242 | NULL, | 1241 | NULL, |
1243 | netlbl_unlhsh_addr6_entry(addr6), | 1242 | netlbl_unlhsh_addr6_entry(addr6), |
1244 | &cb_arg) < 0) { | 1243 | &cb_arg) < 0) { |
1245 | iter_addr6--; | 1244 | iter_addr6--; |
1246 | iter_chain--; | 1245 | iter_chain--; |
1247 | goto unlabel_staticlist_return; | 1246 | goto unlabel_staticlist_return; |
1248 | } | 1247 | } |
1249 | } | 1248 | } |
1250 | #endif /* IPv6 */ | 1249 | #endif /* IPv6 */ |
1251 | } | 1250 | } |
1252 | } | 1251 | } |
1253 | 1252 | ||
1254 | unlabel_staticlist_return: | 1253 | unlabel_staticlist_return: |
1255 | rcu_read_unlock(); | 1254 | rcu_read_unlock(); |
1256 | cb->args[0] = skip_bkt; | 1255 | cb->args[0] = skip_bkt; |
1257 | cb->args[1] = skip_chain; | 1256 | cb->args[1] = skip_chain; |
1258 | cb->args[2] = skip_addr4; | 1257 | cb->args[2] = skip_addr4; |
1259 | cb->args[3] = skip_addr6; | 1258 | cb->args[3] = skip_addr6; |
1260 | return skb->len; | 1259 | return skb->len; |
1261 | } | 1260 | } |
1262 | 1261 | ||
1263 | /** | 1262 | /** |
1264 | * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message | 1263 | * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message |
1265 | * @skb: the NETLINK buffer | 1264 | * @skb: the NETLINK buffer |
1266 | * @cb: the NETLINK callback | 1265 | * @cb: the NETLINK callback |
1267 | * | 1266 | * |
1268 | * Description: | 1267 | * Description: |
1269 | * Process a user generated STATICLISTDEF message and dump the default | 1268 | * Process a user generated STATICLISTDEF message and dump the default |
1270 | * unlabeled connection entry in a form suitable for use in a kernel generated | 1269 | * unlabeled connection entry in a form suitable for use in a kernel generated |
1271 | * STATICLISTDEF message. Returns the length of @skb. | 1270 | * STATICLISTDEF message. Returns the length of @skb. |
1272 | * | 1271 | * |
1273 | */ | 1272 | */ |
1274 | static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, | 1273 | static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, |
1275 | struct netlink_callback *cb) | 1274 | struct netlink_callback *cb) |
1276 | { | 1275 | { |
1277 | struct netlbl_unlhsh_walk_arg cb_arg; | 1276 | struct netlbl_unlhsh_walk_arg cb_arg; |
1278 | struct netlbl_unlhsh_iface *iface; | 1277 | struct netlbl_unlhsh_iface *iface; |
1279 | u32 skip_addr4 = cb->args[0]; | 1278 | u32 skip_addr4 = cb->args[0]; |
1280 | u32 skip_addr6 = cb->args[1]; | 1279 | u32 skip_addr6 = cb->args[1]; |
1281 | u32 iter_addr4 = 0; | 1280 | u32 iter_addr4 = 0; |
1282 | struct netlbl_af4list *addr4; | 1281 | struct netlbl_af4list *addr4; |
1283 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 1282 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
1284 | u32 iter_addr6 = 0; | 1283 | u32 iter_addr6 = 0; |
1285 | struct netlbl_af6list *addr6; | 1284 | struct netlbl_af6list *addr6; |
1286 | #endif | 1285 | #endif |
1287 | 1286 | ||
1288 | cb_arg.nl_cb = cb; | 1287 | cb_arg.nl_cb = cb; |
1289 | cb_arg.skb = skb; | 1288 | cb_arg.skb = skb; |
1290 | cb_arg.seq = cb->nlh->nlmsg_seq; | 1289 | cb_arg.seq = cb->nlh->nlmsg_seq; |
1291 | 1290 | ||
1292 | rcu_read_lock(); | 1291 | rcu_read_lock(); |
1293 | iface = rcu_dereference(netlbl_unlhsh_def); | 1292 | iface = rcu_dereference(netlbl_unlhsh_def); |
1294 | if (iface == NULL || !iface->valid) | 1293 | if (iface == NULL || !iface->valid) |
1295 | goto unlabel_staticlistdef_return; | 1294 | goto unlabel_staticlistdef_return; |
1296 | 1295 | ||
1297 | netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { | 1296 | netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { |
1298 | if (iter_addr4++ < skip_addr4) | 1297 | if (iter_addr4++ < skip_addr4) |
1299 | continue; | 1298 | continue; |
1300 | if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, | 1299 | if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, |
1301 | iface, | 1300 | iface, |
1302 | netlbl_unlhsh_addr4_entry(addr4), | 1301 | netlbl_unlhsh_addr4_entry(addr4), |
1303 | NULL, | 1302 | NULL, |
1304 | &cb_arg) < 0) { | 1303 | &cb_arg) < 0) { |
1305 | iter_addr4--; | 1304 | iter_addr4--; |
1306 | goto unlabel_staticlistdef_return; | 1305 | goto unlabel_staticlistdef_return; |
1307 | } | 1306 | } |
1308 | } | 1307 | } |
1309 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 1308 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
1310 | netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { | 1309 | netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { |
1311 | if (iter_addr6++ < skip_addr6) | 1310 | if (iter_addr6++ < skip_addr6) |
1312 | continue; | 1311 | continue; |
1313 | if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, | 1312 | if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, |
1314 | iface, | 1313 | iface, |
1315 | NULL, | 1314 | NULL, |
1316 | netlbl_unlhsh_addr6_entry(addr6), | 1315 | netlbl_unlhsh_addr6_entry(addr6), |
1317 | &cb_arg) < 0) { | 1316 | &cb_arg) < 0) { |
1318 | iter_addr6--; | 1317 | iter_addr6--; |
1319 | goto unlabel_staticlistdef_return; | 1318 | goto unlabel_staticlistdef_return; |
1320 | } | 1319 | } |
1321 | } | 1320 | } |
1322 | #endif /* IPv6 */ | 1321 | #endif /* IPv6 */ |
1323 | 1322 | ||
1324 | unlabel_staticlistdef_return: | 1323 | unlabel_staticlistdef_return: |
1325 | rcu_read_unlock(); | 1324 | rcu_read_unlock(); |
1326 | cb->args[0] = skip_addr4; | 1325 | cb->args[0] = skip_addr4; |
1327 | cb->args[1] = skip_addr6; | 1326 | cb->args[1] = skip_addr6; |
1328 | return skb->len; | 1327 | return skb->len; |
1329 | } | 1328 | } |
1330 | 1329 | ||
1331 | /* | 1330 | /* |
1332 | * NetLabel Generic NETLINK Command Definitions | 1331 | * NetLabel Generic NETLINK Command Definitions |
1333 | */ | 1332 | */ |
1334 | 1333 | ||
1335 | static struct genl_ops netlbl_unlabel_genl_ops[] = { | 1334 | static struct genl_ops netlbl_unlabel_genl_ops[] = { |
1336 | { | 1335 | { |
1337 | .cmd = NLBL_UNLABEL_C_STATICADD, | 1336 | .cmd = NLBL_UNLABEL_C_STATICADD, |
1338 | .flags = GENL_ADMIN_PERM, | 1337 | .flags = GENL_ADMIN_PERM, |
1339 | .policy = netlbl_unlabel_genl_policy, | 1338 | .policy = netlbl_unlabel_genl_policy, |
1340 | .doit = netlbl_unlabel_staticadd, | 1339 | .doit = netlbl_unlabel_staticadd, |
1341 | .dumpit = NULL, | 1340 | .dumpit = NULL, |
1342 | }, | 1341 | }, |
1343 | { | 1342 | { |
1344 | .cmd = NLBL_UNLABEL_C_STATICREMOVE, | 1343 | .cmd = NLBL_UNLABEL_C_STATICREMOVE, |
1345 | .flags = GENL_ADMIN_PERM, | 1344 | .flags = GENL_ADMIN_PERM, |
1346 | .policy = netlbl_unlabel_genl_policy, | 1345 | .policy = netlbl_unlabel_genl_policy, |
1347 | .doit = netlbl_unlabel_staticremove, | 1346 | .doit = netlbl_unlabel_staticremove, |
1348 | .dumpit = NULL, | 1347 | .dumpit = NULL, |
1349 | }, | 1348 | }, |
1350 | { | 1349 | { |
1351 | .cmd = NLBL_UNLABEL_C_STATICLIST, | 1350 | .cmd = NLBL_UNLABEL_C_STATICLIST, |
1352 | .flags = 0, | 1351 | .flags = 0, |
1353 | .policy = netlbl_unlabel_genl_policy, | 1352 | .policy = netlbl_unlabel_genl_policy, |
1354 | .doit = NULL, | 1353 | .doit = NULL, |
1355 | .dumpit = netlbl_unlabel_staticlist, | 1354 | .dumpit = netlbl_unlabel_staticlist, |
1356 | }, | 1355 | }, |
1357 | { | 1356 | { |
1358 | .cmd = NLBL_UNLABEL_C_STATICADDDEF, | 1357 | .cmd = NLBL_UNLABEL_C_STATICADDDEF, |
1359 | .flags = GENL_ADMIN_PERM, | 1358 | .flags = GENL_ADMIN_PERM, |
1360 | .policy = netlbl_unlabel_genl_policy, | 1359 | .policy = netlbl_unlabel_genl_policy, |
1361 | .doit = netlbl_unlabel_staticadddef, | 1360 | .doit = netlbl_unlabel_staticadddef, |
1362 | .dumpit = NULL, | 1361 | .dumpit = NULL, |
1363 | }, | 1362 | }, |
1364 | { | 1363 | { |
1365 | .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, | 1364 | .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, |
1366 | .flags = GENL_ADMIN_PERM, | 1365 | .flags = GENL_ADMIN_PERM, |
1367 | .policy = netlbl_unlabel_genl_policy, | 1366 | .policy = netlbl_unlabel_genl_policy, |
1368 | .doit = netlbl_unlabel_staticremovedef, | 1367 | .doit = netlbl_unlabel_staticremovedef, |
1369 | .dumpit = NULL, | 1368 | .dumpit = NULL, |
1370 | }, | 1369 | }, |
1371 | { | 1370 | { |
1372 | .cmd = NLBL_UNLABEL_C_STATICLISTDEF, | 1371 | .cmd = NLBL_UNLABEL_C_STATICLISTDEF, |
1373 | .flags = 0, | 1372 | .flags = 0, |
1374 | .policy = netlbl_unlabel_genl_policy, | 1373 | .policy = netlbl_unlabel_genl_policy, |
1375 | .doit = NULL, | 1374 | .doit = NULL, |
1376 | .dumpit = netlbl_unlabel_staticlistdef, | 1375 | .dumpit = netlbl_unlabel_staticlistdef, |
1377 | }, | 1376 | }, |
1378 | { | 1377 | { |
1379 | .cmd = NLBL_UNLABEL_C_ACCEPT, | 1378 | .cmd = NLBL_UNLABEL_C_ACCEPT, |
1380 | .flags = GENL_ADMIN_PERM, | 1379 | .flags = GENL_ADMIN_PERM, |
1381 | .policy = netlbl_unlabel_genl_policy, | 1380 | .policy = netlbl_unlabel_genl_policy, |
1382 | .doit = netlbl_unlabel_accept, | 1381 | .doit = netlbl_unlabel_accept, |
1383 | .dumpit = NULL, | 1382 | .dumpit = NULL, |
1384 | }, | 1383 | }, |
1385 | { | 1384 | { |
1386 | .cmd = NLBL_UNLABEL_C_LIST, | 1385 | .cmd = NLBL_UNLABEL_C_LIST, |
1387 | .flags = 0, | 1386 | .flags = 0, |
1388 | .policy = netlbl_unlabel_genl_policy, | 1387 | .policy = netlbl_unlabel_genl_policy, |
1389 | .doit = netlbl_unlabel_list, | 1388 | .doit = netlbl_unlabel_list, |
1390 | .dumpit = NULL, | 1389 | .dumpit = NULL, |
1391 | }, | 1390 | }, |
1392 | }; | 1391 | }; |
1393 | 1392 | ||
1394 | /* | 1393 | /* |
1395 | * NetLabel Generic NETLINK Protocol Functions | 1394 | * NetLabel Generic NETLINK Protocol Functions |
1396 | */ | 1395 | */ |
1397 | 1396 | ||
1398 | /** | 1397 | /** |
1399 | * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component | 1398 | * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component |
1400 | * | 1399 | * |
1401 | * Description: | 1400 | * Description: |
1402 | * Register the unlabeled packet NetLabel component with the Generic NETLINK | 1401 | * Register the unlabeled packet NetLabel component with the Generic NETLINK |
1403 | * mechanism. Returns zero on success, negative values on failure. | 1402 | * mechanism. Returns zero on success, negative values on failure. |
1404 | * | 1403 | * |
1405 | */ | 1404 | */ |
1406 | int __init netlbl_unlabel_genl_init(void) | 1405 | int __init netlbl_unlabel_genl_init(void) |
1407 | { | 1406 | { |
1408 | return genl_register_family_with_ops(&netlbl_unlabel_gnl_family, | 1407 | return genl_register_family_with_ops(&netlbl_unlabel_gnl_family, |
1409 | netlbl_unlabel_genl_ops, ARRAY_SIZE(netlbl_unlabel_genl_ops)); | 1408 | netlbl_unlabel_genl_ops, ARRAY_SIZE(netlbl_unlabel_genl_ops)); |
1410 | } | 1409 | } |
1411 | 1410 | ||
1412 | /* | 1411 | /* |
1413 | * NetLabel KAPI Hooks | 1412 | * NetLabel KAPI Hooks |
1414 | */ | 1413 | */ |
1415 | 1414 | ||
1416 | static struct notifier_block netlbl_unlhsh_netdev_notifier = { | 1415 | static struct notifier_block netlbl_unlhsh_netdev_notifier = { |
1417 | .notifier_call = netlbl_unlhsh_netdev_handler, | 1416 | .notifier_call = netlbl_unlhsh_netdev_handler, |
1418 | }; | 1417 | }; |
1419 | 1418 | ||
1420 | /** | 1419 | /** |
1421 | * netlbl_unlabel_init - Initialize the unlabeled connection hash table | 1420 | * netlbl_unlabel_init - Initialize the unlabeled connection hash table |
1422 | * @size: the number of bits to use for the hash buckets | 1421 | * @size: the number of bits to use for the hash buckets |
1423 | * | 1422 | * |
1424 | * Description: | 1423 | * Description: |
1425 | * Initializes the unlabeled connection hash table and registers a network | 1424 | * Initializes the unlabeled connection hash table and registers a network |
1426 | * device notification handler. This function should only be called by the | 1425 | * device notification handler. This function should only be called by the |
1427 | * NetLabel subsystem itself during initialization. Returns zero on success, | 1426 | * NetLabel subsystem itself during initialization. Returns zero on success, |
1428 | * non-zero values on error. | 1427 | * non-zero values on error. |
1429 | * | 1428 | * |
1430 | */ | 1429 | */ |
1431 | int __init netlbl_unlabel_init(u32 size) | 1430 | int __init netlbl_unlabel_init(u32 size) |
1432 | { | 1431 | { |
1433 | u32 iter; | 1432 | u32 iter; |
1434 | struct netlbl_unlhsh_tbl *hsh_tbl; | 1433 | struct netlbl_unlhsh_tbl *hsh_tbl; |
1435 | 1434 | ||
1436 | if (size == 0) | 1435 | if (size == 0) |
1437 | return -EINVAL; | 1436 | return -EINVAL; |
1438 | 1437 | ||
1439 | hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); | 1438 | hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); |
1440 | if (hsh_tbl == NULL) | 1439 | if (hsh_tbl == NULL) |
1441 | return -ENOMEM; | 1440 | return -ENOMEM; |
1442 | hsh_tbl->size = 1 << size; | 1441 | hsh_tbl->size = 1 << size; |
1443 | hsh_tbl->tbl = kcalloc(hsh_tbl->size, | 1442 | hsh_tbl->tbl = kcalloc(hsh_tbl->size, |
1444 | sizeof(struct list_head), | 1443 | sizeof(struct list_head), |
1445 | GFP_KERNEL); | 1444 | GFP_KERNEL); |
1446 | if (hsh_tbl->tbl == NULL) { | 1445 | if (hsh_tbl->tbl == NULL) { |
1447 | kfree(hsh_tbl); | 1446 | kfree(hsh_tbl); |
1448 | return -ENOMEM; | 1447 | return -ENOMEM; |
1449 | } | 1448 | } |
1450 | for (iter = 0; iter < hsh_tbl->size; iter++) | 1449 | for (iter = 0; iter < hsh_tbl->size; iter++) |
1451 | INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); | 1450 | INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); |
1452 | 1451 | ||
1453 | rcu_read_lock(); | 1452 | rcu_read_lock(); |
1454 | spin_lock(&netlbl_unlhsh_lock); | 1453 | spin_lock(&netlbl_unlhsh_lock); |
1455 | rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); | 1454 | rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); |
1456 | spin_unlock(&netlbl_unlhsh_lock); | 1455 | spin_unlock(&netlbl_unlhsh_lock); |
1457 | rcu_read_unlock(); | 1456 | rcu_read_unlock(); |
1458 | 1457 | ||
1459 | register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); | 1458 | register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); |
1460 | 1459 | ||
1461 | return 0; | 1460 | return 0; |
1462 | } | 1461 | } |
1463 | 1462 | ||
1464 | /** | 1463 | /** |
1465 | * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet | 1464 | * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet |
1466 | * @skb: the packet | 1465 | * @skb: the packet |
1467 | * @family: protocol family | 1466 | * @family: protocol family |
1468 | * @secattr: the security attributes | 1467 | * @secattr: the security attributes |
1469 | * | 1468 | * |
1470 | * Description: | 1469 | * Description: |
1471 | * Determine the security attributes, if any, for an unlabled packet and return | 1470 | * Determine the security attributes, if any, for an unlabled packet and return |
1472 | * them in @secattr. Returns zero on success and negative values on failure. | 1471 | * them in @secattr. Returns zero on success and negative values on failure. |
1473 | * | 1472 | * |
1474 | */ | 1473 | */ |
1475 | int netlbl_unlabel_getattr(const struct sk_buff *skb, | 1474 | int netlbl_unlabel_getattr(const struct sk_buff *skb, |
1476 | u16 family, | 1475 | u16 family, |
1477 | struct netlbl_lsm_secattr *secattr) | 1476 | struct netlbl_lsm_secattr *secattr) |
1478 | { | 1477 | { |
1479 | struct netlbl_unlhsh_iface *iface; | 1478 | struct netlbl_unlhsh_iface *iface; |
1480 | 1479 | ||
1481 | rcu_read_lock(); | 1480 | rcu_read_lock(); |
1482 | iface = netlbl_unlhsh_search_iface(skb->skb_iif); | 1481 | iface = netlbl_unlhsh_search_iface(skb->skb_iif); |
1483 | if (iface == NULL) | 1482 | if (iface == NULL) |
1484 | iface = rcu_dereference(netlbl_unlhsh_def); | 1483 | iface = rcu_dereference(netlbl_unlhsh_def); |
1485 | if (iface == NULL || !iface->valid) | 1484 | if (iface == NULL || !iface->valid) |
1486 | goto unlabel_getattr_nolabel; | 1485 | goto unlabel_getattr_nolabel; |
1487 | switch (family) { | 1486 | switch (family) { |
1488 | case PF_INET: { | 1487 | case PF_INET: { |
1489 | struct iphdr *hdr4; | 1488 | struct iphdr *hdr4; |
1490 | struct netlbl_af4list *addr4; | 1489 | struct netlbl_af4list *addr4; |
1491 | 1490 | ||
1492 | hdr4 = ip_hdr(skb); | 1491 | hdr4 = ip_hdr(skb); |
1493 | addr4 = netlbl_af4list_search(hdr4->saddr, | 1492 | addr4 = netlbl_af4list_search(hdr4->saddr, |
1494 | &iface->addr4_list); | 1493 | &iface->addr4_list); |
1495 | if (addr4 == NULL) | 1494 | if (addr4 == NULL) |
1496 | goto unlabel_getattr_nolabel; | 1495 | goto unlabel_getattr_nolabel; |
1497 | secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid; | 1496 | secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid; |
1498 | break; | 1497 | break; |
1499 | } | 1498 | } |
1500 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | 1499 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
1501 | case PF_INET6: { | 1500 | case PF_INET6: { |
1502 | struct ipv6hdr *hdr6; | 1501 | struct ipv6hdr *hdr6; |
1503 | struct netlbl_af6list *addr6; | 1502 | struct netlbl_af6list *addr6; |
1504 | 1503 | ||
1505 | hdr6 = ipv6_hdr(skb); | 1504 | hdr6 = ipv6_hdr(skb); |
1506 | addr6 = netlbl_af6list_search(&hdr6->saddr, | 1505 | addr6 = netlbl_af6list_search(&hdr6->saddr, |
1507 | &iface->addr6_list); | 1506 | &iface->addr6_list); |
1508 | if (addr6 == NULL) | 1507 | if (addr6 == NULL) |
1509 | goto unlabel_getattr_nolabel; | 1508 | goto unlabel_getattr_nolabel; |
1510 | secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid; | 1509 | secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid; |
1511 | break; | 1510 | break; |
1512 | } | 1511 | } |
1513 | #endif /* IPv6 */ | 1512 | #endif /* IPv6 */ |
1514 | default: | 1513 | default: |
1515 | goto unlabel_getattr_nolabel; | 1514 | goto unlabel_getattr_nolabel; |
1516 | } | 1515 | } |
1517 | rcu_read_unlock(); | 1516 | rcu_read_unlock(); |
1518 | 1517 | ||
1519 | secattr->flags |= NETLBL_SECATTR_SECID; | 1518 | secattr->flags |= NETLBL_SECATTR_SECID; |
1520 | secattr->type = NETLBL_NLTYPE_UNLABELED; | 1519 | secattr->type = NETLBL_NLTYPE_UNLABELED; |
1521 | return 0; | 1520 | return 0; |
1522 | 1521 | ||
1523 | unlabel_getattr_nolabel: | 1522 | unlabel_getattr_nolabel: |
1524 | rcu_read_unlock(); | 1523 | rcu_read_unlock(); |
1525 | if (netlabel_unlabel_acceptflg == 0) | 1524 | if (netlabel_unlabel_acceptflg == 0) |
1526 | return -ENOMSG; | 1525 | return -ENOMSG; |
1527 | secattr->type = NETLBL_NLTYPE_UNLABELED; | 1526 | secattr->type = NETLBL_NLTYPE_UNLABELED; |
1528 | return 0; | 1527 | return 0; |
1529 | } | 1528 | } |
1530 | 1529 | ||
1531 | /** | 1530 | /** |
1532 | * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets | 1531 | * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets |
1533 | * | 1532 | * |
1534 | * Description: | 1533 | * Description: |
1535 | * Set the default NetLabel configuration to allow incoming unlabeled packets | 1534 | * Set the default NetLabel configuration to allow incoming unlabeled packets |
1536 | * and to send unlabeled network traffic by default. | 1535 | * and to send unlabeled network traffic by default. |
1537 | * | 1536 | * |
1538 | */ | 1537 | */ |
1539 | int __init netlbl_unlabel_defconf(void) | 1538 | int __init netlbl_unlabel_defconf(void) |
1540 | { | 1539 | { |
1541 | int ret_val; | 1540 | int ret_val; |
1542 | struct netlbl_dom_map *entry; | 1541 | struct netlbl_dom_map *entry; |
1543 | struct netlbl_audit audit_info; | 1542 | struct netlbl_audit audit_info; |
1544 | 1543 | ||
1545 | /* Only the kernel is allowed to call this function and the only time | 1544 | /* Only the kernel is allowed to call this function and the only time |
1546 | * it is called is at bootup before the audit subsystem is reporting | 1545 | * it is called is at bootup before the audit subsystem is reporting |
1547 | * messages so don't worry to much about these values. */ | 1546 | * messages so don't worry to much about these values. */ |
1548 | security_task_getsecid(current, &audit_info.secid); | 1547 | security_task_getsecid(current, &audit_info.secid); |
1549 | audit_info.loginuid = 0; | 1548 | audit_info.loginuid = 0; |
1550 | audit_info.sessionid = 0; | 1549 | audit_info.sessionid = 0; |
1551 | 1550 | ||
1552 | entry = kzalloc(sizeof(*entry), GFP_KERNEL); | 1551 | entry = kzalloc(sizeof(*entry), GFP_KERNEL); |
1553 | if (entry == NULL) | 1552 | if (entry == NULL) |
1554 | return -ENOMEM; | 1553 | return -ENOMEM; |
1555 | entry->type = NETLBL_NLTYPE_UNLABELED; | 1554 | entry->type = NETLBL_NLTYPE_UNLABELED; |
1556 | ret_val = netlbl_domhsh_add_default(entry, &audit_info); | 1555 | ret_val = netlbl_domhsh_add_default(entry, &audit_info); |
1557 | if (ret_val != 0) | 1556 | if (ret_val != 0) |
1558 | return ret_val; | 1557 | return ret_val; |
1559 | 1558 | ||
1560 | netlbl_unlabel_acceptflg_set(1, &audit_info); | 1559 | netlbl_unlabel_acceptflg_set(1, &audit_info); |
1561 | 1560 | ||
1562 | return 0; | 1561 | return 0; |
1563 | } | 1562 | } |
1564 | 1563 |
security/keys/keyring.c
1 | /* Keyring handling | 1 | /* Keyring handling |
2 | * | 2 | * |
3 | * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License | 7 | * modify it under the terms of the GNU General Public License |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/security.h> | 16 | #include <linux/security.h> |
17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> |
18 | #include <linux/err.h> | 18 | #include <linux/err.h> |
19 | #include <keys/keyring-type.h> | 19 | #include <keys/keyring-type.h> |
20 | #include <linux/uaccess.h> | 20 | #include <linux/uaccess.h> |
21 | #include "internal.h" | 21 | #include "internal.h" |
22 | 22 | ||
23 | #define rcu_dereference_locked_keyring(keyring) \ | 23 | #define rcu_dereference_locked_keyring(keyring) \ |
24 | (rcu_dereference_protected( \ | 24 | (rcu_dereference_protected( \ |
25 | (keyring)->payload.subscriptions, \ | 25 | (keyring)->payload.subscriptions, \ |
26 | rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem))) | 26 | rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem))) |
27 | 27 | ||
28 | #define KEY_LINK_FIXQUOTA 1UL | 28 | #define KEY_LINK_FIXQUOTA 1UL |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * When plumbing the depths of the key tree, this sets a hard limit | 31 | * When plumbing the depths of the key tree, this sets a hard limit |
32 | * set on how deep we're willing to go. | 32 | * set on how deep we're willing to go. |
33 | */ | 33 | */ |
34 | #define KEYRING_SEARCH_MAX_DEPTH 6 | 34 | #define KEYRING_SEARCH_MAX_DEPTH 6 |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * We keep all named keyrings in a hash to speed looking them up. | 37 | * We keep all named keyrings in a hash to speed looking them up. |
38 | */ | 38 | */ |
39 | #define KEYRING_NAME_HASH_SIZE (1 << 5) | 39 | #define KEYRING_NAME_HASH_SIZE (1 << 5) |
40 | 40 | ||
41 | static struct list_head keyring_name_hash[KEYRING_NAME_HASH_SIZE]; | 41 | static struct list_head keyring_name_hash[KEYRING_NAME_HASH_SIZE]; |
42 | static DEFINE_RWLOCK(keyring_name_lock); | 42 | static DEFINE_RWLOCK(keyring_name_lock); |
43 | 43 | ||
44 | static inline unsigned keyring_hash(const char *desc) | 44 | static inline unsigned keyring_hash(const char *desc) |
45 | { | 45 | { |
46 | unsigned bucket = 0; | 46 | unsigned bucket = 0; |
47 | 47 | ||
48 | for (; *desc; desc++) | 48 | for (; *desc; desc++) |
49 | bucket += (unsigned char)*desc; | 49 | bucket += (unsigned char)*desc; |
50 | 50 | ||
51 | return bucket & (KEYRING_NAME_HASH_SIZE - 1); | 51 | return bucket & (KEYRING_NAME_HASH_SIZE - 1); |
52 | } | 52 | } |
53 | 53 | ||
54 | /* | 54 | /* |
55 | * The keyring key type definition. Keyrings are simply keys of this type and | 55 | * The keyring key type definition. Keyrings are simply keys of this type and |
56 | * can be treated as ordinary keys in addition to having their own special | 56 | * can be treated as ordinary keys in addition to having their own special |
57 | * operations. | 57 | * operations. |
58 | */ | 58 | */ |
59 | static int keyring_instantiate(struct key *keyring, | 59 | static int keyring_instantiate(struct key *keyring, |
60 | const void *data, size_t datalen); | 60 | const void *data, size_t datalen); |
61 | static int keyring_match(const struct key *keyring, const void *criterion); | 61 | static int keyring_match(const struct key *keyring, const void *criterion); |
62 | static void keyring_revoke(struct key *keyring); | 62 | static void keyring_revoke(struct key *keyring); |
63 | static void keyring_destroy(struct key *keyring); | 63 | static void keyring_destroy(struct key *keyring); |
64 | static void keyring_describe(const struct key *keyring, struct seq_file *m); | 64 | static void keyring_describe(const struct key *keyring, struct seq_file *m); |
65 | static long keyring_read(const struct key *keyring, | 65 | static long keyring_read(const struct key *keyring, |
66 | char __user *buffer, size_t buflen); | 66 | char __user *buffer, size_t buflen); |
67 | 67 | ||
68 | struct key_type key_type_keyring = { | 68 | struct key_type key_type_keyring = { |
69 | .name = "keyring", | 69 | .name = "keyring", |
70 | .def_datalen = sizeof(struct keyring_list), | 70 | .def_datalen = sizeof(struct keyring_list), |
71 | .instantiate = keyring_instantiate, | 71 | .instantiate = keyring_instantiate, |
72 | .match = keyring_match, | 72 | .match = keyring_match, |
73 | .revoke = keyring_revoke, | 73 | .revoke = keyring_revoke, |
74 | .destroy = keyring_destroy, | 74 | .destroy = keyring_destroy, |
75 | .describe = keyring_describe, | 75 | .describe = keyring_describe, |
76 | .read = keyring_read, | 76 | .read = keyring_read, |
77 | }; | 77 | }; |
78 | EXPORT_SYMBOL(key_type_keyring); | 78 | EXPORT_SYMBOL(key_type_keyring); |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * Semaphore to serialise link/link calls to prevent two link calls in parallel | 81 | * Semaphore to serialise link/link calls to prevent two link calls in parallel |
82 | * introducing a cycle. | 82 | * introducing a cycle. |
83 | */ | 83 | */ |
84 | static DECLARE_RWSEM(keyring_serialise_link_sem); | 84 | static DECLARE_RWSEM(keyring_serialise_link_sem); |
85 | 85 | ||
86 | /* | 86 | /* |
87 | * Publish the name of a keyring so that it can be found by name (if it has | 87 | * Publish the name of a keyring so that it can be found by name (if it has |
88 | * one). | 88 | * one). |
89 | */ | 89 | */ |
90 | static void keyring_publish_name(struct key *keyring) | 90 | static void keyring_publish_name(struct key *keyring) |
91 | { | 91 | { |
92 | int bucket; | 92 | int bucket; |
93 | 93 | ||
94 | if (keyring->description) { | 94 | if (keyring->description) { |
95 | bucket = keyring_hash(keyring->description); | 95 | bucket = keyring_hash(keyring->description); |
96 | 96 | ||
97 | write_lock(&keyring_name_lock); | 97 | write_lock(&keyring_name_lock); |
98 | 98 | ||
99 | if (!keyring_name_hash[bucket].next) | 99 | if (!keyring_name_hash[bucket].next) |
100 | INIT_LIST_HEAD(&keyring_name_hash[bucket]); | 100 | INIT_LIST_HEAD(&keyring_name_hash[bucket]); |
101 | 101 | ||
102 | list_add_tail(&keyring->type_data.link, | 102 | list_add_tail(&keyring->type_data.link, |
103 | &keyring_name_hash[bucket]); | 103 | &keyring_name_hash[bucket]); |
104 | 104 | ||
105 | write_unlock(&keyring_name_lock); | 105 | write_unlock(&keyring_name_lock); |
106 | } | 106 | } |
107 | } | 107 | } |
108 | 108 | ||
109 | /* | 109 | /* |
110 | * Initialise a keyring. | 110 | * Initialise a keyring. |
111 | * | 111 | * |
112 | * Returns 0 on success, -EINVAL if given any data. | 112 | * Returns 0 on success, -EINVAL if given any data. |
113 | */ | 113 | */ |
114 | static int keyring_instantiate(struct key *keyring, | 114 | static int keyring_instantiate(struct key *keyring, |
115 | const void *data, size_t datalen) | 115 | const void *data, size_t datalen) |
116 | { | 116 | { |
117 | int ret; | 117 | int ret; |
118 | 118 | ||
119 | ret = -EINVAL; | 119 | ret = -EINVAL; |
120 | if (datalen == 0) { | 120 | if (datalen == 0) { |
121 | /* make the keyring available by name if it has one */ | 121 | /* make the keyring available by name if it has one */ |
122 | keyring_publish_name(keyring); | 122 | keyring_publish_name(keyring); |
123 | ret = 0; | 123 | ret = 0; |
124 | } | 124 | } |
125 | 125 | ||
126 | return ret; | 126 | return ret; |
127 | } | 127 | } |
128 | 128 | ||
129 | /* | 129 | /* |
130 | * Match keyrings on their name | 130 | * Match keyrings on their name |
131 | */ | 131 | */ |
132 | static int keyring_match(const struct key *keyring, const void *description) | 132 | static int keyring_match(const struct key *keyring, const void *description) |
133 | { | 133 | { |
134 | return keyring->description && | 134 | return keyring->description && |
135 | strcmp(keyring->description, description) == 0; | 135 | strcmp(keyring->description, description) == 0; |
136 | } | 136 | } |
137 | 137 | ||
138 | /* | 138 | /* |
139 | * Clean up a keyring when it is destroyed. Unpublish its name if it had one | 139 | * Clean up a keyring when it is destroyed. Unpublish its name if it had one |
140 | * and dispose of its data. | 140 | * and dispose of its data. |
141 | */ | 141 | */ |
142 | static void keyring_destroy(struct key *keyring) | 142 | static void keyring_destroy(struct key *keyring) |
143 | { | 143 | { |
144 | struct keyring_list *klist; | 144 | struct keyring_list *klist; |
145 | int loop; | 145 | int loop; |
146 | 146 | ||
147 | if (keyring->description) { | 147 | if (keyring->description) { |
148 | write_lock(&keyring_name_lock); | 148 | write_lock(&keyring_name_lock); |
149 | 149 | ||
150 | if (keyring->type_data.link.next != NULL && | 150 | if (keyring->type_data.link.next != NULL && |
151 | !list_empty(&keyring->type_data.link)) | 151 | !list_empty(&keyring->type_data.link)) |
152 | list_del(&keyring->type_data.link); | 152 | list_del(&keyring->type_data.link); |
153 | 153 | ||
154 | write_unlock(&keyring_name_lock); | 154 | write_unlock(&keyring_name_lock); |
155 | } | 155 | } |
156 | 156 | ||
157 | klist = rcu_dereference_check(keyring->payload.subscriptions, | 157 | klist = rcu_dereference_check(keyring->payload.subscriptions, |
158 | rcu_read_lock_held() || | ||
159 | atomic_read(&keyring->usage) == 0); | 158 | atomic_read(&keyring->usage) == 0); |
160 | if (klist) { | 159 | if (klist) { |
161 | for (loop = klist->nkeys - 1; loop >= 0; loop--) | 160 | for (loop = klist->nkeys - 1; loop >= 0; loop--) |
162 | key_put(klist->keys[loop]); | 161 | key_put(klist->keys[loop]); |
163 | kfree(klist); | 162 | kfree(klist); |
164 | } | 163 | } |
165 | } | 164 | } |
166 | 165 | ||
167 | /* | 166 | /* |
168 | * Describe a keyring for /proc. | 167 | * Describe a keyring for /proc. |
169 | */ | 168 | */ |
170 | static void keyring_describe(const struct key *keyring, struct seq_file *m) | 169 | static void keyring_describe(const struct key *keyring, struct seq_file *m) |
171 | { | 170 | { |
172 | struct keyring_list *klist; | 171 | struct keyring_list *klist; |
173 | 172 | ||
174 | if (keyring->description) | 173 | if (keyring->description) |
175 | seq_puts(m, keyring->description); | 174 | seq_puts(m, keyring->description); |
176 | else | 175 | else |
177 | seq_puts(m, "[anon]"); | 176 | seq_puts(m, "[anon]"); |
178 | 177 | ||
179 | if (key_is_instantiated(keyring)) { | 178 | if (key_is_instantiated(keyring)) { |
180 | rcu_read_lock(); | 179 | rcu_read_lock(); |
181 | klist = rcu_dereference(keyring->payload.subscriptions); | 180 | klist = rcu_dereference(keyring->payload.subscriptions); |
182 | if (klist) | 181 | if (klist) |
183 | seq_printf(m, ": %u/%u", klist->nkeys, klist->maxkeys); | 182 | seq_printf(m, ": %u/%u", klist->nkeys, klist->maxkeys); |
184 | else | 183 | else |
185 | seq_puts(m, ": empty"); | 184 | seq_puts(m, ": empty"); |
186 | rcu_read_unlock(); | 185 | rcu_read_unlock(); |
187 | } | 186 | } |
188 | } | 187 | } |
189 | 188 | ||
190 | /* | 189 | /* |
191 | * Read a list of key IDs from the keyring's contents in binary form | 190 | * Read a list of key IDs from the keyring's contents in binary form |
192 | * | 191 | * |
193 | * The keyring's semaphore is read-locked by the caller. | 192 | * The keyring's semaphore is read-locked by the caller. |
194 | */ | 193 | */ |
195 | static long keyring_read(const struct key *keyring, | 194 | static long keyring_read(const struct key *keyring, |
196 | char __user *buffer, size_t buflen) | 195 | char __user *buffer, size_t buflen) |
197 | { | 196 | { |
198 | struct keyring_list *klist; | 197 | struct keyring_list *klist; |
199 | struct key *key; | 198 | struct key *key; |
200 | size_t qty, tmp; | 199 | size_t qty, tmp; |
201 | int loop, ret; | 200 | int loop, ret; |
202 | 201 | ||
203 | ret = 0; | 202 | ret = 0; |
204 | klist = rcu_dereference_locked_keyring(keyring); | 203 | klist = rcu_dereference_locked_keyring(keyring); |
205 | if (klist) { | 204 | if (klist) { |
206 | /* calculate how much data we could return */ | 205 | /* calculate how much data we could return */ |
207 | qty = klist->nkeys * sizeof(key_serial_t); | 206 | qty = klist->nkeys * sizeof(key_serial_t); |
208 | 207 | ||
209 | if (buffer && buflen > 0) { | 208 | if (buffer && buflen > 0) { |
210 | if (buflen > qty) | 209 | if (buflen > qty) |
211 | buflen = qty; | 210 | buflen = qty; |
212 | 211 | ||
213 | /* copy the IDs of the subscribed keys into the | 212 | /* copy the IDs of the subscribed keys into the |
214 | * buffer */ | 213 | * buffer */ |
215 | ret = -EFAULT; | 214 | ret = -EFAULT; |
216 | 215 | ||
217 | for (loop = 0; loop < klist->nkeys; loop++) { | 216 | for (loop = 0; loop < klist->nkeys; loop++) { |
218 | key = klist->keys[loop]; | 217 | key = klist->keys[loop]; |
219 | 218 | ||
220 | tmp = sizeof(key_serial_t); | 219 | tmp = sizeof(key_serial_t); |
221 | if (tmp > buflen) | 220 | if (tmp > buflen) |
222 | tmp = buflen; | 221 | tmp = buflen; |
223 | 222 | ||
224 | if (copy_to_user(buffer, | 223 | if (copy_to_user(buffer, |
225 | &key->serial, | 224 | &key->serial, |
226 | tmp) != 0) | 225 | tmp) != 0) |
227 | goto error; | 226 | goto error; |
228 | 227 | ||
229 | buflen -= tmp; | 228 | buflen -= tmp; |
230 | if (buflen == 0) | 229 | if (buflen == 0) |
231 | break; | 230 | break; |
232 | buffer += tmp; | 231 | buffer += tmp; |
233 | } | 232 | } |
234 | } | 233 | } |
235 | 234 | ||
236 | ret = qty; | 235 | ret = qty; |
237 | } | 236 | } |
238 | 237 | ||
239 | error: | 238 | error: |
240 | return ret; | 239 | return ret; |
241 | } | 240 | } |
242 | 241 | ||
243 | /* | 242 | /* |
244 | * Allocate a keyring and link into the destination keyring. | 243 | * Allocate a keyring and link into the destination keyring. |
245 | */ | 244 | */ |
246 | struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, | 245 | struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, |
247 | const struct cred *cred, unsigned long flags, | 246 | const struct cred *cred, unsigned long flags, |
248 | struct key *dest) | 247 | struct key *dest) |
249 | { | 248 | { |
250 | struct key *keyring; | 249 | struct key *keyring; |
251 | int ret; | 250 | int ret; |
252 | 251 | ||
253 | keyring = key_alloc(&key_type_keyring, description, | 252 | keyring = key_alloc(&key_type_keyring, description, |
254 | uid, gid, cred, | 253 | uid, gid, cred, |
255 | (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL, | 254 | (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL, |
256 | flags); | 255 | flags); |
257 | 256 | ||
258 | if (!IS_ERR(keyring)) { | 257 | if (!IS_ERR(keyring)) { |
259 | ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); | 258 | ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); |
260 | if (ret < 0) { | 259 | if (ret < 0) { |
261 | key_put(keyring); | 260 | key_put(keyring); |
262 | keyring = ERR_PTR(ret); | 261 | keyring = ERR_PTR(ret); |
263 | } | 262 | } |
264 | } | 263 | } |
265 | 264 | ||
266 | return keyring; | 265 | return keyring; |
267 | } | 266 | } |
268 | 267 | ||
269 | /** | 268 | /** |
270 | * keyring_search_aux - Search a keyring tree for a key matching some criteria | 269 | * keyring_search_aux - Search a keyring tree for a key matching some criteria |
271 | * @keyring_ref: A pointer to the keyring with possession indicator. | 270 | * @keyring_ref: A pointer to the keyring with possession indicator. |
272 | * @cred: The credentials to use for permissions checks. | 271 | * @cred: The credentials to use for permissions checks. |
273 | * @type: The type of key to search for. | 272 | * @type: The type of key to search for. |
274 | * @description: Parameter for @match. | 273 | * @description: Parameter for @match. |
275 | * @match: Function to rule on whether or not a key is the one required. | 274 | * @match: Function to rule on whether or not a key is the one required. |
276 | * @no_state_check: Don't check if a matching key is bad | 275 | * @no_state_check: Don't check if a matching key is bad |
277 | * | 276 | * |
278 | * Search the supplied keyring tree for a key that matches the criteria given. | 277 | * Search the supplied keyring tree for a key that matches the criteria given. |
279 | * The root keyring and any linked keyrings must grant Search permission to the | 278 | * The root keyring and any linked keyrings must grant Search permission to the |
280 | * caller to be searchable and keys can only be found if they too grant Search | 279 | * caller to be searchable and keys can only be found if they too grant Search |
281 | * to the caller. The possession flag on the root keyring pointer controls use | 280 | * to the caller. The possession flag on the root keyring pointer controls use |
282 | * of the possessor bits in permissions checking of the entire tree. In | 281 | * of the possessor bits in permissions checking of the entire tree. In |
283 | * addition, the LSM gets to forbid keyring searches and key matches. | 282 | * addition, the LSM gets to forbid keyring searches and key matches. |
284 | * | 283 | * |
285 | * The search is performed as a breadth-then-depth search up to the prescribed | 284 | * The search is performed as a breadth-then-depth search up to the prescribed |
286 | * limit (KEYRING_SEARCH_MAX_DEPTH). | 285 | * limit (KEYRING_SEARCH_MAX_DEPTH). |
287 | * | 286 | * |
288 | * Keys are matched to the type provided and are then filtered by the match | 287 | * Keys are matched to the type provided and are then filtered by the match |
289 | * function, which is given the description to use in any way it sees fit. The | 288 | * function, which is given the description to use in any way it sees fit. The |
290 | * match function may use any attributes of a key that it wishes to to | 289 | * match function may use any attributes of a key that it wishes to to |
291 | * determine the match. Normally the match function from the key type would be | 290 | * determine the match. Normally the match function from the key type would be |
292 | * used. | 291 | * used. |
293 | * | 292 | * |
294 | * RCU is used to prevent the keyring key lists from disappearing without the | 293 | * RCU is used to prevent the keyring key lists from disappearing without the |
295 | * need to take lots of locks. | 294 | * need to take lots of locks. |
296 | * | 295 | * |
297 | * Returns a pointer to the found key and increments the key usage count if | 296 | * Returns a pointer to the found key and increments the key usage count if |
298 | * successful; -EAGAIN if no matching keys were found, or if expired or revoked | 297 | * successful; -EAGAIN if no matching keys were found, or if expired or revoked |
299 | * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the | 298 | * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the |
300 | * specified keyring wasn't a keyring. | 299 | * specified keyring wasn't a keyring. |
301 | * | 300 | * |
302 | * In the case of a successful return, the possession attribute from | 301 | * In the case of a successful return, the possession attribute from |
303 | * @keyring_ref is propagated to the returned key reference. | 302 | * @keyring_ref is propagated to the returned key reference. |
304 | */ | 303 | */ |
305 | key_ref_t keyring_search_aux(key_ref_t keyring_ref, | 304 | key_ref_t keyring_search_aux(key_ref_t keyring_ref, |
306 | const struct cred *cred, | 305 | const struct cred *cred, |
307 | struct key_type *type, | 306 | struct key_type *type, |
308 | const void *description, | 307 | const void *description, |
309 | key_match_func_t match, | 308 | key_match_func_t match, |
310 | bool no_state_check) | 309 | bool no_state_check) |
311 | { | 310 | { |
312 | struct { | 311 | struct { |
313 | struct keyring_list *keylist; | 312 | struct keyring_list *keylist; |
314 | int kix; | 313 | int kix; |
315 | } stack[KEYRING_SEARCH_MAX_DEPTH]; | 314 | } stack[KEYRING_SEARCH_MAX_DEPTH]; |
316 | 315 | ||
317 | struct keyring_list *keylist; | 316 | struct keyring_list *keylist; |
318 | struct timespec now; | 317 | struct timespec now; |
319 | unsigned long possessed, kflags; | 318 | unsigned long possessed, kflags; |
320 | struct key *keyring, *key; | 319 | struct key *keyring, *key; |
321 | key_ref_t key_ref; | 320 | key_ref_t key_ref; |
322 | long err; | 321 | long err; |
323 | int sp, kix; | 322 | int sp, kix; |
324 | 323 | ||
325 | keyring = key_ref_to_ptr(keyring_ref); | 324 | keyring = key_ref_to_ptr(keyring_ref); |
326 | possessed = is_key_possessed(keyring_ref); | 325 | possessed = is_key_possessed(keyring_ref); |
327 | key_check(keyring); | 326 | key_check(keyring); |
328 | 327 | ||
329 | /* top keyring must have search permission to begin the search */ | 328 | /* top keyring must have search permission to begin the search */ |
330 | err = key_task_permission(keyring_ref, cred, KEY_SEARCH); | 329 | err = key_task_permission(keyring_ref, cred, KEY_SEARCH); |
331 | if (err < 0) { | 330 | if (err < 0) { |
332 | key_ref = ERR_PTR(err); | 331 | key_ref = ERR_PTR(err); |
333 | goto error; | 332 | goto error; |
334 | } | 333 | } |
335 | 334 | ||
336 | key_ref = ERR_PTR(-ENOTDIR); | 335 | key_ref = ERR_PTR(-ENOTDIR); |
337 | if (keyring->type != &key_type_keyring) | 336 | if (keyring->type != &key_type_keyring) |
338 | goto error; | 337 | goto error; |
339 | 338 | ||
340 | rcu_read_lock(); | 339 | rcu_read_lock(); |
341 | 340 | ||
342 | now = current_kernel_time(); | 341 | now = current_kernel_time(); |
343 | err = -EAGAIN; | 342 | err = -EAGAIN; |
344 | sp = 0; | 343 | sp = 0; |
345 | 344 | ||
346 | /* firstly we should check to see if this top-level keyring is what we | 345 | /* firstly we should check to see if this top-level keyring is what we |
347 | * are looking for */ | 346 | * are looking for */ |
348 | key_ref = ERR_PTR(-EAGAIN); | 347 | key_ref = ERR_PTR(-EAGAIN); |
349 | kflags = keyring->flags; | 348 | kflags = keyring->flags; |
350 | if (keyring->type == type && match(keyring, description)) { | 349 | if (keyring->type == type && match(keyring, description)) { |
351 | key = keyring; | 350 | key = keyring; |
352 | if (no_state_check) | 351 | if (no_state_check) |
353 | goto found; | 352 | goto found; |
354 | 353 | ||
355 | /* check it isn't negative and hasn't expired or been | 354 | /* check it isn't negative and hasn't expired or been |
356 | * revoked */ | 355 | * revoked */ |
357 | if (kflags & (1 << KEY_FLAG_REVOKED)) | 356 | if (kflags & (1 << KEY_FLAG_REVOKED)) |
358 | goto error_2; | 357 | goto error_2; |
359 | if (key->expiry && now.tv_sec >= key->expiry) | 358 | if (key->expiry && now.tv_sec >= key->expiry) |
360 | goto error_2; | 359 | goto error_2; |
361 | key_ref = ERR_PTR(key->type_data.reject_error); | 360 | key_ref = ERR_PTR(key->type_data.reject_error); |
362 | if (kflags & (1 << KEY_FLAG_NEGATIVE)) | 361 | if (kflags & (1 << KEY_FLAG_NEGATIVE)) |
363 | goto error_2; | 362 | goto error_2; |
364 | goto found; | 363 | goto found; |
365 | } | 364 | } |
366 | 365 | ||
367 | /* otherwise, the top keyring must not be revoked, expired, or | 366 | /* otherwise, the top keyring must not be revoked, expired, or |
368 | * negatively instantiated if we are to search it */ | 367 | * negatively instantiated if we are to search it */ |
369 | key_ref = ERR_PTR(-EAGAIN); | 368 | key_ref = ERR_PTR(-EAGAIN); |
370 | if (kflags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_NEGATIVE)) || | 369 | if (kflags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_NEGATIVE)) || |
371 | (keyring->expiry && now.tv_sec >= keyring->expiry)) | 370 | (keyring->expiry && now.tv_sec >= keyring->expiry)) |
372 | goto error_2; | 371 | goto error_2; |
373 | 372 | ||
374 | /* start processing a new keyring */ | 373 | /* start processing a new keyring */ |
375 | descend: | 374 | descend: |
376 | if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) | 375 | if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) |
377 | goto not_this_keyring; | 376 | goto not_this_keyring; |
378 | 377 | ||
379 | keylist = rcu_dereference(keyring->payload.subscriptions); | 378 | keylist = rcu_dereference(keyring->payload.subscriptions); |
380 | if (!keylist) | 379 | if (!keylist) |
381 | goto not_this_keyring; | 380 | goto not_this_keyring; |
382 | 381 | ||
383 | /* iterate through the keys in this keyring first */ | 382 | /* iterate through the keys in this keyring first */ |
384 | for (kix = 0; kix < keylist->nkeys; kix++) { | 383 | for (kix = 0; kix < keylist->nkeys; kix++) { |
385 | key = keylist->keys[kix]; | 384 | key = keylist->keys[kix]; |
386 | kflags = key->flags; | 385 | kflags = key->flags; |
387 | 386 | ||
388 | /* ignore keys not of this type */ | 387 | /* ignore keys not of this type */ |
389 | if (key->type != type) | 388 | if (key->type != type) |
390 | continue; | 389 | continue; |
391 | 390 | ||
392 | /* skip revoked keys and expired keys */ | 391 | /* skip revoked keys and expired keys */ |
393 | if (!no_state_check) { | 392 | if (!no_state_check) { |
394 | if (kflags & (1 << KEY_FLAG_REVOKED)) | 393 | if (kflags & (1 << KEY_FLAG_REVOKED)) |
395 | continue; | 394 | continue; |
396 | 395 | ||
397 | if (key->expiry && now.tv_sec >= key->expiry) | 396 | if (key->expiry && now.tv_sec >= key->expiry) |
398 | continue; | 397 | continue; |
399 | } | 398 | } |
400 | 399 | ||
401 | /* keys that don't match */ | 400 | /* keys that don't match */ |
402 | if (!match(key, description)) | 401 | if (!match(key, description)) |
403 | continue; | 402 | continue; |
404 | 403 | ||
405 | /* key must have search permissions */ | 404 | /* key must have search permissions */ |
406 | if (key_task_permission(make_key_ref(key, possessed), | 405 | if (key_task_permission(make_key_ref(key, possessed), |
407 | cred, KEY_SEARCH) < 0) | 406 | cred, KEY_SEARCH) < 0) |
408 | continue; | 407 | continue; |
409 | 408 | ||
410 | if (no_state_check) | 409 | if (no_state_check) |
411 | goto found; | 410 | goto found; |
412 | 411 | ||
413 | /* we set a different error code if we pass a negative key */ | 412 | /* we set a different error code if we pass a negative key */ |
414 | if (kflags & (1 << KEY_FLAG_NEGATIVE)) { | 413 | if (kflags & (1 << KEY_FLAG_NEGATIVE)) { |
415 | err = key->type_data.reject_error; | 414 | err = key->type_data.reject_error; |
416 | continue; | 415 | continue; |
417 | } | 416 | } |
418 | 417 | ||
419 | goto found; | 418 | goto found; |
420 | } | 419 | } |
421 | 420 | ||
422 | /* search through the keyrings nested in this one */ | 421 | /* search through the keyrings nested in this one */ |
423 | kix = 0; | 422 | kix = 0; |
424 | ascend: | 423 | ascend: |
425 | for (; kix < keylist->nkeys; kix++) { | 424 | for (; kix < keylist->nkeys; kix++) { |
426 | key = keylist->keys[kix]; | 425 | key = keylist->keys[kix]; |
427 | if (key->type != &key_type_keyring) | 426 | if (key->type != &key_type_keyring) |
428 | continue; | 427 | continue; |
429 | 428 | ||
430 | /* recursively search nested keyrings | 429 | /* recursively search nested keyrings |
431 | * - only search keyrings for which we have search permission | 430 | * - only search keyrings for which we have search permission |
432 | */ | 431 | */ |
433 | if (sp >= KEYRING_SEARCH_MAX_DEPTH) | 432 | if (sp >= KEYRING_SEARCH_MAX_DEPTH) |
434 | continue; | 433 | continue; |
435 | 434 | ||
436 | if (key_task_permission(make_key_ref(key, possessed), | 435 | if (key_task_permission(make_key_ref(key, possessed), |
437 | cred, KEY_SEARCH) < 0) | 436 | cred, KEY_SEARCH) < 0) |
438 | continue; | 437 | continue; |
439 | 438 | ||
440 | /* stack the current position */ | 439 | /* stack the current position */ |
441 | stack[sp].keylist = keylist; | 440 | stack[sp].keylist = keylist; |
442 | stack[sp].kix = kix; | 441 | stack[sp].kix = kix; |
443 | sp++; | 442 | sp++; |
444 | 443 | ||
445 | /* begin again with the new keyring */ | 444 | /* begin again with the new keyring */ |
446 | keyring = key; | 445 | keyring = key; |
447 | goto descend; | 446 | goto descend; |
448 | } | 447 | } |
449 | 448 | ||
450 | /* the keyring we're looking at was disqualified or didn't contain a | 449 | /* the keyring we're looking at was disqualified or didn't contain a |
451 | * matching key */ | 450 | * matching key */ |
452 | not_this_keyring: | 451 | not_this_keyring: |
453 | if (sp > 0) { | 452 | if (sp > 0) { |
454 | /* resume the processing of a keyring higher up in the tree */ | 453 | /* resume the processing of a keyring higher up in the tree */ |
455 | sp--; | 454 | sp--; |
456 | keylist = stack[sp].keylist; | 455 | keylist = stack[sp].keylist; |
457 | kix = stack[sp].kix + 1; | 456 | kix = stack[sp].kix + 1; |
458 | goto ascend; | 457 | goto ascend; |
459 | } | 458 | } |
460 | 459 | ||
461 | key_ref = ERR_PTR(err); | 460 | key_ref = ERR_PTR(err); |
462 | goto error_2; | 461 | goto error_2; |
463 | 462 | ||
464 | /* we found a viable match */ | 463 | /* we found a viable match */ |
465 | found: | 464 | found: |
466 | atomic_inc(&key->usage); | 465 | atomic_inc(&key->usage); |
467 | key_check(key); | 466 | key_check(key); |
468 | key_ref = make_key_ref(key, possessed); | 467 | key_ref = make_key_ref(key, possessed); |
469 | error_2: | 468 | error_2: |
470 | rcu_read_unlock(); | 469 | rcu_read_unlock(); |
471 | error: | 470 | error: |
472 | return key_ref; | 471 | return key_ref; |
473 | } | 472 | } |
474 | 473 | ||
475 | /** | 474 | /** |
476 | * keyring_search - Search the supplied keyring tree for a matching key | 475 | * keyring_search - Search the supplied keyring tree for a matching key |
477 | * @keyring: The root of the keyring tree to be searched. | 476 | * @keyring: The root of the keyring tree to be searched. |
478 | * @type: The type of keyring we want to find. | 477 | * @type: The type of keyring we want to find. |
479 | * @description: The name of the keyring we want to find. | 478 | * @description: The name of the keyring we want to find. |
480 | * | 479 | * |
481 | * As keyring_search_aux() above, but using the current task's credentials and | 480 | * As keyring_search_aux() above, but using the current task's credentials and |
482 | * type's default matching function. | 481 | * type's default matching function. |
483 | */ | 482 | */ |
484 | key_ref_t keyring_search(key_ref_t keyring, | 483 | key_ref_t keyring_search(key_ref_t keyring, |
485 | struct key_type *type, | 484 | struct key_type *type, |
486 | const char *description) | 485 | const char *description) |
487 | { | 486 | { |
488 | if (!type->match) | 487 | if (!type->match) |
489 | return ERR_PTR(-ENOKEY); | 488 | return ERR_PTR(-ENOKEY); |
490 | 489 | ||
491 | return keyring_search_aux(keyring, current->cred, | 490 | return keyring_search_aux(keyring, current->cred, |
492 | type, description, type->match, false); | 491 | type, description, type->match, false); |
493 | } | 492 | } |
494 | EXPORT_SYMBOL(keyring_search); | 493 | EXPORT_SYMBOL(keyring_search); |
495 | 494 | ||
496 | /* | 495 | /* |
497 | * Search the given keyring only (no recursion). | 496 | * Search the given keyring only (no recursion). |
498 | * | 497 | * |
499 | * The caller must guarantee that the keyring is a keyring and that the | 498 | * The caller must guarantee that the keyring is a keyring and that the |
500 | * permission is granted to search the keyring as no check is made here. | 499 | * permission is granted to search the keyring as no check is made here. |
501 | * | 500 | * |
502 | * RCU is used to make it unnecessary to lock the keyring key list here. | 501 | * RCU is used to make it unnecessary to lock the keyring key list here. |
503 | * | 502 | * |
504 | * Returns a pointer to the found key with usage count incremented if | 503 | * Returns a pointer to the found key with usage count incremented if |
505 | * successful and returns -ENOKEY if not found. Revoked keys and keys not | 504 | * successful and returns -ENOKEY if not found. Revoked keys and keys not |
506 | * providing the requested permission are skipped over. | 505 | * providing the requested permission are skipped over. |
507 | * | 506 | * |
508 | * If successful, the possession indicator is propagated from the keyring ref | 507 | * If successful, the possession indicator is propagated from the keyring ref |
509 | * to the returned key reference. | 508 | * to the returned key reference. |
510 | */ | 509 | */ |
511 | key_ref_t __keyring_search_one(key_ref_t keyring_ref, | 510 | key_ref_t __keyring_search_one(key_ref_t keyring_ref, |
512 | const struct key_type *ktype, | 511 | const struct key_type *ktype, |
513 | const char *description, | 512 | const char *description, |
514 | key_perm_t perm) | 513 | key_perm_t perm) |
515 | { | 514 | { |
516 | struct keyring_list *klist; | 515 | struct keyring_list *klist; |
517 | unsigned long possessed; | 516 | unsigned long possessed; |
518 | struct key *keyring, *key; | 517 | struct key *keyring, *key; |
519 | int loop; | 518 | int loop; |
520 | 519 | ||
521 | keyring = key_ref_to_ptr(keyring_ref); | 520 | keyring = key_ref_to_ptr(keyring_ref); |
522 | possessed = is_key_possessed(keyring_ref); | 521 | possessed = is_key_possessed(keyring_ref); |
523 | 522 | ||
524 | rcu_read_lock(); | 523 | rcu_read_lock(); |
525 | 524 | ||
526 | klist = rcu_dereference(keyring->payload.subscriptions); | 525 | klist = rcu_dereference(keyring->payload.subscriptions); |
527 | if (klist) { | 526 | if (klist) { |
528 | for (loop = 0; loop < klist->nkeys; loop++) { | 527 | for (loop = 0; loop < klist->nkeys; loop++) { |
529 | key = klist->keys[loop]; | 528 | key = klist->keys[loop]; |
530 | 529 | ||
531 | if (key->type == ktype && | 530 | if (key->type == ktype && |
532 | (!key->type->match || | 531 | (!key->type->match || |
533 | key->type->match(key, description)) && | 532 | key->type->match(key, description)) && |
534 | key_permission(make_key_ref(key, possessed), | 533 | key_permission(make_key_ref(key, possessed), |
535 | perm) == 0 && | 534 | perm) == 0 && |
536 | !test_bit(KEY_FLAG_REVOKED, &key->flags) | 535 | !test_bit(KEY_FLAG_REVOKED, &key->flags) |
537 | ) | 536 | ) |
538 | goto found; | 537 | goto found; |
539 | } | 538 | } |
540 | } | 539 | } |
541 | 540 | ||
542 | rcu_read_unlock(); | 541 | rcu_read_unlock(); |
543 | return ERR_PTR(-ENOKEY); | 542 | return ERR_PTR(-ENOKEY); |
544 | 543 | ||
545 | found: | 544 | found: |
546 | atomic_inc(&key->usage); | 545 | atomic_inc(&key->usage); |
547 | rcu_read_unlock(); | 546 | rcu_read_unlock(); |
548 | return make_key_ref(key, possessed); | 547 | return make_key_ref(key, possessed); |
549 | } | 548 | } |
550 | 549 | ||
551 | /* | 550 | /* |
552 | * Find a keyring with the specified name. | 551 | * Find a keyring with the specified name. |
553 | * | 552 | * |
554 | * All named keyrings in the current user namespace are searched, provided they | 553 | * All named keyrings in the current user namespace are searched, provided they |
555 | * grant Search permission directly to the caller (unless this check is | 554 | * grant Search permission directly to the caller (unless this check is |
556 | * skipped). Keyrings whose usage points have reached zero or who have been | 555 | * skipped). Keyrings whose usage points have reached zero or who have been |
557 | * revoked are skipped. | 556 | * revoked are skipped. |
558 | * | 557 | * |
559 | * Returns a pointer to the keyring with the keyring's refcount having being | 558 | * Returns a pointer to the keyring with the keyring's refcount having being |
560 | * incremented on success. -ENOKEY is returned if a key could not be found. | 559 | * incremented on success. -ENOKEY is returned if a key could not be found. |
561 | */ | 560 | */ |
562 | struct key *find_keyring_by_name(const char *name, bool skip_perm_check) | 561 | struct key *find_keyring_by_name(const char *name, bool skip_perm_check) |
563 | { | 562 | { |
564 | struct key *keyring; | 563 | struct key *keyring; |
565 | int bucket; | 564 | int bucket; |
566 | 565 | ||
567 | if (!name) | 566 | if (!name) |
568 | return ERR_PTR(-EINVAL); | 567 | return ERR_PTR(-EINVAL); |
569 | 568 | ||
570 | bucket = keyring_hash(name); | 569 | bucket = keyring_hash(name); |
571 | 570 | ||
572 | read_lock(&keyring_name_lock); | 571 | read_lock(&keyring_name_lock); |
573 | 572 | ||
574 | if (keyring_name_hash[bucket].next) { | 573 | if (keyring_name_hash[bucket].next) { |
575 | /* search this hash bucket for a keyring with a matching name | 574 | /* search this hash bucket for a keyring with a matching name |
576 | * that's readable and that hasn't been revoked */ | 575 | * that's readable and that hasn't been revoked */ |
577 | list_for_each_entry(keyring, | 576 | list_for_each_entry(keyring, |
578 | &keyring_name_hash[bucket], | 577 | &keyring_name_hash[bucket], |
579 | type_data.link | 578 | type_data.link |
580 | ) { | 579 | ) { |
581 | if (keyring->user->user_ns != current_user_ns()) | 580 | if (keyring->user->user_ns != current_user_ns()) |
582 | continue; | 581 | continue; |
583 | 582 | ||
584 | if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) | 583 | if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) |
585 | continue; | 584 | continue; |
586 | 585 | ||
587 | if (strcmp(keyring->description, name) != 0) | 586 | if (strcmp(keyring->description, name) != 0) |
588 | continue; | 587 | continue; |
589 | 588 | ||
590 | if (!skip_perm_check && | 589 | if (!skip_perm_check && |
591 | key_permission(make_key_ref(keyring, 0), | 590 | key_permission(make_key_ref(keyring, 0), |
592 | KEY_SEARCH) < 0) | 591 | KEY_SEARCH) < 0) |
593 | continue; | 592 | continue; |
594 | 593 | ||
595 | /* we've got a match but we might end up racing with | 594 | /* we've got a match but we might end up racing with |
596 | * key_cleanup() if the keyring is currently 'dead' | 595 | * key_cleanup() if the keyring is currently 'dead' |
597 | * (ie. it has a zero usage count) */ | 596 | * (ie. it has a zero usage count) */ |
598 | if (!atomic_inc_not_zero(&keyring->usage)) | 597 | if (!atomic_inc_not_zero(&keyring->usage)) |
599 | continue; | 598 | continue; |
600 | goto out; | 599 | goto out; |
601 | } | 600 | } |
602 | } | 601 | } |
603 | 602 | ||
604 | keyring = ERR_PTR(-ENOKEY); | 603 | keyring = ERR_PTR(-ENOKEY); |
605 | out: | 604 | out: |
606 | read_unlock(&keyring_name_lock); | 605 | read_unlock(&keyring_name_lock); |
607 | return keyring; | 606 | return keyring; |
608 | } | 607 | } |
609 | 608 | ||
610 | /* | 609 | /* |
611 | * See if a cycle will will be created by inserting acyclic tree B in acyclic | 610 | * See if a cycle will will be created by inserting acyclic tree B in acyclic |
612 | * tree A at the topmost level (ie: as a direct child of A). | 611 | * tree A at the topmost level (ie: as a direct child of A). |
613 | * | 612 | * |
614 | * Since we are adding B to A at the top level, checking for cycles should just | 613 | * Since we are adding B to A at the top level, checking for cycles should just |
615 | * be a matter of seeing if node A is somewhere in tree B. | 614 | * be a matter of seeing if node A is somewhere in tree B. |
616 | */ | 615 | */ |
617 | static int keyring_detect_cycle(struct key *A, struct key *B) | 616 | static int keyring_detect_cycle(struct key *A, struct key *B) |
618 | { | 617 | { |
619 | struct { | 618 | struct { |
620 | struct keyring_list *keylist; | 619 | struct keyring_list *keylist; |
621 | int kix; | 620 | int kix; |
622 | } stack[KEYRING_SEARCH_MAX_DEPTH]; | 621 | } stack[KEYRING_SEARCH_MAX_DEPTH]; |
623 | 622 | ||
624 | struct keyring_list *keylist; | 623 | struct keyring_list *keylist; |
625 | struct key *subtree, *key; | 624 | struct key *subtree, *key; |
626 | int sp, kix, ret; | 625 | int sp, kix, ret; |
627 | 626 | ||
628 | rcu_read_lock(); | 627 | rcu_read_lock(); |
629 | 628 | ||
630 | ret = -EDEADLK; | 629 | ret = -EDEADLK; |
631 | if (A == B) | 630 | if (A == B) |
632 | goto cycle_detected; | 631 | goto cycle_detected; |
633 | 632 | ||
634 | subtree = B; | 633 | subtree = B; |
635 | sp = 0; | 634 | sp = 0; |
636 | 635 | ||
637 | /* start processing a new keyring */ | 636 | /* start processing a new keyring */ |
638 | descend: | 637 | descend: |
639 | if (test_bit(KEY_FLAG_REVOKED, &subtree->flags)) | 638 | if (test_bit(KEY_FLAG_REVOKED, &subtree->flags)) |
640 | goto not_this_keyring; | 639 | goto not_this_keyring; |
641 | 640 | ||
642 | keylist = rcu_dereference(subtree->payload.subscriptions); | 641 | keylist = rcu_dereference(subtree->payload.subscriptions); |
643 | if (!keylist) | 642 | if (!keylist) |
644 | goto not_this_keyring; | 643 | goto not_this_keyring; |
645 | kix = 0; | 644 | kix = 0; |
646 | 645 | ||
647 | ascend: | 646 | ascend: |
648 | /* iterate through the remaining keys in this keyring */ | 647 | /* iterate through the remaining keys in this keyring */ |
649 | for (; kix < keylist->nkeys; kix++) { | 648 | for (; kix < keylist->nkeys; kix++) { |
650 | key = keylist->keys[kix]; | 649 | key = keylist->keys[kix]; |
651 | 650 | ||
652 | if (key == A) | 651 | if (key == A) |
653 | goto cycle_detected; | 652 | goto cycle_detected; |
654 | 653 | ||
655 | /* recursively check nested keyrings */ | 654 | /* recursively check nested keyrings */ |
656 | if (key->type == &key_type_keyring) { | 655 | if (key->type == &key_type_keyring) { |
657 | if (sp >= KEYRING_SEARCH_MAX_DEPTH) | 656 | if (sp >= KEYRING_SEARCH_MAX_DEPTH) |
658 | goto too_deep; | 657 | goto too_deep; |
659 | 658 | ||
660 | /* stack the current position */ | 659 | /* stack the current position */ |
661 | stack[sp].keylist = keylist; | 660 | stack[sp].keylist = keylist; |
662 | stack[sp].kix = kix; | 661 | stack[sp].kix = kix; |
663 | sp++; | 662 | sp++; |
664 | 663 | ||
665 | /* begin again with the new keyring */ | 664 | /* begin again with the new keyring */ |
666 | subtree = key; | 665 | subtree = key; |
667 | goto descend; | 666 | goto descend; |
668 | } | 667 | } |
669 | } | 668 | } |
670 | 669 | ||
671 | /* the keyring we're looking at was disqualified or didn't contain a | 670 | /* the keyring we're looking at was disqualified or didn't contain a |
672 | * matching key */ | 671 | * matching key */ |
673 | not_this_keyring: | 672 | not_this_keyring: |
674 | if (sp > 0) { | 673 | if (sp > 0) { |
675 | /* resume the checking of a keyring higher up in the tree */ | 674 | /* resume the checking of a keyring higher up in the tree */ |
676 | sp--; | 675 | sp--; |
677 | keylist = stack[sp].keylist; | 676 | keylist = stack[sp].keylist; |
678 | kix = stack[sp].kix + 1; | 677 | kix = stack[sp].kix + 1; |
679 | goto ascend; | 678 | goto ascend; |
680 | } | 679 | } |
681 | 680 | ||
682 | ret = 0; /* no cycles detected */ | 681 | ret = 0; /* no cycles detected */ |
683 | 682 | ||
684 | error: | 683 | error: |
685 | rcu_read_unlock(); | 684 | rcu_read_unlock(); |
686 | return ret; | 685 | return ret; |
687 | 686 | ||
688 | too_deep: | 687 | too_deep: |
689 | ret = -ELOOP; | 688 | ret = -ELOOP; |
690 | goto error; | 689 | goto error; |
691 | 690 | ||
692 | cycle_detected: | 691 | cycle_detected: |
693 | ret = -EDEADLK; | 692 | ret = -EDEADLK; |
694 | goto error; | 693 | goto error; |
695 | } | 694 | } |
696 | 695 | ||
697 | /* | 696 | /* |
698 | * Dispose of a keyring list after the RCU grace period, freeing the unlinked | 697 | * Dispose of a keyring list after the RCU grace period, freeing the unlinked |
699 | * key | 698 | * key |
700 | */ | 699 | */ |
701 | static void keyring_unlink_rcu_disposal(struct rcu_head *rcu) | 700 | static void keyring_unlink_rcu_disposal(struct rcu_head *rcu) |
702 | { | 701 | { |
703 | struct keyring_list *klist = | 702 | struct keyring_list *klist = |
704 | container_of(rcu, struct keyring_list, rcu); | 703 | container_of(rcu, struct keyring_list, rcu); |
705 | 704 | ||
706 | if (klist->delkey != USHRT_MAX) | 705 | if (klist->delkey != USHRT_MAX) |
707 | key_put(klist->keys[klist->delkey]); | 706 | key_put(klist->keys[klist->delkey]); |
708 | kfree(klist); | 707 | kfree(klist); |
709 | } | 708 | } |
710 | 709 | ||
711 | /* | 710 | /* |
712 | * Preallocate memory so that a key can be linked into to a keyring. | 711 | * Preallocate memory so that a key can be linked into to a keyring. |
713 | */ | 712 | */ |
714 | int __key_link_begin(struct key *keyring, const struct key_type *type, | 713 | int __key_link_begin(struct key *keyring, const struct key_type *type, |
715 | const char *description, unsigned long *_prealloc) | 714 | const char *description, unsigned long *_prealloc) |
716 | __acquires(&keyring->sem) | 715 | __acquires(&keyring->sem) |
717 | { | 716 | { |
718 | struct keyring_list *klist, *nklist; | 717 | struct keyring_list *klist, *nklist; |
719 | unsigned long prealloc; | 718 | unsigned long prealloc; |
720 | unsigned max; | 719 | unsigned max; |
721 | size_t size; | 720 | size_t size; |
722 | int loop, ret; | 721 | int loop, ret; |
723 | 722 | ||
724 | kenter("%d,%s,%s,", key_serial(keyring), type->name, description); | 723 | kenter("%d,%s,%s,", key_serial(keyring), type->name, description); |
725 | 724 | ||
726 | if (keyring->type != &key_type_keyring) | 725 | if (keyring->type != &key_type_keyring) |
727 | return -ENOTDIR; | 726 | return -ENOTDIR; |
728 | 727 | ||
729 | down_write(&keyring->sem); | 728 | down_write(&keyring->sem); |
730 | 729 | ||
731 | ret = -EKEYREVOKED; | 730 | ret = -EKEYREVOKED; |
732 | if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) | 731 | if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) |
733 | goto error_krsem; | 732 | goto error_krsem; |
734 | 733 | ||
735 | /* serialise link/link calls to prevent parallel calls causing a cycle | 734 | /* serialise link/link calls to prevent parallel calls causing a cycle |
736 | * when linking two keyring in opposite orders */ | 735 | * when linking two keyring in opposite orders */ |
737 | if (type == &key_type_keyring) | 736 | if (type == &key_type_keyring) |
738 | down_write(&keyring_serialise_link_sem); | 737 | down_write(&keyring_serialise_link_sem); |
739 | 738 | ||
740 | klist = rcu_dereference_locked_keyring(keyring); | 739 | klist = rcu_dereference_locked_keyring(keyring); |
741 | 740 | ||
742 | /* see if there's a matching key we can displace */ | 741 | /* see if there's a matching key we can displace */ |
743 | if (klist && klist->nkeys > 0) { | 742 | if (klist && klist->nkeys > 0) { |
744 | for (loop = klist->nkeys - 1; loop >= 0; loop--) { | 743 | for (loop = klist->nkeys - 1; loop >= 0; loop--) { |
745 | if (klist->keys[loop]->type == type && | 744 | if (klist->keys[loop]->type == type && |
746 | strcmp(klist->keys[loop]->description, | 745 | strcmp(klist->keys[loop]->description, |
747 | description) == 0 | 746 | description) == 0 |
748 | ) { | 747 | ) { |
749 | /* found a match - we'll replace this one with | 748 | /* found a match - we'll replace this one with |
750 | * the new key */ | 749 | * the new key */ |
751 | size = sizeof(struct key *) * klist->maxkeys; | 750 | size = sizeof(struct key *) * klist->maxkeys; |
752 | size += sizeof(*klist); | 751 | size += sizeof(*klist); |
753 | BUG_ON(size > PAGE_SIZE); | 752 | BUG_ON(size > PAGE_SIZE); |
754 | 753 | ||
755 | ret = -ENOMEM; | 754 | ret = -ENOMEM; |
756 | nklist = kmemdup(klist, size, GFP_KERNEL); | 755 | nklist = kmemdup(klist, size, GFP_KERNEL); |
757 | if (!nklist) | 756 | if (!nklist) |
758 | goto error_sem; | 757 | goto error_sem; |
759 | 758 | ||
760 | /* note replacement slot */ | 759 | /* note replacement slot */ |
761 | klist->delkey = nklist->delkey = loop; | 760 | klist->delkey = nklist->delkey = loop; |
762 | prealloc = (unsigned long)nklist; | 761 | prealloc = (unsigned long)nklist; |
763 | goto done; | 762 | goto done; |
764 | } | 763 | } |
765 | } | 764 | } |
766 | } | 765 | } |
767 | 766 | ||
768 | /* check that we aren't going to overrun the user's quota */ | 767 | /* check that we aren't going to overrun the user's quota */ |
769 | ret = key_payload_reserve(keyring, | 768 | ret = key_payload_reserve(keyring, |
770 | keyring->datalen + KEYQUOTA_LINK_BYTES); | 769 | keyring->datalen + KEYQUOTA_LINK_BYTES); |
771 | if (ret < 0) | 770 | if (ret < 0) |
772 | goto error_sem; | 771 | goto error_sem; |
773 | 772 | ||
774 | if (klist && klist->nkeys < klist->maxkeys) { | 773 | if (klist && klist->nkeys < klist->maxkeys) { |
775 | /* there's sufficient slack space to append directly */ | 774 | /* there's sufficient slack space to append directly */ |
776 | nklist = NULL; | 775 | nklist = NULL; |
777 | prealloc = KEY_LINK_FIXQUOTA; | 776 | prealloc = KEY_LINK_FIXQUOTA; |
778 | } else { | 777 | } else { |
779 | /* grow the key list */ | 778 | /* grow the key list */ |
780 | max = 4; | 779 | max = 4; |
781 | if (klist) | 780 | if (klist) |
782 | max += klist->maxkeys; | 781 | max += klist->maxkeys; |
783 | 782 | ||
784 | ret = -ENFILE; | 783 | ret = -ENFILE; |
785 | if (max > USHRT_MAX - 1) | 784 | if (max > USHRT_MAX - 1) |
786 | goto error_quota; | 785 | goto error_quota; |
787 | size = sizeof(*klist) + sizeof(struct key *) * max; | 786 | size = sizeof(*klist) + sizeof(struct key *) * max; |
788 | if (size > PAGE_SIZE) | 787 | if (size > PAGE_SIZE) |
789 | goto error_quota; | 788 | goto error_quota; |
790 | 789 | ||
791 | ret = -ENOMEM; | 790 | ret = -ENOMEM; |
792 | nklist = kmalloc(size, GFP_KERNEL); | 791 | nklist = kmalloc(size, GFP_KERNEL); |
793 | if (!nklist) | 792 | if (!nklist) |
794 | goto error_quota; | 793 | goto error_quota; |
795 | 794 | ||
796 | nklist->maxkeys = max; | 795 | nklist->maxkeys = max; |
797 | if (klist) { | 796 | if (klist) { |
798 | memcpy(nklist->keys, klist->keys, | 797 | memcpy(nklist->keys, klist->keys, |
799 | sizeof(struct key *) * klist->nkeys); | 798 | sizeof(struct key *) * klist->nkeys); |
800 | nklist->delkey = klist->nkeys; | 799 | nklist->delkey = klist->nkeys; |
801 | nklist->nkeys = klist->nkeys + 1; | 800 | nklist->nkeys = klist->nkeys + 1; |
802 | klist->delkey = USHRT_MAX; | 801 | klist->delkey = USHRT_MAX; |
803 | } else { | 802 | } else { |
804 | nklist->nkeys = 1; | 803 | nklist->nkeys = 1; |
805 | nklist->delkey = 0; | 804 | nklist->delkey = 0; |
806 | } | 805 | } |
807 | 806 | ||
808 | /* add the key into the new space */ | 807 | /* add the key into the new space */ |
809 | nklist->keys[nklist->delkey] = NULL; | 808 | nklist->keys[nklist->delkey] = NULL; |
810 | } | 809 | } |
811 | 810 | ||
812 | prealloc = (unsigned long)nklist | KEY_LINK_FIXQUOTA; | 811 | prealloc = (unsigned long)nklist | KEY_LINK_FIXQUOTA; |
813 | done: | 812 | done: |
814 | *_prealloc = prealloc; | 813 | *_prealloc = prealloc; |
815 | kleave(" = 0"); | 814 | kleave(" = 0"); |
816 | return 0; | 815 | return 0; |
817 | 816 | ||
818 | error_quota: | 817 | error_quota: |
819 | /* undo the quota changes */ | 818 | /* undo the quota changes */ |
820 | key_payload_reserve(keyring, | 819 | key_payload_reserve(keyring, |
821 | keyring->datalen - KEYQUOTA_LINK_BYTES); | 820 | keyring->datalen - KEYQUOTA_LINK_BYTES); |
822 | error_sem: | 821 | error_sem: |
823 | if (type == &key_type_keyring) | 822 | if (type == &key_type_keyring) |
824 | up_write(&keyring_serialise_link_sem); | 823 | up_write(&keyring_serialise_link_sem); |
825 | error_krsem: | 824 | error_krsem: |
826 | up_write(&keyring->sem); | 825 | up_write(&keyring->sem); |
827 | kleave(" = %d", ret); | 826 | kleave(" = %d", ret); |
828 | return ret; | 827 | return ret; |
829 | } | 828 | } |
830 | 829 | ||
831 | /* | 830 | /* |
832 | * Check already instantiated keys aren't going to be a problem. | 831 | * Check already instantiated keys aren't going to be a problem. |
833 | * | 832 | * |
834 | * The caller must have called __key_link_begin(). Don't need to call this for | 833 | * The caller must have called __key_link_begin(). Don't need to call this for |
835 | * keys that were created since __key_link_begin() was called. | 834 | * keys that were created since __key_link_begin() was called. |
836 | */ | 835 | */ |
837 | int __key_link_check_live_key(struct key *keyring, struct key *key) | 836 | int __key_link_check_live_key(struct key *keyring, struct key *key) |
838 | { | 837 | { |
839 | if (key->type == &key_type_keyring) | 838 | if (key->type == &key_type_keyring) |
840 | /* check that we aren't going to create a cycle by linking one | 839 | /* check that we aren't going to create a cycle by linking one |
841 | * keyring to another */ | 840 | * keyring to another */ |
842 | return keyring_detect_cycle(keyring, key); | 841 | return keyring_detect_cycle(keyring, key); |
843 | return 0; | 842 | return 0; |
844 | } | 843 | } |
845 | 844 | ||
846 | /* | 845 | /* |
847 | * Link a key into to a keyring. | 846 | * Link a key into to a keyring. |
848 | * | 847 | * |
849 | * Must be called with __key_link_begin() having being called. Discards any | 848 | * Must be called with __key_link_begin() having being called. Discards any |
850 | * already extant link to matching key if there is one, so that each keyring | 849 | * already extant link to matching key if there is one, so that each keyring |
851 | * holds at most one link to any given key of a particular type+description | 850 | * holds at most one link to any given key of a particular type+description |
852 | * combination. | 851 | * combination. |
853 | */ | 852 | */ |
854 | void __key_link(struct key *keyring, struct key *key, | 853 | void __key_link(struct key *keyring, struct key *key, |
855 | unsigned long *_prealloc) | 854 | unsigned long *_prealloc) |
856 | { | 855 | { |
857 | struct keyring_list *klist, *nklist; | 856 | struct keyring_list *klist, *nklist; |
858 | 857 | ||
859 | nklist = (struct keyring_list *)(*_prealloc & ~KEY_LINK_FIXQUOTA); | 858 | nklist = (struct keyring_list *)(*_prealloc & ~KEY_LINK_FIXQUOTA); |
860 | *_prealloc = 0; | 859 | *_prealloc = 0; |
861 | 860 | ||
862 | kenter("%d,%d,%p", keyring->serial, key->serial, nklist); | 861 | kenter("%d,%d,%p", keyring->serial, key->serial, nklist); |
863 | 862 | ||
864 | klist = rcu_dereference_protected(keyring->payload.subscriptions, | 863 | klist = rcu_dereference_protected(keyring->payload.subscriptions, |
865 | rwsem_is_locked(&keyring->sem)); | 864 | rwsem_is_locked(&keyring->sem)); |
866 | 865 | ||
867 | atomic_inc(&key->usage); | 866 | atomic_inc(&key->usage); |
868 | 867 | ||
869 | /* there's a matching key we can displace or an empty slot in a newly | 868 | /* there's a matching key we can displace or an empty slot in a newly |
870 | * allocated list we can fill */ | 869 | * allocated list we can fill */ |
871 | if (nklist) { | 870 | if (nklist) { |
872 | kdebug("replace %hu/%hu/%hu", | 871 | kdebug("replace %hu/%hu/%hu", |
873 | nklist->delkey, nklist->nkeys, nklist->maxkeys); | 872 | nklist->delkey, nklist->nkeys, nklist->maxkeys); |
874 | 873 | ||
875 | nklist->keys[nklist->delkey] = key; | 874 | nklist->keys[nklist->delkey] = key; |
876 | 875 | ||
877 | rcu_assign_pointer(keyring->payload.subscriptions, nklist); | 876 | rcu_assign_pointer(keyring->payload.subscriptions, nklist); |
878 | 877 | ||
879 | /* dispose of the old keyring list and, if there was one, the | 878 | /* dispose of the old keyring list and, if there was one, the |
880 | * displaced key */ | 879 | * displaced key */ |
881 | if (klist) { | 880 | if (klist) { |
882 | kdebug("dispose %hu/%hu/%hu", | 881 | kdebug("dispose %hu/%hu/%hu", |
883 | klist->delkey, klist->nkeys, klist->maxkeys); | 882 | klist->delkey, klist->nkeys, klist->maxkeys); |
884 | call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); | 883 | call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); |
885 | } | 884 | } |
886 | } else { | 885 | } else { |
887 | /* there's sufficient slack space to append directly */ | 886 | /* there's sufficient slack space to append directly */ |
888 | klist->keys[klist->nkeys] = key; | 887 | klist->keys[klist->nkeys] = key; |
889 | smp_wmb(); | 888 | smp_wmb(); |
890 | klist->nkeys++; | 889 | klist->nkeys++; |
891 | } | 890 | } |
892 | } | 891 | } |
893 | 892 | ||
894 | /* | 893 | /* |
895 | * Finish linking a key into to a keyring. | 894 | * Finish linking a key into to a keyring. |
896 | * | 895 | * |
897 | * Must be called with __key_link_begin() having being called. | 896 | * Must be called with __key_link_begin() having being called. |
898 | */ | 897 | */ |
899 | void __key_link_end(struct key *keyring, struct key_type *type, | 898 | void __key_link_end(struct key *keyring, struct key_type *type, |
900 | unsigned long prealloc) | 899 | unsigned long prealloc) |
901 | __releases(&keyring->sem) | 900 | __releases(&keyring->sem) |
902 | { | 901 | { |
903 | BUG_ON(type == NULL); | 902 | BUG_ON(type == NULL); |
904 | BUG_ON(type->name == NULL); | 903 | BUG_ON(type->name == NULL); |
905 | kenter("%d,%s,%lx", keyring->serial, type->name, prealloc); | 904 | kenter("%d,%s,%lx", keyring->serial, type->name, prealloc); |
906 | 905 | ||
907 | if (type == &key_type_keyring) | 906 | if (type == &key_type_keyring) |
908 | up_write(&keyring_serialise_link_sem); | 907 | up_write(&keyring_serialise_link_sem); |
909 | 908 | ||
910 | if (prealloc) { | 909 | if (prealloc) { |
911 | if (prealloc & KEY_LINK_FIXQUOTA) | 910 | if (prealloc & KEY_LINK_FIXQUOTA) |
912 | key_payload_reserve(keyring, | 911 | key_payload_reserve(keyring, |
913 | keyring->datalen - | 912 | keyring->datalen - |
914 | KEYQUOTA_LINK_BYTES); | 913 | KEYQUOTA_LINK_BYTES); |
915 | kfree((struct keyring_list *)(prealloc & ~KEY_LINK_FIXQUOTA)); | 914 | kfree((struct keyring_list *)(prealloc & ~KEY_LINK_FIXQUOTA)); |
916 | } | 915 | } |
917 | up_write(&keyring->sem); | 916 | up_write(&keyring->sem); |
918 | } | 917 | } |
919 | 918 | ||
920 | /** | 919 | /** |
921 | * key_link - Link a key to a keyring | 920 | * key_link - Link a key to a keyring |
922 | * @keyring: The keyring to make the link in. | 921 | * @keyring: The keyring to make the link in. |
923 | * @key: The key to link to. | 922 | * @key: The key to link to. |
924 | * | 923 | * |
925 | * Make a link in a keyring to a key, such that the keyring holds a reference | 924 | * Make a link in a keyring to a key, such that the keyring holds a reference |
926 | * on that key and the key can potentially be found by searching that keyring. | 925 | * on that key and the key can potentially be found by searching that keyring. |
927 | * | 926 | * |
928 | * This function will write-lock the keyring's semaphore and will consume some | 927 | * This function will write-lock the keyring's semaphore and will consume some |
929 | * of the user's key data quota to hold the link. | 928 | * of the user's key data quota to hold the link. |
930 | * | 929 | * |
931 | * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, | 930 | * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, |
932 | * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is | 931 | * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is |
933 | * full, -EDQUOT if there is insufficient key data quota remaining to add | 932 | * full, -EDQUOT if there is insufficient key data quota remaining to add |
934 | * another link or -ENOMEM if there's insufficient memory. | 933 | * another link or -ENOMEM if there's insufficient memory. |
935 | * | 934 | * |
936 | * It is assumed that the caller has checked that it is permitted for a link to | 935 | * It is assumed that the caller has checked that it is permitted for a link to |
937 | * be made (the keyring should have Write permission and the key Link | 936 | * be made (the keyring should have Write permission and the key Link |
938 | * permission). | 937 | * permission). |
939 | */ | 938 | */ |
940 | int key_link(struct key *keyring, struct key *key) | 939 | int key_link(struct key *keyring, struct key *key) |
941 | { | 940 | { |
942 | unsigned long prealloc; | 941 | unsigned long prealloc; |
943 | int ret; | 942 | int ret; |
944 | 943 | ||
945 | key_check(keyring); | 944 | key_check(keyring); |
946 | key_check(key); | 945 | key_check(key); |
947 | 946 | ||
948 | ret = __key_link_begin(keyring, key->type, key->description, &prealloc); | 947 | ret = __key_link_begin(keyring, key->type, key->description, &prealloc); |
949 | if (ret == 0) { | 948 | if (ret == 0) { |
950 | ret = __key_link_check_live_key(keyring, key); | 949 | ret = __key_link_check_live_key(keyring, key); |
951 | if (ret == 0) | 950 | if (ret == 0) |
952 | __key_link(keyring, key, &prealloc); | 951 | __key_link(keyring, key, &prealloc); |
953 | __key_link_end(keyring, key->type, prealloc); | 952 | __key_link_end(keyring, key->type, prealloc); |
954 | } | 953 | } |
955 | 954 | ||
956 | return ret; | 955 | return ret; |
957 | } | 956 | } |
958 | EXPORT_SYMBOL(key_link); | 957 | EXPORT_SYMBOL(key_link); |
959 | 958 | ||
960 | /** | 959 | /** |
961 | * key_unlink - Unlink the first link to a key from a keyring. | 960 | * key_unlink - Unlink the first link to a key from a keyring. |
962 | * @keyring: The keyring to remove the link from. | 961 | * @keyring: The keyring to remove the link from. |
963 | * @key: The key the link is to. | 962 | * @key: The key the link is to. |
964 | * | 963 | * |
965 | * Remove a link from a keyring to a key. | 964 | * Remove a link from a keyring to a key. |
966 | * | 965 | * |
967 | * This function will write-lock the keyring's semaphore. | 966 | * This function will write-lock the keyring's semaphore. |
968 | * | 967 | * |
969 | * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if | 968 | * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if |
970 | * the key isn't linked to by the keyring or -ENOMEM if there's insufficient | 969 | * the key isn't linked to by the keyring or -ENOMEM if there's insufficient |
971 | * memory. | 970 | * memory. |
972 | * | 971 | * |
973 | * It is assumed that the caller has checked that it is permitted for a link to | 972 | * It is assumed that the caller has checked that it is permitted for a link to |
974 | * be removed (the keyring should have Write permission; no permissions are | 973 | * be removed (the keyring should have Write permission; no permissions are |
975 | * required on the key). | 974 | * required on the key). |
976 | */ | 975 | */ |
977 | int key_unlink(struct key *keyring, struct key *key) | 976 | int key_unlink(struct key *keyring, struct key *key) |
978 | { | 977 | { |
979 | struct keyring_list *klist, *nklist; | 978 | struct keyring_list *klist, *nklist; |
980 | int loop, ret; | 979 | int loop, ret; |
981 | 980 | ||
982 | key_check(keyring); | 981 | key_check(keyring); |
983 | key_check(key); | 982 | key_check(key); |
984 | 983 | ||
985 | ret = -ENOTDIR; | 984 | ret = -ENOTDIR; |
986 | if (keyring->type != &key_type_keyring) | 985 | if (keyring->type != &key_type_keyring) |
987 | goto error; | 986 | goto error; |
988 | 987 | ||
989 | down_write(&keyring->sem); | 988 | down_write(&keyring->sem); |
990 | 989 | ||
991 | klist = rcu_dereference_locked_keyring(keyring); | 990 | klist = rcu_dereference_locked_keyring(keyring); |
992 | if (klist) { | 991 | if (klist) { |
993 | /* search the keyring for the key */ | 992 | /* search the keyring for the key */ |
994 | for (loop = 0; loop < klist->nkeys; loop++) | 993 | for (loop = 0; loop < klist->nkeys; loop++) |
995 | if (klist->keys[loop] == key) | 994 | if (klist->keys[loop] == key) |
996 | goto key_is_present; | 995 | goto key_is_present; |
997 | } | 996 | } |
998 | 997 | ||
999 | up_write(&keyring->sem); | 998 | up_write(&keyring->sem); |
1000 | ret = -ENOENT; | 999 | ret = -ENOENT; |
1001 | goto error; | 1000 | goto error; |
1002 | 1001 | ||
1003 | key_is_present: | 1002 | key_is_present: |
1004 | /* we need to copy the key list for RCU purposes */ | 1003 | /* we need to copy the key list for RCU purposes */ |
1005 | nklist = kmalloc(sizeof(*klist) + | 1004 | nklist = kmalloc(sizeof(*klist) + |
1006 | sizeof(struct key *) * klist->maxkeys, | 1005 | sizeof(struct key *) * klist->maxkeys, |
1007 | GFP_KERNEL); | 1006 | GFP_KERNEL); |
1008 | if (!nklist) | 1007 | if (!nklist) |
1009 | goto nomem; | 1008 | goto nomem; |
1010 | nklist->maxkeys = klist->maxkeys; | 1009 | nklist->maxkeys = klist->maxkeys; |
1011 | nklist->nkeys = klist->nkeys - 1; | 1010 | nklist->nkeys = klist->nkeys - 1; |
1012 | 1011 | ||
1013 | if (loop > 0) | 1012 | if (loop > 0) |
1014 | memcpy(&nklist->keys[0], | 1013 | memcpy(&nklist->keys[0], |
1015 | &klist->keys[0], | 1014 | &klist->keys[0], |
1016 | loop * sizeof(struct key *)); | 1015 | loop * sizeof(struct key *)); |
1017 | 1016 | ||
1018 | if (loop < nklist->nkeys) | 1017 | if (loop < nklist->nkeys) |
1019 | memcpy(&nklist->keys[loop], | 1018 | memcpy(&nklist->keys[loop], |
1020 | &klist->keys[loop + 1], | 1019 | &klist->keys[loop + 1], |
1021 | (nklist->nkeys - loop) * sizeof(struct key *)); | 1020 | (nklist->nkeys - loop) * sizeof(struct key *)); |
1022 | 1021 | ||
1023 | /* adjust the user's quota */ | 1022 | /* adjust the user's quota */ |
1024 | key_payload_reserve(keyring, | 1023 | key_payload_reserve(keyring, |
1025 | keyring->datalen - KEYQUOTA_LINK_BYTES); | 1024 | keyring->datalen - KEYQUOTA_LINK_BYTES); |
1026 | 1025 | ||
1027 | rcu_assign_pointer(keyring->payload.subscriptions, nklist); | 1026 | rcu_assign_pointer(keyring->payload.subscriptions, nklist); |
1028 | 1027 | ||
1029 | up_write(&keyring->sem); | 1028 | up_write(&keyring->sem); |
1030 | 1029 | ||
1031 | /* schedule for later cleanup */ | 1030 | /* schedule for later cleanup */ |
1032 | klist->delkey = loop; | 1031 | klist->delkey = loop; |
1033 | call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); | 1032 | call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); |
1034 | 1033 | ||
1035 | ret = 0; | 1034 | ret = 0; |
1036 | 1035 | ||
1037 | error: | 1036 | error: |
1038 | return ret; | 1037 | return ret; |
1039 | nomem: | 1038 | nomem: |
1040 | ret = -ENOMEM; | 1039 | ret = -ENOMEM; |
1041 | up_write(&keyring->sem); | 1040 | up_write(&keyring->sem); |
1042 | goto error; | 1041 | goto error; |
1043 | } | 1042 | } |
1044 | EXPORT_SYMBOL(key_unlink); | 1043 | EXPORT_SYMBOL(key_unlink); |
1045 | 1044 | ||
1046 | /* | 1045 | /* |
1047 | * Dispose of a keyring list after the RCU grace period, releasing the keys it | 1046 | * Dispose of a keyring list after the RCU grace period, releasing the keys it |
1048 | * links to. | 1047 | * links to. |
1049 | */ | 1048 | */ |
1050 | static void keyring_clear_rcu_disposal(struct rcu_head *rcu) | 1049 | static void keyring_clear_rcu_disposal(struct rcu_head *rcu) |
1051 | { | 1050 | { |
1052 | struct keyring_list *klist; | 1051 | struct keyring_list *klist; |
1053 | int loop; | 1052 | int loop; |
1054 | 1053 | ||
1055 | klist = container_of(rcu, struct keyring_list, rcu); | 1054 | klist = container_of(rcu, struct keyring_list, rcu); |
1056 | 1055 | ||
1057 | for (loop = klist->nkeys - 1; loop >= 0; loop--) | 1056 | for (loop = klist->nkeys - 1; loop >= 0; loop--) |
1058 | key_put(klist->keys[loop]); | 1057 | key_put(klist->keys[loop]); |
1059 | 1058 | ||
1060 | kfree(klist); | 1059 | kfree(klist); |
1061 | } | 1060 | } |
1062 | 1061 | ||
1063 | /** | 1062 | /** |
1064 | * keyring_clear - Clear a keyring | 1063 | * keyring_clear - Clear a keyring |
1065 | * @keyring: The keyring to clear. | 1064 | * @keyring: The keyring to clear. |
1066 | * | 1065 | * |
1067 | * Clear the contents of the specified keyring. | 1066 | * Clear the contents of the specified keyring. |
1068 | * | 1067 | * |
1069 | * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring. | 1068 | * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring. |
1070 | */ | 1069 | */ |
1071 | int keyring_clear(struct key *keyring) | 1070 | int keyring_clear(struct key *keyring) |
1072 | { | 1071 | { |
1073 | struct keyring_list *klist; | 1072 | struct keyring_list *klist; |
1074 | int ret; | 1073 | int ret; |
1075 | 1074 | ||
1076 | ret = -ENOTDIR; | 1075 | ret = -ENOTDIR; |
1077 | if (keyring->type == &key_type_keyring) { | 1076 | if (keyring->type == &key_type_keyring) { |
1078 | /* detach the pointer block with the locks held */ | 1077 | /* detach the pointer block with the locks held */ |
1079 | down_write(&keyring->sem); | 1078 | down_write(&keyring->sem); |
1080 | 1079 | ||
1081 | klist = rcu_dereference_locked_keyring(keyring); | 1080 | klist = rcu_dereference_locked_keyring(keyring); |
1082 | if (klist) { | 1081 | if (klist) { |
1083 | /* adjust the quota */ | 1082 | /* adjust the quota */ |
1084 | key_payload_reserve(keyring, | 1083 | key_payload_reserve(keyring, |
1085 | sizeof(struct keyring_list)); | 1084 | sizeof(struct keyring_list)); |
1086 | 1085 | ||
1087 | rcu_assign_pointer(keyring->payload.subscriptions, | 1086 | rcu_assign_pointer(keyring->payload.subscriptions, |
1088 | NULL); | 1087 | NULL); |
1089 | } | 1088 | } |
1090 | 1089 | ||
1091 | up_write(&keyring->sem); | 1090 | up_write(&keyring->sem); |
1092 | 1091 | ||
1093 | /* free the keys after the locks have been dropped */ | 1092 | /* free the keys after the locks have been dropped */ |
1094 | if (klist) | 1093 | if (klist) |
1095 | call_rcu(&klist->rcu, keyring_clear_rcu_disposal); | 1094 | call_rcu(&klist->rcu, keyring_clear_rcu_disposal); |
1096 | 1095 | ||
1097 | ret = 0; | 1096 | ret = 0; |
1098 | } | 1097 | } |
1099 | 1098 | ||
1100 | return ret; | 1099 | return ret; |
1101 | } | 1100 | } |
1102 | EXPORT_SYMBOL(keyring_clear); | 1101 | EXPORT_SYMBOL(keyring_clear); |
1103 | 1102 | ||
1104 | /* | 1103 | /* |
1105 | * Dispose of the links from a revoked keyring. | 1104 | * Dispose of the links from a revoked keyring. |
1106 | * | 1105 | * |
1107 | * This is called with the key sem write-locked. | 1106 | * This is called with the key sem write-locked. |
1108 | */ | 1107 | */ |
1109 | static void keyring_revoke(struct key *keyring) | 1108 | static void keyring_revoke(struct key *keyring) |
1110 | { | 1109 | { |
1111 | struct keyring_list *klist; | 1110 | struct keyring_list *klist; |
1112 | 1111 | ||
1113 | klist = rcu_dereference_locked_keyring(keyring); | 1112 | klist = rcu_dereference_locked_keyring(keyring); |
1114 | 1113 | ||
1115 | /* adjust the quota */ | 1114 | /* adjust the quota */ |
1116 | key_payload_reserve(keyring, 0); | 1115 | key_payload_reserve(keyring, 0); |
1117 | 1116 | ||
1118 | if (klist) { | 1117 | if (klist) { |
1119 | rcu_assign_pointer(keyring->payload.subscriptions, NULL); | 1118 | rcu_assign_pointer(keyring->payload.subscriptions, NULL); |
1120 | call_rcu(&klist->rcu, keyring_clear_rcu_disposal); | 1119 | call_rcu(&klist->rcu, keyring_clear_rcu_disposal); |
1121 | } | 1120 | } |
1122 | } | 1121 | } |
1123 | 1122 | ||
1124 | /* | 1123 | /* |
1125 | * Determine whether a key is dead. | 1124 | * Determine whether a key is dead. |
1126 | */ | 1125 | */ |
1127 | static bool key_is_dead(struct key *key, time_t limit) | 1126 | static bool key_is_dead(struct key *key, time_t limit) |
1128 | { | 1127 | { |
1129 | return test_bit(KEY_FLAG_DEAD, &key->flags) || | 1128 | return test_bit(KEY_FLAG_DEAD, &key->flags) || |
1130 | (key->expiry > 0 && key->expiry <= limit); | 1129 | (key->expiry > 0 && key->expiry <= limit); |
1131 | } | 1130 | } |
1132 | 1131 | ||
1133 | /* | 1132 | /* |
1134 | * Collect garbage from the contents of a keyring, replacing the old list with | 1133 | * Collect garbage from the contents of a keyring, replacing the old list with |
1135 | * a new one with the pointers all shuffled down. | 1134 | * a new one with the pointers all shuffled down. |
1136 | * | 1135 | * |
1137 | * Dead keys are classed as oned that are flagged as being dead or are revoked, | 1136 | * Dead keys are classed as oned that are flagged as being dead or are revoked, |
1138 | * expired or negative keys that were revoked or expired before the specified | 1137 | * expired or negative keys that were revoked or expired before the specified |
1139 | * limit. | 1138 | * limit. |
1140 | */ | 1139 | */ |
1141 | void keyring_gc(struct key *keyring, time_t limit) | 1140 | void keyring_gc(struct key *keyring, time_t limit) |
1142 | { | 1141 | { |
1143 | struct keyring_list *klist, *new; | 1142 | struct keyring_list *klist, *new; |
1144 | struct key *key; | 1143 | struct key *key; |
1145 | int loop, keep, max; | 1144 | int loop, keep, max; |
1146 | 1145 | ||
1147 | kenter("{%x,%s}", key_serial(keyring), keyring->description); | 1146 | kenter("{%x,%s}", key_serial(keyring), keyring->description); |
1148 | 1147 | ||
1149 | down_write(&keyring->sem); | 1148 | down_write(&keyring->sem); |
1150 | 1149 | ||
1151 | klist = rcu_dereference_locked_keyring(keyring); | 1150 | klist = rcu_dereference_locked_keyring(keyring); |
1152 | if (!klist) | 1151 | if (!klist) |
1153 | goto no_klist; | 1152 | goto no_klist; |
1154 | 1153 | ||
1155 | /* work out how many subscriptions we're keeping */ | 1154 | /* work out how many subscriptions we're keeping */ |
1156 | keep = 0; | 1155 | keep = 0; |
1157 | for (loop = klist->nkeys - 1; loop >= 0; loop--) | 1156 | for (loop = klist->nkeys - 1; loop >= 0; loop--) |
1158 | if (!key_is_dead(klist->keys[loop], limit)) | 1157 | if (!key_is_dead(klist->keys[loop], limit)) |
1159 | keep++; | 1158 | keep++; |
1160 | 1159 | ||
1161 | if (keep == klist->nkeys) | 1160 | if (keep == klist->nkeys) |
1162 | goto just_return; | 1161 | goto just_return; |
1163 | 1162 | ||
1164 | /* allocate a new keyring payload */ | 1163 | /* allocate a new keyring payload */ |
1165 | max = roundup(keep, 4); | 1164 | max = roundup(keep, 4); |
1166 | new = kmalloc(sizeof(struct keyring_list) + max * sizeof(struct key *), | 1165 | new = kmalloc(sizeof(struct keyring_list) + max * sizeof(struct key *), |
1167 | GFP_KERNEL); | 1166 | GFP_KERNEL); |
1168 | if (!new) | 1167 | if (!new) |
1169 | goto nomem; | 1168 | goto nomem; |
1170 | new->maxkeys = max; | 1169 | new->maxkeys = max; |
1171 | new->nkeys = 0; | 1170 | new->nkeys = 0; |
1172 | new->delkey = 0; | 1171 | new->delkey = 0; |
1173 | 1172 | ||
1174 | /* install the live keys | 1173 | /* install the live keys |
1175 | * - must take care as expired keys may be updated back to life | 1174 | * - must take care as expired keys may be updated back to life |
1176 | */ | 1175 | */ |
1177 | keep = 0; | 1176 | keep = 0; |
1178 | for (loop = klist->nkeys - 1; loop >= 0; loop--) { | 1177 | for (loop = klist->nkeys - 1; loop >= 0; loop--) { |
1179 | key = klist->keys[loop]; | 1178 | key = klist->keys[loop]; |
1180 | if (!key_is_dead(key, limit)) { | 1179 | if (!key_is_dead(key, limit)) { |
1181 | if (keep >= max) | 1180 | if (keep >= max) |
1182 | goto discard_new; | 1181 | goto discard_new; |
1183 | new->keys[keep++] = key_get(key); | 1182 | new->keys[keep++] = key_get(key); |
1184 | } | 1183 | } |
1185 | } | 1184 | } |
1186 | new->nkeys = keep; | 1185 | new->nkeys = keep; |
1187 | 1186 | ||
1188 | /* adjust the quota */ | 1187 | /* adjust the quota */ |
1189 | key_payload_reserve(keyring, | 1188 | key_payload_reserve(keyring, |
1190 | sizeof(struct keyring_list) + | 1189 | sizeof(struct keyring_list) + |
1191 | KEYQUOTA_LINK_BYTES * keep); | 1190 | KEYQUOTA_LINK_BYTES * keep); |
1192 | 1191 | ||
1193 | if (keep == 0) { | 1192 | if (keep == 0) { |
1194 | rcu_assign_pointer(keyring->payload.subscriptions, NULL); | 1193 | rcu_assign_pointer(keyring->payload.subscriptions, NULL); |
1195 | kfree(new); | 1194 | kfree(new); |
1196 | } else { | 1195 | } else { |
1197 | rcu_assign_pointer(keyring->payload.subscriptions, new); | 1196 | rcu_assign_pointer(keyring->payload.subscriptions, new); |
1198 | } | 1197 | } |
1199 | 1198 | ||
1200 | up_write(&keyring->sem); | 1199 | up_write(&keyring->sem); |
1201 | 1200 | ||
1202 | call_rcu(&klist->rcu, keyring_clear_rcu_disposal); | 1201 | call_rcu(&klist->rcu, keyring_clear_rcu_disposal); |
1203 | kleave(" [yes]"); | 1202 | kleave(" [yes]"); |
1204 | return; | 1203 | return; |
1205 | 1204 | ||
1206 | discard_new: | 1205 | discard_new: |
1207 | new->nkeys = keep; | 1206 | new->nkeys = keep; |
1208 | keyring_clear_rcu_disposal(&new->rcu); | 1207 | keyring_clear_rcu_disposal(&new->rcu); |
1209 | up_write(&keyring->sem); | 1208 | up_write(&keyring->sem); |
1210 | kleave(" [discard]"); | 1209 | kleave(" [discard]"); |
1211 | return; | 1210 | return; |
1212 | 1211 | ||
1213 | just_return: | 1212 | just_return: |
1214 | up_write(&keyring->sem); | 1213 | up_write(&keyring->sem); |
1215 | kleave(" [no dead]"); | 1214 | kleave(" [no dead]"); |
1216 | return; | 1215 | return; |
1217 | 1216 | ||
1218 | no_klist: | 1217 | no_klist: |
1219 | up_write(&keyring->sem); | 1218 | up_write(&keyring->sem); |
1220 | kleave(" [no_klist]"); | 1219 | kleave(" [no_klist]"); |
1221 | return; | 1220 | return; |
1222 | 1221 | ||
1223 | nomem: | 1222 | nomem: |
1224 | up_write(&keyring->sem); | 1223 | up_write(&keyring->sem); |
1225 | kleave(" [oom]"); | 1224 | kleave(" [oom]"); |
1226 | } | 1225 | } |
1227 | 1226 |