Commit 3116f0e3df0a67ad56f15dd4c5f6cefb04bb4a98

Authored by Paul Menage
Committed by Linus Torvalds
1 parent c27e8818a0

CGroup API files: move "releasable" to cgroup_debug subsystem

The "releasable" control file provided by the cgroup framework exports the
state of a per-cgroup flag that's related to the notify-on-release feature.
This isn't really generally useful, unless you're trying to debug this
particular feature of cgroups.

This patch moves the "releasable" file to the cgroup_debug subsystem.

Signed-off-by: Paul Menage <menage@google.com>
Cc: "Li Zefan" <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "YAMAMOTO Takashi" <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 22 additions and 24 deletions Inline Diff

include/linux/cgroup.h
1 #ifndef _LINUX_CGROUP_H 1 #ifndef _LINUX_CGROUP_H
2 #define _LINUX_CGROUP_H 2 #define _LINUX_CGROUP_H
3 /* 3 /*
4 * cgroup interface 4 * cgroup interface
5 * 5 *
6 * Copyright (C) 2003 BULL SA 6 * Copyright (C) 2003 BULL SA
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * 8 *
9 */ 9 */
10 10
11 #include <linux/sched.h> 11 #include <linux/sched.h>
12 #include <linux/kref.h> 12 #include <linux/kref.h>
13 #include <linux/cpumask.h> 13 #include <linux/cpumask.h>
14 #include <linux/nodemask.h> 14 #include <linux/nodemask.h>
15 #include <linux/rcupdate.h> 15 #include <linux/rcupdate.h>
16 #include <linux/cgroupstats.h> 16 #include <linux/cgroupstats.h>
17 #include <linux/prio_heap.h> 17 #include <linux/prio_heap.h>
18 18
19 #ifdef CONFIG_CGROUPS 19 #ifdef CONFIG_CGROUPS
20 20
21 struct cgroupfs_root; 21 struct cgroupfs_root;
22 struct cgroup_subsys; 22 struct cgroup_subsys;
23 struct inode; 23 struct inode;
24 24
25 extern int cgroup_init_early(void); 25 extern int cgroup_init_early(void);
26 extern int cgroup_init(void); 26 extern int cgroup_init(void);
27 extern void cgroup_init_smp(void); 27 extern void cgroup_init_smp(void);
28 extern void cgroup_lock(void); 28 extern void cgroup_lock(void);
29 extern void cgroup_unlock(void); 29 extern void cgroup_unlock(void);
30 extern void cgroup_fork(struct task_struct *p); 30 extern void cgroup_fork(struct task_struct *p);
31 extern void cgroup_fork_callbacks(struct task_struct *p); 31 extern void cgroup_fork_callbacks(struct task_struct *p);
32 extern void cgroup_post_fork(struct task_struct *p); 32 extern void cgroup_post_fork(struct task_struct *p);
33 extern void cgroup_exit(struct task_struct *p, int run_callbacks); 33 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
34 extern int cgroupstats_build(struct cgroupstats *stats, 34 extern int cgroupstats_build(struct cgroupstats *stats,
35 struct dentry *dentry); 35 struct dentry *dentry);
36 36
37 extern struct file_operations proc_cgroup_operations; 37 extern struct file_operations proc_cgroup_operations;
38 38
39 /* Define the enumeration of all cgroup subsystems */ 39 /* Define the enumeration of all cgroup subsystems */
40 #define SUBSYS(_x) _x ## _subsys_id, 40 #define SUBSYS(_x) _x ## _subsys_id,
41 enum cgroup_subsys_id { 41 enum cgroup_subsys_id {
42 #include <linux/cgroup_subsys.h> 42 #include <linux/cgroup_subsys.h>
43 CGROUP_SUBSYS_COUNT 43 CGROUP_SUBSYS_COUNT
44 }; 44 };
45 #undef SUBSYS 45 #undef SUBSYS
46 46
47 /* Per-subsystem/per-cgroup state maintained by the system. */ 47 /* Per-subsystem/per-cgroup state maintained by the system. */
48 struct cgroup_subsys_state { 48 struct cgroup_subsys_state {
49 /* The cgroup that this subsystem is attached to. Useful 49 /* The cgroup that this subsystem is attached to. Useful
50 * for subsystems that want to know about the cgroup 50 * for subsystems that want to know about the cgroup
51 * hierarchy structure */ 51 * hierarchy structure */
52 struct cgroup *cgroup; 52 struct cgroup *cgroup;
53 53
54 /* State maintained by the cgroup system to allow 54 /* State maintained by the cgroup system to allow
55 * subsystems to be "busy". Should be accessed via css_get() 55 * subsystems to be "busy". Should be accessed via css_get()
56 * and css_put() */ 56 * and css_put() */
57 57
58 atomic_t refcnt; 58 atomic_t refcnt;
59 59
60 unsigned long flags; 60 unsigned long flags;
61 }; 61 };
62 62
63 /* bits in struct cgroup_subsys_state flags field */ 63 /* bits in struct cgroup_subsys_state flags field */
64 enum { 64 enum {
65 CSS_ROOT, /* This CSS is the root of the subsystem */ 65 CSS_ROOT, /* This CSS is the root of the subsystem */
66 }; 66 };
67 67
68 /* 68 /*
69 * Call css_get() to hold a reference on the cgroup; 69 * Call css_get() to hold a reference on the cgroup;
70 * 70 *
71 */ 71 */
72 72
73 static inline void css_get(struct cgroup_subsys_state *css) 73 static inline void css_get(struct cgroup_subsys_state *css)
74 { 74 {
75 /* We don't need to reference count the root state */ 75 /* We don't need to reference count the root state */
76 if (!test_bit(CSS_ROOT, &css->flags)) 76 if (!test_bit(CSS_ROOT, &css->flags))
77 atomic_inc(&css->refcnt); 77 atomic_inc(&css->refcnt);
78 } 78 }
79 /* 79 /*
80 * css_put() should be called to release a reference taken by 80 * css_put() should be called to release a reference taken by
81 * css_get() 81 * css_get()
82 */ 82 */
83 83
84 extern void __css_put(struct cgroup_subsys_state *css); 84 extern void __css_put(struct cgroup_subsys_state *css);
85 static inline void css_put(struct cgroup_subsys_state *css) 85 static inline void css_put(struct cgroup_subsys_state *css)
86 { 86 {
87 if (!test_bit(CSS_ROOT, &css->flags)) 87 if (!test_bit(CSS_ROOT, &css->flags))
88 __css_put(css); 88 __css_put(css);
89 } 89 }
90 90
91 /* bits in struct cgroup flags field */
92 enum {
93 /* Control Group is dead */
94 CGRP_REMOVED,
95 /* Control Group has previously had a child cgroup or a task,
96 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
97 CGRP_RELEASABLE,
98 /* Control Group requires release notifications to userspace */
99 CGRP_NOTIFY_ON_RELEASE,
100 };
101
91 struct cgroup { 102 struct cgroup {
92 unsigned long flags; /* "unsigned long" so bitops work */ 103 unsigned long flags; /* "unsigned long" so bitops work */
93 104
94 /* count users of this cgroup. >0 means busy, but doesn't 105 /* count users of this cgroup. >0 means busy, but doesn't
95 * necessarily indicate the number of tasks in the 106 * necessarily indicate the number of tasks in the
96 * cgroup */ 107 * cgroup */
97 atomic_t count; 108 atomic_t count;
98 109
99 /* 110 /*
100 * We link our 'sibling' struct into our parent's 'children'. 111 * We link our 'sibling' struct into our parent's 'children'.
101 * Our children link their 'sibling' into our 'children'. 112 * Our children link their 'sibling' into our 'children'.
102 */ 113 */
103 struct list_head sibling; /* my parent's children */ 114 struct list_head sibling; /* my parent's children */
104 struct list_head children; /* my children */ 115 struct list_head children; /* my children */
105 116
106 struct cgroup *parent; /* my parent */ 117 struct cgroup *parent; /* my parent */
107 struct dentry *dentry; /* cgroup fs entry */ 118 struct dentry *dentry; /* cgroup fs entry */
108 119
109 /* Private pointers for each registered subsystem */ 120 /* Private pointers for each registered subsystem */
110 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 121 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
111 122
112 struct cgroupfs_root *root; 123 struct cgroupfs_root *root;
113 struct cgroup *top_cgroup; 124 struct cgroup *top_cgroup;
114 125
115 /* 126 /*
116 * List of cg_cgroup_links pointing at css_sets with 127 * List of cg_cgroup_links pointing at css_sets with
117 * tasks in this cgroup. Protected by css_set_lock 128 * tasks in this cgroup. Protected by css_set_lock
118 */ 129 */
119 struct list_head css_sets; 130 struct list_head css_sets;
120 131
121 /* 132 /*
122 * Linked list running through all cgroups that can 133 * Linked list running through all cgroups that can
123 * potentially be reaped by the release agent. Protected by 134 * potentially be reaped by the release agent. Protected by
124 * release_list_lock 135 * release_list_lock
125 */ 136 */
126 struct list_head release_list; 137 struct list_head release_list;
127 }; 138 };
128 139
129 /* A css_set is a structure holding pointers to a set of 140 /* A css_set is a structure holding pointers to a set of
130 * cgroup_subsys_state objects. This saves space in the task struct 141 * cgroup_subsys_state objects. This saves space in the task struct
131 * object and speeds up fork()/exit(), since a single inc/dec and a 142 * object and speeds up fork()/exit(), since a single inc/dec and a
132 * list_add()/del() can bump the reference count on the entire 143 * list_add()/del() can bump the reference count on the entire
133 * cgroup set for a task. 144 * cgroup set for a task.
134 */ 145 */
135 146
136 struct css_set { 147 struct css_set {
137 148
138 /* Reference count */ 149 /* Reference count */
139 struct kref ref; 150 struct kref ref;
140 151
141 /* 152 /*
142 * List running through all cgroup groups. Protected by 153 * List running through all cgroup groups. Protected by
143 * css_set_lock 154 * css_set_lock
144 */ 155 */
145 struct list_head list; 156 struct list_head list;
146 157
147 /* 158 /*
148 * List running through all tasks using this cgroup 159 * List running through all tasks using this cgroup
149 * group. Protected by css_set_lock 160 * group. Protected by css_set_lock
150 */ 161 */
151 struct list_head tasks; 162 struct list_head tasks;
152 163
153 /* 164 /*
154 * List of cg_cgroup_link objects on link chains from 165 * List of cg_cgroup_link objects on link chains from
155 * cgroups referenced from this css_set. Protected by 166 * cgroups referenced from this css_set. Protected by
156 * css_set_lock 167 * css_set_lock
157 */ 168 */
158 struct list_head cg_links; 169 struct list_head cg_links;
159 170
160 /* 171 /*
161 * Set of subsystem states, one for each subsystem. This array 172 * Set of subsystem states, one for each subsystem. This array
162 * is immutable after creation apart from the init_css_set 173 * is immutable after creation apart from the init_css_set
163 * during subsystem registration (at boot time). 174 * during subsystem registration (at boot time).
164 */ 175 */
165 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 176 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
166 177
167 }; 178 };
168 179
169 /* 180 /*
170 * cgroup_map_cb is an abstract callback API for reporting map-valued 181 * cgroup_map_cb is an abstract callback API for reporting map-valued
171 * control files 182 * control files
172 */ 183 */
173 184
174 struct cgroup_map_cb { 185 struct cgroup_map_cb {
175 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); 186 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
176 void *state; 187 void *state;
177 }; 188 };
178 189
179 /* struct cftype: 190 /* struct cftype:
180 * 191 *
181 * The files in the cgroup filesystem mostly have a very simple read/write 192 * The files in the cgroup filesystem mostly have a very simple read/write
182 * handling, some common function will take care of it. Nevertheless some cases 193 * handling, some common function will take care of it. Nevertheless some cases
183 * (read tasks) are special and therefore I define this structure for every 194 * (read tasks) are special and therefore I define this structure for every
184 * kind of file. 195 * kind of file.
185 * 196 *
186 * 197 *
187 * When reading/writing to a file: 198 * When reading/writing to a file:
188 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata 199 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata
189 * - the 'cftype' of the file is file->f_dentry->d_fsdata 200 * - the 'cftype' of the file is file->f_dentry->d_fsdata
190 */ 201 */
191 202
192 #define MAX_CFTYPE_NAME 64 203 #define MAX_CFTYPE_NAME 64
193 struct cftype { 204 struct cftype {
194 /* By convention, the name should begin with the name of the 205 /* By convention, the name should begin with the name of the
195 * subsystem, followed by a period */ 206 * subsystem, followed by a period */
196 char name[MAX_CFTYPE_NAME]; 207 char name[MAX_CFTYPE_NAME];
197 int private; 208 int private;
198 int (*open) (struct inode *inode, struct file *file); 209 int (*open) (struct inode *inode, struct file *file);
199 ssize_t (*read) (struct cgroup *cgrp, struct cftype *cft, 210 ssize_t (*read) (struct cgroup *cgrp, struct cftype *cft,
200 struct file *file, 211 struct file *file,
201 char __user *buf, size_t nbytes, loff_t *ppos); 212 char __user *buf, size_t nbytes, loff_t *ppos);
202 /* 213 /*
203 * read_u64() is a shortcut for the common case of returning a 214 * read_u64() is a shortcut for the common case of returning a
204 * single integer. Use it in place of read() 215 * single integer. Use it in place of read()
205 */ 216 */
206 u64 (*read_u64) (struct cgroup *cgrp, struct cftype *cft); 217 u64 (*read_u64) (struct cgroup *cgrp, struct cftype *cft);
207 /* 218 /*
208 * read_map() is used for defining a map of key/value 219 * read_map() is used for defining a map of key/value
209 * pairs. It should call cb->fill(cb, key, value) for each 220 * pairs. It should call cb->fill(cb, key, value) for each
210 * entry. The key/value pairs (and their ordering) should not 221 * entry. The key/value pairs (and their ordering) should not
211 * change between reboots. 222 * change between reboots.
212 */ 223 */
213 int (*read_map) (struct cgroup *cont, struct cftype *cft, 224 int (*read_map) (struct cgroup *cont, struct cftype *cft,
214 struct cgroup_map_cb *cb); 225 struct cgroup_map_cb *cb);
215 226
216 ssize_t (*write) (struct cgroup *cgrp, struct cftype *cft, 227 ssize_t (*write) (struct cgroup *cgrp, struct cftype *cft,
217 struct file *file, 228 struct file *file,
218 const char __user *buf, size_t nbytes, loff_t *ppos); 229 const char __user *buf, size_t nbytes, loff_t *ppos);
219 230
220 /* 231 /*
221 * write_u64() is a shortcut for the common case of accepting 232 * write_u64() is a shortcut for the common case of accepting
222 * a single integer (as parsed by simple_strtoull) from 233 * a single integer (as parsed by simple_strtoull) from
223 * userspace. Use in place of write(); return 0 or error. 234 * userspace. Use in place of write(); return 0 or error.
224 */ 235 */
225 int (*write_u64) (struct cgroup *cgrp, struct cftype *cft, u64 val); 236 int (*write_u64) (struct cgroup *cgrp, struct cftype *cft, u64 val);
226 237
227 int (*release) (struct inode *inode, struct file *file); 238 int (*release) (struct inode *inode, struct file *file);
228 }; 239 };
229 240
230 struct cgroup_scanner { 241 struct cgroup_scanner {
231 struct cgroup *cg; 242 struct cgroup *cg;
232 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); 243 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
233 void (*process_task)(struct task_struct *p, 244 void (*process_task)(struct task_struct *p,
234 struct cgroup_scanner *scan); 245 struct cgroup_scanner *scan);
235 struct ptr_heap *heap; 246 struct ptr_heap *heap;
236 }; 247 };
237 248
238 /* Add a new file to the given cgroup directory. Should only be 249 /* Add a new file to the given cgroup directory. Should only be
239 * called by subsystems from within a populate() method */ 250 * called by subsystems from within a populate() method */
240 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 251 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
241 const struct cftype *cft); 252 const struct cftype *cft);
242 253
243 /* Add a set of new files to the given cgroup directory. Should 254 /* Add a set of new files to the given cgroup directory. Should
244 * only be called by subsystems from within a populate() method */ 255 * only be called by subsystems from within a populate() method */
245 int cgroup_add_files(struct cgroup *cgrp, 256 int cgroup_add_files(struct cgroup *cgrp,
246 struct cgroup_subsys *subsys, 257 struct cgroup_subsys *subsys,
247 const struct cftype cft[], 258 const struct cftype cft[],
248 int count); 259 int count);
249 260
250 int cgroup_is_removed(const struct cgroup *cgrp); 261 int cgroup_is_removed(const struct cgroup *cgrp);
251 262
252 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); 263 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
253 264
254 int cgroup_task_count(const struct cgroup *cgrp); 265 int cgroup_task_count(const struct cgroup *cgrp);
255 266
256 /* Return true if the cgroup is a descendant of the current cgroup */ 267 /* Return true if the cgroup is a descendant of the current cgroup */
257 int cgroup_is_descendant(const struct cgroup *cgrp); 268 int cgroup_is_descendant(const struct cgroup *cgrp);
258 269
259 /* Control Group subsystem type. See Documentation/cgroups.txt for details */ 270 /* Control Group subsystem type. See Documentation/cgroups.txt for details */
260 271
261 struct cgroup_subsys { 272 struct cgroup_subsys {
262 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, 273 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
263 struct cgroup *cgrp); 274 struct cgroup *cgrp);
264 void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 275 void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
265 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 276 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
266 int (*can_attach)(struct cgroup_subsys *ss, 277 int (*can_attach)(struct cgroup_subsys *ss,
267 struct cgroup *cgrp, struct task_struct *tsk); 278 struct cgroup *cgrp, struct task_struct *tsk);
268 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 279 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
269 struct cgroup *old_cgrp, struct task_struct *tsk); 280 struct cgroup *old_cgrp, struct task_struct *tsk);
270 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); 281 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
271 void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); 282 void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
272 int (*populate)(struct cgroup_subsys *ss, 283 int (*populate)(struct cgroup_subsys *ss,
273 struct cgroup *cgrp); 284 struct cgroup *cgrp);
274 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); 285 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
275 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); 286 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
276 int subsys_id; 287 int subsys_id;
277 int active; 288 int active;
278 int disabled; 289 int disabled;
279 int early_init; 290 int early_init;
280 #define MAX_CGROUP_TYPE_NAMELEN 32 291 #define MAX_CGROUP_TYPE_NAMELEN 32
281 const char *name; 292 const char *name;
282 293
283 /* Protected by RCU */ 294 /* Protected by RCU */
284 struct cgroupfs_root *root; 295 struct cgroupfs_root *root;
285 296
286 struct list_head sibling; 297 struct list_head sibling;
287 298
288 void *private; 299 void *private;
289 }; 300 };
290 301
291 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; 302 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
292 #include <linux/cgroup_subsys.h> 303 #include <linux/cgroup_subsys.h>
293 #undef SUBSYS 304 #undef SUBSYS
294 305
295 static inline struct cgroup_subsys_state *cgroup_subsys_state( 306 static inline struct cgroup_subsys_state *cgroup_subsys_state(
296 struct cgroup *cgrp, int subsys_id) 307 struct cgroup *cgrp, int subsys_id)
297 { 308 {
298 return cgrp->subsys[subsys_id]; 309 return cgrp->subsys[subsys_id];
299 } 310 }
300 311
301 static inline struct cgroup_subsys_state *task_subsys_state( 312 static inline struct cgroup_subsys_state *task_subsys_state(
302 struct task_struct *task, int subsys_id) 313 struct task_struct *task, int subsys_id)
303 { 314 {
304 return rcu_dereference(task->cgroups->subsys[subsys_id]); 315 return rcu_dereference(task->cgroups->subsys[subsys_id]);
305 } 316 }
306 317
307 static inline struct cgroup* task_cgroup(struct task_struct *task, 318 static inline struct cgroup* task_cgroup(struct task_struct *task,
308 int subsys_id) 319 int subsys_id)
309 { 320 {
310 return task_subsys_state(task, subsys_id)->cgroup; 321 return task_subsys_state(task, subsys_id)->cgroup;
311 } 322 }
312 323
313 int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss); 324 int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss);
314 325
315 /* A cgroup_iter should be treated as an opaque object */ 326 /* A cgroup_iter should be treated as an opaque object */
316 struct cgroup_iter { 327 struct cgroup_iter {
317 struct list_head *cg_link; 328 struct list_head *cg_link;
318 struct list_head *task; 329 struct list_head *task;
319 }; 330 };
320 331
321 /* To iterate across the tasks in a cgroup: 332 /* To iterate across the tasks in a cgroup:
322 * 333 *
323 * 1) call cgroup_iter_start to intialize an iterator 334 * 1) call cgroup_iter_start to intialize an iterator
324 * 335 *
325 * 2) call cgroup_iter_next() to retrieve member tasks until it 336 * 2) call cgroup_iter_next() to retrieve member tasks until it
326 * returns NULL or until you want to end the iteration 337 * returns NULL or until you want to end the iteration
327 * 338 *
328 * 3) call cgroup_iter_end() to destroy the iterator. 339 * 3) call cgroup_iter_end() to destroy the iterator.
329 * 340 *
330 * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset. 341 * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
331 * - cgroup_scan_tasks() holds the css_set_lock when calling the test_task() 342 * - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
332 * callback, but not while calling the process_task() callback. 343 * callback, but not while calling the process_task() callback.
333 */ 344 */
334 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); 345 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
335 struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 346 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
336 struct cgroup_iter *it); 347 struct cgroup_iter *it);
337 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); 348 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
338 int cgroup_scan_tasks(struct cgroup_scanner *scan); 349 int cgroup_scan_tasks(struct cgroup_scanner *scan);
339 int cgroup_attach_task(struct cgroup *, struct task_struct *); 350 int cgroup_attach_task(struct cgroup *, struct task_struct *);
340 351
341 #else /* !CONFIG_CGROUPS */ 352 #else /* !CONFIG_CGROUPS */
342 353
343 static inline int cgroup_init_early(void) { return 0; } 354 static inline int cgroup_init_early(void) { return 0; }
344 static inline int cgroup_init(void) { return 0; } 355 static inline int cgroup_init(void) { return 0; }
345 static inline void cgroup_init_smp(void) {} 356 static inline void cgroup_init_smp(void) {}
346 static inline void cgroup_fork(struct task_struct *p) {} 357 static inline void cgroup_fork(struct task_struct *p) {}
347 static inline void cgroup_fork_callbacks(struct task_struct *p) {} 358 static inline void cgroup_fork_callbacks(struct task_struct *p) {}
348 static inline void cgroup_post_fork(struct task_struct *p) {} 359 static inline void cgroup_post_fork(struct task_struct *p) {}
349 static inline void cgroup_exit(struct task_struct *p, int callbacks) {} 360 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
350 361
351 static inline void cgroup_lock(void) {} 362 static inline void cgroup_lock(void) {}
352 static inline void cgroup_unlock(void) {} 363 static inline void cgroup_unlock(void) {}
353 static inline int cgroupstats_build(struct cgroupstats *stats, 364 static inline int cgroupstats_build(struct cgroupstats *stats,
354 struct dentry *dentry) 365 struct dentry *dentry)
355 { 366 {
356 return -EINVAL; 367 return -EINVAL;
357 } 368 }
358 369
359 #endif /* !CONFIG_CGROUPS */ 370 #endif /* !CONFIG_CGROUPS */
360 371
361 #endif /* _LINUX_CGROUP_H */ 372 #endif /* _LINUX_CGROUP_H */
362 373
1 /* 1 /*
2 * Generic process-grouping system. 2 * Generic process-grouping system.
3 * 3 *
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Copyright notices from the original cpuset code: 7 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 8 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 9 * Copyright (C) 2003 BULL SA.
10 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 10 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
11 * 11 *
12 * Portions derived from Patrick Mochel's sysfs code. 12 * Portions derived from Patrick Mochel's sysfs code.
13 * sysfs is Copyright (c) 2001-3 Patrick Mochel 13 * sysfs is Copyright (c) 2001-3 Patrick Mochel
14 * 14 *
15 * 2003-10-10 Written by Simon Derr. 15 * 2003-10-10 Written by Simon Derr.
16 * 2003-10-22 Updates by Stephen Hemminger. 16 * 2003-10-22 Updates by Stephen Hemminger.
17 * 2004 May-July Rework by Paul Jackson. 17 * 2004 May-July Rework by Paul Jackson.
18 * --------------------------------------------------- 18 * ---------------------------------------------------
19 * 19 *
20 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
21 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
22 * distribution for more details. 22 * distribution for more details.
23 */ 23 */
24 24
25 #include <linux/cgroup.h> 25 #include <linux/cgroup.h>
26 #include <linux/errno.h> 26 #include <linux/errno.h>
27 #include <linux/fs.h> 27 #include <linux/fs.h>
28 #include <linux/kernel.h> 28 #include <linux/kernel.h>
29 #include <linux/list.h> 29 #include <linux/list.h>
30 #include <linux/mm.h> 30 #include <linux/mm.h>
31 #include <linux/mutex.h> 31 #include <linux/mutex.h>
32 #include <linux/mount.h> 32 #include <linux/mount.h>
33 #include <linux/pagemap.h> 33 #include <linux/pagemap.h>
34 #include <linux/proc_fs.h> 34 #include <linux/proc_fs.h>
35 #include <linux/rcupdate.h> 35 #include <linux/rcupdate.h>
36 #include <linux/sched.h> 36 #include <linux/sched.h>
37 #include <linux/backing-dev.h> 37 #include <linux/backing-dev.h>
38 #include <linux/seq_file.h> 38 #include <linux/seq_file.h>
39 #include <linux/slab.h> 39 #include <linux/slab.h>
40 #include <linux/magic.h> 40 #include <linux/magic.h>
41 #include <linux/spinlock.h> 41 #include <linux/spinlock.h>
42 #include <linux/string.h> 42 #include <linux/string.h>
43 #include <linux/sort.h> 43 #include <linux/sort.h>
44 #include <linux/kmod.h> 44 #include <linux/kmod.h>
45 #include <linux/delayacct.h> 45 #include <linux/delayacct.h>
46 #include <linux/cgroupstats.h> 46 #include <linux/cgroupstats.h>
47 47
48 #include <asm/atomic.h> 48 #include <asm/atomic.h>
49 49
50 static DEFINE_MUTEX(cgroup_mutex); 50 static DEFINE_MUTEX(cgroup_mutex);
51 51
52 /* Generate an array of cgroup subsystem pointers */ 52 /* Generate an array of cgroup subsystem pointers */
53 #define SUBSYS(_x) &_x ## _subsys, 53 #define SUBSYS(_x) &_x ## _subsys,
54 54
55 static struct cgroup_subsys *subsys[] = { 55 static struct cgroup_subsys *subsys[] = {
56 #include <linux/cgroup_subsys.h> 56 #include <linux/cgroup_subsys.h>
57 }; 57 };
58 58
59 /* 59 /*
60 * A cgroupfs_root represents the root of a cgroup hierarchy, 60 * A cgroupfs_root represents the root of a cgroup hierarchy,
61 * and may be associated with a superblock to form an active 61 * and may be associated with a superblock to form an active
62 * hierarchy 62 * hierarchy
63 */ 63 */
64 struct cgroupfs_root { 64 struct cgroupfs_root {
65 struct super_block *sb; 65 struct super_block *sb;
66 66
67 /* 67 /*
68 * The bitmask of subsystems intended to be attached to this 68 * The bitmask of subsystems intended to be attached to this
69 * hierarchy 69 * hierarchy
70 */ 70 */
71 unsigned long subsys_bits; 71 unsigned long subsys_bits;
72 72
73 /* The bitmask of subsystems currently attached to this hierarchy */ 73 /* The bitmask of subsystems currently attached to this hierarchy */
74 unsigned long actual_subsys_bits; 74 unsigned long actual_subsys_bits;
75 75
76 /* A list running through the attached subsystems */ 76 /* A list running through the attached subsystems */
77 struct list_head subsys_list; 77 struct list_head subsys_list;
78 78
79 /* The root cgroup for this hierarchy */ 79 /* The root cgroup for this hierarchy */
80 struct cgroup top_cgroup; 80 struct cgroup top_cgroup;
81 81
82 /* Tracks how many cgroups are currently defined in hierarchy.*/ 82 /* Tracks how many cgroups are currently defined in hierarchy.*/
83 int number_of_cgroups; 83 int number_of_cgroups;
84 84
85 /* A list running through the mounted hierarchies */ 85 /* A list running through the mounted hierarchies */
86 struct list_head root_list; 86 struct list_head root_list;
87 87
88 /* Hierarchy-specific flags */ 88 /* Hierarchy-specific flags */
89 unsigned long flags; 89 unsigned long flags;
90 90
91 /* The path to use for release notifications. No locking 91 /* The path to use for release notifications. No locking
92 * between setting and use - so if userspace updates this 92 * between setting and use - so if userspace updates this
93 * while child cgroups exist, you could miss a 93 * while child cgroups exist, you could miss a
94 * notification. We ensure that it's always a valid 94 * notification. We ensure that it's always a valid
95 * NUL-terminated string */ 95 * NUL-terminated string */
96 char release_agent_path[PATH_MAX]; 96 char release_agent_path[PATH_MAX];
97 }; 97 };
98 98
99 99
100 /* 100 /*
101 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 101 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
102 * subsystems that are otherwise unattached - it never has more than a 102 * subsystems that are otherwise unattached - it never has more than a
103 * single cgroup, and all tasks are part of that cgroup. 103 * single cgroup, and all tasks are part of that cgroup.
104 */ 104 */
105 static struct cgroupfs_root rootnode; 105 static struct cgroupfs_root rootnode;
106 106
107 /* The list of hierarchy roots */ 107 /* The list of hierarchy roots */
108 108
109 static LIST_HEAD(roots); 109 static LIST_HEAD(roots);
110 static int root_count; 110 static int root_count;
111 111
112 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 112 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
113 #define dummytop (&rootnode.top_cgroup) 113 #define dummytop (&rootnode.top_cgroup)
114 114
115 /* This flag indicates whether tasks in the fork and exit paths should 115 /* This flag indicates whether tasks in the fork and exit paths should
116 * check for fork/exit handlers to call. This avoids us having to do 116 * check for fork/exit handlers to call. This avoids us having to do
117 * extra work in the fork/exit path if none of the subsystems need to 117 * extra work in the fork/exit path if none of the subsystems need to
118 * be called. 118 * be called.
119 */ 119 */
120 static int need_forkexit_callback; 120 static int need_forkexit_callback;
121 121
122 /* bits in struct cgroup flags field */
123 enum {
124 /* Control Group is dead */
125 CGRP_REMOVED,
126 /* Control Group has previously had a child cgroup or a task,
127 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
128 CGRP_RELEASABLE,
129 /* Control Group requires release notifications to userspace */
130 CGRP_NOTIFY_ON_RELEASE,
131 };
132
133 /* convenient tests for these bits */ 122 /* convenient tests for these bits */
134 inline int cgroup_is_removed(const struct cgroup *cgrp) 123 inline int cgroup_is_removed(const struct cgroup *cgrp)
135 { 124 {
136 return test_bit(CGRP_REMOVED, &cgrp->flags); 125 return test_bit(CGRP_REMOVED, &cgrp->flags);
137 } 126 }
138 127
139 /* bits in struct cgroupfs_root flags field */ 128 /* bits in struct cgroupfs_root flags field */
140 enum { 129 enum {
141 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 130 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
142 }; 131 };
143 132
144 static int cgroup_is_releasable(const struct cgroup *cgrp) 133 static int cgroup_is_releasable(const struct cgroup *cgrp)
145 { 134 {
146 const int bits = 135 const int bits =
147 (1 << CGRP_RELEASABLE) | 136 (1 << CGRP_RELEASABLE) |
148 (1 << CGRP_NOTIFY_ON_RELEASE); 137 (1 << CGRP_NOTIFY_ON_RELEASE);
149 return (cgrp->flags & bits) == bits; 138 return (cgrp->flags & bits) == bits;
150 } 139 }
151 140
152 static int notify_on_release(const struct cgroup *cgrp) 141 static int notify_on_release(const struct cgroup *cgrp)
153 { 142 {
154 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 143 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
155 } 144 }
156 145
157 /* 146 /*
158 * for_each_subsys() allows you to iterate on each subsystem attached to 147 * for_each_subsys() allows you to iterate on each subsystem attached to
159 * an active hierarchy 148 * an active hierarchy
160 */ 149 */
161 #define for_each_subsys(_root, _ss) \ 150 #define for_each_subsys(_root, _ss) \
162 list_for_each_entry(_ss, &_root->subsys_list, sibling) 151 list_for_each_entry(_ss, &_root->subsys_list, sibling)
163 152
164 /* for_each_root() allows you to iterate across the active hierarchies */ 153 /* for_each_root() allows you to iterate across the active hierarchies */
165 #define for_each_root(_root) \ 154 #define for_each_root(_root) \
166 list_for_each_entry(_root, &roots, root_list) 155 list_for_each_entry(_root, &roots, root_list)
167 156
168 /* the list of cgroups eligible for automatic release. Protected by 157 /* the list of cgroups eligible for automatic release. Protected by
169 * release_list_lock */ 158 * release_list_lock */
170 static LIST_HEAD(release_list); 159 static LIST_HEAD(release_list);
171 static DEFINE_SPINLOCK(release_list_lock); 160 static DEFINE_SPINLOCK(release_list_lock);
172 static void cgroup_release_agent(struct work_struct *work); 161 static void cgroup_release_agent(struct work_struct *work);
173 static DECLARE_WORK(release_agent_work, cgroup_release_agent); 162 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
174 static void check_for_release(struct cgroup *cgrp); 163 static void check_for_release(struct cgroup *cgrp);
175 164
176 /* Link structure for associating css_set objects with cgroups */ 165 /* Link structure for associating css_set objects with cgroups */
177 struct cg_cgroup_link { 166 struct cg_cgroup_link {
178 /* 167 /*
179 * List running through cg_cgroup_links associated with a 168 * List running through cg_cgroup_links associated with a
180 * cgroup, anchored on cgroup->css_sets 169 * cgroup, anchored on cgroup->css_sets
181 */ 170 */
182 struct list_head cgrp_link_list; 171 struct list_head cgrp_link_list;
183 /* 172 /*
184 * List running through cg_cgroup_links pointing at a 173 * List running through cg_cgroup_links pointing at a
185 * single css_set object, anchored on css_set->cg_links 174 * single css_set object, anchored on css_set->cg_links
186 */ 175 */
187 struct list_head cg_link_list; 176 struct list_head cg_link_list;
188 struct css_set *cg; 177 struct css_set *cg;
189 }; 178 };
190 179
191 /* The default css_set - used by init and its children prior to any 180 /* The default css_set - used by init and its children prior to any
192 * hierarchies being mounted. It contains a pointer to the root state 181 * hierarchies being mounted. It contains a pointer to the root state
193 * for each subsystem. Also used to anchor the list of css_sets. Not 182 * for each subsystem. Also used to anchor the list of css_sets. Not
194 * reference-counted, to improve performance when child cgroups 183 * reference-counted, to improve performance when child cgroups
195 * haven't been created. 184 * haven't been created.
196 */ 185 */
197 186
198 static struct css_set init_css_set; 187 static struct css_set init_css_set;
199 static struct cg_cgroup_link init_css_set_link; 188 static struct cg_cgroup_link init_css_set_link;
200 189
201 /* css_set_lock protects the list of css_set objects, and the 190 /* css_set_lock protects the list of css_set objects, and the
202 * chain of tasks off each css_set. Nests outside task->alloc_lock 191 * chain of tasks off each css_set. Nests outside task->alloc_lock
203 * due to cgroup_iter_start() */ 192 * due to cgroup_iter_start() */
204 static DEFINE_RWLOCK(css_set_lock); 193 static DEFINE_RWLOCK(css_set_lock);
205 static int css_set_count; 194 static int css_set_count;
206 195
207 /* We don't maintain the lists running through each css_set to its 196 /* We don't maintain the lists running through each css_set to its
208 * task until after the first call to cgroup_iter_start(). This 197 * task until after the first call to cgroup_iter_start(). This
209 * reduces the fork()/exit() overhead for people who have cgroups 198 * reduces the fork()/exit() overhead for people who have cgroups
210 * compiled into their kernel but not actually in use */ 199 * compiled into their kernel but not actually in use */
211 static int use_task_css_set_links; 200 static int use_task_css_set_links;
212 201
213 /* When we create or destroy a css_set, the operation simply 202 /* When we create or destroy a css_set, the operation simply
214 * takes/releases a reference count on all the cgroups referenced 203 * takes/releases a reference count on all the cgroups referenced
215 * by subsystems in this css_set. This can end up multiple-counting 204 * by subsystems in this css_set. This can end up multiple-counting
216 * some cgroups, but that's OK - the ref-count is just a 205 * some cgroups, but that's OK - the ref-count is just a
217 * busy/not-busy indicator; ensuring that we only count each cgroup 206 * busy/not-busy indicator; ensuring that we only count each cgroup
218 * once would require taking a global lock to ensure that no 207 * once would require taking a global lock to ensure that no
219 * subsystems moved between hierarchies while we were doing so. 208 * subsystems moved between hierarchies while we were doing so.
220 * 209 *
221 * Possible TODO: decide at boot time based on the number of 210 * Possible TODO: decide at boot time based on the number of
222 * registered subsystems and the number of CPUs or NUMA nodes whether 211 * registered subsystems and the number of CPUs or NUMA nodes whether
223 * it's better for performance to ref-count every subsystem, or to 212 * it's better for performance to ref-count every subsystem, or to
224 * take a global lock and only add one ref count to each hierarchy. 213 * take a global lock and only add one ref count to each hierarchy.
225 */ 214 */
226 215
227 /* 216 /*
228 * unlink a css_set from the list and free it 217 * unlink a css_set from the list and free it
229 */ 218 */
230 static void unlink_css_set(struct css_set *cg) 219 static void unlink_css_set(struct css_set *cg)
231 { 220 {
232 write_lock(&css_set_lock); 221 write_lock(&css_set_lock);
233 list_del(&cg->list); 222 list_del(&cg->list);
234 css_set_count--; 223 css_set_count--;
235 while (!list_empty(&cg->cg_links)) { 224 while (!list_empty(&cg->cg_links)) {
236 struct cg_cgroup_link *link; 225 struct cg_cgroup_link *link;
237 link = list_entry(cg->cg_links.next, 226 link = list_entry(cg->cg_links.next,
238 struct cg_cgroup_link, cg_link_list); 227 struct cg_cgroup_link, cg_link_list);
239 list_del(&link->cg_link_list); 228 list_del(&link->cg_link_list);
240 list_del(&link->cgrp_link_list); 229 list_del(&link->cgrp_link_list);
241 kfree(link); 230 kfree(link);
242 } 231 }
243 write_unlock(&css_set_lock); 232 write_unlock(&css_set_lock);
244 } 233 }
245 234
246 static void __release_css_set(struct kref *k, int taskexit) 235 static void __release_css_set(struct kref *k, int taskexit)
247 { 236 {
248 int i; 237 int i;
249 struct css_set *cg = container_of(k, struct css_set, ref); 238 struct css_set *cg = container_of(k, struct css_set, ref);
250 239
251 unlink_css_set(cg); 240 unlink_css_set(cg);
252 241
253 rcu_read_lock(); 242 rcu_read_lock();
254 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 243 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
255 struct cgroup *cgrp = cg->subsys[i]->cgroup; 244 struct cgroup *cgrp = cg->subsys[i]->cgroup;
256 if (atomic_dec_and_test(&cgrp->count) && 245 if (atomic_dec_and_test(&cgrp->count) &&
257 notify_on_release(cgrp)) { 246 notify_on_release(cgrp)) {
258 if (taskexit) 247 if (taskexit)
259 set_bit(CGRP_RELEASABLE, &cgrp->flags); 248 set_bit(CGRP_RELEASABLE, &cgrp->flags);
260 check_for_release(cgrp); 249 check_for_release(cgrp);
261 } 250 }
262 } 251 }
263 rcu_read_unlock(); 252 rcu_read_unlock();
264 kfree(cg); 253 kfree(cg);
265 } 254 }
266 255
267 static void release_css_set(struct kref *k) 256 static void release_css_set(struct kref *k)
268 { 257 {
269 __release_css_set(k, 0); 258 __release_css_set(k, 0);
270 } 259 }
271 260
272 static void release_css_set_taskexit(struct kref *k) 261 static void release_css_set_taskexit(struct kref *k)
273 { 262 {
274 __release_css_set(k, 1); 263 __release_css_set(k, 1);
275 } 264 }
276 265
277 /* 266 /*
278 * refcounted get/put for css_set objects 267 * refcounted get/put for css_set objects
279 */ 268 */
280 static inline void get_css_set(struct css_set *cg) 269 static inline void get_css_set(struct css_set *cg)
281 { 270 {
282 kref_get(&cg->ref); 271 kref_get(&cg->ref);
283 } 272 }
284 273
285 static inline void put_css_set(struct css_set *cg) 274 static inline void put_css_set(struct css_set *cg)
286 { 275 {
287 kref_put(&cg->ref, release_css_set); 276 kref_put(&cg->ref, release_css_set);
288 } 277 }
289 278
290 static inline void put_css_set_taskexit(struct css_set *cg) 279 static inline void put_css_set_taskexit(struct css_set *cg)
291 { 280 {
292 kref_put(&cg->ref, release_css_set_taskexit); 281 kref_put(&cg->ref, release_css_set_taskexit);
293 } 282 }
294 283
295 /* 284 /*
296 * find_existing_css_set() is a helper for 285 * find_existing_css_set() is a helper for
297 * find_css_set(), and checks to see whether an existing 286 * find_css_set(), and checks to see whether an existing
298 * css_set is suitable. This currently walks a linked-list for 287 * css_set is suitable. This currently walks a linked-list for
299 * simplicity; a later patch will use a hash table for better 288 * simplicity; a later patch will use a hash table for better
300 * performance 289 * performance
301 * 290 *
302 * oldcg: the cgroup group that we're using before the cgroup 291 * oldcg: the cgroup group that we're using before the cgroup
303 * transition 292 * transition
304 * 293 *
305 * cgrp: the cgroup that we're moving into 294 * cgrp: the cgroup that we're moving into
306 * 295 *
307 * template: location in which to build the desired set of subsystem 296 * template: location in which to build the desired set of subsystem
308 * state objects for the new cgroup group 297 * state objects for the new cgroup group
309 */ 298 */
310 static struct css_set *find_existing_css_set( 299 static struct css_set *find_existing_css_set(
311 struct css_set *oldcg, 300 struct css_set *oldcg,
312 struct cgroup *cgrp, 301 struct cgroup *cgrp,
313 struct cgroup_subsys_state *template[]) 302 struct cgroup_subsys_state *template[])
314 { 303 {
315 int i; 304 int i;
316 struct cgroupfs_root *root = cgrp->root; 305 struct cgroupfs_root *root = cgrp->root;
317 struct list_head *l = &init_css_set.list; 306 struct list_head *l = &init_css_set.list;
318 307
319 /* Built the set of subsystem state objects that we want to 308 /* Built the set of subsystem state objects that we want to
320 * see in the new css_set */ 309 * see in the new css_set */
321 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 310 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
322 if (root->subsys_bits & (1UL << i)) { 311 if (root->subsys_bits & (1UL << i)) {
323 /* Subsystem is in this hierarchy. So we want 312 /* Subsystem is in this hierarchy. So we want
324 * the subsystem state from the new 313 * the subsystem state from the new
325 * cgroup */ 314 * cgroup */
326 template[i] = cgrp->subsys[i]; 315 template[i] = cgrp->subsys[i];
327 } else { 316 } else {
328 /* Subsystem is not in this hierarchy, so we 317 /* Subsystem is not in this hierarchy, so we
329 * don't want to change the subsystem state */ 318 * don't want to change the subsystem state */
330 template[i] = oldcg->subsys[i]; 319 template[i] = oldcg->subsys[i];
331 } 320 }
332 } 321 }
333 322
334 /* Look through existing cgroup groups to find one to reuse */ 323 /* Look through existing cgroup groups to find one to reuse */
335 do { 324 do {
336 struct css_set *cg = 325 struct css_set *cg =
337 list_entry(l, struct css_set, list); 326 list_entry(l, struct css_set, list);
338 327
339 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 328 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
340 /* All subsystems matched */ 329 /* All subsystems matched */
341 return cg; 330 return cg;
342 } 331 }
343 /* Try the next cgroup group */ 332 /* Try the next cgroup group */
344 l = l->next; 333 l = l->next;
345 } while (l != &init_css_set.list); 334 } while (l != &init_css_set.list);
346 335
347 /* No existing cgroup group matched */ 336 /* No existing cgroup group matched */
348 return NULL; 337 return NULL;
349 } 338 }
350 339
351 /* 340 /*
352 * allocate_cg_links() allocates "count" cg_cgroup_link structures 341 * allocate_cg_links() allocates "count" cg_cgroup_link structures
353 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 342 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
354 * success or a negative error 343 * success or a negative error
355 */ 344 */
356 static int allocate_cg_links(int count, struct list_head *tmp) 345 static int allocate_cg_links(int count, struct list_head *tmp)
357 { 346 {
358 struct cg_cgroup_link *link; 347 struct cg_cgroup_link *link;
359 int i; 348 int i;
360 INIT_LIST_HEAD(tmp); 349 INIT_LIST_HEAD(tmp);
361 for (i = 0; i < count; i++) { 350 for (i = 0; i < count; i++) {
362 link = kmalloc(sizeof(*link), GFP_KERNEL); 351 link = kmalloc(sizeof(*link), GFP_KERNEL);
363 if (!link) { 352 if (!link) {
364 while (!list_empty(tmp)) { 353 while (!list_empty(tmp)) {
365 link = list_entry(tmp->next, 354 link = list_entry(tmp->next,
366 struct cg_cgroup_link, 355 struct cg_cgroup_link,
367 cgrp_link_list); 356 cgrp_link_list);
368 list_del(&link->cgrp_link_list); 357 list_del(&link->cgrp_link_list);
369 kfree(link); 358 kfree(link);
370 } 359 }
371 return -ENOMEM; 360 return -ENOMEM;
372 } 361 }
373 list_add(&link->cgrp_link_list, tmp); 362 list_add(&link->cgrp_link_list, tmp);
374 } 363 }
375 return 0; 364 return 0;
376 } 365 }
377 366
378 static void free_cg_links(struct list_head *tmp) 367 static void free_cg_links(struct list_head *tmp)
379 { 368 {
380 while (!list_empty(tmp)) { 369 while (!list_empty(tmp)) {
381 struct cg_cgroup_link *link; 370 struct cg_cgroup_link *link;
382 link = list_entry(tmp->next, 371 link = list_entry(tmp->next,
383 struct cg_cgroup_link, 372 struct cg_cgroup_link,
384 cgrp_link_list); 373 cgrp_link_list);
385 list_del(&link->cgrp_link_list); 374 list_del(&link->cgrp_link_list);
386 kfree(link); 375 kfree(link);
387 } 376 }
388 } 377 }
389 378
390 /* 379 /*
391 * find_css_set() takes an existing cgroup group and a 380 * find_css_set() takes an existing cgroup group and a
392 * cgroup object, and returns a css_set object that's 381 * cgroup object, and returns a css_set object that's
393 * equivalent to the old group, but with the given cgroup 382 * equivalent to the old group, but with the given cgroup
394 * substituted into the appropriate hierarchy. Must be called with 383 * substituted into the appropriate hierarchy. Must be called with
395 * cgroup_mutex held 384 * cgroup_mutex held
396 */ 385 */
397 static struct css_set *find_css_set( 386 static struct css_set *find_css_set(
398 struct css_set *oldcg, struct cgroup *cgrp) 387 struct css_set *oldcg, struct cgroup *cgrp)
399 { 388 {
400 struct css_set *res; 389 struct css_set *res;
401 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 390 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
402 int i; 391 int i;
403 392
404 struct list_head tmp_cg_links; 393 struct list_head tmp_cg_links;
405 struct cg_cgroup_link *link; 394 struct cg_cgroup_link *link;
406 395
407 /* First see if we already have a cgroup group that matches 396 /* First see if we already have a cgroup group that matches
408 * the desired set */ 397 * the desired set */
409 write_lock(&css_set_lock); 398 write_lock(&css_set_lock);
410 res = find_existing_css_set(oldcg, cgrp, template); 399 res = find_existing_css_set(oldcg, cgrp, template);
411 if (res) 400 if (res)
412 get_css_set(res); 401 get_css_set(res);
413 write_unlock(&css_set_lock); 402 write_unlock(&css_set_lock);
414 403
415 if (res) 404 if (res)
416 return res; 405 return res;
417 406
418 res = kmalloc(sizeof(*res), GFP_KERNEL); 407 res = kmalloc(sizeof(*res), GFP_KERNEL);
419 if (!res) 408 if (!res)
420 return NULL; 409 return NULL;
421 410
422 /* Allocate all the cg_cgroup_link objects that we'll need */ 411 /* Allocate all the cg_cgroup_link objects that we'll need */
423 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { 412 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
424 kfree(res); 413 kfree(res);
425 return NULL; 414 return NULL;
426 } 415 }
427 416
428 kref_init(&res->ref); 417 kref_init(&res->ref);
429 INIT_LIST_HEAD(&res->cg_links); 418 INIT_LIST_HEAD(&res->cg_links);
430 INIT_LIST_HEAD(&res->tasks); 419 INIT_LIST_HEAD(&res->tasks);
431 420
432 /* Copy the set of subsystem state objects generated in 421 /* Copy the set of subsystem state objects generated in
433 * find_existing_css_set() */ 422 * find_existing_css_set() */
434 memcpy(res->subsys, template, sizeof(res->subsys)); 423 memcpy(res->subsys, template, sizeof(res->subsys));
435 424
436 write_lock(&css_set_lock); 425 write_lock(&css_set_lock);
437 /* Add reference counts and links from the new css_set. */ 426 /* Add reference counts and links from the new css_set. */
438 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 427 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
439 struct cgroup *cgrp = res->subsys[i]->cgroup; 428 struct cgroup *cgrp = res->subsys[i]->cgroup;
440 struct cgroup_subsys *ss = subsys[i]; 429 struct cgroup_subsys *ss = subsys[i];
441 atomic_inc(&cgrp->count); 430 atomic_inc(&cgrp->count);
442 /* 431 /*
443 * We want to add a link once per cgroup, so we 432 * We want to add a link once per cgroup, so we
444 * only do it for the first subsystem in each 433 * only do it for the first subsystem in each
445 * hierarchy 434 * hierarchy
446 */ 435 */
447 if (ss->root->subsys_list.next == &ss->sibling) { 436 if (ss->root->subsys_list.next == &ss->sibling) {
448 BUG_ON(list_empty(&tmp_cg_links)); 437 BUG_ON(list_empty(&tmp_cg_links));
449 link = list_entry(tmp_cg_links.next, 438 link = list_entry(tmp_cg_links.next,
450 struct cg_cgroup_link, 439 struct cg_cgroup_link,
451 cgrp_link_list); 440 cgrp_link_list);
452 list_del(&link->cgrp_link_list); 441 list_del(&link->cgrp_link_list);
453 list_add(&link->cgrp_link_list, &cgrp->css_sets); 442 list_add(&link->cgrp_link_list, &cgrp->css_sets);
454 link->cg = res; 443 link->cg = res;
455 list_add(&link->cg_link_list, &res->cg_links); 444 list_add(&link->cg_link_list, &res->cg_links);
456 } 445 }
457 } 446 }
458 if (list_empty(&rootnode.subsys_list)) { 447 if (list_empty(&rootnode.subsys_list)) {
459 link = list_entry(tmp_cg_links.next, 448 link = list_entry(tmp_cg_links.next,
460 struct cg_cgroup_link, 449 struct cg_cgroup_link,
461 cgrp_link_list); 450 cgrp_link_list);
462 list_del(&link->cgrp_link_list); 451 list_del(&link->cgrp_link_list);
463 list_add(&link->cgrp_link_list, &dummytop->css_sets); 452 list_add(&link->cgrp_link_list, &dummytop->css_sets);
464 link->cg = res; 453 link->cg = res;
465 list_add(&link->cg_link_list, &res->cg_links); 454 list_add(&link->cg_link_list, &res->cg_links);
466 } 455 }
467 456
468 BUG_ON(!list_empty(&tmp_cg_links)); 457 BUG_ON(!list_empty(&tmp_cg_links));
469 458
470 /* Link this cgroup group into the list */ 459 /* Link this cgroup group into the list */
471 list_add(&res->list, &init_css_set.list); 460 list_add(&res->list, &init_css_set.list);
472 css_set_count++; 461 css_set_count++;
473 write_unlock(&css_set_lock); 462 write_unlock(&css_set_lock);
474 463
475 return res; 464 return res;
476 } 465 }
477 466
478 /* 467 /*
479 * There is one global cgroup mutex. We also require taking 468 * There is one global cgroup mutex. We also require taking
480 * task_lock() when dereferencing a task's cgroup subsys pointers. 469 * task_lock() when dereferencing a task's cgroup subsys pointers.
481 * See "The task_lock() exception", at the end of this comment. 470 * See "The task_lock() exception", at the end of this comment.
482 * 471 *
483 * A task must hold cgroup_mutex to modify cgroups. 472 * A task must hold cgroup_mutex to modify cgroups.
484 * 473 *
485 * Any task can increment and decrement the count field without lock. 474 * Any task can increment and decrement the count field without lock.
486 * So in general, code holding cgroup_mutex can't rely on the count 475 * So in general, code holding cgroup_mutex can't rely on the count
487 * field not changing. However, if the count goes to zero, then only 476 * field not changing. However, if the count goes to zero, then only
488 * cgroup_attach_task() can increment it again. Because a count of zero 477 * cgroup_attach_task() can increment it again. Because a count of zero
489 * means that no tasks are currently attached, therefore there is no 478 * means that no tasks are currently attached, therefore there is no
490 * way a task attached to that cgroup can fork (the other way to 479 * way a task attached to that cgroup can fork (the other way to
491 * increment the count). So code holding cgroup_mutex can safely 480 * increment the count). So code holding cgroup_mutex can safely
492 * assume that if the count is zero, it will stay zero. Similarly, if 481 * assume that if the count is zero, it will stay zero. Similarly, if
493 * a task holds cgroup_mutex on a cgroup with zero count, it 482 * a task holds cgroup_mutex on a cgroup with zero count, it
494 * knows that the cgroup won't be removed, as cgroup_rmdir() 483 * knows that the cgroup won't be removed, as cgroup_rmdir()
495 * needs that mutex. 484 * needs that mutex.
496 * 485 *
497 * The cgroup_common_file_write handler for operations that modify 486 * The cgroup_common_file_write handler for operations that modify
498 * the cgroup hierarchy holds cgroup_mutex across the entire operation, 487 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
499 * single threading all such cgroup modifications across the system. 488 * single threading all such cgroup modifications across the system.
500 * 489 *
501 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't 490 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
502 * (usually) take cgroup_mutex. These are the two most performance 491 * (usually) take cgroup_mutex. These are the two most performance
503 * critical pieces of code here. The exception occurs on cgroup_exit(), 492 * critical pieces of code here. The exception occurs on cgroup_exit(),
504 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex 493 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
505 * is taken, and if the cgroup count is zero, a usermode call made 494 * is taken, and if the cgroup count is zero, a usermode call made
506 * to the release agent with the name of the cgroup (path relative to 495 * to the release agent with the name of the cgroup (path relative to
507 * the root of cgroup file system) as the argument. 496 * the root of cgroup file system) as the argument.
508 * 497 *
509 * A cgroup can only be deleted if both its 'count' of using tasks 498 * A cgroup can only be deleted if both its 'count' of using tasks
510 * is zero, and its list of 'children' cgroups is empty. Since all 499 * is zero, and its list of 'children' cgroups is empty. Since all
511 * tasks in the system use _some_ cgroup, and since there is always at 500 * tasks in the system use _some_ cgroup, and since there is always at
512 * least one task in the system (init, pid == 1), therefore, top_cgroup 501 * least one task in the system (init, pid == 1), therefore, top_cgroup
513 * always has either children cgroups and/or using tasks. So we don't 502 * always has either children cgroups and/or using tasks. So we don't
514 * need a special hack to ensure that top_cgroup cannot be deleted. 503 * need a special hack to ensure that top_cgroup cannot be deleted.
515 * 504 *
516 * The task_lock() exception 505 * The task_lock() exception
517 * 506 *
518 * The need for this exception arises from the action of 507 * The need for this exception arises from the action of
519 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 508 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
520 * another. It does so using cgroup_mutex, however there are 509 * another. It does so using cgroup_mutex, however there are
521 * several performance critical places that need to reference 510 * several performance critical places that need to reference
522 * task->cgroup without the expense of grabbing a system global 511 * task->cgroup without the expense of grabbing a system global
523 * mutex. Therefore except as noted below, when dereferencing or, as 512 * mutex. Therefore except as noted below, when dereferencing or, as
524 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 513 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
525 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 514 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
526 * the task_struct routinely used for such matters. 515 * the task_struct routinely used for such matters.
527 * 516 *
528 * P.S. One more locking exception. RCU is used to guard the 517 * P.S. One more locking exception. RCU is used to guard the
529 * update of a tasks cgroup pointer by cgroup_attach_task() 518 * update of a tasks cgroup pointer by cgroup_attach_task()
530 */ 519 */
531 520
532 /** 521 /**
533 * cgroup_lock - lock out any changes to cgroup structures 522 * cgroup_lock - lock out any changes to cgroup structures
534 * 523 *
535 */ 524 */
536 void cgroup_lock(void) 525 void cgroup_lock(void)
537 { 526 {
538 mutex_lock(&cgroup_mutex); 527 mutex_lock(&cgroup_mutex);
539 } 528 }
540 529
541 /** 530 /**
542 * cgroup_unlock - release lock on cgroup changes 531 * cgroup_unlock - release lock on cgroup changes
543 * 532 *
544 * Undo the lock taken in a previous cgroup_lock() call. 533 * Undo the lock taken in a previous cgroup_lock() call.
545 */ 534 */
546 void cgroup_unlock(void) 535 void cgroup_unlock(void)
547 { 536 {
548 mutex_unlock(&cgroup_mutex); 537 mutex_unlock(&cgroup_mutex);
549 } 538 }
550 539
551 /* 540 /*
552 * A couple of forward declarations required, due to cyclic reference loop: 541 * A couple of forward declarations required, due to cyclic reference loop:
553 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> 542 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
554 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations 543 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
555 * -> cgroup_mkdir. 544 * -> cgroup_mkdir.
556 */ 545 */
557 546
558 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 547 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
559 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 548 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
560 static int cgroup_populate_dir(struct cgroup *cgrp); 549 static int cgroup_populate_dir(struct cgroup *cgrp);
561 static struct inode_operations cgroup_dir_inode_operations; 550 static struct inode_operations cgroup_dir_inode_operations;
562 static struct file_operations proc_cgroupstats_operations; 551 static struct file_operations proc_cgroupstats_operations;
563 552
564 static struct backing_dev_info cgroup_backing_dev_info = { 553 static struct backing_dev_info cgroup_backing_dev_info = {
565 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 554 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
566 }; 555 };
567 556
568 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 557 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
569 { 558 {
570 struct inode *inode = new_inode(sb); 559 struct inode *inode = new_inode(sb);
571 560
572 if (inode) { 561 if (inode) {
573 inode->i_mode = mode; 562 inode->i_mode = mode;
574 inode->i_uid = current->fsuid; 563 inode->i_uid = current->fsuid;
575 inode->i_gid = current->fsgid; 564 inode->i_gid = current->fsgid;
576 inode->i_blocks = 0; 565 inode->i_blocks = 0;
577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 566 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 567 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
579 } 568 }
580 return inode; 569 return inode;
581 } 570 }
582 571
583 /* 572 /*
584 * Call subsys's pre_destroy handler. 573 * Call subsys's pre_destroy handler.
585 * This is called before css refcnt check. 574 * This is called before css refcnt check.
586 */ 575 */
587 static void cgroup_call_pre_destroy(struct cgroup *cgrp) 576 static void cgroup_call_pre_destroy(struct cgroup *cgrp)
588 { 577 {
589 struct cgroup_subsys *ss; 578 struct cgroup_subsys *ss;
590 for_each_subsys(cgrp->root, ss) 579 for_each_subsys(cgrp->root, ss)
591 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) 580 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
592 ss->pre_destroy(ss, cgrp); 581 ss->pre_destroy(ss, cgrp);
593 return; 582 return;
594 } 583 }
595 584
596 static void cgroup_diput(struct dentry *dentry, struct inode *inode) 585 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
597 { 586 {
598 /* is dentry a directory ? if so, kfree() associated cgroup */ 587 /* is dentry a directory ? if so, kfree() associated cgroup */
599 if (S_ISDIR(inode->i_mode)) { 588 if (S_ISDIR(inode->i_mode)) {
600 struct cgroup *cgrp = dentry->d_fsdata; 589 struct cgroup *cgrp = dentry->d_fsdata;
601 struct cgroup_subsys *ss; 590 struct cgroup_subsys *ss;
602 BUG_ON(!(cgroup_is_removed(cgrp))); 591 BUG_ON(!(cgroup_is_removed(cgrp)));
603 /* It's possible for external users to be holding css 592 /* It's possible for external users to be holding css
604 * reference counts on a cgroup; css_put() needs to 593 * reference counts on a cgroup; css_put() needs to
605 * be able to access the cgroup after decrementing 594 * be able to access the cgroup after decrementing
606 * the reference count in order to know if it needs to 595 * the reference count in order to know if it needs to
607 * queue the cgroup to be handled by the release 596 * queue the cgroup to be handled by the release
608 * agent */ 597 * agent */
609 synchronize_rcu(); 598 synchronize_rcu();
610 599
611 mutex_lock(&cgroup_mutex); 600 mutex_lock(&cgroup_mutex);
612 /* 601 /*
613 * Release the subsystem state objects. 602 * Release the subsystem state objects.
614 */ 603 */
615 for_each_subsys(cgrp->root, ss) { 604 for_each_subsys(cgrp->root, ss) {
616 if (cgrp->subsys[ss->subsys_id]) 605 if (cgrp->subsys[ss->subsys_id])
617 ss->destroy(ss, cgrp); 606 ss->destroy(ss, cgrp);
618 } 607 }
619 608
620 cgrp->root->number_of_cgroups--; 609 cgrp->root->number_of_cgroups--;
621 mutex_unlock(&cgroup_mutex); 610 mutex_unlock(&cgroup_mutex);
622 611
623 /* Drop the active superblock reference that we took when we 612 /* Drop the active superblock reference that we took when we
624 * created the cgroup */ 613 * created the cgroup */
625 deactivate_super(cgrp->root->sb); 614 deactivate_super(cgrp->root->sb);
626 615
627 kfree(cgrp); 616 kfree(cgrp);
628 } 617 }
629 iput(inode); 618 iput(inode);
630 } 619 }
631 620
632 static void remove_dir(struct dentry *d) 621 static void remove_dir(struct dentry *d)
633 { 622 {
634 struct dentry *parent = dget(d->d_parent); 623 struct dentry *parent = dget(d->d_parent);
635 624
636 d_delete(d); 625 d_delete(d);
637 simple_rmdir(parent->d_inode, d); 626 simple_rmdir(parent->d_inode, d);
638 dput(parent); 627 dput(parent);
639 } 628 }
640 629
641 static void cgroup_clear_directory(struct dentry *dentry) 630 static void cgroup_clear_directory(struct dentry *dentry)
642 { 631 {
643 struct list_head *node; 632 struct list_head *node;
644 633
645 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 634 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
646 spin_lock(&dcache_lock); 635 spin_lock(&dcache_lock);
647 node = dentry->d_subdirs.next; 636 node = dentry->d_subdirs.next;
648 while (node != &dentry->d_subdirs) { 637 while (node != &dentry->d_subdirs) {
649 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 638 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
650 list_del_init(node); 639 list_del_init(node);
651 if (d->d_inode) { 640 if (d->d_inode) {
652 /* This should never be called on a cgroup 641 /* This should never be called on a cgroup
653 * directory with child cgroups */ 642 * directory with child cgroups */
654 BUG_ON(d->d_inode->i_mode & S_IFDIR); 643 BUG_ON(d->d_inode->i_mode & S_IFDIR);
655 d = dget_locked(d); 644 d = dget_locked(d);
656 spin_unlock(&dcache_lock); 645 spin_unlock(&dcache_lock);
657 d_delete(d); 646 d_delete(d);
658 simple_unlink(dentry->d_inode, d); 647 simple_unlink(dentry->d_inode, d);
659 dput(d); 648 dput(d);
660 spin_lock(&dcache_lock); 649 spin_lock(&dcache_lock);
661 } 650 }
662 node = dentry->d_subdirs.next; 651 node = dentry->d_subdirs.next;
663 } 652 }
664 spin_unlock(&dcache_lock); 653 spin_unlock(&dcache_lock);
665 } 654 }
666 655
667 /* 656 /*
668 * NOTE : the dentry must have been dget()'ed 657 * NOTE : the dentry must have been dget()'ed
669 */ 658 */
670 static void cgroup_d_remove_dir(struct dentry *dentry) 659 static void cgroup_d_remove_dir(struct dentry *dentry)
671 { 660 {
672 cgroup_clear_directory(dentry); 661 cgroup_clear_directory(dentry);
673 662
674 spin_lock(&dcache_lock); 663 spin_lock(&dcache_lock);
675 list_del_init(&dentry->d_u.d_child); 664 list_del_init(&dentry->d_u.d_child);
676 spin_unlock(&dcache_lock); 665 spin_unlock(&dcache_lock);
677 remove_dir(dentry); 666 remove_dir(dentry);
678 } 667 }
679 668
680 static int rebind_subsystems(struct cgroupfs_root *root, 669 static int rebind_subsystems(struct cgroupfs_root *root,
681 unsigned long final_bits) 670 unsigned long final_bits)
682 { 671 {
683 unsigned long added_bits, removed_bits; 672 unsigned long added_bits, removed_bits;
684 struct cgroup *cgrp = &root->top_cgroup; 673 struct cgroup *cgrp = &root->top_cgroup;
685 int i; 674 int i;
686 675
687 removed_bits = root->actual_subsys_bits & ~final_bits; 676 removed_bits = root->actual_subsys_bits & ~final_bits;
688 added_bits = final_bits & ~root->actual_subsys_bits; 677 added_bits = final_bits & ~root->actual_subsys_bits;
689 /* Check that any added subsystems are currently free */ 678 /* Check that any added subsystems are currently free */
690 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 679 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
691 unsigned long bit = 1UL << i; 680 unsigned long bit = 1UL << i;
692 struct cgroup_subsys *ss = subsys[i]; 681 struct cgroup_subsys *ss = subsys[i];
693 if (!(bit & added_bits)) 682 if (!(bit & added_bits))
694 continue; 683 continue;
695 if (ss->root != &rootnode) { 684 if (ss->root != &rootnode) {
696 /* Subsystem isn't free */ 685 /* Subsystem isn't free */
697 return -EBUSY; 686 return -EBUSY;
698 } 687 }
699 } 688 }
700 689
701 /* Currently we don't handle adding/removing subsystems when 690 /* Currently we don't handle adding/removing subsystems when
702 * any child cgroups exist. This is theoretically supportable 691 * any child cgroups exist. This is theoretically supportable
703 * but involves complex error handling, so it's being left until 692 * but involves complex error handling, so it's being left until
704 * later */ 693 * later */
705 if (!list_empty(&cgrp->children)) 694 if (!list_empty(&cgrp->children))
706 return -EBUSY; 695 return -EBUSY;
707 696
708 /* Process each subsystem */ 697 /* Process each subsystem */
709 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 698 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
710 struct cgroup_subsys *ss = subsys[i]; 699 struct cgroup_subsys *ss = subsys[i];
711 unsigned long bit = 1UL << i; 700 unsigned long bit = 1UL << i;
712 if (bit & added_bits) { 701 if (bit & added_bits) {
713 /* We're binding this subsystem to this hierarchy */ 702 /* We're binding this subsystem to this hierarchy */
714 BUG_ON(cgrp->subsys[i]); 703 BUG_ON(cgrp->subsys[i]);
715 BUG_ON(!dummytop->subsys[i]); 704 BUG_ON(!dummytop->subsys[i]);
716 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 705 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
717 cgrp->subsys[i] = dummytop->subsys[i]; 706 cgrp->subsys[i] = dummytop->subsys[i];
718 cgrp->subsys[i]->cgroup = cgrp; 707 cgrp->subsys[i]->cgroup = cgrp;
719 list_add(&ss->sibling, &root->subsys_list); 708 list_add(&ss->sibling, &root->subsys_list);
720 rcu_assign_pointer(ss->root, root); 709 rcu_assign_pointer(ss->root, root);
721 if (ss->bind) 710 if (ss->bind)
722 ss->bind(ss, cgrp); 711 ss->bind(ss, cgrp);
723 712
724 } else if (bit & removed_bits) { 713 } else if (bit & removed_bits) {
725 /* We're removing this subsystem */ 714 /* We're removing this subsystem */
726 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 715 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
727 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 716 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
728 if (ss->bind) 717 if (ss->bind)
729 ss->bind(ss, dummytop); 718 ss->bind(ss, dummytop);
730 dummytop->subsys[i]->cgroup = dummytop; 719 dummytop->subsys[i]->cgroup = dummytop;
731 cgrp->subsys[i] = NULL; 720 cgrp->subsys[i] = NULL;
732 rcu_assign_pointer(subsys[i]->root, &rootnode); 721 rcu_assign_pointer(subsys[i]->root, &rootnode);
733 list_del(&ss->sibling); 722 list_del(&ss->sibling);
734 } else if (bit & final_bits) { 723 } else if (bit & final_bits) {
735 /* Subsystem state should already exist */ 724 /* Subsystem state should already exist */
736 BUG_ON(!cgrp->subsys[i]); 725 BUG_ON(!cgrp->subsys[i]);
737 } else { 726 } else {
738 /* Subsystem state shouldn't exist */ 727 /* Subsystem state shouldn't exist */
739 BUG_ON(cgrp->subsys[i]); 728 BUG_ON(cgrp->subsys[i]);
740 } 729 }
741 } 730 }
742 root->subsys_bits = root->actual_subsys_bits = final_bits; 731 root->subsys_bits = root->actual_subsys_bits = final_bits;
743 synchronize_rcu(); 732 synchronize_rcu();
744 733
745 return 0; 734 return 0;
746 } 735 }
747 736
748 static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) 737 static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
749 { 738 {
750 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; 739 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
751 struct cgroup_subsys *ss; 740 struct cgroup_subsys *ss;
752 741
753 mutex_lock(&cgroup_mutex); 742 mutex_lock(&cgroup_mutex);
754 for_each_subsys(root, ss) 743 for_each_subsys(root, ss)
755 seq_printf(seq, ",%s", ss->name); 744 seq_printf(seq, ",%s", ss->name);
756 if (test_bit(ROOT_NOPREFIX, &root->flags)) 745 if (test_bit(ROOT_NOPREFIX, &root->flags))
757 seq_puts(seq, ",noprefix"); 746 seq_puts(seq, ",noprefix");
758 if (strlen(root->release_agent_path)) 747 if (strlen(root->release_agent_path))
759 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 748 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
760 mutex_unlock(&cgroup_mutex); 749 mutex_unlock(&cgroup_mutex);
761 return 0; 750 return 0;
762 } 751 }
763 752
764 struct cgroup_sb_opts { 753 struct cgroup_sb_opts {
765 unsigned long subsys_bits; 754 unsigned long subsys_bits;
766 unsigned long flags; 755 unsigned long flags;
767 char *release_agent; 756 char *release_agent;
768 }; 757 };
769 758
770 /* Convert a hierarchy specifier into a bitmask of subsystems and 759 /* Convert a hierarchy specifier into a bitmask of subsystems and
771 * flags. */ 760 * flags. */
772 static int parse_cgroupfs_options(char *data, 761 static int parse_cgroupfs_options(char *data,
773 struct cgroup_sb_opts *opts) 762 struct cgroup_sb_opts *opts)
774 { 763 {
775 char *token, *o = data ?: "all"; 764 char *token, *o = data ?: "all";
776 765
777 opts->subsys_bits = 0; 766 opts->subsys_bits = 0;
778 opts->flags = 0; 767 opts->flags = 0;
779 opts->release_agent = NULL; 768 opts->release_agent = NULL;
780 769
781 while ((token = strsep(&o, ",")) != NULL) { 770 while ((token = strsep(&o, ",")) != NULL) {
782 if (!*token) 771 if (!*token)
783 return -EINVAL; 772 return -EINVAL;
784 if (!strcmp(token, "all")) { 773 if (!strcmp(token, "all")) {
785 /* Add all non-disabled subsystems */ 774 /* Add all non-disabled subsystems */
786 int i; 775 int i;
787 opts->subsys_bits = 0; 776 opts->subsys_bits = 0;
788 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 777 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
789 struct cgroup_subsys *ss = subsys[i]; 778 struct cgroup_subsys *ss = subsys[i];
790 if (!ss->disabled) 779 if (!ss->disabled)
791 opts->subsys_bits |= 1ul << i; 780 opts->subsys_bits |= 1ul << i;
792 } 781 }
793 } else if (!strcmp(token, "noprefix")) { 782 } else if (!strcmp(token, "noprefix")) {
794 set_bit(ROOT_NOPREFIX, &opts->flags); 783 set_bit(ROOT_NOPREFIX, &opts->flags);
795 } else if (!strncmp(token, "release_agent=", 14)) { 784 } else if (!strncmp(token, "release_agent=", 14)) {
796 /* Specifying two release agents is forbidden */ 785 /* Specifying two release agents is forbidden */
797 if (opts->release_agent) 786 if (opts->release_agent)
798 return -EINVAL; 787 return -EINVAL;
799 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); 788 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
800 if (!opts->release_agent) 789 if (!opts->release_agent)
801 return -ENOMEM; 790 return -ENOMEM;
802 strncpy(opts->release_agent, token + 14, PATH_MAX - 1); 791 strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
803 opts->release_agent[PATH_MAX - 1] = 0; 792 opts->release_agent[PATH_MAX - 1] = 0;
804 } else { 793 } else {
805 struct cgroup_subsys *ss; 794 struct cgroup_subsys *ss;
806 int i; 795 int i;
807 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 796 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
808 ss = subsys[i]; 797 ss = subsys[i];
809 if (!strcmp(token, ss->name)) { 798 if (!strcmp(token, ss->name)) {
810 if (!ss->disabled) 799 if (!ss->disabled)
811 set_bit(i, &opts->subsys_bits); 800 set_bit(i, &opts->subsys_bits);
812 break; 801 break;
813 } 802 }
814 } 803 }
815 if (i == CGROUP_SUBSYS_COUNT) 804 if (i == CGROUP_SUBSYS_COUNT)
816 return -ENOENT; 805 return -ENOENT;
817 } 806 }
818 } 807 }
819 808
820 /* We can't have an empty hierarchy */ 809 /* We can't have an empty hierarchy */
821 if (!opts->subsys_bits) 810 if (!opts->subsys_bits)
822 return -EINVAL; 811 return -EINVAL;
823 812
824 return 0; 813 return 0;
825 } 814 }
826 815
827 static int cgroup_remount(struct super_block *sb, int *flags, char *data) 816 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
828 { 817 {
829 int ret = 0; 818 int ret = 0;
830 struct cgroupfs_root *root = sb->s_fs_info; 819 struct cgroupfs_root *root = sb->s_fs_info;
831 struct cgroup *cgrp = &root->top_cgroup; 820 struct cgroup *cgrp = &root->top_cgroup;
832 struct cgroup_sb_opts opts; 821 struct cgroup_sb_opts opts;
833 822
834 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 823 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
835 mutex_lock(&cgroup_mutex); 824 mutex_lock(&cgroup_mutex);
836 825
837 /* See what subsystems are wanted */ 826 /* See what subsystems are wanted */
838 ret = parse_cgroupfs_options(data, &opts); 827 ret = parse_cgroupfs_options(data, &opts);
839 if (ret) 828 if (ret)
840 goto out_unlock; 829 goto out_unlock;
841 830
842 /* Don't allow flags to change at remount */ 831 /* Don't allow flags to change at remount */
843 if (opts.flags != root->flags) { 832 if (opts.flags != root->flags) {
844 ret = -EINVAL; 833 ret = -EINVAL;
845 goto out_unlock; 834 goto out_unlock;
846 } 835 }
847 836
848 ret = rebind_subsystems(root, opts.subsys_bits); 837 ret = rebind_subsystems(root, opts.subsys_bits);
849 838
850 /* (re)populate subsystem files */ 839 /* (re)populate subsystem files */
851 if (!ret) 840 if (!ret)
852 cgroup_populate_dir(cgrp); 841 cgroup_populate_dir(cgrp);
853 842
854 if (opts.release_agent) 843 if (opts.release_agent)
855 strcpy(root->release_agent_path, opts.release_agent); 844 strcpy(root->release_agent_path, opts.release_agent);
856 out_unlock: 845 out_unlock:
857 if (opts.release_agent) 846 if (opts.release_agent)
858 kfree(opts.release_agent); 847 kfree(opts.release_agent);
859 mutex_unlock(&cgroup_mutex); 848 mutex_unlock(&cgroup_mutex);
860 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 849 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
861 return ret; 850 return ret;
862 } 851 }
863 852
864 static struct super_operations cgroup_ops = { 853 static struct super_operations cgroup_ops = {
865 .statfs = simple_statfs, 854 .statfs = simple_statfs,
866 .drop_inode = generic_delete_inode, 855 .drop_inode = generic_delete_inode,
867 .show_options = cgroup_show_options, 856 .show_options = cgroup_show_options,
868 .remount_fs = cgroup_remount, 857 .remount_fs = cgroup_remount,
869 }; 858 };
870 859
871 static void init_cgroup_root(struct cgroupfs_root *root) 860 static void init_cgroup_root(struct cgroupfs_root *root)
872 { 861 {
873 struct cgroup *cgrp = &root->top_cgroup; 862 struct cgroup *cgrp = &root->top_cgroup;
874 INIT_LIST_HEAD(&root->subsys_list); 863 INIT_LIST_HEAD(&root->subsys_list);
875 INIT_LIST_HEAD(&root->root_list); 864 INIT_LIST_HEAD(&root->root_list);
876 root->number_of_cgroups = 1; 865 root->number_of_cgroups = 1;
877 cgrp->root = root; 866 cgrp->root = root;
878 cgrp->top_cgroup = cgrp; 867 cgrp->top_cgroup = cgrp;
879 INIT_LIST_HEAD(&cgrp->sibling); 868 INIT_LIST_HEAD(&cgrp->sibling);
880 INIT_LIST_HEAD(&cgrp->children); 869 INIT_LIST_HEAD(&cgrp->children);
881 INIT_LIST_HEAD(&cgrp->css_sets); 870 INIT_LIST_HEAD(&cgrp->css_sets);
882 INIT_LIST_HEAD(&cgrp->release_list); 871 INIT_LIST_HEAD(&cgrp->release_list);
883 } 872 }
884 873
885 static int cgroup_test_super(struct super_block *sb, void *data) 874 static int cgroup_test_super(struct super_block *sb, void *data)
886 { 875 {
887 struct cgroupfs_root *new = data; 876 struct cgroupfs_root *new = data;
888 struct cgroupfs_root *root = sb->s_fs_info; 877 struct cgroupfs_root *root = sb->s_fs_info;
889 878
890 /* First check subsystems */ 879 /* First check subsystems */
891 if (new->subsys_bits != root->subsys_bits) 880 if (new->subsys_bits != root->subsys_bits)
892 return 0; 881 return 0;
893 882
894 /* Next check flags */ 883 /* Next check flags */
895 if (new->flags != root->flags) 884 if (new->flags != root->flags)
896 return 0; 885 return 0;
897 886
898 return 1; 887 return 1;
899 } 888 }
900 889
901 static int cgroup_set_super(struct super_block *sb, void *data) 890 static int cgroup_set_super(struct super_block *sb, void *data)
902 { 891 {
903 int ret; 892 int ret;
904 struct cgroupfs_root *root = data; 893 struct cgroupfs_root *root = data;
905 894
906 ret = set_anon_super(sb, NULL); 895 ret = set_anon_super(sb, NULL);
907 if (ret) 896 if (ret)
908 return ret; 897 return ret;
909 898
910 sb->s_fs_info = root; 899 sb->s_fs_info = root;
911 root->sb = sb; 900 root->sb = sb;
912 901
913 sb->s_blocksize = PAGE_CACHE_SIZE; 902 sb->s_blocksize = PAGE_CACHE_SIZE;
914 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 903 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
915 sb->s_magic = CGROUP_SUPER_MAGIC; 904 sb->s_magic = CGROUP_SUPER_MAGIC;
916 sb->s_op = &cgroup_ops; 905 sb->s_op = &cgroup_ops;
917 906
918 return 0; 907 return 0;
919 } 908 }
920 909
921 static int cgroup_get_rootdir(struct super_block *sb) 910 static int cgroup_get_rootdir(struct super_block *sb)
922 { 911 {
923 struct inode *inode = 912 struct inode *inode =
924 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 913 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
925 struct dentry *dentry; 914 struct dentry *dentry;
926 915
927 if (!inode) 916 if (!inode)
928 return -ENOMEM; 917 return -ENOMEM;
929 918
930 inode->i_fop = &simple_dir_operations; 919 inode->i_fop = &simple_dir_operations;
931 inode->i_op = &cgroup_dir_inode_operations; 920 inode->i_op = &cgroup_dir_inode_operations;
932 /* directories start off with i_nlink == 2 (for "." entry) */ 921 /* directories start off with i_nlink == 2 (for "." entry) */
933 inc_nlink(inode); 922 inc_nlink(inode);
934 dentry = d_alloc_root(inode); 923 dentry = d_alloc_root(inode);
935 if (!dentry) { 924 if (!dentry) {
936 iput(inode); 925 iput(inode);
937 return -ENOMEM; 926 return -ENOMEM;
938 } 927 }
939 sb->s_root = dentry; 928 sb->s_root = dentry;
940 return 0; 929 return 0;
941 } 930 }
942 931
943 static int cgroup_get_sb(struct file_system_type *fs_type, 932 static int cgroup_get_sb(struct file_system_type *fs_type,
944 int flags, const char *unused_dev_name, 933 int flags, const char *unused_dev_name,
945 void *data, struct vfsmount *mnt) 934 void *data, struct vfsmount *mnt)
946 { 935 {
947 struct cgroup_sb_opts opts; 936 struct cgroup_sb_opts opts;
948 int ret = 0; 937 int ret = 0;
949 struct super_block *sb; 938 struct super_block *sb;
950 struct cgroupfs_root *root; 939 struct cgroupfs_root *root;
951 struct list_head tmp_cg_links, *l; 940 struct list_head tmp_cg_links, *l;
952 INIT_LIST_HEAD(&tmp_cg_links); 941 INIT_LIST_HEAD(&tmp_cg_links);
953 942
954 /* First find the desired set of subsystems */ 943 /* First find the desired set of subsystems */
955 ret = parse_cgroupfs_options(data, &opts); 944 ret = parse_cgroupfs_options(data, &opts);
956 if (ret) { 945 if (ret) {
957 if (opts.release_agent) 946 if (opts.release_agent)
958 kfree(opts.release_agent); 947 kfree(opts.release_agent);
959 return ret; 948 return ret;
960 } 949 }
961 950
962 root = kzalloc(sizeof(*root), GFP_KERNEL); 951 root = kzalloc(sizeof(*root), GFP_KERNEL);
963 if (!root) { 952 if (!root) {
964 if (opts.release_agent) 953 if (opts.release_agent)
965 kfree(opts.release_agent); 954 kfree(opts.release_agent);
966 return -ENOMEM; 955 return -ENOMEM;
967 } 956 }
968 957
969 init_cgroup_root(root); 958 init_cgroup_root(root);
970 root->subsys_bits = opts.subsys_bits; 959 root->subsys_bits = opts.subsys_bits;
971 root->flags = opts.flags; 960 root->flags = opts.flags;
972 if (opts.release_agent) { 961 if (opts.release_agent) {
973 strcpy(root->release_agent_path, opts.release_agent); 962 strcpy(root->release_agent_path, opts.release_agent);
974 kfree(opts.release_agent); 963 kfree(opts.release_agent);
975 } 964 }
976 965
977 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); 966 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
978 967
979 if (IS_ERR(sb)) { 968 if (IS_ERR(sb)) {
980 kfree(root); 969 kfree(root);
981 return PTR_ERR(sb); 970 return PTR_ERR(sb);
982 } 971 }
983 972
984 if (sb->s_fs_info != root) { 973 if (sb->s_fs_info != root) {
985 /* Reusing an existing superblock */ 974 /* Reusing an existing superblock */
986 BUG_ON(sb->s_root == NULL); 975 BUG_ON(sb->s_root == NULL);
987 kfree(root); 976 kfree(root);
988 root = NULL; 977 root = NULL;
989 } else { 978 } else {
990 /* New superblock */ 979 /* New superblock */
991 struct cgroup *cgrp = &root->top_cgroup; 980 struct cgroup *cgrp = &root->top_cgroup;
992 struct inode *inode; 981 struct inode *inode;
993 982
994 BUG_ON(sb->s_root != NULL); 983 BUG_ON(sb->s_root != NULL);
995 984
996 ret = cgroup_get_rootdir(sb); 985 ret = cgroup_get_rootdir(sb);
997 if (ret) 986 if (ret)
998 goto drop_new_super; 987 goto drop_new_super;
999 inode = sb->s_root->d_inode; 988 inode = sb->s_root->d_inode;
1000 989
1001 mutex_lock(&inode->i_mutex); 990 mutex_lock(&inode->i_mutex);
1002 mutex_lock(&cgroup_mutex); 991 mutex_lock(&cgroup_mutex);
1003 992
1004 /* 993 /*
1005 * We're accessing css_set_count without locking 994 * We're accessing css_set_count without locking
1006 * css_set_lock here, but that's OK - it can only be 995 * css_set_lock here, but that's OK - it can only be
1007 * increased by someone holding cgroup_lock, and 996 * increased by someone holding cgroup_lock, and
1008 * that's us. The worst that can happen is that we 997 * that's us. The worst that can happen is that we
1009 * have some link structures left over 998 * have some link structures left over
1010 */ 999 */
1011 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1000 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1012 if (ret) { 1001 if (ret) {
1013 mutex_unlock(&cgroup_mutex); 1002 mutex_unlock(&cgroup_mutex);
1014 mutex_unlock(&inode->i_mutex); 1003 mutex_unlock(&inode->i_mutex);
1015 goto drop_new_super; 1004 goto drop_new_super;
1016 } 1005 }
1017 1006
1018 ret = rebind_subsystems(root, root->subsys_bits); 1007 ret = rebind_subsystems(root, root->subsys_bits);
1019 if (ret == -EBUSY) { 1008 if (ret == -EBUSY) {
1020 mutex_unlock(&cgroup_mutex); 1009 mutex_unlock(&cgroup_mutex);
1021 mutex_unlock(&inode->i_mutex); 1010 mutex_unlock(&inode->i_mutex);
1022 goto drop_new_super; 1011 goto drop_new_super;
1023 } 1012 }
1024 1013
1025 /* EBUSY should be the only error here */ 1014 /* EBUSY should be the only error here */
1026 BUG_ON(ret); 1015 BUG_ON(ret);
1027 1016
1028 list_add(&root->root_list, &roots); 1017 list_add(&root->root_list, &roots);
1029 root_count++; 1018 root_count++;
1030 1019
1031 sb->s_root->d_fsdata = &root->top_cgroup; 1020 sb->s_root->d_fsdata = &root->top_cgroup;
1032 root->top_cgroup.dentry = sb->s_root; 1021 root->top_cgroup.dentry = sb->s_root;
1033 1022
1034 /* Link the top cgroup in this hierarchy into all 1023 /* Link the top cgroup in this hierarchy into all
1035 * the css_set objects */ 1024 * the css_set objects */
1036 write_lock(&css_set_lock); 1025 write_lock(&css_set_lock);
1037 l = &init_css_set.list; 1026 l = &init_css_set.list;
1038 do { 1027 do {
1039 struct css_set *cg; 1028 struct css_set *cg;
1040 struct cg_cgroup_link *link; 1029 struct cg_cgroup_link *link;
1041 cg = list_entry(l, struct css_set, list); 1030 cg = list_entry(l, struct css_set, list);
1042 BUG_ON(list_empty(&tmp_cg_links)); 1031 BUG_ON(list_empty(&tmp_cg_links));
1043 link = list_entry(tmp_cg_links.next, 1032 link = list_entry(tmp_cg_links.next,
1044 struct cg_cgroup_link, 1033 struct cg_cgroup_link,
1045 cgrp_link_list); 1034 cgrp_link_list);
1046 list_del(&link->cgrp_link_list); 1035 list_del(&link->cgrp_link_list);
1047 link->cg = cg; 1036 link->cg = cg;
1048 list_add(&link->cgrp_link_list, 1037 list_add(&link->cgrp_link_list,
1049 &root->top_cgroup.css_sets); 1038 &root->top_cgroup.css_sets);
1050 list_add(&link->cg_link_list, &cg->cg_links); 1039 list_add(&link->cg_link_list, &cg->cg_links);
1051 l = l->next; 1040 l = l->next;
1052 } while (l != &init_css_set.list); 1041 } while (l != &init_css_set.list);
1053 write_unlock(&css_set_lock); 1042 write_unlock(&css_set_lock);
1054 1043
1055 free_cg_links(&tmp_cg_links); 1044 free_cg_links(&tmp_cg_links);
1056 1045
1057 BUG_ON(!list_empty(&cgrp->sibling)); 1046 BUG_ON(!list_empty(&cgrp->sibling));
1058 BUG_ON(!list_empty(&cgrp->children)); 1047 BUG_ON(!list_empty(&cgrp->children));
1059 BUG_ON(root->number_of_cgroups != 1); 1048 BUG_ON(root->number_of_cgroups != 1);
1060 1049
1061 cgroup_populate_dir(cgrp); 1050 cgroup_populate_dir(cgrp);
1062 mutex_unlock(&inode->i_mutex); 1051 mutex_unlock(&inode->i_mutex);
1063 mutex_unlock(&cgroup_mutex); 1052 mutex_unlock(&cgroup_mutex);
1064 } 1053 }
1065 1054
1066 return simple_set_mnt(mnt, sb); 1055 return simple_set_mnt(mnt, sb);
1067 1056
1068 drop_new_super: 1057 drop_new_super:
1069 up_write(&sb->s_umount); 1058 up_write(&sb->s_umount);
1070 deactivate_super(sb); 1059 deactivate_super(sb);
1071 free_cg_links(&tmp_cg_links); 1060 free_cg_links(&tmp_cg_links);
1072 return ret; 1061 return ret;
1073 } 1062 }
1074 1063
1075 static void cgroup_kill_sb(struct super_block *sb) { 1064 static void cgroup_kill_sb(struct super_block *sb) {
1076 struct cgroupfs_root *root = sb->s_fs_info; 1065 struct cgroupfs_root *root = sb->s_fs_info;
1077 struct cgroup *cgrp = &root->top_cgroup; 1066 struct cgroup *cgrp = &root->top_cgroup;
1078 int ret; 1067 int ret;
1079 1068
1080 BUG_ON(!root); 1069 BUG_ON(!root);
1081 1070
1082 BUG_ON(root->number_of_cgroups != 1); 1071 BUG_ON(root->number_of_cgroups != 1);
1083 BUG_ON(!list_empty(&cgrp->children)); 1072 BUG_ON(!list_empty(&cgrp->children));
1084 BUG_ON(!list_empty(&cgrp->sibling)); 1073 BUG_ON(!list_empty(&cgrp->sibling));
1085 1074
1086 mutex_lock(&cgroup_mutex); 1075 mutex_lock(&cgroup_mutex);
1087 1076
1088 /* Rebind all subsystems back to the default hierarchy */ 1077 /* Rebind all subsystems back to the default hierarchy */
1089 ret = rebind_subsystems(root, 0); 1078 ret = rebind_subsystems(root, 0);
1090 /* Shouldn't be able to fail ... */ 1079 /* Shouldn't be able to fail ... */
1091 BUG_ON(ret); 1080 BUG_ON(ret);
1092 1081
1093 /* 1082 /*
1094 * Release all the links from css_sets to this hierarchy's 1083 * Release all the links from css_sets to this hierarchy's
1095 * root cgroup 1084 * root cgroup
1096 */ 1085 */
1097 write_lock(&css_set_lock); 1086 write_lock(&css_set_lock);
1098 while (!list_empty(&cgrp->css_sets)) { 1087 while (!list_empty(&cgrp->css_sets)) {
1099 struct cg_cgroup_link *link; 1088 struct cg_cgroup_link *link;
1100 link = list_entry(cgrp->css_sets.next, 1089 link = list_entry(cgrp->css_sets.next,
1101 struct cg_cgroup_link, cgrp_link_list); 1090 struct cg_cgroup_link, cgrp_link_list);
1102 list_del(&link->cg_link_list); 1091 list_del(&link->cg_link_list);
1103 list_del(&link->cgrp_link_list); 1092 list_del(&link->cgrp_link_list);
1104 kfree(link); 1093 kfree(link);
1105 } 1094 }
1106 write_unlock(&css_set_lock); 1095 write_unlock(&css_set_lock);
1107 1096
1108 if (!list_empty(&root->root_list)) { 1097 if (!list_empty(&root->root_list)) {
1109 list_del(&root->root_list); 1098 list_del(&root->root_list);
1110 root_count--; 1099 root_count--;
1111 } 1100 }
1112 mutex_unlock(&cgroup_mutex); 1101 mutex_unlock(&cgroup_mutex);
1113 1102
1114 kfree(root); 1103 kfree(root);
1115 kill_litter_super(sb); 1104 kill_litter_super(sb);
1116 } 1105 }
1117 1106
1118 static struct file_system_type cgroup_fs_type = { 1107 static struct file_system_type cgroup_fs_type = {
1119 .name = "cgroup", 1108 .name = "cgroup",
1120 .get_sb = cgroup_get_sb, 1109 .get_sb = cgroup_get_sb,
1121 .kill_sb = cgroup_kill_sb, 1110 .kill_sb = cgroup_kill_sb,
1122 }; 1111 };
1123 1112
1124 static inline struct cgroup *__d_cgrp(struct dentry *dentry) 1113 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1125 { 1114 {
1126 return dentry->d_fsdata; 1115 return dentry->d_fsdata;
1127 } 1116 }
1128 1117
1129 static inline struct cftype *__d_cft(struct dentry *dentry) 1118 static inline struct cftype *__d_cft(struct dentry *dentry)
1130 { 1119 {
1131 return dentry->d_fsdata; 1120 return dentry->d_fsdata;
1132 } 1121 }
1133 1122
1134 /** 1123 /**
1135 * cgroup_path - generate the path of a cgroup 1124 * cgroup_path - generate the path of a cgroup
1136 * @cgrp: the cgroup in question 1125 * @cgrp: the cgroup in question
1137 * @buf: the buffer to write the path into 1126 * @buf: the buffer to write the path into
1138 * @buflen: the length of the buffer 1127 * @buflen: the length of the buffer
1139 * 1128 *
1140 * Called with cgroup_mutex held. Writes path of cgroup into buf. 1129 * Called with cgroup_mutex held. Writes path of cgroup into buf.
1141 * Returns 0 on success, -errno on error. 1130 * Returns 0 on success, -errno on error.
1142 */ 1131 */
1143 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1132 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1144 { 1133 {
1145 char *start; 1134 char *start;
1146 1135
1147 if (cgrp == dummytop) { 1136 if (cgrp == dummytop) {
1148 /* 1137 /*
1149 * Inactive subsystems have no dentry for their root 1138 * Inactive subsystems have no dentry for their root
1150 * cgroup 1139 * cgroup
1151 */ 1140 */
1152 strcpy(buf, "/"); 1141 strcpy(buf, "/");
1153 return 0; 1142 return 0;
1154 } 1143 }
1155 1144
1156 start = buf + buflen; 1145 start = buf + buflen;
1157 1146
1158 *--start = '\0'; 1147 *--start = '\0';
1159 for (;;) { 1148 for (;;) {
1160 int len = cgrp->dentry->d_name.len; 1149 int len = cgrp->dentry->d_name.len;
1161 if ((start -= len) < buf) 1150 if ((start -= len) < buf)
1162 return -ENAMETOOLONG; 1151 return -ENAMETOOLONG;
1163 memcpy(start, cgrp->dentry->d_name.name, len); 1152 memcpy(start, cgrp->dentry->d_name.name, len);
1164 cgrp = cgrp->parent; 1153 cgrp = cgrp->parent;
1165 if (!cgrp) 1154 if (!cgrp)
1166 break; 1155 break;
1167 if (!cgrp->parent) 1156 if (!cgrp->parent)
1168 continue; 1157 continue;
1169 if (--start < buf) 1158 if (--start < buf)
1170 return -ENAMETOOLONG; 1159 return -ENAMETOOLONG;
1171 *start = '/'; 1160 *start = '/';
1172 } 1161 }
1173 memmove(buf, start, buf + buflen - start); 1162 memmove(buf, start, buf + buflen - start);
1174 return 0; 1163 return 0;
1175 } 1164 }
1176 1165
1177 /* 1166 /*
1178 * Return the first subsystem attached to a cgroup's hierarchy, and 1167 * Return the first subsystem attached to a cgroup's hierarchy, and
1179 * its subsystem id. 1168 * its subsystem id.
1180 */ 1169 */
1181 1170
1182 static void get_first_subsys(const struct cgroup *cgrp, 1171 static void get_first_subsys(const struct cgroup *cgrp,
1183 struct cgroup_subsys_state **css, int *subsys_id) 1172 struct cgroup_subsys_state **css, int *subsys_id)
1184 { 1173 {
1185 const struct cgroupfs_root *root = cgrp->root; 1174 const struct cgroupfs_root *root = cgrp->root;
1186 const struct cgroup_subsys *test_ss; 1175 const struct cgroup_subsys *test_ss;
1187 BUG_ON(list_empty(&root->subsys_list)); 1176 BUG_ON(list_empty(&root->subsys_list));
1188 test_ss = list_entry(root->subsys_list.next, 1177 test_ss = list_entry(root->subsys_list.next,
1189 struct cgroup_subsys, sibling); 1178 struct cgroup_subsys, sibling);
1190 if (css) { 1179 if (css) {
1191 *css = cgrp->subsys[test_ss->subsys_id]; 1180 *css = cgrp->subsys[test_ss->subsys_id];
1192 BUG_ON(!*css); 1181 BUG_ON(!*css);
1193 } 1182 }
1194 if (subsys_id) 1183 if (subsys_id)
1195 *subsys_id = test_ss->subsys_id; 1184 *subsys_id = test_ss->subsys_id;
1196 } 1185 }
1197 1186
1198 /** 1187 /**
1199 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1188 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1200 * @cgrp: the cgroup the task is attaching to 1189 * @cgrp: the cgroup the task is attaching to
1201 * @tsk: the task to be attached 1190 * @tsk: the task to be attached
1202 * 1191 *
1203 * Call holding cgroup_mutex. May take task_lock of 1192 * Call holding cgroup_mutex. May take task_lock of
1204 * the task 'tsk' during call. 1193 * the task 'tsk' during call.
1205 */ 1194 */
1206 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1195 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1207 { 1196 {
1208 int retval = 0; 1197 int retval = 0;
1209 struct cgroup_subsys *ss; 1198 struct cgroup_subsys *ss;
1210 struct cgroup *oldcgrp; 1199 struct cgroup *oldcgrp;
1211 struct css_set *cg = tsk->cgroups; 1200 struct css_set *cg = tsk->cgroups;
1212 struct css_set *newcg; 1201 struct css_set *newcg;
1213 struct cgroupfs_root *root = cgrp->root; 1202 struct cgroupfs_root *root = cgrp->root;
1214 int subsys_id; 1203 int subsys_id;
1215 1204
1216 get_first_subsys(cgrp, NULL, &subsys_id); 1205 get_first_subsys(cgrp, NULL, &subsys_id);
1217 1206
1218 /* Nothing to do if the task is already in that cgroup */ 1207 /* Nothing to do if the task is already in that cgroup */
1219 oldcgrp = task_cgroup(tsk, subsys_id); 1208 oldcgrp = task_cgroup(tsk, subsys_id);
1220 if (cgrp == oldcgrp) 1209 if (cgrp == oldcgrp)
1221 return 0; 1210 return 0;
1222 1211
1223 for_each_subsys(root, ss) { 1212 for_each_subsys(root, ss) {
1224 if (ss->can_attach) { 1213 if (ss->can_attach) {
1225 retval = ss->can_attach(ss, cgrp, tsk); 1214 retval = ss->can_attach(ss, cgrp, tsk);
1226 if (retval) 1215 if (retval)
1227 return retval; 1216 return retval;
1228 } 1217 }
1229 } 1218 }
1230 1219
1231 /* 1220 /*
1232 * Locate or allocate a new css_set for this task, 1221 * Locate or allocate a new css_set for this task,
1233 * based on its final set of cgroups 1222 * based on its final set of cgroups
1234 */ 1223 */
1235 newcg = find_css_set(cg, cgrp); 1224 newcg = find_css_set(cg, cgrp);
1236 if (!newcg) 1225 if (!newcg)
1237 return -ENOMEM; 1226 return -ENOMEM;
1238 1227
1239 task_lock(tsk); 1228 task_lock(tsk);
1240 if (tsk->flags & PF_EXITING) { 1229 if (tsk->flags & PF_EXITING) {
1241 task_unlock(tsk); 1230 task_unlock(tsk);
1242 put_css_set(newcg); 1231 put_css_set(newcg);
1243 return -ESRCH; 1232 return -ESRCH;
1244 } 1233 }
1245 rcu_assign_pointer(tsk->cgroups, newcg); 1234 rcu_assign_pointer(tsk->cgroups, newcg);
1246 task_unlock(tsk); 1235 task_unlock(tsk);
1247 1236
1248 /* Update the css_set linked lists if we're using them */ 1237 /* Update the css_set linked lists if we're using them */
1249 write_lock(&css_set_lock); 1238 write_lock(&css_set_lock);
1250 if (!list_empty(&tsk->cg_list)) { 1239 if (!list_empty(&tsk->cg_list)) {
1251 list_del(&tsk->cg_list); 1240 list_del(&tsk->cg_list);
1252 list_add(&tsk->cg_list, &newcg->tasks); 1241 list_add(&tsk->cg_list, &newcg->tasks);
1253 } 1242 }
1254 write_unlock(&css_set_lock); 1243 write_unlock(&css_set_lock);
1255 1244
1256 for_each_subsys(root, ss) { 1245 for_each_subsys(root, ss) {
1257 if (ss->attach) 1246 if (ss->attach)
1258 ss->attach(ss, cgrp, oldcgrp, tsk); 1247 ss->attach(ss, cgrp, oldcgrp, tsk);
1259 } 1248 }
1260 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1249 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1261 synchronize_rcu(); 1250 synchronize_rcu();
1262 put_css_set(cg); 1251 put_css_set(cg);
1263 return 0; 1252 return 0;
1264 } 1253 }
1265 1254
1266 /* 1255 /*
1267 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with 1256 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
1268 * cgroup_mutex, may take task_lock of task 1257 * cgroup_mutex, may take task_lock of task
1269 */ 1258 */
1270 static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) 1259 static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1271 { 1260 {
1272 pid_t pid; 1261 pid_t pid;
1273 struct task_struct *tsk; 1262 struct task_struct *tsk;
1274 int ret; 1263 int ret;
1275 1264
1276 if (sscanf(pidbuf, "%d", &pid) != 1) 1265 if (sscanf(pidbuf, "%d", &pid) != 1)
1277 return -EIO; 1266 return -EIO;
1278 1267
1279 if (pid) { 1268 if (pid) {
1280 rcu_read_lock(); 1269 rcu_read_lock();
1281 tsk = find_task_by_vpid(pid); 1270 tsk = find_task_by_vpid(pid);
1282 if (!tsk || tsk->flags & PF_EXITING) { 1271 if (!tsk || tsk->flags & PF_EXITING) {
1283 rcu_read_unlock(); 1272 rcu_read_unlock();
1284 return -ESRCH; 1273 return -ESRCH;
1285 } 1274 }
1286 get_task_struct(tsk); 1275 get_task_struct(tsk);
1287 rcu_read_unlock(); 1276 rcu_read_unlock();
1288 1277
1289 if ((current->euid) && (current->euid != tsk->uid) 1278 if ((current->euid) && (current->euid != tsk->uid)
1290 && (current->euid != tsk->suid)) { 1279 && (current->euid != tsk->suid)) {
1291 put_task_struct(tsk); 1280 put_task_struct(tsk);
1292 return -EACCES; 1281 return -EACCES;
1293 } 1282 }
1294 } else { 1283 } else {
1295 tsk = current; 1284 tsk = current;
1296 get_task_struct(tsk); 1285 get_task_struct(tsk);
1297 } 1286 }
1298 1287
1299 ret = cgroup_attach_task(cgrp, tsk); 1288 ret = cgroup_attach_task(cgrp, tsk);
1300 put_task_struct(tsk); 1289 put_task_struct(tsk);
1301 return ret; 1290 return ret;
1302 } 1291 }
1303 1292
1304 /* The various types of files and directories in a cgroup file system */ 1293 /* The various types of files and directories in a cgroup file system */
1305 enum cgroup_filetype { 1294 enum cgroup_filetype {
1306 FILE_ROOT, 1295 FILE_ROOT,
1307 FILE_DIR, 1296 FILE_DIR,
1308 FILE_TASKLIST, 1297 FILE_TASKLIST,
1309 FILE_NOTIFY_ON_RELEASE, 1298 FILE_NOTIFY_ON_RELEASE,
1310 FILE_RELEASABLE,
1311 FILE_RELEASE_AGENT, 1299 FILE_RELEASE_AGENT,
1312 }; 1300 };
1313 1301
1314 static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft, 1302 static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft,
1315 struct file *file, 1303 struct file *file,
1316 const char __user *userbuf, 1304 const char __user *userbuf,
1317 size_t nbytes, loff_t *unused_ppos) 1305 size_t nbytes, loff_t *unused_ppos)
1318 { 1306 {
1319 char buffer[64]; 1307 char buffer[64];
1320 int retval = 0; 1308 int retval = 0;
1321 u64 val; 1309 u64 val;
1322 char *end; 1310 char *end;
1323 1311
1324 if (!nbytes) 1312 if (!nbytes)
1325 return -EINVAL; 1313 return -EINVAL;
1326 if (nbytes >= sizeof(buffer)) 1314 if (nbytes >= sizeof(buffer))
1327 return -E2BIG; 1315 return -E2BIG;
1328 if (copy_from_user(buffer, userbuf, nbytes)) 1316 if (copy_from_user(buffer, userbuf, nbytes))
1329 return -EFAULT; 1317 return -EFAULT;
1330 1318
1331 buffer[nbytes] = 0; /* nul-terminate */ 1319 buffer[nbytes] = 0; /* nul-terminate */
1332 strstrip(buffer); 1320 strstrip(buffer);
1333 val = simple_strtoull(buffer, &end, 0); 1321 val = simple_strtoull(buffer, &end, 0);
1334 if (*end) 1322 if (*end)
1335 return -EINVAL; 1323 return -EINVAL;
1336 1324
1337 /* Pass to subsystem */ 1325 /* Pass to subsystem */
1338 retval = cft->write_u64(cgrp, cft, val); 1326 retval = cft->write_u64(cgrp, cft, val);
1339 if (!retval) 1327 if (!retval)
1340 retval = nbytes; 1328 retval = nbytes;
1341 return retval; 1329 return retval;
1342 } 1330 }
1343 1331
1344 static ssize_t cgroup_common_file_write(struct cgroup *cgrp, 1332 static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
1345 struct cftype *cft, 1333 struct cftype *cft,
1346 struct file *file, 1334 struct file *file,
1347 const char __user *userbuf, 1335 const char __user *userbuf,
1348 size_t nbytes, loff_t *unused_ppos) 1336 size_t nbytes, loff_t *unused_ppos)
1349 { 1337 {
1350 enum cgroup_filetype type = cft->private; 1338 enum cgroup_filetype type = cft->private;
1351 char *buffer; 1339 char *buffer;
1352 int retval = 0; 1340 int retval = 0;
1353 1341
1354 if (nbytes >= PATH_MAX) 1342 if (nbytes >= PATH_MAX)
1355 return -E2BIG; 1343 return -E2BIG;
1356 1344
1357 /* +1 for nul-terminator */ 1345 /* +1 for nul-terminator */
1358 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 1346 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1359 if (buffer == NULL) 1347 if (buffer == NULL)
1360 return -ENOMEM; 1348 return -ENOMEM;
1361 1349
1362 if (copy_from_user(buffer, userbuf, nbytes)) { 1350 if (copy_from_user(buffer, userbuf, nbytes)) {
1363 retval = -EFAULT; 1351 retval = -EFAULT;
1364 goto out1; 1352 goto out1;
1365 } 1353 }
1366 buffer[nbytes] = 0; /* nul-terminate */ 1354 buffer[nbytes] = 0; /* nul-terminate */
1367 strstrip(buffer); /* strip -just- trailing whitespace */ 1355 strstrip(buffer); /* strip -just- trailing whitespace */
1368 1356
1369 mutex_lock(&cgroup_mutex); 1357 mutex_lock(&cgroup_mutex);
1370 1358
1371 /* 1359 /*
1372 * This was already checked for in cgroup_file_write(), but 1360 * This was already checked for in cgroup_file_write(), but
1373 * check again now we're holding cgroup_mutex. 1361 * check again now we're holding cgroup_mutex.
1374 */ 1362 */
1375 if (cgroup_is_removed(cgrp)) { 1363 if (cgroup_is_removed(cgrp)) {
1376 retval = -ENODEV; 1364 retval = -ENODEV;
1377 goto out2; 1365 goto out2;
1378 } 1366 }
1379 1367
1380 switch (type) { 1368 switch (type) {
1381 case FILE_TASKLIST: 1369 case FILE_TASKLIST:
1382 retval = attach_task_by_pid(cgrp, buffer); 1370 retval = attach_task_by_pid(cgrp, buffer);
1383 break; 1371 break;
1384 case FILE_NOTIFY_ON_RELEASE: 1372 case FILE_NOTIFY_ON_RELEASE:
1385 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 1373 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
1386 if (simple_strtoul(buffer, NULL, 10) != 0) 1374 if (simple_strtoul(buffer, NULL, 10) != 0)
1387 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 1375 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1388 else 1376 else
1389 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 1377 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1390 break; 1378 break;
1391 case FILE_RELEASE_AGENT: 1379 case FILE_RELEASE_AGENT:
1392 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 1380 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1393 strcpy(cgrp->root->release_agent_path, buffer); 1381 strcpy(cgrp->root->release_agent_path, buffer);
1394 break; 1382 break;
1395 default: 1383 default:
1396 retval = -EINVAL; 1384 retval = -EINVAL;
1397 goto out2; 1385 goto out2;
1398 } 1386 }
1399 1387
1400 if (retval == 0) 1388 if (retval == 0)
1401 retval = nbytes; 1389 retval = nbytes;
1402 out2: 1390 out2:
1403 mutex_unlock(&cgroup_mutex); 1391 mutex_unlock(&cgroup_mutex);
1404 out1: 1392 out1:
1405 kfree(buffer); 1393 kfree(buffer);
1406 return retval; 1394 return retval;
1407 } 1395 }
1408 1396
1409 static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 1397 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1410 size_t nbytes, loff_t *ppos) 1398 size_t nbytes, loff_t *ppos)
1411 { 1399 {
1412 struct cftype *cft = __d_cft(file->f_dentry); 1400 struct cftype *cft = __d_cft(file->f_dentry);
1413 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1401 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1414 1402
1415 if (!cft || cgroup_is_removed(cgrp)) 1403 if (!cft || cgroup_is_removed(cgrp))
1416 return -ENODEV; 1404 return -ENODEV;
1417 if (cft->write) 1405 if (cft->write)
1418 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1406 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1419 if (cft->write_u64) 1407 if (cft->write_u64)
1420 return cgroup_write_u64(cgrp, cft, file, buf, nbytes, ppos); 1408 return cgroup_write_u64(cgrp, cft, file, buf, nbytes, ppos);
1421 return -EINVAL; 1409 return -EINVAL;
1422 } 1410 }
1423 1411
1424 static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 1412 static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
1425 struct file *file, 1413 struct file *file,
1426 char __user *buf, size_t nbytes, 1414 char __user *buf, size_t nbytes,
1427 loff_t *ppos) 1415 loff_t *ppos)
1428 { 1416 {
1429 char tmp[64]; 1417 char tmp[64];
1430 u64 val = cft->read_u64(cgrp, cft); 1418 u64 val = cft->read_u64(cgrp, cft);
1431 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 1419 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1432 1420
1433 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 1421 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1434 } 1422 }
1435 1423
1436 static ssize_t cgroup_common_file_read(struct cgroup *cgrp, 1424 static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1437 struct cftype *cft, 1425 struct cftype *cft,
1438 struct file *file, 1426 struct file *file,
1439 char __user *buf, 1427 char __user *buf,
1440 size_t nbytes, loff_t *ppos) 1428 size_t nbytes, loff_t *ppos)
1441 { 1429 {
1442 enum cgroup_filetype type = cft->private; 1430 enum cgroup_filetype type = cft->private;
1443 char *page; 1431 char *page;
1444 ssize_t retval = 0; 1432 ssize_t retval = 0;
1445 char *s; 1433 char *s;
1446 1434
1447 if (!(page = (char *)__get_free_page(GFP_KERNEL))) 1435 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
1448 return -ENOMEM; 1436 return -ENOMEM;
1449 1437
1450 s = page; 1438 s = page;
1451 1439
1452 switch (type) { 1440 switch (type) {
1453 case FILE_RELEASE_AGENT: 1441 case FILE_RELEASE_AGENT:
1454 { 1442 {
1455 struct cgroupfs_root *root; 1443 struct cgroupfs_root *root;
1456 size_t n; 1444 size_t n;
1457 mutex_lock(&cgroup_mutex); 1445 mutex_lock(&cgroup_mutex);
1458 root = cgrp->root; 1446 root = cgrp->root;
1459 n = strnlen(root->release_agent_path, 1447 n = strnlen(root->release_agent_path,
1460 sizeof(root->release_agent_path)); 1448 sizeof(root->release_agent_path));
1461 n = min(n, (size_t) PAGE_SIZE); 1449 n = min(n, (size_t) PAGE_SIZE);
1462 strncpy(s, root->release_agent_path, n); 1450 strncpy(s, root->release_agent_path, n);
1463 mutex_unlock(&cgroup_mutex); 1451 mutex_unlock(&cgroup_mutex);
1464 s += n; 1452 s += n;
1465 break; 1453 break;
1466 } 1454 }
1467 default: 1455 default:
1468 retval = -EINVAL; 1456 retval = -EINVAL;
1469 goto out; 1457 goto out;
1470 } 1458 }
1471 *s++ = '\n'; 1459 *s++ = '\n';
1472 1460
1473 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); 1461 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1474 out: 1462 out:
1475 free_page((unsigned long)page); 1463 free_page((unsigned long)page);
1476 return retval; 1464 return retval;
1477 } 1465 }
1478 1466
1479 static ssize_t cgroup_file_read(struct file *file, char __user *buf, 1467 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1480 size_t nbytes, loff_t *ppos) 1468 size_t nbytes, loff_t *ppos)
1481 { 1469 {
1482 struct cftype *cft = __d_cft(file->f_dentry); 1470 struct cftype *cft = __d_cft(file->f_dentry);
1483 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1471 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1484 1472
1485 if (!cft || cgroup_is_removed(cgrp)) 1473 if (!cft || cgroup_is_removed(cgrp))
1486 return -ENODEV; 1474 return -ENODEV;
1487 1475
1488 if (cft->read) 1476 if (cft->read)
1489 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 1477 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
1490 if (cft->read_u64) 1478 if (cft->read_u64)
1491 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 1479 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
1492 return -EINVAL; 1480 return -EINVAL;
1493 } 1481 }
1494 1482
1495 /* 1483 /*
1496 * seqfile ops/methods for returning structured data. Currently just 1484 * seqfile ops/methods for returning structured data. Currently just
1497 * supports string->u64 maps, but can be extended in future. 1485 * supports string->u64 maps, but can be extended in future.
1498 */ 1486 */
1499 1487
1500 struct cgroup_seqfile_state { 1488 struct cgroup_seqfile_state {
1501 struct cftype *cft; 1489 struct cftype *cft;
1502 struct cgroup *cgroup; 1490 struct cgroup *cgroup;
1503 }; 1491 };
1504 1492
1505 static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 1493 static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
1506 { 1494 {
1507 struct seq_file *sf = cb->state; 1495 struct seq_file *sf = cb->state;
1508 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); 1496 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
1509 } 1497 }
1510 1498
1511 static int cgroup_seqfile_show(struct seq_file *m, void *arg) 1499 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
1512 { 1500 {
1513 struct cgroup_seqfile_state *state = m->private; 1501 struct cgroup_seqfile_state *state = m->private;
1514 struct cftype *cft = state->cft; 1502 struct cftype *cft = state->cft;
1515 struct cgroup_map_cb cb = { 1503 struct cgroup_map_cb cb = {
1516 .fill = cgroup_map_add, 1504 .fill = cgroup_map_add,
1517 .state = m, 1505 .state = m,
1518 }; 1506 };
1519 return cft->read_map(state->cgroup, cft, &cb); 1507 return cft->read_map(state->cgroup, cft, &cb);
1520 } 1508 }
1521 1509
1522 int cgroup_seqfile_release(struct inode *inode, struct file *file) 1510 int cgroup_seqfile_release(struct inode *inode, struct file *file)
1523 { 1511 {
1524 struct seq_file *seq = file->private_data; 1512 struct seq_file *seq = file->private_data;
1525 kfree(seq->private); 1513 kfree(seq->private);
1526 return single_release(inode, file); 1514 return single_release(inode, file);
1527 } 1515 }
1528 1516
1529 static struct file_operations cgroup_seqfile_operations = { 1517 static struct file_operations cgroup_seqfile_operations = {
1530 .read = seq_read, 1518 .read = seq_read,
1531 .llseek = seq_lseek, 1519 .llseek = seq_lseek,
1532 .release = cgroup_seqfile_release, 1520 .release = cgroup_seqfile_release,
1533 }; 1521 };
1534 1522
1535 static int cgroup_file_open(struct inode *inode, struct file *file) 1523 static int cgroup_file_open(struct inode *inode, struct file *file)
1536 { 1524 {
1537 int err; 1525 int err;
1538 struct cftype *cft; 1526 struct cftype *cft;
1539 1527
1540 err = generic_file_open(inode, file); 1528 err = generic_file_open(inode, file);
1541 if (err) 1529 if (err)
1542 return err; 1530 return err;
1543 1531
1544 cft = __d_cft(file->f_dentry); 1532 cft = __d_cft(file->f_dentry);
1545 if (!cft) 1533 if (!cft)
1546 return -ENODEV; 1534 return -ENODEV;
1547 if (cft->read_map) { 1535 if (cft->read_map) {
1548 struct cgroup_seqfile_state *state = 1536 struct cgroup_seqfile_state *state =
1549 kzalloc(sizeof(*state), GFP_USER); 1537 kzalloc(sizeof(*state), GFP_USER);
1550 if (!state) 1538 if (!state)
1551 return -ENOMEM; 1539 return -ENOMEM;
1552 state->cft = cft; 1540 state->cft = cft;
1553 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 1541 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
1554 file->f_op = &cgroup_seqfile_operations; 1542 file->f_op = &cgroup_seqfile_operations;
1555 err = single_open(file, cgroup_seqfile_show, state); 1543 err = single_open(file, cgroup_seqfile_show, state);
1556 if (err < 0) 1544 if (err < 0)
1557 kfree(state); 1545 kfree(state);
1558 } else if (cft->open) 1546 } else if (cft->open)
1559 err = cft->open(inode, file); 1547 err = cft->open(inode, file);
1560 else 1548 else
1561 err = 0; 1549 err = 0;
1562 1550
1563 return err; 1551 return err;
1564 } 1552 }
1565 1553
1566 static int cgroup_file_release(struct inode *inode, struct file *file) 1554 static int cgroup_file_release(struct inode *inode, struct file *file)
1567 { 1555 {
1568 struct cftype *cft = __d_cft(file->f_dentry); 1556 struct cftype *cft = __d_cft(file->f_dentry);
1569 if (cft->release) 1557 if (cft->release)
1570 return cft->release(inode, file); 1558 return cft->release(inode, file);
1571 return 0; 1559 return 0;
1572 } 1560 }
1573 1561
1574 /* 1562 /*
1575 * cgroup_rename - Only allow simple rename of directories in place. 1563 * cgroup_rename - Only allow simple rename of directories in place.
1576 */ 1564 */
1577 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 1565 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1578 struct inode *new_dir, struct dentry *new_dentry) 1566 struct inode *new_dir, struct dentry *new_dentry)
1579 { 1567 {
1580 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 1568 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1581 return -ENOTDIR; 1569 return -ENOTDIR;
1582 if (new_dentry->d_inode) 1570 if (new_dentry->d_inode)
1583 return -EEXIST; 1571 return -EEXIST;
1584 if (old_dir != new_dir) 1572 if (old_dir != new_dir)
1585 return -EIO; 1573 return -EIO;
1586 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1574 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1587 } 1575 }
1588 1576
1589 static struct file_operations cgroup_file_operations = { 1577 static struct file_operations cgroup_file_operations = {
1590 .read = cgroup_file_read, 1578 .read = cgroup_file_read,
1591 .write = cgroup_file_write, 1579 .write = cgroup_file_write,
1592 .llseek = generic_file_llseek, 1580 .llseek = generic_file_llseek,
1593 .open = cgroup_file_open, 1581 .open = cgroup_file_open,
1594 .release = cgroup_file_release, 1582 .release = cgroup_file_release,
1595 }; 1583 };
1596 1584
1597 static struct inode_operations cgroup_dir_inode_operations = { 1585 static struct inode_operations cgroup_dir_inode_operations = {
1598 .lookup = simple_lookup, 1586 .lookup = simple_lookup,
1599 .mkdir = cgroup_mkdir, 1587 .mkdir = cgroup_mkdir,
1600 .rmdir = cgroup_rmdir, 1588 .rmdir = cgroup_rmdir,
1601 .rename = cgroup_rename, 1589 .rename = cgroup_rename,
1602 }; 1590 };
1603 1591
1604 static int cgroup_create_file(struct dentry *dentry, int mode, 1592 static int cgroup_create_file(struct dentry *dentry, int mode,
1605 struct super_block *sb) 1593 struct super_block *sb)
1606 { 1594 {
1607 static struct dentry_operations cgroup_dops = { 1595 static struct dentry_operations cgroup_dops = {
1608 .d_iput = cgroup_diput, 1596 .d_iput = cgroup_diput,
1609 }; 1597 };
1610 1598
1611 struct inode *inode; 1599 struct inode *inode;
1612 1600
1613 if (!dentry) 1601 if (!dentry)
1614 return -ENOENT; 1602 return -ENOENT;
1615 if (dentry->d_inode) 1603 if (dentry->d_inode)
1616 return -EEXIST; 1604 return -EEXIST;
1617 1605
1618 inode = cgroup_new_inode(mode, sb); 1606 inode = cgroup_new_inode(mode, sb);
1619 if (!inode) 1607 if (!inode)
1620 return -ENOMEM; 1608 return -ENOMEM;
1621 1609
1622 if (S_ISDIR(mode)) { 1610 if (S_ISDIR(mode)) {
1623 inode->i_op = &cgroup_dir_inode_operations; 1611 inode->i_op = &cgroup_dir_inode_operations;
1624 inode->i_fop = &simple_dir_operations; 1612 inode->i_fop = &simple_dir_operations;
1625 1613
1626 /* start off with i_nlink == 2 (for "." entry) */ 1614 /* start off with i_nlink == 2 (for "." entry) */
1627 inc_nlink(inode); 1615 inc_nlink(inode);
1628 1616
1629 /* start with the directory inode held, so that we can 1617 /* start with the directory inode held, so that we can
1630 * populate it without racing with another mkdir */ 1618 * populate it without racing with another mkdir */
1631 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 1619 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1632 } else if (S_ISREG(mode)) { 1620 } else if (S_ISREG(mode)) {
1633 inode->i_size = 0; 1621 inode->i_size = 0;
1634 inode->i_fop = &cgroup_file_operations; 1622 inode->i_fop = &cgroup_file_operations;
1635 } 1623 }
1636 dentry->d_op = &cgroup_dops; 1624 dentry->d_op = &cgroup_dops;
1637 d_instantiate(dentry, inode); 1625 d_instantiate(dentry, inode);
1638 dget(dentry); /* Extra count - pin the dentry in core */ 1626 dget(dentry); /* Extra count - pin the dentry in core */
1639 return 0; 1627 return 0;
1640 } 1628 }
1641 1629
1642 /* 1630 /*
1643 * cgroup_create_dir - create a directory for an object. 1631 * cgroup_create_dir - create a directory for an object.
1644 * @cgrp: the cgroup we create the directory for. It must have a valid 1632 * @cgrp: the cgroup we create the directory for. It must have a valid
1645 * ->parent field. And we are going to fill its ->dentry field. 1633 * ->parent field. And we are going to fill its ->dentry field.
1646 * @dentry: dentry of the new cgroup 1634 * @dentry: dentry of the new cgroup
1647 * @mode: mode to set on new directory. 1635 * @mode: mode to set on new directory.
1648 */ 1636 */
1649 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 1637 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1650 int mode) 1638 int mode)
1651 { 1639 {
1652 struct dentry *parent; 1640 struct dentry *parent;
1653 int error = 0; 1641 int error = 0;
1654 1642
1655 parent = cgrp->parent->dentry; 1643 parent = cgrp->parent->dentry;
1656 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); 1644 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
1657 if (!error) { 1645 if (!error) {
1658 dentry->d_fsdata = cgrp; 1646 dentry->d_fsdata = cgrp;
1659 inc_nlink(parent->d_inode); 1647 inc_nlink(parent->d_inode);
1660 cgrp->dentry = dentry; 1648 cgrp->dentry = dentry;
1661 dget(dentry); 1649 dget(dentry);
1662 } 1650 }
1663 dput(dentry); 1651 dput(dentry);
1664 1652
1665 return error; 1653 return error;
1666 } 1654 }
1667 1655
1668 int cgroup_add_file(struct cgroup *cgrp, 1656 int cgroup_add_file(struct cgroup *cgrp,
1669 struct cgroup_subsys *subsys, 1657 struct cgroup_subsys *subsys,
1670 const struct cftype *cft) 1658 const struct cftype *cft)
1671 { 1659 {
1672 struct dentry *dir = cgrp->dentry; 1660 struct dentry *dir = cgrp->dentry;
1673 struct dentry *dentry; 1661 struct dentry *dentry;
1674 int error; 1662 int error;
1675 1663
1676 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 1664 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1677 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 1665 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
1678 strcpy(name, subsys->name); 1666 strcpy(name, subsys->name);
1679 strcat(name, "."); 1667 strcat(name, ".");
1680 } 1668 }
1681 strcat(name, cft->name); 1669 strcat(name, cft->name);
1682 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 1670 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1683 dentry = lookup_one_len(name, dir, strlen(name)); 1671 dentry = lookup_one_len(name, dir, strlen(name));
1684 if (!IS_ERR(dentry)) { 1672 if (!IS_ERR(dentry)) {
1685 error = cgroup_create_file(dentry, 0644 | S_IFREG, 1673 error = cgroup_create_file(dentry, 0644 | S_IFREG,
1686 cgrp->root->sb); 1674 cgrp->root->sb);
1687 if (!error) 1675 if (!error)
1688 dentry->d_fsdata = (void *)cft; 1676 dentry->d_fsdata = (void *)cft;
1689 dput(dentry); 1677 dput(dentry);
1690 } else 1678 } else
1691 error = PTR_ERR(dentry); 1679 error = PTR_ERR(dentry);
1692 return error; 1680 return error;
1693 } 1681 }
1694 1682
1695 int cgroup_add_files(struct cgroup *cgrp, 1683 int cgroup_add_files(struct cgroup *cgrp,
1696 struct cgroup_subsys *subsys, 1684 struct cgroup_subsys *subsys,
1697 const struct cftype cft[], 1685 const struct cftype cft[],
1698 int count) 1686 int count)
1699 { 1687 {
1700 int i, err; 1688 int i, err;
1701 for (i = 0; i < count; i++) { 1689 for (i = 0; i < count; i++) {
1702 err = cgroup_add_file(cgrp, subsys, &cft[i]); 1690 err = cgroup_add_file(cgrp, subsys, &cft[i]);
1703 if (err) 1691 if (err)
1704 return err; 1692 return err;
1705 } 1693 }
1706 return 0; 1694 return 0;
1707 } 1695 }
1708 1696
1709 /** 1697 /**
1710 * cgroup_task_count - count the number of tasks in a cgroup. 1698 * cgroup_task_count - count the number of tasks in a cgroup.
1711 * @cgrp: the cgroup in question 1699 * @cgrp: the cgroup in question
1712 * 1700 *
1713 * Return the number of tasks in the cgroup. 1701 * Return the number of tasks in the cgroup.
1714 */ 1702 */
1715 int cgroup_task_count(const struct cgroup *cgrp) 1703 int cgroup_task_count(const struct cgroup *cgrp)
1716 { 1704 {
1717 int count = 0; 1705 int count = 0;
1718 struct list_head *l; 1706 struct list_head *l;
1719 1707
1720 read_lock(&css_set_lock); 1708 read_lock(&css_set_lock);
1721 l = cgrp->css_sets.next; 1709 l = cgrp->css_sets.next;
1722 while (l != &cgrp->css_sets) { 1710 while (l != &cgrp->css_sets) {
1723 struct cg_cgroup_link *link = 1711 struct cg_cgroup_link *link =
1724 list_entry(l, struct cg_cgroup_link, cgrp_link_list); 1712 list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1725 count += atomic_read(&link->cg->ref.refcount); 1713 count += atomic_read(&link->cg->ref.refcount);
1726 l = l->next; 1714 l = l->next;
1727 } 1715 }
1728 read_unlock(&css_set_lock); 1716 read_unlock(&css_set_lock);
1729 return count; 1717 return count;
1730 } 1718 }
1731 1719
1732 /* 1720 /*
1733 * Advance a list_head iterator. The iterator should be positioned at 1721 * Advance a list_head iterator. The iterator should be positioned at
1734 * the start of a css_set 1722 * the start of a css_set
1735 */ 1723 */
1736 static void cgroup_advance_iter(struct cgroup *cgrp, 1724 static void cgroup_advance_iter(struct cgroup *cgrp,
1737 struct cgroup_iter *it) 1725 struct cgroup_iter *it)
1738 { 1726 {
1739 struct list_head *l = it->cg_link; 1727 struct list_head *l = it->cg_link;
1740 struct cg_cgroup_link *link; 1728 struct cg_cgroup_link *link;
1741 struct css_set *cg; 1729 struct css_set *cg;
1742 1730
1743 /* Advance to the next non-empty css_set */ 1731 /* Advance to the next non-empty css_set */
1744 do { 1732 do {
1745 l = l->next; 1733 l = l->next;
1746 if (l == &cgrp->css_sets) { 1734 if (l == &cgrp->css_sets) {
1747 it->cg_link = NULL; 1735 it->cg_link = NULL;
1748 return; 1736 return;
1749 } 1737 }
1750 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); 1738 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1751 cg = link->cg; 1739 cg = link->cg;
1752 } while (list_empty(&cg->tasks)); 1740 } while (list_empty(&cg->tasks));
1753 it->cg_link = l; 1741 it->cg_link = l;
1754 it->task = cg->tasks.next; 1742 it->task = cg->tasks.next;
1755 } 1743 }
1756 1744
1757 /* 1745 /*
1758 * To reduce the fork() overhead for systems that are not actually 1746 * To reduce the fork() overhead for systems that are not actually
1759 * using their cgroups capability, we don't maintain the lists running 1747 * using their cgroups capability, we don't maintain the lists running
1760 * through each css_set to its tasks until we see the list actually 1748 * through each css_set to its tasks until we see the list actually
1761 * used - in other words after the first call to cgroup_iter_start(). 1749 * used - in other words after the first call to cgroup_iter_start().
1762 * 1750 *
1763 * The tasklist_lock is not held here, as do_each_thread() and 1751 * The tasklist_lock is not held here, as do_each_thread() and
1764 * while_each_thread() are protected by RCU. 1752 * while_each_thread() are protected by RCU.
1765 */ 1753 */
1766 static void cgroup_enable_task_cg_lists(void) 1754 static void cgroup_enable_task_cg_lists(void)
1767 { 1755 {
1768 struct task_struct *p, *g; 1756 struct task_struct *p, *g;
1769 write_lock(&css_set_lock); 1757 write_lock(&css_set_lock);
1770 use_task_css_set_links = 1; 1758 use_task_css_set_links = 1;
1771 do_each_thread(g, p) { 1759 do_each_thread(g, p) {
1772 task_lock(p); 1760 task_lock(p);
1773 /* 1761 /*
1774 * We should check if the process is exiting, otherwise 1762 * We should check if the process is exiting, otherwise
1775 * it will race with cgroup_exit() in that the list 1763 * it will race with cgroup_exit() in that the list
1776 * entry won't be deleted though the process has exited. 1764 * entry won't be deleted though the process has exited.
1777 */ 1765 */
1778 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 1766 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
1779 list_add(&p->cg_list, &p->cgroups->tasks); 1767 list_add(&p->cg_list, &p->cgroups->tasks);
1780 task_unlock(p); 1768 task_unlock(p);
1781 } while_each_thread(g, p); 1769 } while_each_thread(g, p);
1782 write_unlock(&css_set_lock); 1770 write_unlock(&css_set_lock);
1783 } 1771 }
1784 1772
1785 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 1773 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1786 { 1774 {
1787 /* 1775 /*
1788 * The first time anyone tries to iterate across a cgroup, 1776 * The first time anyone tries to iterate across a cgroup,
1789 * we need to enable the list linking each css_set to its 1777 * we need to enable the list linking each css_set to its
1790 * tasks, and fix up all existing tasks. 1778 * tasks, and fix up all existing tasks.
1791 */ 1779 */
1792 if (!use_task_css_set_links) 1780 if (!use_task_css_set_links)
1793 cgroup_enable_task_cg_lists(); 1781 cgroup_enable_task_cg_lists();
1794 1782
1795 read_lock(&css_set_lock); 1783 read_lock(&css_set_lock);
1796 it->cg_link = &cgrp->css_sets; 1784 it->cg_link = &cgrp->css_sets;
1797 cgroup_advance_iter(cgrp, it); 1785 cgroup_advance_iter(cgrp, it);
1798 } 1786 }
1799 1787
1800 struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 1788 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1801 struct cgroup_iter *it) 1789 struct cgroup_iter *it)
1802 { 1790 {
1803 struct task_struct *res; 1791 struct task_struct *res;
1804 struct list_head *l = it->task; 1792 struct list_head *l = it->task;
1805 1793
1806 /* If the iterator cg is NULL, we have no tasks */ 1794 /* If the iterator cg is NULL, we have no tasks */
1807 if (!it->cg_link) 1795 if (!it->cg_link)
1808 return NULL; 1796 return NULL;
1809 res = list_entry(l, struct task_struct, cg_list); 1797 res = list_entry(l, struct task_struct, cg_list);
1810 /* Advance iterator to find next entry */ 1798 /* Advance iterator to find next entry */
1811 l = l->next; 1799 l = l->next;
1812 if (l == &res->cgroups->tasks) { 1800 if (l == &res->cgroups->tasks) {
1813 /* We reached the end of this task list - move on to 1801 /* We reached the end of this task list - move on to
1814 * the next cg_cgroup_link */ 1802 * the next cg_cgroup_link */
1815 cgroup_advance_iter(cgrp, it); 1803 cgroup_advance_iter(cgrp, it);
1816 } else { 1804 } else {
1817 it->task = l; 1805 it->task = l;
1818 } 1806 }
1819 return res; 1807 return res;
1820 } 1808 }
1821 1809
1822 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 1810 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
1823 { 1811 {
1824 read_unlock(&css_set_lock); 1812 read_unlock(&css_set_lock);
1825 } 1813 }
1826 1814
1827 static inline int started_after_time(struct task_struct *t1, 1815 static inline int started_after_time(struct task_struct *t1,
1828 struct timespec *time, 1816 struct timespec *time,
1829 struct task_struct *t2) 1817 struct task_struct *t2)
1830 { 1818 {
1831 int start_diff = timespec_compare(&t1->start_time, time); 1819 int start_diff = timespec_compare(&t1->start_time, time);
1832 if (start_diff > 0) { 1820 if (start_diff > 0) {
1833 return 1; 1821 return 1;
1834 } else if (start_diff < 0) { 1822 } else if (start_diff < 0) {
1835 return 0; 1823 return 0;
1836 } else { 1824 } else {
1837 /* 1825 /*
1838 * Arbitrarily, if two processes started at the same 1826 * Arbitrarily, if two processes started at the same
1839 * time, we'll say that the lower pointer value 1827 * time, we'll say that the lower pointer value
1840 * started first. Note that t2 may have exited by now 1828 * started first. Note that t2 may have exited by now
1841 * so this may not be a valid pointer any longer, but 1829 * so this may not be a valid pointer any longer, but
1842 * that's fine - it still serves to distinguish 1830 * that's fine - it still serves to distinguish
1843 * between two tasks started (effectively) simultaneously. 1831 * between two tasks started (effectively) simultaneously.
1844 */ 1832 */
1845 return t1 > t2; 1833 return t1 > t2;
1846 } 1834 }
1847 } 1835 }
1848 1836
1849 /* 1837 /*
1850 * This function is a callback from heap_insert() and is used to order 1838 * This function is a callback from heap_insert() and is used to order
1851 * the heap. 1839 * the heap.
1852 * In this case we order the heap in descending task start time. 1840 * In this case we order the heap in descending task start time.
1853 */ 1841 */
1854 static inline int started_after(void *p1, void *p2) 1842 static inline int started_after(void *p1, void *p2)
1855 { 1843 {
1856 struct task_struct *t1 = p1; 1844 struct task_struct *t1 = p1;
1857 struct task_struct *t2 = p2; 1845 struct task_struct *t2 = p2;
1858 return started_after_time(t1, &t2->start_time, t2); 1846 return started_after_time(t1, &t2->start_time, t2);
1859 } 1847 }
1860 1848
1861 /** 1849 /**
1862 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 1850 * cgroup_scan_tasks - iterate though all the tasks in a cgroup
1863 * @scan: struct cgroup_scanner containing arguments for the scan 1851 * @scan: struct cgroup_scanner containing arguments for the scan
1864 * 1852 *
1865 * Arguments include pointers to callback functions test_task() and 1853 * Arguments include pointers to callback functions test_task() and
1866 * process_task(). 1854 * process_task().
1867 * Iterate through all the tasks in a cgroup, calling test_task() for each, 1855 * Iterate through all the tasks in a cgroup, calling test_task() for each,
1868 * and if it returns true, call process_task() for it also. 1856 * and if it returns true, call process_task() for it also.
1869 * The test_task pointer may be NULL, meaning always true (select all tasks). 1857 * The test_task pointer may be NULL, meaning always true (select all tasks).
1870 * Effectively duplicates cgroup_iter_{start,next,end}() 1858 * Effectively duplicates cgroup_iter_{start,next,end}()
1871 * but does not lock css_set_lock for the call to process_task(). 1859 * but does not lock css_set_lock for the call to process_task().
1872 * The struct cgroup_scanner may be embedded in any structure of the caller's 1860 * The struct cgroup_scanner may be embedded in any structure of the caller's
1873 * creation. 1861 * creation.
1874 * It is guaranteed that process_task() will act on every task that 1862 * It is guaranteed that process_task() will act on every task that
1875 * is a member of the cgroup for the duration of this call. This 1863 * is a member of the cgroup for the duration of this call. This
1876 * function may or may not call process_task() for tasks that exit 1864 * function may or may not call process_task() for tasks that exit
1877 * or move to a different cgroup during the call, or are forked or 1865 * or move to a different cgroup during the call, or are forked or
1878 * move into the cgroup during the call. 1866 * move into the cgroup during the call.
1879 * 1867 *
1880 * Note that test_task() may be called with locks held, and may in some 1868 * Note that test_task() may be called with locks held, and may in some
1881 * situations be called multiple times for the same task, so it should 1869 * situations be called multiple times for the same task, so it should
1882 * be cheap. 1870 * be cheap.
1883 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 1871 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
1884 * pre-allocated and will be used for heap operations (and its "gt" member will 1872 * pre-allocated and will be used for heap operations (and its "gt" member will
1885 * be overwritten), else a temporary heap will be used (allocation of which 1873 * be overwritten), else a temporary heap will be used (allocation of which
1886 * may cause this function to fail). 1874 * may cause this function to fail).
1887 */ 1875 */
1888 int cgroup_scan_tasks(struct cgroup_scanner *scan) 1876 int cgroup_scan_tasks(struct cgroup_scanner *scan)
1889 { 1877 {
1890 int retval, i; 1878 int retval, i;
1891 struct cgroup_iter it; 1879 struct cgroup_iter it;
1892 struct task_struct *p, *dropped; 1880 struct task_struct *p, *dropped;
1893 /* Never dereference latest_task, since it's not refcounted */ 1881 /* Never dereference latest_task, since it's not refcounted */
1894 struct task_struct *latest_task = NULL; 1882 struct task_struct *latest_task = NULL;
1895 struct ptr_heap tmp_heap; 1883 struct ptr_heap tmp_heap;
1896 struct ptr_heap *heap; 1884 struct ptr_heap *heap;
1897 struct timespec latest_time = { 0, 0 }; 1885 struct timespec latest_time = { 0, 0 };
1898 1886
1899 if (scan->heap) { 1887 if (scan->heap) {
1900 /* The caller supplied our heap and pre-allocated its memory */ 1888 /* The caller supplied our heap and pre-allocated its memory */
1901 heap = scan->heap; 1889 heap = scan->heap;
1902 heap->gt = &started_after; 1890 heap->gt = &started_after;
1903 } else { 1891 } else {
1904 /* We need to allocate our own heap memory */ 1892 /* We need to allocate our own heap memory */
1905 heap = &tmp_heap; 1893 heap = &tmp_heap;
1906 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); 1894 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
1907 if (retval) 1895 if (retval)
1908 /* cannot allocate the heap */ 1896 /* cannot allocate the heap */
1909 return retval; 1897 return retval;
1910 } 1898 }
1911 1899
1912 again: 1900 again:
1913 /* 1901 /*
1914 * Scan tasks in the cgroup, using the scanner's "test_task" callback 1902 * Scan tasks in the cgroup, using the scanner's "test_task" callback
1915 * to determine which are of interest, and using the scanner's 1903 * to determine which are of interest, and using the scanner's
1916 * "process_task" callback to process any of them that need an update. 1904 * "process_task" callback to process any of them that need an update.
1917 * Since we don't want to hold any locks during the task updates, 1905 * Since we don't want to hold any locks during the task updates,
1918 * gather tasks to be processed in a heap structure. 1906 * gather tasks to be processed in a heap structure.
1919 * The heap is sorted by descending task start time. 1907 * The heap is sorted by descending task start time.
1920 * If the statically-sized heap fills up, we overflow tasks that 1908 * If the statically-sized heap fills up, we overflow tasks that
1921 * started later, and in future iterations only consider tasks that 1909 * started later, and in future iterations only consider tasks that
1922 * started after the latest task in the previous pass. This 1910 * started after the latest task in the previous pass. This
1923 * guarantees forward progress and that we don't miss any tasks. 1911 * guarantees forward progress and that we don't miss any tasks.
1924 */ 1912 */
1925 heap->size = 0; 1913 heap->size = 0;
1926 cgroup_iter_start(scan->cg, &it); 1914 cgroup_iter_start(scan->cg, &it);
1927 while ((p = cgroup_iter_next(scan->cg, &it))) { 1915 while ((p = cgroup_iter_next(scan->cg, &it))) {
1928 /* 1916 /*
1929 * Only affect tasks that qualify per the caller's callback, 1917 * Only affect tasks that qualify per the caller's callback,
1930 * if he provided one 1918 * if he provided one
1931 */ 1919 */
1932 if (scan->test_task && !scan->test_task(p, scan)) 1920 if (scan->test_task && !scan->test_task(p, scan))
1933 continue; 1921 continue;
1934 /* 1922 /*
1935 * Only process tasks that started after the last task 1923 * Only process tasks that started after the last task
1936 * we processed 1924 * we processed
1937 */ 1925 */
1938 if (!started_after_time(p, &latest_time, latest_task)) 1926 if (!started_after_time(p, &latest_time, latest_task))
1939 continue; 1927 continue;
1940 dropped = heap_insert(heap, p); 1928 dropped = heap_insert(heap, p);
1941 if (dropped == NULL) { 1929 if (dropped == NULL) {
1942 /* 1930 /*
1943 * The new task was inserted; the heap wasn't 1931 * The new task was inserted; the heap wasn't
1944 * previously full 1932 * previously full
1945 */ 1933 */
1946 get_task_struct(p); 1934 get_task_struct(p);
1947 } else if (dropped != p) { 1935 } else if (dropped != p) {
1948 /* 1936 /*
1949 * The new task was inserted, and pushed out a 1937 * The new task was inserted, and pushed out a
1950 * different task 1938 * different task
1951 */ 1939 */
1952 get_task_struct(p); 1940 get_task_struct(p);
1953 put_task_struct(dropped); 1941 put_task_struct(dropped);
1954 } 1942 }
1955 /* 1943 /*
1956 * Else the new task was newer than anything already in 1944 * Else the new task was newer than anything already in
1957 * the heap and wasn't inserted 1945 * the heap and wasn't inserted
1958 */ 1946 */
1959 } 1947 }
1960 cgroup_iter_end(scan->cg, &it); 1948 cgroup_iter_end(scan->cg, &it);
1961 1949
1962 if (heap->size) { 1950 if (heap->size) {
1963 for (i = 0; i < heap->size; i++) { 1951 for (i = 0; i < heap->size; i++) {
1964 struct task_struct *q = heap->ptrs[i]; 1952 struct task_struct *q = heap->ptrs[i];
1965 if (i == 0) { 1953 if (i == 0) {
1966 latest_time = q->start_time; 1954 latest_time = q->start_time;
1967 latest_task = q; 1955 latest_task = q;
1968 } 1956 }
1969 /* Process the task per the caller's callback */ 1957 /* Process the task per the caller's callback */
1970 scan->process_task(q, scan); 1958 scan->process_task(q, scan);
1971 put_task_struct(q); 1959 put_task_struct(q);
1972 } 1960 }
1973 /* 1961 /*
1974 * If we had to process any tasks at all, scan again 1962 * If we had to process any tasks at all, scan again
1975 * in case some of them were in the middle of forking 1963 * in case some of them were in the middle of forking
1976 * children that didn't get processed. 1964 * children that didn't get processed.
1977 * Not the most efficient way to do it, but it avoids 1965 * Not the most efficient way to do it, but it avoids
1978 * having to take callback_mutex in the fork path 1966 * having to take callback_mutex in the fork path
1979 */ 1967 */
1980 goto again; 1968 goto again;
1981 } 1969 }
1982 if (heap == &tmp_heap) 1970 if (heap == &tmp_heap)
1983 heap_free(&tmp_heap); 1971 heap_free(&tmp_heap);
1984 return 0; 1972 return 0;
1985 } 1973 }
1986 1974
1987 /* 1975 /*
1988 * Stuff for reading the 'tasks' file. 1976 * Stuff for reading the 'tasks' file.
1989 * 1977 *
1990 * Reading this file can return large amounts of data if a cgroup has 1978 * Reading this file can return large amounts of data if a cgroup has
1991 * *lots* of attached tasks. So it may need several calls to read(), 1979 * *lots* of attached tasks. So it may need several calls to read(),
1992 * but we cannot guarantee that the information we produce is correct 1980 * but we cannot guarantee that the information we produce is correct
1993 * unless we produce it entirely atomically. 1981 * unless we produce it entirely atomically.
1994 * 1982 *
1995 * Upon tasks file open(), a struct ctr_struct is allocated, that 1983 * Upon tasks file open(), a struct ctr_struct is allocated, that
1996 * will have a pointer to an array (also allocated here). The struct 1984 * will have a pointer to an array (also allocated here). The struct
1997 * ctr_struct * is stored in file->private_data. Its resources will 1985 * ctr_struct * is stored in file->private_data. Its resources will
1998 * be freed by release() when the file is closed. The array is used 1986 * be freed by release() when the file is closed. The array is used
1999 * to sprintf the PIDs and then used by read(). 1987 * to sprintf the PIDs and then used by read().
2000 */ 1988 */
2001 struct ctr_struct { 1989 struct ctr_struct {
2002 char *buf; 1990 char *buf;
2003 int bufsz; 1991 int bufsz;
2004 }; 1992 };
2005 1993
2006 /* 1994 /*
2007 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 1995 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
2008 * 'cgrp'. Return actual number of pids loaded. No need to 1996 * 'cgrp'. Return actual number of pids loaded. No need to
2009 * task_lock(p) when reading out p->cgroup, since we're in an RCU 1997 * task_lock(p) when reading out p->cgroup, since we're in an RCU
2010 * read section, so the css_set can't go away, and is 1998 * read section, so the css_set can't go away, and is
2011 * immutable after creation. 1999 * immutable after creation.
2012 */ 2000 */
2013 static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2001 static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
2014 { 2002 {
2015 int n = 0; 2003 int n = 0;
2016 struct cgroup_iter it; 2004 struct cgroup_iter it;
2017 struct task_struct *tsk; 2005 struct task_struct *tsk;
2018 cgroup_iter_start(cgrp, &it); 2006 cgroup_iter_start(cgrp, &it);
2019 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2007 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2020 if (unlikely(n == npids)) 2008 if (unlikely(n == npids))
2021 break; 2009 break;
2022 pidarray[n++] = task_pid_vnr(tsk); 2010 pidarray[n++] = task_pid_vnr(tsk);
2023 } 2011 }
2024 cgroup_iter_end(cgrp, &it); 2012 cgroup_iter_end(cgrp, &it);
2025 return n; 2013 return n;
2026 } 2014 }
2027 2015
2028 /** 2016 /**
2029 * cgroupstats_build - build and fill cgroupstats 2017 * cgroupstats_build - build and fill cgroupstats
2030 * @stats: cgroupstats to fill information into 2018 * @stats: cgroupstats to fill information into
2031 * @dentry: A dentry entry belonging to the cgroup for which stats have 2019 * @dentry: A dentry entry belonging to the cgroup for which stats have
2032 * been requested. 2020 * been requested.
2033 * 2021 *
2034 * Build and fill cgroupstats so that taskstats can export it to user 2022 * Build and fill cgroupstats so that taskstats can export it to user
2035 * space. 2023 * space.
2036 */ 2024 */
2037 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 2025 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2038 { 2026 {
2039 int ret = -EINVAL; 2027 int ret = -EINVAL;
2040 struct cgroup *cgrp; 2028 struct cgroup *cgrp;
2041 struct cgroup_iter it; 2029 struct cgroup_iter it;
2042 struct task_struct *tsk; 2030 struct task_struct *tsk;
2043 /* 2031 /*
2044 * Validate dentry by checking the superblock operations 2032 * Validate dentry by checking the superblock operations
2045 */ 2033 */
2046 if (dentry->d_sb->s_op != &cgroup_ops) 2034 if (dentry->d_sb->s_op != &cgroup_ops)
2047 goto err; 2035 goto err;
2048 2036
2049 ret = 0; 2037 ret = 0;
2050 cgrp = dentry->d_fsdata; 2038 cgrp = dentry->d_fsdata;
2051 rcu_read_lock(); 2039 rcu_read_lock();
2052 2040
2053 cgroup_iter_start(cgrp, &it); 2041 cgroup_iter_start(cgrp, &it);
2054 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2042 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2055 switch (tsk->state) { 2043 switch (tsk->state) {
2056 case TASK_RUNNING: 2044 case TASK_RUNNING:
2057 stats->nr_running++; 2045 stats->nr_running++;
2058 break; 2046 break;
2059 case TASK_INTERRUPTIBLE: 2047 case TASK_INTERRUPTIBLE:
2060 stats->nr_sleeping++; 2048 stats->nr_sleeping++;
2061 break; 2049 break;
2062 case TASK_UNINTERRUPTIBLE: 2050 case TASK_UNINTERRUPTIBLE:
2063 stats->nr_uninterruptible++; 2051 stats->nr_uninterruptible++;
2064 break; 2052 break;
2065 case TASK_STOPPED: 2053 case TASK_STOPPED:
2066 stats->nr_stopped++; 2054 stats->nr_stopped++;
2067 break; 2055 break;
2068 default: 2056 default:
2069 if (delayacct_is_task_waiting_on_io(tsk)) 2057 if (delayacct_is_task_waiting_on_io(tsk))
2070 stats->nr_io_wait++; 2058 stats->nr_io_wait++;
2071 break; 2059 break;
2072 } 2060 }
2073 } 2061 }
2074 cgroup_iter_end(cgrp, &it); 2062 cgroup_iter_end(cgrp, &it);
2075 2063
2076 rcu_read_unlock(); 2064 rcu_read_unlock();
2077 err: 2065 err:
2078 return ret; 2066 return ret;
2079 } 2067 }
2080 2068
2081 static int cmppid(const void *a, const void *b) 2069 static int cmppid(const void *a, const void *b)
2082 { 2070 {
2083 return *(pid_t *)a - *(pid_t *)b; 2071 return *(pid_t *)a - *(pid_t *)b;
2084 } 2072 }
2085 2073
2086 /* 2074 /*
2087 * Convert array 'a' of 'npids' pid_t's to a string of newline separated 2075 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
2088 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return 2076 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
2089 * count 'cnt' of how many chars would be written if buf were large enough. 2077 * count 'cnt' of how many chars would be written if buf were large enough.
2090 */ 2078 */
2091 static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) 2079 static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
2092 { 2080 {
2093 int cnt = 0; 2081 int cnt = 0;
2094 int i; 2082 int i;
2095 2083
2096 for (i = 0; i < npids; i++) 2084 for (i = 0; i < npids; i++)
2097 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); 2085 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
2098 return cnt; 2086 return cnt;
2099 } 2087 }
2100 2088
2101 /* 2089 /*
2102 * Handle an open on 'tasks' file. Prepare a buffer listing the 2090 * Handle an open on 'tasks' file. Prepare a buffer listing the
2103 * process id's of tasks currently attached to the cgroup being opened. 2091 * process id's of tasks currently attached to the cgroup being opened.
2104 * 2092 *
2105 * Does not require any specific cgroup mutexes, and does not take any. 2093 * Does not require any specific cgroup mutexes, and does not take any.
2106 */ 2094 */
2107 static int cgroup_tasks_open(struct inode *unused, struct file *file) 2095 static int cgroup_tasks_open(struct inode *unused, struct file *file)
2108 { 2096 {
2109 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2097 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2110 struct ctr_struct *ctr; 2098 struct ctr_struct *ctr;
2111 pid_t *pidarray; 2099 pid_t *pidarray;
2112 int npids; 2100 int npids;
2113 char c; 2101 char c;
2114 2102
2115 if (!(file->f_mode & FMODE_READ)) 2103 if (!(file->f_mode & FMODE_READ))
2116 return 0; 2104 return 0;
2117 2105
2118 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); 2106 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
2119 if (!ctr) 2107 if (!ctr)
2120 goto err0; 2108 goto err0;
2121 2109
2122 /* 2110 /*
2123 * If cgroup gets more users after we read count, we won't have 2111 * If cgroup gets more users after we read count, we won't have
2124 * enough space - tough. This race is indistinguishable to the 2112 * enough space - tough. This race is indistinguishable to the
2125 * caller from the case that the additional cgroup users didn't 2113 * caller from the case that the additional cgroup users didn't
2126 * show up until sometime later on. 2114 * show up until sometime later on.
2127 */ 2115 */
2128 npids = cgroup_task_count(cgrp); 2116 npids = cgroup_task_count(cgrp);
2129 if (npids) { 2117 if (npids) {
2130 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); 2118 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2131 if (!pidarray) 2119 if (!pidarray)
2132 goto err1; 2120 goto err1;
2133 2121
2134 npids = pid_array_load(pidarray, npids, cgrp); 2122 npids = pid_array_load(pidarray, npids, cgrp);
2135 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); 2123 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2136 2124
2137 /* Call pid_array_to_buf() twice, first just to get bufsz */ 2125 /* Call pid_array_to_buf() twice, first just to get bufsz */
2138 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; 2126 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
2139 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); 2127 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
2140 if (!ctr->buf) 2128 if (!ctr->buf)
2141 goto err2; 2129 goto err2;
2142 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); 2130 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
2143 2131
2144 kfree(pidarray); 2132 kfree(pidarray);
2145 } else { 2133 } else {
2146 ctr->buf = NULL; 2134 ctr->buf = NULL;
2147 ctr->bufsz = 0; 2135 ctr->bufsz = 0;
2148 } 2136 }
2149 file->private_data = ctr; 2137 file->private_data = ctr;
2150 return 0; 2138 return 0;
2151 2139
2152 err2: 2140 err2:
2153 kfree(pidarray); 2141 kfree(pidarray);
2154 err1: 2142 err1:
2155 kfree(ctr); 2143 kfree(ctr);
2156 err0: 2144 err0:
2157 return -ENOMEM; 2145 return -ENOMEM;
2158 } 2146 }
2159 2147
2160 static ssize_t cgroup_tasks_read(struct cgroup *cgrp, 2148 static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
2161 struct cftype *cft, 2149 struct cftype *cft,
2162 struct file *file, char __user *buf, 2150 struct file *file, char __user *buf,
2163 size_t nbytes, loff_t *ppos) 2151 size_t nbytes, loff_t *ppos)
2164 { 2152 {
2165 struct ctr_struct *ctr = file->private_data; 2153 struct ctr_struct *ctr = file->private_data;
2166 2154
2167 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); 2155 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
2168 } 2156 }
2169 2157
2170 static int cgroup_tasks_release(struct inode *unused_inode, 2158 static int cgroup_tasks_release(struct inode *unused_inode,
2171 struct file *file) 2159 struct file *file)
2172 { 2160 {
2173 struct ctr_struct *ctr; 2161 struct ctr_struct *ctr;
2174 2162
2175 if (file->f_mode & FMODE_READ) { 2163 if (file->f_mode & FMODE_READ) {
2176 ctr = file->private_data; 2164 ctr = file->private_data;
2177 kfree(ctr->buf); 2165 kfree(ctr->buf);
2178 kfree(ctr); 2166 kfree(ctr);
2179 } 2167 }
2180 return 0; 2168 return 0;
2181 } 2169 }
2182 2170
2183 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2171 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2184 struct cftype *cft) 2172 struct cftype *cft)
2185 { 2173 {
2186 return notify_on_release(cgrp); 2174 return notify_on_release(cgrp);
2187 } 2175 }
2188 2176
2189 static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
2190 {
2191 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
2192 }
2193
2194 /* 2177 /*
2195 * for the common functions, 'private' gives the type of file 2178 * for the common functions, 'private' gives the type of file
2196 */ 2179 */
2197 static struct cftype files[] = { 2180 static struct cftype files[] = {
2198 { 2181 {
2199 .name = "tasks", 2182 .name = "tasks",
2200 .open = cgroup_tasks_open, 2183 .open = cgroup_tasks_open,
2201 .read = cgroup_tasks_read, 2184 .read = cgroup_tasks_read,
2202 .write = cgroup_common_file_write, 2185 .write = cgroup_common_file_write,
2203 .release = cgroup_tasks_release, 2186 .release = cgroup_tasks_release,
2204 .private = FILE_TASKLIST, 2187 .private = FILE_TASKLIST,
2205 }, 2188 },
2206 2189
2207 { 2190 {
2208 .name = "notify_on_release", 2191 .name = "notify_on_release",
2209 .read_u64 = cgroup_read_notify_on_release, 2192 .read_u64 = cgroup_read_notify_on_release,
2210 .write = cgroup_common_file_write, 2193 .write = cgroup_common_file_write,
2211 .private = FILE_NOTIFY_ON_RELEASE, 2194 .private = FILE_NOTIFY_ON_RELEASE,
2212 }, 2195 },
2213
2214 {
2215 .name = "releasable",
2216 .read_u64 = cgroup_read_releasable,
2217 .private = FILE_RELEASABLE,
2218 }
2219 }; 2196 };
2220 2197
2221 static struct cftype cft_release_agent = { 2198 static struct cftype cft_release_agent = {
2222 .name = "release_agent", 2199 .name = "release_agent",
2223 .read = cgroup_common_file_read, 2200 .read = cgroup_common_file_read,
2224 .write = cgroup_common_file_write, 2201 .write = cgroup_common_file_write,
2225 .private = FILE_RELEASE_AGENT, 2202 .private = FILE_RELEASE_AGENT,
2226 }; 2203 };
2227 2204
2228 static int cgroup_populate_dir(struct cgroup *cgrp) 2205 static int cgroup_populate_dir(struct cgroup *cgrp)
2229 { 2206 {
2230 int err; 2207 int err;
2231 struct cgroup_subsys *ss; 2208 struct cgroup_subsys *ss;
2232 2209
2233 /* First clear out any existing files */ 2210 /* First clear out any existing files */
2234 cgroup_clear_directory(cgrp->dentry); 2211 cgroup_clear_directory(cgrp->dentry);
2235 2212
2236 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); 2213 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
2237 if (err < 0) 2214 if (err < 0)
2238 return err; 2215 return err;
2239 2216
2240 if (cgrp == cgrp->top_cgroup) { 2217 if (cgrp == cgrp->top_cgroup) {
2241 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) 2218 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
2242 return err; 2219 return err;
2243 } 2220 }
2244 2221
2245 for_each_subsys(cgrp->root, ss) { 2222 for_each_subsys(cgrp->root, ss) {
2246 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 2223 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
2247 return err; 2224 return err;
2248 } 2225 }
2249 2226
2250 return 0; 2227 return 0;
2251 } 2228 }
2252 2229
2253 static void init_cgroup_css(struct cgroup_subsys_state *css, 2230 static void init_cgroup_css(struct cgroup_subsys_state *css,
2254 struct cgroup_subsys *ss, 2231 struct cgroup_subsys *ss,
2255 struct cgroup *cgrp) 2232 struct cgroup *cgrp)
2256 { 2233 {
2257 css->cgroup = cgrp; 2234 css->cgroup = cgrp;
2258 atomic_set(&css->refcnt, 0); 2235 atomic_set(&css->refcnt, 0);
2259 css->flags = 0; 2236 css->flags = 0;
2260 if (cgrp == dummytop) 2237 if (cgrp == dummytop)
2261 set_bit(CSS_ROOT, &css->flags); 2238 set_bit(CSS_ROOT, &css->flags);
2262 BUG_ON(cgrp->subsys[ss->subsys_id]); 2239 BUG_ON(cgrp->subsys[ss->subsys_id]);
2263 cgrp->subsys[ss->subsys_id] = css; 2240 cgrp->subsys[ss->subsys_id] = css;
2264 } 2241 }
2265 2242
2266 /* 2243 /*
2267 * cgroup_create - create a cgroup 2244 * cgroup_create - create a cgroup
2268 * @parent: cgroup that will be parent of the new cgroup 2245 * @parent: cgroup that will be parent of the new cgroup
2269 * @dentry: dentry of the new cgroup 2246 * @dentry: dentry of the new cgroup
2270 * @mode: mode to set on new inode 2247 * @mode: mode to set on new inode
2271 * 2248 *
2272 * Must be called with the mutex on the parent inode held 2249 * Must be called with the mutex on the parent inode held
2273 */ 2250 */
2274 static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 2251 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2275 int mode) 2252 int mode)
2276 { 2253 {
2277 struct cgroup *cgrp; 2254 struct cgroup *cgrp;
2278 struct cgroupfs_root *root = parent->root; 2255 struct cgroupfs_root *root = parent->root;
2279 int err = 0; 2256 int err = 0;
2280 struct cgroup_subsys *ss; 2257 struct cgroup_subsys *ss;
2281 struct super_block *sb = root->sb; 2258 struct super_block *sb = root->sb;
2282 2259
2283 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 2260 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
2284 if (!cgrp) 2261 if (!cgrp)
2285 return -ENOMEM; 2262 return -ENOMEM;
2286 2263
2287 /* Grab a reference on the superblock so the hierarchy doesn't 2264 /* Grab a reference on the superblock so the hierarchy doesn't
2288 * get deleted on unmount if there are child cgroups. This 2265 * get deleted on unmount if there are child cgroups. This
2289 * can be done outside cgroup_mutex, since the sb can't 2266 * can be done outside cgroup_mutex, since the sb can't
2290 * disappear while someone has an open control file on the 2267 * disappear while someone has an open control file on the
2291 * fs */ 2268 * fs */
2292 atomic_inc(&sb->s_active); 2269 atomic_inc(&sb->s_active);
2293 2270
2294 mutex_lock(&cgroup_mutex); 2271 mutex_lock(&cgroup_mutex);
2295 2272
2296 INIT_LIST_HEAD(&cgrp->sibling); 2273 INIT_LIST_HEAD(&cgrp->sibling);
2297 INIT_LIST_HEAD(&cgrp->children); 2274 INIT_LIST_HEAD(&cgrp->children);
2298 INIT_LIST_HEAD(&cgrp->css_sets); 2275 INIT_LIST_HEAD(&cgrp->css_sets);
2299 INIT_LIST_HEAD(&cgrp->release_list); 2276 INIT_LIST_HEAD(&cgrp->release_list);
2300 2277
2301 cgrp->parent = parent; 2278 cgrp->parent = parent;
2302 cgrp->root = parent->root; 2279 cgrp->root = parent->root;
2303 cgrp->top_cgroup = parent->top_cgroup; 2280 cgrp->top_cgroup = parent->top_cgroup;
2304 2281
2305 if (notify_on_release(parent)) 2282 if (notify_on_release(parent))
2306 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 2283 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2307 2284
2308 for_each_subsys(root, ss) { 2285 for_each_subsys(root, ss) {
2309 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 2286 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2310 if (IS_ERR(css)) { 2287 if (IS_ERR(css)) {
2311 err = PTR_ERR(css); 2288 err = PTR_ERR(css);
2312 goto err_destroy; 2289 goto err_destroy;
2313 } 2290 }
2314 init_cgroup_css(css, ss, cgrp); 2291 init_cgroup_css(css, ss, cgrp);
2315 } 2292 }
2316 2293
2317 list_add(&cgrp->sibling, &cgrp->parent->children); 2294 list_add(&cgrp->sibling, &cgrp->parent->children);
2318 root->number_of_cgroups++; 2295 root->number_of_cgroups++;
2319 2296
2320 err = cgroup_create_dir(cgrp, dentry, mode); 2297 err = cgroup_create_dir(cgrp, dentry, mode);
2321 if (err < 0) 2298 if (err < 0)
2322 goto err_remove; 2299 goto err_remove;
2323 2300
2324 /* The cgroup directory was pre-locked for us */ 2301 /* The cgroup directory was pre-locked for us */
2325 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 2302 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
2326 2303
2327 err = cgroup_populate_dir(cgrp); 2304 err = cgroup_populate_dir(cgrp);
2328 /* If err < 0, we have a half-filled directory - oh well ;) */ 2305 /* If err < 0, we have a half-filled directory - oh well ;) */
2329 2306
2330 mutex_unlock(&cgroup_mutex); 2307 mutex_unlock(&cgroup_mutex);
2331 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 2308 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
2332 2309
2333 return 0; 2310 return 0;
2334 2311
2335 err_remove: 2312 err_remove:
2336 2313
2337 list_del(&cgrp->sibling); 2314 list_del(&cgrp->sibling);
2338 root->number_of_cgroups--; 2315 root->number_of_cgroups--;
2339 2316
2340 err_destroy: 2317 err_destroy:
2341 2318
2342 for_each_subsys(root, ss) { 2319 for_each_subsys(root, ss) {
2343 if (cgrp->subsys[ss->subsys_id]) 2320 if (cgrp->subsys[ss->subsys_id])
2344 ss->destroy(ss, cgrp); 2321 ss->destroy(ss, cgrp);
2345 } 2322 }
2346 2323
2347 mutex_unlock(&cgroup_mutex); 2324 mutex_unlock(&cgroup_mutex);
2348 2325
2349 /* Release the reference count that we took on the superblock */ 2326 /* Release the reference count that we took on the superblock */
2350 deactivate_super(sb); 2327 deactivate_super(sb);
2351 2328
2352 kfree(cgrp); 2329 kfree(cgrp);
2353 return err; 2330 return err;
2354 } 2331 }
2355 2332
2356 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) 2333 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2357 { 2334 {
2358 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 2335 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
2359 2336
2360 /* the vfs holds inode->i_mutex already */ 2337 /* the vfs holds inode->i_mutex already */
2361 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 2338 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
2362 } 2339 }
2363 2340
2364 static inline int cgroup_has_css_refs(struct cgroup *cgrp) 2341 static inline int cgroup_has_css_refs(struct cgroup *cgrp)
2365 { 2342 {
2366 /* Check the reference count on each subsystem. Since we 2343 /* Check the reference count on each subsystem. Since we
2367 * already established that there are no tasks in the 2344 * already established that there are no tasks in the
2368 * cgroup, if the css refcount is also 0, then there should 2345 * cgroup, if the css refcount is also 0, then there should
2369 * be no outstanding references, so the subsystem is safe to 2346 * be no outstanding references, so the subsystem is safe to
2370 * destroy. We scan across all subsystems rather than using 2347 * destroy. We scan across all subsystems rather than using
2371 * the per-hierarchy linked list of mounted subsystems since 2348 * the per-hierarchy linked list of mounted subsystems since
2372 * we can be called via check_for_release() with no 2349 * we can be called via check_for_release() with no
2373 * synchronization other than RCU, and the subsystem linked 2350 * synchronization other than RCU, and the subsystem linked
2374 * list isn't RCU-safe */ 2351 * list isn't RCU-safe */
2375 int i; 2352 int i;
2376 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2353 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2377 struct cgroup_subsys *ss = subsys[i]; 2354 struct cgroup_subsys *ss = subsys[i];
2378 struct cgroup_subsys_state *css; 2355 struct cgroup_subsys_state *css;
2379 /* Skip subsystems not in this hierarchy */ 2356 /* Skip subsystems not in this hierarchy */
2380 if (ss->root != cgrp->root) 2357 if (ss->root != cgrp->root)
2381 continue; 2358 continue;
2382 css = cgrp->subsys[ss->subsys_id]; 2359 css = cgrp->subsys[ss->subsys_id];
2383 /* When called from check_for_release() it's possible 2360 /* When called from check_for_release() it's possible
2384 * that by this point the cgroup has been removed 2361 * that by this point the cgroup has been removed
2385 * and the css deleted. But a false-positive doesn't 2362 * and the css deleted. But a false-positive doesn't
2386 * matter, since it can only happen if the cgroup 2363 * matter, since it can only happen if the cgroup
2387 * has been deleted and hence no longer needs the 2364 * has been deleted and hence no longer needs the
2388 * release agent to be called anyway. */ 2365 * release agent to be called anyway. */
2389 if (css && atomic_read(&css->refcnt)) 2366 if (css && atomic_read(&css->refcnt))
2390 return 1; 2367 return 1;
2391 } 2368 }
2392 return 0; 2369 return 0;
2393 } 2370 }
2394 2371
2395 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 2372 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2396 { 2373 {
2397 struct cgroup *cgrp = dentry->d_fsdata; 2374 struct cgroup *cgrp = dentry->d_fsdata;
2398 struct dentry *d; 2375 struct dentry *d;
2399 struct cgroup *parent; 2376 struct cgroup *parent;
2400 struct super_block *sb; 2377 struct super_block *sb;
2401 struct cgroupfs_root *root; 2378 struct cgroupfs_root *root;
2402 2379
2403 /* the vfs holds both inode->i_mutex already */ 2380 /* the vfs holds both inode->i_mutex already */
2404 2381
2405 mutex_lock(&cgroup_mutex); 2382 mutex_lock(&cgroup_mutex);
2406 if (atomic_read(&cgrp->count) != 0) { 2383 if (atomic_read(&cgrp->count) != 0) {
2407 mutex_unlock(&cgroup_mutex); 2384 mutex_unlock(&cgroup_mutex);
2408 return -EBUSY; 2385 return -EBUSY;
2409 } 2386 }
2410 if (!list_empty(&cgrp->children)) { 2387 if (!list_empty(&cgrp->children)) {
2411 mutex_unlock(&cgroup_mutex); 2388 mutex_unlock(&cgroup_mutex);
2412 return -EBUSY; 2389 return -EBUSY;
2413 } 2390 }
2414 2391
2415 parent = cgrp->parent; 2392 parent = cgrp->parent;
2416 root = cgrp->root; 2393 root = cgrp->root;
2417 sb = root->sb; 2394 sb = root->sb;
2418 2395
2419 /* 2396 /*
2420 * Call pre_destroy handlers of subsys. Notify subsystems 2397 * Call pre_destroy handlers of subsys. Notify subsystems
2421 * that rmdir() request comes. 2398 * that rmdir() request comes.
2422 */ 2399 */
2423 cgroup_call_pre_destroy(cgrp); 2400 cgroup_call_pre_destroy(cgrp);
2424 2401
2425 if (cgroup_has_css_refs(cgrp)) { 2402 if (cgroup_has_css_refs(cgrp)) {
2426 mutex_unlock(&cgroup_mutex); 2403 mutex_unlock(&cgroup_mutex);
2427 return -EBUSY; 2404 return -EBUSY;
2428 } 2405 }
2429 2406
2430 spin_lock(&release_list_lock); 2407 spin_lock(&release_list_lock);
2431 set_bit(CGRP_REMOVED, &cgrp->flags); 2408 set_bit(CGRP_REMOVED, &cgrp->flags);
2432 if (!list_empty(&cgrp->release_list)) 2409 if (!list_empty(&cgrp->release_list))
2433 list_del(&cgrp->release_list); 2410 list_del(&cgrp->release_list);
2434 spin_unlock(&release_list_lock); 2411 spin_unlock(&release_list_lock);
2435 /* delete my sibling from parent->children */ 2412 /* delete my sibling from parent->children */
2436 list_del(&cgrp->sibling); 2413 list_del(&cgrp->sibling);
2437 spin_lock(&cgrp->dentry->d_lock); 2414 spin_lock(&cgrp->dentry->d_lock);
2438 d = dget(cgrp->dentry); 2415 d = dget(cgrp->dentry);
2439 cgrp->dentry = NULL; 2416 cgrp->dentry = NULL;
2440 spin_unlock(&d->d_lock); 2417 spin_unlock(&d->d_lock);
2441 2418
2442 cgroup_d_remove_dir(d); 2419 cgroup_d_remove_dir(d);
2443 dput(d); 2420 dput(d);
2444 2421
2445 set_bit(CGRP_RELEASABLE, &parent->flags); 2422 set_bit(CGRP_RELEASABLE, &parent->flags);
2446 check_for_release(parent); 2423 check_for_release(parent);
2447 2424
2448 mutex_unlock(&cgroup_mutex); 2425 mutex_unlock(&cgroup_mutex);
2449 return 0; 2426 return 0;
2450 } 2427 }
2451 2428
2452 static void cgroup_init_subsys(struct cgroup_subsys *ss) 2429 static void cgroup_init_subsys(struct cgroup_subsys *ss)
2453 { 2430 {
2454 struct cgroup_subsys_state *css; 2431 struct cgroup_subsys_state *css;
2455 struct list_head *l; 2432 struct list_head *l;
2456 2433
2457 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2434 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2458 2435
2459 /* Create the top cgroup state for this subsystem */ 2436 /* Create the top cgroup state for this subsystem */
2460 ss->root = &rootnode; 2437 ss->root = &rootnode;
2461 css = ss->create(ss, dummytop); 2438 css = ss->create(ss, dummytop);
2462 /* We don't handle early failures gracefully */ 2439 /* We don't handle early failures gracefully */
2463 BUG_ON(IS_ERR(css)); 2440 BUG_ON(IS_ERR(css));
2464 init_cgroup_css(css, ss, dummytop); 2441 init_cgroup_css(css, ss, dummytop);
2465 2442
2466 /* Update all cgroup groups to contain a subsys 2443 /* Update all cgroup groups to contain a subsys
2467 * pointer to this state - since the subsystem is 2444 * pointer to this state - since the subsystem is
2468 * newly registered, all tasks and hence all cgroup 2445 * newly registered, all tasks and hence all cgroup
2469 * groups are in the subsystem's top cgroup. */ 2446 * groups are in the subsystem's top cgroup. */
2470 write_lock(&css_set_lock); 2447 write_lock(&css_set_lock);
2471 l = &init_css_set.list; 2448 l = &init_css_set.list;
2472 do { 2449 do {
2473 struct css_set *cg = 2450 struct css_set *cg =
2474 list_entry(l, struct css_set, list); 2451 list_entry(l, struct css_set, list);
2475 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 2452 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2476 l = l->next; 2453 l = l->next;
2477 } while (l != &init_css_set.list); 2454 } while (l != &init_css_set.list);
2478 write_unlock(&css_set_lock); 2455 write_unlock(&css_set_lock);
2479 2456
2480 /* If this subsystem requested that it be notified with fork 2457 /* If this subsystem requested that it be notified with fork
2481 * events, we should send it one now for every process in the 2458 * events, we should send it one now for every process in the
2482 * system */ 2459 * system */
2483 if (ss->fork) { 2460 if (ss->fork) {
2484 struct task_struct *g, *p; 2461 struct task_struct *g, *p;
2485 2462
2486 read_lock(&tasklist_lock); 2463 read_lock(&tasklist_lock);
2487 do_each_thread(g, p) { 2464 do_each_thread(g, p) {
2488 ss->fork(ss, p); 2465 ss->fork(ss, p);
2489 } while_each_thread(g, p); 2466 } while_each_thread(g, p);
2490 read_unlock(&tasklist_lock); 2467 read_unlock(&tasklist_lock);
2491 } 2468 }
2492 2469
2493 need_forkexit_callback |= ss->fork || ss->exit; 2470 need_forkexit_callback |= ss->fork || ss->exit;
2494 2471
2495 ss->active = 1; 2472 ss->active = 1;
2496 } 2473 }
2497 2474
2498 /** 2475 /**
2499 * cgroup_init_early - cgroup initialization at system boot 2476 * cgroup_init_early - cgroup initialization at system boot
2500 * 2477 *
2501 * Initialize cgroups at system boot, and initialize any 2478 * Initialize cgroups at system boot, and initialize any
2502 * subsystems that request early init. 2479 * subsystems that request early init.
2503 */ 2480 */
2504 int __init cgroup_init_early(void) 2481 int __init cgroup_init_early(void)
2505 { 2482 {
2506 int i; 2483 int i;
2507 kref_init(&init_css_set.ref); 2484 kref_init(&init_css_set.ref);
2508 kref_get(&init_css_set.ref); 2485 kref_get(&init_css_set.ref);
2509 INIT_LIST_HEAD(&init_css_set.list); 2486 INIT_LIST_HEAD(&init_css_set.list);
2510 INIT_LIST_HEAD(&init_css_set.cg_links); 2487 INIT_LIST_HEAD(&init_css_set.cg_links);
2511 INIT_LIST_HEAD(&init_css_set.tasks); 2488 INIT_LIST_HEAD(&init_css_set.tasks);
2512 css_set_count = 1; 2489 css_set_count = 1;
2513 init_cgroup_root(&rootnode); 2490 init_cgroup_root(&rootnode);
2514 list_add(&rootnode.root_list, &roots); 2491 list_add(&rootnode.root_list, &roots);
2515 root_count = 1; 2492 root_count = 1;
2516 init_task.cgroups = &init_css_set; 2493 init_task.cgroups = &init_css_set;
2517 2494
2518 init_css_set_link.cg = &init_css_set; 2495 init_css_set_link.cg = &init_css_set;
2519 list_add(&init_css_set_link.cgrp_link_list, 2496 list_add(&init_css_set_link.cgrp_link_list,
2520 &rootnode.top_cgroup.css_sets); 2497 &rootnode.top_cgroup.css_sets);
2521 list_add(&init_css_set_link.cg_link_list, 2498 list_add(&init_css_set_link.cg_link_list,
2522 &init_css_set.cg_links); 2499 &init_css_set.cg_links);
2523 2500
2524 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2501 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2525 struct cgroup_subsys *ss = subsys[i]; 2502 struct cgroup_subsys *ss = subsys[i];
2526 2503
2527 BUG_ON(!ss->name); 2504 BUG_ON(!ss->name);
2528 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 2505 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
2529 BUG_ON(!ss->create); 2506 BUG_ON(!ss->create);
2530 BUG_ON(!ss->destroy); 2507 BUG_ON(!ss->destroy);
2531 if (ss->subsys_id != i) { 2508 if (ss->subsys_id != i) {
2532 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 2509 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
2533 ss->name, ss->subsys_id); 2510 ss->name, ss->subsys_id);
2534 BUG(); 2511 BUG();
2535 } 2512 }
2536 2513
2537 if (ss->early_init) 2514 if (ss->early_init)
2538 cgroup_init_subsys(ss); 2515 cgroup_init_subsys(ss);
2539 } 2516 }
2540 return 0; 2517 return 0;
2541 } 2518 }
2542 2519
2543 /** 2520 /**
2544 * cgroup_init - cgroup initialization 2521 * cgroup_init - cgroup initialization
2545 * 2522 *
2546 * Register cgroup filesystem and /proc file, and initialize 2523 * Register cgroup filesystem and /proc file, and initialize
2547 * any subsystems that didn't request early init. 2524 * any subsystems that didn't request early init.
2548 */ 2525 */
2549 int __init cgroup_init(void) 2526 int __init cgroup_init(void)
2550 { 2527 {
2551 int err; 2528 int err;
2552 int i; 2529 int i;
2553 struct proc_dir_entry *entry; 2530 struct proc_dir_entry *entry;
2554 2531
2555 err = bdi_init(&cgroup_backing_dev_info); 2532 err = bdi_init(&cgroup_backing_dev_info);
2556 if (err) 2533 if (err)
2557 return err; 2534 return err;
2558 2535
2559 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2536 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2560 struct cgroup_subsys *ss = subsys[i]; 2537 struct cgroup_subsys *ss = subsys[i];
2561 if (!ss->early_init) 2538 if (!ss->early_init)
2562 cgroup_init_subsys(ss); 2539 cgroup_init_subsys(ss);
2563 } 2540 }
2564 2541
2565 err = register_filesystem(&cgroup_fs_type); 2542 err = register_filesystem(&cgroup_fs_type);
2566 if (err < 0) 2543 if (err < 0)
2567 goto out; 2544 goto out;
2568 2545
2569 entry = create_proc_entry("cgroups", 0, NULL); 2546 entry = create_proc_entry("cgroups", 0, NULL);
2570 if (entry) 2547 if (entry)
2571 entry->proc_fops = &proc_cgroupstats_operations; 2548 entry->proc_fops = &proc_cgroupstats_operations;
2572 2549
2573 out: 2550 out:
2574 if (err) 2551 if (err)
2575 bdi_destroy(&cgroup_backing_dev_info); 2552 bdi_destroy(&cgroup_backing_dev_info);
2576 2553
2577 return err; 2554 return err;
2578 } 2555 }
2579 2556
2580 /* 2557 /*
2581 * proc_cgroup_show() 2558 * proc_cgroup_show()
2582 * - Print task's cgroup paths into seq_file, one line for each hierarchy 2559 * - Print task's cgroup paths into seq_file, one line for each hierarchy
2583 * - Used for /proc/<pid>/cgroup. 2560 * - Used for /proc/<pid>/cgroup.
2584 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it 2561 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2585 * doesn't really matter if tsk->cgroup changes after we read it, 2562 * doesn't really matter if tsk->cgroup changes after we read it,
2586 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it 2563 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
2587 * anyway. No need to check that tsk->cgroup != NULL, thanks to 2564 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2588 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks 2565 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2589 * cgroup to top_cgroup. 2566 * cgroup to top_cgroup.
2590 */ 2567 */
2591 2568
2592 /* TODO: Use a proper seq_file iterator */ 2569 /* TODO: Use a proper seq_file iterator */
2593 static int proc_cgroup_show(struct seq_file *m, void *v) 2570 static int proc_cgroup_show(struct seq_file *m, void *v)
2594 { 2571 {
2595 struct pid *pid; 2572 struct pid *pid;
2596 struct task_struct *tsk; 2573 struct task_struct *tsk;
2597 char *buf; 2574 char *buf;
2598 int retval; 2575 int retval;
2599 struct cgroupfs_root *root; 2576 struct cgroupfs_root *root;
2600 2577
2601 retval = -ENOMEM; 2578 retval = -ENOMEM;
2602 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2579 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2603 if (!buf) 2580 if (!buf)
2604 goto out; 2581 goto out;
2605 2582
2606 retval = -ESRCH; 2583 retval = -ESRCH;
2607 pid = m->private; 2584 pid = m->private;
2608 tsk = get_pid_task(pid, PIDTYPE_PID); 2585 tsk = get_pid_task(pid, PIDTYPE_PID);
2609 if (!tsk) 2586 if (!tsk)
2610 goto out_free; 2587 goto out_free;
2611 2588
2612 retval = 0; 2589 retval = 0;
2613 2590
2614 mutex_lock(&cgroup_mutex); 2591 mutex_lock(&cgroup_mutex);
2615 2592
2616 for_each_root(root) { 2593 for_each_root(root) {
2617 struct cgroup_subsys *ss; 2594 struct cgroup_subsys *ss;
2618 struct cgroup *cgrp; 2595 struct cgroup *cgrp;
2619 int subsys_id; 2596 int subsys_id;
2620 int count = 0; 2597 int count = 0;
2621 2598
2622 /* Skip this hierarchy if it has no active subsystems */ 2599 /* Skip this hierarchy if it has no active subsystems */
2623 if (!root->actual_subsys_bits) 2600 if (!root->actual_subsys_bits)
2624 continue; 2601 continue;
2625 seq_printf(m, "%lu:", root->subsys_bits); 2602 seq_printf(m, "%lu:", root->subsys_bits);
2626 for_each_subsys(root, ss) 2603 for_each_subsys(root, ss)
2627 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2604 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
2628 seq_putc(m, ':'); 2605 seq_putc(m, ':');
2629 get_first_subsys(&root->top_cgroup, NULL, &subsys_id); 2606 get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
2630 cgrp = task_cgroup(tsk, subsys_id); 2607 cgrp = task_cgroup(tsk, subsys_id);
2631 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 2608 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2632 if (retval < 0) 2609 if (retval < 0)
2633 goto out_unlock; 2610 goto out_unlock;
2634 seq_puts(m, buf); 2611 seq_puts(m, buf);
2635 seq_putc(m, '\n'); 2612 seq_putc(m, '\n');
2636 } 2613 }
2637 2614
2638 out_unlock: 2615 out_unlock:
2639 mutex_unlock(&cgroup_mutex); 2616 mutex_unlock(&cgroup_mutex);
2640 put_task_struct(tsk); 2617 put_task_struct(tsk);
2641 out_free: 2618 out_free:
2642 kfree(buf); 2619 kfree(buf);
2643 out: 2620 out:
2644 return retval; 2621 return retval;
2645 } 2622 }
2646 2623
2647 static int cgroup_open(struct inode *inode, struct file *file) 2624 static int cgroup_open(struct inode *inode, struct file *file)
2648 { 2625 {
2649 struct pid *pid = PROC_I(inode)->pid; 2626 struct pid *pid = PROC_I(inode)->pid;
2650 return single_open(file, proc_cgroup_show, pid); 2627 return single_open(file, proc_cgroup_show, pid);
2651 } 2628 }
2652 2629
2653 struct file_operations proc_cgroup_operations = { 2630 struct file_operations proc_cgroup_operations = {
2654 .open = cgroup_open, 2631 .open = cgroup_open,
2655 .read = seq_read, 2632 .read = seq_read,
2656 .llseek = seq_lseek, 2633 .llseek = seq_lseek,
2657 .release = single_release, 2634 .release = single_release,
2658 }; 2635 };
2659 2636
2660 /* Display information about each subsystem and each hierarchy */ 2637 /* Display information about each subsystem and each hierarchy */
2661 static int proc_cgroupstats_show(struct seq_file *m, void *v) 2638 static int proc_cgroupstats_show(struct seq_file *m, void *v)
2662 { 2639 {
2663 int i; 2640 int i;
2664 2641
2665 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 2642 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
2666 mutex_lock(&cgroup_mutex); 2643 mutex_lock(&cgroup_mutex);
2667 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2644 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2668 struct cgroup_subsys *ss = subsys[i]; 2645 struct cgroup_subsys *ss = subsys[i];
2669 seq_printf(m, "%s\t%lu\t%d\t%d\n", 2646 seq_printf(m, "%s\t%lu\t%d\t%d\n",
2670 ss->name, ss->root->subsys_bits, 2647 ss->name, ss->root->subsys_bits,
2671 ss->root->number_of_cgroups, !ss->disabled); 2648 ss->root->number_of_cgroups, !ss->disabled);
2672 } 2649 }
2673 mutex_unlock(&cgroup_mutex); 2650 mutex_unlock(&cgroup_mutex);
2674 return 0; 2651 return 0;
2675 } 2652 }
2676 2653
2677 static int cgroupstats_open(struct inode *inode, struct file *file) 2654 static int cgroupstats_open(struct inode *inode, struct file *file)
2678 { 2655 {
2679 return single_open(file, proc_cgroupstats_show, NULL); 2656 return single_open(file, proc_cgroupstats_show, NULL);
2680 } 2657 }
2681 2658
2682 static struct file_operations proc_cgroupstats_operations = { 2659 static struct file_operations proc_cgroupstats_operations = {
2683 .open = cgroupstats_open, 2660 .open = cgroupstats_open,
2684 .read = seq_read, 2661 .read = seq_read,
2685 .llseek = seq_lseek, 2662 .llseek = seq_lseek,
2686 .release = single_release, 2663 .release = single_release,
2687 }; 2664 };
2688 2665
2689 /** 2666 /**
2690 * cgroup_fork - attach newly forked task to its parents cgroup. 2667 * cgroup_fork - attach newly forked task to its parents cgroup.
2691 * @child: pointer to task_struct of forking parent process. 2668 * @child: pointer to task_struct of forking parent process.
2692 * 2669 *
2693 * Description: A task inherits its parent's cgroup at fork(). 2670 * Description: A task inherits its parent's cgroup at fork().
2694 * 2671 *
2695 * A pointer to the shared css_set was automatically copied in 2672 * A pointer to the shared css_set was automatically copied in
2696 * fork.c by dup_task_struct(). However, we ignore that copy, since 2673 * fork.c by dup_task_struct(). However, we ignore that copy, since
2697 * it was not made under the protection of RCU or cgroup_mutex, so 2674 * it was not made under the protection of RCU or cgroup_mutex, so
2698 * might no longer be a valid cgroup pointer. cgroup_attach_task() might 2675 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
2699 * have already changed current->cgroups, allowing the previously 2676 * have already changed current->cgroups, allowing the previously
2700 * referenced cgroup group to be removed and freed. 2677 * referenced cgroup group to be removed and freed.
2701 * 2678 *
2702 * At the point that cgroup_fork() is called, 'current' is the parent 2679 * At the point that cgroup_fork() is called, 'current' is the parent
2703 * task, and the passed argument 'child' points to the child task. 2680 * task, and the passed argument 'child' points to the child task.
2704 */ 2681 */
2705 void cgroup_fork(struct task_struct *child) 2682 void cgroup_fork(struct task_struct *child)
2706 { 2683 {
2707 task_lock(current); 2684 task_lock(current);
2708 child->cgroups = current->cgroups; 2685 child->cgroups = current->cgroups;
2709 get_css_set(child->cgroups); 2686 get_css_set(child->cgroups);
2710 task_unlock(current); 2687 task_unlock(current);
2711 INIT_LIST_HEAD(&child->cg_list); 2688 INIT_LIST_HEAD(&child->cg_list);
2712 } 2689 }
2713 2690
2714 /** 2691 /**
2715 * cgroup_fork_callbacks - run fork callbacks 2692 * cgroup_fork_callbacks - run fork callbacks
2716 * @child: the new task 2693 * @child: the new task
2717 * 2694 *
2718 * Called on a new task very soon before adding it to the 2695 * Called on a new task very soon before adding it to the
2719 * tasklist. No need to take any locks since no-one can 2696 * tasklist. No need to take any locks since no-one can
2720 * be operating on this task. 2697 * be operating on this task.
2721 */ 2698 */
2722 void cgroup_fork_callbacks(struct task_struct *child) 2699 void cgroup_fork_callbacks(struct task_struct *child)
2723 { 2700 {
2724 if (need_forkexit_callback) { 2701 if (need_forkexit_callback) {
2725 int i; 2702 int i;
2726 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2703 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2727 struct cgroup_subsys *ss = subsys[i]; 2704 struct cgroup_subsys *ss = subsys[i];
2728 if (ss->fork) 2705 if (ss->fork)
2729 ss->fork(ss, child); 2706 ss->fork(ss, child);
2730 } 2707 }
2731 } 2708 }
2732 } 2709 }
2733 2710
2734 /** 2711 /**
2735 * cgroup_post_fork - called on a new task after adding it to the task list 2712 * cgroup_post_fork - called on a new task after adding it to the task list
2736 * @child: the task in question 2713 * @child: the task in question
2737 * 2714 *
2738 * Adds the task to the list running through its css_set if necessary. 2715 * Adds the task to the list running through its css_set if necessary.
2739 * Has to be after the task is visible on the task list in case we race 2716 * Has to be after the task is visible on the task list in case we race
2740 * with the first call to cgroup_iter_start() - to guarantee that the 2717 * with the first call to cgroup_iter_start() - to guarantee that the
2741 * new task ends up on its list. 2718 * new task ends up on its list.
2742 */ 2719 */
2743 void cgroup_post_fork(struct task_struct *child) 2720 void cgroup_post_fork(struct task_struct *child)
2744 { 2721 {
2745 if (use_task_css_set_links) { 2722 if (use_task_css_set_links) {
2746 write_lock(&css_set_lock); 2723 write_lock(&css_set_lock);
2747 if (list_empty(&child->cg_list)) 2724 if (list_empty(&child->cg_list))
2748 list_add(&child->cg_list, &child->cgroups->tasks); 2725 list_add(&child->cg_list, &child->cgroups->tasks);
2749 write_unlock(&css_set_lock); 2726 write_unlock(&css_set_lock);
2750 } 2727 }
2751 } 2728 }
2752 /** 2729 /**
2753 * cgroup_exit - detach cgroup from exiting task 2730 * cgroup_exit - detach cgroup from exiting task
2754 * @tsk: pointer to task_struct of exiting process 2731 * @tsk: pointer to task_struct of exiting process
2755 * @run_callback: run exit callbacks? 2732 * @run_callback: run exit callbacks?
2756 * 2733 *
2757 * Description: Detach cgroup from @tsk and release it. 2734 * Description: Detach cgroup from @tsk and release it.
2758 * 2735 *
2759 * Note that cgroups marked notify_on_release force every task in 2736 * Note that cgroups marked notify_on_release force every task in
2760 * them to take the global cgroup_mutex mutex when exiting. 2737 * them to take the global cgroup_mutex mutex when exiting.
2761 * This could impact scaling on very large systems. Be reluctant to 2738 * This could impact scaling on very large systems. Be reluctant to
2762 * use notify_on_release cgroups where very high task exit scaling 2739 * use notify_on_release cgroups where very high task exit scaling
2763 * is required on large systems. 2740 * is required on large systems.
2764 * 2741 *
2765 * the_top_cgroup_hack: 2742 * the_top_cgroup_hack:
2766 * 2743 *
2767 * Set the exiting tasks cgroup to the root cgroup (top_cgroup). 2744 * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
2768 * 2745 *
2769 * We call cgroup_exit() while the task is still competent to 2746 * We call cgroup_exit() while the task is still competent to
2770 * handle notify_on_release(), then leave the task attached to the 2747 * handle notify_on_release(), then leave the task attached to the
2771 * root cgroup in each hierarchy for the remainder of its exit. 2748 * root cgroup in each hierarchy for the remainder of its exit.
2772 * 2749 *
2773 * To do this properly, we would increment the reference count on 2750 * To do this properly, we would increment the reference count on
2774 * top_cgroup, and near the very end of the kernel/exit.c do_exit() 2751 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
2775 * code we would add a second cgroup function call, to drop that 2752 * code we would add a second cgroup function call, to drop that
2776 * reference. This would just create an unnecessary hot spot on 2753 * reference. This would just create an unnecessary hot spot on
2777 * the top_cgroup reference count, to no avail. 2754 * the top_cgroup reference count, to no avail.
2778 * 2755 *
2779 * Normally, holding a reference to a cgroup without bumping its 2756 * Normally, holding a reference to a cgroup without bumping its
2780 * count is unsafe. The cgroup could go away, or someone could 2757 * count is unsafe. The cgroup could go away, or someone could
2781 * attach us to a different cgroup, decrementing the count on 2758 * attach us to a different cgroup, decrementing the count on
2782 * the first cgroup that we never incremented. But in this case, 2759 * the first cgroup that we never incremented. But in this case,
2783 * top_cgroup isn't going away, and either task has PF_EXITING set, 2760 * top_cgroup isn't going away, and either task has PF_EXITING set,
2784 * which wards off any cgroup_attach_task() attempts, or task is a failed 2761 * which wards off any cgroup_attach_task() attempts, or task is a failed
2785 * fork, never visible to cgroup_attach_task. 2762 * fork, never visible to cgroup_attach_task.
2786 */ 2763 */
2787 void cgroup_exit(struct task_struct *tsk, int run_callbacks) 2764 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
2788 { 2765 {
2789 int i; 2766 int i;
2790 struct css_set *cg; 2767 struct css_set *cg;
2791 2768
2792 if (run_callbacks && need_forkexit_callback) { 2769 if (run_callbacks && need_forkexit_callback) {
2793 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2770 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2794 struct cgroup_subsys *ss = subsys[i]; 2771 struct cgroup_subsys *ss = subsys[i];
2795 if (ss->exit) 2772 if (ss->exit)
2796 ss->exit(ss, tsk); 2773 ss->exit(ss, tsk);
2797 } 2774 }
2798 } 2775 }
2799 2776
2800 /* 2777 /*
2801 * Unlink from the css_set task list if necessary. 2778 * Unlink from the css_set task list if necessary.
2802 * Optimistically check cg_list before taking 2779 * Optimistically check cg_list before taking
2803 * css_set_lock 2780 * css_set_lock
2804 */ 2781 */
2805 if (!list_empty(&tsk->cg_list)) { 2782 if (!list_empty(&tsk->cg_list)) {
2806 write_lock(&css_set_lock); 2783 write_lock(&css_set_lock);
2807 if (!list_empty(&tsk->cg_list)) 2784 if (!list_empty(&tsk->cg_list))
2808 list_del(&tsk->cg_list); 2785 list_del(&tsk->cg_list);
2809 write_unlock(&css_set_lock); 2786 write_unlock(&css_set_lock);
2810 } 2787 }
2811 2788
2812 /* Reassign the task to the init_css_set. */ 2789 /* Reassign the task to the init_css_set. */
2813 task_lock(tsk); 2790 task_lock(tsk);
2814 cg = tsk->cgroups; 2791 cg = tsk->cgroups;
2815 tsk->cgroups = &init_css_set; 2792 tsk->cgroups = &init_css_set;
2816 task_unlock(tsk); 2793 task_unlock(tsk);
2817 if (cg) 2794 if (cg)
2818 put_css_set_taskexit(cg); 2795 put_css_set_taskexit(cg);
2819 } 2796 }
2820 2797
2821 /** 2798 /**
2822 * cgroup_clone - clone the cgroup the given subsystem is attached to 2799 * cgroup_clone - clone the cgroup the given subsystem is attached to
2823 * @tsk: the task to be moved 2800 * @tsk: the task to be moved
2824 * @subsys: the given subsystem 2801 * @subsys: the given subsystem
2825 * 2802 *
2826 * Duplicate the current cgroup in the hierarchy that the given 2803 * Duplicate the current cgroup in the hierarchy that the given
2827 * subsystem is attached to, and move this task into the new 2804 * subsystem is attached to, and move this task into the new
2828 * child. 2805 * child.
2829 */ 2806 */
2830 int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) 2807 int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2831 { 2808 {
2832 struct dentry *dentry; 2809 struct dentry *dentry;
2833 int ret = 0; 2810 int ret = 0;
2834 char nodename[MAX_CGROUP_TYPE_NAMELEN]; 2811 char nodename[MAX_CGROUP_TYPE_NAMELEN];
2835 struct cgroup *parent, *child; 2812 struct cgroup *parent, *child;
2836 struct inode *inode; 2813 struct inode *inode;
2837 struct css_set *cg; 2814 struct css_set *cg;
2838 struct cgroupfs_root *root; 2815 struct cgroupfs_root *root;
2839 struct cgroup_subsys *ss; 2816 struct cgroup_subsys *ss;
2840 2817
2841 /* We shouldn't be called by an unregistered subsystem */ 2818 /* We shouldn't be called by an unregistered subsystem */
2842 BUG_ON(!subsys->active); 2819 BUG_ON(!subsys->active);
2843 2820
2844 /* First figure out what hierarchy and cgroup we're dealing 2821 /* First figure out what hierarchy and cgroup we're dealing
2845 * with, and pin them so we can drop cgroup_mutex */ 2822 * with, and pin them so we can drop cgroup_mutex */
2846 mutex_lock(&cgroup_mutex); 2823 mutex_lock(&cgroup_mutex);
2847 again: 2824 again:
2848 root = subsys->root; 2825 root = subsys->root;
2849 if (root == &rootnode) { 2826 if (root == &rootnode) {
2850 printk(KERN_INFO 2827 printk(KERN_INFO
2851 "Not cloning cgroup for unused subsystem %s\n", 2828 "Not cloning cgroup for unused subsystem %s\n",
2852 subsys->name); 2829 subsys->name);
2853 mutex_unlock(&cgroup_mutex); 2830 mutex_unlock(&cgroup_mutex);
2854 return 0; 2831 return 0;
2855 } 2832 }
2856 cg = tsk->cgroups; 2833 cg = tsk->cgroups;
2857 parent = task_cgroup(tsk, subsys->subsys_id); 2834 parent = task_cgroup(tsk, subsys->subsys_id);
2858 2835
2859 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid); 2836 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
2860 2837
2861 /* Pin the hierarchy */ 2838 /* Pin the hierarchy */
2862 atomic_inc(&parent->root->sb->s_active); 2839 atomic_inc(&parent->root->sb->s_active);
2863 2840
2864 /* Keep the cgroup alive */ 2841 /* Keep the cgroup alive */
2865 get_css_set(cg); 2842 get_css_set(cg);
2866 mutex_unlock(&cgroup_mutex); 2843 mutex_unlock(&cgroup_mutex);
2867 2844
2868 /* Now do the VFS work to create a cgroup */ 2845 /* Now do the VFS work to create a cgroup */
2869 inode = parent->dentry->d_inode; 2846 inode = parent->dentry->d_inode;
2870 2847
2871 /* Hold the parent directory mutex across this operation to 2848 /* Hold the parent directory mutex across this operation to
2872 * stop anyone else deleting the new cgroup */ 2849 * stop anyone else deleting the new cgroup */
2873 mutex_lock(&inode->i_mutex); 2850 mutex_lock(&inode->i_mutex);
2874 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); 2851 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
2875 if (IS_ERR(dentry)) { 2852 if (IS_ERR(dentry)) {
2876 printk(KERN_INFO 2853 printk(KERN_INFO
2877 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, 2854 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
2878 PTR_ERR(dentry)); 2855 PTR_ERR(dentry));
2879 ret = PTR_ERR(dentry); 2856 ret = PTR_ERR(dentry);
2880 goto out_release; 2857 goto out_release;
2881 } 2858 }
2882 2859
2883 /* Create the cgroup directory, which also creates the cgroup */ 2860 /* Create the cgroup directory, which also creates the cgroup */
2884 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); 2861 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
2885 child = __d_cgrp(dentry); 2862 child = __d_cgrp(dentry);
2886 dput(dentry); 2863 dput(dentry);
2887 if (ret) { 2864 if (ret) {
2888 printk(KERN_INFO 2865 printk(KERN_INFO
2889 "Failed to create cgroup %s: %d\n", nodename, 2866 "Failed to create cgroup %s: %d\n", nodename,
2890 ret); 2867 ret);
2891 goto out_release; 2868 goto out_release;
2892 } 2869 }
2893 2870
2894 if (!child) { 2871 if (!child) {
2895 printk(KERN_INFO 2872 printk(KERN_INFO
2896 "Couldn't find new cgroup %s\n", nodename); 2873 "Couldn't find new cgroup %s\n", nodename);
2897 ret = -ENOMEM; 2874 ret = -ENOMEM;
2898 goto out_release; 2875 goto out_release;
2899 } 2876 }
2900 2877
2901 /* The cgroup now exists. Retake cgroup_mutex and check 2878 /* The cgroup now exists. Retake cgroup_mutex and check
2902 * that we're still in the same state that we thought we 2879 * that we're still in the same state that we thought we
2903 * were. */ 2880 * were. */
2904 mutex_lock(&cgroup_mutex); 2881 mutex_lock(&cgroup_mutex);
2905 if ((root != subsys->root) || 2882 if ((root != subsys->root) ||
2906 (parent != task_cgroup(tsk, subsys->subsys_id))) { 2883 (parent != task_cgroup(tsk, subsys->subsys_id))) {
2907 /* Aargh, we raced ... */ 2884 /* Aargh, we raced ... */
2908 mutex_unlock(&inode->i_mutex); 2885 mutex_unlock(&inode->i_mutex);
2909 put_css_set(cg); 2886 put_css_set(cg);
2910 2887
2911 deactivate_super(parent->root->sb); 2888 deactivate_super(parent->root->sb);
2912 /* The cgroup is still accessible in the VFS, but 2889 /* The cgroup is still accessible in the VFS, but
2913 * we're not going to try to rmdir() it at this 2890 * we're not going to try to rmdir() it at this
2914 * point. */ 2891 * point. */
2915 printk(KERN_INFO 2892 printk(KERN_INFO
2916 "Race in cgroup_clone() - leaking cgroup %s\n", 2893 "Race in cgroup_clone() - leaking cgroup %s\n",
2917 nodename); 2894 nodename);
2918 goto again; 2895 goto again;
2919 } 2896 }
2920 2897
2921 /* do any required auto-setup */ 2898 /* do any required auto-setup */
2922 for_each_subsys(root, ss) { 2899 for_each_subsys(root, ss) {
2923 if (ss->post_clone) 2900 if (ss->post_clone)
2924 ss->post_clone(ss, child); 2901 ss->post_clone(ss, child);
2925 } 2902 }
2926 2903
2927 /* All seems fine. Finish by moving the task into the new cgroup */ 2904 /* All seems fine. Finish by moving the task into the new cgroup */
2928 ret = cgroup_attach_task(child, tsk); 2905 ret = cgroup_attach_task(child, tsk);
2929 mutex_unlock(&cgroup_mutex); 2906 mutex_unlock(&cgroup_mutex);
2930 2907
2931 out_release: 2908 out_release:
2932 mutex_unlock(&inode->i_mutex); 2909 mutex_unlock(&inode->i_mutex);
2933 2910
2934 mutex_lock(&cgroup_mutex); 2911 mutex_lock(&cgroup_mutex);
2935 put_css_set(cg); 2912 put_css_set(cg);
2936 mutex_unlock(&cgroup_mutex); 2913 mutex_unlock(&cgroup_mutex);
2937 deactivate_super(parent->root->sb); 2914 deactivate_super(parent->root->sb);
2938 return ret; 2915 return ret;
2939 } 2916 }
2940 2917
2941 /** 2918 /**
2942 * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp 2919 * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
2943 * @cgrp: the cgroup in question 2920 * @cgrp: the cgroup in question
2944 * 2921 *
2945 * See if @cgrp is a descendant of the current task's cgroup in 2922 * See if @cgrp is a descendant of the current task's cgroup in
2946 * the appropriate hierarchy. 2923 * the appropriate hierarchy.
2947 * 2924 *
2948 * If we are sending in dummytop, then presumably we are creating 2925 * If we are sending in dummytop, then presumably we are creating
2949 * the top cgroup in the subsystem. 2926 * the top cgroup in the subsystem.
2950 * 2927 *
2951 * Called only by the ns (nsproxy) cgroup. 2928 * Called only by the ns (nsproxy) cgroup.
2952 */ 2929 */
2953 int cgroup_is_descendant(const struct cgroup *cgrp) 2930 int cgroup_is_descendant(const struct cgroup *cgrp)
2954 { 2931 {
2955 int ret; 2932 int ret;
2956 struct cgroup *target; 2933 struct cgroup *target;
2957 int subsys_id; 2934 int subsys_id;
2958 2935
2959 if (cgrp == dummytop) 2936 if (cgrp == dummytop)
2960 return 1; 2937 return 1;
2961 2938
2962 get_first_subsys(cgrp, NULL, &subsys_id); 2939 get_first_subsys(cgrp, NULL, &subsys_id);
2963 target = task_cgroup(current, subsys_id); 2940 target = task_cgroup(current, subsys_id);
2964 while (cgrp != target && cgrp!= cgrp->top_cgroup) 2941 while (cgrp != target && cgrp!= cgrp->top_cgroup)
2965 cgrp = cgrp->parent; 2942 cgrp = cgrp->parent;
2966 ret = (cgrp == target); 2943 ret = (cgrp == target);
2967 return ret; 2944 return ret;
2968 } 2945 }
2969 2946
2970 static void check_for_release(struct cgroup *cgrp) 2947 static void check_for_release(struct cgroup *cgrp)
2971 { 2948 {
2972 /* All of these checks rely on RCU to keep the cgroup 2949 /* All of these checks rely on RCU to keep the cgroup
2973 * structure alive */ 2950 * structure alive */
2974 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) 2951 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
2975 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { 2952 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
2976 /* Control Group is currently removeable. If it's not 2953 /* Control Group is currently removeable. If it's not
2977 * already queued for a userspace notification, queue 2954 * already queued for a userspace notification, queue
2978 * it now */ 2955 * it now */
2979 int need_schedule_work = 0; 2956 int need_schedule_work = 0;
2980 spin_lock(&release_list_lock); 2957 spin_lock(&release_list_lock);
2981 if (!cgroup_is_removed(cgrp) && 2958 if (!cgroup_is_removed(cgrp) &&
2982 list_empty(&cgrp->release_list)) { 2959 list_empty(&cgrp->release_list)) {
2983 list_add(&cgrp->release_list, &release_list); 2960 list_add(&cgrp->release_list, &release_list);
2984 need_schedule_work = 1; 2961 need_schedule_work = 1;
2985 } 2962 }
2986 spin_unlock(&release_list_lock); 2963 spin_unlock(&release_list_lock);
2987 if (need_schedule_work) 2964 if (need_schedule_work)
2988 schedule_work(&release_agent_work); 2965 schedule_work(&release_agent_work);
2989 } 2966 }
2990 } 2967 }
2991 2968
2992 void __css_put(struct cgroup_subsys_state *css) 2969 void __css_put(struct cgroup_subsys_state *css)
2993 { 2970 {
2994 struct cgroup *cgrp = css->cgroup; 2971 struct cgroup *cgrp = css->cgroup;
2995 rcu_read_lock(); 2972 rcu_read_lock();
2996 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { 2973 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
2997 set_bit(CGRP_RELEASABLE, &cgrp->flags); 2974 set_bit(CGRP_RELEASABLE, &cgrp->flags);
2998 check_for_release(cgrp); 2975 check_for_release(cgrp);
2999 } 2976 }
3000 rcu_read_unlock(); 2977 rcu_read_unlock();
3001 } 2978 }
3002 2979
3003 /* 2980 /*
3004 * Notify userspace when a cgroup is released, by running the 2981 * Notify userspace when a cgroup is released, by running the
3005 * configured release agent with the name of the cgroup (path 2982 * configured release agent with the name of the cgroup (path
3006 * relative to the root of cgroup file system) as the argument. 2983 * relative to the root of cgroup file system) as the argument.
3007 * 2984 *
3008 * Most likely, this user command will try to rmdir this cgroup. 2985 * Most likely, this user command will try to rmdir this cgroup.
3009 * 2986 *
3010 * This races with the possibility that some other task will be 2987 * This races with the possibility that some other task will be
3011 * attached to this cgroup before it is removed, or that some other 2988 * attached to this cgroup before it is removed, or that some other
3012 * user task will 'mkdir' a child cgroup of this cgroup. That's ok. 2989 * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
3013 * The presumed 'rmdir' will fail quietly if this cgroup is no longer 2990 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
3014 * unused, and this cgroup will be reprieved from its death sentence, 2991 * unused, and this cgroup will be reprieved from its death sentence,
3015 * to continue to serve a useful existence. Next time it's released, 2992 * to continue to serve a useful existence. Next time it's released,
3016 * we will get notified again, if it still has 'notify_on_release' set. 2993 * we will get notified again, if it still has 'notify_on_release' set.
3017 * 2994 *
3018 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which 2995 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
3019 * means only wait until the task is successfully execve()'d. The 2996 * means only wait until the task is successfully execve()'d. The
3020 * separate release agent task is forked by call_usermodehelper(), 2997 * separate release agent task is forked by call_usermodehelper(),
3021 * then control in this thread returns here, without waiting for the 2998 * then control in this thread returns here, without waiting for the
3022 * release agent task. We don't bother to wait because the caller of 2999 * release agent task. We don't bother to wait because the caller of
3023 * this routine has no use for the exit status of the release agent 3000 * this routine has no use for the exit status of the release agent
3024 * task, so no sense holding our caller up for that. 3001 * task, so no sense holding our caller up for that.
3025 */ 3002 */
3026 static void cgroup_release_agent(struct work_struct *work) 3003 static void cgroup_release_agent(struct work_struct *work)
3027 { 3004 {
3028 BUG_ON(work != &release_agent_work); 3005 BUG_ON(work != &release_agent_work);
3029 mutex_lock(&cgroup_mutex); 3006 mutex_lock(&cgroup_mutex);
3030 spin_lock(&release_list_lock); 3007 spin_lock(&release_list_lock);
3031 while (!list_empty(&release_list)) { 3008 while (!list_empty(&release_list)) {
3032 char *argv[3], *envp[3]; 3009 char *argv[3], *envp[3];
3033 int i; 3010 int i;
3034 char *pathbuf; 3011 char *pathbuf;
3035 struct cgroup *cgrp = list_entry(release_list.next, 3012 struct cgroup *cgrp = list_entry(release_list.next,
3036 struct cgroup, 3013 struct cgroup,
3037 release_list); 3014 release_list);
3038 list_del_init(&cgrp->release_list); 3015 list_del_init(&cgrp->release_list);
3039 spin_unlock(&release_list_lock); 3016 spin_unlock(&release_list_lock);
3040 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 3017 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
3041 if (!pathbuf) { 3018 if (!pathbuf) {
3042 spin_lock(&release_list_lock); 3019 spin_lock(&release_list_lock);
3043 continue; 3020 continue;
3044 } 3021 }
3045 3022
3046 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) { 3023 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
3047 kfree(pathbuf); 3024 kfree(pathbuf);
3048 spin_lock(&release_list_lock); 3025 spin_lock(&release_list_lock);
3049 continue; 3026 continue;
3050 } 3027 }
3051 3028
3052 i = 0; 3029 i = 0;
3053 argv[i++] = cgrp->root->release_agent_path; 3030 argv[i++] = cgrp->root->release_agent_path;
3054 argv[i++] = (char *)pathbuf; 3031 argv[i++] = (char *)pathbuf;
3055 argv[i] = NULL; 3032 argv[i] = NULL;
3056 3033
3057 i = 0; 3034 i = 0;
3058 /* minimal command environment */ 3035 /* minimal command environment */
3059 envp[i++] = "HOME=/"; 3036 envp[i++] = "HOME=/";
3060 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 3037 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
3061 envp[i] = NULL; 3038 envp[i] = NULL;
3062 3039
3063 /* Drop the lock while we invoke the usermode helper, 3040 /* Drop the lock while we invoke the usermode helper,
3064 * since the exec could involve hitting disk and hence 3041 * since the exec could involve hitting disk and hence
3065 * be a slow process */ 3042 * be a slow process */
3066 mutex_unlock(&cgroup_mutex); 3043 mutex_unlock(&cgroup_mutex);
3067 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); 3044 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
3068 kfree(pathbuf); 3045 kfree(pathbuf);
3069 mutex_lock(&cgroup_mutex); 3046 mutex_lock(&cgroup_mutex);
3070 spin_lock(&release_list_lock); 3047 spin_lock(&release_list_lock);
3071 } 3048 }
3072 spin_unlock(&release_list_lock); 3049 spin_unlock(&release_list_lock);
3073 mutex_unlock(&cgroup_mutex); 3050 mutex_unlock(&cgroup_mutex);
3074 } 3051 }
3075 3052
3076 static int __init cgroup_disable(char *str) 3053 static int __init cgroup_disable(char *str)
3077 { 3054 {
3078 int i; 3055 int i;
3079 char *token; 3056 char *token;
3080 3057
3081 while ((token = strsep(&str, ",")) != NULL) { 3058 while ((token = strsep(&str, ",")) != NULL) {
3082 if (!*token) 3059 if (!*token)
3083 continue; 3060 continue;
3084 3061
3085 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3062 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3086 struct cgroup_subsys *ss = subsys[i]; 3063 struct cgroup_subsys *ss = subsys[i];
3087 3064
3088 if (!strcmp(token, ss->name)) { 3065 if (!strcmp(token, ss->name)) {
3089 ss->disabled = 1; 3066 ss->disabled = 1;
3090 printk(KERN_INFO "Disabling %s control group" 3067 printk(KERN_INFO "Disabling %s control group"
3091 " subsystem\n", ss->name); 3068 " subsystem\n", ss->name);
3092 break; 3069 break;
3093 } 3070 }
3094 } 3071 }
3095 } 3072 }
3096 return 1; 3073 return 1;
3097 } 3074 }
3098 __setup("cgroup_disable=", cgroup_disable); 3075 __setup("cgroup_disable=", cgroup_disable);
3099 3076
kernel/cgroup_debug.c
1 /* 1 /*
2 * kernel/ccontainer_debug.c - Example cgroup subsystem that 2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info 3 * exposes debug info
4 * 4 *
5 * Copyright (C) Google Inc, 2007 5 * Copyright (C) Google Inc, 2007
6 * 6 *
7 * Developed by Paul Menage (menage@google.com) 7 * Developed by Paul Menage (menage@google.com)
8 * 8 *
9 */ 9 */
10 10
11 #include <linux/cgroup.h> 11 #include <linux/cgroup.h>
12 #include <linux/fs.h> 12 #include <linux/fs.h>
13 #include <linux/slab.h> 13 #include <linux/slab.h>
14 #include <linux/rcupdate.h> 14 #include <linux/rcupdate.h>
15 15
16 #include <asm/atomic.h> 16 #include <asm/atomic.h>
17 17
18 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 18 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont) 19 struct cgroup *cont)
20 { 20 {
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22 22
23 if (!css) 23 if (!css)
24 return ERR_PTR(-ENOMEM); 24 return ERR_PTR(-ENOMEM);
25 25
26 return css; 26 return css;
27 } 27 }
28 28
29 static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 29 static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30 { 30 {
31 kfree(cont->subsys[debug_subsys_id]); 31 kfree(cont->subsys[debug_subsys_id]);
32 } 32 }
33 33
34 static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) 34 static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35 { 35 {
36 return atomic_read(&cont->count); 36 return atomic_read(&cont->count);
37 } 37 }
38 38
39 static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) 39 static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40 { 40 {
41 u64 count; 41 u64 count;
42 42
43 cgroup_lock(); 43 cgroup_lock();
44 count = cgroup_task_count(cont); 44 count = cgroup_task_count(cont);
45 cgroup_unlock(); 45 cgroup_unlock();
46 return count; 46 return count;
47 } 47 }
48 48
49 static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) 49 static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
50 { 50 {
51 return (u64)(long)current->cgroups; 51 return (u64)(long)current->cgroups;
52 } 52 }
53 53
54 static u64 current_css_set_refcount_read(struct cgroup *cont, 54 static u64 current_css_set_refcount_read(struct cgroup *cont,
55 struct cftype *cft) 55 struct cftype *cft)
56 { 56 {
57 u64 count; 57 u64 count;
58 58
59 rcu_read_lock(); 59 rcu_read_lock();
60 count = atomic_read(&current->cgroups->ref.refcount); 60 count = atomic_read(&current->cgroups->ref.refcount);
61 rcu_read_unlock(); 61 rcu_read_unlock();
62 return count; 62 return count;
63 } 63 }
64 64
65 static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
66 {
67 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
68 }
69
65 static struct cftype files[] = { 70 static struct cftype files[] = {
66 { 71 {
67 .name = "cgroup_refcount", 72 .name = "cgroup_refcount",
68 .read_u64 = cgroup_refcount_read, 73 .read_u64 = cgroup_refcount_read,
69 }, 74 },
70 { 75 {
71 .name = "taskcount", 76 .name = "taskcount",
72 .read_u64 = taskcount_read, 77 .read_u64 = taskcount_read,
73 }, 78 },
74 79
75 { 80 {
76 .name = "current_css_set", 81 .name = "current_css_set",
77 .read_u64 = current_css_set_read, 82 .read_u64 = current_css_set_read,
78 }, 83 },
79 84
80 { 85 {
81 .name = "current_css_set_refcount", 86 .name = "current_css_set_refcount",
82 .read_u64 = current_css_set_refcount_read, 87 .read_u64 = current_css_set_refcount_read,
83 }, 88 },
89
90 {
91 .name = "releasable",
92 .read_u64 = releasable_read,
93 }
84 }; 94 };
85 95
86 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 96 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
87 { 97 {
88 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 98 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
89 } 99 }
90 100
91 struct cgroup_subsys debug_subsys = { 101 struct cgroup_subsys debug_subsys = {
92 .name = "debug", 102 .name = "debug",
93 .create = debug_create, 103 .create = debug_create,
94 .destroy = debug_destroy, 104 .destroy = debug_destroy,
95 .populate = debug_populate, 105 .populate = debug_populate,
96 .subsys_id = debug_subsys_id, 106 .subsys_id = debug_subsys_id,
97 }; 107 };
98 108