Commit 6803c006282768ec850760766a6e4eb1a6ff87df

Authored by Tejun Heo
1 parent bd53d617b3

cgroup: add css_set->dfl_cgrp

To implement the unified hierarchy behavior, we'll need to be able to
determine the associated cgroup on the default hierarchy from css_set.
Let's add css_set->dfl_cgrp so that it can be accessed conveniently
and efficiently.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>

Showing 2 changed files with 7 additions and 0 deletions Inline Diff

include/linux/cgroup.h
1 #ifndef _LINUX_CGROUP_H 1 #ifndef _LINUX_CGROUP_H
2 #define _LINUX_CGROUP_H 2 #define _LINUX_CGROUP_H
3 /* 3 /*
4 * cgroup interface 4 * cgroup interface
5 * 5 *
6 * Copyright (C) 2003 BULL SA 6 * Copyright (C) 2003 BULL SA
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * 8 *
9 */ 9 */
10 10
11 #include <linux/sched.h> 11 #include <linux/sched.h>
12 #include <linux/cpumask.h> 12 #include <linux/cpumask.h>
13 #include <linux/nodemask.h> 13 #include <linux/nodemask.h>
14 #include <linux/rcupdate.h> 14 #include <linux/rcupdate.h>
15 #include <linux/rculist.h> 15 #include <linux/rculist.h>
16 #include <linux/cgroupstats.h> 16 #include <linux/cgroupstats.h>
17 #include <linux/rwsem.h> 17 #include <linux/rwsem.h>
18 #include <linux/idr.h> 18 #include <linux/idr.h>
19 #include <linux/workqueue.h> 19 #include <linux/workqueue.h>
20 #include <linux/fs.h> 20 #include <linux/fs.h>
21 #include <linux/percpu-refcount.h> 21 #include <linux/percpu-refcount.h>
22 #include <linux/seq_file.h> 22 #include <linux/seq_file.h>
23 #include <linux/kernfs.h> 23 #include <linux/kernfs.h>
24 24
25 #ifdef CONFIG_CGROUPS 25 #ifdef CONFIG_CGROUPS
26 26
27 struct cgroup_root; 27 struct cgroup_root;
28 struct cgroup_subsys; 28 struct cgroup_subsys;
29 struct inode; 29 struct inode;
30 struct cgroup; 30 struct cgroup;
31 31
32 extern int cgroup_init_early(void); 32 extern int cgroup_init_early(void);
33 extern int cgroup_init(void); 33 extern int cgroup_init(void);
34 extern void cgroup_fork(struct task_struct *p); 34 extern void cgroup_fork(struct task_struct *p);
35 extern void cgroup_post_fork(struct task_struct *p); 35 extern void cgroup_post_fork(struct task_struct *p);
36 extern void cgroup_exit(struct task_struct *p); 36 extern void cgroup_exit(struct task_struct *p);
37 extern int cgroupstats_build(struct cgroupstats *stats, 37 extern int cgroupstats_build(struct cgroupstats *stats,
38 struct dentry *dentry); 38 struct dentry *dentry);
39 39
40 extern int proc_cgroup_show(struct seq_file *, void *); 40 extern int proc_cgroup_show(struct seq_file *, void *);
41 41
42 /* define the enumeration of all cgroup subsystems */ 42 /* define the enumeration of all cgroup subsystems */
43 #define SUBSYS(_x) _x ## _cgrp_id, 43 #define SUBSYS(_x) _x ## _cgrp_id,
44 enum cgroup_subsys_id { 44 enum cgroup_subsys_id {
45 #include <linux/cgroup_subsys.h> 45 #include <linux/cgroup_subsys.h>
46 CGROUP_SUBSYS_COUNT, 46 CGROUP_SUBSYS_COUNT,
47 }; 47 };
48 #undef SUBSYS 48 #undef SUBSYS
49 49
50 /* Per-subsystem/per-cgroup state maintained by the system. */ 50 /* Per-subsystem/per-cgroup state maintained by the system. */
51 struct cgroup_subsys_state { 51 struct cgroup_subsys_state {
52 /* the cgroup that this css is attached to */ 52 /* the cgroup that this css is attached to */
53 struct cgroup *cgroup; 53 struct cgroup *cgroup;
54 54
55 /* the cgroup subsystem that this css is attached to */ 55 /* the cgroup subsystem that this css is attached to */
56 struct cgroup_subsys *ss; 56 struct cgroup_subsys *ss;
57 57
58 /* reference count - access via css_[try]get() and css_put() */ 58 /* reference count - access via css_[try]get() and css_put() */
59 struct percpu_ref refcnt; 59 struct percpu_ref refcnt;
60 60
61 /* the parent css */ 61 /* the parent css */
62 struct cgroup_subsys_state *parent; 62 struct cgroup_subsys_state *parent;
63 63
64 unsigned long flags; 64 unsigned long flags;
65 65
66 /* percpu_ref killing and RCU release */ 66 /* percpu_ref killing and RCU release */
67 struct rcu_head rcu_head; 67 struct rcu_head rcu_head;
68 struct work_struct destroy_work; 68 struct work_struct destroy_work;
69 }; 69 };
70 70
71 /* bits in struct cgroup_subsys_state flags field */ 71 /* bits in struct cgroup_subsys_state flags field */
72 enum { 72 enum {
73 CSS_ROOT = (1 << 0), /* this CSS is the root of the subsystem */ 73 CSS_ROOT = (1 << 0), /* this CSS is the root of the subsystem */
74 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ 74 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
75 }; 75 };
76 76
77 /** 77 /**
78 * css_get - obtain a reference on the specified css 78 * css_get - obtain a reference on the specified css
79 * @css: target css 79 * @css: target css
80 * 80 *
81 * The caller must already have a reference. 81 * The caller must already have a reference.
82 */ 82 */
83 static inline void css_get(struct cgroup_subsys_state *css) 83 static inline void css_get(struct cgroup_subsys_state *css)
84 { 84 {
85 /* We don't need to reference count the root state */ 85 /* We don't need to reference count the root state */
86 if (!(css->flags & CSS_ROOT)) 86 if (!(css->flags & CSS_ROOT))
87 percpu_ref_get(&css->refcnt); 87 percpu_ref_get(&css->refcnt);
88 } 88 }
89 89
90 /** 90 /**
91 * css_tryget - try to obtain a reference on the specified css 91 * css_tryget - try to obtain a reference on the specified css
92 * @css: target css 92 * @css: target css
93 * 93 *
94 * Obtain a reference on @css if it's alive. The caller naturally needs to 94 * Obtain a reference on @css if it's alive. The caller naturally needs to
95 * ensure that @css is accessible but doesn't have to be holding a 95 * ensure that @css is accessible but doesn't have to be holding a
96 * reference on it - IOW, RCU protected access is good enough for this 96 * reference on it - IOW, RCU protected access is good enough for this
97 * function. Returns %true if a reference count was successfully obtained; 97 * function. Returns %true if a reference count was successfully obtained;
98 * %false otherwise. 98 * %false otherwise.
99 */ 99 */
100 static inline bool css_tryget(struct cgroup_subsys_state *css) 100 static inline bool css_tryget(struct cgroup_subsys_state *css)
101 { 101 {
102 if (css->flags & CSS_ROOT) 102 if (css->flags & CSS_ROOT)
103 return true; 103 return true;
104 return percpu_ref_tryget(&css->refcnt); 104 return percpu_ref_tryget(&css->refcnt);
105 } 105 }
106 106
107 /** 107 /**
108 * css_put - put a css reference 108 * css_put - put a css reference
109 * @css: target css 109 * @css: target css
110 * 110 *
111 * Put a reference obtained via css_get() and css_tryget(). 111 * Put a reference obtained via css_get() and css_tryget().
112 */ 112 */
113 static inline void css_put(struct cgroup_subsys_state *css) 113 static inline void css_put(struct cgroup_subsys_state *css)
114 { 114 {
115 if (!(css->flags & CSS_ROOT)) 115 if (!(css->flags & CSS_ROOT))
116 percpu_ref_put(&css->refcnt); 116 percpu_ref_put(&css->refcnt);
117 } 117 }
118 118
119 /* bits in struct cgroup flags field */ 119 /* bits in struct cgroup flags field */
120 enum { 120 enum {
121 /* Control Group is dead */ 121 /* Control Group is dead */
122 CGRP_DEAD, 122 CGRP_DEAD,
123 /* 123 /*
124 * Control Group has previously had a child cgroup or a task, 124 * Control Group has previously had a child cgroup or a task,
125 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) 125 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
126 */ 126 */
127 CGRP_RELEASABLE, 127 CGRP_RELEASABLE,
128 /* Control Group requires release notifications to userspace */ 128 /* Control Group requires release notifications to userspace */
129 CGRP_NOTIFY_ON_RELEASE, 129 CGRP_NOTIFY_ON_RELEASE,
130 /* 130 /*
131 * Clone the parent's configuration when creating a new child 131 * Clone the parent's configuration when creating a new child
132 * cpuset cgroup. For historical reasons, this option can be 132 * cpuset cgroup. For historical reasons, this option can be
133 * specified at mount time and thus is implemented here. 133 * specified at mount time and thus is implemented here.
134 */ 134 */
135 CGRP_CPUSET_CLONE_CHILDREN, 135 CGRP_CPUSET_CLONE_CHILDREN,
136 /* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */ 136 /* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
137 CGRP_SANE_BEHAVIOR, 137 CGRP_SANE_BEHAVIOR,
138 }; 138 };
139 139
140 struct cgroup { 140 struct cgroup {
141 unsigned long flags; /* "unsigned long" so bitops work */ 141 unsigned long flags; /* "unsigned long" so bitops work */
142 142
143 /* 143 /*
144 * idr allocated in-hierarchy ID. 144 * idr allocated in-hierarchy ID.
145 * 145 *
146 * The ID of the root cgroup is always 0, and a new cgroup 146 * The ID of the root cgroup is always 0, and a new cgroup
147 * will be assigned with a smallest available ID. 147 * will be assigned with a smallest available ID.
148 * 148 *
149 * Allocating/Removing ID must be protected by cgroup_mutex. 149 * Allocating/Removing ID must be protected by cgroup_mutex.
150 */ 150 */
151 int id; 151 int id;
152 152
153 /* the number of attached css's */ 153 /* the number of attached css's */
154 int nr_css; 154 int nr_css;
155 155
156 atomic_t refcnt; 156 atomic_t refcnt;
157 157
158 /* 158 /*
159 * We link our 'sibling' struct into our parent's 'children'. 159 * We link our 'sibling' struct into our parent's 'children'.
160 * Our children link their 'sibling' into our 'children'. 160 * Our children link their 'sibling' into our 'children'.
161 */ 161 */
162 struct list_head sibling; /* my parent's children */ 162 struct list_head sibling; /* my parent's children */
163 struct list_head children; /* my children */ 163 struct list_head children; /* my children */
164 164
165 struct cgroup *parent; /* my parent */ 165 struct cgroup *parent; /* my parent */
166 struct kernfs_node *kn; /* cgroup kernfs entry */ 166 struct kernfs_node *kn; /* cgroup kernfs entry */
167 167
168 /* 168 /*
169 * Monotonically increasing unique serial number which defines a 169 * Monotonically increasing unique serial number which defines a
170 * uniform order among all cgroups. It's guaranteed that all 170 * uniform order among all cgroups. It's guaranteed that all
171 * ->children lists are in the ascending order of ->serial_nr. 171 * ->children lists are in the ascending order of ->serial_nr.
172 * It's used to allow interrupting and resuming iterations. 172 * It's used to allow interrupting and resuming iterations.
173 */ 173 */
174 u64 serial_nr; 174 u64 serial_nr;
175 175
176 /* the bitmask of subsystems enabled on the child cgroups */ 176 /* the bitmask of subsystems enabled on the child cgroups */
177 unsigned long child_subsys_mask; 177 unsigned long child_subsys_mask;
178 178
179 /* Private pointers for each registered subsystem */ 179 /* Private pointers for each registered subsystem */
180 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; 180 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
181 181
182 struct cgroup_root *root; 182 struct cgroup_root *root;
183 183
184 /* 184 /*
185 * List of cgrp_cset_links pointing at css_sets with tasks in this 185 * List of cgrp_cset_links pointing at css_sets with tasks in this
186 * cgroup. Protected by css_set_lock. 186 * cgroup. Protected by css_set_lock.
187 */ 187 */
188 struct list_head cset_links; 188 struct list_head cset_links;
189 189
190 /* 190 /*
191 * On the default hierarchy, a css_set for a cgroup with some 191 * On the default hierarchy, a css_set for a cgroup with some
192 * susbsys disabled will point to css's which are associated with 192 * susbsys disabled will point to css's which are associated with
193 * the closest ancestor which has the subsys enabled. The 193 * the closest ancestor which has the subsys enabled. The
194 * following lists all css_sets which point to this cgroup's css 194 * following lists all css_sets which point to this cgroup's css
195 * for the given subsystem. 195 * for the given subsystem.
196 */ 196 */
197 struct list_head e_csets[CGROUP_SUBSYS_COUNT]; 197 struct list_head e_csets[CGROUP_SUBSYS_COUNT];
198 198
199 /* 199 /*
200 * Linked list running through all cgroups that can 200 * Linked list running through all cgroups that can
201 * potentially be reaped by the release agent. Protected by 201 * potentially be reaped by the release agent. Protected by
202 * release_list_lock 202 * release_list_lock
203 */ 203 */
204 struct list_head release_list; 204 struct list_head release_list;
205 205
206 /* 206 /*
207 * list of pidlists, up to two for each namespace (one for procs, one 207 * list of pidlists, up to two for each namespace (one for procs, one
208 * for tasks); created on demand. 208 * for tasks); created on demand.
209 */ 209 */
210 struct list_head pidlists; 210 struct list_head pidlists;
211 struct mutex pidlist_mutex; 211 struct mutex pidlist_mutex;
212 212
213 /* dummy css with NULL ->ss, points back to this cgroup */ 213 /* dummy css with NULL ->ss, points back to this cgroup */
214 struct cgroup_subsys_state dummy_css; 214 struct cgroup_subsys_state dummy_css;
215 215
216 /* For css percpu_ref killing and RCU-protected deletion */ 216 /* For css percpu_ref killing and RCU-protected deletion */
217 struct rcu_head rcu_head; 217 struct rcu_head rcu_head;
218 struct work_struct destroy_work; 218 struct work_struct destroy_work;
219 }; 219 };
220 220
221 #define MAX_CGROUP_ROOT_NAMELEN 64 221 #define MAX_CGROUP_ROOT_NAMELEN 64
222 222
223 /* cgroup_root->flags */ 223 /* cgroup_root->flags */
224 enum { 224 enum {
225 /* 225 /*
226 * Unfortunately, cgroup core and various controllers are riddled 226 * Unfortunately, cgroup core and various controllers are riddled
227 * with idiosyncrasies and pointless options. The following flag, 227 * with idiosyncrasies and pointless options. The following flag,
228 * when set, will force sane behavior - some options are forced on, 228 * when set, will force sane behavior - some options are forced on,
229 * others are disallowed, and some controllers will change their 229 * others are disallowed, and some controllers will change their
230 * hierarchical or other behaviors. 230 * hierarchical or other behaviors.
231 * 231 *
232 * The set of behaviors affected by this flag are still being 232 * The set of behaviors affected by this flag are still being
233 * determined and developed and the mount option for this flag is 233 * determined and developed and the mount option for this flag is
234 * prefixed with __DEVEL__. The prefix will be dropped once we 234 * prefixed with __DEVEL__. The prefix will be dropped once we
235 * reach the point where all behaviors are compatible with the 235 * reach the point where all behaviors are compatible with the
236 * planned unified hierarchy, which will automatically turn on this 236 * planned unified hierarchy, which will automatically turn on this
237 * flag. 237 * flag.
238 * 238 *
239 * The followings are the behaviors currently affected this flag. 239 * The followings are the behaviors currently affected this flag.
240 * 240 *
241 * - Mount options "noprefix", "xattr", "clone_children", 241 * - Mount options "noprefix", "xattr", "clone_children",
242 * "release_agent" and "name" are disallowed. 242 * "release_agent" and "name" are disallowed.
243 * 243 *
244 * - When mounting an existing superblock, mount options should 244 * - When mounting an existing superblock, mount options should
245 * match. 245 * match.
246 * 246 *
247 * - Remount is disallowed. 247 * - Remount is disallowed.
248 * 248 *
249 * - rename(2) is disallowed. 249 * - rename(2) is disallowed.
250 * 250 *
251 * - "tasks" is removed. Everything should be at process 251 * - "tasks" is removed. Everything should be at process
252 * granularity. Use "cgroup.procs" instead. 252 * granularity. Use "cgroup.procs" instead.
253 * 253 *
254 * - "cgroup.procs" is not sorted. pids will be unique unless they 254 * - "cgroup.procs" is not sorted. pids will be unique unless they
255 * got recycled inbetween reads. 255 * got recycled inbetween reads.
256 * 256 *
257 * - "release_agent" and "notify_on_release" are removed. 257 * - "release_agent" and "notify_on_release" are removed.
258 * Replacement notification mechanism will be implemented. 258 * Replacement notification mechanism will be implemented.
259 * 259 *
260 * - "cgroup.clone_children" is removed. 260 * - "cgroup.clone_children" is removed.
261 * 261 *
262 * - If mount is requested with sane_behavior but without any 262 * - If mount is requested with sane_behavior but without any
263 * subsystem, the default unified hierarchy is mounted. 263 * subsystem, the default unified hierarchy is mounted.
264 * 264 *
265 * - cpuset: tasks will be kept in empty cpusets when hotplug happens 265 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
266 * and take masks of ancestors with non-empty cpus/mems, instead of 266 * and take masks of ancestors with non-empty cpus/mems, instead of
267 * being moved to an ancestor. 267 * being moved to an ancestor.
268 * 268 *
269 * - cpuset: a task can be moved into an empty cpuset, and again it 269 * - cpuset: a task can be moved into an empty cpuset, and again it
270 * takes masks of ancestors. 270 * takes masks of ancestors.
271 * 271 *
272 * - memcg: use_hierarchy is on by default and the cgroup file for 272 * - memcg: use_hierarchy is on by default and the cgroup file for
273 * the flag is not created. 273 * the flag is not created.
274 * 274 *
275 * - blkcg: blk-throttle becomes properly hierarchical. 275 * - blkcg: blk-throttle becomes properly hierarchical.
276 */ 276 */
277 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), 277 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
278 278
279 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ 279 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
280 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ 280 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
281 281
282 /* mount options live below bit 16 */ 282 /* mount options live below bit 16 */
283 CGRP_ROOT_OPTION_MASK = (1 << 16) - 1, 283 CGRP_ROOT_OPTION_MASK = (1 << 16) - 1,
284 }; 284 };
285 285
286 /* 286 /*
287 * A cgroup_root represents the root of a cgroup hierarchy, and may be 287 * A cgroup_root represents the root of a cgroup hierarchy, and may be
288 * associated with a kernfs_root to form an active hierarchy. This is 288 * associated with a kernfs_root to form an active hierarchy. This is
289 * internal to cgroup core. Don't access directly from controllers. 289 * internal to cgroup core. Don't access directly from controllers.
290 */ 290 */
291 struct cgroup_root { 291 struct cgroup_root {
292 struct kernfs_root *kf_root; 292 struct kernfs_root *kf_root;
293 293
294 /* The bitmask of subsystems attached to this hierarchy */ 294 /* The bitmask of subsystems attached to this hierarchy */
295 unsigned long subsys_mask; 295 unsigned long subsys_mask;
296 296
297 /* Unique id for this hierarchy. */ 297 /* Unique id for this hierarchy. */
298 int hierarchy_id; 298 int hierarchy_id;
299 299
300 /* The root cgroup. Root is destroyed on its release. */ 300 /* The root cgroup. Root is destroyed on its release. */
301 struct cgroup cgrp; 301 struct cgroup cgrp;
302 302
303 /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ 303 /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
304 atomic_t nr_cgrps; 304 atomic_t nr_cgrps;
305 305
306 /* A list running through the active hierarchies */ 306 /* A list running through the active hierarchies */
307 struct list_head root_list; 307 struct list_head root_list;
308 308
309 /* Hierarchy-specific flags */ 309 /* Hierarchy-specific flags */
310 unsigned long flags; 310 unsigned long flags;
311 311
312 /* IDs for cgroups in this hierarchy */ 312 /* IDs for cgroups in this hierarchy */
313 struct idr cgroup_idr; 313 struct idr cgroup_idr;
314 314
315 /* The path to use for release notifications. */ 315 /* The path to use for release notifications. */
316 char release_agent_path[PATH_MAX]; 316 char release_agent_path[PATH_MAX];
317 317
318 /* The name for this hierarchy - may be empty */ 318 /* The name for this hierarchy - may be empty */
319 char name[MAX_CGROUP_ROOT_NAMELEN]; 319 char name[MAX_CGROUP_ROOT_NAMELEN];
320 }; 320 };
321 321
322 /* 322 /*
323 * A css_set is a structure holding pointers to a set of 323 * A css_set is a structure holding pointers to a set of
324 * cgroup_subsys_state objects. This saves space in the task struct 324 * cgroup_subsys_state objects. This saves space in the task struct
325 * object and speeds up fork()/exit(), since a single inc/dec and a 325 * object and speeds up fork()/exit(), since a single inc/dec and a
326 * list_add()/del() can bump the reference count on the entire cgroup 326 * list_add()/del() can bump the reference count on the entire cgroup
327 * set for a task. 327 * set for a task.
328 */ 328 */
329 329
330 struct css_set { 330 struct css_set {
331 331
332 /* Reference count */ 332 /* Reference count */
333 atomic_t refcount; 333 atomic_t refcount;
334 334
335 /* 335 /*
336 * List running through all cgroup groups in the same hash 336 * List running through all cgroup groups in the same hash
337 * slot. Protected by css_set_lock 337 * slot. Protected by css_set_lock
338 */ 338 */
339 struct hlist_node hlist; 339 struct hlist_node hlist;
340 340
341 /* 341 /*
342 * Lists running through all tasks using this cgroup group. 342 * Lists running through all tasks using this cgroup group.
343 * mg_tasks lists tasks which belong to this cset but are in the 343 * mg_tasks lists tasks which belong to this cset but are in the
344 * process of being migrated out or in. Protected by 344 * process of being migrated out or in. Protected by
345 * css_set_rwsem, but, during migration, once tasks are moved to 345 * css_set_rwsem, but, during migration, once tasks are moved to
346 * mg_tasks, it can be read safely while holding cgroup_mutex. 346 * mg_tasks, it can be read safely while holding cgroup_mutex.
347 */ 347 */
348 struct list_head tasks; 348 struct list_head tasks;
349 struct list_head mg_tasks; 349 struct list_head mg_tasks;
350 350
351 /* 351 /*
352 * List of cgrp_cset_links pointing at cgroups referenced from this 352 * List of cgrp_cset_links pointing at cgroups referenced from this
353 * css_set. Protected by css_set_lock. 353 * css_set. Protected by css_set_lock.
354 */ 354 */
355 struct list_head cgrp_links; 355 struct list_head cgrp_links;
356 356
357 /* the default cgroup associated with this css_set */
358 struct cgroup *dfl_cgrp;
359
357 /* 360 /*
358 * Set of subsystem states, one for each subsystem. This array is 361 * Set of subsystem states, one for each subsystem. This array is
359 * immutable after creation apart from the init_css_set during 362 * immutable after creation apart from the init_css_set during
360 * subsystem registration (at boot time). 363 * subsystem registration (at boot time).
361 */ 364 */
362 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 365 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
363 366
364 /* 367 /*
365 * List of csets participating in the on-going migration either as 368 * List of csets participating in the on-going migration either as
366 * source or destination. Protected by cgroup_mutex. 369 * source or destination. Protected by cgroup_mutex.
367 */ 370 */
368 struct list_head mg_preload_node; 371 struct list_head mg_preload_node;
369 struct list_head mg_node; 372 struct list_head mg_node;
370 373
371 /* 374 /*
372 * If this cset is acting as the source of migration the following 375 * If this cset is acting as the source of migration the following
373 * two fields are set. mg_src_cgrp is the source cgroup of the 376 * two fields are set. mg_src_cgrp is the source cgroup of the
374 * on-going migration and mg_dst_cset is the destination cset the 377 * on-going migration and mg_dst_cset is the destination cset the
375 * target tasks on this cset should be migrated to. Protected by 378 * target tasks on this cset should be migrated to. Protected by
376 * cgroup_mutex. 379 * cgroup_mutex.
377 */ 380 */
378 struct cgroup *mg_src_cgrp; 381 struct cgroup *mg_src_cgrp;
379 struct css_set *mg_dst_cset; 382 struct css_set *mg_dst_cset;
380 383
381 /* 384 /*
382 * On the default hierarhcy, ->subsys[ssid] may point to a css 385 * On the default hierarhcy, ->subsys[ssid] may point to a css
383 * attached to an ancestor instead of the cgroup this css_set is 386 * attached to an ancestor instead of the cgroup this css_set is
384 * associated with. The following node is anchored at 387 * associated with. The following node is anchored at
385 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to 388 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
386 * iterate through all css's attached to a given cgroup. 389 * iterate through all css's attached to a given cgroup.
387 */ 390 */
388 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; 391 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
389 392
390 /* For RCU-protected deletion */ 393 /* For RCU-protected deletion */
391 struct rcu_head rcu_head; 394 struct rcu_head rcu_head;
392 }; 395 };
393 396
394 /* 397 /*
395 * struct cftype: handler definitions for cgroup control files 398 * struct cftype: handler definitions for cgroup control files
396 * 399 *
397 * When reading/writing to a file: 400 * When reading/writing to a file:
398 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata 401 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata
399 * - the 'cftype' of the file is file->f_dentry->d_fsdata 402 * - the 'cftype' of the file is file->f_dentry->d_fsdata
400 */ 403 */
401 404
402 /* cftype->flags */ 405 /* cftype->flags */
403 enum { 406 enum {
404 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ 407 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
405 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 408 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
406 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ 409 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */
407 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 410 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
408 CFTYPE_ONLY_ON_DFL = (1 << 4), /* only on default hierarchy */ 411 CFTYPE_ONLY_ON_DFL = (1 << 4), /* only on default hierarchy */
409 }; 412 };
410 413
411 #define MAX_CFTYPE_NAME 64 414 #define MAX_CFTYPE_NAME 64
412 415
413 struct cftype { 416 struct cftype {
414 /* 417 /*
415 * By convention, the name should begin with the name of the 418 * By convention, the name should begin with the name of the
416 * subsystem, followed by a period. Zero length string indicates 419 * subsystem, followed by a period. Zero length string indicates
417 * end of cftype array. 420 * end of cftype array.
418 */ 421 */
419 char name[MAX_CFTYPE_NAME]; 422 char name[MAX_CFTYPE_NAME];
420 int private; 423 int private;
421 /* 424 /*
422 * If not 0, file mode is set to this value, otherwise it will 425 * If not 0, file mode is set to this value, otherwise it will
423 * be figured out automatically 426 * be figured out automatically
424 */ 427 */
425 umode_t mode; 428 umode_t mode;
426 429
427 /* 430 /*
428 * The maximum length of string, excluding trailing nul, that can 431 * The maximum length of string, excluding trailing nul, that can
429 * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is 432 * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is
430 * assumed. 433 * assumed.
431 */ 434 */
432 size_t max_write_len; 435 size_t max_write_len;
433 436
434 /* CFTYPE_* flags */ 437 /* CFTYPE_* flags */
435 unsigned int flags; 438 unsigned int flags;
436 439
437 /* 440 /*
438 * Fields used for internal bookkeeping. Initialized automatically 441 * Fields used for internal bookkeeping. Initialized automatically
439 * during registration. 442 * during registration.
440 */ 443 */
441 struct cgroup_subsys *ss; /* NULL for cgroup core files */ 444 struct cgroup_subsys *ss; /* NULL for cgroup core files */
442 struct list_head node; /* anchored at ss->cfts */ 445 struct list_head node; /* anchored at ss->cfts */
443 struct kernfs_ops *kf_ops; 446 struct kernfs_ops *kf_ops;
444 447
445 /* 448 /*
446 * read_u64() is a shortcut for the common case of returning a 449 * read_u64() is a shortcut for the common case of returning a
447 * single integer. Use it in place of read() 450 * single integer. Use it in place of read()
448 */ 451 */
449 u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); 452 u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
450 /* 453 /*
451 * read_s64() is a signed version of read_u64() 454 * read_s64() is a signed version of read_u64()
452 */ 455 */
453 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); 456 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
454 457
455 /* generic seq_file read interface */ 458 /* generic seq_file read interface */
456 int (*seq_show)(struct seq_file *sf, void *v); 459 int (*seq_show)(struct seq_file *sf, void *v);
457 460
458 /* optional ops, implement all or none */ 461 /* optional ops, implement all or none */
459 void *(*seq_start)(struct seq_file *sf, loff_t *ppos); 462 void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
460 void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); 463 void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
461 void (*seq_stop)(struct seq_file *sf, void *v); 464 void (*seq_stop)(struct seq_file *sf, void *v);
462 465
463 /* 466 /*
464 * write_u64() is a shortcut for the common case of accepting 467 * write_u64() is a shortcut for the common case of accepting
465 * a single integer (as parsed by simple_strtoull) from 468 * a single integer (as parsed by simple_strtoull) from
466 * userspace. Use in place of write(); return 0 or error. 469 * userspace. Use in place of write(); return 0 or error.
467 */ 470 */
468 int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, 471 int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
469 u64 val); 472 u64 val);
470 /* 473 /*
471 * write_s64() is a signed version of write_u64() 474 * write_s64() is a signed version of write_u64()
472 */ 475 */
473 int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, 476 int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
474 s64 val); 477 s64 val);
475 478
476 /* 479 /*
477 * write_string() is passed a nul-terminated kernelspace 480 * write_string() is passed a nul-terminated kernelspace
478 * buffer of maximum length determined by max_write_len. 481 * buffer of maximum length determined by max_write_len.
479 * Returns 0 or -ve error code. 482 * Returns 0 or -ve error code.
480 */ 483 */
481 int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft, 484 int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
482 char *buffer); 485 char *buffer);
483 /* 486 /*
484 * trigger() callback can be used to get some kick from the 487 * trigger() callback can be used to get some kick from the
485 * userspace, when the actual string written is not important 488 * userspace, when the actual string written is not important
486 * at all. The private field can be used to determine the 489 * at all. The private field can be used to determine the
487 * kick type for multiplexing. 490 * kick type for multiplexing.
488 */ 491 */
489 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); 492 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
490 493
491 #ifdef CONFIG_DEBUG_LOCK_ALLOC 494 #ifdef CONFIG_DEBUG_LOCK_ALLOC
492 struct lock_class_key lockdep_key; 495 struct lock_class_key lockdep_key;
493 #endif 496 #endif
494 }; 497 };
495 498
496 extern struct cgroup_root cgrp_dfl_root; 499 extern struct cgroup_root cgrp_dfl_root;
497 500
498 static inline bool cgroup_on_dfl(const struct cgroup *cgrp) 501 static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
499 { 502 {
500 return cgrp->root == &cgrp_dfl_root; 503 return cgrp->root == &cgrp_dfl_root;
501 } 504 }
502 505
503 /* 506 /*
504 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This 507 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
505 * function can be called as long as @cgrp is accessible. 508 * function can be called as long as @cgrp is accessible.
506 */ 509 */
507 static inline bool cgroup_sane_behavior(const struct cgroup *cgrp) 510 static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
508 { 511 {
509 return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; 512 return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
510 } 513 }
511 514
512 /* no synchronization, the result can only be used as a hint */ 515 /* no synchronization, the result can only be used as a hint */
513 static inline bool cgroup_has_tasks(struct cgroup *cgrp) 516 static inline bool cgroup_has_tasks(struct cgroup *cgrp)
514 { 517 {
515 return !list_empty(&cgrp->cset_links); 518 return !list_empty(&cgrp->cset_links);
516 } 519 }
517 520
518 /* returns ino associated with a cgroup, 0 indicates unmounted root */ 521 /* returns ino associated with a cgroup, 0 indicates unmounted root */
519 static inline ino_t cgroup_ino(struct cgroup *cgrp) 522 static inline ino_t cgroup_ino(struct cgroup *cgrp)
520 { 523 {
521 if (cgrp->kn) 524 if (cgrp->kn)
522 return cgrp->kn->ino; 525 return cgrp->kn->ino;
523 else 526 else
524 return 0; 527 return 0;
525 } 528 }
526 529
527 static inline struct cftype *seq_cft(struct seq_file *seq) 530 static inline struct cftype *seq_cft(struct seq_file *seq)
528 { 531 {
529 struct kernfs_open_file *of = seq->private; 532 struct kernfs_open_file *of = seq->private;
530 533
531 return of->kn->priv; 534 return of->kn->priv;
532 } 535 }
533 536
534 struct cgroup_subsys_state *seq_css(struct seq_file *seq); 537 struct cgroup_subsys_state *seq_css(struct seq_file *seq);
535 538
536 /* 539 /*
537 * Name / path handling functions. All are thin wrappers around the kernfs 540 * Name / path handling functions. All are thin wrappers around the kernfs
538 * counterparts and can be called under any context. 541 * counterparts and can be called under any context.
539 */ 542 */
540 543
541 static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) 544 static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
542 { 545 {
543 return kernfs_name(cgrp->kn, buf, buflen); 546 return kernfs_name(cgrp->kn, buf, buflen);
544 } 547 }
545 548
546 static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, 549 static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
547 size_t buflen) 550 size_t buflen)
548 { 551 {
549 return kernfs_path(cgrp->kn, buf, buflen); 552 return kernfs_path(cgrp->kn, buf, buflen);
550 } 553 }
551 554
552 static inline void pr_cont_cgroup_name(struct cgroup *cgrp) 555 static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
553 { 556 {
554 pr_cont_kernfs_name(cgrp->kn); 557 pr_cont_kernfs_name(cgrp->kn);
555 } 558 }
556 559
557 static inline void pr_cont_cgroup_path(struct cgroup *cgrp) 560 static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
558 { 561 {
559 pr_cont_kernfs_path(cgrp->kn); 562 pr_cont_kernfs_path(cgrp->kn);
560 } 563 }
561 564
562 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); 565 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
563 566
564 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 567 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
565 int cgroup_rm_cftypes(struct cftype *cfts); 568 int cgroup_rm_cftypes(struct cftype *cfts);
566 569
567 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); 570 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
568 571
569 /* 572 /*
570 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 573 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
571 * methods. 574 * methods.
572 */ 575 */
573 struct cgroup_taskset; 576 struct cgroup_taskset;
574 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); 577 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
575 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); 578 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
576 579
577 /** 580 /**
578 * cgroup_taskset_for_each - iterate cgroup_taskset 581 * cgroup_taskset_for_each - iterate cgroup_taskset
579 * @task: the loop cursor 582 * @task: the loop cursor
580 * @tset: taskset to iterate 583 * @tset: taskset to iterate
581 */ 584 */
582 #define cgroup_taskset_for_each(task, tset) \ 585 #define cgroup_taskset_for_each(task, tset) \
583 for ((task) = cgroup_taskset_first((tset)); (task); \ 586 for ((task) = cgroup_taskset_first((tset)); (task); \
584 (task) = cgroup_taskset_next((tset))) 587 (task) = cgroup_taskset_next((tset)))
585 588
586 /* 589 /*
587 * Control Group subsystem type. 590 * Control Group subsystem type.
588 * See Documentation/cgroups/cgroups.txt for details 591 * See Documentation/cgroups/cgroups.txt for details
589 */ 592 */
590 593
591 struct cgroup_subsys { 594 struct cgroup_subsys {
592 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); 595 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
593 int (*css_online)(struct cgroup_subsys_state *css); 596 int (*css_online)(struct cgroup_subsys_state *css);
594 void (*css_offline)(struct cgroup_subsys_state *css); 597 void (*css_offline)(struct cgroup_subsys_state *css);
595 void (*css_free)(struct cgroup_subsys_state *css); 598 void (*css_free)(struct cgroup_subsys_state *css);
596 599
597 int (*can_attach)(struct cgroup_subsys_state *css, 600 int (*can_attach)(struct cgroup_subsys_state *css,
598 struct cgroup_taskset *tset); 601 struct cgroup_taskset *tset);
599 void (*cancel_attach)(struct cgroup_subsys_state *css, 602 void (*cancel_attach)(struct cgroup_subsys_state *css,
600 struct cgroup_taskset *tset); 603 struct cgroup_taskset *tset);
601 void (*attach)(struct cgroup_subsys_state *css, 604 void (*attach)(struct cgroup_subsys_state *css,
602 struct cgroup_taskset *tset); 605 struct cgroup_taskset *tset);
603 void (*fork)(struct task_struct *task); 606 void (*fork)(struct task_struct *task);
604 void (*exit)(struct cgroup_subsys_state *css, 607 void (*exit)(struct cgroup_subsys_state *css,
605 struct cgroup_subsys_state *old_css, 608 struct cgroup_subsys_state *old_css,
606 struct task_struct *task); 609 struct task_struct *task);
607 void (*bind)(struct cgroup_subsys_state *root_css); 610 void (*bind)(struct cgroup_subsys_state *root_css);
608 611
609 int disabled; 612 int disabled;
610 int early_init; 613 int early_init;
611 614
612 /* 615 /*
613 * If %false, this subsystem is properly hierarchical - 616 * If %false, this subsystem is properly hierarchical -
614 * configuration, resource accounting and restriction on a parent 617 * configuration, resource accounting and restriction on a parent
615 * cgroup cover those of its children. If %true, hierarchy support 618 * cgroup cover those of its children. If %true, hierarchy support
616 * is broken in some ways - some subsystems ignore hierarchy 619 * is broken in some ways - some subsystems ignore hierarchy
617 * completely while others are only implemented half-way. 620 * completely while others are only implemented half-way.
618 * 621 *
619 * It's now disallowed to create nested cgroups if the subsystem is 622 * It's now disallowed to create nested cgroups if the subsystem is
620 * broken and cgroup core will emit a warning message on such 623 * broken and cgroup core will emit a warning message on such
621 * cases. Eventually, all subsystems will be made properly 624 * cases. Eventually, all subsystems will be made properly
622 * hierarchical and this will go away. 625 * hierarchical and this will go away.
623 */ 626 */
624 bool broken_hierarchy; 627 bool broken_hierarchy;
625 bool warned_broken_hierarchy; 628 bool warned_broken_hierarchy;
626 629
627 /* the following two fields are initialized automtically during boot */ 630 /* the following two fields are initialized automtically during boot */
628 int id; 631 int id;
629 #define MAX_CGROUP_TYPE_NAMELEN 32 632 #define MAX_CGROUP_TYPE_NAMELEN 32
630 const char *name; 633 const char *name;
631 634
632 /* link to parent, protected by cgroup_lock() */ 635 /* link to parent, protected by cgroup_lock() */
633 struct cgroup_root *root; 636 struct cgroup_root *root;
634 637
635 /* 638 /*
636 * List of cftypes. Each entry is the first entry of an array 639 * List of cftypes. Each entry is the first entry of an array
637 * terminated by zero length name. 640 * terminated by zero length name.
638 */ 641 */
639 struct list_head cfts; 642 struct list_head cfts;
640 643
641 /* base cftypes, automatically registered with subsys itself */ 644 /* base cftypes, automatically registered with subsys itself */
642 struct cftype *base_cftypes; 645 struct cftype *base_cftypes;
643 }; 646 };
644 647
645 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; 648 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
646 #include <linux/cgroup_subsys.h> 649 #include <linux/cgroup_subsys.h>
647 #undef SUBSYS 650 #undef SUBSYS
648 651
649 /** 652 /**
650 * css_parent - find the parent css 653 * css_parent - find the parent css
651 * @css: the target cgroup_subsys_state 654 * @css: the target cgroup_subsys_state
652 * 655 *
653 * Return the parent css of @css. This function is guaranteed to return 656 * Return the parent css of @css. This function is guaranteed to return
654 * non-NULL parent as long as @css isn't the root. 657 * non-NULL parent as long as @css isn't the root.
655 */ 658 */
656 static inline 659 static inline
657 struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) 660 struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
658 { 661 {
659 return css->parent; 662 return css->parent;
660 } 663 }
661 664
662 /** 665 /**
663 * task_css_set_check - obtain a task's css_set with extra access conditions 666 * task_css_set_check - obtain a task's css_set with extra access conditions
664 * @task: the task to obtain css_set for 667 * @task: the task to obtain css_set for
665 * @__c: extra condition expression to be passed to rcu_dereference_check() 668 * @__c: extra condition expression to be passed to rcu_dereference_check()
666 * 669 *
667 * A task's css_set is RCU protected, initialized and exited while holding 670 * A task's css_set is RCU protected, initialized and exited while holding
668 * task_lock(), and can only be modified while holding both cgroup_mutex 671 * task_lock(), and can only be modified while holding both cgroup_mutex
669 * and task_lock() while the task is alive. This macro verifies that the 672 * and task_lock() while the task is alive. This macro verifies that the
670 * caller is inside proper critical section and returns @task's css_set. 673 * caller is inside proper critical section and returns @task's css_set.
671 * 674 *
672 * The caller can also specify additional allowed conditions via @__c, such 675 * The caller can also specify additional allowed conditions via @__c, such
673 * as locks used during the cgroup_subsys::attach() methods. 676 * as locks used during the cgroup_subsys::attach() methods.
674 */ 677 */
675 #ifdef CONFIG_PROVE_RCU 678 #ifdef CONFIG_PROVE_RCU
676 extern struct mutex cgroup_mutex; 679 extern struct mutex cgroup_mutex;
677 extern struct rw_semaphore css_set_rwsem; 680 extern struct rw_semaphore css_set_rwsem;
678 #define task_css_set_check(task, __c) \ 681 #define task_css_set_check(task, __c) \
679 rcu_dereference_check((task)->cgroups, \ 682 rcu_dereference_check((task)->cgroups, \
680 lockdep_is_held(&cgroup_mutex) || \ 683 lockdep_is_held(&cgroup_mutex) || \
681 lockdep_is_held(&css_set_rwsem) || \ 684 lockdep_is_held(&css_set_rwsem) || \
682 ((task)->flags & PF_EXITING) || (__c)) 685 ((task)->flags & PF_EXITING) || (__c))
683 #else 686 #else
684 #define task_css_set_check(task, __c) \ 687 #define task_css_set_check(task, __c) \
685 rcu_dereference((task)->cgroups) 688 rcu_dereference((task)->cgroups)
686 #endif 689 #endif
687 690
688 /** 691 /**
689 * task_css_check - obtain css for (task, subsys) w/ extra access conds 692 * task_css_check - obtain css for (task, subsys) w/ extra access conds
690 * @task: the target task 693 * @task: the target task
691 * @subsys_id: the target subsystem ID 694 * @subsys_id: the target subsystem ID
692 * @__c: extra condition expression to be passed to rcu_dereference_check() 695 * @__c: extra condition expression to be passed to rcu_dereference_check()
693 * 696 *
694 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The 697 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The
695 * synchronization rules are the same as task_css_set_check(). 698 * synchronization rules are the same as task_css_set_check().
696 */ 699 */
697 #define task_css_check(task, subsys_id, __c) \ 700 #define task_css_check(task, subsys_id, __c) \
698 task_css_set_check((task), (__c))->subsys[(subsys_id)] 701 task_css_set_check((task), (__c))->subsys[(subsys_id)]
699 702
700 /** 703 /**
701 * task_css_set - obtain a task's css_set 704 * task_css_set - obtain a task's css_set
702 * @task: the task to obtain css_set for 705 * @task: the task to obtain css_set for
703 * 706 *
704 * See task_css_set_check(). 707 * See task_css_set_check().
705 */ 708 */
706 static inline struct css_set *task_css_set(struct task_struct *task) 709 static inline struct css_set *task_css_set(struct task_struct *task)
707 { 710 {
708 return task_css_set_check(task, false); 711 return task_css_set_check(task, false);
709 } 712 }
710 713
711 /** 714 /**
712 * task_css - obtain css for (task, subsys) 715 * task_css - obtain css for (task, subsys)
713 * @task: the target task 716 * @task: the target task
714 * @subsys_id: the target subsystem ID 717 * @subsys_id: the target subsystem ID
715 * 718 *
716 * See task_css_check(). 719 * See task_css_check().
717 */ 720 */
718 static inline struct cgroup_subsys_state *task_css(struct task_struct *task, 721 static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
719 int subsys_id) 722 int subsys_id)
720 { 723 {
721 return task_css_check(task, subsys_id, false); 724 return task_css_check(task, subsys_id, false);
722 } 725 }
723 726
724 static inline struct cgroup *task_cgroup(struct task_struct *task, 727 static inline struct cgroup *task_cgroup(struct task_struct *task,
725 int subsys_id) 728 int subsys_id)
726 { 729 {
727 return task_css(task, subsys_id)->cgroup; 730 return task_css(task, subsys_id)->cgroup;
728 } 731 }
729 732
730 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, 733 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
731 struct cgroup_subsys_state *parent); 734 struct cgroup_subsys_state *parent);
732 735
733 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); 736 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
734 737
735 /** 738 /**
736 * css_for_each_child - iterate through children of a css 739 * css_for_each_child - iterate through children of a css
737 * @pos: the css * to use as the loop cursor 740 * @pos: the css * to use as the loop cursor
738 * @parent: css whose children to walk 741 * @parent: css whose children to walk
739 * 742 *
740 * Walk @parent's children. Must be called under rcu_read_lock(). A child 743 * Walk @parent's children. Must be called under rcu_read_lock(). A child
741 * css which hasn't finished ->css_online() or already has finished 744 * css which hasn't finished ->css_online() or already has finished
742 * ->css_offline() may show up during traversal and it's each subsystem's 745 * ->css_offline() may show up during traversal and it's each subsystem's
743 * responsibility to verify that each @pos is alive. 746 * responsibility to verify that each @pos is alive.
744 * 747 *
745 * If a subsystem synchronizes against the parent in its ->css_online() and 748 * If a subsystem synchronizes against the parent in its ->css_online() and
746 * before starting iterating, a css which finished ->css_online() is 749 * before starting iterating, a css which finished ->css_online() is
747 * guaranteed to be visible in the future iterations. 750 * guaranteed to be visible in the future iterations.
748 * 751 *
749 * It is allowed to temporarily drop RCU read lock during iteration. The 752 * It is allowed to temporarily drop RCU read lock during iteration. The
750 * caller is responsible for ensuring that @pos remains accessible until 753 * caller is responsible for ensuring that @pos remains accessible until
751 * the start of the next iteration by, for example, bumping the css refcnt. 754 * the start of the next iteration by, for example, bumping the css refcnt.
752 */ 755 */
753 #define css_for_each_child(pos, parent) \ 756 #define css_for_each_child(pos, parent) \
754 for ((pos) = css_next_child(NULL, (parent)); (pos); \ 757 for ((pos) = css_next_child(NULL, (parent)); (pos); \
755 (pos) = css_next_child((pos), (parent))) 758 (pos) = css_next_child((pos), (parent)))
756 759
757 struct cgroup_subsys_state * 760 struct cgroup_subsys_state *
758 css_next_descendant_pre(struct cgroup_subsys_state *pos, 761 css_next_descendant_pre(struct cgroup_subsys_state *pos,
759 struct cgroup_subsys_state *css); 762 struct cgroup_subsys_state *css);
760 763
761 struct cgroup_subsys_state * 764 struct cgroup_subsys_state *
762 css_rightmost_descendant(struct cgroup_subsys_state *pos); 765 css_rightmost_descendant(struct cgroup_subsys_state *pos);
763 766
764 /** 767 /**
765 * css_for_each_descendant_pre - pre-order walk of a css's descendants 768 * css_for_each_descendant_pre - pre-order walk of a css's descendants
766 * @pos: the css * to use as the loop cursor 769 * @pos: the css * to use as the loop cursor
767 * @root: css whose descendants to walk 770 * @root: css whose descendants to walk
768 * 771 *
769 * Walk @root's descendants. @root is included in the iteration and the 772 * Walk @root's descendants. @root is included in the iteration and the
770 * first node to be visited. Must be called under rcu_read_lock(). A 773 * first node to be visited. Must be called under rcu_read_lock(). A
771 * descendant css which hasn't finished ->css_online() or already has 774 * descendant css which hasn't finished ->css_online() or already has
772 * finished ->css_offline() may show up during traversal and it's each 775 * finished ->css_offline() may show up during traversal and it's each
773 * subsystem's responsibility to verify that each @pos is alive. 776 * subsystem's responsibility to verify that each @pos is alive.
774 * 777 *
775 * If a subsystem synchronizes against the parent in its ->css_online() and 778 * If a subsystem synchronizes against the parent in its ->css_online() and
776 * before starting iterating, and synchronizes against @pos on each 779 * before starting iterating, and synchronizes against @pos on each
777 * iteration, any descendant css which finished ->css_online() is 780 * iteration, any descendant css which finished ->css_online() is
778 * guaranteed to be visible in the future iterations. 781 * guaranteed to be visible in the future iterations.
779 * 782 *
780 * In other words, the following guarantees that a descendant can't escape 783 * In other words, the following guarantees that a descendant can't escape
781 * state updates of its ancestors. 784 * state updates of its ancestors.
782 * 785 *
783 * my_online(@css) 786 * my_online(@css)
784 * { 787 * {
785 * Lock @css's parent and @css; 788 * Lock @css's parent and @css;
786 * Inherit state from the parent; 789 * Inherit state from the parent;
787 * Unlock both. 790 * Unlock both.
788 * } 791 * }
789 * 792 *
790 * my_update_state(@css) 793 * my_update_state(@css)
791 * { 794 * {
792 * css_for_each_descendant_pre(@pos, @css) { 795 * css_for_each_descendant_pre(@pos, @css) {
793 * Lock @pos; 796 * Lock @pos;
794 * if (@pos == @css) 797 * if (@pos == @css)
795 * Update @css's state; 798 * Update @css's state;
796 * else 799 * else
797 * Verify @pos is alive and inherit state from its parent; 800 * Verify @pos is alive and inherit state from its parent;
798 * Unlock @pos; 801 * Unlock @pos;
799 * } 802 * }
800 * } 803 * }
801 * 804 *
802 * As long as the inheriting step, including checking the parent state, is 805 * As long as the inheriting step, including checking the parent state, is
803 * enclosed inside @pos locking, double-locking the parent isn't necessary 806 * enclosed inside @pos locking, double-locking the parent isn't necessary
804 * while inheriting. The state update to the parent is guaranteed to be 807 * while inheriting. The state update to the parent is guaranteed to be
805 * visible by walking order and, as long as inheriting operations to the 808 * visible by walking order and, as long as inheriting operations to the
806 * same @pos are atomic to each other, multiple updates racing each other 809 * same @pos are atomic to each other, multiple updates racing each other
807 * still result in the correct state. It's guaranateed that at least one 810 * still result in the correct state. It's guaranateed that at least one
808 * inheritance happens for any css after the latest update to its parent. 811 * inheritance happens for any css after the latest update to its parent.
809 * 812 *
810 * If checking parent's state requires locking the parent, each inheriting 813 * If checking parent's state requires locking the parent, each inheriting
811 * iteration should lock and unlock both @pos->parent and @pos. 814 * iteration should lock and unlock both @pos->parent and @pos.
812 * 815 *
813 * Alternatively, a subsystem may choose to use a single global lock to 816 * Alternatively, a subsystem may choose to use a single global lock to
814 * synchronize ->css_online() and ->css_offline() against tree-walking 817 * synchronize ->css_online() and ->css_offline() against tree-walking
815 * operations. 818 * operations.
816 * 819 *
817 * It is allowed to temporarily drop RCU read lock during iteration. The 820 * It is allowed to temporarily drop RCU read lock during iteration. The
818 * caller is responsible for ensuring that @pos remains accessible until 821 * caller is responsible for ensuring that @pos remains accessible until
819 * the start of the next iteration by, for example, bumping the css refcnt. 822 * the start of the next iteration by, for example, bumping the css refcnt.
820 */ 823 */
821 #define css_for_each_descendant_pre(pos, css) \ 824 #define css_for_each_descendant_pre(pos, css) \
822 for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ 825 for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \
823 (pos) = css_next_descendant_pre((pos), (css))) 826 (pos) = css_next_descendant_pre((pos), (css)))
824 827
825 struct cgroup_subsys_state * 828 struct cgroup_subsys_state *
826 css_next_descendant_post(struct cgroup_subsys_state *pos, 829 css_next_descendant_post(struct cgroup_subsys_state *pos,
827 struct cgroup_subsys_state *css); 830 struct cgroup_subsys_state *css);
828 831
829 /** 832 /**
830 * css_for_each_descendant_post - post-order walk of a css's descendants 833 * css_for_each_descendant_post - post-order walk of a css's descendants
831 * @pos: the css * to use as the loop cursor 834 * @pos: the css * to use as the loop cursor
832 * @css: css whose descendants to walk 835 * @css: css whose descendants to walk
833 * 836 *
834 * Similar to css_for_each_descendant_pre() but performs post-order 837 * Similar to css_for_each_descendant_pre() but performs post-order
835 * traversal instead. @root is included in the iteration and the last 838 * traversal instead. @root is included in the iteration and the last
836 * node to be visited. Note that the walk visibility guarantee described 839 * node to be visited. Note that the walk visibility guarantee described
837 * in pre-order walk doesn't apply the same to post-order walks. 840 * in pre-order walk doesn't apply the same to post-order walks.
838 */ 841 */
839 #define css_for_each_descendant_post(pos, css) \ 842 #define css_for_each_descendant_post(pos, css) \
840 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ 843 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \
841 (pos) = css_next_descendant_post((pos), (css))) 844 (pos) = css_next_descendant_post((pos), (css)))
842 845
843 /* A css_task_iter should be treated as an opaque object */ 846 /* A css_task_iter should be treated as an opaque object */
844 struct css_task_iter { 847 struct css_task_iter {
845 struct cgroup_subsys *ss; 848 struct cgroup_subsys *ss;
846 849
847 struct list_head *cset_pos; 850 struct list_head *cset_pos;
848 struct list_head *cset_head; 851 struct list_head *cset_head;
849 852
850 struct list_head *task_pos; 853 struct list_head *task_pos;
851 struct list_head *tasks_head; 854 struct list_head *tasks_head;
852 struct list_head *mg_tasks_head; 855 struct list_head *mg_tasks_head;
853 }; 856 };
854 857
855 void css_task_iter_start(struct cgroup_subsys_state *css, 858 void css_task_iter_start(struct cgroup_subsys_state *css,
856 struct css_task_iter *it); 859 struct css_task_iter *it);
857 struct task_struct *css_task_iter_next(struct css_task_iter *it); 860 struct task_struct *css_task_iter_next(struct css_task_iter *it);
858 void css_task_iter_end(struct css_task_iter *it); 861 void css_task_iter_end(struct css_task_iter *it);
859 862
860 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 863 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
861 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); 864 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
862 865
863 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, 866 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
864 struct cgroup_subsys *ss); 867 struct cgroup_subsys *ss);
865 868
866 #else /* !CONFIG_CGROUPS */ 869 #else /* !CONFIG_CGROUPS */
867 870
868 static inline int cgroup_init_early(void) { return 0; } 871 static inline int cgroup_init_early(void) { return 0; }
869 static inline int cgroup_init(void) { return 0; } 872 static inline int cgroup_init(void) { return 0; }
870 static inline void cgroup_fork(struct task_struct *p) {} 873 static inline void cgroup_fork(struct task_struct *p) {}
871 static inline void cgroup_post_fork(struct task_struct *p) {} 874 static inline void cgroup_post_fork(struct task_struct *p) {}
872 static inline void cgroup_exit(struct task_struct *p) {} 875 static inline void cgroup_exit(struct task_struct *p) {}
873 876
874 static inline int cgroupstats_build(struct cgroupstats *stats, 877 static inline int cgroupstats_build(struct cgroupstats *stats,
875 struct dentry *dentry) 878 struct dentry *dentry)
876 { 879 {
877 return -EINVAL; 880 return -EINVAL;
878 } 881 }
879 882
880 /* No cgroups - nothing to do */ 883 /* No cgroups - nothing to do */
881 static inline int cgroup_attach_task_all(struct task_struct *from, 884 static inline int cgroup_attach_task_all(struct task_struct *from,
882 struct task_struct *t) 885 struct task_struct *t)
883 { 886 {
884 return 0; 887 return 0;
885 } 888 }
886 889
887 #endif /* !CONFIG_CGROUPS */ 890 #endif /* !CONFIG_CGROUPS */
888 891
889 #endif /* _LINUX_CGROUP_H */ 892 #endif /* _LINUX_CGROUP_H */
890 893
1 /* 1 /*
2 * Generic process-grouping system. 2 * Generic process-grouping system.
3 * 3 *
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support 7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation 8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov 9 * Author: Kirill A. Shutemov
10 * 10 *
11 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
12 * -------------------------------------------------- 12 * --------------------------------------------------
13 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
14 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 14 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
15 * 15 *
16 * Portions derived from Patrick Mochel's sysfs code. 16 * Portions derived from Patrick Mochel's sysfs code.
17 * sysfs is Copyright (c) 2001-3 Patrick Mochel 17 * sysfs is Copyright (c) 2001-3 Patrick Mochel
18 * 18 *
19 * 2003-10-10 Written by Simon Derr. 19 * 2003-10-10 Written by Simon Derr.
20 * 2003-10-22 Updates by Stephen Hemminger. 20 * 2003-10-22 Updates by Stephen Hemminger.
21 * 2004 May-July Rework by Paul Jackson. 21 * 2004 May-July Rework by Paul Jackson.
22 * --------------------------------------------------- 22 * ---------------------------------------------------
23 * 23 *
24 * This file is subject to the terms and conditions of the GNU General Public 24 * This file is subject to the terms and conditions of the GNU General Public
25 * License. See the file COPYING in the main directory of the Linux 25 * License. See the file COPYING in the main directory of the Linux
26 * distribution for more details. 26 * distribution for more details.
27 */ 27 */
28 28
29 #include <linux/cgroup.h> 29 #include <linux/cgroup.h>
30 #include <linux/cred.h> 30 #include <linux/cred.h>
31 #include <linux/ctype.h> 31 #include <linux/ctype.h>
32 #include <linux/errno.h> 32 #include <linux/errno.h>
33 #include <linux/init_task.h> 33 #include <linux/init_task.h>
34 #include <linux/kernel.h> 34 #include <linux/kernel.h>
35 #include <linux/list.h> 35 #include <linux/list.h>
36 #include <linux/mm.h> 36 #include <linux/mm.h>
37 #include <linux/mutex.h> 37 #include <linux/mutex.h>
38 #include <linux/mount.h> 38 #include <linux/mount.h>
39 #include <linux/pagemap.h> 39 #include <linux/pagemap.h>
40 #include <linux/proc_fs.h> 40 #include <linux/proc_fs.h>
41 #include <linux/rcupdate.h> 41 #include <linux/rcupdate.h>
42 #include <linux/sched.h> 42 #include <linux/sched.h>
43 #include <linux/slab.h> 43 #include <linux/slab.h>
44 #include <linux/spinlock.h> 44 #include <linux/spinlock.h>
45 #include <linux/rwsem.h> 45 #include <linux/rwsem.h>
46 #include <linux/string.h> 46 #include <linux/string.h>
47 #include <linux/sort.h> 47 #include <linux/sort.h>
48 #include <linux/kmod.h> 48 #include <linux/kmod.h>
49 #include <linux/delayacct.h> 49 #include <linux/delayacct.h>
50 #include <linux/cgroupstats.h> 50 #include <linux/cgroupstats.h>
51 #include <linux/hashtable.h> 51 #include <linux/hashtable.h>
52 #include <linux/pid_namespace.h> 52 #include <linux/pid_namespace.h>
53 #include <linux/idr.h> 53 #include <linux/idr.h>
54 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 54 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
55 #include <linux/kthread.h> 55 #include <linux/kthread.h>
56 #include <linux/delay.h> 56 #include <linux/delay.h>
57 57
58 #include <linux/atomic.h> 58 #include <linux/atomic.h>
59 59
60 /* 60 /*
61 * pidlists linger the following amount before being destroyed. The goal 61 * pidlists linger the following amount before being destroyed. The goal
62 * is avoiding frequent destruction in the middle of consecutive read calls 62 * is avoiding frequent destruction in the middle of consecutive read calls
63 * Expiring in the middle is a performance problem not a correctness one. 63 * Expiring in the middle is a performance problem not a correctness one.
64 * 1 sec should be enough. 64 * 1 sec should be enough.
65 */ 65 */
66 #define CGROUP_PIDLIST_DESTROY_DELAY HZ 66 #define CGROUP_PIDLIST_DESTROY_DELAY HZ
67 67
68 #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ 68 #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
69 MAX_CFTYPE_NAME + 2) 69 MAX_CFTYPE_NAME + 2)
70 70
71 /* 71 /*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file 72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup 73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer 74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs 75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. 76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */ 77 */
78 static DEFINE_MUTEX(cgroup_tree_mutex); 78 static DEFINE_MUTEX(cgroup_tree_mutex);
79 79
80 /* 80 /*
81 * cgroup_mutex is the master lock. Any modification to cgroup or its 81 * cgroup_mutex is the master lock. Any modification to cgroup or its
82 * hierarchy must be performed while holding it. 82 * hierarchy must be performed while holding it.
83 * 83 *
84 * css_set_rwsem protects task->cgroups pointer, the list of css_set 84 * css_set_rwsem protects task->cgroups pointer, the list of css_set
85 * objects, and the chain of tasks off each css_set. 85 * objects, and the chain of tasks off each css_set.
86 * 86 *
87 * These locks are exported if CONFIG_PROVE_RCU so that accessors in 87 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
88 * cgroup.h can use them for lockdep annotations. 88 * cgroup.h can use them for lockdep annotations.
89 */ 89 */
90 #ifdef CONFIG_PROVE_RCU 90 #ifdef CONFIG_PROVE_RCU
91 DEFINE_MUTEX(cgroup_mutex); 91 DEFINE_MUTEX(cgroup_mutex);
92 DECLARE_RWSEM(css_set_rwsem); 92 DECLARE_RWSEM(css_set_rwsem);
93 EXPORT_SYMBOL_GPL(cgroup_mutex); 93 EXPORT_SYMBOL_GPL(cgroup_mutex);
94 EXPORT_SYMBOL_GPL(css_set_rwsem); 94 EXPORT_SYMBOL_GPL(css_set_rwsem);
95 #else 95 #else
96 static DEFINE_MUTEX(cgroup_mutex); 96 static DEFINE_MUTEX(cgroup_mutex);
97 static DECLARE_RWSEM(css_set_rwsem); 97 static DECLARE_RWSEM(css_set_rwsem);
98 #endif 98 #endif
99 99
100 /* 100 /*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires 101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. 102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */ 103 */
104 static DEFINE_SPINLOCK(release_agent_path_lock); 104 static DEFINE_SPINLOCK(release_agent_path_lock);
105 105
106 #define cgroup_assert_mutexes_or_rcu_locked() \ 106 #define cgroup_assert_mutexes_or_rcu_locked() \
107 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \ 108 lockdep_is_held(&cgroup_tree_mutex) || \
109 lockdep_is_held(&cgroup_mutex), \ 109 lockdep_is_held(&cgroup_mutex), \
110 "cgroup_[tree_]mutex or RCU read lock required"); 110 "cgroup_[tree_]mutex or RCU read lock required");
111 111
112 /* 112 /*
113 * cgroup destruction makes heavy use of work items and there can be a lot 113 * cgroup destruction makes heavy use of work items and there can be a lot
114 * of concurrent destructions. Use a separate workqueue so that cgroup 114 * of concurrent destructions. Use a separate workqueue so that cgroup
115 * destruction work items don't end up filling up max_active of system_wq 115 * destruction work items don't end up filling up max_active of system_wq
116 * which may lead to deadlock. 116 * which may lead to deadlock.
117 */ 117 */
118 static struct workqueue_struct *cgroup_destroy_wq; 118 static struct workqueue_struct *cgroup_destroy_wq;
119 119
120 /* 120 /*
121 * pidlist destructions need to be flushed on cgroup destruction. Use a 121 * pidlist destructions need to be flushed on cgroup destruction. Use a
122 * separate workqueue as flush domain. 122 * separate workqueue as flush domain.
123 */ 123 */
124 static struct workqueue_struct *cgroup_pidlist_destroy_wq; 124 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
125 125
126 /* generate an array of cgroup subsystem pointers */ 126 /* generate an array of cgroup subsystem pointers */
127 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, 127 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
128 static struct cgroup_subsys *cgroup_subsys[] = { 128 static struct cgroup_subsys *cgroup_subsys[] = {
129 #include <linux/cgroup_subsys.h> 129 #include <linux/cgroup_subsys.h>
130 }; 130 };
131 #undef SUBSYS 131 #undef SUBSYS
132 132
133 /* array of cgroup subsystem names */ 133 /* array of cgroup subsystem names */
134 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x, 134 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
135 static const char *cgroup_subsys_name[] = { 135 static const char *cgroup_subsys_name[] = {
136 #include <linux/cgroup_subsys.h> 136 #include <linux/cgroup_subsys.h>
137 }; 137 };
138 #undef SUBSYS 138 #undef SUBSYS
139 139
140 /* 140 /*
141 * The default hierarchy, reserved for the subsystems that are otherwise 141 * The default hierarchy, reserved for the subsystems that are otherwise
142 * unattached - it never has more than a single cgroup, and all tasks are 142 * unattached - it never has more than a single cgroup, and all tasks are
143 * part of that cgroup. 143 * part of that cgroup.
144 */ 144 */
145 struct cgroup_root cgrp_dfl_root; 145 struct cgroup_root cgrp_dfl_root;
146 146
147 /* 147 /*
148 * The default hierarchy always exists but is hidden until mounted for the 148 * The default hierarchy always exists but is hidden until mounted for the
149 * first time. This is for backward compatibility. 149 * first time. This is for backward compatibility.
150 */ 150 */
151 static bool cgrp_dfl_root_visible; 151 static bool cgrp_dfl_root_visible;
152 152
153 /* The list of hierarchy roots */ 153 /* The list of hierarchy roots */
154 154
155 static LIST_HEAD(cgroup_roots); 155 static LIST_HEAD(cgroup_roots);
156 static int cgroup_root_count; 156 static int cgroup_root_count;
157 157
158 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ 158 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
159 static DEFINE_IDR(cgroup_hierarchy_idr); 159 static DEFINE_IDR(cgroup_hierarchy_idr);
160 160
161 /* 161 /*
162 * Assign a monotonically increasing serial number to cgroups. It 162 * Assign a monotonically increasing serial number to cgroups. It
163 * guarantees cgroups with bigger numbers are newer than those with smaller 163 * guarantees cgroups with bigger numbers are newer than those with smaller
164 * numbers. Also, as cgroups are always appended to the parent's 164 * numbers. Also, as cgroups are always appended to the parent's
165 * ->children list, it guarantees that sibling cgroups are always sorted in 165 * ->children list, it guarantees that sibling cgroups are always sorted in
166 * the ascending serial number order on the list. Protected by 166 * the ascending serial number order on the list. Protected by
167 * cgroup_mutex. 167 * cgroup_mutex.
168 */ 168 */
169 static u64 cgroup_serial_nr_next = 1; 169 static u64 cgroup_serial_nr_next = 1;
170 170
171 /* This flag indicates whether tasks in the fork and exit paths should 171 /* This flag indicates whether tasks in the fork and exit paths should
172 * check for fork/exit handlers to call. This avoids us having to do 172 * check for fork/exit handlers to call. This avoids us having to do
173 * extra work in the fork/exit path if none of the subsystems need to 173 * extra work in the fork/exit path if none of the subsystems need to
174 * be called. 174 * be called.
175 */ 175 */
176 static int need_forkexit_callback __read_mostly; 176 static int need_forkexit_callback __read_mostly;
177 177
178 static struct cftype cgroup_base_files[]; 178 static struct cftype cgroup_base_files[];
179 179
180 static void cgroup_put(struct cgroup *cgrp); 180 static void cgroup_put(struct cgroup *cgrp);
181 static int rebind_subsystems(struct cgroup_root *dst_root, 181 static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask); 182 unsigned long ss_mask);
183 static void cgroup_destroy_css_killed(struct cgroup *cgrp); 183 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
184 static int cgroup_destroy_locked(struct cgroup *cgrp); 184 static int cgroup_destroy_locked(struct cgroup *cgrp);
185 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 185 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
186 bool is_add); 186 bool is_add);
187 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 187 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
188 188
189 /** 189 /**
190 * cgroup_css - obtain a cgroup's css for the specified subsystem 190 * cgroup_css - obtain a cgroup's css for the specified subsystem
191 * @cgrp: the cgroup of interest 191 * @cgrp: the cgroup of interest
192 * @ss: the subsystem of interest (%NULL returns the dummy_css) 192 * @ss: the subsystem of interest (%NULL returns the dummy_css)
193 * 193 *
194 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This 194 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
195 * function must be called either under cgroup_mutex or rcu_read_lock() and 195 * function must be called either under cgroup_mutex or rcu_read_lock() and
196 * the caller is responsible for pinning the returned css if it wants to 196 * the caller is responsible for pinning the returned css if it wants to
197 * keep accessing it outside the said locks. This function may return 197 * keep accessing it outside the said locks. This function may return
198 * %NULL if @cgrp doesn't have @subsys_id enabled. 198 * %NULL if @cgrp doesn't have @subsys_id enabled.
199 */ 199 */
200 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, 200 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
201 struct cgroup_subsys *ss) 201 struct cgroup_subsys *ss)
202 { 202 {
203 if (ss) 203 if (ss)
204 return rcu_dereference_check(cgrp->subsys[ss->id], 204 return rcu_dereference_check(cgrp->subsys[ss->id],
205 lockdep_is_held(&cgroup_tree_mutex) || 205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex)); 206 lockdep_is_held(&cgroup_mutex));
207 else 207 else
208 return &cgrp->dummy_css; 208 return &cgrp->dummy_css;
209 } 209 }
210 210
211 /** 211 /**
212 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem 212 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
213 * @cgrp: the cgroup of interest 213 * @cgrp: the cgroup of interest
214 * @ss: the subsystem of interest (%NULL returns the dummy_css) 214 * @ss: the subsystem of interest (%NULL returns the dummy_css)
215 * 215 *
216 * Similar to cgroup_css() but returns the effctive css, which is defined 216 * Similar to cgroup_css() but returns the effctive css, which is defined
217 * as the matching css of the nearest ancestor including self which has @ss 217 * as the matching css of the nearest ancestor including self which has @ss
218 * enabled. If @ss is associated with the hierarchy @cgrp is on, this 218 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
219 * function is guaranteed to return non-NULL css. 219 * function is guaranteed to return non-NULL css.
220 */ 220 */
221 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, 221 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
222 struct cgroup_subsys *ss) 222 struct cgroup_subsys *ss)
223 { 223 {
224 lockdep_assert_held(&cgroup_mutex); 224 lockdep_assert_held(&cgroup_mutex);
225 225
226 if (!ss) 226 if (!ss)
227 return &cgrp->dummy_css; 227 return &cgrp->dummy_css;
228 228
229 if (!(cgrp->root->subsys_mask & (1 << ss->id))) 229 if (!(cgrp->root->subsys_mask & (1 << ss->id)))
230 return NULL; 230 return NULL;
231 231
232 while (cgrp->parent && 232 while (cgrp->parent &&
233 !(cgrp->parent->child_subsys_mask & (1 << ss->id))) 233 !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
234 cgrp = cgrp->parent; 234 cgrp = cgrp->parent;
235 235
236 return cgroup_css(cgrp, ss); 236 return cgroup_css(cgrp, ss);
237 } 237 }
238 238
239 /* convenient tests for these bits */ 239 /* convenient tests for these bits */
240 static inline bool cgroup_is_dead(const struct cgroup *cgrp) 240 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
241 { 241 {
242 return test_bit(CGRP_DEAD, &cgrp->flags); 242 return test_bit(CGRP_DEAD, &cgrp->flags);
243 } 243 }
244 244
245 struct cgroup_subsys_state *seq_css(struct seq_file *seq) 245 struct cgroup_subsys_state *seq_css(struct seq_file *seq)
246 { 246 {
247 struct kernfs_open_file *of = seq->private; 247 struct kernfs_open_file *of = seq->private;
248 struct cgroup *cgrp = of->kn->parent->priv; 248 struct cgroup *cgrp = of->kn->parent->priv;
249 struct cftype *cft = seq_cft(seq); 249 struct cftype *cft = seq_cft(seq);
250 250
251 /* 251 /*
252 * This is open and unprotected implementation of cgroup_css(). 252 * This is open and unprotected implementation of cgroup_css().
253 * seq_css() is only called from a kernfs file operation which has 253 * seq_css() is only called from a kernfs file operation which has
254 * an active reference on the file. Because all the subsystem 254 * an active reference on the file. Because all the subsystem
255 * files are drained before a css is disassociated with a cgroup, 255 * files are drained before a css is disassociated with a cgroup,
256 * the matching css from the cgroup's subsys table is guaranteed to 256 * the matching css from the cgroup's subsys table is guaranteed to
257 * be and stay valid until the enclosing operation is complete. 257 * be and stay valid until the enclosing operation is complete.
258 */ 258 */
259 if (cft->ss) 259 if (cft->ss)
260 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); 260 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
261 else 261 else
262 return &cgrp->dummy_css; 262 return &cgrp->dummy_css;
263 } 263 }
264 EXPORT_SYMBOL_GPL(seq_css); 264 EXPORT_SYMBOL_GPL(seq_css);
265 265
266 /** 266 /**
267 * cgroup_is_descendant - test ancestry 267 * cgroup_is_descendant - test ancestry
268 * @cgrp: the cgroup to be tested 268 * @cgrp: the cgroup to be tested
269 * @ancestor: possible ancestor of @cgrp 269 * @ancestor: possible ancestor of @cgrp
270 * 270 *
271 * Test whether @cgrp is a descendant of @ancestor. It also returns %true 271 * Test whether @cgrp is a descendant of @ancestor. It also returns %true
272 * if @cgrp == @ancestor. This function is safe to call as long as @cgrp 272 * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
273 * and @ancestor are accessible. 273 * and @ancestor are accessible.
274 */ 274 */
275 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) 275 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
276 { 276 {
277 while (cgrp) { 277 while (cgrp) {
278 if (cgrp == ancestor) 278 if (cgrp == ancestor)
279 return true; 279 return true;
280 cgrp = cgrp->parent; 280 cgrp = cgrp->parent;
281 } 281 }
282 return false; 282 return false;
283 } 283 }
284 284
285 static int cgroup_is_releasable(const struct cgroup *cgrp) 285 static int cgroup_is_releasable(const struct cgroup *cgrp)
286 { 286 {
287 const int bits = 287 const int bits =
288 (1 << CGRP_RELEASABLE) | 288 (1 << CGRP_RELEASABLE) |
289 (1 << CGRP_NOTIFY_ON_RELEASE); 289 (1 << CGRP_NOTIFY_ON_RELEASE);
290 return (cgrp->flags & bits) == bits; 290 return (cgrp->flags & bits) == bits;
291 } 291 }
292 292
293 static int notify_on_release(const struct cgroup *cgrp) 293 static int notify_on_release(const struct cgroup *cgrp)
294 { 294 {
295 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 295 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
296 } 296 }
297 297
298 /** 298 /**
299 * for_each_css - iterate all css's of a cgroup 299 * for_each_css - iterate all css's of a cgroup
300 * @css: the iteration cursor 300 * @css: the iteration cursor
301 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 301 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
302 * @cgrp: the target cgroup to iterate css's of 302 * @cgrp: the target cgroup to iterate css's of
303 * 303 *
304 * Should be called under cgroup_[tree_]mutex. 304 * Should be called under cgroup_[tree_]mutex.
305 */ 305 */
306 #define for_each_css(css, ssid, cgrp) \ 306 #define for_each_css(css, ssid, cgrp) \
307 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 307 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
308 if (!((css) = rcu_dereference_check( \ 308 if (!((css) = rcu_dereference_check( \
309 (cgrp)->subsys[(ssid)], \ 309 (cgrp)->subsys[(ssid)], \
310 lockdep_is_held(&cgroup_tree_mutex) || \ 310 lockdep_is_held(&cgroup_tree_mutex) || \
311 lockdep_is_held(&cgroup_mutex)))) { } \ 311 lockdep_is_held(&cgroup_mutex)))) { } \
312 else 312 else
313 313
314 /** 314 /**
315 * for_each_e_css - iterate all effective css's of a cgroup 315 * for_each_e_css - iterate all effective css's of a cgroup
316 * @css: the iteration cursor 316 * @css: the iteration cursor
317 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 317 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
318 * @cgrp: the target cgroup to iterate css's of 318 * @cgrp: the target cgroup to iterate css's of
319 * 319 *
320 * Should be called under cgroup_[tree_]mutex. 320 * Should be called under cgroup_[tree_]mutex.
321 */ 321 */
322 #define for_each_e_css(css, ssid, cgrp) \ 322 #define for_each_e_css(css, ssid, cgrp) \
323 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 323 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
324 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ 324 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
325 ; \ 325 ; \
326 else 326 else
327 327
328 /** 328 /**
329 * for_each_subsys - iterate all enabled cgroup subsystems 329 * for_each_subsys - iterate all enabled cgroup subsystems
330 * @ss: the iteration cursor 330 * @ss: the iteration cursor
331 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 331 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
332 */ 332 */
333 #define for_each_subsys(ss, ssid) \ 333 #define for_each_subsys(ss, ssid) \
334 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ 334 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
335 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) 335 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
336 336
337 /* iterate across the hierarchies */ 337 /* iterate across the hierarchies */
338 #define for_each_root(root) \ 338 #define for_each_root(root) \
339 list_for_each_entry((root), &cgroup_roots, root_list) 339 list_for_each_entry((root), &cgroup_roots, root_list)
340 340
341 /** 341 /**
342 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 342 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
343 * @cgrp: the cgroup to be checked for liveness 343 * @cgrp: the cgroup to be checked for liveness
344 * 344 *
345 * On success, returns true; the mutex should be later unlocked. On 345 * On success, returns true; the mutex should be later unlocked. On
346 * failure returns false with no lock held. 346 * failure returns false with no lock held.
347 */ 347 */
348 static bool cgroup_lock_live_group(struct cgroup *cgrp) 348 static bool cgroup_lock_live_group(struct cgroup *cgrp)
349 { 349 {
350 mutex_lock(&cgroup_mutex); 350 mutex_lock(&cgroup_mutex);
351 if (cgroup_is_dead(cgrp)) { 351 if (cgroup_is_dead(cgrp)) {
352 mutex_unlock(&cgroup_mutex); 352 mutex_unlock(&cgroup_mutex);
353 return false; 353 return false;
354 } 354 }
355 return true; 355 return true;
356 } 356 }
357 357
358 /* the list of cgroups eligible for automatic release. Protected by 358 /* the list of cgroups eligible for automatic release. Protected by
359 * release_list_lock */ 359 * release_list_lock */
360 static LIST_HEAD(release_list); 360 static LIST_HEAD(release_list);
361 static DEFINE_RAW_SPINLOCK(release_list_lock); 361 static DEFINE_RAW_SPINLOCK(release_list_lock);
362 static void cgroup_release_agent(struct work_struct *work); 362 static void cgroup_release_agent(struct work_struct *work);
363 static DECLARE_WORK(release_agent_work, cgroup_release_agent); 363 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
364 static void check_for_release(struct cgroup *cgrp); 364 static void check_for_release(struct cgroup *cgrp);
365 365
366 /* 366 /*
367 * A cgroup can be associated with multiple css_sets as different tasks may 367 * A cgroup can be associated with multiple css_sets as different tasks may
368 * belong to different cgroups on different hierarchies. In the other 368 * belong to different cgroups on different hierarchies. In the other
369 * direction, a css_set is naturally associated with multiple cgroups. 369 * direction, a css_set is naturally associated with multiple cgroups.
370 * This M:N relationship is represented by the following link structure 370 * This M:N relationship is represented by the following link structure
371 * which exists for each association and allows traversing the associations 371 * which exists for each association and allows traversing the associations
372 * from both sides. 372 * from both sides.
373 */ 373 */
374 struct cgrp_cset_link { 374 struct cgrp_cset_link {
375 /* the cgroup and css_set this link associates */ 375 /* the cgroup and css_set this link associates */
376 struct cgroup *cgrp; 376 struct cgroup *cgrp;
377 struct css_set *cset; 377 struct css_set *cset;
378 378
379 /* list of cgrp_cset_links anchored at cgrp->cset_links */ 379 /* list of cgrp_cset_links anchored at cgrp->cset_links */
380 struct list_head cset_link; 380 struct list_head cset_link;
381 381
382 /* list of cgrp_cset_links anchored at css_set->cgrp_links */ 382 /* list of cgrp_cset_links anchored at css_set->cgrp_links */
383 struct list_head cgrp_link; 383 struct list_head cgrp_link;
384 }; 384 };
385 385
386 /* 386 /*
387 * The default css_set - used by init and its children prior to any 387 * The default css_set - used by init and its children prior to any
388 * hierarchies being mounted. It contains a pointer to the root state 388 * hierarchies being mounted. It contains a pointer to the root state
389 * for each subsystem. Also used to anchor the list of css_sets. Not 389 * for each subsystem. Also used to anchor the list of css_sets. Not
390 * reference-counted, to improve performance when child cgroups 390 * reference-counted, to improve performance when child cgroups
391 * haven't been created. 391 * haven't been created.
392 */ 392 */
393 static struct css_set init_css_set = { 393 static struct css_set init_css_set = {
394 .refcount = ATOMIC_INIT(1), 394 .refcount = ATOMIC_INIT(1),
395 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), 395 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
396 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 396 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
397 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 397 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
398 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), 398 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
399 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), 399 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
400 }; 400 };
401 401
402 static int css_set_count = 1; /* 1 for init_css_set */ 402 static int css_set_count = 1; /* 1 for init_css_set */
403 403
404 /* 404 /*
405 * hash table for cgroup groups. This improves the performance to find 405 * hash table for cgroup groups. This improves the performance to find
406 * an existing css_set. This hash doesn't (currently) take into 406 * an existing css_set. This hash doesn't (currently) take into
407 * account cgroups in empty hierarchies. 407 * account cgroups in empty hierarchies.
408 */ 408 */
409 #define CSS_SET_HASH_BITS 7 409 #define CSS_SET_HASH_BITS 7
410 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); 410 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
411 411
412 static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) 412 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
413 { 413 {
414 unsigned long key = 0UL; 414 unsigned long key = 0UL;
415 struct cgroup_subsys *ss; 415 struct cgroup_subsys *ss;
416 int i; 416 int i;
417 417
418 for_each_subsys(ss, i) 418 for_each_subsys(ss, i)
419 key += (unsigned long)css[i]; 419 key += (unsigned long)css[i];
420 key = (key >> 16) ^ key; 420 key = (key >> 16) ^ key;
421 421
422 return key; 422 return key;
423 } 423 }
424 424
425 static void put_css_set_locked(struct css_set *cset, bool taskexit) 425 static void put_css_set_locked(struct css_set *cset, bool taskexit)
426 { 426 {
427 struct cgrp_cset_link *link, *tmp_link; 427 struct cgrp_cset_link *link, *tmp_link;
428 struct cgroup_subsys *ss; 428 struct cgroup_subsys *ss;
429 int ssid; 429 int ssid;
430 430
431 lockdep_assert_held(&css_set_rwsem); 431 lockdep_assert_held(&css_set_rwsem);
432 432
433 if (!atomic_dec_and_test(&cset->refcount)) 433 if (!atomic_dec_and_test(&cset->refcount))
434 return; 434 return;
435 435
436 /* This css_set is dead. unlink it and release cgroup refcounts */ 436 /* This css_set is dead. unlink it and release cgroup refcounts */
437 for_each_subsys(ss, ssid) 437 for_each_subsys(ss, ssid)
438 list_del(&cset->e_cset_node[ssid]); 438 list_del(&cset->e_cset_node[ssid]);
439 hash_del(&cset->hlist); 439 hash_del(&cset->hlist);
440 css_set_count--; 440 css_set_count--;
441 441
442 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { 442 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
443 struct cgroup *cgrp = link->cgrp; 443 struct cgroup *cgrp = link->cgrp;
444 444
445 list_del(&link->cset_link); 445 list_del(&link->cset_link);
446 list_del(&link->cgrp_link); 446 list_del(&link->cgrp_link);
447 447
448 /* @cgrp can't go away while we're holding css_set_rwsem */ 448 /* @cgrp can't go away while we're holding css_set_rwsem */
449 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 449 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
450 if (taskexit) 450 if (taskexit)
451 set_bit(CGRP_RELEASABLE, &cgrp->flags); 451 set_bit(CGRP_RELEASABLE, &cgrp->flags);
452 check_for_release(cgrp); 452 check_for_release(cgrp);
453 } 453 }
454 454
455 kfree(link); 455 kfree(link);
456 } 456 }
457 457
458 kfree_rcu(cset, rcu_head); 458 kfree_rcu(cset, rcu_head);
459 } 459 }
460 460
461 static void put_css_set(struct css_set *cset, bool taskexit) 461 static void put_css_set(struct css_set *cset, bool taskexit)
462 { 462 {
463 /* 463 /*
464 * Ensure that the refcount doesn't hit zero while any readers 464 * Ensure that the refcount doesn't hit zero while any readers
465 * can see it. Similar to atomic_dec_and_lock(), but for an 465 * can see it. Similar to atomic_dec_and_lock(), but for an
466 * rwlock 466 * rwlock
467 */ 467 */
468 if (atomic_add_unless(&cset->refcount, -1, 1)) 468 if (atomic_add_unless(&cset->refcount, -1, 1))
469 return; 469 return;
470 470
471 down_write(&css_set_rwsem); 471 down_write(&css_set_rwsem);
472 put_css_set_locked(cset, taskexit); 472 put_css_set_locked(cset, taskexit);
473 up_write(&css_set_rwsem); 473 up_write(&css_set_rwsem);
474 } 474 }
475 475
476 /* 476 /*
477 * refcounted get/put for css_set objects 477 * refcounted get/put for css_set objects
478 */ 478 */
479 static inline void get_css_set(struct css_set *cset) 479 static inline void get_css_set(struct css_set *cset)
480 { 480 {
481 atomic_inc(&cset->refcount); 481 atomic_inc(&cset->refcount);
482 } 482 }
483 483
484 /** 484 /**
485 * compare_css_sets - helper function for find_existing_css_set(). 485 * compare_css_sets - helper function for find_existing_css_set().
486 * @cset: candidate css_set being tested 486 * @cset: candidate css_set being tested
487 * @old_cset: existing css_set for a task 487 * @old_cset: existing css_set for a task
488 * @new_cgrp: cgroup that's being entered by the task 488 * @new_cgrp: cgroup that's being entered by the task
489 * @template: desired set of css pointers in css_set (pre-calculated) 489 * @template: desired set of css pointers in css_set (pre-calculated)
490 * 490 *
491 * Returns true if "cset" matches "old_cset" except for the hierarchy 491 * Returns true if "cset" matches "old_cset" except for the hierarchy
492 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 492 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
493 */ 493 */
494 static bool compare_css_sets(struct css_set *cset, 494 static bool compare_css_sets(struct css_set *cset,
495 struct css_set *old_cset, 495 struct css_set *old_cset,
496 struct cgroup *new_cgrp, 496 struct cgroup *new_cgrp,
497 struct cgroup_subsys_state *template[]) 497 struct cgroup_subsys_state *template[])
498 { 498 {
499 struct list_head *l1, *l2; 499 struct list_head *l1, *l2;
500 500
501 /* 501 /*
502 * On the default hierarchy, there can be csets which are 502 * On the default hierarchy, there can be csets which are
503 * associated with the same set of cgroups but different csses. 503 * associated with the same set of cgroups but different csses.
504 * Let's first ensure that csses match. 504 * Let's first ensure that csses match.
505 */ 505 */
506 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) 506 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
507 return false; 507 return false;
508 508
509 /* 509 /*
510 * Compare cgroup pointers in order to distinguish between 510 * Compare cgroup pointers in order to distinguish between
511 * different cgroups in hierarchies. As different cgroups may 511 * different cgroups in hierarchies. As different cgroups may
512 * share the same effective css, this comparison is always 512 * share the same effective css, this comparison is always
513 * necessary. 513 * necessary.
514 */ 514 */
515 l1 = &cset->cgrp_links; 515 l1 = &cset->cgrp_links;
516 l2 = &old_cset->cgrp_links; 516 l2 = &old_cset->cgrp_links;
517 while (1) { 517 while (1) {
518 struct cgrp_cset_link *link1, *link2; 518 struct cgrp_cset_link *link1, *link2;
519 struct cgroup *cgrp1, *cgrp2; 519 struct cgroup *cgrp1, *cgrp2;
520 520
521 l1 = l1->next; 521 l1 = l1->next;
522 l2 = l2->next; 522 l2 = l2->next;
523 /* See if we reached the end - both lists are equal length. */ 523 /* See if we reached the end - both lists are equal length. */
524 if (l1 == &cset->cgrp_links) { 524 if (l1 == &cset->cgrp_links) {
525 BUG_ON(l2 != &old_cset->cgrp_links); 525 BUG_ON(l2 != &old_cset->cgrp_links);
526 break; 526 break;
527 } else { 527 } else {
528 BUG_ON(l2 == &old_cset->cgrp_links); 528 BUG_ON(l2 == &old_cset->cgrp_links);
529 } 529 }
530 /* Locate the cgroups associated with these links. */ 530 /* Locate the cgroups associated with these links. */
531 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); 531 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
532 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); 532 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
533 cgrp1 = link1->cgrp; 533 cgrp1 = link1->cgrp;
534 cgrp2 = link2->cgrp; 534 cgrp2 = link2->cgrp;
535 /* Hierarchies should be linked in the same order. */ 535 /* Hierarchies should be linked in the same order. */
536 BUG_ON(cgrp1->root != cgrp2->root); 536 BUG_ON(cgrp1->root != cgrp2->root);
537 537
538 /* 538 /*
539 * If this hierarchy is the hierarchy of the cgroup 539 * If this hierarchy is the hierarchy of the cgroup
540 * that's changing, then we need to check that this 540 * that's changing, then we need to check that this
541 * css_set points to the new cgroup; if it's any other 541 * css_set points to the new cgroup; if it's any other
542 * hierarchy, then this css_set should point to the 542 * hierarchy, then this css_set should point to the
543 * same cgroup as the old css_set. 543 * same cgroup as the old css_set.
544 */ 544 */
545 if (cgrp1->root == new_cgrp->root) { 545 if (cgrp1->root == new_cgrp->root) {
546 if (cgrp1 != new_cgrp) 546 if (cgrp1 != new_cgrp)
547 return false; 547 return false;
548 } else { 548 } else {
549 if (cgrp1 != cgrp2) 549 if (cgrp1 != cgrp2)
550 return false; 550 return false;
551 } 551 }
552 } 552 }
553 return true; 553 return true;
554 } 554 }
555 555
556 /** 556 /**
557 * find_existing_css_set - init css array and find the matching css_set 557 * find_existing_css_set - init css array and find the matching css_set
558 * @old_cset: the css_set that we're using before the cgroup transition 558 * @old_cset: the css_set that we're using before the cgroup transition
559 * @cgrp: the cgroup that we're moving into 559 * @cgrp: the cgroup that we're moving into
560 * @template: out param for the new set of csses, should be clear on entry 560 * @template: out param for the new set of csses, should be clear on entry
561 */ 561 */
562 static struct css_set *find_existing_css_set(struct css_set *old_cset, 562 static struct css_set *find_existing_css_set(struct css_set *old_cset,
563 struct cgroup *cgrp, 563 struct cgroup *cgrp,
564 struct cgroup_subsys_state *template[]) 564 struct cgroup_subsys_state *template[])
565 { 565 {
566 struct cgroup_root *root = cgrp->root; 566 struct cgroup_root *root = cgrp->root;
567 struct cgroup_subsys *ss; 567 struct cgroup_subsys *ss;
568 struct css_set *cset; 568 struct css_set *cset;
569 unsigned long key; 569 unsigned long key;
570 int i; 570 int i;
571 571
572 /* 572 /*
573 * Build the set of subsystem state objects that we want to see in the 573 * Build the set of subsystem state objects that we want to see in the
574 * new css_set. while subsystems can change globally, the entries here 574 * new css_set. while subsystems can change globally, the entries here
575 * won't change, so no need for locking. 575 * won't change, so no need for locking.
576 */ 576 */
577 for_each_subsys(ss, i) { 577 for_each_subsys(ss, i) {
578 if (root->subsys_mask & (1UL << i)) { 578 if (root->subsys_mask & (1UL << i)) {
579 /* 579 /*
580 * @ss is in this hierarchy, so we want the 580 * @ss is in this hierarchy, so we want the
581 * effective css from @cgrp. 581 * effective css from @cgrp.
582 */ 582 */
583 template[i] = cgroup_e_css(cgrp, ss); 583 template[i] = cgroup_e_css(cgrp, ss);
584 } else { 584 } else {
585 /* 585 /*
586 * @ss is not in this hierarchy, so we don't want 586 * @ss is not in this hierarchy, so we don't want
587 * to change the css. 587 * to change the css.
588 */ 588 */
589 template[i] = old_cset->subsys[i]; 589 template[i] = old_cset->subsys[i];
590 } 590 }
591 } 591 }
592 592
593 key = css_set_hash(template); 593 key = css_set_hash(template);
594 hash_for_each_possible(css_set_table, cset, hlist, key) { 594 hash_for_each_possible(css_set_table, cset, hlist, key) {
595 if (!compare_css_sets(cset, old_cset, cgrp, template)) 595 if (!compare_css_sets(cset, old_cset, cgrp, template))
596 continue; 596 continue;
597 597
598 /* This css_set matches what we need */ 598 /* This css_set matches what we need */
599 return cset; 599 return cset;
600 } 600 }
601 601
602 /* No existing cgroup group matched */ 602 /* No existing cgroup group matched */
603 return NULL; 603 return NULL;
604 } 604 }
605 605
606 static void free_cgrp_cset_links(struct list_head *links_to_free) 606 static void free_cgrp_cset_links(struct list_head *links_to_free)
607 { 607 {
608 struct cgrp_cset_link *link, *tmp_link; 608 struct cgrp_cset_link *link, *tmp_link;
609 609
610 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { 610 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
611 list_del(&link->cset_link); 611 list_del(&link->cset_link);
612 kfree(link); 612 kfree(link);
613 } 613 }
614 } 614 }
615 615
616 /** 616 /**
617 * allocate_cgrp_cset_links - allocate cgrp_cset_links 617 * allocate_cgrp_cset_links - allocate cgrp_cset_links
618 * @count: the number of links to allocate 618 * @count: the number of links to allocate
619 * @tmp_links: list_head the allocated links are put on 619 * @tmp_links: list_head the allocated links are put on
620 * 620 *
621 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links 621 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
622 * through ->cset_link. Returns 0 on success or -errno. 622 * through ->cset_link. Returns 0 on success or -errno.
623 */ 623 */
624 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) 624 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
625 { 625 {
626 struct cgrp_cset_link *link; 626 struct cgrp_cset_link *link;
627 int i; 627 int i;
628 628
629 INIT_LIST_HEAD(tmp_links); 629 INIT_LIST_HEAD(tmp_links);
630 630
631 for (i = 0; i < count; i++) { 631 for (i = 0; i < count; i++) {
632 link = kzalloc(sizeof(*link), GFP_KERNEL); 632 link = kzalloc(sizeof(*link), GFP_KERNEL);
633 if (!link) { 633 if (!link) {
634 free_cgrp_cset_links(tmp_links); 634 free_cgrp_cset_links(tmp_links);
635 return -ENOMEM; 635 return -ENOMEM;
636 } 636 }
637 list_add(&link->cset_link, tmp_links); 637 list_add(&link->cset_link, tmp_links);
638 } 638 }
639 return 0; 639 return 0;
640 } 640 }
641 641
642 /** 642 /**
643 * link_css_set - a helper function to link a css_set to a cgroup 643 * link_css_set - a helper function to link a css_set to a cgroup
644 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() 644 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
645 * @cset: the css_set to be linked 645 * @cset: the css_set to be linked
646 * @cgrp: the destination cgroup 646 * @cgrp: the destination cgroup
647 */ 647 */
648 static void link_css_set(struct list_head *tmp_links, struct css_set *cset, 648 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
649 struct cgroup *cgrp) 649 struct cgroup *cgrp)
650 { 650 {
651 struct cgrp_cset_link *link; 651 struct cgrp_cset_link *link;
652 652
653 BUG_ON(list_empty(tmp_links)); 653 BUG_ON(list_empty(tmp_links));
654
655 if (cgroup_on_dfl(cgrp))
656 cset->dfl_cgrp = cgrp;
657
654 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); 658 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
655 link->cset = cset; 659 link->cset = cset;
656 link->cgrp = cgrp; 660 link->cgrp = cgrp;
657 list_move(&link->cset_link, &cgrp->cset_links); 661 list_move(&link->cset_link, &cgrp->cset_links);
658 /* 662 /*
659 * Always add links to the tail of the list so that the list 663 * Always add links to the tail of the list so that the list
660 * is sorted by order of hierarchy creation 664 * is sorted by order of hierarchy creation
661 */ 665 */
662 list_add_tail(&link->cgrp_link, &cset->cgrp_links); 666 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
663 } 667 }
664 668
665 /** 669 /**
666 * find_css_set - return a new css_set with one cgroup updated 670 * find_css_set - return a new css_set with one cgroup updated
667 * @old_cset: the baseline css_set 671 * @old_cset: the baseline css_set
668 * @cgrp: the cgroup to be updated 672 * @cgrp: the cgroup to be updated
669 * 673 *
670 * Return a new css_set that's equivalent to @old_cset, but with @cgrp 674 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
671 * substituted into the appropriate hierarchy. 675 * substituted into the appropriate hierarchy.
672 */ 676 */
673 static struct css_set *find_css_set(struct css_set *old_cset, 677 static struct css_set *find_css_set(struct css_set *old_cset,
674 struct cgroup *cgrp) 678 struct cgroup *cgrp)
675 { 679 {
676 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; 680 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
677 struct css_set *cset; 681 struct css_set *cset;
678 struct list_head tmp_links; 682 struct list_head tmp_links;
679 struct cgrp_cset_link *link; 683 struct cgrp_cset_link *link;
680 struct cgroup_subsys *ss; 684 struct cgroup_subsys *ss;
681 unsigned long key; 685 unsigned long key;
682 int ssid; 686 int ssid;
683 687
684 lockdep_assert_held(&cgroup_mutex); 688 lockdep_assert_held(&cgroup_mutex);
685 689
686 /* First see if we already have a cgroup group that matches 690 /* First see if we already have a cgroup group that matches
687 * the desired set */ 691 * the desired set */
688 down_read(&css_set_rwsem); 692 down_read(&css_set_rwsem);
689 cset = find_existing_css_set(old_cset, cgrp, template); 693 cset = find_existing_css_set(old_cset, cgrp, template);
690 if (cset) 694 if (cset)
691 get_css_set(cset); 695 get_css_set(cset);
692 up_read(&css_set_rwsem); 696 up_read(&css_set_rwsem);
693 697
694 if (cset) 698 if (cset)
695 return cset; 699 return cset;
696 700
697 cset = kzalloc(sizeof(*cset), GFP_KERNEL); 701 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
698 if (!cset) 702 if (!cset)
699 return NULL; 703 return NULL;
700 704
701 /* Allocate all the cgrp_cset_link objects that we'll need */ 705 /* Allocate all the cgrp_cset_link objects that we'll need */
702 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { 706 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
703 kfree(cset); 707 kfree(cset);
704 return NULL; 708 return NULL;
705 } 709 }
706 710
707 atomic_set(&cset->refcount, 1); 711 atomic_set(&cset->refcount, 1);
708 INIT_LIST_HEAD(&cset->cgrp_links); 712 INIT_LIST_HEAD(&cset->cgrp_links);
709 INIT_LIST_HEAD(&cset->tasks); 713 INIT_LIST_HEAD(&cset->tasks);
710 INIT_LIST_HEAD(&cset->mg_tasks); 714 INIT_LIST_HEAD(&cset->mg_tasks);
711 INIT_LIST_HEAD(&cset->mg_preload_node); 715 INIT_LIST_HEAD(&cset->mg_preload_node);
712 INIT_LIST_HEAD(&cset->mg_node); 716 INIT_LIST_HEAD(&cset->mg_node);
713 INIT_HLIST_NODE(&cset->hlist); 717 INIT_HLIST_NODE(&cset->hlist);
714 718
715 /* Copy the set of subsystem state objects generated in 719 /* Copy the set of subsystem state objects generated in
716 * find_existing_css_set() */ 720 * find_existing_css_set() */
717 memcpy(cset->subsys, template, sizeof(cset->subsys)); 721 memcpy(cset->subsys, template, sizeof(cset->subsys));
718 722
719 down_write(&css_set_rwsem); 723 down_write(&css_set_rwsem);
720 /* Add reference counts and links from the new css_set. */ 724 /* Add reference counts and links from the new css_set. */
721 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 725 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
722 struct cgroup *c = link->cgrp; 726 struct cgroup *c = link->cgrp;
723 727
724 if (c->root == cgrp->root) 728 if (c->root == cgrp->root)
725 c = cgrp; 729 c = cgrp;
726 link_css_set(&tmp_links, cset, c); 730 link_css_set(&tmp_links, cset, c);
727 } 731 }
728 732
729 BUG_ON(!list_empty(&tmp_links)); 733 BUG_ON(!list_empty(&tmp_links));
730 734
731 css_set_count++; 735 css_set_count++;
732 736
733 /* Add @cset to the hash table */ 737 /* Add @cset to the hash table */
734 key = css_set_hash(cset->subsys); 738 key = css_set_hash(cset->subsys);
735 hash_add(css_set_table, &cset->hlist, key); 739 hash_add(css_set_table, &cset->hlist, key);
736 740
737 for_each_subsys(ss, ssid) 741 for_each_subsys(ss, ssid)
738 list_add_tail(&cset->e_cset_node[ssid], 742 list_add_tail(&cset->e_cset_node[ssid],
739 &cset->subsys[ssid]->cgroup->e_csets[ssid]); 743 &cset->subsys[ssid]->cgroup->e_csets[ssid]);
740 744
741 up_write(&css_set_rwsem); 745 up_write(&css_set_rwsem);
742 746
743 return cset; 747 return cset;
744 } 748 }
745 749
746 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) 750 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
747 { 751 {
748 struct cgroup *root_cgrp = kf_root->kn->priv; 752 struct cgroup *root_cgrp = kf_root->kn->priv;
749 753
750 return root_cgrp->root; 754 return root_cgrp->root;
751 } 755 }
752 756
753 static int cgroup_init_root_id(struct cgroup_root *root) 757 static int cgroup_init_root_id(struct cgroup_root *root)
754 { 758 {
755 int id; 759 int id;
756 760
757 lockdep_assert_held(&cgroup_mutex); 761 lockdep_assert_held(&cgroup_mutex);
758 762
759 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); 763 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
760 if (id < 0) 764 if (id < 0)
761 return id; 765 return id;
762 766
763 root->hierarchy_id = id; 767 root->hierarchy_id = id;
764 return 0; 768 return 0;
765 } 769 }
766 770
767 static void cgroup_exit_root_id(struct cgroup_root *root) 771 static void cgroup_exit_root_id(struct cgroup_root *root)
768 { 772 {
769 lockdep_assert_held(&cgroup_mutex); 773 lockdep_assert_held(&cgroup_mutex);
770 774
771 if (root->hierarchy_id) { 775 if (root->hierarchy_id) {
772 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); 776 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
773 root->hierarchy_id = 0; 777 root->hierarchy_id = 0;
774 } 778 }
775 } 779 }
776 780
777 static void cgroup_free_root(struct cgroup_root *root) 781 static void cgroup_free_root(struct cgroup_root *root)
778 { 782 {
779 if (root) { 783 if (root) {
780 /* hierarhcy ID shoulid already have been released */ 784 /* hierarhcy ID shoulid already have been released */
781 WARN_ON_ONCE(root->hierarchy_id); 785 WARN_ON_ONCE(root->hierarchy_id);
782 786
783 idr_destroy(&root->cgroup_idr); 787 idr_destroy(&root->cgroup_idr);
784 kfree(root); 788 kfree(root);
785 } 789 }
786 } 790 }
787 791
788 static void cgroup_destroy_root(struct cgroup_root *root) 792 static void cgroup_destroy_root(struct cgroup_root *root)
789 { 793 {
790 struct cgroup *cgrp = &root->cgrp; 794 struct cgroup *cgrp = &root->cgrp;
791 struct cgrp_cset_link *link, *tmp_link; 795 struct cgrp_cset_link *link, *tmp_link;
792 796
793 mutex_lock(&cgroup_tree_mutex); 797 mutex_lock(&cgroup_tree_mutex);
794 mutex_lock(&cgroup_mutex); 798 mutex_lock(&cgroup_mutex);
795 799
796 BUG_ON(atomic_read(&root->nr_cgrps)); 800 BUG_ON(atomic_read(&root->nr_cgrps));
797 BUG_ON(!list_empty(&cgrp->children)); 801 BUG_ON(!list_empty(&cgrp->children));
798 802
799 /* Rebind all subsystems back to the default hierarchy */ 803 /* Rebind all subsystems back to the default hierarchy */
800 rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); 804 rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
801 805
802 /* 806 /*
803 * Release all the links from cset_links to this hierarchy's 807 * Release all the links from cset_links to this hierarchy's
804 * root cgroup 808 * root cgroup
805 */ 809 */
806 down_write(&css_set_rwsem); 810 down_write(&css_set_rwsem);
807 811
808 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 812 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
809 list_del(&link->cset_link); 813 list_del(&link->cset_link);
810 list_del(&link->cgrp_link); 814 list_del(&link->cgrp_link);
811 kfree(link); 815 kfree(link);
812 } 816 }
813 up_write(&css_set_rwsem); 817 up_write(&css_set_rwsem);
814 818
815 if (!list_empty(&root->root_list)) { 819 if (!list_empty(&root->root_list)) {
816 list_del(&root->root_list); 820 list_del(&root->root_list);
817 cgroup_root_count--; 821 cgroup_root_count--;
818 } 822 }
819 823
820 cgroup_exit_root_id(root); 824 cgroup_exit_root_id(root);
821 825
822 mutex_unlock(&cgroup_mutex); 826 mutex_unlock(&cgroup_mutex);
823 mutex_unlock(&cgroup_tree_mutex); 827 mutex_unlock(&cgroup_tree_mutex);
824 828
825 kernfs_destroy_root(root->kf_root); 829 kernfs_destroy_root(root->kf_root);
826 cgroup_free_root(root); 830 cgroup_free_root(root);
827 } 831 }
828 832
829 /* look up cgroup associated with given css_set on the specified hierarchy */ 833 /* look up cgroup associated with given css_set on the specified hierarchy */
830 static struct cgroup *cset_cgroup_from_root(struct css_set *cset, 834 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
831 struct cgroup_root *root) 835 struct cgroup_root *root)
832 { 836 {
833 struct cgroup *res = NULL; 837 struct cgroup *res = NULL;
834 838
835 lockdep_assert_held(&cgroup_mutex); 839 lockdep_assert_held(&cgroup_mutex);
836 lockdep_assert_held(&css_set_rwsem); 840 lockdep_assert_held(&css_set_rwsem);
837 841
838 if (cset == &init_css_set) { 842 if (cset == &init_css_set) {
839 res = &root->cgrp; 843 res = &root->cgrp;
840 } else { 844 } else {
841 struct cgrp_cset_link *link; 845 struct cgrp_cset_link *link;
842 846
843 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 847 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
844 struct cgroup *c = link->cgrp; 848 struct cgroup *c = link->cgrp;
845 849
846 if (c->root == root) { 850 if (c->root == root) {
847 res = c; 851 res = c;
848 break; 852 break;
849 } 853 }
850 } 854 }
851 } 855 }
852 856
853 BUG_ON(!res); 857 BUG_ON(!res);
854 return res; 858 return res;
855 } 859 }
856 860
857 /* 861 /*
858 * Return the cgroup for "task" from the given hierarchy. Must be 862 * Return the cgroup for "task" from the given hierarchy. Must be
859 * called with cgroup_mutex and css_set_rwsem held. 863 * called with cgroup_mutex and css_set_rwsem held.
860 */ 864 */
861 static struct cgroup *task_cgroup_from_root(struct task_struct *task, 865 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
862 struct cgroup_root *root) 866 struct cgroup_root *root)
863 { 867 {
864 /* 868 /*
865 * No need to lock the task - since we hold cgroup_mutex the 869 * No need to lock the task - since we hold cgroup_mutex the
866 * task can't change groups, so the only thing that can happen 870 * task can't change groups, so the only thing that can happen
867 * is that it exits and its css is set back to init_css_set. 871 * is that it exits and its css is set back to init_css_set.
868 */ 872 */
869 return cset_cgroup_from_root(task_css_set(task), root); 873 return cset_cgroup_from_root(task_css_set(task), root);
870 } 874 }
871 875
872 /* 876 /*
873 * A task must hold cgroup_mutex to modify cgroups. 877 * A task must hold cgroup_mutex to modify cgroups.
874 * 878 *
875 * Any task can increment and decrement the count field without lock. 879 * Any task can increment and decrement the count field without lock.
876 * So in general, code holding cgroup_mutex can't rely on the count 880 * So in general, code holding cgroup_mutex can't rely on the count
877 * field not changing. However, if the count goes to zero, then only 881 * field not changing. However, if the count goes to zero, then only
878 * cgroup_attach_task() can increment it again. Because a count of zero 882 * cgroup_attach_task() can increment it again. Because a count of zero
879 * means that no tasks are currently attached, therefore there is no 883 * means that no tasks are currently attached, therefore there is no
880 * way a task attached to that cgroup can fork (the other way to 884 * way a task attached to that cgroup can fork (the other way to
881 * increment the count). So code holding cgroup_mutex can safely 885 * increment the count). So code holding cgroup_mutex can safely
882 * assume that if the count is zero, it will stay zero. Similarly, if 886 * assume that if the count is zero, it will stay zero. Similarly, if
883 * a task holds cgroup_mutex on a cgroup with zero count, it 887 * a task holds cgroup_mutex on a cgroup with zero count, it
884 * knows that the cgroup won't be removed, as cgroup_rmdir() 888 * knows that the cgroup won't be removed, as cgroup_rmdir()
885 * needs that mutex. 889 * needs that mutex.
886 * 890 *
887 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't 891 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
888 * (usually) take cgroup_mutex. These are the two most performance 892 * (usually) take cgroup_mutex. These are the two most performance
889 * critical pieces of code here. The exception occurs on cgroup_exit(), 893 * critical pieces of code here. The exception occurs on cgroup_exit(),
890 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex 894 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
891 * is taken, and if the cgroup count is zero, a usermode call made 895 * is taken, and if the cgroup count is zero, a usermode call made
892 * to the release agent with the name of the cgroup (path relative to 896 * to the release agent with the name of the cgroup (path relative to
893 * the root of cgroup file system) as the argument. 897 * the root of cgroup file system) as the argument.
894 * 898 *
895 * A cgroup can only be deleted if both its 'count' of using tasks 899 * A cgroup can only be deleted if both its 'count' of using tasks
896 * is zero, and its list of 'children' cgroups is empty. Since all 900 * is zero, and its list of 'children' cgroups is empty. Since all
897 * tasks in the system use _some_ cgroup, and since there is always at 901 * tasks in the system use _some_ cgroup, and since there is always at
898 * least one task in the system (init, pid == 1), therefore, root cgroup 902 * least one task in the system (init, pid == 1), therefore, root cgroup
899 * always has either children cgroups and/or using tasks. So we don't 903 * always has either children cgroups and/or using tasks. So we don't
900 * need a special hack to ensure that root cgroup cannot be deleted. 904 * need a special hack to ensure that root cgroup cannot be deleted.
901 * 905 *
902 * P.S. One more locking exception. RCU is used to guard the 906 * P.S. One more locking exception. RCU is used to guard the
903 * update of a tasks cgroup pointer by cgroup_attach_task() 907 * update of a tasks cgroup pointer by cgroup_attach_task()
904 */ 908 */
905 909
906 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 910 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
907 static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 911 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
908 static const struct file_operations proc_cgroupstats_operations; 912 static const struct file_operations proc_cgroupstats_operations;
909 913
910 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, 914 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
911 char *buf) 915 char *buf)
912 { 916 {
913 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && 917 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
914 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) 918 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
915 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", 919 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
916 cft->ss->name, cft->name); 920 cft->ss->name, cft->name);
917 else 921 else
918 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); 922 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
919 return buf; 923 return buf;
920 } 924 }
921 925
922 /** 926 /**
923 * cgroup_file_mode - deduce file mode of a control file 927 * cgroup_file_mode - deduce file mode of a control file
924 * @cft: the control file in question 928 * @cft: the control file in question
925 * 929 *
926 * returns cft->mode if ->mode is not 0 930 * returns cft->mode if ->mode is not 0
927 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler 931 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
928 * returns S_IRUGO if it has only a read handler 932 * returns S_IRUGO if it has only a read handler
929 * returns S_IWUSR if it has only a write hander 933 * returns S_IWUSR if it has only a write hander
930 */ 934 */
931 static umode_t cgroup_file_mode(const struct cftype *cft) 935 static umode_t cgroup_file_mode(const struct cftype *cft)
932 { 936 {
933 umode_t mode = 0; 937 umode_t mode = 0;
934 938
935 if (cft->mode) 939 if (cft->mode)
936 return cft->mode; 940 return cft->mode;
937 941
938 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 942 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
939 mode |= S_IRUGO; 943 mode |= S_IRUGO;
940 944
941 if (cft->write_u64 || cft->write_s64 || cft->write_string || 945 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
942 cft->trigger) 946 cft->trigger)
943 mode |= S_IWUSR; 947 mode |= S_IWUSR;
944 948
945 return mode; 949 return mode;
946 } 950 }
947 951
948 static void cgroup_free_fn(struct work_struct *work) 952 static void cgroup_free_fn(struct work_struct *work)
949 { 953 {
950 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 954 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
951 955
952 atomic_dec(&cgrp->root->nr_cgrps); 956 atomic_dec(&cgrp->root->nr_cgrps);
953 cgroup_pidlist_destroy_all(cgrp); 957 cgroup_pidlist_destroy_all(cgrp);
954 958
955 if (cgrp->parent) { 959 if (cgrp->parent) {
956 /* 960 /*
957 * We get a ref to the parent, and put the ref when this 961 * We get a ref to the parent, and put the ref when this
958 * cgroup is being freed, so it's guaranteed that the 962 * cgroup is being freed, so it's guaranteed that the
959 * parent won't be destroyed before its children. 963 * parent won't be destroyed before its children.
960 */ 964 */
961 cgroup_put(cgrp->parent); 965 cgroup_put(cgrp->parent);
962 kernfs_put(cgrp->kn); 966 kernfs_put(cgrp->kn);
963 kfree(cgrp); 967 kfree(cgrp);
964 } else { 968 } else {
965 /* 969 /*
966 * This is root cgroup's refcnt reaching zero, which 970 * This is root cgroup's refcnt reaching zero, which
967 * indicates that the root should be released. 971 * indicates that the root should be released.
968 */ 972 */
969 cgroup_destroy_root(cgrp->root); 973 cgroup_destroy_root(cgrp->root);
970 } 974 }
971 } 975 }
972 976
973 static void cgroup_free_rcu(struct rcu_head *head) 977 static void cgroup_free_rcu(struct rcu_head *head)
974 { 978 {
975 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 979 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
976 980
977 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); 981 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
978 queue_work(cgroup_destroy_wq, &cgrp->destroy_work); 982 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
979 } 983 }
980 984
981 static void cgroup_get(struct cgroup *cgrp) 985 static void cgroup_get(struct cgroup *cgrp)
982 { 986 {
983 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 987 WARN_ON_ONCE(cgroup_is_dead(cgrp));
984 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); 988 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
985 atomic_inc(&cgrp->refcnt); 989 atomic_inc(&cgrp->refcnt);
986 } 990 }
987 991
988 static void cgroup_put(struct cgroup *cgrp) 992 static void cgroup_put(struct cgroup *cgrp)
989 { 993 {
990 if (!atomic_dec_and_test(&cgrp->refcnt)) 994 if (!atomic_dec_and_test(&cgrp->refcnt))
991 return; 995 return;
992 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) 996 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
993 return; 997 return;
994 998
995 /* 999 /*
996 * XXX: cgrp->id is only used to look up css's. As cgroup and 1000 * XXX: cgrp->id is only used to look up css's. As cgroup and
997 * css's lifetimes will be decoupled, it should be made 1001 * css's lifetimes will be decoupled, it should be made
998 * per-subsystem and moved to css->id so that lookups are 1002 * per-subsystem and moved to css->id so that lookups are
999 * successful until the target css is released. 1003 * successful until the target css is released.
1000 */ 1004 */
1001 mutex_lock(&cgroup_mutex); 1005 mutex_lock(&cgroup_mutex);
1002 idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 1006 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
1003 mutex_unlock(&cgroup_mutex); 1007 mutex_unlock(&cgroup_mutex);
1004 cgrp->id = -1; 1008 cgrp->id = -1;
1005 1009
1006 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 1010 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
1007 } 1011 }
1008 1012
1009 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 1013 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1010 { 1014 {
1011 char name[CGROUP_FILE_NAME_MAX]; 1015 char name[CGROUP_FILE_NAME_MAX];
1012 1016
1013 lockdep_assert_held(&cgroup_tree_mutex); 1017 lockdep_assert_held(&cgroup_tree_mutex);
1014 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); 1018 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1015 } 1019 }
1016 1020
1017 /** 1021 /**
1018 * cgroup_clear_dir - remove subsys files in a cgroup directory 1022 * cgroup_clear_dir - remove subsys files in a cgroup directory
1019 * @cgrp: target cgroup 1023 * @cgrp: target cgroup
1020 * @subsys_mask: mask of the subsystem ids whose files should be removed 1024 * @subsys_mask: mask of the subsystem ids whose files should be removed
1021 */ 1025 */
1022 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) 1026 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
1023 { 1027 {
1024 struct cgroup_subsys *ss; 1028 struct cgroup_subsys *ss;
1025 int i; 1029 int i;
1026 1030
1027 for_each_subsys(ss, i) { 1031 for_each_subsys(ss, i) {
1028 struct cftype *cfts; 1032 struct cftype *cfts;
1029 1033
1030 if (!test_bit(i, &subsys_mask)) 1034 if (!test_bit(i, &subsys_mask))
1031 continue; 1035 continue;
1032 list_for_each_entry(cfts, &ss->cfts, node) 1036 list_for_each_entry(cfts, &ss->cfts, node)
1033 cgroup_addrm_files(cgrp, cfts, false); 1037 cgroup_addrm_files(cgrp, cfts, false);
1034 } 1038 }
1035 } 1039 }
1036 1040
1037 static int rebind_subsystems(struct cgroup_root *dst_root, 1041 static int rebind_subsystems(struct cgroup_root *dst_root,
1038 unsigned long ss_mask) 1042 unsigned long ss_mask)
1039 { 1043 {
1040 struct cgroup_subsys *ss; 1044 struct cgroup_subsys *ss;
1041 int ssid, i, ret; 1045 int ssid, i, ret;
1042 1046
1043 lockdep_assert_held(&cgroup_tree_mutex); 1047 lockdep_assert_held(&cgroup_tree_mutex);
1044 lockdep_assert_held(&cgroup_mutex); 1048 lockdep_assert_held(&cgroup_mutex);
1045 1049
1046 for_each_subsys(ss, ssid) { 1050 for_each_subsys(ss, ssid) {
1047 if (!(ss_mask & (1 << ssid))) 1051 if (!(ss_mask & (1 << ssid)))
1048 continue; 1052 continue;
1049 1053
1050 /* if @ss is on the dummy_root, we can always move it */ 1054 /* if @ss is on the dummy_root, we can always move it */
1051 if (ss->root == &cgrp_dfl_root) 1055 if (ss->root == &cgrp_dfl_root)
1052 continue; 1056 continue;
1053 1057
1054 /* if @ss has non-root cgroups attached to it, can't move */ 1058 /* if @ss has non-root cgroups attached to it, can't move */
1055 if (!list_empty(&ss->root->cgrp.children)) 1059 if (!list_empty(&ss->root->cgrp.children))
1056 return -EBUSY; 1060 return -EBUSY;
1057 1061
1058 /* can't move between two non-dummy roots either */ 1062 /* can't move between two non-dummy roots either */
1059 if (dst_root != &cgrp_dfl_root) 1063 if (dst_root != &cgrp_dfl_root)
1060 return -EBUSY; 1064 return -EBUSY;
1061 } 1065 }
1062 1066
1063 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); 1067 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1064 if (ret) { 1068 if (ret) {
1065 if (dst_root != &cgrp_dfl_root) 1069 if (dst_root != &cgrp_dfl_root)
1066 return ret; 1070 return ret;
1067 1071
1068 /* 1072 /*
1069 * Rebinding back to the default root is not allowed to 1073 * Rebinding back to the default root is not allowed to
1070 * fail. Using both default and non-default roots should 1074 * fail. Using both default and non-default roots should
1071 * be rare. Moving subsystems back and forth even more so. 1075 * be rare. Moving subsystems back and forth even more so.
1072 * Just warn about it and continue. 1076 * Just warn about it and continue.
1073 */ 1077 */
1074 if (cgrp_dfl_root_visible) { 1078 if (cgrp_dfl_root_visible) {
1075 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", 1079 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
1076 ret, ss_mask); 1080 ret, ss_mask);
1077 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); 1081 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
1078 } 1082 }
1079 } 1083 }
1080 1084
1081 /* 1085 /*
1082 * Nothing can fail from this point on. Remove files for the 1086 * Nothing can fail from this point on. Remove files for the
1083 * removed subsystems and rebind each subsystem. 1087 * removed subsystems and rebind each subsystem.
1084 */ 1088 */
1085 mutex_unlock(&cgroup_mutex); 1089 mutex_unlock(&cgroup_mutex);
1086 for_each_subsys(ss, ssid) 1090 for_each_subsys(ss, ssid)
1087 if (ss_mask & (1 << ssid)) 1091 if (ss_mask & (1 << ssid))
1088 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); 1092 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1089 mutex_lock(&cgroup_mutex); 1093 mutex_lock(&cgroup_mutex);
1090 1094
1091 for_each_subsys(ss, ssid) { 1095 for_each_subsys(ss, ssid) {
1092 struct cgroup_root *src_root; 1096 struct cgroup_root *src_root;
1093 struct cgroup_subsys_state *css; 1097 struct cgroup_subsys_state *css;
1094 struct css_set *cset; 1098 struct css_set *cset;
1095 1099
1096 if (!(ss_mask & (1 << ssid))) 1100 if (!(ss_mask & (1 << ssid)))
1097 continue; 1101 continue;
1098 1102
1099 src_root = ss->root; 1103 src_root = ss->root;
1100 css = cgroup_css(&src_root->cgrp, ss); 1104 css = cgroup_css(&src_root->cgrp, ss);
1101 1105
1102 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); 1106 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1103 1107
1104 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); 1108 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1105 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); 1109 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1106 ss->root = dst_root; 1110 ss->root = dst_root;
1107 css->cgroup = &dst_root->cgrp; 1111 css->cgroup = &dst_root->cgrp;
1108 1112
1109 down_write(&css_set_rwsem); 1113 down_write(&css_set_rwsem);
1110 hash_for_each(css_set_table, i, cset, hlist) 1114 hash_for_each(css_set_table, i, cset, hlist)
1111 list_move_tail(&cset->e_cset_node[ss->id], 1115 list_move_tail(&cset->e_cset_node[ss->id],
1112 &dst_root->cgrp.e_csets[ss->id]); 1116 &dst_root->cgrp.e_csets[ss->id]);
1113 up_write(&css_set_rwsem); 1117 up_write(&css_set_rwsem);
1114 1118
1115 src_root->subsys_mask &= ~(1 << ssid); 1119 src_root->subsys_mask &= ~(1 << ssid);
1116 src_root->cgrp.child_subsys_mask &= ~(1 << ssid); 1120 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1117 1121
1118 /* default hierarchy doesn't enable controllers by default */ 1122 /* default hierarchy doesn't enable controllers by default */
1119 dst_root->subsys_mask |= 1 << ssid; 1123 dst_root->subsys_mask |= 1 << ssid;
1120 if (dst_root != &cgrp_dfl_root) 1124 if (dst_root != &cgrp_dfl_root)
1121 dst_root->cgrp.child_subsys_mask |= 1 << ssid; 1125 dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1122 1126
1123 if (ss->bind) 1127 if (ss->bind)
1124 ss->bind(css); 1128 ss->bind(css);
1125 } 1129 }
1126 1130
1127 kernfs_activate(dst_root->cgrp.kn); 1131 kernfs_activate(dst_root->cgrp.kn);
1128 return 0; 1132 return 0;
1129 } 1133 }
1130 1134
1131 static int cgroup_show_options(struct seq_file *seq, 1135 static int cgroup_show_options(struct seq_file *seq,
1132 struct kernfs_root *kf_root) 1136 struct kernfs_root *kf_root)
1133 { 1137 {
1134 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1138 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1135 struct cgroup_subsys *ss; 1139 struct cgroup_subsys *ss;
1136 int ssid; 1140 int ssid;
1137 1141
1138 for_each_subsys(ss, ssid) 1142 for_each_subsys(ss, ssid)
1139 if (root->subsys_mask & (1 << ssid)) 1143 if (root->subsys_mask & (1 << ssid))
1140 seq_printf(seq, ",%s", ss->name); 1144 seq_printf(seq, ",%s", ss->name);
1141 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1145 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1142 seq_puts(seq, ",sane_behavior"); 1146 seq_puts(seq, ",sane_behavior");
1143 if (root->flags & CGRP_ROOT_NOPREFIX) 1147 if (root->flags & CGRP_ROOT_NOPREFIX)
1144 seq_puts(seq, ",noprefix"); 1148 seq_puts(seq, ",noprefix");
1145 if (root->flags & CGRP_ROOT_XATTR) 1149 if (root->flags & CGRP_ROOT_XATTR)
1146 seq_puts(seq, ",xattr"); 1150 seq_puts(seq, ",xattr");
1147 1151
1148 spin_lock(&release_agent_path_lock); 1152 spin_lock(&release_agent_path_lock);
1149 if (strlen(root->release_agent_path)) 1153 if (strlen(root->release_agent_path))
1150 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1154 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1151 spin_unlock(&release_agent_path_lock); 1155 spin_unlock(&release_agent_path_lock);
1152 1156
1153 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) 1157 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1154 seq_puts(seq, ",clone_children"); 1158 seq_puts(seq, ",clone_children");
1155 if (strlen(root->name)) 1159 if (strlen(root->name))
1156 seq_printf(seq, ",name=%s", root->name); 1160 seq_printf(seq, ",name=%s", root->name);
1157 return 0; 1161 return 0;
1158 } 1162 }
1159 1163
1160 struct cgroup_sb_opts { 1164 struct cgroup_sb_opts {
1161 unsigned long subsys_mask; 1165 unsigned long subsys_mask;
1162 unsigned long flags; 1166 unsigned long flags;
1163 char *release_agent; 1167 char *release_agent;
1164 bool cpuset_clone_children; 1168 bool cpuset_clone_children;
1165 char *name; 1169 char *name;
1166 /* User explicitly requested empty subsystem */ 1170 /* User explicitly requested empty subsystem */
1167 bool none; 1171 bool none;
1168 }; 1172 };
1169 1173
1170 /* 1174 /*
1171 * Convert a hierarchy specifier into a bitmask of subsystems and 1175 * Convert a hierarchy specifier into a bitmask of subsystems and
1172 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] 1176 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1173 * array. This function takes refcounts on subsystems to be used, unless it 1177 * array. This function takes refcounts on subsystems to be used, unless it
1174 * returns error, in which case no refcounts are taken. 1178 * returns error, in which case no refcounts are taken.
1175 */ 1179 */
1176 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1180 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1177 { 1181 {
1178 char *token, *o = data; 1182 char *token, *o = data;
1179 bool all_ss = false, one_ss = false; 1183 bool all_ss = false, one_ss = false;
1180 unsigned long mask = (unsigned long)-1; 1184 unsigned long mask = (unsigned long)-1;
1181 struct cgroup_subsys *ss; 1185 struct cgroup_subsys *ss;
1182 int i; 1186 int i;
1183 1187
1184 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1188 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1185 1189
1186 #ifdef CONFIG_CPUSETS 1190 #ifdef CONFIG_CPUSETS
1187 mask = ~(1UL << cpuset_cgrp_id); 1191 mask = ~(1UL << cpuset_cgrp_id);
1188 #endif 1192 #endif
1189 1193
1190 memset(opts, 0, sizeof(*opts)); 1194 memset(opts, 0, sizeof(*opts));
1191 1195
1192 while ((token = strsep(&o, ",")) != NULL) { 1196 while ((token = strsep(&o, ",")) != NULL) {
1193 if (!*token) 1197 if (!*token)
1194 return -EINVAL; 1198 return -EINVAL;
1195 if (!strcmp(token, "none")) { 1199 if (!strcmp(token, "none")) {
1196 /* Explicitly have no subsystems */ 1200 /* Explicitly have no subsystems */
1197 opts->none = true; 1201 opts->none = true;
1198 continue; 1202 continue;
1199 } 1203 }
1200 if (!strcmp(token, "all")) { 1204 if (!strcmp(token, "all")) {
1201 /* Mutually exclusive option 'all' + subsystem name */ 1205 /* Mutually exclusive option 'all' + subsystem name */
1202 if (one_ss) 1206 if (one_ss)
1203 return -EINVAL; 1207 return -EINVAL;
1204 all_ss = true; 1208 all_ss = true;
1205 continue; 1209 continue;
1206 } 1210 }
1207 if (!strcmp(token, "__DEVEL__sane_behavior")) { 1211 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1208 opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; 1212 opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1209 continue; 1213 continue;
1210 } 1214 }
1211 if (!strcmp(token, "noprefix")) { 1215 if (!strcmp(token, "noprefix")) {
1212 opts->flags |= CGRP_ROOT_NOPREFIX; 1216 opts->flags |= CGRP_ROOT_NOPREFIX;
1213 continue; 1217 continue;
1214 } 1218 }
1215 if (!strcmp(token, "clone_children")) { 1219 if (!strcmp(token, "clone_children")) {
1216 opts->cpuset_clone_children = true; 1220 opts->cpuset_clone_children = true;
1217 continue; 1221 continue;
1218 } 1222 }
1219 if (!strcmp(token, "xattr")) { 1223 if (!strcmp(token, "xattr")) {
1220 opts->flags |= CGRP_ROOT_XATTR; 1224 opts->flags |= CGRP_ROOT_XATTR;
1221 continue; 1225 continue;
1222 } 1226 }
1223 if (!strncmp(token, "release_agent=", 14)) { 1227 if (!strncmp(token, "release_agent=", 14)) {
1224 /* Specifying two release agents is forbidden */ 1228 /* Specifying two release agents is forbidden */
1225 if (opts->release_agent) 1229 if (opts->release_agent)
1226 return -EINVAL; 1230 return -EINVAL;
1227 opts->release_agent = 1231 opts->release_agent =
1228 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); 1232 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1229 if (!opts->release_agent) 1233 if (!opts->release_agent)
1230 return -ENOMEM; 1234 return -ENOMEM;
1231 continue; 1235 continue;
1232 } 1236 }
1233 if (!strncmp(token, "name=", 5)) { 1237 if (!strncmp(token, "name=", 5)) {
1234 const char *name = token + 5; 1238 const char *name = token + 5;
1235 /* Can't specify an empty name */ 1239 /* Can't specify an empty name */
1236 if (!strlen(name)) 1240 if (!strlen(name))
1237 return -EINVAL; 1241 return -EINVAL;
1238 /* Must match [\w.-]+ */ 1242 /* Must match [\w.-]+ */
1239 for (i = 0; i < strlen(name); i++) { 1243 for (i = 0; i < strlen(name); i++) {
1240 char c = name[i]; 1244 char c = name[i];
1241 if (isalnum(c)) 1245 if (isalnum(c))
1242 continue; 1246 continue;
1243 if ((c == '.') || (c == '-') || (c == '_')) 1247 if ((c == '.') || (c == '-') || (c == '_'))
1244 continue; 1248 continue;
1245 return -EINVAL; 1249 return -EINVAL;
1246 } 1250 }
1247 /* Specifying two names is forbidden */ 1251 /* Specifying two names is forbidden */
1248 if (opts->name) 1252 if (opts->name)
1249 return -EINVAL; 1253 return -EINVAL;
1250 opts->name = kstrndup(name, 1254 opts->name = kstrndup(name,
1251 MAX_CGROUP_ROOT_NAMELEN - 1, 1255 MAX_CGROUP_ROOT_NAMELEN - 1,
1252 GFP_KERNEL); 1256 GFP_KERNEL);
1253 if (!opts->name) 1257 if (!opts->name)
1254 return -ENOMEM; 1258 return -ENOMEM;
1255 1259
1256 continue; 1260 continue;
1257 } 1261 }
1258 1262
1259 for_each_subsys(ss, i) { 1263 for_each_subsys(ss, i) {
1260 if (strcmp(token, ss->name)) 1264 if (strcmp(token, ss->name))
1261 continue; 1265 continue;
1262 if (ss->disabled) 1266 if (ss->disabled)
1263 continue; 1267 continue;
1264 1268
1265 /* Mutually exclusive option 'all' + subsystem name */ 1269 /* Mutually exclusive option 'all' + subsystem name */
1266 if (all_ss) 1270 if (all_ss)
1267 return -EINVAL; 1271 return -EINVAL;
1268 set_bit(i, &opts->subsys_mask); 1272 set_bit(i, &opts->subsys_mask);
1269 one_ss = true; 1273 one_ss = true;
1270 1274
1271 break; 1275 break;
1272 } 1276 }
1273 if (i == CGROUP_SUBSYS_COUNT) 1277 if (i == CGROUP_SUBSYS_COUNT)
1274 return -ENOENT; 1278 return -ENOENT;
1275 } 1279 }
1276 1280
1277 /* Consistency checks */ 1281 /* Consistency checks */
1278 1282
1279 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1283 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1280 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1284 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1281 1285
1282 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1286 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1283 opts->cpuset_clone_children || opts->release_agent || 1287 opts->cpuset_clone_children || opts->release_agent ||
1284 opts->name) { 1288 opts->name) {
1285 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); 1289 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1286 return -EINVAL; 1290 return -EINVAL;
1287 } 1291 }
1288 } else { 1292 } else {
1289 /* 1293 /*
1290 * If the 'all' option was specified select all the 1294 * If the 'all' option was specified select all the
1291 * subsystems, otherwise if 'none', 'name=' and a subsystem 1295 * subsystems, otherwise if 'none', 'name=' and a subsystem
1292 * name options were not specified, let's default to 'all' 1296 * name options were not specified, let's default to 'all'
1293 */ 1297 */
1294 if (all_ss || (!one_ss && !opts->none && !opts->name)) 1298 if (all_ss || (!one_ss && !opts->none && !opts->name))
1295 for_each_subsys(ss, i) 1299 for_each_subsys(ss, i)
1296 if (!ss->disabled) 1300 if (!ss->disabled)
1297 set_bit(i, &opts->subsys_mask); 1301 set_bit(i, &opts->subsys_mask);
1298 1302
1299 /* 1303 /*
1300 * We either have to specify by name or by subsystems. (So 1304 * We either have to specify by name or by subsystems. (So
1301 * all empty hierarchies must have a name). 1305 * all empty hierarchies must have a name).
1302 */ 1306 */
1303 if (!opts->subsys_mask && !opts->name) 1307 if (!opts->subsys_mask && !opts->name)
1304 return -EINVAL; 1308 return -EINVAL;
1305 } 1309 }
1306 1310
1307 /* 1311 /*
1308 * Option noprefix was introduced just for backward compatibility 1312 * Option noprefix was introduced just for backward compatibility
1309 * with the old cpuset, so we allow noprefix only if mounting just 1313 * with the old cpuset, so we allow noprefix only if mounting just
1310 * the cpuset subsystem. 1314 * the cpuset subsystem.
1311 */ 1315 */
1312 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) 1316 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1313 return -EINVAL; 1317 return -EINVAL;
1314 1318
1315 1319
1316 /* Can't specify "none" and some subsystems */ 1320 /* Can't specify "none" and some subsystems */
1317 if (opts->subsys_mask && opts->none) 1321 if (opts->subsys_mask && opts->none)
1318 return -EINVAL; 1322 return -EINVAL;
1319 1323
1320 return 0; 1324 return 0;
1321 } 1325 }
1322 1326
1323 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) 1327 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1324 { 1328 {
1325 int ret = 0; 1329 int ret = 0;
1326 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1330 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1327 struct cgroup_sb_opts opts; 1331 struct cgroup_sb_opts opts;
1328 unsigned long added_mask, removed_mask; 1332 unsigned long added_mask, removed_mask;
1329 1333
1330 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1334 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1331 pr_err("cgroup: sane_behavior: remount is not allowed\n"); 1335 pr_err("cgroup: sane_behavior: remount is not allowed\n");
1332 return -EINVAL; 1336 return -EINVAL;
1333 } 1337 }
1334 1338
1335 mutex_lock(&cgroup_tree_mutex); 1339 mutex_lock(&cgroup_tree_mutex);
1336 mutex_lock(&cgroup_mutex); 1340 mutex_lock(&cgroup_mutex);
1337 1341
1338 /* See what subsystems are wanted */ 1342 /* See what subsystems are wanted */
1339 ret = parse_cgroupfs_options(data, &opts); 1343 ret = parse_cgroupfs_options(data, &opts);
1340 if (ret) 1344 if (ret)
1341 goto out_unlock; 1345 goto out_unlock;
1342 1346
1343 if (opts.subsys_mask != root->subsys_mask || opts.release_agent) 1347 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1344 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1348 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1345 task_tgid_nr(current), current->comm); 1349 task_tgid_nr(current), current->comm);
1346 1350
1347 added_mask = opts.subsys_mask & ~root->subsys_mask; 1351 added_mask = opts.subsys_mask & ~root->subsys_mask;
1348 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1352 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1349 1353
1350 /* Don't allow flags or name to change at remount */ 1354 /* Don't allow flags or name to change at remount */
1351 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1355 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1352 (opts.name && strcmp(opts.name, root->name))) { 1356 (opts.name && strcmp(opts.name, root->name))) {
1353 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", 1357 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
1354 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1358 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1355 root->flags & CGRP_ROOT_OPTION_MASK, root->name); 1359 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1356 ret = -EINVAL; 1360 ret = -EINVAL;
1357 goto out_unlock; 1361 goto out_unlock;
1358 } 1362 }
1359 1363
1360 /* remounting is not allowed for populated hierarchies */ 1364 /* remounting is not allowed for populated hierarchies */
1361 if (!list_empty(&root->cgrp.children)) { 1365 if (!list_empty(&root->cgrp.children)) {
1362 ret = -EBUSY; 1366 ret = -EBUSY;
1363 goto out_unlock; 1367 goto out_unlock;
1364 } 1368 }
1365 1369
1366 ret = rebind_subsystems(root, added_mask); 1370 ret = rebind_subsystems(root, added_mask);
1367 if (ret) 1371 if (ret)
1368 goto out_unlock; 1372 goto out_unlock;
1369 1373
1370 rebind_subsystems(&cgrp_dfl_root, removed_mask); 1374 rebind_subsystems(&cgrp_dfl_root, removed_mask);
1371 1375
1372 if (opts.release_agent) { 1376 if (opts.release_agent) {
1373 spin_lock(&release_agent_path_lock); 1377 spin_lock(&release_agent_path_lock);
1374 strcpy(root->release_agent_path, opts.release_agent); 1378 strcpy(root->release_agent_path, opts.release_agent);
1375 spin_unlock(&release_agent_path_lock); 1379 spin_unlock(&release_agent_path_lock);
1376 } 1380 }
1377 out_unlock: 1381 out_unlock:
1378 kfree(opts.release_agent); 1382 kfree(opts.release_agent);
1379 kfree(opts.name); 1383 kfree(opts.name);
1380 mutex_unlock(&cgroup_mutex); 1384 mutex_unlock(&cgroup_mutex);
1381 mutex_unlock(&cgroup_tree_mutex); 1385 mutex_unlock(&cgroup_tree_mutex);
1382 return ret; 1386 return ret;
1383 } 1387 }
1384 1388
1385 /* 1389 /*
1386 * To reduce the fork() overhead for systems that are not actually using 1390 * To reduce the fork() overhead for systems that are not actually using
1387 * their cgroups capability, we don't maintain the lists running through 1391 * their cgroups capability, we don't maintain the lists running through
1388 * each css_set to its tasks until we see the list actually used - in other 1392 * each css_set to its tasks until we see the list actually used - in other
1389 * words after the first mount. 1393 * words after the first mount.
1390 */ 1394 */
1391 static bool use_task_css_set_links __read_mostly; 1395 static bool use_task_css_set_links __read_mostly;
1392 1396
1393 static void cgroup_enable_task_cg_lists(void) 1397 static void cgroup_enable_task_cg_lists(void)
1394 { 1398 {
1395 struct task_struct *p, *g; 1399 struct task_struct *p, *g;
1396 1400
1397 down_write(&css_set_rwsem); 1401 down_write(&css_set_rwsem);
1398 1402
1399 if (use_task_css_set_links) 1403 if (use_task_css_set_links)
1400 goto out_unlock; 1404 goto out_unlock;
1401 1405
1402 use_task_css_set_links = true; 1406 use_task_css_set_links = true;
1403 1407
1404 /* 1408 /*
1405 * We need tasklist_lock because RCU is not safe against 1409 * We need tasklist_lock because RCU is not safe against
1406 * while_each_thread(). Besides, a forking task that has passed 1410 * while_each_thread(). Besides, a forking task that has passed
1407 * cgroup_post_fork() without seeing use_task_css_set_links = 1 1411 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1408 * is not guaranteed to have its child immediately visible in the 1412 * is not guaranteed to have its child immediately visible in the
1409 * tasklist if we walk through it with RCU. 1413 * tasklist if we walk through it with RCU.
1410 */ 1414 */
1411 read_lock(&tasklist_lock); 1415 read_lock(&tasklist_lock);
1412 do_each_thread(g, p) { 1416 do_each_thread(g, p) {
1413 WARN_ON_ONCE(!list_empty(&p->cg_list) || 1417 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1414 task_css_set(p) != &init_css_set); 1418 task_css_set(p) != &init_css_set);
1415 1419
1416 /* 1420 /*
1417 * We should check if the process is exiting, otherwise 1421 * We should check if the process is exiting, otherwise
1418 * it will race with cgroup_exit() in that the list 1422 * it will race with cgroup_exit() in that the list
1419 * entry won't be deleted though the process has exited. 1423 * entry won't be deleted though the process has exited.
1420 * Do it while holding siglock so that we don't end up 1424 * Do it while holding siglock so that we don't end up
1421 * racing against cgroup_exit(). 1425 * racing against cgroup_exit().
1422 */ 1426 */
1423 spin_lock_irq(&p->sighand->siglock); 1427 spin_lock_irq(&p->sighand->siglock);
1424 if (!(p->flags & PF_EXITING)) { 1428 if (!(p->flags & PF_EXITING)) {
1425 struct css_set *cset = task_css_set(p); 1429 struct css_set *cset = task_css_set(p);
1426 1430
1427 list_add(&p->cg_list, &cset->tasks); 1431 list_add(&p->cg_list, &cset->tasks);
1428 get_css_set(cset); 1432 get_css_set(cset);
1429 } 1433 }
1430 spin_unlock_irq(&p->sighand->siglock); 1434 spin_unlock_irq(&p->sighand->siglock);
1431 } while_each_thread(g, p); 1435 } while_each_thread(g, p);
1432 read_unlock(&tasklist_lock); 1436 read_unlock(&tasklist_lock);
1433 out_unlock: 1437 out_unlock:
1434 up_write(&css_set_rwsem); 1438 up_write(&css_set_rwsem);
1435 } 1439 }
1436 1440
1437 static void init_cgroup_housekeeping(struct cgroup *cgrp) 1441 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1438 { 1442 {
1439 struct cgroup_subsys *ss; 1443 struct cgroup_subsys *ss;
1440 int ssid; 1444 int ssid;
1441 1445
1442 atomic_set(&cgrp->refcnt, 1); 1446 atomic_set(&cgrp->refcnt, 1);
1443 INIT_LIST_HEAD(&cgrp->sibling); 1447 INIT_LIST_HEAD(&cgrp->sibling);
1444 INIT_LIST_HEAD(&cgrp->children); 1448 INIT_LIST_HEAD(&cgrp->children);
1445 INIT_LIST_HEAD(&cgrp->cset_links); 1449 INIT_LIST_HEAD(&cgrp->cset_links);
1446 INIT_LIST_HEAD(&cgrp->release_list); 1450 INIT_LIST_HEAD(&cgrp->release_list);
1447 INIT_LIST_HEAD(&cgrp->pidlists); 1451 INIT_LIST_HEAD(&cgrp->pidlists);
1448 mutex_init(&cgrp->pidlist_mutex); 1452 mutex_init(&cgrp->pidlist_mutex);
1449 cgrp->dummy_css.cgroup = cgrp; 1453 cgrp->dummy_css.cgroup = cgrp;
1450 1454
1451 for_each_subsys(ss, ssid) 1455 for_each_subsys(ss, ssid)
1452 INIT_LIST_HEAD(&cgrp->e_csets[ssid]); 1456 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1453 } 1457 }
1454 1458
1455 static void init_cgroup_root(struct cgroup_root *root, 1459 static void init_cgroup_root(struct cgroup_root *root,
1456 struct cgroup_sb_opts *opts) 1460 struct cgroup_sb_opts *opts)
1457 { 1461 {
1458 struct cgroup *cgrp = &root->cgrp; 1462 struct cgroup *cgrp = &root->cgrp;
1459 1463
1460 INIT_LIST_HEAD(&root->root_list); 1464 INIT_LIST_HEAD(&root->root_list);
1461 atomic_set(&root->nr_cgrps, 1); 1465 atomic_set(&root->nr_cgrps, 1);
1462 cgrp->root = root; 1466 cgrp->root = root;
1463 init_cgroup_housekeeping(cgrp); 1467 init_cgroup_housekeeping(cgrp);
1464 idr_init(&root->cgroup_idr); 1468 idr_init(&root->cgroup_idr);
1465 1469
1466 root->flags = opts->flags; 1470 root->flags = opts->flags;
1467 if (opts->release_agent) 1471 if (opts->release_agent)
1468 strcpy(root->release_agent_path, opts->release_agent); 1472 strcpy(root->release_agent_path, opts->release_agent);
1469 if (opts->name) 1473 if (opts->name)
1470 strcpy(root->name, opts->name); 1474 strcpy(root->name, opts->name);
1471 if (opts->cpuset_clone_children) 1475 if (opts->cpuset_clone_children)
1472 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1476 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1473 } 1477 }
1474 1478
1475 static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) 1479 static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1476 { 1480 {
1477 LIST_HEAD(tmp_links); 1481 LIST_HEAD(tmp_links);
1478 struct cgroup *root_cgrp = &root->cgrp; 1482 struct cgroup *root_cgrp = &root->cgrp;
1479 struct css_set *cset; 1483 struct css_set *cset;
1480 int i, ret; 1484 int i, ret;
1481 1485
1482 lockdep_assert_held(&cgroup_tree_mutex); 1486 lockdep_assert_held(&cgroup_tree_mutex);
1483 lockdep_assert_held(&cgroup_mutex); 1487 lockdep_assert_held(&cgroup_mutex);
1484 1488
1485 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); 1489 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1486 if (ret < 0) 1490 if (ret < 0)
1487 goto out; 1491 goto out;
1488 root_cgrp->id = ret; 1492 root_cgrp->id = ret;
1489 1493
1490 /* 1494 /*
1491 * We're accessing css_set_count without locking css_set_rwsem here, 1495 * We're accessing css_set_count without locking css_set_rwsem here,
1492 * but that's OK - it can only be increased by someone holding 1496 * but that's OK - it can only be increased by someone holding
1493 * cgroup_lock, and that's us. The worst that can happen is that we 1497 * cgroup_lock, and that's us. The worst that can happen is that we
1494 * have some link structures left over 1498 * have some link structures left over
1495 */ 1499 */
1496 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); 1500 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1497 if (ret) 1501 if (ret)
1498 goto out; 1502 goto out;
1499 1503
1500 ret = cgroup_init_root_id(root); 1504 ret = cgroup_init_root_id(root);
1501 if (ret) 1505 if (ret)
1502 goto out; 1506 goto out;
1503 1507
1504 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, 1508 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1505 KERNFS_ROOT_CREATE_DEACTIVATED, 1509 KERNFS_ROOT_CREATE_DEACTIVATED,
1506 root_cgrp); 1510 root_cgrp);
1507 if (IS_ERR(root->kf_root)) { 1511 if (IS_ERR(root->kf_root)) {
1508 ret = PTR_ERR(root->kf_root); 1512 ret = PTR_ERR(root->kf_root);
1509 goto exit_root_id; 1513 goto exit_root_id;
1510 } 1514 }
1511 root_cgrp->kn = root->kf_root->kn; 1515 root_cgrp->kn = root->kf_root->kn;
1512 1516
1513 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); 1517 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1514 if (ret) 1518 if (ret)
1515 goto destroy_root; 1519 goto destroy_root;
1516 1520
1517 ret = rebind_subsystems(root, ss_mask); 1521 ret = rebind_subsystems(root, ss_mask);
1518 if (ret) 1522 if (ret)
1519 goto destroy_root; 1523 goto destroy_root;
1520 1524
1521 /* 1525 /*
1522 * There must be no failure case after here, since rebinding takes 1526 * There must be no failure case after here, since rebinding takes
1523 * care of subsystems' refcounts, which are explicitly dropped in 1527 * care of subsystems' refcounts, which are explicitly dropped in
1524 * the failure exit path. 1528 * the failure exit path.
1525 */ 1529 */
1526 list_add(&root->root_list, &cgroup_roots); 1530 list_add(&root->root_list, &cgroup_roots);
1527 cgroup_root_count++; 1531 cgroup_root_count++;
1528 1532
1529 /* 1533 /*
1530 * Link the root cgroup in this hierarchy into all the css_set 1534 * Link the root cgroup in this hierarchy into all the css_set
1531 * objects. 1535 * objects.
1532 */ 1536 */
1533 down_write(&css_set_rwsem); 1537 down_write(&css_set_rwsem);
1534 hash_for_each(css_set_table, i, cset, hlist) 1538 hash_for_each(css_set_table, i, cset, hlist)
1535 link_css_set(&tmp_links, cset, root_cgrp); 1539 link_css_set(&tmp_links, cset, root_cgrp);
1536 up_write(&css_set_rwsem); 1540 up_write(&css_set_rwsem);
1537 1541
1538 BUG_ON(!list_empty(&root_cgrp->children)); 1542 BUG_ON(!list_empty(&root_cgrp->children));
1539 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1543 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1540 1544
1541 kernfs_activate(root_cgrp->kn); 1545 kernfs_activate(root_cgrp->kn);
1542 ret = 0; 1546 ret = 0;
1543 goto out; 1547 goto out;
1544 1548
1545 destroy_root: 1549 destroy_root:
1546 kernfs_destroy_root(root->kf_root); 1550 kernfs_destroy_root(root->kf_root);
1547 root->kf_root = NULL; 1551 root->kf_root = NULL;
1548 exit_root_id: 1552 exit_root_id:
1549 cgroup_exit_root_id(root); 1553 cgroup_exit_root_id(root);
1550 out: 1554 out:
1551 free_cgrp_cset_links(&tmp_links); 1555 free_cgrp_cset_links(&tmp_links);
1552 return ret; 1556 return ret;
1553 } 1557 }
1554 1558
1555 static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1559 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1556 int flags, const char *unused_dev_name, 1560 int flags, const char *unused_dev_name,
1557 void *data) 1561 void *data)
1558 { 1562 {
1559 struct cgroup_root *root; 1563 struct cgroup_root *root;
1560 struct cgroup_sb_opts opts; 1564 struct cgroup_sb_opts opts;
1561 struct dentry *dentry; 1565 struct dentry *dentry;
1562 int ret; 1566 int ret;
1563 bool new_sb; 1567 bool new_sb;
1564 1568
1565 /* 1569 /*
1566 * The first time anyone tries to mount a cgroup, enable the list 1570 * The first time anyone tries to mount a cgroup, enable the list
1567 * linking each css_set to its tasks and fix up all existing tasks. 1571 * linking each css_set to its tasks and fix up all existing tasks.
1568 */ 1572 */
1569 if (!use_task_css_set_links) 1573 if (!use_task_css_set_links)
1570 cgroup_enable_task_cg_lists(); 1574 cgroup_enable_task_cg_lists();
1571 1575
1572 mutex_lock(&cgroup_tree_mutex); 1576 mutex_lock(&cgroup_tree_mutex);
1573 mutex_lock(&cgroup_mutex); 1577 mutex_lock(&cgroup_mutex);
1574 1578
1575 /* First find the desired set of subsystems */ 1579 /* First find the desired set of subsystems */
1576 ret = parse_cgroupfs_options(data, &opts); 1580 ret = parse_cgroupfs_options(data, &opts);
1577 if (ret) 1581 if (ret)
1578 goto out_unlock; 1582 goto out_unlock;
1579 retry: 1583 retry:
1580 /* look for a matching existing root */ 1584 /* look for a matching existing root */
1581 if (!opts.subsys_mask && !opts.none && !opts.name) { 1585 if (!opts.subsys_mask && !opts.none && !opts.name) {
1582 cgrp_dfl_root_visible = true; 1586 cgrp_dfl_root_visible = true;
1583 root = &cgrp_dfl_root; 1587 root = &cgrp_dfl_root;
1584 cgroup_get(&root->cgrp); 1588 cgroup_get(&root->cgrp);
1585 ret = 0; 1589 ret = 0;
1586 goto out_unlock; 1590 goto out_unlock;
1587 } 1591 }
1588 1592
1589 for_each_root(root) { 1593 for_each_root(root) {
1590 bool name_match = false; 1594 bool name_match = false;
1591 1595
1592 if (root == &cgrp_dfl_root) 1596 if (root == &cgrp_dfl_root)
1593 continue; 1597 continue;
1594 1598
1595 /* 1599 /*
1596 * If we asked for a name then it must match. Also, if 1600 * If we asked for a name then it must match. Also, if
1597 * name matches but sybsys_mask doesn't, we should fail. 1601 * name matches but sybsys_mask doesn't, we should fail.
1598 * Remember whether name matched. 1602 * Remember whether name matched.
1599 */ 1603 */
1600 if (opts.name) { 1604 if (opts.name) {
1601 if (strcmp(opts.name, root->name)) 1605 if (strcmp(opts.name, root->name))
1602 continue; 1606 continue;
1603 name_match = true; 1607 name_match = true;
1604 } 1608 }
1605 1609
1606 /* 1610 /*
1607 * If we asked for subsystems (or explicitly for no 1611 * If we asked for subsystems (or explicitly for no
1608 * subsystems) then they must match. 1612 * subsystems) then they must match.
1609 */ 1613 */
1610 if ((opts.subsys_mask || opts.none) && 1614 if ((opts.subsys_mask || opts.none) &&
1611 (opts.subsys_mask != root->subsys_mask)) { 1615 (opts.subsys_mask != root->subsys_mask)) {
1612 if (!name_match) 1616 if (!name_match)
1613 continue; 1617 continue;
1614 ret = -EBUSY; 1618 ret = -EBUSY;
1615 goto out_unlock; 1619 goto out_unlock;
1616 } 1620 }
1617 1621
1618 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1622 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1619 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1623 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1620 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1624 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1621 ret = -EINVAL; 1625 ret = -EINVAL;
1622 goto out_unlock; 1626 goto out_unlock;
1623 } else { 1627 } else {
1624 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1628 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1625 } 1629 }
1626 } 1630 }
1627 1631
1628 /* 1632 /*
1629 * A root's lifetime is governed by its root cgroup. Zero 1633 * A root's lifetime is governed by its root cgroup. Zero
1630 * ref indicate that the root is being destroyed. Wait for 1634 * ref indicate that the root is being destroyed. Wait for
1631 * destruction to complete so that the subsystems are free. 1635 * destruction to complete so that the subsystems are free.
1632 * We can use wait_queue for the wait but this path is 1636 * We can use wait_queue for the wait but this path is
1633 * super cold. Let's just sleep for a bit and retry. 1637 * super cold. Let's just sleep for a bit and retry.
1634 */ 1638 */
1635 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { 1639 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1636 mutex_unlock(&cgroup_mutex); 1640 mutex_unlock(&cgroup_mutex);
1637 mutex_unlock(&cgroup_tree_mutex); 1641 mutex_unlock(&cgroup_tree_mutex);
1638 msleep(10); 1642 msleep(10);
1639 mutex_lock(&cgroup_tree_mutex); 1643 mutex_lock(&cgroup_tree_mutex);
1640 mutex_lock(&cgroup_mutex); 1644 mutex_lock(&cgroup_mutex);
1641 goto retry; 1645 goto retry;
1642 } 1646 }
1643 1647
1644 ret = 0; 1648 ret = 0;
1645 goto out_unlock; 1649 goto out_unlock;
1646 } 1650 }
1647 1651
1648 /* 1652 /*
1649 * No such thing, create a new one. name= matching without subsys 1653 * No such thing, create a new one. name= matching without subsys
1650 * specification is allowed for already existing hierarchies but we 1654 * specification is allowed for already existing hierarchies but we
1651 * can't create new one without subsys specification. 1655 * can't create new one without subsys specification.
1652 */ 1656 */
1653 if (!opts.subsys_mask && !opts.none) { 1657 if (!opts.subsys_mask && !opts.none) {
1654 ret = -EINVAL; 1658 ret = -EINVAL;
1655 goto out_unlock; 1659 goto out_unlock;
1656 } 1660 }
1657 1661
1658 root = kzalloc(sizeof(*root), GFP_KERNEL); 1662 root = kzalloc(sizeof(*root), GFP_KERNEL);
1659 if (!root) { 1663 if (!root) {
1660 ret = -ENOMEM; 1664 ret = -ENOMEM;
1661 goto out_unlock; 1665 goto out_unlock;
1662 } 1666 }
1663 1667
1664 init_cgroup_root(root, &opts); 1668 init_cgroup_root(root, &opts);
1665 1669
1666 ret = cgroup_setup_root(root, opts.subsys_mask); 1670 ret = cgroup_setup_root(root, opts.subsys_mask);
1667 if (ret) 1671 if (ret)
1668 cgroup_free_root(root); 1672 cgroup_free_root(root);
1669 1673
1670 out_unlock: 1674 out_unlock:
1671 mutex_unlock(&cgroup_mutex); 1675 mutex_unlock(&cgroup_mutex);
1672 mutex_unlock(&cgroup_tree_mutex); 1676 mutex_unlock(&cgroup_tree_mutex);
1673 1677
1674 kfree(opts.release_agent); 1678 kfree(opts.release_agent);
1675 kfree(opts.name); 1679 kfree(opts.name);
1676 1680
1677 if (ret) 1681 if (ret)
1678 return ERR_PTR(ret); 1682 return ERR_PTR(ret);
1679 1683
1680 dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); 1684 dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
1681 if (IS_ERR(dentry) || !new_sb) 1685 if (IS_ERR(dentry) || !new_sb)
1682 cgroup_put(&root->cgrp); 1686 cgroup_put(&root->cgrp);
1683 return dentry; 1687 return dentry;
1684 } 1688 }
1685 1689
1686 static void cgroup_kill_sb(struct super_block *sb) 1690 static void cgroup_kill_sb(struct super_block *sb)
1687 { 1691 {
1688 struct kernfs_root *kf_root = kernfs_root_from_sb(sb); 1692 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1689 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1693 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1690 1694
1691 cgroup_put(&root->cgrp); 1695 cgroup_put(&root->cgrp);
1692 kernfs_kill_sb(sb); 1696 kernfs_kill_sb(sb);
1693 } 1697 }
1694 1698
1695 static struct file_system_type cgroup_fs_type = { 1699 static struct file_system_type cgroup_fs_type = {
1696 .name = "cgroup", 1700 .name = "cgroup",
1697 .mount = cgroup_mount, 1701 .mount = cgroup_mount,
1698 .kill_sb = cgroup_kill_sb, 1702 .kill_sb = cgroup_kill_sb,
1699 }; 1703 };
1700 1704
1701 static struct kobject *cgroup_kobj; 1705 static struct kobject *cgroup_kobj;
1702 1706
1703 /** 1707 /**
1704 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy 1708 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1705 * @task: target task 1709 * @task: target task
1706 * @buf: the buffer to write the path into 1710 * @buf: the buffer to write the path into
1707 * @buflen: the length of the buffer 1711 * @buflen: the length of the buffer
1708 * 1712 *
1709 * Determine @task's cgroup on the first (the one with the lowest non-zero 1713 * Determine @task's cgroup on the first (the one with the lowest non-zero
1710 * hierarchy_id) cgroup hierarchy and copy its path into @buf. This 1714 * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
1711 * function grabs cgroup_mutex and shouldn't be used inside locks used by 1715 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1712 * cgroup controller callbacks. 1716 * cgroup controller callbacks.
1713 * 1717 *
1714 * Return value is the same as kernfs_path(). 1718 * Return value is the same as kernfs_path().
1715 */ 1719 */
1716 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) 1720 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1717 { 1721 {
1718 struct cgroup_root *root; 1722 struct cgroup_root *root;
1719 struct cgroup *cgrp; 1723 struct cgroup *cgrp;
1720 int hierarchy_id = 1; 1724 int hierarchy_id = 1;
1721 char *path = NULL; 1725 char *path = NULL;
1722 1726
1723 mutex_lock(&cgroup_mutex); 1727 mutex_lock(&cgroup_mutex);
1724 down_read(&css_set_rwsem); 1728 down_read(&css_set_rwsem);
1725 1729
1726 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 1730 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1727 1731
1728 if (root) { 1732 if (root) {
1729 cgrp = task_cgroup_from_root(task, root); 1733 cgrp = task_cgroup_from_root(task, root);
1730 path = cgroup_path(cgrp, buf, buflen); 1734 path = cgroup_path(cgrp, buf, buflen);
1731 } else { 1735 } else {
1732 /* if no hierarchy exists, everyone is in "/" */ 1736 /* if no hierarchy exists, everyone is in "/" */
1733 if (strlcpy(buf, "/", buflen) < buflen) 1737 if (strlcpy(buf, "/", buflen) < buflen)
1734 path = buf; 1738 path = buf;
1735 } 1739 }
1736 1740
1737 up_read(&css_set_rwsem); 1741 up_read(&css_set_rwsem);
1738 mutex_unlock(&cgroup_mutex); 1742 mutex_unlock(&cgroup_mutex);
1739 return path; 1743 return path;
1740 } 1744 }
1741 EXPORT_SYMBOL_GPL(task_cgroup_path); 1745 EXPORT_SYMBOL_GPL(task_cgroup_path);
1742 1746
1743 /* used to track tasks and other necessary states during migration */ 1747 /* used to track tasks and other necessary states during migration */
1744 struct cgroup_taskset { 1748 struct cgroup_taskset {
1745 /* the src and dst cset list running through cset->mg_node */ 1749 /* the src and dst cset list running through cset->mg_node */
1746 struct list_head src_csets; 1750 struct list_head src_csets;
1747 struct list_head dst_csets; 1751 struct list_head dst_csets;
1748 1752
1749 /* 1753 /*
1750 * Fields for cgroup_taskset_*() iteration. 1754 * Fields for cgroup_taskset_*() iteration.
1751 * 1755 *
1752 * Before migration is committed, the target migration tasks are on 1756 * Before migration is committed, the target migration tasks are on
1753 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of 1757 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
1754 * the csets on ->dst_csets. ->csets point to either ->src_csets 1758 * the csets on ->dst_csets. ->csets point to either ->src_csets
1755 * or ->dst_csets depending on whether migration is committed. 1759 * or ->dst_csets depending on whether migration is committed.
1756 * 1760 *
1757 * ->cur_csets and ->cur_task point to the current task position 1761 * ->cur_csets and ->cur_task point to the current task position
1758 * during iteration. 1762 * during iteration.
1759 */ 1763 */
1760 struct list_head *csets; 1764 struct list_head *csets;
1761 struct css_set *cur_cset; 1765 struct css_set *cur_cset;
1762 struct task_struct *cur_task; 1766 struct task_struct *cur_task;
1763 }; 1767 };
1764 1768
1765 /** 1769 /**
1766 * cgroup_taskset_first - reset taskset and return the first task 1770 * cgroup_taskset_first - reset taskset and return the first task
1767 * @tset: taskset of interest 1771 * @tset: taskset of interest
1768 * 1772 *
1769 * @tset iteration is initialized and the first task is returned. 1773 * @tset iteration is initialized and the first task is returned.
1770 */ 1774 */
1771 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1775 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1772 { 1776 {
1773 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); 1777 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1774 tset->cur_task = NULL; 1778 tset->cur_task = NULL;
1775 1779
1776 return cgroup_taskset_next(tset); 1780 return cgroup_taskset_next(tset);
1777 } 1781 }
1778 1782
1779 /** 1783 /**
1780 * cgroup_taskset_next - iterate to the next task in taskset 1784 * cgroup_taskset_next - iterate to the next task in taskset
1781 * @tset: taskset of interest 1785 * @tset: taskset of interest
1782 * 1786 *
1783 * Return the next task in @tset. Iteration must have been initialized 1787 * Return the next task in @tset. Iteration must have been initialized
1784 * with cgroup_taskset_first(). 1788 * with cgroup_taskset_first().
1785 */ 1789 */
1786 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1790 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1787 { 1791 {
1788 struct css_set *cset = tset->cur_cset; 1792 struct css_set *cset = tset->cur_cset;
1789 struct task_struct *task = tset->cur_task; 1793 struct task_struct *task = tset->cur_task;
1790 1794
1791 while (&cset->mg_node != tset->csets) { 1795 while (&cset->mg_node != tset->csets) {
1792 if (!task) 1796 if (!task)
1793 task = list_first_entry(&cset->mg_tasks, 1797 task = list_first_entry(&cset->mg_tasks,
1794 struct task_struct, cg_list); 1798 struct task_struct, cg_list);
1795 else 1799 else
1796 task = list_next_entry(task, cg_list); 1800 task = list_next_entry(task, cg_list);
1797 1801
1798 if (&task->cg_list != &cset->mg_tasks) { 1802 if (&task->cg_list != &cset->mg_tasks) {
1799 tset->cur_cset = cset; 1803 tset->cur_cset = cset;
1800 tset->cur_task = task; 1804 tset->cur_task = task;
1801 return task; 1805 return task;
1802 } 1806 }
1803 1807
1804 cset = list_next_entry(cset, mg_node); 1808 cset = list_next_entry(cset, mg_node);
1805 task = NULL; 1809 task = NULL;
1806 } 1810 }
1807 1811
1808 return NULL; 1812 return NULL;
1809 } 1813 }
1810 1814
1811 /** 1815 /**
1812 * cgroup_task_migrate - move a task from one cgroup to another. 1816 * cgroup_task_migrate - move a task from one cgroup to another.
1813 * @old_cgrp; the cgroup @tsk is being migrated from 1817 * @old_cgrp; the cgroup @tsk is being migrated from
1814 * @tsk: the task being migrated 1818 * @tsk: the task being migrated
1815 * @new_cset: the new css_set @tsk is being attached to 1819 * @new_cset: the new css_set @tsk is being attached to
1816 * 1820 *
1817 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. 1821 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1818 */ 1822 */
1819 static void cgroup_task_migrate(struct cgroup *old_cgrp, 1823 static void cgroup_task_migrate(struct cgroup *old_cgrp,
1820 struct task_struct *tsk, 1824 struct task_struct *tsk,
1821 struct css_set *new_cset) 1825 struct css_set *new_cset)
1822 { 1826 {
1823 struct css_set *old_cset; 1827 struct css_set *old_cset;
1824 1828
1825 lockdep_assert_held(&cgroup_mutex); 1829 lockdep_assert_held(&cgroup_mutex);
1826 lockdep_assert_held(&css_set_rwsem); 1830 lockdep_assert_held(&css_set_rwsem);
1827 1831
1828 /* 1832 /*
1829 * We are synchronized through threadgroup_lock() against PF_EXITING 1833 * We are synchronized through threadgroup_lock() against PF_EXITING
1830 * setting such that we can't race against cgroup_exit() changing the 1834 * setting such that we can't race against cgroup_exit() changing the
1831 * css_set to init_css_set and dropping the old one. 1835 * css_set to init_css_set and dropping the old one.
1832 */ 1836 */
1833 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1837 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1834 old_cset = task_css_set(tsk); 1838 old_cset = task_css_set(tsk);
1835 1839
1836 get_css_set(new_cset); 1840 get_css_set(new_cset);
1837 rcu_assign_pointer(tsk->cgroups, new_cset); 1841 rcu_assign_pointer(tsk->cgroups, new_cset);
1838 1842
1839 /* 1843 /*
1840 * Use move_tail so that cgroup_taskset_first() still returns the 1844 * Use move_tail so that cgroup_taskset_first() still returns the
1841 * leader after migration. This works because cgroup_migrate() 1845 * leader after migration. This works because cgroup_migrate()
1842 * ensures that the dst_cset of the leader is the first on the 1846 * ensures that the dst_cset of the leader is the first on the
1843 * tset's dst_csets list. 1847 * tset's dst_csets list.
1844 */ 1848 */
1845 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); 1849 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1846 1850
1847 /* 1851 /*
1848 * We just gained a reference on old_cset by taking it from the 1852 * We just gained a reference on old_cset by taking it from the
1849 * task. As trading it for new_cset is protected by cgroup_mutex, 1853 * task. As trading it for new_cset is protected by cgroup_mutex,
1850 * we're safe to drop it here; it will be freed under RCU. 1854 * we're safe to drop it here; it will be freed under RCU.
1851 */ 1855 */
1852 set_bit(CGRP_RELEASABLE, &old_cgrp->flags); 1856 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1853 put_css_set_locked(old_cset, false); 1857 put_css_set_locked(old_cset, false);
1854 } 1858 }
1855 1859
1856 /** 1860 /**
1857 * cgroup_migrate_finish - cleanup after attach 1861 * cgroup_migrate_finish - cleanup after attach
1858 * @preloaded_csets: list of preloaded css_sets 1862 * @preloaded_csets: list of preloaded css_sets
1859 * 1863 *
1860 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See 1864 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
1861 * those functions for details. 1865 * those functions for details.
1862 */ 1866 */
1863 static void cgroup_migrate_finish(struct list_head *preloaded_csets) 1867 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1864 { 1868 {
1865 struct css_set *cset, *tmp_cset; 1869 struct css_set *cset, *tmp_cset;
1866 1870
1867 lockdep_assert_held(&cgroup_mutex); 1871 lockdep_assert_held(&cgroup_mutex);
1868 1872
1869 down_write(&css_set_rwsem); 1873 down_write(&css_set_rwsem);
1870 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { 1874 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1871 cset->mg_src_cgrp = NULL; 1875 cset->mg_src_cgrp = NULL;
1872 cset->mg_dst_cset = NULL; 1876 cset->mg_dst_cset = NULL;
1873 list_del_init(&cset->mg_preload_node); 1877 list_del_init(&cset->mg_preload_node);
1874 put_css_set_locked(cset, false); 1878 put_css_set_locked(cset, false);
1875 } 1879 }
1876 up_write(&css_set_rwsem); 1880 up_write(&css_set_rwsem);
1877 } 1881 }
1878 1882
1879 /** 1883 /**
1880 * cgroup_migrate_add_src - add a migration source css_set 1884 * cgroup_migrate_add_src - add a migration source css_set
1881 * @src_cset: the source css_set to add 1885 * @src_cset: the source css_set to add
1882 * @dst_cgrp: the destination cgroup 1886 * @dst_cgrp: the destination cgroup
1883 * @preloaded_csets: list of preloaded css_sets 1887 * @preloaded_csets: list of preloaded css_sets
1884 * 1888 *
1885 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin 1889 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
1886 * @src_cset and add it to @preloaded_csets, which should later be cleaned 1890 * @src_cset and add it to @preloaded_csets, which should later be cleaned
1887 * up by cgroup_migrate_finish(). 1891 * up by cgroup_migrate_finish().
1888 * 1892 *
1889 * This function may be called without holding threadgroup_lock even if the 1893 * This function may be called without holding threadgroup_lock even if the
1890 * target is a process. Threads may be created and destroyed but as long 1894 * target is a process. Threads may be created and destroyed but as long
1891 * as cgroup_mutex is not dropped, no new css_set can be put into play and 1895 * as cgroup_mutex is not dropped, no new css_set can be put into play and
1892 * the preloaded css_sets are guaranteed to cover all migrations. 1896 * the preloaded css_sets are guaranteed to cover all migrations.
1893 */ 1897 */
1894 static void cgroup_migrate_add_src(struct css_set *src_cset, 1898 static void cgroup_migrate_add_src(struct css_set *src_cset,
1895 struct cgroup *dst_cgrp, 1899 struct cgroup *dst_cgrp,
1896 struct list_head *preloaded_csets) 1900 struct list_head *preloaded_csets)
1897 { 1901 {
1898 struct cgroup *src_cgrp; 1902 struct cgroup *src_cgrp;
1899 1903
1900 lockdep_assert_held(&cgroup_mutex); 1904 lockdep_assert_held(&cgroup_mutex);
1901 lockdep_assert_held(&css_set_rwsem); 1905 lockdep_assert_held(&css_set_rwsem);
1902 1906
1903 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 1907 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1904 1908
1905 /* nothing to do if this cset already belongs to the cgroup */ 1909 /* nothing to do if this cset already belongs to the cgroup */
1906 if (src_cgrp == dst_cgrp) 1910 if (src_cgrp == dst_cgrp)
1907 return; 1911 return;
1908 1912
1909 if (!list_empty(&src_cset->mg_preload_node)) 1913 if (!list_empty(&src_cset->mg_preload_node))
1910 return; 1914 return;
1911 1915
1912 WARN_ON(src_cset->mg_src_cgrp); 1916 WARN_ON(src_cset->mg_src_cgrp);
1913 WARN_ON(!list_empty(&src_cset->mg_tasks)); 1917 WARN_ON(!list_empty(&src_cset->mg_tasks));
1914 WARN_ON(!list_empty(&src_cset->mg_node)); 1918 WARN_ON(!list_empty(&src_cset->mg_node));
1915 1919
1916 src_cset->mg_src_cgrp = src_cgrp; 1920 src_cset->mg_src_cgrp = src_cgrp;
1917 get_css_set(src_cset); 1921 get_css_set(src_cset);
1918 list_add(&src_cset->mg_preload_node, preloaded_csets); 1922 list_add(&src_cset->mg_preload_node, preloaded_csets);
1919 } 1923 }
1920 1924
1921 /** 1925 /**
1922 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration 1926 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1923 * @dst_cgrp: the destination cgroup 1927 * @dst_cgrp: the destination cgroup
1924 * @preloaded_csets: list of preloaded source css_sets 1928 * @preloaded_csets: list of preloaded source css_sets
1925 * 1929 *
1926 * Tasks are about to be moved to @dst_cgrp and all the source css_sets 1930 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1927 * have been preloaded to @preloaded_csets. This function looks up and 1931 * have been preloaded to @preloaded_csets. This function looks up and
1928 * pins all destination css_sets, links each to its source, and put them on 1932 * pins all destination css_sets, links each to its source, and put them on
1929 * @preloaded_csets. 1933 * @preloaded_csets.
1930 * 1934 *
1931 * This function must be called after cgroup_migrate_add_src() has been 1935 * This function must be called after cgroup_migrate_add_src() has been
1932 * called on each migration source css_set. After migration is performed 1936 * called on each migration source css_set. After migration is performed
1933 * using cgroup_migrate(), cgroup_migrate_finish() must be called on 1937 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
1934 * @preloaded_csets. 1938 * @preloaded_csets.
1935 */ 1939 */
1936 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, 1940 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1937 struct list_head *preloaded_csets) 1941 struct list_head *preloaded_csets)
1938 { 1942 {
1939 LIST_HEAD(csets); 1943 LIST_HEAD(csets);
1940 struct css_set *src_cset; 1944 struct css_set *src_cset;
1941 1945
1942 lockdep_assert_held(&cgroup_mutex); 1946 lockdep_assert_held(&cgroup_mutex);
1943 1947
1944 /* look up the dst cset for each src cset and link it to src */ 1948 /* look up the dst cset for each src cset and link it to src */
1945 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { 1949 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
1946 struct css_set *dst_cset; 1950 struct css_set *dst_cset;
1947 1951
1948 dst_cset = find_css_set(src_cset, dst_cgrp); 1952 dst_cset = find_css_set(src_cset, dst_cgrp);
1949 if (!dst_cset) 1953 if (!dst_cset)
1950 goto err; 1954 goto err;
1951 1955
1952 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); 1956 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
1953 src_cset->mg_dst_cset = dst_cset; 1957 src_cset->mg_dst_cset = dst_cset;
1954 1958
1955 if (list_empty(&dst_cset->mg_preload_node)) 1959 if (list_empty(&dst_cset->mg_preload_node))
1956 list_add(&dst_cset->mg_preload_node, &csets); 1960 list_add(&dst_cset->mg_preload_node, &csets);
1957 else 1961 else
1958 put_css_set(dst_cset, false); 1962 put_css_set(dst_cset, false);
1959 } 1963 }
1960 1964
1961 list_splice(&csets, preloaded_csets); 1965 list_splice(&csets, preloaded_csets);
1962 return 0; 1966 return 0;
1963 err: 1967 err:
1964 cgroup_migrate_finish(&csets); 1968 cgroup_migrate_finish(&csets);
1965 return -ENOMEM; 1969 return -ENOMEM;
1966 } 1970 }
1967 1971
1968 /** 1972 /**
1969 * cgroup_migrate - migrate a process or task to a cgroup 1973 * cgroup_migrate - migrate a process or task to a cgroup
1970 * @cgrp: the destination cgroup 1974 * @cgrp: the destination cgroup
1971 * @leader: the leader of the process or the task to migrate 1975 * @leader: the leader of the process or the task to migrate
1972 * @threadgroup: whether @leader points to the whole process or a single task 1976 * @threadgroup: whether @leader points to the whole process or a single task
1973 * 1977 *
1974 * Migrate a process or task denoted by @leader to @cgrp. If migrating a 1978 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
1975 * process, the caller must be holding threadgroup_lock of @leader. The 1979 * process, the caller must be holding threadgroup_lock of @leader. The
1976 * caller is also responsible for invoking cgroup_migrate_add_src() and 1980 * caller is also responsible for invoking cgroup_migrate_add_src() and
1977 * cgroup_migrate_prepare_dst() on the targets before invoking this 1981 * cgroup_migrate_prepare_dst() on the targets before invoking this
1978 * function and following up with cgroup_migrate_finish(). 1982 * function and following up with cgroup_migrate_finish().
1979 * 1983 *
1980 * As long as a controller's ->can_attach() doesn't fail, this function is 1984 * As long as a controller's ->can_attach() doesn't fail, this function is
1981 * guaranteed to succeed. This means that, excluding ->can_attach() 1985 * guaranteed to succeed. This means that, excluding ->can_attach()
1982 * failure, when migrating multiple targets, the success or failure can be 1986 * failure, when migrating multiple targets, the success or failure can be
1983 * decided for all targets by invoking group_migrate_prepare_dst() before 1987 * decided for all targets by invoking group_migrate_prepare_dst() before
1984 * actually starting migrating. 1988 * actually starting migrating.
1985 */ 1989 */
1986 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, 1990 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1987 bool threadgroup) 1991 bool threadgroup)
1988 { 1992 {
1989 struct cgroup_taskset tset = { 1993 struct cgroup_taskset tset = {
1990 .src_csets = LIST_HEAD_INIT(tset.src_csets), 1994 .src_csets = LIST_HEAD_INIT(tset.src_csets),
1991 .dst_csets = LIST_HEAD_INIT(tset.dst_csets), 1995 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
1992 .csets = &tset.src_csets, 1996 .csets = &tset.src_csets,
1993 }; 1997 };
1994 struct cgroup_subsys_state *css, *failed_css = NULL; 1998 struct cgroup_subsys_state *css, *failed_css = NULL;
1995 struct css_set *cset, *tmp_cset; 1999 struct css_set *cset, *tmp_cset;
1996 struct task_struct *task, *tmp_task; 2000 struct task_struct *task, *tmp_task;
1997 int i, ret; 2001 int i, ret;
1998 2002
1999 /* 2003 /*
2000 * Prevent freeing of tasks while we take a snapshot. Tasks that are 2004 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2001 * already PF_EXITING could be freed from underneath us unless we 2005 * already PF_EXITING could be freed from underneath us unless we
2002 * take an rcu_read_lock. 2006 * take an rcu_read_lock.
2003 */ 2007 */
2004 down_write(&css_set_rwsem); 2008 down_write(&css_set_rwsem);
2005 rcu_read_lock(); 2009 rcu_read_lock();
2006 task = leader; 2010 task = leader;
2007 do { 2011 do {
2008 /* @task either already exited or can't exit until the end */ 2012 /* @task either already exited or can't exit until the end */
2009 if (task->flags & PF_EXITING) 2013 if (task->flags & PF_EXITING)
2010 goto next; 2014 goto next;
2011 2015
2012 /* leave @task alone if post_fork() hasn't linked it yet */ 2016 /* leave @task alone if post_fork() hasn't linked it yet */
2013 if (list_empty(&task->cg_list)) 2017 if (list_empty(&task->cg_list))
2014 goto next; 2018 goto next;
2015 2019
2016 cset = task_css_set(task); 2020 cset = task_css_set(task);
2017 if (!cset->mg_src_cgrp) 2021 if (!cset->mg_src_cgrp)
2018 goto next; 2022 goto next;
2019 2023
2020 /* 2024 /*
2021 * cgroup_taskset_first() must always return the leader. 2025 * cgroup_taskset_first() must always return the leader.
2022 * Take care to avoid disturbing the ordering. 2026 * Take care to avoid disturbing the ordering.
2023 */ 2027 */
2024 list_move_tail(&task->cg_list, &cset->mg_tasks); 2028 list_move_tail(&task->cg_list, &cset->mg_tasks);
2025 if (list_empty(&cset->mg_node)) 2029 if (list_empty(&cset->mg_node))
2026 list_add_tail(&cset->mg_node, &tset.src_csets); 2030 list_add_tail(&cset->mg_node, &tset.src_csets);
2027 if (list_empty(&cset->mg_dst_cset->mg_node)) 2031 if (list_empty(&cset->mg_dst_cset->mg_node))
2028 list_move_tail(&cset->mg_dst_cset->mg_node, 2032 list_move_tail(&cset->mg_dst_cset->mg_node,
2029 &tset.dst_csets); 2033 &tset.dst_csets);
2030 next: 2034 next:
2031 if (!threadgroup) 2035 if (!threadgroup)
2032 break; 2036 break;
2033 } while_each_thread(leader, task); 2037 } while_each_thread(leader, task);
2034 rcu_read_unlock(); 2038 rcu_read_unlock();
2035 up_write(&css_set_rwsem); 2039 up_write(&css_set_rwsem);
2036 2040
2037 /* methods shouldn't be called if no task is actually migrating */ 2041 /* methods shouldn't be called if no task is actually migrating */
2038 if (list_empty(&tset.src_csets)) 2042 if (list_empty(&tset.src_csets))
2039 return 0; 2043 return 0;
2040 2044
2041 /* check that we can legitimately attach to the cgroup */ 2045 /* check that we can legitimately attach to the cgroup */
2042 for_each_e_css(css, i, cgrp) { 2046 for_each_e_css(css, i, cgrp) {
2043 if (css->ss->can_attach) { 2047 if (css->ss->can_attach) {
2044 ret = css->ss->can_attach(css, &tset); 2048 ret = css->ss->can_attach(css, &tset);
2045 if (ret) { 2049 if (ret) {
2046 failed_css = css; 2050 failed_css = css;
2047 goto out_cancel_attach; 2051 goto out_cancel_attach;
2048 } 2052 }
2049 } 2053 }
2050 } 2054 }
2051 2055
2052 /* 2056 /*
2053 * Now that we're guaranteed success, proceed to move all tasks to 2057 * Now that we're guaranteed success, proceed to move all tasks to
2054 * the new cgroup. There are no failure cases after here, so this 2058 * the new cgroup. There are no failure cases after here, so this
2055 * is the commit point. 2059 * is the commit point.
2056 */ 2060 */
2057 down_write(&css_set_rwsem); 2061 down_write(&css_set_rwsem);
2058 list_for_each_entry(cset, &tset.src_csets, mg_node) { 2062 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2059 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) 2063 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2060 cgroup_task_migrate(cset->mg_src_cgrp, task, 2064 cgroup_task_migrate(cset->mg_src_cgrp, task,
2061 cset->mg_dst_cset); 2065 cset->mg_dst_cset);
2062 } 2066 }
2063 up_write(&css_set_rwsem); 2067 up_write(&css_set_rwsem);
2064 2068
2065 /* 2069 /*
2066 * Migration is committed, all target tasks are now on dst_csets. 2070 * Migration is committed, all target tasks are now on dst_csets.
2067 * Nothing is sensitive to fork() after this point. Notify 2071 * Nothing is sensitive to fork() after this point. Notify
2068 * controllers that migration is complete. 2072 * controllers that migration is complete.
2069 */ 2073 */
2070 tset.csets = &tset.dst_csets; 2074 tset.csets = &tset.dst_csets;
2071 2075
2072 for_each_e_css(css, i, cgrp) 2076 for_each_e_css(css, i, cgrp)
2073 if (css->ss->attach) 2077 if (css->ss->attach)
2074 css->ss->attach(css, &tset); 2078 css->ss->attach(css, &tset);
2075 2079
2076 ret = 0; 2080 ret = 0;
2077 goto out_release_tset; 2081 goto out_release_tset;
2078 2082
2079 out_cancel_attach: 2083 out_cancel_attach:
2080 for_each_e_css(css, i, cgrp) { 2084 for_each_e_css(css, i, cgrp) {
2081 if (css == failed_css) 2085 if (css == failed_css)
2082 break; 2086 break;
2083 if (css->ss->cancel_attach) 2087 if (css->ss->cancel_attach)
2084 css->ss->cancel_attach(css, &tset); 2088 css->ss->cancel_attach(css, &tset);
2085 } 2089 }
2086 out_release_tset: 2090 out_release_tset:
2087 down_write(&css_set_rwsem); 2091 down_write(&css_set_rwsem);
2088 list_splice_init(&tset.dst_csets, &tset.src_csets); 2092 list_splice_init(&tset.dst_csets, &tset.src_csets);
2089 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { 2093 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2090 list_splice_tail_init(&cset->mg_tasks, &cset->tasks); 2094 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2091 list_del_init(&cset->mg_node); 2095 list_del_init(&cset->mg_node);
2092 } 2096 }
2093 up_write(&css_set_rwsem); 2097 up_write(&css_set_rwsem);
2094 return ret; 2098 return ret;
2095 } 2099 }
2096 2100
2097 /** 2101 /**
2098 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup 2102 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2099 * @dst_cgrp: the cgroup to attach to 2103 * @dst_cgrp: the cgroup to attach to
2100 * @leader: the task or the leader of the threadgroup to be attached 2104 * @leader: the task or the leader of the threadgroup to be attached
2101 * @threadgroup: attach the whole threadgroup? 2105 * @threadgroup: attach the whole threadgroup?
2102 * 2106 *
2103 * Call holding cgroup_mutex and threadgroup_lock of @leader. 2107 * Call holding cgroup_mutex and threadgroup_lock of @leader.
2104 */ 2108 */
2105 static int cgroup_attach_task(struct cgroup *dst_cgrp, 2109 static int cgroup_attach_task(struct cgroup *dst_cgrp,
2106 struct task_struct *leader, bool threadgroup) 2110 struct task_struct *leader, bool threadgroup)
2107 { 2111 {
2108 LIST_HEAD(preloaded_csets); 2112 LIST_HEAD(preloaded_csets);
2109 struct task_struct *task; 2113 struct task_struct *task;
2110 int ret; 2114 int ret;
2111 2115
2112 /* look up all src csets */ 2116 /* look up all src csets */
2113 down_read(&css_set_rwsem); 2117 down_read(&css_set_rwsem);
2114 rcu_read_lock(); 2118 rcu_read_lock();
2115 task = leader; 2119 task = leader;
2116 do { 2120 do {
2117 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, 2121 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2118 &preloaded_csets); 2122 &preloaded_csets);
2119 if (!threadgroup) 2123 if (!threadgroup)
2120 break; 2124 break;
2121 } while_each_thread(leader, task); 2125 } while_each_thread(leader, task);
2122 rcu_read_unlock(); 2126 rcu_read_unlock();
2123 up_read(&css_set_rwsem); 2127 up_read(&css_set_rwsem);
2124 2128
2125 /* prepare dst csets and commit */ 2129 /* prepare dst csets and commit */
2126 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); 2130 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2127 if (!ret) 2131 if (!ret)
2128 ret = cgroup_migrate(dst_cgrp, leader, threadgroup); 2132 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2129 2133
2130 cgroup_migrate_finish(&preloaded_csets); 2134 cgroup_migrate_finish(&preloaded_csets);
2131 return ret; 2135 return ret;
2132 } 2136 }
2133 2137
2134 /* 2138 /*
2135 * Find the task_struct of the task to attach by vpid and pass it along to the 2139 * Find the task_struct of the task to attach by vpid and pass it along to the
2136 * function to attach either it or all tasks in its threadgroup. Will lock 2140 * function to attach either it or all tasks in its threadgroup. Will lock
2137 * cgroup_mutex and threadgroup. 2141 * cgroup_mutex and threadgroup.
2138 */ 2142 */
2139 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2143 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2140 { 2144 {
2141 struct task_struct *tsk; 2145 struct task_struct *tsk;
2142 const struct cred *cred = current_cred(), *tcred; 2146 const struct cred *cred = current_cred(), *tcred;
2143 int ret; 2147 int ret;
2144 2148
2145 if (!cgroup_lock_live_group(cgrp)) 2149 if (!cgroup_lock_live_group(cgrp))
2146 return -ENODEV; 2150 return -ENODEV;
2147 2151
2148 retry_find_task: 2152 retry_find_task:
2149 rcu_read_lock(); 2153 rcu_read_lock();
2150 if (pid) { 2154 if (pid) {
2151 tsk = find_task_by_vpid(pid); 2155 tsk = find_task_by_vpid(pid);
2152 if (!tsk) { 2156 if (!tsk) {
2153 rcu_read_unlock(); 2157 rcu_read_unlock();
2154 ret = -ESRCH; 2158 ret = -ESRCH;
2155 goto out_unlock_cgroup; 2159 goto out_unlock_cgroup;
2156 } 2160 }
2157 /* 2161 /*
2158 * even if we're attaching all tasks in the thread group, we 2162 * even if we're attaching all tasks in the thread group, we
2159 * only need to check permissions on one of them. 2163 * only need to check permissions on one of them.
2160 */ 2164 */
2161 tcred = __task_cred(tsk); 2165 tcred = __task_cred(tsk);
2162 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 2166 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2163 !uid_eq(cred->euid, tcred->uid) && 2167 !uid_eq(cred->euid, tcred->uid) &&
2164 !uid_eq(cred->euid, tcred->suid)) { 2168 !uid_eq(cred->euid, tcred->suid)) {
2165 rcu_read_unlock(); 2169 rcu_read_unlock();
2166 ret = -EACCES; 2170 ret = -EACCES;
2167 goto out_unlock_cgroup; 2171 goto out_unlock_cgroup;
2168 } 2172 }
2169 } else 2173 } else
2170 tsk = current; 2174 tsk = current;
2171 2175
2172 if (threadgroup) 2176 if (threadgroup)
2173 tsk = tsk->group_leader; 2177 tsk = tsk->group_leader;
2174 2178
2175 /* 2179 /*
2176 * Workqueue threads may acquire PF_NO_SETAFFINITY and become 2180 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2177 * trapped in a cpuset, or RT worker may be born in a cgroup 2181 * trapped in a cpuset, or RT worker may be born in a cgroup
2178 * with no rt_runtime allocated. Just say no. 2182 * with no rt_runtime allocated. Just say no.
2179 */ 2183 */
2180 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { 2184 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2181 ret = -EINVAL; 2185 ret = -EINVAL;
2182 rcu_read_unlock(); 2186 rcu_read_unlock();
2183 goto out_unlock_cgroup; 2187 goto out_unlock_cgroup;
2184 } 2188 }
2185 2189
2186 get_task_struct(tsk); 2190 get_task_struct(tsk);
2187 rcu_read_unlock(); 2191 rcu_read_unlock();
2188 2192
2189 threadgroup_lock(tsk); 2193 threadgroup_lock(tsk);
2190 if (threadgroup) { 2194 if (threadgroup) {
2191 if (!thread_group_leader(tsk)) { 2195 if (!thread_group_leader(tsk)) {
2192 /* 2196 /*
2193 * a race with de_thread from another thread's exec() 2197 * a race with de_thread from another thread's exec()
2194 * may strip us of our leadership, if this happens, 2198 * may strip us of our leadership, if this happens,
2195 * there is no choice but to throw this task away and 2199 * there is no choice but to throw this task away and
2196 * try again; this is 2200 * try again; this is
2197 * "double-double-toil-and-trouble-check locking". 2201 * "double-double-toil-and-trouble-check locking".
2198 */ 2202 */
2199 threadgroup_unlock(tsk); 2203 threadgroup_unlock(tsk);
2200 put_task_struct(tsk); 2204 put_task_struct(tsk);
2201 goto retry_find_task; 2205 goto retry_find_task;
2202 } 2206 }
2203 } 2207 }
2204 2208
2205 ret = cgroup_attach_task(cgrp, tsk, threadgroup); 2209 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2206 2210
2207 threadgroup_unlock(tsk); 2211 threadgroup_unlock(tsk);
2208 2212
2209 put_task_struct(tsk); 2213 put_task_struct(tsk);
2210 out_unlock_cgroup: 2214 out_unlock_cgroup:
2211 mutex_unlock(&cgroup_mutex); 2215 mutex_unlock(&cgroup_mutex);
2212 return ret; 2216 return ret;
2213 } 2217 }
2214 2218
2215 /** 2219 /**
2216 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' 2220 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2217 * @from: attach to all cgroups of a given task 2221 * @from: attach to all cgroups of a given task
2218 * @tsk: the task to be attached 2222 * @tsk: the task to be attached
2219 */ 2223 */
2220 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) 2224 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2221 { 2225 {
2222 struct cgroup_root *root; 2226 struct cgroup_root *root;
2223 int retval = 0; 2227 int retval = 0;
2224 2228
2225 mutex_lock(&cgroup_mutex); 2229 mutex_lock(&cgroup_mutex);
2226 for_each_root(root) { 2230 for_each_root(root) {
2227 struct cgroup *from_cgrp; 2231 struct cgroup *from_cgrp;
2228 2232
2229 if (root == &cgrp_dfl_root) 2233 if (root == &cgrp_dfl_root)
2230 continue; 2234 continue;
2231 2235
2232 down_read(&css_set_rwsem); 2236 down_read(&css_set_rwsem);
2233 from_cgrp = task_cgroup_from_root(from, root); 2237 from_cgrp = task_cgroup_from_root(from, root);
2234 up_read(&css_set_rwsem); 2238 up_read(&css_set_rwsem);
2235 2239
2236 retval = cgroup_attach_task(from_cgrp, tsk, false); 2240 retval = cgroup_attach_task(from_cgrp, tsk, false);
2237 if (retval) 2241 if (retval)
2238 break; 2242 break;
2239 } 2243 }
2240 mutex_unlock(&cgroup_mutex); 2244 mutex_unlock(&cgroup_mutex);
2241 2245
2242 return retval; 2246 return retval;
2243 } 2247 }
2244 EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2248 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2245 2249
2246 static int cgroup_tasks_write(struct cgroup_subsys_state *css, 2250 static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2247 struct cftype *cft, u64 pid) 2251 struct cftype *cft, u64 pid)
2248 { 2252 {
2249 return attach_task_by_pid(css->cgroup, pid, false); 2253 return attach_task_by_pid(css->cgroup, pid, false);
2250 } 2254 }
2251 2255
2252 static int cgroup_procs_write(struct cgroup_subsys_state *css, 2256 static int cgroup_procs_write(struct cgroup_subsys_state *css,
2253 struct cftype *cft, u64 tgid) 2257 struct cftype *cft, u64 tgid)
2254 { 2258 {
2255 return attach_task_by_pid(css->cgroup, tgid, true); 2259 return attach_task_by_pid(css->cgroup, tgid, true);
2256 } 2260 }
2257 2261
2258 static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2262 static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2259 struct cftype *cft, char *buffer) 2263 struct cftype *cft, char *buffer)
2260 { 2264 {
2261 struct cgroup_root *root = css->cgroup->root; 2265 struct cgroup_root *root = css->cgroup->root;
2262 2266
2263 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); 2267 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
2264 if (!cgroup_lock_live_group(css->cgroup)) 2268 if (!cgroup_lock_live_group(css->cgroup))
2265 return -ENODEV; 2269 return -ENODEV;
2266 spin_lock(&release_agent_path_lock); 2270 spin_lock(&release_agent_path_lock);
2267 strlcpy(root->release_agent_path, buffer, 2271 strlcpy(root->release_agent_path, buffer,
2268 sizeof(root->release_agent_path)); 2272 sizeof(root->release_agent_path));
2269 spin_unlock(&release_agent_path_lock); 2273 spin_unlock(&release_agent_path_lock);
2270 mutex_unlock(&cgroup_mutex); 2274 mutex_unlock(&cgroup_mutex);
2271 return 0; 2275 return 0;
2272 } 2276 }
2273 2277
2274 static int cgroup_release_agent_show(struct seq_file *seq, void *v) 2278 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2275 { 2279 {
2276 struct cgroup *cgrp = seq_css(seq)->cgroup; 2280 struct cgroup *cgrp = seq_css(seq)->cgroup;
2277 2281
2278 if (!cgroup_lock_live_group(cgrp)) 2282 if (!cgroup_lock_live_group(cgrp))
2279 return -ENODEV; 2283 return -ENODEV;
2280 seq_puts(seq, cgrp->root->release_agent_path); 2284 seq_puts(seq, cgrp->root->release_agent_path);
2281 seq_putc(seq, '\n'); 2285 seq_putc(seq, '\n');
2282 mutex_unlock(&cgroup_mutex); 2286 mutex_unlock(&cgroup_mutex);
2283 return 0; 2287 return 0;
2284 } 2288 }
2285 2289
2286 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) 2290 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2287 { 2291 {
2288 struct cgroup *cgrp = seq_css(seq)->cgroup; 2292 struct cgroup *cgrp = seq_css(seq)->cgroup;
2289 2293
2290 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2294 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2291 return 0; 2295 return 0;
2292 } 2296 }
2293 2297
2294 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 2298 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2295 size_t nbytes, loff_t off) 2299 size_t nbytes, loff_t off)
2296 { 2300 {
2297 struct cgroup *cgrp = of->kn->parent->priv; 2301 struct cgroup *cgrp = of->kn->parent->priv;
2298 struct cftype *cft = of->kn->priv; 2302 struct cftype *cft = of->kn->priv;
2299 struct cgroup_subsys_state *css; 2303 struct cgroup_subsys_state *css;
2300 int ret; 2304 int ret;
2301 2305
2302 /* 2306 /*
2303 * kernfs guarantees that a file isn't deleted with operations in 2307 * kernfs guarantees that a file isn't deleted with operations in
2304 * flight, which means that the matching css is and stays alive and 2308 * flight, which means that the matching css is and stays alive and
2305 * doesn't need to be pinned. The RCU locking is not necessary 2309 * doesn't need to be pinned. The RCU locking is not necessary
2306 * either. It's just for the convenience of using cgroup_css(). 2310 * either. It's just for the convenience of using cgroup_css().
2307 */ 2311 */
2308 rcu_read_lock(); 2312 rcu_read_lock();
2309 css = cgroup_css(cgrp, cft->ss); 2313 css = cgroup_css(cgrp, cft->ss);
2310 rcu_read_unlock(); 2314 rcu_read_unlock();
2311 2315
2312 if (cft->write_string) { 2316 if (cft->write_string) {
2313 ret = cft->write_string(css, cft, strstrip(buf)); 2317 ret = cft->write_string(css, cft, strstrip(buf));
2314 } else if (cft->write_u64) { 2318 } else if (cft->write_u64) {
2315 unsigned long long v; 2319 unsigned long long v;
2316 ret = kstrtoull(buf, 0, &v); 2320 ret = kstrtoull(buf, 0, &v);
2317 if (!ret) 2321 if (!ret)
2318 ret = cft->write_u64(css, cft, v); 2322 ret = cft->write_u64(css, cft, v);
2319 } else if (cft->write_s64) { 2323 } else if (cft->write_s64) {
2320 long long v; 2324 long long v;
2321 ret = kstrtoll(buf, 0, &v); 2325 ret = kstrtoll(buf, 0, &v);
2322 if (!ret) 2326 if (!ret)
2323 ret = cft->write_s64(css, cft, v); 2327 ret = cft->write_s64(css, cft, v);
2324 } else if (cft->trigger) { 2328 } else if (cft->trigger) {
2325 ret = cft->trigger(css, (unsigned int)cft->private); 2329 ret = cft->trigger(css, (unsigned int)cft->private);
2326 } else { 2330 } else {
2327 ret = -EINVAL; 2331 ret = -EINVAL;
2328 } 2332 }
2329 2333
2330 return ret ?: nbytes; 2334 return ret ?: nbytes;
2331 } 2335 }
2332 2336
2333 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2337 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2334 { 2338 {
2335 return seq_cft(seq)->seq_start(seq, ppos); 2339 return seq_cft(seq)->seq_start(seq, ppos);
2336 } 2340 }
2337 2341
2338 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2342 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2339 { 2343 {
2340 return seq_cft(seq)->seq_next(seq, v, ppos); 2344 return seq_cft(seq)->seq_next(seq, v, ppos);
2341 } 2345 }
2342 2346
2343 static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2347 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2344 { 2348 {
2345 seq_cft(seq)->seq_stop(seq, v); 2349 seq_cft(seq)->seq_stop(seq, v);
2346 } 2350 }
2347 2351
2348 static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2352 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2349 { 2353 {
2350 struct cftype *cft = seq_cft(m); 2354 struct cftype *cft = seq_cft(m);
2351 struct cgroup_subsys_state *css = seq_css(m); 2355 struct cgroup_subsys_state *css = seq_css(m);
2352 2356
2353 if (cft->seq_show) 2357 if (cft->seq_show)
2354 return cft->seq_show(m, arg); 2358 return cft->seq_show(m, arg);
2355 2359
2356 if (cft->read_u64) 2360 if (cft->read_u64)
2357 seq_printf(m, "%llu\n", cft->read_u64(css, cft)); 2361 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2358 else if (cft->read_s64) 2362 else if (cft->read_s64)
2359 seq_printf(m, "%lld\n", cft->read_s64(css, cft)); 2363 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2360 else 2364 else
2361 return -EINVAL; 2365 return -EINVAL;
2362 return 0; 2366 return 0;
2363 } 2367 }
2364 2368
2365 static struct kernfs_ops cgroup_kf_single_ops = { 2369 static struct kernfs_ops cgroup_kf_single_ops = {
2366 .atomic_write_len = PAGE_SIZE, 2370 .atomic_write_len = PAGE_SIZE,
2367 .write = cgroup_file_write, 2371 .write = cgroup_file_write,
2368 .seq_show = cgroup_seqfile_show, 2372 .seq_show = cgroup_seqfile_show,
2369 }; 2373 };
2370 2374
2371 static struct kernfs_ops cgroup_kf_ops = { 2375 static struct kernfs_ops cgroup_kf_ops = {
2372 .atomic_write_len = PAGE_SIZE, 2376 .atomic_write_len = PAGE_SIZE,
2373 .write = cgroup_file_write, 2377 .write = cgroup_file_write,
2374 .seq_start = cgroup_seqfile_start, 2378 .seq_start = cgroup_seqfile_start,
2375 .seq_next = cgroup_seqfile_next, 2379 .seq_next = cgroup_seqfile_next,
2376 .seq_stop = cgroup_seqfile_stop, 2380 .seq_stop = cgroup_seqfile_stop,
2377 .seq_show = cgroup_seqfile_show, 2381 .seq_show = cgroup_seqfile_show,
2378 }; 2382 };
2379 2383
2380 /* 2384 /*
2381 * cgroup_rename - Only allow simple rename of directories in place. 2385 * cgroup_rename - Only allow simple rename of directories in place.
2382 */ 2386 */
2383 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, 2387 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2384 const char *new_name_str) 2388 const char *new_name_str)
2385 { 2389 {
2386 struct cgroup *cgrp = kn->priv; 2390 struct cgroup *cgrp = kn->priv;
2387 int ret; 2391 int ret;
2388 2392
2389 if (kernfs_type(kn) != KERNFS_DIR) 2393 if (kernfs_type(kn) != KERNFS_DIR)
2390 return -ENOTDIR; 2394 return -ENOTDIR;
2391 if (kn->parent != new_parent) 2395 if (kn->parent != new_parent)
2392 return -EIO; 2396 return -EIO;
2393 2397
2394 /* 2398 /*
2395 * This isn't a proper migration and its usefulness is very 2399 * This isn't a proper migration and its usefulness is very
2396 * limited. Disallow if sane_behavior. 2400 * limited. Disallow if sane_behavior.
2397 */ 2401 */
2398 if (cgroup_sane_behavior(cgrp)) 2402 if (cgroup_sane_behavior(cgrp))
2399 return -EPERM; 2403 return -EPERM;
2400 2404
2401 /* 2405 /*
2402 * We're gonna grab cgroup_tree_mutex which nests outside kernfs 2406 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
2403 * active_ref. kernfs_rename() doesn't require active_ref 2407 * active_ref. kernfs_rename() doesn't require active_ref
2404 * protection. Break them before grabbing cgroup_tree_mutex. 2408 * protection. Break them before grabbing cgroup_tree_mutex.
2405 */ 2409 */
2406 kernfs_break_active_protection(new_parent); 2410 kernfs_break_active_protection(new_parent);
2407 kernfs_break_active_protection(kn); 2411 kernfs_break_active_protection(kn);
2408 2412
2409 mutex_lock(&cgroup_tree_mutex); 2413 mutex_lock(&cgroup_tree_mutex);
2410 mutex_lock(&cgroup_mutex); 2414 mutex_lock(&cgroup_mutex);
2411 2415
2412 ret = kernfs_rename(kn, new_parent, new_name_str); 2416 ret = kernfs_rename(kn, new_parent, new_name_str);
2413 2417
2414 mutex_unlock(&cgroup_mutex); 2418 mutex_unlock(&cgroup_mutex);
2415 mutex_unlock(&cgroup_tree_mutex); 2419 mutex_unlock(&cgroup_tree_mutex);
2416 2420
2417 kernfs_unbreak_active_protection(kn); 2421 kernfs_unbreak_active_protection(kn);
2418 kernfs_unbreak_active_protection(new_parent); 2422 kernfs_unbreak_active_protection(new_parent);
2419 return ret; 2423 return ret;
2420 } 2424 }
2421 2425
2422 /* set uid and gid of cgroup dirs and files to that of the creator */ 2426 /* set uid and gid of cgroup dirs and files to that of the creator */
2423 static int cgroup_kn_set_ugid(struct kernfs_node *kn) 2427 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
2424 { 2428 {
2425 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, 2429 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
2426 .ia_uid = current_fsuid(), 2430 .ia_uid = current_fsuid(),
2427 .ia_gid = current_fsgid(), }; 2431 .ia_gid = current_fsgid(), };
2428 2432
2429 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && 2433 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
2430 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) 2434 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
2431 return 0; 2435 return 0;
2432 2436
2433 return kernfs_setattr(kn, &iattr); 2437 return kernfs_setattr(kn, &iattr);
2434 } 2438 }
2435 2439
2436 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 2440 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2437 { 2441 {
2438 char name[CGROUP_FILE_NAME_MAX]; 2442 char name[CGROUP_FILE_NAME_MAX];
2439 struct kernfs_node *kn; 2443 struct kernfs_node *kn;
2440 struct lock_class_key *key = NULL; 2444 struct lock_class_key *key = NULL;
2441 int ret; 2445 int ret;
2442 2446
2443 #ifdef CONFIG_DEBUG_LOCK_ALLOC 2447 #ifdef CONFIG_DEBUG_LOCK_ALLOC
2444 key = &cft->lockdep_key; 2448 key = &cft->lockdep_key;
2445 #endif 2449 #endif
2446 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), 2450 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2447 cgroup_file_mode(cft), 0, cft->kf_ops, cft, 2451 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2448 NULL, false, key); 2452 NULL, false, key);
2449 if (IS_ERR(kn)) 2453 if (IS_ERR(kn))
2450 return PTR_ERR(kn); 2454 return PTR_ERR(kn);
2451 2455
2452 ret = cgroup_kn_set_ugid(kn); 2456 ret = cgroup_kn_set_ugid(kn);
2453 if (ret) 2457 if (ret)
2454 kernfs_remove(kn); 2458 kernfs_remove(kn);
2455 return ret; 2459 return ret;
2456 } 2460 }
2457 2461
2458 /** 2462 /**
2459 * cgroup_addrm_files - add or remove files to a cgroup directory 2463 * cgroup_addrm_files - add or remove files to a cgroup directory
2460 * @cgrp: the target cgroup 2464 * @cgrp: the target cgroup
2461 * @cfts: array of cftypes to be added 2465 * @cfts: array of cftypes to be added
2462 * @is_add: whether to add or remove 2466 * @is_add: whether to add or remove
2463 * 2467 *
2464 * Depending on @is_add, add or remove files defined by @cfts on @cgrp. 2468 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2465 * For removals, this function never fails. If addition fails, this 2469 * For removals, this function never fails. If addition fails, this
2466 * function doesn't remove files already added. The caller is responsible 2470 * function doesn't remove files already added. The caller is responsible
2467 * for cleaning up. 2471 * for cleaning up.
2468 */ 2472 */
2469 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 2473 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2470 bool is_add) 2474 bool is_add)
2471 { 2475 {
2472 struct cftype *cft; 2476 struct cftype *cft;
2473 int ret; 2477 int ret;
2474 2478
2475 lockdep_assert_held(&cgroup_tree_mutex); 2479 lockdep_assert_held(&cgroup_tree_mutex);
2476 2480
2477 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2481 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2478 /* does cft->flags tell us to skip this file on @cgrp? */ 2482 /* does cft->flags tell us to skip this file on @cgrp? */
2479 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 2483 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2480 continue; 2484 continue;
2481 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2485 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2482 continue; 2486 continue;
2483 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2487 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2484 continue; 2488 continue;
2485 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2489 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2486 continue; 2490 continue;
2487 2491
2488 if (is_add) { 2492 if (is_add) {
2489 ret = cgroup_add_file(cgrp, cft); 2493 ret = cgroup_add_file(cgrp, cft);
2490 if (ret) { 2494 if (ret) {
2491 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2495 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2492 cft->name, ret); 2496 cft->name, ret);
2493 return ret; 2497 return ret;
2494 } 2498 }
2495 } else { 2499 } else {
2496 cgroup_rm_file(cgrp, cft); 2500 cgroup_rm_file(cgrp, cft);
2497 } 2501 }
2498 } 2502 }
2499 return 0; 2503 return 0;
2500 } 2504 }
2501 2505
2502 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) 2506 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2503 { 2507 {
2504 LIST_HEAD(pending); 2508 LIST_HEAD(pending);
2505 struct cgroup_subsys *ss = cfts[0].ss; 2509 struct cgroup_subsys *ss = cfts[0].ss;
2506 struct cgroup *root = &ss->root->cgrp; 2510 struct cgroup *root = &ss->root->cgrp;
2507 struct cgroup_subsys_state *css; 2511 struct cgroup_subsys_state *css;
2508 int ret = 0; 2512 int ret = 0;
2509 2513
2510 lockdep_assert_held(&cgroup_tree_mutex); 2514 lockdep_assert_held(&cgroup_tree_mutex);
2511 2515
2512 /* add/rm files for all cgroups created before */ 2516 /* add/rm files for all cgroups created before */
2513 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2517 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2514 struct cgroup *cgrp = css->cgroup; 2518 struct cgroup *cgrp = css->cgroup;
2515 2519
2516 if (cgroup_is_dead(cgrp)) 2520 if (cgroup_is_dead(cgrp))
2517 continue; 2521 continue;
2518 2522
2519 ret = cgroup_addrm_files(cgrp, cfts, is_add); 2523 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2520 if (ret) 2524 if (ret)
2521 break; 2525 break;
2522 } 2526 }
2523 2527
2524 if (is_add && !ret) 2528 if (is_add && !ret)
2525 kernfs_activate(root->kn); 2529 kernfs_activate(root->kn);
2526 return ret; 2530 return ret;
2527 } 2531 }
2528 2532
2529 static void cgroup_exit_cftypes(struct cftype *cfts) 2533 static void cgroup_exit_cftypes(struct cftype *cfts)
2530 { 2534 {
2531 struct cftype *cft; 2535 struct cftype *cft;
2532 2536
2533 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2537 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2534 /* free copy for custom atomic_write_len, see init_cftypes() */ 2538 /* free copy for custom atomic_write_len, see init_cftypes() */
2535 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) 2539 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2536 kfree(cft->kf_ops); 2540 kfree(cft->kf_ops);
2537 cft->kf_ops = NULL; 2541 cft->kf_ops = NULL;
2538 cft->ss = NULL; 2542 cft->ss = NULL;
2539 } 2543 }
2540 } 2544 }
2541 2545
2542 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2546 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2543 { 2547 {
2544 struct cftype *cft; 2548 struct cftype *cft;
2545 2549
2546 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2550 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2547 struct kernfs_ops *kf_ops; 2551 struct kernfs_ops *kf_ops;
2548 2552
2549 WARN_ON(cft->ss || cft->kf_ops); 2553 WARN_ON(cft->ss || cft->kf_ops);
2550 2554
2551 if (cft->seq_start) 2555 if (cft->seq_start)
2552 kf_ops = &cgroup_kf_ops; 2556 kf_ops = &cgroup_kf_ops;
2553 else 2557 else
2554 kf_ops = &cgroup_kf_single_ops; 2558 kf_ops = &cgroup_kf_single_ops;
2555 2559
2556 /* 2560 /*
2557 * Ugh... if @cft wants a custom max_write_len, we need to 2561 * Ugh... if @cft wants a custom max_write_len, we need to
2558 * make a copy of kf_ops to set its atomic_write_len. 2562 * make a copy of kf_ops to set its atomic_write_len.
2559 */ 2563 */
2560 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { 2564 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
2561 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); 2565 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
2562 if (!kf_ops) { 2566 if (!kf_ops) {
2563 cgroup_exit_cftypes(cfts); 2567 cgroup_exit_cftypes(cfts);
2564 return -ENOMEM; 2568 return -ENOMEM;
2565 } 2569 }
2566 kf_ops->atomic_write_len = cft->max_write_len; 2570 kf_ops->atomic_write_len = cft->max_write_len;
2567 } 2571 }
2568 2572
2569 cft->kf_ops = kf_ops; 2573 cft->kf_ops = kf_ops;
2570 cft->ss = ss; 2574 cft->ss = ss;
2571 } 2575 }
2572 2576
2573 return 0; 2577 return 0;
2574 } 2578 }
2575 2579
2576 static int cgroup_rm_cftypes_locked(struct cftype *cfts) 2580 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2577 { 2581 {
2578 lockdep_assert_held(&cgroup_tree_mutex); 2582 lockdep_assert_held(&cgroup_tree_mutex);
2579 2583
2580 if (!cfts || !cfts[0].ss) 2584 if (!cfts || !cfts[0].ss)
2581 return -ENOENT; 2585 return -ENOENT;
2582 2586
2583 list_del(&cfts->node); 2587 list_del(&cfts->node);
2584 cgroup_apply_cftypes(cfts, false); 2588 cgroup_apply_cftypes(cfts, false);
2585 cgroup_exit_cftypes(cfts); 2589 cgroup_exit_cftypes(cfts);
2586 return 0; 2590 return 0;
2587 } 2591 }
2588 2592
2589 /** 2593 /**
2590 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2594 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2591 * @cfts: zero-length name terminated array of cftypes 2595 * @cfts: zero-length name terminated array of cftypes
2592 * 2596 *
2593 * Unregister @cfts. Files described by @cfts are removed from all 2597 * Unregister @cfts. Files described by @cfts are removed from all
2594 * existing cgroups and all future cgroups won't have them either. This 2598 * existing cgroups and all future cgroups won't have them either. This
2595 * function can be called anytime whether @cfts' subsys is attached or not. 2599 * function can be called anytime whether @cfts' subsys is attached or not.
2596 * 2600 *
2597 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2601 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2598 * registered. 2602 * registered.
2599 */ 2603 */
2600 int cgroup_rm_cftypes(struct cftype *cfts) 2604 int cgroup_rm_cftypes(struct cftype *cfts)
2601 { 2605 {
2602 int ret; 2606 int ret;
2603 2607
2604 mutex_lock(&cgroup_tree_mutex); 2608 mutex_lock(&cgroup_tree_mutex);
2605 ret = cgroup_rm_cftypes_locked(cfts); 2609 ret = cgroup_rm_cftypes_locked(cfts);
2606 mutex_unlock(&cgroup_tree_mutex); 2610 mutex_unlock(&cgroup_tree_mutex);
2607 return ret; 2611 return ret;
2608 } 2612 }
2609 2613
2610 /** 2614 /**
2611 * cgroup_add_cftypes - add an array of cftypes to a subsystem 2615 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2612 * @ss: target cgroup subsystem 2616 * @ss: target cgroup subsystem
2613 * @cfts: zero-length name terminated array of cftypes 2617 * @cfts: zero-length name terminated array of cftypes
2614 * 2618 *
2615 * Register @cfts to @ss. Files described by @cfts are created for all 2619 * Register @cfts to @ss. Files described by @cfts are created for all
2616 * existing cgroups to which @ss is attached and all future cgroups will 2620 * existing cgroups to which @ss is attached and all future cgroups will
2617 * have them too. This function can be called anytime whether @ss is 2621 * have them too. This function can be called anytime whether @ss is
2618 * attached or not. 2622 * attached or not.
2619 * 2623 *
2620 * Returns 0 on successful registration, -errno on failure. Note that this 2624 * Returns 0 on successful registration, -errno on failure. Note that this
2621 * function currently returns 0 as long as @cfts registration is successful 2625 * function currently returns 0 as long as @cfts registration is successful
2622 * even if some file creation attempts on existing cgroups fail. 2626 * even if some file creation attempts on existing cgroups fail.
2623 */ 2627 */
2624 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2628 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2625 { 2629 {
2626 int ret; 2630 int ret;
2627 2631
2628 if (!cfts || cfts[0].name[0] == '\0') 2632 if (!cfts || cfts[0].name[0] == '\0')
2629 return 0; 2633 return 0;
2630 2634
2631 ret = cgroup_init_cftypes(ss, cfts); 2635 ret = cgroup_init_cftypes(ss, cfts);
2632 if (ret) 2636 if (ret)
2633 return ret; 2637 return ret;
2634 2638
2635 mutex_lock(&cgroup_tree_mutex); 2639 mutex_lock(&cgroup_tree_mutex);
2636 2640
2637 list_add_tail(&cfts->node, &ss->cfts); 2641 list_add_tail(&cfts->node, &ss->cfts);
2638 ret = cgroup_apply_cftypes(cfts, true); 2642 ret = cgroup_apply_cftypes(cfts, true);
2639 if (ret) 2643 if (ret)
2640 cgroup_rm_cftypes_locked(cfts); 2644 cgroup_rm_cftypes_locked(cfts);
2641 2645
2642 mutex_unlock(&cgroup_tree_mutex); 2646 mutex_unlock(&cgroup_tree_mutex);
2643 return ret; 2647 return ret;
2644 } 2648 }
2645 2649
2646 /** 2650 /**
2647 * cgroup_task_count - count the number of tasks in a cgroup. 2651 * cgroup_task_count - count the number of tasks in a cgroup.
2648 * @cgrp: the cgroup in question 2652 * @cgrp: the cgroup in question
2649 * 2653 *
2650 * Return the number of tasks in the cgroup. 2654 * Return the number of tasks in the cgroup.
2651 */ 2655 */
2652 static int cgroup_task_count(const struct cgroup *cgrp) 2656 static int cgroup_task_count(const struct cgroup *cgrp)
2653 { 2657 {
2654 int count = 0; 2658 int count = 0;
2655 struct cgrp_cset_link *link; 2659 struct cgrp_cset_link *link;
2656 2660
2657 down_read(&css_set_rwsem); 2661 down_read(&css_set_rwsem);
2658 list_for_each_entry(link, &cgrp->cset_links, cset_link) 2662 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2659 count += atomic_read(&link->cset->refcount); 2663 count += atomic_read(&link->cset->refcount);
2660 up_read(&css_set_rwsem); 2664 up_read(&css_set_rwsem);
2661 return count; 2665 return count;
2662 } 2666 }
2663 2667
2664 /** 2668 /**
2665 * css_next_child - find the next child of a given css 2669 * css_next_child - find the next child of a given css
2666 * @pos_css: the current position (%NULL to initiate traversal) 2670 * @pos_css: the current position (%NULL to initiate traversal)
2667 * @parent_css: css whose children to walk 2671 * @parent_css: css whose children to walk
2668 * 2672 *
2669 * This function returns the next child of @parent_css and should be called 2673 * This function returns the next child of @parent_css and should be called
2670 * under either cgroup_mutex or RCU read lock. The only requirement is 2674 * under either cgroup_mutex or RCU read lock. The only requirement is
2671 * that @parent_css and @pos_css are accessible. The next sibling is 2675 * that @parent_css and @pos_css are accessible. The next sibling is
2672 * guaranteed to be returned regardless of their states. 2676 * guaranteed to be returned regardless of their states.
2673 */ 2677 */
2674 struct cgroup_subsys_state * 2678 struct cgroup_subsys_state *
2675 css_next_child(struct cgroup_subsys_state *pos_css, 2679 css_next_child(struct cgroup_subsys_state *pos_css,
2676 struct cgroup_subsys_state *parent_css) 2680 struct cgroup_subsys_state *parent_css)
2677 { 2681 {
2678 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; 2682 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
2679 struct cgroup *cgrp = parent_css->cgroup; 2683 struct cgroup *cgrp = parent_css->cgroup;
2680 struct cgroup *next; 2684 struct cgroup *next;
2681 2685
2682 cgroup_assert_mutexes_or_rcu_locked(); 2686 cgroup_assert_mutexes_or_rcu_locked();
2683 2687
2684 /* 2688 /*
2685 * @pos could already have been removed. Once a cgroup is removed, 2689 * @pos could already have been removed. Once a cgroup is removed,
2686 * its ->sibling.next is no longer updated when its next sibling 2690 * its ->sibling.next is no longer updated when its next sibling
2687 * changes. As CGRP_DEAD assertion is serialized and happens 2691 * changes. As CGRP_DEAD assertion is serialized and happens
2688 * before the cgroup is taken off the ->sibling list, if we see it 2692 * before the cgroup is taken off the ->sibling list, if we see it
2689 * unasserted, it's guaranteed that the next sibling hasn't 2693 * unasserted, it's guaranteed that the next sibling hasn't
2690 * finished its grace period even if it's already removed, and thus 2694 * finished its grace period even if it's already removed, and thus
2691 * safe to dereference from this RCU critical section. If 2695 * safe to dereference from this RCU critical section. If
2692 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 2696 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
2693 * to be visible as %true here. 2697 * to be visible as %true here.
2694 * 2698 *
2695 * If @pos is dead, its next pointer can't be dereferenced; 2699 * If @pos is dead, its next pointer can't be dereferenced;
2696 * however, as each cgroup is given a monotonically increasing 2700 * however, as each cgroup is given a monotonically increasing
2697 * unique serial number and always appended to the sibling list, 2701 * unique serial number and always appended to the sibling list,
2698 * the next one can be found by walking the parent's children until 2702 * the next one can be found by walking the parent's children until
2699 * we see a cgroup with higher serial number than @pos's. While 2703 * we see a cgroup with higher serial number than @pos's. While
2700 * this path can be slower, it's taken only when either the current 2704 * this path can be slower, it's taken only when either the current
2701 * cgroup is removed or iteration and removal race. 2705 * cgroup is removed or iteration and removal race.
2702 */ 2706 */
2703 if (!pos) { 2707 if (!pos) {
2704 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); 2708 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
2705 } else if (likely(!cgroup_is_dead(pos))) { 2709 } else if (likely(!cgroup_is_dead(pos))) {
2706 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 2710 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
2707 } else { 2711 } else {
2708 list_for_each_entry_rcu(next, &cgrp->children, sibling) 2712 list_for_each_entry_rcu(next, &cgrp->children, sibling)
2709 if (next->serial_nr > pos->serial_nr) 2713 if (next->serial_nr > pos->serial_nr)
2710 break; 2714 break;
2711 } 2715 }
2712 2716
2713 /* 2717 /*
2714 * @next, if not pointing to the head, can be dereferenced and is 2718 * @next, if not pointing to the head, can be dereferenced and is
2715 * the next sibling; however, it might have @ss disabled. If so, 2719 * the next sibling; however, it might have @ss disabled. If so,
2716 * fast-forward to the next enabled one. 2720 * fast-forward to the next enabled one.
2717 */ 2721 */
2718 while (&next->sibling != &cgrp->children) { 2722 while (&next->sibling != &cgrp->children) {
2719 struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); 2723 struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
2720 2724
2721 if (next_css) 2725 if (next_css)
2722 return next_css; 2726 return next_css;
2723 next = list_entry_rcu(next->sibling.next, struct cgroup, sibling); 2727 next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);
2724 } 2728 }
2725 return NULL; 2729 return NULL;
2726 } 2730 }
2727 2731
2728 /** 2732 /**
2729 * css_next_descendant_pre - find the next descendant for pre-order walk 2733 * css_next_descendant_pre - find the next descendant for pre-order walk
2730 * @pos: the current position (%NULL to initiate traversal) 2734 * @pos: the current position (%NULL to initiate traversal)
2731 * @root: css whose descendants to walk 2735 * @root: css whose descendants to walk
2732 * 2736 *
2733 * To be used by css_for_each_descendant_pre(). Find the next descendant 2737 * To be used by css_for_each_descendant_pre(). Find the next descendant
2734 * to visit for pre-order traversal of @root's descendants. @root is 2738 * to visit for pre-order traversal of @root's descendants. @root is
2735 * included in the iteration and the first node to be visited. 2739 * included in the iteration and the first node to be visited.
2736 * 2740 *
2737 * While this function requires cgroup_mutex or RCU read locking, it 2741 * While this function requires cgroup_mutex or RCU read locking, it
2738 * doesn't require the whole traversal to be contained in a single critical 2742 * doesn't require the whole traversal to be contained in a single critical
2739 * section. This function will return the correct next descendant as long 2743 * section. This function will return the correct next descendant as long
2740 * as both @pos and @root are accessible and @pos is a descendant of @root. 2744 * as both @pos and @root are accessible and @pos is a descendant of @root.
2741 */ 2745 */
2742 struct cgroup_subsys_state * 2746 struct cgroup_subsys_state *
2743 css_next_descendant_pre(struct cgroup_subsys_state *pos, 2747 css_next_descendant_pre(struct cgroup_subsys_state *pos,
2744 struct cgroup_subsys_state *root) 2748 struct cgroup_subsys_state *root)
2745 { 2749 {
2746 struct cgroup_subsys_state *next; 2750 struct cgroup_subsys_state *next;
2747 2751
2748 cgroup_assert_mutexes_or_rcu_locked(); 2752 cgroup_assert_mutexes_or_rcu_locked();
2749 2753
2750 /* if first iteration, visit @root */ 2754 /* if first iteration, visit @root */
2751 if (!pos) 2755 if (!pos)
2752 return root; 2756 return root;
2753 2757
2754 /* visit the first child if exists */ 2758 /* visit the first child if exists */
2755 next = css_next_child(NULL, pos); 2759 next = css_next_child(NULL, pos);
2756 if (next) 2760 if (next)
2757 return next; 2761 return next;
2758 2762
2759 /* no child, visit my or the closest ancestor's next sibling */ 2763 /* no child, visit my or the closest ancestor's next sibling */
2760 while (pos != root) { 2764 while (pos != root) {
2761 next = css_next_child(pos, css_parent(pos)); 2765 next = css_next_child(pos, css_parent(pos));
2762 if (next) 2766 if (next)
2763 return next; 2767 return next;
2764 pos = css_parent(pos); 2768 pos = css_parent(pos);
2765 } 2769 }
2766 2770
2767 return NULL; 2771 return NULL;
2768 } 2772 }
2769 2773
2770 /** 2774 /**
2771 * css_rightmost_descendant - return the rightmost descendant of a css 2775 * css_rightmost_descendant - return the rightmost descendant of a css
2772 * @pos: css of interest 2776 * @pos: css of interest
2773 * 2777 *
2774 * Return the rightmost descendant of @pos. If there's no descendant, @pos 2778 * Return the rightmost descendant of @pos. If there's no descendant, @pos
2775 * is returned. This can be used during pre-order traversal to skip 2779 * is returned. This can be used during pre-order traversal to skip
2776 * subtree of @pos. 2780 * subtree of @pos.
2777 * 2781 *
2778 * While this function requires cgroup_mutex or RCU read locking, it 2782 * While this function requires cgroup_mutex or RCU read locking, it
2779 * doesn't require the whole traversal to be contained in a single critical 2783 * doesn't require the whole traversal to be contained in a single critical
2780 * section. This function will return the correct rightmost descendant as 2784 * section. This function will return the correct rightmost descendant as
2781 * long as @pos is accessible. 2785 * long as @pos is accessible.
2782 */ 2786 */
2783 struct cgroup_subsys_state * 2787 struct cgroup_subsys_state *
2784 css_rightmost_descendant(struct cgroup_subsys_state *pos) 2788 css_rightmost_descendant(struct cgroup_subsys_state *pos)
2785 { 2789 {
2786 struct cgroup_subsys_state *last, *tmp; 2790 struct cgroup_subsys_state *last, *tmp;
2787 2791
2788 cgroup_assert_mutexes_or_rcu_locked(); 2792 cgroup_assert_mutexes_or_rcu_locked();
2789 2793
2790 do { 2794 do {
2791 last = pos; 2795 last = pos;
2792 /* ->prev isn't RCU safe, walk ->next till the end */ 2796 /* ->prev isn't RCU safe, walk ->next till the end */
2793 pos = NULL; 2797 pos = NULL;
2794 css_for_each_child(tmp, last) 2798 css_for_each_child(tmp, last)
2795 pos = tmp; 2799 pos = tmp;
2796 } while (pos); 2800 } while (pos);
2797 2801
2798 return last; 2802 return last;
2799 } 2803 }
2800 2804
2801 static struct cgroup_subsys_state * 2805 static struct cgroup_subsys_state *
2802 css_leftmost_descendant(struct cgroup_subsys_state *pos) 2806 css_leftmost_descendant(struct cgroup_subsys_state *pos)
2803 { 2807 {
2804 struct cgroup_subsys_state *last; 2808 struct cgroup_subsys_state *last;
2805 2809
2806 do { 2810 do {
2807 last = pos; 2811 last = pos;
2808 pos = css_next_child(NULL, pos); 2812 pos = css_next_child(NULL, pos);
2809 } while (pos); 2813 } while (pos);
2810 2814
2811 return last; 2815 return last;
2812 } 2816 }
2813 2817
2814 /** 2818 /**
2815 * css_next_descendant_post - find the next descendant for post-order walk 2819 * css_next_descendant_post - find the next descendant for post-order walk
2816 * @pos: the current position (%NULL to initiate traversal) 2820 * @pos: the current position (%NULL to initiate traversal)
2817 * @root: css whose descendants to walk 2821 * @root: css whose descendants to walk
2818 * 2822 *
2819 * To be used by css_for_each_descendant_post(). Find the next descendant 2823 * To be used by css_for_each_descendant_post(). Find the next descendant
2820 * to visit for post-order traversal of @root's descendants. @root is 2824 * to visit for post-order traversal of @root's descendants. @root is
2821 * included in the iteration and the last node to be visited. 2825 * included in the iteration and the last node to be visited.
2822 * 2826 *
2823 * While this function requires cgroup_mutex or RCU read locking, it 2827 * While this function requires cgroup_mutex or RCU read locking, it
2824 * doesn't require the whole traversal to be contained in a single critical 2828 * doesn't require the whole traversal to be contained in a single critical
2825 * section. This function will return the correct next descendant as long 2829 * section. This function will return the correct next descendant as long
2826 * as both @pos and @cgroup are accessible and @pos is a descendant of 2830 * as both @pos and @cgroup are accessible and @pos is a descendant of
2827 * @cgroup. 2831 * @cgroup.
2828 */ 2832 */
2829 struct cgroup_subsys_state * 2833 struct cgroup_subsys_state *
2830 css_next_descendant_post(struct cgroup_subsys_state *pos, 2834 css_next_descendant_post(struct cgroup_subsys_state *pos,
2831 struct cgroup_subsys_state *root) 2835 struct cgroup_subsys_state *root)
2832 { 2836 {
2833 struct cgroup_subsys_state *next; 2837 struct cgroup_subsys_state *next;
2834 2838
2835 cgroup_assert_mutexes_or_rcu_locked(); 2839 cgroup_assert_mutexes_or_rcu_locked();
2836 2840
2837 /* if first iteration, visit leftmost descendant which may be @root */ 2841 /* if first iteration, visit leftmost descendant which may be @root */
2838 if (!pos) 2842 if (!pos)
2839 return css_leftmost_descendant(root); 2843 return css_leftmost_descendant(root);
2840 2844
2841 /* if we visited @root, we're done */ 2845 /* if we visited @root, we're done */
2842 if (pos == root) 2846 if (pos == root)
2843 return NULL; 2847 return NULL;
2844 2848
2845 /* if there's an unvisited sibling, visit its leftmost descendant */ 2849 /* if there's an unvisited sibling, visit its leftmost descendant */
2846 next = css_next_child(pos, css_parent(pos)); 2850 next = css_next_child(pos, css_parent(pos));
2847 if (next) 2851 if (next)
2848 return css_leftmost_descendant(next); 2852 return css_leftmost_descendant(next);
2849 2853
2850 /* no sibling left, visit parent */ 2854 /* no sibling left, visit parent */
2851 return css_parent(pos); 2855 return css_parent(pos);
2852 } 2856 }
2853 2857
2854 /** 2858 /**
2855 * css_advance_task_iter - advance a task itererator to the next css_set 2859 * css_advance_task_iter - advance a task itererator to the next css_set
2856 * @it: the iterator to advance 2860 * @it: the iterator to advance
2857 * 2861 *
2858 * Advance @it to the next css_set to walk. 2862 * Advance @it to the next css_set to walk.
2859 */ 2863 */
2860 static void css_advance_task_iter(struct css_task_iter *it) 2864 static void css_advance_task_iter(struct css_task_iter *it)
2861 { 2865 {
2862 struct list_head *l = it->cset_pos; 2866 struct list_head *l = it->cset_pos;
2863 struct cgrp_cset_link *link; 2867 struct cgrp_cset_link *link;
2864 struct css_set *cset; 2868 struct css_set *cset;
2865 2869
2866 /* Advance to the next non-empty css_set */ 2870 /* Advance to the next non-empty css_set */
2867 do { 2871 do {
2868 l = l->next; 2872 l = l->next;
2869 if (l == it->cset_head) { 2873 if (l == it->cset_head) {
2870 it->cset_pos = NULL; 2874 it->cset_pos = NULL;
2871 return; 2875 return;
2872 } 2876 }
2873 2877
2874 if (it->ss) { 2878 if (it->ss) {
2875 cset = container_of(l, struct css_set, 2879 cset = container_of(l, struct css_set,
2876 e_cset_node[it->ss->id]); 2880 e_cset_node[it->ss->id]);
2877 } else { 2881 } else {
2878 link = list_entry(l, struct cgrp_cset_link, cset_link); 2882 link = list_entry(l, struct cgrp_cset_link, cset_link);
2879 cset = link->cset; 2883 cset = link->cset;
2880 } 2884 }
2881 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); 2885 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2882 2886
2883 it->cset_pos = l; 2887 it->cset_pos = l;
2884 2888
2885 if (!list_empty(&cset->tasks)) 2889 if (!list_empty(&cset->tasks))
2886 it->task_pos = cset->tasks.next; 2890 it->task_pos = cset->tasks.next;
2887 else 2891 else
2888 it->task_pos = cset->mg_tasks.next; 2892 it->task_pos = cset->mg_tasks.next;
2889 2893
2890 it->tasks_head = &cset->tasks; 2894 it->tasks_head = &cset->tasks;
2891 it->mg_tasks_head = &cset->mg_tasks; 2895 it->mg_tasks_head = &cset->mg_tasks;
2892 } 2896 }
2893 2897
2894 /** 2898 /**
2895 * css_task_iter_start - initiate task iteration 2899 * css_task_iter_start - initiate task iteration
2896 * @css: the css to walk tasks of 2900 * @css: the css to walk tasks of
2897 * @it: the task iterator to use 2901 * @it: the task iterator to use
2898 * 2902 *
2899 * Initiate iteration through the tasks of @css. The caller can call 2903 * Initiate iteration through the tasks of @css. The caller can call
2900 * css_task_iter_next() to walk through the tasks until the function 2904 * css_task_iter_next() to walk through the tasks until the function
2901 * returns NULL. On completion of iteration, css_task_iter_end() must be 2905 * returns NULL. On completion of iteration, css_task_iter_end() must be
2902 * called. 2906 * called.
2903 * 2907 *
2904 * Note that this function acquires a lock which is released when the 2908 * Note that this function acquires a lock which is released when the
2905 * iteration finishes. The caller can't sleep while iteration is in 2909 * iteration finishes. The caller can't sleep while iteration is in
2906 * progress. 2910 * progress.
2907 */ 2911 */
2908 void css_task_iter_start(struct cgroup_subsys_state *css, 2912 void css_task_iter_start(struct cgroup_subsys_state *css,
2909 struct css_task_iter *it) 2913 struct css_task_iter *it)
2910 __acquires(css_set_rwsem) 2914 __acquires(css_set_rwsem)
2911 { 2915 {
2912 /* no one should try to iterate before mounting cgroups */ 2916 /* no one should try to iterate before mounting cgroups */
2913 WARN_ON_ONCE(!use_task_css_set_links); 2917 WARN_ON_ONCE(!use_task_css_set_links);
2914 2918
2915 down_read(&css_set_rwsem); 2919 down_read(&css_set_rwsem);
2916 2920
2917 it->ss = css->ss; 2921 it->ss = css->ss;
2918 2922
2919 if (it->ss) 2923 if (it->ss)
2920 it->cset_pos = &css->cgroup->e_csets[css->ss->id]; 2924 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
2921 else 2925 else
2922 it->cset_pos = &css->cgroup->cset_links; 2926 it->cset_pos = &css->cgroup->cset_links;
2923 2927
2924 it->cset_head = it->cset_pos; 2928 it->cset_head = it->cset_pos;
2925 2929
2926 css_advance_task_iter(it); 2930 css_advance_task_iter(it);
2927 } 2931 }
2928 2932
2929 /** 2933 /**
2930 * css_task_iter_next - return the next task for the iterator 2934 * css_task_iter_next - return the next task for the iterator
2931 * @it: the task iterator being iterated 2935 * @it: the task iterator being iterated
2932 * 2936 *
2933 * The "next" function for task iteration. @it should have been 2937 * The "next" function for task iteration. @it should have been
2934 * initialized via css_task_iter_start(). Returns NULL when the iteration 2938 * initialized via css_task_iter_start(). Returns NULL when the iteration
2935 * reaches the end. 2939 * reaches the end.
2936 */ 2940 */
2937 struct task_struct *css_task_iter_next(struct css_task_iter *it) 2941 struct task_struct *css_task_iter_next(struct css_task_iter *it)
2938 { 2942 {
2939 struct task_struct *res; 2943 struct task_struct *res;
2940 struct list_head *l = it->task_pos; 2944 struct list_head *l = it->task_pos;
2941 2945
2942 /* If the iterator cg is NULL, we have no tasks */ 2946 /* If the iterator cg is NULL, we have no tasks */
2943 if (!it->cset_pos) 2947 if (!it->cset_pos)
2944 return NULL; 2948 return NULL;
2945 res = list_entry(l, struct task_struct, cg_list); 2949 res = list_entry(l, struct task_struct, cg_list);
2946 2950
2947 /* 2951 /*
2948 * Advance iterator to find next entry. cset->tasks is consumed 2952 * Advance iterator to find next entry. cset->tasks is consumed
2949 * first and then ->mg_tasks. After ->mg_tasks, we move onto the 2953 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
2950 * next cset. 2954 * next cset.
2951 */ 2955 */
2952 l = l->next; 2956 l = l->next;
2953 2957
2954 if (l == it->tasks_head) 2958 if (l == it->tasks_head)
2955 l = it->mg_tasks_head->next; 2959 l = it->mg_tasks_head->next;
2956 2960
2957 if (l == it->mg_tasks_head) 2961 if (l == it->mg_tasks_head)
2958 css_advance_task_iter(it); 2962 css_advance_task_iter(it);
2959 else 2963 else
2960 it->task_pos = l; 2964 it->task_pos = l;
2961 2965
2962 return res; 2966 return res;
2963 } 2967 }
2964 2968
2965 /** 2969 /**
2966 * css_task_iter_end - finish task iteration 2970 * css_task_iter_end - finish task iteration
2967 * @it: the task iterator to finish 2971 * @it: the task iterator to finish
2968 * 2972 *
2969 * Finish task iteration started by css_task_iter_start(). 2973 * Finish task iteration started by css_task_iter_start().
2970 */ 2974 */
2971 void css_task_iter_end(struct css_task_iter *it) 2975 void css_task_iter_end(struct css_task_iter *it)
2972 __releases(css_set_rwsem) 2976 __releases(css_set_rwsem)
2973 { 2977 {
2974 up_read(&css_set_rwsem); 2978 up_read(&css_set_rwsem);
2975 } 2979 }
2976 2980
2977 /** 2981 /**
2978 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another 2982 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
2979 * @to: cgroup to which the tasks will be moved 2983 * @to: cgroup to which the tasks will be moved
2980 * @from: cgroup in which the tasks currently reside 2984 * @from: cgroup in which the tasks currently reside
2981 * 2985 *
2982 * Locking rules between cgroup_post_fork() and the migration path 2986 * Locking rules between cgroup_post_fork() and the migration path
2983 * guarantee that, if a task is forking while being migrated, the new child 2987 * guarantee that, if a task is forking while being migrated, the new child
2984 * is guaranteed to be either visible in the source cgroup after the 2988 * is guaranteed to be either visible in the source cgroup after the
2985 * parent's migration is complete or put into the target cgroup. No task 2989 * parent's migration is complete or put into the target cgroup. No task
2986 * can slip out of migration through forking. 2990 * can slip out of migration through forking.
2987 */ 2991 */
2988 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) 2992 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
2989 { 2993 {
2990 LIST_HEAD(preloaded_csets); 2994 LIST_HEAD(preloaded_csets);
2991 struct cgrp_cset_link *link; 2995 struct cgrp_cset_link *link;
2992 struct css_task_iter it; 2996 struct css_task_iter it;
2993 struct task_struct *task; 2997 struct task_struct *task;
2994 int ret; 2998 int ret;
2995 2999
2996 mutex_lock(&cgroup_mutex); 3000 mutex_lock(&cgroup_mutex);
2997 3001
2998 /* all tasks in @from are being moved, all csets are source */ 3002 /* all tasks in @from are being moved, all csets are source */
2999 down_read(&css_set_rwsem); 3003 down_read(&css_set_rwsem);
3000 list_for_each_entry(link, &from->cset_links, cset_link) 3004 list_for_each_entry(link, &from->cset_links, cset_link)
3001 cgroup_migrate_add_src(link->cset, to, &preloaded_csets); 3005 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3002 up_read(&css_set_rwsem); 3006 up_read(&css_set_rwsem);
3003 3007
3004 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); 3008 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3005 if (ret) 3009 if (ret)
3006 goto out_err; 3010 goto out_err;
3007 3011
3008 /* 3012 /*
3009 * Migrate tasks one-by-one until @form is empty. This fails iff 3013 * Migrate tasks one-by-one until @form is empty. This fails iff
3010 * ->can_attach() fails. 3014 * ->can_attach() fails.
3011 */ 3015 */
3012 do { 3016 do {
3013 css_task_iter_start(&from->dummy_css, &it); 3017 css_task_iter_start(&from->dummy_css, &it);
3014 task = css_task_iter_next(&it); 3018 task = css_task_iter_next(&it);
3015 if (task) 3019 if (task)
3016 get_task_struct(task); 3020 get_task_struct(task);
3017 css_task_iter_end(&it); 3021 css_task_iter_end(&it);
3018 3022
3019 if (task) { 3023 if (task) {
3020 ret = cgroup_migrate(to, task, false); 3024 ret = cgroup_migrate(to, task, false);
3021 put_task_struct(task); 3025 put_task_struct(task);
3022 } 3026 }
3023 } while (task && !ret); 3027 } while (task && !ret);
3024 out_err: 3028 out_err:
3025 cgroup_migrate_finish(&preloaded_csets); 3029 cgroup_migrate_finish(&preloaded_csets);
3026 mutex_unlock(&cgroup_mutex); 3030 mutex_unlock(&cgroup_mutex);
3027 return ret; 3031 return ret;
3028 } 3032 }
3029 3033
3030 /* 3034 /*
3031 * Stuff for reading the 'tasks'/'procs' files. 3035 * Stuff for reading the 'tasks'/'procs' files.
3032 * 3036 *
3033 * Reading this file can return large amounts of data if a cgroup has 3037 * Reading this file can return large amounts of data if a cgroup has
3034 * *lots* of attached tasks. So it may need several calls to read(), 3038 * *lots* of attached tasks. So it may need several calls to read(),
3035 * but we cannot guarantee that the information we produce is correct 3039 * but we cannot guarantee that the information we produce is correct
3036 * unless we produce it entirely atomically. 3040 * unless we produce it entirely atomically.
3037 * 3041 *
3038 */ 3042 */
3039 3043
3040 /* which pidlist file are we talking about? */ 3044 /* which pidlist file are we talking about? */
3041 enum cgroup_filetype { 3045 enum cgroup_filetype {
3042 CGROUP_FILE_PROCS, 3046 CGROUP_FILE_PROCS,
3043 CGROUP_FILE_TASKS, 3047 CGROUP_FILE_TASKS,
3044 }; 3048 };
3045 3049
3046 /* 3050 /*
3047 * A pidlist is a list of pids that virtually represents the contents of one 3051 * A pidlist is a list of pids that virtually represents the contents of one
3048 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, 3052 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3049 * a pair (one each for procs, tasks) for each pid namespace that's relevant 3053 * a pair (one each for procs, tasks) for each pid namespace that's relevant
3050 * to the cgroup. 3054 * to the cgroup.
3051 */ 3055 */
3052 struct cgroup_pidlist { 3056 struct cgroup_pidlist {
3053 /* 3057 /*
3054 * used to find which pidlist is wanted. doesn't change as long as 3058 * used to find which pidlist is wanted. doesn't change as long as
3055 * this particular list stays in the list. 3059 * this particular list stays in the list.
3056 */ 3060 */
3057 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; 3061 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3058 /* array of xids */ 3062 /* array of xids */
3059 pid_t *list; 3063 pid_t *list;
3060 /* how many elements the above list has */ 3064 /* how many elements the above list has */
3061 int length; 3065 int length;
3062 /* each of these stored in a list by its cgroup */ 3066 /* each of these stored in a list by its cgroup */
3063 struct list_head links; 3067 struct list_head links;
3064 /* pointer to the cgroup we belong to, for list removal purposes */ 3068 /* pointer to the cgroup we belong to, for list removal purposes */
3065 struct cgroup *owner; 3069 struct cgroup *owner;
3066 /* for delayed destruction */ 3070 /* for delayed destruction */
3067 struct delayed_work destroy_dwork; 3071 struct delayed_work destroy_dwork;
3068 }; 3072 };
3069 3073
3070 /* 3074 /*
3071 * The following two functions "fix" the issue where there are more pids 3075 * The following two functions "fix" the issue where there are more pids
3072 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. 3076 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3073 * TODO: replace with a kernel-wide solution to this problem 3077 * TODO: replace with a kernel-wide solution to this problem
3074 */ 3078 */
3075 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) 3079 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3076 static void *pidlist_allocate(int count) 3080 static void *pidlist_allocate(int count)
3077 { 3081 {
3078 if (PIDLIST_TOO_LARGE(count)) 3082 if (PIDLIST_TOO_LARGE(count))
3079 return vmalloc(count * sizeof(pid_t)); 3083 return vmalloc(count * sizeof(pid_t));
3080 else 3084 else
3081 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3085 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3082 } 3086 }
3083 3087
3084 static void pidlist_free(void *p) 3088 static void pidlist_free(void *p)
3085 { 3089 {
3086 if (is_vmalloc_addr(p)) 3090 if (is_vmalloc_addr(p))
3087 vfree(p); 3091 vfree(p);
3088 else 3092 else
3089 kfree(p); 3093 kfree(p);
3090 } 3094 }
3091 3095
3092 /* 3096 /*
3093 * Used to destroy all pidlists lingering waiting for destroy timer. None 3097 * Used to destroy all pidlists lingering waiting for destroy timer. None
3094 * should be left afterwards. 3098 * should be left afterwards.
3095 */ 3099 */
3096 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) 3100 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3097 { 3101 {
3098 struct cgroup_pidlist *l, *tmp_l; 3102 struct cgroup_pidlist *l, *tmp_l;
3099 3103
3100 mutex_lock(&cgrp->pidlist_mutex); 3104 mutex_lock(&cgrp->pidlist_mutex);
3101 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) 3105 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3102 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); 3106 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3103 mutex_unlock(&cgrp->pidlist_mutex); 3107 mutex_unlock(&cgrp->pidlist_mutex);
3104 3108
3105 flush_workqueue(cgroup_pidlist_destroy_wq); 3109 flush_workqueue(cgroup_pidlist_destroy_wq);
3106 BUG_ON(!list_empty(&cgrp->pidlists)); 3110 BUG_ON(!list_empty(&cgrp->pidlists));
3107 } 3111 }
3108 3112
3109 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) 3113 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3110 { 3114 {
3111 struct delayed_work *dwork = to_delayed_work(work); 3115 struct delayed_work *dwork = to_delayed_work(work);
3112 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, 3116 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3113 destroy_dwork); 3117 destroy_dwork);
3114 struct cgroup_pidlist *tofree = NULL; 3118 struct cgroup_pidlist *tofree = NULL;
3115 3119
3116 mutex_lock(&l->owner->pidlist_mutex); 3120 mutex_lock(&l->owner->pidlist_mutex);
3117 3121
3118 /* 3122 /*
3119 * Destroy iff we didn't get queued again. The state won't change 3123 * Destroy iff we didn't get queued again. The state won't change
3120 * as destroy_dwork can only be queued while locked. 3124 * as destroy_dwork can only be queued while locked.
3121 */ 3125 */
3122 if (!delayed_work_pending(dwork)) { 3126 if (!delayed_work_pending(dwork)) {
3123 list_del(&l->links); 3127 list_del(&l->links);
3124 pidlist_free(l->list); 3128 pidlist_free(l->list);
3125 put_pid_ns(l->key.ns); 3129 put_pid_ns(l->key.ns);
3126 tofree = l; 3130 tofree = l;
3127 } 3131 }
3128 3132
3129 mutex_unlock(&l->owner->pidlist_mutex); 3133 mutex_unlock(&l->owner->pidlist_mutex);
3130 kfree(tofree); 3134 kfree(tofree);
3131 } 3135 }
3132 3136
3133 /* 3137 /*
3134 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3138 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3135 * Returns the number of unique elements. 3139 * Returns the number of unique elements.
3136 */ 3140 */
3137 static int pidlist_uniq(pid_t *list, int length) 3141 static int pidlist_uniq(pid_t *list, int length)
3138 { 3142 {
3139 int src, dest = 1; 3143 int src, dest = 1;
3140 3144
3141 /* 3145 /*
3142 * we presume the 0th element is unique, so i starts at 1. trivial 3146 * we presume the 0th element is unique, so i starts at 1. trivial
3143 * edge cases first; no work needs to be done for either 3147 * edge cases first; no work needs to be done for either
3144 */ 3148 */
3145 if (length == 0 || length == 1) 3149 if (length == 0 || length == 1)
3146 return length; 3150 return length;
3147 /* src and dest walk down the list; dest counts unique elements */ 3151 /* src and dest walk down the list; dest counts unique elements */
3148 for (src = 1; src < length; src++) { 3152 for (src = 1; src < length; src++) {
3149 /* find next unique element */ 3153 /* find next unique element */
3150 while (list[src] == list[src-1]) { 3154 while (list[src] == list[src-1]) {
3151 src++; 3155 src++;
3152 if (src == length) 3156 if (src == length)
3153 goto after; 3157 goto after;
3154 } 3158 }
3155 /* dest always points to where the next unique element goes */ 3159 /* dest always points to where the next unique element goes */
3156 list[dest] = list[src]; 3160 list[dest] = list[src];
3157 dest++; 3161 dest++;
3158 } 3162 }
3159 after: 3163 after:
3160 return dest; 3164 return dest;
3161 } 3165 }
3162 3166
3163 /* 3167 /*
3164 * The two pid files - task and cgroup.procs - guaranteed that the result 3168 * The two pid files - task and cgroup.procs - guaranteed that the result
3165 * is sorted, which forced this whole pidlist fiasco. As pid order is 3169 * is sorted, which forced this whole pidlist fiasco. As pid order is
3166 * different per namespace, each namespace needs differently sorted list, 3170 * different per namespace, each namespace needs differently sorted list,
3167 * making it impossible to use, for example, single rbtree of member tasks 3171 * making it impossible to use, for example, single rbtree of member tasks
3168 * sorted by task pointer. As pidlists can be fairly large, allocating one 3172 * sorted by task pointer. As pidlists can be fairly large, allocating one
3169 * per open file is dangerous, so cgroup had to implement shared pool of 3173 * per open file is dangerous, so cgroup had to implement shared pool of
3170 * pidlists keyed by cgroup and namespace. 3174 * pidlists keyed by cgroup and namespace.
3171 * 3175 *
3172 * All this extra complexity was caused by the original implementation 3176 * All this extra complexity was caused by the original implementation
3173 * committing to an entirely unnecessary property. In the long term, we 3177 * committing to an entirely unnecessary property. In the long term, we
3174 * want to do away with it. Explicitly scramble sort order if 3178 * want to do away with it. Explicitly scramble sort order if
3175 * sane_behavior so that no such expectation exists in the new interface. 3179 * sane_behavior so that no such expectation exists in the new interface.
3176 * 3180 *
3177 * Scrambling is done by swapping every two consecutive bits, which is 3181 * Scrambling is done by swapping every two consecutive bits, which is
3178 * non-identity one-to-one mapping which disturbs sort order sufficiently. 3182 * non-identity one-to-one mapping which disturbs sort order sufficiently.
3179 */ 3183 */
3180 static pid_t pid_fry(pid_t pid) 3184 static pid_t pid_fry(pid_t pid)
3181 { 3185 {
3182 unsigned a = pid & 0x55555555; 3186 unsigned a = pid & 0x55555555;
3183 unsigned b = pid & 0xAAAAAAAA; 3187 unsigned b = pid & 0xAAAAAAAA;
3184 3188
3185 return (a << 1) | (b >> 1); 3189 return (a << 1) | (b >> 1);
3186 } 3190 }
3187 3191
3188 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) 3192 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3189 { 3193 {
3190 if (cgroup_sane_behavior(cgrp)) 3194 if (cgroup_sane_behavior(cgrp))
3191 return pid_fry(pid); 3195 return pid_fry(pid);
3192 else 3196 else
3193 return pid; 3197 return pid;
3194 } 3198 }
3195 3199
3196 static int cmppid(const void *a, const void *b) 3200 static int cmppid(const void *a, const void *b)
3197 { 3201 {
3198 return *(pid_t *)a - *(pid_t *)b; 3202 return *(pid_t *)a - *(pid_t *)b;
3199 } 3203 }
3200 3204
3201 static int fried_cmppid(const void *a, const void *b) 3205 static int fried_cmppid(const void *a, const void *b)
3202 { 3206 {
3203 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); 3207 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3204 } 3208 }
3205 3209
3206 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3210 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3207 enum cgroup_filetype type) 3211 enum cgroup_filetype type)
3208 { 3212 {
3209 struct cgroup_pidlist *l; 3213 struct cgroup_pidlist *l;
3210 /* don't need task_nsproxy() if we're looking at ourself */ 3214 /* don't need task_nsproxy() if we're looking at ourself */
3211 struct pid_namespace *ns = task_active_pid_ns(current); 3215 struct pid_namespace *ns = task_active_pid_ns(current);
3212 3216
3213 lockdep_assert_held(&cgrp->pidlist_mutex); 3217 lockdep_assert_held(&cgrp->pidlist_mutex);
3214 3218
3215 list_for_each_entry(l, &cgrp->pidlists, links) 3219 list_for_each_entry(l, &cgrp->pidlists, links)
3216 if (l->key.type == type && l->key.ns == ns) 3220 if (l->key.type == type && l->key.ns == ns)
3217 return l; 3221 return l;
3218 return NULL; 3222 return NULL;
3219 } 3223 }
3220 3224
3221 /* 3225 /*
3222 * find the appropriate pidlist for our purpose (given procs vs tasks) 3226 * find the appropriate pidlist for our purpose (given procs vs tasks)
3223 * returns with the lock on that pidlist already held, and takes care 3227 * returns with the lock on that pidlist already held, and takes care
3224 * of the use count, or returns NULL with no locks held if we're out of 3228 * of the use count, or returns NULL with no locks held if we're out of
3225 * memory. 3229 * memory.
3226 */ 3230 */
3227 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, 3231 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3228 enum cgroup_filetype type) 3232 enum cgroup_filetype type)
3229 { 3233 {
3230 struct cgroup_pidlist *l; 3234 struct cgroup_pidlist *l;
3231 3235
3232 lockdep_assert_held(&cgrp->pidlist_mutex); 3236 lockdep_assert_held(&cgrp->pidlist_mutex);
3233 3237
3234 l = cgroup_pidlist_find(cgrp, type); 3238 l = cgroup_pidlist_find(cgrp, type);
3235 if (l) 3239 if (l)
3236 return l; 3240 return l;
3237 3241
3238 /* entry not found; create a new one */ 3242 /* entry not found; create a new one */
3239 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3243 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3240 if (!l) 3244 if (!l)
3241 return l; 3245 return l;
3242 3246
3243 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); 3247 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3244 l->key.type = type; 3248 l->key.type = type;
3245 /* don't need task_nsproxy() if we're looking at ourself */ 3249 /* don't need task_nsproxy() if we're looking at ourself */
3246 l->key.ns = get_pid_ns(task_active_pid_ns(current)); 3250 l->key.ns = get_pid_ns(task_active_pid_ns(current));
3247 l->owner = cgrp; 3251 l->owner = cgrp;
3248 list_add(&l->links, &cgrp->pidlists); 3252 list_add(&l->links, &cgrp->pidlists);
3249 return l; 3253 return l;
3250 } 3254 }
3251 3255
3252 /* 3256 /*
3253 * Load a cgroup's pidarray with either procs' tgids or tasks' pids 3257 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3254 */ 3258 */
3255 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, 3259 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3256 struct cgroup_pidlist **lp) 3260 struct cgroup_pidlist **lp)
3257 { 3261 {
3258 pid_t *array; 3262 pid_t *array;
3259 int length; 3263 int length;
3260 int pid, n = 0; /* used for populating the array */ 3264 int pid, n = 0; /* used for populating the array */
3261 struct css_task_iter it; 3265 struct css_task_iter it;
3262 struct task_struct *tsk; 3266 struct task_struct *tsk;
3263 struct cgroup_pidlist *l; 3267 struct cgroup_pidlist *l;
3264 3268
3265 lockdep_assert_held(&cgrp->pidlist_mutex); 3269 lockdep_assert_held(&cgrp->pidlist_mutex);
3266 3270
3267 /* 3271 /*
3268 * If cgroup gets more users after we read count, we won't have 3272 * If cgroup gets more users after we read count, we won't have
3269 * enough space - tough. This race is indistinguishable to the 3273 * enough space - tough. This race is indistinguishable to the
3270 * caller from the case that the additional cgroup users didn't 3274 * caller from the case that the additional cgroup users didn't
3271 * show up until sometime later on. 3275 * show up until sometime later on.
3272 */ 3276 */
3273 length = cgroup_task_count(cgrp); 3277 length = cgroup_task_count(cgrp);
3274 array = pidlist_allocate(length); 3278 array = pidlist_allocate(length);
3275 if (!array) 3279 if (!array)
3276 return -ENOMEM; 3280 return -ENOMEM;
3277 /* now, populate the array */ 3281 /* now, populate the array */
3278 css_task_iter_start(&cgrp->dummy_css, &it); 3282 css_task_iter_start(&cgrp->dummy_css, &it);
3279 while ((tsk = css_task_iter_next(&it))) { 3283 while ((tsk = css_task_iter_next(&it))) {
3280 if (unlikely(n == length)) 3284 if (unlikely(n == length))
3281 break; 3285 break;
3282 /* get tgid or pid for procs or tasks file respectively */ 3286 /* get tgid or pid for procs or tasks file respectively */
3283 if (type == CGROUP_FILE_PROCS) 3287 if (type == CGROUP_FILE_PROCS)
3284 pid = task_tgid_vnr(tsk); 3288 pid = task_tgid_vnr(tsk);
3285 else 3289 else
3286 pid = task_pid_vnr(tsk); 3290 pid = task_pid_vnr(tsk);
3287 if (pid > 0) /* make sure to only use valid results */ 3291 if (pid > 0) /* make sure to only use valid results */
3288 array[n++] = pid; 3292 array[n++] = pid;
3289 } 3293 }
3290 css_task_iter_end(&it); 3294 css_task_iter_end(&it);
3291 length = n; 3295 length = n;
3292 /* now sort & (if procs) strip out duplicates */ 3296 /* now sort & (if procs) strip out duplicates */
3293 if (cgroup_sane_behavior(cgrp)) 3297 if (cgroup_sane_behavior(cgrp))
3294 sort(array, length, sizeof(pid_t), fried_cmppid, NULL); 3298 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3295 else 3299 else
3296 sort(array, length, sizeof(pid_t), cmppid, NULL); 3300 sort(array, length, sizeof(pid_t), cmppid, NULL);
3297 if (type == CGROUP_FILE_PROCS) 3301 if (type == CGROUP_FILE_PROCS)
3298 length = pidlist_uniq(array, length); 3302 length = pidlist_uniq(array, length);
3299 3303
3300 l = cgroup_pidlist_find_create(cgrp, type); 3304 l = cgroup_pidlist_find_create(cgrp, type);
3301 if (!l) { 3305 if (!l) {
3302 mutex_unlock(&cgrp->pidlist_mutex); 3306 mutex_unlock(&cgrp->pidlist_mutex);
3303 pidlist_free(array); 3307 pidlist_free(array);
3304 return -ENOMEM; 3308 return -ENOMEM;
3305 } 3309 }
3306 3310
3307 /* store array, freeing old if necessary */ 3311 /* store array, freeing old if necessary */
3308 pidlist_free(l->list); 3312 pidlist_free(l->list);
3309 l->list = array; 3313 l->list = array;
3310 l->length = length; 3314 l->length = length;
3311 *lp = l; 3315 *lp = l;
3312 return 0; 3316 return 0;
3313 } 3317 }
3314 3318
3315 /** 3319 /**
3316 * cgroupstats_build - build and fill cgroupstats 3320 * cgroupstats_build - build and fill cgroupstats
3317 * @stats: cgroupstats to fill information into 3321 * @stats: cgroupstats to fill information into
3318 * @dentry: A dentry entry belonging to the cgroup for which stats have 3322 * @dentry: A dentry entry belonging to the cgroup for which stats have
3319 * been requested. 3323 * been requested.
3320 * 3324 *
3321 * Build and fill cgroupstats so that taskstats can export it to user 3325 * Build and fill cgroupstats so that taskstats can export it to user
3322 * space. 3326 * space.
3323 */ 3327 */
3324 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3328 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3325 { 3329 {
3326 struct kernfs_node *kn = kernfs_node_from_dentry(dentry); 3330 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3327 struct cgroup *cgrp; 3331 struct cgroup *cgrp;
3328 struct css_task_iter it; 3332 struct css_task_iter it;
3329 struct task_struct *tsk; 3333 struct task_struct *tsk;
3330 3334
3331 /* it should be kernfs_node belonging to cgroupfs and is a directory */ 3335 /* it should be kernfs_node belonging to cgroupfs and is a directory */
3332 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || 3336 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3333 kernfs_type(kn) != KERNFS_DIR) 3337 kernfs_type(kn) != KERNFS_DIR)
3334 return -EINVAL; 3338 return -EINVAL;
3335 3339
3336 mutex_lock(&cgroup_mutex); 3340 mutex_lock(&cgroup_mutex);
3337 3341
3338 /* 3342 /*
3339 * We aren't being called from kernfs and there's no guarantee on 3343 * We aren't being called from kernfs and there's no guarantee on
3340 * @kn->priv's validity. For this and css_tryget_from_dir(), 3344 * @kn->priv's validity. For this and css_tryget_from_dir(),
3341 * @kn->priv is RCU safe. Let's do the RCU dancing. 3345 * @kn->priv is RCU safe. Let's do the RCU dancing.
3342 */ 3346 */
3343 rcu_read_lock(); 3347 rcu_read_lock();
3344 cgrp = rcu_dereference(kn->priv); 3348 cgrp = rcu_dereference(kn->priv);
3345 if (!cgrp || cgroup_is_dead(cgrp)) { 3349 if (!cgrp || cgroup_is_dead(cgrp)) {
3346 rcu_read_unlock(); 3350 rcu_read_unlock();
3347 mutex_unlock(&cgroup_mutex); 3351 mutex_unlock(&cgroup_mutex);
3348 return -ENOENT; 3352 return -ENOENT;
3349 } 3353 }
3350 rcu_read_unlock(); 3354 rcu_read_unlock();
3351 3355
3352 css_task_iter_start(&cgrp->dummy_css, &it); 3356 css_task_iter_start(&cgrp->dummy_css, &it);
3353 while ((tsk = css_task_iter_next(&it))) { 3357 while ((tsk = css_task_iter_next(&it))) {
3354 switch (tsk->state) { 3358 switch (tsk->state) {
3355 case TASK_RUNNING: 3359 case TASK_RUNNING:
3356 stats->nr_running++; 3360 stats->nr_running++;
3357 break; 3361 break;
3358 case TASK_INTERRUPTIBLE: 3362 case TASK_INTERRUPTIBLE:
3359 stats->nr_sleeping++; 3363 stats->nr_sleeping++;
3360 break; 3364 break;
3361 case TASK_UNINTERRUPTIBLE: 3365 case TASK_UNINTERRUPTIBLE:
3362 stats->nr_uninterruptible++; 3366 stats->nr_uninterruptible++;
3363 break; 3367 break;
3364 case TASK_STOPPED: 3368 case TASK_STOPPED:
3365 stats->nr_stopped++; 3369 stats->nr_stopped++;
3366 break; 3370 break;
3367 default: 3371 default:
3368 if (delayacct_is_task_waiting_on_io(tsk)) 3372 if (delayacct_is_task_waiting_on_io(tsk))
3369 stats->nr_io_wait++; 3373 stats->nr_io_wait++;
3370 break; 3374 break;
3371 } 3375 }
3372 } 3376 }
3373 css_task_iter_end(&it); 3377 css_task_iter_end(&it);
3374 3378
3375 mutex_unlock(&cgroup_mutex); 3379 mutex_unlock(&cgroup_mutex);
3376 return 0; 3380 return 0;
3377 } 3381 }
3378 3382
3379 3383
3380 /* 3384 /*
3381 * seq_file methods for the tasks/procs files. The seq_file position is the 3385 * seq_file methods for the tasks/procs files. The seq_file position is the
3382 * next pid to display; the seq_file iterator is a pointer to the pid 3386 * next pid to display; the seq_file iterator is a pointer to the pid
3383 * in the cgroup->l->list array. 3387 * in the cgroup->l->list array.
3384 */ 3388 */
3385 3389
3386 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) 3390 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3387 { 3391 {
3388 /* 3392 /*
3389 * Initially we receive a position value that corresponds to 3393 * Initially we receive a position value that corresponds to
3390 * one more than the last pid shown (or 0 on the first call or 3394 * one more than the last pid shown (or 0 on the first call or
3391 * after a seek to the start). Use a binary-search to find the 3395 * after a seek to the start). Use a binary-search to find the
3392 * next pid to display, if any 3396 * next pid to display, if any
3393 */ 3397 */
3394 struct kernfs_open_file *of = s->private; 3398 struct kernfs_open_file *of = s->private;
3395 struct cgroup *cgrp = seq_css(s)->cgroup; 3399 struct cgroup *cgrp = seq_css(s)->cgroup;
3396 struct cgroup_pidlist *l; 3400 struct cgroup_pidlist *l;
3397 enum cgroup_filetype type = seq_cft(s)->private; 3401 enum cgroup_filetype type = seq_cft(s)->private;
3398 int index = 0, pid = *pos; 3402 int index = 0, pid = *pos;
3399 int *iter, ret; 3403 int *iter, ret;
3400 3404
3401 mutex_lock(&cgrp->pidlist_mutex); 3405 mutex_lock(&cgrp->pidlist_mutex);
3402 3406
3403 /* 3407 /*
3404 * !NULL @of->priv indicates that this isn't the first start() 3408 * !NULL @of->priv indicates that this isn't the first start()
3405 * after open. If the matching pidlist is around, we can use that. 3409 * after open. If the matching pidlist is around, we can use that.
3406 * Look for it. Note that @of->priv can't be used directly. It 3410 * Look for it. Note that @of->priv can't be used directly. It
3407 * could already have been destroyed. 3411 * could already have been destroyed.
3408 */ 3412 */
3409 if (of->priv) 3413 if (of->priv)
3410 of->priv = cgroup_pidlist_find(cgrp, type); 3414 of->priv = cgroup_pidlist_find(cgrp, type);
3411 3415
3412 /* 3416 /*
3413 * Either this is the first start() after open or the matching 3417 * Either this is the first start() after open or the matching
3414 * pidlist has been destroyed inbetween. Create a new one. 3418 * pidlist has been destroyed inbetween. Create a new one.
3415 */ 3419 */
3416 if (!of->priv) { 3420 if (!of->priv) {
3417 ret = pidlist_array_load(cgrp, type, 3421 ret = pidlist_array_load(cgrp, type,
3418 (struct cgroup_pidlist **)&of->priv); 3422 (struct cgroup_pidlist **)&of->priv);
3419 if (ret) 3423 if (ret)
3420 return ERR_PTR(ret); 3424 return ERR_PTR(ret);
3421 } 3425 }
3422 l = of->priv; 3426 l = of->priv;
3423 3427
3424 if (pid) { 3428 if (pid) {
3425 int end = l->length; 3429 int end = l->length;
3426 3430
3427 while (index < end) { 3431 while (index < end) {
3428 int mid = (index + end) / 2; 3432 int mid = (index + end) / 2;
3429 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { 3433 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3430 index = mid; 3434 index = mid;
3431 break; 3435 break;
3432 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) 3436 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3433 index = mid + 1; 3437 index = mid + 1;
3434 else 3438 else
3435 end = mid; 3439 end = mid;
3436 } 3440 }
3437 } 3441 }
3438 /* If we're off the end of the array, we're done */ 3442 /* If we're off the end of the array, we're done */
3439 if (index >= l->length) 3443 if (index >= l->length)
3440 return NULL; 3444 return NULL;
3441 /* Update the abstract position to be the actual pid that we found */ 3445 /* Update the abstract position to be the actual pid that we found */
3442 iter = l->list + index; 3446 iter = l->list + index;
3443 *pos = cgroup_pid_fry(cgrp, *iter); 3447 *pos = cgroup_pid_fry(cgrp, *iter);
3444 return iter; 3448 return iter;
3445 } 3449 }
3446 3450
3447 static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3451 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3448 { 3452 {
3449 struct kernfs_open_file *of = s->private; 3453 struct kernfs_open_file *of = s->private;
3450 struct cgroup_pidlist *l = of->priv; 3454 struct cgroup_pidlist *l = of->priv;
3451 3455
3452 if (l) 3456 if (l)
3453 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 3457 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3454 CGROUP_PIDLIST_DESTROY_DELAY); 3458 CGROUP_PIDLIST_DESTROY_DELAY);
3455 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); 3459 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3456 } 3460 }
3457 3461
3458 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3462 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3459 { 3463 {
3460 struct kernfs_open_file *of = s->private; 3464 struct kernfs_open_file *of = s->private;
3461 struct cgroup_pidlist *l = of->priv; 3465 struct cgroup_pidlist *l = of->priv;
3462 pid_t *p = v; 3466 pid_t *p = v;
3463 pid_t *end = l->list + l->length; 3467 pid_t *end = l->list + l->length;
3464 /* 3468 /*
3465 * Advance to the next pid in the array. If this goes off the 3469 * Advance to the next pid in the array. If this goes off the
3466 * end, we're done 3470 * end, we're done
3467 */ 3471 */
3468 p++; 3472 p++;
3469 if (p >= end) { 3473 if (p >= end) {
3470 return NULL; 3474 return NULL;
3471 } else { 3475 } else {
3472 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); 3476 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3473 return p; 3477 return p;
3474 } 3478 }
3475 } 3479 }
3476 3480
3477 static int cgroup_pidlist_show(struct seq_file *s, void *v) 3481 static int cgroup_pidlist_show(struct seq_file *s, void *v)
3478 { 3482 {
3479 return seq_printf(s, "%d\n", *(int *)v); 3483 return seq_printf(s, "%d\n", *(int *)v);
3480 } 3484 }
3481 3485
3482 /* 3486 /*
3483 * seq_operations functions for iterating on pidlists through seq_file - 3487 * seq_operations functions for iterating on pidlists through seq_file -
3484 * independent of whether it's tasks or procs 3488 * independent of whether it's tasks or procs
3485 */ 3489 */
3486 static const struct seq_operations cgroup_pidlist_seq_operations = { 3490 static const struct seq_operations cgroup_pidlist_seq_operations = {
3487 .start = cgroup_pidlist_start, 3491 .start = cgroup_pidlist_start,
3488 .stop = cgroup_pidlist_stop, 3492 .stop = cgroup_pidlist_stop,
3489 .next = cgroup_pidlist_next, 3493 .next = cgroup_pidlist_next,
3490 .show = cgroup_pidlist_show, 3494 .show = cgroup_pidlist_show,
3491 }; 3495 };
3492 3496
3493 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3497 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3494 struct cftype *cft) 3498 struct cftype *cft)
3495 { 3499 {
3496 return notify_on_release(css->cgroup); 3500 return notify_on_release(css->cgroup);
3497 } 3501 }
3498 3502
3499 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, 3503 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3500 struct cftype *cft, u64 val) 3504 struct cftype *cft, u64 val)
3501 { 3505 {
3502 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); 3506 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3503 if (val) 3507 if (val)
3504 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); 3508 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3505 else 3509 else
3506 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); 3510 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3507 return 0; 3511 return 0;
3508 } 3512 }
3509 3513
3510 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3514 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3511 struct cftype *cft) 3515 struct cftype *cft)
3512 { 3516 {
3513 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); 3517 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3514 } 3518 }
3515 3519
3516 static int cgroup_clone_children_write(struct cgroup_subsys_state *css, 3520 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
3517 struct cftype *cft, u64 val) 3521 struct cftype *cft, u64 val)
3518 { 3522 {
3519 if (val) 3523 if (val)
3520 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); 3524 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3521 else 3525 else
3522 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); 3526 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3523 return 0; 3527 return 0;
3524 } 3528 }
3525 3529
3526 static struct cftype cgroup_base_files[] = { 3530 static struct cftype cgroup_base_files[] = {
3527 { 3531 {
3528 .name = "cgroup.procs", 3532 .name = "cgroup.procs",
3529 .seq_start = cgroup_pidlist_start, 3533 .seq_start = cgroup_pidlist_start,
3530 .seq_next = cgroup_pidlist_next, 3534 .seq_next = cgroup_pidlist_next,
3531 .seq_stop = cgroup_pidlist_stop, 3535 .seq_stop = cgroup_pidlist_stop,
3532 .seq_show = cgroup_pidlist_show, 3536 .seq_show = cgroup_pidlist_show,
3533 .private = CGROUP_FILE_PROCS, 3537 .private = CGROUP_FILE_PROCS,
3534 .write_u64 = cgroup_procs_write, 3538 .write_u64 = cgroup_procs_write,
3535 .mode = S_IRUGO | S_IWUSR, 3539 .mode = S_IRUGO | S_IWUSR,
3536 }, 3540 },
3537 { 3541 {
3538 .name = "cgroup.clone_children", 3542 .name = "cgroup.clone_children",
3539 .flags = CFTYPE_INSANE, 3543 .flags = CFTYPE_INSANE,
3540 .read_u64 = cgroup_clone_children_read, 3544 .read_u64 = cgroup_clone_children_read,
3541 .write_u64 = cgroup_clone_children_write, 3545 .write_u64 = cgroup_clone_children_write,
3542 }, 3546 },
3543 { 3547 {
3544 .name = "cgroup.sane_behavior", 3548 .name = "cgroup.sane_behavior",
3545 .flags = CFTYPE_ONLY_ON_ROOT, 3549 .flags = CFTYPE_ONLY_ON_ROOT,
3546 .seq_show = cgroup_sane_behavior_show, 3550 .seq_show = cgroup_sane_behavior_show,
3547 }, 3551 },
3548 3552
3549 /* 3553 /*
3550 * Historical crazy stuff. These don't have "cgroup." prefix and 3554 * Historical crazy stuff. These don't have "cgroup." prefix and
3551 * don't exist if sane_behavior. If you're depending on these, be 3555 * don't exist if sane_behavior. If you're depending on these, be
3552 * prepared to be burned. 3556 * prepared to be burned.
3553 */ 3557 */
3554 { 3558 {
3555 .name = "tasks", 3559 .name = "tasks",
3556 .flags = CFTYPE_INSANE, /* use "procs" instead */ 3560 .flags = CFTYPE_INSANE, /* use "procs" instead */
3557 .seq_start = cgroup_pidlist_start, 3561 .seq_start = cgroup_pidlist_start,
3558 .seq_next = cgroup_pidlist_next, 3562 .seq_next = cgroup_pidlist_next,
3559 .seq_stop = cgroup_pidlist_stop, 3563 .seq_stop = cgroup_pidlist_stop,
3560 .seq_show = cgroup_pidlist_show, 3564 .seq_show = cgroup_pidlist_show,
3561 .private = CGROUP_FILE_TASKS, 3565 .private = CGROUP_FILE_TASKS,
3562 .write_u64 = cgroup_tasks_write, 3566 .write_u64 = cgroup_tasks_write,
3563 .mode = S_IRUGO | S_IWUSR, 3567 .mode = S_IRUGO | S_IWUSR,
3564 }, 3568 },
3565 { 3569 {
3566 .name = "notify_on_release", 3570 .name = "notify_on_release",
3567 .flags = CFTYPE_INSANE, 3571 .flags = CFTYPE_INSANE,
3568 .read_u64 = cgroup_read_notify_on_release, 3572 .read_u64 = cgroup_read_notify_on_release,
3569 .write_u64 = cgroup_write_notify_on_release, 3573 .write_u64 = cgroup_write_notify_on_release,
3570 }, 3574 },
3571 { 3575 {
3572 .name = "release_agent", 3576 .name = "release_agent",
3573 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3577 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3574 .seq_show = cgroup_release_agent_show, 3578 .seq_show = cgroup_release_agent_show,
3575 .write_string = cgroup_release_agent_write, 3579 .write_string = cgroup_release_agent_write,
3576 .max_write_len = PATH_MAX - 1, 3580 .max_write_len = PATH_MAX - 1,
3577 }, 3581 },
3578 { } /* terminate */ 3582 { } /* terminate */
3579 }; 3583 };
3580 3584
3581 /** 3585 /**
3582 * cgroup_populate_dir - create subsys files in a cgroup directory 3586 * cgroup_populate_dir - create subsys files in a cgroup directory
3583 * @cgrp: target cgroup 3587 * @cgrp: target cgroup
3584 * @subsys_mask: mask of the subsystem ids whose files should be added 3588 * @subsys_mask: mask of the subsystem ids whose files should be added
3585 * 3589 *
3586 * On failure, no file is added. 3590 * On failure, no file is added.
3587 */ 3591 */
3588 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) 3592 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3589 { 3593 {
3590 struct cgroup_subsys *ss; 3594 struct cgroup_subsys *ss;
3591 int i, ret = 0; 3595 int i, ret = 0;
3592 3596
3593 /* process cftsets of each subsystem */ 3597 /* process cftsets of each subsystem */
3594 for_each_subsys(ss, i) { 3598 for_each_subsys(ss, i) {
3595 struct cftype *cfts; 3599 struct cftype *cfts;
3596 3600
3597 if (!test_bit(i, &subsys_mask)) 3601 if (!test_bit(i, &subsys_mask))
3598 continue; 3602 continue;
3599 3603
3600 list_for_each_entry(cfts, &ss->cfts, node) { 3604 list_for_each_entry(cfts, &ss->cfts, node) {
3601 ret = cgroup_addrm_files(cgrp, cfts, true); 3605 ret = cgroup_addrm_files(cgrp, cfts, true);
3602 if (ret < 0) 3606 if (ret < 0)
3603 goto err; 3607 goto err;
3604 } 3608 }
3605 } 3609 }
3606 return 0; 3610 return 0;
3607 err: 3611 err:
3608 cgroup_clear_dir(cgrp, subsys_mask); 3612 cgroup_clear_dir(cgrp, subsys_mask);
3609 return ret; 3613 return ret;
3610 } 3614 }
3611 3615
3612 /* 3616 /*
3613 * css destruction is four-stage process. 3617 * css destruction is four-stage process.
3614 * 3618 *
3615 * 1. Destruction starts. Killing of the percpu_ref is initiated. 3619 * 1. Destruction starts. Killing of the percpu_ref is initiated.
3616 * Implemented in kill_css(). 3620 * Implemented in kill_css().
3617 * 3621 *
3618 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs 3622 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
3619 * and thus css_tryget() is guaranteed to fail, the css can be offlined 3623 * and thus css_tryget() is guaranteed to fail, the css can be offlined
3620 * by invoking offline_css(). After offlining, the base ref is put. 3624 * by invoking offline_css(). After offlining, the base ref is put.
3621 * Implemented in css_killed_work_fn(). 3625 * Implemented in css_killed_work_fn().
3622 * 3626 *
3623 * 3. When the percpu_ref reaches zero, the only possible remaining 3627 * 3. When the percpu_ref reaches zero, the only possible remaining
3624 * accessors are inside RCU read sections. css_release() schedules the 3628 * accessors are inside RCU read sections. css_release() schedules the
3625 * RCU callback. 3629 * RCU callback.
3626 * 3630 *
3627 * 4. After the grace period, the css can be freed. Implemented in 3631 * 4. After the grace period, the css can be freed. Implemented in
3628 * css_free_work_fn(). 3632 * css_free_work_fn().
3629 * 3633 *
3630 * It is actually hairier because both step 2 and 4 require process context 3634 * It is actually hairier because both step 2 and 4 require process context
3631 * and thus involve punting to css->destroy_work adding two additional 3635 * and thus involve punting to css->destroy_work adding two additional
3632 * steps to the already complex sequence. 3636 * steps to the already complex sequence.
3633 */ 3637 */
3634 static void css_free_work_fn(struct work_struct *work) 3638 static void css_free_work_fn(struct work_struct *work)
3635 { 3639 {
3636 struct cgroup_subsys_state *css = 3640 struct cgroup_subsys_state *css =
3637 container_of(work, struct cgroup_subsys_state, destroy_work); 3641 container_of(work, struct cgroup_subsys_state, destroy_work);
3638 struct cgroup *cgrp = css->cgroup; 3642 struct cgroup *cgrp = css->cgroup;
3639 3643
3640 if (css->parent) 3644 if (css->parent)
3641 css_put(css->parent); 3645 css_put(css->parent);
3642 3646
3643 css->ss->css_free(css); 3647 css->ss->css_free(css);
3644 cgroup_put(cgrp); 3648 cgroup_put(cgrp);
3645 } 3649 }
3646 3650
3647 static void css_free_rcu_fn(struct rcu_head *rcu_head) 3651 static void css_free_rcu_fn(struct rcu_head *rcu_head)
3648 { 3652 {
3649 struct cgroup_subsys_state *css = 3653 struct cgroup_subsys_state *css =
3650 container_of(rcu_head, struct cgroup_subsys_state, rcu_head); 3654 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
3651 3655
3652 INIT_WORK(&css->destroy_work, css_free_work_fn); 3656 INIT_WORK(&css->destroy_work, css_free_work_fn);
3653 queue_work(cgroup_destroy_wq, &css->destroy_work); 3657 queue_work(cgroup_destroy_wq, &css->destroy_work);
3654 } 3658 }
3655 3659
3656 static void css_release(struct percpu_ref *ref) 3660 static void css_release(struct percpu_ref *ref)
3657 { 3661 {
3658 struct cgroup_subsys_state *css = 3662 struct cgroup_subsys_state *css =
3659 container_of(ref, struct cgroup_subsys_state, refcnt); 3663 container_of(ref, struct cgroup_subsys_state, refcnt);
3660 3664
3661 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); 3665 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
3662 call_rcu(&css->rcu_head, css_free_rcu_fn); 3666 call_rcu(&css->rcu_head, css_free_rcu_fn);
3663 } 3667 }
3664 3668
3665 static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, 3669 static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
3666 struct cgroup *cgrp) 3670 struct cgroup *cgrp)
3667 { 3671 {
3668 css->cgroup = cgrp; 3672 css->cgroup = cgrp;
3669 css->ss = ss; 3673 css->ss = ss;
3670 css->flags = 0; 3674 css->flags = 0;
3671 3675
3672 if (cgrp->parent) 3676 if (cgrp->parent)
3673 css->parent = cgroup_css(cgrp->parent, ss); 3677 css->parent = cgroup_css(cgrp->parent, ss);
3674 else 3678 else
3675 css->flags |= CSS_ROOT; 3679 css->flags |= CSS_ROOT;
3676 3680
3677 BUG_ON(cgroup_css(cgrp, ss)); 3681 BUG_ON(cgroup_css(cgrp, ss));
3678 } 3682 }
3679 3683
3680 /* invoke ->css_online() on a new CSS and mark it online if successful */ 3684 /* invoke ->css_online() on a new CSS and mark it online if successful */
3681 static int online_css(struct cgroup_subsys_state *css) 3685 static int online_css(struct cgroup_subsys_state *css)
3682 { 3686 {
3683 struct cgroup_subsys *ss = css->ss; 3687 struct cgroup_subsys *ss = css->ss;
3684 int ret = 0; 3688 int ret = 0;
3685 3689
3686 lockdep_assert_held(&cgroup_tree_mutex); 3690 lockdep_assert_held(&cgroup_tree_mutex);
3687 lockdep_assert_held(&cgroup_mutex); 3691 lockdep_assert_held(&cgroup_mutex);
3688 3692
3689 if (ss->css_online) 3693 if (ss->css_online)
3690 ret = ss->css_online(css); 3694 ret = ss->css_online(css);
3691 if (!ret) { 3695 if (!ret) {
3692 css->flags |= CSS_ONLINE; 3696 css->flags |= CSS_ONLINE;
3693 css->cgroup->nr_css++; 3697 css->cgroup->nr_css++;
3694 rcu_assign_pointer(css->cgroup->subsys[ss->id], css); 3698 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
3695 } 3699 }
3696 return ret; 3700 return ret;
3697 } 3701 }
3698 3702
3699 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ 3703 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
3700 static void offline_css(struct cgroup_subsys_state *css) 3704 static void offline_css(struct cgroup_subsys_state *css)
3701 { 3705 {
3702 struct cgroup_subsys *ss = css->ss; 3706 struct cgroup_subsys *ss = css->ss;
3703 3707
3704 lockdep_assert_held(&cgroup_tree_mutex); 3708 lockdep_assert_held(&cgroup_tree_mutex);
3705 lockdep_assert_held(&cgroup_mutex); 3709 lockdep_assert_held(&cgroup_mutex);
3706 3710
3707 if (!(css->flags & CSS_ONLINE)) 3711 if (!(css->flags & CSS_ONLINE))
3708 return; 3712 return;
3709 3713
3710 if (ss->css_offline) 3714 if (ss->css_offline)
3711 ss->css_offline(css); 3715 ss->css_offline(css);
3712 3716
3713 css->flags &= ~CSS_ONLINE; 3717 css->flags &= ~CSS_ONLINE;
3714 css->cgroup->nr_css--; 3718 css->cgroup->nr_css--;
3715 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); 3719 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
3716 } 3720 }
3717 3721
3718 /** 3722 /**
3719 * create_css - create a cgroup_subsys_state 3723 * create_css - create a cgroup_subsys_state
3720 * @cgrp: the cgroup new css will be associated with 3724 * @cgrp: the cgroup new css will be associated with
3721 * @ss: the subsys of new css 3725 * @ss: the subsys of new css
3722 * 3726 *
3723 * Create a new css associated with @cgrp - @ss pair. On success, the new 3727 * Create a new css associated with @cgrp - @ss pair. On success, the new
3724 * css is online and installed in @cgrp with all interface files created. 3728 * css is online and installed in @cgrp with all interface files created.
3725 * Returns 0 on success, -errno on failure. 3729 * Returns 0 on success, -errno on failure.
3726 */ 3730 */
3727 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 3731 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
3728 { 3732 {
3729 struct cgroup *parent = cgrp->parent; 3733 struct cgroup *parent = cgrp->parent;
3730 struct cgroup_subsys_state *css; 3734 struct cgroup_subsys_state *css;
3731 int err; 3735 int err;
3732 3736
3733 lockdep_assert_held(&cgroup_mutex); 3737 lockdep_assert_held(&cgroup_mutex);
3734 3738
3735 css = ss->css_alloc(cgroup_css(parent, ss)); 3739 css = ss->css_alloc(cgroup_css(parent, ss));
3736 if (IS_ERR(css)) 3740 if (IS_ERR(css))
3737 return PTR_ERR(css); 3741 return PTR_ERR(css);
3738 3742
3739 err = percpu_ref_init(&css->refcnt, css_release); 3743 err = percpu_ref_init(&css->refcnt, css_release);
3740 if (err) 3744 if (err)
3741 goto err_free_css; 3745 goto err_free_css;
3742 3746
3743 init_css(css, ss, cgrp); 3747 init_css(css, ss, cgrp);
3744 3748
3745 err = cgroup_populate_dir(cgrp, 1 << ss->id); 3749 err = cgroup_populate_dir(cgrp, 1 << ss->id);
3746 if (err) 3750 if (err)
3747 goto err_free_percpu_ref; 3751 goto err_free_percpu_ref;
3748 3752
3749 err = online_css(css); 3753 err = online_css(css);
3750 if (err) 3754 if (err)
3751 goto err_clear_dir; 3755 goto err_clear_dir;
3752 3756
3753 cgroup_get(cgrp); 3757 cgroup_get(cgrp);
3754 css_get(css->parent); 3758 css_get(css->parent);
3755 3759
3756 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 3760 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
3757 parent->parent) { 3761 parent->parent) {
3758 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 3762 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
3759 current->comm, current->pid, ss->name); 3763 current->comm, current->pid, ss->name);
3760 if (!strcmp(ss->name, "memory")) 3764 if (!strcmp(ss->name, "memory"))
3761 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); 3765 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
3762 ss->warned_broken_hierarchy = true; 3766 ss->warned_broken_hierarchy = true;
3763 } 3767 }
3764 3768
3765 return 0; 3769 return 0;
3766 3770
3767 err_clear_dir: 3771 err_clear_dir:
3768 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 3772 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
3769 err_free_percpu_ref: 3773 err_free_percpu_ref:
3770 percpu_ref_cancel_init(&css->refcnt); 3774 percpu_ref_cancel_init(&css->refcnt);
3771 err_free_css: 3775 err_free_css:
3772 ss->css_free(css); 3776 ss->css_free(css);
3773 return err; 3777 return err;
3774 } 3778 }
3775 3779
3776 /** 3780 /**
3777 * cgroup_create - create a cgroup 3781 * cgroup_create - create a cgroup
3778 * @parent: cgroup that will be parent of the new cgroup 3782 * @parent: cgroup that will be parent of the new cgroup
3779 * @name: name of the new cgroup 3783 * @name: name of the new cgroup
3780 * @mode: mode to set on new cgroup 3784 * @mode: mode to set on new cgroup
3781 */ 3785 */
3782 static long cgroup_create(struct cgroup *parent, const char *name, 3786 static long cgroup_create(struct cgroup *parent, const char *name,
3783 umode_t mode) 3787 umode_t mode)
3784 { 3788 {
3785 struct cgroup *cgrp; 3789 struct cgroup *cgrp;
3786 struct cgroup_root *root = parent->root; 3790 struct cgroup_root *root = parent->root;
3787 int ssid, err; 3791 int ssid, err;
3788 struct cgroup_subsys *ss; 3792 struct cgroup_subsys *ss;
3789 struct kernfs_node *kn; 3793 struct kernfs_node *kn;
3790 3794
3791 /* allocate the cgroup and its ID, 0 is reserved for the root */ 3795 /* allocate the cgroup and its ID, 0 is reserved for the root */
3792 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3796 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3793 if (!cgrp) 3797 if (!cgrp)
3794 return -ENOMEM; 3798 return -ENOMEM;
3795 3799
3796 mutex_lock(&cgroup_tree_mutex); 3800 mutex_lock(&cgroup_tree_mutex);
3797 3801
3798 /* 3802 /*
3799 * Only live parents can have children. Note that the liveliness 3803 * Only live parents can have children. Note that the liveliness
3800 * check isn't strictly necessary because cgroup_mkdir() and 3804 * check isn't strictly necessary because cgroup_mkdir() and
3801 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it 3805 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
3802 * anyway so that locking is contained inside cgroup proper and we 3806 * anyway so that locking is contained inside cgroup proper and we
3803 * don't get nasty surprises if we ever grow another caller. 3807 * don't get nasty surprises if we ever grow another caller.
3804 */ 3808 */
3805 if (!cgroup_lock_live_group(parent)) { 3809 if (!cgroup_lock_live_group(parent)) {
3806 err = -ENODEV; 3810 err = -ENODEV;
3807 goto err_unlock_tree; 3811 goto err_unlock_tree;
3808 } 3812 }
3809 3813
3810 /* 3814 /*
3811 * Temporarily set the pointer to NULL, so idr_find() won't return 3815 * Temporarily set the pointer to NULL, so idr_find() won't return
3812 * a half-baked cgroup. 3816 * a half-baked cgroup.
3813 */ 3817 */
3814 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); 3818 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
3815 if (cgrp->id < 0) { 3819 if (cgrp->id < 0) {
3816 err = -ENOMEM; 3820 err = -ENOMEM;
3817 goto err_unlock; 3821 goto err_unlock;
3818 } 3822 }
3819 3823
3820 init_cgroup_housekeeping(cgrp); 3824 init_cgroup_housekeeping(cgrp);
3821 3825
3822 cgrp->parent = parent; 3826 cgrp->parent = parent;
3823 cgrp->dummy_css.parent = &parent->dummy_css; 3827 cgrp->dummy_css.parent = &parent->dummy_css;
3824 cgrp->root = parent->root; 3828 cgrp->root = parent->root;
3825 3829
3826 if (notify_on_release(parent)) 3830 if (notify_on_release(parent))
3827 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3831 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3828 3832
3829 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 3833 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
3830 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3834 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3831 3835
3832 /* create the directory */ 3836 /* create the directory */
3833 kn = kernfs_create_dir(parent->kn, name, mode, cgrp); 3837 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3834 if (IS_ERR(kn)) { 3838 if (IS_ERR(kn)) {
3835 err = PTR_ERR(kn); 3839 err = PTR_ERR(kn);
3836 goto err_free_id; 3840 goto err_free_id;
3837 } 3841 }
3838 cgrp->kn = kn; 3842 cgrp->kn = kn;
3839 3843
3840 /* 3844 /*
3841 * This extra ref will be put in cgroup_free_fn() and guarantees 3845 * This extra ref will be put in cgroup_free_fn() and guarantees
3842 * that @cgrp->kn is always accessible. 3846 * that @cgrp->kn is always accessible.
3843 */ 3847 */
3844 kernfs_get(kn); 3848 kernfs_get(kn);
3845 3849
3846 cgrp->serial_nr = cgroup_serial_nr_next++; 3850 cgrp->serial_nr = cgroup_serial_nr_next++;
3847 3851
3848 /* allocation complete, commit to creation */ 3852 /* allocation complete, commit to creation */
3849 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 3853 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
3850 atomic_inc(&root->nr_cgrps); 3854 atomic_inc(&root->nr_cgrps);
3851 cgroup_get(parent); 3855 cgroup_get(parent);
3852 3856
3853 /* 3857 /*
3854 * @cgrp is now fully operational. If something fails after this 3858 * @cgrp is now fully operational. If something fails after this
3855 * point, it'll be released via the normal destruction path. 3859 * point, it'll be released via the normal destruction path.
3856 */ 3860 */
3857 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 3861 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
3858 3862
3859 err = cgroup_kn_set_ugid(kn); 3863 err = cgroup_kn_set_ugid(kn);
3860 if (err) 3864 if (err)
3861 goto err_destroy; 3865 goto err_destroy;
3862 3866
3863 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 3867 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
3864 if (err) 3868 if (err)
3865 goto err_destroy; 3869 goto err_destroy;
3866 3870
3867 /* let's create and online css's */ 3871 /* let's create and online css's */
3868 for_each_subsys(ss, ssid) { 3872 for_each_subsys(ss, ssid) {
3869 if (parent->child_subsys_mask & (1 << ssid)) { 3873 if (parent->child_subsys_mask & (1 << ssid)) {
3870 err = create_css(cgrp, ss); 3874 err = create_css(cgrp, ss);
3871 if (err) 3875 if (err)
3872 goto err_destroy; 3876 goto err_destroy;
3873 } 3877 }
3874 } 3878 }
3875 3879
3876 /* 3880 /*
3877 * On the default hierarchy, a child doesn't automatically inherit 3881 * On the default hierarchy, a child doesn't automatically inherit
3878 * child_subsys_mask from the parent. Each is configured manually. 3882 * child_subsys_mask from the parent. Each is configured manually.
3879 */ 3883 */
3880 if (!cgroup_on_dfl(cgrp)) 3884 if (!cgroup_on_dfl(cgrp))
3881 cgrp->child_subsys_mask = parent->child_subsys_mask; 3885 cgrp->child_subsys_mask = parent->child_subsys_mask;
3882 3886
3883 kernfs_activate(kn); 3887 kernfs_activate(kn);
3884 3888
3885 mutex_unlock(&cgroup_mutex); 3889 mutex_unlock(&cgroup_mutex);
3886 mutex_unlock(&cgroup_tree_mutex); 3890 mutex_unlock(&cgroup_tree_mutex);
3887 3891
3888 return 0; 3892 return 0;
3889 3893
3890 err_free_id: 3894 err_free_id:
3891 idr_remove(&root->cgroup_idr, cgrp->id); 3895 idr_remove(&root->cgroup_idr, cgrp->id);
3892 err_unlock: 3896 err_unlock:
3893 mutex_unlock(&cgroup_mutex); 3897 mutex_unlock(&cgroup_mutex);
3894 err_unlock_tree: 3898 err_unlock_tree:
3895 mutex_unlock(&cgroup_tree_mutex); 3899 mutex_unlock(&cgroup_tree_mutex);
3896 kfree(cgrp); 3900 kfree(cgrp);
3897 return err; 3901 return err;
3898 3902
3899 err_destroy: 3903 err_destroy:
3900 cgroup_destroy_locked(cgrp); 3904 cgroup_destroy_locked(cgrp);
3901 mutex_unlock(&cgroup_mutex); 3905 mutex_unlock(&cgroup_mutex);
3902 mutex_unlock(&cgroup_tree_mutex); 3906 mutex_unlock(&cgroup_tree_mutex);
3903 return err; 3907 return err;
3904 } 3908 }
3905 3909
3906 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, 3910 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3907 umode_t mode) 3911 umode_t mode)
3908 { 3912 {
3909 struct cgroup *parent = parent_kn->priv; 3913 struct cgroup *parent = parent_kn->priv;
3910 int ret; 3914 int ret;
3911 3915
3912 /* 3916 /*
3913 * cgroup_create() grabs cgroup_tree_mutex which nests outside 3917 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3914 * kernfs active_ref and cgroup_create() already synchronizes 3918 * kernfs active_ref and cgroup_create() already synchronizes
3915 * properly against removal through cgroup_lock_live_group(). 3919 * properly against removal through cgroup_lock_live_group().
3916 * Break it before calling cgroup_create(). 3920 * Break it before calling cgroup_create().
3917 */ 3921 */
3918 cgroup_get(parent); 3922 cgroup_get(parent);
3919 kernfs_break_active_protection(parent_kn); 3923 kernfs_break_active_protection(parent_kn);
3920 3924
3921 ret = cgroup_create(parent, name, mode); 3925 ret = cgroup_create(parent, name, mode);
3922 3926
3923 kernfs_unbreak_active_protection(parent_kn); 3927 kernfs_unbreak_active_protection(parent_kn);
3924 cgroup_put(parent); 3928 cgroup_put(parent);
3925 return ret; 3929 return ret;
3926 } 3930 }
3927 3931
3928 /* 3932 /*
3929 * This is called when the refcnt of a css is confirmed to be killed. 3933 * This is called when the refcnt of a css is confirmed to be killed.
3930 * css_tryget() is now guaranteed to fail. 3934 * css_tryget() is now guaranteed to fail.
3931 */ 3935 */
3932 static void css_killed_work_fn(struct work_struct *work) 3936 static void css_killed_work_fn(struct work_struct *work)
3933 { 3937 {
3934 struct cgroup_subsys_state *css = 3938 struct cgroup_subsys_state *css =
3935 container_of(work, struct cgroup_subsys_state, destroy_work); 3939 container_of(work, struct cgroup_subsys_state, destroy_work);
3936 struct cgroup *cgrp = css->cgroup; 3940 struct cgroup *cgrp = css->cgroup;
3937 3941
3938 mutex_lock(&cgroup_tree_mutex); 3942 mutex_lock(&cgroup_tree_mutex);
3939 mutex_lock(&cgroup_mutex); 3943 mutex_lock(&cgroup_mutex);
3940 3944
3941 /* 3945 /*
3942 * css_tryget() is guaranteed to fail now. Tell subsystems to 3946 * css_tryget() is guaranteed to fail now. Tell subsystems to
3943 * initate destruction. 3947 * initate destruction.
3944 */ 3948 */
3945 offline_css(css); 3949 offline_css(css);
3946 3950
3947 /* 3951 /*
3948 * If @cgrp is marked dead, it's waiting for refs of all css's to 3952 * If @cgrp is marked dead, it's waiting for refs of all css's to
3949 * be disabled before proceeding to the second phase of cgroup 3953 * be disabled before proceeding to the second phase of cgroup
3950 * destruction. If we are the last one, kick it off. 3954 * destruction. If we are the last one, kick it off.
3951 */ 3955 */
3952 if (!cgrp->nr_css && cgroup_is_dead(cgrp)) 3956 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
3953 cgroup_destroy_css_killed(cgrp); 3957 cgroup_destroy_css_killed(cgrp);
3954 3958
3955 mutex_unlock(&cgroup_mutex); 3959 mutex_unlock(&cgroup_mutex);
3956 mutex_unlock(&cgroup_tree_mutex); 3960 mutex_unlock(&cgroup_tree_mutex);
3957 3961
3958 /* 3962 /*
3959 * Put the css refs from kill_css(). Each css holds an extra 3963 * Put the css refs from kill_css(). Each css holds an extra
3960 * reference to the cgroup's dentry and cgroup removal proceeds 3964 * reference to the cgroup's dentry and cgroup removal proceeds
3961 * regardless of css refs. On the last put of each css, whenever 3965 * regardless of css refs. On the last put of each css, whenever
3962 * that may be, the extra dentry ref is put so that dentry 3966 * that may be, the extra dentry ref is put so that dentry
3963 * destruction happens only after all css's are released. 3967 * destruction happens only after all css's are released.
3964 */ 3968 */
3965 css_put(css); 3969 css_put(css);
3966 } 3970 }
3967 3971
3968 /* css kill confirmation processing requires process context, bounce */ 3972 /* css kill confirmation processing requires process context, bounce */
3969 static void css_killed_ref_fn(struct percpu_ref *ref) 3973 static void css_killed_ref_fn(struct percpu_ref *ref)
3970 { 3974 {
3971 struct cgroup_subsys_state *css = 3975 struct cgroup_subsys_state *css =
3972 container_of(ref, struct cgroup_subsys_state, refcnt); 3976 container_of(ref, struct cgroup_subsys_state, refcnt);
3973 3977
3974 INIT_WORK(&css->destroy_work, css_killed_work_fn); 3978 INIT_WORK(&css->destroy_work, css_killed_work_fn);
3975 queue_work(cgroup_destroy_wq, &css->destroy_work); 3979 queue_work(cgroup_destroy_wq, &css->destroy_work);
3976 } 3980 }
3977 3981
3978 /** 3982 /**
3979 * kill_css - destroy a css 3983 * kill_css - destroy a css
3980 * @css: css to destroy 3984 * @css: css to destroy
3981 * 3985 *
3982 * This function initiates destruction of @css by removing cgroup interface 3986 * This function initiates destruction of @css by removing cgroup interface
3983 * files and putting its base reference. ->css_offline() will be invoked 3987 * files and putting its base reference. ->css_offline() will be invoked
3984 * asynchronously once css_tryget() is guaranteed to fail and when the 3988 * asynchronously once css_tryget() is guaranteed to fail and when the
3985 * reference count reaches zero, @css will be released. 3989 * reference count reaches zero, @css will be released.
3986 */ 3990 */
3987 static void kill_css(struct cgroup_subsys_state *css) 3991 static void kill_css(struct cgroup_subsys_state *css)
3988 { 3992 {
3989 lockdep_assert_held(&cgroup_tree_mutex); 3993 lockdep_assert_held(&cgroup_tree_mutex);
3990 3994
3991 /* 3995 /*
3992 * This must happen before css is disassociated with its cgroup. 3996 * This must happen before css is disassociated with its cgroup.
3993 * See seq_css() for details. 3997 * See seq_css() for details.
3994 */ 3998 */
3995 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 3999 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
3996 4000
3997 /* 4001 /*
3998 * Killing would put the base ref, but we need to keep it alive 4002 * Killing would put the base ref, but we need to keep it alive
3999 * until after ->css_offline(). 4003 * until after ->css_offline().
4000 */ 4004 */
4001 css_get(css); 4005 css_get(css);
4002 4006
4003 /* 4007 /*
4004 * cgroup core guarantees that, by the time ->css_offline() is 4008 * cgroup core guarantees that, by the time ->css_offline() is
4005 * invoked, no new css reference will be given out via 4009 * invoked, no new css reference will be given out via
4006 * css_tryget(). We can't simply call percpu_ref_kill() and 4010 * css_tryget(). We can't simply call percpu_ref_kill() and
4007 * proceed to offlining css's because percpu_ref_kill() doesn't 4011 * proceed to offlining css's because percpu_ref_kill() doesn't
4008 * guarantee that the ref is seen as killed on all CPUs on return. 4012 * guarantee that the ref is seen as killed on all CPUs on return.
4009 * 4013 *
4010 * Use percpu_ref_kill_and_confirm() to get notifications as each 4014 * Use percpu_ref_kill_and_confirm() to get notifications as each
4011 * css is confirmed to be seen as killed on all CPUs. 4015 * css is confirmed to be seen as killed on all CPUs.
4012 */ 4016 */
4013 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); 4017 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4014 } 4018 }
4015 4019
4016 /** 4020 /**
4017 * cgroup_destroy_locked - the first stage of cgroup destruction 4021 * cgroup_destroy_locked - the first stage of cgroup destruction
4018 * @cgrp: cgroup to be destroyed 4022 * @cgrp: cgroup to be destroyed
4019 * 4023 *
4020 * css's make use of percpu refcnts whose killing latency shouldn't be 4024 * css's make use of percpu refcnts whose killing latency shouldn't be
4021 * exposed to userland and are RCU protected. Also, cgroup core needs to 4025 * exposed to userland and are RCU protected. Also, cgroup core needs to
4022 * guarantee that css_tryget() won't succeed by the time ->css_offline() is 4026 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4023 * invoked. To satisfy all the requirements, destruction is implemented in 4027 * invoked. To satisfy all the requirements, destruction is implemented in
4024 * the following two steps. 4028 * the following two steps.
4025 * 4029 *
4026 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all 4030 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4027 * userland visible parts and start killing the percpu refcnts of 4031 * userland visible parts and start killing the percpu refcnts of
4028 * css's. Set up so that the next stage will be kicked off once all 4032 * css's. Set up so that the next stage will be kicked off once all
4029 * the percpu refcnts are confirmed to be killed. 4033 * the percpu refcnts are confirmed to be killed.
4030 * 4034 *
4031 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the 4035 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4032 * rest of destruction. Once all cgroup references are gone, the 4036 * rest of destruction. Once all cgroup references are gone, the
4033 * cgroup is RCU-freed. 4037 * cgroup is RCU-freed.
4034 * 4038 *
4035 * This function implements s1. After this step, @cgrp is gone as far as 4039 * This function implements s1. After this step, @cgrp is gone as far as
4036 * the userland is concerned and a new cgroup with the same name may be 4040 * the userland is concerned and a new cgroup with the same name may be
4037 * created. As cgroup doesn't care about the names internally, this 4041 * created. As cgroup doesn't care about the names internally, this
4038 * doesn't cause any problem. 4042 * doesn't cause any problem.
4039 */ 4043 */
4040 static int cgroup_destroy_locked(struct cgroup *cgrp) 4044 static int cgroup_destroy_locked(struct cgroup *cgrp)
4041 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4045 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4042 { 4046 {
4043 struct cgroup *child; 4047 struct cgroup *child;
4044 struct cgroup_subsys_state *css; 4048 struct cgroup_subsys_state *css;
4045 bool empty; 4049 bool empty;
4046 int ssid; 4050 int ssid;
4047 4051
4048 lockdep_assert_held(&cgroup_tree_mutex); 4052 lockdep_assert_held(&cgroup_tree_mutex);
4049 lockdep_assert_held(&cgroup_mutex); 4053 lockdep_assert_held(&cgroup_mutex);
4050 4054
4051 /* 4055 /*
4052 * css_set_rwsem synchronizes access to ->cset_links and prevents 4056 * css_set_rwsem synchronizes access to ->cset_links and prevents
4053 * @cgrp from being removed while put_css_set() is in progress. 4057 * @cgrp from being removed while put_css_set() is in progress.
4054 */ 4058 */
4055 down_read(&css_set_rwsem); 4059 down_read(&css_set_rwsem);
4056 empty = list_empty(&cgrp->cset_links); 4060 empty = list_empty(&cgrp->cset_links);
4057 up_read(&css_set_rwsem); 4061 up_read(&css_set_rwsem);
4058 if (!empty) 4062 if (!empty)
4059 return -EBUSY; 4063 return -EBUSY;
4060 4064
4061 /* 4065 /*
4062 * Make sure there's no live children. We can't test ->children 4066 * Make sure there's no live children. We can't test ->children
4063 * emptiness as dead children linger on it while being destroyed; 4067 * emptiness as dead children linger on it while being destroyed;
4064 * otherwise, "rmdir parent/child parent" may fail with -EBUSY. 4068 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
4065 */ 4069 */
4066 empty = true; 4070 empty = true;
4067 rcu_read_lock(); 4071 rcu_read_lock();
4068 list_for_each_entry_rcu(child, &cgrp->children, sibling) { 4072 list_for_each_entry_rcu(child, &cgrp->children, sibling) {
4069 empty = cgroup_is_dead(child); 4073 empty = cgroup_is_dead(child);
4070 if (!empty) 4074 if (!empty)
4071 break; 4075 break;
4072 } 4076 }
4073 rcu_read_unlock(); 4077 rcu_read_unlock();
4074 if (!empty) 4078 if (!empty)
4075 return -EBUSY; 4079 return -EBUSY;
4076 4080
4077 /* 4081 /*
4078 * Mark @cgrp dead. This prevents further task migration and child 4082 * Mark @cgrp dead. This prevents further task migration and child
4079 * creation by disabling cgroup_lock_live_group(). Note that 4083 * creation by disabling cgroup_lock_live_group(). Note that
4080 * CGRP_DEAD assertion is depended upon by css_next_child() to 4084 * CGRP_DEAD assertion is depended upon by css_next_child() to
4081 * resume iteration after dropping RCU read lock. See 4085 * resume iteration after dropping RCU read lock. See
4082 * css_next_child() for details. 4086 * css_next_child() for details.
4083 */ 4087 */
4084 set_bit(CGRP_DEAD, &cgrp->flags); 4088 set_bit(CGRP_DEAD, &cgrp->flags);
4085 4089
4086 /* 4090 /*
4087 * Initiate massacre of all css's. cgroup_destroy_css_killed() 4091 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4088 * will be invoked to perform the rest of destruction once the 4092 * will be invoked to perform the rest of destruction once the
4089 * percpu refs of all css's are confirmed to be killed. This 4093 * percpu refs of all css's are confirmed to be killed. This
4090 * involves removing the subsystem's files, drop cgroup_mutex. 4094 * involves removing the subsystem's files, drop cgroup_mutex.
4091 */ 4095 */
4092 mutex_unlock(&cgroup_mutex); 4096 mutex_unlock(&cgroup_mutex);
4093 for_each_css(css, ssid, cgrp) 4097 for_each_css(css, ssid, cgrp)
4094 kill_css(css); 4098 kill_css(css);
4095 mutex_lock(&cgroup_mutex); 4099 mutex_lock(&cgroup_mutex);
4096 4100
4097 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 4101 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4098 raw_spin_lock(&release_list_lock); 4102 raw_spin_lock(&release_list_lock);
4099 if (!list_empty(&cgrp->release_list)) 4103 if (!list_empty(&cgrp->release_list))
4100 list_del_init(&cgrp->release_list); 4104 list_del_init(&cgrp->release_list);
4101 raw_spin_unlock(&release_list_lock); 4105 raw_spin_unlock(&release_list_lock);
4102 4106
4103 /* 4107 /*
4104 * If @cgrp has css's attached, the second stage of cgroup 4108 * If @cgrp has css's attached, the second stage of cgroup
4105 * destruction is kicked off from css_killed_work_fn() after the 4109 * destruction is kicked off from css_killed_work_fn() after the
4106 * refs of all attached css's are killed. If @cgrp doesn't have 4110 * refs of all attached css's are killed. If @cgrp doesn't have
4107 * any css, we kick it off here. 4111 * any css, we kick it off here.
4108 */ 4112 */
4109 if (!cgrp->nr_css) 4113 if (!cgrp->nr_css)
4110 cgroup_destroy_css_killed(cgrp); 4114 cgroup_destroy_css_killed(cgrp);
4111 4115
4112 /* remove @cgrp directory along with the base files */ 4116 /* remove @cgrp directory along with the base files */
4113 mutex_unlock(&cgroup_mutex); 4117 mutex_unlock(&cgroup_mutex);
4114 4118
4115 /* 4119 /*
4116 * There are two control paths which try to determine cgroup from 4120 * There are two control paths which try to determine cgroup from
4117 * dentry without going through kernfs - cgroupstats_build() and 4121 * dentry without going through kernfs - cgroupstats_build() and
4118 * css_tryget_from_dir(). Those are supported by RCU protecting 4122 * css_tryget_from_dir(). Those are supported by RCU protecting
4119 * clearing of cgrp->kn->priv backpointer, which should happen 4123 * clearing of cgrp->kn->priv backpointer, which should happen
4120 * after all files under it have been removed. 4124 * after all files under it have been removed.
4121 */ 4125 */
4122 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ 4126 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4123 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); 4127 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4124 4128
4125 mutex_lock(&cgroup_mutex); 4129 mutex_lock(&cgroup_mutex);
4126 4130
4127 return 0; 4131 return 0;
4128 }; 4132 };
4129 4133
4130 /** 4134 /**
4131 * cgroup_destroy_css_killed - the second step of cgroup destruction 4135 * cgroup_destroy_css_killed - the second step of cgroup destruction
4132 * @work: cgroup->destroy_free_work 4136 * @work: cgroup->destroy_free_work
4133 * 4137 *
4134 * This function is invoked from a work item for a cgroup which is being 4138 * This function is invoked from a work item for a cgroup which is being
4135 * destroyed after all css's are offlined and performs the rest of 4139 * destroyed after all css's are offlined and performs the rest of
4136 * destruction. This is the second step of destruction described in the 4140 * destruction. This is the second step of destruction described in the
4137 * comment above cgroup_destroy_locked(). 4141 * comment above cgroup_destroy_locked().
4138 */ 4142 */
4139 static void cgroup_destroy_css_killed(struct cgroup *cgrp) 4143 static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4140 { 4144 {
4141 struct cgroup *parent = cgrp->parent; 4145 struct cgroup *parent = cgrp->parent;
4142 4146
4143 lockdep_assert_held(&cgroup_tree_mutex); 4147 lockdep_assert_held(&cgroup_tree_mutex);
4144 lockdep_assert_held(&cgroup_mutex); 4148 lockdep_assert_held(&cgroup_mutex);
4145 4149
4146 /* delete this cgroup from parent->children */ 4150 /* delete this cgroup from parent->children */
4147 list_del_rcu(&cgrp->sibling); 4151 list_del_rcu(&cgrp->sibling);
4148 4152
4149 cgroup_put(cgrp); 4153 cgroup_put(cgrp);
4150 4154
4151 set_bit(CGRP_RELEASABLE, &parent->flags); 4155 set_bit(CGRP_RELEASABLE, &parent->flags);
4152 check_for_release(parent); 4156 check_for_release(parent);
4153 } 4157 }
4154 4158
4155 static int cgroup_rmdir(struct kernfs_node *kn) 4159 static int cgroup_rmdir(struct kernfs_node *kn)
4156 { 4160 {
4157 struct cgroup *cgrp = kn->priv; 4161 struct cgroup *cgrp = kn->priv;
4158 int ret = 0; 4162 int ret = 0;
4159 4163
4160 /* 4164 /*
4161 * This is self-destruction but @kn can't be removed while this 4165 * This is self-destruction but @kn can't be removed while this
4162 * callback is in progress. Let's break active protection. Once 4166 * callback is in progress. Let's break active protection. Once
4163 * the protection is broken, @cgrp can be destroyed at any point. 4167 * the protection is broken, @cgrp can be destroyed at any point.
4164 * Pin it so that it stays accessible. 4168 * Pin it so that it stays accessible.
4165 */ 4169 */
4166 cgroup_get(cgrp); 4170 cgroup_get(cgrp);
4167 kernfs_break_active_protection(kn); 4171 kernfs_break_active_protection(kn);
4168 4172
4169 mutex_lock(&cgroup_tree_mutex); 4173 mutex_lock(&cgroup_tree_mutex);
4170 mutex_lock(&cgroup_mutex); 4174 mutex_lock(&cgroup_mutex);
4171 4175
4172 /* 4176 /*
4173 * @cgrp might already have been destroyed while we're trying to 4177 * @cgrp might already have been destroyed while we're trying to
4174 * grab the mutexes. 4178 * grab the mutexes.
4175 */ 4179 */
4176 if (!cgroup_is_dead(cgrp)) 4180 if (!cgroup_is_dead(cgrp))
4177 ret = cgroup_destroy_locked(cgrp); 4181 ret = cgroup_destroy_locked(cgrp);
4178 4182
4179 mutex_unlock(&cgroup_mutex); 4183 mutex_unlock(&cgroup_mutex);
4180 mutex_unlock(&cgroup_tree_mutex); 4184 mutex_unlock(&cgroup_tree_mutex);
4181 4185
4182 kernfs_unbreak_active_protection(kn); 4186 kernfs_unbreak_active_protection(kn);
4183 cgroup_put(cgrp); 4187 cgroup_put(cgrp);
4184 return ret; 4188 return ret;
4185 } 4189 }
4186 4190
4187 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { 4191 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4188 .remount_fs = cgroup_remount, 4192 .remount_fs = cgroup_remount,
4189 .show_options = cgroup_show_options, 4193 .show_options = cgroup_show_options,
4190 .mkdir = cgroup_mkdir, 4194 .mkdir = cgroup_mkdir,
4191 .rmdir = cgroup_rmdir, 4195 .rmdir = cgroup_rmdir,
4192 .rename = cgroup_rename, 4196 .rename = cgroup_rename,
4193 }; 4197 };
4194 4198
4195 static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4199 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4196 { 4200 {
4197 struct cgroup_subsys_state *css; 4201 struct cgroup_subsys_state *css;
4198 4202
4199 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4203 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4200 4204
4201 mutex_lock(&cgroup_tree_mutex); 4205 mutex_lock(&cgroup_tree_mutex);
4202 mutex_lock(&cgroup_mutex); 4206 mutex_lock(&cgroup_mutex);
4203 4207
4204 INIT_LIST_HEAD(&ss->cfts); 4208 INIT_LIST_HEAD(&ss->cfts);
4205 4209
4206 /* Create the root cgroup state for this subsystem */ 4210 /* Create the root cgroup state for this subsystem */
4207 ss->root = &cgrp_dfl_root; 4211 ss->root = &cgrp_dfl_root;
4208 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); 4212 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4209 /* We don't handle early failures gracefully */ 4213 /* We don't handle early failures gracefully */
4210 BUG_ON(IS_ERR(css)); 4214 BUG_ON(IS_ERR(css));
4211 init_css(css, ss, &cgrp_dfl_root.cgrp); 4215 init_css(css, ss, &cgrp_dfl_root.cgrp);
4212 4216
4213 /* Update the init_css_set to contain a subsys 4217 /* Update the init_css_set to contain a subsys
4214 * pointer to this state - since the subsystem is 4218 * pointer to this state - since the subsystem is
4215 * newly registered, all tasks and hence the 4219 * newly registered, all tasks and hence the
4216 * init_css_set is in the subsystem's root cgroup. */ 4220 * init_css_set is in the subsystem's root cgroup. */
4217 init_css_set.subsys[ss->id] = css; 4221 init_css_set.subsys[ss->id] = css;
4218 4222
4219 need_forkexit_callback |= ss->fork || ss->exit; 4223 need_forkexit_callback |= ss->fork || ss->exit;
4220 4224
4221 /* At system boot, before all subsystems have been 4225 /* At system boot, before all subsystems have been
4222 * registered, no tasks have been forked, so we don't 4226 * registered, no tasks have been forked, so we don't
4223 * need to invoke fork callbacks here. */ 4227 * need to invoke fork callbacks here. */
4224 BUG_ON(!list_empty(&init_task.tasks)); 4228 BUG_ON(!list_empty(&init_task.tasks));
4225 4229
4226 BUG_ON(online_css(css)); 4230 BUG_ON(online_css(css));
4227 4231
4228 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 4232 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4229 4233
4230 mutex_unlock(&cgroup_mutex); 4234 mutex_unlock(&cgroup_mutex);
4231 mutex_unlock(&cgroup_tree_mutex); 4235 mutex_unlock(&cgroup_tree_mutex);
4232 } 4236 }
4233 4237
4234 /** 4238 /**
4235 * cgroup_init_early - cgroup initialization at system boot 4239 * cgroup_init_early - cgroup initialization at system boot
4236 * 4240 *
4237 * Initialize cgroups at system boot, and initialize any 4241 * Initialize cgroups at system boot, and initialize any
4238 * subsystems that request early init. 4242 * subsystems that request early init.
4239 */ 4243 */
4240 int __init cgroup_init_early(void) 4244 int __init cgroup_init_early(void)
4241 { 4245 {
4242 static struct cgroup_sb_opts __initdata opts = 4246 static struct cgroup_sb_opts __initdata opts =
4243 { .flags = CGRP_ROOT_SANE_BEHAVIOR }; 4247 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4244 struct cgroup_subsys *ss; 4248 struct cgroup_subsys *ss;
4245 int i; 4249 int i;
4246 4250
4247 init_cgroup_root(&cgrp_dfl_root, &opts); 4251 init_cgroup_root(&cgrp_dfl_root, &opts);
4248 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4252 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4249 4253
4250 for_each_subsys(ss, i) { 4254 for_each_subsys(ss, i) {
4251 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, 4255 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4252 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", 4256 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4253 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, 4257 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4254 ss->id, ss->name); 4258 ss->id, ss->name);
4255 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, 4259 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4256 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); 4260 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4257 4261
4258 ss->id = i; 4262 ss->id = i;
4259 ss->name = cgroup_subsys_name[i]; 4263 ss->name = cgroup_subsys_name[i];
4260 4264
4261 if (ss->early_init) 4265 if (ss->early_init)
4262 cgroup_init_subsys(ss); 4266 cgroup_init_subsys(ss);
4263 } 4267 }
4264 return 0; 4268 return 0;
4265 } 4269 }
4266 4270
4267 /** 4271 /**
4268 * cgroup_init - cgroup initialization 4272 * cgroup_init - cgroup initialization
4269 * 4273 *
4270 * Register cgroup filesystem and /proc file, and initialize 4274 * Register cgroup filesystem and /proc file, and initialize
4271 * any subsystems that didn't request early init. 4275 * any subsystems that didn't request early init.
4272 */ 4276 */
4273 int __init cgroup_init(void) 4277 int __init cgroup_init(void)
4274 { 4278 {
4275 struct cgroup_subsys *ss; 4279 struct cgroup_subsys *ss;
4276 unsigned long key; 4280 unsigned long key;
4277 int ssid, err; 4281 int ssid, err;
4278 4282
4279 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4283 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4280 4284
4281 mutex_lock(&cgroup_tree_mutex); 4285 mutex_lock(&cgroup_tree_mutex);
4282 mutex_lock(&cgroup_mutex); 4286 mutex_lock(&cgroup_mutex);
4283 4287
4284 /* Add init_css_set to the hash table */ 4288 /* Add init_css_set to the hash table */
4285 key = css_set_hash(init_css_set.subsys); 4289 key = css_set_hash(init_css_set.subsys);
4286 hash_add(css_set_table, &init_css_set.hlist, key); 4290 hash_add(css_set_table, &init_css_set.hlist, key);
4287 4291
4288 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); 4292 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4289 4293
4290 mutex_unlock(&cgroup_mutex); 4294 mutex_unlock(&cgroup_mutex);
4291 mutex_unlock(&cgroup_tree_mutex); 4295 mutex_unlock(&cgroup_tree_mutex);
4292 4296
4293 for_each_subsys(ss, ssid) { 4297 for_each_subsys(ss, ssid) {
4294 if (!ss->early_init) 4298 if (!ss->early_init)
4295 cgroup_init_subsys(ss); 4299 cgroup_init_subsys(ss);
4296 4300
4297 list_add_tail(&init_css_set.e_cset_node[ssid], 4301 list_add_tail(&init_css_set.e_cset_node[ssid],
4298 &cgrp_dfl_root.cgrp.e_csets[ssid]); 4302 &cgrp_dfl_root.cgrp.e_csets[ssid]);
4299 4303
4300 /* 4304 /*
4301 * cftype registration needs kmalloc and can't be done 4305 * cftype registration needs kmalloc and can't be done
4302 * during early_init. Register base cftypes separately. 4306 * during early_init. Register base cftypes separately.
4303 */ 4307 */
4304 if (ss->base_cftypes) 4308 if (ss->base_cftypes)
4305 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4309 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4306 } 4310 }
4307 4311
4308 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4312 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4309 if (!cgroup_kobj) 4313 if (!cgroup_kobj)
4310 return -ENOMEM; 4314 return -ENOMEM;
4311 4315
4312 err = register_filesystem(&cgroup_fs_type); 4316 err = register_filesystem(&cgroup_fs_type);
4313 if (err < 0) { 4317 if (err < 0) {
4314 kobject_put(cgroup_kobj); 4318 kobject_put(cgroup_kobj);
4315 return err; 4319 return err;
4316 } 4320 }
4317 4321
4318 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4322 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4319 return 0; 4323 return 0;
4320 } 4324 }
4321 4325
4322 static int __init cgroup_wq_init(void) 4326 static int __init cgroup_wq_init(void)
4323 { 4327 {
4324 /* 4328 /*
4325 * There isn't much point in executing destruction path in 4329 * There isn't much point in executing destruction path in
4326 * parallel. Good chunk is serialized with cgroup_mutex anyway. 4330 * parallel. Good chunk is serialized with cgroup_mutex anyway.
4327 * Use 1 for @max_active. 4331 * Use 1 for @max_active.
4328 * 4332 *
4329 * We would prefer to do this in cgroup_init() above, but that 4333 * We would prefer to do this in cgroup_init() above, but that
4330 * is called before init_workqueues(): so leave this until after. 4334 * is called before init_workqueues(): so leave this until after.
4331 */ 4335 */
4332 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4336 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4333 BUG_ON(!cgroup_destroy_wq); 4337 BUG_ON(!cgroup_destroy_wq);
4334 4338
4335 /* 4339 /*
4336 * Used to destroy pidlists and separate to serve as flush domain. 4340 * Used to destroy pidlists and separate to serve as flush domain.
4337 * Cap @max_active to 1 too. 4341 * Cap @max_active to 1 too.
4338 */ 4342 */
4339 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", 4343 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4340 0, 1); 4344 0, 1);
4341 BUG_ON(!cgroup_pidlist_destroy_wq); 4345 BUG_ON(!cgroup_pidlist_destroy_wq);
4342 4346
4343 return 0; 4347 return 0;
4344 } 4348 }
4345 core_initcall(cgroup_wq_init); 4349 core_initcall(cgroup_wq_init);
4346 4350
4347 /* 4351 /*
4348 * proc_cgroup_show() 4352 * proc_cgroup_show()
4349 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4353 * - Print task's cgroup paths into seq_file, one line for each hierarchy
4350 * - Used for /proc/<pid>/cgroup. 4354 * - Used for /proc/<pid>/cgroup.
4351 */ 4355 */
4352 4356
4353 /* TODO: Use a proper seq_file iterator */ 4357 /* TODO: Use a proper seq_file iterator */
4354 int proc_cgroup_show(struct seq_file *m, void *v) 4358 int proc_cgroup_show(struct seq_file *m, void *v)
4355 { 4359 {
4356 struct pid *pid; 4360 struct pid *pid;
4357 struct task_struct *tsk; 4361 struct task_struct *tsk;
4358 char *buf, *path; 4362 char *buf, *path;
4359 int retval; 4363 int retval;
4360 struct cgroup_root *root; 4364 struct cgroup_root *root;
4361 4365
4362 retval = -ENOMEM; 4366 retval = -ENOMEM;
4363 buf = kmalloc(PATH_MAX, GFP_KERNEL); 4367 buf = kmalloc(PATH_MAX, GFP_KERNEL);
4364 if (!buf) 4368 if (!buf)
4365 goto out; 4369 goto out;
4366 4370
4367 retval = -ESRCH; 4371 retval = -ESRCH;
4368 pid = m->private; 4372 pid = m->private;
4369 tsk = get_pid_task(pid, PIDTYPE_PID); 4373 tsk = get_pid_task(pid, PIDTYPE_PID);
4370 if (!tsk) 4374 if (!tsk)
4371 goto out_free; 4375 goto out_free;
4372 4376
4373 retval = 0; 4377 retval = 0;
4374 4378
4375 mutex_lock(&cgroup_mutex); 4379 mutex_lock(&cgroup_mutex);
4376 down_read(&css_set_rwsem); 4380 down_read(&css_set_rwsem);
4377 4381
4378 for_each_root(root) { 4382 for_each_root(root) {
4379 struct cgroup_subsys *ss; 4383 struct cgroup_subsys *ss;
4380 struct cgroup *cgrp; 4384 struct cgroup *cgrp;
4381 int ssid, count = 0; 4385 int ssid, count = 0;
4382 4386
4383 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) 4387 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4384 continue; 4388 continue;
4385 4389
4386 seq_printf(m, "%d:", root->hierarchy_id); 4390 seq_printf(m, "%d:", root->hierarchy_id);
4387 for_each_subsys(ss, ssid) 4391 for_each_subsys(ss, ssid)
4388 if (root->subsys_mask & (1 << ssid)) 4392 if (root->subsys_mask & (1 << ssid))
4389 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4393 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4390 if (strlen(root->name)) 4394 if (strlen(root->name))
4391 seq_printf(m, "%sname=%s", count ? "," : "", 4395 seq_printf(m, "%sname=%s", count ? "," : "",
4392 root->name); 4396 root->name);
4393 seq_putc(m, ':'); 4397 seq_putc(m, ':');
4394 cgrp = task_cgroup_from_root(tsk, root); 4398 cgrp = task_cgroup_from_root(tsk, root);
4395 path = cgroup_path(cgrp, buf, PATH_MAX); 4399 path = cgroup_path(cgrp, buf, PATH_MAX);
4396 if (!path) { 4400 if (!path) {
4397 retval = -ENAMETOOLONG; 4401 retval = -ENAMETOOLONG;
4398 goto out_unlock; 4402 goto out_unlock;
4399 } 4403 }
4400 seq_puts(m, path); 4404 seq_puts(m, path);
4401 seq_putc(m, '\n'); 4405 seq_putc(m, '\n');
4402 } 4406 }
4403 4407
4404 out_unlock: 4408 out_unlock:
4405 up_read(&css_set_rwsem); 4409 up_read(&css_set_rwsem);
4406 mutex_unlock(&cgroup_mutex); 4410 mutex_unlock(&cgroup_mutex);
4407 put_task_struct(tsk); 4411 put_task_struct(tsk);
4408 out_free: 4412 out_free:
4409 kfree(buf); 4413 kfree(buf);
4410 out: 4414 out:
4411 return retval; 4415 return retval;
4412 } 4416 }
4413 4417
4414 /* Display information about each subsystem and each hierarchy */ 4418 /* Display information about each subsystem and each hierarchy */
4415 static int proc_cgroupstats_show(struct seq_file *m, void *v) 4419 static int proc_cgroupstats_show(struct seq_file *m, void *v)
4416 { 4420 {
4417 struct cgroup_subsys *ss; 4421 struct cgroup_subsys *ss;
4418 int i; 4422 int i;
4419 4423
4420 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 4424 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4421 /* 4425 /*
4422 * ideally we don't want subsystems moving around while we do this. 4426 * ideally we don't want subsystems moving around while we do this.
4423 * cgroup_mutex is also necessary to guarantee an atomic snapshot of 4427 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
4424 * subsys/hierarchy state. 4428 * subsys/hierarchy state.
4425 */ 4429 */
4426 mutex_lock(&cgroup_mutex); 4430 mutex_lock(&cgroup_mutex);
4427 4431
4428 for_each_subsys(ss, i) 4432 for_each_subsys(ss, i)
4429 seq_printf(m, "%s\t%d\t%d\t%d\n", 4433 seq_printf(m, "%s\t%d\t%d\t%d\n",
4430 ss->name, ss->root->hierarchy_id, 4434 ss->name, ss->root->hierarchy_id,
4431 atomic_read(&ss->root->nr_cgrps), !ss->disabled); 4435 atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4432 4436
4433 mutex_unlock(&cgroup_mutex); 4437 mutex_unlock(&cgroup_mutex);
4434 return 0; 4438 return 0;
4435 } 4439 }
4436 4440
4437 static int cgroupstats_open(struct inode *inode, struct file *file) 4441 static int cgroupstats_open(struct inode *inode, struct file *file)
4438 { 4442 {
4439 return single_open(file, proc_cgroupstats_show, NULL); 4443 return single_open(file, proc_cgroupstats_show, NULL);
4440 } 4444 }
4441 4445
4442 static const struct file_operations proc_cgroupstats_operations = { 4446 static const struct file_operations proc_cgroupstats_operations = {
4443 .open = cgroupstats_open, 4447 .open = cgroupstats_open,
4444 .read = seq_read, 4448 .read = seq_read,
4445 .llseek = seq_lseek, 4449 .llseek = seq_lseek,
4446 .release = single_release, 4450 .release = single_release,
4447 }; 4451 };
4448 4452
4449 /** 4453 /**
4450 * cgroup_fork - initialize cgroup related fields during copy_process() 4454 * cgroup_fork - initialize cgroup related fields during copy_process()
4451 * @child: pointer to task_struct of forking parent process. 4455 * @child: pointer to task_struct of forking parent process.
4452 * 4456 *
4453 * A task is associated with the init_css_set until cgroup_post_fork() 4457 * A task is associated with the init_css_set until cgroup_post_fork()
4454 * attaches it to the parent's css_set. Empty cg_list indicates that 4458 * attaches it to the parent's css_set. Empty cg_list indicates that
4455 * @child isn't holding reference to its css_set. 4459 * @child isn't holding reference to its css_set.
4456 */ 4460 */
4457 void cgroup_fork(struct task_struct *child) 4461 void cgroup_fork(struct task_struct *child)
4458 { 4462 {
4459 RCU_INIT_POINTER(child->cgroups, &init_css_set); 4463 RCU_INIT_POINTER(child->cgroups, &init_css_set);
4460 INIT_LIST_HEAD(&child->cg_list); 4464 INIT_LIST_HEAD(&child->cg_list);
4461 } 4465 }
4462 4466
4463 /** 4467 /**
4464 * cgroup_post_fork - called on a new task after adding it to the task list 4468 * cgroup_post_fork - called on a new task after adding it to the task list
4465 * @child: the task in question 4469 * @child: the task in question
4466 * 4470 *
4467 * Adds the task to the list running through its css_set if necessary and 4471 * Adds the task to the list running through its css_set if necessary and
4468 * call the subsystem fork() callbacks. Has to be after the task is 4472 * call the subsystem fork() callbacks. Has to be after the task is
4469 * visible on the task list in case we race with the first call to 4473 * visible on the task list in case we race with the first call to
4470 * cgroup_task_iter_start() - to guarantee that the new task ends up on its 4474 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
4471 * list. 4475 * list.
4472 */ 4476 */
4473 void cgroup_post_fork(struct task_struct *child) 4477 void cgroup_post_fork(struct task_struct *child)
4474 { 4478 {
4475 struct cgroup_subsys *ss; 4479 struct cgroup_subsys *ss;
4476 int i; 4480 int i;
4477 4481
4478 /* 4482 /*
4479 * This may race against cgroup_enable_task_cg_links(). As that 4483 * This may race against cgroup_enable_task_cg_links(). As that
4480 * function sets use_task_css_set_links before grabbing 4484 * function sets use_task_css_set_links before grabbing
4481 * tasklist_lock and we just went through tasklist_lock to add 4485 * tasklist_lock and we just went through tasklist_lock to add
4482 * @child, it's guaranteed that either we see the set 4486 * @child, it's guaranteed that either we see the set
4483 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees 4487 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
4484 * @child during its iteration. 4488 * @child during its iteration.
4485 * 4489 *
4486 * If we won the race, @child is associated with %current's 4490 * If we won the race, @child is associated with %current's
4487 * css_set. Grabbing css_set_rwsem guarantees both that the 4491 * css_set. Grabbing css_set_rwsem guarantees both that the
4488 * association is stable, and, on completion of the parent's 4492 * association is stable, and, on completion of the parent's
4489 * migration, @child is visible in the source of migration or 4493 * migration, @child is visible in the source of migration or
4490 * already in the destination cgroup. This guarantee is necessary 4494 * already in the destination cgroup. This guarantee is necessary
4491 * when implementing operations which need to migrate all tasks of 4495 * when implementing operations which need to migrate all tasks of
4492 * a cgroup to another. 4496 * a cgroup to another.
4493 * 4497 *
4494 * Note that if we lose to cgroup_enable_task_cg_links(), @child 4498 * Note that if we lose to cgroup_enable_task_cg_links(), @child
4495 * will remain in init_css_set. This is safe because all tasks are 4499 * will remain in init_css_set. This is safe because all tasks are
4496 * in the init_css_set before cg_links is enabled and there's no 4500 * in the init_css_set before cg_links is enabled and there's no
4497 * operation which transfers all tasks out of init_css_set. 4501 * operation which transfers all tasks out of init_css_set.
4498 */ 4502 */
4499 if (use_task_css_set_links) { 4503 if (use_task_css_set_links) {
4500 struct css_set *cset; 4504 struct css_set *cset;
4501 4505
4502 down_write(&css_set_rwsem); 4506 down_write(&css_set_rwsem);
4503 cset = task_css_set(current); 4507 cset = task_css_set(current);
4504 if (list_empty(&child->cg_list)) { 4508 if (list_empty(&child->cg_list)) {
4505 rcu_assign_pointer(child->cgroups, cset); 4509 rcu_assign_pointer(child->cgroups, cset);
4506 list_add(&child->cg_list, &cset->tasks); 4510 list_add(&child->cg_list, &cset->tasks);
4507 get_css_set(cset); 4511 get_css_set(cset);
4508 } 4512 }
4509 up_write(&css_set_rwsem); 4513 up_write(&css_set_rwsem);
4510 } 4514 }
4511 4515
4512 /* 4516 /*
4513 * Call ss->fork(). This must happen after @child is linked on 4517 * Call ss->fork(). This must happen after @child is linked on
4514 * css_set; otherwise, @child might change state between ->fork() 4518 * css_set; otherwise, @child might change state between ->fork()
4515 * and addition to css_set. 4519 * and addition to css_set.
4516 */ 4520 */
4517 if (need_forkexit_callback) { 4521 if (need_forkexit_callback) {
4518 for_each_subsys(ss, i) 4522 for_each_subsys(ss, i)
4519 if (ss->fork) 4523 if (ss->fork)
4520 ss->fork(child); 4524 ss->fork(child);
4521 } 4525 }
4522 } 4526 }
4523 4527
4524 /** 4528 /**
4525 * cgroup_exit - detach cgroup from exiting task 4529 * cgroup_exit - detach cgroup from exiting task
4526 * @tsk: pointer to task_struct of exiting process 4530 * @tsk: pointer to task_struct of exiting process
4527 * 4531 *
4528 * Description: Detach cgroup from @tsk and release it. 4532 * Description: Detach cgroup from @tsk and release it.
4529 * 4533 *
4530 * Note that cgroups marked notify_on_release force every task in 4534 * Note that cgroups marked notify_on_release force every task in
4531 * them to take the global cgroup_mutex mutex when exiting. 4535 * them to take the global cgroup_mutex mutex when exiting.
4532 * This could impact scaling on very large systems. Be reluctant to 4536 * This could impact scaling on very large systems. Be reluctant to
4533 * use notify_on_release cgroups where very high task exit scaling 4537 * use notify_on_release cgroups where very high task exit scaling
4534 * is required on large systems. 4538 * is required on large systems.
4535 * 4539 *
4536 * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We 4540 * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
4537 * call cgroup_exit() while the task is still competent to handle 4541 * call cgroup_exit() while the task is still competent to handle
4538 * notify_on_release(), then leave the task attached to the root cgroup in 4542 * notify_on_release(), then leave the task attached to the root cgroup in
4539 * each hierarchy for the remainder of its exit. No need to bother with 4543 * each hierarchy for the remainder of its exit. No need to bother with
4540 * init_css_set refcnting. init_css_set never goes away and we can't race 4544 * init_css_set refcnting. init_css_set never goes away and we can't race
4541 * with migration path - PF_EXITING is visible to migration path. 4545 * with migration path - PF_EXITING is visible to migration path.
4542 */ 4546 */
4543 void cgroup_exit(struct task_struct *tsk) 4547 void cgroup_exit(struct task_struct *tsk)
4544 { 4548 {
4545 struct cgroup_subsys *ss; 4549 struct cgroup_subsys *ss;
4546 struct css_set *cset; 4550 struct css_set *cset;
4547 bool put_cset = false; 4551 bool put_cset = false;
4548 int i; 4552 int i;
4549 4553
4550 /* 4554 /*
4551 * Unlink from @tsk from its css_set. As migration path can't race 4555 * Unlink from @tsk from its css_set. As migration path can't race
4552 * with us, we can check cg_list without grabbing css_set_rwsem. 4556 * with us, we can check cg_list without grabbing css_set_rwsem.
4553 */ 4557 */
4554 if (!list_empty(&tsk->cg_list)) { 4558 if (!list_empty(&tsk->cg_list)) {
4555 down_write(&css_set_rwsem); 4559 down_write(&css_set_rwsem);
4556 list_del_init(&tsk->cg_list); 4560 list_del_init(&tsk->cg_list);
4557 up_write(&css_set_rwsem); 4561 up_write(&css_set_rwsem);
4558 put_cset = true; 4562 put_cset = true;
4559 } 4563 }
4560 4564
4561 /* Reassign the task to the init_css_set. */ 4565 /* Reassign the task to the init_css_set. */
4562 cset = task_css_set(tsk); 4566 cset = task_css_set(tsk);
4563 RCU_INIT_POINTER(tsk->cgroups, &init_css_set); 4567 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4564 4568
4565 if (need_forkexit_callback) { 4569 if (need_forkexit_callback) {
4566 /* see cgroup_post_fork() for details */ 4570 /* see cgroup_post_fork() for details */
4567 for_each_subsys(ss, i) { 4571 for_each_subsys(ss, i) {
4568 if (ss->exit) { 4572 if (ss->exit) {
4569 struct cgroup_subsys_state *old_css = cset->subsys[i]; 4573 struct cgroup_subsys_state *old_css = cset->subsys[i];
4570 struct cgroup_subsys_state *css = task_css(tsk, i); 4574 struct cgroup_subsys_state *css = task_css(tsk, i);
4571 4575
4572 ss->exit(css, old_css, tsk); 4576 ss->exit(css, old_css, tsk);
4573 } 4577 }
4574 } 4578 }
4575 } 4579 }
4576 4580
4577 if (put_cset) 4581 if (put_cset)
4578 put_css_set(cset, true); 4582 put_css_set(cset, true);
4579 } 4583 }
4580 4584
4581 static void check_for_release(struct cgroup *cgrp) 4585 static void check_for_release(struct cgroup *cgrp)
4582 { 4586 {
4583 if (cgroup_is_releasable(cgrp) && 4587 if (cgroup_is_releasable(cgrp) &&
4584 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { 4588 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
4585 /* 4589 /*
4586 * Control Group is currently removeable. If it's not 4590 * Control Group is currently removeable. If it's not
4587 * already queued for a userspace notification, queue 4591 * already queued for a userspace notification, queue
4588 * it now 4592 * it now
4589 */ 4593 */
4590 int need_schedule_work = 0; 4594 int need_schedule_work = 0;
4591 4595
4592 raw_spin_lock(&release_list_lock); 4596 raw_spin_lock(&release_list_lock);
4593 if (!cgroup_is_dead(cgrp) && 4597 if (!cgroup_is_dead(cgrp) &&
4594 list_empty(&cgrp->release_list)) { 4598 list_empty(&cgrp->release_list)) {
4595 list_add(&cgrp->release_list, &release_list); 4599 list_add(&cgrp->release_list, &release_list);
4596 need_schedule_work = 1; 4600 need_schedule_work = 1;
4597 } 4601 }
4598 raw_spin_unlock(&release_list_lock); 4602 raw_spin_unlock(&release_list_lock);
4599 if (need_schedule_work) 4603 if (need_schedule_work)
4600 schedule_work(&release_agent_work); 4604 schedule_work(&release_agent_work);
4601 } 4605 }
4602 } 4606 }
4603 4607
4604 /* 4608 /*
4605 * Notify userspace when a cgroup is released, by running the 4609 * Notify userspace when a cgroup is released, by running the
4606 * configured release agent with the name of the cgroup (path 4610 * configured release agent with the name of the cgroup (path
4607 * relative to the root of cgroup file system) as the argument. 4611 * relative to the root of cgroup file system) as the argument.
4608 * 4612 *
4609 * Most likely, this user command will try to rmdir this cgroup. 4613 * Most likely, this user command will try to rmdir this cgroup.
4610 * 4614 *
4611 * This races with the possibility that some other task will be 4615 * This races with the possibility that some other task will be
4612 * attached to this cgroup before it is removed, or that some other 4616 * attached to this cgroup before it is removed, or that some other
4613 * user task will 'mkdir' a child cgroup of this cgroup. That's ok. 4617 * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
4614 * The presumed 'rmdir' will fail quietly if this cgroup is no longer 4618 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
4615 * unused, and this cgroup will be reprieved from its death sentence, 4619 * unused, and this cgroup will be reprieved from its death sentence,
4616 * to continue to serve a useful existence. Next time it's released, 4620 * to continue to serve a useful existence. Next time it's released,
4617 * we will get notified again, if it still has 'notify_on_release' set. 4621 * we will get notified again, if it still has 'notify_on_release' set.
4618 * 4622 *
4619 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which 4623 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
4620 * means only wait until the task is successfully execve()'d. The 4624 * means only wait until the task is successfully execve()'d. The
4621 * separate release agent task is forked by call_usermodehelper(), 4625 * separate release agent task is forked by call_usermodehelper(),
4622 * then control in this thread returns here, without waiting for the 4626 * then control in this thread returns here, without waiting for the
4623 * release agent task. We don't bother to wait because the caller of 4627 * release agent task. We don't bother to wait because the caller of
4624 * this routine has no use for the exit status of the release agent 4628 * this routine has no use for the exit status of the release agent
4625 * task, so no sense holding our caller up for that. 4629 * task, so no sense holding our caller up for that.
4626 */ 4630 */
4627 static void cgroup_release_agent(struct work_struct *work) 4631 static void cgroup_release_agent(struct work_struct *work)
4628 { 4632 {
4629 BUG_ON(work != &release_agent_work); 4633 BUG_ON(work != &release_agent_work);
4630 mutex_lock(&cgroup_mutex); 4634 mutex_lock(&cgroup_mutex);
4631 raw_spin_lock(&release_list_lock); 4635 raw_spin_lock(&release_list_lock);
4632 while (!list_empty(&release_list)) { 4636 while (!list_empty(&release_list)) {
4633 char *argv[3], *envp[3]; 4637 char *argv[3], *envp[3];
4634 int i; 4638 int i;
4635 char *pathbuf = NULL, *agentbuf = NULL, *path; 4639 char *pathbuf = NULL, *agentbuf = NULL, *path;
4636 struct cgroup *cgrp = list_entry(release_list.next, 4640 struct cgroup *cgrp = list_entry(release_list.next,
4637 struct cgroup, 4641 struct cgroup,
4638 release_list); 4642 release_list);
4639 list_del_init(&cgrp->release_list); 4643 list_del_init(&cgrp->release_list);
4640 raw_spin_unlock(&release_list_lock); 4644 raw_spin_unlock(&release_list_lock);
4641 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); 4645 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
4642 if (!pathbuf) 4646 if (!pathbuf)
4643 goto continue_free; 4647 goto continue_free;
4644 path = cgroup_path(cgrp, pathbuf, PATH_MAX); 4648 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
4645 if (!path) 4649 if (!path)
4646 goto continue_free; 4650 goto continue_free;
4647 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 4651 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
4648 if (!agentbuf) 4652 if (!agentbuf)
4649 goto continue_free; 4653 goto continue_free;
4650 4654
4651 i = 0; 4655 i = 0;
4652 argv[i++] = agentbuf; 4656 argv[i++] = agentbuf;
4653 argv[i++] = path; 4657 argv[i++] = path;
4654 argv[i] = NULL; 4658 argv[i] = NULL;
4655 4659
4656 i = 0; 4660 i = 0;
4657 /* minimal command environment */ 4661 /* minimal command environment */
4658 envp[i++] = "HOME=/"; 4662 envp[i++] = "HOME=/";
4659 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 4663 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
4660 envp[i] = NULL; 4664 envp[i] = NULL;
4661 4665
4662 /* Drop the lock while we invoke the usermode helper, 4666 /* Drop the lock while we invoke the usermode helper,
4663 * since the exec could involve hitting disk and hence 4667 * since the exec could involve hitting disk and hence
4664 * be a slow process */ 4668 * be a slow process */
4665 mutex_unlock(&cgroup_mutex); 4669 mutex_unlock(&cgroup_mutex);
4666 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); 4670 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
4667 mutex_lock(&cgroup_mutex); 4671 mutex_lock(&cgroup_mutex);
4668 continue_free: 4672 continue_free:
4669 kfree(pathbuf); 4673 kfree(pathbuf);
4670 kfree(agentbuf); 4674 kfree(agentbuf);
4671 raw_spin_lock(&release_list_lock); 4675 raw_spin_lock(&release_list_lock);
4672 } 4676 }
4673 raw_spin_unlock(&release_list_lock); 4677 raw_spin_unlock(&release_list_lock);
4674 mutex_unlock(&cgroup_mutex); 4678 mutex_unlock(&cgroup_mutex);
4675 } 4679 }
4676 4680
4677 static int __init cgroup_disable(char *str) 4681 static int __init cgroup_disable(char *str)
4678 { 4682 {
4679 struct cgroup_subsys *ss; 4683 struct cgroup_subsys *ss;
4680 char *token; 4684 char *token;
4681 int i; 4685 int i;
4682 4686
4683 while ((token = strsep(&str, ",")) != NULL) { 4687 while ((token = strsep(&str, ",")) != NULL) {
4684 if (!*token) 4688 if (!*token)
4685 continue; 4689 continue;
4686 4690
4687 for_each_subsys(ss, i) { 4691 for_each_subsys(ss, i) {
4688 if (!strcmp(token, ss->name)) { 4692 if (!strcmp(token, ss->name)) {
4689 ss->disabled = 1; 4693 ss->disabled = 1;
4690 printk(KERN_INFO "Disabling %s control group" 4694 printk(KERN_INFO "Disabling %s control group"
4691 " subsystem\n", ss->name); 4695 " subsystem\n", ss->name);
4692 break; 4696 break;
4693 } 4697 }
4694 } 4698 }
4695 } 4699 }
4696 return 1; 4700 return 1;
4697 } 4701 }
4698 __setup("cgroup_disable=", cgroup_disable); 4702 __setup("cgroup_disable=", cgroup_disable);
4699 4703
4700 /** 4704 /**
4701 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir 4705 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
4702 * @dentry: directory dentry of interest 4706 * @dentry: directory dentry of interest
4703 * @ss: subsystem of interest 4707 * @ss: subsystem of interest
4704 * 4708 *
4705 * If @dentry is a directory for a cgroup which has @ss enabled on it, try 4709 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
4706 * to get the corresponding css and return it. If such css doesn't exist 4710 * to get the corresponding css and return it. If such css doesn't exist
4707 * or can't be pinned, an ERR_PTR value is returned. 4711 * or can't be pinned, an ERR_PTR value is returned.
4708 */ 4712 */
4709 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, 4713 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4710 struct cgroup_subsys *ss) 4714 struct cgroup_subsys *ss)
4711 { 4715 {
4712 struct kernfs_node *kn = kernfs_node_from_dentry(dentry); 4716 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4713 struct cgroup_subsys_state *css = NULL; 4717 struct cgroup_subsys_state *css = NULL;
4714 struct cgroup *cgrp; 4718 struct cgroup *cgrp;
4715 4719
4716 /* is @dentry a cgroup dir? */ 4720 /* is @dentry a cgroup dir? */
4717 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || 4721 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
4718 kernfs_type(kn) != KERNFS_DIR) 4722 kernfs_type(kn) != KERNFS_DIR)
4719 return ERR_PTR(-EBADF); 4723 return ERR_PTR(-EBADF);
4720 4724
4721 rcu_read_lock(); 4725 rcu_read_lock();
4722 4726
4723 /* 4727 /*
4724 * This path doesn't originate from kernfs and @kn could already 4728 * This path doesn't originate from kernfs and @kn could already
4725 * have been or be removed at any point. @kn->priv is RCU 4729 * have been or be removed at any point. @kn->priv is RCU
4726 * protected for this access. See destroy_locked() for details. 4730 * protected for this access. See destroy_locked() for details.
4727 */ 4731 */
4728 cgrp = rcu_dereference(kn->priv); 4732 cgrp = rcu_dereference(kn->priv);
4729 if (cgrp) 4733 if (cgrp)
4730 css = cgroup_css(cgrp, ss); 4734 css = cgroup_css(cgrp, ss);
4731 4735
4732 if (!css || !css_tryget(css)) 4736 if (!css || !css_tryget(css))
4733 css = ERR_PTR(-ENOENT); 4737 css = ERR_PTR(-ENOENT);
4734 4738
4735 rcu_read_unlock(); 4739 rcu_read_unlock();
4736 return css; 4740 return css;
4737 } 4741 }
4738 4742
4739 /** 4743 /**
4740 * css_from_id - lookup css by id 4744 * css_from_id - lookup css by id
4741 * @id: the cgroup id 4745 * @id: the cgroup id
4742 * @ss: cgroup subsys to be looked into 4746 * @ss: cgroup subsys to be looked into
4743 * 4747 *
4744 * Returns the css if there's valid one with @id, otherwise returns NULL. 4748 * Returns the css if there's valid one with @id, otherwise returns NULL.
4745 * Should be called under rcu_read_lock(). 4749 * Should be called under rcu_read_lock().
4746 */ 4750 */
4747 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) 4751 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
4748 { 4752 {
4749 struct cgroup *cgrp; 4753 struct cgroup *cgrp;
4750 4754
4751 cgroup_assert_mutexes_or_rcu_locked(); 4755 cgroup_assert_mutexes_or_rcu_locked();
4752 4756
4753 cgrp = idr_find(&ss->root->cgroup_idr, id); 4757 cgrp = idr_find(&ss->root->cgroup_idr, id);
4754 if (cgrp) 4758 if (cgrp)
4755 return cgroup_css(cgrp, ss); 4759 return cgroup_css(cgrp, ss);
4756 return NULL; 4760 return NULL;
4757 } 4761 }
4758 4762
4759 #ifdef CONFIG_CGROUP_DEBUG 4763 #ifdef CONFIG_CGROUP_DEBUG
4760 static struct cgroup_subsys_state * 4764 static struct cgroup_subsys_state *
4761 debug_css_alloc(struct cgroup_subsys_state *parent_css) 4765 debug_css_alloc(struct cgroup_subsys_state *parent_css)
4762 { 4766 {
4763 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 4767 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
4764 4768
4765 if (!css) 4769 if (!css)
4766 return ERR_PTR(-ENOMEM); 4770 return ERR_PTR(-ENOMEM);
4767 4771
4768 return css; 4772 return css;
4769 } 4773 }
4770 4774
4771 static void debug_css_free(struct cgroup_subsys_state *css) 4775 static void debug_css_free(struct cgroup_subsys_state *css)
4772 { 4776 {
4773 kfree(css); 4777 kfree(css);
4774 } 4778 }
4775 4779
4776 static u64 debug_taskcount_read(struct cgroup_subsys_state *css, 4780 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
4777 struct cftype *cft) 4781 struct cftype *cft)
4778 { 4782 {
4779 return cgroup_task_count(css->cgroup); 4783 return cgroup_task_count(css->cgroup);
4780 } 4784 }
4781 4785
4782 static u64 current_css_set_read(struct cgroup_subsys_state *css, 4786 static u64 current_css_set_read(struct cgroup_subsys_state *css,
4783 struct cftype *cft) 4787 struct cftype *cft)
4784 { 4788 {
4785 return (u64)(unsigned long)current->cgroups; 4789 return (u64)(unsigned long)current->cgroups;
4786 } 4790 }
4787 4791
4788 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, 4792 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
4789 struct cftype *cft) 4793 struct cftype *cft)
4790 { 4794 {
4791 u64 count; 4795 u64 count;
4792 4796
4793 rcu_read_lock(); 4797 rcu_read_lock();
4794 count = atomic_read(&task_css_set(current)->refcount); 4798 count = atomic_read(&task_css_set(current)->refcount);
4795 rcu_read_unlock(); 4799 rcu_read_unlock();
4796 return count; 4800 return count;
4797 } 4801 }
4798 4802
4799 static int current_css_set_cg_links_read(struct seq_file *seq, void *v) 4803 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
4800 { 4804 {
4801 struct cgrp_cset_link *link; 4805 struct cgrp_cset_link *link;
4802 struct css_set *cset; 4806 struct css_set *cset;
4803 char *name_buf; 4807 char *name_buf;
4804 4808
4805 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); 4809 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
4806 if (!name_buf) 4810 if (!name_buf)
4807 return -ENOMEM; 4811 return -ENOMEM;
4808 4812
4809 down_read(&css_set_rwsem); 4813 down_read(&css_set_rwsem);
4810 rcu_read_lock(); 4814 rcu_read_lock();
4811 cset = rcu_dereference(current->cgroups); 4815 cset = rcu_dereference(current->cgroups);
4812 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 4816 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
4813 struct cgroup *c = link->cgrp; 4817 struct cgroup *c = link->cgrp;
4814 4818
4815 cgroup_name(c, name_buf, NAME_MAX + 1); 4819 cgroup_name(c, name_buf, NAME_MAX + 1);
4816 seq_printf(seq, "Root %d group %s\n", 4820 seq_printf(seq, "Root %d group %s\n",
4817 c->root->hierarchy_id, name_buf); 4821 c->root->hierarchy_id, name_buf);
4818 } 4822 }
4819 rcu_read_unlock(); 4823 rcu_read_unlock();
4820 up_read(&css_set_rwsem); 4824 up_read(&css_set_rwsem);
4821 kfree(name_buf); 4825 kfree(name_buf);
4822 return 0; 4826 return 0;
4823 } 4827 }
4824 4828
4825 #define MAX_TASKS_SHOWN_PER_CSS 25 4829 #define MAX_TASKS_SHOWN_PER_CSS 25
4826 static int cgroup_css_links_read(struct seq_file *seq, void *v) 4830 static int cgroup_css_links_read(struct seq_file *seq, void *v)
4827 { 4831 {
4828 struct cgroup_subsys_state *css = seq_css(seq); 4832 struct cgroup_subsys_state *css = seq_css(seq);
4829 struct cgrp_cset_link *link; 4833 struct cgrp_cset_link *link;
4830 4834
4831 down_read(&css_set_rwsem); 4835 down_read(&css_set_rwsem);
4832 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 4836 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
4833 struct css_set *cset = link->cset; 4837 struct css_set *cset = link->cset;
4834 struct task_struct *task; 4838 struct task_struct *task;
4835 int count = 0; 4839 int count = 0;
4836 4840
4837 seq_printf(seq, "css_set %p\n", cset); 4841 seq_printf(seq, "css_set %p\n", cset);
4838 4842
4839 list_for_each_entry(task, &cset->tasks, cg_list) { 4843 list_for_each_entry(task, &cset->tasks, cg_list) {
4840 if (count++ > MAX_TASKS_SHOWN_PER_CSS) 4844 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4841 goto overflow; 4845 goto overflow;
4842 seq_printf(seq, " task %d\n", task_pid_vnr(task)); 4846 seq_printf(seq, " task %d\n", task_pid_vnr(task));
4843 } 4847 }
4844 4848
4845 list_for_each_entry(task, &cset->mg_tasks, cg_list) { 4849 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
4846 if (count++ > MAX_TASKS_SHOWN_PER_CSS) 4850 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4847 goto overflow; 4851 goto overflow;
4848 seq_printf(seq, " task %d\n", task_pid_vnr(task)); 4852 seq_printf(seq, " task %d\n", task_pid_vnr(task));
4849 } 4853 }
4850 continue; 4854 continue;
4851 overflow: 4855 overflow:
4852 seq_puts(seq, " ...\n"); 4856 seq_puts(seq, " ...\n");
4853 } 4857 }
4854 up_read(&css_set_rwsem); 4858 up_read(&css_set_rwsem);
4855 return 0; 4859 return 0;
4856 } 4860 }
4857 4861
4858 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) 4862 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
4859 { 4863 {
4860 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); 4864 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
4861 } 4865 }
4862 4866
4863 static struct cftype debug_files[] = { 4867 static struct cftype debug_files[] = {
4864 { 4868 {
4865 .name = "taskcount", 4869 .name = "taskcount",
4866 .read_u64 = debug_taskcount_read, 4870 .read_u64 = debug_taskcount_read,
4867 }, 4871 },
4868 4872
4869 { 4873 {
4870 .name = "current_css_set", 4874 .name = "current_css_set",
4871 .read_u64 = current_css_set_read, 4875 .read_u64 = current_css_set_read,
4872 }, 4876 },
4873 4877
4874 { 4878 {
4875 .name = "current_css_set_refcount", 4879 .name = "current_css_set_refcount",
4876 .read_u64 = current_css_set_refcount_read, 4880 .read_u64 = current_css_set_refcount_read,
4877 }, 4881 },
4878 4882
4879 { 4883 {
4880 .name = "current_css_set_cg_links", 4884 .name = "current_css_set_cg_links",
4881 .seq_show = current_css_set_cg_links_read, 4885 .seq_show = current_css_set_cg_links_read,
4882 }, 4886 },
4883 4887
4884 { 4888 {
4885 .name = "cgroup_css_links", 4889 .name = "cgroup_css_links",
4886 .seq_show = cgroup_css_links_read, 4890 .seq_show = cgroup_css_links_read,
4887 }, 4891 },
4888 4892
4889 { 4893 {
4890 .name = "releasable", 4894 .name = "releasable",
4891 .read_u64 = releasable_read, 4895 .read_u64 = releasable_read,
4892 }, 4896 },
4893 4897
4894 { } /* terminate */ 4898 { } /* terminate */
4895 }; 4899 };
4896 4900
4897 struct cgroup_subsys debug_cgrp_subsys = { 4901 struct cgroup_subsys debug_cgrp_subsys = {
4898 .css_alloc = debug_css_alloc, 4902 .css_alloc = debug_css_alloc,
4899 .css_free = debug_css_free, 4903 .css_free = debug_css_free,
4900 .base_cftypes = debug_files, 4904 .base_cftypes = debug_files,
4901 }; 4905 };
4902 #endif /* CONFIG_CGROUP_DEBUG */ 4906 #endif /* CONFIG_CGROUP_DEBUG */
4903 4907