Blame view

kernel/cgroup.c 93.3 KB
ddbcc7e8e   Paul Menage   Task Control Grou...
1
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
   *  Generic process-grouping system.
   *
   *  Based originally on the cpuset system, extracted by Paul Menage
   *  Copyright (C) 2006 Google, Inc
   *
   *  Copyright notices from the original cpuset code:
   *  --------------------------------------------------
   *  Copyright (C) 2003 BULL SA.
   *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
   *
   *  2003-10-10 Written by Simon Derr.
   *  2003-10-22 Updates by Stephen Hemminger.
   *  2004 May-July Rework by Paul Jackson.
   *  ---------------------------------------------------
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
  
  #include <linux/cgroup.h>
  #include <linux/errno.h>
  #include <linux/fs.h>
  #include <linux/kernel.h>
  #include <linux/list.h>
  #include <linux/mm.h>
  #include <linux/mutex.h>
  #include <linux/mount.h>
  #include <linux/pagemap.h>
a424316ca   Paul Menage   Task Control Grou...
34
  #include <linux/proc_fs.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
35
36
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
817929ec2   Paul Menage   Task Control Grou...
37
  #include <linux/backing-dev.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
38
39
40
41
42
  #include <linux/seq_file.h>
  #include <linux/slab.h>
  #include <linux/magic.h>
  #include <linux/spinlock.h>
  #include <linux/string.h>
bbcb81d09   Paul Menage   Task Control Grou...
43
  #include <linux/sort.h>
81a6a5cdd   Paul Menage   Task Control Grou...
44
  #include <linux/kmod.h>
846c7bb05   Balbir Singh   Add cgroupstats
45
46
  #include <linux/delayacct.h>
  #include <linux/cgroupstats.h>
472b1053f   Li Zefan   cgroups: use a ha...
47
  #include <linux/hash.h>
3f8206d49   Al Viro   [PATCH] get rid o...
48
  #include <linux/namei.h>
846c7bb05   Balbir Singh   Add cgroupstats
49

ddbcc7e8e   Paul Menage   Task Control Grou...
50
  #include <asm/atomic.h>
81a6a5cdd   Paul Menage   Task Control Grou...
51
  static DEFINE_MUTEX(cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
  /* Generate an array of cgroup subsystem pointers */
  #define SUBSYS(_x) &_x ## _subsys,
  
  static struct cgroup_subsys *subsys[] = {
  #include <linux/cgroup_subsys.h>
  };
  
  /*
   * A cgroupfs_root represents the root of a cgroup hierarchy,
   * and may be associated with a superblock to form an active
   * hierarchy
   */
  struct cgroupfs_root {
  	struct super_block *sb;
  
  	/*
  	 * The bitmask of subsystems intended to be attached to this
  	 * hierarchy
  	 */
  	unsigned long subsys_bits;
  
  	/* The bitmask of subsystems currently attached to this hierarchy */
  	unsigned long actual_subsys_bits;
  
  	/* A list running through the attached subsystems */
  	struct list_head subsys_list;
  
  	/* The root cgroup for this hierarchy */
  	struct cgroup top_cgroup;
  
  	/* Tracks how many cgroups are currently defined in hierarchy.*/
  	int number_of_cgroups;
e5f6a8609   Li Zefan   cgroups: make roo...
84
  	/* A list running through the active hierarchies */
ddbcc7e8e   Paul Menage   Task Control Grou...
85
86
87
88
  	struct list_head root_list;
  
  	/* Hierarchy-specific flags */
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
89

e788e066c   Paul Menage   cgroup files: mov...
90
  	/* The path to use for release notifications. */
81a6a5cdd   Paul Menage   Task Control Grou...
91
  	char release_agent_path[PATH_MAX];
ddbcc7e8e   Paul Menage   Task Control Grou...
92
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
93
94
95
96
97
98
  /*
   * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
   * subsystems that are otherwise unattached - it never has more than a
   * single cgroup, and all tasks are part of that cgroup.
   */
  static struct cgroupfs_root rootnode;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
  /*
   * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
   * cgroup_subsys->use_id != 0.
   */
  #define CSS_ID_MAX	(65535)
  struct css_id {
  	/*
  	 * The css to which this ID points. This pointer is set to valid value
  	 * after cgroup is populated. If cgroup is removed, this will be NULL.
  	 * This pointer is expected to be RCU-safe because destroy()
  	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
  	 * css_tryget() should be used for avoiding race.
  	 */
  	struct cgroup_subsys_state *css;
  	/*
  	 * ID of this css.
  	 */
  	unsigned short id;
  	/*
  	 * Depth in hierarchy which this ID belongs to.
  	 */
  	unsigned short depth;
  	/*
  	 * ID is freed by RCU. (and lookup routine is RCU safe.)
  	 */
  	struct rcu_head rcu_head;
  	/*
  	 * Hierarchy of CSS ID belongs to.
  	 */
  	unsigned short stack[0]; /* Array of Length (depth+1) */
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
130
131
132
  /* The list of hierarchy roots */
  
  static LIST_HEAD(roots);
817929ec2   Paul Menage   Task Control Grou...
133
  static int root_count;
ddbcc7e8e   Paul Menage   Task Control Grou...
134
135
136
137
138
  
  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
  #define dummytop (&rootnode.top_cgroup)
  
  /* This flag indicates whether tasks in the fork and exit paths should
a043e3b2c   Li Zefan   cgroup: fix comments
139
140
141
   * check for fork/exit handlers to call. This avoids us having to do
   * extra work in the fork/exit path if none of the subsystems need to
   * be called.
ddbcc7e8e   Paul Menage   Task Control Grou...
142
   */
8947f9d5b   Li Zefan   cgroups: annotate...
143
  static int need_forkexit_callback __read_mostly;
ddbcc7e8e   Paul Menage   Task Control Grou...
144

ddbcc7e8e   Paul Menage   Task Control Grou...
145
  /* convenient tests for these bits */
bd89aabc6   Paul Menage   Control groups: R...
146
  inline int cgroup_is_removed(const struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
147
  {
bd89aabc6   Paul Menage   Control groups: R...
148
  	return test_bit(CGRP_REMOVED, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
149
150
151
152
153
154
  }
  
  /* bits in struct cgroupfs_root flags field */
  enum {
  	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
  };
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
155
  static int cgroup_is_releasable(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
156
157
  {
  	const int bits =
bd89aabc6   Paul Menage   Control groups: R...
158
159
160
  		(1 << CGRP_RELEASABLE) |
  		(1 << CGRP_NOTIFY_ON_RELEASE);
  	return (cgrp->flags & bits) == bits;
81a6a5cdd   Paul Menage   Task Control Grou...
161
  }
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
162
  static int notify_on_release(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
163
  {
bd89aabc6   Paul Menage   Control groups: R...
164
  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
165
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
166
167
168
169
170
171
  /*
   * for_each_subsys() allows you to iterate on each subsystem attached to
   * an active hierarchy
   */
  #define for_each_subsys(_root, _ss) \
  list_for_each_entry(_ss, &_root->subsys_list, sibling)
e5f6a8609   Li Zefan   cgroups: make roo...
172
173
  /* for_each_active_root() allows you to iterate across the active hierarchies */
  #define for_each_active_root(_root) \
ddbcc7e8e   Paul Menage   Task Control Grou...
174
  list_for_each_entry(_root, &roots, root_list)
81a6a5cdd   Paul Menage   Task Control Grou...
175
176
177
178
179
180
  /* the list of cgroups eligible for automatic release. Protected by
   * release_list_lock */
  static LIST_HEAD(release_list);
  static DEFINE_SPINLOCK(release_list_lock);
  static void cgroup_release_agent(struct work_struct *work);
  static DECLARE_WORK(release_agent_work, cgroup_release_agent);
bd89aabc6   Paul Menage   Control groups: R...
181
  static void check_for_release(struct cgroup *cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
182

817929ec2   Paul Menage   Task Control Grou...
183
184
185
186
187
188
  /* Link structure for associating css_set objects with cgroups */
  struct cg_cgroup_link {
  	/*
  	 * List running through cg_cgroup_links associated with a
  	 * cgroup, anchored on cgroup->css_sets
  	 */
bd89aabc6   Paul Menage   Control groups: R...
189
  	struct list_head cgrp_link_list;
817929ec2   Paul Menage   Task Control Grou...
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
  	/*
  	 * List running through cg_cgroup_links pointing at a
  	 * single css_set object, anchored on css_set->cg_links
  	 */
  	struct list_head cg_link_list;
  	struct css_set *cg;
  };
  
  /* The default css_set - used by init and its children prior to any
   * hierarchies being mounted. It contains a pointer to the root state
   * for each subsystem. Also used to anchor the list of css_sets. Not
   * reference-counted, to improve performance when child cgroups
   * haven't been created.
   */
  
  static struct css_set init_css_set;
  static struct cg_cgroup_link init_css_set_link;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
207
  static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
817929ec2   Paul Menage   Task Control Grou...
208
209
210
211
212
  /* css_set_lock protects the list of css_set objects, and the
   * chain of tasks off each css_set.  Nests outside task->alloc_lock
   * due to cgroup_iter_start() */
  static DEFINE_RWLOCK(css_set_lock);
  static int css_set_count;
472b1053f   Li Zefan   cgroups: use a ha...
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
  /* hash table for cgroup groups. This improves the performance to
   * find an existing css_set */
  #define CSS_SET_HASH_BITS	7
  #define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
  static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
  
  static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
  {
  	int i;
  	int index;
  	unsigned long tmp = 0UL;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
  		tmp += (unsigned long)css[i];
  	tmp = (tmp >> 16) ^ tmp;
  
  	index = hash_long(tmp, CSS_SET_HASH_BITS);
  
  	return &css_set_table[index];
  }
817929ec2   Paul Menage   Task Control Grou...
233
234
235
236
  /* We don't maintain the lists running through each css_set to its
   * task until after the first call to cgroup_iter_start(). This
   * reduces the fork()/exit() overhead for people who have cgroups
   * compiled into their kernel but not actually in use */
8947f9d5b   Li Zefan   cgroups: annotate...
237
  static int use_task_css_set_links __read_mostly;
817929ec2   Paul Menage   Task Control Grou...
238
239
240
241
242
243
244
  
  /* When we create or destroy a css_set, the operation simply
   * takes/releases a reference count on all the cgroups referenced
   * by subsystems in this css_set. This can end up multiple-counting
   * some cgroups, but that's OK - the ref-count is just a
   * busy/not-busy indicator; ensuring that we only count each cgroup
   * once would require taking a global lock to ensure that no
b4f48b636   Paul Menage   Task Control Grou...
245
246
247
248
249
250
251
   * subsystems moved between hierarchies while we were doing so.
   *
   * Possible TODO: decide at boot time based on the number of
   * registered subsystems and the number of CPUs or NUMA nodes whether
   * it's better for performance to ref-count every subsystem, or to
   * take a global lock and only add one ref count to each hierarchy.
   */
817929ec2   Paul Menage   Task Control Grou...
252
253
254
255
  
  /*
   * unlink a css_set from the list and free it
   */
81a6a5cdd   Paul Menage   Task Control Grou...
256
  static void unlink_css_set(struct css_set *cg)
b4f48b636   Paul Menage   Task Control Grou...
257
  {
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
258
259
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
472b1053f   Li Zefan   cgroups: use a ha...
260
  	hlist_del(&cg->hlist);
817929ec2   Paul Menage   Task Control Grou...
261
  	css_set_count--;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
262
263
264
  
  	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
  				 cg_link_list) {
817929ec2   Paul Menage   Task Control Grou...
265
  		list_del(&link->cg_link_list);
bd89aabc6   Paul Menage   Control groups: R...
266
  		list_del(&link->cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
267
268
  		kfree(link);
  	}
81a6a5cdd   Paul Menage   Task Control Grou...
269
  }
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
270
  static void __put_css_set(struct css_set *cg, int taskexit)
81a6a5cdd   Paul Menage   Task Control Grou...
271
272
  {
  	int i;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
273
274
275
276
277
278
279
280
281
282
283
284
  	/*
  	 * Ensure that the refcount doesn't hit zero while any readers
  	 * can see it. Similar to atomic_dec_and_lock(), but for an
  	 * rwlock
  	 */
  	if (atomic_add_unless(&cg->refcount, -1, 1))
  		return;
  	write_lock(&css_set_lock);
  	if (!atomic_dec_and_test(&cg->refcount)) {
  		write_unlock(&css_set_lock);
  		return;
  	}
81a6a5cdd   Paul Menage   Task Control Grou...
285
  	unlink_css_set(cg);
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
286
  	write_unlock(&css_set_lock);
81a6a5cdd   Paul Menage   Task Control Grou...
287
288
289
  
  	rcu_read_lock();
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
a47295e6b   Paul Menage   cgroups: make cgr...
290
  		struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
bd89aabc6   Paul Menage   Control groups: R...
291
292
  		if (atomic_dec_and_test(&cgrp->count) &&
  		    notify_on_release(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
293
  			if (taskexit)
bd89aabc6   Paul Menage   Control groups: R...
294
295
  				set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
296
297
298
  		}
  	}
  	rcu_read_unlock();
817929ec2   Paul Menage   Task Control Grou...
299
  	kfree(cg);
b4f48b636   Paul Menage   Task Control Grou...
300
  }
817929ec2   Paul Menage   Task Control Grou...
301
302
303
304
305
  /*
   * refcounted get/put for css_set objects
   */
  static inline void get_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
306
  	atomic_inc(&cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
307
308
309
310
  }
  
  static inline void put_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
311
  	__put_css_set(cg, 0);
817929ec2   Paul Menage   Task Control Grou...
312
  }
81a6a5cdd   Paul Menage   Task Control Grou...
313
314
  static inline void put_css_set_taskexit(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
315
  	__put_css_set(cg, 1);
81a6a5cdd   Paul Menage   Task Control Grou...
316
  }
817929ec2   Paul Menage   Task Control Grou...
317
318
319
  /*
   * find_existing_css_set() is a helper for
   * find_css_set(), and checks to see whether an existing
472b1053f   Li Zefan   cgroups: use a ha...
320
   * css_set is suitable.
817929ec2   Paul Menage   Task Control Grou...
321
322
323
324
   *
   * oldcg: the cgroup group that we're using before the cgroup
   * transition
   *
bd89aabc6   Paul Menage   Control groups: R...
325
   * cgrp: the cgroup that we're moving into
817929ec2   Paul Menage   Task Control Grou...
326
327
328
329
   *
   * template: location in which to build the desired set of subsystem
   * state objects for the new cgroup group
   */
817929ec2   Paul Menage   Task Control Grou...
330
331
  static struct css_set *find_existing_css_set(
  	struct css_set *oldcg,
bd89aabc6   Paul Menage   Control groups: R...
332
  	struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
333
  	struct cgroup_subsys_state *template[])
b4f48b636   Paul Menage   Task Control Grou...
334
335
  {
  	int i;
bd89aabc6   Paul Menage   Control groups: R...
336
  	struct cgroupfs_root *root = cgrp->root;
472b1053f   Li Zefan   cgroups: use a ha...
337
338
339
  	struct hlist_head *hhead;
  	struct hlist_node *node;
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
340
341
342
343
  
  	/* Built the set of subsystem state objects that we want to
  	 * see in the new css_set */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
344
  		if (root->subsys_bits & (1UL << i)) {
817929ec2   Paul Menage   Task Control Grou...
345
346
347
  			/* Subsystem is in this hierarchy. So we want
  			 * the subsystem state from the new
  			 * cgroup */
bd89aabc6   Paul Menage   Control groups: R...
348
  			template[i] = cgrp->subsys[i];
817929ec2   Paul Menage   Task Control Grou...
349
350
351
352
353
354
  		} else {
  			/* Subsystem is not in this hierarchy, so we
  			 * don't want to change the subsystem state */
  			template[i] = oldcg->subsys[i];
  		}
  	}
472b1053f   Li Zefan   cgroups: use a ha...
355
356
  	hhead = css_set_hash(template);
  	hlist_for_each_entry(cg, node, hhead, hlist) {
817929ec2   Paul Menage   Task Control Grou...
357
358
359
360
  		if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
  			/* All subsystems matched */
  			return cg;
  		}
472b1053f   Li Zefan   cgroups: use a ha...
361
  	}
817929ec2   Paul Menage   Task Control Grou...
362
363
364
365
  
  	/* No existing cgroup group matched */
  	return NULL;
  }
36553434f   Li Zefan   cgroup: remove du...
366
367
368
369
370
371
372
373
374
375
  static void free_cg_links(struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
  
  	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
  		list_del(&link->cgrp_link_list);
  		kfree(link);
  	}
  }
817929ec2   Paul Menage   Task Control Grou...
376
377
  /*
   * allocate_cg_links() allocates "count" cg_cgroup_link structures
bd89aabc6   Paul Menage   Control groups: R...
378
   * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
817929ec2   Paul Menage   Task Control Grou...
379
380
   * success or a negative error
   */
817929ec2   Paul Menage   Task Control Grou...
381
382
383
384
385
386
387
388
  static int allocate_cg_links(int count, struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	int i;
  	INIT_LIST_HEAD(tmp);
  	for (i = 0; i < count; i++) {
  		link = kmalloc(sizeof(*link), GFP_KERNEL);
  		if (!link) {
36553434f   Li Zefan   cgroup: remove du...
389
  			free_cg_links(tmp);
817929ec2   Paul Menage   Task Control Grou...
390
391
  			return -ENOMEM;
  		}
bd89aabc6   Paul Menage   Control groups: R...
392
  		list_add(&link->cgrp_link_list, tmp);
817929ec2   Paul Menage   Task Control Grou...
393
394
395
  	}
  	return 0;
  }
c12f65d43   Li Zefan   cgroups: introduc...
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
  /**
   * link_css_set - a helper function to link a css_set to a cgroup
   * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
   * @cg: the css_set to be linked
   * @cgrp: the destination cgroup
   */
  static void link_css_set(struct list_head *tmp_cg_links,
  			 struct css_set *cg, struct cgroup *cgrp)
  {
  	struct cg_cgroup_link *link;
  
  	BUG_ON(list_empty(tmp_cg_links));
  	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
  				cgrp_link_list);
  	link->cg = cg;
  	list_move(&link->cgrp_link_list, &cgrp->css_sets);
  	list_add(&link->cg_link_list, &cg->cg_links);
  }
817929ec2   Paul Menage   Task Control Grou...
414
415
416
417
418
419
420
  /*
   * find_css_set() takes an existing cgroup group and a
   * cgroup object, and returns a css_set object that's
   * equivalent to the old group, but with the given cgroup
   * substituted into the appropriate hierarchy. Must be called with
   * cgroup_mutex held
   */
817929ec2   Paul Menage   Task Control Grou...
421
  static struct css_set *find_css_set(
bd89aabc6   Paul Menage   Control groups: R...
422
  	struct css_set *oldcg, struct cgroup *cgrp)
817929ec2   Paul Menage   Task Control Grou...
423
424
425
426
427
428
  {
  	struct css_set *res;
  	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
  	int i;
  
  	struct list_head tmp_cg_links;
817929ec2   Paul Menage   Task Control Grou...
429

472b1053f   Li Zefan   cgroups: use a ha...
430
  	struct hlist_head *hhead;
817929ec2   Paul Menage   Task Control Grou...
431
432
  	/* First see if we already have a cgroup group that matches
  	 * the desired set */
7e9abd89c   Li Zefan   cgroup: use read ...
433
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
434
  	res = find_existing_css_set(oldcg, cgrp, template);
817929ec2   Paul Menage   Task Control Grou...
435
436
  	if (res)
  		get_css_set(res);
7e9abd89c   Li Zefan   cgroup: use read ...
437
  	read_unlock(&css_set_lock);
817929ec2   Paul Menage   Task Control Grou...
438
439
440
441
442
443
444
445
446
447
448
449
450
  
  	if (res)
  		return res;
  
  	res = kmalloc(sizeof(*res), GFP_KERNEL);
  	if (!res)
  		return NULL;
  
  	/* Allocate all the cg_cgroup_link objects that we'll need */
  	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
  		kfree(res);
  		return NULL;
  	}
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
451
  	atomic_set(&res->refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
452
453
  	INIT_LIST_HEAD(&res->cg_links);
  	INIT_LIST_HEAD(&res->tasks);
472b1053f   Li Zefan   cgroups: use a ha...
454
  	INIT_HLIST_NODE(&res->hlist);
817929ec2   Paul Menage   Task Control Grou...
455
456
457
458
459
460
461
462
  
  	/* Copy the set of subsystem state objects generated in
  	 * find_existing_css_set() */
  	memcpy(res->subsys, template, sizeof(res->subsys));
  
  	write_lock(&css_set_lock);
  	/* Add reference counts and links from the new css_set. */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
bd89aabc6   Paul Menage   Control groups: R...
463
  		struct cgroup *cgrp = res->subsys[i]->cgroup;
817929ec2   Paul Menage   Task Control Grou...
464
  		struct cgroup_subsys *ss = subsys[i];
bd89aabc6   Paul Menage   Control groups: R...
465
  		atomic_inc(&cgrp->count);
817929ec2   Paul Menage   Task Control Grou...
466
467
468
469
470
  		/*
  		 * We want to add a link once per cgroup, so we
  		 * only do it for the first subsystem in each
  		 * hierarchy
  		 */
c12f65d43   Li Zefan   cgroups: introduc...
471
472
  		if (ss->root->subsys_list.next == &ss->sibling)
  			link_css_set(&tmp_cg_links, res, cgrp);
817929ec2   Paul Menage   Task Control Grou...
473
  	}
c12f65d43   Li Zefan   cgroups: introduc...
474
475
  	if (list_empty(&rootnode.subsys_list))
  		link_css_set(&tmp_cg_links, res, dummytop);
817929ec2   Paul Menage   Task Control Grou...
476
477
  
  	BUG_ON(!list_empty(&tmp_cg_links));
817929ec2   Paul Menage   Task Control Grou...
478
  	css_set_count++;
472b1053f   Li Zefan   cgroups: use a ha...
479
480
481
482
  
  	/* Add this cgroup group to the hash table */
  	hhead = css_set_hash(res->subsys);
  	hlist_add_head(&res->hlist, hhead);
817929ec2   Paul Menage   Task Control Grou...
483
484
485
  	write_unlock(&css_set_lock);
  
  	return res;
b4f48b636   Paul Menage   Task Control Grou...
486
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
487
488
489
490
491
492
493
494
495
496
  /*
   * There is one global cgroup mutex. We also require taking
   * task_lock() when dereferencing a task's cgroup subsys pointers.
   * See "The task_lock() exception", at the end of this comment.
   *
   * A task must hold cgroup_mutex to modify cgroups.
   *
   * Any task can increment and decrement the count field without lock.
   * So in general, code holding cgroup_mutex can't rely on the count
   * field not changing.  However, if the count goes to zero, then only
956db3ca0   Cliff Wickman   hotplug cpu: move...
497
   * cgroup_attach_task() can increment it again.  Because a count of zero
ddbcc7e8e   Paul Menage   Task Control Grou...
498
499
500
501
502
503
504
505
   * means that no tasks are currently attached, therefore there is no
   * way a task attached to that cgroup can fork (the other way to
   * increment the count).  So code holding cgroup_mutex can safely
   * assume that if the count is zero, it will stay zero. Similarly, if
   * a task holds cgroup_mutex on a cgroup with zero count, it
   * knows that the cgroup won't be removed, as cgroup_rmdir()
   * needs that mutex.
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
506
507
508
509
510
   * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
   * (usually) take cgroup_mutex.  These are the two most performance
   * critical pieces of code here.  The exception occurs on cgroup_exit(),
   * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
   * is taken, and if the cgroup count is zero, a usermode call made
a043e3b2c   Li Zefan   cgroup: fix comments
511
512
   * to the release agent with the name of the cgroup (path relative to
   * the root of cgroup file system) as the argument.
ddbcc7e8e   Paul Menage   Task Control Grou...
513
514
515
516
517
518
519
520
521
522
523
   *
   * A cgroup can only be deleted if both its 'count' of using tasks
   * is zero, and its list of 'children' cgroups is empty.  Since all
   * tasks in the system use _some_ cgroup, and since there is always at
   * least one task in the system (init, pid == 1), therefore, top_cgroup
   * always has either children cgroups and/or using tasks.  So we don't
   * need a special hack to ensure that top_cgroup cannot be deleted.
   *
   *	The task_lock() exception
   *
   * The need for this exception arises from the action of
956db3ca0   Cliff Wickman   hotplug cpu: move...
524
   * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
a043e3b2c   Li Zefan   cgroup: fix comments
525
   * another.  It does so using cgroup_mutex, however there are
ddbcc7e8e   Paul Menage   Task Control Grou...
526
527
528
   * several performance critical places that need to reference
   * task->cgroup without the expense of grabbing a system global
   * mutex.  Therefore except as noted below, when dereferencing or, as
956db3ca0   Cliff Wickman   hotplug cpu: move...
529
   * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
ddbcc7e8e   Paul Menage   Task Control Grou...
530
531
532
533
   * task_lock(), which acts on a spinlock (task->alloc_lock) already in
   * the task_struct routinely used for such matters.
   *
   * P.S.  One more locking exception.  RCU is used to guard the
956db3ca0   Cliff Wickman   hotplug cpu: move...
534
   * update of a tasks cgroup pointer by cgroup_attach_task()
ddbcc7e8e   Paul Menage   Task Control Grou...
535
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
536
537
538
539
  /**
   * cgroup_lock - lock out any changes to cgroup structures
   *
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
540
541
542
543
544
545
546
547
548
549
  void cgroup_lock(void)
  {
  	mutex_lock(&cgroup_mutex);
  }
  
  /**
   * cgroup_unlock - release lock on cgroup changes
   *
   * Undo the lock taken in a previous cgroup_lock() call.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
550
551
552
553
554
555
556
557
558
559
560
561
562
563
  void cgroup_unlock(void)
  {
  	mutex_unlock(&cgroup_mutex);
  }
  
  /*
   * A couple of forward declarations required, due to cyclic reference loop:
   * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
   * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
   * -> cgroup_mkdir.
   */
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
bd89aabc6   Paul Menage   Control groups: R...
564
  static int cgroup_populate_dir(struct cgroup *cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
565
  static struct inode_operations cgroup_dir_inode_operations;
a424316ca   Paul Menage   Task Control Grou...
566
567
568
  static struct file_operations proc_cgroupstats_operations;
  
  static struct backing_dev_info cgroup_backing_dev_info = {
e4ad08fe6   Miklos Szeredi   mm: bdi: add sepa...
569
  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
a424316ca   Paul Menage   Task Control Grou...
570
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
571

38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
572
573
  static int alloc_css_id(struct cgroup_subsys *ss,
  			struct cgroup *parent, struct cgroup *child);
ddbcc7e8e   Paul Menage   Task Control Grou...
574
575
576
  static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  {
  	struct inode *inode = new_inode(sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
577
578
579
  
  	if (inode) {
  		inode->i_mode = mode;
76aac0e9a   David Howells   CRED: Wrap task c...
580
581
  		inode->i_uid = current_fsuid();
  		inode->i_gid = current_fsgid();
ddbcc7e8e   Paul Menage   Task Control Grou...
582
583
584
585
586
  		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
  	}
  	return inode;
  }
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
587
588
589
590
  /*
   * Call subsys's pre_destroy handler.
   * This is called before css refcnt check.
   */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
591
  static int cgroup_call_pre_destroy(struct cgroup *cgrp)
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
592
593
  {
  	struct cgroup_subsys *ss;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
594
  	int ret = 0;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
595
  	for_each_subsys(cgrp->root, ss)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
596
597
598
599
600
601
  		if (ss->pre_destroy) {
  			ret = ss->pre_destroy(ss, cgrp);
  			if (ret)
  				break;
  		}
  	return ret;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
602
  }
a47295e6b   Paul Menage   cgroups: make cgr...
603
604
605
606
607
608
  static void free_cgroup_rcu(struct rcu_head *obj)
  {
  	struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
  
  	kfree(cgrp);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
609
610
611
612
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
  	/* is dentry a directory ? if so, kfree() associated cgroup */
  	if (S_ISDIR(inode->i_mode)) {
bd89aabc6   Paul Menage   Control groups: R...
613
  		struct cgroup *cgrp = dentry->d_fsdata;
8dc4f3e17   Paul Menage   cgroups: move cgr...
614
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
615
  		BUG_ON(!(cgroup_is_removed(cgrp)));
81a6a5cdd   Paul Menage   Task Control Grou...
616
617
618
619
620
621
622
  		/* It's possible for external users to be holding css
  		 * reference counts on a cgroup; css_put() needs to
  		 * be able to access the cgroup after decrementing
  		 * the reference count in order to know if it needs to
  		 * queue the cgroup to be handled by the release
  		 * agent */
  		synchronize_rcu();
8dc4f3e17   Paul Menage   cgroups: move cgr...
623
624
625
626
627
  
  		mutex_lock(&cgroup_mutex);
  		/*
  		 * Release the subsystem state objects.
  		 */
75139b827   Li Zefan   cgroups: remove s...
628
629
  		for_each_subsys(cgrp->root, ss)
  			ss->destroy(ss, cgrp);
8dc4f3e17   Paul Menage   cgroups: move cgr...
630
631
632
  
  		cgrp->root->number_of_cgroups--;
  		mutex_unlock(&cgroup_mutex);
a47295e6b   Paul Menage   cgroups: make cgr...
633
634
635
636
  		/*
  		 * Drop the active superblock reference that we took when we
  		 * created the cgroup
  		 */
8dc4f3e17   Paul Menage   cgroups: move cgr...
637
  		deactivate_super(cgrp->root->sb);
a47295e6b   Paul Menage   cgroups: make cgr...
638
  		call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
ddbcc7e8e   Paul Menage   Task Control Grou...
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
  	}
  	iput(inode);
  }
  
  static void remove_dir(struct dentry *d)
  {
  	struct dentry *parent = dget(d->d_parent);
  
  	d_delete(d);
  	simple_rmdir(parent->d_inode, d);
  	dput(parent);
  }
  
  static void cgroup_clear_directory(struct dentry *dentry)
  {
  	struct list_head *node;
  
  	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
  	spin_lock(&dcache_lock);
  	node = dentry->d_subdirs.next;
  	while (node != &dentry->d_subdirs) {
  		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
  		list_del_init(node);
  		if (d->d_inode) {
  			/* This should never be called on a cgroup
  			 * directory with child cgroups */
  			BUG_ON(d->d_inode->i_mode & S_IFDIR);
  			d = dget_locked(d);
  			spin_unlock(&dcache_lock);
  			d_delete(d);
  			simple_unlink(dentry->d_inode, d);
  			dput(d);
  			spin_lock(&dcache_lock);
  		}
  		node = dentry->d_subdirs.next;
  	}
  	spin_unlock(&dcache_lock);
  }
  
  /*
   * NOTE : the dentry must have been dget()'ed
   */
  static void cgroup_d_remove_dir(struct dentry *dentry)
  {
  	cgroup_clear_directory(dentry);
  
  	spin_lock(&dcache_lock);
  	list_del_init(&dentry->d_u.d_child);
  	spin_unlock(&dcache_lock);
  	remove_dir(dentry);
  }
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
  /*
   * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
   * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
   * reference to css->refcnt. In general, this refcnt is expected to goes down
   * to zero, soon.
   *
   * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
   */
  DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
  
  static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
  {
  	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
  		wake_up_all(&cgroup_rmdir_waitq);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
705
706
707
708
  static int rebind_subsystems(struct cgroupfs_root *root,
  			      unsigned long final_bits)
  {
  	unsigned long added_bits, removed_bits;
bd89aabc6   Paul Menage   Control groups: R...
709
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
710
711
712
713
714
715
  	int i;
  
  	removed_bits = root->actual_subsys_bits & ~final_bits;
  	added_bits = final_bits & ~root->actual_subsys_bits;
  	/* Check that any added subsystems are currently free */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
716
  		unsigned long bit = 1UL << i;
ddbcc7e8e   Paul Menage   Task Control Grou...
717
718
719
720
721
722
723
724
725
726
727
728
729
  		struct cgroup_subsys *ss = subsys[i];
  		if (!(bit & added_bits))
  			continue;
  		if (ss->root != &rootnode) {
  			/* Subsystem isn't free */
  			return -EBUSY;
  		}
  	}
  
  	/* Currently we don't handle adding/removing subsystems when
  	 * any child cgroups exist. This is theoretically supportable
  	 * but involves complex error handling, so it's being left until
  	 * later */
307257cf4   Paul Menage   cgroups: fix a ra...
730
  	if (root->number_of_cgroups > 1)
ddbcc7e8e   Paul Menage   Task Control Grou...
731
732
733
734
735
736
737
738
  		return -EBUSY;
  
  	/* Process each subsystem */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		unsigned long bit = 1UL << i;
  		if (bit & added_bits) {
  			/* We're binding this subsystem to this hierarchy */
bd89aabc6   Paul Menage   Control groups: R...
739
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
740
741
  			BUG_ON(!dummytop->subsys[i]);
  			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
999cd8a45   Paul Menage   cgroups: add a pe...
742
  			mutex_lock(&ss->hierarchy_mutex);
bd89aabc6   Paul Menage   Control groups: R...
743
744
  			cgrp->subsys[i] = dummytop->subsys[i];
  			cgrp->subsys[i]->cgroup = cgrp;
33a68ac1c   Li Zefan   cgroups: add inac...
745
  			list_move(&ss->sibling, &root->subsys_list);
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
746
  			ss->root = root;
ddbcc7e8e   Paul Menage   Task Control Grou...
747
  			if (ss->bind)
bd89aabc6   Paul Menage   Control groups: R...
748
  				ss->bind(ss, cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
749
  			mutex_unlock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
750
751
  		} else if (bit & removed_bits) {
  			/* We're removing this subsystem */
bd89aabc6   Paul Menage   Control groups: R...
752
753
  			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
  			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
754
  			mutex_lock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
755
756
757
  			if (ss->bind)
  				ss->bind(ss, dummytop);
  			dummytop->subsys[i]->cgroup = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
758
  			cgrp->subsys[i] = NULL;
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
759
  			subsys[i]->root = &rootnode;
33a68ac1c   Li Zefan   cgroups: add inac...
760
  			list_move(&ss->sibling, &rootnode.subsys_list);
999cd8a45   Paul Menage   cgroups: add a pe...
761
  			mutex_unlock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
762
763
  		} else if (bit & final_bits) {
  			/* Subsystem state should already exist */
bd89aabc6   Paul Menage   Control groups: R...
764
  			BUG_ON(!cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
765
766
  		} else {
  			/* Subsystem state shouldn't exist */
bd89aabc6   Paul Menage   Control groups: R...
767
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
  		}
  	}
  	root->subsys_bits = root->actual_subsys_bits = final_bits;
  	synchronize_rcu();
  
  	return 0;
  }
  
  static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
  {
  	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
  	struct cgroup_subsys *ss;
  
  	mutex_lock(&cgroup_mutex);
  	for_each_subsys(root, ss)
  		seq_printf(seq, ",%s", ss->name);
  	if (test_bit(ROOT_NOPREFIX, &root->flags))
  		seq_puts(seq, ",noprefix");
81a6a5cdd   Paul Menage   Task Control Grou...
786
787
  	if (strlen(root->release_agent_path))
  		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
ddbcc7e8e   Paul Menage   Task Control Grou...
788
789
790
791
792
793
794
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  struct cgroup_sb_opts {
  	unsigned long subsys_bits;
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
795
  	char *release_agent;
ddbcc7e8e   Paul Menage   Task Control Grou...
796
797
798
799
800
801
802
803
804
805
806
  };
  
  /* Convert a hierarchy specifier into a bitmask of subsystems and
   * flags. */
  static int parse_cgroupfs_options(char *data,
  				     struct cgroup_sb_opts *opts)
  {
  	char *token, *o = data ?: "all";
  
  	opts->subsys_bits = 0;
  	opts->flags = 0;
81a6a5cdd   Paul Menage   Task Control Grou...
807
  	opts->release_agent = NULL;
ddbcc7e8e   Paul Menage   Task Control Grou...
808
809
810
811
812
  
  	while ((token = strsep(&o, ",")) != NULL) {
  		if (!*token)
  			return -EINVAL;
  		if (!strcmp(token, "all")) {
8bab8dded   Paul Menage   cgroups: add cgro...
813
814
815
816
817
818
819
820
  			/* Add all non-disabled subsystems */
  			int i;
  			opts->subsys_bits = 0;
  			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  				struct cgroup_subsys *ss = subsys[i];
  				if (!ss->disabled)
  					opts->subsys_bits |= 1ul << i;
  			}
ddbcc7e8e   Paul Menage   Task Control Grou...
821
822
  		} else if (!strcmp(token, "noprefix")) {
  			set_bit(ROOT_NOPREFIX, &opts->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
823
824
825
826
827
828
829
830
831
  		} else if (!strncmp(token, "release_agent=", 14)) {
  			/* Specifying two release agents is forbidden */
  			if (opts->release_agent)
  				return -EINVAL;
  			opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
  			if (!opts->release_agent)
  				return -ENOMEM;
  			strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
  			opts->release_agent[PATH_MAX - 1] = 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
832
833
834
835
836
837
  		} else {
  			struct cgroup_subsys *ss;
  			int i;
  			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  				ss = subsys[i];
  				if (!strcmp(token, ss->name)) {
8bab8dded   Paul Menage   cgroups: add cgro...
838
839
  					if (!ss->disabled)
  						set_bit(i, &opts->subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
  					break;
  				}
  			}
  			if (i == CGROUP_SUBSYS_COUNT)
  				return -ENOENT;
  		}
  	}
  
  	/* We can't have an empty hierarchy */
  	if (!opts->subsys_bits)
  		return -EINVAL;
  
  	return 0;
  }
  
  static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  {
  	int ret = 0;
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
859
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
860
  	struct cgroup_sb_opts opts;
bd89aabc6   Paul Menage   Control groups: R...
861
  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
862
863
864
865
866
867
868
869
870
871
872
873
874
875
  	mutex_lock(&cgroup_mutex);
  
  	/* See what subsystems are wanted */
  	ret = parse_cgroupfs_options(data, &opts);
  	if (ret)
  		goto out_unlock;
  
  	/* Don't allow flags to change at remount */
  	if (opts.flags != root->flags) {
  		ret = -EINVAL;
  		goto out_unlock;
  	}
  
  	ret = rebind_subsystems(root, opts.subsys_bits);
0670e08bd   Li Zefan   cgroups: don't ch...
876
877
  	if (ret)
  		goto out_unlock;
ddbcc7e8e   Paul Menage   Task Control Grou...
878
879
  
  	/* (re)populate subsystem files */
0670e08bd   Li Zefan   cgroups: don't ch...
880
  	cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
881

81a6a5cdd   Paul Menage   Task Control Grou...
882
883
  	if (opts.release_agent)
  		strcpy(root->release_agent_path, opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
884
   out_unlock:
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
885
  	kfree(opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
886
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
887
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
888
889
890
891
892
893
894
895
896
  	return ret;
  }
  
  static struct super_operations cgroup_ops = {
  	.statfs = simple_statfs,
  	.drop_inode = generic_delete_inode,
  	.show_options = cgroup_show_options,
  	.remount_fs = cgroup_remount,
  };
cc31edcee   Paul Menage   cgroups: convert ...
897
898
899
900
901
902
903
904
  static void init_cgroup_housekeeping(struct cgroup *cgrp)
  {
  	INIT_LIST_HEAD(&cgrp->sibling);
  	INIT_LIST_HEAD(&cgrp->children);
  	INIT_LIST_HEAD(&cgrp->css_sets);
  	INIT_LIST_HEAD(&cgrp->release_list);
  	init_rwsem(&cgrp->pids_mutex);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
905
906
  static void init_cgroup_root(struct cgroupfs_root *root)
  {
bd89aabc6   Paul Menage   Control groups: R...
907
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
908
909
910
  	INIT_LIST_HEAD(&root->subsys_list);
  	INIT_LIST_HEAD(&root->root_list);
  	root->number_of_cgroups = 1;
bd89aabc6   Paul Menage   Control groups: R...
911
912
  	cgrp->root = root;
  	cgrp->top_cgroup = cgrp;
cc31edcee   Paul Menage   cgroups: convert ...
913
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
  }
  
  static int cgroup_test_super(struct super_block *sb, void *data)
  {
  	struct cgroupfs_root *new = data;
  	struct cgroupfs_root *root = sb->s_fs_info;
  
  	/* First check subsystems */
  	if (new->subsys_bits != root->subsys_bits)
  	    return 0;
  
  	/* Next check flags */
  	if (new->flags != root->flags)
  		return 0;
  
  	return 1;
  }
  
  static int cgroup_set_super(struct super_block *sb, void *data)
  {
  	int ret;
  	struct cgroupfs_root *root = data;
  
  	ret = set_anon_super(sb, NULL);
  	if (ret)
  		return ret;
  
  	sb->s_fs_info = root;
  	root->sb = sb;
  
  	sb->s_blocksize = PAGE_CACHE_SIZE;
  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
  	sb->s_magic = CGROUP_SUPER_MAGIC;
  	sb->s_op = &cgroup_ops;
  
  	return 0;
  }
  
  static int cgroup_get_rootdir(struct super_block *sb)
  {
  	struct inode *inode =
  		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
  	struct dentry *dentry;
  
  	if (!inode)
  		return -ENOMEM;
ddbcc7e8e   Paul Menage   Task Control Grou...
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
  	inode->i_fop = &simple_dir_operations;
  	inode->i_op = &cgroup_dir_inode_operations;
  	/* directories start off with i_nlink == 2 (for "." entry) */
  	inc_nlink(inode);
  	dentry = d_alloc_root(inode);
  	if (!dentry) {
  		iput(inode);
  		return -ENOMEM;
  	}
  	sb->s_root = dentry;
  	return 0;
  }
  
  static int cgroup_get_sb(struct file_system_type *fs_type,
  			 int flags, const char *unused_dev_name,
  			 void *data, struct vfsmount *mnt)
  {
  	struct cgroup_sb_opts opts;
  	int ret = 0;
  	struct super_block *sb;
  	struct cgroupfs_root *root;
28fd5dfc1   Li Zefan   cgroups: remove t...
981
  	struct list_head tmp_cg_links;
ddbcc7e8e   Paul Menage   Task Control Grou...
982
983
984
  
  	/* First find the desired set of subsystems */
  	ret = parse_cgroupfs_options(data, &opts);
81a6a5cdd   Paul Menage   Task Control Grou...
985
  	if (ret) {
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
986
  		kfree(opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
987
  		return ret;
81a6a5cdd   Paul Menage   Task Control Grou...
988
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
989
990
  
  	root = kzalloc(sizeof(*root), GFP_KERNEL);
f77707384   Li Zefan   cgroup: fix memor...
991
  	if (!root) {
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
992
  		kfree(opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
993
  		return -ENOMEM;
f77707384   Li Zefan   cgroup: fix memor...
994
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
995
996
997
998
  
  	init_cgroup_root(root);
  	root->subsys_bits = opts.subsys_bits;
  	root->flags = opts.flags;
81a6a5cdd   Paul Menage   Task Control Grou...
999
1000
1001
1002
  	if (opts.release_agent) {
  		strcpy(root->release_agent_path, opts.release_agent);
  		kfree(opts.release_agent);
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
  
  	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
  
  	if (IS_ERR(sb)) {
  		kfree(root);
  		return PTR_ERR(sb);
  	}
  
  	if (sb->s_fs_info != root) {
  		/* Reusing an existing superblock */
  		BUG_ON(sb->s_root == NULL);
  		kfree(root);
  		root = NULL;
  	} else {
  		/* New superblock */
c12f65d43   Li Zefan   cgroups: introduc...
1018
  		struct cgroup *root_cgrp = &root->top_cgroup;
817929ec2   Paul Menage   Task Control Grou...
1019
  		struct inode *inode;
28fd5dfc1   Li Zefan   cgroups: remove t...
1020
  		int i;
ddbcc7e8e   Paul Menage   Task Control Grou...
1021
1022
1023
1024
1025
1026
  
  		BUG_ON(sb->s_root != NULL);
  
  		ret = cgroup_get_rootdir(sb);
  		if (ret)
  			goto drop_new_super;
817929ec2   Paul Menage   Task Control Grou...
1027
  		inode = sb->s_root->d_inode;
ddbcc7e8e   Paul Menage   Task Control Grou...
1028

817929ec2   Paul Menage   Task Control Grou...
1029
  		mutex_lock(&inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1030
  		mutex_lock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
  		/*
  		 * We're accessing css_set_count without locking
  		 * css_set_lock here, but that's OK - it can only be
  		 * increased by someone holding cgroup_lock, and
  		 * that's us. The worst that can happen is that we
  		 * have some link structures left over
  		 */
  		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
  		if (ret) {
  			mutex_unlock(&cgroup_mutex);
  			mutex_unlock(&inode->i_mutex);
  			goto drop_new_super;
  		}
ddbcc7e8e   Paul Menage   Task Control Grou...
1044
1045
1046
  		ret = rebind_subsystems(root, root->subsys_bits);
  		if (ret == -EBUSY) {
  			mutex_unlock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
1047
  			mutex_unlock(&inode->i_mutex);
20ca9b3f4   Li Zefan   cgroups: avoid ac...
1048
  			goto free_cg_links;
ddbcc7e8e   Paul Menage   Task Control Grou...
1049
1050
1051
1052
1053
1054
  		}
  
  		/* EBUSY should be the only error here */
  		BUG_ON(ret);
  
  		list_add(&root->root_list, &roots);
817929ec2   Paul Menage   Task Control Grou...
1055
  		root_count++;
ddbcc7e8e   Paul Menage   Task Control Grou...
1056

c12f65d43   Li Zefan   cgroups: introduc...
1057
  		sb->s_root->d_fsdata = root_cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
1058
  		root->top_cgroup.dentry = sb->s_root;
817929ec2   Paul Menage   Task Control Grou...
1059
1060
1061
  		/* Link the top cgroup in this hierarchy into all
  		 * the css_set objects */
  		write_lock(&css_set_lock);
28fd5dfc1   Li Zefan   cgroups: remove t...
1062
1063
1064
  		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  			struct hlist_head *hhead = &css_set_table[i];
  			struct hlist_node *node;
817929ec2   Paul Menage   Task Control Grou...
1065
  			struct css_set *cg;
28fd5dfc1   Li Zefan   cgroups: remove t...
1066

c12f65d43   Li Zefan   cgroups: introduc...
1067
1068
  			hlist_for_each_entry(cg, node, hhead, hlist)
  				link_css_set(&tmp_cg_links, cg, root_cgrp);
28fd5dfc1   Li Zefan   cgroups: remove t...
1069
  		}
817929ec2   Paul Menage   Task Control Grou...
1070
1071
1072
  		write_unlock(&css_set_lock);
  
  		free_cg_links(&tmp_cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
1073
1074
  		BUG_ON(!list_empty(&root_cgrp->sibling));
  		BUG_ON(!list_empty(&root_cgrp->children));
ddbcc7e8e   Paul Menage   Task Control Grou...
1075
  		BUG_ON(root->number_of_cgroups != 1);
c12f65d43   Li Zefan   cgroups: introduc...
1076
  		cgroup_populate_dir(root_cgrp);
817929ec2   Paul Menage   Task Control Grou...
1077
  		mutex_unlock(&inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1078
1079
  		mutex_unlock(&cgroup_mutex);
  	}
a3ec947c8   Sukadev Bhattiprolu   vfs: simple_set_m...
1080
1081
  	simple_set_mnt(mnt, sb);
  	return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
1082

20ca9b3f4   Li Zefan   cgroups: avoid ac...
1083
1084
   free_cg_links:
  	free_cg_links(&tmp_cg_links);
ddbcc7e8e   Paul Menage   Task Control Grou...
1085
   drop_new_super:
6f5bbff9a   Al Viro   Convert obvious p...
1086
  	deactivate_locked_super(sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
1087
1088
1089
1090
1091
  	return ret;
  }
  
  static void cgroup_kill_sb(struct super_block *sb) {
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1092
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1093
  	int ret;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1094
1095
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
ddbcc7e8e   Paul Menage   Task Control Grou...
1096
1097
1098
1099
  
  	BUG_ON(!root);
  
  	BUG_ON(root->number_of_cgroups != 1);
bd89aabc6   Paul Menage   Control groups: R...
1100
1101
  	BUG_ON(!list_empty(&cgrp->children));
  	BUG_ON(!list_empty(&cgrp->sibling));
ddbcc7e8e   Paul Menage   Task Control Grou...
1102
1103
1104
1105
1106
1107
1108
  
  	mutex_lock(&cgroup_mutex);
  
  	/* Rebind all subsystems back to the default hierarchy */
  	ret = rebind_subsystems(root, 0);
  	/* Shouldn't be able to fail ... */
  	BUG_ON(ret);
817929ec2   Paul Menage   Task Control Grou...
1109
1110
1111
1112
1113
  	/*
  	 * Release all the links from css_sets to this hierarchy's
  	 * root cgroup
  	 */
  	write_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1114
1115
1116
  
  	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
  				 cgrp_link_list) {
817929ec2   Paul Menage   Task Control Grou...
1117
  		list_del(&link->cg_link_list);
bd89aabc6   Paul Menage   Control groups: R...
1118
  		list_del(&link->cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
1119
1120
1121
  		kfree(link);
  	}
  	write_unlock(&css_set_lock);
839ec5452   Paul Menage   cgroup: fix root_...
1122
1123
1124
1125
  	if (!list_empty(&root->root_list)) {
  		list_del(&root->root_list);
  		root_count--;
  	}
e5f6a8609   Li Zefan   cgroups: make roo...
1126

ddbcc7e8e   Paul Menage   Task Control Grou...
1127
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1128
  	kill_litter_super(sb);
67e055d14   Li Zefan   cgroups: fix poss...
1129
  	kfree(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
1130
1131
1132
1133
1134
1135
1136
  }
  
  static struct file_system_type cgroup_fs_type = {
  	.name = "cgroup",
  	.get_sb = cgroup_get_sb,
  	.kill_sb = cgroup_kill_sb,
  };
bd89aabc6   Paul Menage   Control groups: R...
1137
  static inline struct cgroup *__d_cgrp(struct dentry *dentry)
ddbcc7e8e   Paul Menage   Task Control Grou...
1138
1139
1140
1141
1142
1143
1144
1145
  {
  	return dentry->d_fsdata;
  }
  
  static inline struct cftype *__d_cft(struct dentry *dentry)
  {
  	return dentry->d_fsdata;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1146
1147
1148
1149
1150
1151
  /**
   * cgroup_path - generate the path of a cgroup
   * @cgrp: the cgroup in question
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
a47295e6b   Paul Menage   cgroups: make cgr...
1152
1153
1154
   * Called with cgroup_mutex held or else with an RCU-protected cgroup
   * reference.  Writes path of cgroup into buf.  Returns 0 on success,
   * -errno on error.
ddbcc7e8e   Paul Menage   Task Control Grou...
1155
   */
bd89aabc6   Paul Menage   Control groups: R...
1156
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
ddbcc7e8e   Paul Menage   Task Control Grou...
1157
1158
  {
  	char *start;
a47295e6b   Paul Menage   cgroups: make cgr...
1159
  	struct dentry *dentry = rcu_dereference(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
1160

a47295e6b   Paul Menage   cgroups: make cgr...
1161
  	if (!dentry || cgrp == dummytop) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
  		/*
  		 * Inactive subsystems have no dentry for their root
  		 * cgroup
  		 */
  		strcpy(buf, "/");
  		return 0;
  	}
  
  	start = buf + buflen;
  
  	*--start = '\0';
  	for (;;) {
a47295e6b   Paul Menage   cgroups: make cgr...
1174
  		int len = dentry->d_name.len;
ddbcc7e8e   Paul Menage   Task Control Grou...
1175
1176
  		if ((start -= len) < buf)
  			return -ENAMETOOLONG;
bd89aabc6   Paul Menage   Control groups: R...
1177
1178
1179
  		memcpy(start, cgrp->dentry->d_name.name, len);
  		cgrp = cgrp->parent;
  		if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
1180
  			break;
a47295e6b   Paul Menage   cgroups: make cgr...
1181
  		dentry = rcu_dereference(cgrp->dentry);
bd89aabc6   Paul Menage   Control groups: R...
1182
  		if (!cgrp->parent)
ddbcc7e8e   Paul Menage   Task Control Grou...
1183
1184
1185
1186
1187
1188
1189
1190
  			continue;
  		if (--start < buf)
  			return -ENAMETOOLONG;
  		*start = '/';
  	}
  	memmove(buf, start, buf + buflen - start);
  	return 0;
  }
bbcb81d09   Paul Menage   Task Control Grou...
1191
1192
1193
1194
  /*
   * Return the first subsystem attached to a cgroup's hierarchy, and
   * its subsystem id.
   */
bd89aabc6   Paul Menage   Control groups: R...
1195
  static void get_first_subsys(const struct cgroup *cgrp,
bbcb81d09   Paul Menage   Task Control Grou...
1196
1197
  			struct cgroup_subsys_state **css, int *subsys_id)
  {
bd89aabc6   Paul Menage   Control groups: R...
1198
  	const struct cgroupfs_root *root = cgrp->root;
bbcb81d09   Paul Menage   Task Control Grou...
1199
1200
1201
1202
1203
  	const struct cgroup_subsys *test_ss;
  	BUG_ON(list_empty(&root->subsys_list));
  	test_ss = list_entry(root->subsys_list.next,
  			     struct cgroup_subsys, sibling);
  	if (css) {
bd89aabc6   Paul Menage   Control groups: R...
1204
  		*css = cgrp->subsys[test_ss->subsys_id];
bbcb81d09   Paul Menage   Task Control Grou...
1205
1206
1207
1208
1209
  		BUG_ON(!*css);
  	}
  	if (subsys_id)
  		*subsys_id = test_ss->subsys_id;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1210
1211
1212
1213
  /**
   * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
   * @cgrp: the cgroup the task is attaching to
   * @tsk: the task to be attached
bbcb81d09   Paul Menage   Task Control Grou...
1214
   *
a043e3b2c   Li Zefan   cgroup: fix comments
1215
1216
   * Call holding cgroup_mutex. May take task_lock of
   * the task 'tsk' during call.
bbcb81d09   Paul Menage   Task Control Grou...
1217
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1218
  int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
bbcb81d09   Paul Menage   Task Control Grou...
1219
1220
1221
  {
  	int retval = 0;
  	struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
1222
  	struct cgroup *oldcgrp;
77efecd9e   Lai Jiangshan   cgroups: call fin...
1223
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
1224
  	struct css_set *newcg;
bd89aabc6   Paul Menage   Control groups: R...
1225
  	struct cgroupfs_root *root = cgrp->root;
bbcb81d09   Paul Menage   Task Control Grou...
1226
  	int subsys_id;
bd89aabc6   Paul Menage   Control groups: R...
1227
  	get_first_subsys(cgrp, NULL, &subsys_id);
bbcb81d09   Paul Menage   Task Control Grou...
1228
1229
  
  	/* Nothing to do if the task is already in that cgroup */
bd89aabc6   Paul Menage   Control groups: R...
1230
1231
  	oldcgrp = task_cgroup(tsk, subsys_id);
  	if (cgrp == oldcgrp)
bbcb81d09   Paul Menage   Task Control Grou...
1232
1233
1234
1235
  		return 0;
  
  	for_each_subsys(root, ss) {
  		if (ss->can_attach) {
bd89aabc6   Paul Menage   Control groups: R...
1236
  			retval = ss->can_attach(ss, cgrp, tsk);
e18f6318e   Paul Jackson   cgroup brace codi...
1237
  			if (retval)
bbcb81d09   Paul Menage   Task Control Grou...
1238
  				return retval;
bbcb81d09   Paul Menage   Task Control Grou...
1239
1240
  		}
  	}
77efecd9e   Lai Jiangshan   cgroups: call fin...
1241
1242
1243
1244
  	task_lock(tsk);
  	cg = tsk->cgroups;
  	get_css_set(cg);
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1245
1246
1247
1248
  	/*
  	 * Locate or allocate a new css_set for this task,
  	 * based on its final set of cgroups
  	 */
bd89aabc6   Paul Menage   Control groups: R...
1249
  	newcg = find_css_set(cg, cgrp);
77efecd9e   Lai Jiangshan   cgroups: call fin...
1250
  	put_css_set(cg);
e18f6318e   Paul Jackson   cgroup brace codi...
1251
  	if (!newcg)
817929ec2   Paul Menage   Task Control Grou...
1252
  		return -ENOMEM;
817929ec2   Paul Menage   Task Control Grou...
1253

bbcb81d09   Paul Menage   Task Control Grou...
1254
1255
1256
  	task_lock(tsk);
  	if (tsk->flags & PF_EXITING) {
  		task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1257
  		put_css_set(newcg);
bbcb81d09   Paul Menage   Task Control Grou...
1258
1259
  		return -ESRCH;
  	}
817929ec2   Paul Menage   Task Control Grou...
1260
  	rcu_assign_pointer(tsk->cgroups, newcg);
bbcb81d09   Paul Menage   Task Control Grou...
1261
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1262
1263
1264
1265
1266
1267
1268
  	/* Update the css_set linked lists if we're using them */
  	write_lock(&css_set_lock);
  	if (!list_empty(&tsk->cg_list)) {
  		list_del(&tsk->cg_list);
  		list_add(&tsk->cg_list, &newcg->tasks);
  	}
  	write_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
1269
  	for_each_subsys(root, ss) {
e18f6318e   Paul Jackson   cgroup brace codi...
1270
  		if (ss->attach)
bd89aabc6   Paul Menage   Control groups: R...
1271
  			ss->attach(ss, cgrp, oldcgrp, tsk);
bbcb81d09   Paul Menage   Task Control Grou...
1272
  	}
bd89aabc6   Paul Menage   Control groups: R...
1273
  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
bbcb81d09   Paul Menage   Task Control Grou...
1274
  	synchronize_rcu();
817929ec2   Paul Menage   Task Control Grou...
1275
  	put_css_set(cg);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
1276
1277
1278
1279
1280
1281
  
  	/*
  	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
  	 * is no longer empty.
  	 */
  	cgroup_wakeup_rmdir_waiters(cgrp);
bbcb81d09   Paul Menage   Task Control Grou...
1282
1283
1284
1285
  	return 0;
  }
  
  /*
af351026a   Paul Menage   cgroup files: tur...
1286
1287
   * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
   * held. May take task_lock of task
bbcb81d09   Paul Menage   Task Control Grou...
1288
   */
af351026a   Paul Menage   cgroup files: tur...
1289
  static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
bbcb81d09   Paul Menage   Task Control Grou...
1290
  {
bbcb81d09   Paul Menage   Task Control Grou...
1291
  	struct task_struct *tsk;
c69e8d9c0   David Howells   CRED: Use RCU to ...
1292
  	const struct cred *cred = current_cred(), *tcred;
bbcb81d09   Paul Menage   Task Control Grou...
1293
  	int ret;
bbcb81d09   Paul Menage   Task Control Grou...
1294
1295
  	if (pid) {
  		rcu_read_lock();
73507f335   Pavel Emelyanov   Handle pid namesp...
1296
  		tsk = find_task_by_vpid(pid);
bbcb81d09   Paul Menage   Task Control Grou...
1297
1298
1299
1300
  		if (!tsk || tsk->flags & PF_EXITING) {
  			rcu_read_unlock();
  			return -ESRCH;
  		}
bbcb81d09   Paul Menage   Task Control Grou...
1301

c69e8d9c0   David Howells   CRED: Use RCU to ...
1302
1303
1304
1305
1306
  		tcred = __task_cred(tsk);
  		if (cred->euid &&
  		    cred->euid != tcred->uid &&
  		    cred->euid != tcred->suid) {
  			rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
1307
1308
  			return -EACCES;
  		}
c69e8d9c0   David Howells   CRED: Use RCU to ...
1309
1310
  		get_task_struct(tsk);
  		rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
1311
1312
1313
1314
  	} else {
  		tsk = current;
  		get_task_struct(tsk);
  	}
956db3ca0   Cliff Wickman   hotplug cpu: move...
1315
  	ret = cgroup_attach_task(cgrp, tsk);
bbcb81d09   Paul Menage   Task Control Grou...
1316
1317
1318
  	put_task_struct(tsk);
  	return ret;
  }
af351026a   Paul Menage   cgroup files: tur...
1319
1320
1321
1322
1323
1324
1325
1326
1327
  static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
  {
  	int ret;
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	ret = attach_task_by_pid(cgrp, pid);
  	cgroup_unlock();
  	return ret;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1328
  /* The various types of files and directories in a cgroup file system */
ddbcc7e8e   Paul Menage   Task Control Grou...
1329
1330
1331
1332
  enum cgroup_filetype {
  	FILE_ROOT,
  	FILE_DIR,
  	FILE_TASKLIST,
81a6a5cdd   Paul Menage   Task Control Grou...
1333
  	FILE_NOTIFY_ON_RELEASE,
81a6a5cdd   Paul Menage   Task Control Grou...
1334
  	FILE_RELEASE_AGENT,
ddbcc7e8e   Paul Menage   Task Control Grou...
1335
  };
e788e066c   Paul Menage   cgroup files: mov...
1336
1337
1338
1339
  /**
   * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
   * @cgrp: the cgroup to be checked for liveness
   *
84eea8428   Paul Menage   cgroups: misc cle...
1340
1341
   * On success, returns true; the lock should be later released with
   * cgroup_unlock(). On failure returns false with no lock held.
e788e066c   Paul Menage   cgroup files: mov...
1342
   */
84eea8428   Paul Menage   cgroups: misc cle...
1343
  bool cgroup_lock_live_group(struct cgroup *cgrp)
e788e066c   Paul Menage   cgroup files: mov...
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
  {
  	mutex_lock(&cgroup_mutex);
  	if (cgroup_is_removed(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
  		return false;
  	}
  	return true;
  }
  
  static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	strcpy(cgrp->root->release_agent_path, buffer);
84eea8428   Paul Menage   cgroups: misc cle...
1360
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
  	return 0;
  }
  
  static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
  				     struct seq_file *seq)
  {
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	seq_puts(seq, cgrp->root->release_agent_path);
  	seq_putc(seq, '
  ');
84eea8428   Paul Menage   cgroups: misc cle...
1372
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
1373
1374
  	return 0;
  }
84eea8428   Paul Menage   cgroups: misc cle...
1375
1376
  /* A buffer size big enough for numbers or short strings */
  #define CGROUP_LOCAL_BUFFER_SIZE 64
e73d2c61d   Paul Menage   CGroups _s64 file...
1377
  static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
f4c753b7e   Paul Menage   CGroup API files:...
1378
1379
1380
  				struct file *file,
  				const char __user *userbuf,
  				size_t nbytes, loff_t *unused_ppos)
355e0c48b   Paul Menage   Add cgroup write_...
1381
  {
84eea8428   Paul Menage   cgroups: misc cle...
1382
  	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
355e0c48b   Paul Menage   Add cgroup write_...
1383
  	int retval = 0;
355e0c48b   Paul Menage   Add cgroup write_...
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
  	char *end;
  
  	if (!nbytes)
  		return -EINVAL;
  	if (nbytes >= sizeof(buffer))
  		return -E2BIG;
  	if (copy_from_user(buffer, userbuf, nbytes))
  		return -EFAULT;
  
  	buffer[nbytes] = 0;     /* nul-terminate */
b7269dfc8   Paul Menage   CGroup API files:...
1394
  	strstrip(buffer);
e73d2c61d   Paul Menage   CGroups _s64 file...
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
  	if (cft->write_u64) {
  		u64 val = simple_strtoull(buffer, &end, 0);
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_u64(cgrp, cft, val);
  	} else {
  		s64 val = simple_strtoll(buffer, &end, 0);
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_s64(cgrp, cft, val);
  	}
355e0c48b   Paul Menage   Add cgroup write_...
1406
1407
1408
1409
  	if (!retval)
  		retval = nbytes;
  	return retval;
  }
db3b14978   Paul Menage   cgroup files: add...
1410
1411
1412
1413
1414
  static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
  				   struct file *file,
  				   const char __user *userbuf,
  				   size_t nbytes, loff_t *unused_ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
1415
  	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
db3b14978   Paul Menage   cgroup files: add...
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
  	int retval = 0;
  	size_t max_bytes = cft->max_write_len;
  	char *buffer = local_buffer;
  
  	if (!max_bytes)
  		max_bytes = sizeof(local_buffer) - 1;
  	if (nbytes >= max_bytes)
  		return -E2BIG;
  	/* Allocate a dynamic buffer if we need one */
  	if (nbytes >= sizeof(local_buffer)) {
  		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
  		if (buffer == NULL)
  			return -ENOMEM;
  	}
5a3eb9f6b   Li Zefan   cgroup: fix possi...
1430
1431
1432
1433
  	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
  		retval = -EFAULT;
  		goto out;
  	}
db3b14978   Paul Menage   cgroup files: add...
1434
1435
1436
1437
1438
1439
  
  	buffer[nbytes] = 0;     /* nul-terminate */
  	strstrip(buffer);
  	retval = cft->write_string(cgrp, cft, buffer);
  	if (!retval)
  		retval = nbytes;
5a3eb9f6b   Li Zefan   cgroup: fix possi...
1440
  out:
db3b14978   Paul Menage   cgroup files: add...
1441
1442
1443
1444
  	if (buffer != local_buffer)
  		kfree(buffer);
  	return retval;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1445
1446
1447
1448
  static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
  						size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
1449
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1450

75139b827   Li Zefan   cgroups: remove s...
1451
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
1452
  		return -ENODEV;
355e0c48b   Paul Menage   Add cgroup write_...
1453
  	if (cft->write)
bd89aabc6   Paul Menage   Control groups: R...
1454
  		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
1455
1456
  	if (cft->write_u64 || cft->write_s64)
  		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
db3b14978   Paul Menage   cgroup files: add...
1457
1458
  	if (cft->write_string)
  		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
d447ea2f3   Pavel Emelyanov   cgroups: add the ...
1459
1460
1461
1462
  	if (cft->trigger) {
  		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
  		return ret ? ret : nbytes;
  	}
355e0c48b   Paul Menage   Add cgroup write_...
1463
  	return -EINVAL;
ddbcc7e8e   Paul Menage   Task Control Grou...
1464
  }
f4c753b7e   Paul Menage   CGroup API files:...
1465
1466
1467
1468
  static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
ddbcc7e8e   Paul Menage   Task Control Grou...
1469
  {
84eea8428   Paul Menage   cgroups: misc cle...
1470
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
f4c753b7e   Paul Menage   CGroup API files:...
1471
  	u64 val = cft->read_u64(cgrp, cft);
ddbcc7e8e   Paul Menage   Task Control Grou...
1472
1473
1474
1475
1476
  	int len = sprintf(tmp, "%llu
  ", (unsigned long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
e73d2c61d   Paul Menage   CGroups _s64 file...
1477
1478
1479
1480
1481
  static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
1482
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
e73d2c61d   Paul Menage   CGroups _s64 file...
1483
1484
1485
1486
1487
1488
  	s64 val = cft->read_s64(cgrp, cft);
  	int len = sprintf(tmp, "%lld
  ", (long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1489
1490
1491
1492
  static ssize_t cgroup_file_read(struct file *file, char __user *buf,
  				   size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
1493
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1494

75139b827   Li Zefan   cgroups: remove s...
1495
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
1496
1497
1498
  		return -ENODEV;
  
  	if (cft->read)
bd89aabc6   Paul Menage   Control groups: R...
1499
  		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
f4c753b7e   Paul Menage   CGroup API files:...
1500
1501
  	if (cft->read_u64)
  		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
1502
1503
  	if (cft->read_s64)
  		return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
ddbcc7e8e   Paul Menage   Task Control Grou...
1504
1505
  	return -EINVAL;
  }
917965696   Paul Menage   CGroup API files:...
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
  /*
   * seqfile ops/methods for returning structured data. Currently just
   * supports string->u64 maps, but can be extended in future.
   */
  
  struct cgroup_seqfile_state {
  	struct cftype *cft;
  	struct cgroup *cgroup;
  };
  
  static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
  {
  	struct seq_file *sf = cb->state;
  	return seq_printf(sf, "%s %llu
  ", key, (unsigned long long)value);
  }
  
  static int cgroup_seqfile_show(struct seq_file *m, void *arg)
  {
  	struct cgroup_seqfile_state *state = m->private;
  	struct cftype *cft = state->cft;
29486df32   Serge E. Hallyn   cgroups: introduc...
1527
1528
1529
1530
1531
1532
1533
1534
  	if (cft->read_map) {
  		struct cgroup_map_cb cb = {
  			.fill = cgroup_map_add,
  			.state = m,
  		};
  		return cft->read_map(state->cgroup, cft, &cb);
  	}
  	return cft->read_seq_string(state->cgroup, cft, m);
917965696   Paul Menage   CGroup API files:...
1535
  }
96930a636   Adrian Bunk   make cgroup_seqfi...
1536
  static int cgroup_seqfile_release(struct inode *inode, struct file *file)
917965696   Paul Menage   CGroup API files:...
1537
1538
1539
1540
1541
1542
1543
1544
  {
  	struct seq_file *seq = file->private_data;
  	kfree(seq->private);
  	return single_release(inode, file);
  }
  
  static struct file_operations cgroup_seqfile_operations = {
  	.read = seq_read,
e788e066c   Paul Menage   cgroup files: mov...
1545
  	.write = cgroup_file_write,
917965696   Paul Menage   CGroup API files:...
1546
1547
1548
  	.llseek = seq_lseek,
  	.release = cgroup_seqfile_release,
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
1549
1550
1551
1552
1553
1554
1555
1556
  static int cgroup_file_open(struct inode *inode, struct file *file)
  {
  	int err;
  	struct cftype *cft;
  
  	err = generic_file_open(inode, file);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
1557
  	cft = __d_cft(file->f_dentry);
75139b827   Li Zefan   cgroups: remove s...
1558

29486df32   Serge E. Hallyn   cgroups: introduc...
1559
  	if (cft->read_map || cft->read_seq_string) {
917965696   Paul Menage   CGroup API files:...
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
  		struct cgroup_seqfile_state *state =
  			kzalloc(sizeof(*state), GFP_USER);
  		if (!state)
  			return -ENOMEM;
  		state->cft = cft;
  		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
  		file->f_op = &cgroup_seqfile_operations;
  		err = single_open(file, cgroup_seqfile_show, state);
  		if (err < 0)
  			kfree(state);
  	} else if (cft->open)
ddbcc7e8e   Paul Menage   Task Control Grou...
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
  		err = cft->open(inode, file);
  	else
  		err = 0;
  
  	return err;
  }
  
  static int cgroup_file_release(struct inode *inode, struct file *file)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
  	if (cft->release)
  		return cft->release(inode, file);
  	return 0;
  }
  
  /*
   * cgroup_rename - Only allow simple rename of directories in place.
   */
  static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
  			    struct inode *new_dir, struct dentry *new_dentry)
  {
  	if (!S_ISDIR(old_dentry->d_inode->i_mode))
  		return -ENOTDIR;
  	if (new_dentry->d_inode)
  		return -EEXIST;
  	if (old_dir != new_dir)
  		return -EIO;
  	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
  }
  
  static struct file_operations cgroup_file_operations = {
  	.read = cgroup_file_read,
  	.write = cgroup_file_write,
  	.llseek = generic_file_llseek,
  	.open = cgroup_file_open,
  	.release = cgroup_file_release,
  };
  
  static struct inode_operations cgroup_dir_inode_operations = {
  	.lookup = simple_lookup,
  	.mkdir = cgroup_mkdir,
  	.rmdir = cgroup_rmdir,
  	.rename = cgroup_rename,
  };
099fca322   Li Zefan   cgroups: show cor...
1615
  static int cgroup_create_file(struct dentry *dentry, mode_t mode,
ddbcc7e8e   Paul Menage   Task Control Grou...
1616
1617
  				struct super_block *sb)
  {
3ba13d179   Al Viro   constify dentry_o...
1618
  	static const struct dentry_operations cgroup_dops = {
ddbcc7e8e   Paul Menage   Task Control Grou...
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
  		.d_iput = cgroup_diput,
  	};
  
  	struct inode *inode;
  
  	if (!dentry)
  		return -ENOENT;
  	if (dentry->d_inode)
  		return -EEXIST;
  
  	inode = cgroup_new_inode(mode, sb);
  	if (!inode)
  		return -ENOMEM;
  
  	if (S_ISDIR(mode)) {
  		inode->i_op = &cgroup_dir_inode_operations;
  		inode->i_fop = &simple_dir_operations;
  
  		/* start off with i_nlink == 2 (for "." entry) */
  		inc_nlink(inode);
  
  		/* start with the directory inode held, so that we can
  		 * populate it without racing with another mkdir */
817929ec2   Paul Menage   Task Control Grou...
1642
  		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ddbcc7e8e   Paul Menage   Task Control Grou...
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
  	} else if (S_ISREG(mode)) {
  		inode->i_size = 0;
  		inode->i_fop = &cgroup_file_operations;
  	}
  	dentry->d_op = &cgroup_dops;
  	d_instantiate(dentry, inode);
  	dget(dentry);	/* Extra count - pin the dentry in core */
  	return 0;
  }
  
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
1654
1655
1656
1657
1658
   * cgroup_create_dir - create a directory for an object.
   * @cgrp: the cgroup we create the directory for. It must have a valid
   *        ->parent field. And we are going to fill its ->dentry field.
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new directory.
ddbcc7e8e   Paul Menage   Task Control Grou...
1659
   */
bd89aabc6   Paul Menage   Control groups: R...
1660
  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
1661
  				mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
1662
1663
1664
  {
  	struct dentry *parent;
  	int error = 0;
bd89aabc6   Paul Menage   Control groups: R...
1665
1666
  	parent = cgrp->parent->dentry;
  	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
1667
  	if (!error) {
bd89aabc6   Paul Menage   Control groups: R...
1668
  		dentry->d_fsdata = cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
1669
  		inc_nlink(parent->d_inode);
a47295e6b   Paul Menage   cgroups: make cgr...
1670
  		rcu_assign_pointer(cgrp->dentry, dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
1671
1672
1673
1674
1675
1676
  		dget(dentry);
  	}
  	dput(dentry);
  
  	return error;
  }
099fca322   Li Zefan   cgroups: show cor...
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
  /**
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
   *
   * returns cft->mode if ->mode is not 0
   * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
   * returns S_IRUGO if it has only a read handler
   * returns S_IWUSR if it has only a write hander
   */
  static mode_t cgroup_file_mode(const struct cftype *cft)
  {
  	mode_t mode = 0;
  
  	if (cft->mode)
  		return cft->mode;
  
  	if (cft->read || cft->read_u64 || cft->read_s64 ||
  	    cft->read_map || cft->read_seq_string)
  		mode |= S_IRUGO;
  
  	if (cft->write || cft->write_u64 || cft->write_s64 ||
  	    cft->write_string || cft->trigger)
  		mode |= S_IWUSR;
  
  	return mode;
  }
bd89aabc6   Paul Menage   Control groups: R...
1703
  int cgroup_add_file(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
1704
1705
1706
  		       struct cgroup_subsys *subsys,
  		       const struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
1707
  	struct dentry *dir = cgrp->dentry;
ddbcc7e8e   Paul Menage   Task Control Grou...
1708
1709
  	struct dentry *dentry;
  	int error;
099fca322   Li Zefan   cgroups: show cor...
1710
  	mode_t mode;
ddbcc7e8e   Paul Menage   Task Control Grou...
1711
1712
  
  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
bd89aabc6   Paul Menage   Control groups: R...
1713
  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1714
1715
1716
1717
1718
1719
1720
  		strcpy(name, subsys->name);
  		strcat(name, ".");
  	}
  	strcat(name, cft->name);
  	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
  	dentry = lookup_one_len(name, dir, strlen(name));
  	if (!IS_ERR(dentry)) {
099fca322   Li Zefan   cgroups: show cor...
1721
1722
  		mode = cgroup_file_mode(cft);
  		error = cgroup_create_file(dentry, mode | S_IFREG,
bd89aabc6   Paul Menage   Control groups: R...
1723
  						cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
1724
1725
1726
1727
1728
1729
1730
  		if (!error)
  			dentry->d_fsdata = (void *)cft;
  		dput(dentry);
  	} else
  		error = PTR_ERR(dentry);
  	return error;
  }
bd89aabc6   Paul Menage   Control groups: R...
1731
  int cgroup_add_files(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
1732
1733
1734
1735
1736
1737
  			struct cgroup_subsys *subsys,
  			const struct cftype cft[],
  			int count)
  {
  	int i, err;
  	for (i = 0; i < count; i++) {
bd89aabc6   Paul Menage   Control groups: R...
1738
  		err = cgroup_add_file(cgrp, subsys, &cft[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
1739
1740
1741
1742
1743
  		if (err)
  			return err;
  	}
  	return 0;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1744
1745
1746
1747
1748
1749
  /**
   * cgroup_task_count - count the number of tasks in a cgroup.
   * @cgrp: the cgroup in question
   *
   * Return the number of tasks in the cgroup.
   */
bd89aabc6   Paul Menage   Control groups: R...
1750
  int cgroup_task_count(const struct cgroup *cgrp)
bbcb81d09   Paul Menage   Task Control Grou...
1751
1752
  {
  	int count = 0;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1753
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
1754
1755
  
  	read_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1756
  	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
1757
  		count += atomic_read(&link->cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
1758
1759
  	}
  	read_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
1760
1761
1762
1763
  	return count;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
1764
1765
1766
   * Advance a list_head iterator.  The iterator should be positioned at
   * the start of a css_set
   */
bd89aabc6   Paul Menage   Control groups: R...
1767
  static void cgroup_advance_iter(struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
1768
1769
1770
1771
1772
1773
1774
1775
1776
  					  struct cgroup_iter *it)
  {
  	struct list_head *l = it->cg_link;
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	/* Advance to the next non-empty css_set */
  	do {
  		l = l->next;
bd89aabc6   Paul Menage   Control groups: R...
1777
  		if (l == &cgrp->css_sets) {
817929ec2   Paul Menage   Task Control Grou...
1778
1779
1780
  			it->cg_link = NULL;
  			return;
  		}
bd89aabc6   Paul Menage   Control groups: R...
1781
  		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
1782
1783
1784
1785
1786
  		cg = link->cg;
  	} while (list_empty(&cg->tasks));
  	it->cg_link = l;
  	it->task = cg->tasks.next;
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
1787
1788
1789
1790
1791
1792
1793
1794
1795
  /*
   * To reduce the fork() overhead for systems that are not actually
   * using their cgroups capability, we don't maintain the lists running
   * through each css_set to its tasks until we see the list actually
   * used - in other words after the first call to cgroup_iter_start().
   *
   * The tasklist_lock is not held here, as do_each_thread() and
   * while_each_thread() are protected by RCU.
   */
3df91fe30   Adrian Bunk   make cgroup_enabl...
1796
  static void cgroup_enable_task_cg_lists(void)
31a7df01f   Cliff Wickman   cgroups: mechanis...
1797
1798
1799
1800
1801
1802
  {
  	struct task_struct *p, *g;
  	write_lock(&css_set_lock);
  	use_task_css_set_links = 1;
  	do_each_thread(g, p) {
  		task_lock(p);
0e04388f0   Li Zefan   cgroup: fix a rac...
1803
1804
1805
1806
1807
1808
  		/*
  		 * We should check if the process is exiting, otherwise
  		 * it will race with cgroup_exit() in that the list
  		 * entry won't be deleted though the process has exited.
  		 */
  		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
31a7df01f   Cliff Wickman   cgroups: mechanis...
1809
1810
1811
1812
1813
  			list_add(&p->cg_list, &p->cgroups->tasks);
  		task_unlock(p);
  	} while_each_thread(g, p);
  	write_unlock(&css_set_lock);
  }
bd89aabc6   Paul Menage   Control groups: R...
1814
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
1815
1816
1817
1818
1819
1820
  {
  	/*
  	 * The first time anyone tries to iterate across a cgroup,
  	 * we need to enable the list linking each css_set to its
  	 * tasks, and fix up all existing tasks.
  	 */
31a7df01f   Cliff Wickman   cgroups: mechanis...
1821
1822
  	if (!use_task_css_set_links)
  		cgroup_enable_task_cg_lists();
817929ec2   Paul Menage   Task Control Grou...
1823
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
1824
1825
  	it->cg_link = &cgrp->css_sets;
  	cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
1826
  }
bd89aabc6   Paul Menage   Control groups: R...
1827
  struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
1828
1829
1830
1831
  					struct cgroup_iter *it)
  {
  	struct task_struct *res;
  	struct list_head *l = it->task;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
1832
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
1833
1834
1835
1836
1837
1838
1839
  
  	/* If the iterator cg is NULL, we have no tasks */
  	if (!it->cg_link)
  		return NULL;
  	res = list_entry(l, struct task_struct, cg_list);
  	/* Advance iterator to find next entry */
  	l = l->next;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
1840
1841
  	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
  	if (l == &link->cg->tasks) {
817929ec2   Paul Menage   Task Control Grou...
1842
1843
  		/* We reached the end of this task list - move on to
  		 * the next cg_cgroup_link */
bd89aabc6   Paul Menage   Control groups: R...
1844
  		cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
1845
1846
1847
1848
1849
  	} else {
  		it->task = l;
  	}
  	return res;
  }
bd89aabc6   Paul Menage   Control groups: R...
1850
  void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
1851
1852
1853
  {
  	read_unlock(&css_set_lock);
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
  static inline int started_after_time(struct task_struct *t1,
  				     struct timespec *time,
  				     struct task_struct *t2)
  {
  	int start_diff = timespec_compare(&t1->start_time, time);
  	if (start_diff > 0) {
  		return 1;
  	} else if (start_diff < 0) {
  		return 0;
  	} else {
  		/*
  		 * Arbitrarily, if two processes started at the same
  		 * time, we'll say that the lower pointer value
  		 * started first. Note that t2 may have exited by now
  		 * so this may not be a valid pointer any longer, but
  		 * that's fine - it still serves to distinguish
  		 * between two tasks started (effectively) simultaneously.
  		 */
  		return t1 > t2;
  	}
  }
  
  /*
   * This function is a callback from heap_insert() and is used to order
   * the heap.
   * In this case we order the heap in descending task start time.
   */
  static inline int started_after(void *p1, void *p2)
  {
  	struct task_struct *t1 = p1;
  	struct task_struct *t2 = p2;
  	return started_after_time(t1, &t2->start_time, t2);
  }
  
  /**
   * cgroup_scan_tasks - iterate though all the tasks in a cgroup
   * @scan: struct cgroup_scanner containing arguments for the scan
   *
   * Arguments include pointers to callback functions test_task() and
   * process_task().
   * Iterate through all the tasks in a cgroup, calling test_task() for each,
   * and if it returns true, call process_task() for it also.
   * The test_task pointer may be NULL, meaning always true (select all tasks).
   * Effectively duplicates cgroup_iter_{start,next,end}()
   * but does not lock css_set_lock for the call to process_task().
   * The struct cgroup_scanner may be embedded in any structure of the caller's
   * creation.
   * It is guaranteed that process_task() will act on every task that
   * is a member of the cgroup for the duration of this call. This
   * function may or may not call process_task() for tasks that exit
   * or move to a different cgroup during the call, or are forked or
   * move into the cgroup during the call.
   *
   * Note that test_task() may be called with locks held, and may in some
   * situations be called multiple times for the same task, so it should
   * be cheap.
   * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
   * pre-allocated and will be used for heap operations (and its "gt" member will
   * be overwritten), else a temporary heap will be used (allocation of which
   * may cause this function to fail).
   */
  int cgroup_scan_tasks(struct cgroup_scanner *scan)
  {
  	int retval, i;
  	struct cgroup_iter it;
  	struct task_struct *p, *dropped;
  	/* Never dereference latest_task, since it's not refcounted */
  	struct task_struct *latest_task = NULL;
  	struct ptr_heap tmp_heap;
  	struct ptr_heap *heap;
  	struct timespec latest_time = { 0, 0 };
  
  	if (scan->heap) {
  		/* The caller supplied our heap and pre-allocated its memory */
  		heap = scan->heap;
  		heap->gt = &started_after;
  	} else {
  		/* We need to allocate our own heap memory */
  		heap = &tmp_heap;
  		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
  		if (retval)
  			/* cannot allocate the heap */
  			return retval;
  	}
  
   again:
  	/*
  	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
  	 * to determine which are of interest, and using the scanner's
  	 * "process_task" callback to process any of them that need an update.
  	 * Since we don't want to hold any locks during the task updates,
  	 * gather tasks to be processed in a heap structure.
  	 * The heap is sorted by descending task start time.
  	 * If the statically-sized heap fills up, we overflow tasks that
  	 * started later, and in future iterations only consider tasks that
  	 * started after the latest task in the previous pass. This
  	 * guarantees forward progress and that we don't miss any tasks.
  	 */
  	heap->size = 0;
  	cgroup_iter_start(scan->cg, &it);
  	while ((p = cgroup_iter_next(scan->cg, &it))) {
  		/*
  		 * Only affect tasks that qualify per the caller's callback,
  		 * if he provided one
  		 */
  		if (scan->test_task && !scan->test_task(p, scan))
  			continue;
  		/*
  		 * Only process tasks that started after the last task
  		 * we processed
  		 */
  		if (!started_after_time(p, &latest_time, latest_task))
  			continue;
  		dropped = heap_insert(heap, p);
  		if (dropped == NULL) {
  			/*
  			 * The new task was inserted; the heap wasn't
  			 * previously full
  			 */
  			get_task_struct(p);
  		} else if (dropped != p) {
  			/*
  			 * The new task was inserted, and pushed out a
  			 * different task
  			 */
  			get_task_struct(p);
  			put_task_struct(dropped);
  		}
  		/*
  		 * Else the new task was newer than anything already in
  		 * the heap and wasn't inserted
  		 */
  	}
  	cgroup_iter_end(scan->cg, &it);
  
  	if (heap->size) {
  		for (i = 0; i < heap->size; i++) {
4fe91d518   Paul Jackson   cgroup: fix spars...
1991
  			struct task_struct *q = heap->ptrs[i];
31a7df01f   Cliff Wickman   cgroups: mechanis...
1992
  			if (i == 0) {
4fe91d518   Paul Jackson   cgroup: fix spars...
1993
1994
  				latest_time = q->start_time;
  				latest_task = q;
31a7df01f   Cliff Wickman   cgroups: mechanis...
1995
1996
  			}
  			/* Process the task per the caller's callback */
4fe91d518   Paul Jackson   cgroup: fix spars...
1997
1998
  			scan->process_task(q, scan);
  			put_task_struct(q);
31a7df01f   Cliff Wickman   cgroups: mechanis...
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
  		}
  		/*
  		 * If we had to process any tasks at all, scan again
  		 * in case some of them were in the middle of forking
  		 * children that didn't get processed.
  		 * Not the most efficient way to do it, but it avoids
  		 * having to take callback_mutex in the fork path
  		 */
  		goto again;
  	}
  	if (heap == &tmp_heap)
  		heap_free(&tmp_heap);
  	return 0;
  }
817929ec2   Paul Menage   Task Control Grou...
2013
  /*
bbcb81d09   Paul Menage   Task Control Grou...
2014
2015
2016
2017
2018
2019
2020
   * Stuff for reading the 'tasks' file.
   *
   * Reading this file can return large amounts of data if a cgroup has
   * *lots* of attached tasks. So it may need several calls to read(),
   * but we cannot guarantee that the information we produce is correct
   * unless we produce it entirely atomically.
   *
bbcb81d09   Paul Menage   Task Control Grou...
2021
   */
bbcb81d09   Paul Menage   Task Control Grou...
2022
2023
2024
  
  /*
   * Load into 'pidarray' up to 'npids' of the tasks using cgroup
bd89aabc6   Paul Menage   Control groups: R...
2025
   * 'cgrp'.  Return actual number of pids loaded.  No need to
bbcb81d09   Paul Menage   Task Control Grou...
2026
2027
2028
2029
   * task_lock(p) when reading out p->cgroup, since we're in an RCU
   * read section, so the css_set can't go away, and is
   * immutable after creation.
   */
bd89aabc6   Paul Menage   Control groups: R...
2030
  static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
bbcb81d09   Paul Menage   Task Control Grou...
2031
  {
e7b80bb69   Gowrishankar M   cgroups: skip pro...
2032
  	int n = 0, pid;
817929ec2   Paul Menage   Task Control Grou...
2033
2034
  	struct cgroup_iter it;
  	struct task_struct *tsk;
bd89aabc6   Paul Menage   Control groups: R...
2035
2036
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
817929ec2   Paul Menage   Task Control Grou...
2037
2038
  		if (unlikely(n == npids))
  			break;
e7b80bb69   Gowrishankar M   cgroups: skip pro...
2039
2040
2041
  		pid = task_pid_vnr(tsk);
  		if (pid > 0)
  			pidarray[n++] = pid;
817929ec2   Paul Menage   Task Control Grou...
2042
  	}
bd89aabc6   Paul Menage   Control groups: R...
2043
  	cgroup_iter_end(cgrp, &it);
bbcb81d09   Paul Menage   Task Control Grou...
2044
2045
  	return n;
  }
846c7bb05   Balbir Singh   Add cgroupstats
2046
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2047
   * cgroupstats_build - build and fill cgroupstats
846c7bb05   Balbir Singh   Add cgroupstats
2048
2049
2050
   * @stats: cgroupstats to fill information into
   * @dentry: A dentry entry belonging to the cgroup for which stats have
   * been requested.
a043e3b2c   Li Zefan   cgroup: fix comments
2051
2052
2053
   *
   * Build and fill cgroupstats so that taskstats can export it to user
   * space.
846c7bb05   Balbir Singh   Add cgroupstats
2054
2055
2056
2057
   */
  int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  {
  	int ret = -EINVAL;
bd89aabc6   Paul Menage   Control groups: R...
2058
  	struct cgroup *cgrp;
846c7bb05   Balbir Singh   Add cgroupstats
2059
2060
  	struct cgroup_iter it;
  	struct task_struct *tsk;
33d283bef   Li Zefan   cgroups: fix a se...
2061

846c7bb05   Balbir Singh   Add cgroupstats
2062
  	/*
33d283bef   Li Zefan   cgroups: fix a se...
2063
2064
  	 * Validate dentry by checking the superblock operations,
  	 * and make sure it's a directory.
846c7bb05   Balbir Singh   Add cgroupstats
2065
  	 */
33d283bef   Li Zefan   cgroups: fix a se...
2066
2067
  	if (dentry->d_sb->s_op != &cgroup_ops ||
  	    !S_ISDIR(dentry->d_inode->i_mode))
846c7bb05   Balbir Singh   Add cgroupstats
2068
2069
2070
  		 goto err;
  
  	ret = 0;
bd89aabc6   Paul Menage   Control groups: R...
2071
  	cgrp = dentry->d_fsdata;
846c7bb05   Balbir Singh   Add cgroupstats
2072

bd89aabc6   Paul Menage   Control groups: R...
2073
2074
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
846c7bb05   Balbir Singh   Add cgroupstats
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
  		switch (tsk->state) {
  		case TASK_RUNNING:
  			stats->nr_running++;
  			break;
  		case TASK_INTERRUPTIBLE:
  			stats->nr_sleeping++;
  			break;
  		case TASK_UNINTERRUPTIBLE:
  			stats->nr_uninterruptible++;
  			break;
  		case TASK_STOPPED:
  			stats->nr_stopped++;
  			break;
  		default:
  			if (delayacct_is_task_waiting_on_io(tsk))
  				stats->nr_io_wait++;
  			break;
  		}
  	}
bd89aabc6   Paul Menage   Control groups: R...
2094
  	cgroup_iter_end(cgrp, &it);
846c7bb05   Balbir Singh   Add cgroupstats
2095

846c7bb05   Balbir Singh   Add cgroupstats
2096
2097
2098
  err:
  	return ret;
  }
bbcb81d09   Paul Menage   Task Control Grou...
2099
2100
2101
2102
  static int cmppid(const void *a, const void *b)
  {
  	return *(pid_t *)a - *(pid_t *)b;
  }
cc31edcee   Paul Menage   cgroups: convert ...
2103

bbcb81d09   Paul Menage   Task Control Grou...
2104
  /*
cc31edcee   Paul Menage   cgroups: convert ...
2105
2106
2107
   * seq_file methods for the "tasks" file. The seq_file position is the
   * next pid to display; the seq_file iterator is a pointer to the pid
   * in the cgroup->tasks_pids array.
bbcb81d09   Paul Menage   Task Control Grou...
2108
   */
cc31edcee   Paul Menage   cgroups: convert ...
2109
2110
  
  static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
bbcb81d09   Paul Menage   Task Control Grou...
2111
  {
cc31edcee   Paul Menage   cgroups: convert ...
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
  	/*
  	 * Initially we receive a position value that corresponds to
  	 * one more than the last pid shown (or 0 on the first call or
  	 * after a seek to the start). Use a binary-search to find the
  	 * next pid to display, if any
  	 */
  	struct cgroup *cgrp = s->private;
  	int index = 0, pid = *pos;
  	int *iter;
  
  	down_read(&cgrp->pids_mutex);
  	if (pid) {
  		int end = cgrp->pids_length;
207777664   Stephen Rothwell   cgroup: remove un...
2125

cc31edcee   Paul Menage   cgroups: convert ...
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
  		while (index < end) {
  			int mid = (index + end) / 2;
  			if (cgrp->tasks_pids[mid] == pid) {
  				index = mid;
  				break;
  			} else if (cgrp->tasks_pids[mid] <= pid)
  				index = mid + 1;
  			else
  				end = mid;
  		}
  	}
  	/* If we're off the end of the array, we're done */
  	if (index >= cgrp->pids_length)
  		return NULL;
  	/* Update the abstract position to be the actual pid that we found */
  	iter = cgrp->tasks_pids + index;
  	*pos = *iter;
  	return iter;
  }
  
  static void cgroup_tasks_stop(struct seq_file *s, void *v)
  {
  	struct cgroup *cgrp = s->private;
  	up_read(&cgrp->pids_mutex);
  }
  
  static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
  {
  	struct cgroup *cgrp = s->private;
  	int *p = v;
  	int *end = cgrp->tasks_pids + cgrp->pids_length;
  
  	/*
  	 * Advance to the next pid in the array. If this goes off the
  	 * end, we're done
  	 */
  	p++;
  	if (p >= end) {
  		return NULL;
  	} else {
  		*pos = *p;
  		return p;
  	}
  }
  
  static int cgroup_tasks_show(struct seq_file *s, void *v)
  {
  	return seq_printf(s, "%d
  ", *(int *)v);
  }
bbcb81d09   Paul Menage   Task Control Grou...
2176

cc31edcee   Paul Menage   cgroups: convert ...
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
  static struct seq_operations cgroup_tasks_seq_operations = {
  	.start = cgroup_tasks_start,
  	.stop = cgroup_tasks_stop,
  	.next = cgroup_tasks_next,
  	.show = cgroup_tasks_show,
  };
  
  static void release_cgroup_pid_array(struct cgroup *cgrp)
  {
  	down_write(&cgrp->pids_mutex);
  	BUG_ON(!cgrp->pids_use_count);
  	if (!--cgrp->pids_use_count) {
  		kfree(cgrp->tasks_pids);
  		cgrp->tasks_pids = NULL;
  		cgrp->pids_length = 0;
  	}
  	up_write(&cgrp->pids_mutex);
bbcb81d09   Paul Menage   Task Control Grou...
2194
  }
cc31edcee   Paul Menage   cgroups: convert ...
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
  static int cgroup_tasks_release(struct inode *inode, struct file *file)
  {
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
  
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
  
  	release_cgroup_pid_array(cgrp);
  	return seq_release(inode, file);
  }
  
  static struct file_operations cgroup_tasks_operations = {
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.write = cgroup_file_write,
  	.release = cgroup_tasks_release,
  };
bbcb81d09   Paul Menage   Task Control Grou...
2212
  /*
cc31edcee   Paul Menage   cgroups: convert ...
2213
   * Handle an open on 'tasks' file.  Prepare an array containing the
bbcb81d09   Paul Menage   Task Control Grou...
2214
   * process id's of tasks currently attached to the cgroup being opened.
bbcb81d09   Paul Menage   Task Control Grou...
2215
   */
cc31edcee   Paul Menage   cgroups: convert ...
2216

bbcb81d09   Paul Menage   Task Control Grou...
2217
2218
  static int cgroup_tasks_open(struct inode *unused, struct file *file)
  {
bd89aabc6   Paul Menage   Control groups: R...
2219
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
bbcb81d09   Paul Menage   Task Control Grou...
2220
2221
  	pid_t *pidarray;
  	int npids;
cc31edcee   Paul Menage   cgroups: convert ...
2222
  	int retval;
bbcb81d09   Paul Menage   Task Control Grou...
2223

cc31edcee   Paul Menage   cgroups: convert ...
2224
  	/* Nothing to do for write-only files */
bbcb81d09   Paul Menage   Task Control Grou...
2225
2226
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
bbcb81d09   Paul Menage   Task Control Grou...
2227
2228
2229
2230
2231
2232
  	/*
  	 * If cgroup gets more users after we read count, we won't have
  	 * enough space - tough.  This race is indistinguishable to the
  	 * caller from the case that the additional cgroup users didn't
  	 * show up until sometime later on.
  	 */
bd89aabc6   Paul Menage   Control groups: R...
2233
  	npids = cgroup_task_count(cgrp);
cc31edcee   Paul Menage   cgroups: convert ...
2234
2235
2236
2237
2238
  	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
  	if (!pidarray)
  		return -ENOMEM;
  	npids = pid_array_load(pidarray, npids, cgrp);
  	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
bbcb81d09   Paul Menage   Task Control Grou...
2239

cc31edcee   Paul Menage   cgroups: convert ...
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
  	/*
  	 * Store the array in the cgroup, freeing the old
  	 * array if necessary
  	 */
  	down_write(&cgrp->pids_mutex);
  	kfree(cgrp->tasks_pids);
  	cgrp->tasks_pids = pidarray;
  	cgrp->pids_length = npids;
  	cgrp->pids_use_count++;
  	up_write(&cgrp->pids_mutex);
  
  	file->f_op = &cgroup_tasks_operations;
  
  	retval = seq_open(file, &cgroup_tasks_seq_operations);
  	if (retval) {
  		release_cgroup_pid_array(cgrp);
  		return retval;
bbcb81d09   Paul Menage   Task Control Grou...
2257
  	}
cc31edcee   Paul Menage   cgroups: convert ...
2258
  	((struct seq_file *)file->private_data)->private = cgrp;
bbcb81d09   Paul Menage   Task Control Grou...
2259
2260
  	return 0;
  }
bd89aabc6   Paul Menage   Control groups: R...
2261
  static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
81a6a5cdd   Paul Menage   Task Control Grou...
2262
2263
  					    struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
2264
  	return notify_on_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
2265
  }
6379c1061   Paul Menage   cgroup files: mov...
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
  static int cgroup_write_notify_on_release(struct cgroup *cgrp,
  					  struct cftype *cft,
  					  u64 val)
  {
  	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
  	if (val)
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	else
  		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	return 0;
  }
bbcb81d09   Paul Menage   Task Control Grou...
2277
2278
2279
  /*
   * for the common functions, 'private' gives the type of file
   */
81a6a5cdd   Paul Menage   Task Control Grou...
2280
2281
2282
2283
  static struct cftype files[] = {
  	{
  		.name = "tasks",
  		.open = cgroup_tasks_open,
af351026a   Paul Menage   cgroup files: tur...
2284
  		.write_u64 = cgroup_tasks_write,
81a6a5cdd   Paul Menage   Task Control Grou...
2285
2286
  		.release = cgroup_tasks_release,
  		.private = FILE_TASKLIST,
099fca322   Li Zefan   cgroups: show cor...
2287
  		.mode = S_IRUGO | S_IWUSR,
81a6a5cdd   Paul Menage   Task Control Grou...
2288
2289
2290
2291
  	},
  
  	{
  		.name = "notify_on_release",
f4c753b7e   Paul Menage   CGroup API files:...
2292
  		.read_u64 = cgroup_read_notify_on_release,
6379c1061   Paul Menage   cgroup files: mov...
2293
  		.write_u64 = cgroup_write_notify_on_release,
81a6a5cdd   Paul Menage   Task Control Grou...
2294
2295
  		.private = FILE_NOTIFY_ON_RELEASE,
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
2296
2297
2298
2299
  };
  
  static struct cftype cft_release_agent = {
  	.name = "release_agent",
e788e066c   Paul Menage   cgroup files: mov...
2300
2301
2302
  	.read_seq_string = cgroup_release_agent_show,
  	.write_string = cgroup_release_agent_write,
  	.max_write_len = PATH_MAX,
81a6a5cdd   Paul Menage   Task Control Grou...
2303
  	.private = FILE_RELEASE_AGENT,
bbcb81d09   Paul Menage   Task Control Grou...
2304
  };
bd89aabc6   Paul Menage   Control groups: R...
2305
  static int cgroup_populate_dir(struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
2306
2307
2308
2309
2310
  {
  	int err;
  	struct cgroup_subsys *ss;
  
  	/* First clear out any existing files */
bd89aabc6   Paul Menage   Control groups: R...
2311
  	cgroup_clear_directory(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
2312

bd89aabc6   Paul Menage   Control groups: R...
2313
  	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
bbcb81d09   Paul Menage   Task Control Grou...
2314
2315
  	if (err < 0)
  		return err;
bd89aabc6   Paul Menage   Control groups: R...
2316
2317
  	if (cgrp == cgrp->top_cgroup) {
  		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
81a6a5cdd   Paul Menage   Task Control Grou...
2318
2319
  			return err;
  	}
bd89aabc6   Paul Menage   Control groups: R...
2320
2321
  	for_each_subsys(cgrp->root, ss) {
  		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
ddbcc7e8e   Paul Menage   Task Control Grou...
2322
2323
  			return err;
  	}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
  	/* This cgroup is ready now */
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		/*
  		 * Update id->css pointer and make this css visible from
  		 * CSS ID functions. This pointer will be dereferened
  		 * from RCU-read-side without locks.
  		 */
  		if (css->id)
  			rcu_assign_pointer(css->id->css, css);
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
2335
2336
2337
2338
2339
2340
  
  	return 0;
  }
  
  static void init_cgroup_css(struct cgroup_subsys_state *css,
  			       struct cgroup_subsys *ss,
bd89aabc6   Paul Menage   Control groups: R...
2341
  			       struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
2342
  {
bd89aabc6   Paul Menage   Control groups: R...
2343
  	css->cgroup = cgrp;
e7c5ec919   Paul Menage   cgroups: add css_...
2344
  	atomic_set(&css->refcnt, 1);
ddbcc7e8e   Paul Menage   Task Control Grou...
2345
  	css->flags = 0;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
2346
  	css->id = NULL;
bd89aabc6   Paul Menage   Control groups: R...
2347
  	if (cgrp == dummytop)
ddbcc7e8e   Paul Menage   Task Control Grou...
2348
  		set_bit(CSS_ROOT, &css->flags);
bd89aabc6   Paul Menage   Control groups: R...
2349
2350
  	BUG_ON(cgrp->subsys[ss->subsys_id]);
  	cgrp->subsys[ss->subsys_id] = css;
ddbcc7e8e   Paul Menage   Task Control Grou...
2351
  }
999cd8a45   Paul Menage   cgroups: add a pe...
2352
2353
2354
2355
2356
2357
2358
2359
  static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
  {
  	/* We need to take each hierarchy_mutex in a consistent order */
  	int i;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		if (ss->root == root)
cfebe563b   Li Zefan   cgroups: fix lock...
2360
  			mutex_lock(&ss->hierarchy_mutex);
999cd8a45   Paul Menage   cgroups: add a pe...
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
  	}
  }
  
  static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  {
  	int i;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		if (ss->root == root)
  			mutex_unlock(&ss->hierarchy_mutex);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
2374
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
2375
2376
2377
2378
   * cgroup_create - create a cgroup
   * @parent: cgroup that will be parent of the new cgroup
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new inode
ddbcc7e8e   Paul Menage   Task Control Grou...
2379
   *
a043e3b2c   Li Zefan   cgroup: fix comments
2380
   * Must be called with the mutex on the parent inode held
ddbcc7e8e   Paul Menage   Task Control Grou...
2381
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
2382
  static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
2383
  			     mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
2384
  {
bd89aabc6   Paul Menage   Control groups: R...
2385
  	struct cgroup *cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
2386
2387
2388
2389
  	struct cgroupfs_root *root = parent->root;
  	int err = 0;
  	struct cgroup_subsys *ss;
  	struct super_block *sb = root->sb;
bd89aabc6   Paul Menage   Control groups: R...
2390
2391
  	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
  	if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
  		return -ENOMEM;
  
  	/* Grab a reference on the superblock so the hierarchy doesn't
  	 * get deleted on unmount if there are child cgroups.  This
  	 * can be done outside cgroup_mutex, since the sb can't
  	 * disappear while someone has an open control file on the
  	 * fs */
  	atomic_inc(&sb->s_active);
  
  	mutex_lock(&cgroup_mutex);
cc31edcee   Paul Menage   cgroups: convert ...
2402
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2403

bd89aabc6   Paul Menage   Control groups: R...
2404
2405
2406
  	cgrp->parent = parent;
  	cgrp->root = parent->root;
  	cgrp->top_cgroup = parent->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
2407

b6abdb0e6   Li Zefan   cgroup: fix defau...
2408
2409
  	if (notify_on_release(parent))
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
2410
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
2411
  		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2412
2413
2414
2415
  		if (IS_ERR(css)) {
  			err = PTR_ERR(css);
  			goto err_destroy;
  		}
bd89aabc6   Paul Menage   Control groups: R...
2416
  		init_cgroup_css(css, ss, cgrp);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
2417
2418
2419
2420
  		if (ss->use_id)
  			if (alloc_css_id(ss, parent, cgrp))
  				goto err_destroy;
  		/* At error, ->destroy() callback has to free assigned ID. */
ddbcc7e8e   Paul Menage   Task Control Grou...
2421
  	}
999cd8a45   Paul Menage   cgroups: add a pe...
2422
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
2423
  	list_add(&cgrp->sibling, &cgrp->parent->children);
999cd8a45   Paul Menage   cgroups: add a pe...
2424
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
2425
  	root->number_of_cgroups++;
bd89aabc6   Paul Menage   Control groups: R...
2426
  	err = cgroup_create_dir(cgrp, dentry, mode);
ddbcc7e8e   Paul Menage   Task Control Grou...
2427
2428
2429
2430
  	if (err < 0)
  		goto err_remove;
  
  	/* The cgroup directory was pre-locked for us */
bd89aabc6   Paul Menage   Control groups: R...
2431
  	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
ddbcc7e8e   Paul Menage   Task Control Grou...
2432

bd89aabc6   Paul Menage   Control groups: R...
2433
  	err = cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2434
2435
2436
  	/* If err < 0, we have a half-filled directory - oh well ;) */
  
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
2437
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
2438
2439
2440
2441
  
  	return 0;
  
   err_remove:
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
2442
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
2443
  	list_del(&cgrp->sibling);
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
2444
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
2445
2446
2447
2448
2449
  	root->number_of_cgroups--;
  
   err_destroy:
  
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
2450
2451
  		if (cgrp->subsys[ss->subsys_id])
  			ss->destroy(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2452
2453
2454
2455
2456
2457
  	}
  
  	mutex_unlock(&cgroup_mutex);
  
  	/* Release the reference count that we took on the superblock */
  	deactivate_super(sb);
bd89aabc6   Paul Menage   Control groups: R...
2458
  	kfree(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
  	return err;
  }
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  {
  	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
  
  	/* the vfs holds inode->i_mutex already */
  	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
55b6fd016   Li Zefan   cgroup: uninline ...
2469
  static int cgroup_has_css_refs(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
2470
2471
2472
  {
  	/* Check the reference count on each subsystem. Since we
  	 * already established that there are no tasks in the
e7c5ec919   Paul Menage   cgroups: add css_...
2473
  	 * cgroup, if the css refcount is also 1, then there should
81a6a5cdd   Paul Menage   Task Control Grou...
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
  	 * be no outstanding references, so the subsystem is safe to
  	 * destroy. We scan across all subsystems rather than using
  	 * the per-hierarchy linked list of mounted subsystems since
  	 * we can be called via check_for_release() with no
  	 * synchronization other than RCU, and the subsystem linked
  	 * list isn't RCU-safe */
  	int i;
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		struct cgroup_subsys_state *css;
  		/* Skip subsystems not in this hierarchy */
bd89aabc6   Paul Menage   Control groups: R...
2485
  		if (ss->root != cgrp->root)
81a6a5cdd   Paul Menage   Task Control Grou...
2486
  			continue;
bd89aabc6   Paul Menage   Control groups: R...
2487
  		css = cgrp->subsys[ss->subsys_id];
81a6a5cdd   Paul Menage   Task Control Grou...
2488
2489
2490
2491
2492
2493
  		/* When called from check_for_release() it's possible
  		 * that by this point the cgroup has been removed
  		 * and the css deleted. But a false-positive doesn't
  		 * matter, since it can only happen if the cgroup
  		 * has been deleted and hence no longer needs the
  		 * release agent to be called anyway. */
e7c5ec919   Paul Menage   cgroups: add css_...
2494
  		if (css && (atomic_read(&css->refcnt) > 1))
81a6a5cdd   Paul Menage   Task Control Grou...
2495
  			return 1;
81a6a5cdd   Paul Menage   Task Control Grou...
2496
2497
2498
  	}
  	return 0;
  }
e7c5ec919   Paul Menage   cgroups: add css_...
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
  /*
   * Atomically mark all (or else none) of the cgroup's CSS objects as
   * CSS_REMOVED. Return true on success, or false if the cgroup has
   * busy subsystems. Call with cgroup_mutex held
   */
  
  static int cgroup_clear_css_refs(struct cgroup *cgrp)
  {
  	struct cgroup_subsys *ss;
  	unsigned long flags;
  	bool failed = false;
  	local_irq_save(flags);
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		int refcnt;
804b3c28a   Paul Menage   cgroups: add cpu_...
2514
  		while (1) {
e7c5ec919   Paul Menage   cgroups: add css_...
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
  			/* We can only remove a CSS with a refcnt==1 */
  			refcnt = atomic_read(&css->refcnt);
  			if (refcnt > 1) {
  				failed = true;
  				goto done;
  			}
  			BUG_ON(!refcnt);
  			/*
  			 * Drop the refcnt to 0 while we check other
  			 * subsystems. This will cause any racing
  			 * css_tryget() to spin until we set the
  			 * CSS_REMOVED bits or abort
  			 */
804b3c28a   Paul Menage   cgroups: add cpu_...
2528
2529
2530
2531
  			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
  				break;
  			cpu_relax();
  		}
e7c5ec919   Paul Menage   cgroups: add css_...
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
  	}
   done:
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		if (failed) {
  			/*
  			 * Restore old refcnt if we previously managed
  			 * to clear it from 1 to 0
  			 */
  			if (!atomic_read(&css->refcnt))
  				atomic_set(&css->refcnt, 1);
  		} else {
  			/* Commit the fact that the CSS is removed */
  			set_bit(CSS_REMOVED, &css->flags);
  		}
  	}
  	local_irq_restore(flags);
  	return !failed;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
2551
2552
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
bd89aabc6   Paul Menage   Control groups: R...
2553
  	struct cgroup *cgrp = dentry->d_fsdata;
ddbcc7e8e   Paul Menage   Task Control Grou...
2554
2555
  	struct dentry *d;
  	struct cgroup *parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2556
2557
  	DEFINE_WAIT(wait);
  	int ret;
ddbcc7e8e   Paul Menage   Task Control Grou...
2558
2559
  
  	/* the vfs holds both inode->i_mutex already */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2560
  again:
ddbcc7e8e   Paul Menage   Task Control Grou...
2561
  	mutex_lock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
2562
  	if (atomic_read(&cgrp->count) != 0) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2563
2564
2565
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
bd89aabc6   Paul Menage   Control groups: R...
2566
  	if (!list_empty(&cgrp->children)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2567
2568
2569
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
2570
  	mutex_unlock(&cgroup_mutex);
a043e3b2c   Li Zefan   cgroup: fix comments
2571

4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
2572
  	/*
a043e3b2c   Li Zefan   cgroup: fix comments
2573
2574
  	 * Call pre_destroy handlers of subsys. Notify subsystems
  	 * that rmdir() request comes.
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
2575
  	 */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2576
2577
2578
  	ret = cgroup_call_pre_destroy(cgrp);
  	if (ret)
  		return ret;
ddbcc7e8e   Paul Menage   Task Control Grou...
2579

3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
2580
2581
  	mutex_lock(&cgroup_mutex);
  	parent = cgrp->parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2582
  	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2583
2584
2585
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
  	/*
  	 * css_put/get is provided for subsys to grab refcnt to css. In typical
  	 * case, subsystem has no reference after pre_destroy(). But, under
  	 * hierarchy management, some *temporal* refcnt can be hold.
  	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
  	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
  	 * is called when css_put() is called and refcnt goes down to 0.
  	 */
  	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
  
  	if (!cgroup_clear_css_refs(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
  		schedule();
  		finish_wait(&cgroup_rmdir_waitq, &wait);
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  		if (signal_pending(current))
  			return -EINTR;
  		goto again;
  	}
  	/* NO css_tryget() can success after here. */
  	finish_wait(&cgroup_rmdir_waitq, &wait);
  	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
2609

81a6a5cdd   Paul Menage   Task Control Grou...
2610
  	spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
2611
2612
2613
  	set_bit(CGRP_REMOVED, &cgrp->flags);
  	if (!list_empty(&cgrp->release_list))
  		list_del(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
2614
  	spin_unlock(&release_list_lock);
999cd8a45   Paul Menage   cgroups: add a pe...
2615
2616
2617
  
  	cgroup_lock_hierarchy(cgrp->root);
  	/* delete this cgroup from parent->children */
bd89aabc6   Paul Menage   Control groups: R...
2618
  	list_del(&cgrp->sibling);
999cd8a45   Paul Menage   cgroups: add a pe...
2619
  	cgroup_unlock_hierarchy(cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
2620
2621
  	spin_lock(&cgrp->dentry->d_lock);
  	d = dget(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
2622
2623
2624
2625
  	spin_unlock(&d->d_lock);
  
  	cgroup_d_remove_dir(d);
  	dput(d);
ddbcc7e8e   Paul Menage   Task Control Grou...
2626

bd89aabc6   Paul Menage   Control groups: R...
2627
  	set_bit(CGRP_RELEASABLE, &parent->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
2628
  	check_for_release(parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
2629
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
2630
2631
  	return 0;
  }
06a119204   Li Zefan   cgroup: annotate ...
2632
  static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
ddbcc7e8e   Paul Menage   Task Control Grou...
2633
  {
ddbcc7e8e   Paul Menage   Task Control Grou...
2634
  	struct cgroup_subsys_state *css;
cfe36bde5   Diego Calleja   Improve cgroup pr...
2635
2636
2637
  
  	printk(KERN_INFO "Initializing cgroup subsys %s
  ", ss->name);
ddbcc7e8e   Paul Menage   Task Control Grou...
2638
2639
  
  	/* Create the top cgroup state for this subsystem */
33a68ac1c   Li Zefan   cgroups: add inac...
2640
  	list_add(&ss->sibling, &rootnode.subsys_list);
ddbcc7e8e   Paul Menage   Task Control Grou...
2641
2642
2643
2644
2645
  	ss->root = &rootnode;
  	css = ss->create(ss, dummytop);
  	/* We don't handle early failures gracefully */
  	BUG_ON(IS_ERR(css));
  	init_cgroup_css(css, ss, dummytop);
e8d55fdeb   Li Zefan   cgroups: simplify...
2646
  	/* Update the init_css_set to contain a subsys
817929ec2   Paul Menage   Task Control Grou...
2647
  	 * pointer to this state - since the subsystem is
e8d55fdeb   Li Zefan   cgroups: simplify...
2648
2649
2650
  	 * newly registered, all tasks and hence the
  	 * init_css_set is in the subsystem's top cgroup. */
  	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
ddbcc7e8e   Paul Menage   Task Control Grou...
2651
2652
  
  	need_forkexit_callback |= ss->fork || ss->exit;
e8d55fdeb   Li Zefan   cgroups: simplify...
2653
2654
2655
2656
  	/* At system boot, before all subsystems have been
  	 * registered, no tasks have been forked, so we don't
  	 * need to invoke fork callbacks here. */
  	BUG_ON(!list_empty(&init_task.tasks));
999cd8a45   Paul Menage   cgroups: add a pe...
2657
  	mutex_init(&ss->hierarchy_mutex);
cfebe563b   Li Zefan   cgroups: fix lock...
2658
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ddbcc7e8e   Paul Menage   Task Control Grou...
2659
2660
2661
2662
  	ss->active = 1;
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2663
2664
2665
2666
   * cgroup_init_early - cgroup initialization at system boot
   *
   * Initialize cgroups at system boot, and initialize any
   * subsystems that request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
2667
2668
2669
2670
   */
  int __init cgroup_init_early(void)
  {
  	int i;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
2671
  	atomic_set(&init_css_set.refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
2672
2673
  	INIT_LIST_HEAD(&init_css_set.cg_links);
  	INIT_LIST_HEAD(&init_css_set.tasks);
472b1053f   Li Zefan   cgroups: use a ha...
2674
  	INIT_HLIST_NODE(&init_css_set.hlist);
817929ec2   Paul Menage   Task Control Grou...
2675
  	css_set_count = 1;
ddbcc7e8e   Paul Menage   Task Control Grou...
2676
  	init_cgroup_root(&rootnode);
817929ec2   Paul Menage   Task Control Grou...
2677
2678
2679
2680
  	root_count = 1;
  	init_task.cgroups = &init_css_set;
  
  	init_css_set_link.cg = &init_css_set;
bd89aabc6   Paul Menage   Control groups: R...
2681
  	list_add(&init_css_set_link.cgrp_link_list,
817929ec2   Paul Menage   Task Control Grou...
2682
2683
2684
  		 &rootnode.top_cgroup.css_sets);
  	list_add(&init_css_set_link.cg_link_list,
  		 &init_css_set.cg_links);
ddbcc7e8e   Paul Menage   Task Control Grou...
2685

472b1053f   Li Zefan   cgroups: use a ha...
2686
2687
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
  		INIT_HLIST_HEAD(&css_set_table[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
2688
2689
2690
2691
2692
2693
2694
2695
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  
  		BUG_ON(!ss->name);
  		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
  		BUG_ON(!ss->create);
  		BUG_ON(!ss->destroy);
  		if (ss->subsys_id != i) {
cfe36bde5   Diego Calleja   Improve cgroup pr...
2696
2697
  			printk(KERN_ERR "cgroup: Subsys %s id == %d
  ",
ddbcc7e8e   Paul Menage   Task Control Grou...
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
  			       ss->name, ss->subsys_id);
  			BUG();
  		}
  
  		if (ss->early_init)
  			cgroup_init_subsys(ss);
  	}
  	return 0;
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2709
2710
2711
2712
   * cgroup_init - cgroup initialization
   *
   * Register cgroup filesystem and /proc file, and initialize
   * any subsystems that didn't request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
2713
2714
2715
2716
2717
   */
  int __init cgroup_init(void)
  {
  	int err;
  	int i;
472b1053f   Li Zefan   cgroups: use a ha...
2718
  	struct hlist_head *hhead;
a424316ca   Paul Menage   Task Control Grou...
2719
2720
2721
2722
  
  	err = bdi_init(&cgroup_backing_dev_info);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
2723
2724
2725
2726
2727
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		if (!ss->early_init)
  			cgroup_init_subsys(ss);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
2728
2729
  		if (ss->use_id)
  			cgroup_subsys_init_idr(ss);
ddbcc7e8e   Paul Menage   Task Control Grou...
2730
  	}
472b1053f   Li Zefan   cgroups: use a ha...
2731
2732
2733
  	/* Add init_css_set to the hash table */
  	hhead = css_set_hash(init_css_set.subsys);
  	hlist_add_head(&init_css_set.hlist, hhead);
ddbcc7e8e   Paul Menage   Task Control Grou...
2734
2735
2736
  	err = register_filesystem(&cgroup_fs_type);
  	if (err < 0)
  		goto out;
46ae220be   Li Zefan   cgroup: switch to...
2737
  	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
a424316ca   Paul Menage   Task Control Grou...
2738

ddbcc7e8e   Paul Menage   Task Control Grou...
2739
  out:
a424316ca   Paul Menage   Task Control Grou...
2740
2741
  	if (err)
  		bdi_destroy(&cgroup_backing_dev_info);
ddbcc7e8e   Paul Menage   Task Control Grou...
2742
2743
  	return err;
  }
b4f48b636   Paul Menage   Task Control Grou...
2744

a424316ca   Paul Menage   Task Control Grou...
2745
2746
2747
2748
2749
2750
  /*
   * proc_cgroup_show()
   *  - Print task's cgroup paths into seq_file, one line for each hierarchy
   *  - Used for /proc/<pid>/cgroup.
   *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
   *    doesn't really matter if tsk->cgroup changes after we read it,
956db3ca0   Cliff Wickman   hotplug cpu: move...
2751
   *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
a424316ca   Paul Menage   Task Control Grou...
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
   *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
   *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
   *    cgroup to top_cgroup.
   */
  
  /* TODO: Use a proper seq_file iterator */
  static int proc_cgroup_show(struct seq_file *m, void *v)
  {
  	struct pid *pid;
  	struct task_struct *tsk;
  	char *buf;
  	int retval;
  	struct cgroupfs_root *root;
  
  	retval = -ENOMEM;
  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!buf)
  		goto out;
  
  	retval = -ESRCH;
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
  	if (!tsk)
  		goto out_free;
  
  	retval = 0;
  
  	mutex_lock(&cgroup_mutex);
e5f6a8609   Li Zefan   cgroups: make roo...
2780
  	for_each_active_root(root) {
a424316ca   Paul Menage   Task Control Grou...
2781
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
2782
  		struct cgroup *cgrp;
a424316ca   Paul Menage   Task Control Grou...
2783
2784
  		int subsys_id;
  		int count = 0;
b6c3006d2   Paul Menage   cgroups: include ...
2785
  		seq_printf(m, "%lu:", root->subsys_bits);
a424316ca   Paul Menage   Task Control Grou...
2786
2787
2788
2789
  		for_each_subsys(root, ss)
  			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
  		seq_putc(m, ':');
  		get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
bd89aabc6   Paul Menage   Control groups: R...
2790
2791
  		cgrp = task_cgroup(tsk, subsys_id);
  		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
a424316ca   Paul Menage   Task Control Grou...
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
  		if (retval < 0)
  			goto out_unlock;
  		seq_puts(m, buf);
  		seq_putc(m, '
  ');
  	}
  
  out_unlock:
  	mutex_unlock(&cgroup_mutex);
  	put_task_struct(tsk);
  out_free:
  	kfree(buf);
  out:
  	return retval;
  }
  
  static int cgroup_open(struct inode *inode, struct file *file)
  {
  	struct pid *pid = PROC_I(inode)->pid;
  	return single_open(file, proc_cgroup_show, pid);
  }
  
  struct file_operations proc_cgroup_operations = {
  	.open		= cgroup_open,
  	.read		= seq_read,
  	.llseek		= seq_lseek,
  	.release	= single_release,
  };
  
  /* Display information about each subsystem and each hierarchy */
  static int proc_cgroupstats_show(struct seq_file *m, void *v)
  {
  	int i;
a424316ca   Paul Menage   Task Control Grou...
2825

8bab8dded   Paul Menage   cgroups: add cgro...
2826
2827
  	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled
  ");
a424316ca   Paul Menage   Task Control Grou...
2828
  	mutex_lock(&cgroup_mutex);
a424316ca   Paul Menage   Task Control Grou...
2829
2830
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
8bab8dded   Paul Menage   cgroups: add cgro...
2831
2832
  		seq_printf(m, "%s\t%lu\t%d\t%d
  ",
817929ec2   Paul Menage   Task Control Grou...
2833
  			   ss->name, ss->root->subsys_bits,
8bab8dded   Paul Menage   cgroups: add cgro...
2834
  			   ss->root->number_of_cgroups, !ss->disabled);
a424316ca   Paul Menage   Task Control Grou...
2835
2836
2837
2838
2839
2840
2841
  	}
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  static int cgroupstats_open(struct inode *inode, struct file *file)
  {
9dce07f1a   Al Viro   NULL noise: fs/*,...
2842
  	return single_open(file, proc_cgroupstats_show, NULL);
a424316ca   Paul Menage   Task Control Grou...
2843
2844
2845
2846
2847
2848
2849
2850
  }
  
  static struct file_operations proc_cgroupstats_operations = {
  	.open = cgroupstats_open,
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.release = single_release,
  };
b4f48b636   Paul Menage   Task Control Grou...
2851
2852
  /**
   * cgroup_fork - attach newly forked task to its parents cgroup.
a043e3b2c   Li Zefan   cgroup: fix comments
2853
   * @child: pointer to task_struct of forking parent process.
b4f48b636   Paul Menage   Task Control Grou...
2854
2855
2856
2857
2858
2859
   *
   * Description: A task inherits its parent's cgroup at fork().
   *
   * A pointer to the shared css_set was automatically copied in
   * fork.c by dup_task_struct().  However, we ignore that copy, since
   * it was not made under the protection of RCU or cgroup_mutex, so
956db3ca0   Cliff Wickman   hotplug cpu: move...
2860
   * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
817929ec2   Paul Menage   Task Control Grou...
2861
2862
   * have already changed current->cgroups, allowing the previously
   * referenced cgroup group to be removed and freed.
b4f48b636   Paul Menage   Task Control Grou...
2863
2864
2865
2866
2867
2868
   *
   * At the point that cgroup_fork() is called, 'current' is the parent
   * task, and the passed argument 'child' points to the child task.
   */
  void cgroup_fork(struct task_struct *child)
  {
817929ec2   Paul Menage   Task Control Grou...
2869
2870
2871
2872
2873
  	task_lock(current);
  	child->cgroups = current->cgroups;
  	get_css_set(child->cgroups);
  	task_unlock(current);
  	INIT_LIST_HEAD(&child->cg_list);
b4f48b636   Paul Menage   Task Control Grou...
2874
2875
2876
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2877
2878
2879
2880
2881
2882
   * cgroup_fork_callbacks - run fork callbacks
   * @child: the new task
   *
   * Called on a new task very soon before adding it to the
   * tasklist. No need to take any locks since no-one can
   * be operating on this task.
b4f48b636   Paul Menage   Task Control Grou...
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
   */
  void cgroup_fork_callbacks(struct task_struct *child)
  {
  	if (need_forkexit_callback) {
  		int i;
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->fork)
  				ss->fork(ss, child);
  		}
  	}
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2897
2898
2899
2900
2901
2902
2903
2904
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
   * Adds the task to the list running through its css_set if necessary.
   * Has to be after the task is visible on the task list in case we race
   * with the first call to cgroup_iter_start() - to guarantee that the
   * new task ends up on its list.
   */
817929ec2   Paul Menage   Task Control Grou...
2905
2906
2907
2908
  void cgroup_post_fork(struct task_struct *child)
  {
  	if (use_task_css_set_links) {
  		write_lock(&css_set_lock);
b12b533fa   Lai Jiangshan   cgroups: add lock...
2909
  		task_lock(child);
817929ec2   Paul Menage   Task Control Grou...
2910
2911
  		if (list_empty(&child->cg_list))
  			list_add(&child->cg_list, &child->cgroups->tasks);
b12b533fa   Lai Jiangshan   cgroups: add lock...
2912
  		task_unlock(child);
817929ec2   Paul Menage   Task Control Grou...
2913
2914
2915
2916
  		write_unlock(&css_set_lock);
  	}
  }
  /**
b4f48b636   Paul Menage   Task Control Grou...
2917
2918
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
a043e3b2c   Li Zefan   cgroup: fix comments
2919
   * @run_callback: run exit callbacks?
b4f48b636   Paul Menage   Task Control Grou...
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
   *
   * Description: Detach cgroup from @tsk and release it.
   *
   * Note that cgroups marked notify_on_release force every task in
   * them to take the global cgroup_mutex mutex when exiting.
   * This could impact scaling on very large systems.  Be reluctant to
   * use notify_on_release cgroups where very high task exit scaling
   * is required on large systems.
   *
   * the_top_cgroup_hack:
   *
   *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
   *
   *    We call cgroup_exit() while the task is still competent to
   *    handle notify_on_release(), then leave the task attached to the
   *    root cgroup in each hierarchy for the remainder of its exit.
   *
   *    To do this properly, we would increment the reference count on
   *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
   *    code we would add a second cgroup function call, to drop that
   *    reference.  This would just create an unnecessary hot spot on
   *    the top_cgroup reference count, to no avail.
   *
   *    Normally, holding a reference to a cgroup without bumping its
   *    count is unsafe.   The cgroup could go away, or someone could
   *    attach us to a different cgroup, decrementing the count on
   *    the first cgroup that we never incremented.  But in this case,
   *    top_cgroup isn't going away, and either task has PF_EXITING set,
956db3ca0   Cliff Wickman   hotplug cpu: move...
2948
2949
   *    which wards off any cgroup_attach_task() attempts, or task is a failed
   *    fork, never visible to cgroup_attach_task.
b4f48b636   Paul Menage   Task Control Grou...
2950
2951
2952
2953
   */
  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
  {
  	int i;
817929ec2   Paul Menage   Task Control Grou...
2954
  	struct css_set *cg;
b4f48b636   Paul Menage   Task Control Grou...
2955
2956
2957
2958
2959
2960
2961
2962
  
  	if (run_callbacks && need_forkexit_callback) {
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->exit)
  				ss->exit(ss, tsk);
  		}
  	}
817929ec2   Paul Menage   Task Control Grou...
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
  
  	/*
  	 * Unlink from the css_set task list if necessary.
  	 * Optimistically check cg_list before taking
  	 * css_set_lock
  	 */
  	if (!list_empty(&tsk->cg_list)) {
  		write_lock(&css_set_lock);
  		if (!list_empty(&tsk->cg_list))
  			list_del(&tsk->cg_list);
  		write_unlock(&css_set_lock);
  	}
b4f48b636   Paul Menage   Task Control Grou...
2975
2976
  	/* Reassign the task to the init_css_set. */
  	task_lock(tsk);
817929ec2   Paul Menage   Task Control Grou...
2977
2978
  	cg = tsk->cgroups;
  	tsk->cgroups = &init_css_set;
b4f48b636   Paul Menage   Task Control Grou...
2979
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
2980
  	if (cg)
81a6a5cdd   Paul Menage   Task Control Grou...
2981
  		put_css_set_taskexit(cg);
b4f48b636   Paul Menage   Task Control Grou...
2982
  }
697f41610   Paul Menage   Task Control Grou...
2983
2984
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2985
2986
2987
   * cgroup_clone - clone the cgroup the given subsystem is attached to
   * @tsk: the task to be moved
   * @subsys: the given subsystem
e885dcde7   Serge E. Hallyn   cgroup_clone: use...
2988
   * @nodename: the name for the new cgroup
a043e3b2c   Li Zefan   cgroup: fix comments
2989
2990
2991
2992
   *
   * Duplicate the current cgroup in the hierarchy that the given
   * subsystem is attached to, and move this task into the new
   * child.
697f41610   Paul Menage   Task Control Grou...
2993
   */
e885dcde7   Serge E. Hallyn   cgroup_clone: use...
2994
2995
  int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
  							char *nodename)
697f41610   Paul Menage   Task Control Grou...
2996
2997
2998
  {
  	struct dentry *dentry;
  	int ret = 0;
697f41610   Paul Menage   Task Control Grou...
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
  	struct cgroup *parent, *child;
  	struct inode *inode;
  	struct css_set *cg;
  	struct cgroupfs_root *root;
  	struct cgroup_subsys *ss;
  
  	/* We shouldn't be called by an unregistered subsystem */
  	BUG_ON(!subsys->active);
  
  	/* First figure out what hierarchy and cgroup we're dealing
  	 * with, and pin them so we can drop cgroup_mutex */
  	mutex_lock(&cgroup_mutex);
   again:
  	root = subsys->root;
  	if (root == &rootnode) {
697f41610   Paul Menage   Task Control Grou...
3014
3015
3016
  		mutex_unlock(&cgroup_mutex);
  		return 0;
  	}
697f41610   Paul Menage   Task Control Grou...
3017

697f41610   Paul Menage   Task Control Grou...
3018
  	/* Pin the hierarchy */
1404f0656   Li Zefan   cgroups: fix lock...
3019
  	if (!atomic_inc_not_zero(&root->sb->s_active)) {
7b574b7b0   Li Zefan   cgroups: fix a ra...
3020
3021
3022
3023
  		/* We race with the final deactivate_super() */
  		mutex_unlock(&cgroup_mutex);
  		return 0;
  	}
697f41610   Paul Menage   Task Control Grou...
3024

817929ec2   Paul Menage   Task Control Grou...
3025
  	/* Keep the cgroup alive */
1404f0656   Li Zefan   cgroups: fix lock...
3026
3027
3028
  	task_lock(tsk);
  	parent = task_cgroup(tsk, subsys->subsys_id);
  	cg = tsk->cgroups;
817929ec2   Paul Menage   Task Control Grou...
3029
  	get_css_set(cg);
104cbd553   Lai Jiangshan   cgroups: use task...
3030
  	task_unlock(tsk);
1404f0656   Li Zefan   cgroups: fix lock...
3031

697f41610   Paul Menage   Task Control Grou...
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
  	mutex_unlock(&cgroup_mutex);
  
  	/* Now do the VFS work to create a cgroup */
  	inode = parent->dentry->d_inode;
  
  	/* Hold the parent directory mutex across this operation to
  	 * stop anyone else deleting the new cgroup */
  	mutex_lock(&inode->i_mutex);
  	dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
  	if (IS_ERR(dentry)) {
  		printk(KERN_INFO
cfe36bde5   Diego Calleja   Improve cgroup pr...
3043
3044
  		       "cgroup: Couldn't allocate dentry for %s: %ld
  ", nodename,
697f41610   Paul Menage   Task Control Grou...
3045
3046
3047
3048
3049
3050
  		       PTR_ERR(dentry));
  		ret = PTR_ERR(dentry);
  		goto out_release;
  	}
  
  	/* Create the cgroup directory, which also creates the cgroup */
75139b827   Li Zefan   cgroups: remove s...
3051
  	ret = vfs_mkdir(inode, dentry, 0755);
bd89aabc6   Paul Menage   Control groups: R...
3052
  	child = __d_cgrp(dentry);
697f41610   Paul Menage   Task Control Grou...
3053
3054
3055
3056
3057
3058
3059
3060
  	dput(dentry);
  	if (ret) {
  		printk(KERN_INFO
  		       "Failed to create cgroup %s: %d
  ", nodename,
  		       ret);
  		goto out_release;
  	}
697f41610   Paul Menage   Task Control Grou...
3061
3062
3063
3064
3065
3066
3067
3068
  	/* The cgroup now exists. Retake cgroup_mutex and check
  	 * that we're still in the same state that we thought we
  	 * were. */
  	mutex_lock(&cgroup_mutex);
  	if ((root != subsys->root) ||
  	    (parent != task_cgroup(tsk, subsys->subsys_id))) {
  		/* Aargh, we raced ... */
  		mutex_unlock(&inode->i_mutex);
817929ec2   Paul Menage   Task Control Grou...
3069
  		put_css_set(cg);
697f41610   Paul Menage   Task Control Grou...
3070

1404f0656   Li Zefan   cgroups: fix lock...
3071
  		deactivate_super(root->sb);
697f41610   Paul Menage   Task Control Grou...
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
  		/* The cgroup is still accessible in the VFS, but
  		 * we're not going to try to rmdir() it at this
  		 * point. */
  		printk(KERN_INFO
  		       "Race in cgroup_clone() - leaking cgroup %s
  ",
  		       nodename);
  		goto again;
  	}
  
  	/* do any required auto-setup */
  	for_each_subsys(root, ss) {
  		if (ss->post_clone)
  			ss->post_clone(ss, child);
  	}
  
  	/* All seems fine. Finish by moving the task into the new cgroup */
956db3ca0   Cliff Wickman   hotplug cpu: move...
3089
  	ret = cgroup_attach_task(child, tsk);
697f41610   Paul Menage   Task Control Grou...
3090
3091
3092
3093
  	mutex_unlock(&cgroup_mutex);
  
   out_release:
  	mutex_unlock(&inode->i_mutex);
81a6a5cdd   Paul Menage   Task Control Grou...
3094
3095
  
  	mutex_lock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
3096
  	put_css_set(cg);
81a6a5cdd   Paul Menage   Task Control Grou...
3097
  	mutex_unlock(&cgroup_mutex);
1404f0656   Li Zefan   cgroups: fix lock...
3098
  	deactivate_super(root->sb);
697f41610   Paul Menage   Task Control Grou...
3099
3100
  	return ret;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
3101
  /**
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3102
   * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
a043e3b2c   Li Zefan   cgroup: fix comments
3103
   * @cgrp: the cgroup in question
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3104
   * @task: the task in question
a043e3b2c   Li Zefan   cgroup: fix comments
3105
   *
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3106
3107
   * See if @cgrp is a descendant of @task's cgroup in the appropriate
   * hierarchy.
697f41610   Paul Menage   Task Control Grou...
3108
3109
3110
3111
3112
3113
   *
   * If we are sending in dummytop, then presumably we are creating
   * the top cgroup in the subsystem.
   *
   * Called only by the ns (nsproxy) cgroup.
   */
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3114
  int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
697f41610   Paul Menage   Task Control Grou...
3115
3116
3117
3118
  {
  	int ret;
  	struct cgroup *target;
  	int subsys_id;
bd89aabc6   Paul Menage   Control groups: R...
3119
  	if (cgrp == dummytop)
697f41610   Paul Menage   Task Control Grou...
3120
  		return 1;
bd89aabc6   Paul Menage   Control groups: R...
3121
  	get_first_subsys(cgrp, NULL, &subsys_id);
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3122
  	target = task_cgroup(task, subsys_id);
bd89aabc6   Paul Menage   Control groups: R...
3123
3124
3125
  	while (cgrp != target && cgrp!= cgrp->top_cgroup)
  		cgrp = cgrp->parent;
  	ret = (cgrp == target);
697f41610   Paul Menage   Task Control Grou...
3126
3127
  	return ret;
  }
81a6a5cdd   Paul Menage   Task Control Grou...
3128

bd89aabc6   Paul Menage   Control groups: R...
3129
  static void check_for_release(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
3130
3131
3132
  {
  	/* All of these checks rely on RCU to keep the cgroup
  	 * structure alive */
bd89aabc6   Paul Menage   Control groups: R...
3133
3134
  	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
  	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
3135
3136
3137
3138
3139
  		/* Control Group is currently removeable. If it's not
  		 * already queued for a userspace notification, queue
  		 * it now */
  		int need_schedule_work = 0;
  		spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
3140
3141
3142
  		if (!cgroup_is_removed(cgrp) &&
  		    list_empty(&cgrp->release_list)) {
  			list_add(&cgrp->release_list, &release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
  			need_schedule_work = 1;
  		}
  		spin_unlock(&release_list_lock);
  		if (need_schedule_work)
  			schedule_work(&release_agent_work);
  	}
  }
  
  void __css_put(struct cgroup_subsys_state *css)
  {
bd89aabc6   Paul Menage   Control groups: R...
3153
  	struct cgroup *cgrp = css->cgroup;
81a6a5cdd   Paul Menage   Task Control Grou...
3154
  	rcu_read_lock();
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3155
3156
3157
3158
3159
3160
  	if (atomic_dec_return(&css->refcnt) == 1) {
  		if (notify_on_release(cgrp)) {
  			set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
  		}
  		cgroup_wakeup_rmdir_waiters(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
  	}
  	rcu_read_unlock();
  }
  
  /*
   * Notify userspace when a cgroup is released, by running the
   * configured release agent with the name of the cgroup (path
   * relative to the root of cgroup file system) as the argument.
   *
   * Most likely, this user command will try to rmdir this cgroup.
   *
   * This races with the possibility that some other task will be
   * attached to this cgroup before it is removed, or that some other
   * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
   * The presumed 'rmdir' will fail quietly if this cgroup is no longer
   * unused, and this cgroup will be reprieved from its death sentence,
   * to continue to serve a useful existence.  Next time it's released,
   * we will get notified again, if it still has 'notify_on_release' set.
   *
   * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
   * means only wait until the task is successfully execve()'d.  The
   * separate release agent task is forked by call_usermodehelper(),
   * then control in this thread returns here, without waiting for the
   * release agent task.  We don't bother to wait because the caller of
   * this routine has no use for the exit status of the release agent
   * task, so no sense holding our caller up for that.
81a6a5cdd   Paul Menage   Task Control Grou...
3187
   */
81a6a5cdd   Paul Menage   Task Control Grou...
3188
3189
3190
3191
3192
3193
3194
3195
  static void cgroup_release_agent(struct work_struct *work)
  {
  	BUG_ON(work != &release_agent_work);
  	mutex_lock(&cgroup_mutex);
  	spin_lock(&release_list_lock);
  	while (!list_empty(&release_list)) {
  		char *argv[3], *envp[3];
  		int i;
e788e066c   Paul Menage   cgroup files: mov...
3196
  		char *pathbuf = NULL, *agentbuf = NULL;
bd89aabc6   Paul Menage   Control groups: R...
3197
  		struct cgroup *cgrp = list_entry(release_list.next,
81a6a5cdd   Paul Menage   Task Control Grou...
3198
3199
  						    struct cgroup,
  						    release_list);
bd89aabc6   Paul Menage   Control groups: R...
3200
  		list_del_init(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
3201
3202
  		spin_unlock(&release_list_lock);
  		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
e788e066c   Paul Menage   cgroup files: mov...
3203
3204
3205
3206
3207
3208
3209
  		if (!pathbuf)
  			goto continue_free;
  		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
  			goto continue_free;
  		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
  		if (!agentbuf)
  			goto continue_free;
81a6a5cdd   Paul Menage   Task Control Grou...
3210
3211
  
  		i = 0;
e788e066c   Paul Menage   cgroup files: mov...
3212
3213
  		argv[i++] = agentbuf;
  		argv[i++] = pathbuf;
81a6a5cdd   Paul Menage   Task Control Grou...
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
  		argv[i] = NULL;
  
  		i = 0;
  		/* minimal command environment */
  		envp[i++] = "HOME=/";
  		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  		envp[i] = NULL;
  
  		/* Drop the lock while we invoke the usermode helper,
  		 * since the exec could involve hitting disk and hence
  		 * be a slow process */
  		mutex_unlock(&cgroup_mutex);
  		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
81a6a5cdd   Paul Menage   Task Control Grou...
3227
  		mutex_lock(&cgroup_mutex);
e788e066c   Paul Menage   cgroup files: mov...
3228
3229
3230
   continue_free:
  		kfree(pathbuf);
  		kfree(agentbuf);
81a6a5cdd   Paul Menage   Task Control Grou...
3231
3232
3233
3234
3235
  		spin_lock(&release_list_lock);
  	}
  	spin_unlock(&release_list_lock);
  	mutex_unlock(&cgroup_mutex);
  }
8bab8dded   Paul Menage   cgroups: add cgro...
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
  
  static int __init cgroup_disable(char *str)
  {
  	int i;
  	char *token;
  
  	while ((token = strsep(&str, ",")) != NULL) {
  		if (!*token)
  			continue;
  
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  
  			if (!strcmp(token, ss->name)) {
  				ss->disabled = 1;
  				printk(KERN_INFO "Disabling %s control group"
  					" subsystem
  ", ss->name);
  				break;
  			}
  		}
  	}
  	return 1;
  }
  __setup("cgroup_disable=", cgroup_disable);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
  
  /*
   * Functons for CSS ID.
   */
  
  /*
   *To get ID other than 0, this should be called when !cgroup_is_removed().
   */
  unsigned short css_id(struct cgroup_subsys_state *css)
  {
  	struct css_id *cssid = rcu_dereference(css->id);
  
  	if (cssid)
  		return cssid->id;
  	return 0;
  }
  
  unsigned short css_depth(struct cgroup_subsys_state *css)
  {
  	struct css_id *cssid = rcu_dereference(css->id);
  
  	if (cssid)
  		return cssid->depth;
  	return 0;
  }
  
  bool css_is_ancestor(struct cgroup_subsys_state *child,
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
3288
  		    const struct cgroup_subsys_state *root)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
  {
  	struct css_id *child_id = rcu_dereference(child->id);
  	struct css_id *root_id = rcu_dereference(root->id);
  
  	if (!child_id || !root_id || (child_id->depth < root_id->depth))
  		return false;
  	return child_id->stack[root_id->depth] == root_id->id;
  }
  
  static void __free_css_id_cb(struct rcu_head *head)
  {
  	struct css_id *id;
  
  	id = container_of(head, struct css_id, rcu_head);
  	kfree(id);
  }
  
  void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
  {
  	struct css_id *id = css->id;
  	/* When this is called before css_id initialization, id can be NULL */
  	if (!id)
  		return;
  
  	BUG_ON(!ss->use_id);
  
  	rcu_assign_pointer(id->css, NULL);
  	rcu_assign_pointer(css->id, NULL);
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, id->id);
  	spin_unlock(&ss->id_lock);
  	call_rcu(&id->rcu_head, __free_css_id_cb);
  }
  
  /*
   * This is called by init or create(). Then, calls to this function are
   * always serialized (By cgroup_mutex() at create()).
   */
  
  static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
  {
  	struct css_id *newid;
  	int myid, error, size;
  
  	BUG_ON(!ss->use_id);
  
  	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
  	newid = kzalloc(size, GFP_KERNEL);
  	if (!newid)
  		return ERR_PTR(-ENOMEM);
  	/* get id */
  	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
  		error = -ENOMEM;
  		goto err_out;
  	}
  	spin_lock(&ss->id_lock);
  	/* Don't use 0. allocates an ID of 1-65535 */
  	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
  	spin_unlock(&ss->id_lock);
  
  	/* Returns error when there are no free spaces for new ID.*/
  	if (error) {
  		error = -ENOSPC;
  		goto err_out;
  	}
  	if (myid > CSS_ID_MAX)
  		goto remove_idr;
  
  	newid->id = myid;
  	newid->depth = depth;
  	return newid;
  remove_idr:
  	error = -ENOSPC;
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, myid);
  	spin_unlock(&ss->id_lock);
  err_out:
  	kfree(newid);
  	return ERR_PTR(error);
  
  }
  
  static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
  {
  	struct css_id *newid;
  	struct cgroup_subsys_state *rootcss;
  
  	spin_lock_init(&ss->id_lock);
  	idr_init(&ss->idr);
  
  	rootcss = init_css_set.subsys[ss->subsys_id];
  	newid = get_new_cssid(ss, 0);
  	if (IS_ERR(newid))
  		return PTR_ERR(newid);
  
  	newid->stack[0] = newid->id;
  	newid->css = rootcss;
  	rootcss->id = newid;
  	return 0;
  }
  
  static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
  			struct cgroup *child)
  {
  	int subsys_id, i, depth = 0;
  	struct cgroup_subsys_state *parent_css, *child_css;
  	struct css_id *child_id, *parent_id = NULL;
  
  	subsys_id = ss->subsys_id;
  	parent_css = parent->subsys[subsys_id];
  	child_css = child->subsys[subsys_id];
  	depth = css_depth(parent_css) + 1;
  	parent_id = parent_css->id;
  
  	child_id = get_new_cssid(ss, depth);
  	if (IS_ERR(child_id))
  		return PTR_ERR(child_id);
  
  	for (i = 0; i < depth; i++)
  		child_id->stack[i] = parent_id->stack[i];
  	child_id->stack[depth] = child_id->id;
  	/*
  	 * child_id->css pointer will be set after this cgroup is available
  	 * see cgroup_populate_dir()
  	 */
  	rcu_assign_pointer(child_css->id, child_id);
  
  	return 0;
  }
  
  /**
   * css_lookup - lookup css by id
   * @ss: cgroup subsys to be looked into.
   * @id: the id
   *
   * Returns pointer to cgroup_subsys_state if there is valid one with id.
   * NULL if not. Should be called under rcu_read_lock()
   */
  struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
  {
  	struct css_id *cssid = NULL;
  
  	BUG_ON(!ss->use_id);
  	cssid = idr_find(&ss->idr, id);
  
  	if (unlikely(!cssid))
  		return NULL;
  
  	return rcu_dereference(cssid->css);
  }
  
  /**
   * css_get_next - lookup next cgroup under specified hierarchy.
   * @ss: pointer to subsystem
   * @id: current position of iteration.
   * @root: pointer to css. search tree under this.
   * @foundid: position of found object.
   *
   * Search next css under the specified hierarchy of rootid. Calling under
   * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
   */
  struct cgroup_subsys_state *
  css_get_next(struct cgroup_subsys *ss, int id,
  	     struct cgroup_subsys_state *root, int *foundid)
  {
  	struct cgroup_subsys_state *ret = NULL;
  	struct css_id *tmp;
  	int tmpid;
  	int rootid = css_id(root);
  	int depth = css_depth(root);
  
  	if (!rootid)
  		return NULL;
  
  	BUG_ON(!ss->use_id);
  	/* fill start point for scan */
  	tmpid = id;
  	while (1) {
  		/*
  		 * scan next entry from bitmap(tree), tmpid is updated after
  		 * idr_get_next().
  		 */
  		spin_lock(&ss->id_lock);
  		tmp = idr_get_next(&ss->idr, &tmpid);
  		spin_unlock(&ss->id_lock);
  
  		if (!tmp)
  			break;
  		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
  			ret = rcu_dereference(tmp->css);
  			if (ret) {
  				*foundid = tmpid;
  				break;
  			}
  		}
  		/* continue to scan from next id */
  		tmpid = tmpid + 1;
  	}
  	return ret;
  }