Blame view

kernel/cgroup.c 93.7 KB
ddbcc7e8e   Paul Menage   Task Control Grou...
1
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
   *  Generic process-grouping system.
   *
   *  Based originally on the cpuset system, extracted by Paul Menage
   *  Copyright (C) 2006 Google, Inc
   *
   *  Copyright notices from the original cpuset code:
   *  --------------------------------------------------
   *  Copyright (C) 2003 BULL SA.
   *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
   *
   *  2003-10-10 Written by Simon Derr.
   *  2003-10-22 Updates by Stephen Hemminger.
   *  2004 May-July Rework by Paul Jackson.
   *  ---------------------------------------------------
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
  
  #include <linux/cgroup.h>
  #include <linux/errno.h>
  #include <linux/fs.h>
  #include <linux/kernel.h>
  #include <linux/list.h>
  #include <linux/mm.h>
  #include <linux/mutex.h>
  #include <linux/mount.h>
  #include <linux/pagemap.h>
a424316ca   Paul Menage   Task Control Grou...
34
  #include <linux/proc_fs.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
35
36
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
817929ec2   Paul Menage   Task Control Grou...
37
  #include <linux/backing-dev.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
38
39
40
41
42
  #include <linux/seq_file.h>
  #include <linux/slab.h>
  #include <linux/magic.h>
  #include <linux/spinlock.h>
  #include <linux/string.h>
bbcb81d09   Paul Menage   Task Control Grou...
43
  #include <linux/sort.h>
81a6a5cdd   Paul Menage   Task Control Grou...
44
  #include <linux/kmod.h>
846c7bb05   Balbir Singh   Add cgroupstats
45
46
  #include <linux/delayacct.h>
  #include <linux/cgroupstats.h>
472b1053f   Li Zefan   cgroups: use a ha...
47
  #include <linux/hash.h>
3f8206d49   Al Viro   [PATCH] get rid o...
48
  #include <linux/namei.h>
337eb00a2   Alessio Igor Bogani   Push BKL down int...
49
  #include <linux/smp_lock.h>
846c7bb05   Balbir Singh   Add cgroupstats
50

ddbcc7e8e   Paul Menage   Task Control Grou...
51
  #include <asm/atomic.h>
81a6a5cdd   Paul Menage   Task Control Grou...
52
  static DEFINE_MUTEX(cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
  /* Generate an array of cgroup subsystem pointers */
  #define SUBSYS(_x) &_x ## _subsys,
  
  static struct cgroup_subsys *subsys[] = {
  #include <linux/cgroup_subsys.h>
  };
  
  /*
   * A cgroupfs_root represents the root of a cgroup hierarchy,
   * and may be associated with a superblock to form an active
   * hierarchy
   */
  struct cgroupfs_root {
  	struct super_block *sb;
  
  	/*
  	 * The bitmask of subsystems intended to be attached to this
  	 * hierarchy
  	 */
  	unsigned long subsys_bits;
  
  	/* The bitmask of subsystems currently attached to this hierarchy */
  	unsigned long actual_subsys_bits;
  
  	/* A list running through the attached subsystems */
  	struct list_head subsys_list;
  
  	/* The root cgroup for this hierarchy */
  	struct cgroup top_cgroup;
  
  	/* Tracks how many cgroups are currently defined in hierarchy.*/
  	int number_of_cgroups;
e5f6a8609   Li Zefan   cgroups: make roo...
85
  	/* A list running through the active hierarchies */
ddbcc7e8e   Paul Menage   Task Control Grou...
86
87
88
89
  	struct list_head root_list;
  
  	/* Hierarchy-specific flags */
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
90

e788e066c   Paul Menage   cgroup files: mov...
91
  	/* The path to use for release notifications. */
81a6a5cdd   Paul Menage   Task Control Grou...
92
  	char release_agent_path[PATH_MAX];
ddbcc7e8e   Paul Menage   Task Control Grou...
93
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
94
95
96
97
98
99
  /*
   * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
   * subsystems that are otherwise unattached - it never has more than a
   * single cgroup, and all tasks are part of that cgroup.
   */
  static struct cgroupfs_root rootnode;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
  /*
   * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
   * cgroup_subsys->use_id != 0.
   */
  #define CSS_ID_MAX	(65535)
  struct css_id {
  	/*
  	 * The css to which this ID points. This pointer is set to valid value
  	 * after cgroup is populated. If cgroup is removed, this will be NULL.
  	 * This pointer is expected to be RCU-safe because destroy()
  	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
  	 * css_tryget() should be used for avoiding race.
  	 */
  	struct cgroup_subsys_state *css;
  	/*
  	 * ID of this css.
  	 */
  	unsigned short id;
  	/*
  	 * Depth in hierarchy which this ID belongs to.
  	 */
  	unsigned short depth;
  	/*
  	 * ID is freed by RCU. (and lookup routine is RCU safe.)
  	 */
  	struct rcu_head rcu_head;
  	/*
  	 * Hierarchy of CSS ID belongs to.
  	 */
  	unsigned short stack[0]; /* Array of Length (depth+1) */
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
131
132
133
  /* The list of hierarchy roots */
  
  static LIST_HEAD(roots);
817929ec2   Paul Menage   Task Control Grou...
134
  static int root_count;
ddbcc7e8e   Paul Menage   Task Control Grou...
135
136
137
138
139
  
  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
  #define dummytop (&rootnode.top_cgroup)
  
  /* This flag indicates whether tasks in the fork and exit paths should
a043e3b2c   Li Zefan   cgroup: fix comments
140
141
142
   * check for fork/exit handlers to call. This avoids us having to do
   * extra work in the fork/exit path if none of the subsystems need to
   * be called.
ddbcc7e8e   Paul Menage   Task Control Grou...
143
   */
8947f9d5b   Li Zefan   cgroups: annotate...
144
  static int need_forkexit_callback __read_mostly;
ddbcc7e8e   Paul Menage   Task Control Grou...
145

ddbcc7e8e   Paul Menage   Task Control Grou...
146
  /* convenient tests for these bits */
bd89aabc6   Paul Menage   Control groups: R...
147
  inline int cgroup_is_removed(const struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
148
  {
bd89aabc6   Paul Menage   Control groups: R...
149
  	return test_bit(CGRP_REMOVED, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
150
151
152
153
154
155
  }
  
  /* bits in struct cgroupfs_root flags field */
  enum {
  	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
  };
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
156
  static int cgroup_is_releasable(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
157
158
  {
  	const int bits =
bd89aabc6   Paul Menage   Control groups: R...
159
160
161
  		(1 << CGRP_RELEASABLE) |
  		(1 << CGRP_NOTIFY_ON_RELEASE);
  	return (cgrp->flags & bits) == bits;
81a6a5cdd   Paul Menage   Task Control Grou...
162
  }
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
163
  static int notify_on_release(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
164
  {
bd89aabc6   Paul Menage   Control groups: R...
165
  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
166
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
167
168
169
170
171
172
  /*
   * for_each_subsys() allows you to iterate on each subsystem attached to
   * an active hierarchy
   */
  #define for_each_subsys(_root, _ss) \
  list_for_each_entry(_ss, &_root->subsys_list, sibling)
e5f6a8609   Li Zefan   cgroups: make roo...
173
174
  /* for_each_active_root() allows you to iterate across the active hierarchies */
  #define for_each_active_root(_root) \
ddbcc7e8e   Paul Menage   Task Control Grou...
175
  list_for_each_entry(_root, &roots, root_list)
81a6a5cdd   Paul Menage   Task Control Grou...
176
177
178
179
180
181
  /* the list of cgroups eligible for automatic release. Protected by
   * release_list_lock */
  static LIST_HEAD(release_list);
  static DEFINE_SPINLOCK(release_list_lock);
  static void cgroup_release_agent(struct work_struct *work);
  static DECLARE_WORK(release_agent_work, cgroup_release_agent);
bd89aabc6   Paul Menage   Control groups: R...
182
  static void check_for_release(struct cgroup *cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
183

817929ec2   Paul Menage   Task Control Grou...
184
185
186
187
188
189
  /* Link structure for associating css_set objects with cgroups */
  struct cg_cgroup_link {
  	/*
  	 * List running through cg_cgroup_links associated with a
  	 * cgroup, anchored on cgroup->css_sets
  	 */
bd89aabc6   Paul Menage   Control groups: R...
190
  	struct list_head cgrp_link_list;
817929ec2   Paul Menage   Task Control Grou...
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
  	/*
  	 * List running through cg_cgroup_links pointing at a
  	 * single css_set object, anchored on css_set->cg_links
  	 */
  	struct list_head cg_link_list;
  	struct css_set *cg;
  };
  
  /* The default css_set - used by init and its children prior to any
   * hierarchies being mounted. It contains a pointer to the root state
   * for each subsystem. Also used to anchor the list of css_sets. Not
   * reference-counted, to improve performance when child cgroups
   * haven't been created.
   */
  
  static struct css_set init_css_set;
  static struct cg_cgroup_link init_css_set_link;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
208
  static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
817929ec2   Paul Menage   Task Control Grou...
209
210
211
212
213
  /* css_set_lock protects the list of css_set objects, and the
   * chain of tasks off each css_set.  Nests outside task->alloc_lock
   * due to cgroup_iter_start() */
  static DEFINE_RWLOCK(css_set_lock);
  static int css_set_count;
472b1053f   Li Zefan   cgroups: use a ha...
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
  /* hash table for cgroup groups. This improves the performance to
   * find an existing css_set */
  #define CSS_SET_HASH_BITS	7
  #define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
  static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
  
  static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
  {
  	int i;
  	int index;
  	unsigned long tmp = 0UL;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
  		tmp += (unsigned long)css[i];
  	tmp = (tmp >> 16) ^ tmp;
  
  	index = hash_long(tmp, CSS_SET_HASH_BITS);
  
  	return &css_set_table[index];
  }
817929ec2   Paul Menage   Task Control Grou...
234
235
236
237
  /* We don't maintain the lists running through each css_set to its
   * task until after the first call to cgroup_iter_start(). This
   * reduces the fork()/exit() overhead for people who have cgroups
   * compiled into their kernel but not actually in use */
8947f9d5b   Li Zefan   cgroups: annotate...
238
  static int use_task_css_set_links __read_mostly;
817929ec2   Paul Menage   Task Control Grou...
239
240
241
242
243
244
245
  
  /* When we create or destroy a css_set, the operation simply
   * takes/releases a reference count on all the cgroups referenced
   * by subsystems in this css_set. This can end up multiple-counting
   * some cgroups, but that's OK - the ref-count is just a
   * busy/not-busy indicator; ensuring that we only count each cgroup
   * once would require taking a global lock to ensure that no
b4f48b636   Paul Menage   Task Control Grou...
246
247
248
249
250
251
252
   * subsystems moved between hierarchies while we were doing so.
   *
   * Possible TODO: decide at boot time based on the number of
   * registered subsystems and the number of CPUs or NUMA nodes whether
   * it's better for performance to ref-count every subsystem, or to
   * take a global lock and only add one ref count to each hierarchy.
   */
817929ec2   Paul Menage   Task Control Grou...
253
254
255
256
  
  /*
   * unlink a css_set from the list and free it
   */
81a6a5cdd   Paul Menage   Task Control Grou...
257
  static void unlink_css_set(struct css_set *cg)
b4f48b636   Paul Menage   Task Control Grou...
258
  {
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
259
260
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
472b1053f   Li Zefan   cgroups: use a ha...
261
  	hlist_del(&cg->hlist);
817929ec2   Paul Menage   Task Control Grou...
262
  	css_set_count--;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
263
264
265
  
  	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
  				 cg_link_list) {
817929ec2   Paul Menage   Task Control Grou...
266
  		list_del(&link->cg_link_list);
bd89aabc6   Paul Menage   Control groups: R...
267
  		list_del(&link->cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
268
269
  		kfree(link);
  	}
81a6a5cdd   Paul Menage   Task Control Grou...
270
  }
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
271
  static void __put_css_set(struct css_set *cg, int taskexit)
81a6a5cdd   Paul Menage   Task Control Grou...
272
273
  {
  	int i;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
274
275
276
277
278
279
280
281
282
283
284
285
  	/*
  	 * Ensure that the refcount doesn't hit zero while any readers
  	 * can see it. Similar to atomic_dec_and_lock(), but for an
  	 * rwlock
  	 */
  	if (atomic_add_unless(&cg->refcount, -1, 1))
  		return;
  	write_lock(&css_set_lock);
  	if (!atomic_dec_and_test(&cg->refcount)) {
  		write_unlock(&css_set_lock);
  		return;
  	}
81a6a5cdd   Paul Menage   Task Control Grou...
286
  	unlink_css_set(cg);
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
287
  	write_unlock(&css_set_lock);
81a6a5cdd   Paul Menage   Task Control Grou...
288
289
290
  
  	rcu_read_lock();
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
a47295e6b   Paul Menage   cgroups: make cgr...
291
  		struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
bd89aabc6   Paul Menage   Control groups: R...
292
293
  		if (atomic_dec_and_test(&cgrp->count) &&
  		    notify_on_release(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
294
  			if (taskexit)
bd89aabc6   Paul Menage   Control groups: R...
295
296
  				set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
297
298
299
  		}
  	}
  	rcu_read_unlock();
817929ec2   Paul Menage   Task Control Grou...
300
  	kfree(cg);
b4f48b636   Paul Menage   Task Control Grou...
301
  }
817929ec2   Paul Menage   Task Control Grou...
302
303
304
305
306
  /*
   * refcounted get/put for css_set objects
   */
  static inline void get_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
307
  	atomic_inc(&cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
308
309
310
311
  }
  
  static inline void put_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
312
  	__put_css_set(cg, 0);
817929ec2   Paul Menage   Task Control Grou...
313
  }
81a6a5cdd   Paul Menage   Task Control Grou...
314
315
  static inline void put_css_set_taskexit(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
316
  	__put_css_set(cg, 1);
81a6a5cdd   Paul Menage   Task Control Grou...
317
  }
817929ec2   Paul Menage   Task Control Grou...
318
319
320
  /*
   * find_existing_css_set() is a helper for
   * find_css_set(), and checks to see whether an existing
472b1053f   Li Zefan   cgroups: use a ha...
321
   * css_set is suitable.
817929ec2   Paul Menage   Task Control Grou...
322
323
324
325
   *
   * oldcg: the cgroup group that we're using before the cgroup
   * transition
   *
bd89aabc6   Paul Menage   Control groups: R...
326
   * cgrp: the cgroup that we're moving into
817929ec2   Paul Menage   Task Control Grou...
327
328
329
330
   *
   * template: location in which to build the desired set of subsystem
   * state objects for the new cgroup group
   */
817929ec2   Paul Menage   Task Control Grou...
331
332
  static struct css_set *find_existing_css_set(
  	struct css_set *oldcg,
bd89aabc6   Paul Menage   Control groups: R...
333
  	struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
334
  	struct cgroup_subsys_state *template[])
b4f48b636   Paul Menage   Task Control Grou...
335
336
  {
  	int i;
bd89aabc6   Paul Menage   Control groups: R...
337
  	struct cgroupfs_root *root = cgrp->root;
472b1053f   Li Zefan   cgroups: use a ha...
338
339
340
  	struct hlist_head *hhead;
  	struct hlist_node *node;
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
341
342
343
344
  
  	/* Built the set of subsystem state objects that we want to
  	 * see in the new css_set */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
345
  		if (root->subsys_bits & (1UL << i)) {
817929ec2   Paul Menage   Task Control Grou...
346
347
348
  			/* Subsystem is in this hierarchy. So we want
  			 * the subsystem state from the new
  			 * cgroup */
bd89aabc6   Paul Menage   Control groups: R...
349
  			template[i] = cgrp->subsys[i];
817929ec2   Paul Menage   Task Control Grou...
350
351
352
353
354
355
  		} else {
  			/* Subsystem is not in this hierarchy, so we
  			 * don't want to change the subsystem state */
  			template[i] = oldcg->subsys[i];
  		}
  	}
472b1053f   Li Zefan   cgroups: use a ha...
356
357
  	hhead = css_set_hash(template);
  	hlist_for_each_entry(cg, node, hhead, hlist) {
817929ec2   Paul Menage   Task Control Grou...
358
359
360
361
  		if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
  			/* All subsystems matched */
  			return cg;
  		}
472b1053f   Li Zefan   cgroups: use a ha...
362
  	}
817929ec2   Paul Menage   Task Control Grou...
363
364
365
366
  
  	/* No existing cgroup group matched */
  	return NULL;
  }
36553434f   Li Zefan   cgroup: remove du...
367
368
369
370
371
372
373
374
375
376
  static void free_cg_links(struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
  
  	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
  		list_del(&link->cgrp_link_list);
  		kfree(link);
  	}
  }
817929ec2   Paul Menage   Task Control Grou...
377
378
  /*
   * allocate_cg_links() allocates "count" cg_cgroup_link structures
bd89aabc6   Paul Menage   Control groups: R...
379
   * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
817929ec2   Paul Menage   Task Control Grou...
380
381
   * success or a negative error
   */
817929ec2   Paul Menage   Task Control Grou...
382
383
384
385
386
387
388
389
  static int allocate_cg_links(int count, struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	int i;
  	INIT_LIST_HEAD(tmp);
  	for (i = 0; i < count; i++) {
  		link = kmalloc(sizeof(*link), GFP_KERNEL);
  		if (!link) {
36553434f   Li Zefan   cgroup: remove du...
390
  			free_cg_links(tmp);
817929ec2   Paul Menage   Task Control Grou...
391
392
  			return -ENOMEM;
  		}
bd89aabc6   Paul Menage   Control groups: R...
393
  		list_add(&link->cgrp_link_list, tmp);
817929ec2   Paul Menage   Task Control Grou...
394
395
396
  	}
  	return 0;
  }
c12f65d43   Li Zefan   cgroups: introduc...
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
  /**
   * link_css_set - a helper function to link a css_set to a cgroup
   * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
   * @cg: the css_set to be linked
   * @cgrp: the destination cgroup
   */
  static void link_css_set(struct list_head *tmp_cg_links,
  			 struct css_set *cg, struct cgroup *cgrp)
  {
  	struct cg_cgroup_link *link;
  
  	BUG_ON(list_empty(tmp_cg_links));
  	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
  				cgrp_link_list);
  	link->cg = cg;
  	list_move(&link->cgrp_link_list, &cgrp->css_sets);
  	list_add(&link->cg_link_list, &cg->cg_links);
  }
817929ec2   Paul Menage   Task Control Grou...
415
416
417
418
419
420
421
  /*
   * find_css_set() takes an existing cgroup group and a
   * cgroup object, and returns a css_set object that's
   * equivalent to the old group, but with the given cgroup
   * substituted into the appropriate hierarchy. Must be called with
   * cgroup_mutex held
   */
817929ec2   Paul Menage   Task Control Grou...
422
  static struct css_set *find_css_set(
bd89aabc6   Paul Menage   Control groups: R...
423
  	struct css_set *oldcg, struct cgroup *cgrp)
817929ec2   Paul Menage   Task Control Grou...
424
425
426
427
428
429
  {
  	struct css_set *res;
  	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
  	int i;
  
  	struct list_head tmp_cg_links;
817929ec2   Paul Menage   Task Control Grou...
430

472b1053f   Li Zefan   cgroups: use a ha...
431
  	struct hlist_head *hhead;
817929ec2   Paul Menage   Task Control Grou...
432
433
  	/* First see if we already have a cgroup group that matches
  	 * the desired set */
7e9abd89c   Li Zefan   cgroup: use read ...
434
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
435
  	res = find_existing_css_set(oldcg, cgrp, template);
817929ec2   Paul Menage   Task Control Grou...
436
437
  	if (res)
  		get_css_set(res);
7e9abd89c   Li Zefan   cgroup: use read ...
438
  	read_unlock(&css_set_lock);
817929ec2   Paul Menage   Task Control Grou...
439
440
441
442
443
444
445
446
447
448
449
450
451
  
  	if (res)
  		return res;
  
  	res = kmalloc(sizeof(*res), GFP_KERNEL);
  	if (!res)
  		return NULL;
  
  	/* Allocate all the cg_cgroup_link objects that we'll need */
  	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
  		kfree(res);
  		return NULL;
  	}
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
452
  	atomic_set(&res->refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
453
454
  	INIT_LIST_HEAD(&res->cg_links);
  	INIT_LIST_HEAD(&res->tasks);
472b1053f   Li Zefan   cgroups: use a ha...
455
  	INIT_HLIST_NODE(&res->hlist);
817929ec2   Paul Menage   Task Control Grou...
456
457
458
459
460
461
462
463
  
  	/* Copy the set of subsystem state objects generated in
  	 * find_existing_css_set() */
  	memcpy(res->subsys, template, sizeof(res->subsys));
  
  	write_lock(&css_set_lock);
  	/* Add reference counts and links from the new css_set. */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
bd89aabc6   Paul Menage   Control groups: R...
464
  		struct cgroup *cgrp = res->subsys[i]->cgroup;
817929ec2   Paul Menage   Task Control Grou...
465
  		struct cgroup_subsys *ss = subsys[i];
bd89aabc6   Paul Menage   Control groups: R...
466
  		atomic_inc(&cgrp->count);
817929ec2   Paul Menage   Task Control Grou...
467
468
469
470
471
  		/*
  		 * We want to add a link once per cgroup, so we
  		 * only do it for the first subsystem in each
  		 * hierarchy
  		 */
c12f65d43   Li Zefan   cgroups: introduc...
472
473
  		if (ss->root->subsys_list.next == &ss->sibling)
  			link_css_set(&tmp_cg_links, res, cgrp);
817929ec2   Paul Menage   Task Control Grou...
474
  	}
c12f65d43   Li Zefan   cgroups: introduc...
475
476
  	if (list_empty(&rootnode.subsys_list))
  		link_css_set(&tmp_cg_links, res, dummytop);
817929ec2   Paul Menage   Task Control Grou...
477
478
  
  	BUG_ON(!list_empty(&tmp_cg_links));
817929ec2   Paul Menage   Task Control Grou...
479
  	css_set_count++;
472b1053f   Li Zefan   cgroups: use a ha...
480
481
482
483
  
  	/* Add this cgroup group to the hash table */
  	hhead = css_set_hash(res->subsys);
  	hlist_add_head(&res->hlist, hhead);
817929ec2   Paul Menage   Task Control Grou...
484
485
486
  	write_unlock(&css_set_lock);
  
  	return res;
b4f48b636   Paul Menage   Task Control Grou...
487
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
488
489
490
491
492
493
494
495
496
497
  /*
   * There is one global cgroup mutex. We also require taking
   * task_lock() when dereferencing a task's cgroup subsys pointers.
   * See "The task_lock() exception", at the end of this comment.
   *
   * A task must hold cgroup_mutex to modify cgroups.
   *
   * Any task can increment and decrement the count field without lock.
   * So in general, code holding cgroup_mutex can't rely on the count
   * field not changing.  However, if the count goes to zero, then only
956db3ca0   Cliff Wickman   hotplug cpu: move...
498
   * cgroup_attach_task() can increment it again.  Because a count of zero
ddbcc7e8e   Paul Menage   Task Control Grou...
499
500
501
502
503
504
505
506
   * means that no tasks are currently attached, therefore there is no
   * way a task attached to that cgroup can fork (the other way to
   * increment the count).  So code holding cgroup_mutex can safely
   * assume that if the count is zero, it will stay zero. Similarly, if
   * a task holds cgroup_mutex on a cgroup with zero count, it
   * knows that the cgroup won't be removed, as cgroup_rmdir()
   * needs that mutex.
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
507
508
509
510
511
   * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
   * (usually) take cgroup_mutex.  These are the two most performance
   * critical pieces of code here.  The exception occurs on cgroup_exit(),
   * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
   * is taken, and if the cgroup count is zero, a usermode call made
a043e3b2c   Li Zefan   cgroup: fix comments
512
513
   * to the release agent with the name of the cgroup (path relative to
   * the root of cgroup file system) as the argument.
ddbcc7e8e   Paul Menage   Task Control Grou...
514
515
516
517
518
519
520
521
522
523
524
   *
   * A cgroup can only be deleted if both its 'count' of using tasks
   * is zero, and its list of 'children' cgroups is empty.  Since all
   * tasks in the system use _some_ cgroup, and since there is always at
   * least one task in the system (init, pid == 1), therefore, top_cgroup
   * always has either children cgroups and/or using tasks.  So we don't
   * need a special hack to ensure that top_cgroup cannot be deleted.
   *
   *	The task_lock() exception
   *
   * The need for this exception arises from the action of
956db3ca0   Cliff Wickman   hotplug cpu: move...
525
   * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
a043e3b2c   Li Zefan   cgroup: fix comments
526
   * another.  It does so using cgroup_mutex, however there are
ddbcc7e8e   Paul Menage   Task Control Grou...
527
528
529
   * several performance critical places that need to reference
   * task->cgroup without the expense of grabbing a system global
   * mutex.  Therefore except as noted below, when dereferencing or, as
956db3ca0   Cliff Wickman   hotplug cpu: move...
530
   * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
ddbcc7e8e   Paul Menage   Task Control Grou...
531
532
533
534
   * task_lock(), which acts on a spinlock (task->alloc_lock) already in
   * the task_struct routinely used for such matters.
   *
   * P.S.  One more locking exception.  RCU is used to guard the
956db3ca0   Cliff Wickman   hotplug cpu: move...
535
   * update of a tasks cgroup pointer by cgroup_attach_task()
ddbcc7e8e   Paul Menage   Task Control Grou...
536
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
537
538
539
540
  /**
   * cgroup_lock - lock out any changes to cgroup structures
   *
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
541
542
543
544
545
546
547
548
549
550
  void cgroup_lock(void)
  {
  	mutex_lock(&cgroup_mutex);
  }
  
  /**
   * cgroup_unlock - release lock on cgroup changes
   *
   * Undo the lock taken in a previous cgroup_lock() call.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
551
552
553
554
555
556
557
558
559
560
561
562
563
564
  void cgroup_unlock(void)
  {
  	mutex_unlock(&cgroup_mutex);
  }
  
  /*
   * A couple of forward declarations required, due to cyclic reference loop:
   * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
   * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
   * -> cgroup_mkdir.
   */
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
bd89aabc6   Paul Menage   Control groups: R...
565
  static int cgroup_populate_dir(struct cgroup *cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
566
  static struct inode_operations cgroup_dir_inode_operations;
a424316ca   Paul Menage   Task Control Grou...
567
568
569
  static struct file_operations proc_cgroupstats_operations;
  
  static struct backing_dev_info cgroup_backing_dev_info = {
e4ad08fe6   Miklos Szeredi   mm: bdi: add sepa...
570
  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
a424316ca   Paul Menage   Task Control Grou...
571
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
572

38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
573
574
  static int alloc_css_id(struct cgroup_subsys *ss,
  			struct cgroup *parent, struct cgroup *child);
ddbcc7e8e   Paul Menage   Task Control Grou...
575
576
577
  static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  {
  	struct inode *inode = new_inode(sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
578
579
580
  
  	if (inode) {
  		inode->i_mode = mode;
76aac0e9a   David Howells   CRED: Wrap task c...
581
582
  		inode->i_uid = current_fsuid();
  		inode->i_gid = current_fsgid();
ddbcc7e8e   Paul Menage   Task Control Grou...
583
584
585
586
587
  		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
  	}
  	return inode;
  }
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
588
589
590
591
  /*
   * Call subsys's pre_destroy handler.
   * This is called before css refcnt check.
   */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
592
  static int cgroup_call_pre_destroy(struct cgroup *cgrp)
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
593
594
  {
  	struct cgroup_subsys *ss;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
595
  	int ret = 0;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
596
  	for_each_subsys(cgrp->root, ss)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
597
598
599
600
601
602
  		if (ss->pre_destroy) {
  			ret = ss->pre_destroy(ss, cgrp);
  			if (ret)
  				break;
  		}
  	return ret;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
603
  }
a47295e6b   Paul Menage   cgroups: make cgr...
604
605
606
607
608
609
  static void free_cgroup_rcu(struct rcu_head *obj)
  {
  	struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
  
  	kfree(cgrp);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
610
611
612
613
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
  	/* is dentry a directory ? if so, kfree() associated cgroup */
  	if (S_ISDIR(inode->i_mode)) {
bd89aabc6   Paul Menage   Control groups: R...
614
  		struct cgroup *cgrp = dentry->d_fsdata;
8dc4f3e17   Paul Menage   cgroups: move cgr...
615
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
616
  		BUG_ON(!(cgroup_is_removed(cgrp)));
81a6a5cdd   Paul Menage   Task Control Grou...
617
618
619
620
621
622
623
  		/* It's possible for external users to be holding css
  		 * reference counts on a cgroup; css_put() needs to
  		 * be able to access the cgroup after decrementing
  		 * the reference count in order to know if it needs to
  		 * queue the cgroup to be handled by the release
  		 * agent */
  		synchronize_rcu();
8dc4f3e17   Paul Menage   cgroups: move cgr...
624
625
626
627
628
  
  		mutex_lock(&cgroup_mutex);
  		/*
  		 * Release the subsystem state objects.
  		 */
75139b827   Li Zefan   cgroups: remove s...
629
630
  		for_each_subsys(cgrp->root, ss)
  			ss->destroy(ss, cgrp);
8dc4f3e17   Paul Menage   cgroups: move cgr...
631
632
633
  
  		cgrp->root->number_of_cgroups--;
  		mutex_unlock(&cgroup_mutex);
a47295e6b   Paul Menage   cgroups: make cgr...
634
635
636
637
  		/*
  		 * Drop the active superblock reference that we took when we
  		 * created the cgroup
  		 */
8dc4f3e17   Paul Menage   cgroups: move cgr...
638
  		deactivate_super(cgrp->root->sb);
a47295e6b   Paul Menage   cgroups: make cgr...
639
  		call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
ddbcc7e8e   Paul Menage   Task Control Grou...
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
  	}
  	iput(inode);
  }
  
  static void remove_dir(struct dentry *d)
  {
  	struct dentry *parent = dget(d->d_parent);
  
  	d_delete(d);
  	simple_rmdir(parent->d_inode, d);
  	dput(parent);
  }
  
  static void cgroup_clear_directory(struct dentry *dentry)
  {
  	struct list_head *node;
  
  	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
  	spin_lock(&dcache_lock);
  	node = dentry->d_subdirs.next;
  	while (node != &dentry->d_subdirs) {
  		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
  		list_del_init(node);
  		if (d->d_inode) {
  			/* This should never be called on a cgroup
  			 * directory with child cgroups */
  			BUG_ON(d->d_inode->i_mode & S_IFDIR);
  			d = dget_locked(d);
  			spin_unlock(&dcache_lock);
  			d_delete(d);
  			simple_unlink(dentry->d_inode, d);
  			dput(d);
  			spin_lock(&dcache_lock);
  		}
  		node = dentry->d_subdirs.next;
  	}
  	spin_unlock(&dcache_lock);
  }
  
  /*
   * NOTE : the dentry must have been dget()'ed
   */
  static void cgroup_d_remove_dir(struct dentry *dentry)
  {
  	cgroup_clear_directory(dentry);
  
  	spin_lock(&dcache_lock);
  	list_del_init(&dentry->d_u.d_child);
  	spin_unlock(&dcache_lock);
  	remove_dir(dentry);
  }
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
  /*
   * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
   * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
   * reference to css->refcnt. In general, this refcnt is expected to goes down
   * to zero, soon.
   *
   * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
   */
  DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
  
  static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
  {
  	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
  		wake_up_all(&cgroup_rmdir_waitq);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
706
707
708
709
  static int rebind_subsystems(struct cgroupfs_root *root,
  			      unsigned long final_bits)
  {
  	unsigned long added_bits, removed_bits;
bd89aabc6   Paul Menage   Control groups: R...
710
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
711
712
713
714
715
716
  	int i;
  
  	removed_bits = root->actual_subsys_bits & ~final_bits;
  	added_bits = final_bits & ~root->actual_subsys_bits;
  	/* Check that any added subsystems are currently free */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
717
  		unsigned long bit = 1UL << i;
ddbcc7e8e   Paul Menage   Task Control Grou...
718
719
720
721
722
723
724
725
726
727
728
729
730
  		struct cgroup_subsys *ss = subsys[i];
  		if (!(bit & added_bits))
  			continue;
  		if (ss->root != &rootnode) {
  			/* Subsystem isn't free */
  			return -EBUSY;
  		}
  	}
  
  	/* Currently we don't handle adding/removing subsystems when
  	 * any child cgroups exist. This is theoretically supportable
  	 * but involves complex error handling, so it's being left until
  	 * later */
307257cf4   Paul Menage   cgroups: fix a ra...
731
  	if (root->number_of_cgroups > 1)
ddbcc7e8e   Paul Menage   Task Control Grou...
732
733
734
735
736
737
738
739
  		return -EBUSY;
  
  	/* Process each subsystem */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		unsigned long bit = 1UL << i;
  		if (bit & added_bits) {
  			/* We're binding this subsystem to this hierarchy */
bd89aabc6   Paul Menage   Control groups: R...
740
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
741
742
  			BUG_ON(!dummytop->subsys[i]);
  			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
999cd8a45   Paul Menage   cgroups: add a pe...
743
  			mutex_lock(&ss->hierarchy_mutex);
bd89aabc6   Paul Menage   Control groups: R...
744
745
  			cgrp->subsys[i] = dummytop->subsys[i];
  			cgrp->subsys[i]->cgroup = cgrp;
33a68ac1c   Li Zefan   cgroups: add inac...
746
  			list_move(&ss->sibling, &root->subsys_list);
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
747
  			ss->root = root;
ddbcc7e8e   Paul Menage   Task Control Grou...
748
  			if (ss->bind)
bd89aabc6   Paul Menage   Control groups: R...
749
  				ss->bind(ss, cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
750
  			mutex_unlock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
751
752
  		} else if (bit & removed_bits) {
  			/* We're removing this subsystem */
bd89aabc6   Paul Menage   Control groups: R...
753
754
  			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
  			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
755
  			mutex_lock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
756
757
758
  			if (ss->bind)
  				ss->bind(ss, dummytop);
  			dummytop->subsys[i]->cgroup = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
759
  			cgrp->subsys[i] = NULL;
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
760
  			subsys[i]->root = &rootnode;
33a68ac1c   Li Zefan   cgroups: add inac...
761
  			list_move(&ss->sibling, &rootnode.subsys_list);
999cd8a45   Paul Menage   cgroups: add a pe...
762
  			mutex_unlock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
763
764
  		} else if (bit & final_bits) {
  			/* Subsystem state should already exist */
bd89aabc6   Paul Menage   Control groups: R...
765
  			BUG_ON(!cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
766
767
  		} else {
  			/* Subsystem state shouldn't exist */
bd89aabc6   Paul Menage   Control groups: R...
768
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
  		}
  	}
  	root->subsys_bits = root->actual_subsys_bits = final_bits;
  	synchronize_rcu();
  
  	return 0;
  }
  
  static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
  {
  	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
  	struct cgroup_subsys *ss;
  
  	mutex_lock(&cgroup_mutex);
  	for_each_subsys(root, ss)
  		seq_printf(seq, ",%s", ss->name);
  	if (test_bit(ROOT_NOPREFIX, &root->flags))
  		seq_puts(seq, ",noprefix");
81a6a5cdd   Paul Menage   Task Control Grou...
787
788
  	if (strlen(root->release_agent_path))
  		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
ddbcc7e8e   Paul Menage   Task Control Grou...
789
790
791
792
793
794
795
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  struct cgroup_sb_opts {
  	unsigned long subsys_bits;
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
796
  	char *release_agent;
ddbcc7e8e   Paul Menage   Task Control Grou...
797
798
799
800
801
802
803
804
  };
  
  /* Convert a hierarchy specifier into a bitmask of subsystems and
   * flags. */
  static int parse_cgroupfs_options(char *data,
  				     struct cgroup_sb_opts *opts)
  {
  	char *token, *o = data ?: "all";
f9ab5b5b0   Li Zefan   cgroups: forbid n...
805
806
807
808
809
  	unsigned long mask = (unsigned long)-1;
  
  #ifdef CONFIG_CPUSETS
  	mask = ~(1UL << cpuset_subsys_id);
  #endif
ddbcc7e8e   Paul Menage   Task Control Grou...
810
811
812
  
  	opts->subsys_bits = 0;
  	opts->flags = 0;
81a6a5cdd   Paul Menage   Task Control Grou...
813
  	opts->release_agent = NULL;
ddbcc7e8e   Paul Menage   Task Control Grou...
814
815
816
817
818
  
  	while ((token = strsep(&o, ",")) != NULL) {
  		if (!*token)
  			return -EINVAL;
  		if (!strcmp(token, "all")) {
8bab8dded   Paul Menage   cgroups: add cgro...
819
820
821
822
823
824
825
826
  			/* Add all non-disabled subsystems */
  			int i;
  			opts->subsys_bits = 0;
  			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  				struct cgroup_subsys *ss = subsys[i];
  				if (!ss->disabled)
  					opts->subsys_bits |= 1ul << i;
  			}
ddbcc7e8e   Paul Menage   Task Control Grou...
827
828
  		} else if (!strcmp(token, "noprefix")) {
  			set_bit(ROOT_NOPREFIX, &opts->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
829
830
831
832
833
834
835
836
837
  		} else if (!strncmp(token, "release_agent=", 14)) {
  			/* Specifying two release agents is forbidden */
  			if (opts->release_agent)
  				return -EINVAL;
  			opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
  			if (!opts->release_agent)
  				return -ENOMEM;
  			strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
  			opts->release_agent[PATH_MAX - 1] = 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
838
839
840
841
842
843
  		} else {
  			struct cgroup_subsys *ss;
  			int i;
  			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  				ss = subsys[i];
  				if (!strcmp(token, ss->name)) {
8bab8dded   Paul Menage   cgroups: add cgro...
844
845
  					if (!ss->disabled)
  						set_bit(i, &opts->subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
846
847
848
849
850
851
852
  					break;
  				}
  			}
  			if (i == CGROUP_SUBSYS_COUNT)
  				return -ENOENT;
  		}
  	}
f9ab5b5b0   Li Zefan   cgroups: forbid n...
853
854
855
856
857
858
859
860
  	/*
  	 * Option noprefix was introduced just for backward compatibility
  	 * with the old cpuset, so we allow noprefix only if mounting just
  	 * the cpuset subsystem.
  	 */
  	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
  	    (opts->subsys_bits & mask))
  		return -EINVAL;
ddbcc7e8e   Paul Menage   Task Control Grou...
861
862
863
864
865
866
867
868
869
870
871
  	/* We can't have an empty hierarchy */
  	if (!opts->subsys_bits)
  		return -EINVAL;
  
  	return 0;
  }
  
  static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  {
  	int ret = 0;
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
872
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
873
  	struct cgroup_sb_opts opts;
337eb00a2   Alessio Igor Bogani   Push BKL down int...
874
  	lock_kernel();
bd89aabc6   Paul Menage   Control groups: R...
875
  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
876
877
878
879
880
881
882
883
884
885
886
887
888
889
  	mutex_lock(&cgroup_mutex);
  
  	/* See what subsystems are wanted */
  	ret = parse_cgroupfs_options(data, &opts);
  	if (ret)
  		goto out_unlock;
  
  	/* Don't allow flags to change at remount */
  	if (opts.flags != root->flags) {
  		ret = -EINVAL;
  		goto out_unlock;
  	}
  
  	ret = rebind_subsystems(root, opts.subsys_bits);
0670e08bd   Li Zefan   cgroups: don't ch...
890
891
  	if (ret)
  		goto out_unlock;
ddbcc7e8e   Paul Menage   Task Control Grou...
892
893
  
  	/* (re)populate subsystem files */
0670e08bd   Li Zefan   cgroups: don't ch...
894
  	cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
895

81a6a5cdd   Paul Menage   Task Control Grou...
896
897
  	if (opts.release_agent)
  		strcpy(root->release_agent_path, opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
898
   out_unlock:
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
899
  	kfree(opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
900
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
901
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
337eb00a2   Alessio Igor Bogani   Push BKL down int...
902
  	unlock_kernel();
ddbcc7e8e   Paul Menage   Task Control Grou...
903
904
905
906
907
908
909
910
911
  	return ret;
  }
  
  static struct super_operations cgroup_ops = {
  	.statfs = simple_statfs,
  	.drop_inode = generic_delete_inode,
  	.show_options = cgroup_show_options,
  	.remount_fs = cgroup_remount,
  };
cc31edcee   Paul Menage   cgroups: convert ...
912
913
914
915
916
917
918
919
  static void init_cgroup_housekeeping(struct cgroup *cgrp)
  {
  	INIT_LIST_HEAD(&cgrp->sibling);
  	INIT_LIST_HEAD(&cgrp->children);
  	INIT_LIST_HEAD(&cgrp->css_sets);
  	INIT_LIST_HEAD(&cgrp->release_list);
  	init_rwsem(&cgrp->pids_mutex);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
920
921
  static void init_cgroup_root(struct cgroupfs_root *root)
  {
bd89aabc6   Paul Menage   Control groups: R...
922
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
923
924
925
  	INIT_LIST_HEAD(&root->subsys_list);
  	INIT_LIST_HEAD(&root->root_list);
  	root->number_of_cgroups = 1;
bd89aabc6   Paul Menage   Control groups: R...
926
927
  	cgrp->root = root;
  	cgrp->top_cgroup = cgrp;
cc31edcee   Paul Menage   cgroups: convert ...
928
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
  }
  
  static int cgroup_test_super(struct super_block *sb, void *data)
  {
  	struct cgroupfs_root *new = data;
  	struct cgroupfs_root *root = sb->s_fs_info;
  
  	/* First check subsystems */
  	if (new->subsys_bits != root->subsys_bits)
  	    return 0;
  
  	/* Next check flags */
  	if (new->flags != root->flags)
  		return 0;
  
  	return 1;
  }
  
  static int cgroup_set_super(struct super_block *sb, void *data)
  {
  	int ret;
  	struct cgroupfs_root *root = data;
  
  	ret = set_anon_super(sb, NULL);
  	if (ret)
  		return ret;
  
  	sb->s_fs_info = root;
  	root->sb = sb;
  
  	sb->s_blocksize = PAGE_CACHE_SIZE;
  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
  	sb->s_magic = CGROUP_SUPER_MAGIC;
  	sb->s_op = &cgroup_ops;
  
  	return 0;
  }
  
  static int cgroup_get_rootdir(struct super_block *sb)
  {
  	struct inode *inode =
  		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
  	struct dentry *dentry;
  
  	if (!inode)
  		return -ENOMEM;
ddbcc7e8e   Paul Menage   Task Control Grou...
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
  	inode->i_fop = &simple_dir_operations;
  	inode->i_op = &cgroup_dir_inode_operations;
  	/* directories start off with i_nlink == 2 (for "." entry) */
  	inc_nlink(inode);
  	dentry = d_alloc_root(inode);
  	if (!dentry) {
  		iput(inode);
  		return -ENOMEM;
  	}
  	sb->s_root = dentry;
  	return 0;
  }
  
  static int cgroup_get_sb(struct file_system_type *fs_type,
  			 int flags, const char *unused_dev_name,
  			 void *data, struct vfsmount *mnt)
  {
  	struct cgroup_sb_opts opts;
  	int ret = 0;
  	struct super_block *sb;
  	struct cgroupfs_root *root;
28fd5dfc1   Li Zefan   cgroups: remove t...
996
  	struct list_head tmp_cg_links;
ddbcc7e8e   Paul Menage   Task Control Grou...
997
998
999
  
  	/* First find the desired set of subsystems */
  	ret = parse_cgroupfs_options(data, &opts);
81a6a5cdd   Paul Menage   Task Control Grou...
1000
  	if (ret) {
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
1001
  		kfree(opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1002
  		return ret;
81a6a5cdd   Paul Menage   Task Control Grou...
1003
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1004
1005
  
  	root = kzalloc(sizeof(*root), GFP_KERNEL);
f77707384   Li Zefan   cgroup: fix memor...
1006
  	if (!root) {
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
1007
  		kfree(opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1008
  		return -ENOMEM;
f77707384   Li Zefan   cgroup: fix memor...
1009
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1010
1011
1012
1013
  
  	init_cgroup_root(root);
  	root->subsys_bits = opts.subsys_bits;
  	root->flags = opts.flags;
81a6a5cdd   Paul Menage   Task Control Grou...
1014
1015
1016
1017
  	if (opts.release_agent) {
  		strcpy(root->release_agent_path, opts.release_agent);
  		kfree(opts.release_agent);
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
  
  	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
  
  	if (IS_ERR(sb)) {
  		kfree(root);
  		return PTR_ERR(sb);
  	}
  
  	if (sb->s_fs_info != root) {
  		/* Reusing an existing superblock */
  		BUG_ON(sb->s_root == NULL);
  		kfree(root);
  		root = NULL;
  	} else {
  		/* New superblock */
c12f65d43   Li Zefan   cgroups: introduc...
1033
  		struct cgroup *root_cgrp = &root->top_cgroup;
817929ec2   Paul Menage   Task Control Grou...
1034
  		struct inode *inode;
28fd5dfc1   Li Zefan   cgroups: remove t...
1035
  		int i;
ddbcc7e8e   Paul Menage   Task Control Grou...
1036
1037
1038
1039
1040
1041
  
  		BUG_ON(sb->s_root != NULL);
  
  		ret = cgroup_get_rootdir(sb);
  		if (ret)
  			goto drop_new_super;
817929ec2   Paul Menage   Task Control Grou...
1042
  		inode = sb->s_root->d_inode;
ddbcc7e8e   Paul Menage   Task Control Grou...
1043

817929ec2   Paul Menage   Task Control Grou...
1044
  		mutex_lock(&inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1045
  		mutex_lock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
  		/*
  		 * We're accessing css_set_count without locking
  		 * css_set_lock here, but that's OK - it can only be
  		 * increased by someone holding cgroup_lock, and
  		 * that's us. The worst that can happen is that we
  		 * have some link structures left over
  		 */
  		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
  		if (ret) {
  			mutex_unlock(&cgroup_mutex);
  			mutex_unlock(&inode->i_mutex);
  			goto drop_new_super;
  		}
ddbcc7e8e   Paul Menage   Task Control Grou...
1059
1060
1061
  		ret = rebind_subsystems(root, root->subsys_bits);
  		if (ret == -EBUSY) {
  			mutex_unlock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
1062
  			mutex_unlock(&inode->i_mutex);
20ca9b3f4   Li Zefan   cgroups: avoid ac...
1063
  			goto free_cg_links;
ddbcc7e8e   Paul Menage   Task Control Grou...
1064
1065
1066
1067
1068
1069
  		}
  
  		/* EBUSY should be the only error here */
  		BUG_ON(ret);
  
  		list_add(&root->root_list, &roots);
817929ec2   Paul Menage   Task Control Grou...
1070
  		root_count++;
ddbcc7e8e   Paul Menage   Task Control Grou...
1071

c12f65d43   Li Zefan   cgroups: introduc...
1072
  		sb->s_root->d_fsdata = root_cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
1073
  		root->top_cgroup.dentry = sb->s_root;
817929ec2   Paul Menage   Task Control Grou...
1074
1075
1076
  		/* Link the top cgroup in this hierarchy into all
  		 * the css_set objects */
  		write_lock(&css_set_lock);
28fd5dfc1   Li Zefan   cgroups: remove t...
1077
1078
1079
  		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  			struct hlist_head *hhead = &css_set_table[i];
  			struct hlist_node *node;
817929ec2   Paul Menage   Task Control Grou...
1080
  			struct css_set *cg;
28fd5dfc1   Li Zefan   cgroups: remove t...
1081

c12f65d43   Li Zefan   cgroups: introduc...
1082
1083
  			hlist_for_each_entry(cg, node, hhead, hlist)
  				link_css_set(&tmp_cg_links, cg, root_cgrp);
28fd5dfc1   Li Zefan   cgroups: remove t...
1084
  		}
817929ec2   Paul Menage   Task Control Grou...
1085
1086
1087
  		write_unlock(&css_set_lock);
  
  		free_cg_links(&tmp_cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
1088
1089
  		BUG_ON(!list_empty(&root_cgrp->sibling));
  		BUG_ON(!list_empty(&root_cgrp->children));
ddbcc7e8e   Paul Menage   Task Control Grou...
1090
  		BUG_ON(root->number_of_cgroups != 1);
c12f65d43   Li Zefan   cgroups: introduc...
1091
  		cgroup_populate_dir(root_cgrp);
817929ec2   Paul Menage   Task Control Grou...
1092
  		mutex_unlock(&inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1093
1094
  		mutex_unlock(&cgroup_mutex);
  	}
a3ec947c8   Sukadev Bhattiprolu   vfs: simple_set_m...
1095
1096
  	simple_set_mnt(mnt, sb);
  	return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
1097

20ca9b3f4   Li Zefan   cgroups: avoid ac...
1098
1099
   free_cg_links:
  	free_cg_links(&tmp_cg_links);
ddbcc7e8e   Paul Menage   Task Control Grou...
1100
   drop_new_super:
6f5bbff9a   Al Viro   Convert obvious p...
1101
  	deactivate_locked_super(sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
1102
1103
1104
1105
1106
  	return ret;
  }
  
  static void cgroup_kill_sb(struct super_block *sb) {
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1107
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1108
  	int ret;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1109
1110
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
ddbcc7e8e   Paul Menage   Task Control Grou...
1111
1112
1113
1114
  
  	BUG_ON(!root);
  
  	BUG_ON(root->number_of_cgroups != 1);
bd89aabc6   Paul Menage   Control groups: R...
1115
1116
  	BUG_ON(!list_empty(&cgrp->children));
  	BUG_ON(!list_empty(&cgrp->sibling));
ddbcc7e8e   Paul Menage   Task Control Grou...
1117
1118
1119
1120
1121
1122
1123
  
  	mutex_lock(&cgroup_mutex);
  
  	/* Rebind all subsystems back to the default hierarchy */
  	ret = rebind_subsystems(root, 0);
  	/* Shouldn't be able to fail ... */
  	BUG_ON(ret);
817929ec2   Paul Menage   Task Control Grou...
1124
1125
1126
1127
1128
  	/*
  	 * Release all the links from css_sets to this hierarchy's
  	 * root cgroup
  	 */
  	write_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1129
1130
1131
  
  	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
  				 cgrp_link_list) {
817929ec2   Paul Menage   Task Control Grou...
1132
  		list_del(&link->cg_link_list);
bd89aabc6   Paul Menage   Control groups: R...
1133
  		list_del(&link->cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
1134
1135
1136
  		kfree(link);
  	}
  	write_unlock(&css_set_lock);
839ec5452   Paul Menage   cgroup: fix root_...
1137
1138
1139
1140
  	if (!list_empty(&root->root_list)) {
  		list_del(&root->root_list);
  		root_count--;
  	}
e5f6a8609   Li Zefan   cgroups: make roo...
1141

ddbcc7e8e   Paul Menage   Task Control Grou...
1142
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1143
  	kill_litter_super(sb);
67e055d14   Li Zefan   cgroups: fix poss...
1144
  	kfree(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
1145
1146
1147
1148
1149
1150
1151
  }
  
  static struct file_system_type cgroup_fs_type = {
  	.name = "cgroup",
  	.get_sb = cgroup_get_sb,
  	.kill_sb = cgroup_kill_sb,
  };
bd89aabc6   Paul Menage   Control groups: R...
1152
  static inline struct cgroup *__d_cgrp(struct dentry *dentry)
ddbcc7e8e   Paul Menage   Task Control Grou...
1153
1154
1155
1156
1157
1158
1159
1160
  {
  	return dentry->d_fsdata;
  }
  
  static inline struct cftype *__d_cft(struct dentry *dentry)
  {
  	return dentry->d_fsdata;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1161
1162
1163
1164
1165
1166
  /**
   * cgroup_path - generate the path of a cgroup
   * @cgrp: the cgroup in question
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
a47295e6b   Paul Menage   cgroups: make cgr...
1167
1168
1169
   * Called with cgroup_mutex held or else with an RCU-protected cgroup
   * reference.  Writes path of cgroup into buf.  Returns 0 on success,
   * -errno on error.
ddbcc7e8e   Paul Menage   Task Control Grou...
1170
   */
bd89aabc6   Paul Menage   Control groups: R...
1171
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
ddbcc7e8e   Paul Menage   Task Control Grou...
1172
1173
  {
  	char *start;
a47295e6b   Paul Menage   cgroups: make cgr...
1174
  	struct dentry *dentry = rcu_dereference(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
1175

a47295e6b   Paul Menage   cgroups: make cgr...
1176
  	if (!dentry || cgrp == dummytop) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
  		/*
  		 * Inactive subsystems have no dentry for their root
  		 * cgroup
  		 */
  		strcpy(buf, "/");
  		return 0;
  	}
  
  	start = buf + buflen;
  
  	*--start = '\0';
  	for (;;) {
a47295e6b   Paul Menage   cgroups: make cgr...
1189
  		int len = dentry->d_name.len;
ddbcc7e8e   Paul Menage   Task Control Grou...
1190
1191
  		if ((start -= len) < buf)
  			return -ENAMETOOLONG;
bd89aabc6   Paul Menage   Control groups: R...
1192
1193
1194
  		memcpy(start, cgrp->dentry->d_name.name, len);
  		cgrp = cgrp->parent;
  		if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
1195
  			break;
a47295e6b   Paul Menage   cgroups: make cgr...
1196
  		dentry = rcu_dereference(cgrp->dentry);
bd89aabc6   Paul Menage   Control groups: R...
1197
  		if (!cgrp->parent)
ddbcc7e8e   Paul Menage   Task Control Grou...
1198
1199
1200
1201
1202
1203
1204
1205
  			continue;
  		if (--start < buf)
  			return -ENAMETOOLONG;
  		*start = '/';
  	}
  	memmove(buf, start, buf + buflen - start);
  	return 0;
  }
bbcb81d09   Paul Menage   Task Control Grou...
1206
1207
1208
1209
  /*
   * Return the first subsystem attached to a cgroup's hierarchy, and
   * its subsystem id.
   */
bd89aabc6   Paul Menage   Control groups: R...
1210
  static void get_first_subsys(const struct cgroup *cgrp,
bbcb81d09   Paul Menage   Task Control Grou...
1211
1212
  			struct cgroup_subsys_state **css, int *subsys_id)
  {
bd89aabc6   Paul Menage   Control groups: R...
1213
  	const struct cgroupfs_root *root = cgrp->root;
bbcb81d09   Paul Menage   Task Control Grou...
1214
1215
1216
1217
1218
  	const struct cgroup_subsys *test_ss;
  	BUG_ON(list_empty(&root->subsys_list));
  	test_ss = list_entry(root->subsys_list.next,
  			     struct cgroup_subsys, sibling);
  	if (css) {
bd89aabc6   Paul Menage   Control groups: R...
1219
  		*css = cgrp->subsys[test_ss->subsys_id];
bbcb81d09   Paul Menage   Task Control Grou...
1220
1221
1222
1223
1224
  		BUG_ON(!*css);
  	}
  	if (subsys_id)
  		*subsys_id = test_ss->subsys_id;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1225
1226
1227
1228
  /**
   * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
   * @cgrp: the cgroup the task is attaching to
   * @tsk: the task to be attached
bbcb81d09   Paul Menage   Task Control Grou...
1229
   *
a043e3b2c   Li Zefan   cgroup: fix comments
1230
1231
   * Call holding cgroup_mutex. May take task_lock of
   * the task 'tsk' during call.
bbcb81d09   Paul Menage   Task Control Grou...
1232
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1233
  int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
bbcb81d09   Paul Menage   Task Control Grou...
1234
1235
1236
  {
  	int retval = 0;
  	struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
1237
  	struct cgroup *oldcgrp;
77efecd9e   Lai Jiangshan   cgroups: call fin...
1238
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
1239
  	struct css_set *newcg;
bd89aabc6   Paul Menage   Control groups: R...
1240
  	struct cgroupfs_root *root = cgrp->root;
bbcb81d09   Paul Menage   Task Control Grou...
1241
  	int subsys_id;
bd89aabc6   Paul Menage   Control groups: R...
1242
  	get_first_subsys(cgrp, NULL, &subsys_id);
bbcb81d09   Paul Menage   Task Control Grou...
1243
1244
  
  	/* Nothing to do if the task is already in that cgroup */
bd89aabc6   Paul Menage   Control groups: R...
1245
1246
  	oldcgrp = task_cgroup(tsk, subsys_id);
  	if (cgrp == oldcgrp)
bbcb81d09   Paul Menage   Task Control Grou...
1247
1248
1249
1250
  		return 0;
  
  	for_each_subsys(root, ss) {
  		if (ss->can_attach) {
bd89aabc6   Paul Menage   Control groups: R...
1251
  			retval = ss->can_attach(ss, cgrp, tsk);
e18f6318e   Paul Jackson   cgroup brace codi...
1252
  			if (retval)
bbcb81d09   Paul Menage   Task Control Grou...
1253
  				return retval;
bbcb81d09   Paul Menage   Task Control Grou...
1254
1255
  		}
  	}
77efecd9e   Lai Jiangshan   cgroups: call fin...
1256
1257
1258
1259
  	task_lock(tsk);
  	cg = tsk->cgroups;
  	get_css_set(cg);
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1260
1261
1262
1263
  	/*
  	 * Locate or allocate a new css_set for this task,
  	 * based on its final set of cgroups
  	 */
bd89aabc6   Paul Menage   Control groups: R...
1264
  	newcg = find_css_set(cg, cgrp);
77efecd9e   Lai Jiangshan   cgroups: call fin...
1265
  	put_css_set(cg);
e18f6318e   Paul Jackson   cgroup brace codi...
1266
  	if (!newcg)
817929ec2   Paul Menage   Task Control Grou...
1267
  		return -ENOMEM;
817929ec2   Paul Menage   Task Control Grou...
1268

bbcb81d09   Paul Menage   Task Control Grou...
1269
1270
1271
  	task_lock(tsk);
  	if (tsk->flags & PF_EXITING) {
  		task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1272
  		put_css_set(newcg);
bbcb81d09   Paul Menage   Task Control Grou...
1273
1274
  		return -ESRCH;
  	}
817929ec2   Paul Menage   Task Control Grou...
1275
  	rcu_assign_pointer(tsk->cgroups, newcg);
bbcb81d09   Paul Menage   Task Control Grou...
1276
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1277
1278
1279
1280
1281
1282
1283
  	/* Update the css_set linked lists if we're using them */
  	write_lock(&css_set_lock);
  	if (!list_empty(&tsk->cg_list)) {
  		list_del(&tsk->cg_list);
  		list_add(&tsk->cg_list, &newcg->tasks);
  	}
  	write_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
1284
  	for_each_subsys(root, ss) {
e18f6318e   Paul Jackson   cgroup brace codi...
1285
  		if (ss->attach)
bd89aabc6   Paul Menage   Control groups: R...
1286
  			ss->attach(ss, cgrp, oldcgrp, tsk);
bbcb81d09   Paul Menage   Task Control Grou...
1287
  	}
bd89aabc6   Paul Menage   Control groups: R...
1288
  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
bbcb81d09   Paul Menage   Task Control Grou...
1289
  	synchronize_rcu();
817929ec2   Paul Menage   Task Control Grou...
1290
  	put_css_set(cg);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
1291
1292
1293
1294
1295
1296
  
  	/*
  	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
  	 * is no longer empty.
  	 */
  	cgroup_wakeup_rmdir_waiters(cgrp);
bbcb81d09   Paul Menage   Task Control Grou...
1297
1298
1299
1300
  	return 0;
  }
  
  /*
af351026a   Paul Menage   cgroup files: tur...
1301
1302
   * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
   * held. May take task_lock of task
bbcb81d09   Paul Menage   Task Control Grou...
1303
   */
af351026a   Paul Menage   cgroup files: tur...
1304
  static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
bbcb81d09   Paul Menage   Task Control Grou...
1305
  {
bbcb81d09   Paul Menage   Task Control Grou...
1306
  	struct task_struct *tsk;
c69e8d9c0   David Howells   CRED: Use RCU to ...
1307
  	const struct cred *cred = current_cred(), *tcred;
bbcb81d09   Paul Menage   Task Control Grou...
1308
  	int ret;
bbcb81d09   Paul Menage   Task Control Grou...
1309
1310
  	if (pid) {
  		rcu_read_lock();
73507f335   Pavel Emelyanov   Handle pid namesp...
1311
  		tsk = find_task_by_vpid(pid);
bbcb81d09   Paul Menage   Task Control Grou...
1312
1313
1314
1315
  		if (!tsk || tsk->flags & PF_EXITING) {
  			rcu_read_unlock();
  			return -ESRCH;
  		}
bbcb81d09   Paul Menage   Task Control Grou...
1316

c69e8d9c0   David Howells   CRED: Use RCU to ...
1317
1318
1319
1320
1321
  		tcred = __task_cred(tsk);
  		if (cred->euid &&
  		    cred->euid != tcred->uid &&
  		    cred->euid != tcred->suid) {
  			rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
1322
1323
  			return -EACCES;
  		}
c69e8d9c0   David Howells   CRED: Use RCU to ...
1324
1325
  		get_task_struct(tsk);
  		rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
1326
1327
1328
1329
  	} else {
  		tsk = current;
  		get_task_struct(tsk);
  	}
956db3ca0   Cliff Wickman   hotplug cpu: move...
1330
  	ret = cgroup_attach_task(cgrp, tsk);
bbcb81d09   Paul Menage   Task Control Grou...
1331
1332
1333
  	put_task_struct(tsk);
  	return ret;
  }
af351026a   Paul Menage   cgroup files: tur...
1334
1335
1336
1337
1338
1339
1340
1341
1342
  static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
  {
  	int ret;
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	ret = attach_task_by_pid(cgrp, pid);
  	cgroup_unlock();
  	return ret;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1343
  /* The various types of files and directories in a cgroup file system */
ddbcc7e8e   Paul Menage   Task Control Grou...
1344
1345
1346
1347
  enum cgroup_filetype {
  	FILE_ROOT,
  	FILE_DIR,
  	FILE_TASKLIST,
81a6a5cdd   Paul Menage   Task Control Grou...
1348
  	FILE_NOTIFY_ON_RELEASE,
81a6a5cdd   Paul Menage   Task Control Grou...
1349
  	FILE_RELEASE_AGENT,
ddbcc7e8e   Paul Menage   Task Control Grou...
1350
  };
e788e066c   Paul Menage   cgroup files: mov...
1351
1352
1353
1354
  /**
   * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
   * @cgrp: the cgroup to be checked for liveness
   *
84eea8428   Paul Menage   cgroups: misc cle...
1355
1356
   * On success, returns true; the lock should be later released with
   * cgroup_unlock(). On failure returns false with no lock held.
e788e066c   Paul Menage   cgroup files: mov...
1357
   */
84eea8428   Paul Menage   cgroups: misc cle...
1358
  bool cgroup_lock_live_group(struct cgroup *cgrp)
e788e066c   Paul Menage   cgroup files: mov...
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
  {
  	mutex_lock(&cgroup_mutex);
  	if (cgroup_is_removed(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
  		return false;
  	}
  	return true;
  }
  
  static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	strcpy(cgrp->root->release_agent_path, buffer);
84eea8428   Paul Menage   cgroups: misc cle...
1375
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
  	return 0;
  }
  
  static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
  				     struct seq_file *seq)
  {
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	seq_puts(seq, cgrp->root->release_agent_path);
  	seq_putc(seq, '
  ');
84eea8428   Paul Menage   cgroups: misc cle...
1387
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
1388
1389
  	return 0;
  }
84eea8428   Paul Menage   cgroups: misc cle...
1390
1391
  /* A buffer size big enough for numbers or short strings */
  #define CGROUP_LOCAL_BUFFER_SIZE 64
e73d2c61d   Paul Menage   CGroups _s64 file...
1392
  static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
f4c753b7e   Paul Menage   CGroup API files:...
1393
1394
1395
  				struct file *file,
  				const char __user *userbuf,
  				size_t nbytes, loff_t *unused_ppos)
355e0c48b   Paul Menage   Add cgroup write_...
1396
  {
84eea8428   Paul Menage   cgroups: misc cle...
1397
  	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
355e0c48b   Paul Menage   Add cgroup write_...
1398
  	int retval = 0;
355e0c48b   Paul Menage   Add cgroup write_...
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
  	char *end;
  
  	if (!nbytes)
  		return -EINVAL;
  	if (nbytes >= sizeof(buffer))
  		return -E2BIG;
  	if (copy_from_user(buffer, userbuf, nbytes))
  		return -EFAULT;
  
  	buffer[nbytes] = 0;     /* nul-terminate */
b7269dfc8   Paul Menage   CGroup API files:...
1409
  	strstrip(buffer);
e73d2c61d   Paul Menage   CGroups _s64 file...
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
  	if (cft->write_u64) {
  		u64 val = simple_strtoull(buffer, &end, 0);
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_u64(cgrp, cft, val);
  	} else {
  		s64 val = simple_strtoll(buffer, &end, 0);
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_s64(cgrp, cft, val);
  	}
355e0c48b   Paul Menage   Add cgroup write_...
1421
1422
1423
1424
  	if (!retval)
  		retval = nbytes;
  	return retval;
  }
db3b14978   Paul Menage   cgroup files: add...
1425
1426
1427
1428
1429
  static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
  				   struct file *file,
  				   const char __user *userbuf,
  				   size_t nbytes, loff_t *unused_ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
1430
  	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
db3b14978   Paul Menage   cgroup files: add...
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
  	int retval = 0;
  	size_t max_bytes = cft->max_write_len;
  	char *buffer = local_buffer;
  
  	if (!max_bytes)
  		max_bytes = sizeof(local_buffer) - 1;
  	if (nbytes >= max_bytes)
  		return -E2BIG;
  	/* Allocate a dynamic buffer if we need one */
  	if (nbytes >= sizeof(local_buffer)) {
  		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
  		if (buffer == NULL)
  			return -ENOMEM;
  	}
5a3eb9f6b   Li Zefan   cgroup: fix possi...
1445
1446
1447
1448
  	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
  		retval = -EFAULT;
  		goto out;
  	}
db3b14978   Paul Menage   cgroup files: add...
1449
1450
1451
1452
1453
1454
  
  	buffer[nbytes] = 0;     /* nul-terminate */
  	strstrip(buffer);
  	retval = cft->write_string(cgrp, cft, buffer);
  	if (!retval)
  		retval = nbytes;
5a3eb9f6b   Li Zefan   cgroup: fix possi...
1455
  out:
db3b14978   Paul Menage   cgroup files: add...
1456
1457
1458
1459
  	if (buffer != local_buffer)
  		kfree(buffer);
  	return retval;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1460
1461
1462
1463
  static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
  						size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
1464
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1465

75139b827   Li Zefan   cgroups: remove s...
1466
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
1467
  		return -ENODEV;
355e0c48b   Paul Menage   Add cgroup write_...
1468
  	if (cft->write)
bd89aabc6   Paul Menage   Control groups: R...
1469
  		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
1470
1471
  	if (cft->write_u64 || cft->write_s64)
  		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
db3b14978   Paul Menage   cgroup files: add...
1472
1473
  	if (cft->write_string)
  		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
d447ea2f3   Pavel Emelyanov   cgroups: add the ...
1474
1475
1476
1477
  	if (cft->trigger) {
  		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
  		return ret ? ret : nbytes;
  	}
355e0c48b   Paul Menage   Add cgroup write_...
1478
  	return -EINVAL;
ddbcc7e8e   Paul Menage   Task Control Grou...
1479
  }
f4c753b7e   Paul Menage   CGroup API files:...
1480
1481
1482
1483
  static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
ddbcc7e8e   Paul Menage   Task Control Grou...
1484
  {
84eea8428   Paul Menage   cgroups: misc cle...
1485
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
f4c753b7e   Paul Menage   CGroup API files:...
1486
  	u64 val = cft->read_u64(cgrp, cft);
ddbcc7e8e   Paul Menage   Task Control Grou...
1487
1488
1489
1490
1491
  	int len = sprintf(tmp, "%llu
  ", (unsigned long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
e73d2c61d   Paul Menage   CGroups _s64 file...
1492
1493
1494
1495
1496
  static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
1497
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
e73d2c61d   Paul Menage   CGroups _s64 file...
1498
1499
1500
1501
1502
1503
  	s64 val = cft->read_s64(cgrp, cft);
  	int len = sprintf(tmp, "%lld
  ", (long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1504
1505
1506
1507
  static ssize_t cgroup_file_read(struct file *file, char __user *buf,
  				   size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
1508
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1509

75139b827   Li Zefan   cgroups: remove s...
1510
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
1511
1512
1513
  		return -ENODEV;
  
  	if (cft->read)
bd89aabc6   Paul Menage   Control groups: R...
1514
  		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
f4c753b7e   Paul Menage   CGroup API files:...
1515
1516
  	if (cft->read_u64)
  		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
1517
1518
  	if (cft->read_s64)
  		return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
ddbcc7e8e   Paul Menage   Task Control Grou...
1519
1520
  	return -EINVAL;
  }
917965696   Paul Menage   CGroup API files:...
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
  /*
   * seqfile ops/methods for returning structured data. Currently just
   * supports string->u64 maps, but can be extended in future.
   */
  
  struct cgroup_seqfile_state {
  	struct cftype *cft;
  	struct cgroup *cgroup;
  };
  
  static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
  {
  	struct seq_file *sf = cb->state;
  	return seq_printf(sf, "%s %llu
  ", key, (unsigned long long)value);
  }
  
  static int cgroup_seqfile_show(struct seq_file *m, void *arg)
  {
  	struct cgroup_seqfile_state *state = m->private;
  	struct cftype *cft = state->cft;
29486df32   Serge E. Hallyn   cgroups: introduc...
1542
1543
1544
1545
1546
1547
1548
1549
  	if (cft->read_map) {
  		struct cgroup_map_cb cb = {
  			.fill = cgroup_map_add,
  			.state = m,
  		};
  		return cft->read_map(state->cgroup, cft, &cb);
  	}
  	return cft->read_seq_string(state->cgroup, cft, m);
917965696   Paul Menage   CGroup API files:...
1550
  }
96930a636   Adrian Bunk   make cgroup_seqfi...
1551
  static int cgroup_seqfile_release(struct inode *inode, struct file *file)
917965696   Paul Menage   CGroup API files:...
1552
1553
1554
1555
1556
1557
1558
1559
  {
  	struct seq_file *seq = file->private_data;
  	kfree(seq->private);
  	return single_release(inode, file);
  }
  
  static struct file_operations cgroup_seqfile_operations = {
  	.read = seq_read,
e788e066c   Paul Menage   cgroup files: mov...
1560
  	.write = cgroup_file_write,
917965696   Paul Menage   CGroup API files:...
1561
1562
1563
  	.llseek = seq_lseek,
  	.release = cgroup_seqfile_release,
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
1564
1565
1566
1567
1568
1569
1570
1571
  static int cgroup_file_open(struct inode *inode, struct file *file)
  {
  	int err;
  	struct cftype *cft;
  
  	err = generic_file_open(inode, file);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
1572
  	cft = __d_cft(file->f_dentry);
75139b827   Li Zefan   cgroups: remove s...
1573

29486df32   Serge E. Hallyn   cgroups: introduc...
1574
  	if (cft->read_map || cft->read_seq_string) {
917965696   Paul Menage   CGroup API files:...
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
  		struct cgroup_seqfile_state *state =
  			kzalloc(sizeof(*state), GFP_USER);
  		if (!state)
  			return -ENOMEM;
  		state->cft = cft;
  		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
  		file->f_op = &cgroup_seqfile_operations;
  		err = single_open(file, cgroup_seqfile_show, state);
  		if (err < 0)
  			kfree(state);
  	} else if (cft->open)
ddbcc7e8e   Paul Menage   Task Control Grou...
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
  		err = cft->open(inode, file);
  	else
  		err = 0;
  
  	return err;
  }
  
  static int cgroup_file_release(struct inode *inode, struct file *file)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
  	if (cft->release)
  		return cft->release(inode, file);
  	return 0;
  }
  
  /*
   * cgroup_rename - Only allow simple rename of directories in place.
   */
  static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
  			    struct inode *new_dir, struct dentry *new_dentry)
  {
  	if (!S_ISDIR(old_dentry->d_inode->i_mode))
  		return -ENOTDIR;
  	if (new_dentry->d_inode)
  		return -EEXIST;
  	if (old_dir != new_dir)
  		return -EIO;
  	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
  }
  
  static struct file_operations cgroup_file_operations = {
  	.read = cgroup_file_read,
  	.write = cgroup_file_write,
  	.llseek = generic_file_llseek,
  	.open = cgroup_file_open,
  	.release = cgroup_file_release,
  };
  
  static struct inode_operations cgroup_dir_inode_operations = {
  	.lookup = simple_lookup,
  	.mkdir = cgroup_mkdir,
  	.rmdir = cgroup_rmdir,
  	.rename = cgroup_rename,
  };
099fca322   Li Zefan   cgroups: show cor...
1630
  static int cgroup_create_file(struct dentry *dentry, mode_t mode,
ddbcc7e8e   Paul Menage   Task Control Grou...
1631
1632
  				struct super_block *sb)
  {
3ba13d179   Al Viro   constify dentry_o...
1633
  	static const struct dentry_operations cgroup_dops = {
ddbcc7e8e   Paul Menage   Task Control Grou...
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
  		.d_iput = cgroup_diput,
  	};
  
  	struct inode *inode;
  
  	if (!dentry)
  		return -ENOENT;
  	if (dentry->d_inode)
  		return -EEXIST;
  
  	inode = cgroup_new_inode(mode, sb);
  	if (!inode)
  		return -ENOMEM;
  
  	if (S_ISDIR(mode)) {
  		inode->i_op = &cgroup_dir_inode_operations;
  		inode->i_fop = &simple_dir_operations;
  
  		/* start off with i_nlink == 2 (for "." entry) */
  		inc_nlink(inode);
  
  		/* start with the directory inode held, so that we can
  		 * populate it without racing with another mkdir */
817929ec2   Paul Menage   Task Control Grou...
1657
  		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ddbcc7e8e   Paul Menage   Task Control Grou...
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
  	} else if (S_ISREG(mode)) {
  		inode->i_size = 0;
  		inode->i_fop = &cgroup_file_operations;
  	}
  	dentry->d_op = &cgroup_dops;
  	d_instantiate(dentry, inode);
  	dget(dentry);	/* Extra count - pin the dentry in core */
  	return 0;
  }
  
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
1669
1670
1671
1672
1673
   * cgroup_create_dir - create a directory for an object.
   * @cgrp: the cgroup we create the directory for. It must have a valid
   *        ->parent field. And we are going to fill its ->dentry field.
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new directory.
ddbcc7e8e   Paul Menage   Task Control Grou...
1674
   */
bd89aabc6   Paul Menage   Control groups: R...
1675
  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
1676
  				mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
1677
1678
1679
  {
  	struct dentry *parent;
  	int error = 0;
bd89aabc6   Paul Menage   Control groups: R...
1680
1681
  	parent = cgrp->parent->dentry;
  	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
1682
  	if (!error) {
bd89aabc6   Paul Menage   Control groups: R...
1683
  		dentry->d_fsdata = cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
1684
  		inc_nlink(parent->d_inode);
a47295e6b   Paul Menage   cgroups: make cgr...
1685
  		rcu_assign_pointer(cgrp->dentry, dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
1686
1687
1688
1689
1690
1691
  		dget(dentry);
  	}
  	dput(dentry);
  
  	return error;
  }
099fca322   Li Zefan   cgroups: show cor...
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
  /**
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
   *
   * returns cft->mode if ->mode is not 0
   * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
   * returns S_IRUGO if it has only a read handler
   * returns S_IWUSR if it has only a write hander
   */
  static mode_t cgroup_file_mode(const struct cftype *cft)
  {
  	mode_t mode = 0;
  
  	if (cft->mode)
  		return cft->mode;
  
  	if (cft->read || cft->read_u64 || cft->read_s64 ||
  	    cft->read_map || cft->read_seq_string)
  		mode |= S_IRUGO;
  
  	if (cft->write || cft->write_u64 || cft->write_s64 ||
  	    cft->write_string || cft->trigger)
  		mode |= S_IWUSR;
  
  	return mode;
  }
bd89aabc6   Paul Menage   Control groups: R...
1718
  int cgroup_add_file(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
1719
1720
1721
  		       struct cgroup_subsys *subsys,
  		       const struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
1722
  	struct dentry *dir = cgrp->dentry;
ddbcc7e8e   Paul Menage   Task Control Grou...
1723
1724
  	struct dentry *dentry;
  	int error;
099fca322   Li Zefan   cgroups: show cor...
1725
  	mode_t mode;
ddbcc7e8e   Paul Menage   Task Control Grou...
1726
1727
  
  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
bd89aabc6   Paul Menage   Control groups: R...
1728
  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1729
1730
1731
1732
1733
1734
1735
  		strcpy(name, subsys->name);
  		strcat(name, ".");
  	}
  	strcat(name, cft->name);
  	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
  	dentry = lookup_one_len(name, dir, strlen(name));
  	if (!IS_ERR(dentry)) {
099fca322   Li Zefan   cgroups: show cor...
1736
1737
  		mode = cgroup_file_mode(cft);
  		error = cgroup_create_file(dentry, mode | S_IFREG,
bd89aabc6   Paul Menage   Control groups: R...
1738
  						cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
1739
1740
1741
1742
1743
1744
1745
  		if (!error)
  			dentry->d_fsdata = (void *)cft;
  		dput(dentry);
  	} else
  		error = PTR_ERR(dentry);
  	return error;
  }
bd89aabc6   Paul Menage   Control groups: R...
1746
  int cgroup_add_files(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
1747
1748
1749
1750
1751
1752
  			struct cgroup_subsys *subsys,
  			const struct cftype cft[],
  			int count)
  {
  	int i, err;
  	for (i = 0; i < count; i++) {
bd89aabc6   Paul Menage   Control groups: R...
1753
  		err = cgroup_add_file(cgrp, subsys, &cft[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
1754
1755
1756
1757
1758
  		if (err)
  			return err;
  	}
  	return 0;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1759
1760
1761
1762
1763
1764
  /**
   * cgroup_task_count - count the number of tasks in a cgroup.
   * @cgrp: the cgroup in question
   *
   * Return the number of tasks in the cgroup.
   */
bd89aabc6   Paul Menage   Control groups: R...
1765
  int cgroup_task_count(const struct cgroup *cgrp)
bbcb81d09   Paul Menage   Task Control Grou...
1766
1767
  {
  	int count = 0;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1768
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
1769
1770
  
  	read_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1771
  	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
1772
  		count += atomic_read(&link->cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
1773
1774
  	}
  	read_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
1775
1776
1777
1778
  	return count;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
1779
1780
1781
   * Advance a list_head iterator.  The iterator should be positioned at
   * the start of a css_set
   */
bd89aabc6   Paul Menage   Control groups: R...
1782
  static void cgroup_advance_iter(struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
1783
1784
1785
1786
1787
1788
1789
1790
1791
  					  struct cgroup_iter *it)
  {
  	struct list_head *l = it->cg_link;
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	/* Advance to the next non-empty css_set */
  	do {
  		l = l->next;
bd89aabc6   Paul Menage   Control groups: R...
1792
  		if (l == &cgrp->css_sets) {
817929ec2   Paul Menage   Task Control Grou...
1793
1794
1795
  			it->cg_link = NULL;
  			return;
  		}
bd89aabc6   Paul Menage   Control groups: R...
1796
  		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
1797
1798
1799
1800
1801
  		cg = link->cg;
  	} while (list_empty(&cg->tasks));
  	it->cg_link = l;
  	it->task = cg->tasks.next;
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
1802
1803
1804
1805
1806
1807
1808
1809
1810
  /*
   * To reduce the fork() overhead for systems that are not actually
   * using their cgroups capability, we don't maintain the lists running
   * through each css_set to its tasks until we see the list actually
   * used - in other words after the first call to cgroup_iter_start().
   *
   * The tasklist_lock is not held here, as do_each_thread() and
   * while_each_thread() are protected by RCU.
   */
3df91fe30   Adrian Bunk   make cgroup_enabl...
1811
  static void cgroup_enable_task_cg_lists(void)
31a7df01f   Cliff Wickman   cgroups: mechanis...
1812
1813
1814
1815
1816
1817
  {
  	struct task_struct *p, *g;
  	write_lock(&css_set_lock);
  	use_task_css_set_links = 1;
  	do_each_thread(g, p) {
  		task_lock(p);
0e04388f0   Li Zefan   cgroup: fix a rac...
1818
1819
1820
1821
1822
1823
  		/*
  		 * We should check if the process is exiting, otherwise
  		 * it will race with cgroup_exit() in that the list
  		 * entry won't be deleted though the process has exited.
  		 */
  		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
31a7df01f   Cliff Wickman   cgroups: mechanis...
1824
1825
1826
1827
1828
  			list_add(&p->cg_list, &p->cgroups->tasks);
  		task_unlock(p);
  	} while_each_thread(g, p);
  	write_unlock(&css_set_lock);
  }
bd89aabc6   Paul Menage   Control groups: R...
1829
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
1830
1831
1832
1833
1834
1835
  {
  	/*
  	 * The first time anyone tries to iterate across a cgroup,
  	 * we need to enable the list linking each css_set to its
  	 * tasks, and fix up all existing tasks.
  	 */
31a7df01f   Cliff Wickman   cgroups: mechanis...
1836
1837
  	if (!use_task_css_set_links)
  		cgroup_enable_task_cg_lists();
817929ec2   Paul Menage   Task Control Grou...
1838
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
1839
1840
  	it->cg_link = &cgrp->css_sets;
  	cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
1841
  }
bd89aabc6   Paul Menage   Control groups: R...
1842
  struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
1843
1844
1845
1846
  					struct cgroup_iter *it)
  {
  	struct task_struct *res;
  	struct list_head *l = it->task;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
1847
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
1848
1849
1850
1851
1852
1853
1854
  
  	/* If the iterator cg is NULL, we have no tasks */
  	if (!it->cg_link)
  		return NULL;
  	res = list_entry(l, struct task_struct, cg_list);
  	/* Advance iterator to find next entry */
  	l = l->next;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
1855
1856
  	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
  	if (l == &link->cg->tasks) {
817929ec2   Paul Menage   Task Control Grou...
1857
1858
  		/* We reached the end of this task list - move on to
  		 * the next cg_cgroup_link */
bd89aabc6   Paul Menage   Control groups: R...
1859
  		cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
1860
1861
1862
1863
1864
  	} else {
  		it->task = l;
  	}
  	return res;
  }
bd89aabc6   Paul Menage   Control groups: R...
1865
  void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
1866
1867
1868
  {
  	read_unlock(&css_set_lock);
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
  static inline int started_after_time(struct task_struct *t1,
  				     struct timespec *time,
  				     struct task_struct *t2)
  {
  	int start_diff = timespec_compare(&t1->start_time, time);
  	if (start_diff > 0) {
  		return 1;
  	} else if (start_diff < 0) {
  		return 0;
  	} else {
  		/*
  		 * Arbitrarily, if two processes started at the same
  		 * time, we'll say that the lower pointer value
  		 * started first. Note that t2 may have exited by now
  		 * so this may not be a valid pointer any longer, but
  		 * that's fine - it still serves to distinguish
  		 * between two tasks started (effectively) simultaneously.
  		 */
  		return t1 > t2;
  	}
  }
  
  /*
   * This function is a callback from heap_insert() and is used to order
   * the heap.
   * In this case we order the heap in descending task start time.
   */
  static inline int started_after(void *p1, void *p2)
  {
  	struct task_struct *t1 = p1;
  	struct task_struct *t2 = p2;
  	return started_after_time(t1, &t2->start_time, t2);
  }
  
  /**
   * cgroup_scan_tasks - iterate though all the tasks in a cgroup
   * @scan: struct cgroup_scanner containing arguments for the scan
   *
   * Arguments include pointers to callback functions test_task() and
   * process_task().
   * Iterate through all the tasks in a cgroup, calling test_task() for each,
   * and if it returns true, call process_task() for it also.
   * The test_task pointer may be NULL, meaning always true (select all tasks).
   * Effectively duplicates cgroup_iter_{start,next,end}()
   * but does not lock css_set_lock for the call to process_task().
   * The struct cgroup_scanner may be embedded in any structure of the caller's
   * creation.
   * It is guaranteed that process_task() will act on every task that
   * is a member of the cgroup for the duration of this call. This
   * function may or may not call process_task() for tasks that exit
   * or move to a different cgroup during the call, or are forked or
   * move into the cgroup during the call.
   *
   * Note that test_task() may be called with locks held, and may in some
   * situations be called multiple times for the same task, so it should
   * be cheap.
   * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
   * pre-allocated and will be used for heap operations (and its "gt" member will
   * be overwritten), else a temporary heap will be used (allocation of which
   * may cause this function to fail).
   */
  int cgroup_scan_tasks(struct cgroup_scanner *scan)
  {
  	int retval, i;
  	struct cgroup_iter it;
  	struct task_struct *p, *dropped;
  	/* Never dereference latest_task, since it's not refcounted */
  	struct task_struct *latest_task = NULL;
  	struct ptr_heap tmp_heap;
  	struct ptr_heap *heap;
  	struct timespec latest_time = { 0, 0 };
  
  	if (scan->heap) {
  		/* The caller supplied our heap and pre-allocated its memory */
  		heap = scan->heap;
  		heap->gt = &started_after;
  	} else {
  		/* We need to allocate our own heap memory */
  		heap = &tmp_heap;
  		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
  		if (retval)
  			/* cannot allocate the heap */
  			return retval;
  	}
  
   again:
  	/*
  	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
  	 * to determine which are of interest, and using the scanner's
  	 * "process_task" callback to process any of them that need an update.
  	 * Since we don't want to hold any locks during the task updates,
  	 * gather tasks to be processed in a heap structure.
  	 * The heap is sorted by descending task start time.
  	 * If the statically-sized heap fills up, we overflow tasks that
  	 * started later, and in future iterations only consider tasks that
  	 * started after the latest task in the previous pass. This
  	 * guarantees forward progress and that we don't miss any tasks.
  	 */
  	heap->size = 0;
  	cgroup_iter_start(scan->cg, &it);
  	while ((p = cgroup_iter_next(scan->cg, &it))) {
  		/*
  		 * Only affect tasks that qualify per the caller's callback,
  		 * if he provided one
  		 */
  		if (scan->test_task && !scan->test_task(p, scan))
  			continue;
  		/*
  		 * Only process tasks that started after the last task
  		 * we processed
  		 */
  		if (!started_after_time(p, &latest_time, latest_task))
  			continue;
  		dropped = heap_insert(heap, p);
  		if (dropped == NULL) {
  			/*
  			 * The new task was inserted; the heap wasn't
  			 * previously full
  			 */
  			get_task_struct(p);
  		} else if (dropped != p) {
  			/*
  			 * The new task was inserted, and pushed out a
  			 * different task
  			 */
  			get_task_struct(p);
  			put_task_struct(dropped);
  		}
  		/*
  		 * Else the new task was newer than anything already in
  		 * the heap and wasn't inserted
  		 */
  	}
  	cgroup_iter_end(scan->cg, &it);
  
  	if (heap->size) {
  		for (i = 0; i < heap->size; i++) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2006
  			struct task_struct *q = heap->ptrs[i];
31a7df01f   Cliff Wickman   cgroups: mechanis...
2007
  			if (i == 0) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2008
2009
  				latest_time = q->start_time;
  				latest_task = q;
31a7df01f   Cliff Wickman   cgroups: mechanis...
2010
2011
  			}
  			/* Process the task per the caller's callback */
4fe91d518   Paul Jackson   cgroup: fix spars...
2012
2013
  			scan->process_task(q, scan);
  			put_task_struct(q);
31a7df01f   Cliff Wickman   cgroups: mechanis...
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
  		}
  		/*
  		 * If we had to process any tasks at all, scan again
  		 * in case some of them were in the middle of forking
  		 * children that didn't get processed.
  		 * Not the most efficient way to do it, but it avoids
  		 * having to take callback_mutex in the fork path
  		 */
  		goto again;
  	}
  	if (heap == &tmp_heap)
  		heap_free(&tmp_heap);
  	return 0;
  }
817929ec2   Paul Menage   Task Control Grou...
2028
  /*
bbcb81d09   Paul Menage   Task Control Grou...
2029
2030
2031
2032
2033
2034
2035
   * Stuff for reading the 'tasks' file.
   *
   * Reading this file can return large amounts of data if a cgroup has
   * *lots* of attached tasks. So it may need several calls to read(),
   * but we cannot guarantee that the information we produce is correct
   * unless we produce it entirely atomically.
   *
bbcb81d09   Paul Menage   Task Control Grou...
2036
   */
bbcb81d09   Paul Menage   Task Control Grou...
2037
2038
2039
  
  /*
   * Load into 'pidarray' up to 'npids' of the tasks using cgroup
bd89aabc6   Paul Menage   Control groups: R...
2040
   * 'cgrp'.  Return actual number of pids loaded.  No need to
bbcb81d09   Paul Menage   Task Control Grou...
2041
2042
2043
2044
   * task_lock(p) when reading out p->cgroup, since we're in an RCU
   * read section, so the css_set can't go away, and is
   * immutable after creation.
   */
bd89aabc6   Paul Menage   Control groups: R...
2045
  static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
bbcb81d09   Paul Menage   Task Control Grou...
2046
  {
e7b80bb69   Gowrishankar M   cgroups: skip pro...
2047
  	int n = 0, pid;
817929ec2   Paul Menage   Task Control Grou...
2048
2049
  	struct cgroup_iter it;
  	struct task_struct *tsk;
bd89aabc6   Paul Menage   Control groups: R...
2050
2051
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
817929ec2   Paul Menage   Task Control Grou...
2052
2053
  		if (unlikely(n == npids))
  			break;
e7b80bb69   Gowrishankar M   cgroups: skip pro...
2054
2055
2056
  		pid = task_pid_vnr(tsk);
  		if (pid > 0)
  			pidarray[n++] = pid;
817929ec2   Paul Menage   Task Control Grou...
2057
  	}
bd89aabc6   Paul Menage   Control groups: R...
2058
  	cgroup_iter_end(cgrp, &it);
bbcb81d09   Paul Menage   Task Control Grou...
2059
2060
  	return n;
  }
846c7bb05   Balbir Singh   Add cgroupstats
2061
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2062
   * cgroupstats_build - build and fill cgroupstats
846c7bb05   Balbir Singh   Add cgroupstats
2063
2064
2065
   * @stats: cgroupstats to fill information into
   * @dentry: A dentry entry belonging to the cgroup for which stats have
   * been requested.
a043e3b2c   Li Zefan   cgroup: fix comments
2066
2067
2068
   *
   * Build and fill cgroupstats so that taskstats can export it to user
   * space.
846c7bb05   Balbir Singh   Add cgroupstats
2069
2070
2071
2072
   */
  int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  {
  	int ret = -EINVAL;
bd89aabc6   Paul Menage   Control groups: R...
2073
  	struct cgroup *cgrp;
846c7bb05   Balbir Singh   Add cgroupstats
2074
2075
  	struct cgroup_iter it;
  	struct task_struct *tsk;
33d283bef   Li Zefan   cgroups: fix a se...
2076

846c7bb05   Balbir Singh   Add cgroupstats
2077
  	/*
33d283bef   Li Zefan   cgroups: fix a se...
2078
2079
  	 * Validate dentry by checking the superblock operations,
  	 * and make sure it's a directory.
846c7bb05   Balbir Singh   Add cgroupstats
2080
  	 */
33d283bef   Li Zefan   cgroups: fix a se...
2081
2082
  	if (dentry->d_sb->s_op != &cgroup_ops ||
  	    !S_ISDIR(dentry->d_inode->i_mode))
846c7bb05   Balbir Singh   Add cgroupstats
2083
2084
2085
  		 goto err;
  
  	ret = 0;
bd89aabc6   Paul Menage   Control groups: R...
2086
  	cgrp = dentry->d_fsdata;
846c7bb05   Balbir Singh   Add cgroupstats
2087

bd89aabc6   Paul Menage   Control groups: R...
2088
2089
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
846c7bb05   Balbir Singh   Add cgroupstats
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
  		switch (tsk->state) {
  		case TASK_RUNNING:
  			stats->nr_running++;
  			break;
  		case TASK_INTERRUPTIBLE:
  			stats->nr_sleeping++;
  			break;
  		case TASK_UNINTERRUPTIBLE:
  			stats->nr_uninterruptible++;
  			break;
  		case TASK_STOPPED:
  			stats->nr_stopped++;
  			break;
  		default:
  			if (delayacct_is_task_waiting_on_io(tsk))
  				stats->nr_io_wait++;
  			break;
  		}
  	}
bd89aabc6   Paul Menage   Control groups: R...
2109
  	cgroup_iter_end(cgrp, &it);
846c7bb05   Balbir Singh   Add cgroupstats
2110

846c7bb05   Balbir Singh   Add cgroupstats
2111
2112
2113
  err:
  	return ret;
  }
bbcb81d09   Paul Menage   Task Control Grou...
2114
2115
2116
2117
  static int cmppid(const void *a, const void *b)
  {
  	return *(pid_t *)a - *(pid_t *)b;
  }
cc31edcee   Paul Menage   cgroups: convert ...
2118

bbcb81d09   Paul Menage   Task Control Grou...
2119
  /*
cc31edcee   Paul Menage   cgroups: convert ...
2120
2121
2122
   * seq_file methods for the "tasks" file. The seq_file position is the
   * next pid to display; the seq_file iterator is a pointer to the pid
   * in the cgroup->tasks_pids array.
bbcb81d09   Paul Menage   Task Control Grou...
2123
   */
cc31edcee   Paul Menage   cgroups: convert ...
2124
2125
  
  static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
bbcb81d09   Paul Menage   Task Control Grou...
2126
  {
cc31edcee   Paul Menage   cgroups: convert ...
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
  	/*
  	 * Initially we receive a position value that corresponds to
  	 * one more than the last pid shown (or 0 on the first call or
  	 * after a seek to the start). Use a binary-search to find the
  	 * next pid to display, if any
  	 */
  	struct cgroup *cgrp = s->private;
  	int index = 0, pid = *pos;
  	int *iter;
  
  	down_read(&cgrp->pids_mutex);
  	if (pid) {
  		int end = cgrp->pids_length;
207777664   Stephen Rothwell   cgroup: remove un...
2140

cc31edcee   Paul Menage   cgroups: convert ...
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
  		while (index < end) {
  			int mid = (index + end) / 2;
  			if (cgrp->tasks_pids[mid] == pid) {
  				index = mid;
  				break;
  			} else if (cgrp->tasks_pids[mid] <= pid)
  				index = mid + 1;
  			else
  				end = mid;
  		}
  	}
  	/* If we're off the end of the array, we're done */
  	if (index >= cgrp->pids_length)
  		return NULL;
  	/* Update the abstract position to be the actual pid that we found */
  	iter = cgrp->tasks_pids + index;
  	*pos = *iter;
  	return iter;
  }
  
  static void cgroup_tasks_stop(struct seq_file *s, void *v)
  {
  	struct cgroup *cgrp = s->private;
  	up_read(&cgrp->pids_mutex);
  }
  
  static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
  {
  	struct cgroup *cgrp = s->private;
  	int *p = v;
  	int *end = cgrp->tasks_pids + cgrp->pids_length;
  
  	/*
  	 * Advance to the next pid in the array. If this goes off the
  	 * end, we're done
  	 */
  	p++;
  	if (p >= end) {
  		return NULL;
  	} else {
  		*pos = *p;
  		return p;
  	}
  }
  
  static int cgroup_tasks_show(struct seq_file *s, void *v)
  {
  	return seq_printf(s, "%d
  ", *(int *)v);
  }
bbcb81d09   Paul Menage   Task Control Grou...
2191

cc31edcee   Paul Menage   cgroups: convert ...
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
  static struct seq_operations cgroup_tasks_seq_operations = {
  	.start = cgroup_tasks_start,
  	.stop = cgroup_tasks_stop,
  	.next = cgroup_tasks_next,
  	.show = cgroup_tasks_show,
  };
  
  static void release_cgroup_pid_array(struct cgroup *cgrp)
  {
  	down_write(&cgrp->pids_mutex);
  	BUG_ON(!cgrp->pids_use_count);
  	if (!--cgrp->pids_use_count) {
  		kfree(cgrp->tasks_pids);
  		cgrp->tasks_pids = NULL;
  		cgrp->pids_length = 0;
  	}
  	up_write(&cgrp->pids_mutex);
bbcb81d09   Paul Menage   Task Control Grou...
2209
  }
cc31edcee   Paul Menage   cgroups: convert ...
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
  static int cgroup_tasks_release(struct inode *inode, struct file *file)
  {
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
  
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
  
  	release_cgroup_pid_array(cgrp);
  	return seq_release(inode, file);
  }
  
  static struct file_operations cgroup_tasks_operations = {
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.write = cgroup_file_write,
  	.release = cgroup_tasks_release,
  };
bbcb81d09   Paul Menage   Task Control Grou...
2227
  /*
cc31edcee   Paul Menage   cgroups: convert ...
2228
   * Handle an open on 'tasks' file.  Prepare an array containing the
bbcb81d09   Paul Menage   Task Control Grou...
2229
   * process id's of tasks currently attached to the cgroup being opened.
bbcb81d09   Paul Menage   Task Control Grou...
2230
   */
cc31edcee   Paul Menage   cgroups: convert ...
2231

bbcb81d09   Paul Menage   Task Control Grou...
2232
2233
  static int cgroup_tasks_open(struct inode *unused, struct file *file)
  {
bd89aabc6   Paul Menage   Control groups: R...
2234
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
bbcb81d09   Paul Menage   Task Control Grou...
2235
2236
  	pid_t *pidarray;
  	int npids;
cc31edcee   Paul Menage   cgroups: convert ...
2237
  	int retval;
bbcb81d09   Paul Menage   Task Control Grou...
2238

cc31edcee   Paul Menage   cgroups: convert ...
2239
  	/* Nothing to do for write-only files */
bbcb81d09   Paul Menage   Task Control Grou...
2240
2241
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
bbcb81d09   Paul Menage   Task Control Grou...
2242
2243
2244
2245
2246
2247
  	/*
  	 * If cgroup gets more users after we read count, we won't have
  	 * enough space - tough.  This race is indistinguishable to the
  	 * caller from the case that the additional cgroup users didn't
  	 * show up until sometime later on.
  	 */
bd89aabc6   Paul Menage   Control groups: R...
2248
  	npids = cgroup_task_count(cgrp);
cc31edcee   Paul Menage   cgroups: convert ...
2249
2250
2251
2252
2253
  	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
  	if (!pidarray)
  		return -ENOMEM;
  	npids = pid_array_load(pidarray, npids, cgrp);
  	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
bbcb81d09   Paul Menage   Task Control Grou...
2254

cc31edcee   Paul Menage   cgroups: convert ...
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
  	/*
  	 * Store the array in the cgroup, freeing the old
  	 * array if necessary
  	 */
  	down_write(&cgrp->pids_mutex);
  	kfree(cgrp->tasks_pids);
  	cgrp->tasks_pids = pidarray;
  	cgrp->pids_length = npids;
  	cgrp->pids_use_count++;
  	up_write(&cgrp->pids_mutex);
  
  	file->f_op = &cgroup_tasks_operations;
  
  	retval = seq_open(file, &cgroup_tasks_seq_operations);
  	if (retval) {
  		release_cgroup_pid_array(cgrp);
  		return retval;
bbcb81d09   Paul Menage   Task Control Grou...
2272
  	}
cc31edcee   Paul Menage   cgroups: convert ...
2273
  	((struct seq_file *)file->private_data)->private = cgrp;
bbcb81d09   Paul Menage   Task Control Grou...
2274
2275
  	return 0;
  }
bd89aabc6   Paul Menage   Control groups: R...
2276
  static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
81a6a5cdd   Paul Menage   Task Control Grou...
2277
2278
  					    struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
2279
  	return notify_on_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
2280
  }
6379c1061   Paul Menage   cgroup files: mov...
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
  static int cgroup_write_notify_on_release(struct cgroup *cgrp,
  					  struct cftype *cft,
  					  u64 val)
  {
  	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
  	if (val)
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	else
  		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	return 0;
  }
bbcb81d09   Paul Menage   Task Control Grou...
2292
2293
2294
  /*
   * for the common functions, 'private' gives the type of file
   */
81a6a5cdd   Paul Menage   Task Control Grou...
2295
2296
2297
2298
  static struct cftype files[] = {
  	{
  		.name = "tasks",
  		.open = cgroup_tasks_open,
af351026a   Paul Menage   cgroup files: tur...
2299
  		.write_u64 = cgroup_tasks_write,
81a6a5cdd   Paul Menage   Task Control Grou...
2300
2301
  		.release = cgroup_tasks_release,
  		.private = FILE_TASKLIST,
099fca322   Li Zefan   cgroups: show cor...
2302
  		.mode = S_IRUGO | S_IWUSR,
81a6a5cdd   Paul Menage   Task Control Grou...
2303
2304
2305
2306
  	},
  
  	{
  		.name = "notify_on_release",
f4c753b7e   Paul Menage   CGroup API files:...
2307
  		.read_u64 = cgroup_read_notify_on_release,
6379c1061   Paul Menage   cgroup files: mov...
2308
  		.write_u64 = cgroup_write_notify_on_release,
81a6a5cdd   Paul Menage   Task Control Grou...
2309
2310
  		.private = FILE_NOTIFY_ON_RELEASE,
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
2311
2312
2313
2314
  };
  
  static struct cftype cft_release_agent = {
  	.name = "release_agent",
e788e066c   Paul Menage   cgroup files: mov...
2315
2316
2317
  	.read_seq_string = cgroup_release_agent_show,
  	.write_string = cgroup_release_agent_write,
  	.max_write_len = PATH_MAX,
81a6a5cdd   Paul Menage   Task Control Grou...
2318
  	.private = FILE_RELEASE_AGENT,
bbcb81d09   Paul Menage   Task Control Grou...
2319
  };
bd89aabc6   Paul Menage   Control groups: R...
2320
  static int cgroup_populate_dir(struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
2321
2322
2323
2324
2325
  {
  	int err;
  	struct cgroup_subsys *ss;
  
  	/* First clear out any existing files */
bd89aabc6   Paul Menage   Control groups: R...
2326
  	cgroup_clear_directory(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
2327

bd89aabc6   Paul Menage   Control groups: R...
2328
  	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
bbcb81d09   Paul Menage   Task Control Grou...
2329
2330
  	if (err < 0)
  		return err;
bd89aabc6   Paul Menage   Control groups: R...
2331
2332
  	if (cgrp == cgrp->top_cgroup) {
  		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
81a6a5cdd   Paul Menage   Task Control Grou...
2333
2334
  			return err;
  	}
bd89aabc6   Paul Menage   Control groups: R...
2335
2336
  	for_each_subsys(cgrp->root, ss) {
  		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
ddbcc7e8e   Paul Menage   Task Control Grou...
2337
2338
  			return err;
  	}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
  	/* This cgroup is ready now */
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		/*
  		 * Update id->css pointer and make this css visible from
  		 * CSS ID functions. This pointer will be dereferened
  		 * from RCU-read-side without locks.
  		 */
  		if (css->id)
  			rcu_assign_pointer(css->id->css, css);
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
2350
2351
2352
2353
2354
2355
  
  	return 0;
  }
  
  static void init_cgroup_css(struct cgroup_subsys_state *css,
  			       struct cgroup_subsys *ss,
bd89aabc6   Paul Menage   Control groups: R...
2356
  			       struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
2357
  {
bd89aabc6   Paul Menage   Control groups: R...
2358
  	css->cgroup = cgrp;
e7c5ec919   Paul Menage   cgroups: add css_...
2359
  	atomic_set(&css->refcnt, 1);
ddbcc7e8e   Paul Menage   Task Control Grou...
2360
  	css->flags = 0;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
2361
  	css->id = NULL;
bd89aabc6   Paul Menage   Control groups: R...
2362
  	if (cgrp == dummytop)
ddbcc7e8e   Paul Menage   Task Control Grou...
2363
  		set_bit(CSS_ROOT, &css->flags);
bd89aabc6   Paul Menage   Control groups: R...
2364
2365
  	BUG_ON(cgrp->subsys[ss->subsys_id]);
  	cgrp->subsys[ss->subsys_id] = css;
ddbcc7e8e   Paul Menage   Task Control Grou...
2366
  }
999cd8a45   Paul Menage   cgroups: add a pe...
2367
2368
2369
2370
2371
2372
2373
2374
  static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
  {
  	/* We need to take each hierarchy_mutex in a consistent order */
  	int i;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		if (ss->root == root)
cfebe563b   Li Zefan   cgroups: fix lock...
2375
  			mutex_lock(&ss->hierarchy_mutex);
999cd8a45   Paul Menage   cgroups: add a pe...
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
  	}
  }
  
  static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  {
  	int i;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		if (ss->root == root)
  			mutex_unlock(&ss->hierarchy_mutex);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
2389
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
2390
2391
2392
2393
   * cgroup_create - create a cgroup
   * @parent: cgroup that will be parent of the new cgroup
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new inode
ddbcc7e8e   Paul Menage   Task Control Grou...
2394
   *
a043e3b2c   Li Zefan   cgroup: fix comments
2395
   * Must be called with the mutex on the parent inode held
ddbcc7e8e   Paul Menage   Task Control Grou...
2396
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
2397
  static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
2398
  			     mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
2399
  {
bd89aabc6   Paul Menage   Control groups: R...
2400
  	struct cgroup *cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
2401
2402
2403
2404
  	struct cgroupfs_root *root = parent->root;
  	int err = 0;
  	struct cgroup_subsys *ss;
  	struct super_block *sb = root->sb;
bd89aabc6   Paul Menage   Control groups: R...
2405
2406
  	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
  	if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
  		return -ENOMEM;
  
  	/* Grab a reference on the superblock so the hierarchy doesn't
  	 * get deleted on unmount if there are child cgroups.  This
  	 * can be done outside cgroup_mutex, since the sb can't
  	 * disappear while someone has an open control file on the
  	 * fs */
  	atomic_inc(&sb->s_active);
  
  	mutex_lock(&cgroup_mutex);
cc31edcee   Paul Menage   cgroups: convert ...
2417
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2418

bd89aabc6   Paul Menage   Control groups: R...
2419
2420
2421
  	cgrp->parent = parent;
  	cgrp->root = parent->root;
  	cgrp->top_cgroup = parent->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
2422

b6abdb0e6   Li Zefan   cgroup: fix defau...
2423
2424
  	if (notify_on_release(parent))
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
2425
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
2426
  		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2427
2428
2429
2430
  		if (IS_ERR(css)) {
  			err = PTR_ERR(css);
  			goto err_destroy;
  		}
bd89aabc6   Paul Menage   Control groups: R...
2431
  		init_cgroup_css(css, ss, cgrp);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
2432
2433
2434
2435
  		if (ss->use_id)
  			if (alloc_css_id(ss, parent, cgrp))
  				goto err_destroy;
  		/* At error, ->destroy() callback has to free assigned ID. */
ddbcc7e8e   Paul Menage   Task Control Grou...
2436
  	}
999cd8a45   Paul Menage   cgroups: add a pe...
2437
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
2438
  	list_add(&cgrp->sibling, &cgrp->parent->children);
999cd8a45   Paul Menage   cgroups: add a pe...
2439
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
2440
  	root->number_of_cgroups++;
bd89aabc6   Paul Menage   Control groups: R...
2441
  	err = cgroup_create_dir(cgrp, dentry, mode);
ddbcc7e8e   Paul Menage   Task Control Grou...
2442
2443
2444
2445
  	if (err < 0)
  		goto err_remove;
  
  	/* The cgroup directory was pre-locked for us */
bd89aabc6   Paul Menage   Control groups: R...
2446
  	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
ddbcc7e8e   Paul Menage   Task Control Grou...
2447

bd89aabc6   Paul Menage   Control groups: R...
2448
  	err = cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2449
2450
2451
  	/* If err < 0, we have a half-filled directory - oh well ;) */
  
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
2452
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
2453
2454
2455
2456
  
  	return 0;
  
   err_remove:
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
2457
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
2458
  	list_del(&cgrp->sibling);
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
2459
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
2460
2461
2462
2463
2464
  	root->number_of_cgroups--;
  
   err_destroy:
  
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
2465
2466
  		if (cgrp->subsys[ss->subsys_id])
  			ss->destroy(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2467
2468
2469
2470
2471
2472
  	}
  
  	mutex_unlock(&cgroup_mutex);
  
  	/* Release the reference count that we took on the superblock */
  	deactivate_super(sb);
bd89aabc6   Paul Menage   Control groups: R...
2473
  	kfree(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
  	return err;
  }
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  {
  	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
  
  	/* the vfs holds inode->i_mutex already */
  	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
55b6fd016   Li Zefan   cgroup: uninline ...
2484
  static int cgroup_has_css_refs(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
2485
2486
2487
  {
  	/* Check the reference count on each subsystem. Since we
  	 * already established that there are no tasks in the
e7c5ec919   Paul Menage   cgroups: add css_...
2488
  	 * cgroup, if the css refcount is also 1, then there should
81a6a5cdd   Paul Menage   Task Control Grou...
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
  	 * be no outstanding references, so the subsystem is safe to
  	 * destroy. We scan across all subsystems rather than using
  	 * the per-hierarchy linked list of mounted subsystems since
  	 * we can be called via check_for_release() with no
  	 * synchronization other than RCU, and the subsystem linked
  	 * list isn't RCU-safe */
  	int i;
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		struct cgroup_subsys_state *css;
  		/* Skip subsystems not in this hierarchy */
bd89aabc6   Paul Menage   Control groups: R...
2500
  		if (ss->root != cgrp->root)
81a6a5cdd   Paul Menage   Task Control Grou...
2501
  			continue;
bd89aabc6   Paul Menage   Control groups: R...
2502
  		css = cgrp->subsys[ss->subsys_id];
81a6a5cdd   Paul Menage   Task Control Grou...
2503
2504
2505
2506
2507
2508
  		/* When called from check_for_release() it's possible
  		 * that by this point the cgroup has been removed
  		 * and the css deleted. But a false-positive doesn't
  		 * matter, since it can only happen if the cgroup
  		 * has been deleted and hence no longer needs the
  		 * release agent to be called anyway. */
e7c5ec919   Paul Menage   cgroups: add css_...
2509
  		if (css && (atomic_read(&css->refcnt) > 1))
81a6a5cdd   Paul Menage   Task Control Grou...
2510
  			return 1;
81a6a5cdd   Paul Menage   Task Control Grou...
2511
2512
2513
  	}
  	return 0;
  }
e7c5ec919   Paul Menage   cgroups: add css_...
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
  /*
   * Atomically mark all (or else none) of the cgroup's CSS objects as
   * CSS_REMOVED. Return true on success, or false if the cgroup has
   * busy subsystems. Call with cgroup_mutex held
   */
  
  static int cgroup_clear_css_refs(struct cgroup *cgrp)
  {
  	struct cgroup_subsys *ss;
  	unsigned long flags;
  	bool failed = false;
  	local_irq_save(flags);
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		int refcnt;
804b3c28a   Paul Menage   cgroups: add cpu_...
2529
  		while (1) {
e7c5ec919   Paul Menage   cgroups: add css_...
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
  			/* We can only remove a CSS with a refcnt==1 */
  			refcnt = atomic_read(&css->refcnt);
  			if (refcnt > 1) {
  				failed = true;
  				goto done;
  			}
  			BUG_ON(!refcnt);
  			/*
  			 * Drop the refcnt to 0 while we check other
  			 * subsystems. This will cause any racing
  			 * css_tryget() to spin until we set the
  			 * CSS_REMOVED bits or abort
  			 */
804b3c28a   Paul Menage   cgroups: add cpu_...
2543
2544
2545
2546
  			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
  				break;
  			cpu_relax();
  		}
e7c5ec919   Paul Menage   cgroups: add css_...
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
  	}
   done:
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		if (failed) {
  			/*
  			 * Restore old refcnt if we previously managed
  			 * to clear it from 1 to 0
  			 */
  			if (!atomic_read(&css->refcnt))
  				atomic_set(&css->refcnt, 1);
  		} else {
  			/* Commit the fact that the CSS is removed */
  			set_bit(CSS_REMOVED, &css->flags);
  		}
  	}
  	local_irq_restore(flags);
  	return !failed;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
2566
2567
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
bd89aabc6   Paul Menage   Control groups: R...
2568
  	struct cgroup *cgrp = dentry->d_fsdata;
ddbcc7e8e   Paul Menage   Task Control Grou...
2569
2570
  	struct dentry *d;
  	struct cgroup *parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2571
2572
  	DEFINE_WAIT(wait);
  	int ret;
ddbcc7e8e   Paul Menage   Task Control Grou...
2573
2574
  
  	/* the vfs holds both inode->i_mutex already */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2575
  again:
ddbcc7e8e   Paul Menage   Task Control Grou...
2576
  	mutex_lock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
2577
  	if (atomic_read(&cgrp->count) != 0) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2578
2579
2580
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
bd89aabc6   Paul Menage   Control groups: R...
2581
  	if (!list_empty(&cgrp->children)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2582
2583
2584
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
2585
  	mutex_unlock(&cgroup_mutex);
a043e3b2c   Li Zefan   cgroup: fix comments
2586

4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
2587
  	/*
a043e3b2c   Li Zefan   cgroup: fix comments
2588
2589
  	 * Call pre_destroy handlers of subsys. Notify subsystems
  	 * that rmdir() request comes.
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
2590
  	 */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2591
2592
2593
  	ret = cgroup_call_pre_destroy(cgrp);
  	if (ret)
  		return ret;
ddbcc7e8e   Paul Menage   Task Control Grou...
2594

3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
2595
2596
  	mutex_lock(&cgroup_mutex);
  	parent = cgrp->parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2597
  	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2598
2599
2600
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
  	/*
  	 * css_put/get is provided for subsys to grab refcnt to css. In typical
  	 * case, subsystem has no reference after pre_destroy(). But, under
  	 * hierarchy management, some *temporal* refcnt can be hold.
  	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
  	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
  	 * is called when css_put() is called and refcnt goes down to 0.
  	 */
  	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
  
  	if (!cgroup_clear_css_refs(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
  		schedule();
  		finish_wait(&cgroup_rmdir_waitq, &wait);
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  		if (signal_pending(current))
  			return -EINTR;
  		goto again;
  	}
  	/* NO css_tryget() can success after here. */
  	finish_wait(&cgroup_rmdir_waitq, &wait);
  	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
2624

81a6a5cdd   Paul Menage   Task Control Grou...
2625
  	spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
2626
2627
2628
  	set_bit(CGRP_REMOVED, &cgrp->flags);
  	if (!list_empty(&cgrp->release_list))
  		list_del(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
2629
  	spin_unlock(&release_list_lock);
999cd8a45   Paul Menage   cgroups: add a pe...
2630
2631
2632
  
  	cgroup_lock_hierarchy(cgrp->root);
  	/* delete this cgroup from parent->children */
bd89aabc6   Paul Menage   Control groups: R...
2633
  	list_del(&cgrp->sibling);
999cd8a45   Paul Menage   cgroups: add a pe...
2634
  	cgroup_unlock_hierarchy(cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
2635
2636
  	spin_lock(&cgrp->dentry->d_lock);
  	d = dget(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
2637
2638
2639
2640
  	spin_unlock(&d->d_lock);
  
  	cgroup_d_remove_dir(d);
  	dput(d);
ddbcc7e8e   Paul Menage   Task Control Grou...
2641

bd89aabc6   Paul Menage   Control groups: R...
2642
  	set_bit(CGRP_RELEASABLE, &parent->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
2643
  	check_for_release(parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
2644
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
2645
2646
  	return 0;
  }
06a119204   Li Zefan   cgroup: annotate ...
2647
  static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
ddbcc7e8e   Paul Menage   Task Control Grou...
2648
  {
ddbcc7e8e   Paul Menage   Task Control Grou...
2649
  	struct cgroup_subsys_state *css;
cfe36bde5   Diego Calleja   Improve cgroup pr...
2650
2651
2652
  
  	printk(KERN_INFO "Initializing cgroup subsys %s
  ", ss->name);
ddbcc7e8e   Paul Menage   Task Control Grou...
2653
2654
  
  	/* Create the top cgroup state for this subsystem */
33a68ac1c   Li Zefan   cgroups: add inac...
2655
  	list_add(&ss->sibling, &rootnode.subsys_list);
ddbcc7e8e   Paul Menage   Task Control Grou...
2656
2657
2658
2659
2660
  	ss->root = &rootnode;
  	css = ss->create(ss, dummytop);
  	/* We don't handle early failures gracefully */
  	BUG_ON(IS_ERR(css));
  	init_cgroup_css(css, ss, dummytop);
e8d55fdeb   Li Zefan   cgroups: simplify...
2661
  	/* Update the init_css_set to contain a subsys
817929ec2   Paul Menage   Task Control Grou...
2662
  	 * pointer to this state - since the subsystem is
e8d55fdeb   Li Zefan   cgroups: simplify...
2663
2664
2665
  	 * newly registered, all tasks and hence the
  	 * init_css_set is in the subsystem's top cgroup. */
  	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
ddbcc7e8e   Paul Menage   Task Control Grou...
2666
2667
  
  	need_forkexit_callback |= ss->fork || ss->exit;
e8d55fdeb   Li Zefan   cgroups: simplify...
2668
2669
2670
2671
  	/* At system boot, before all subsystems have been
  	 * registered, no tasks have been forked, so we don't
  	 * need to invoke fork callbacks here. */
  	BUG_ON(!list_empty(&init_task.tasks));
999cd8a45   Paul Menage   cgroups: add a pe...
2672
  	mutex_init(&ss->hierarchy_mutex);
cfebe563b   Li Zefan   cgroups: fix lock...
2673
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ddbcc7e8e   Paul Menage   Task Control Grou...
2674
2675
2676
2677
  	ss->active = 1;
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2678
2679
2680
2681
   * cgroup_init_early - cgroup initialization at system boot
   *
   * Initialize cgroups at system boot, and initialize any
   * subsystems that request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
2682
2683
2684
2685
   */
  int __init cgroup_init_early(void)
  {
  	int i;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
2686
  	atomic_set(&init_css_set.refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
2687
2688
  	INIT_LIST_HEAD(&init_css_set.cg_links);
  	INIT_LIST_HEAD(&init_css_set.tasks);
472b1053f   Li Zefan   cgroups: use a ha...
2689
  	INIT_HLIST_NODE(&init_css_set.hlist);
817929ec2   Paul Menage   Task Control Grou...
2690
  	css_set_count = 1;
ddbcc7e8e   Paul Menage   Task Control Grou...
2691
  	init_cgroup_root(&rootnode);
817929ec2   Paul Menage   Task Control Grou...
2692
2693
2694
2695
  	root_count = 1;
  	init_task.cgroups = &init_css_set;
  
  	init_css_set_link.cg = &init_css_set;
bd89aabc6   Paul Menage   Control groups: R...
2696
  	list_add(&init_css_set_link.cgrp_link_list,
817929ec2   Paul Menage   Task Control Grou...
2697
2698
2699
  		 &rootnode.top_cgroup.css_sets);
  	list_add(&init_css_set_link.cg_link_list,
  		 &init_css_set.cg_links);
ddbcc7e8e   Paul Menage   Task Control Grou...
2700

472b1053f   Li Zefan   cgroups: use a ha...
2701
2702
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
  		INIT_HLIST_HEAD(&css_set_table[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
2703
2704
2705
2706
2707
2708
2709
2710
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  
  		BUG_ON(!ss->name);
  		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
  		BUG_ON(!ss->create);
  		BUG_ON(!ss->destroy);
  		if (ss->subsys_id != i) {
cfe36bde5   Diego Calleja   Improve cgroup pr...
2711
2712
  			printk(KERN_ERR "cgroup: Subsys %s id == %d
  ",
ddbcc7e8e   Paul Menage   Task Control Grou...
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
  			       ss->name, ss->subsys_id);
  			BUG();
  		}
  
  		if (ss->early_init)
  			cgroup_init_subsys(ss);
  	}
  	return 0;
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2724
2725
2726
2727
   * cgroup_init - cgroup initialization
   *
   * Register cgroup filesystem and /proc file, and initialize
   * any subsystems that didn't request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
2728
2729
2730
2731
2732
   */
  int __init cgroup_init(void)
  {
  	int err;
  	int i;
472b1053f   Li Zefan   cgroups: use a ha...
2733
  	struct hlist_head *hhead;
a424316ca   Paul Menage   Task Control Grou...
2734
2735
2736
2737
  
  	err = bdi_init(&cgroup_backing_dev_info);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
2738
2739
2740
2741
2742
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		if (!ss->early_init)
  			cgroup_init_subsys(ss);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
2743
2744
  		if (ss->use_id)
  			cgroup_subsys_init_idr(ss);
ddbcc7e8e   Paul Menage   Task Control Grou...
2745
  	}
472b1053f   Li Zefan   cgroups: use a ha...
2746
2747
2748
  	/* Add init_css_set to the hash table */
  	hhead = css_set_hash(init_css_set.subsys);
  	hlist_add_head(&init_css_set.hlist, hhead);
ddbcc7e8e   Paul Menage   Task Control Grou...
2749
2750
2751
  	err = register_filesystem(&cgroup_fs_type);
  	if (err < 0)
  		goto out;
46ae220be   Li Zefan   cgroup: switch to...
2752
  	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
a424316ca   Paul Menage   Task Control Grou...
2753

ddbcc7e8e   Paul Menage   Task Control Grou...
2754
  out:
a424316ca   Paul Menage   Task Control Grou...
2755
2756
  	if (err)
  		bdi_destroy(&cgroup_backing_dev_info);
ddbcc7e8e   Paul Menage   Task Control Grou...
2757
2758
  	return err;
  }
b4f48b636   Paul Menage   Task Control Grou...
2759

a424316ca   Paul Menage   Task Control Grou...
2760
2761
2762
2763
2764
2765
  /*
   * proc_cgroup_show()
   *  - Print task's cgroup paths into seq_file, one line for each hierarchy
   *  - Used for /proc/<pid>/cgroup.
   *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
   *    doesn't really matter if tsk->cgroup changes after we read it,
956db3ca0   Cliff Wickman   hotplug cpu: move...
2766
   *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
a424316ca   Paul Menage   Task Control Grou...
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
   *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
   *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
   *    cgroup to top_cgroup.
   */
  
  /* TODO: Use a proper seq_file iterator */
  static int proc_cgroup_show(struct seq_file *m, void *v)
  {
  	struct pid *pid;
  	struct task_struct *tsk;
  	char *buf;
  	int retval;
  	struct cgroupfs_root *root;
  
  	retval = -ENOMEM;
  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!buf)
  		goto out;
  
  	retval = -ESRCH;
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
  	if (!tsk)
  		goto out_free;
  
  	retval = 0;
  
  	mutex_lock(&cgroup_mutex);
e5f6a8609   Li Zefan   cgroups: make roo...
2795
  	for_each_active_root(root) {
a424316ca   Paul Menage   Task Control Grou...
2796
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
2797
  		struct cgroup *cgrp;
a424316ca   Paul Menage   Task Control Grou...
2798
2799
  		int subsys_id;
  		int count = 0;
b6c3006d2   Paul Menage   cgroups: include ...
2800
  		seq_printf(m, "%lu:", root->subsys_bits);
a424316ca   Paul Menage   Task Control Grou...
2801
2802
2803
2804
  		for_each_subsys(root, ss)
  			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
  		seq_putc(m, ':');
  		get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
bd89aabc6   Paul Menage   Control groups: R...
2805
2806
  		cgrp = task_cgroup(tsk, subsys_id);
  		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
a424316ca   Paul Menage   Task Control Grou...
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
  		if (retval < 0)
  			goto out_unlock;
  		seq_puts(m, buf);
  		seq_putc(m, '
  ');
  	}
  
  out_unlock:
  	mutex_unlock(&cgroup_mutex);
  	put_task_struct(tsk);
  out_free:
  	kfree(buf);
  out:
  	return retval;
  }
  
  static int cgroup_open(struct inode *inode, struct file *file)
  {
  	struct pid *pid = PROC_I(inode)->pid;
  	return single_open(file, proc_cgroup_show, pid);
  }
  
  struct file_operations proc_cgroup_operations = {
  	.open		= cgroup_open,
  	.read		= seq_read,
  	.llseek		= seq_lseek,
  	.release	= single_release,
  };
  
  /* Display information about each subsystem and each hierarchy */
  static int proc_cgroupstats_show(struct seq_file *m, void *v)
  {
  	int i;
a424316ca   Paul Menage   Task Control Grou...
2840

8bab8dded   Paul Menage   cgroups: add cgro...
2841
2842
  	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled
  ");
a424316ca   Paul Menage   Task Control Grou...
2843
  	mutex_lock(&cgroup_mutex);
a424316ca   Paul Menage   Task Control Grou...
2844
2845
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
8bab8dded   Paul Menage   cgroups: add cgro...
2846
2847
  		seq_printf(m, "%s\t%lu\t%d\t%d
  ",
817929ec2   Paul Menage   Task Control Grou...
2848
  			   ss->name, ss->root->subsys_bits,
8bab8dded   Paul Menage   cgroups: add cgro...
2849
  			   ss->root->number_of_cgroups, !ss->disabled);
a424316ca   Paul Menage   Task Control Grou...
2850
2851
2852
2853
2854
2855
2856
  	}
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  static int cgroupstats_open(struct inode *inode, struct file *file)
  {
9dce07f1a   Al Viro   NULL noise: fs/*,...
2857
  	return single_open(file, proc_cgroupstats_show, NULL);
a424316ca   Paul Menage   Task Control Grou...
2858
2859
2860
2861
2862
2863
2864
2865
  }
  
  static struct file_operations proc_cgroupstats_operations = {
  	.open = cgroupstats_open,
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.release = single_release,
  };
b4f48b636   Paul Menage   Task Control Grou...
2866
2867
  /**
   * cgroup_fork - attach newly forked task to its parents cgroup.
a043e3b2c   Li Zefan   cgroup: fix comments
2868
   * @child: pointer to task_struct of forking parent process.
b4f48b636   Paul Menage   Task Control Grou...
2869
2870
2871
2872
2873
2874
   *
   * Description: A task inherits its parent's cgroup at fork().
   *
   * A pointer to the shared css_set was automatically copied in
   * fork.c by dup_task_struct().  However, we ignore that copy, since
   * it was not made under the protection of RCU or cgroup_mutex, so
956db3ca0   Cliff Wickman   hotplug cpu: move...
2875
   * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
817929ec2   Paul Menage   Task Control Grou...
2876
2877
   * have already changed current->cgroups, allowing the previously
   * referenced cgroup group to be removed and freed.
b4f48b636   Paul Menage   Task Control Grou...
2878
2879
2880
2881
2882
2883
   *
   * At the point that cgroup_fork() is called, 'current' is the parent
   * task, and the passed argument 'child' points to the child task.
   */
  void cgroup_fork(struct task_struct *child)
  {
817929ec2   Paul Menage   Task Control Grou...
2884
2885
2886
2887
2888
  	task_lock(current);
  	child->cgroups = current->cgroups;
  	get_css_set(child->cgroups);
  	task_unlock(current);
  	INIT_LIST_HEAD(&child->cg_list);
b4f48b636   Paul Menage   Task Control Grou...
2889
2890
2891
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2892
2893
2894
2895
2896
2897
   * cgroup_fork_callbacks - run fork callbacks
   * @child: the new task
   *
   * Called on a new task very soon before adding it to the
   * tasklist. No need to take any locks since no-one can
   * be operating on this task.
b4f48b636   Paul Menage   Task Control Grou...
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
   */
  void cgroup_fork_callbacks(struct task_struct *child)
  {
  	if (need_forkexit_callback) {
  		int i;
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->fork)
  				ss->fork(ss, child);
  		}
  	}
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2912
2913
2914
2915
2916
2917
2918
2919
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
   * Adds the task to the list running through its css_set if necessary.
   * Has to be after the task is visible on the task list in case we race
   * with the first call to cgroup_iter_start() - to guarantee that the
   * new task ends up on its list.
   */
817929ec2   Paul Menage   Task Control Grou...
2920
2921
2922
2923
  void cgroup_post_fork(struct task_struct *child)
  {
  	if (use_task_css_set_links) {
  		write_lock(&css_set_lock);
b12b533fa   Lai Jiangshan   cgroups: add lock...
2924
  		task_lock(child);
817929ec2   Paul Menage   Task Control Grou...
2925
2926
  		if (list_empty(&child->cg_list))
  			list_add(&child->cg_list, &child->cgroups->tasks);
b12b533fa   Lai Jiangshan   cgroups: add lock...
2927
  		task_unlock(child);
817929ec2   Paul Menage   Task Control Grou...
2928
2929
2930
2931
  		write_unlock(&css_set_lock);
  	}
  }
  /**
b4f48b636   Paul Menage   Task Control Grou...
2932
2933
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
a043e3b2c   Li Zefan   cgroup: fix comments
2934
   * @run_callback: run exit callbacks?
b4f48b636   Paul Menage   Task Control Grou...
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
   *
   * Description: Detach cgroup from @tsk and release it.
   *
   * Note that cgroups marked notify_on_release force every task in
   * them to take the global cgroup_mutex mutex when exiting.
   * This could impact scaling on very large systems.  Be reluctant to
   * use notify_on_release cgroups where very high task exit scaling
   * is required on large systems.
   *
   * the_top_cgroup_hack:
   *
   *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
   *
   *    We call cgroup_exit() while the task is still competent to
   *    handle notify_on_release(), then leave the task attached to the
   *    root cgroup in each hierarchy for the remainder of its exit.
   *
   *    To do this properly, we would increment the reference count on
   *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
   *    code we would add a second cgroup function call, to drop that
   *    reference.  This would just create an unnecessary hot spot on
   *    the top_cgroup reference count, to no avail.
   *
   *    Normally, holding a reference to a cgroup without bumping its
   *    count is unsafe.   The cgroup could go away, or someone could
   *    attach us to a different cgroup, decrementing the count on
   *    the first cgroup that we never incremented.  But in this case,
   *    top_cgroup isn't going away, and either task has PF_EXITING set,
956db3ca0   Cliff Wickman   hotplug cpu: move...
2963
2964
   *    which wards off any cgroup_attach_task() attempts, or task is a failed
   *    fork, never visible to cgroup_attach_task.
b4f48b636   Paul Menage   Task Control Grou...
2965
2966
2967
2968
   */
  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
  {
  	int i;
817929ec2   Paul Menage   Task Control Grou...
2969
  	struct css_set *cg;
b4f48b636   Paul Menage   Task Control Grou...
2970
2971
2972
2973
2974
2975
2976
2977
  
  	if (run_callbacks && need_forkexit_callback) {
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->exit)
  				ss->exit(ss, tsk);
  		}
  	}
817929ec2   Paul Menage   Task Control Grou...
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
  
  	/*
  	 * Unlink from the css_set task list if necessary.
  	 * Optimistically check cg_list before taking
  	 * css_set_lock
  	 */
  	if (!list_empty(&tsk->cg_list)) {
  		write_lock(&css_set_lock);
  		if (!list_empty(&tsk->cg_list))
  			list_del(&tsk->cg_list);
  		write_unlock(&css_set_lock);
  	}
b4f48b636   Paul Menage   Task Control Grou...
2990
2991
  	/* Reassign the task to the init_css_set. */
  	task_lock(tsk);
817929ec2   Paul Menage   Task Control Grou...
2992
2993
  	cg = tsk->cgroups;
  	tsk->cgroups = &init_css_set;
b4f48b636   Paul Menage   Task Control Grou...
2994
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
2995
  	if (cg)
81a6a5cdd   Paul Menage   Task Control Grou...
2996
  		put_css_set_taskexit(cg);
b4f48b636   Paul Menage   Task Control Grou...
2997
  }
697f41610   Paul Menage   Task Control Grou...
2998
2999
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3000
3001
3002
   * cgroup_clone - clone the cgroup the given subsystem is attached to
   * @tsk: the task to be moved
   * @subsys: the given subsystem
e885dcde7   Serge E. Hallyn   cgroup_clone: use...
3003
   * @nodename: the name for the new cgroup
a043e3b2c   Li Zefan   cgroup: fix comments
3004
3005
3006
3007
   *
   * Duplicate the current cgroup in the hierarchy that the given
   * subsystem is attached to, and move this task into the new
   * child.
697f41610   Paul Menage   Task Control Grou...
3008
   */
e885dcde7   Serge E. Hallyn   cgroup_clone: use...
3009
3010
  int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
  							char *nodename)
697f41610   Paul Menage   Task Control Grou...
3011
3012
3013
  {
  	struct dentry *dentry;
  	int ret = 0;
697f41610   Paul Menage   Task Control Grou...
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
  	struct cgroup *parent, *child;
  	struct inode *inode;
  	struct css_set *cg;
  	struct cgroupfs_root *root;
  	struct cgroup_subsys *ss;
  
  	/* We shouldn't be called by an unregistered subsystem */
  	BUG_ON(!subsys->active);
  
  	/* First figure out what hierarchy and cgroup we're dealing
  	 * with, and pin them so we can drop cgroup_mutex */
  	mutex_lock(&cgroup_mutex);
   again:
  	root = subsys->root;
  	if (root == &rootnode) {
697f41610   Paul Menage   Task Control Grou...
3029
3030
3031
  		mutex_unlock(&cgroup_mutex);
  		return 0;
  	}
697f41610   Paul Menage   Task Control Grou...
3032

697f41610   Paul Menage   Task Control Grou...
3033
  	/* Pin the hierarchy */
1404f0656   Li Zefan   cgroups: fix lock...
3034
  	if (!atomic_inc_not_zero(&root->sb->s_active)) {
7b574b7b0   Li Zefan   cgroups: fix a ra...
3035
3036
3037
3038
  		/* We race with the final deactivate_super() */
  		mutex_unlock(&cgroup_mutex);
  		return 0;
  	}
697f41610   Paul Menage   Task Control Grou...
3039

817929ec2   Paul Menage   Task Control Grou...
3040
  	/* Keep the cgroup alive */
1404f0656   Li Zefan   cgroups: fix lock...
3041
3042
3043
  	task_lock(tsk);
  	parent = task_cgroup(tsk, subsys->subsys_id);
  	cg = tsk->cgroups;
817929ec2   Paul Menage   Task Control Grou...
3044
  	get_css_set(cg);
104cbd553   Lai Jiangshan   cgroups: use task...
3045
  	task_unlock(tsk);
1404f0656   Li Zefan   cgroups: fix lock...
3046

697f41610   Paul Menage   Task Control Grou...
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
  	mutex_unlock(&cgroup_mutex);
  
  	/* Now do the VFS work to create a cgroup */
  	inode = parent->dentry->d_inode;
  
  	/* Hold the parent directory mutex across this operation to
  	 * stop anyone else deleting the new cgroup */
  	mutex_lock(&inode->i_mutex);
  	dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
  	if (IS_ERR(dentry)) {
  		printk(KERN_INFO
cfe36bde5   Diego Calleja   Improve cgroup pr...
3058
3059
  		       "cgroup: Couldn't allocate dentry for %s: %ld
  ", nodename,
697f41610   Paul Menage   Task Control Grou...
3060
3061
3062
3063
3064
3065
  		       PTR_ERR(dentry));
  		ret = PTR_ERR(dentry);
  		goto out_release;
  	}
  
  	/* Create the cgroup directory, which also creates the cgroup */
75139b827   Li Zefan   cgroups: remove s...
3066
  	ret = vfs_mkdir(inode, dentry, 0755);
bd89aabc6   Paul Menage   Control groups: R...
3067
  	child = __d_cgrp(dentry);
697f41610   Paul Menage   Task Control Grou...
3068
3069
3070
3071
3072
3073
3074
3075
  	dput(dentry);
  	if (ret) {
  		printk(KERN_INFO
  		       "Failed to create cgroup %s: %d
  ", nodename,
  		       ret);
  		goto out_release;
  	}
697f41610   Paul Menage   Task Control Grou...
3076
3077
3078
3079
3080
3081
3082
3083
  	/* The cgroup now exists. Retake cgroup_mutex and check
  	 * that we're still in the same state that we thought we
  	 * were. */
  	mutex_lock(&cgroup_mutex);
  	if ((root != subsys->root) ||
  	    (parent != task_cgroup(tsk, subsys->subsys_id))) {
  		/* Aargh, we raced ... */
  		mutex_unlock(&inode->i_mutex);
817929ec2   Paul Menage   Task Control Grou...
3084
  		put_css_set(cg);
697f41610   Paul Menage   Task Control Grou...
3085

1404f0656   Li Zefan   cgroups: fix lock...
3086
  		deactivate_super(root->sb);
697f41610   Paul Menage   Task Control Grou...
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
  		/* The cgroup is still accessible in the VFS, but
  		 * we're not going to try to rmdir() it at this
  		 * point. */
  		printk(KERN_INFO
  		       "Race in cgroup_clone() - leaking cgroup %s
  ",
  		       nodename);
  		goto again;
  	}
  
  	/* do any required auto-setup */
  	for_each_subsys(root, ss) {
  		if (ss->post_clone)
  			ss->post_clone(ss, child);
  	}
  
  	/* All seems fine. Finish by moving the task into the new cgroup */
956db3ca0   Cliff Wickman   hotplug cpu: move...
3104
  	ret = cgroup_attach_task(child, tsk);
697f41610   Paul Menage   Task Control Grou...
3105
3106
3107
3108
  	mutex_unlock(&cgroup_mutex);
  
   out_release:
  	mutex_unlock(&inode->i_mutex);
81a6a5cdd   Paul Menage   Task Control Grou...
3109
3110
  
  	mutex_lock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
3111
  	put_css_set(cg);
81a6a5cdd   Paul Menage   Task Control Grou...
3112
  	mutex_unlock(&cgroup_mutex);
1404f0656   Li Zefan   cgroups: fix lock...
3113
  	deactivate_super(root->sb);
697f41610   Paul Menage   Task Control Grou...
3114
3115
  	return ret;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
3116
  /**
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3117
   * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
a043e3b2c   Li Zefan   cgroup: fix comments
3118
   * @cgrp: the cgroup in question
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3119
   * @task: the task in question
a043e3b2c   Li Zefan   cgroup: fix comments
3120
   *
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3121
3122
   * See if @cgrp is a descendant of @task's cgroup in the appropriate
   * hierarchy.
697f41610   Paul Menage   Task Control Grou...
3123
3124
3125
3126
3127
3128
   *
   * If we are sending in dummytop, then presumably we are creating
   * the top cgroup in the subsystem.
   *
   * Called only by the ns (nsproxy) cgroup.
   */
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3129
  int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
697f41610   Paul Menage   Task Control Grou...
3130
3131
3132
3133
  {
  	int ret;
  	struct cgroup *target;
  	int subsys_id;
bd89aabc6   Paul Menage   Control groups: R...
3134
  	if (cgrp == dummytop)
697f41610   Paul Menage   Task Control Grou...
3135
  		return 1;
bd89aabc6   Paul Menage   Control groups: R...
3136
  	get_first_subsys(cgrp, NULL, &subsys_id);
313e924c0   Grzegorz Nosek   cgroups: relax ns...
3137
  	target = task_cgroup(task, subsys_id);
bd89aabc6   Paul Menage   Control groups: R...
3138
3139
3140
  	while (cgrp != target && cgrp!= cgrp->top_cgroup)
  		cgrp = cgrp->parent;
  	ret = (cgrp == target);
697f41610   Paul Menage   Task Control Grou...
3141
3142
  	return ret;
  }
81a6a5cdd   Paul Menage   Task Control Grou...
3143

bd89aabc6   Paul Menage   Control groups: R...
3144
  static void check_for_release(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
3145
3146
3147
  {
  	/* All of these checks rely on RCU to keep the cgroup
  	 * structure alive */
bd89aabc6   Paul Menage   Control groups: R...
3148
3149
  	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
  	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
3150
3151
3152
3153
3154
  		/* Control Group is currently removeable. If it's not
  		 * already queued for a userspace notification, queue
  		 * it now */
  		int need_schedule_work = 0;
  		spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
3155
3156
3157
  		if (!cgroup_is_removed(cgrp) &&
  		    list_empty(&cgrp->release_list)) {
  			list_add(&cgrp->release_list, &release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
  			need_schedule_work = 1;
  		}
  		spin_unlock(&release_list_lock);
  		if (need_schedule_work)
  			schedule_work(&release_agent_work);
  	}
  }
  
  void __css_put(struct cgroup_subsys_state *css)
  {
bd89aabc6   Paul Menage   Control groups: R...
3168
  	struct cgroup *cgrp = css->cgroup;
81a6a5cdd   Paul Menage   Task Control Grou...
3169
  	rcu_read_lock();
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3170
3171
3172
3173
3174
3175
  	if (atomic_dec_return(&css->refcnt) == 1) {
  		if (notify_on_release(cgrp)) {
  			set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
  		}
  		cgroup_wakeup_rmdir_waiters(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
  	}
  	rcu_read_unlock();
  }
  
  /*
   * Notify userspace when a cgroup is released, by running the
   * configured release agent with the name of the cgroup (path
   * relative to the root of cgroup file system) as the argument.
   *
   * Most likely, this user command will try to rmdir this cgroup.
   *
   * This races with the possibility that some other task will be
   * attached to this cgroup before it is removed, or that some other
   * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
   * The presumed 'rmdir' will fail quietly if this cgroup is no longer
   * unused, and this cgroup will be reprieved from its death sentence,
   * to continue to serve a useful existence.  Next time it's released,
   * we will get notified again, if it still has 'notify_on_release' set.
   *
   * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
   * means only wait until the task is successfully execve()'d.  The
   * separate release agent task is forked by call_usermodehelper(),
   * then control in this thread returns here, without waiting for the
   * release agent task.  We don't bother to wait because the caller of
   * this routine has no use for the exit status of the release agent
   * task, so no sense holding our caller up for that.
81a6a5cdd   Paul Menage   Task Control Grou...
3202
   */
81a6a5cdd   Paul Menage   Task Control Grou...
3203
3204
3205
3206
3207
3208
3209
3210
  static void cgroup_release_agent(struct work_struct *work)
  {
  	BUG_ON(work != &release_agent_work);
  	mutex_lock(&cgroup_mutex);
  	spin_lock(&release_list_lock);
  	while (!list_empty(&release_list)) {
  		char *argv[3], *envp[3];
  		int i;
e788e066c   Paul Menage   cgroup files: mov...
3211
  		char *pathbuf = NULL, *agentbuf = NULL;
bd89aabc6   Paul Menage   Control groups: R...
3212
  		struct cgroup *cgrp = list_entry(release_list.next,
81a6a5cdd   Paul Menage   Task Control Grou...
3213
3214
  						    struct cgroup,
  						    release_list);
bd89aabc6   Paul Menage   Control groups: R...
3215
  		list_del_init(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
3216
3217
  		spin_unlock(&release_list_lock);
  		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
e788e066c   Paul Menage   cgroup files: mov...
3218
3219
3220
3221
3222
3223
3224
  		if (!pathbuf)
  			goto continue_free;
  		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
  			goto continue_free;
  		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
  		if (!agentbuf)
  			goto continue_free;
81a6a5cdd   Paul Menage   Task Control Grou...
3225
3226
  
  		i = 0;
e788e066c   Paul Menage   cgroup files: mov...
3227
3228
  		argv[i++] = agentbuf;
  		argv[i++] = pathbuf;
81a6a5cdd   Paul Menage   Task Control Grou...
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
  		argv[i] = NULL;
  
  		i = 0;
  		/* minimal command environment */
  		envp[i++] = "HOME=/";
  		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  		envp[i] = NULL;
  
  		/* Drop the lock while we invoke the usermode helper,
  		 * since the exec could involve hitting disk and hence
  		 * be a slow process */
  		mutex_unlock(&cgroup_mutex);
  		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
81a6a5cdd   Paul Menage   Task Control Grou...
3242
  		mutex_lock(&cgroup_mutex);
e788e066c   Paul Menage   cgroup files: mov...
3243
3244
3245
   continue_free:
  		kfree(pathbuf);
  		kfree(agentbuf);
81a6a5cdd   Paul Menage   Task Control Grou...
3246
3247
3248
3249
3250
  		spin_lock(&release_list_lock);
  	}
  	spin_unlock(&release_list_lock);
  	mutex_unlock(&cgroup_mutex);
  }
8bab8dded   Paul Menage   cgroups: add cgro...
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
  
  static int __init cgroup_disable(char *str)
  {
  	int i;
  	char *token;
  
  	while ((token = strsep(&str, ",")) != NULL) {
  		if (!*token)
  			continue;
  
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  
  			if (!strcmp(token, ss->name)) {
  				ss->disabled = 1;
  				printk(KERN_INFO "Disabling %s control group"
  					" subsystem
  ", ss->name);
  				break;
  			}
  		}
  	}
  	return 1;
  }
  __setup("cgroup_disable=", cgroup_disable);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
  
  /*
   * Functons for CSS ID.
   */
  
  /*
   *To get ID other than 0, this should be called when !cgroup_is_removed().
   */
  unsigned short css_id(struct cgroup_subsys_state *css)
  {
  	struct css_id *cssid = rcu_dereference(css->id);
  
  	if (cssid)
  		return cssid->id;
  	return 0;
  }
  
  unsigned short css_depth(struct cgroup_subsys_state *css)
  {
  	struct css_id *cssid = rcu_dereference(css->id);
  
  	if (cssid)
  		return cssid->depth;
  	return 0;
  }
  
  bool css_is_ancestor(struct cgroup_subsys_state *child,
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
3303
  		    const struct cgroup_subsys_state *root)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
  {
  	struct css_id *child_id = rcu_dereference(child->id);
  	struct css_id *root_id = rcu_dereference(root->id);
  
  	if (!child_id || !root_id || (child_id->depth < root_id->depth))
  		return false;
  	return child_id->stack[root_id->depth] == root_id->id;
  }
  
  static void __free_css_id_cb(struct rcu_head *head)
  {
  	struct css_id *id;
  
  	id = container_of(head, struct css_id, rcu_head);
  	kfree(id);
  }
  
  void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
  {
  	struct css_id *id = css->id;
  	/* When this is called before css_id initialization, id can be NULL */
  	if (!id)
  		return;
  
  	BUG_ON(!ss->use_id);
  
  	rcu_assign_pointer(id->css, NULL);
  	rcu_assign_pointer(css->id, NULL);
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, id->id);
  	spin_unlock(&ss->id_lock);
  	call_rcu(&id->rcu_head, __free_css_id_cb);
  }
  
  /*
   * This is called by init or create(). Then, calls to this function are
   * always serialized (By cgroup_mutex() at create()).
   */
  
  static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
  {
  	struct css_id *newid;
  	int myid, error, size;
  
  	BUG_ON(!ss->use_id);
  
  	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
  	newid = kzalloc(size, GFP_KERNEL);
  	if (!newid)
  		return ERR_PTR(-ENOMEM);
  	/* get id */
  	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
  		error = -ENOMEM;
  		goto err_out;
  	}
  	spin_lock(&ss->id_lock);
  	/* Don't use 0. allocates an ID of 1-65535 */
  	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
  	spin_unlock(&ss->id_lock);
  
  	/* Returns error when there are no free spaces for new ID.*/
  	if (error) {
  		error = -ENOSPC;
  		goto err_out;
  	}
  	if (myid > CSS_ID_MAX)
  		goto remove_idr;
  
  	newid->id = myid;
  	newid->depth = depth;
  	return newid;
  remove_idr:
  	error = -ENOSPC;
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, myid);
  	spin_unlock(&ss->id_lock);
  err_out:
  	kfree(newid);
  	return ERR_PTR(error);
  
  }
  
  static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
  {
  	struct css_id *newid;
  	struct cgroup_subsys_state *rootcss;
  
  	spin_lock_init(&ss->id_lock);
  	idr_init(&ss->idr);
  
  	rootcss = init_css_set.subsys[ss->subsys_id];
  	newid = get_new_cssid(ss, 0);
  	if (IS_ERR(newid))
  		return PTR_ERR(newid);
  
  	newid->stack[0] = newid->id;
  	newid->css = rootcss;
  	rootcss->id = newid;
  	return 0;
  }
  
  static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
  			struct cgroup *child)
  {
  	int subsys_id, i, depth = 0;
  	struct cgroup_subsys_state *parent_css, *child_css;
  	struct css_id *child_id, *parent_id = NULL;
  
  	subsys_id = ss->subsys_id;
  	parent_css = parent->subsys[subsys_id];
  	child_css = child->subsys[subsys_id];
  	depth = css_depth(parent_css) + 1;
  	parent_id = parent_css->id;
  
  	child_id = get_new_cssid(ss, depth);
  	if (IS_ERR(child_id))
  		return PTR_ERR(child_id);
  
  	for (i = 0; i < depth; i++)
  		child_id->stack[i] = parent_id->stack[i];
  	child_id->stack[depth] = child_id->id;
  	/*
  	 * child_id->css pointer will be set after this cgroup is available
  	 * see cgroup_populate_dir()
  	 */
  	rcu_assign_pointer(child_css->id, child_id);
  
  	return 0;
  }
  
  /**
   * css_lookup - lookup css by id
   * @ss: cgroup subsys to be looked into.
   * @id: the id
   *
   * Returns pointer to cgroup_subsys_state if there is valid one with id.
   * NULL if not. Should be called under rcu_read_lock()
   */
  struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
  {
  	struct css_id *cssid = NULL;
  
  	BUG_ON(!ss->use_id);
  	cssid = idr_find(&ss->idr, id);
  
  	if (unlikely(!cssid))
  		return NULL;
  
  	return rcu_dereference(cssid->css);
  }
  
  /**
   * css_get_next - lookup next cgroup under specified hierarchy.
   * @ss: pointer to subsystem
   * @id: current position of iteration.
   * @root: pointer to css. search tree under this.
   * @foundid: position of found object.
   *
   * Search next css under the specified hierarchy of rootid. Calling under
   * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
   */
  struct cgroup_subsys_state *
  css_get_next(struct cgroup_subsys *ss, int id,
  	     struct cgroup_subsys_state *root, int *foundid)
  {
  	struct cgroup_subsys_state *ret = NULL;
  	struct css_id *tmp;
  	int tmpid;
  	int rootid = css_id(root);
  	int depth = css_depth(root);
  
  	if (!rootid)
  		return NULL;
  
  	BUG_ON(!ss->use_id);
  	/* fill start point for scan */
  	tmpid = id;
  	while (1) {
  		/*
  		 * scan next entry from bitmap(tree), tmpid is updated after
  		 * idr_get_next().
  		 */
  		spin_lock(&ss->id_lock);
  		tmp = idr_get_next(&ss->idr, &tmpid);
  		spin_unlock(&ss->id_lock);
  
  		if (!tmp)
  			break;
  		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
  			ret = rcu_dereference(tmp->css);
  			if (ret) {
  				*foundid = tmpid;
  				break;
  			}
  		}
  		/* continue to scan from next id */
  		tmpid = tmpid + 1;
  	}
  	return ret;
  }