Blame view

kernel/cgroup.c 137 KB
ddbcc7e8e   Paul Menage   Task Control Grou...
1
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
2
3
4
5
6
   *  Generic process-grouping system.
   *
   *  Based originally on the cpuset system, extracted by Paul Menage
   *  Copyright (C) 2006 Google, Inc
   *
0dea11687   Kirill A. Shutemov   cgroup: implement...
7
8
9
10
   *  Notifications support
   *  Copyright (C) 2009 Nokia Corporation
   *  Author: Kirill A. Shutemov
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
   *  Copyright notices from the original cpuset code:
   *  --------------------------------------------------
   *  Copyright (C) 2003 BULL SA.
   *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
   *
   *  2003-10-10 Written by Simon Derr.
   *  2003-10-22 Updates by Stephen Hemminger.
   *  2004 May-July Rework by Paul Jackson.
   *  ---------------------------------------------------
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
  
  #include <linux/cgroup.h>
2ce9738ba   eparis@redhat   cgroupfs: use ini...
30
  #include <linux/cred.h>
c6d57f331   Paul Menage   cgroups: support ...
31
  #include <linux/ctype.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
32
33
  #include <linux/errno.h>
  #include <linux/fs.h>
2ce9738ba   eparis@redhat   cgroupfs: use ini...
34
  #include <linux/init_task.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
35
36
37
38
39
40
  #include <linux/kernel.h>
  #include <linux/list.h>
  #include <linux/mm.h>
  #include <linux/mutex.h>
  #include <linux/mount.h>
  #include <linux/pagemap.h>
a424316ca   Paul Menage   Task Control Grou...
41
  #include <linux/proc_fs.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
42
43
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
817929ec2   Paul Menage   Task Control Grou...
44
  #include <linux/backing-dev.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
45
46
47
48
49
  #include <linux/seq_file.h>
  #include <linux/slab.h>
  #include <linux/magic.h>
  #include <linux/spinlock.h>
  #include <linux/string.h>
bbcb81d09   Paul Menage   Task Control Grou...
50
  #include <linux/sort.h>
81a6a5cdd   Paul Menage   Task Control Grou...
51
  #include <linux/kmod.h>
e6a1105ba   Ben Blum   cgroups: subsyste...
52
  #include <linux/module.h>
846c7bb05   Balbir Singh   Add cgroupstats
53
54
  #include <linux/delayacct.h>
  #include <linux/cgroupstats.h>
472b1053f   Li Zefan   cgroups: use a ha...
55
  #include <linux/hash.h>
3f8206d49   Al Viro   [PATCH] get rid o...
56
  #include <linux/namei.h>
096b7fe01   Li Zefan   cgroups: fix pid ...
57
  #include <linux/pid_namespace.h>
2c6ab6d20   Paul Menage   cgroups: allow cg...
58
  #include <linux/idr.h>
d1d9fd330   Ben Blum   cgroups: use vmal...
59
  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
0dea11687   Kirill A. Shutemov   cgroup: implement...
60
61
  #include <linux/eventfd.h>
  #include <linux/poll.h>
d846687d7   Ben Blum   cgroups: use flex...
62
  #include <linux/flex_array.h> /* used in cgroup_attach_proc */
846c7bb05   Balbir Singh   Add cgroupstats
63

60063497a   Arun Sharma   atomic: use <linu...
64
  #include <linux/atomic.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
65

81a6a5cdd   Paul Menage   Task Control Grou...
66
  static DEFINE_MUTEX(cgroup_mutex);
aae8aab40   Ben Blum   cgroups: revamp s...
67
68
69
70
71
72
  /*
   * Generate an array of cgroup subsystem pointers. At boot time, this is
   * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
   * registered after that. The mutable section of this array is protected by
   * cgroup_mutex.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
73
  #define SUBSYS(_x) &_x ## _subsys,
aae8aab40   Ben Blum   cgroups: revamp s...
74
  static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
ddbcc7e8e   Paul Menage   Task Control Grou...
75
76
  #include <linux/cgroup_subsys.h>
  };
c6d57f331   Paul Menage   cgroups: support ...
77
  #define MAX_CGROUP_ROOT_NAMELEN 64
ddbcc7e8e   Paul Menage   Task Control Grou...
78
79
80
81
82
83
84
85
86
87
88
89
90
  /*
   * A cgroupfs_root represents the root of a cgroup hierarchy,
   * and may be associated with a superblock to form an active
   * hierarchy
   */
  struct cgroupfs_root {
  	struct super_block *sb;
  
  	/*
  	 * The bitmask of subsystems intended to be attached to this
  	 * hierarchy
  	 */
  	unsigned long subsys_bits;
2c6ab6d20   Paul Menage   cgroups: allow cg...
91
92
  	/* Unique id for this hierarchy. */
  	int hierarchy_id;
ddbcc7e8e   Paul Menage   Task Control Grou...
93
94
95
96
97
98
99
100
101
102
103
  	/* The bitmask of subsystems currently attached to this hierarchy */
  	unsigned long actual_subsys_bits;
  
  	/* A list running through the attached subsystems */
  	struct list_head subsys_list;
  
  	/* The root cgroup for this hierarchy */
  	struct cgroup top_cgroup;
  
  	/* Tracks how many cgroups are currently defined in hierarchy.*/
  	int number_of_cgroups;
e5f6a8609   Li Zefan   cgroups: make roo...
104
  	/* A list running through the active hierarchies */
ddbcc7e8e   Paul Menage   Task Control Grou...
105
106
107
108
  	struct list_head root_list;
  
  	/* Hierarchy-specific flags */
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
109

e788e066c   Paul Menage   cgroup files: mov...
110
  	/* The path to use for release notifications. */
81a6a5cdd   Paul Menage   Task Control Grou...
111
  	char release_agent_path[PATH_MAX];
c6d57f331   Paul Menage   cgroups: support ...
112
113
114
  
  	/* The name for this hierarchy - may be empty */
  	char name[MAX_CGROUP_ROOT_NAMELEN];
ddbcc7e8e   Paul Menage   Task Control Grou...
115
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
116
117
118
119
120
121
  /*
   * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
   * subsystems that are otherwise unattached - it never has more than a
   * single cgroup, and all tasks are part of that cgroup.
   */
  static struct cgroupfs_root rootnode;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
122
123
124
125
126
127
128
129
130
131
132
133
134
  /*
   * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
   * cgroup_subsys->use_id != 0.
   */
  #define CSS_ID_MAX	(65535)
  struct css_id {
  	/*
  	 * The css to which this ID points. This pointer is set to valid value
  	 * after cgroup is populated. If cgroup is removed, this will be NULL.
  	 * This pointer is expected to be RCU-safe because destroy()
  	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
  	 * css_tryget() should be used for avoiding race.
  	 */
2c392b8c3   Arnd Bergmann   cgroups: __rcu an...
135
  	struct cgroup_subsys_state __rcu *css;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
  	/*
  	 * ID of this css.
  	 */
  	unsigned short id;
  	/*
  	 * Depth in hierarchy which this ID belongs to.
  	 */
  	unsigned short depth;
  	/*
  	 * ID is freed by RCU. (and lookup routine is RCU safe.)
  	 */
  	struct rcu_head rcu_head;
  	/*
  	 * Hierarchy of CSS ID belongs to.
  	 */
  	unsigned short stack[0]; /* Array of Length (depth+1) */
  };
0dea11687   Kirill A. Shutemov   cgroup: implement...
153
  /*
25985edce   Lucas De Marchi   Fix common misspe...
154
   * cgroup_event represents events which userspace want to receive.
0dea11687   Kirill A. Shutemov   cgroup: implement...
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
   */
  struct cgroup_event {
  	/*
  	 * Cgroup which the event belongs to.
  	 */
  	struct cgroup *cgrp;
  	/*
  	 * Control file which the event associated.
  	 */
  	struct cftype *cft;
  	/*
  	 * eventfd to signal userspace about the event.
  	 */
  	struct eventfd_ctx *eventfd;
  	/*
  	 * Each of these stored in a list by the cgroup.
  	 */
  	struct list_head list;
  	/*
  	 * All fields below needed to unregister event when
  	 * userspace closes eventfd.
  	 */
  	poll_table pt;
  	wait_queue_head_t *wqh;
  	wait_queue_t wait;
  	struct work_struct remove;
  };
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
182

ddbcc7e8e   Paul Menage   Task Control Grou...
183
184
185
  /* The list of hierarchy roots */
  
  static LIST_HEAD(roots);
817929ec2   Paul Menage   Task Control Grou...
186
  static int root_count;
ddbcc7e8e   Paul Menage   Task Control Grou...
187

2c6ab6d20   Paul Menage   cgroups: allow cg...
188
189
190
  static DEFINE_IDA(hierarchy_ida);
  static int next_hierarchy_id;
  static DEFINE_SPINLOCK(hierarchy_id_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
191
192
193
194
  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
  #define dummytop (&rootnode.top_cgroup)
  
  /* This flag indicates whether tasks in the fork and exit paths should
a043e3b2c   Li Zefan   cgroup: fix comments
195
196
197
   * check for fork/exit handlers to call. This avoids us having to do
   * extra work in the fork/exit path if none of the subsystems need to
   * be called.
ddbcc7e8e   Paul Menage   Task Control Grou...
198
   */
8947f9d5b   Li Zefan   cgroups: annotate...
199
  static int need_forkexit_callback __read_mostly;
ddbcc7e8e   Paul Menage   Task Control Grou...
200

d11c563dd   Paul E. McKenney   sched: Use lockde...
201
202
203
204
205
206
207
208
209
210
211
212
213
  #ifdef CONFIG_PROVE_LOCKING
  int cgroup_lock_is_held(void)
  {
  	return lockdep_is_held(&cgroup_mutex);
  }
  #else /* #ifdef CONFIG_PROVE_LOCKING */
  int cgroup_lock_is_held(void)
  {
  	return mutex_is_locked(&cgroup_mutex);
  }
  #endif /* #else #ifdef CONFIG_PROVE_LOCKING */
  
  EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
ddbcc7e8e   Paul Menage   Task Control Grou...
214
  /* convenient tests for these bits */
bd89aabc6   Paul Menage   Control groups: R...
215
  inline int cgroup_is_removed(const struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
216
  {
bd89aabc6   Paul Menage   Control groups: R...
217
  	return test_bit(CGRP_REMOVED, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
218
219
220
221
222
223
  }
  
  /* bits in struct cgroupfs_root flags field */
  enum {
  	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
  };
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
224
  static int cgroup_is_releasable(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
225
226
  {
  	const int bits =
bd89aabc6   Paul Menage   Control groups: R...
227
228
229
  		(1 << CGRP_RELEASABLE) |
  		(1 << CGRP_NOTIFY_ON_RELEASE);
  	return (cgrp->flags & bits) == bits;
81a6a5cdd   Paul Menage   Task Control Grou...
230
  }
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
231
  static int notify_on_release(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
232
  {
bd89aabc6   Paul Menage   Control groups: R...
233
  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
234
  }
97978e6d1   Daniel Lezcano   cgroup: add clone...
235
236
237
238
  static int clone_children(const struct cgroup *cgrp)
  {
  	return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
239
240
241
242
243
244
  /*
   * for_each_subsys() allows you to iterate on each subsystem attached to
   * an active hierarchy
   */
  #define for_each_subsys(_root, _ss) \
  list_for_each_entry(_ss, &_root->subsys_list, sibling)
e5f6a8609   Li Zefan   cgroups: make roo...
245
246
  /* for_each_active_root() allows you to iterate across the active hierarchies */
  #define for_each_active_root(_root) \
ddbcc7e8e   Paul Menage   Task Control Grou...
247
  list_for_each_entry(_root, &roots, root_list)
81a6a5cdd   Paul Menage   Task Control Grou...
248
249
250
251
252
253
  /* the list of cgroups eligible for automatic release. Protected by
   * release_list_lock */
  static LIST_HEAD(release_list);
  static DEFINE_SPINLOCK(release_list_lock);
  static void cgroup_release_agent(struct work_struct *work);
  static DECLARE_WORK(release_agent_work, cgroup_release_agent);
bd89aabc6   Paul Menage   Control groups: R...
254
  static void check_for_release(struct cgroup *cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
255

817929ec2   Paul Menage   Task Control Grou...
256
257
258
259
260
261
  /* Link structure for associating css_set objects with cgroups */
  struct cg_cgroup_link {
  	/*
  	 * List running through cg_cgroup_links associated with a
  	 * cgroup, anchored on cgroup->css_sets
  	 */
bd89aabc6   Paul Menage   Control groups: R...
262
  	struct list_head cgrp_link_list;
7717f7ba9   Paul Menage   cgroups: add a ba...
263
  	struct cgroup *cgrp;
817929ec2   Paul Menage   Task Control Grou...
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
  	/*
  	 * List running through cg_cgroup_links pointing at a
  	 * single css_set object, anchored on css_set->cg_links
  	 */
  	struct list_head cg_link_list;
  	struct css_set *cg;
  };
  
  /* The default css_set - used by init and its children prior to any
   * hierarchies being mounted. It contains a pointer to the root state
   * for each subsystem. Also used to anchor the list of css_sets. Not
   * reference-counted, to improve performance when child cgroups
   * haven't been created.
   */
  
  static struct css_set init_css_set;
  static struct cg_cgroup_link init_css_set_link;
e6a1105ba   Ben Blum   cgroups: subsyste...
281
282
  static int cgroup_init_idr(struct cgroup_subsys *ss,
  			   struct cgroup_subsys_state *css);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
283

817929ec2   Paul Menage   Task Control Grou...
284
285
286
287
288
  /* css_set_lock protects the list of css_set objects, and the
   * chain of tasks off each css_set.  Nests outside task->alloc_lock
   * due to cgroup_iter_start() */
  static DEFINE_RWLOCK(css_set_lock);
  static int css_set_count;
7717f7ba9   Paul Menage   cgroups: add a ba...
289
290
291
292
293
  /*
   * hash table for cgroup groups. This improves the performance to find
   * an existing css_set. This hash doesn't (currently) take into
   * account cgroups in empty hierarchies.
   */
472b1053f   Li Zefan   cgroups: use a ha...
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
  #define CSS_SET_HASH_BITS	7
  #define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
  static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
  
  static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
  {
  	int i;
  	int index;
  	unsigned long tmp = 0UL;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
  		tmp += (unsigned long)css[i];
  	tmp = (tmp >> 16) ^ tmp;
  
  	index = hash_long(tmp, CSS_SET_HASH_BITS);
  
  	return &css_set_table[index];
  }
817929ec2   Paul Menage   Task Control Grou...
312
313
314
315
  /* We don't maintain the lists running through each css_set to its
   * task until after the first call to cgroup_iter_start(). This
   * reduces the fork()/exit() overhead for people who have cgroups
   * compiled into their kernel but not actually in use */
8947f9d5b   Li Zefan   cgroups: annotate...
316
  static int use_task_css_set_links __read_mostly;
817929ec2   Paul Menage   Task Control Grou...
317

2c6ab6d20   Paul Menage   cgroups: allow cg...
318
  static void __put_css_set(struct css_set *cg, int taskexit)
b4f48b636   Paul Menage   Task Control Grou...
319
  {
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
320
321
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
322
323
324
325
326
327
328
329
330
331
332
333
  	/*
  	 * Ensure that the refcount doesn't hit zero while any readers
  	 * can see it. Similar to atomic_dec_and_lock(), but for an
  	 * rwlock
  	 */
  	if (atomic_add_unless(&cg->refcount, -1, 1))
  		return;
  	write_lock(&css_set_lock);
  	if (!atomic_dec_and_test(&cg->refcount)) {
  		write_unlock(&css_set_lock);
  		return;
  	}
81a6a5cdd   Paul Menage   Task Control Grou...
334

2c6ab6d20   Paul Menage   cgroups: allow cg...
335
336
337
338
339
340
341
342
343
  	/* This css_set is dead. unlink it and release cgroup refcounts */
  	hlist_del(&cg->hlist);
  	css_set_count--;
  
  	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
  				 cg_link_list) {
  		struct cgroup *cgrp = link->cgrp;
  		list_del(&link->cg_link_list);
  		list_del(&link->cgrp_link_list);
bd89aabc6   Paul Menage   Control groups: R...
344
345
  		if (atomic_dec_and_test(&cgrp->count) &&
  		    notify_on_release(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
346
  			if (taskexit)
bd89aabc6   Paul Menage   Control groups: R...
347
348
  				set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
349
  		}
2c6ab6d20   Paul Menage   cgroups: allow cg...
350
351
  
  		kfree(link);
81a6a5cdd   Paul Menage   Task Control Grou...
352
  	}
2c6ab6d20   Paul Menage   cgroups: allow cg...
353
354
  
  	write_unlock(&css_set_lock);
30088ad81   Lai Jiangshan   cgroup,rcu: conve...
355
  	kfree_rcu(cg, rcu_head);
b4f48b636   Paul Menage   Task Control Grou...
356
  }
817929ec2   Paul Menage   Task Control Grou...
357
358
359
360
361
  /*
   * refcounted get/put for css_set objects
   */
  static inline void get_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
362
  	atomic_inc(&cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
363
364
365
366
  }
  
  static inline void put_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
367
  	__put_css_set(cg, 0);
817929ec2   Paul Menage   Task Control Grou...
368
  }
81a6a5cdd   Paul Menage   Task Control Grou...
369
370
  static inline void put_css_set_taskexit(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
371
  	__put_css_set(cg, 1);
81a6a5cdd   Paul Menage   Task Control Grou...
372
  }
817929ec2   Paul Menage   Task Control Grou...
373
  /*
7717f7ba9   Paul Menage   cgroups: add a ba...
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
   * compare_css_sets - helper function for find_existing_css_set().
   * @cg: candidate css_set being tested
   * @old_cg: existing css_set for a task
   * @new_cgrp: cgroup that's being entered by the task
   * @template: desired set of css pointers in css_set (pre-calculated)
   *
   * Returns true if "cg" matches "old_cg" except for the hierarchy
   * which "new_cgrp" belongs to, for which it should match "new_cgrp".
   */
  static bool compare_css_sets(struct css_set *cg,
  			     struct css_set *old_cg,
  			     struct cgroup *new_cgrp,
  			     struct cgroup_subsys_state *template[])
  {
  	struct list_head *l1, *l2;
  
  	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
  		/* Not all subsystems matched */
  		return false;
  	}
  
  	/*
  	 * Compare cgroup pointers in order to distinguish between
  	 * different cgroups in heirarchies with no subsystems. We
  	 * could get by with just this check alone (and skip the
  	 * memcmp above) but on most setups the memcmp check will
  	 * avoid the need for this more expensive check on almost all
  	 * candidates.
  	 */
  
  	l1 = &cg->cg_links;
  	l2 = &old_cg->cg_links;
  	while (1) {
  		struct cg_cgroup_link *cgl1, *cgl2;
  		struct cgroup *cg1, *cg2;
  
  		l1 = l1->next;
  		l2 = l2->next;
  		/* See if we reached the end - both lists are equal length. */
  		if (l1 == &cg->cg_links) {
  			BUG_ON(l2 != &old_cg->cg_links);
  			break;
  		} else {
  			BUG_ON(l2 == &old_cg->cg_links);
  		}
  		/* Locate the cgroups associated with these links. */
  		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
  		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
  		cg1 = cgl1->cgrp;
  		cg2 = cgl2->cgrp;
  		/* Hierarchies should be linked in the same order. */
  		BUG_ON(cg1->root != cg2->root);
  
  		/*
  		 * If this hierarchy is the hierarchy of the cgroup
  		 * that's changing, then we need to check that this
  		 * css_set points to the new cgroup; if it's any other
  		 * hierarchy, then this css_set should point to the
  		 * same cgroup as the old css_set.
  		 */
  		if (cg1->root == new_cgrp->root) {
  			if (cg1 != new_cgrp)
  				return false;
  		} else {
  			if (cg1 != cg2)
  				return false;
  		}
  	}
  	return true;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
446
447
   * find_existing_css_set() is a helper for
   * find_css_set(), and checks to see whether an existing
472b1053f   Li Zefan   cgroups: use a ha...
448
   * css_set is suitable.
817929ec2   Paul Menage   Task Control Grou...
449
450
451
452
   *
   * oldcg: the cgroup group that we're using before the cgroup
   * transition
   *
bd89aabc6   Paul Menage   Control groups: R...
453
   * cgrp: the cgroup that we're moving into
817929ec2   Paul Menage   Task Control Grou...
454
455
456
457
   *
   * template: location in which to build the desired set of subsystem
   * state objects for the new cgroup group
   */
817929ec2   Paul Menage   Task Control Grou...
458
459
  static struct css_set *find_existing_css_set(
  	struct css_set *oldcg,
bd89aabc6   Paul Menage   Control groups: R...
460
  	struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
461
  	struct cgroup_subsys_state *template[])
b4f48b636   Paul Menage   Task Control Grou...
462
463
  {
  	int i;
bd89aabc6   Paul Menage   Control groups: R...
464
  	struct cgroupfs_root *root = cgrp->root;
472b1053f   Li Zefan   cgroups: use a ha...
465
466
467
  	struct hlist_head *hhead;
  	struct hlist_node *node;
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
468

aae8aab40   Ben Blum   cgroups: revamp s...
469
470
471
472
473
  	/*
  	 * Build the set of subsystem state objects that we want to see in the
  	 * new css_set. while subsystems can change globally, the entries here
  	 * won't change, so no need for locking.
  	 */
817929ec2   Paul Menage   Task Control Grou...
474
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
475
  		if (root->subsys_bits & (1UL << i)) {
817929ec2   Paul Menage   Task Control Grou...
476
477
478
  			/* Subsystem is in this hierarchy. So we want
  			 * the subsystem state from the new
  			 * cgroup */
bd89aabc6   Paul Menage   Control groups: R...
479
  			template[i] = cgrp->subsys[i];
817929ec2   Paul Menage   Task Control Grou...
480
481
482
483
484
485
  		} else {
  			/* Subsystem is not in this hierarchy, so we
  			 * don't want to change the subsystem state */
  			template[i] = oldcg->subsys[i];
  		}
  	}
472b1053f   Li Zefan   cgroups: use a ha...
486
487
  	hhead = css_set_hash(template);
  	hlist_for_each_entry(cg, node, hhead, hlist) {
7717f7ba9   Paul Menage   cgroups: add a ba...
488
489
490
491
492
  		if (!compare_css_sets(cg, oldcg, cgrp, template))
  			continue;
  
  		/* This css_set matches what we need */
  		return cg;
472b1053f   Li Zefan   cgroups: use a ha...
493
  	}
817929ec2   Paul Menage   Task Control Grou...
494
495
496
497
  
  	/* No existing cgroup group matched */
  	return NULL;
  }
36553434f   Li Zefan   cgroup: remove du...
498
499
500
501
502
503
504
505
506
507
  static void free_cg_links(struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
  
  	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
  		list_del(&link->cgrp_link_list);
  		kfree(link);
  	}
  }
817929ec2   Paul Menage   Task Control Grou...
508
509
  /*
   * allocate_cg_links() allocates "count" cg_cgroup_link structures
bd89aabc6   Paul Menage   Control groups: R...
510
   * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
817929ec2   Paul Menage   Task Control Grou...
511
512
   * success or a negative error
   */
817929ec2   Paul Menage   Task Control Grou...
513
514
515
516
517
518
519
520
  static int allocate_cg_links(int count, struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	int i;
  	INIT_LIST_HEAD(tmp);
  	for (i = 0; i < count; i++) {
  		link = kmalloc(sizeof(*link), GFP_KERNEL);
  		if (!link) {
36553434f   Li Zefan   cgroup: remove du...
521
  			free_cg_links(tmp);
817929ec2   Paul Menage   Task Control Grou...
522
523
  			return -ENOMEM;
  		}
bd89aabc6   Paul Menage   Control groups: R...
524
  		list_add(&link->cgrp_link_list, tmp);
817929ec2   Paul Menage   Task Control Grou...
525
526
527
  	}
  	return 0;
  }
c12f65d43   Li Zefan   cgroups: introduc...
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
  /**
   * link_css_set - a helper function to link a css_set to a cgroup
   * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
   * @cg: the css_set to be linked
   * @cgrp: the destination cgroup
   */
  static void link_css_set(struct list_head *tmp_cg_links,
  			 struct css_set *cg, struct cgroup *cgrp)
  {
  	struct cg_cgroup_link *link;
  
  	BUG_ON(list_empty(tmp_cg_links));
  	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
  				cgrp_link_list);
  	link->cg = cg;
7717f7ba9   Paul Menage   cgroups: add a ba...
543
  	link->cgrp = cgrp;
2c6ab6d20   Paul Menage   cgroups: allow cg...
544
  	atomic_inc(&cgrp->count);
c12f65d43   Li Zefan   cgroups: introduc...
545
  	list_move(&link->cgrp_link_list, &cgrp->css_sets);
7717f7ba9   Paul Menage   cgroups: add a ba...
546
547
548
549
550
  	/*
  	 * Always add links to the tail of the list so that the list
  	 * is sorted by order of hierarchy creation
  	 */
  	list_add_tail(&link->cg_link_list, &cg->cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
551
  }
817929ec2   Paul Menage   Task Control Grou...
552
553
554
555
556
557
558
  /*
   * find_css_set() takes an existing cgroup group and a
   * cgroup object, and returns a css_set object that's
   * equivalent to the old group, but with the given cgroup
   * substituted into the appropriate hierarchy. Must be called with
   * cgroup_mutex held
   */
817929ec2   Paul Menage   Task Control Grou...
559
  static struct css_set *find_css_set(
bd89aabc6   Paul Menage   Control groups: R...
560
  	struct css_set *oldcg, struct cgroup *cgrp)
817929ec2   Paul Menage   Task Control Grou...
561
562
563
  {
  	struct css_set *res;
  	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
817929ec2   Paul Menage   Task Control Grou...
564
565
  
  	struct list_head tmp_cg_links;
817929ec2   Paul Menage   Task Control Grou...
566

472b1053f   Li Zefan   cgroups: use a ha...
567
  	struct hlist_head *hhead;
7717f7ba9   Paul Menage   cgroups: add a ba...
568
  	struct cg_cgroup_link *link;
472b1053f   Li Zefan   cgroups: use a ha...
569

817929ec2   Paul Menage   Task Control Grou...
570
571
  	/* First see if we already have a cgroup group that matches
  	 * the desired set */
7e9abd89c   Li Zefan   cgroup: use read ...
572
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
573
  	res = find_existing_css_set(oldcg, cgrp, template);
817929ec2   Paul Menage   Task Control Grou...
574
575
  	if (res)
  		get_css_set(res);
7e9abd89c   Li Zefan   cgroup: use read ...
576
  	read_unlock(&css_set_lock);
817929ec2   Paul Menage   Task Control Grou...
577
578
579
580
581
582
583
584
585
586
587
588
589
  
  	if (res)
  		return res;
  
  	res = kmalloc(sizeof(*res), GFP_KERNEL);
  	if (!res)
  		return NULL;
  
  	/* Allocate all the cg_cgroup_link objects that we'll need */
  	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
  		kfree(res);
  		return NULL;
  	}
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
590
  	atomic_set(&res->refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
591
592
  	INIT_LIST_HEAD(&res->cg_links);
  	INIT_LIST_HEAD(&res->tasks);
472b1053f   Li Zefan   cgroups: use a ha...
593
  	INIT_HLIST_NODE(&res->hlist);
817929ec2   Paul Menage   Task Control Grou...
594
595
596
597
598
599
600
  
  	/* Copy the set of subsystem state objects generated in
  	 * find_existing_css_set() */
  	memcpy(res->subsys, template, sizeof(res->subsys));
  
  	write_lock(&css_set_lock);
  	/* Add reference counts and links from the new css_set. */
7717f7ba9   Paul Menage   cgroups: add a ba...
601
602
603
604
605
606
  	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
  		struct cgroup *c = link->cgrp;
  		if (c->root == cgrp->root)
  			c = cgrp;
  		link_css_set(&tmp_cg_links, res, c);
  	}
817929ec2   Paul Menage   Task Control Grou...
607
608
  
  	BUG_ON(!list_empty(&tmp_cg_links));
817929ec2   Paul Menage   Task Control Grou...
609
  	css_set_count++;
472b1053f   Li Zefan   cgroups: use a ha...
610
611
612
613
  
  	/* Add this cgroup group to the hash table */
  	hhead = css_set_hash(res->subsys);
  	hlist_add_head(&res->hlist, hhead);
817929ec2   Paul Menage   Task Control Grou...
614
615
616
  	write_unlock(&css_set_lock);
  
  	return res;
b4f48b636   Paul Menage   Task Control Grou...
617
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
618
  /*
7717f7ba9   Paul Menage   cgroups: add a ba...
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
   * Return the cgroup for "task" from the given hierarchy. Must be
   * called with cgroup_mutex held.
   */
  static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  					    struct cgroupfs_root *root)
  {
  	struct css_set *css;
  	struct cgroup *res = NULL;
  
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
  	read_lock(&css_set_lock);
  	/*
  	 * No need to lock the task - since we hold cgroup_mutex the
  	 * task can't change groups, so the only thing that can happen
  	 * is that it exits and its css is set back to init_css_set.
  	 */
  	css = task->cgroups;
  	if (css == &init_css_set) {
  		res = &root->top_cgroup;
  	} else {
  		struct cg_cgroup_link *link;
  		list_for_each_entry(link, &css->cg_links, cg_link_list) {
  			struct cgroup *c = link->cgrp;
  			if (c->root == root) {
  				res = c;
  				break;
  			}
  		}
  	}
  	read_unlock(&css_set_lock);
  	BUG_ON(!res);
  	return res;
  }
  
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
654
655
656
657
658
659
660
661
662
   * There is one global cgroup mutex. We also require taking
   * task_lock() when dereferencing a task's cgroup subsys pointers.
   * See "The task_lock() exception", at the end of this comment.
   *
   * A task must hold cgroup_mutex to modify cgroups.
   *
   * Any task can increment and decrement the count field without lock.
   * So in general, code holding cgroup_mutex can't rely on the count
   * field not changing.  However, if the count goes to zero, then only
956db3ca0   Cliff Wickman   hotplug cpu: move...
663
   * cgroup_attach_task() can increment it again.  Because a count of zero
ddbcc7e8e   Paul Menage   Task Control Grou...
664
665
666
667
668
669
670
671
   * means that no tasks are currently attached, therefore there is no
   * way a task attached to that cgroup can fork (the other way to
   * increment the count).  So code holding cgroup_mutex can safely
   * assume that if the count is zero, it will stay zero. Similarly, if
   * a task holds cgroup_mutex on a cgroup with zero count, it
   * knows that the cgroup won't be removed, as cgroup_rmdir()
   * needs that mutex.
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
672
673
674
675
676
   * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
   * (usually) take cgroup_mutex.  These are the two most performance
   * critical pieces of code here.  The exception occurs on cgroup_exit(),
   * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
   * is taken, and if the cgroup count is zero, a usermode call made
a043e3b2c   Li Zefan   cgroup: fix comments
677
678
   * to the release agent with the name of the cgroup (path relative to
   * the root of cgroup file system) as the argument.
ddbcc7e8e   Paul Menage   Task Control Grou...
679
680
681
682
683
684
685
686
687
688
689
   *
   * A cgroup can only be deleted if both its 'count' of using tasks
   * is zero, and its list of 'children' cgroups is empty.  Since all
   * tasks in the system use _some_ cgroup, and since there is always at
   * least one task in the system (init, pid == 1), therefore, top_cgroup
   * always has either children cgroups and/or using tasks.  So we don't
   * need a special hack to ensure that top_cgroup cannot be deleted.
   *
   *	The task_lock() exception
   *
   * The need for this exception arises from the action of
956db3ca0   Cliff Wickman   hotplug cpu: move...
690
   * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
a043e3b2c   Li Zefan   cgroup: fix comments
691
   * another.  It does so using cgroup_mutex, however there are
ddbcc7e8e   Paul Menage   Task Control Grou...
692
693
694
   * several performance critical places that need to reference
   * task->cgroup without the expense of grabbing a system global
   * mutex.  Therefore except as noted below, when dereferencing or, as
956db3ca0   Cliff Wickman   hotplug cpu: move...
695
   * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
ddbcc7e8e   Paul Menage   Task Control Grou...
696
697
698
699
   * task_lock(), which acts on a spinlock (task->alloc_lock) already in
   * the task_struct routinely used for such matters.
   *
   * P.S.  One more locking exception.  RCU is used to guard the
956db3ca0   Cliff Wickman   hotplug cpu: move...
700
   * update of a tasks cgroup pointer by cgroup_attach_task()
ddbcc7e8e   Paul Menage   Task Control Grou...
701
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
702
703
704
705
  /**
   * cgroup_lock - lock out any changes to cgroup structures
   *
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
706
707
708
709
  void cgroup_lock(void)
  {
  	mutex_lock(&cgroup_mutex);
  }
67523c48a   Ben Blum   cgroups: blkio su...
710
  EXPORT_SYMBOL_GPL(cgroup_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
711
712
713
714
715
716
  
  /**
   * cgroup_unlock - release lock on cgroup changes
   *
   * Undo the lock taken in a previous cgroup_lock() call.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
717
718
719
720
  void cgroup_unlock(void)
  {
  	mutex_unlock(&cgroup_mutex);
  }
67523c48a   Ben Blum   cgroups: blkio su...
721
  EXPORT_SYMBOL_GPL(cgroup_unlock);
ddbcc7e8e   Paul Menage   Task Control Grou...
722
723
724
725
726
727
728
729
730
  
  /*
   * A couple of forward declarations required, due to cyclic reference loop:
   * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
   * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
   * -> cgroup_mkdir.
   */
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
c72a04e34   Al Viro   cgroup_fs: fix cg...
731
  static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
ddbcc7e8e   Paul Menage   Task Control Grou...
732
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
bd89aabc6   Paul Menage   Control groups: R...
733
  static int cgroup_populate_dir(struct cgroup *cgrp);
6e1d5dcc2   Alexey Dobriyan   const: mark remai...
734
  static const struct inode_operations cgroup_dir_inode_operations;
828c09509   Alexey Dobriyan   const: constify r...
735
  static const struct file_operations proc_cgroupstats_operations;
a424316ca   Paul Menage   Task Control Grou...
736
737
  
  static struct backing_dev_info cgroup_backing_dev_info = {
d993831fa   Jens Axboe   writeback: add na...
738
  	.name		= "cgroup",
e4ad08fe6   Miklos Szeredi   mm: bdi: add sepa...
739
  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
a424316ca   Paul Menage   Task Control Grou...
740
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
741

38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
742
743
  static int alloc_css_id(struct cgroup_subsys *ss,
  			struct cgroup *parent, struct cgroup *child);
ddbcc7e8e   Paul Menage   Task Control Grou...
744
745
746
  static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  {
  	struct inode *inode = new_inode(sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
747
748
  
  	if (inode) {
85fe4025c   Christoph Hellwig   fs: do not assign...
749
  		inode->i_ino = get_next_ino();
ddbcc7e8e   Paul Menage   Task Control Grou...
750
  		inode->i_mode = mode;
76aac0e9a   David Howells   CRED: Wrap task c...
751
752
  		inode->i_uid = current_fsuid();
  		inode->i_gid = current_fsgid();
ddbcc7e8e   Paul Menage   Task Control Grou...
753
754
755
756
757
  		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
  	}
  	return inode;
  }
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
758
759
760
761
  /*
   * Call subsys's pre_destroy handler.
   * This is called before css refcnt check.
   */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
762
  static int cgroup_call_pre_destroy(struct cgroup *cgrp)
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
763
764
  {
  	struct cgroup_subsys *ss;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
765
  	int ret = 0;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
766
  	for_each_subsys(cgrp->root, ss)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
767
768
769
  		if (ss->pre_destroy) {
  			ret = ss->pre_destroy(ss, cgrp);
  			if (ret)
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
770
  				break;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
771
  		}
0dea11687   Kirill A. Shutemov   cgroup: implement...
772

ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
773
  	return ret;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
774
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
775
776
777
778
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
  	/* is dentry a directory ? if so, kfree() associated cgroup */
  	if (S_ISDIR(inode->i_mode)) {
bd89aabc6   Paul Menage   Control groups: R...
779
  		struct cgroup *cgrp = dentry->d_fsdata;
8dc4f3e17   Paul Menage   cgroups: move cgr...
780
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
781
  		BUG_ON(!(cgroup_is_removed(cgrp)));
81a6a5cdd   Paul Menage   Task Control Grou...
782
783
784
785
786
787
788
  		/* It's possible for external users to be holding css
  		 * reference counts on a cgroup; css_put() needs to
  		 * be able to access the cgroup after decrementing
  		 * the reference count in order to know if it needs to
  		 * queue the cgroup to be handled by the release
  		 * agent */
  		synchronize_rcu();
8dc4f3e17   Paul Menage   cgroups: move cgr...
789
790
791
792
793
  
  		mutex_lock(&cgroup_mutex);
  		/*
  		 * Release the subsystem state objects.
  		 */
75139b827   Li Zefan   cgroups: remove s...
794
795
  		for_each_subsys(cgrp->root, ss)
  			ss->destroy(ss, cgrp);
8dc4f3e17   Paul Menage   cgroups: move cgr...
796
797
798
  
  		cgrp->root->number_of_cgroups--;
  		mutex_unlock(&cgroup_mutex);
a47295e6b   Paul Menage   cgroups: make cgr...
799
800
801
802
  		/*
  		 * Drop the active superblock reference that we took when we
  		 * created the cgroup
  		 */
8dc4f3e17   Paul Menage   cgroups: move cgr...
803
  		deactivate_super(cgrp->root->sb);
72a8cb30d   Ben Blum   cgroups: ensure c...
804
805
806
807
808
  		/*
  		 * if we're getting rid of the cgroup, refcount should ensure
  		 * that there are no pidlists left.
  		 */
  		BUG_ON(!list_empty(&cgrp->pidlists));
f2da1c40d   Lai Jiangshan   cgroup,rcu: conve...
809
  		kfree_rcu(cgrp, rcu_head);
ddbcc7e8e   Paul Menage   Task Control Grou...
810
811
812
  	}
  	iput(inode);
  }
c72a04e34   Al Viro   cgroup_fs: fix cg...
813
814
815
816
  static int cgroup_delete(const struct dentry *d)
  {
  	return 1;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
817
818
819
820
821
822
823
824
825
826
827
828
829
830
  static void remove_dir(struct dentry *d)
  {
  	struct dentry *parent = dget(d->d_parent);
  
  	d_delete(d);
  	simple_rmdir(parent->d_inode, d);
  	dput(parent);
  }
  
  static void cgroup_clear_directory(struct dentry *dentry)
  {
  	struct list_head *node;
  
  	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2fd6b7f50   Nick Piggin   fs: dcache scale ...
831
  	spin_lock(&dentry->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
832
833
834
  	node = dentry->d_subdirs.next;
  	while (node != &dentry->d_subdirs) {
  		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
835
836
  
  		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
ddbcc7e8e   Paul Menage   Task Control Grou...
837
838
839
840
841
  		list_del_init(node);
  		if (d->d_inode) {
  			/* This should never be called on a cgroup
  			 * directory with child cgroups */
  			BUG_ON(d->d_inode->i_mode & S_IFDIR);
dc0474be3   Nick Piggin   fs: dcache ration...
842
  			dget_dlock(d);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
843
844
  			spin_unlock(&d->d_lock);
  			spin_unlock(&dentry->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
845
846
847
  			d_delete(d);
  			simple_unlink(dentry->d_inode, d);
  			dput(d);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
848
849
850
  			spin_lock(&dentry->d_lock);
  		} else
  			spin_unlock(&d->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
851
852
  		node = dentry->d_subdirs.next;
  	}
2fd6b7f50   Nick Piggin   fs: dcache scale ...
853
  	spin_unlock(&dentry->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
854
855
856
857
858
859
860
  }
  
  /*
   * NOTE : the dentry must have been dget()'ed
   */
  static void cgroup_d_remove_dir(struct dentry *dentry)
  {
2fd6b7f50   Nick Piggin   fs: dcache scale ...
861
  	struct dentry *parent;
ddbcc7e8e   Paul Menage   Task Control Grou...
862
  	cgroup_clear_directory(dentry);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
863
864
  	parent = dentry->d_parent;
  	spin_lock(&parent->d_lock);
3ec762ad8   Li Zefan   cgroups: Fix a lo...
865
  	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
ddbcc7e8e   Paul Menage   Task Control Grou...
866
  	list_del_init(&dentry->d_u.d_child);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
867
868
  	spin_unlock(&dentry->d_lock);
  	spin_unlock(&parent->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
869
870
  	remove_dir(dentry);
  }
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
871
872
873
874
875
876
  /*
   * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
   * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
   * reference to css->refcnt. In general, this refcnt is expected to goes down
   * to zero, soon.
   *
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
877
   * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
878
879
   */
  DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
880
  static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
881
  {
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
882
  	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
883
884
  		wake_up_all(&cgroup_rmdir_waitq);
  }
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
885
886
887
888
889
890
891
892
893
894
  void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
  {
  	css_get(css);
  }
  
  void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
  {
  	cgroup_wakeup_rmdir_waiter(css->cgroup);
  	css_put(css);
  }
aae8aab40   Ben Blum   cgroups: revamp s...
895
  /*
cf5d5941f   Ben Blum   cgroups: subsyste...
896
897
898
   * Call with cgroup_mutex held. Drops reference counts on modules, including
   * any duplicate ones that parse_cgroupfs_options took. If this function
   * returns an error, no reference counts are touched.
aae8aab40   Ben Blum   cgroups: revamp s...
899
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
900
901
902
903
  static int rebind_subsystems(struct cgroupfs_root *root,
  			      unsigned long final_bits)
  {
  	unsigned long added_bits, removed_bits;
bd89aabc6   Paul Menage   Control groups: R...
904
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
905
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
906
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
ddbcc7e8e   Paul Menage   Task Control Grou...
907
908
909
910
  	removed_bits = root->actual_subsys_bits & ~final_bits;
  	added_bits = final_bits & ~root->actual_subsys_bits;
  	/* Check that any added subsystems are currently free */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
911
  		unsigned long bit = 1UL << i;
ddbcc7e8e   Paul Menage   Task Control Grou...
912
913
914
  		struct cgroup_subsys *ss = subsys[i];
  		if (!(bit & added_bits))
  			continue;
aae8aab40   Ben Blum   cgroups: revamp s...
915
916
917
918
919
920
  		/*
  		 * Nobody should tell us to do a subsys that doesn't exist:
  		 * parse_cgroupfs_options should catch that case and refcounts
  		 * ensure that subsystems won't disappear once selected.
  		 */
  		BUG_ON(ss == NULL);
ddbcc7e8e   Paul Menage   Task Control Grou...
921
922
923
924
925
926
927
928
929
930
  		if (ss->root != &rootnode) {
  			/* Subsystem isn't free */
  			return -EBUSY;
  		}
  	}
  
  	/* Currently we don't handle adding/removing subsystems when
  	 * any child cgroups exist. This is theoretically supportable
  	 * but involves complex error handling, so it's being left until
  	 * later */
307257cf4   Paul Menage   cgroups: fix a ra...
931
  	if (root->number_of_cgroups > 1)
ddbcc7e8e   Paul Menage   Task Control Grou...
932
933
934
935
936
937
938
939
  		return -EBUSY;
  
  	/* Process each subsystem */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		unsigned long bit = 1UL << i;
  		if (bit & added_bits) {
  			/* We're binding this subsystem to this hierarchy */
aae8aab40   Ben Blum   cgroups: revamp s...
940
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
941
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
942
943
  			BUG_ON(!dummytop->subsys[i]);
  			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
999cd8a45   Paul Menage   cgroups: add a pe...
944
  			mutex_lock(&ss->hierarchy_mutex);
bd89aabc6   Paul Menage   Control groups: R...
945
946
  			cgrp->subsys[i] = dummytop->subsys[i];
  			cgrp->subsys[i]->cgroup = cgrp;
33a68ac1c   Li Zefan   cgroups: add inac...
947
  			list_move(&ss->sibling, &root->subsys_list);
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
948
  			ss->root = root;
ddbcc7e8e   Paul Menage   Task Control Grou...
949
  			if (ss->bind)
bd89aabc6   Paul Menage   Control groups: R...
950
  				ss->bind(ss, cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
951
  			mutex_unlock(&ss->hierarchy_mutex);
cf5d5941f   Ben Blum   cgroups: subsyste...
952
  			/* refcount was already taken, and we're keeping it */
ddbcc7e8e   Paul Menage   Task Control Grou...
953
954
  		} else if (bit & removed_bits) {
  			/* We're removing this subsystem */
aae8aab40   Ben Blum   cgroups: revamp s...
955
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
956
957
  			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
  			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
958
  			mutex_lock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
959
960
961
  			if (ss->bind)
  				ss->bind(ss, dummytop);
  			dummytop->subsys[i]->cgroup = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
962
  			cgrp->subsys[i] = NULL;
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
963
  			subsys[i]->root = &rootnode;
33a68ac1c   Li Zefan   cgroups: add inac...
964
  			list_move(&ss->sibling, &rootnode.subsys_list);
999cd8a45   Paul Menage   cgroups: add a pe...
965
  			mutex_unlock(&ss->hierarchy_mutex);
cf5d5941f   Ben Blum   cgroups: subsyste...
966
967
  			/* subsystem is now free - drop reference on module */
  			module_put(ss->module);
ddbcc7e8e   Paul Menage   Task Control Grou...
968
969
  		} else if (bit & final_bits) {
  			/* Subsystem state should already exist */
aae8aab40   Ben Blum   cgroups: revamp s...
970
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
971
  			BUG_ON(!cgrp->subsys[i]);
cf5d5941f   Ben Blum   cgroups: subsyste...
972
973
974
975
976
977
978
979
  			/*
  			 * a refcount was taken, but we already had one, so
  			 * drop the extra reference.
  			 */
  			module_put(ss->module);
  #ifdef CONFIG_MODULE_UNLOAD
  			BUG_ON(ss->module && !module_refcount(ss->module));
  #endif
ddbcc7e8e   Paul Menage   Task Control Grou...
980
981
  		} else {
  			/* Subsystem state shouldn't exist */
bd89aabc6   Paul Menage   Control groups: R...
982
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
  		}
  	}
  	root->subsys_bits = root->actual_subsys_bits = final_bits;
  	synchronize_rcu();
  
  	return 0;
  }
  
  static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
  {
  	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
  	struct cgroup_subsys *ss;
  
  	mutex_lock(&cgroup_mutex);
  	for_each_subsys(root, ss)
  		seq_printf(seq, ",%s", ss->name);
  	if (test_bit(ROOT_NOPREFIX, &root->flags))
  		seq_puts(seq, ",noprefix");
81a6a5cdd   Paul Menage   Task Control Grou...
1001
1002
  	if (strlen(root->release_agent_path))
  		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
97978e6d1   Daniel Lezcano   cgroup: add clone...
1003
1004
  	if (clone_children(&root->top_cgroup))
  		seq_puts(seq, ",clone_children");
c6d57f331   Paul Menage   cgroups: support ...
1005
1006
  	if (strlen(root->name))
  		seq_printf(seq, ",name=%s", root->name);
ddbcc7e8e   Paul Menage   Task Control Grou...
1007
1008
1009
1010
1011
1012
1013
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  struct cgroup_sb_opts {
  	unsigned long subsys_bits;
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
1014
  	char *release_agent;
97978e6d1   Daniel Lezcano   cgroup: add clone...
1015
  	bool clone_children;
c6d57f331   Paul Menage   cgroups: support ...
1016
  	char *name;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1017
1018
  	/* User explicitly requested empty subsystem */
  	bool none;
c6d57f331   Paul Menage   cgroups: support ...
1019
1020
  
  	struct cgroupfs_root *new_root;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1021

ddbcc7e8e   Paul Menage   Task Control Grou...
1022
  };
aae8aab40   Ben Blum   cgroups: revamp s...
1023
1024
  /*
   * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
cf5d5941f   Ben Blum   cgroups: subsyste...
1025
1026
1027
   * with cgroup_mutex held to protect the subsys[] array. This function takes
   * refcounts on subsystems to be used, unless it returns error, in which case
   * no refcounts are taken.
aae8aab40   Ben Blum   cgroups: revamp s...
1028
   */
cf5d5941f   Ben Blum   cgroups: subsyste...
1029
  static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
ddbcc7e8e   Paul Menage   Task Control Grou...
1030
  {
32a8cf235   Daniel Lezcano   cgroup: make the ...
1031
1032
  	char *token, *o = data;
  	bool all_ss = false, one_ss = false;
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1033
  	unsigned long mask = (unsigned long)-1;
cf5d5941f   Ben Blum   cgroups: subsyste...
1034
1035
  	int i;
  	bool module_pin_failed = false;
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1036

aae8aab40   Ben Blum   cgroups: revamp s...
1037
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1038
1039
1040
  #ifdef CONFIG_CPUSETS
  	mask = ~(1UL << cpuset_subsys_id);
  #endif
ddbcc7e8e   Paul Menage   Task Control Grou...
1041

c6d57f331   Paul Menage   cgroups: support ...
1042
  	memset(opts, 0, sizeof(*opts));
ddbcc7e8e   Paul Menage   Task Control Grou...
1043
1044
1045
1046
  
  	while ((token = strsep(&o, ",")) != NULL) {
  		if (!*token)
  			return -EINVAL;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1047
  		if (!strcmp(token, "none")) {
2c6ab6d20   Paul Menage   cgroups: allow cg...
1048
1049
  			/* Explicitly have no subsystems */
  			opts->none = true;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
  			continue;
  		}
  		if (!strcmp(token, "all")) {
  			/* Mutually exclusive option 'all' + subsystem name */
  			if (one_ss)
  				return -EINVAL;
  			all_ss = true;
  			continue;
  		}
  		if (!strcmp(token, "noprefix")) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1060
  			set_bit(ROOT_NOPREFIX, &opts->flags);
32a8cf235   Daniel Lezcano   cgroup: make the ...
1061
1062
1063
  			continue;
  		}
  		if (!strcmp(token, "clone_children")) {
97978e6d1   Daniel Lezcano   cgroup: add clone...
1064
  			opts->clone_children = true;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1065
1066
1067
  			continue;
  		}
  		if (!strncmp(token, "release_agent=", 14)) {
81a6a5cdd   Paul Menage   Task Control Grou...
1068
1069
1070
  			/* Specifying two release agents is forbidden */
  			if (opts->release_agent)
  				return -EINVAL;
c6d57f331   Paul Menage   cgroups: support ...
1071
  			opts->release_agent =
e400c2852   Dan Carpenter   cgroups: save spa...
1072
  				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
81a6a5cdd   Paul Menage   Task Control Grou...
1073
1074
  			if (!opts->release_agent)
  				return -ENOMEM;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1075
1076
1077
  			continue;
  		}
  		if (!strncmp(token, "name=", 5)) {
c6d57f331   Paul Menage   cgroups: support ...
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
  			const char *name = token + 5;
  			/* Can't specify an empty name */
  			if (!strlen(name))
  				return -EINVAL;
  			/* Must match [\w.-]+ */
  			for (i = 0; i < strlen(name); i++) {
  				char c = name[i];
  				if (isalnum(c))
  					continue;
  				if ((c == '.') || (c == '-') || (c == '_'))
  					continue;
  				return -EINVAL;
  			}
  			/* Specifying two names is forbidden */
  			if (opts->name)
  				return -EINVAL;
  			opts->name = kstrndup(name,
e400c2852   Dan Carpenter   cgroups: save spa...
1095
  					      MAX_CGROUP_ROOT_NAMELEN - 1,
c6d57f331   Paul Menage   cgroups: support ...
1096
1097
1098
  					      GFP_KERNEL);
  			if (!opts->name)
  				return -ENOMEM;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
  
  			continue;
  		}
  
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss == NULL)
  				continue;
  			if (strcmp(token, ss->name))
  				continue;
  			if (ss->disabled)
  				continue;
  
  			/* Mutually exclusive option 'all' + subsystem name */
  			if (all_ss)
  				return -EINVAL;
  			set_bit(i, &opts->subsys_bits);
  			one_ss = true;
  
  			break;
  		}
  		if (i == CGROUP_SUBSYS_COUNT)
  			return -ENOENT;
  	}
  
  	/*
  	 * If the 'all' option was specified select all the subsystems,
  	 * otherwise 'all, 'none' and a subsystem name options were not
  	 * specified, let's default to 'all'
  	 */
  	if (all_ss || (!all_ss && !one_ss && !opts->none)) {
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss == NULL)
  				continue;
  			if (ss->disabled)
  				continue;
  			set_bit(i, &opts->subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
1137
1138
  		}
  	}
2c6ab6d20   Paul Menage   cgroups: allow cg...
1139
  	/* Consistency checks */
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1140
1141
1142
1143
1144
1145
1146
1147
  	/*
  	 * Option noprefix was introduced just for backward compatibility
  	 * with the old cpuset, so we allow noprefix only if mounting just
  	 * the cpuset subsystem.
  	 */
  	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
  	    (opts->subsys_bits & mask))
  		return -EINVAL;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1148
1149
1150
1151
1152
1153
1154
1155
1156
  
  	/* Can't specify "none" and some subsystems */
  	if (opts->subsys_bits && opts->none)
  		return -EINVAL;
  
  	/*
  	 * We either have to specify by name or by subsystems. (So all
  	 * empty hierarchies must have a name).
  	 */
c6d57f331   Paul Menage   cgroups: support ...
1157
  	if (!opts->subsys_bits && !opts->name)
ddbcc7e8e   Paul Menage   Task Control Grou...
1158
  		return -EINVAL;
cf5d5941f   Ben Blum   cgroups: subsyste...
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
  	/*
  	 * Grab references on all the modules we'll need, so the subsystems
  	 * don't dance around before rebind_subsystems attaches them. This may
  	 * take duplicate reference counts on a subsystem that's already used,
  	 * but rebind_subsystems handles this case.
  	 */
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		unsigned long bit = 1UL << i;
  
  		if (!(bit & opts->subsys_bits))
  			continue;
  		if (!try_module_get(subsys[i]->module)) {
  			module_pin_failed = true;
  			break;
  		}
  	}
  	if (module_pin_failed) {
  		/*
  		 * oops, one of the modules was going away. this means that we
  		 * raced with a module_delete call, and to the user this is
  		 * essentially a "subsystem doesn't exist" case.
  		 */
  		for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
  			/* drop refcounts only on the ones we took */
  			unsigned long bit = 1UL << i;
  
  			if (!(bit & opts->subsys_bits))
  				continue;
  			module_put(subsys[i]->module);
  		}
  		return -ENOENT;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1191
1192
  	return 0;
  }
cf5d5941f   Ben Blum   cgroups: subsyste...
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
  static void drop_parsed_module_refcounts(unsigned long subsys_bits)
  {
  	int i;
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		unsigned long bit = 1UL << i;
  
  		if (!(bit & subsys_bits))
  			continue;
  		module_put(subsys[i]->module);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1204
1205
1206
1207
  static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  {
  	int ret = 0;
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1208
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1209
  	struct cgroup_sb_opts opts;
bd89aabc6   Paul Menage   Control groups: R...
1210
  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1211
1212
1213
1214
1215
1216
  	mutex_lock(&cgroup_mutex);
  
  	/* See what subsystems are wanted */
  	ret = parse_cgroupfs_options(data, &opts);
  	if (ret)
  		goto out_unlock;
cf5d5941f   Ben Blum   cgroups: subsyste...
1217
1218
1219
  	/* Don't allow flags or name to change at remount */
  	if (opts.flags != root->flags ||
  	    (opts.name && strcmp(opts.name, root->name))) {
c6d57f331   Paul Menage   cgroups: support ...
1220
  		ret = -EINVAL;
cf5d5941f   Ben Blum   cgroups: subsyste...
1221
  		drop_parsed_module_refcounts(opts.subsys_bits);
c6d57f331   Paul Menage   cgroups: support ...
1222
1223
  		goto out_unlock;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1224
  	ret = rebind_subsystems(root, opts.subsys_bits);
cf5d5941f   Ben Blum   cgroups: subsyste...
1225
1226
  	if (ret) {
  		drop_parsed_module_refcounts(opts.subsys_bits);
0670e08bd   Li Zefan   cgroups: don't ch...
1227
  		goto out_unlock;
cf5d5941f   Ben Blum   cgroups: subsyste...
1228
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1229
1230
  
  	/* (re)populate subsystem files */
0670e08bd   Li Zefan   cgroups: don't ch...
1231
  	cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1232

81a6a5cdd   Paul Menage   Task Control Grou...
1233
1234
  	if (opts.release_agent)
  		strcpy(root->release_agent_path, opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1235
   out_unlock:
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
1236
  	kfree(opts.release_agent);
c6d57f331   Paul Menage   cgroups: support ...
1237
  	kfree(opts.name);
ddbcc7e8e   Paul Menage   Task Control Grou...
1238
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
1239
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1240
1241
  	return ret;
  }
b87221de6   Alexey Dobriyan   const: mark remai...
1242
  static const struct super_operations cgroup_ops = {
ddbcc7e8e   Paul Menage   Task Control Grou...
1243
1244
1245
1246
1247
  	.statfs = simple_statfs,
  	.drop_inode = generic_delete_inode,
  	.show_options = cgroup_show_options,
  	.remount_fs = cgroup_remount,
  };
cc31edcee   Paul Menage   cgroups: convert ...
1248
1249
1250
1251
1252
1253
  static void init_cgroup_housekeeping(struct cgroup *cgrp)
  {
  	INIT_LIST_HEAD(&cgrp->sibling);
  	INIT_LIST_HEAD(&cgrp->children);
  	INIT_LIST_HEAD(&cgrp->css_sets);
  	INIT_LIST_HEAD(&cgrp->release_list);
72a8cb30d   Ben Blum   cgroups: ensure c...
1254
1255
  	INIT_LIST_HEAD(&cgrp->pidlists);
  	mutex_init(&cgrp->pidlist_mutex);
0dea11687   Kirill A. Shutemov   cgroup: implement...
1256
1257
  	INIT_LIST_HEAD(&cgrp->event_list);
  	spin_lock_init(&cgrp->event_list_lock);
cc31edcee   Paul Menage   cgroups: convert ...
1258
  }
c6d57f331   Paul Menage   cgroups: support ...
1259

ddbcc7e8e   Paul Menage   Task Control Grou...
1260
1261
  static void init_cgroup_root(struct cgroupfs_root *root)
  {
bd89aabc6   Paul Menage   Control groups: R...
1262
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1263
1264
1265
  	INIT_LIST_HEAD(&root->subsys_list);
  	INIT_LIST_HEAD(&root->root_list);
  	root->number_of_cgroups = 1;
bd89aabc6   Paul Menage   Control groups: R...
1266
1267
  	cgrp->root = root;
  	cgrp->top_cgroup = cgrp;
cc31edcee   Paul Menage   cgroups: convert ...
1268
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1269
  }
2c6ab6d20   Paul Menage   cgroups: allow cg...
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
  static bool init_root_id(struct cgroupfs_root *root)
  {
  	int ret = 0;
  
  	do {
  		if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
  			return false;
  		spin_lock(&hierarchy_id_lock);
  		/* Try to allocate the next unused ID */
  		ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
  					&root->hierarchy_id);
  		if (ret == -ENOSPC)
  			/* Try again starting from 0 */
  			ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
  		if (!ret) {
  			next_hierarchy_id = root->hierarchy_id + 1;
  		} else if (ret != -EAGAIN) {
  			/* Can only get here if the 31-bit IDR is full ... */
  			BUG_ON(ret);
  		}
  		spin_unlock(&hierarchy_id_lock);
  	} while (ret);
  	return true;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1294
1295
  static int cgroup_test_super(struct super_block *sb, void *data)
  {
c6d57f331   Paul Menage   cgroups: support ...
1296
  	struct cgroup_sb_opts *opts = data;
ddbcc7e8e   Paul Menage   Task Control Grou...
1297
  	struct cgroupfs_root *root = sb->s_fs_info;
c6d57f331   Paul Menage   cgroups: support ...
1298
1299
1300
  	/* If we asked for a name then it must match */
  	if (opts->name && strcmp(opts->name, root->name))
  		return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
1301

2c6ab6d20   Paul Menage   cgroups: allow cg...
1302
1303
1304
1305
1306
1307
  	/*
  	 * If we asked for subsystems (or explicitly for no
  	 * subsystems) then they must match
  	 */
  	if ((opts->subsys_bits || opts->none)
  	    && (opts->subsys_bits != root->subsys_bits))
ddbcc7e8e   Paul Menage   Task Control Grou...
1308
1309
1310
1311
  		return 0;
  
  	return 1;
  }
c6d57f331   Paul Menage   cgroups: support ...
1312
1313
1314
  static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
  {
  	struct cgroupfs_root *root;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1315
  	if (!opts->subsys_bits && !opts->none)
c6d57f331   Paul Menage   cgroups: support ...
1316
1317
1318
1319
1320
  		return NULL;
  
  	root = kzalloc(sizeof(*root), GFP_KERNEL);
  	if (!root)
  		return ERR_PTR(-ENOMEM);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1321
1322
1323
1324
  	if (!init_root_id(root)) {
  		kfree(root);
  		return ERR_PTR(-ENOMEM);
  	}
c6d57f331   Paul Menage   cgroups: support ...
1325
  	init_cgroup_root(root);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1326

c6d57f331   Paul Menage   cgroups: support ...
1327
1328
1329
1330
1331
1332
  	root->subsys_bits = opts->subsys_bits;
  	root->flags = opts->flags;
  	if (opts->release_agent)
  		strcpy(root->release_agent_path, opts->release_agent);
  	if (opts->name)
  		strcpy(root->name, opts->name);
97978e6d1   Daniel Lezcano   cgroup: add clone...
1333
1334
  	if (opts->clone_children)
  		set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
c6d57f331   Paul Menage   cgroups: support ...
1335
1336
  	return root;
  }
2c6ab6d20   Paul Menage   cgroups: allow cg...
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
  static void cgroup_drop_root(struct cgroupfs_root *root)
  {
  	if (!root)
  		return;
  
  	BUG_ON(!root->hierarchy_id);
  	spin_lock(&hierarchy_id_lock);
  	ida_remove(&hierarchy_ida, root->hierarchy_id);
  	spin_unlock(&hierarchy_id_lock);
  	kfree(root);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1348
1349
1350
  static int cgroup_set_super(struct super_block *sb, void *data)
  {
  	int ret;
c6d57f331   Paul Menage   cgroups: support ...
1351
1352
1353
1354
1355
  	struct cgroup_sb_opts *opts = data;
  
  	/* If we don't have a new root, we can't set up a new sb */
  	if (!opts->new_root)
  		return -EINVAL;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1356
  	BUG_ON(!opts->subsys_bits && !opts->none);
ddbcc7e8e   Paul Menage   Task Control Grou...
1357
1358
1359
1360
  
  	ret = set_anon_super(sb, NULL);
  	if (ret)
  		return ret;
c6d57f331   Paul Menage   cgroups: support ...
1361
1362
  	sb->s_fs_info = opts->new_root;
  	opts->new_root->sb = sb;
ddbcc7e8e   Paul Menage   Task Control Grou...
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
  
  	sb->s_blocksize = PAGE_CACHE_SIZE;
  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
  	sb->s_magic = CGROUP_SUPER_MAGIC;
  	sb->s_op = &cgroup_ops;
  
  	return 0;
  }
  
  static int cgroup_get_rootdir(struct super_block *sb)
  {
0df6a63f8   Al Viro   switch cgroup
1374
1375
  	static const struct dentry_operations cgroup_dops = {
  		.d_iput = cgroup_diput,
c72a04e34   Al Viro   cgroup_fs: fix cg...
1376
  		.d_delete = cgroup_delete,
0df6a63f8   Al Viro   switch cgroup
1377
  	};
ddbcc7e8e   Paul Menage   Task Control Grou...
1378
1379
1380
1381
1382
1383
  	struct inode *inode =
  		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
  	struct dentry *dentry;
  
  	if (!inode)
  		return -ENOMEM;
ddbcc7e8e   Paul Menage   Task Control Grou...
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
  	inode->i_fop = &simple_dir_operations;
  	inode->i_op = &cgroup_dir_inode_operations;
  	/* directories start off with i_nlink == 2 (for "." entry) */
  	inc_nlink(inode);
  	dentry = d_alloc_root(inode);
  	if (!dentry) {
  		iput(inode);
  		return -ENOMEM;
  	}
  	sb->s_root = dentry;
0df6a63f8   Al Viro   switch cgroup
1394
1395
  	/* for everything else we want ->d_op set */
  	sb->s_d_op = &cgroup_dops;
ddbcc7e8e   Paul Menage   Task Control Grou...
1396
1397
  	return 0;
  }
f7e835710   Al Viro   convert cgroup an...
1398
  static struct dentry *cgroup_mount(struct file_system_type *fs_type,
ddbcc7e8e   Paul Menage   Task Control Grou...
1399
  			 int flags, const char *unused_dev_name,
f7e835710   Al Viro   convert cgroup an...
1400
  			 void *data)
ddbcc7e8e   Paul Menage   Task Control Grou...
1401
1402
  {
  	struct cgroup_sb_opts opts;
c6d57f331   Paul Menage   cgroups: support ...
1403
  	struct cgroupfs_root *root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1404
1405
  	int ret = 0;
  	struct super_block *sb;
c6d57f331   Paul Menage   cgroups: support ...
1406
  	struct cgroupfs_root *new_root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1407
1408
  
  	/* First find the desired set of subsystems */
aae8aab40   Ben Blum   cgroups: revamp s...
1409
  	mutex_lock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1410
  	ret = parse_cgroupfs_options(data, &opts);
aae8aab40   Ben Blum   cgroups: revamp s...
1411
  	mutex_unlock(&cgroup_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1412
1413
  	if (ret)
  		goto out_err;
ddbcc7e8e   Paul Menage   Task Control Grou...
1414

c6d57f331   Paul Menage   cgroups: support ...
1415
1416
1417
1418
1419
1420
1421
  	/*
  	 * Allocate a new cgroup root. We may not need it if we're
  	 * reusing an existing hierarchy.
  	 */
  	new_root = cgroup_root_from_opts(&opts);
  	if (IS_ERR(new_root)) {
  		ret = PTR_ERR(new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1422
  		goto drop_modules;
81a6a5cdd   Paul Menage   Task Control Grou...
1423
  	}
c6d57f331   Paul Menage   cgroups: support ...
1424
  	opts.new_root = new_root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1425

c6d57f331   Paul Menage   cgroups: support ...
1426
1427
  	/* Locate an existing or new sb for this hierarchy */
  	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
ddbcc7e8e   Paul Menage   Task Control Grou...
1428
  	if (IS_ERR(sb)) {
c6d57f331   Paul Menage   cgroups: support ...
1429
  		ret = PTR_ERR(sb);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1430
  		cgroup_drop_root(opts.new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1431
  		goto drop_modules;
ddbcc7e8e   Paul Menage   Task Control Grou...
1432
  	}
c6d57f331   Paul Menage   cgroups: support ...
1433
1434
1435
1436
1437
  	root = sb->s_fs_info;
  	BUG_ON(!root);
  	if (root == opts.new_root) {
  		/* We used the new root structure, so this is a new hierarchy */
  		struct list_head tmp_cg_links;
c12f65d43   Li Zefan   cgroups: introduc...
1438
  		struct cgroup *root_cgrp = &root->top_cgroup;
817929ec2   Paul Menage   Task Control Grou...
1439
  		struct inode *inode;
c6d57f331   Paul Menage   cgroups: support ...
1440
  		struct cgroupfs_root *existing_root;
2ce9738ba   eparis@redhat   cgroupfs: use ini...
1441
  		const struct cred *cred;
28fd5dfc1   Li Zefan   cgroups: remove t...
1442
  		int i;
ddbcc7e8e   Paul Menage   Task Control Grou...
1443
1444
1445
1446
1447
1448
  
  		BUG_ON(sb->s_root != NULL);
  
  		ret = cgroup_get_rootdir(sb);
  		if (ret)
  			goto drop_new_super;
817929ec2   Paul Menage   Task Control Grou...
1449
  		inode = sb->s_root->d_inode;
ddbcc7e8e   Paul Menage   Task Control Grou...
1450

817929ec2   Paul Menage   Task Control Grou...
1451
  		mutex_lock(&inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1452
  		mutex_lock(&cgroup_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
  		if (strlen(root->name)) {
  			/* Check for name clashes with existing mounts */
  			for_each_active_root(existing_root) {
  				if (!strcmp(existing_root->name, root->name)) {
  					ret = -EBUSY;
  					mutex_unlock(&cgroup_mutex);
  					mutex_unlock(&inode->i_mutex);
  					goto drop_new_super;
  				}
  			}
  		}
817929ec2   Paul Menage   Task Control Grou...
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
  		/*
  		 * We're accessing css_set_count without locking
  		 * css_set_lock here, but that's OK - it can only be
  		 * increased by someone holding cgroup_lock, and
  		 * that's us. The worst that can happen is that we
  		 * have some link structures left over
  		 */
  		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
  		if (ret) {
  			mutex_unlock(&cgroup_mutex);
  			mutex_unlock(&inode->i_mutex);
  			goto drop_new_super;
  		}
ddbcc7e8e   Paul Menage   Task Control Grou...
1477
1478
1479
  		ret = rebind_subsystems(root, root->subsys_bits);
  		if (ret == -EBUSY) {
  			mutex_unlock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
1480
  			mutex_unlock(&inode->i_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1481
1482
  			free_cg_links(&tmp_cg_links);
  			goto drop_new_super;
ddbcc7e8e   Paul Menage   Task Control Grou...
1483
  		}
cf5d5941f   Ben Blum   cgroups: subsyste...
1484
1485
1486
1487
1488
  		/*
  		 * There must be no failure case after here, since rebinding
  		 * takes care of subsystems' refcounts, which are explicitly
  		 * dropped in the failure exit path.
  		 */
ddbcc7e8e   Paul Menage   Task Control Grou...
1489
1490
1491
1492
1493
  
  		/* EBUSY should be the only error here */
  		BUG_ON(ret);
  
  		list_add(&root->root_list, &roots);
817929ec2   Paul Menage   Task Control Grou...
1494
  		root_count++;
ddbcc7e8e   Paul Menage   Task Control Grou...
1495

c12f65d43   Li Zefan   cgroups: introduc...
1496
  		sb->s_root->d_fsdata = root_cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
1497
  		root->top_cgroup.dentry = sb->s_root;
817929ec2   Paul Menage   Task Control Grou...
1498
1499
1500
  		/* Link the top cgroup in this hierarchy into all
  		 * the css_set objects */
  		write_lock(&css_set_lock);
28fd5dfc1   Li Zefan   cgroups: remove t...
1501
1502
1503
  		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  			struct hlist_head *hhead = &css_set_table[i];
  			struct hlist_node *node;
817929ec2   Paul Menage   Task Control Grou...
1504
  			struct css_set *cg;
28fd5dfc1   Li Zefan   cgroups: remove t...
1505

c12f65d43   Li Zefan   cgroups: introduc...
1506
1507
  			hlist_for_each_entry(cg, node, hhead, hlist)
  				link_css_set(&tmp_cg_links, cg, root_cgrp);
28fd5dfc1   Li Zefan   cgroups: remove t...
1508
  		}
817929ec2   Paul Menage   Task Control Grou...
1509
1510
1511
  		write_unlock(&css_set_lock);
  
  		free_cg_links(&tmp_cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
1512
1513
  		BUG_ON(!list_empty(&root_cgrp->sibling));
  		BUG_ON(!list_empty(&root_cgrp->children));
ddbcc7e8e   Paul Menage   Task Control Grou...
1514
  		BUG_ON(root->number_of_cgroups != 1);
2ce9738ba   eparis@redhat   cgroupfs: use ini...
1515
  		cred = override_creds(&init_cred);
c12f65d43   Li Zefan   cgroups: introduc...
1516
  		cgroup_populate_dir(root_cgrp);
2ce9738ba   eparis@redhat   cgroupfs: use ini...
1517
  		revert_creds(cred);
ddbcc7e8e   Paul Menage   Task Control Grou...
1518
  		mutex_unlock(&cgroup_mutex);
34f77a90f   Xiaotian Feng   cgroups: make unl...
1519
  		mutex_unlock(&inode->i_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1520
1521
1522
1523
1524
  	} else {
  		/*
  		 * We re-used an existing hierarchy - the new root (if
  		 * any) is not needed
  		 */
2c6ab6d20   Paul Menage   cgroups: allow cg...
1525
  		cgroup_drop_root(opts.new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1526
1527
  		/* no subsys rebinding, so refcounts don't change */
  		drop_parsed_module_refcounts(opts.subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
1528
  	}
c6d57f331   Paul Menage   cgroups: support ...
1529
1530
  	kfree(opts.release_agent);
  	kfree(opts.name);
f7e835710   Al Viro   convert cgroup an...
1531
  	return dget(sb->s_root);
ddbcc7e8e   Paul Menage   Task Control Grou...
1532
1533
  
   drop_new_super:
6f5bbff9a   Al Viro   Convert obvious p...
1534
  	deactivate_locked_super(sb);
cf5d5941f   Ben Blum   cgroups: subsyste...
1535
1536
   drop_modules:
  	drop_parsed_module_refcounts(opts.subsys_bits);
c6d57f331   Paul Menage   cgroups: support ...
1537
1538
1539
   out_err:
  	kfree(opts.release_agent);
  	kfree(opts.name);
f7e835710   Al Viro   convert cgroup an...
1540
  	return ERR_PTR(ret);
ddbcc7e8e   Paul Menage   Task Control Grou...
1541
1542
1543
1544
  }
  
  static void cgroup_kill_sb(struct super_block *sb) {
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1545
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1546
  	int ret;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1547
1548
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
ddbcc7e8e   Paul Menage   Task Control Grou...
1549
1550
1551
1552
  
  	BUG_ON(!root);
  
  	BUG_ON(root->number_of_cgroups != 1);
bd89aabc6   Paul Menage   Control groups: R...
1553
1554
  	BUG_ON(!list_empty(&cgrp->children));
  	BUG_ON(!list_empty(&cgrp->sibling));
ddbcc7e8e   Paul Menage   Task Control Grou...
1555
1556
1557
1558
1559
1560
1561
  
  	mutex_lock(&cgroup_mutex);
  
  	/* Rebind all subsystems back to the default hierarchy */
  	ret = rebind_subsystems(root, 0);
  	/* Shouldn't be able to fail ... */
  	BUG_ON(ret);
817929ec2   Paul Menage   Task Control Grou...
1562
1563
1564
1565
1566
  	/*
  	 * Release all the links from css_sets to this hierarchy's
  	 * root cgroup
  	 */
  	write_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1567
1568
1569
  
  	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
  				 cgrp_link_list) {
817929ec2   Paul Menage   Task Control Grou...
1570
  		list_del(&link->cg_link_list);
bd89aabc6   Paul Menage   Control groups: R...
1571
  		list_del(&link->cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
1572
1573
1574
  		kfree(link);
  	}
  	write_unlock(&css_set_lock);
839ec5452   Paul Menage   cgroup: fix root_...
1575
1576
1577
1578
  	if (!list_empty(&root->root_list)) {
  		list_del(&root->root_list);
  		root_count--;
  	}
e5f6a8609   Li Zefan   cgroups: make roo...
1579

ddbcc7e8e   Paul Menage   Task Control Grou...
1580
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1581
  	kill_litter_super(sb);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1582
  	cgroup_drop_root(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
1583
1584
1585
1586
  }
  
  static struct file_system_type cgroup_fs_type = {
  	.name = "cgroup",
f7e835710   Al Viro   convert cgroup an...
1587
  	.mount = cgroup_mount,
ddbcc7e8e   Paul Menage   Task Control Grou...
1588
1589
  	.kill_sb = cgroup_kill_sb,
  };
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
1590
  static struct kobject *cgroup_kobj;
bd89aabc6   Paul Menage   Control groups: R...
1591
  static inline struct cgroup *__d_cgrp(struct dentry *dentry)
ddbcc7e8e   Paul Menage   Task Control Grou...
1592
1593
1594
1595
1596
1597
1598
1599
  {
  	return dentry->d_fsdata;
  }
  
  static inline struct cftype *__d_cft(struct dentry *dentry)
  {
  	return dentry->d_fsdata;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1600
1601
1602
1603
1604
1605
  /**
   * cgroup_path - generate the path of a cgroup
   * @cgrp: the cgroup in question
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
a47295e6b   Paul Menage   cgroups: make cgr...
1606
1607
1608
   * Called with cgroup_mutex held or else with an RCU-protected cgroup
   * reference.  Writes path of cgroup into buf.  Returns 0 on success,
   * -errno on error.
ddbcc7e8e   Paul Menage   Task Control Grou...
1609
   */
bd89aabc6   Paul Menage   Control groups: R...
1610
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
ddbcc7e8e   Paul Menage   Task Control Grou...
1611
1612
  {
  	char *start;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1613
  	struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
9a9686b63   Li Zefan   cgroup: Fix an RC...
1614
  						      cgroup_lock_is_held());
ddbcc7e8e   Paul Menage   Task Control Grou...
1615

a47295e6b   Paul Menage   cgroups: make cgr...
1616
  	if (!dentry || cgrp == dummytop) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
  		/*
  		 * Inactive subsystems have no dentry for their root
  		 * cgroup
  		 */
  		strcpy(buf, "/");
  		return 0;
  	}
  
  	start = buf + buflen;
  
  	*--start = '\0';
  	for (;;) {
a47295e6b   Paul Menage   cgroups: make cgr...
1629
  		int len = dentry->d_name.len;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1630

ddbcc7e8e   Paul Menage   Task Control Grou...
1631
1632
  		if ((start -= len) < buf)
  			return -ENAMETOOLONG;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1633
  		memcpy(start, dentry->d_name.name, len);
bd89aabc6   Paul Menage   Control groups: R...
1634
1635
  		cgrp = cgrp->parent;
  		if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
1636
  			break;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1637
1638
  
  		dentry = rcu_dereference_check(cgrp->dentry,
9a9686b63   Li Zefan   cgroup: Fix an RC...
1639
  					       cgroup_lock_is_held());
bd89aabc6   Paul Menage   Control groups: R...
1640
  		if (!cgrp->parent)
ddbcc7e8e   Paul Menage   Task Control Grou...
1641
1642
1643
1644
1645
1646
1647
1648
  			continue;
  		if (--start < buf)
  			return -ENAMETOOLONG;
  		*start = '/';
  	}
  	memmove(buf, start, buf + buflen - start);
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
1649
  EXPORT_SYMBOL_GPL(cgroup_path);
ddbcc7e8e   Paul Menage   Task Control Grou...
1650

74a1166df   Ben Blum   cgroups: make pro...
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
  /*
   * cgroup_task_migrate - move a task from one cgroup to another.
   *
   * 'guarantee' is set if the caller promises that a new css_set for the task
   * will already exist. If not set, this function might sleep, and can fail with
   * -ENOMEM. Otherwise, it can only fail with -ESRCH.
   */
  static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
  			       struct task_struct *tsk, bool guarantee)
  {
  	struct css_set *oldcg;
  	struct css_set *newcg;
  
  	/*
  	 * get old css_set. we need to take task_lock and refcount it, because
  	 * an exiting task can change its css_set to init_css_set and drop its
  	 * old one without taking cgroup_mutex.
  	 */
  	task_lock(tsk);
  	oldcg = tsk->cgroups;
  	get_css_set(oldcg);
  	task_unlock(tsk);
  
  	/* locate or allocate a new css_set for this task. */
  	if (guarantee) {
  		/* we know the css_set we want already exists. */
  		struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
  		read_lock(&css_set_lock);
  		newcg = find_existing_css_set(oldcg, cgrp, template);
  		BUG_ON(!newcg);
  		get_css_set(newcg);
  		read_unlock(&css_set_lock);
  	} else {
  		might_sleep();
  		/* find_css_set will give us newcg already referenced. */
  		newcg = find_css_set(oldcg, cgrp);
  		if (!newcg) {
  			put_css_set(oldcg);
  			return -ENOMEM;
  		}
  	}
  	put_css_set(oldcg);
  
  	/* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
  	task_lock(tsk);
  	if (tsk->flags & PF_EXITING) {
  		task_unlock(tsk);
  		put_css_set(newcg);
  		return -ESRCH;
  	}
  	rcu_assign_pointer(tsk->cgroups, newcg);
  	task_unlock(tsk);
  
  	/* Update the css_set linked lists if we're using them */
  	write_lock(&css_set_lock);
  	if (!list_empty(&tsk->cg_list))
  		list_move(&tsk->cg_list, &newcg->tasks);
  	write_unlock(&css_set_lock);
  
  	/*
  	 * We just gained a reference on oldcg by taking it from the task. As
  	 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
  	 * it here; it will be freed under RCU.
  	 */
  	put_css_set(oldcg);
  
  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
  	return 0;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1720
1721
1722
1723
  /**
   * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
   * @cgrp: the cgroup the task is attaching to
   * @tsk: the task to be attached
bbcb81d09   Paul Menage   Task Control Grou...
1724
   *
a043e3b2c   Li Zefan   cgroup: fix comments
1725
1726
   * Call holding cgroup_mutex. May take task_lock of
   * the task 'tsk' during call.
bbcb81d09   Paul Menage   Task Control Grou...
1727
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1728
  int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
bbcb81d09   Paul Menage   Task Control Grou...
1729
  {
74a1166df   Ben Blum   cgroups: make pro...
1730
  	int retval;
2468c7234   Daisuke Nishimura   cgroup: introduce...
1731
  	struct cgroup_subsys *ss, *failed_ss = NULL;
bd89aabc6   Paul Menage   Control groups: R...
1732
  	struct cgroup *oldcgrp;
bd89aabc6   Paul Menage   Control groups: R...
1733
  	struct cgroupfs_root *root = cgrp->root;
bbcb81d09   Paul Menage   Task Control Grou...
1734
1735
  
  	/* Nothing to do if the task is already in that cgroup */
7717f7ba9   Paul Menage   cgroups: add a ba...
1736
  	oldcgrp = task_cgroup_from_root(tsk, root);
bd89aabc6   Paul Menage   Control groups: R...
1737
  	if (cgrp == oldcgrp)
bbcb81d09   Paul Menage   Task Control Grou...
1738
1739
1740
1741
  		return 0;
  
  	for_each_subsys(root, ss) {
  		if (ss->can_attach) {
f780bdb7c   Ben Blum   cgroups: add per-...
1742
  			retval = ss->can_attach(ss, cgrp, tsk);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
  			if (retval) {
  				/*
  				 * Remember on which subsystem the can_attach()
  				 * failed, so that we only call cancel_attach()
  				 * against the subsystems whose can_attach()
  				 * succeeded. (See below)
  				 */
  				failed_ss = ss;
  				goto out;
  			}
bbcb81d09   Paul Menage   Task Control Grou...
1753
  		}
f780bdb7c   Ben Blum   cgroups: add per-...
1754
1755
1756
1757
1758
1759
1760
  		if (ss->can_attach_task) {
  			retval = ss->can_attach_task(cgrp, tsk);
  			if (retval) {
  				failed_ss = ss;
  				goto out;
  			}
  		}
bbcb81d09   Paul Menage   Task Control Grou...
1761
  	}
74a1166df   Ben Blum   cgroups: make pro...
1762
1763
  	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
  	if (retval)
2468c7234   Daisuke Nishimura   cgroup: introduce...
1764
  		goto out;
817929ec2   Paul Menage   Task Control Grou...
1765

bbcb81d09   Paul Menage   Task Control Grou...
1766
  	for_each_subsys(root, ss) {
f780bdb7c   Ben Blum   cgroups: add per-...
1767
1768
1769
1770
  		if (ss->pre_attach)
  			ss->pre_attach(cgrp);
  		if (ss->attach_task)
  			ss->attach_task(cgrp, tsk);
e18f6318e   Paul Jackson   cgroup brace codi...
1771
  		if (ss->attach)
f780bdb7c   Ben Blum   cgroups: add per-...
1772
  			ss->attach(ss, cgrp, oldcgrp, tsk);
bbcb81d09   Paul Menage   Task Control Grou...
1773
  	}
74a1166df   Ben Blum   cgroups: make pro...
1774

bbcb81d09   Paul Menage   Task Control Grou...
1775
  	synchronize_rcu();
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
1776
1777
1778
1779
1780
  
  	/*
  	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
  	 * is no longer empty.
  	 */
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
1781
  	cgroup_wakeup_rmdir_waiter(cgrp);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
  out:
  	if (retval) {
  		for_each_subsys(root, ss) {
  			if (ss == failed_ss)
  				/*
  				 * This subsystem was the one that failed the
  				 * can_attach() check earlier, so we don't need
  				 * to call cancel_attach() against it or any
  				 * remaining subsystems.
  				 */
  				break;
  			if (ss->cancel_attach)
f780bdb7c   Ben Blum   cgroups: add per-...
1794
  				ss->cancel_attach(ss, cgrp, tsk);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1795
1796
1797
  		}
  	}
  	return retval;
bbcb81d09   Paul Menage   Task Control Grou...
1798
  }
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1799
  /**
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1800
1801
   * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
   * @from: attach to all cgroups of a given task
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1802
1803
   * @tsk: the task to be attached
   */
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1804
  int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1805
1806
  {
  	struct cgroupfs_root *root;
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1807
1808
1809
1810
  	int retval = 0;
  
  	cgroup_lock();
  	for_each_active_root(root) {
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1811
1812
1813
  		struct cgroup *from_cg = task_cgroup_from_root(from, root);
  
  		retval = cgroup_attach_task(from_cg, tsk);
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1814
1815
1816
1817
1818
1819
1820
  		if (retval)
  			break;
  	}
  	cgroup_unlock();
  
  	return retval;
  }
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1821
  EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1822

bbcb81d09   Paul Menage   Task Control Grou...
1823
  /*
74a1166df   Ben Blum   cgroups: make pro...
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
   * cgroup_attach_proc works in two stages, the first of which prefetches all
   * new css_sets needed (to make sure we have enough memory before committing
   * to the move) and stores them in a list of entries of the following type.
   * TODO: possible optimization: use css_set->rcu_head for chaining instead
   */
  struct cg_list_entry {
  	struct css_set *cg;
  	struct list_head links;
  };
  
  static bool css_set_check_fetched(struct cgroup *cgrp,
  				  struct task_struct *tsk, struct css_set *cg,
  				  struct list_head *newcg_list)
  {
  	struct css_set *newcg;
  	struct cg_list_entry *cg_entry;
  	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
  
  	read_lock(&css_set_lock);
  	newcg = find_existing_css_set(cg, cgrp, template);
  	if (newcg)
  		get_css_set(newcg);
  	read_unlock(&css_set_lock);
  
  	/* doesn't exist at all? */
  	if (!newcg)
  		return false;
  	/* see if it's already in the list */
  	list_for_each_entry(cg_entry, newcg_list, links) {
  		if (cg_entry->cg == newcg) {
  			put_css_set(newcg);
  			return true;
  		}
  	}
  
  	/* not found */
  	put_css_set(newcg);
  	return false;
  }
  
  /*
   * Find the new css_set and store it in the list in preparation for moving the
   * given task to the given cgroup. Returns 0 or -ENOMEM.
   */
  static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
  			    struct list_head *newcg_list)
  {
  	struct css_set *newcg;
  	struct cg_list_entry *cg_entry;
  
  	/* ensure a new css_set will exist for this thread */
  	newcg = find_css_set(cg, cgrp);
  	if (!newcg)
  		return -ENOMEM;
  	/* add it to the list */
  	cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
  	if (!cg_entry) {
  		put_css_set(newcg);
  		return -ENOMEM;
  	}
  	cg_entry->cg = newcg;
  	list_add(&cg_entry->links, newcg_list);
  	return 0;
  }
  
  /**
   * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
   * @cgrp: the cgroup to attach to
   * @leader: the threadgroup leader task_struct of the group to be attached
   *
   * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
   * take task_lock of each thread in leader's threadgroup individually in turn.
   */
  int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
  {
  	int retval, i, group_size;
  	struct cgroup_subsys *ss, *failed_ss = NULL;
  	bool cancel_failed_ss = false;
  	/* guaranteed to be initialized later, but the compiler needs this */
  	struct cgroup *oldcgrp = NULL;
  	struct css_set *oldcg;
  	struct cgroupfs_root *root = cgrp->root;
  	/* threadgroup list cursor and array */
  	struct task_struct *tsk;
d846687d7   Ben Blum   cgroups: use flex...
1908
  	struct flex_array *group;
74a1166df   Ben Blum   cgroups: make pro...
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
  	/*
  	 * we need to make sure we have css_sets for all the tasks we're
  	 * going to move -before- we actually start moving them, so that in
  	 * case we get an ENOMEM we can bail out before making any changes.
  	 */
  	struct list_head newcg_list;
  	struct cg_list_entry *cg_entry, *temp_nobe;
  
  	/*
  	 * step 0: in order to do expensive, possibly blocking operations for
  	 * every thread, we cannot iterate the thread group list, since it needs
  	 * rcu or tasklist locked. instead, build an array of all threads in the
  	 * group - threadgroup_fork_lock prevents new threads from appearing,
  	 * and if threads exit, this will just be an over-estimate.
  	 */
  	group_size = get_nr_threads(leader);
d846687d7   Ben Blum   cgroups: use flex...
1925
1926
1927
  	/* flex_array supports very large thread-groups better than kmalloc. */
  	group = flex_array_alloc(sizeof(struct task_struct *), group_size,
  				 GFP_KERNEL);
74a1166df   Ben Blum   cgroups: make pro...
1928
1929
  	if (!group)
  		return -ENOMEM;
d846687d7   Ben Blum   cgroups: use flex...
1930
1931
1932
1933
  	/* pre-allocate to guarantee space while iterating in rcu read-side. */
  	retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
  	if (retval)
  		goto out_free_group_list;
74a1166df   Ben Blum   cgroups: make pro...
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
  
  	/* prevent changes to the threadgroup list while we take a snapshot. */
  	rcu_read_lock();
  	if (!thread_group_leader(leader)) {
  		/*
  		 * a race with de_thread from another thread's exec() may strip
  		 * us of our leadership, making while_each_thread unsafe to use
  		 * on this task. if this happens, there is no choice but to
  		 * throw this task away and try again (from cgroup_procs_write);
  		 * this is "double-double-toil-and-trouble-check locking".
  		 */
  		rcu_read_unlock();
  		retval = -EAGAIN;
  		goto out_free_group_list;
  	}
  	/* take a reference on each task in the group to go in the array. */
  	tsk = leader;
  	i = 0;
  	do {
  		/* as per above, nr_threads may decrease, but not increase. */
  		BUG_ON(i >= group_size);
  		get_task_struct(tsk);
d846687d7   Ben Blum   cgroups: use flex...
1956
1957
1958
1959
1960
1961
  		/*
  		 * saying GFP_ATOMIC has no effect here because we did prealloc
  		 * earlier, but it's good form to communicate our expectations.
  		 */
  		retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
  		BUG_ON(retval != 0);
74a1166df   Ben Blum   cgroups: make pro...
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
  		i++;
  	} while_each_thread(leader, tsk);
  	/* remember the number of threads in the array for later. */
  	group_size = i;
  	rcu_read_unlock();
  
  	/*
  	 * step 1: check that we can legitimately attach to the cgroup.
  	 */
  	for_each_subsys(root, ss) {
  		if (ss->can_attach) {
  			retval = ss->can_attach(ss, cgrp, leader);
  			if (retval) {
  				failed_ss = ss;
  				goto out_cancel_attach;
  			}
  		}
  		/* a callback to be run on every thread in the threadgroup. */
  		if (ss->can_attach_task) {
  			/* run on each task in the threadgroup. */
  			for (i = 0; i < group_size; i++) {
d846687d7   Ben Blum   cgroups: use flex...
1983
1984
  				tsk = flex_array_get_ptr(group, i);
  				retval = ss->can_attach_task(cgrp, tsk);
74a1166df   Ben Blum   cgroups: make pro...
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
  				if (retval) {
  					failed_ss = ss;
  					cancel_failed_ss = true;
  					goto out_cancel_attach;
  				}
  			}
  		}
  	}
  
  	/*
  	 * step 2: make sure css_sets exist for all threads to be migrated.
  	 * we use find_css_set, which allocates a new one if necessary.
  	 */
  	INIT_LIST_HEAD(&newcg_list);
  	for (i = 0; i < group_size; i++) {
d846687d7   Ben Blum   cgroups: use flex...
2000
  		tsk = flex_array_get_ptr(group, i);
74a1166df   Ben Blum   cgroups: make pro...
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
  		/* nothing to do if this task is already in the cgroup */
  		oldcgrp = task_cgroup_from_root(tsk, root);
  		if (cgrp == oldcgrp)
  			continue;
  		/* get old css_set pointer */
  		task_lock(tsk);
  		if (tsk->flags & PF_EXITING) {
  			/* ignore this task if it's going away */
  			task_unlock(tsk);
  			continue;
  		}
  		oldcg = tsk->cgroups;
  		get_css_set(oldcg);
  		task_unlock(tsk);
  		/* see if the new one for us is already in the list? */
  		if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
  			/* was already there, nothing to do. */
  			put_css_set(oldcg);
  		} else {
  			/* we don't already have it. get new one. */
  			retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
  			put_css_set(oldcg);
  			if (retval)
  				goto out_list_teardown;
  		}
  	}
  
  	/*
  	 * step 3: now that we're guaranteed success wrt the css_sets, proceed
  	 * to move all tasks to the new cgroup, calling ss->attach_task for each
  	 * one along the way. there are no failure cases after here, so this is
  	 * the commit point.
  	 */
  	for_each_subsys(root, ss) {
  		if (ss->pre_attach)
  			ss->pre_attach(cgrp);
  	}
  	for (i = 0; i < group_size; i++) {
d846687d7   Ben Blum   cgroups: use flex...
2039
  		tsk = flex_array_get_ptr(group, i);
74a1166df   Ben Blum   cgroups: make pro...
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
  		/* leave current thread as it is if it's already there */
  		oldcgrp = task_cgroup_from_root(tsk, root);
  		if (cgrp == oldcgrp)
  			continue;
  		/* attach each task to each subsystem */
  		for_each_subsys(root, ss) {
  			if (ss->attach_task)
  				ss->attach_task(cgrp, tsk);
  		}
  		/* if the thread is PF_EXITING, it can just get skipped. */
  		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
  		BUG_ON(retval != 0 && retval != -ESRCH);
  	}
  	/* nothing is sensitive to fork() after this point. */
  
  	/*
  	 * step 4: do expensive, non-thread-specific subsystem callbacks.
  	 * TODO: if ever a subsystem needs to know the oldcgrp for each task
  	 * being moved, this call will need to be reworked to communicate that.
  	 */
  	for_each_subsys(root, ss) {
  		if (ss->attach)
  			ss->attach(ss, cgrp, oldcgrp, leader);
  	}
  
  	/*
  	 * step 5: success! and cleanup
  	 */
  	synchronize_rcu();
  	cgroup_wakeup_rmdir_waiter(cgrp);
  	retval = 0;
  out_list_teardown:
  	/* clean up the list of prefetched css_sets. */
  	list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
  		list_del(&cg_entry->links);
  		put_css_set(cg_entry->cg);
  		kfree(cg_entry);
  	}
  out_cancel_attach:
  	/* same deal as in cgroup_attach_task */
  	if (retval) {
  		for_each_subsys(root, ss) {
  			if (ss == failed_ss) {
  				if (cancel_failed_ss && ss->cancel_attach)
  					ss->cancel_attach(ss, cgrp, leader);
  				break;
  			}
  			if (ss->cancel_attach)
  				ss->cancel_attach(ss, cgrp, leader);
  		}
  	}
  	/* clean up the array of referenced threads in the group. */
d846687d7   Ben Blum   cgroups: use flex...
2092
2093
2094
2095
  	for (i = 0; i < group_size; i++) {
  		tsk = flex_array_get_ptr(group, i);
  		put_task_struct(tsk);
  	}
74a1166df   Ben Blum   cgroups: make pro...
2096
  out_free_group_list:
d846687d7   Ben Blum   cgroups: use flex...
2097
  	flex_array_free(group);
74a1166df   Ben Blum   cgroups: make pro...
2098
2099
2100
2101
2102
2103
2104
  	return retval;
  }
  
  /*
   * Find the task_struct of the task to attach by vpid and pass it along to the
   * function to attach either it or all tasks in its threadgroup. Will take
   * cgroup_mutex; may take task_lock of task.
bbcb81d09   Paul Menage   Task Control Grou...
2105
   */
74a1166df   Ben Blum   cgroups: make pro...
2106
  static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
bbcb81d09   Paul Menage   Task Control Grou...
2107
  {
bbcb81d09   Paul Menage   Task Control Grou...
2108
  	struct task_struct *tsk;
c69e8d9c0   David Howells   CRED: Use RCU to ...
2109
  	const struct cred *cred = current_cred(), *tcred;
bbcb81d09   Paul Menage   Task Control Grou...
2110
  	int ret;
74a1166df   Ben Blum   cgroups: make pro...
2111
2112
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
bbcb81d09   Paul Menage   Task Control Grou...
2113
2114
  	if (pid) {
  		rcu_read_lock();
73507f335   Pavel Emelyanov   Handle pid namesp...
2115
  		tsk = find_task_by_vpid(pid);
74a1166df   Ben Blum   cgroups: make pro...
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
  		if (!tsk) {
  			rcu_read_unlock();
  			cgroup_unlock();
  			return -ESRCH;
  		}
  		if (threadgroup) {
  			/*
  			 * RCU protects this access, since tsk was found in the
  			 * tid map. a race with de_thread may cause group_leader
  			 * to stop being the leader, but cgroup_attach_proc will
  			 * detect it later.
  			 */
  			tsk = tsk->group_leader;
  		} else if (tsk->flags & PF_EXITING) {
  			/* optimization for the single-task-only case */
bbcb81d09   Paul Menage   Task Control Grou...
2131
  			rcu_read_unlock();
74a1166df   Ben Blum   cgroups: make pro...
2132
  			cgroup_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
2133
2134
  			return -ESRCH;
  		}
bbcb81d09   Paul Menage   Task Control Grou...
2135

74a1166df   Ben Blum   cgroups: make pro...
2136
2137
2138
2139
  		/*
  		 * even if we're attaching all tasks in the thread group, we
  		 * only need to check permissions on one of them.
  		 */
c69e8d9c0   David Howells   CRED: Use RCU to ...
2140
2141
2142
2143
2144
  		tcred = __task_cred(tsk);
  		if (cred->euid &&
  		    cred->euid != tcred->uid &&
  		    cred->euid != tcred->suid) {
  			rcu_read_unlock();
74a1166df   Ben Blum   cgroups: make pro...
2145
  			cgroup_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
2146
2147
  			return -EACCES;
  		}
c69e8d9c0   David Howells   CRED: Use RCU to ...
2148
2149
  		get_task_struct(tsk);
  		rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
2150
  	} else {
74a1166df   Ben Blum   cgroups: make pro...
2151
2152
2153
2154
  		if (threadgroup)
  			tsk = current->group_leader;
  		else
  			tsk = current;
bbcb81d09   Paul Menage   Task Control Grou...
2155
2156
  		get_task_struct(tsk);
  	}
74a1166df   Ben Blum   cgroups: make pro...
2157
2158
2159
2160
2161
2162
2163
  	if (threadgroup) {
  		threadgroup_fork_write_lock(tsk);
  		ret = cgroup_attach_proc(cgrp, tsk);
  		threadgroup_fork_write_unlock(tsk);
  	} else {
  		ret = cgroup_attach_task(cgrp, tsk);
  	}
bbcb81d09   Paul Menage   Task Control Grou...
2164
  	put_task_struct(tsk);
74a1166df   Ben Blum   cgroups: make pro...
2165
  	cgroup_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
2166
2167
  	return ret;
  }
af351026a   Paul Menage   cgroup files: tur...
2168
2169
  static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
  {
74a1166df   Ben Blum   cgroups: make pro...
2170
2171
2172
2173
2174
  	return attach_task_by_pid(cgrp, pid, false);
  }
  
  static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
  {
af351026a   Paul Menage   cgroup files: tur...
2175
  	int ret;
74a1166df   Ben Blum   cgroups: make pro...
2176
2177
2178
2179
2180
2181
2182
2183
  	do {
  		/*
  		 * attach_proc fails with -EAGAIN if threadgroup leadership
  		 * changes in the middle of the operation, in which case we need
  		 * to find the task_struct for the new leader and start over.
  		 */
  		ret = attach_task_by_pid(cgrp, tgid, true);
  	} while (ret == -EAGAIN);
af351026a   Paul Menage   cgroup files: tur...
2184
2185
  	return ret;
  }
e788e066c   Paul Menage   cgroup files: mov...
2186
2187
2188
2189
  /**
   * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
   * @cgrp: the cgroup to be checked for liveness
   *
84eea8428   Paul Menage   cgroups: misc cle...
2190
2191
   * On success, returns true; the lock should be later released with
   * cgroup_unlock(). On failure returns false with no lock held.
e788e066c   Paul Menage   cgroup files: mov...
2192
   */
84eea8428   Paul Menage   cgroups: misc cle...
2193
  bool cgroup_lock_live_group(struct cgroup *cgrp)
e788e066c   Paul Menage   cgroup files: mov...
2194
2195
2196
2197
2198
2199
2200
2201
  {
  	mutex_lock(&cgroup_mutex);
  	if (cgroup_is_removed(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
  		return false;
  	}
  	return true;
  }
67523c48a   Ben Blum   cgroups: blkio su...
2202
  EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
e788e066c   Paul Menage   cgroup files: mov...
2203
2204
2205
2206
2207
  
  static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
f4a2589fe   Evgeny Kuznetsov   cgroups: add chec...
2208
2209
  	if (strlen(buffer) >= PATH_MAX)
  		return -EINVAL;
e788e066c   Paul Menage   cgroup files: mov...
2210
2211
2212
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	strcpy(cgrp->root->release_agent_path, buffer);
84eea8428   Paul Menage   cgroups: misc cle...
2213
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
  	return 0;
  }
  
  static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
  				     struct seq_file *seq)
  {
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	seq_puts(seq, cgrp->root->release_agent_path);
  	seq_putc(seq, '
  ');
84eea8428   Paul Menage   cgroups: misc cle...
2225
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
2226
2227
  	return 0;
  }
84eea8428   Paul Menage   cgroups: misc cle...
2228
2229
  /* A buffer size big enough for numbers or short strings */
  #define CGROUP_LOCAL_BUFFER_SIZE 64
e73d2c61d   Paul Menage   CGroups _s64 file...
2230
  static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
f4c753b7e   Paul Menage   CGroup API files:...
2231
2232
2233
  				struct file *file,
  				const char __user *userbuf,
  				size_t nbytes, loff_t *unused_ppos)
355e0c48b   Paul Menage   Add cgroup write_...
2234
  {
84eea8428   Paul Menage   cgroups: misc cle...
2235
  	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
355e0c48b   Paul Menage   Add cgroup write_...
2236
  	int retval = 0;
355e0c48b   Paul Menage   Add cgroup write_...
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
  	char *end;
  
  	if (!nbytes)
  		return -EINVAL;
  	if (nbytes >= sizeof(buffer))
  		return -E2BIG;
  	if (copy_from_user(buffer, userbuf, nbytes))
  		return -EFAULT;
  
  	buffer[nbytes] = 0;     /* nul-terminate */
e73d2c61d   Paul Menage   CGroups _s64 file...
2247
  	if (cft->write_u64) {
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
2248
  		u64 val = simple_strtoull(strstrip(buffer), &end, 0);
e73d2c61d   Paul Menage   CGroups _s64 file...
2249
2250
2251
2252
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_u64(cgrp, cft, val);
  	} else {
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
2253
  		s64 val = simple_strtoll(strstrip(buffer), &end, 0);
e73d2c61d   Paul Menage   CGroups _s64 file...
2254
2255
2256
2257
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_s64(cgrp, cft, val);
  	}
355e0c48b   Paul Menage   Add cgroup write_...
2258
2259
2260
2261
  	if (!retval)
  		retval = nbytes;
  	return retval;
  }
db3b14978   Paul Menage   cgroup files: add...
2262
2263
2264
2265
2266
  static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
  				   struct file *file,
  				   const char __user *userbuf,
  				   size_t nbytes, loff_t *unused_ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
2267
  	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
db3b14978   Paul Menage   cgroup files: add...
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
  	int retval = 0;
  	size_t max_bytes = cft->max_write_len;
  	char *buffer = local_buffer;
  
  	if (!max_bytes)
  		max_bytes = sizeof(local_buffer) - 1;
  	if (nbytes >= max_bytes)
  		return -E2BIG;
  	/* Allocate a dynamic buffer if we need one */
  	if (nbytes >= sizeof(local_buffer)) {
  		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
  		if (buffer == NULL)
  			return -ENOMEM;
  	}
5a3eb9f6b   Li Zefan   cgroup: fix possi...
2282
2283
2284
2285
  	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
  		retval = -EFAULT;
  		goto out;
  	}
db3b14978   Paul Menage   cgroup files: add...
2286
2287
  
  	buffer[nbytes] = 0;     /* nul-terminate */
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
2288
  	retval = cft->write_string(cgrp, cft, strstrip(buffer));
db3b14978   Paul Menage   cgroup files: add...
2289
2290
  	if (!retval)
  		retval = nbytes;
5a3eb9f6b   Li Zefan   cgroup: fix possi...
2291
  out:
db3b14978   Paul Menage   cgroup files: add...
2292
2293
2294
2295
  	if (buffer != local_buffer)
  		kfree(buffer);
  	return retval;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
2296
2297
2298
2299
  static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
  						size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
2300
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
2301

75139b827   Li Zefan   cgroups: remove s...
2302
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
2303
  		return -ENODEV;
355e0c48b   Paul Menage   Add cgroup write_...
2304
  	if (cft->write)
bd89aabc6   Paul Menage   Control groups: R...
2305
  		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
2306
2307
  	if (cft->write_u64 || cft->write_s64)
  		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
db3b14978   Paul Menage   cgroup files: add...
2308
2309
  	if (cft->write_string)
  		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
d447ea2f3   Pavel Emelyanov   cgroups: add the ...
2310
2311
2312
2313
  	if (cft->trigger) {
  		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
  		return ret ? ret : nbytes;
  	}
355e0c48b   Paul Menage   Add cgroup write_...
2314
  	return -EINVAL;
ddbcc7e8e   Paul Menage   Task Control Grou...
2315
  }
f4c753b7e   Paul Menage   CGroup API files:...
2316
2317
2318
2319
  static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
ddbcc7e8e   Paul Menage   Task Control Grou...
2320
  {
84eea8428   Paul Menage   cgroups: misc cle...
2321
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
f4c753b7e   Paul Menage   CGroup API files:...
2322
  	u64 val = cft->read_u64(cgrp, cft);
ddbcc7e8e   Paul Menage   Task Control Grou...
2323
2324
2325
2326
2327
  	int len = sprintf(tmp, "%llu
  ", (unsigned long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
e73d2c61d   Paul Menage   CGroups _s64 file...
2328
2329
2330
2331
2332
  static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
2333
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
e73d2c61d   Paul Menage   CGroups _s64 file...
2334
2335
2336
2337
2338
2339
  	s64 val = cft->read_s64(cgrp, cft);
  	int len = sprintf(tmp, "%lld
  ", (long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
2340
2341
2342
2343
  static ssize_t cgroup_file_read(struct file *file, char __user *buf,
  				   size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
2344
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
2345

75139b827   Li Zefan   cgroups: remove s...
2346
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
2347
2348
2349
  		return -ENODEV;
  
  	if (cft->read)
bd89aabc6   Paul Menage   Control groups: R...
2350
  		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
f4c753b7e   Paul Menage   CGroup API files:...
2351
2352
  	if (cft->read_u64)
  		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
2353
2354
  	if (cft->read_s64)
  		return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
ddbcc7e8e   Paul Menage   Task Control Grou...
2355
2356
  	return -EINVAL;
  }
917965696   Paul Menage   CGroup API files:...
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
  /*
   * seqfile ops/methods for returning structured data. Currently just
   * supports string->u64 maps, but can be extended in future.
   */
  
  struct cgroup_seqfile_state {
  	struct cftype *cft;
  	struct cgroup *cgroup;
  };
  
  static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
  {
  	struct seq_file *sf = cb->state;
  	return seq_printf(sf, "%s %llu
  ", key, (unsigned long long)value);
  }
  
  static int cgroup_seqfile_show(struct seq_file *m, void *arg)
  {
  	struct cgroup_seqfile_state *state = m->private;
  	struct cftype *cft = state->cft;
29486df32   Serge E. Hallyn   cgroups: introduc...
2378
2379
2380
2381
2382
2383
2384
2385
  	if (cft->read_map) {
  		struct cgroup_map_cb cb = {
  			.fill = cgroup_map_add,
  			.state = m,
  		};
  		return cft->read_map(state->cgroup, cft, &cb);
  	}
  	return cft->read_seq_string(state->cgroup, cft, m);
917965696   Paul Menage   CGroup API files:...
2386
  }
96930a636   Adrian Bunk   make cgroup_seqfi...
2387
  static int cgroup_seqfile_release(struct inode *inode, struct file *file)
917965696   Paul Menage   CGroup API files:...
2388
2389
2390
2391
2392
  {
  	struct seq_file *seq = file->private_data;
  	kfree(seq->private);
  	return single_release(inode, file);
  }
828c09509   Alexey Dobriyan   const: constify r...
2393
  static const struct file_operations cgroup_seqfile_operations = {
917965696   Paul Menage   CGroup API files:...
2394
  	.read = seq_read,
e788e066c   Paul Menage   cgroup files: mov...
2395
  	.write = cgroup_file_write,
917965696   Paul Menage   CGroup API files:...
2396
2397
2398
  	.llseek = seq_lseek,
  	.release = cgroup_seqfile_release,
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
2399
2400
2401
2402
2403
2404
2405
2406
  static int cgroup_file_open(struct inode *inode, struct file *file)
  {
  	int err;
  	struct cftype *cft;
  
  	err = generic_file_open(inode, file);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
2407
  	cft = __d_cft(file->f_dentry);
75139b827   Li Zefan   cgroups: remove s...
2408

29486df32   Serge E. Hallyn   cgroups: introduc...
2409
  	if (cft->read_map || cft->read_seq_string) {
917965696   Paul Menage   CGroup API files:...
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
  		struct cgroup_seqfile_state *state =
  			kzalloc(sizeof(*state), GFP_USER);
  		if (!state)
  			return -ENOMEM;
  		state->cft = cft;
  		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
  		file->f_op = &cgroup_seqfile_operations;
  		err = single_open(file, cgroup_seqfile_show, state);
  		if (err < 0)
  			kfree(state);
  	} else if (cft->open)
ddbcc7e8e   Paul Menage   Task Control Grou...
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
  		err = cft->open(inode, file);
  	else
  		err = 0;
  
  	return err;
  }
  
  static int cgroup_file_release(struct inode *inode, struct file *file)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
  	if (cft->release)
  		return cft->release(inode, file);
  	return 0;
  }
  
  /*
   * cgroup_rename - Only allow simple rename of directories in place.
   */
  static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
  			    struct inode *new_dir, struct dentry *new_dentry)
  {
  	if (!S_ISDIR(old_dentry->d_inode->i_mode))
  		return -ENOTDIR;
  	if (new_dentry->d_inode)
  		return -EEXIST;
  	if (old_dir != new_dir)
  		return -EIO;
  	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
  }
828c09509   Alexey Dobriyan   const: constify r...
2450
  static const struct file_operations cgroup_file_operations = {
ddbcc7e8e   Paul Menage   Task Control Grou...
2451
2452
2453
2454
2455
2456
  	.read = cgroup_file_read,
  	.write = cgroup_file_write,
  	.llseek = generic_file_llseek,
  	.open = cgroup_file_open,
  	.release = cgroup_file_release,
  };
6e1d5dcc2   Alexey Dobriyan   const: mark remai...
2457
  static const struct inode_operations cgroup_dir_inode_operations = {
c72a04e34   Al Viro   cgroup_fs: fix cg...
2458
  	.lookup = cgroup_lookup,
ddbcc7e8e   Paul Menage   Task Control Grou...
2459
2460
2461
2462
  	.mkdir = cgroup_mkdir,
  	.rmdir = cgroup_rmdir,
  	.rename = cgroup_rename,
  };
c72a04e34   Al Viro   cgroup_fs: fix cg...
2463
2464
2465
2466
2467
2468
2469
  static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
  {
  	if (dentry->d_name.len > NAME_MAX)
  		return ERR_PTR(-ENAMETOOLONG);
  	d_add(dentry, NULL);
  	return NULL;
  }
0dea11687   Kirill A. Shutemov   cgroup: implement...
2470
2471
2472
2473
2474
2475
2476
2477
2478
  /*
   * Check if a file is a control file
   */
  static inline struct cftype *__file_cft(struct file *file)
  {
  	if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
  		return ERR_PTR(-EINVAL);
  	return __d_cft(file->f_dentry);
  }
5adcee1d8   Nick Piggin   cgroup fs: avoid ...
2479
2480
2481
  static int cgroup_create_file(struct dentry *dentry, mode_t mode,
  				struct super_block *sb)
  {
ddbcc7e8e   Paul Menage   Task Control Grou...
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
  	struct inode *inode;
  
  	if (!dentry)
  		return -ENOENT;
  	if (dentry->d_inode)
  		return -EEXIST;
  
  	inode = cgroup_new_inode(mode, sb);
  	if (!inode)
  		return -ENOMEM;
  
  	if (S_ISDIR(mode)) {
  		inode->i_op = &cgroup_dir_inode_operations;
  		inode->i_fop = &simple_dir_operations;
  
  		/* start off with i_nlink == 2 (for "." entry) */
  		inc_nlink(inode);
  
  		/* start with the directory inode held, so that we can
  		 * populate it without racing with another mkdir */
817929ec2   Paul Menage   Task Control Grou...
2502
  		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ddbcc7e8e   Paul Menage   Task Control Grou...
2503
2504
2505
2506
  	} else if (S_ISREG(mode)) {
  		inode->i_size = 0;
  		inode->i_fop = &cgroup_file_operations;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
2507
2508
2509
2510
2511
2512
  	d_instantiate(dentry, inode);
  	dget(dentry);	/* Extra count - pin the dentry in core */
  	return 0;
  }
  
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
2513
2514
2515
2516
2517
   * cgroup_create_dir - create a directory for an object.
   * @cgrp: the cgroup we create the directory for. It must have a valid
   *        ->parent field. And we are going to fill its ->dentry field.
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new directory.
ddbcc7e8e   Paul Menage   Task Control Grou...
2518
   */
bd89aabc6   Paul Menage   Control groups: R...
2519
  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
2520
  				mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
2521
2522
2523
  {
  	struct dentry *parent;
  	int error = 0;
bd89aabc6   Paul Menage   Control groups: R...
2524
2525
  	parent = cgrp->parent->dentry;
  	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
2526
  	if (!error) {
bd89aabc6   Paul Menage   Control groups: R...
2527
  		dentry->d_fsdata = cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
2528
  		inc_nlink(parent->d_inode);
a47295e6b   Paul Menage   cgroups: make cgr...
2529
  		rcu_assign_pointer(cgrp->dentry, dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
2530
2531
2532
2533
2534
2535
  		dget(dentry);
  	}
  	dput(dentry);
  
  	return error;
  }
099fca322   Li Zefan   cgroups: show cor...
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
  /**
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
   *
   * returns cft->mode if ->mode is not 0
   * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
   * returns S_IRUGO if it has only a read handler
   * returns S_IWUSR if it has only a write hander
   */
  static mode_t cgroup_file_mode(const struct cftype *cft)
  {
  	mode_t mode = 0;
  
  	if (cft->mode)
  		return cft->mode;
  
  	if (cft->read || cft->read_u64 || cft->read_s64 ||
  	    cft->read_map || cft->read_seq_string)
  		mode |= S_IRUGO;
  
  	if (cft->write || cft->write_u64 || cft->write_s64 ||
  	    cft->write_string || cft->trigger)
  		mode |= S_IWUSR;
  
  	return mode;
  }
bd89aabc6   Paul Menage   Control groups: R...
2562
  int cgroup_add_file(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
2563
2564
2565
  		       struct cgroup_subsys *subsys,
  		       const struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
2566
  	struct dentry *dir = cgrp->dentry;
ddbcc7e8e   Paul Menage   Task Control Grou...
2567
2568
  	struct dentry *dentry;
  	int error;
099fca322   Li Zefan   cgroups: show cor...
2569
  	mode_t mode;
ddbcc7e8e   Paul Menage   Task Control Grou...
2570
2571
  
  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
bd89aabc6   Paul Menage   Control groups: R...
2572
  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2573
2574
2575
2576
2577
2578
2579
  		strcpy(name, subsys->name);
  		strcat(name, ".");
  	}
  	strcat(name, cft->name);
  	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
  	dentry = lookup_one_len(name, dir, strlen(name));
  	if (!IS_ERR(dentry)) {
099fca322   Li Zefan   cgroups: show cor...
2580
2581
  		mode = cgroup_file_mode(cft);
  		error = cgroup_create_file(dentry, mode | S_IFREG,
bd89aabc6   Paul Menage   Control groups: R...
2582
  						cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
2583
2584
2585
2586
2587
2588
2589
  		if (!error)
  			dentry->d_fsdata = (void *)cft;
  		dput(dentry);
  	} else
  		error = PTR_ERR(dentry);
  	return error;
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
2590
  EXPORT_SYMBOL_GPL(cgroup_add_file);
ddbcc7e8e   Paul Menage   Task Control Grou...
2591

bd89aabc6   Paul Menage   Control groups: R...
2592
  int cgroup_add_files(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
2593
2594
2595
2596
2597
2598
  			struct cgroup_subsys *subsys,
  			const struct cftype cft[],
  			int count)
  {
  	int i, err;
  	for (i = 0; i < count; i++) {
bd89aabc6   Paul Menage   Control groups: R...
2599
  		err = cgroup_add_file(cgrp, subsys, &cft[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
2600
2601
2602
2603
2604
  		if (err)
  			return err;
  	}
  	return 0;
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
2605
  EXPORT_SYMBOL_GPL(cgroup_add_files);
ddbcc7e8e   Paul Menage   Task Control Grou...
2606

a043e3b2c   Li Zefan   cgroup: fix comments
2607
2608
2609
2610
2611
2612
  /**
   * cgroup_task_count - count the number of tasks in a cgroup.
   * @cgrp: the cgroup in question
   *
   * Return the number of tasks in the cgroup.
   */
bd89aabc6   Paul Menage   Control groups: R...
2613
  int cgroup_task_count(const struct cgroup *cgrp)
bbcb81d09   Paul Menage   Task Control Grou...
2614
2615
  {
  	int count = 0;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
2616
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
2617
2618
  
  	read_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
2619
  	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
2620
  		count += atomic_read(&link->cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
2621
2622
  	}
  	read_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
2623
2624
2625
2626
  	return count;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
2627
2628
2629
   * Advance a list_head iterator.  The iterator should be positioned at
   * the start of a css_set
   */
bd89aabc6   Paul Menage   Control groups: R...
2630
  static void cgroup_advance_iter(struct cgroup *cgrp,
7717f7ba9   Paul Menage   cgroups: add a ba...
2631
  				struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2632
2633
2634
2635
2636
2637
2638
2639
  {
  	struct list_head *l = it->cg_link;
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	/* Advance to the next non-empty css_set */
  	do {
  		l = l->next;
bd89aabc6   Paul Menage   Control groups: R...
2640
  		if (l == &cgrp->css_sets) {
817929ec2   Paul Menage   Task Control Grou...
2641
2642
2643
  			it->cg_link = NULL;
  			return;
  		}
bd89aabc6   Paul Menage   Control groups: R...
2644
  		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
2645
2646
2647
2648
2649
  		cg = link->cg;
  	} while (list_empty(&cg->tasks));
  	it->cg_link = l;
  	it->task = cg->tasks.next;
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
2650
2651
2652
2653
2654
2655
2656
2657
2658
  /*
   * To reduce the fork() overhead for systems that are not actually
   * using their cgroups capability, we don't maintain the lists running
   * through each css_set to its tasks until we see the list actually
   * used - in other words after the first call to cgroup_iter_start().
   *
   * The tasklist_lock is not held here, as do_each_thread() and
   * while_each_thread() are protected by RCU.
   */
3df91fe30   Adrian Bunk   make cgroup_enabl...
2659
  static void cgroup_enable_task_cg_lists(void)
31a7df01f   Cliff Wickman   cgroups: mechanis...
2660
2661
2662
2663
2664
2665
  {
  	struct task_struct *p, *g;
  	write_lock(&css_set_lock);
  	use_task_css_set_links = 1;
  	do_each_thread(g, p) {
  		task_lock(p);
0e04388f0   Li Zefan   cgroup: fix a rac...
2666
2667
2668
2669
2670
2671
  		/*
  		 * We should check if the process is exiting, otherwise
  		 * it will race with cgroup_exit() in that the list
  		 * entry won't be deleted though the process has exited.
  		 */
  		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
31a7df01f   Cliff Wickman   cgroups: mechanis...
2672
2673
2674
2675
2676
  			list_add(&p->cg_list, &p->cgroups->tasks);
  		task_unlock(p);
  	} while_each_thread(g, p);
  	write_unlock(&css_set_lock);
  }
bd89aabc6   Paul Menage   Control groups: R...
2677
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2678
2679
2680
2681
2682
2683
  {
  	/*
  	 * The first time anyone tries to iterate across a cgroup,
  	 * we need to enable the list linking each css_set to its
  	 * tasks, and fix up all existing tasks.
  	 */
31a7df01f   Cliff Wickman   cgroups: mechanis...
2684
2685
  	if (!use_task_css_set_links)
  		cgroup_enable_task_cg_lists();
817929ec2   Paul Menage   Task Control Grou...
2686
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
2687
2688
  	it->cg_link = &cgrp->css_sets;
  	cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
2689
  }
bd89aabc6   Paul Menage   Control groups: R...
2690
  struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
2691
2692
2693
2694
  					struct cgroup_iter *it)
  {
  	struct task_struct *res;
  	struct list_head *l = it->task;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
2695
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
2696
2697
2698
2699
2700
2701
2702
  
  	/* If the iterator cg is NULL, we have no tasks */
  	if (!it->cg_link)
  		return NULL;
  	res = list_entry(l, struct task_struct, cg_list);
  	/* Advance iterator to find next entry */
  	l = l->next;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
2703
2704
  	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
  	if (l == &link->cg->tasks) {
817929ec2   Paul Menage   Task Control Grou...
2705
2706
  		/* We reached the end of this task list - move on to
  		 * the next cg_cgroup_link */
bd89aabc6   Paul Menage   Control groups: R...
2707
  		cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
2708
2709
2710
2711
2712
  	} else {
  		it->task = l;
  	}
  	return res;
  }
bd89aabc6   Paul Menage   Control groups: R...
2713
  void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2714
2715
2716
  {
  	read_unlock(&css_set_lock);
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
  static inline int started_after_time(struct task_struct *t1,
  				     struct timespec *time,
  				     struct task_struct *t2)
  {
  	int start_diff = timespec_compare(&t1->start_time, time);
  	if (start_diff > 0) {
  		return 1;
  	} else if (start_diff < 0) {
  		return 0;
  	} else {
  		/*
  		 * Arbitrarily, if two processes started at the same
  		 * time, we'll say that the lower pointer value
  		 * started first. Note that t2 may have exited by now
  		 * so this may not be a valid pointer any longer, but
  		 * that's fine - it still serves to distinguish
  		 * between two tasks started (effectively) simultaneously.
  		 */
  		return t1 > t2;
  	}
  }
  
  /*
   * This function is a callback from heap_insert() and is used to order
   * the heap.
   * In this case we order the heap in descending task start time.
   */
  static inline int started_after(void *p1, void *p2)
  {
  	struct task_struct *t1 = p1;
  	struct task_struct *t2 = p2;
  	return started_after_time(t1, &t2->start_time, t2);
  }
  
  /**
   * cgroup_scan_tasks - iterate though all the tasks in a cgroup
   * @scan: struct cgroup_scanner containing arguments for the scan
   *
   * Arguments include pointers to callback functions test_task() and
   * process_task().
   * Iterate through all the tasks in a cgroup, calling test_task() for each,
   * and if it returns true, call process_task() for it also.
   * The test_task pointer may be NULL, meaning always true (select all tasks).
   * Effectively duplicates cgroup_iter_{start,next,end}()
   * but does not lock css_set_lock for the call to process_task().
   * The struct cgroup_scanner may be embedded in any structure of the caller's
   * creation.
   * It is guaranteed that process_task() will act on every task that
   * is a member of the cgroup for the duration of this call. This
   * function may or may not call process_task() for tasks that exit
   * or move to a different cgroup during the call, or are forked or
   * move into the cgroup during the call.
   *
   * Note that test_task() may be called with locks held, and may in some
   * situations be called multiple times for the same task, so it should
   * be cheap.
   * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
   * pre-allocated and will be used for heap operations (and its "gt" member will
   * be overwritten), else a temporary heap will be used (allocation of which
   * may cause this function to fail).
   */
  int cgroup_scan_tasks(struct cgroup_scanner *scan)
  {
  	int retval, i;
  	struct cgroup_iter it;
  	struct task_struct *p, *dropped;
  	/* Never dereference latest_task, since it's not refcounted */
  	struct task_struct *latest_task = NULL;
  	struct ptr_heap tmp_heap;
  	struct ptr_heap *heap;
  	struct timespec latest_time = { 0, 0 };
  
  	if (scan->heap) {
  		/* The caller supplied our heap and pre-allocated its memory */
  		heap = scan->heap;
  		heap->gt = &started_after;
  	} else {
  		/* We need to allocate our own heap memory */
  		heap = &tmp_heap;
  		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
  		if (retval)
  			/* cannot allocate the heap */
  			return retval;
  	}
  
   again:
  	/*
  	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
  	 * to determine which are of interest, and using the scanner's
  	 * "process_task" callback to process any of them that need an update.
  	 * Since we don't want to hold any locks during the task updates,
  	 * gather tasks to be processed in a heap structure.
  	 * The heap is sorted by descending task start time.
  	 * If the statically-sized heap fills up, we overflow tasks that
  	 * started later, and in future iterations only consider tasks that
  	 * started after the latest task in the previous pass. This
  	 * guarantees forward progress and that we don't miss any tasks.
  	 */
  	heap->size = 0;
  	cgroup_iter_start(scan->cg, &it);
  	while ((p = cgroup_iter_next(scan->cg, &it))) {
  		/*
  		 * Only affect tasks that qualify per the caller's callback,
  		 * if he provided one
  		 */
  		if (scan->test_task && !scan->test_task(p, scan))
  			continue;
  		/*
  		 * Only process tasks that started after the last task
  		 * we processed
  		 */
  		if (!started_after_time(p, &latest_time, latest_task))
  			continue;
  		dropped = heap_insert(heap, p);
  		if (dropped == NULL) {
  			/*
  			 * The new task was inserted; the heap wasn't
  			 * previously full
  			 */
  			get_task_struct(p);
  		} else if (dropped != p) {
  			/*
  			 * The new task was inserted, and pushed out a
  			 * different task
  			 */
  			get_task_struct(p);
  			put_task_struct(dropped);
  		}
  		/*
  		 * Else the new task was newer than anything already in
  		 * the heap and wasn't inserted
  		 */
  	}
  	cgroup_iter_end(scan->cg, &it);
  
  	if (heap->size) {
  		for (i = 0; i < heap->size; i++) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2854
  			struct task_struct *q = heap->ptrs[i];
31a7df01f   Cliff Wickman   cgroups: mechanis...
2855
  			if (i == 0) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2856
2857
  				latest_time = q->start_time;
  				latest_task = q;
31a7df01f   Cliff Wickman   cgroups: mechanis...
2858
2859
  			}
  			/* Process the task per the caller's callback */
4fe91d518   Paul Jackson   cgroup: fix spars...
2860
2861
  			scan->process_task(q, scan);
  			put_task_struct(q);
31a7df01f   Cliff Wickman   cgroups: mechanis...
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
  		}
  		/*
  		 * If we had to process any tasks at all, scan again
  		 * in case some of them were in the middle of forking
  		 * children that didn't get processed.
  		 * Not the most efficient way to do it, but it avoids
  		 * having to take callback_mutex in the fork path
  		 */
  		goto again;
  	}
  	if (heap == &tmp_heap)
  		heap_free(&tmp_heap);
  	return 0;
  }
817929ec2   Paul Menage   Task Control Grou...
2876
  /*
102a775e3   Ben Blum   cgroups: add a re...
2877
   * Stuff for reading the 'tasks'/'procs' files.
bbcb81d09   Paul Menage   Task Control Grou...
2878
2879
2880
2881
2882
2883
   *
   * Reading this file can return large amounts of data if a cgroup has
   * *lots* of attached tasks. So it may need several calls to read(),
   * but we cannot guarantee that the information we produce is correct
   * unless we produce it entirely atomically.
   *
bbcb81d09   Paul Menage   Task Control Grou...
2884
   */
bbcb81d09   Paul Menage   Task Control Grou...
2885
2886
  
  /*
d1d9fd330   Ben Blum   cgroups: use vmal...
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
   * The following two functions "fix" the issue where there are more pids
   * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
   * TODO: replace with a kernel-wide solution to this problem
   */
  #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
  static void *pidlist_allocate(int count)
  {
  	if (PIDLIST_TOO_LARGE(count))
  		return vmalloc(count * sizeof(pid_t));
  	else
  		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
  }
  static void pidlist_free(void *p)
  {
  	if (is_vmalloc_addr(p))
  		vfree(p);
  	else
  		kfree(p);
  }
  static void *pidlist_resize(void *p, int newcount)
  {
  	void *newlist;
  	/* note: if new alloc fails, old p will still be valid either way */
  	if (is_vmalloc_addr(p)) {
  		newlist = vmalloc(newcount * sizeof(pid_t));
  		if (!newlist)
  			return NULL;
  		memcpy(newlist, p, newcount * sizeof(pid_t));
  		vfree(p);
  	} else {
  		newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
  	}
  	return newlist;
  }
  
  /*
102a775e3   Ben Blum   cgroups: add a re...
2923
2924
2925
2926
   * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
   * If the new stripped list is sufficiently smaller and there's enough memory
   * to allocate a new buffer, will let go of the unneeded memory. Returns the
   * number of unique elements.
bbcb81d09   Paul Menage   Task Control Grou...
2927
   */
102a775e3   Ben Blum   cgroups: add a re...
2928
2929
2930
  /* is the size difference enough that we should re-allocate the array? */
  #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
  static int pidlist_uniq(pid_t **p, int length)
bbcb81d09   Paul Menage   Task Control Grou...
2931
  {
102a775e3   Ben Blum   cgroups: add a re...
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
  	int src, dest = 1;
  	pid_t *list = *p;
  	pid_t *newlist;
  
  	/*
  	 * we presume the 0th element is unique, so i starts at 1. trivial
  	 * edge cases first; no work needs to be done for either
  	 */
  	if (length == 0 || length == 1)
  		return length;
  	/* src and dest walk down the list; dest counts unique elements */
  	for (src = 1; src < length; src++) {
  		/* find next unique element */
  		while (list[src] == list[src-1]) {
  			src++;
  			if (src == length)
  				goto after;
  		}
  		/* dest always points to where the next unique element goes */
  		list[dest] = list[src];
  		dest++;
  	}
  after:
  	/*
  	 * if the length difference is large enough, we want to allocate a
  	 * smaller buffer to save memory. if this fails due to out of memory,
  	 * we'll just stay with what we've got.
  	 */
  	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
d1d9fd330   Ben Blum   cgroups: use vmal...
2961
  		newlist = pidlist_resize(list, dest);
102a775e3   Ben Blum   cgroups: add a re...
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
  		if (newlist)
  			*p = newlist;
  	}
  	return dest;
  }
  
  static int cmppid(const void *a, const void *b)
  {
  	return *(pid_t *)a - *(pid_t *)b;
  }
  
  /*
72a8cb30d   Ben Blum   cgroups: ensure c...
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
   * find the appropriate pidlist for our purpose (given procs vs tasks)
   * returns with the lock on that pidlist already held, and takes care
   * of the use count, or returns NULL with no locks held if we're out of
   * memory.
   */
  static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
  						  enum cgroup_filetype type)
  {
  	struct cgroup_pidlist *l;
  	/* don't need task_nsproxy() if we're looking at ourself */
b70cc5fdb   Li Zefan   cgroups: clean up...
2984
  	struct pid_namespace *ns = current->nsproxy->pid_ns;
72a8cb30d   Ben Blum   cgroups: ensure c...
2985
2986
2987
2988
2989
2990
2991
2992
2993
  	/*
  	 * We can't drop the pidlist_mutex before taking the l->mutex in case
  	 * the last ref-holder is trying to remove l from the list at the same
  	 * time. Holding the pidlist_mutex precludes somebody taking whichever
  	 * list we find out from under us - compare release_pid_array().
  	 */
  	mutex_lock(&cgrp->pidlist_mutex);
  	list_for_each_entry(l, &cgrp->pidlists, links) {
  		if (l->key.type == type && l->key.ns == ns) {
72a8cb30d   Ben Blum   cgroups: ensure c...
2994
2995
2996
  			/* make sure l doesn't vanish out from under us */
  			down_write(&l->mutex);
  			mutex_unlock(&cgrp->pidlist_mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
2997
2998
2999
3000
3001
3002
3003
  			return l;
  		}
  	}
  	/* entry not found; create a new one */
  	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
  	if (!l) {
  		mutex_unlock(&cgrp->pidlist_mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
3004
3005
3006
3007
3008
  		return l;
  	}
  	init_rwsem(&l->mutex);
  	down_write(&l->mutex);
  	l->key.type = type;
b70cc5fdb   Li Zefan   cgroups: clean up...
3009
  	l->key.ns = get_pid_ns(ns);
72a8cb30d   Ben Blum   cgroups: ensure c...
3010
3011
3012
3013
3014
3015
3016
3017
3018
  	l->use_count = 0; /* don't increment here */
  	l->list = NULL;
  	l->owner = cgrp;
  	list_add(&l->links, &cgrp->pidlists);
  	mutex_unlock(&cgrp->pidlist_mutex);
  	return l;
  }
  
  /*
102a775e3   Ben Blum   cgroups: add a re...
3019
3020
   * Load a cgroup's pidarray with either procs' tgids or tasks' pids
   */
72a8cb30d   Ben Blum   cgroups: ensure c...
3021
3022
  static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
  			      struct cgroup_pidlist **lp)
102a775e3   Ben Blum   cgroups: add a re...
3023
3024
3025
3026
  {
  	pid_t *array;
  	int length;
  	int pid, n = 0; /* used for populating the array */
817929ec2   Paul Menage   Task Control Grou...
3027
3028
  	struct cgroup_iter it;
  	struct task_struct *tsk;
102a775e3   Ben Blum   cgroups: add a re...
3029
3030
3031
3032
3033
3034
3035
3036
3037
  	struct cgroup_pidlist *l;
  
  	/*
  	 * If cgroup gets more users after we read count, we won't have
  	 * enough space - tough.  This race is indistinguishable to the
  	 * caller from the case that the additional cgroup users didn't
  	 * show up until sometime later on.
  	 */
  	length = cgroup_task_count(cgrp);
d1d9fd330   Ben Blum   cgroups: use vmal...
3038
  	array = pidlist_allocate(length);
102a775e3   Ben Blum   cgroups: add a re...
3039
3040
3041
  	if (!array)
  		return -ENOMEM;
  	/* now, populate the array */
bd89aabc6   Paul Menage   Control groups: R...
3042
3043
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
102a775e3   Ben Blum   cgroups: add a re...
3044
  		if (unlikely(n == length))
817929ec2   Paul Menage   Task Control Grou...
3045
  			break;
102a775e3   Ben Blum   cgroups: add a re...
3046
  		/* get tgid or pid for procs or tasks file respectively */
72a8cb30d   Ben Blum   cgroups: ensure c...
3047
3048
3049
3050
  		if (type == CGROUP_FILE_PROCS)
  			pid = task_tgid_vnr(tsk);
  		else
  			pid = task_pid_vnr(tsk);
102a775e3   Ben Blum   cgroups: add a re...
3051
3052
  		if (pid > 0) /* make sure to only use valid results */
  			array[n++] = pid;
817929ec2   Paul Menage   Task Control Grou...
3053
  	}
bd89aabc6   Paul Menage   Control groups: R...
3054
  	cgroup_iter_end(cgrp, &it);
102a775e3   Ben Blum   cgroups: add a re...
3055
3056
3057
  	length = n;
  	/* now sort & (if procs) strip out duplicates */
  	sort(array, length, sizeof(pid_t), cmppid, NULL);
72a8cb30d   Ben Blum   cgroups: ensure c...
3058
  	if (type == CGROUP_FILE_PROCS)
102a775e3   Ben Blum   cgroups: add a re...
3059
  		length = pidlist_uniq(&array, length);
72a8cb30d   Ben Blum   cgroups: ensure c...
3060
3061
  	l = cgroup_pidlist_find(cgrp, type);
  	if (!l) {
d1d9fd330   Ben Blum   cgroups: use vmal...
3062
  		pidlist_free(array);
72a8cb30d   Ben Blum   cgroups: ensure c...
3063
  		return -ENOMEM;
102a775e3   Ben Blum   cgroups: add a re...
3064
  	}
72a8cb30d   Ben Blum   cgroups: ensure c...
3065
  	/* store array, freeing old if necessary - lock already held */
d1d9fd330   Ben Blum   cgroups: use vmal...
3066
  	pidlist_free(l->list);
102a775e3   Ben Blum   cgroups: add a re...
3067
3068
3069
3070
  	l->list = array;
  	l->length = length;
  	l->use_count++;
  	up_write(&l->mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
3071
  	*lp = l;
102a775e3   Ben Blum   cgroups: add a re...
3072
  	return 0;
bbcb81d09   Paul Menage   Task Control Grou...
3073
  }
846c7bb05   Balbir Singh   Add cgroupstats
3074
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3075
   * cgroupstats_build - build and fill cgroupstats
846c7bb05   Balbir Singh   Add cgroupstats
3076
3077
3078
   * @stats: cgroupstats to fill information into
   * @dentry: A dentry entry belonging to the cgroup for which stats have
   * been requested.
a043e3b2c   Li Zefan   cgroup: fix comments
3079
3080
3081
   *
   * Build and fill cgroupstats so that taskstats can export it to user
   * space.
846c7bb05   Balbir Singh   Add cgroupstats
3082
3083
3084
3085
   */
  int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  {
  	int ret = -EINVAL;
bd89aabc6   Paul Menage   Control groups: R...
3086
  	struct cgroup *cgrp;
846c7bb05   Balbir Singh   Add cgroupstats
3087
3088
  	struct cgroup_iter it;
  	struct task_struct *tsk;
33d283bef   Li Zefan   cgroups: fix a se...
3089

846c7bb05   Balbir Singh   Add cgroupstats
3090
  	/*
33d283bef   Li Zefan   cgroups: fix a se...
3091
3092
  	 * Validate dentry by checking the superblock operations,
  	 * and make sure it's a directory.
846c7bb05   Balbir Singh   Add cgroupstats
3093
  	 */
33d283bef   Li Zefan   cgroups: fix a se...
3094
3095
  	if (dentry->d_sb->s_op != &cgroup_ops ||
  	    !S_ISDIR(dentry->d_inode->i_mode))
846c7bb05   Balbir Singh   Add cgroupstats
3096
3097
3098
  		 goto err;
  
  	ret = 0;
bd89aabc6   Paul Menage   Control groups: R...
3099
  	cgrp = dentry->d_fsdata;
846c7bb05   Balbir Singh   Add cgroupstats
3100

bd89aabc6   Paul Menage   Control groups: R...
3101
3102
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
846c7bb05   Balbir Singh   Add cgroupstats
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
  		switch (tsk->state) {
  		case TASK_RUNNING:
  			stats->nr_running++;
  			break;
  		case TASK_INTERRUPTIBLE:
  			stats->nr_sleeping++;
  			break;
  		case TASK_UNINTERRUPTIBLE:
  			stats->nr_uninterruptible++;
  			break;
  		case TASK_STOPPED:
  			stats->nr_stopped++;
  			break;
  		default:
  			if (delayacct_is_task_waiting_on_io(tsk))
  				stats->nr_io_wait++;
  			break;
  		}
  	}
bd89aabc6   Paul Menage   Control groups: R...
3122
  	cgroup_iter_end(cgrp, &it);
846c7bb05   Balbir Singh   Add cgroupstats
3123

846c7bb05   Balbir Singh   Add cgroupstats
3124
3125
3126
  err:
  	return ret;
  }
8f3ff2086   Paul Menage   cgroups: revert "...
3127

bbcb81d09   Paul Menage   Task Control Grou...
3128
  /*
102a775e3   Ben Blum   cgroups: add a re...
3129
   * seq_file methods for the tasks/procs files. The seq_file position is the
cc31edcee   Paul Menage   cgroups: convert ...
3130
   * next pid to display; the seq_file iterator is a pointer to the pid
102a775e3   Ben Blum   cgroups: add a re...
3131
   * in the cgroup->l->list array.
bbcb81d09   Paul Menage   Task Control Grou...
3132
   */
cc31edcee   Paul Menage   cgroups: convert ...
3133

102a775e3   Ben Blum   cgroups: add a re...
3134
  static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
bbcb81d09   Paul Menage   Task Control Grou...
3135
  {
cc31edcee   Paul Menage   cgroups: convert ...
3136
3137
3138
3139
3140
3141
  	/*
  	 * Initially we receive a position value that corresponds to
  	 * one more than the last pid shown (or 0 on the first call or
  	 * after a seek to the start). Use a binary-search to find the
  	 * next pid to display, if any
  	 */
102a775e3   Ben Blum   cgroups: add a re...
3142
  	struct cgroup_pidlist *l = s->private;
cc31edcee   Paul Menage   cgroups: convert ...
3143
3144
  	int index = 0, pid = *pos;
  	int *iter;
102a775e3   Ben Blum   cgroups: add a re...
3145
  	down_read(&l->mutex);
cc31edcee   Paul Menage   cgroups: convert ...
3146
  	if (pid) {
102a775e3   Ben Blum   cgroups: add a re...
3147
  		int end = l->length;
207777664   Stephen Rothwell   cgroup: remove un...
3148

cc31edcee   Paul Menage   cgroups: convert ...
3149
3150
  		while (index < end) {
  			int mid = (index + end) / 2;
102a775e3   Ben Blum   cgroups: add a re...
3151
  			if (l->list[mid] == pid) {
cc31edcee   Paul Menage   cgroups: convert ...
3152
3153
  				index = mid;
  				break;
102a775e3   Ben Blum   cgroups: add a re...
3154
  			} else if (l->list[mid] <= pid)
cc31edcee   Paul Menage   cgroups: convert ...
3155
3156
3157
3158
3159
3160
  				index = mid + 1;
  			else
  				end = mid;
  		}
  	}
  	/* If we're off the end of the array, we're done */
102a775e3   Ben Blum   cgroups: add a re...
3161
  	if (index >= l->length)
cc31edcee   Paul Menage   cgroups: convert ...
3162
3163
  		return NULL;
  	/* Update the abstract position to be the actual pid that we found */
102a775e3   Ben Blum   cgroups: add a re...
3164
  	iter = l->list + index;
cc31edcee   Paul Menage   cgroups: convert ...
3165
3166
3167
  	*pos = *iter;
  	return iter;
  }
102a775e3   Ben Blum   cgroups: add a re...
3168
  static void cgroup_pidlist_stop(struct seq_file *s, void *v)
cc31edcee   Paul Menage   cgroups: convert ...
3169
  {
102a775e3   Ben Blum   cgroups: add a re...
3170
3171
  	struct cgroup_pidlist *l = s->private;
  	up_read(&l->mutex);
cc31edcee   Paul Menage   cgroups: convert ...
3172
  }
102a775e3   Ben Blum   cgroups: add a re...
3173
  static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
cc31edcee   Paul Menage   cgroups: convert ...
3174
  {
102a775e3   Ben Blum   cgroups: add a re...
3175
3176
3177
  	struct cgroup_pidlist *l = s->private;
  	pid_t *p = v;
  	pid_t *end = l->list + l->length;
cc31edcee   Paul Menage   cgroups: convert ...
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
  	/*
  	 * Advance to the next pid in the array. If this goes off the
  	 * end, we're done
  	 */
  	p++;
  	if (p >= end) {
  		return NULL;
  	} else {
  		*pos = *p;
  		return p;
  	}
  }
102a775e3   Ben Blum   cgroups: add a re...
3190
  static int cgroup_pidlist_show(struct seq_file *s, void *v)
cc31edcee   Paul Menage   cgroups: convert ...
3191
3192
3193
3194
  {
  	return seq_printf(s, "%d
  ", *(int *)v);
  }
bbcb81d09   Paul Menage   Task Control Grou...
3195

102a775e3   Ben Blum   cgroups: add a re...
3196
3197
3198
3199
3200
3201
3202
3203
3204
  /*
   * seq_operations functions for iterating on pidlists through seq_file -
   * independent of whether it's tasks or procs
   */
  static const struct seq_operations cgroup_pidlist_seq_operations = {
  	.start = cgroup_pidlist_start,
  	.stop = cgroup_pidlist_stop,
  	.next = cgroup_pidlist_next,
  	.show = cgroup_pidlist_show,
cc31edcee   Paul Menage   cgroups: convert ...
3205
  };
102a775e3   Ben Blum   cgroups: add a re...
3206
  static void cgroup_release_pid_array(struct cgroup_pidlist *l)
cc31edcee   Paul Menage   cgroups: convert ...
3207
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
3208
3209
3210
3211
3212
3213
3214
  	/*
  	 * the case where we're the last user of this particular pidlist will
  	 * have us remove it from the cgroup's list, which entails taking the
  	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
  	 * pidlist_mutex, we have to take pidlist_mutex first.
  	 */
  	mutex_lock(&l->owner->pidlist_mutex);
102a775e3   Ben Blum   cgroups: add a re...
3215
3216
3217
  	down_write(&l->mutex);
  	BUG_ON(!l->use_count);
  	if (!--l->use_count) {
72a8cb30d   Ben Blum   cgroups: ensure c...
3218
3219
3220
  		/* we're the last user if refcount is 0; remove and free */
  		list_del(&l->links);
  		mutex_unlock(&l->owner->pidlist_mutex);
d1d9fd330   Ben Blum   cgroups: use vmal...
3221
  		pidlist_free(l->list);
72a8cb30d   Ben Blum   cgroups: ensure c...
3222
3223
3224
3225
  		put_pid_ns(l->key.ns);
  		up_write(&l->mutex);
  		kfree(l);
  		return;
cc31edcee   Paul Menage   cgroups: convert ...
3226
  	}
72a8cb30d   Ben Blum   cgroups: ensure c...
3227
  	mutex_unlock(&l->owner->pidlist_mutex);
102a775e3   Ben Blum   cgroups: add a re...
3228
  	up_write(&l->mutex);
bbcb81d09   Paul Menage   Task Control Grou...
3229
  }
102a775e3   Ben Blum   cgroups: add a re...
3230
  static int cgroup_pidlist_release(struct inode *inode, struct file *file)
cc31edcee   Paul Menage   cgroups: convert ...
3231
  {
102a775e3   Ben Blum   cgroups: add a re...
3232
  	struct cgroup_pidlist *l;
cc31edcee   Paul Menage   cgroups: convert ...
3233
3234
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
102a775e3   Ben Blum   cgroups: add a re...
3235
3236
3237
3238
3239
3240
  	/*
  	 * the seq_file will only be initialized if the file was opened for
  	 * reading; hence we check if it's not null only in that case.
  	 */
  	l = ((struct seq_file *)file->private_data)->private;
  	cgroup_release_pid_array(l);
cc31edcee   Paul Menage   cgroups: convert ...
3241
3242
  	return seq_release(inode, file);
  }
102a775e3   Ben Blum   cgroups: add a re...
3243
  static const struct file_operations cgroup_pidlist_operations = {
cc31edcee   Paul Menage   cgroups: convert ...
3244
3245
3246
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.write = cgroup_file_write,
102a775e3   Ben Blum   cgroups: add a re...
3247
  	.release = cgroup_pidlist_release,
cc31edcee   Paul Menage   cgroups: convert ...
3248
  };
bbcb81d09   Paul Menage   Task Control Grou...
3249
  /*
102a775e3   Ben Blum   cgroups: add a re...
3250
3251
3252
   * The following functions handle opens on a file that displays a pidlist
   * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
   * in the cgroup.
bbcb81d09   Paul Menage   Task Control Grou...
3253
   */
102a775e3   Ben Blum   cgroups: add a re...
3254
  /* helper function for the two below it */
72a8cb30d   Ben Blum   cgroups: ensure c...
3255
  static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
bbcb81d09   Paul Menage   Task Control Grou...
3256
  {
bd89aabc6   Paul Menage   Control groups: R...
3257
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
72a8cb30d   Ben Blum   cgroups: ensure c...
3258
  	struct cgroup_pidlist *l;
cc31edcee   Paul Menage   cgroups: convert ...
3259
  	int retval;
bbcb81d09   Paul Menage   Task Control Grou...
3260

cc31edcee   Paul Menage   cgroups: convert ...
3261
  	/* Nothing to do for write-only files */
bbcb81d09   Paul Menage   Task Control Grou...
3262
3263
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
102a775e3   Ben Blum   cgroups: add a re...
3264
  	/* have the array populated */
72a8cb30d   Ben Blum   cgroups: ensure c...
3265
  	retval = pidlist_array_load(cgrp, type, &l);
102a775e3   Ben Blum   cgroups: add a re...
3266
3267
3268
3269
  	if (retval)
  		return retval;
  	/* configure file information */
  	file->f_op = &cgroup_pidlist_operations;
cc31edcee   Paul Menage   cgroups: convert ...
3270

102a775e3   Ben Blum   cgroups: add a re...
3271
  	retval = seq_open(file, &cgroup_pidlist_seq_operations);
cc31edcee   Paul Menage   cgroups: convert ...
3272
  	if (retval) {
102a775e3   Ben Blum   cgroups: add a re...
3273
  		cgroup_release_pid_array(l);
cc31edcee   Paul Menage   cgroups: convert ...
3274
  		return retval;
bbcb81d09   Paul Menage   Task Control Grou...
3275
  	}
102a775e3   Ben Blum   cgroups: add a re...
3276
  	((struct seq_file *)file->private_data)->private = l;
bbcb81d09   Paul Menage   Task Control Grou...
3277
3278
  	return 0;
  }
102a775e3   Ben Blum   cgroups: add a re...
3279
3280
  static int cgroup_tasks_open(struct inode *unused, struct file *file)
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
3281
  	return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
102a775e3   Ben Blum   cgroups: add a re...
3282
3283
3284
  }
  static int cgroup_procs_open(struct inode *unused, struct file *file)
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
3285
  	return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
102a775e3   Ben Blum   cgroups: add a re...
3286
  }
bbcb81d09   Paul Menage   Task Control Grou...
3287

bd89aabc6   Paul Menage   Control groups: R...
3288
  static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
81a6a5cdd   Paul Menage   Task Control Grou...
3289
3290
  					    struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
3291
  	return notify_on_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
3292
  }
6379c1061   Paul Menage   cgroup files: mov...
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
  static int cgroup_write_notify_on_release(struct cgroup *cgrp,
  					  struct cftype *cft,
  					  u64 val)
  {
  	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
  	if (val)
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	else
  		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	return 0;
  }
bbcb81d09   Paul Menage   Task Control Grou...
3304
  /*
0dea11687   Kirill A. Shutemov   cgroup: implement...
3305
3306
3307
3308
3309
3310
3311
3312
3313
   * Unregister event and free resources.
   *
   * Gets called from workqueue.
   */
  static void cgroup_event_remove(struct work_struct *work)
  {
  	struct cgroup_event *event = container_of(work, struct cgroup_event,
  			remove);
  	struct cgroup *cgrp = event->cgrp;
0dea11687   Kirill A. Shutemov   cgroup: implement...
3314
3315
3316
  	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
  
  	eventfd_ctx_put(event->eventfd);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3317
  	kfree(event);
a0a4db548   Kirill A. Shutemov   cgroups: remove e...
3318
  	dput(cgrp->dentry);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
  }
  
  /*
   * Gets called on POLLHUP on eventfd when user closes it.
   *
   * Called with wqh->lock held and interrupts disabled.
   */
  static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
  		int sync, void *key)
  {
  	struct cgroup_event *event = container_of(wait,
  			struct cgroup_event, wait);
  	struct cgroup *cgrp = event->cgrp;
  	unsigned long flags = (unsigned long)key;
  
  	if (flags & POLLHUP) {
a93d2f174   Changli Gao   sched, wait: Use ...
3335
  		__remove_wait_queue(event->wqh, &event->wait);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
  		spin_lock(&cgrp->event_list_lock);
  		list_del(&event->list);
  		spin_unlock(&cgrp->event_list_lock);
  		/*
  		 * We are in atomic context, but cgroup_event_remove() may
  		 * sleep, so we have to call it in workqueue.
  		 */
  		schedule_work(&event->remove);
  	}
  
  	return 0;
  }
  
  static void cgroup_event_ptable_queue_proc(struct file *file,
  		wait_queue_head_t *wqh, poll_table *pt)
  {
  	struct cgroup_event *event = container_of(pt,
  			struct cgroup_event, pt);
  
  	event->wqh = wqh;
  	add_wait_queue(wqh, &event->wait);
  }
  
  /*
   * Parse input and register new cgroup event handler.
   *
   * Input must be in format '<event_fd> <control_fd> <args>'.
   * Interpretation of args is defined by control file implementation.
   */
  static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	struct cgroup_event *event = NULL;
  	unsigned int efd, cfd;
  	struct file *efile = NULL;
  	struct file *cfile = NULL;
  	char *endp;
  	int ret;
  
  	efd = simple_strtoul(buffer, &endp, 10);
  	if (*endp != ' ')
  		return -EINVAL;
  	buffer = endp + 1;
  
  	cfd = simple_strtoul(buffer, &endp, 10);
  	if ((*endp != ' ') && (*endp != '\0'))
  		return -EINVAL;
  	buffer = endp + 1;
  
  	event = kzalloc(sizeof(*event), GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
  	event->cgrp = cgrp;
  	INIT_LIST_HEAD(&event->list);
  	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
  	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
  	INIT_WORK(&event->remove, cgroup_event_remove);
  
  	efile = eventfd_fget(efd);
  	if (IS_ERR(efile)) {
  		ret = PTR_ERR(efile);
  		goto fail;
  	}
  
  	event->eventfd = eventfd_ctx_fileget(efile);
  	if (IS_ERR(event->eventfd)) {
  		ret = PTR_ERR(event->eventfd);
  		goto fail;
  	}
  
  	cfile = fget(cfd);
  	if (!cfile) {
  		ret = -EBADF;
  		goto fail;
  	}
  
  	/* the process need read permission on control file */
3bfa784a6   Al Viro   kill file_permiss...
3413
3414
  	/* AV: shouldn't we check that it's been opened for read instead? */
  	ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
  	if (ret < 0)
  		goto fail;
  
  	event->cft = __file_cft(cfile);
  	if (IS_ERR(event->cft)) {
  		ret = PTR_ERR(event->cft);
  		goto fail;
  	}
  
  	if (!event->cft->register_event || !event->cft->unregister_event) {
  		ret = -EINVAL;
  		goto fail;
  	}
  
  	ret = event->cft->register_event(cgrp, event->cft,
  			event->eventfd, buffer);
  	if (ret)
  		goto fail;
  
  	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
  		event->cft->unregister_event(cgrp, event->cft, event->eventfd);
  		ret = 0;
  		goto fail;
  	}
a0a4db548   Kirill A. Shutemov   cgroups: remove e...
3439
3440
3441
3442
3443
3444
  	/*
  	 * Events should be removed after rmdir of cgroup directory, but before
  	 * destroying subsystem state objects. Let's take reference to cgroup
  	 * directory dentry to do that.
  	 */
  	dget(cgrp->dentry);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
  	spin_lock(&cgrp->event_list_lock);
  	list_add(&event->list, &cgrp->event_list);
  	spin_unlock(&cgrp->event_list_lock);
  
  	fput(cfile);
  	fput(efile);
  
  	return 0;
  
  fail:
  	if (cfile)
  		fput(cfile);
  
  	if (event && event->eventfd && !IS_ERR(event->eventfd))
  		eventfd_ctx_put(event->eventfd);
  
  	if (!IS_ERR_OR_NULL(efile))
  		fput(efile);
  
  	kfree(event);
  
  	return ret;
  }
97978e6d1   Daniel Lezcano   cgroup: add clone...
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
  static u64 cgroup_clone_children_read(struct cgroup *cgrp,
  				    struct cftype *cft)
  {
  	return clone_children(cgrp);
  }
  
  static int cgroup_clone_children_write(struct cgroup *cgrp,
  				     struct cftype *cft,
  				     u64 val)
  {
  	if (val)
  		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
  	else
  		clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
  	return 0;
  }
0dea11687   Kirill A. Shutemov   cgroup: implement...
3484
  /*
bbcb81d09   Paul Menage   Task Control Grou...
3485
3486
   * for the common functions, 'private' gives the type of file
   */
102a775e3   Ben Blum   cgroups: add a re...
3487
3488
  /* for hysterical raisins, we can't put this on the older files */
  #define CGROUP_FILE_GENERIC_PREFIX "cgroup."
81a6a5cdd   Paul Menage   Task Control Grou...
3489
3490
3491
3492
  static struct cftype files[] = {
  	{
  		.name = "tasks",
  		.open = cgroup_tasks_open,
af351026a   Paul Menage   cgroup files: tur...
3493
  		.write_u64 = cgroup_tasks_write,
102a775e3   Ben Blum   cgroups: add a re...
3494
  		.release = cgroup_pidlist_release,
099fca322   Li Zefan   cgroups: show cor...
3495
  		.mode = S_IRUGO | S_IWUSR,
81a6a5cdd   Paul Menage   Task Control Grou...
3496
  	},
102a775e3   Ben Blum   cgroups: add a re...
3497
3498
3499
  	{
  		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
  		.open = cgroup_procs_open,
74a1166df   Ben Blum   cgroups: make pro...
3500
  		.write_u64 = cgroup_procs_write,
102a775e3   Ben Blum   cgroups: add a re...
3501
  		.release = cgroup_pidlist_release,
74a1166df   Ben Blum   cgroups: make pro...
3502
  		.mode = S_IRUGO | S_IWUSR,
102a775e3   Ben Blum   cgroups: add a re...
3503
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
3504
3505
  	{
  		.name = "notify_on_release",
f4c753b7e   Paul Menage   CGroup API files:...
3506
  		.read_u64 = cgroup_read_notify_on_release,
6379c1061   Paul Menage   cgroup files: mov...
3507
  		.write_u64 = cgroup_write_notify_on_release,
81a6a5cdd   Paul Menage   Task Control Grou...
3508
  	},
0dea11687   Kirill A. Shutemov   cgroup: implement...
3509
3510
3511
3512
3513
  	{
  		.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
  		.write_string = cgroup_write_event_control,
  		.mode = S_IWUGO,
  	},
97978e6d1   Daniel Lezcano   cgroup: add clone...
3514
3515
3516
3517
3518
  	{
  		.name = "cgroup.clone_children",
  		.read_u64 = cgroup_clone_children_read,
  		.write_u64 = cgroup_clone_children_write,
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
3519
3520
3521
3522
  };
  
  static struct cftype cft_release_agent = {
  	.name = "release_agent",
e788e066c   Paul Menage   cgroup files: mov...
3523
3524
3525
  	.read_seq_string = cgroup_release_agent_show,
  	.write_string = cgroup_release_agent_write,
  	.max_write_len = PATH_MAX,
bbcb81d09   Paul Menage   Task Control Grou...
3526
  };
bd89aabc6   Paul Menage   Control groups: R...
3527
  static int cgroup_populate_dir(struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3528
3529
3530
3531
3532
  {
  	int err;
  	struct cgroup_subsys *ss;
  
  	/* First clear out any existing files */
bd89aabc6   Paul Menage   Control groups: R...
3533
  	cgroup_clear_directory(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
3534

bd89aabc6   Paul Menage   Control groups: R...
3535
  	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
bbcb81d09   Paul Menage   Task Control Grou...
3536
3537
  	if (err < 0)
  		return err;
bd89aabc6   Paul Menage   Control groups: R...
3538
3539
  	if (cgrp == cgrp->top_cgroup) {
  		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
81a6a5cdd   Paul Menage   Task Control Grou...
3540
3541
  			return err;
  	}
bd89aabc6   Paul Menage   Control groups: R...
3542
3543
  	for_each_subsys(cgrp->root, ss) {
  		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
ddbcc7e8e   Paul Menage   Task Control Grou...
3544
3545
  			return err;
  	}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
  	/* This cgroup is ready now */
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		/*
  		 * Update id->css pointer and make this css visible from
  		 * CSS ID functions. This pointer will be dereferened
  		 * from RCU-read-side without locks.
  		 */
  		if (css->id)
  			rcu_assign_pointer(css->id->css, css);
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3557
3558
3559
3560
3561
3562
  
  	return 0;
  }
  
  static void init_cgroup_css(struct cgroup_subsys_state *css,
  			       struct cgroup_subsys *ss,
bd89aabc6   Paul Menage   Control groups: R...
3563
  			       struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3564
  {
bd89aabc6   Paul Menage   Control groups: R...
3565
  	css->cgroup = cgrp;
e7c5ec919   Paul Menage   cgroups: add css_...
3566
  	atomic_set(&css->refcnt, 1);
ddbcc7e8e   Paul Menage   Task Control Grou...
3567
  	css->flags = 0;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3568
  	css->id = NULL;
bd89aabc6   Paul Menage   Control groups: R...
3569
  	if (cgrp == dummytop)
ddbcc7e8e   Paul Menage   Task Control Grou...
3570
  		set_bit(CSS_ROOT, &css->flags);
bd89aabc6   Paul Menage   Control groups: R...
3571
3572
  	BUG_ON(cgrp->subsys[ss->subsys_id]);
  	cgrp->subsys[ss->subsys_id] = css;
ddbcc7e8e   Paul Menage   Task Control Grou...
3573
  }
999cd8a45   Paul Menage   cgroups: add a pe...
3574
3575
3576
3577
  static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
  {
  	/* We need to take each hierarchy_mutex in a consistent order */
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3578
3579
3580
3581
  	/*
  	 * No worry about a race with rebind_subsystems that might mess up the
  	 * locking order, since both parties are under cgroup_mutex.
  	 */
999cd8a45   Paul Menage   cgroups: add a pe...
3582
3583
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3584
3585
  		if (ss == NULL)
  			continue;
999cd8a45   Paul Menage   cgroups: add a pe...
3586
  		if (ss->root == root)
cfebe563b   Li Zefan   cgroups: fix lock...
3587
  			mutex_lock(&ss->hierarchy_mutex);
999cd8a45   Paul Menage   cgroups: add a pe...
3588
3589
3590
3591
3592
3593
3594
3595
3596
  	}
  }
  
  static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  {
  	int i;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3597
3598
  		if (ss == NULL)
  			continue;
999cd8a45   Paul Menage   cgroups: add a pe...
3599
3600
3601
3602
  		if (ss->root == root)
  			mutex_unlock(&ss->hierarchy_mutex);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
3603
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
3604
3605
3606
3607
   * cgroup_create - create a cgroup
   * @parent: cgroup that will be parent of the new cgroup
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new inode
ddbcc7e8e   Paul Menage   Task Control Grou...
3608
   *
a043e3b2c   Li Zefan   cgroup: fix comments
3609
   * Must be called with the mutex on the parent inode held
ddbcc7e8e   Paul Menage   Task Control Grou...
3610
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
3611
  static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
3612
  			     mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
3613
  {
bd89aabc6   Paul Menage   Control groups: R...
3614
  	struct cgroup *cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
3615
3616
3617
3618
  	struct cgroupfs_root *root = parent->root;
  	int err = 0;
  	struct cgroup_subsys *ss;
  	struct super_block *sb = root->sb;
bd89aabc6   Paul Menage   Control groups: R...
3619
3620
  	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
  	if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
  		return -ENOMEM;
  
  	/* Grab a reference on the superblock so the hierarchy doesn't
  	 * get deleted on unmount if there are child cgroups.  This
  	 * can be done outside cgroup_mutex, since the sb can't
  	 * disappear while someone has an open control file on the
  	 * fs */
  	atomic_inc(&sb->s_active);
  
  	mutex_lock(&cgroup_mutex);
cc31edcee   Paul Menage   cgroups: convert ...
3631
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3632

bd89aabc6   Paul Menage   Control groups: R...
3633
3634
3635
  	cgrp->parent = parent;
  	cgrp->root = parent->root;
  	cgrp->top_cgroup = parent->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
3636

b6abdb0e6   Li Zefan   cgroup: fix defau...
3637
3638
  	if (notify_on_release(parent))
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
97978e6d1   Daniel Lezcano   cgroup: add clone...
3639
3640
  	if (clone_children(parent))
  		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3641
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
3642
  		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
4528fd059   Li Zefan   cgroups: fix to r...
3643

ddbcc7e8e   Paul Menage   Task Control Grou...
3644
3645
3646
3647
  		if (IS_ERR(css)) {
  			err = PTR_ERR(css);
  			goto err_destroy;
  		}
bd89aabc6   Paul Menage   Control groups: R...
3648
  		init_cgroup_css(css, ss, cgrp);
4528fd059   Li Zefan   cgroups: fix to r...
3649
3650
3651
  		if (ss->use_id) {
  			err = alloc_css_id(ss, parent, cgrp);
  			if (err)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3652
  				goto err_destroy;
4528fd059   Li Zefan   cgroups: fix to r...
3653
  		}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3654
  		/* At error, ->destroy() callback has to free assigned ID. */
97978e6d1   Daniel Lezcano   cgroup: add clone...
3655
3656
  		if (clone_children(parent) && ss->post_clone)
  			ss->post_clone(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3657
  	}
999cd8a45   Paul Menage   cgroups: add a pe...
3658
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
3659
  	list_add(&cgrp->sibling, &cgrp->parent->children);
999cd8a45   Paul Menage   cgroups: add a pe...
3660
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
3661
  	root->number_of_cgroups++;
bd89aabc6   Paul Menage   Control groups: R...
3662
  	err = cgroup_create_dir(cgrp, dentry, mode);
ddbcc7e8e   Paul Menage   Task Control Grou...
3663
3664
3665
3666
  	if (err < 0)
  		goto err_remove;
  
  	/* The cgroup directory was pre-locked for us */
bd89aabc6   Paul Menage   Control groups: R...
3667
  	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
ddbcc7e8e   Paul Menage   Task Control Grou...
3668

bd89aabc6   Paul Menage   Control groups: R...
3669
  	err = cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3670
3671
3672
  	/* If err < 0, we have a half-filled directory - oh well ;) */
  
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
3673
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
3674
3675
3676
3677
  
  	return 0;
  
   err_remove:
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
3678
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
3679
  	list_del(&cgrp->sibling);
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
3680
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
3681
3682
3683
3684
3685
  	root->number_of_cgroups--;
  
   err_destroy:
  
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
3686
3687
  		if (cgrp->subsys[ss->subsys_id])
  			ss->destroy(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3688
3689
3690
3691
3692
3693
  	}
  
  	mutex_unlock(&cgroup_mutex);
  
  	/* Release the reference count that we took on the superblock */
  	deactivate_super(sb);
bd89aabc6   Paul Menage   Control groups: R...
3694
  	kfree(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
  	return err;
  }
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  {
  	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
  
  	/* the vfs holds inode->i_mutex already */
  	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
55b6fd016   Li Zefan   cgroup: uninline ...
3705
  static int cgroup_has_css_refs(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
3706
3707
3708
  {
  	/* Check the reference count on each subsystem. Since we
  	 * already established that there are no tasks in the
e7c5ec919   Paul Menage   cgroups: add css_...
3709
  	 * cgroup, if the css refcount is also 1, then there should
81a6a5cdd   Paul Menage   Task Control Grou...
3710
3711
3712
3713
3714
3715
3716
  	 * be no outstanding references, so the subsystem is safe to
  	 * destroy. We scan across all subsystems rather than using
  	 * the per-hierarchy linked list of mounted subsystems since
  	 * we can be called via check_for_release() with no
  	 * synchronization other than RCU, and the subsystem linked
  	 * list isn't RCU-safe */
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3717
3718
3719
3720
3721
  	/*
  	 * We won't need to lock the subsys array, because the subsystems
  	 * we're concerned about aren't going anywhere since our cgroup root
  	 * has a reference on them.
  	 */
81a6a5cdd   Paul Menage   Task Control Grou...
3722
3723
3724
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		struct cgroup_subsys_state *css;
aae8aab40   Ben Blum   cgroups: revamp s...
3725
3726
  		/* Skip subsystems not present or not in this hierarchy */
  		if (ss == NULL || ss->root != cgrp->root)
81a6a5cdd   Paul Menage   Task Control Grou...
3727
  			continue;
bd89aabc6   Paul Menage   Control groups: R...
3728
  		css = cgrp->subsys[ss->subsys_id];
81a6a5cdd   Paul Menage   Task Control Grou...
3729
3730
3731
3732
3733
3734
  		/* When called from check_for_release() it's possible
  		 * that by this point the cgroup has been removed
  		 * and the css deleted. But a false-positive doesn't
  		 * matter, since it can only happen if the cgroup
  		 * has been deleted and hence no longer needs the
  		 * release agent to be called anyway. */
e7c5ec919   Paul Menage   cgroups: add css_...
3735
  		if (css && (atomic_read(&css->refcnt) > 1))
81a6a5cdd   Paul Menage   Task Control Grou...
3736
  			return 1;
81a6a5cdd   Paul Menage   Task Control Grou...
3737
3738
3739
  	}
  	return 0;
  }
e7c5ec919   Paul Menage   cgroups: add css_...
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
  /*
   * Atomically mark all (or else none) of the cgroup's CSS objects as
   * CSS_REMOVED. Return true on success, or false if the cgroup has
   * busy subsystems. Call with cgroup_mutex held
   */
  
  static int cgroup_clear_css_refs(struct cgroup *cgrp)
  {
  	struct cgroup_subsys *ss;
  	unsigned long flags;
  	bool failed = false;
  	local_irq_save(flags);
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		int refcnt;
804b3c28a   Paul Menage   cgroups: add cpu_...
3755
  		while (1) {
e7c5ec919   Paul Menage   cgroups: add css_...
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
  			/* We can only remove a CSS with a refcnt==1 */
  			refcnt = atomic_read(&css->refcnt);
  			if (refcnt > 1) {
  				failed = true;
  				goto done;
  			}
  			BUG_ON(!refcnt);
  			/*
  			 * Drop the refcnt to 0 while we check other
  			 * subsystems. This will cause any racing
  			 * css_tryget() to spin until we set the
  			 * CSS_REMOVED bits or abort
  			 */
804b3c28a   Paul Menage   cgroups: add cpu_...
3769
3770
3771
3772
  			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
  				break;
  			cpu_relax();
  		}
e7c5ec919   Paul Menage   cgroups: add css_...
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
  	}
   done:
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		if (failed) {
  			/*
  			 * Restore old refcnt if we previously managed
  			 * to clear it from 1 to 0
  			 */
  			if (!atomic_read(&css->refcnt))
  				atomic_set(&css->refcnt, 1);
  		} else {
  			/* Commit the fact that the CSS is removed */
  			set_bit(CSS_REMOVED, &css->flags);
  		}
  	}
  	local_irq_restore(flags);
  	return !failed;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
3792
3793
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
bd89aabc6   Paul Menage   Control groups: R...
3794
  	struct cgroup *cgrp = dentry->d_fsdata;
ddbcc7e8e   Paul Menage   Task Control Grou...
3795
3796
  	struct dentry *d;
  	struct cgroup *parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3797
  	DEFINE_WAIT(wait);
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
3798
  	struct cgroup_event *event, *tmp;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3799
  	int ret;
ddbcc7e8e   Paul Menage   Task Control Grou...
3800
3801
  
  	/* the vfs holds both inode->i_mutex already */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3802
  again:
ddbcc7e8e   Paul Menage   Task Control Grou...
3803
  	mutex_lock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
3804
  	if (atomic_read(&cgrp->count) != 0) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3805
3806
3807
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
bd89aabc6   Paul Menage   Control groups: R...
3808
  	if (!list_empty(&cgrp->children)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3809
3810
3811
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
3812
  	mutex_unlock(&cgroup_mutex);
a043e3b2c   Li Zefan   cgroup: fix comments
3813

4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
3814
  	/*
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
  	 * In general, subsystem has no css->refcnt after pre_destroy(). But
  	 * in racy cases, subsystem may have to get css->refcnt after
  	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
  	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
  	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
  	 * and subsystem's reference count handling. Please see css_get/put
  	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
  	 */
  	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  
  	/*
a043e3b2c   Li Zefan   cgroup: fix comments
3826
3827
  	 * Call pre_destroy handlers of subsys. Notify subsystems
  	 * that rmdir() request comes.
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
3828
  	 */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3829
  	ret = cgroup_call_pre_destroy(cgrp);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3830
3831
  	if (ret) {
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3832
  		return ret;
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3833
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3834

3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
3835
3836
  	mutex_lock(&cgroup_mutex);
  	parent = cgrp->parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3837
  	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3838
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3839
3840
3841
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3842
  	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3843
3844
  	if (!cgroup_clear_css_refs(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3845
3846
3847
3848
3849
3850
  		/*
  		 * Because someone may call cgroup_wakeup_rmdir_waiter() before
  		 * prepare_to_wait(), we need to check this flag.
  		 */
  		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
  			schedule();
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3851
3852
3853
3854
3855
3856
3857
3858
3859
  		finish_wait(&cgroup_rmdir_waitq, &wait);
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  		if (signal_pending(current))
  			return -EINTR;
  		goto again;
  	}
  	/* NO css_tryget() can success after here. */
  	finish_wait(&cgroup_rmdir_waitq, &wait);
  	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3860

81a6a5cdd   Paul Menage   Task Control Grou...
3861
  	spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
3862
3863
  	set_bit(CGRP_REMOVED, &cgrp->flags);
  	if (!list_empty(&cgrp->release_list))
8d2587970   Phil Carmody   cgroups: if you l...
3864
  		list_del_init(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
3865
  	spin_unlock(&release_list_lock);
999cd8a45   Paul Menage   cgroups: add a pe...
3866
3867
3868
  
  	cgroup_lock_hierarchy(cgrp->root);
  	/* delete this cgroup from parent->children */
8d2587970   Phil Carmody   cgroups: if you l...
3869
  	list_del_init(&cgrp->sibling);
999cd8a45   Paul Menage   cgroups: add a pe...
3870
  	cgroup_unlock_hierarchy(cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
3871
  	d = dget(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
3872
3873
3874
  
  	cgroup_d_remove_dir(d);
  	dput(d);
ddbcc7e8e   Paul Menage   Task Control Grou...
3875

bd89aabc6   Paul Menage   Control groups: R...
3876
  	set_bit(CGRP_RELEASABLE, &parent->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
3877
  	check_for_release(parent);
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
  	/*
  	 * Unregister events and notify userspace.
  	 * Notify userspace about cgroup removing only after rmdir of cgroup
  	 * directory to avoid race between userspace and kernelspace
  	 */
  	spin_lock(&cgrp->event_list_lock);
  	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
  		list_del(&event->list);
  		remove_wait_queue(event->wqh, &event->wait);
  		eventfd_signal(event->eventfd, 1);
  		schedule_work(&event->remove);
  	}
  	spin_unlock(&cgrp->event_list_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
3891
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
3892
3893
  	return 0;
  }
06a119204   Li Zefan   cgroup: annotate ...
3894
  static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
ddbcc7e8e   Paul Menage   Task Control Grou...
3895
  {
ddbcc7e8e   Paul Menage   Task Control Grou...
3896
  	struct cgroup_subsys_state *css;
cfe36bde5   Diego Calleja   Improve cgroup pr...
3897
3898
3899
  
  	printk(KERN_INFO "Initializing cgroup subsys %s
  ", ss->name);
ddbcc7e8e   Paul Menage   Task Control Grou...
3900
3901
  
  	/* Create the top cgroup state for this subsystem */
33a68ac1c   Li Zefan   cgroups: add inac...
3902
  	list_add(&ss->sibling, &rootnode.subsys_list);
ddbcc7e8e   Paul Menage   Task Control Grou...
3903
3904
3905
3906
3907
  	ss->root = &rootnode;
  	css = ss->create(ss, dummytop);
  	/* We don't handle early failures gracefully */
  	BUG_ON(IS_ERR(css));
  	init_cgroup_css(css, ss, dummytop);
e8d55fdeb   Li Zefan   cgroups: simplify...
3908
  	/* Update the init_css_set to contain a subsys
817929ec2   Paul Menage   Task Control Grou...
3909
  	 * pointer to this state - since the subsystem is
e8d55fdeb   Li Zefan   cgroups: simplify...
3910
3911
3912
  	 * newly registered, all tasks and hence the
  	 * init_css_set is in the subsystem's top cgroup. */
  	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
ddbcc7e8e   Paul Menage   Task Control Grou...
3913
3914
  
  	need_forkexit_callback |= ss->fork || ss->exit;
e8d55fdeb   Li Zefan   cgroups: simplify...
3915
3916
3917
3918
  	/* At system boot, before all subsystems have been
  	 * registered, no tasks have been forked, so we don't
  	 * need to invoke fork callbacks here. */
  	BUG_ON(!list_empty(&init_task.tasks));
999cd8a45   Paul Menage   cgroups: add a pe...
3919
  	mutex_init(&ss->hierarchy_mutex);
cfebe563b   Li Zefan   cgroups: fix lock...
3920
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ddbcc7e8e   Paul Menage   Task Control Grou...
3921
  	ss->active = 1;
e6a1105ba   Ben Blum   cgroups: subsyste...
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
  
  	/* this function shouldn't be used with modular subsystems, since they
  	 * need to register a subsys_id, among other things */
  	BUG_ON(ss->module);
  }
  
  /**
   * cgroup_load_subsys: load and register a modular subsystem at runtime
   * @ss: the subsystem to load
   *
   * This function should be called in a modular subsystem's initcall. If the
883931612   Thomas Weber   Fix typos in comm...
3933
   * subsystem is built as a module, it will be assigned a new subsys_id and set
e6a1105ba   Ben Blum   cgroups: subsyste...
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
   * up for use. If the subsystem is built-in anyway, work is delegated to the
   * simpler cgroup_init_subsys.
   */
  int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
  {
  	int i;
  	struct cgroup_subsys_state *css;
  
  	/* check name and function validity */
  	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
  	    ss->create == NULL || ss->destroy == NULL)
  		return -EINVAL;
  
  	/*
  	 * we don't support callbacks in modular subsystems. this check is
  	 * before the ss->module check for consistency; a subsystem that could
  	 * be a module should still have no callbacks even if the user isn't
  	 * compiling it as one.
  	 */
  	if (ss->fork || ss->exit)
  		return -EINVAL;
  
  	/*
  	 * an optionally modular subsystem is built-in: we want to do nothing,
  	 * since cgroup_init_subsys will have already taken care of it.
  	 */
  	if (ss->module == NULL) {
  		/* a few sanity checks */
  		BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
  		BUG_ON(subsys[ss->subsys_id] != ss);
  		return 0;
  	}
  
  	/*
  	 * need to register a subsys id before anything else - for example,
  	 * init_cgroup_css needs it.
  	 */
  	mutex_lock(&cgroup_mutex);
  	/* find the first empty slot in the array */
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		if (subsys[i] == NULL)
  			break;
  	}
  	if (i == CGROUP_SUBSYS_COUNT) {
  		/* maximum number of subsystems already registered! */
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
  	/* assign ourselves the subsys_id */
  	ss->subsys_id = i;
  	subsys[i] = ss;
  
  	/*
  	 * no ss->create seems to need anything important in the ss struct, so
  	 * this can happen first (i.e. before the rootnode attachment).
  	 */
  	css = ss->create(ss, dummytop);
  	if (IS_ERR(css)) {
  		/* failure case - need to deassign the subsys[] slot. */
  		subsys[i] = NULL;
  		mutex_unlock(&cgroup_mutex);
  		return PTR_ERR(css);
  	}
  
  	list_add(&ss->sibling, &rootnode.subsys_list);
  	ss->root = &rootnode;
  
  	/* our new subsystem will be attached to the dummy hierarchy. */
  	init_cgroup_css(css, ss, dummytop);
  	/* init_idr must be after init_cgroup_css because it sets css->id. */
  	if (ss->use_id) {
  		int ret = cgroup_init_idr(ss, css);
  		if (ret) {
  			dummytop->subsys[ss->subsys_id] = NULL;
  			ss->destroy(ss, dummytop);
  			subsys[i] = NULL;
  			mutex_unlock(&cgroup_mutex);
  			return ret;
  		}
  	}
  
  	/*
  	 * Now we need to entangle the css into the existing css_sets. unlike
  	 * in cgroup_init_subsys, there are now multiple css_sets, so each one
  	 * will need a new pointer to it; done by iterating the css_set_table.
  	 * furthermore, modifying the existing css_sets will corrupt the hash
  	 * table state, so each changed css_set will need its hash recomputed.
  	 * this is all done under the css_set_lock.
  	 */
  	write_lock(&css_set_lock);
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  		struct css_set *cg;
  		struct hlist_node *node, *tmp;
  		struct hlist_head *bucket = &css_set_table[i], *new_bucket;
  
  		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
  			/* skip entries that we already rehashed */
  			if (cg->subsys[ss->subsys_id])
  				continue;
  			/* remove existing entry */
  			hlist_del(&cg->hlist);
  			/* set new value */
  			cg->subsys[ss->subsys_id] = css;
  			/* recompute hash and restore entry */
  			new_bucket = css_set_hash(cg->subsys);
  			hlist_add_head(&cg->hlist, new_bucket);
  		}
  	}
  	write_unlock(&css_set_lock);
  
  	mutex_init(&ss->hierarchy_mutex);
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
  	ss->active = 1;
e6a1105ba   Ben Blum   cgroups: subsyste...
4047
4048
4049
  	/* success! */
  	mutex_unlock(&cgroup_mutex);
  	return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
4050
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
4051
  EXPORT_SYMBOL_GPL(cgroup_load_subsys);
ddbcc7e8e   Paul Menage   Task Control Grou...
4052
4053
  
  /**
cf5d5941f   Ben Blum   cgroups: subsyste...
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
   * cgroup_unload_subsys: unload a modular subsystem
   * @ss: the subsystem to unload
   *
   * This function should be called in a modular subsystem's exitcall. When this
   * function is invoked, the refcount on the subsystem's module will be 0, so
   * the subsystem will not be attached to any hierarchy.
   */
  void cgroup_unload_subsys(struct cgroup_subsys *ss)
  {
  	struct cg_cgroup_link *link;
  	struct hlist_head *hhead;
  
  	BUG_ON(ss->module == NULL);
  
  	/*
  	 * we shouldn't be called if the subsystem is in use, and the use of
  	 * try_module_get in parse_cgroupfs_options should ensure that it
  	 * doesn't start being used while we're killing it off.
  	 */
  	BUG_ON(ss->root != &rootnode);
  
  	mutex_lock(&cgroup_mutex);
  	/* deassign the subsys_id */
  	BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
  	subsys[ss->subsys_id] = NULL;
  
  	/* remove subsystem from rootnode's list of subsystems */
8d2587970   Phil Carmody   cgroups: if you l...
4081
  	list_del_init(&ss->sibling);
cf5d5941f   Ben Blum   cgroups: subsyste...
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
  
  	/*
  	 * disentangle the css from all css_sets attached to the dummytop. as
  	 * in loading, we need to pay our respects to the hashtable gods.
  	 */
  	write_lock(&css_set_lock);
  	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
  		struct css_set *cg = link->cg;
  
  		hlist_del(&cg->hlist);
  		BUG_ON(!cg->subsys[ss->subsys_id]);
  		cg->subsys[ss->subsys_id] = NULL;
  		hhead = css_set_hash(cg->subsys);
  		hlist_add_head(&cg->hlist, hhead);
  	}
  	write_unlock(&css_set_lock);
  
  	/*
  	 * remove subsystem's css from the dummytop and free it - need to free
  	 * before marking as null because ss->destroy needs the cgrp->subsys
  	 * pointer to find their state. note that this also takes care of
  	 * freeing the css_id.
  	 */
  	ss->destroy(ss, dummytop);
  	dummytop->subsys[ss->subsys_id] = NULL;
  
  	mutex_unlock(&cgroup_mutex);
  }
  EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4113
4114
4115
4116
   * cgroup_init_early - cgroup initialization at system boot
   *
   * Initialize cgroups at system boot, and initialize any
   * subsystems that request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
4117
4118
4119
4120
   */
  int __init cgroup_init_early(void)
  {
  	int i;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
4121
  	atomic_set(&init_css_set.refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
4122
4123
  	INIT_LIST_HEAD(&init_css_set.cg_links);
  	INIT_LIST_HEAD(&init_css_set.tasks);
472b1053f   Li Zefan   cgroups: use a ha...
4124
  	INIT_HLIST_NODE(&init_css_set.hlist);
817929ec2   Paul Menage   Task Control Grou...
4125
  	css_set_count = 1;
ddbcc7e8e   Paul Menage   Task Control Grou...
4126
  	init_cgroup_root(&rootnode);
817929ec2   Paul Menage   Task Control Grou...
4127
4128
4129
4130
  	root_count = 1;
  	init_task.cgroups = &init_css_set;
  
  	init_css_set_link.cg = &init_css_set;
7717f7ba9   Paul Menage   cgroups: add a ba...
4131
  	init_css_set_link.cgrp = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
4132
  	list_add(&init_css_set_link.cgrp_link_list,
817929ec2   Paul Menage   Task Control Grou...
4133
4134
4135
  		 &rootnode.top_cgroup.css_sets);
  	list_add(&init_css_set_link.cg_link_list,
  		 &init_css_set.cg_links);
ddbcc7e8e   Paul Menage   Task Control Grou...
4136

472b1053f   Li Zefan   cgroups: use a ha...
4137
4138
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
  		INIT_HLIST_HEAD(&css_set_table[i]);
aae8aab40   Ben Blum   cgroups: revamp s...
4139
4140
  	/* at bootup time, we don't worry about modular subsystems */
  	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
ddbcc7e8e   Paul Menage   Task Control Grou...
4141
4142
4143
4144
4145
4146
4147
  		struct cgroup_subsys *ss = subsys[i];
  
  		BUG_ON(!ss->name);
  		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
  		BUG_ON(!ss->create);
  		BUG_ON(!ss->destroy);
  		if (ss->subsys_id != i) {
cfe36bde5   Diego Calleja   Improve cgroup pr...
4148
4149
  			printk(KERN_ERR "cgroup: Subsys %s id == %d
  ",
ddbcc7e8e   Paul Menage   Task Control Grou...
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
  			       ss->name, ss->subsys_id);
  			BUG();
  		}
  
  		if (ss->early_init)
  			cgroup_init_subsys(ss);
  	}
  	return 0;
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4161
4162
4163
4164
   * cgroup_init - cgroup initialization
   *
   * Register cgroup filesystem and /proc file, and initialize
   * any subsystems that didn't request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
4165
4166
4167
4168
4169
   */
  int __init cgroup_init(void)
  {
  	int err;
  	int i;
472b1053f   Li Zefan   cgroups: use a ha...
4170
  	struct hlist_head *hhead;
a424316ca   Paul Menage   Task Control Grou...
4171
4172
4173
4174
  
  	err = bdi_init(&cgroup_backing_dev_info);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
4175

aae8aab40   Ben Blum   cgroups: revamp s...
4176
4177
  	/* at bootup time, we don't worry about modular subsystems */
  	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
ddbcc7e8e   Paul Menage   Task Control Grou...
4178
4179
4180
  		struct cgroup_subsys *ss = subsys[i];
  		if (!ss->early_init)
  			cgroup_init_subsys(ss);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4181
  		if (ss->use_id)
e6a1105ba   Ben Blum   cgroups: subsyste...
4182
  			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
ddbcc7e8e   Paul Menage   Task Control Grou...
4183
  	}
472b1053f   Li Zefan   cgroups: use a ha...
4184
4185
4186
  	/* Add init_css_set to the hash table */
  	hhead = css_set_hash(init_css_set.subsys);
  	hlist_add_head(&init_css_set.hlist, hhead);
2c6ab6d20   Paul Menage   cgroups: allow cg...
4187
  	BUG_ON(!init_root_id(&rootnode));
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
4188
4189
4190
4191
4192
4193
  
  	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
  	if (!cgroup_kobj) {
  		err = -ENOMEM;
  		goto out;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
4194
  	err = register_filesystem(&cgroup_fs_type);
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
4195
4196
  	if (err < 0) {
  		kobject_put(cgroup_kobj);
ddbcc7e8e   Paul Menage   Task Control Grou...
4197
  		goto out;
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
4198
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
4199

46ae220be   Li Zefan   cgroup: switch to...
4200
  	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
a424316ca   Paul Menage   Task Control Grou...
4201

ddbcc7e8e   Paul Menage   Task Control Grou...
4202
  out:
a424316ca   Paul Menage   Task Control Grou...
4203
4204
  	if (err)
  		bdi_destroy(&cgroup_backing_dev_info);
ddbcc7e8e   Paul Menage   Task Control Grou...
4205
4206
  	return err;
  }
b4f48b636   Paul Menage   Task Control Grou...
4207

a424316ca   Paul Menage   Task Control Grou...
4208
4209
4210
4211
4212
4213
  /*
   * proc_cgroup_show()
   *  - Print task's cgroup paths into seq_file, one line for each hierarchy
   *  - Used for /proc/<pid>/cgroup.
   *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
   *    doesn't really matter if tsk->cgroup changes after we read it,
956db3ca0   Cliff Wickman   hotplug cpu: move...
4214
   *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
a424316ca   Paul Menage   Task Control Grou...
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
   *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
   *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
   *    cgroup to top_cgroup.
   */
  
  /* TODO: Use a proper seq_file iterator */
  static int proc_cgroup_show(struct seq_file *m, void *v)
  {
  	struct pid *pid;
  	struct task_struct *tsk;
  	char *buf;
  	int retval;
  	struct cgroupfs_root *root;
  
  	retval = -ENOMEM;
  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!buf)
  		goto out;
  
  	retval = -ESRCH;
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
  	if (!tsk)
  		goto out_free;
  
  	retval = 0;
  
  	mutex_lock(&cgroup_mutex);
e5f6a8609   Li Zefan   cgroups: make roo...
4243
  	for_each_active_root(root) {
a424316ca   Paul Menage   Task Control Grou...
4244
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
4245
  		struct cgroup *cgrp;
a424316ca   Paul Menage   Task Control Grou...
4246
  		int count = 0;
2c6ab6d20   Paul Menage   cgroups: allow cg...
4247
  		seq_printf(m, "%d:", root->hierarchy_id);
a424316ca   Paul Menage   Task Control Grou...
4248
4249
  		for_each_subsys(root, ss)
  			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
c6d57f331   Paul Menage   cgroups: support ...
4250
4251
4252
  		if (strlen(root->name))
  			seq_printf(m, "%sname=%s", count ? "," : "",
  				   root->name);
a424316ca   Paul Menage   Task Control Grou...
4253
  		seq_putc(m, ':');
7717f7ba9   Paul Menage   cgroups: add a ba...
4254
  		cgrp = task_cgroup_from_root(tsk, root);
bd89aabc6   Paul Menage   Control groups: R...
4255
  		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
a424316ca   Paul Menage   Task Control Grou...
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
  		if (retval < 0)
  			goto out_unlock;
  		seq_puts(m, buf);
  		seq_putc(m, '
  ');
  	}
  
  out_unlock:
  	mutex_unlock(&cgroup_mutex);
  	put_task_struct(tsk);
  out_free:
  	kfree(buf);
  out:
  	return retval;
  }
  
  static int cgroup_open(struct inode *inode, struct file *file)
  {
  	struct pid *pid = PROC_I(inode)->pid;
  	return single_open(file, proc_cgroup_show, pid);
  }
828c09509   Alexey Dobriyan   const: constify r...
4277
  const struct file_operations proc_cgroup_operations = {
a424316ca   Paul Menage   Task Control Grou...
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
  	.open		= cgroup_open,
  	.read		= seq_read,
  	.llseek		= seq_lseek,
  	.release	= single_release,
  };
  
  /* Display information about each subsystem and each hierarchy */
  static int proc_cgroupstats_show(struct seq_file *m, void *v)
  {
  	int i;
a424316ca   Paul Menage   Task Control Grou...
4288

8bab8dded   Paul Menage   cgroups: add cgro...
4289
4290
  	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled
  ");
aae8aab40   Ben Blum   cgroups: revamp s...
4291
4292
4293
4294
4295
  	/*
  	 * ideally we don't want subsystems moving around while we do this.
  	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
  	 * subsys/hierarchy state.
  	 */
a424316ca   Paul Menage   Task Control Grou...
4296
  	mutex_lock(&cgroup_mutex);
a424316ca   Paul Menage   Task Control Grou...
4297
4298
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
4299
4300
  		if (ss == NULL)
  			continue;
2c6ab6d20   Paul Menage   cgroups: allow cg...
4301
4302
4303
  		seq_printf(m, "%s\t%d\t%d\t%d
  ",
  			   ss->name, ss->root->hierarchy_id,
8bab8dded   Paul Menage   cgroups: add cgro...
4304
  			   ss->root->number_of_cgroups, !ss->disabled);
a424316ca   Paul Menage   Task Control Grou...
4305
4306
4307
4308
4309
4310
4311
  	}
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  static int cgroupstats_open(struct inode *inode, struct file *file)
  {
9dce07f1a   Al Viro   NULL noise: fs/*,...
4312
  	return single_open(file, proc_cgroupstats_show, NULL);
a424316ca   Paul Menage   Task Control Grou...
4313
  }
828c09509   Alexey Dobriyan   const: constify r...
4314
  static const struct file_operations proc_cgroupstats_operations = {
a424316ca   Paul Menage   Task Control Grou...
4315
4316
4317
4318
4319
  	.open = cgroupstats_open,
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.release = single_release,
  };
b4f48b636   Paul Menage   Task Control Grou...
4320
4321
  /**
   * cgroup_fork - attach newly forked task to its parents cgroup.
a043e3b2c   Li Zefan   cgroup: fix comments
4322
   * @child: pointer to task_struct of forking parent process.
b4f48b636   Paul Menage   Task Control Grou...
4323
4324
4325
4326
4327
4328
   *
   * Description: A task inherits its parent's cgroup at fork().
   *
   * A pointer to the shared css_set was automatically copied in
   * fork.c by dup_task_struct().  However, we ignore that copy, since
   * it was not made under the protection of RCU or cgroup_mutex, so
956db3ca0   Cliff Wickman   hotplug cpu: move...
4329
   * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
817929ec2   Paul Menage   Task Control Grou...
4330
4331
   * have already changed current->cgroups, allowing the previously
   * referenced cgroup group to be removed and freed.
b4f48b636   Paul Menage   Task Control Grou...
4332
4333
4334
4335
4336
4337
   *
   * At the point that cgroup_fork() is called, 'current' is the parent
   * task, and the passed argument 'child' points to the child task.
   */
  void cgroup_fork(struct task_struct *child)
  {
817929ec2   Paul Menage   Task Control Grou...
4338
4339
4340
4341
4342
  	task_lock(current);
  	child->cgroups = current->cgroups;
  	get_css_set(child->cgroups);
  	task_unlock(current);
  	INIT_LIST_HEAD(&child->cg_list);
b4f48b636   Paul Menage   Task Control Grou...
4343
4344
4345
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4346
4347
4348
4349
4350
4351
   * cgroup_fork_callbacks - run fork callbacks
   * @child: the new task
   *
   * Called on a new task very soon before adding it to the
   * tasklist. No need to take any locks since no-one can
   * be operating on this task.
b4f48b636   Paul Menage   Task Control Grou...
4352
4353
4354
4355
4356
   */
  void cgroup_fork_callbacks(struct task_struct *child)
  {
  	if (need_forkexit_callback) {
  		int i;
aae8aab40   Ben Blum   cgroups: revamp s...
4357
4358
4359
4360
4361
4362
  		/*
  		 * forkexit callbacks are only supported for builtin
  		 * subsystems, and the builtin section of the subsys array is
  		 * immutable, so we don't need to lock the subsys array here.
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
b4f48b636   Paul Menage   Task Control Grou...
4363
4364
4365
4366
4367
4368
4369
4370
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->fork)
  				ss->fork(ss, child);
  		}
  	}
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4371
4372
4373
4374
4375
4376
4377
4378
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
   * Adds the task to the list running through its css_set if necessary.
   * Has to be after the task is visible on the task list in case we race
   * with the first call to cgroup_iter_start() - to guarantee that the
   * new task ends up on its list.
   */
817929ec2   Paul Menage   Task Control Grou...
4379
4380
4381
4382
  void cgroup_post_fork(struct task_struct *child)
  {
  	if (use_task_css_set_links) {
  		write_lock(&css_set_lock);
b12b533fa   Lai Jiangshan   cgroups: add lock...
4383
  		task_lock(child);
817929ec2   Paul Menage   Task Control Grou...
4384
4385
  		if (list_empty(&child->cg_list))
  			list_add(&child->cg_list, &child->cgroups->tasks);
b12b533fa   Lai Jiangshan   cgroups: add lock...
4386
  		task_unlock(child);
817929ec2   Paul Menage   Task Control Grou...
4387
4388
4389
4390
  		write_unlock(&css_set_lock);
  	}
  }
  /**
b4f48b636   Paul Menage   Task Control Grou...
4391
4392
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
a043e3b2c   Li Zefan   cgroup: fix comments
4393
   * @run_callback: run exit callbacks?
b4f48b636   Paul Menage   Task Control Grou...
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
   *
   * Description: Detach cgroup from @tsk and release it.
   *
   * Note that cgroups marked notify_on_release force every task in
   * them to take the global cgroup_mutex mutex when exiting.
   * This could impact scaling on very large systems.  Be reluctant to
   * use notify_on_release cgroups where very high task exit scaling
   * is required on large systems.
   *
   * the_top_cgroup_hack:
   *
   *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
   *
   *    We call cgroup_exit() while the task is still competent to
   *    handle notify_on_release(), then leave the task attached to the
   *    root cgroup in each hierarchy for the remainder of its exit.
   *
   *    To do this properly, we would increment the reference count on
   *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
   *    code we would add a second cgroup function call, to drop that
   *    reference.  This would just create an unnecessary hot spot on
   *    the top_cgroup reference count, to no avail.
   *
   *    Normally, holding a reference to a cgroup without bumping its
   *    count is unsafe.   The cgroup could go away, or someone could
   *    attach us to a different cgroup, decrementing the count on
   *    the first cgroup that we never incremented.  But in this case,
   *    top_cgroup isn't going away, and either task has PF_EXITING set,
956db3ca0   Cliff Wickman   hotplug cpu: move...
4422
4423
   *    which wards off any cgroup_attach_task() attempts, or task is a failed
   *    fork, never visible to cgroup_attach_task.
b4f48b636   Paul Menage   Task Control Grou...
4424
4425
4426
   */
  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
  {
817929ec2   Paul Menage   Task Control Grou...
4427
  	struct css_set *cg;
d41d5a016   Peter Zijlstra   cgroup: Fix cgrou...
4428
  	int i;
817929ec2   Paul Menage   Task Control Grou...
4429
4430
4431
4432
4433
4434
4435
4436
4437
  
  	/*
  	 * Unlink from the css_set task list if necessary.
  	 * Optimistically check cg_list before taking
  	 * css_set_lock
  	 */
  	if (!list_empty(&tsk->cg_list)) {
  		write_lock(&css_set_lock);
  		if (!list_empty(&tsk->cg_list))
8d2587970   Phil Carmody   cgroups: if you l...
4438
  			list_del_init(&tsk->cg_list);
817929ec2   Paul Menage   Task Control Grou...
4439
4440
  		write_unlock(&css_set_lock);
  	}
b4f48b636   Paul Menage   Task Control Grou...
4441
4442
  	/* Reassign the task to the init_css_set. */
  	task_lock(tsk);
817929ec2   Paul Menage   Task Control Grou...
4443
4444
  	cg = tsk->cgroups;
  	tsk->cgroups = &init_css_set;
d41d5a016   Peter Zijlstra   cgroup: Fix cgrou...
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
  
  	if (run_callbacks && need_forkexit_callback) {
  		/*
  		 * modular subsystems can't use callbacks, so no need to lock
  		 * the subsys array
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->exit) {
  				struct cgroup *old_cgrp =
  					rcu_dereference_raw(cg->subsys[i])->cgroup;
  				struct cgroup *cgrp = task_cgroup(tsk, i);
  				ss->exit(ss, cgrp, old_cgrp, tsk);
  			}
  		}
  	}
b4f48b636   Paul Menage   Task Control Grou...
4461
  	task_unlock(tsk);
d41d5a016   Peter Zijlstra   cgroup: Fix cgrou...
4462

817929ec2   Paul Menage   Task Control Grou...
4463
  	if (cg)
81a6a5cdd   Paul Menage   Task Control Grou...
4464
  		put_css_set_taskexit(cg);
b4f48b636   Paul Menage   Task Control Grou...
4465
  }
697f41610   Paul Menage   Task Control Grou...
4466
4467
  
  /**
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4468
   * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
a043e3b2c   Li Zefan   cgroup: fix comments
4469
   * @cgrp: the cgroup in question
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4470
   * @task: the task in question
a043e3b2c   Li Zefan   cgroup: fix comments
4471
   *
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4472
4473
   * See if @cgrp is a descendant of @task's cgroup in the appropriate
   * hierarchy.
697f41610   Paul Menage   Task Control Grou...
4474
4475
4476
4477
4478
4479
   *
   * If we are sending in dummytop, then presumably we are creating
   * the top cgroup in the subsystem.
   *
   * Called only by the ns (nsproxy) cgroup.
   */
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4480
  int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
697f41610   Paul Menage   Task Control Grou...
4481
4482
4483
  {
  	int ret;
  	struct cgroup *target;
697f41610   Paul Menage   Task Control Grou...
4484

bd89aabc6   Paul Menage   Control groups: R...
4485
  	if (cgrp == dummytop)
697f41610   Paul Menage   Task Control Grou...
4486
  		return 1;
7717f7ba9   Paul Menage   cgroups: add a ba...
4487
  	target = task_cgroup_from_root(task, cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
4488
4489
4490
  	while (cgrp != target && cgrp!= cgrp->top_cgroup)
  		cgrp = cgrp->parent;
  	ret = (cgrp == target);
697f41610   Paul Menage   Task Control Grou...
4491
4492
  	return ret;
  }
81a6a5cdd   Paul Menage   Task Control Grou...
4493

bd89aabc6   Paul Menage   Control groups: R...
4494
  static void check_for_release(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
4495
4496
4497
  {
  	/* All of these checks rely on RCU to keep the cgroup
  	 * structure alive */
bd89aabc6   Paul Menage   Control groups: R...
4498
4499
  	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
  	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
4500
4501
4502
4503
4504
  		/* Control Group is currently removeable. If it's not
  		 * already queued for a userspace notification, queue
  		 * it now */
  		int need_schedule_work = 0;
  		spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
4505
4506
4507
  		if (!cgroup_is_removed(cgrp) &&
  		    list_empty(&cgrp->release_list)) {
  			list_add(&cgrp->release_list, &release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
4508
4509
4510
4511
4512
4513
4514
  			need_schedule_work = 1;
  		}
  		spin_unlock(&release_list_lock);
  		if (need_schedule_work)
  			schedule_work(&release_agent_work);
  	}
  }
d7b9fff71   Daisuke Nishimura   cgroup: introduce...
4515
4516
  /* Caller must verify that the css is not for root cgroup */
  void __css_put(struct cgroup_subsys_state *css, int count)
81a6a5cdd   Paul Menage   Task Control Grou...
4517
  {
bd89aabc6   Paul Menage   Control groups: R...
4518
  	struct cgroup *cgrp = css->cgroup;
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4519
  	int val;
81a6a5cdd   Paul Menage   Task Control Grou...
4520
  	rcu_read_lock();
d7b9fff71   Daisuke Nishimura   cgroup: introduce...
4521
  	val = atomic_sub_return(count, &css->refcnt);
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4522
  	if (val == 1) {
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
4523
4524
4525
4526
  		if (notify_on_release(cgrp)) {
  			set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
  		}
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
4527
  		cgroup_wakeup_rmdir_waiter(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
4528
4529
  	}
  	rcu_read_unlock();
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4530
  	WARN_ON_ONCE(val < 1);
81a6a5cdd   Paul Menage   Task Control Grou...
4531
  }
67523c48a   Ben Blum   cgroups: blkio su...
4532
  EXPORT_SYMBOL_GPL(__css_put);
81a6a5cdd   Paul Menage   Task Control Grou...
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
  
  /*
   * Notify userspace when a cgroup is released, by running the
   * configured release agent with the name of the cgroup (path
   * relative to the root of cgroup file system) as the argument.
   *
   * Most likely, this user command will try to rmdir this cgroup.
   *
   * This races with the possibility that some other task will be
   * attached to this cgroup before it is removed, or that some other
   * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
   * The presumed 'rmdir' will fail quietly if this cgroup is no longer
   * unused, and this cgroup will be reprieved from its death sentence,
   * to continue to serve a useful existence.  Next time it's released,
   * we will get notified again, if it still has 'notify_on_release' set.
   *
   * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
   * means only wait until the task is successfully execve()'d.  The
   * separate release agent task is forked by call_usermodehelper(),
   * then control in this thread returns here, without waiting for the
   * release agent task.  We don't bother to wait because the caller of
   * this routine has no use for the exit status of the release agent
   * task, so no sense holding our caller up for that.
81a6a5cdd   Paul Menage   Task Control Grou...
4556
   */
81a6a5cdd   Paul Menage   Task Control Grou...
4557
4558
4559
4560
4561
4562
4563
4564
  static void cgroup_release_agent(struct work_struct *work)
  {
  	BUG_ON(work != &release_agent_work);
  	mutex_lock(&cgroup_mutex);
  	spin_lock(&release_list_lock);
  	while (!list_empty(&release_list)) {
  		char *argv[3], *envp[3];
  		int i;
e788e066c   Paul Menage   cgroup files: mov...
4565
  		char *pathbuf = NULL, *agentbuf = NULL;
bd89aabc6   Paul Menage   Control groups: R...
4566
  		struct cgroup *cgrp = list_entry(release_list.next,
81a6a5cdd   Paul Menage   Task Control Grou...
4567
4568
  						    struct cgroup,
  						    release_list);
bd89aabc6   Paul Menage   Control groups: R...
4569
  		list_del_init(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
4570
4571
  		spin_unlock(&release_list_lock);
  		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
e788e066c   Paul Menage   cgroup files: mov...
4572
4573
4574
4575
4576
4577
4578
  		if (!pathbuf)
  			goto continue_free;
  		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
  			goto continue_free;
  		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
  		if (!agentbuf)
  			goto continue_free;
81a6a5cdd   Paul Menage   Task Control Grou...
4579
4580
  
  		i = 0;
e788e066c   Paul Menage   cgroup files: mov...
4581
4582
  		argv[i++] = agentbuf;
  		argv[i++] = pathbuf;
81a6a5cdd   Paul Menage   Task Control Grou...
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
  		argv[i] = NULL;
  
  		i = 0;
  		/* minimal command environment */
  		envp[i++] = "HOME=/";
  		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  		envp[i] = NULL;
  
  		/* Drop the lock while we invoke the usermode helper,
  		 * since the exec could involve hitting disk and hence
  		 * be a slow process */
  		mutex_unlock(&cgroup_mutex);
  		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
81a6a5cdd   Paul Menage   Task Control Grou...
4596
  		mutex_lock(&cgroup_mutex);
e788e066c   Paul Menage   cgroup files: mov...
4597
4598
4599
   continue_free:
  		kfree(pathbuf);
  		kfree(agentbuf);
81a6a5cdd   Paul Menage   Task Control Grou...
4600
4601
4602
4603
4604
  		spin_lock(&release_list_lock);
  	}
  	spin_unlock(&release_list_lock);
  	mutex_unlock(&cgroup_mutex);
  }
8bab8dded   Paul Menage   cgroups: add cgro...
4605
4606
4607
4608
4609
4610
4611
4612
4613
  
  static int __init cgroup_disable(char *str)
  {
  	int i;
  	char *token;
  
  	while ((token = strsep(&str, ",")) != NULL) {
  		if (!*token)
  			continue;
aae8aab40   Ben Blum   cgroups: revamp s...
4614
4615
4616
4617
4618
  		/*
  		 * cgroup_disable, being at boot time, can't know about module
  		 * subsystems, so we don't worry about them.
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
8bab8dded   Paul Menage   cgroups: add cgro...
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
  			struct cgroup_subsys *ss = subsys[i];
  
  			if (!strcmp(token, ss->name)) {
  				ss->disabled = 1;
  				printk(KERN_INFO "Disabling %s control group"
  					" subsystem
  ", ss->name);
  				break;
  			}
  		}
  	}
  	return 1;
  }
  __setup("cgroup_disable=", cgroup_disable);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
  
  /*
   * Functons for CSS ID.
   */
  
  /*
   *To get ID other than 0, this should be called when !cgroup_is_removed().
   */
  unsigned short css_id(struct cgroup_subsys_state *css)
  {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4643
4644
4645
4646
4647
4648
4649
  	struct css_id *cssid;
  
  	/*
  	 * This css_id() can return correct value when somone has refcnt
  	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
  	 * it's unchanged until freed.
  	 */
d8bf4ca9c   Michal Hocko   rcu: treewide: Do...
4650
  	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4651
4652
4653
4654
4655
  
  	if (cssid)
  		return cssid->id;
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
4656
  EXPORT_SYMBOL_GPL(css_id);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4657
4658
4659
  
  unsigned short css_depth(struct cgroup_subsys_state *css)
  {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4660
  	struct css_id *cssid;
d8bf4ca9c   Michal Hocko   rcu: treewide: Do...
4661
  	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4662
4663
4664
4665
4666
  
  	if (cssid)
  		return cssid->depth;
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
4667
  EXPORT_SYMBOL_GPL(css_depth);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4668

747388d78   KAMEZAWA Hiroyuki   memcg: fix css_is...
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
  /**
   *  css_is_ancestor - test "root" css is an ancestor of "child"
   * @child: the css to be tested.
   * @root: the css supporsed to be an ancestor of the child.
   *
   * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
   * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
   * But, considering usual usage, the csses should be valid objects after test.
   * Assuming that the caller will do some action to the child if this returns
   * returns true, the caller must take "child";s reference count.
   * If "child" is valid object and this returns true, "root" is valid, too.
   */
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4681
  bool css_is_ancestor(struct cgroup_subsys_state *child,
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
4682
  		    const struct cgroup_subsys_state *root)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4683
  {
747388d78   KAMEZAWA Hiroyuki   memcg: fix css_is...
4684
4685
4686
  	struct css_id *child_id;
  	struct css_id *root_id;
  	bool ret = true;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4687

747388d78   KAMEZAWA Hiroyuki   memcg: fix css_is...
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
  	rcu_read_lock();
  	child_id  = rcu_dereference(child->id);
  	root_id = rcu_dereference(root->id);
  	if (!child_id
  	    || !root_id
  	    || (child_id->depth < root_id->depth)
  	    || (child_id->stack[root_id->depth] != root_id->id))
  		ret = false;
  	rcu_read_unlock();
  	return ret;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4698
  }
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
  void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
  {
  	struct css_id *id = css->id;
  	/* When this is called before css_id initialization, id can be NULL */
  	if (!id)
  		return;
  
  	BUG_ON(!ss->use_id);
  
  	rcu_assign_pointer(id->css, NULL);
  	rcu_assign_pointer(css->id, NULL);
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, id->id);
  	spin_unlock(&ss->id_lock);
025cea99d   Lai Jiangshan   cgroup,rcu: conve...
4713
  	kfree_rcu(id, rcu_head);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4714
  }
67523c48a   Ben Blum   cgroups: blkio su...
4715
  EXPORT_SYMBOL_GPL(free_css_id);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
  
  /*
   * This is called by init or create(). Then, calls to this function are
   * always serialized (By cgroup_mutex() at create()).
   */
  
  static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
  {
  	struct css_id *newid;
  	int myid, error, size;
  
  	BUG_ON(!ss->use_id);
  
  	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
  	newid = kzalloc(size, GFP_KERNEL);
  	if (!newid)
  		return ERR_PTR(-ENOMEM);
  	/* get id */
  	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
  		error = -ENOMEM;
  		goto err_out;
  	}
  	spin_lock(&ss->id_lock);
  	/* Don't use 0. allocates an ID of 1-65535 */
  	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
  	spin_unlock(&ss->id_lock);
  
  	/* Returns error when there are no free spaces for new ID.*/
  	if (error) {
  		error = -ENOSPC;
  		goto err_out;
  	}
  	if (myid > CSS_ID_MAX)
  		goto remove_idr;
  
  	newid->id = myid;
  	newid->depth = depth;
  	return newid;
  remove_idr:
  	error = -ENOSPC;
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, myid);
  	spin_unlock(&ss->id_lock);
  err_out:
  	kfree(newid);
  	return ERR_PTR(error);
  
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
4764
4765
  static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
  					    struct cgroup_subsys_state *rootcss)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4766
4767
  {
  	struct css_id *newid;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4768
4769
4770
  
  	spin_lock_init(&ss->id_lock);
  	idr_init(&ss->idr);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
  	newid = get_new_cssid(ss, 0);
  	if (IS_ERR(newid))
  		return PTR_ERR(newid);
  
  	newid->stack[0] = newid->id;
  	newid->css = rootcss;
  	rootcss->id = newid;
  	return 0;
  }
  
  static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
  			struct cgroup *child)
  {
  	int subsys_id, i, depth = 0;
  	struct cgroup_subsys_state *parent_css, *child_css;
fae9c7917   Li Zefan   cgroup: Fix an RC...
4786
  	struct css_id *child_id, *parent_id;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4787
4788
4789
4790
  
  	subsys_id = ss->subsys_id;
  	parent_css = parent->subsys[subsys_id];
  	child_css = child->subsys[subsys_id];
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4791
  	parent_id = parent_css->id;
94b3dd0f7   Greg Thelen   cgroups: alloc_cs...
4792
  	depth = parent_id->depth + 1;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
  
  	child_id = get_new_cssid(ss, depth);
  	if (IS_ERR(child_id))
  		return PTR_ERR(child_id);
  
  	for (i = 0; i < depth; i++)
  		child_id->stack[i] = parent_id->stack[i];
  	child_id->stack[depth] = child_id->id;
  	/*
  	 * child_id->css pointer will be set after this cgroup is available
  	 * see cgroup_populate_dir()
  	 */
  	rcu_assign_pointer(child_css->id, child_id);
  
  	return 0;
  }
  
  /**
   * css_lookup - lookup css by id
   * @ss: cgroup subsys to be looked into.
   * @id: the id
   *
   * Returns pointer to cgroup_subsys_state if there is valid one with id.
   * NULL if not. Should be called under rcu_read_lock()
   */
  struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
  {
  	struct css_id *cssid = NULL;
  
  	BUG_ON(!ss->use_id);
  	cssid = idr_find(&ss->idr, id);
  
  	if (unlikely(!cssid))
  		return NULL;
  
  	return rcu_dereference(cssid->css);
  }
67523c48a   Ben Blum   cgroups: blkio su...
4830
  EXPORT_SYMBOL_GPL(css_lookup);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
  
  /**
   * css_get_next - lookup next cgroup under specified hierarchy.
   * @ss: pointer to subsystem
   * @id: current position of iteration.
   * @root: pointer to css. search tree under this.
   * @foundid: position of found object.
   *
   * Search next css under the specified hierarchy of rootid. Calling under
   * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
   */
  struct cgroup_subsys_state *
  css_get_next(struct cgroup_subsys *ss, int id,
  	     struct cgroup_subsys_state *root, int *foundid)
  {
  	struct cgroup_subsys_state *ret = NULL;
  	struct css_id *tmp;
  	int tmpid;
  	int rootid = css_id(root);
  	int depth = css_depth(root);
  
  	if (!rootid)
  		return NULL;
  
  	BUG_ON(!ss->use_id);
  	/* fill start point for scan */
  	tmpid = id;
  	while (1) {
  		/*
  		 * scan next entry from bitmap(tree), tmpid is updated after
  		 * idr_get_next().
  		 */
  		spin_lock(&ss->id_lock);
  		tmp = idr_get_next(&ss->idr, &tmpid);
  		spin_unlock(&ss->id_lock);
  
  		if (!tmp)
  			break;
  		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
  			ret = rcu_dereference(tmp->css);
  			if (ret) {
  				*foundid = tmpid;
  				break;
  			}
  		}
  		/* continue to scan from next id */
  		tmpid = tmpid + 1;
  	}
  	return ret;
  }
e5d1367f1   Stephane Eranian   perf: Add cgroup ...
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
  /*
   * get corresponding css from file open on cgroupfs directory
   */
  struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
  {
  	struct cgroup *cgrp;
  	struct inode *inode;
  	struct cgroup_subsys_state *css;
  
  	inode = f->f_dentry->d_inode;
  	/* check in cgroup filesystem dir */
  	if (inode->i_op != &cgroup_dir_inode_operations)
  		return ERR_PTR(-EBADF);
  
  	if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
  		return ERR_PTR(-EINVAL);
  
  	/* get cgroup */
  	cgrp = __d_cgrp(f->f_dentry);
  	css = cgrp->subsys[id];
  	return css ? css : ERR_PTR(-ENOENT);
  }
fe6934354   Paul Menage   cgroups: move the...
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
  #ifdef CONFIG_CGROUP_DEBUG
  static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
  						   struct cgroup *cont)
  {
  	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
  
  	if (!css)
  		return ERR_PTR(-ENOMEM);
  
  	return css;
  }
  
  static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
  {
  	kfree(cont->subsys[debug_subsys_id]);
  }
  
  static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
  {
  	return atomic_read(&cont->count);
  }
  
  static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
  {
  	return cgroup_task_count(cont);
  }
  
  static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
  {
  	return (u64)(unsigned long)current->cgroups;
  }
  
  static u64 current_css_set_refcount_read(struct cgroup *cont,
  					   struct cftype *cft)
  {
  	u64 count;
  
  	rcu_read_lock();
  	count = atomic_read(&current->cgroups->refcount);
  	rcu_read_unlock();
  	return count;
  }
7717f7ba9   Paul Menage   cgroups: add a ba...
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
  static int current_css_set_cg_links_read(struct cgroup *cont,
  					 struct cftype *cft,
  					 struct seq_file *seq)
  {
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	read_lock(&css_set_lock);
  	rcu_read_lock();
  	cg = rcu_dereference(current->cgroups);
  	list_for_each_entry(link, &cg->cg_links, cg_link_list) {
  		struct cgroup *c = link->cgrp;
  		const char *name;
  
  		if (c->dentry)
  			name = c->dentry->d_name.name;
  		else
  			name = "?";
2c6ab6d20   Paul Menage   cgroups: allow cg...
4963
4964
4965
  		seq_printf(seq, "Root %d group %s
  ",
  			   c->root->hierarchy_id, name);
7717f7ba9   Paul Menage   cgroups: add a ba...
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
  	}
  	rcu_read_unlock();
  	read_unlock(&css_set_lock);
  	return 0;
  }
  
  #define MAX_TASKS_SHOWN_PER_CSS 25
  static int cgroup_css_links_read(struct cgroup *cont,
  				 struct cftype *cft,
  				 struct seq_file *seq)
  {
  	struct cg_cgroup_link *link;
  
  	read_lock(&css_set_lock);
  	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
  		struct css_set *cg = link->cg;
  		struct task_struct *task;
  		int count = 0;
  		seq_printf(seq, "css_set %p
  ", cg);
  		list_for_each_entry(task, &cg->tasks, cg_list) {
  			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
  				seq_puts(seq, "  ...
  ");
  				break;
  			} else {
  				seq_printf(seq, "  task %d
  ",
  					   task_pid_vnr(task));
  			}
  		}
  	}
  	read_unlock(&css_set_lock);
  	return 0;
  }
fe6934354   Paul Menage   cgroups: move the...
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
  static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
  {
  	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
  }
  
  static struct cftype debug_files[] =  {
  	{
  		.name = "cgroup_refcount",
  		.read_u64 = cgroup_refcount_read,
  	},
  	{
  		.name = "taskcount",
  		.read_u64 = debug_taskcount_read,
  	},
  
  	{
  		.name = "current_css_set",
  		.read_u64 = current_css_set_read,
  	},
  
  	{
  		.name = "current_css_set_refcount",
  		.read_u64 = current_css_set_refcount_read,
  	},
  
  	{
7717f7ba9   Paul Menage   cgroups: add a ba...
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
  		.name = "current_css_set_cg_links",
  		.read_seq_string = current_css_set_cg_links_read,
  	},
  
  	{
  		.name = "cgroup_css_links",
  		.read_seq_string = cgroup_css_links_read,
  	},
  
  	{
fe6934354   Paul Menage   cgroups: move the...
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
  		.name = "releasable",
  		.read_u64 = releasable_read,
  	},
  };
  
  static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
  {
  	return cgroup_add_files(cont, ss, debug_files,
  				ARRAY_SIZE(debug_files));
  }
  
  struct cgroup_subsys debug_subsys = {
  	.name = "debug",
  	.create = debug_create,
  	.destroy = debug_destroy,
  	.populate = debug_populate,
  	.subsys_id = debug_subsys_id,
  };
  #endif /* CONFIG_CGROUP_DEBUG */