Blame view

kernel/cgroup.c 139 KB
ddbcc7e8e   Paul Menage   Task Control Grou...
1
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
2
3
4
5
6
   *  Generic process-grouping system.
   *
   *  Based originally on the cpuset system, extracted by Paul Menage
   *  Copyright (C) 2006 Google, Inc
   *
0dea11687   Kirill A. Shutemov   cgroup: implement...
7
8
9
10
   *  Notifications support
   *  Copyright (C) 2009 Nokia Corporation
   *  Author: Kirill A. Shutemov
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
   *  Copyright notices from the original cpuset code:
   *  --------------------------------------------------
   *  Copyright (C) 2003 BULL SA.
   *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
   *
   *  2003-10-10 Written by Simon Derr.
   *  2003-10-22 Updates by Stephen Hemminger.
   *  2004 May-July Rework by Paul Jackson.
   *  ---------------------------------------------------
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
  
  #include <linux/cgroup.h>
2ce9738ba   eparis@redhat   cgroupfs: use ini...
30
  #include <linux/cred.h>
c6d57f331   Paul Menage   cgroups: support ...
31
  #include <linux/ctype.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
32
33
  #include <linux/errno.h>
  #include <linux/fs.h>
2ce9738ba   eparis@redhat   cgroupfs: use ini...
34
  #include <linux/init_task.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
35
36
37
38
39
40
  #include <linux/kernel.h>
  #include <linux/list.h>
  #include <linux/mm.h>
  #include <linux/mutex.h>
  #include <linux/mount.h>
  #include <linux/pagemap.h>
a424316ca   Paul Menage   Task Control Grou...
41
  #include <linux/proc_fs.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
42
43
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
817929ec2   Paul Menage   Task Control Grou...
44
  #include <linux/backing-dev.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
45
46
47
48
49
  #include <linux/seq_file.h>
  #include <linux/slab.h>
  #include <linux/magic.h>
  #include <linux/spinlock.h>
  #include <linux/string.h>
bbcb81d09   Paul Menage   Task Control Grou...
50
  #include <linux/sort.h>
81a6a5cdd   Paul Menage   Task Control Grou...
51
  #include <linux/kmod.h>
e6a1105ba   Ben Blum   cgroups: subsyste...
52
  #include <linux/module.h>
846c7bb05   Balbir Singh   Add cgroupstats
53
54
  #include <linux/delayacct.h>
  #include <linux/cgroupstats.h>
472b1053f   Li Zefan   cgroups: use a ha...
55
  #include <linux/hash.h>
3f8206d49   Al Viro   [PATCH] get rid o...
56
  #include <linux/namei.h>
096b7fe01   Li Zefan   cgroups: fix pid ...
57
  #include <linux/pid_namespace.h>
2c6ab6d20   Paul Menage   cgroups: allow cg...
58
  #include <linux/idr.h>
d1d9fd330   Ben Blum   cgroups: use vmal...
59
  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
0dea11687   Kirill A. Shutemov   cgroup: implement...
60
61
  #include <linux/eventfd.h>
  #include <linux/poll.h>
d846687d7   Ben Blum   cgroups: use flex...
62
  #include <linux/flex_array.h> /* used in cgroup_attach_proc */
846c7bb05   Balbir Singh   Add cgroupstats
63

60063497a   Arun Sharma   atomic: use <linu...
64
  #include <linux/atomic.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
65

e25e2cbb4   Tejun Heo   cgroup: add cgrou...
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  /*
   * cgroup_mutex is the master lock.  Any modification to cgroup or its
   * hierarchy must be performed while holding it.
   *
   * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
   * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
   * release_agent_path and so on.  Modifying requires both cgroup_mutex and
   * cgroup_root_mutex.  Readers can acquire either of the two.  This is to
   * break the following locking order cycle.
   *
   *  A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
   *  B. namespace_sem -> cgroup_mutex
   *
   * B happens only through cgroup_show_options() and using cgroup_root_mutex
   * breaks it.
   */
81a6a5cdd   Paul Menage   Task Control Grou...
82
  static DEFINE_MUTEX(cgroup_mutex);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
83
  static DEFINE_MUTEX(cgroup_root_mutex);
81a6a5cdd   Paul Menage   Task Control Grou...
84

aae8aab40   Ben Blum   cgroups: revamp s...
85
86
87
88
89
90
  /*
   * Generate an array of cgroup subsystem pointers. At boot time, this is
   * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
   * registered after that. The mutable section of this array is protected by
   * cgroup_mutex.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
91
  #define SUBSYS(_x) &_x ## _subsys,
aae8aab40   Ben Blum   cgroups: revamp s...
92
  static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
ddbcc7e8e   Paul Menage   Task Control Grou...
93
94
  #include <linux/cgroup_subsys.h>
  };
c6d57f331   Paul Menage   cgroups: support ...
95
  #define MAX_CGROUP_ROOT_NAMELEN 64
ddbcc7e8e   Paul Menage   Task Control Grou...
96
97
98
99
100
101
102
103
104
105
106
107
108
  /*
   * A cgroupfs_root represents the root of a cgroup hierarchy,
   * and may be associated with a superblock to form an active
   * hierarchy
   */
  struct cgroupfs_root {
  	struct super_block *sb;
  
  	/*
  	 * The bitmask of subsystems intended to be attached to this
  	 * hierarchy
  	 */
  	unsigned long subsys_bits;
2c6ab6d20   Paul Menage   cgroups: allow cg...
109
110
  	/* Unique id for this hierarchy. */
  	int hierarchy_id;
ddbcc7e8e   Paul Menage   Task Control Grou...
111
112
113
114
115
116
117
118
119
120
121
  	/* The bitmask of subsystems currently attached to this hierarchy */
  	unsigned long actual_subsys_bits;
  
  	/* A list running through the attached subsystems */
  	struct list_head subsys_list;
  
  	/* The root cgroup for this hierarchy */
  	struct cgroup top_cgroup;
  
  	/* Tracks how many cgroups are currently defined in hierarchy.*/
  	int number_of_cgroups;
e5f6a8609   Li Zefan   cgroups: make roo...
122
  	/* A list running through the active hierarchies */
ddbcc7e8e   Paul Menage   Task Control Grou...
123
124
125
126
  	struct list_head root_list;
  
  	/* Hierarchy-specific flags */
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
127

e788e066c   Paul Menage   cgroup files: mov...
128
  	/* The path to use for release notifications. */
81a6a5cdd   Paul Menage   Task Control Grou...
129
  	char release_agent_path[PATH_MAX];
c6d57f331   Paul Menage   cgroups: support ...
130
131
132
  
  	/* The name for this hierarchy - may be empty */
  	char name[MAX_CGROUP_ROOT_NAMELEN];
ddbcc7e8e   Paul Menage   Task Control Grou...
133
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
134
135
136
137
138
139
  /*
   * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
   * subsystems that are otherwise unattached - it never has more than a
   * single cgroup, and all tasks are part of that cgroup.
   */
  static struct cgroupfs_root rootnode;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
140
141
142
143
144
145
146
147
148
149
150
151
152
  /*
   * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
   * cgroup_subsys->use_id != 0.
   */
  #define CSS_ID_MAX	(65535)
  struct css_id {
  	/*
  	 * The css to which this ID points. This pointer is set to valid value
  	 * after cgroup is populated. If cgroup is removed, this will be NULL.
  	 * This pointer is expected to be RCU-safe because destroy()
  	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
  	 * css_tryget() should be used for avoiding race.
  	 */
2c392b8c3   Arnd Bergmann   cgroups: __rcu an...
153
  	struct cgroup_subsys_state __rcu *css;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
  	/*
  	 * ID of this css.
  	 */
  	unsigned short id;
  	/*
  	 * Depth in hierarchy which this ID belongs to.
  	 */
  	unsigned short depth;
  	/*
  	 * ID is freed by RCU. (and lookup routine is RCU safe.)
  	 */
  	struct rcu_head rcu_head;
  	/*
  	 * Hierarchy of CSS ID belongs to.
  	 */
  	unsigned short stack[0]; /* Array of Length (depth+1) */
  };
0dea11687   Kirill A. Shutemov   cgroup: implement...
171
  /*
25985edce   Lucas De Marchi   Fix common misspe...
172
   * cgroup_event represents events which userspace want to receive.
0dea11687   Kirill A. Shutemov   cgroup: implement...
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
   */
  struct cgroup_event {
  	/*
  	 * Cgroup which the event belongs to.
  	 */
  	struct cgroup *cgrp;
  	/*
  	 * Control file which the event associated.
  	 */
  	struct cftype *cft;
  	/*
  	 * eventfd to signal userspace about the event.
  	 */
  	struct eventfd_ctx *eventfd;
  	/*
  	 * Each of these stored in a list by the cgroup.
  	 */
  	struct list_head list;
  	/*
  	 * All fields below needed to unregister event when
  	 * userspace closes eventfd.
  	 */
  	poll_table pt;
  	wait_queue_head_t *wqh;
  	wait_queue_t wait;
  	struct work_struct remove;
  };
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
200

ddbcc7e8e   Paul Menage   Task Control Grou...
201
202
203
  /* The list of hierarchy roots */
  
  static LIST_HEAD(roots);
817929ec2   Paul Menage   Task Control Grou...
204
  static int root_count;
ddbcc7e8e   Paul Menage   Task Control Grou...
205

2c6ab6d20   Paul Menage   cgroups: allow cg...
206
207
208
  static DEFINE_IDA(hierarchy_ida);
  static int next_hierarchy_id;
  static DEFINE_SPINLOCK(hierarchy_id_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
209
210
211
212
  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
  #define dummytop (&rootnode.top_cgroup)
  
  /* This flag indicates whether tasks in the fork and exit paths should
a043e3b2c   Li Zefan   cgroup: fix comments
213
214
215
   * check for fork/exit handlers to call. This avoids us having to do
   * extra work in the fork/exit path if none of the subsystems need to
   * be called.
ddbcc7e8e   Paul Menage   Task Control Grou...
216
   */
8947f9d5b   Li Zefan   cgroups: annotate...
217
  static int need_forkexit_callback __read_mostly;
ddbcc7e8e   Paul Menage   Task Control Grou...
218

d11c563dd   Paul E. McKenney   sched: Use lockde...
219
220
221
222
223
224
225
226
227
228
229
230
231
  #ifdef CONFIG_PROVE_LOCKING
  int cgroup_lock_is_held(void)
  {
  	return lockdep_is_held(&cgroup_mutex);
  }
  #else /* #ifdef CONFIG_PROVE_LOCKING */
  int cgroup_lock_is_held(void)
  {
  	return mutex_is_locked(&cgroup_mutex);
  }
  #endif /* #else #ifdef CONFIG_PROVE_LOCKING */
  
  EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
ddbcc7e8e   Paul Menage   Task Control Grou...
232
  /* convenient tests for these bits */
bd89aabc6   Paul Menage   Control groups: R...
233
  inline int cgroup_is_removed(const struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
234
  {
bd89aabc6   Paul Menage   Control groups: R...
235
  	return test_bit(CGRP_REMOVED, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
236
237
238
239
240
241
  }
  
  /* bits in struct cgroupfs_root flags field */
  enum {
  	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
  };
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
242
  static int cgroup_is_releasable(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
243
244
  {
  	const int bits =
bd89aabc6   Paul Menage   Control groups: R...
245
246
247
  		(1 << CGRP_RELEASABLE) |
  		(1 << CGRP_NOTIFY_ON_RELEASE);
  	return (cgrp->flags & bits) == bits;
81a6a5cdd   Paul Menage   Task Control Grou...
248
  }
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
249
  static int notify_on_release(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
250
  {
bd89aabc6   Paul Menage   Control groups: R...
251
  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
252
  }
97978e6d1   Daniel Lezcano   cgroup: add clone...
253
254
255
256
  static int clone_children(const struct cgroup *cgrp)
  {
  	return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
257
258
259
260
261
262
  /*
   * for_each_subsys() allows you to iterate on each subsystem attached to
   * an active hierarchy
   */
  #define for_each_subsys(_root, _ss) \
  list_for_each_entry(_ss, &_root->subsys_list, sibling)
e5f6a8609   Li Zefan   cgroups: make roo...
263
264
  /* for_each_active_root() allows you to iterate across the active hierarchies */
  #define for_each_active_root(_root) \
ddbcc7e8e   Paul Menage   Task Control Grou...
265
  list_for_each_entry(_root, &roots, root_list)
81a6a5cdd   Paul Menage   Task Control Grou...
266
267
268
  /* the list of cgroups eligible for automatic release. Protected by
   * release_list_lock */
  static LIST_HEAD(release_list);
cdcc136ff   Thomas Gleixner   locking, sched, c...
269
  static DEFINE_RAW_SPINLOCK(release_list_lock);
81a6a5cdd   Paul Menage   Task Control Grou...
270
271
  static void cgroup_release_agent(struct work_struct *work);
  static DECLARE_WORK(release_agent_work, cgroup_release_agent);
bd89aabc6   Paul Menage   Control groups: R...
272
  static void check_for_release(struct cgroup *cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
273

817929ec2   Paul Menage   Task Control Grou...
274
275
276
277
278
279
  /* Link structure for associating css_set objects with cgroups */
  struct cg_cgroup_link {
  	/*
  	 * List running through cg_cgroup_links associated with a
  	 * cgroup, anchored on cgroup->css_sets
  	 */
bd89aabc6   Paul Menage   Control groups: R...
280
  	struct list_head cgrp_link_list;
7717f7ba9   Paul Menage   cgroups: add a ba...
281
  	struct cgroup *cgrp;
817929ec2   Paul Menage   Task Control Grou...
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
  	/*
  	 * List running through cg_cgroup_links pointing at a
  	 * single css_set object, anchored on css_set->cg_links
  	 */
  	struct list_head cg_link_list;
  	struct css_set *cg;
  };
  
  /* The default css_set - used by init and its children prior to any
   * hierarchies being mounted. It contains a pointer to the root state
   * for each subsystem. Also used to anchor the list of css_sets. Not
   * reference-counted, to improve performance when child cgroups
   * haven't been created.
   */
  
  static struct css_set init_css_set;
  static struct cg_cgroup_link init_css_set_link;
e6a1105ba   Ben Blum   cgroups: subsyste...
299
300
  static int cgroup_init_idr(struct cgroup_subsys *ss,
  			   struct cgroup_subsys_state *css);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
301

817929ec2   Paul Menage   Task Control Grou...
302
303
304
305
306
  /* css_set_lock protects the list of css_set objects, and the
   * chain of tasks off each css_set.  Nests outside task->alloc_lock
   * due to cgroup_iter_start() */
  static DEFINE_RWLOCK(css_set_lock);
  static int css_set_count;
7717f7ba9   Paul Menage   cgroups: add a ba...
307
308
309
310
311
  /*
   * hash table for cgroup groups. This improves the performance to find
   * an existing css_set. This hash doesn't (currently) take into
   * account cgroups in empty hierarchies.
   */
472b1053f   Li Zefan   cgroups: use a ha...
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
  #define CSS_SET_HASH_BITS	7
  #define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
  static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
  
  static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
  {
  	int i;
  	int index;
  	unsigned long tmp = 0UL;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
  		tmp += (unsigned long)css[i];
  	tmp = (tmp >> 16) ^ tmp;
  
  	index = hash_long(tmp, CSS_SET_HASH_BITS);
  
  	return &css_set_table[index];
  }
817929ec2   Paul Menage   Task Control Grou...
330
331
332
333
  /* We don't maintain the lists running through each css_set to its
   * task until after the first call to cgroup_iter_start(). This
   * reduces the fork()/exit() overhead for people who have cgroups
   * compiled into their kernel but not actually in use */
8947f9d5b   Li Zefan   cgroups: annotate...
334
  static int use_task_css_set_links __read_mostly;
817929ec2   Paul Menage   Task Control Grou...
335

2c6ab6d20   Paul Menage   cgroups: allow cg...
336
  static void __put_css_set(struct css_set *cg, int taskexit)
b4f48b636   Paul Menage   Task Control Grou...
337
  {
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
338
339
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
340
341
342
343
344
345
346
347
348
349
350
351
  	/*
  	 * Ensure that the refcount doesn't hit zero while any readers
  	 * can see it. Similar to atomic_dec_and_lock(), but for an
  	 * rwlock
  	 */
  	if (atomic_add_unless(&cg->refcount, -1, 1))
  		return;
  	write_lock(&css_set_lock);
  	if (!atomic_dec_and_test(&cg->refcount)) {
  		write_unlock(&css_set_lock);
  		return;
  	}
81a6a5cdd   Paul Menage   Task Control Grou...
352

2c6ab6d20   Paul Menage   cgroups: allow cg...
353
354
355
356
357
358
359
360
361
  	/* This css_set is dead. unlink it and release cgroup refcounts */
  	hlist_del(&cg->hlist);
  	css_set_count--;
  
  	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
  				 cg_link_list) {
  		struct cgroup *cgrp = link->cgrp;
  		list_del(&link->cg_link_list);
  		list_del(&link->cgrp_link_list);
bd89aabc6   Paul Menage   Control groups: R...
362
363
  		if (atomic_dec_and_test(&cgrp->count) &&
  		    notify_on_release(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
364
  			if (taskexit)
bd89aabc6   Paul Menage   Control groups: R...
365
366
  				set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
367
  		}
2c6ab6d20   Paul Menage   cgroups: allow cg...
368
369
  
  		kfree(link);
81a6a5cdd   Paul Menage   Task Control Grou...
370
  	}
2c6ab6d20   Paul Menage   cgroups: allow cg...
371
372
  
  	write_unlock(&css_set_lock);
30088ad81   Lai Jiangshan   cgroup,rcu: conve...
373
  	kfree_rcu(cg, rcu_head);
b4f48b636   Paul Menage   Task Control Grou...
374
  }
817929ec2   Paul Menage   Task Control Grou...
375
376
377
378
379
  /*
   * refcounted get/put for css_set objects
   */
  static inline void get_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
380
  	atomic_inc(&cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
381
382
383
384
  }
  
  static inline void put_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
385
  	__put_css_set(cg, 0);
817929ec2   Paul Menage   Task Control Grou...
386
  }
81a6a5cdd   Paul Menage   Task Control Grou...
387
388
  static inline void put_css_set_taskexit(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
389
  	__put_css_set(cg, 1);
81a6a5cdd   Paul Menage   Task Control Grou...
390
  }
817929ec2   Paul Menage   Task Control Grou...
391
  /*
7717f7ba9   Paul Menage   cgroups: add a ba...
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
   * compare_css_sets - helper function for find_existing_css_set().
   * @cg: candidate css_set being tested
   * @old_cg: existing css_set for a task
   * @new_cgrp: cgroup that's being entered by the task
   * @template: desired set of css pointers in css_set (pre-calculated)
   *
   * Returns true if "cg" matches "old_cg" except for the hierarchy
   * which "new_cgrp" belongs to, for which it should match "new_cgrp".
   */
  static bool compare_css_sets(struct css_set *cg,
  			     struct css_set *old_cg,
  			     struct cgroup *new_cgrp,
  			     struct cgroup_subsys_state *template[])
  {
  	struct list_head *l1, *l2;
  
  	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
  		/* Not all subsystems matched */
  		return false;
  	}
  
  	/*
  	 * Compare cgroup pointers in order to distinguish between
  	 * different cgroups in heirarchies with no subsystems. We
  	 * could get by with just this check alone (and skip the
  	 * memcmp above) but on most setups the memcmp check will
  	 * avoid the need for this more expensive check on almost all
  	 * candidates.
  	 */
  
  	l1 = &cg->cg_links;
  	l2 = &old_cg->cg_links;
  	while (1) {
  		struct cg_cgroup_link *cgl1, *cgl2;
  		struct cgroup *cg1, *cg2;
  
  		l1 = l1->next;
  		l2 = l2->next;
  		/* See if we reached the end - both lists are equal length. */
  		if (l1 == &cg->cg_links) {
  			BUG_ON(l2 != &old_cg->cg_links);
  			break;
  		} else {
  			BUG_ON(l2 == &old_cg->cg_links);
  		}
  		/* Locate the cgroups associated with these links. */
  		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
  		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
  		cg1 = cgl1->cgrp;
  		cg2 = cgl2->cgrp;
  		/* Hierarchies should be linked in the same order. */
  		BUG_ON(cg1->root != cg2->root);
  
  		/*
  		 * If this hierarchy is the hierarchy of the cgroup
  		 * that's changing, then we need to check that this
  		 * css_set points to the new cgroup; if it's any other
  		 * hierarchy, then this css_set should point to the
  		 * same cgroup as the old css_set.
  		 */
  		if (cg1->root == new_cgrp->root) {
  			if (cg1 != new_cgrp)
  				return false;
  		} else {
  			if (cg1 != cg2)
  				return false;
  		}
  	}
  	return true;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
464
465
   * find_existing_css_set() is a helper for
   * find_css_set(), and checks to see whether an existing
472b1053f   Li Zefan   cgroups: use a ha...
466
   * css_set is suitable.
817929ec2   Paul Menage   Task Control Grou...
467
468
469
470
   *
   * oldcg: the cgroup group that we're using before the cgroup
   * transition
   *
bd89aabc6   Paul Menage   Control groups: R...
471
   * cgrp: the cgroup that we're moving into
817929ec2   Paul Menage   Task Control Grou...
472
473
474
475
   *
   * template: location in which to build the desired set of subsystem
   * state objects for the new cgroup group
   */
817929ec2   Paul Menage   Task Control Grou...
476
477
  static struct css_set *find_existing_css_set(
  	struct css_set *oldcg,
bd89aabc6   Paul Menage   Control groups: R...
478
  	struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
479
  	struct cgroup_subsys_state *template[])
b4f48b636   Paul Menage   Task Control Grou...
480
481
  {
  	int i;
bd89aabc6   Paul Menage   Control groups: R...
482
  	struct cgroupfs_root *root = cgrp->root;
472b1053f   Li Zefan   cgroups: use a ha...
483
484
485
  	struct hlist_head *hhead;
  	struct hlist_node *node;
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
486

aae8aab40   Ben Blum   cgroups: revamp s...
487
488
489
490
491
  	/*
  	 * Build the set of subsystem state objects that we want to see in the
  	 * new css_set. while subsystems can change globally, the entries here
  	 * won't change, so no need for locking.
  	 */
817929ec2   Paul Menage   Task Control Grou...
492
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
493
  		if (root->subsys_bits & (1UL << i)) {
817929ec2   Paul Menage   Task Control Grou...
494
495
496
  			/* Subsystem is in this hierarchy. So we want
  			 * the subsystem state from the new
  			 * cgroup */
bd89aabc6   Paul Menage   Control groups: R...
497
  			template[i] = cgrp->subsys[i];
817929ec2   Paul Menage   Task Control Grou...
498
499
500
501
502
503
  		} else {
  			/* Subsystem is not in this hierarchy, so we
  			 * don't want to change the subsystem state */
  			template[i] = oldcg->subsys[i];
  		}
  	}
472b1053f   Li Zefan   cgroups: use a ha...
504
505
  	hhead = css_set_hash(template);
  	hlist_for_each_entry(cg, node, hhead, hlist) {
7717f7ba9   Paul Menage   cgroups: add a ba...
506
507
508
509
510
  		if (!compare_css_sets(cg, oldcg, cgrp, template))
  			continue;
  
  		/* This css_set matches what we need */
  		return cg;
472b1053f   Li Zefan   cgroups: use a ha...
511
  	}
817929ec2   Paul Menage   Task Control Grou...
512
513
514
515
  
  	/* No existing cgroup group matched */
  	return NULL;
  }
36553434f   Li Zefan   cgroup: remove du...
516
517
518
519
520
521
522
523
524
525
  static void free_cg_links(struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
  
  	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
  		list_del(&link->cgrp_link_list);
  		kfree(link);
  	}
  }
817929ec2   Paul Menage   Task Control Grou...
526
527
  /*
   * allocate_cg_links() allocates "count" cg_cgroup_link structures
bd89aabc6   Paul Menage   Control groups: R...
528
   * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
817929ec2   Paul Menage   Task Control Grou...
529
530
   * success or a negative error
   */
817929ec2   Paul Menage   Task Control Grou...
531
532
533
534
535
536
537
538
  static int allocate_cg_links(int count, struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	int i;
  	INIT_LIST_HEAD(tmp);
  	for (i = 0; i < count; i++) {
  		link = kmalloc(sizeof(*link), GFP_KERNEL);
  		if (!link) {
36553434f   Li Zefan   cgroup: remove du...
539
  			free_cg_links(tmp);
817929ec2   Paul Menage   Task Control Grou...
540
541
  			return -ENOMEM;
  		}
bd89aabc6   Paul Menage   Control groups: R...
542
  		list_add(&link->cgrp_link_list, tmp);
817929ec2   Paul Menage   Task Control Grou...
543
544
545
  	}
  	return 0;
  }
c12f65d43   Li Zefan   cgroups: introduc...
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
  /**
   * link_css_set - a helper function to link a css_set to a cgroup
   * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
   * @cg: the css_set to be linked
   * @cgrp: the destination cgroup
   */
  static void link_css_set(struct list_head *tmp_cg_links,
  			 struct css_set *cg, struct cgroup *cgrp)
  {
  	struct cg_cgroup_link *link;
  
  	BUG_ON(list_empty(tmp_cg_links));
  	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
  				cgrp_link_list);
  	link->cg = cg;
7717f7ba9   Paul Menage   cgroups: add a ba...
561
  	link->cgrp = cgrp;
2c6ab6d20   Paul Menage   cgroups: allow cg...
562
  	atomic_inc(&cgrp->count);
c12f65d43   Li Zefan   cgroups: introduc...
563
  	list_move(&link->cgrp_link_list, &cgrp->css_sets);
7717f7ba9   Paul Menage   cgroups: add a ba...
564
565
566
567
568
  	/*
  	 * Always add links to the tail of the list so that the list
  	 * is sorted by order of hierarchy creation
  	 */
  	list_add_tail(&link->cg_link_list, &cg->cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
569
  }
817929ec2   Paul Menage   Task Control Grou...
570
571
572
573
574
575
576
  /*
   * find_css_set() takes an existing cgroup group and a
   * cgroup object, and returns a css_set object that's
   * equivalent to the old group, but with the given cgroup
   * substituted into the appropriate hierarchy. Must be called with
   * cgroup_mutex held
   */
817929ec2   Paul Menage   Task Control Grou...
577
  static struct css_set *find_css_set(
bd89aabc6   Paul Menage   Control groups: R...
578
  	struct css_set *oldcg, struct cgroup *cgrp)
817929ec2   Paul Menage   Task Control Grou...
579
580
581
  {
  	struct css_set *res;
  	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
817929ec2   Paul Menage   Task Control Grou...
582
583
  
  	struct list_head tmp_cg_links;
817929ec2   Paul Menage   Task Control Grou...
584

472b1053f   Li Zefan   cgroups: use a ha...
585
  	struct hlist_head *hhead;
7717f7ba9   Paul Menage   cgroups: add a ba...
586
  	struct cg_cgroup_link *link;
472b1053f   Li Zefan   cgroups: use a ha...
587

817929ec2   Paul Menage   Task Control Grou...
588
589
  	/* First see if we already have a cgroup group that matches
  	 * the desired set */
7e9abd89c   Li Zefan   cgroup: use read ...
590
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
591
  	res = find_existing_css_set(oldcg, cgrp, template);
817929ec2   Paul Menage   Task Control Grou...
592
593
  	if (res)
  		get_css_set(res);
7e9abd89c   Li Zefan   cgroup: use read ...
594
  	read_unlock(&css_set_lock);
817929ec2   Paul Menage   Task Control Grou...
595
596
597
598
599
600
601
602
603
604
605
606
607
  
  	if (res)
  		return res;
  
  	res = kmalloc(sizeof(*res), GFP_KERNEL);
  	if (!res)
  		return NULL;
  
  	/* Allocate all the cg_cgroup_link objects that we'll need */
  	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
  		kfree(res);
  		return NULL;
  	}
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
608
  	atomic_set(&res->refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
609
610
  	INIT_LIST_HEAD(&res->cg_links);
  	INIT_LIST_HEAD(&res->tasks);
472b1053f   Li Zefan   cgroups: use a ha...
611
  	INIT_HLIST_NODE(&res->hlist);
817929ec2   Paul Menage   Task Control Grou...
612
613
614
615
616
617
618
  
  	/* Copy the set of subsystem state objects generated in
  	 * find_existing_css_set() */
  	memcpy(res->subsys, template, sizeof(res->subsys));
  
  	write_lock(&css_set_lock);
  	/* Add reference counts and links from the new css_set. */
7717f7ba9   Paul Menage   cgroups: add a ba...
619
620
621
622
623
624
  	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
  		struct cgroup *c = link->cgrp;
  		if (c->root == cgrp->root)
  			c = cgrp;
  		link_css_set(&tmp_cg_links, res, c);
  	}
817929ec2   Paul Menage   Task Control Grou...
625
626
  
  	BUG_ON(!list_empty(&tmp_cg_links));
817929ec2   Paul Menage   Task Control Grou...
627
  	css_set_count++;
472b1053f   Li Zefan   cgroups: use a ha...
628
629
630
631
  
  	/* Add this cgroup group to the hash table */
  	hhead = css_set_hash(res->subsys);
  	hlist_add_head(&res->hlist, hhead);
817929ec2   Paul Menage   Task Control Grou...
632
633
634
  	write_unlock(&css_set_lock);
  
  	return res;
b4f48b636   Paul Menage   Task Control Grou...
635
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
636
  /*
7717f7ba9   Paul Menage   cgroups: add a ba...
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
   * Return the cgroup for "task" from the given hierarchy. Must be
   * called with cgroup_mutex held.
   */
  static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  					    struct cgroupfs_root *root)
  {
  	struct css_set *css;
  	struct cgroup *res = NULL;
  
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
  	read_lock(&css_set_lock);
  	/*
  	 * No need to lock the task - since we hold cgroup_mutex the
  	 * task can't change groups, so the only thing that can happen
  	 * is that it exits and its css is set back to init_css_set.
  	 */
  	css = task->cgroups;
  	if (css == &init_css_set) {
  		res = &root->top_cgroup;
  	} else {
  		struct cg_cgroup_link *link;
  		list_for_each_entry(link, &css->cg_links, cg_link_list) {
  			struct cgroup *c = link->cgrp;
  			if (c->root == root) {
  				res = c;
  				break;
  			}
  		}
  	}
  	read_unlock(&css_set_lock);
  	BUG_ON(!res);
  	return res;
  }
  
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
672
673
674
675
676
677
678
679
680
   * There is one global cgroup mutex. We also require taking
   * task_lock() when dereferencing a task's cgroup subsys pointers.
   * See "The task_lock() exception", at the end of this comment.
   *
   * A task must hold cgroup_mutex to modify cgroups.
   *
   * Any task can increment and decrement the count field without lock.
   * So in general, code holding cgroup_mutex can't rely on the count
   * field not changing.  However, if the count goes to zero, then only
956db3ca0   Cliff Wickman   hotplug cpu: move...
681
   * cgroup_attach_task() can increment it again.  Because a count of zero
ddbcc7e8e   Paul Menage   Task Control Grou...
682
683
684
685
686
687
688
689
   * means that no tasks are currently attached, therefore there is no
   * way a task attached to that cgroup can fork (the other way to
   * increment the count).  So code holding cgroup_mutex can safely
   * assume that if the count is zero, it will stay zero. Similarly, if
   * a task holds cgroup_mutex on a cgroup with zero count, it
   * knows that the cgroup won't be removed, as cgroup_rmdir()
   * needs that mutex.
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
690
691
692
693
694
   * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
   * (usually) take cgroup_mutex.  These are the two most performance
   * critical pieces of code here.  The exception occurs on cgroup_exit(),
   * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
   * is taken, and if the cgroup count is zero, a usermode call made
a043e3b2c   Li Zefan   cgroup: fix comments
695
696
   * to the release agent with the name of the cgroup (path relative to
   * the root of cgroup file system) as the argument.
ddbcc7e8e   Paul Menage   Task Control Grou...
697
698
699
700
701
702
703
704
705
706
707
   *
   * A cgroup can only be deleted if both its 'count' of using tasks
   * is zero, and its list of 'children' cgroups is empty.  Since all
   * tasks in the system use _some_ cgroup, and since there is always at
   * least one task in the system (init, pid == 1), therefore, top_cgroup
   * always has either children cgroups and/or using tasks.  So we don't
   * need a special hack to ensure that top_cgroup cannot be deleted.
   *
   *	The task_lock() exception
   *
   * The need for this exception arises from the action of
956db3ca0   Cliff Wickman   hotplug cpu: move...
708
   * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
a043e3b2c   Li Zefan   cgroup: fix comments
709
   * another.  It does so using cgroup_mutex, however there are
ddbcc7e8e   Paul Menage   Task Control Grou...
710
711
712
   * several performance critical places that need to reference
   * task->cgroup without the expense of grabbing a system global
   * mutex.  Therefore except as noted below, when dereferencing or, as
956db3ca0   Cliff Wickman   hotplug cpu: move...
713
   * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
ddbcc7e8e   Paul Menage   Task Control Grou...
714
715
716
717
   * task_lock(), which acts on a spinlock (task->alloc_lock) already in
   * the task_struct routinely used for such matters.
   *
   * P.S.  One more locking exception.  RCU is used to guard the
956db3ca0   Cliff Wickman   hotplug cpu: move...
718
   * update of a tasks cgroup pointer by cgroup_attach_task()
ddbcc7e8e   Paul Menage   Task Control Grou...
719
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
720
721
722
723
  /**
   * cgroup_lock - lock out any changes to cgroup structures
   *
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
724
725
726
727
  void cgroup_lock(void)
  {
  	mutex_lock(&cgroup_mutex);
  }
67523c48a   Ben Blum   cgroups: blkio su...
728
  EXPORT_SYMBOL_GPL(cgroup_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
729
730
731
732
733
734
  
  /**
   * cgroup_unlock - release lock on cgroup changes
   *
   * Undo the lock taken in a previous cgroup_lock() call.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
735
736
737
738
  void cgroup_unlock(void)
  {
  	mutex_unlock(&cgroup_mutex);
  }
67523c48a   Ben Blum   cgroups: blkio su...
739
  EXPORT_SYMBOL_GPL(cgroup_unlock);
ddbcc7e8e   Paul Menage   Task Control Grou...
740
741
742
743
744
745
746
  
  /*
   * A couple of forward declarations required, due to cyclic reference loop:
   * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
   * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
   * -> cgroup_mkdir.
   */
18bb1db3e   Al Viro   switch vfs_mkdir(...
747
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
c72a04e34   Al Viro   cgroup_fs: fix cg...
748
  static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
ddbcc7e8e   Paul Menage   Task Control Grou...
749
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
bd89aabc6   Paul Menage   Control groups: R...
750
  static int cgroup_populate_dir(struct cgroup *cgrp);
6e1d5dcc2   Alexey Dobriyan   const: mark remai...
751
  static const struct inode_operations cgroup_dir_inode_operations;
828c09509   Alexey Dobriyan   const: constify r...
752
  static const struct file_operations proc_cgroupstats_operations;
a424316ca   Paul Menage   Task Control Grou...
753
754
  
  static struct backing_dev_info cgroup_backing_dev_info = {
d993831fa   Jens Axboe   writeback: add na...
755
  	.name		= "cgroup",
e4ad08fe6   Miklos Szeredi   mm: bdi: add sepa...
756
  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
a424316ca   Paul Menage   Task Control Grou...
757
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
758

38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
759
760
  static int alloc_css_id(struct cgroup_subsys *ss,
  			struct cgroup *parent, struct cgroup *child);
a5e7ed328   Al Viro   cgroup: propagate...
761
  static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
ddbcc7e8e   Paul Menage   Task Control Grou...
762
763
  {
  	struct inode *inode = new_inode(sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
764
765
  
  	if (inode) {
85fe4025c   Christoph Hellwig   fs: do not assign...
766
  		inode->i_ino = get_next_ino();
ddbcc7e8e   Paul Menage   Task Control Grou...
767
  		inode->i_mode = mode;
76aac0e9a   David Howells   CRED: Wrap task c...
768
769
  		inode->i_uid = current_fsuid();
  		inode->i_gid = current_fsgid();
ddbcc7e8e   Paul Menage   Task Control Grou...
770
771
772
773
774
  		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
  	}
  	return inode;
  }
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
775
776
777
778
  /*
   * Call subsys's pre_destroy handler.
   * This is called before css refcnt check.
   */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
779
  static int cgroup_call_pre_destroy(struct cgroup *cgrp)
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
780
781
  {
  	struct cgroup_subsys *ss;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
782
  	int ret = 0;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
783
  	for_each_subsys(cgrp->root, ss)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
784
785
786
  		if (ss->pre_destroy) {
  			ret = ss->pre_destroy(ss, cgrp);
  			if (ret)
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
787
  				break;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
788
  		}
0dea11687   Kirill A. Shutemov   cgroup: implement...
789

ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
790
  	return ret;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
791
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
792
793
794
795
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
  	/* is dentry a directory ? if so, kfree() associated cgroup */
  	if (S_ISDIR(inode->i_mode)) {
bd89aabc6   Paul Menage   Control groups: R...
796
  		struct cgroup *cgrp = dentry->d_fsdata;
8dc4f3e17   Paul Menage   cgroups: move cgr...
797
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
798
  		BUG_ON(!(cgroup_is_removed(cgrp)));
81a6a5cdd   Paul Menage   Task Control Grou...
799
800
801
802
803
804
805
  		/* It's possible for external users to be holding css
  		 * reference counts on a cgroup; css_put() needs to
  		 * be able to access the cgroup after decrementing
  		 * the reference count in order to know if it needs to
  		 * queue the cgroup to be handled by the release
  		 * agent */
  		synchronize_rcu();
8dc4f3e17   Paul Menage   cgroups: move cgr...
806
807
808
809
810
  
  		mutex_lock(&cgroup_mutex);
  		/*
  		 * Release the subsystem state objects.
  		 */
75139b827   Li Zefan   cgroups: remove s...
811
812
  		for_each_subsys(cgrp->root, ss)
  			ss->destroy(ss, cgrp);
8dc4f3e17   Paul Menage   cgroups: move cgr...
813
814
815
  
  		cgrp->root->number_of_cgroups--;
  		mutex_unlock(&cgroup_mutex);
a47295e6b   Paul Menage   cgroups: make cgr...
816
817
818
819
  		/*
  		 * Drop the active superblock reference that we took when we
  		 * created the cgroup
  		 */
8dc4f3e17   Paul Menage   cgroups: move cgr...
820
  		deactivate_super(cgrp->root->sb);
72a8cb30d   Ben Blum   cgroups: ensure c...
821
822
823
824
825
  		/*
  		 * if we're getting rid of the cgroup, refcount should ensure
  		 * that there are no pidlists left.
  		 */
  		BUG_ON(!list_empty(&cgrp->pidlists));
f2da1c40d   Lai Jiangshan   cgroup,rcu: conve...
826
  		kfree_rcu(cgrp, rcu_head);
ddbcc7e8e   Paul Menage   Task Control Grou...
827
828
829
  	}
  	iput(inode);
  }
c72a04e34   Al Viro   cgroup_fs: fix cg...
830
831
832
833
  static int cgroup_delete(const struct dentry *d)
  {
  	return 1;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
834
835
836
837
838
839
840
841
842
843
844
845
846
847
  static void remove_dir(struct dentry *d)
  {
  	struct dentry *parent = dget(d->d_parent);
  
  	d_delete(d);
  	simple_rmdir(parent->d_inode, d);
  	dput(parent);
  }
  
  static void cgroup_clear_directory(struct dentry *dentry)
  {
  	struct list_head *node;
  
  	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2fd6b7f50   Nick Piggin   fs: dcache scale ...
848
  	spin_lock(&dentry->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
849
850
851
  	node = dentry->d_subdirs.next;
  	while (node != &dentry->d_subdirs) {
  		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
852
853
  
  		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
ddbcc7e8e   Paul Menage   Task Control Grou...
854
855
856
857
858
  		list_del_init(node);
  		if (d->d_inode) {
  			/* This should never be called on a cgroup
  			 * directory with child cgroups */
  			BUG_ON(d->d_inode->i_mode & S_IFDIR);
dc0474be3   Nick Piggin   fs: dcache ration...
859
  			dget_dlock(d);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
860
861
  			spin_unlock(&d->d_lock);
  			spin_unlock(&dentry->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
862
863
864
  			d_delete(d);
  			simple_unlink(dentry->d_inode, d);
  			dput(d);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
865
866
867
  			spin_lock(&dentry->d_lock);
  		} else
  			spin_unlock(&d->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
868
869
  		node = dentry->d_subdirs.next;
  	}
2fd6b7f50   Nick Piggin   fs: dcache scale ...
870
  	spin_unlock(&dentry->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
871
872
873
874
875
876
877
  }
  
  /*
   * NOTE : the dentry must have been dget()'ed
   */
  static void cgroup_d_remove_dir(struct dentry *dentry)
  {
2fd6b7f50   Nick Piggin   fs: dcache scale ...
878
  	struct dentry *parent;
ddbcc7e8e   Paul Menage   Task Control Grou...
879
  	cgroup_clear_directory(dentry);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
880
881
  	parent = dentry->d_parent;
  	spin_lock(&parent->d_lock);
3ec762ad8   Li Zefan   cgroups: Fix a lo...
882
  	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
ddbcc7e8e   Paul Menage   Task Control Grou...
883
  	list_del_init(&dentry->d_u.d_child);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
884
885
  	spin_unlock(&dentry->d_lock);
  	spin_unlock(&parent->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
886
887
  	remove_dir(dentry);
  }
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
888
889
890
891
892
893
  /*
   * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
   * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
   * reference to css->refcnt. In general, this refcnt is expected to goes down
   * to zero, soon.
   *
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
894
   * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
895
   */
1c6c3fad8   Kirill A. Shutemov   cgroup: mark cgro...
896
  static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
897

887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
898
  static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
899
  {
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
900
  	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
901
902
  		wake_up_all(&cgroup_rmdir_waitq);
  }
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
903
904
905
906
907
908
909
910
911
912
  void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
  {
  	css_get(css);
  }
  
  void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
  {
  	cgroup_wakeup_rmdir_waiter(css->cgroup);
  	css_put(css);
  }
aae8aab40   Ben Blum   cgroups: revamp s...
913
  /*
cf5d5941f   Ben Blum   cgroups: subsyste...
914
915
916
   * Call with cgroup_mutex held. Drops reference counts on modules, including
   * any duplicate ones that parse_cgroupfs_options took. If this function
   * returns an error, no reference counts are touched.
aae8aab40   Ben Blum   cgroups: revamp s...
917
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
918
919
920
921
  static int rebind_subsystems(struct cgroupfs_root *root,
  			      unsigned long final_bits)
  {
  	unsigned long added_bits, removed_bits;
bd89aabc6   Paul Menage   Control groups: R...
922
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
923
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
924
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
925
  	BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
aae8aab40   Ben Blum   cgroups: revamp s...
926

ddbcc7e8e   Paul Menage   Task Control Grou...
927
928
929
930
  	removed_bits = root->actual_subsys_bits & ~final_bits;
  	added_bits = final_bits & ~root->actual_subsys_bits;
  	/* Check that any added subsystems are currently free */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
931
  		unsigned long bit = 1UL << i;
ddbcc7e8e   Paul Menage   Task Control Grou...
932
933
934
  		struct cgroup_subsys *ss = subsys[i];
  		if (!(bit & added_bits))
  			continue;
aae8aab40   Ben Blum   cgroups: revamp s...
935
936
937
938
939
940
  		/*
  		 * Nobody should tell us to do a subsys that doesn't exist:
  		 * parse_cgroupfs_options should catch that case and refcounts
  		 * ensure that subsystems won't disappear once selected.
  		 */
  		BUG_ON(ss == NULL);
ddbcc7e8e   Paul Menage   Task Control Grou...
941
942
943
944
945
946
947
948
949
950
  		if (ss->root != &rootnode) {
  			/* Subsystem isn't free */
  			return -EBUSY;
  		}
  	}
  
  	/* Currently we don't handle adding/removing subsystems when
  	 * any child cgroups exist. This is theoretically supportable
  	 * but involves complex error handling, so it's being left until
  	 * later */
307257cf4   Paul Menage   cgroups: fix a ra...
951
  	if (root->number_of_cgroups > 1)
ddbcc7e8e   Paul Menage   Task Control Grou...
952
953
954
955
956
957
958
959
  		return -EBUSY;
  
  	/* Process each subsystem */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		unsigned long bit = 1UL << i;
  		if (bit & added_bits) {
  			/* We're binding this subsystem to this hierarchy */
aae8aab40   Ben Blum   cgroups: revamp s...
960
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
961
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
962
963
  			BUG_ON(!dummytop->subsys[i]);
  			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
999cd8a45   Paul Menage   cgroups: add a pe...
964
  			mutex_lock(&ss->hierarchy_mutex);
bd89aabc6   Paul Menage   Control groups: R...
965
966
  			cgrp->subsys[i] = dummytop->subsys[i];
  			cgrp->subsys[i]->cgroup = cgrp;
33a68ac1c   Li Zefan   cgroups: add inac...
967
  			list_move(&ss->sibling, &root->subsys_list);
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
968
  			ss->root = root;
ddbcc7e8e   Paul Menage   Task Control Grou...
969
  			if (ss->bind)
bd89aabc6   Paul Menage   Control groups: R...
970
  				ss->bind(ss, cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
971
  			mutex_unlock(&ss->hierarchy_mutex);
cf5d5941f   Ben Blum   cgroups: subsyste...
972
  			/* refcount was already taken, and we're keeping it */
ddbcc7e8e   Paul Menage   Task Control Grou...
973
974
  		} else if (bit & removed_bits) {
  			/* We're removing this subsystem */
aae8aab40   Ben Blum   cgroups: revamp s...
975
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
976
977
  			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
  			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
978
  			mutex_lock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
979
980
981
  			if (ss->bind)
  				ss->bind(ss, dummytop);
  			dummytop->subsys[i]->cgroup = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
982
  			cgrp->subsys[i] = NULL;
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
983
  			subsys[i]->root = &rootnode;
33a68ac1c   Li Zefan   cgroups: add inac...
984
  			list_move(&ss->sibling, &rootnode.subsys_list);
999cd8a45   Paul Menage   cgroups: add a pe...
985
  			mutex_unlock(&ss->hierarchy_mutex);
cf5d5941f   Ben Blum   cgroups: subsyste...
986
987
  			/* subsystem is now free - drop reference on module */
  			module_put(ss->module);
ddbcc7e8e   Paul Menage   Task Control Grou...
988
989
  		} else if (bit & final_bits) {
  			/* Subsystem state should already exist */
aae8aab40   Ben Blum   cgroups: revamp s...
990
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
991
  			BUG_ON(!cgrp->subsys[i]);
cf5d5941f   Ben Blum   cgroups: subsyste...
992
993
994
995
996
997
998
999
  			/*
  			 * a refcount was taken, but we already had one, so
  			 * drop the extra reference.
  			 */
  			module_put(ss->module);
  #ifdef CONFIG_MODULE_UNLOAD
  			BUG_ON(ss->module && !module_refcount(ss->module));
  #endif
ddbcc7e8e   Paul Menage   Task Control Grou...
1000
1001
  		} else {
  			/* Subsystem state shouldn't exist */
bd89aabc6   Paul Menage   Control groups: R...
1002
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
1003
1004
1005
1006
1007
1008
1009
  		}
  	}
  	root->subsys_bits = root->actual_subsys_bits = final_bits;
  	synchronize_rcu();
  
  	return 0;
  }
34c80b1d9   Al Viro   vfs: switch ->sho...
1010
  static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
ddbcc7e8e   Paul Menage   Task Control Grou...
1011
  {
34c80b1d9   Al Viro   vfs: switch ->sho...
1012
  	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
ddbcc7e8e   Paul Menage   Task Control Grou...
1013
  	struct cgroup_subsys *ss;
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1014
  	mutex_lock(&cgroup_root_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1015
1016
1017
1018
  	for_each_subsys(root, ss)
  		seq_printf(seq, ",%s", ss->name);
  	if (test_bit(ROOT_NOPREFIX, &root->flags))
  		seq_puts(seq, ",noprefix");
81a6a5cdd   Paul Menage   Task Control Grou...
1019
1020
  	if (strlen(root->release_agent_path))
  		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
97978e6d1   Daniel Lezcano   cgroup: add clone...
1021
1022
  	if (clone_children(&root->top_cgroup))
  		seq_puts(seq, ",clone_children");
c6d57f331   Paul Menage   cgroups: support ...
1023
1024
  	if (strlen(root->name))
  		seq_printf(seq, ",name=%s", root->name);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1025
  	mutex_unlock(&cgroup_root_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1026
1027
1028
1029
1030
1031
  	return 0;
  }
  
  struct cgroup_sb_opts {
  	unsigned long subsys_bits;
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
1032
  	char *release_agent;
97978e6d1   Daniel Lezcano   cgroup: add clone...
1033
  	bool clone_children;
c6d57f331   Paul Menage   cgroups: support ...
1034
  	char *name;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1035
1036
  	/* User explicitly requested empty subsystem */
  	bool none;
c6d57f331   Paul Menage   cgroups: support ...
1037
1038
  
  	struct cgroupfs_root *new_root;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1039

ddbcc7e8e   Paul Menage   Task Control Grou...
1040
  };
aae8aab40   Ben Blum   cgroups: revamp s...
1041
1042
  /*
   * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
cf5d5941f   Ben Blum   cgroups: subsyste...
1043
1044
1045
   * with cgroup_mutex held to protect the subsys[] array. This function takes
   * refcounts on subsystems to be used, unless it returns error, in which case
   * no refcounts are taken.
aae8aab40   Ben Blum   cgroups: revamp s...
1046
   */
cf5d5941f   Ben Blum   cgroups: subsyste...
1047
  static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
ddbcc7e8e   Paul Menage   Task Control Grou...
1048
  {
32a8cf235   Daniel Lezcano   cgroup: make the ...
1049
1050
  	char *token, *o = data;
  	bool all_ss = false, one_ss = false;
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1051
  	unsigned long mask = (unsigned long)-1;
cf5d5941f   Ben Blum   cgroups: subsyste...
1052
1053
  	int i;
  	bool module_pin_failed = false;
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1054

aae8aab40   Ben Blum   cgroups: revamp s...
1055
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1056
1057
1058
  #ifdef CONFIG_CPUSETS
  	mask = ~(1UL << cpuset_subsys_id);
  #endif
ddbcc7e8e   Paul Menage   Task Control Grou...
1059

c6d57f331   Paul Menage   cgroups: support ...
1060
  	memset(opts, 0, sizeof(*opts));
ddbcc7e8e   Paul Menage   Task Control Grou...
1061
1062
1063
1064
  
  	while ((token = strsep(&o, ",")) != NULL) {
  		if (!*token)
  			return -EINVAL;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1065
  		if (!strcmp(token, "none")) {
2c6ab6d20   Paul Menage   cgroups: allow cg...
1066
1067
  			/* Explicitly have no subsystems */
  			opts->none = true;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
  			continue;
  		}
  		if (!strcmp(token, "all")) {
  			/* Mutually exclusive option 'all' + subsystem name */
  			if (one_ss)
  				return -EINVAL;
  			all_ss = true;
  			continue;
  		}
  		if (!strcmp(token, "noprefix")) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1078
  			set_bit(ROOT_NOPREFIX, &opts->flags);
32a8cf235   Daniel Lezcano   cgroup: make the ...
1079
1080
1081
  			continue;
  		}
  		if (!strcmp(token, "clone_children")) {
97978e6d1   Daniel Lezcano   cgroup: add clone...
1082
  			opts->clone_children = true;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1083
1084
1085
  			continue;
  		}
  		if (!strncmp(token, "release_agent=", 14)) {
81a6a5cdd   Paul Menage   Task Control Grou...
1086
1087
1088
  			/* Specifying two release agents is forbidden */
  			if (opts->release_agent)
  				return -EINVAL;
c6d57f331   Paul Menage   cgroups: support ...
1089
  			opts->release_agent =
e400c2852   Dan Carpenter   cgroups: save spa...
1090
  				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
81a6a5cdd   Paul Menage   Task Control Grou...
1091
1092
  			if (!opts->release_agent)
  				return -ENOMEM;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1093
1094
1095
  			continue;
  		}
  		if (!strncmp(token, "name=", 5)) {
c6d57f331   Paul Menage   cgroups: support ...
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
  			const char *name = token + 5;
  			/* Can't specify an empty name */
  			if (!strlen(name))
  				return -EINVAL;
  			/* Must match [\w.-]+ */
  			for (i = 0; i < strlen(name); i++) {
  				char c = name[i];
  				if (isalnum(c))
  					continue;
  				if ((c == '.') || (c == '-') || (c == '_'))
  					continue;
  				return -EINVAL;
  			}
  			/* Specifying two names is forbidden */
  			if (opts->name)
  				return -EINVAL;
  			opts->name = kstrndup(name,
e400c2852   Dan Carpenter   cgroups: save spa...
1113
  					      MAX_CGROUP_ROOT_NAMELEN - 1,
c6d57f331   Paul Menage   cgroups: support ...
1114
1115
1116
  					      GFP_KERNEL);
  			if (!opts->name)
  				return -ENOMEM;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
  
  			continue;
  		}
  
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss == NULL)
  				continue;
  			if (strcmp(token, ss->name))
  				continue;
  			if (ss->disabled)
  				continue;
  
  			/* Mutually exclusive option 'all' + subsystem name */
  			if (all_ss)
  				return -EINVAL;
  			set_bit(i, &opts->subsys_bits);
  			one_ss = true;
  
  			break;
  		}
  		if (i == CGROUP_SUBSYS_COUNT)
  			return -ENOENT;
  	}
  
  	/*
  	 * If the 'all' option was specified select all the subsystems,
0d19ea866   Li Zefan   cgroup: fix to al...
1144
1145
  	 * otherwise if 'none', 'name=' and a subsystem name options
  	 * were not specified, let's default to 'all'
32a8cf235   Daniel Lezcano   cgroup: make the ...
1146
  	 */
0d19ea866   Li Zefan   cgroup: fix to al...
1147
  	if (all_ss || (!one_ss && !opts->none && !opts->name)) {
32a8cf235   Daniel Lezcano   cgroup: make the ...
1148
1149
1150
1151
1152
1153
1154
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss == NULL)
  				continue;
  			if (ss->disabled)
  				continue;
  			set_bit(i, &opts->subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
1155
1156
  		}
  	}
2c6ab6d20   Paul Menage   cgroups: allow cg...
1157
  	/* Consistency checks */
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1158
1159
1160
1161
1162
1163
1164
1165
  	/*
  	 * Option noprefix was introduced just for backward compatibility
  	 * with the old cpuset, so we allow noprefix only if mounting just
  	 * the cpuset subsystem.
  	 */
  	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
  	    (opts->subsys_bits & mask))
  		return -EINVAL;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1166
1167
1168
1169
1170
1171
1172
1173
1174
  
  	/* Can't specify "none" and some subsystems */
  	if (opts->subsys_bits && opts->none)
  		return -EINVAL;
  
  	/*
  	 * We either have to specify by name or by subsystems. (So all
  	 * empty hierarchies must have a name).
  	 */
c6d57f331   Paul Menage   cgroups: support ...
1175
  	if (!opts->subsys_bits && !opts->name)
ddbcc7e8e   Paul Menage   Task Control Grou...
1176
  		return -EINVAL;
cf5d5941f   Ben Blum   cgroups: subsyste...
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
  	/*
  	 * Grab references on all the modules we'll need, so the subsystems
  	 * don't dance around before rebind_subsystems attaches them. This may
  	 * take duplicate reference counts on a subsystem that's already used,
  	 * but rebind_subsystems handles this case.
  	 */
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		unsigned long bit = 1UL << i;
  
  		if (!(bit & opts->subsys_bits))
  			continue;
  		if (!try_module_get(subsys[i]->module)) {
  			module_pin_failed = true;
  			break;
  		}
  	}
  	if (module_pin_failed) {
  		/*
  		 * oops, one of the modules was going away. this means that we
  		 * raced with a module_delete call, and to the user this is
  		 * essentially a "subsystem doesn't exist" case.
  		 */
  		for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
  			/* drop refcounts only on the ones we took */
  			unsigned long bit = 1UL << i;
  
  			if (!(bit & opts->subsys_bits))
  				continue;
  			module_put(subsys[i]->module);
  		}
  		return -ENOENT;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1209
1210
  	return 0;
  }
cf5d5941f   Ben Blum   cgroups: subsyste...
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
  static void drop_parsed_module_refcounts(unsigned long subsys_bits)
  {
  	int i;
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		unsigned long bit = 1UL << i;
  
  		if (!(bit & subsys_bits))
  			continue;
  		module_put(subsys[i]->module);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1222
1223
1224
1225
  static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  {
  	int ret = 0;
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1226
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1227
  	struct cgroup_sb_opts opts;
bd89aabc6   Paul Menage   Control groups: R...
1228
  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1229
  	mutex_lock(&cgroup_mutex);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1230
  	mutex_lock(&cgroup_root_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1231
1232
1233
1234
1235
  
  	/* See what subsystems are wanted */
  	ret = parse_cgroupfs_options(data, &opts);
  	if (ret)
  		goto out_unlock;
cf5d5941f   Ben Blum   cgroups: subsyste...
1236
1237
1238
  	/* Don't allow flags or name to change at remount */
  	if (opts.flags != root->flags ||
  	    (opts.name && strcmp(opts.name, root->name))) {
c6d57f331   Paul Menage   cgroups: support ...
1239
  		ret = -EINVAL;
cf5d5941f   Ben Blum   cgroups: subsyste...
1240
  		drop_parsed_module_refcounts(opts.subsys_bits);
c6d57f331   Paul Menage   cgroups: support ...
1241
1242
  		goto out_unlock;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1243
  	ret = rebind_subsystems(root, opts.subsys_bits);
cf5d5941f   Ben Blum   cgroups: subsyste...
1244
1245
  	if (ret) {
  		drop_parsed_module_refcounts(opts.subsys_bits);
0670e08bd   Li Zefan   cgroups: don't ch...
1246
  		goto out_unlock;
cf5d5941f   Ben Blum   cgroups: subsyste...
1247
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1248
1249
  
  	/* (re)populate subsystem files */
0670e08bd   Li Zefan   cgroups: don't ch...
1250
  	cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1251

81a6a5cdd   Paul Menage   Task Control Grou...
1252
1253
  	if (opts.release_agent)
  		strcpy(root->release_agent_path, opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1254
   out_unlock:
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
1255
  	kfree(opts.release_agent);
c6d57f331   Paul Menage   cgroups: support ...
1256
  	kfree(opts.name);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1257
  	mutex_unlock(&cgroup_root_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1258
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
1259
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1260
1261
  	return ret;
  }
b87221de6   Alexey Dobriyan   const: mark remai...
1262
  static const struct super_operations cgroup_ops = {
ddbcc7e8e   Paul Menage   Task Control Grou...
1263
1264
1265
1266
1267
  	.statfs = simple_statfs,
  	.drop_inode = generic_delete_inode,
  	.show_options = cgroup_show_options,
  	.remount_fs = cgroup_remount,
  };
cc31edcee   Paul Menage   cgroups: convert ...
1268
1269
1270
1271
1272
1273
  static void init_cgroup_housekeeping(struct cgroup *cgrp)
  {
  	INIT_LIST_HEAD(&cgrp->sibling);
  	INIT_LIST_HEAD(&cgrp->children);
  	INIT_LIST_HEAD(&cgrp->css_sets);
  	INIT_LIST_HEAD(&cgrp->release_list);
72a8cb30d   Ben Blum   cgroups: ensure c...
1274
1275
  	INIT_LIST_HEAD(&cgrp->pidlists);
  	mutex_init(&cgrp->pidlist_mutex);
0dea11687   Kirill A. Shutemov   cgroup: implement...
1276
1277
  	INIT_LIST_HEAD(&cgrp->event_list);
  	spin_lock_init(&cgrp->event_list_lock);
cc31edcee   Paul Menage   cgroups: convert ...
1278
  }
c6d57f331   Paul Menage   cgroups: support ...
1279

ddbcc7e8e   Paul Menage   Task Control Grou...
1280
1281
  static void init_cgroup_root(struct cgroupfs_root *root)
  {
bd89aabc6   Paul Menage   Control groups: R...
1282
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1283
1284
1285
  	INIT_LIST_HEAD(&root->subsys_list);
  	INIT_LIST_HEAD(&root->root_list);
  	root->number_of_cgroups = 1;
bd89aabc6   Paul Menage   Control groups: R...
1286
1287
  	cgrp->root = root;
  	cgrp->top_cgroup = cgrp;
cc31edcee   Paul Menage   cgroups: convert ...
1288
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1289
  }
2c6ab6d20   Paul Menage   cgroups: allow cg...
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
  static bool init_root_id(struct cgroupfs_root *root)
  {
  	int ret = 0;
  
  	do {
  		if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
  			return false;
  		spin_lock(&hierarchy_id_lock);
  		/* Try to allocate the next unused ID */
  		ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
  					&root->hierarchy_id);
  		if (ret == -ENOSPC)
  			/* Try again starting from 0 */
  			ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
  		if (!ret) {
  			next_hierarchy_id = root->hierarchy_id + 1;
  		} else if (ret != -EAGAIN) {
  			/* Can only get here if the 31-bit IDR is full ... */
  			BUG_ON(ret);
  		}
  		spin_unlock(&hierarchy_id_lock);
  	} while (ret);
  	return true;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1314
1315
  static int cgroup_test_super(struct super_block *sb, void *data)
  {
c6d57f331   Paul Menage   cgroups: support ...
1316
  	struct cgroup_sb_opts *opts = data;
ddbcc7e8e   Paul Menage   Task Control Grou...
1317
  	struct cgroupfs_root *root = sb->s_fs_info;
c6d57f331   Paul Menage   cgroups: support ...
1318
1319
1320
  	/* If we asked for a name then it must match */
  	if (opts->name && strcmp(opts->name, root->name))
  		return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
1321

2c6ab6d20   Paul Menage   cgroups: allow cg...
1322
1323
1324
1325
1326
1327
  	/*
  	 * If we asked for subsystems (or explicitly for no
  	 * subsystems) then they must match
  	 */
  	if ((opts->subsys_bits || opts->none)
  	    && (opts->subsys_bits != root->subsys_bits))
ddbcc7e8e   Paul Menage   Task Control Grou...
1328
1329
1330
1331
  		return 0;
  
  	return 1;
  }
c6d57f331   Paul Menage   cgroups: support ...
1332
1333
1334
  static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
  {
  	struct cgroupfs_root *root;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1335
  	if (!opts->subsys_bits && !opts->none)
c6d57f331   Paul Menage   cgroups: support ...
1336
1337
1338
1339
1340
  		return NULL;
  
  	root = kzalloc(sizeof(*root), GFP_KERNEL);
  	if (!root)
  		return ERR_PTR(-ENOMEM);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1341
1342
1343
1344
  	if (!init_root_id(root)) {
  		kfree(root);
  		return ERR_PTR(-ENOMEM);
  	}
c6d57f331   Paul Menage   cgroups: support ...
1345
  	init_cgroup_root(root);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1346

c6d57f331   Paul Menage   cgroups: support ...
1347
1348
1349
1350
1351
1352
  	root->subsys_bits = opts->subsys_bits;
  	root->flags = opts->flags;
  	if (opts->release_agent)
  		strcpy(root->release_agent_path, opts->release_agent);
  	if (opts->name)
  		strcpy(root->name, opts->name);
97978e6d1   Daniel Lezcano   cgroup: add clone...
1353
1354
  	if (opts->clone_children)
  		set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
c6d57f331   Paul Menage   cgroups: support ...
1355
1356
  	return root;
  }
2c6ab6d20   Paul Menage   cgroups: allow cg...
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
  static void cgroup_drop_root(struct cgroupfs_root *root)
  {
  	if (!root)
  		return;
  
  	BUG_ON(!root->hierarchy_id);
  	spin_lock(&hierarchy_id_lock);
  	ida_remove(&hierarchy_ida, root->hierarchy_id);
  	spin_unlock(&hierarchy_id_lock);
  	kfree(root);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1368
1369
1370
  static int cgroup_set_super(struct super_block *sb, void *data)
  {
  	int ret;
c6d57f331   Paul Menage   cgroups: support ...
1371
1372
1373
1374
1375
  	struct cgroup_sb_opts *opts = data;
  
  	/* If we don't have a new root, we can't set up a new sb */
  	if (!opts->new_root)
  		return -EINVAL;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1376
  	BUG_ON(!opts->subsys_bits && !opts->none);
ddbcc7e8e   Paul Menage   Task Control Grou...
1377
1378
1379
1380
  
  	ret = set_anon_super(sb, NULL);
  	if (ret)
  		return ret;
c6d57f331   Paul Menage   cgroups: support ...
1381
1382
  	sb->s_fs_info = opts->new_root;
  	opts->new_root->sb = sb;
ddbcc7e8e   Paul Menage   Task Control Grou...
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
  
  	sb->s_blocksize = PAGE_CACHE_SIZE;
  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
  	sb->s_magic = CGROUP_SUPER_MAGIC;
  	sb->s_op = &cgroup_ops;
  
  	return 0;
  }
  
  static int cgroup_get_rootdir(struct super_block *sb)
  {
0df6a63f8   Al Viro   switch cgroup
1394
1395
  	static const struct dentry_operations cgroup_dops = {
  		.d_iput = cgroup_diput,
c72a04e34   Al Viro   cgroup_fs: fix cg...
1396
  		.d_delete = cgroup_delete,
0df6a63f8   Al Viro   switch cgroup
1397
  	};
ddbcc7e8e   Paul Menage   Task Control Grou...
1398
1399
1400
1401
1402
1403
  	struct inode *inode =
  		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
  	struct dentry *dentry;
  
  	if (!inode)
  		return -ENOMEM;
ddbcc7e8e   Paul Menage   Task Control Grou...
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
  	inode->i_fop = &simple_dir_operations;
  	inode->i_op = &cgroup_dir_inode_operations;
  	/* directories start off with i_nlink == 2 (for "." entry) */
  	inc_nlink(inode);
  	dentry = d_alloc_root(inode);
  	if (!dentry) {
  		iput(inode);
  		return -ENOMEM;
  	}
  	sb->s_root = dentry;
0df6a63f8   Al Viro   switch cgroup
1414
1415
  	/* for everything else we want ->d_op set */
  	sb->s_d_op = &cgroup_dops;
ddbcc7e8e   Paul Menage   Task Control Grou...
1416
1417
  	return 0;
  }
f7e835710   Al Viro   convert cgroup an...
1418
  static struct dentry *cgroup_mount(struct file_system_type *fs_type,
ddbcc7e8e   Paul Menage   Task Control Grou...
1419
  			 int flags, const char *unused_dev_name,
f7e835710   Al Viro   convert cgroup an...
1420
  			 void *data)
ddbcc7e8e   Paul Menage   Task Control Grou...
1421
1422
  {
  	struct cgroup_sb_opts opts;
c6d57f331   Paul Menage   cgroups: support ...
1423
  	struct cgroupfs_root *root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1424
1425
  	int ret = 0;
  	struct super_block *sb;
c6d57f331   Paul Menage   cgroups: support ...
1426
  	struct cgroupfs_root *new_root;
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1427
  	struct inode *inode;
ddbcc7e8e   Paul Menage   Task Control Grou...
1428
1429
  
  	/* First find the desired set of subsystems */
aae8aab40   Ben Blum   cgroups: revamp s...
1430
  	mutex_lock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1431
  	ret = parse_cgroupfs_options(data, &opts);
aae8aab40   Ben Blum   cgroups: revamp s...
1432
  	mutex_unlock(&cgroup_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1433
1434
  	if (ret)
  		goto out_err;
ddbcc7e8e   Paul Menage   Task Control Grou...
1435

c6d57f331   Paul Menage   cgroups: support ...
1436
1437
1438
1439
1440
1441
1442
  	/*
  	 * Allocate a new cgroup root. We may not need it if we're
  	 * reusing an existing hierarchy.
  	 */
  	new_root = cgroup_root_from_opts(&opts);
  	if (IS_ERR(new_root)) {
  		ret = PTR_ERR(new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1443
  		goto drop_modules;
81a6a5cdd   Paul Menage   Task Control Grou...
1444
  	}
c6d57f331   Paul Menage   cgroups: support ...
1445
  	opts.new_root = new_root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1446

c6d57f331   Paul Menage   cgroups: support ...
1447
1448
  	/* Locate an existing or new sb for this hierarchy */
  	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
ddbcc7e8e   Paul Menage   Task Control Grou...
1449
  	if (IS_ERR(sb)) {
c6d57f331   Paul Menage   cgroups: support ...
1450
  		ret = PTR_ERR(sb);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1451
  		cgroup_drop_root(opts.new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1452
  		goto drop_modules;
ddbcc7e8e   Paul Menage   Task Control Grou...
1453
  	}
c6d57f331   Paul Menage   cgroups: support ...
1454
1455
1456
1457
1458
  	root = sb->s_fs_info;
  	BUG_ON(!root);
  	if (root == opts.new_root) {
  		/* We used the new root structure, so this is a new hierarchy */
  		struct list_head tmp_cg_links;
c12f65d43   Li Zefan   cgroups: introduc...
1459
  		struct cgroup *root_cgrp = &root->top_cgroup;
c6d57f331   Paul Menage   cgroups: support ...
1460
  		struct cgroupfs_root *existing_root;
2ce9738ba   eparis@redhat   cgroupfs: use ini...
1461
  		const struct cred *cred;
28fd5dfc1   Li Zefan   cgroups: remove t...
1462
  		int i;
ddbcc7e8e   Paul Menage   Task Control Grou...
1463
1464
1465
1466
1467
1468
  
  		BUG_ON(sb->s_root != NULL);
  
  		ret = cgroup_get_rootdir(sb);
  		if (ret)
  			goto drop_new_super;
817929ec2   Paul Menage   Task Control Grou...
1469
  		inode = sb->s_root->d_inode;
ddbcc7e8e   Paul Menage   Task Control Grou...
1470

817929ec2   Paul Menage   Task Control Grou...
1471
  		mutex_lock(&inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1472
  		mutex_lock(&cgroup_mutex);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1473
  		mutex_lock(&cgroup_root_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1474

e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1475
1476
1477
1478
1479
1480
  		/* Check for name clashes with existing mounts */
  		ret = -EBUSY;
  		if (strlen(root->name))
  			for_each_active_root(existing_root)
  				if (!strcmp(existing_root->name, root->name))
  					goto unlock_drop;
c6d57f331   Paul Menage   cgroups: support ...
1481

817929ec2   Paul Menage   Task Control Grou...
1482
1483
1484
1485
1486
1487
1488
1489
  		/*
  		 * We're accessing css_set_count without locking
  		 * css_set_lock here, but that's OK - it can only be
  		 * increased by someone holding cgroup_lock, and
  		 * that's us. The worst that can happen is that we
  		 * have some link structures left over
  		 */
  		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1490
1491
  		if (ret)
  			goto unlock_drop;
817929ec2   Paul Menage   Task Control Grou...
1492

ddbcc7e8e   Paul Menage   Task Control Grou...
1493
1494
  		ret = rebind_subsystems(root, root->subsys_bits);
  		if (ret == -EBUSY) {
c6d57f331   Paul Menage   cgroups: support ...
1495
  			free_cg_links(&tmp_cg_links);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1496
  			goto unlock_drop;
ddbcc7e8e   Paul Menage   Task Control Grou...
1497
  		}
cf5d5941f   Ben Blum   cgroups: subsyste...
1498
1499
1500
1501
1502
  		/*
  		 * There must be no failure case after here, since rebinding
  		 * takes care of subsystems' refcounts, which are explicitly
  		 * dropped in the failure exit path.
  		 */
ddbcc7e8e   Paul Menage   Task Control Grou...
1503
1504
1505
1506
1507
  
  		/* EBUSY should be the only error here */
  		BUG_ON(ret);
  
  		list_add(&root->root_list, &roots);
817929ec2   Paul Menage   Task Control Grou...
1508
  		root_count++;
ddbcc7e8e   Paul Menage   Task Control Grou...
1509

c12f65d43   Li Zefan   cgroups: introduc...
1510
  		sb->s_root->d_fsdata = root_cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
1511
  		root->top_cgroup.dentry = sb->s_root;
817929ec2   Paul Menage   Task Control Grou...
1512
1513
1514
  		/* Link the top cgroup in this hierarchy into all
  		 * the css_set objects */
  		write_lock(&css_set_lock);
28fd5dfc1   Li Zefan   cgroups: remove t...
1515
1516
1517
  		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  			struct hlist_head *hhead = &css_set_table[i];
  			struct hlist_node *node;
817929ec2   Paul Menage   Task Control Grou...
1518
  			struct css_set *cg;
28fd5dfc1   Li Zefan   cgroups: remove t...
1519

c12f65d43   Li Zefan   cgroups: introduc...
1520
1521
  			hlist_for_each_entry(cg, node, hhead, hlist)
  				link_css_set(&tmp_cg_links, cg, root_cgrp);
28fd5dfc1   Li Zefan   cgroups: remove t...
1522
  		}
817929ec2   Paul Menage   Task Control Grou...
1523
1524
1525
  		write_unlock(&css_set_lock);
  
  		free_cg_links(&tmp_cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
1526
1527
  		BUG_ON(!list_empty(&root_cgrp->sibling));
  		BUG_ON(!list_empty(&root_cgrp->children));
ddbcc7e8e   Paul Menage   Task Control Grou...
1528
  		BUG_ON(root->number_of_cgroups != 1);
2ce9738ba   eparis@redhat   cgroupfs: use ini...
1529
  		cred = override_creds(&init_cred);
c12f65d43   Li Zefan   cgroups: introduc...
1530
  		cgroup_populate_dir(root_cgrp);
2ce9738ba   eparis@redhat   cgroupfs: use ini...
1531
  		revert_creds(cred);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1532
  		mutex_unlock(&cgroup_root_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1533
  		mutex_unlock(&cgroup_mutex);
34f77a90f   Xiaotian Feng   cgroups: make unl...
1534
  		mutex_unlock(&inode->i_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1535
1536
1537
1538
1539
  	} else {
  		/*
  		 * We re-used an existing hierarchy - the new root (if
  		 * any) is not needed
  		 */
2c6ab6d20   Paul Menage   cgroups: allow cg...
1540
  		cgroup_drop_root(opts.new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1541
1542
  		/* no subsys rebinding, so refcounts don't change */
  		drop_parsed_module_refcounts(opts.subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
1543
  	}
c6d57f331   Paul Menage   cgroups: support ...
1544
1545
  	kfree(opts.release_agent);
  	kfree(opts.name);
f7e835710   Al Viro   convert cgroup an...
1546
  	return dget(sb->s_root);
ddbcc7e8e   Paul Menage   Task Control Grou...
1547

e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1548
1549
1550
1551
   unlock_drop:
  	mutex_unlock(&cgroup_root_mutex);
  	mutex_unlock(&cgroup_mutex);
  	mutex_unlock(&inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1552
   drop_new_super:
6f5bbff9a   Al Viro   Convert obvious p...
1553
  	deactivate_locked_super(sb);
cf5d5941f   Ben Blum   cgroups: subsyste...
1554
1555
   drop_modules:
  	drop_parsed_module_refcounts(opts.subsys_bits);
c6d57f331   Paul Menage   cgroups: support ...
1556
1557
1558
   out_err:
  	kfree(opts.release_agent);
  	kfree(opts.name);
f7e835710   Al Viro   convert cgroup an...
1559
  	return ERR_PTR(ret);
ddbcc7e8e   Paul Menage   Task Control Grou...
1560
1561
1562
1563
  }
  
  static void cgroup_kill_sb(struct super_block *sb) {
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1564
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1565
  	int ret;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1566
1567
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
ddbcc7e8e   Paul Menage   Task Control Grou...
1568
1569
1570
1571
  
  	BUG_ON(!root);
  
  	BUG_ON(root->number_of_cgroups != 1);
bd89aabc6   Paul Menage   Control groups: R...
1572
1573
  	BUG_ON(!list_empty(&cgrp->children));
  	BUG_ON(!list_empty(&cgrp->sibling));
ddbcc7e8e   Paul Menage   Task Control Grou...
1574
1575
  
  	mutex_lock(&cgroup_mutex);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1576
  	mutex_lock(&cgroup_root_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1577
1578
1579
1580
1581
  
  	/* Rebind all subsystems back to the default hierarchy */
  	ret = rebind_subsystems(root, 0);
  	/* Shouldn't be able to fail ... */
  	BUG_ON(ret);
817929ec2   Paul Menage   Task Control Grou...
1582
1583
1584
1585
1586
  	/*
  	 * Release all the links from css_sets to this hierarchy's
  	 * root cgroup
  	 */
  	write_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1587
1588
1589
  
  	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
  				 cgrp_link_list) {
817929ec2   Paul Menage   Task Control Grou...
1590
  		list_del(&link->cg_link_list);
bd89aabc6   Paul Menage   Control groups: R...
1591
  		list_del(&link->cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
1592
1593
1594
  		kfree(link);
  	}
  	write_unlock(&css_set_lock);
839ec5452   Paul Menage   cgroup: fix root_...
1595
1596
1597
1598
  	if (!list_empty(&root->root_list)) {
  		list_del(&root->root_list);
  		root_count--;
  	}
e5f6a8609   Li Zefan   cgroups: make roo...
1599

e25e2cbb4   Tejun Heo   cgroup: add cgrou...
1600
  	mutex_unlock(&cgroup_root_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1601
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1602
  	kill_litter_super(sb);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1603
  	cgroup_drop_root(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
1604
1605
1606
1607
  }
  
  static struct file_system_type cgroup_fs_type = {
  	.name = "cgroup",
f7e835710   Al Viro   convert cgroup an...
1608
  	.mount = cgroup_mount,
ddbcc7e8e   Paul Menage   Task Control Grou...
1609
1610
  	.kill_sb = cgroup_kill_sb,
  };
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
1611
  static struct kobject *cgroup_kobj;
bd89aabc6   Paul Menage   Control groups: R...
1612
  static inline struct cgroup *__d_cgrp(struct dentry *dentry)
ddbcc7e8e   Paul Menage   Task Control Grou...
1613
1614
1615
1616
1617
1618
1619
1620
  {
  	return dentry->d_fsdata;
  }
  
  static inline struct cftype *__d_cft(struct dentry *dentry)
  {
  	return dentry->d_fsdata;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1621
1622
1623
1624
1625
1626
  /**
   * cgroup_path - generate the path of a cgroup
   * @cgrp: the cgroup in question
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
a47295e6b   Paul Menage   cgroups: make cgr...
1627
1628
1629
   * Called with cgroup_mutex held or else with an RCU-protected cgroup
   * reference.  Writes path of cgroup into buf.  Returns 0 on success,
   * -errno on error.
ddbcc7e8e   Paul Menage   Task Control Grou...
1630
   */
bd89aabc6   Paul Menage   Control groups: R...
1631
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
ddbcc7e8e   Paul Menage   Task Control Grou...
1632
1633
  {
  	char *start;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1634
  	struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
9a9686b63   Li Zefan   cgroup: Fix an RC...
1635
  						      cgroup_lock_is_held());
ddbcc7e8e   Paul Menage   Task Control Grou...
1636

a47295e6b   Paul Menage   cgroups: make cgr...
1637
  	if (!dentry || cgrp == dummytop) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
  		/*
  		 * Inactive subsystems have no dentry for their root
  		 * cgroup
  		 */
  		strcpy(buf, "/");
  		return 0;
  	}
  
  	start = buf + buflen;
  
  	*--start = '\0';
  	for (;;) {
a47295e6b   Paul Menage   cgroups: make cgr...
1650
  		int len = dentry->d_name.len;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1651

ddbcc7e8e   Paul Menage   Task Control Grou...
1652
1653
  		if ((start -= len) < buf)
  			return -ENAMETOOLONG;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1654
  		memcpy(start, dentry->d_name.name, len);
bd89aabc6   Paul Menage   Control groups: R...
1655
1656
  		cgrp = cgrp->parent;
  		if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
1657
  			break;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1658
1659
  
  		dentry = rcu_dereference_check(cgrp->dentry,
9a9686b63   Li Zefan   cgroup: Fix an RC...
1660
  					       cgroup_lock_is_held());
bd89aabc6   Paul Menage   Control groups: R...
1661
  		if (!cgrp->parent)
ddbcc7e8e   Paul Menage   Task Control Grou...
1662
1663
1664
1665
1666
1667
1668
1669
  			continue;
  		if (--start < buf)
  			return -ENAMETOOLONG;
  		*start = '/';
  	}
  	memmove(buf, start, buf + buflen - start);
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
1670
  EXPORT_SYMBOL_GPL(cgroup_path);
ddbcc7e8e   Paul Menage   Task Control Grou...
1671

74a1166df   Ben Blum   cgroups: make pro...
1672
  /*
2f7ee5691   Tejun Heo   cgroup: introduce...
1673
1674
   * Control Group taskset
   */
134d33737   Tejun Heo   cgroup: improve o...
1675
1676
1677
1678
  struct task_and_cgroup {
  	struct task_struct	*task;
  	struct cgroup		*cgrp;
  };
2f7ee5691   Tejun Heo   cgroup: introduce...
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
  struct cgroup_taskset {
  	struct task_and_cgroup	single;
  	struct flex_array	*tc_array;
  	int			tc_array_len;
  	int			idx;
  	struct cgroup		*cur_cgrp;
  };
  
  /**
   * cgroup_taskset_first - reset taskset and return the first task
   * @tset: taskset of interest
   *
   * @tset iteration is initialized and the first task is returned.
   */
  struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
  {
  	if (tset->tc_array) {
  		tset->idx = 0;
  		return cgroup_taskset_next(tset);
  	} else {
  		tset->cur_cgrp = tset->single.cgrp;
  		return tset->single.task;
  	}
  }
  EXPORT_SYMBOL_GPL(cgroup_taskset_first);
  
  /**
   * cgroup_taskset_next - iterate to the next task in taskset
   * @tset: taskset of interest
   *
   * Return the next task in @tset.  Iteration must have been initialized
   * with cgroup_taskset_first().
   */
  struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
  {
  	struct task_and_cgroup *tc;
  
  	if (!tset->tc_array || tset->idx >= tset->tc_array_len)
  		return NULL;
  
  	tc = flex_array_get(tset->tc_array, tset->idx++);
  	tset->cur_cgrp = tc->cgrp;
  	return tc->task;
  }
  EXPORT_SYMBOL_GPL(cgroup_taskset_next);
  
  /**
   * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
   * @tset: taskset of interest
   *
   * Return the cgroup for the current (last returned) task of @tset.  This
   * function must be preceded by either cgroup_taskset_first() or
   * cgroup_taskset_next().
   */
  struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
  {
  	return tset->cur_cgrp;
  }
  EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
  
  /**
   * cgroup_taskset_size - return the number of tasks in taskset
   * @tset: taskset of interest
   */
  int cgroup_taskset_size(struct cgroup_taskset *tset)
  {
  	return tset->tc_array ? tset->tc_array_len : 1;
  }
  EXPORT_SYMBOL_GPL(cgroup_taskset_size);
74a1166df   Ben Blum   cgroups: make pro...
1748
1749
1750
1751
1752
  /*
   * cgroup_task_migrate - move a task from one cgroup to another.
   *
   * 'guarantee' is set if the caller promises that a new css_set for the task
   * will already exist. If not set, this function might sleep, and can fail with
cd3d09527   Tejun Heo   cgroup: always lo...
1753
   * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
74a1166df   Ben Blum   cgroups: make pro...
1754
1755
1756
1757
1758
1759
1760
1761
   */
  static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
  			       struct task_struct *tsk, bool guarantee)
  {
  	struct css_set *oldcg;
  	struct css_set *newcg;
  
  	/*
026085ef5   Mandeep Singh Baines   cgroup: remove re...
1762
1763
1764
  	 * We are synchronized through threadgroup_lock() against PF_EXITING
  	 * setting such that we can't race against cgroup_exit() changing the
  	 * css_set to init_css_set and dropping the old one.
74a1166df   Ben Blum   cgroups: make pro...
1765
  	 */
c84cdf75c   Frederic Weisbecker   cgroup: Remove un...
1766
  	WARN_ON_ONCE(tsk->flags & PF_EXITING);
74a1166df   Ben Blum   cgroups: make pro...
1767
  	oldcg = tsk->cgroups;
74a1166df   Ben Blum   cgroups: make pro...
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
  
  	/* locate or allocate a new css_set for this task. */
  	if (guarantee) {
  		/* we know the css_set we want already exists. */
  		struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
  		read_lock(&css_set_lock);
  		newcg = find_existing_css_set(oldcg, cgrp, template);
  		BUG_ON(!newcg);
  		get_css_set(newcg);
  		read_unlock(&css_set_lock);
  	} else {
  		might_sleep();
  		/* find_css_set will give us newcg already referenced. */
  		newcg = find_css_set(oldcg, cgrp);
026085ef5   Mandeep Singh Baines   cgroup: remove re...
1782
  		if (!newcg)
74a1166df   Ben Blum   cgroups: make pro...
1783
  			return -ENOMEM;
74a1166df   Ben Blum   cgroups: make pro...
1784
  	}
74a1166df   Ben Blum   cgroups: make pro...
1785

74a1166df   Ben Blum   cgroups: make pro...
1786
  	task_lock(tsk);
74a1166df   Ben Blum   cgroups: make pro...
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
  	rcu_assign_pointer(tsk->cgroups, newcg);
  	task_unlock(tsk);
  
  	/* Update the css_set linked lists if we're using them */
  	write_lock(&css_set_lock);
  	if (!list_empty(&tsk->cg_list))
  		list_move(&tsk->cg_list, &newcg->tasks);
  	write_unlock(&css_set_lock);
  
  	/*
  	 * We just gained a reference on oldcg by taking it from the task. As
  	 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
  	 * it here; it will be freed under RCU.
  	 */
  	put_css_set(oldcg);
  
  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
  	return 0;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1806
1807
1808
1809
  /**
   * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
   * @cgrp: the cgroup the task is attaching to
   * @tsk: the task to be attached
bbcb81d09   Paul Menage   Task Control Grou...
1810
   *
cd3d09527   Tejun Heo   cgroup: always lo...
1811
1812
   * Call with cgroup_mutex and threadgroup locked. May take task_lock of
   * @tsk during call.
bbcb81d09   Paul Menage   Task Control Grou...
1813
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1814
  int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
bbcb81d09   Paul Menage   Task Control Grou...
1815
  {
74a1166df   Ben Blum   cgroups: make pro...
1816
  	int retval;
2468c7234   Daisuke Nishimura   cgroup: introduce...
1817
  	struct cgroup_subsys *ss, *failed_ss = NULL;
bd89aabc6   Paul Menage   Control groups: R...
1818
  	struct cgroup *oldcgrp;
bd89aabc6   Paul Menage   Control groups: R...
1819
  	struct cgroupfs_root *root = cgrp->root;
2f7ee5691   Tejun Heo   cgroup: introduce...
1820
  	struct cgroup_taskset tset = { };
bbcb81d09   Paul Menage   Task Control Grou...
1821

cd3d09527   Tejun Heo   cgroup: always lo...
1822
1823
1824
  	/* @tsk either already exited or can't exit until the end */
  	if (tsk->flags & PF_EXITING)
  		return -ESRCH;
bbcb81d09   Paul Menage   Task Control Grou...
1825
1826
  
  	/* Nothing to do if the task is already in that cgroup */
7717f7ba9   Paul Menage   cgroups: add a ba...
1827
  	oldcgrp = task_cgroup_from_root(tsk, root);
bd89aabc6   Paul Menage   Control groups: R...
1828
  	if (cgrp == oldcgrp)
bbcb81d09   Paul Menage   Task Control Grou...
1829
  		return 0;
2f7ee5691   Tejun Heo   cgroup: introduce...
1830
1831
  	tset.single.task = tsk;
  	tset.single.cgrp = oldcgrp;
bbcb81d09   Paul Menage   Task Control Grou...
1832
1833
  	for_each_subsys(root, ss) {
  		if (ss->can_attach) {
2f7ee5691   Tejun Heo   cgroup: introduce...
1834
  			retval = ss->can_attach(ss, cgrp, &tset);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
  			if (retval) {
  				/*
  				 * Remember on which subsystem the can_attach()
  				 * failed, so that we only call cancel_attach()
  				 * against the subsystems whose can_attach()
  				 * succeeded. (See below)
  				 */
  				failed_ss = ss;
  				goto out;
  			}
bbcb81d09   Paul Menage   Task Control Grou...
1845
1846
  		}
  	}
74a1166df   Ben Blum   cgroups: make pro...
1847
1848
  	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
  	if (retval)
2468c7234   Daisuke Nishimura   cgroup: introduce...
1849
  		goto out;
817929ec2   Paul Menage   Task Control Grou...
1850

bbcb81d09   Paul Menage   Task Control Grou...
1851
  	for_each_subsys(root, ss) {
e18f6318e   Paul Jackson   cgroup brace codi...
1852
  		if (ss->attach)
2f7ee5691   Tejun Heo   cgroup: introduce...
1853
  			ss->attach(ss, cgrp, &tset);
bbcb81d09   Paul Menage   Task Control Grou...
1854
  	}
74a1166df   Ben Blum   cgroups: make pro...
1855

bbcb81d09   Paul Menage   Task Control Grou...
1856
  	synchronize_rcu();
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
1857
1858
1859
1860
1861
  
  	/*
  	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
  	 * is no longer empty.
  	 */
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
1862
  	cgroup_wakeup_rmdir_waiter(cgrp);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
  out:
  	if (retval) {
  		for_each_subsys(root, ss) {
  			if (ss == failed_ss)
  				/*
  				 * This subsystem was the one that failed the
  				 * can_attach() check earlier, so we don't need
  				 * to call cancel_attach() against it or any
  				 * remaining subsystems.
  				 */
  				break;
  			if (ss->cancel_attach)
2f7ee5691   Tejun Heo   cgroup: introduce...
1875
  				ss->cancel_attach(ss, cgrp, &tset);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1876
1877
1878
  		}
  	}
  	return retval;
bbcb81d09   Paul Menage   Task Control Grou...
1879
  }
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1880
  /**
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1881
1882
   * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
   * @from: attach to all cgroups of a given task
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1883
1884
   * @tsk: the task to be attached
   */
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1885
  int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1886
1887
  {
  	struct cgroupfs_root *root;
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1888
1889
1890
1891
  	int retval = 0;
  
  	cgroup_lock();
  	for_each_active_root(root) {
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1892
1893
1894
  		struct cgroup *from_cg = task_cgroup_from_root(from, root);
  
  		retval = cgroup_attach_task(from_cg, tsk);
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1895
1896
1897
1898
1899
1900
1901
  		if (retval)
  			break;
  	}
  	cgroup_unlock();
  
  	return retval;
  }
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1902
  EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1903

bbcb81d09   Paul Menage   Task Control Grou...
1904
  /*
74a1166df   Ben Blum   cgroups: make pro...
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
   * cgroup_attach_proc works in two stages, the first of which prefetches all
   * new css_sets needed (to make sure we have enough memory before committing
   * to the move) and stores them in a list of entries of the following type.
   * TODO: possible optimization: use css_set->rcu_head for chaining instead
   */
  struct cg_list_entry {
  	struct css_set *cg;
  	struct list_head links;
  };
  
  static bool css_set_check_fetched(struct cgroup *cgrp,
  				  struct task_struct *tsk, struct css_set *cg,
  				  struct list_head *newcg_list)
  {
  	struct css_set *newcg;
  	struct cg_list_entry *cg_entry;
  	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
  
  	read_lock(&css_set_lock);
  	newcg = find_existing_css_set(cg, cgrp, template);
74a1166df   Ben Blum   cgroups: make pro...
1925
1926
1927
1928
1929
1930
  	read_unlock(&css_set_lock);
  
  	/* doesn't exist at all? */
  	if (!newcg)
  		return false;
  	/* see if it's already in the list */
29e21368b   Mandeep Singh Baines   cgroups: remove r...
1931
1932
  	list_for_each_entry(cg_entry, newcg_list, links)
  		if (cg_entry->cg == newcg)
74a1166df   Ben Blum   cgroups: make pro...
1933
  			return true;
74a1166df   Ben Blum   cgroups: make pro...
1934
1935
  
  	/* not found */
74a1166df   Ben Blum   cgroups: make pro...
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
  	return false;
  }
  
  /*
   * Find the new css_set and store it in the list in preparation for moving the
   * given task to the given cgroup. Returns 0 or -ENOMEM.
   */
  static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
  			    struct list_head *newcg_list)
  {
  	struct css_set *newcg;
  	struct cg_list_entry *cg_entry;
  
  	/* ensure a new css_set will exist for this thread */
  	newcg = find_css_set(cg, cgrp);
  	if (!newcg)
  		return -ENOMEM;
  	/* add it to the list */
  	cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
  	if (!cg_entry) {
  		put_css_set(newcg);
  		return -ENOMEM;
  	}
  	cg_entry->cg = newcg;
  	list_add(&cg_entry->links, newcg_list);
  	return 0;
  }
  
  /**
   * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
   * @cgrp: the cgroup to attach to
   * @leader: the threadgroup leader task_struct of the group to be attached
   *
257058ae2   Tejun Heo   threadgroup: rena...
1969
1970
   * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
   * task_lock of each thread in leader's threadgroup individually in turn.
74a1166df   Ben Blum   cgroups: make pro...
1971
   */
1c6c3fad8   Kirill A. Shutemov   cgroup: mark cgro...
1972
  static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
74a1166df   Ben Blum   cgroups: make pro...
1973
1974
1975
  {
  	int retval, i, group_size;
  	struct cgroup_subsys *ss, *failed_ss = NULL;
74a1166df   Ben Blum   cgroups: make pro...
1976
  	/* guaranteed to be initialized later, but the compiler needs this */
74a1166df   Ben Blum   cgroups: make pro...
1977
1978
1979
1980
  	struct css_set *oldcg;
  	struct cgroupfs_root *root = cgrp->root;
  	/* threadgroup list cursor and array */
  	struct task_struct *tsk;
134d33737   Tejun Heo   cgroup: improve o...
1981
  	struct task_and_cgroup *tc;
d846687d7   Ben Blum   cgroups: use flex...
1982
  	struct flex_array *group;
2f7ee5691   Tejun Heo   cgroup: introduce...
1983
  	struct cgroup_taskset tset = { };
74a1166df   Ben Blum   cgroups: make pro...
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
  	/*
  	 * we need to make sure we have css_sets for all the tasks we're
  	 * going to move -before- we actually start moving them, so that in
  	 * case we get an ENOMEM we can bail out before making any changes.
  	 */
  	struct list_head newcg_list;
  	struct cg_list_entry *cg_entry, *temp_nobe;
  
  	/*
  	 * step 0: in order to do expensive, possibly blocking operations for
  	 * every thread, we cannot iterate the thread group list, since it needs
  	 * rcu or tasklist locked. instead, build an array of all threads in the
257058ae2   Tejun Heo   threadgroup: rena...
1996
1997
  	 * group - group_rwsem prevents new threads from appearing, and if
  	 * threads exit, this will just be an over-estimate.
74a1166df   Ben Blum   cgroups: make pro...
1998
1999
  	 */
  	group_size = get_nr_threads(leader);
d846687d7   Ben Blum   cgroups: use flex...
2000
  	/* flex_array supports very large thread-groups better than kmalloc. */
134d33737   Tejun Heo   cgroup: improve o...
2001
  	group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
74a1166df   Ben Blum   cgroups: make pro...
2002
2003
  	if (!group)
  		return -ENOMEM;
d846687d7   Ben Blum   cgroups: use flex...
2004
2005
2006
2007
  	/* pre-allocate to guarantee space while iterating in rcu read-side. */
  	retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
  	if (retval)
  		goto out_free_group_list;
74a1166df   Ben Blum   cgroups: make pro...
2008
2009
  
  	/* prevent changes to the threadgroup list while we take a snapshot. */
33ef6b698   Ben Blum   cgroups: more saf...
2010
  	read_lock(&tasklist_lock);
74a1166df   Ben Blum   cgroups: make pro...
2011
2012
2013
2014
2015
2016
2017
2018
  	if (!thread_group_leader(leader)) {
  		/*
  		 * a race with de_thread from another thread's exec() may strip
  		 * us of our leadership, making while_each_thread unsafe to use
  		 * on this task. if this happens, there is no choice but to
  		 * throw this task away and try again (from cgroup_procs_write);
  		 * this is "double-double-toil-and-trouble-check locking".
  		 */
33ef6b698   Ben Blum   cgroups: more saf...
2019
  		read_unlock(&tasklist_lock);
74a1166df   Ben Blum   cgroups: make pro...
2020
2021
2022
  		retval = -EAGAIN;
  		goto out_free_group_list;
  	}
b07ef7741   Mandeep Singh Baines   cgroup: remove re...
2023

74a1166df   Ben Blum   cgroups: make pro...
2024
2025
2026
  	tsk = leader;
  	i = 0;
  	do {
134d33737   Tejun Heo   cgroup: improve o...
2027
  		struct task_and_cgroup ent;
cd3d09527   Tejun Heo   cgroup: always lo...
2028
2029
2030
  		/* @tsk either already exited or can't exit until the end */
  		if (tsk->flags & PF_EXITING)
  			continue;
74a1166df   Ben Blum   cgroups: make pro...
2031
2032
  		/* as per above, nr_threads may decrease, but not increase. */
  		BUG_ON(i >= group_size);
d846687d7   Ben Blum   cgroups: use flex...
2033
2034
2035
2036
  		/*
  		 * saying GFP_ATOMIC has no effect here because we did prealloc
  		 * earlier, but it's good form to communicate our expectations.
  		 */
134d33737   Tejun Heo   cgroup: improve o...
2037
2038
  		ent.task = tsk;
  		ent.cgrp = task_cgroup_from_root(tsk, root);
892a2b90b   Mandeep Singh Baines   cgroup: only need...
2039
2040
2041
  		/* nothing to do if this task is already in the cgroup */
  		if (ent.cgrp == cgrp)
  			continue;
134d33737   Tejun Heo   cgroup: improve o...
2042
  		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
d846687d7   Ben Blum   cgroups: use flex...
2043
  		BUG_ON(retval != 0);
74a1166df   Ben Blum   cgroups: make pro...
2044
2045
2046
2047
  		i++;
  	} while_each_thread(leader, tsk);
  	/* remember the number of threads in the array for later. */
  	group_size = i;
2f7ee5691   Tejun Heo   cgroup: introduce...
2048
2049
  	tset.tc_array = group;
  	tset.tc_array_len = group_size;
33ef6b698   Ben Blum   cgroups: more saf...
2050
  	read_unlock(&tasklist_lock);
74a1166df   Ben Blum   cgroups: make pro...
2051

134d33737   Tejun Heo   cgroup: improve o...
2052
2053
  	/* methods shouldn't be called if no task is actually migrating */
  	retval = 0;
892a2b90b   Mandeep Singh Baines   cgroup: only need...
2054
  	if (!group_size)
b07ef7741   Mandeep Singh Baines   cgroup: remove re...
2055
  		goto out_free_group_list;
134d33737   Tejun Heo   cgroup: improve o...
2056

74a1166df   Ben Blum   cgroups: make pro...
2057
2058
2059
2060
2061
  	/*
  	 * step 1: check that we can legitimately attach to the cgroup.
  	 */
  	for_each_subsys(root, ss) {
  		if (ss->can_attach) {
2f7ee5691   Tejun Heo   cgroup: introduce...
2062
  			retval = ss->can_attach(ss, cgrp, &tset);
74a1166df   Ben Blum   cgroups: make pro...
2063
2064
2065
2066
2067
  			if (retval) {
  				failed_ss = ss;
  				goto out_cancel_attach;
  			}
  		}
74a1166df   Ben Blum   cgroups: make pro...
2068
2069
2070
2071
2072
2073
2074
2075
  	}
  
  	/*
  	 * step 2: make sure css_sets exist for all threads to be migrated.
  	 * we use find_css_set, which allocates a new one if necessary.
  	 */
  	INIT_LIST_HEAD(&newcg_list);
  	for (i = 0; i < group_size; i++) {
134d33737   Tejun Heo   cgroup: improve o...
2076
  		tc = flex_array_get(group, i);
134d33737   Tejun Heo   cgroup: improve o...
2077
  		oldcg = tc->task->cgroups;
026085ef5   Mandeep Singh Baines   cgroup: remove re...
2078
2079
  
  		/* if we don't already have it in the list get a new one */
305f3c8b2   Dan Carpenter   cgroup: move assi...
2080
2081
  		if (!css_set_check_fetched(cgrp, tc->task, oldcg,
  					   &newcg_list)) {
74a1166df   Ben Blum   cgroups: make pro...
2082
  			retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
74a1166df   Ben Blum   cgroups: make pro...
2083
2084
2085
2086
2087
2088
  			if (retval)
  				goto out_list_teardown;
  		}
  	}
  
  	/*
494c167cf   Tejun Heo   cgroup: kill subs...
2089
2090
2091
  	 * step 3: now that we're guaranteed success wrt the css_sets,
  	 * proceed to move all tasks to the new cgroup.  There are no
  	 * failure cases after here, so this is the commit point.
74a1166df   Ben Blum   cgroups: make pro...
2092
  	 */
74a1166df   Ben Blum   cgroups: make pro...
2093
  	for (i = 0; i < group_size; i++) {
134d33737   Tejun Heo   cgroup: improve o...
2094
  		tc = flex_array_get(group, i);
134d33737   Tejun Heo   cgroup: improve o...
2095
  		retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
cd3d09527   Tejun Heo   cgroup: always lo...
2096
  		BUG_ON(retval);
74a1166df   Ben Blum   cgroups: make pro...
2097
2098
2099
2100
  	}
  	/* nothing is sensitive to fork() after this point. */
  
  	/*
494c167cf   Tejun Heo   cgroup: kill subs...
2101
  	 * step 4: do subsystem attach callbacks.
74a1166df   Ben Blum   cgroups: make pro...
2102
2103
2104
  	 */
  	for_each_subsys(root, ss) {
  		if (ss->attach)
2f7ee5691   Tejun Heo   cgroup: introduce...
2105
  			ss->attach(ss, cgrp, &tset);
74a1166df   Ben Blum   cgroups: make pro...
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
  	}
  
  	/*
  	 * step 5: success! and cleanup
  	 */
  	synchronize_rcu();
  	cgroup_wakeup_rmdir_waiter(cgrp);
  	retval = 0;
  out_list_teardown:
  	/* clean up the list of prefetched css_sets. */
  	list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
  		list_del(&cg_entry->links);
  		put_css_set(cg_entry->cg);
  		kfree(cg_entry);
  	}
  out_cancel_attach:
  	/* same deal as in cgroup_attach_task */
  	if (retval) {
  		for_each_subsys(root, ss) {
494c167cf   Tejun Heo   cgroup: kill subs...
2125
  			if (ss == failed_ss)
74a1166df   Ben Blum   cgroups: make pro...
2126
  				break;
74a1166df   Ben Blum   cgroups: make pro...
2127
  			if (ss->cancel_attach)
2f7ee5691   Tejun Heo   cgroup: introduce...
2128
  				ss->cancel_attach(ss, cgrp, &tset);
74a1166df   Ben Blum   cgroups: make pro...
2129
2130
  		}
  	}
74a1166df   Ben Blum   cgroups: make pro...
2131
  out_free_group_list:
d846687d7   Ben Blum   cgroups: use flex...
2132
  	flex_array_free(group);
74a1166df   Ben Blum   cgroups: make pro...
2133
2134
2135
2136
2137
  	return retval;
  }
  
  /*
   * Find the task_struct of the task to attach by vpid and pass it along to the
cd3d09527   Tejun Heo   cgroup: always lo...
2138
2139
   * function to attach either it or all tasks in its threadgroup. Will lock
   * cgroup_mutex and threadgroup; may take task_lock of task.
bbcb81d09   Paul Menage   Task Control Grou...
2140
   */
74a1166df   Ben Blum   cgroups: make pro...
2141
  static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
bbcb81d09   Paul Menage   Task Control Grou...
2142
  {
bbcb81d09   Paul Menage   Task Control Grou...
2143
  	struct task_struct *tsk;
c69e8d9c0   David Howells   CRED: Use RCU to ...
2144
  	const struct cred *cred = current_cred(), *tcred;
bbcb81d09   Paul Menage   Task Control Grou...
2145
  	int ret;
74a1166df   Ben Blum   cgroups: make pro...
2146
2147
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
bbcb81d09   Paul Menage   Task Control Grou...
2148
2149
  	if (pid) {
  		rcu_read_lock();
73507f335   Pavel Emelyanov   Handle pid namesp...
2150
  		tsk = find_task_by_vpid(pid);
74a1166df   Ben Blum   cgroups: make pro...
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
  		if (!tsk) {
  			rcu_read_unlock();
  			cgroup_unlock();
  			return -ESRCH;
  		}
  		if (threadgroup) {
  			/*
  			 * RCU protects this access, since tsk was found in the
  			 * tid map. a race with de_thread may cause group_leader
  			 * to stop being the leader, but cgroup_attach_proc will
  			 * detect it later.
  			 */
  			tsk = tsk->group_leader;
bbcb81d09   Paul Menage   Task Control Grou...
2164
  		}
74a1166df   Ben Blum   cgroups: make pro...
2165
2166
2167
2168
  		/*
  		 * even if we're attaching all tasks in the thread group, we
  		 * only need to check permissions on one of them.
  		 */
c69e8d9c0   David Howells   CRED: Use RCU to ...
2169
2170
2171
2172
2173
  		tcred = __task_cred(tsk);
  		if (cred->euid &&
  		    cred->euid != tcred->uid &&
  		    cred->euid != tcred->suid) {
  			rcu_read_unlock();
74a1166df   Ben Blum   cgroups: make pro...
2174
  			cgroup_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
2175
2176
  			return -EACCES;
  		}
c69e8d9c0   David Howells   CRED: Use RCU to ...
2177
2178
  		get_task_struct(tsk);
  		rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
2179
  	} else {
74a1166df   Ben Blum   cgroups: make pro...
2180
2181
2182
2183
  		if (threadgroup)
  			tsk = current->group_leader;
  		else
  			tsk = current;
bbcb81d09   Paul Menage   Task Control Grou...
2184
2185
  		get_task_struct(tsk);
  	}
cd3d09527   Tejun Heo   cgroup: always lo...
2186
2187
2188
  	threadgroup_lock(tsk);
  
  	if (threadgroup)
74a1166df   Ben Blum   cgroups: make pro...
2189
  		ret = cgroup_attach_proc(cgrp, tsk);
cd3d09527   Tejun Heo   cgroup: always lo...
2190
  	else
74a1166df   Ben Blum   cgroups: make pro...
2191
  		ret = cgroup_attach_task(cgrp, tsk);
cd3d09527   Tejun Heo   cgroup: always lo...
2192
2193
  
  	threadgroup_unlock(tsk);
bbcb81d09   Paul Menage   Task Control Grou...
2194
  	put_task_struct(tsk);
74a1166df   Ben Blum   cgroups: make pro...
2195
  	cgroup_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
2196
2197
  	return ret;
  }
af351026a   Paul Menage   cgroup files: tur...
2198
2199
  static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
  {
74a1166df   Ben Blum   cgroups: make pro...
2200
2201
2202
2203
2204
  	return attach_task_by_pid(cgrp, pid, false);
  }
  
  static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
  {
af351026a   Paul Menage   cgroup files: tur...
2205
  	int ret;
74a1166df   Ben Blum   cgroups: make pro...
2206
2207
2208
2209
2210
2211
2212
2213
  	do {
  		/*
  		 * attach_proc fails with -EAGAIN if threadgroup leadership
  		 * changes in the middle of the operation, in which case we need
  		 * to find the task_struct for the new leader and start over.
  		 */
  		ret = attach_task_by_pid(cgrp, tgid, true);
  	} while (ret == -EAGAIN);
af351026a   Paul Menage   cgroup files: tur...
2214
2215
  	return ret;
  }
e788e066c   Paul Menage   cgroup files: mov...
2216
2217
2218
2219
  /**
   * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
   * @cgrp: the cgroup to be checked for liveness
   *
84eea8428   Paul Menage   cgroups: misc cle...
2220
2221
   * On success, returns true; the lock should be later released with
   * cgroup_unlock(). On failure returns false with no lock held.
e788e066c   Paul Menage   cgroup files: mov...
2222
   */
84eea8428   Paul Menage   cgroups: misc cle...
2223
  bool cgroup_lock_live_group(struct cgroup *cgrp)
e788e066c   Paul Menage   cgroup files: mov...
2224
2225
2226
2227
2228
2229
2230
2231
  {
  	mutex_lock(&cgroup_mutex);
  	if (cgroup_is_removed(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
  		return false;
  	}
  	return true;
  }
67523c48a   Ben Blum   cgroups: blkio su...
2232
  EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
e788e066c   Paul Menage   cgroup files: mov...
2233
2234
2235
2236
2237
  
  static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
f4a2589fe   Evgeny Kuznetsov   cgroups: add chec...
2238
2239
  	if (strlen(buffer) >= PATH_MAX)
  		return -EINVAL;
e788e066c   Paul Menage   cgroup files: mov...
2240
2241
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
2242
  	mutex_lock(&cgroup_root_mutex);
e788e066c   Paul Menage   cgroup files: mov...
2243
  	strcpy(cgrp->root->release_agent_path, buffer);
e25e2cbb4   Tejun Heo   cgroup: add cgrou...
2244
  	mutex_unlock(&cgroup_root_mutex);
84eea8428   Paul Menage   cgroups: misc cle...
2245
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
  	return 0;
  }
  
  static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
  				     struct seq_file *seq)
  {
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	seq_puts(seq, cgrp->root->release_agent_path);
  	seq_putc(seq, '
  ');
84eea8428   Paul Menage   cgroups: misc cle...
2257
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
2258
2259
  	return 0;
  }
84eea8428   Paul Menage   cgroups: misc cle...
2260
2261
  /* A buffer size big enough for numbers or short strings */
  #define CGROUP_LOCAL_BUFFER_SIZE 64
e73d2c61d   Paul Menage   CGroups _s64 file...
2262
  static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
f4c753b7e   Paul Menage   CGroup API files:...
2263
2264
2265
  				struct file *file,
  				const char __user *userbuf,
  				size_t nbytes, loff_t *unused_ppos)
355e0c48b   Paul Menage   Add cgroup write_...
2266
  {
84eea8428   Paul Menage   cgroups: misc cle...
2267
  	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
355e0c48b   Paul Menage   Add cgroup write_...
2268
  	int retval = 0;
355e0c48b   Paul Menage   Add cgroup write_...
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
  	char *end;
  
  	if (!nbytes)
  		return -EINVAL;
  	if (nbytes >= sizeof(buffer))
  		return -E2BIG;
  	if (copy_from_user(buffer, userbuf, nbytes))
  		return -EFAULT;
  
  	buffer[nbytes] = 0;     /* nul-terminate */
e73d2c61d   Paul Menage   CGroups _s64 file...
2279
  	if (cft->write_u64) {
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
2280
  		u64 val = simple_strtoull(strstrip(buffer), &end, 0);
e73d2c61d   Paul Menage   CGroups _s64 file...
2281
2282
2283
2284
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_u64(cgrp, cft, val);
  	} else {
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
2285
  		s64 val = simple_strtoll(strstrip(buffer), &end, 0);
e73d2c61d   Paul Menage   CGroups _s64 file...
2286
2287
2288
2289
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_s64(cgrp, cft, val);
  	}
355e0c48b   Paul Menage   Add cgroup write_...
2290
2291
2292
2293
  	if (!retval)
  		retval = nbytes;
  	return retval;
  }
db3b14978   Paul Menage   cgroup files: add...
2294
2295
2296
2297
2298
  static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
  				   struct file *file,
  				   const char __user *userbuf,
  				   size_t nbytes, loff_t *unused_ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
2299
  	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
db3b14978   Paul Menage   cgroup files: add...
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
  	int retval = 0;
  	size_t max_bytes = cft->max_write_len;
  	char *buffer = local_buffer;
  
  	if (!max_bytes)
  		max_bytes = sizeof(local_buffer) - 1;
  	if (nbytes >= max_bytes)
  		return -E2BIG;
  	/* Allocate a dynamic buffer if we need one */
  	if (nbytes >= sizeof(local_buffer)) {
  		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
  		if (buffer == NULL)
  			return -ENOMEM;
  	}
5a3eb9f6b   Li Zefan   cgroup: fix possi...
2314
2315
2316
2317
  	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
  		retval = -EFAULT;
  		goto out;
  	}
db3b14978   Paul Menage   cgroup files: add...
2318
2319
  
  	buffer[nbytes] = 0;     /* nul-terminate */
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
2320
  	retval = cft->write_string(cgrp, cft, strstrip(buffer));
db3b14978   Paul Menage   cgroup files: add...
2321
2322
  	if (!retval)
  		retval = nbytes;
5a3eb9f6b   Li Zefan   cgroup: fix possi...
2323
  out:
db3b14978   Paul Menage   cgroup files: add...
2324
2325
2326
2327
  	if (buffer != local_buffer)
  		kfree(buffer);
  	return retval;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
2328
2329
2330
2331
  static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
  						size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
2332
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
2333

75139b827   Li Zefan   cgroups: remove s...
2334
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
2335
  		return -ENODEV;
355e0c48b   Paul Menage   Add cgroup write_...
2336
  	if (cft->write)
bd89aabc6   Paul Menage   Control groups: R...
2337
  		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
2338
2339
  	if (cft->write_u64 || cft->write_s64)
  		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
db3b14978   Paul Menage   cgroup files: add...
2340
2341
  	if (cft->write_string)
  		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
d447ea2f3   Pavel Emelyanov   cgroups: add the ...
2342
2343
2344
2345
  	if (cft->trigger) {
  		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
  		return ret ? ret : nbytes;
  	}
355e0c48b   Paul Menage   Add cgroup write_...
2346
  	return -EINVAL;
ddbcc7e8e   Paul Menage   Task Control Grou...
2347
  }
f4c753b7e   Paul Menage   CGroup API files:...
2348
2349
2350
2351
  static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
ddbcc7e8e   Paul Menage   Task Control Grou...
2352
  {
84eea8428   Paul Menage   cgroups: misc cle...
2353
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
f4c753b7e   Paul Menage   CGroup API files:...
2354
  	u64 val = cft->read_u64(cgrp, cft);
ddbcc7e8e   Paul Menage   Task Control Grou...
2355
2356
2357
2358
2359
  	int len = sprintf(tmp, "%llu
  ", (unsigned long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
e73d2c61d   Paul Menage   CGroups _s64 file...
2360
2361
2362
2363
2364
  static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
2365
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
e73d2c61d   Paul Menage   CGroups _s64 file...
2366
2367
2368
2369
2370
2371
  	s64 val = cft->read_s64(cgrp, cft);
  	int len = sprintf(tmp, "%lld
  ", (long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
2372
2373
2374
2375
  static ssize_t cgroup_file_read(struct file *file, char __user *buf,
  				   size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
2376
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
2377

75139b827   Li Zefan   cgroups: remove s...
2378
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
2379
2380
2381
  		return -ENODEV;
  
  	if (cft->read)
bd89aabc6   Paul Menage   Control groups: R...
2382
  		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
f4c753b7e   Paul Menage   CGroup API files:...
2383
2384
  	if (cft->read_u64)
  		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
2385
2386
  	if (cft->read_s64)
  		return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
ddbcc7e8e   Paul Menage   Task Control Grou...
2387
2388
  	return -EINVAL;
  }
917965696   Paul Menage   CGroup API files:...
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
  /*
   * seqfile ops/methods for returning structured data. Currently just
   * supports string->u64 maps, but can be extended in future.
   */
  
  struct cgroup_seqfile_state {
  	struct cftype *cft;
  	struct cgroup *cgroup;
  };
  
  static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
  {
  	struct seq_file *sf = cb->state;
  	return seq_printf(sf, "%s %llu
  ", key, (unsigned long long)value);
  }
  
  static int cgroup_seqfile_show(struct seq_file *m, void *arg)
  {
  	struct cgroup_seqfile_state *state = m->private;
  	struct cftype *cft = state->cft;
29486df32   Serge E. Hallyn   cgroups: introduc...
2410
2411
2412
2413
2414
2415
2416
2417
  	if (cft->read_map) {
  		struct cgroup_map_cb cb = {
  			.fill = cgroup_map_add,
  			.state = m,
  		};
  		return cft->read_map(state->cgroup, cft, &cb);
  	}
  	return cft->read_seq_string(state->cgroup, cft, m);
917965696   Paul Menage   CGroup API files:...
2418
  }
96930a636   Adrian Bunk   make cgroup_seqfi...
2419
  static int cgroup_seqfile_release(struct inode *inode, struct file *file)
917965696   Paul Menage   CGroup API files:...
2420
2421
2422
2423
2424
  {
  	struct seq_file *seq = file->private_data;
  	kfree(seq->private);
  	return single_release(inode, file);
  }
828c09509   Alexey Dobriyan   const: constify r...
2425
  static const struct file_operations cgroup_seqfile_operations = {
917965696   Paul Menage   CGroup API files:...
2426
  	.read = seq_read,
e788e066c   Paul Menage   cgroup files: mov...
2427
  	.write = cgroup_file_write,
917965696   Paul Menage   CGroup API files:...
2428
2429
2430
  	.llseek = seq_lseek,
  	.release = cgroup_seqfile_release,
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
2431
2432
2433
2434
2435
2436
2437
2438
  static int cgroup_file_open(struct inode *inode, struct file *file)
  {
  	int err;
  	struct cftype *cft;
  
  	err = generic_file_open(inode, file);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
2439
  	cft = __d_cft(file->f_dentry);
75139b827   Li Zefan   cgroups: remove s...
2440

29486df32   Serge E. Hallyn   cgroups: introduc...
2441
  	if (cft->read_map || cft->read_seq_string) {
917965696   Paul Menage   CGroup API files:...
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
  		struct cgroup_seqfile_state *state =
  			kzalloc(sizeof(*state), GFP_USER);
  		if (!state)
  			return -ENOMEM;
  		state->cft = cft;
  		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
  		file->f_op = &cgroup_seqfile_operations;
  		err = single_open(file, cgroup_seqfile_show, state);
  		if (err < 0)
  			kfree(state);
  	} else if (cft->open)
ddbcc7e8e   Paul Menage   Task Control Grou...
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
  		err = cft->open(inode, file);
  	else
  		err = 0;
  
  	return err;
  }
  
  static int cgroup_file_release(struct inode *inode, struct file *file)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
  	if (cft->release)
  		return cft->release(inode, file);
  	return 0;
  }
  
  /*
   * cgroup_rename - Only allow simple rename of directories in place.
   */
  static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
  			    struct inode *new_dir, struct dentry *new_dentry)
  {
  	if (!S_ISDIR(old_dentry->d_inode->i_mode))
  		return -ENOTDIR;
  	if (new_dentry->d_inode)
  		return -EEXIST;
  	if (old_dir != new_dir)
  		return -EIO;
  	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
  }
828c09509   Alexey Dobriyan   const: constify r...
2482
  static const struct file_operations cgroup_file_operations = {
ddbcc7e8e   Paul Menage   Task Control Grou...
2483
2484
2485
2486
2487
2488
  	.read = cgroup_file_read,
  	.write = cgroup_file_write,
  	.llseek = generic_file_llseek,
  	.open = cgroup_file_open,
  	.release = cgroup_file_release,
  };
6e1d5dcc2   Alexey Dobriyan   const: mark remai...
2489
  static const struct inode_operations cgroup_dir_inode_operations = {
c72a04e34   Al Viro   cgroup_fs: fix cg...
2490
  	.lookup = cgroup_lookup,
ddbcc7e8e   Paul Menage   Task Control Grou...
2491
2492
2493
2494
  	.mkdir = cgroup_mkdir,
  	.rmdir = cgroup_rmdir,
  	.rename = cgroup_rename,
  };
c72a04e34   Al Viro   cgroup_fs: fix cg...
2495
2496
2497
2498
2499
2500
2501
  static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
  {
  	if (dentry->d_name.len > NAME_MAX)
  		return ERR_PTR(-ENAMETOOLONG);
  	d_add(dentry, NULL);
  	return NULL;
  }
0dea11687   Kirill A. Shutemov   cgroup: implement...
2502
2503
2504
2505
2506
2507
2508
2509
2510
  /*
   * Check if a file is a control file
   */
  static inline struct cftype *__file_cft(struct file *file)
  {
  	if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
  		return ERR_PTR(-EINVAL);
  	return __d_cft(file->f_dentry);
  }
a5e7ed328   Al Viro   cgroup: propagate...
2511
  static int cgroup_create_file(struct dentry *dentry, umode_t mode,
5adcee1d8   Nick Piggin   cgroup fs: avoid ...
2512
2513
  				struct super_block *sb)
  {
ddbcc7e8e   Paul Menage   Task Control Grou...
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
  	struct inode *inode;
  
  	if (!dentry)
  		return -ENOENT;
  	if (dentry->d_inode)
  		return -EEXIST;
  
  	inode = cgroup_new_inode(mode, sb);
  	if (!inode)
  		return -ENOMEM;
  
  	if (S_ISDIR(mode)) {
  		inode->i_op = &cgroup_dir_inode_operations;
  		inode->i_fop = &simple_dir_operations;
  
  		/* start off with i_nlink == 2 (for "." entry) */
  		inc_nlink(inode);
  
  		/* start with the directory inode held, so that we can
  		 * populate it without racing with another mkdir */
817929ec2   Paul Menage   Task Control Grou...
2534
  		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ddbcc7e8e   Paul Menage   Task Control Grou...
2535
2536
2537
2538
  	} else if (S_ISREG(mode)) {
  		inode->i_size = 0;
  		inode->i_fop = &cgroup_file_operations;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
2539
2540
2541
2542
2543
2544
  	d_instantiate(dentry, inode);
  	dget(dentry);	/* Extra count - pin the dentry in core */
  	return 0;
  }
  
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
2545
2546
2547
2548
2549
   * cgroup_create_dir - create a directory for an object.
   * @cgrp: the cgroup we create the directory for. It must have a valid
   *        ->parent field. And we are going to fill its ->dentry field.
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new directory.
ddbcc7e8e   Paul Menage   Task Control Grou...
2550
   */
bd89aabc6   Paul Menage   Control groups: R...
2551
  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
a5e7ed328   Al Viro   cgroup: propagate...
2552
  				umode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
2553
2554
2555
  {
  	struct dentry *parent;
  	int error = 0;
bd89aabc6   Paul Menage   Control groups: R...
2556
2557
  	parent = cgrp->parent->dentry;
  	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
2558
  	if (!error) {
bd89aabc6   Paul Menage   Control groups: R...
2559
  		dentry->d_fsdata = cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
2560
  		inc_nlink(parent->d_inode);
a47295e6b   Paul Menage   cgroups: make cgr...
2561
  		rcu_assign_pointer(cgrp->dentry, dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
2562
2563
2564
2565
2566
2567
  		dget(dentry);
  	}
  	dput(dentry);
  
  	return error;
  }
099fca322   Li Zefan   cgroups: show cor...
2568
2569
2570
2571
2572
2573
2574
2575
2576
  /**
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
   *
   * returns cft->mode if ->mode is not 0
   * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
   * returns S_IRUGO if it has only a read handler
   * returns S_IWUSR if it has only a write hander
   */
a5e7ed328   Al Viro   cgroup: propagate...
2577
  static umode_t cgroup_file_mode(const struct cftype *cft)
099fca322   Li Zefan   cgroups: show cor...
2578
  {
a5e7ed328   Al Viro   cgroup: propagate...
2579
  	umode_t mode = 0;
099fca322   Li Zefan   cgroups: show cor...
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
  
  	if (cft->mode)
  		return cft->mode;
  
  	if (cft->read || cft->read_u64 || cft->read_s64 ||
  	    cft->read_map || cft->read_seq_string)
  		mode |= S_IRUGO;
  
  	if (cft->write || cft->write_u64 || cft->write_s64 ||
  	    cft->write_string || cft->trigger)
  		mode |= S_IWUSR;
  
  	return mode;
  }
bd89aabc6   Paul Menage   Control groups: R...
2594
  int cgroup_add_file(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
2595
2596
2597
  		       struct cgroup_subsys *subsys,
  		       const struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
2598
  	struct dentry *dir = cgrp->dentry;
ddbcc7e8e   Paul Menage   Task Control Grou...
2599
2600
  	struct dentry *dentry;
  	int error;
a5e7ed328   Al Viro   cgroup: propagate...
2601
  	umode_t mode;
ddbcc7e8e   Paul Menage   Task Control Grou...
2602
2603
  
  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
bd89aabc6   Paul Menage   Control groups: R...
2604
  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2605
2606
2607
2608
2609
2610
2611
  		strcpy(name, subsys->name);
  		strcat(name, ".");
  	}
  	strcat(name, cft->name);
  	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
  	dentry = lookup_one_len(name, dir, strlen(name));
  	if (!IS_ERR(dentry)) {
099fca322   Li Zefan   cgroups: show cor...
2612
2613
  		mode = cgroup_file_mode(cft);
  		error = cgroup_create_file(dentry, mode | S_IFREG,
bd89aabc6   Paul Menage   Control groups: R...
2614
  						cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
2615
2616
2617
2618
2619
2620
2621
  		if (!error)
  			dentry->d_fsdata = (void *)cft;
  		dput(dentry);
  	} else
  		error = PTR_ERR(dentry);
  	return error;
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
2622
  EXPORT_SYMBOL_GPL(cgroup_add_file);
ddbcc7e8e   Paul Menage   Task Control Grou...
2623

bd89aabc6   Paul Menage   Control groups: R...
2624
  int cgroup_add_files(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
2625
2626
2627
2628
2629
2630
  			struct cgroup_subsys *subsys,
  			const struct cftype cft[],
  			int count)
  {
  	int i, err;
  	for (i = 0; i < count; i++) {
bd89aabc6   Paul Menage   Control groups: R...
2631
  		err = cgroup_add_file(cgrp, subsys, &cft[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
2632
2633
2634
2635
2636
  		if (err)
  			return err;
  	}
  	return 0;
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
2637
  EXPORT_SYMBOL_GPL(cgroup_add_files);
ddbcc7e8e   Paul Menage   Task Control Grou...
2638

a043e3b2c   Li Zefan   cgroup: fix comments
2639
2640
2641
2642
2643
2644
  /**
   * cgroup_task_count - count the number of tasks in a cgroup.
   * @cgrp: the cgroup in question
   *
   * Return the number of tasks in the cgroup.
   */
bd89aabc6   Paul Menage   Control groups: R...
2645
  int cgroup_task_count(const struct cgroup *cgrp)
bbcb81d09   Paul Menage   Task Control Grou...
2646
2647
  {
  	int count = 0;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
2648
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
2649
2650
  
  	read_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
2651
  	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
2652
  		count += atomic_read(&link->cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
2653
2654
  	}
  	read_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
2655
2656
2657
2658
  	return count;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
2659
2660
2661
   * Advance a list_head iterator.  The iterator should be positioned at
   * the start of a css_set
   */
bd89aabc6   Paul Menage   Control groups: R...
2662
  static void cgroup_advance_iter(struct cgroup *cgrp,
7717f7ba9   Paul Menage   cgroups: add a ba...
2663
  				struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2664
2665
2666
2667
2668
2669
2670
2671
  {
  	struct list_head *l = it->cg_link;
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	/* Advance to the next non-empty css_set */
  	do {
  		l = l->next;
bd89aabc6   Paul Menage   Control groups: R...
2672
  		if (l == &cgrp->css_sets) {
817929ec2   Paul Menage   Task Control Grou...
2673
2674
2675
  			it->cg_link = NULL;
  			return;
  		}
bd89aabc6   Paul Menage   Control groups: R...
2676
  		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
2677
2678
2679
2680
2681
  		cg = link->cg;
  	} while (list_empty(&cg->tasks));
  	it->cg_link = l;
  	it->task = cg->tasks.next;
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
2682
2683
2684
2685
2686
2687
2688
2689
2690
  /*
   * To reduce the fork() overhead for systems that are not actually
   * using their cgroups capability, we don't maintain the lists running
   * through each css_set to its tasks until we see the list actually
   * used - in other words after the first call to cgroup_iter_start().
   *
   * The tasklist_lock is not held here, as do_each_thread() and
   * while_each_thread() are protected by RCU.
   */
3df91fe30   Adrian Bunk   make cgroup_enabl...
2691
  static void cgroup_enable_task_cg_lists(void)
31a7df01f   Cliff Wickman   cgroups: mechanis...
2692
2693
2694
2695
2696
2697
  {
  	struct task_struct *p, *g;
  	write_lock(&css_set_lock);
  	use_task_css_set_links = 1;
  	do_each_thread(g, p) {
  		task_lock(p);
0e04388f0   Li Zefan   cgroup: fix a rac...
2698
2699
2700
2701
2702
2703
  		/*
  		 * We should check if the process is exiting, otherwise
  		 * it will race with cgroup_exit() in that the list
  		 * entry won't be deleted though the process has exited.
  		 */
  		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
31a7df01f   Cliff Wickman   cgroups: mechanis...
2704
2705
2706
2707
2708
  			list_add(&p->cg_list, &p->cgroups->tasks);
  		task_unlock(p);
  	} while_each_thread(g, p);
  	write_unlock(&css_set_lock);
  }
bd89aabc6   Paul Menage   Control groups: R...
2709
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
c6ca57500   Kirill A. Shutemov   cgroup: add spars...
2710
  	__acquires(css_set_lock)
817929ec2   Paul Menage   Task Control Grou...
2711
2712
2713
2714
2715
2716
  {
  	/*
  	 * The first time anyone tries to iterate across a cgroup,
  	 * we need to enable the list linking each css_set to its
  	 * tasks, and fix up all existing tasks.
  	 */
31a7df01f   Cliff Wickman   cgroups: mechanis...
2717
2718
  	if (!use_task_css_set_links)
  		cgroup_enable_task_cg_lists();
817929ec2   Paul Menage   Task Control Grou...
2719
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
2720
2721
  	it->cg_link = &cgrp->css_sets;
  	cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
2722
  }
bd89aabc6   Paul Menage   Control groups: R...
2723
  struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
2724
2725
2726
2727
  					struct cgroup_iter *it)
  {
  	struct task_struct *res;
  	struct list_head *l = it->task;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
2728
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
2729
2730
2731
2732
2733
2734
2735
  
  	/* If the iterator cg is NULL, we have no tasks */
  	if (!it->cg_link)
  		return NULL;
  	res = list_entry(l, struct task_struct, cg_list);
  	/* Advance iterator to find next entry */
  	l = l->next;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
2736
2737
  	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
  	if (l == &link->cg->tasks) {
817929ec2   Paul Menage   Task Control Grou...
2738
2739
  		/* We reached the end of this task list - move on to
  		 * the next cg_cgroup_link */
bd89aabc6   Paul Menage   Control groups: R...
2740
  		cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
2741
2742
2743
2744
2745
  	} else {
  		it->task = l;
  	}
  	return res;
  }
bd89aabc6   Paul Menage   Control groups: R...
2746
  void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
c6ca57500   Kirill A. Shutemov   cgroup: add spars...
2747
  	__releases(css_set_lock)
817929ec2   Paul Menage   Task Control Grou...
2748
2749
2750
  {
  	read_unlock(&css_set_lock);
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
  static inline int started_after_time(struct task_struct *t1,
  				     struct timespec *time,
  				     struct task_struct *t2)
  {
  	int start_diff = timespec_compare(&t1->start_time, time);
  	if (start_diff > 0) {
  		return 1;
  	} else if (start_diff < 0) {
  		return 0;
  	} else {
  		/*
  		 * Arbitrarily, if two processes started at the same
  		 * time, we'll say that the lower pointer value
  		 * started first. Note that t2 may have exited by now
  		 * so this may not be a valid pointer any longer, but
  		 * that's fine - it still serves to distinguish
  		 * between two tasks started (effectively) simultaneously.
  		 */
  		return t1 > t2;
  	}
  }
  
  /*
   * This function is a callback from heap_insert() and is used to order
   * the heap.
   * In this case we order the heap in descending task start time.
   */
  static inline int started_after(void *p1, void *p2)
  {
  	struct task_struct *t1 = p1;
  	struct task_struct *t2 = p2;
  	return started_after_time(t1, &t2->start_time, t2);
  }
  
  /**
   * cgroup_scan_tasks - iterate though all the tasks in a cgroup
   * @scan: struct cgroup_scanner containing arguments for the scan
   *
   * Arguments include pointers to callback functions test_task() and
   * process_task().
   * Iterate through all the tasks in a cgroup, calling test_task() for each,
   * and if it returns true, call process_task() for it also.
   * The test_task pointer may be NULL, meaning always true (select all tasks).
   * Effectively duplicates cgroup_iter_{start,next,end}()
   * but does not lock css_set_lock for the call to process_task().
   * The struct cgroup_scanner may be embedded in any structure of the caller's
   * creation.
   * It is guaranteed that process_task() will act on every task that
   * is a member of the cgroup for the duration of this call. This
   * function may or may not call process_task() for tasks that exit
   * or move to a different cgroup during the call, or are forked or
   * move into the cgroup during the call.
   *
   * Note that test_task() may be called with locks held, and may in some
   * situations be called multiple times for the same task, so it should
   * be cheap.
   * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
   * pre-allocated and will be used for heap operations (and its "gt" member will
   * be overwritten), else a temporary heap will be used (allocation of which
   * may cause this function to fail).
   */
  int cgroup_scan_tasks(struct cgroup_scanner *scan)
  {
  	int retval, i;
  	struct cgroup_iter it;
  	struct task_struct *p, *dropped;
  	/* Never dereference latest_task, since it's not refcounted */
  	struct task_struct *latest_task = NULL;
  	struct ptr_heap tmp_heap;
  	struct ptr_heap *heap;
  	struct timespec latest_time = { 0, 0 };
  
  	if (scan->heap) {
  		/* The caller supplied our heap and pre-allocated its memory */
  		heap = scan->heap;
  		heap->gt = &started_after;
  	} else {
  		/* We need to allocate our own heap memory */
  		heap = &tmp_heap;
  		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
  		if (retval)
  			/* cannot allocate the heap */
  			return retval;
  	}
  
   again:
  	/*
  	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
  	 * to determine which are of interest, and using the scanner's
  	 * "process_task" callback to process any of them that need an update.
  	 * Since we don't want to hold any locks during the task updates,
  	 * gather tasks to be processed in a heap structure.
  	 * The heap is sorted by descending task start time.
  	 * If the statically-sized heap fills up, we overflow tasks that
  	 * started later, and in future iterations only consider tasks that
  	 * started after the latest task in the previous pass. This
  	 * guarantees forward progress and that we don't miss any tasks.
  	 */
  	heap->size = 0;
  	cgroup_iter_start(scan->cg, &it);
  	while ((p = cgroup_iter_next(scan->cg, &it))) {
  		/*
  		 * Only affect tasks that qualify per the caller's callback,
  		 * if he provided one
  		 */
  		if (scan->test_task && !scan->test_task(p, scan))
  			continue;
  		/*
  		 * Only process tasks that started after the last task
  		 * we processed
  		 */
  		if (!started_after_time(p, &latest_time, latest_task))
  			continue;
  		dropped = heap_insert(heap, p);
  		if (dropped == NULL) {
  			/*
  			 * The new task was inserted; the heap wasn't
  			 * previously full
  			 */
  			get_task_struct(p);
  		} else if (dropped != p) {
  			/*
  			 * The new task was inserted, and pushed out a
  			 * different task
  			 */
  			get_task_struct(p);
  			put_task_struct(dropped);
  		}
  		/*
  		 * Else the new task was newer than anything already in
  		 * the heap and wasn't inserted
  		 */
  	}
  	cgroup_iter_end(scan->cg, &it);
  
  	if (heap->size) {
  		for (i = 0; i < heap->size; i++) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2888
  			struct task_struct *q = heap->ptrs[i];
31a7df01f   Cliff Wickman   cgroups: mechanis...
2889
  			if (i == 0) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2890
2891
  				latest_time = q->start_time;
  				latest_task = q;
31a7df01f   Cliff Wickman   cgroups: mechanis...
2892
2893
  			}
  			/* Process the task per the caller's callback */
4fe91d518   Paul Jackson   cgroup: fix spars...
2894
2895
  			scan->process_task(q, scan);
  			put_task_struct(q);
31a7df01f   Cliff Wickman   cgroups: mechanis...
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
  		}
  		/*
  		 * If we had to process any tasks at all, scan again
  		 * in case some of them were in the middle of forking
  		 * children that didn't get processed.
  		 * Not the most efficient way to do it, but it avoids
  		 * having to take callback_mutex in the fork path
  		 */
  		goto again;
  	}
  	if (heap == &tmp_heap)
  		heap_free(&tmp_heap);
  	return 0;
  }
817929ec2   Paul Menage   Task Control Grou...
2910
  /*
102a775e3   Ben Blum   cgroups: add a re...
2911
   * Stuff for reading the 'tasks'/'procs' files.
bbcb81d09   Paul Menage   Task Control Grou...
2912
2913
2914
2915
2916
2917
   *
   * Reading this file can return large amounts of data if a cgroup has
   * *lots* of attached tasks. So it may need several calls to read(),
   * but we cannot guarantee that the information we produce is correct
   * unless we produce it entirely atomically.
   *
bbcb81d09   Paul Menage   Task Control Grou...
2918
   */
bbcb81d09   Paul Menage   Task Control Grou...
2919
2920
  
  /*
d1d9fd330   Ben Blum   cgroups: use vmal...
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
   * The following two functions "fix" the issue where there are more pids
   * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
   * TODO: replace with a kernel-wide solution to this problem
   */
  #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
  static void *pidlist_allocate(int count)
  {
  	if (PIDLIST_TOO_LARGE(count))
  		return vmalloc(count * sizeof(pid_t));
  	else
  		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
  }
  static void pidlist_free(void *p)
  {
  	if (is_vmalloc_addr(p))
  		vfree(p);
  	else
  		kfree(p);
  }
  static void *pidlist_resize(void *p, int newcount)
  {
  	void *newlist;
  	/* note: if new alloc fails, old p will still be valid either way */
  	if (is_vmalloc_addr(p)) {
  		newlist = vmalloc(newcount * sizeof(pid_t));
  		if (!newlist)
  			return NULL;
  		memcpy(newlist, p, newcount * sizeof(pid_t));
  		vfree(p);
  	} else {
  		newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
  	}
  	return newlist;
  }
  
  /*
102a775e3   Ben Blum   cgroups: add a re...
2957
2958
2959
2960
   * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
   * If the new stripped list is sufficiently smaller and there's enough memory
   * to allocate a new buffer, will let go of the unneeded memory. Returns the
   * number of unique elements.
bbcb81d09   Paul Menage   Task Control Grou...
2961
   */
102a775e3   Ben Blum   cgroups: add a re...
2962
2963
2964
  /* is the size difference enough that we should re-allocate the array? */
  #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
  static int pidlist_uniq(pid_t **p, int length)
bbcb81d09   Paul Menage   Task Control Grou...
2965
  {
102a775e3   Ben Blum   cgroups: add a re...
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
  	int src, dest = 1;
  	pid_t *list = *p;
  	pid_t *newlist;
  
  	/*
  	 * we presume the 0th element is unique, so i starts at 1. trivial
  	 * edge cases first; no work needs to be done for either
  	 */
  	if (length == 0 || length == 1)
  		return length;
  	/* src and dest walk down the list; dest counts unique elements */
  	for (src = 1; src < length; src++) {
  		/* find next unique element */
  		while (list[src] == list[src-1]) {
  			src++;
  			if (src == length)
  				goto after;
  		}
  		/* dest always points to where the next unique element goes */
  		list[dest] = list[src];
  		dest++;
  	}
  after:
  	/*
  	 * if the length difference is large enough, we want to allocate a
  	 * smaller buffer to save memory. if this fails due to out of memory,
  	 * we'll just stay with what we've got.
  	 */
  	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
d1d9fd330   Ben Blum   cgroups: use vmal...
2995
  		newlist = pidlist_resize(list, dest);
102a775e3   Ben Blum   cgroups: add a re...
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
  		if (newlist)
  			*p = newlist;
  	}
  	return dest;
  }
  
  static int cmppid(const void *a, const void *b)
  {
  	return *(pid_t *)a - *(pid_t *)b;
  }
  
  /*
72a8cb30d   Ben Blum   cgroups: ensure c...
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
   * find the appropriate pidlist for our purpose (given procs vs tasks)
   * returns with the lock on that pidlist already held, and takes care
   * of the use count, or returns NULL with no locks held if we're out of
   * memory.
   */
  static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
  						  enum cgroup_filetype type)
  {
  	struct cgroup_pidlist *l;
  	/* don't need task_nsproxy() if we're looking at ourself */
b70cc5fdb   Li Zefan   cgroups: clean up...
3018
  	struct pid_namespace *ns = current->nsproxy->pid_ns;
72a8cb30d   Ben Blum   cgroups: ensure c...
3019
3020
3021
3022
3023
3024
3025
3026
3027
  	/*
  	 * We can't drop the pidlist_mutex before taking the l->mutex in case
  	 * the last ref-holder is trying to remove l from the list at the same
  	 * time. Holding the pidlist_mutex precludes somebody taking whichever
  	 * list we find out from under us - compare release_pid_array().
  	 */
  	mutex_lock(&cgrp->pidlist_mutex);
  	list_for_each_entry(l, &cgrp->pidlists, links) {
  		if (l->key.type == type && l->key.ns == ns) {
72a8cb30d   Ben Blum   cgroups: ensure c...
3028
3029
3030
  			/* make sure l doesn't vanish out from under us */
  			down_write(&l->mutex);
  			mutex_unlock(&cgrp->pidlist_mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
3031
3032
3033
3034
3035
3036
3037
  			return l;
  		}
  	}
  	/* entry not found; create a new one */
  	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
  	if (!l) {
  		mutex_unlock(&cgrp->pidlist_mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
3038
3039
3040
3041
3042
  		return l;
  	}
  	init_rwsem(&l->mutex);
  	down_write(&l->mutex);
  	l->key.type = type;
b70cc5fdb   Li Zefan   cgroups: clean up...
3043
  	l->key.ns = get_pid_ns(ns);
72a8cb30d   Ben Blum   cgroups: ensure c...
3044
3045
3046
3047
3048
3049
3050
3051
3052
  	l->use_count = 0; /* don't increment here */
  	l->list = NULL;
  	l->owner = cgrp;
  	list_add(&l->links, &cgrp->pidlists);
  	mutex_unlock(&cgrp->pidlist_mutex);
  	return l;
  }
  
  /*
102a775e3   Ben Blum   cgroups: add a re...
3053
3054
   * Load a cgroup's pidarray with either procs' tgids or tasks' pids
   */
72a8cb30d   Ben Blum   cgroups: ensure c...
3055
3056
  static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
  			      struct cgroup_pidlist **lp)
102a775e3   Ben Blum   cgroups: add a re...
3057
3058
3059
3060
  {
  	pid_t *array;
  	int length;
  	int pid, n = 0; /* used for populating the array */
817929ec2   Paul Menage   Task Control Grou...
3061
3062
  	struct cgroup_iter it;
  	struct task_struct *tsk;
102a775e3   Ben Blum   cgroups: add a re...
3063
3064
3065
3066
3067
3068
3069
3070
3071
  	struct cgroup_pidlist *l;
  
  	/*
  	 * If cgroup gets more users after we read count, we won't have
  	 * enough space - tough.  This race is indistinguishable to the
  	 * caller from the case that the additional cgroup users didn't
  	 * show up until sometime later on.
  	 */
  	length = cgroup_task_count(cgrp);
d1d9fd330   Ben Blum   cgroups: use vmal...
3072
  	array = pidlist_allocate(length);
102a775e3   Ben Blum   cgroups: add a re...
3073
3074
3075
  	if (!array)
  		return -ENOMEM;
  	/* now, populate the array */
bd89aabc6   Paul Menage   Control groups: R...
3076
3077
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
102a775e3   Ben Blum   cgroups: add a re...
3078
  		if (unlikely(n == length))
817929ec2   Paul Menage   Task Control Grou...
3079
  			break;
102a775e3   Ben Blum   cgroups: add a re...
3080
  		/* get tgid or pid for procs or tasks file respectively */
72a8cb30d   Ben Blum   cgroups: ensure c...
3081
3082
3083
3084
  		if (type == CGROUP_FILE_PROCS)
  			pid = task_tgid_vnr(tsk);
  		else
  			pid = task_pid_vnr(tsk);
102a775e3   Ben Blum   cgroups: add a re...
3085
3086
  		if (pid > 0) /* make sure to only use valid results */
  			array[n++] = pid;
817929ec2   Paul Menage   Task Control Grou...
3087
  	}
bd89aabc6   Paul Menage   Control groups: R...
3088
  	cgroup_iter_end(cgrp, &it);
102a775e3   Ben Blum   cgroups: add a re...
3089
3090
3091
  	length = n;
  	/* now sort & (if procs) strip out duplicates */
  	sort(array, length, sizeof(pid_t), cmppid, NULL);
72a8cb30d   Ben Blum   cgroups: ensure c...
3092
  	if (type == CGROUP_FILE_PROCS)
102a775e3   Ben Blum   cgroups: add a re...
3093
  		length = pidlist_uniq(&array, length);
72a8cb30d   Ben Blum   cgroups: ensure c...
3094
3095
  	l = cgroup_pidlist_find(cgrp, type);
  	if (!l) {
d1d9fd330   Ben Blum   cgroups: use vmal...
3096
  		pidlist_free(array);
72a8cb30d   Ben Blum   cgroups: ensure c...
3097
  		return -ENOMEM;
102a775e3   Ben Blum   cgroups: add a re...
3098
  	}
72a8cb30d   Ben Blum   cgroups: ensure c...
3099
  	/* store array, freeing old if necessary - lock already held */
d1d9fd330   Ben Blum   cgroups: use vmal...
3100
  	pidlist_free(l->list);
102a775e3   Ben Blum   cgroups: add a re...
3101
3102
3103
3104
  	l->list = array;
  	l->length = length;
  	l->use_count++;
  	up_write(&l->mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
3105
  	*lp = l;
102a775e3   Ben Blum   cgroups: add a re...
3106
  	return 0;
bbcb81d09   Paul Menage   Task Control Grou...
3107
  }
846c7bb05   Balbir Singh   Add cgroupstats
3108
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3109
   * cgroupstats_build - build and fill cgroupstats
846c7bb05   Balbir Singh   Add cgroupstats
3110
3111
3112
   * @stats: cgroupstats to fill information into
   * @dentry: A dentry entry belonging to the cgroup for which stats have
   * been requested.
a043e3b2c   Li Zefan   cgroup: fix comments
3113
3114
3115
   *
   * Build and fill cgroupstats so that taskstats can export it to user
   * space.
846c7bb05   Balbir Singh   Add cgroupstats
3116
3117
3118
3119
   */
  int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  {
  	int ret = -EINVAL;
bd89aabc6   Paul Menage   Control groups: R...
3120
  	struct cgroup *cgrp;
846c7bb05   Balbir Singh   Add cgroupstats
3121
3122
  	struct cgroup_iter it;
  	struct task_struct *tsk;
33d283bef   Li Zefan   cgroups: fix a se...
3123

846c7bb05   Balbir Singh   Add cgroupstats
3124
  	/*
33d283bef   Li Zefan   cgroups: fix a se...
3125
3126
  	 * Validate dentry by checking the superblock operations,
  	 * and make sure it's a directory.
846c7bb05   Balbir Singh   Add cgroupstats
3127
  	 */
33d283bef   Li Zefan   cgroups: fix a se...
3128
3129
  	if (dentry->d_sb->s_op != &cgroup_ops ||
  	    !S_ISDIR(dentry->d_inode->i_mode))
846c7bb05   Balbir Singh   Add cgroupstats
3130
3131
3132
  		 goto err;
  
  	ret = 0;
bd89aabc6   Paul Menage   Control groups: R...
3133
  	cgrp = dentry->d_fsdata;
846c7bb05   Balbir Singh   Add cgroupstats
3134

bd89aabc6   Paul Menage   Control groups: R...
3135
3136
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
846c7bb05   Balbir Singh   Add cgroupstats
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
  		switch (tsk->state) {
  		case TASK_RUNNING:
  			stats->nr_running++;
  			break;
  		case TASK_INTERRUPTIBLE:
  			stats->nr_sleeping++;
  			break;
  		case TASK_UNINTERRUPTIBLE:
  			stats->nr_uninterruptible++;
  			break;
  		case TASK_STOPPED:
  			stats->nr_stopped++;
  			break;
  		default:
  			if (delayacct_is_task_waiting_on_io(tsk))
  				stats->nr_io_wait++;
  			break;
  		}
  	}
bd89aabc6   Paul Menage   Control groups: R...
3156
  	cgroup_iter_end(cgrp, &it);
846c7bb05   Balbir Singh   Add cgroupstats
3157

846c7bb05   Balbir Singh   Add cgroupstats
3158
3159
3160
  err:
  	return ret;
  }
8f3ff2086   Paul Menage   cgroups: revert "...
3161

bbcb81d09   Paul Menage   Task Control Grou...
3162
  /*
102a775e3   Ben Blum   cgroups: add a re...
3163
   * seq_file methods for the tasks/procs files. The seq_file position is the
cc31edcee   Paul Menage   cgroups: convert ...
3164
   * next pid to display; the seq_file iterator is a pointer to the pid
102a775e3   Ben Blum   cgroups: add a re...
3165
   * in the cgroup->l->list array.
bbcb81d09   Paul Menage   Task Control Grou...
3166
   */
cc31edcee   Paul Menage   cgroups: convert ...
3167

102a775e3   Ben Blum   cgroups: add a re...
3168
  static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
bbcb81d09   Paul Menage   Task Control Grou...
3169
  {
cc31edcee   Paul Menage   cgroups: convert ...
3170
3171
3172
3173
3174
3175
  	/*
  	 * Initially we receive a position value that corresponds to
  	 * one more than the last pid shown (or 0 on the first call or
  	 * after a seek to the start). Use a binary-search to find the
  	 * next pid to display, if any
  	 */
102a775e3   Ben Blum   cgroups: add a re...
3176
  	struct cgroup_pidlist *l = s->private;
cc31edcee   Paul Menage   cgroups: convert ...
3177
3178
  	int index = 0, pid = *pos;
  	int *iter;
102a775e3   Ben Blum   cgroups: add a re...
3179
  	down_read(&l->mutex);
cc31edcee   Paul Menage   cgroups: convert ...
3180
  	if (pid) {
102a775e3   Ben Blum   cgroups: add a re...
3181
  		int end = l->length;
207777664   Stephen Rothwell   cgroup: remove un...
3182

cc31edcee   Paul Menage   cgroups: convert ...
3183
3184
  		while (index < end) {
  			int mid = (index + end) / 2;
102a775e3   Ben Blum   cgroups: add a re...
3185
  			if (l->list[mid] == pid) {
cc31edcee   Paul Menage   cgroups: convert ...
3186
3187
  				index = mid;
  				break;
102a775e3   Ben Blum   cgroups: add a re...
3188
  			} else if (l->list[mid] <= pid)
cc31edcee   Paul Menage   cgroups: convert ...
3189
3190
3191
3192
3193
3194
  				index = mid + 1;
  			else
  				end = mid;
  		}
  	}
  	/* If we're off the end of the array, we're done */
102a775e3   Ben Blum   cgroups: add a re...
3195
  	if (index >= l->length)
cc31edcee   Paul Menage   cgroups: convert ...
3196
3197
  		return NULL;
  	/* Update the abstract position to be the actual pid that we found */
102a775e3   Ben Blum   cgroups: add a re...
3198
  	iter = l->list + index;
cc31edcee   Paul Menage   cgroups: convert ...
3199
3200
3201
  	*pos = *iter;
  	return iter;
  }
102a775e3   Ben Blum   cgroups: add a re...
3202
  static void cgroup_pidlist_stop(struct seq_file *s, void *v)
cc31edcee   Paul Menage   cgroups: convert ...
3203
  {
102a775e3   Ben Blum   cgroups: add a re...
3204
3205
  	struct cgroup_pidlist *l = s->private;
  	up_read(&l->mutex);
cc31edcee   Paul Menage   cgroups: convert ...
3206
  }
102a775e3   Ben Blum   cgroups: add a re...
3207
  static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
cc31edcee   Paul Menage   cgroups: convert ...
3208
  {
102a775e3   Ben Blum   cgroups: add a re...
3209
3210
3211
  	struct cgroup_pidlist *l = s->private;
  	pid_t *p = v;
  	pid_t *end = l->list + l->length;
cc31edcee   Paul Menage   cgroups: convert ...
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
  	/*
  	 * Advance to the next pid in the array. If this goes off the
  	 * end, we're done
  	 */
  	p++;
  	if (p >= end) {
  		return NULL;
  	} else {
  		*pos = *p;
  		return p;
  	}
  }
102a775e3   Ben Blum   cgroups: add a re...
3224
  static int cgroup_pidlist_show(struct seq_file *s, void *v)
cc31edcee   Paul Menage   cgroups: convert ...
3225
3226
3227
3228
  {
  	return seq_printf(s, "%d
  ", *(int *)v);
  }
bbcb81d09   Paul Menage   Task Control Grou...
3229

102a775e3   Ben Blum   cgroups: add a re...
3230
3231
3232
3233
3234
3235
3236
3237
3238
  /*
   * seq_operations functions for iterating on pidlists through seq_file -
   * independent of whether it's tasks or procs
   */
  static const struct seq_operations cgroup_pidlist_seq_operations = {
  	.start = cgroup_pidlist_start,
  	.stop = cgroup_pidlist_stop,
  	.next = cgroup_pidlist_next,
  	.show = cgroup_pidlist_show,
cc31edcee   Paul Menage   cgroups: convert ...
3239
  };
102a775e3   Ben Blum   cgroups: add a re...
3240
  static void cgroup_release_pid_array(struct cgroup_pidlist *l)
cc31edcee   Paul Menage   cgroups: convert ...
3241
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
3242
3243
3244
3245
3246
3247
3248
  	/*
  	 * the case where we're the last user of this particular pidlist will
  	 * have us remove it from the cgroup's list, which entails taking the
  	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
  	 * pidlist_mutex, we have to take pidlist_mutex first.
  	 */
  	mutex_lock(&l->owner->pidlist_mutex);
102a775e3   Ben Blum   cgroups: add a re...
3249
3250
3251
  	down_write(&l->mutex);
  	BUG_ON(!l->use_count);
  	if (!--l->use_count) {
72a8cb30d   Ben Blum   cgroups: ensure c...
3252
3253
3254
  		/* we're the last user if refcount is 0; remove and free */
  		list_del(&l->links);
  		mutex_unlock(&l->owner->pidlist_mutex);
d1d9fd330   Ben Blum   cgroups: use vmal...
3255
  		pidlist_free(l->list);
72a8cb30d   Ben Blum   cgroups: ensure c...
3256
3257
3258
3259
  		put_pid_ns(l->key.ns);
  		up_write(&l->mutex);
  		kfree(l);
  		return;
cc31edcee   Paul Menage   cgroups: convert ...
3260
  	}
72a8cb30d   Ben Blum   cgroups: ensure c...
3261
  	mutex_unlock(&l->owner->pidlist_mutex);
102a775e3   Ben Blum   cgroups: add a re...
3262
  	up_write(&l->mutex);
bbcb81d09   Paul Menage   Task Control Grou...
3263
  }
102a775e3   Ben Blum   cgroups: add a re...
3264
  static int cgroup_pidlist_release(struct inode *inode, struct file *file)
cc31edcee   Paul Menage   cgroups: convert ...
3265
  {
102a775e3   Ben Blum   cgroups: add a re...
3266
  	struct cgroup_pidlist *l;
cc31edcee   Paul Menage   cgroups: convert ...
3267
3268
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
102a775e3   Ben Blum   cgroups: add a re...
3269
3270
3271
3272
3273
3274
  	/*
  	 * the seq_file will only be initialized if the file was opened for
  	 * reading; hence we check if it's not null only in that case.
  	 */
  	l = ((struct seq_file *)file->private_data)->private;
  	cgroup_release_pid_array(l);
cc31edcee   Paul Menage   cgroups: convert ...
3275
3276
  	return seq_release(inode, file);
  }
102a775e3   Ben Blum   cgroups: add a re...
3277
  static const struct file_operations cgroup_pidlist_operations = {
cc31edcee   Paul Menage   cgroups: convert ...
3278
3279
3280
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.write = cgroup_file_write,
102a775e3   Ben Blum   cgroups: add a re...
3281
  	.release = cgroup_pidlist_release,
cc31edcee   Paul Menage   cgroups: convert ...
3282
  };
bbcb81d09   Paul Menage   Task Control Grou...
3283
  /*
102a775e3   Ben Blum   cgroups: add a re...
3284
3285
3286
   * The following functions handle opens on a file that displays a pidlist
   * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
   * in the cgroup.
bbcb81d09   Paul Menage   Task Control Grou...
3287
   */
102a775e3   Ben Blum   cgroups: add a re...
3288
  /* helper function for the two below it */
72a8cb30d   Ben Blum   cgroups: ensure c...
3289
  static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
bbcb81d09   Paul Menage   Task Control Grou...
3290
  {
bd89aabc6   Paul Menage   Control groups: R...
3291
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
72a8cb30d   Ben Blum   cgroups: ensure c...
3292
  	struct cgroup_pidlist *l;
cc31edcee   Paul Menage   cgroups: convert ...
3293
  	int retval;
bbcb81d09   Paul Menage   Task Control Grou...
3294

cc31edcee   Paul Menage   cgroups: convert ...
3295
  	/* Nothing to do for write-only files */
bbcb81d09   Paul Menage   Task Control Grou...
3296
3297
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
102a775e3   Ben Blum   cgroups: add a re...
3298
  	/* have the array populated */
72a8cb30d   Ben Blum   cgroups: ensure c...
3299
  	retval = pidlist_array_load(cgrp, type, &l);
102a775e3   Ben Blum   cgroups: add a re...
3300
3301
3302
3303
  	if (retval)
  		return retval;
  	/* configure file information */
  	file->f_op = &cgroup_pidlist_operations;
cc31edcee   Paul Menage   cgroups: convert ...
3304

102a775e3   Ben Blum   cgroups: add a re...
3305
  	retval = seq_open(file, &cgroup_pidlist_seq_operations);
cc31edcee   Paul Menage   cgroups: convert ...
3306
  	if (retval) {
102a775e3   Ben Blum   cgroups: add a re...
3307
  		cgroup_release_pid_array(l);
cc31edcee   Paul Menage   cgroups: convert ...
3308
  		return retval;
bbcb81d09   Paul Menage   Task Control Grou...
3309
  	}
102a775e3   Ben Blum   cgroups: add a re...
3310
  	((struct seq_file *)file->private_data)->private = l;
bbcb81d09   Paul Menage   Task Control Grou...
3311
3312
  	return 0;
  }
102a775e3   Ben Blum   cgroups: add a re...
3313
3314
  static int cgroup_tasks_open(struct inode *unused, struct file *file)
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
3315
  	return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
102a775e3   Ben Blum   cgroups: add a re...
3316
3317
3318
  }
  static int cgroup_procs_open(struct inode *unused, struct file *file)
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
3319
  	return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
102a775e3   Ben Blum   cgroups: add a re...
3320
  }
bbcb81d09   Paul Menage   Task Control Grou...
3321

bd89aabc6   Paul Menage   Control groups: R...
3322
  static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
81a6a5cdd   Paul Menage   Task Control Grou...
3323
3324
  					    struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
3325
  	return notify_on_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
3326
  }
6379c1061   Paul Menage   cgroup files: mov...
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
  static int cgroup_write_notify_on_release(struct cgroup *cgrp,
  					  struct cftype *cft,
  					  u64 val)
  {
  	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
  	if (val)
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	else
  		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	return 0;
  }
bbcb81d09   Paul Menage   Task Control Grou...
3338
  /*
0dea11687   Kirill A. Shutemov   cgroup: implement...
3339
3340
3341
3342
3343
3344
3345
3346
3347
   * Unregister event and free resources.
   *
   * Gets called from workqueue.
   */
  static void cgroup_event_remove(struct work_struct *work)
  {
  	struct cgroup_event *event = container_of(work, struct cgroup_event,
  			remove);
  	struct cgroup *cgrp = event->cgrp;
0dea11687   Kirill A. Shutemov   cgroup: implement...
3348
3349
3350
  	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
  
  	eventfd_ctx_put(event->eventfd);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3351
  	kfree(event);
a0a4db548   Kirill A. Shutemov   cgroups: remove e...
3352
  	dput(cgrp->dentry);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
  }
  
  /*
   * Gets called on POLLHUP on eventfd when user closes it.
   *
   * Called with wqh->lock held and interrupts disabled.
   */
  static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
  		int sync, void *key)
  {
  	struct cgroup_event *event = container_of(wait,
  			struct cgroup_event, wait);
  	struct cgroup *cgrp = event->cgrp;
  	unsigned long flags = (unsigned long)key;
  
  	if (flags & POLLHUP) {
a93d2f174   Changli Gao   sched, wait: Use ...
3369
  		__remove_wait_queue(event->wqh, &event->wait);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
  		spin_lock(&cgrp->event_list_lock);
  		list_del(&event->list);
  		spin_unlock(&cgrp->event_list_lock);
  		/*
  		 * We are in atomic context, but cgroup_event_remove() may
  		 * sleep, so we have to call it in workqueue.
  		 */
  		schedule_work(&event->remove);
  	}
  
  	return 0;
  }
  
  static void cgroup_event_ptable_queue_proc(struct file *file,
  		wait_queue_head_t *wqh, poll_table *pt)
  {
  	struct cgroup_event *event = container_of(pt,
  			struct cgroup_event, pt);
  
  	event->wqh = wqh;
  	add_wait_queue(wqh, &event->wait);
  }
  
  /*
   * Parse input and register new cgroup event handler.
   *
   * Input must be in format '<event_fd> <control_fd> <args>'.
   * Interpretation of args is defined by control file implementation.
   */
  static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	struct cgroup_event *event = NULL;
  	unsigned int efd, cfd;
  	struct file *efile = NULL;
  	struct file *cfile = NULL;
  	char *endp;
  	int ret;
  
  	efd = simple_strtoul(buffer, &endp, 10);
  	if (*endp != ' ')
  		return -EINVAL;
  	buffer = endp + 1;
  
  	cfd = simple_strtoul(buffer, &endp, 10);
  	if ((*endp != ' ') && (*endp != '\0'))
  		return -EINVAL;
  	buffer = endp + 1;
  
  	event = kzalloc(sizeof(*event), GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
  	event->cgrp = cgrp;
  	INIT_LIST_HEAD(&event->list);
  	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
  	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
  	INIT_WORK(&event->remove, cgroup_event_remove);
  
  	efile = eventfd_fget(efd);
  	if (IS_ERR(efile)) {
  		ret = PTR_ERR(efile);
  		goto fail;
  	}
  
  	event->eventfd = eventfd_ctx_fileget(efile);
  	if (IS_ERR(event->eventfd)) {
  		ret = PTR_ERR(event->eventfd);
  		goto fail;
  	}
  
  	cfile = fget(cfd);
  	if (!cfile) {
  		ret = -EBADF;
  		goto fail;
  	}
  
  	/* the process need read permission on control file */
3bfa784a6   Al Viro   kill file_permiss...
3447
3448
  	/* AV: shouldn't we check that it's been opened for read instead? */
  	ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
  	if (ret < 0)
  		goto fail;
  
  	event->cft = __file_cft(cfile);
  	if (IS_ERR(event->cft)) {
  		ret = PTR_ERR(event->cft);
  		goto fail;
  	}
  
  	if (!event->cft->register_event || !event->cft->unregister_event) {
  		ret = -EINVAL;
  		goto fail;
  	}
  
  	ret = event->cft->register_event(cgrp, event->cft,
  			event->eventfd, buffer);
  	if (ret)
  		goto fail;
  
  	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
  		event->cft->unregister_event(cgrp, event->cft, event->eventfd);
  		ret = 0;
  		goto fail;
  	}
a0a4db548   Kirill A. Shutemov   cgroups: remove e...
3473
3474
3475
3476
3477
3478
  	/*
  	 * Events should be removed after rmdir of cgroup directory, but before
  	 * destroying subsystem state objects. Let's take reference to cgroup
  	 * directory dentry to do that.
  	 */
  	dget(cgrp->dentry);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
  	spin_lock(&cgrp->event_list_lock);
  	list_add(&event->list, &cgrp->event_list);
  	spin_unlock(&cgrp->event_list_lock);
  
  	fput(cfile);
  	fput(efile);
  
  	return 0;
  
  fail:
  	if (cfile)
  		fput(cfile);
  
  	if (event && event->eventfd && !IS_ERR(event->eventfd))
  		eventfd_ctx_put(event->eventfd);
  
  	if (!IS_ERR_OR_NULL(efile))
  		fput(efile);
  
  	kfree(event);
  
  	return ret;
  }
97978e6d1   Daniel Lezcano   cgroup: add clone...
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
  static u64 cgroup_clone_children_read(struct cgroup *cgrp,
  				    struct cftype *cft)
  {
  	return clone_children(cgrp);
  }
  
  static int cgroup_clone_children_write(struct cgroup *cgrp,
  				     struct cftype *cft,
  				     u64 val)
  {
  	if (val)
  		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
  	else
  		clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
  	return 0;
  }
0dea11687   Kirill A. Shutemov   cgroup: implement...
3518
  /*
bbcb81d09   Paul Menage   Task Control Grou...
3519
3520
   * for the common functions, 'private' gives the type of file
   */
102a775e3   Ben Blum   cgroups: add a re...
3521
3522
  /* for hysterical raisins, we can't put this on the older files */
  #define CGROUP_FILE_GENERIC_PREFIX "cgroup."
81a6a5cdd   Paul Menage   Task Control Grou...
3523
3524
3525
3526
  static struct cftype files[] = {
  	{
  		.name = "tasks",
  		.open = cgroup_tasks_open,
af351026a   Paul Menage   cgroup files: tur...
3527
  		.write_u64 = cgroup_tasks_write,
102a775e3   Ben Blum   cgroups: add a re...
3528
  		.release = cgroup_pidlist_release,
099fca322   Li Zefan   cgroups: show cor...
3529
  		.mode = S_IRUGO | S_IWUSR,
81a6a5cdd   Paul Menage   Task Control Grou...
3530
  	},
102a775e3   Ben Blum   cgroups: add a re...
3531
3532
3533
  	{
  		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
  		.open = cgroup_procs_open,
74a1166df   Ben Blum   cgroups: make pro...
3534
  		.write_u64 = cgroup_procs_write,
102a775e3   Ben Blum   cgroups: add a re...
3535
  		.release = cgroup_pidlist_release,
74a1166df   Ben Blum   cgroups: make pro...
3536
  		.mode = S_IRUGO | S_IWUSR,
102a775e3   Ben Blum   cgroups: add a re...
3537
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
3538
3539
  	{
  		.name = "notify_on_release",
f4c753b7e   Paul Menage   CGroup API files:...
3540
  		.read_u64 = cgroup_read_notify_on_release,
6379c1061   Paul Menage   cgroup files: mov...
3541
  		.write_u64 = cgroup_write_notify_on_release,
81a6a5cdd   Paul Menage   Task Control Grou...
3542
  	},
0dea11687   Kirill A. Shutemov   cgroup: implement...
3543
3544
3545
3546
3547
  	{
  		.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
  		.write_string = cgroup_write_event_control,
  		.mode = S_IWUGO,
  	},
97978e6d1   Daniel Lezcano   cgroup: add clone...
3548
3549
3550
3551
3552
  	{
  		.name = "cgroup.clone_children",
  		.read_u64 = cgroup_clone_children_read,
  		.write_u64 = cgroup_clone_children_write,
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
3553
3554
3555
3556
  };
  
  static struct cftype cft_release_agent = {
  	.name = "release_agent",
e788e066c   Paul Menage   cgroup files: mov...
3557
3558
3559
  	.read_seq_string = cgroup_release_agent_show,
  	.write_string = cgroup_release_agent_write,
  	.max_write_len = PATH_MAX,
bbcb81d09   Paul Menage   Task Control Grou...
3560
  };
bd89aabc6   Paul Menage   Control groups: R...
3561
  static int cgroup_populate_dir(struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3562
3563
3564
3565
3566
  {
  	int err;
  	struct cgroup_subsys *ss;
  
  	/* First clear out any existing files */
bd89aabc6   Paul Menage   Control groups: R...
3567
  	cgroup_clear_directory(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
3568

bd89aabc6   Paul Menage   Control groups: R...
3569
  	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
bbcb81d09   Paul Menage   Task Control Grou...
3570
3571
  	if (err < 0)
  		return err;
bd89aabc6   Paul Menage   Control groups: R...
3572
3573
  	if (cgrp == cgrp->top_cgroup) {
  		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
81a6a5cdd   Paul Menage   Task Control Grou...
3574
3575
  			return err;
  	}
bd89aabc6   Paul Menage   Control groups: R...
3576
3577
  	for_each_subsys(cgrp->root, ss) {
  		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
ddbcc7e8e   Paul Menage   Task Control Grou...
3578
3579
  			return err;
  	}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
  	/* This cgroup is ready now */
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		/*
  		 * Update id->css pointer and make this css visible from
  		 * CSS ID functions. This pointer will be dereferened
  		 * from RCU-read-side without locks.
  		 */
  		if (css->id)
  			rcu_assign_pointer(css->id->css, css);
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3591
3592
3593
3594
3595
3596
  
  	return 0;
  }
  
  static void init_cgroup_css(struct cgroup_subsys_state *css,
  			       struct cgroup_subsys *ss,
bd89aabc6   Paul Menage   Control groups: R...
3597
  			       struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3598
  {
bd89aabc6   Paul Menage   Control groups: R...
3599
  	css->cgroup = cgrp;
e7c5ec919   Paul Menage   cgroups: add css_...
3600
  	atomic_set(&css->refcnt, 1);
ddbcc7e8e   Paul Menage   Task Control Grou...
3601
  	css->flags = 0;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3602
  	css->id = NULL;
bd89aabc6   Paul Menage   Control groups: R...
3603
  	if (cgrp == dummytop)
ddbcc7e8e   Paul Menage   Task Control Grou...
3604
  		set_bit(CSS_ROOT, &css->flags);
bd89aabc6   Paul Menage   Control groups: R...
3605
3606
  	BUG_ON(cgrp->subsys[ss->subsys_id]);
  	cgrp->subsys[ss->subsys_id] = css;
ddbcc7e8e   Paul Menage   Task Control Grou...
3607
  }
999cd8a45   Paul Menage   cgroups: add a pe...
3608
3609
3610
3611
  static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
  {
  	/* We need to take each hierarchy_mutex in a consistent order */
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3612
3613
3614
3615
  	/*
  	 * No worry about a race with rebind_subsystems that might mess up the
  	 * locking order, since both parties are under cgroup_mutex.
  	 */
999cd8a45   Paul Menage   cgroups: add a pe...
3616
3617
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3618
3619
  		if (ss == NULL)
  			continue;
999cd8a45   Paul Menage   cgroups: add a pe...
3620
  		if (ss->root == root)
cfebe563b   Li Zefan   cgroups: fix lock...
3621
  			mutex_lock(&ss->hierarchy_mutex);
999cd8a45   Paul Menage   cgroups: add a pe...
3622
3623
3624
3625
3626
3627
3628
3629
3630
  	}
  }
  
  static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  {
  	int i;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3631
3632
  		if (ss == NULL)
  			continue;
999cd8a45   Paul Menage   cgroups: add a pe...
3633
3634
3635
3636
  		if (ss->root == root)
  			mutex_unlock(&ss->hierarchy_mutex);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
3637
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
3638
3639
3640
3641
   * cgroup_create - create a cgroup
   * @parent: cgroup that will be parent of the new cgroup
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new inode
ddbcc7e8e   Paul Menage   Task Control Grou...
3642
   *
a043e3b2c   Li Zefan   cgroup: fix comments
3643
   * Must be called with the mutex on the parent inode held
ddbcc7e8e   Paul Menage   Task Control Grou...
3644
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
3645
  static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
a5e7ed328   Al Viro   cgroup: propagate...
3646
  			     umode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
3647
  {
bd89aabc6   Paul Menage   Control groups: R...
3648
  	struct cgroup *cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
3649
3650
3651
3652
  	struct cgroupfs_root *root = parent->root;
  	int err = 0;
  	struct cgroup_subsys *ss;
  	struct super_block *sb = root->sb;
bd89aabc6   Paul Menage   Control groups: R...
3653
3654
  	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
  	if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
  		return -ENOMEM;
  
  	/* Grab a reference on the superblock so the hierarchy doesn't
  	 * get deleted on unmount if there are child cgroups.  This
  	 * can be done outside cgroup_mutex, since the sb can't
  	 * disappear while someone has an open control file on the
  	 * fs */
  	atomic_inc(&sb->s_active);
  
  	mutex_lock(&cgroup_mutex);
cc31edcee   Paul Menage   cgroups: convert ...
3665
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3666

bd89aabc6   Paul Menage   Control groups: R...
3667
3668
3669
  	cgrp->parent = parent;
  	cgrp->root = parent->root;
  	cgrp->top_cgroup = parent->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
3670

b6abdb0e6   Li Zefan   cgroup: fix defau...
3671
3672
  	if (notify_on_release(parent))
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
97978e6d1   Daniel Lezcano   cgroup: add clone...
3673
3674
  	if (clone_children(parent))
  		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3675
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
3676
  		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
4528fd059   Li Zefan   cgroups: fix to r...
3677

ddbcc7e8e   Paul Menage   Task Control Grou...
3678
3679
3680
3681
  		if (IS_ERR(css)) {
  			err = PTR_ERR(css);
  			goto err_destroy;
  		}
bd89aabc6   Paul Menage   Control groups: R...
3682
  		init_cgroup_css(css, ss, cgrp);
4528fd059   Li Zefan   cgroups: fix to r...
3683
3684
3685
  		if (ss->use_id) {
  			err = alloc_css_id(ss, parent, cgrp);
  			if (err)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3686
  				goto err_destroy;
4528fd059   Li Zefan   cgroups: fix to r...
3687
  		}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3688
  		/* At error, ->destroy() callback has to free assigned ID. */
97978e6d1   Daniel Lezcano   cgroup: add clone...
3689
3690
  		if (clone_children(parent) && ss->post_clone)
  			ss->post_clone(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3691
  	}
999cd8a45   Paul Menage   cgroups: add a pe...
3692
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
3693
  	list_add(&cgrp->sibling, &cgrp->parent->children);
999cd8a45   Paul Menage   cgroups: add a pe...
3694
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
3695
  	root->number_of_cgroups++;
bd89aabc6   Paul Menage   Control groups: R...
3696
  	err = cgroup_create_dir(cgrp, dentry, mode);
ddbcc7e8e   Paul Menage   Task Control Grou...
3697
3698
3699
3700
  	if (err < 0)
  		goto err_remove;
  
  	/* The cgroup directory was pre-locked for us */
bd89aabc6   Paul Menage   Control groups: R...
3701
  	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
ddbcc7e8e   Paul Menage   Task Control Grou...
3702

bd89aabc6   Paul Menage   Control groups: R...
3703
  	err = cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3704
3705
3706
  	/* If err < 0, we have a half-filled directory - oh well ;) */
  
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
3707
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
3708
3709
3710
3711
  
  	return 0;
  
   err_remove:
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
3712
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
3713
  	list_del(&cgrp->sibling);
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
3714
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
3715
3716
3717
3718
3719
  	root->number_of_cgroups--;
  
   err_destroy:
  
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
3720
3721
  		if (cgrp->subsys[ss->subsys_id])
  			ss->destroy(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3722
3723
3724
3725
3726
3727
  	}
  
  	mutex_unlock(&cgroup_mutex);
  
  	/* Release the reference count that we took on the superblock */
  	deactivate_super(sb);
bd89aabc6   Paul Menage   Control groups: R...
3728
  	kfree(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3729
3730
  	return err;
  }
18bb1db3e   Al Viro   switch vfs_mkdir(...
3731
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
3732
3733
3734
3735
3736
3737
  {
  	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
  
  	/* the vfs holds inode->i_mutex already */
  	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
55b6fd016   Li Zefan   cgroup: uninline ...
3738
  static int cgroup_has_css_refs(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
3739
3740
3741
  {
  	/* Check the reference count on each subsystem. Since we
  	 * already established that there are no tasks in the
e7c5ec919   Paul Menage   cgroups: add css_...
3742
  	 * cgroup, if the css refcount is also 1, then there should
81a6a5cdd   Paul Menage   Task Control Grou...
3743
3744
3745
3746
3747
3748
3749
  	 * be no outstanding references, so the subsystem is safe to
  	 * destroy. We scan across all subsystems rather than using
  	 * the per-hierarchy linked list of mounted subsystems since
  	 * we can be called via check_for_release() with no
  	 * synchronization other than RCU, and the subsystem linked
  	 * list isn't RCU-safe */
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3750
3751
3752
3753
3754
  	/*
  	 * We won't need to lock the subsys array, because the subsystems
  	 * we're concerned about aren't going anywhere since our cgroup root
  	 * has a reference on them.
  	 */
81a6a5cdd   Paul Menage   Task Control Grou...
3755
3756
3757
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		struct cgroup_subsys_state *css;
aae8aab40   Ben Blum   cgroups: revamp s...
3758
3759
  		/* Skip subsystems not present or not in this hierarchy */
  		if (ss == NULL || ss->root != cgrp->root)
81a6a5cdd   Paul Menage   Task Control Grou...
3760
  			continue;
bd89aabc6   Paul Menage   Control groups: R...
3761
  		css = cgrp->subsys[ss->subsys_id];
81a6a5cdd   Paul Menage   Task Control Grou...
3762
3763
3764
3765
3766
3767
  		/* When called from check_for_release() it's possible
  		 * that by this point the cgroup has been removed
  		 * and the css deleted. But a false-positive doesn't
  		 * matter, since it can only happen if the cgroup
  		 * has been deleted and hence no longer needs the
  		 * release agent to be called anyway. */
e7c5ec919   Paul Menage   cgroups: add css_...
3768
  		if (css && (atomic_read(&css->refcnt) > 1))
81a6a5cdd   Paul Menage   Task Control Grou...
3769
  			return 1;
81a6a5cdd   Paul Menage   Task Control Grou...
3770
3771
3772
  	}
  	return 0;
  }
e7c5ec919   Paul Menage   cgroups: add css_...
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
  /*
   * Atomically mark all (or else none) of the cgroup's CSS objects as
   * CSS_REMOVED. Return true on success, or false if the cgroup has
   * busy subsystems. Call with cgroup_mutex held
   */
  
  static int cgroup_clear_css_refs(struct cgroup *cgrp)
  {
  	struct cgroup_subsys *ss;
  	unsigned long flags;
  	bool failed = false;
  	local_irq_save(flags);
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		int refcnt;
804b3c28a   Paul Menage   cgroups: add cpu_...
3788
  		while (1) {
e7c5ec919   Paul Menage   cgroups: add css_...
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
  			/* We can only remove a CSS with a refcnt==1 */
  			refcnt = atomic_read(&css->refcnt);
  			if (refcnt > 1) {
  				failed = true;
  				goto done;
  			}
  			BUG_ON(!refcnt);
  			/*
  			 * Drop the refcnt to 0 while we check other
  			 * subsystems. This will cause any racing
  			 * css_tryget() to spin until we set the
  			 * CSS_REMOVED bits or abort
  			 */
804b3c28a   Paul Menage   cgroups: add cpu_...
3802
3803
3804
3805
  			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
  				break;
  			cpu_relax();
  		}
e7c5ec919   Paul Menage   cgroups: add css_...
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
  	}
   done:
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		if (failed) {
  			/*
  			 * Restore old refcnt if we previously managed
  			 * to clear it from 1 to 0
  			 */
  			if (!atomic_read(&css->refcnt))
  				atomic_set(&css->refcnt, 1);
  		} else {
  			/* Commit the fact that the CSS is removed */
  			set_bit(CSS_REMOVED, &css->flags);
  		}
  	}
  	local_irq_restore(flags);
  	return !failed;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
3825
3826
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
bd89aabc6   Paul Menage   Control groups: R...
3827
  	struct cgroup *cgrp = dentry->d_fsdata;
ddbcc7e8e   Paul Menage   Task Control Grou...
3828
3829
  	struct dentry *d;
  	struct cgroup *parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3830
  	DEFINE_WAIT(wait);
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
3831
  	struct cgroup_event *event, *tmp;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3832
  	int ret;
ddbcc7e8e   Paul Menage   Task Control Grou...
3833
3834
  
  	/* the vfs holds both inode->i_mutex already */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3835
  again:
ddbcc7e8e   Paul Menage   Task Control Grou...
3836
  	mutex_lock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
3837
  	if (atomic_read(&cgrp->count) != 0) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3838
3839
3840
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
bd89aabc6   Paul Menage   Control groups: R...
3841
  	if (!list_empty(&cgrp->children)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3842
3843
3844
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
3845
  	mutex_unlock(&cgroup_mutex);
a043e3b2c   Li Zefan   cgroup: fix comments
3846

4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
3847
  	/*
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
  	 * In general, subsystem has no css->refcnt after pre_destroy(). But
  	 * in racy cases, subsystem may have to get css->refcnt after
  	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
  	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
  	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
  	 * and subsystem's reference count handling. Please see css_get/put
  	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
  	 */
  	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  
  	/*
a043e3b2c   Li Zefan   cgroup: fix comments
3859
3860
  	 * Call pre_destroy handlers of subsys. Notify subsystems
  	 * that rmdir() request comes.
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
3861
  	 */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3862
  	ret = cgroup_call_pre_destroy(cgrp);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3863
3864
  	if (ret) {
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3865
  		return ret;
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3866
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3867

3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
3868
3869
  	mutex_lock(&cgroup_mutex);
  	parent = cgrp->parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3870
  	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3871
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3872
3873
3874
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3875
  	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3876
3877
  	if (!cgroup_clear_css_refs(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3878
3879
3880
3881
3882
3883
  		/*
  		 * Because someone may call cgroup_wakeup_rmdir_waiter() before
  		 * prepare_to_wait(), we need to check this flag.
  		 */
  		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
  			schedule();
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3884
3885
3886
3887
3888
3889
3890
3891
3892
  		finish_wait(&cgroup_rmdir_waitq, &wait);
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  		if (signal_pending(current))
  			return -EINTR;
  		goto again;
  	}
  	/* NO css_tryget() can success after here. */
  	finish_wait(&cgroup_rmdir_waitq, &wait);
  	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3893

cdcc136ff   Thomas Gleixner   locking, sched, c...
3894
  	raw_spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
3895
3896
  	set_bit(CGRP_REMOVED, &cgrp->flags);
  	if (!list_empty(&cgrp->release_list))
8d2587970   Phil Carmody   cgroups: if you l...
3897
  		list_del_init(&cgrp->release_list);
cdcc136ff   Thomas Gleixner   locking, sched, c...
3898
  	raw_spin_unlock(&release_list_lock);
999cd8a45   Paul Menage   cgroups: add a pe...
3899
3900
3901
  
  	cgroup_lock_hierarchy(cgrp->root);
  	/* delete this cgroup from parent->children */
8d2587970   Phil Carmody   cgroups: if you l...
3902
  	list_del_init(&cgrp->sibling);
999cd8a45   Paul Menage   cgroups: add a pe...
3903
  	cgroup_unlock_hierarchy(cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
3904
  	d = dget(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
3905
3906
3907
  
  	cgroup_d_remove_dir(d);
  	dput(d);
ddbcc7e8e   Paul Menage   Task Control Grou...
3908

bd89aabc6   Paul Menage   Control groups: R...
3909
  	set_bit(CGRP_RELEASABLE, &parent->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
3910
  	check_for_release(parent);
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
  	/*
  	 * Unregister events and notify userspace.
  	 * Notify userspace about cgroup removing only after rmdir of cgroup
  	 * directory to avoid race between userspace and kernelspace
  	 */
  	spin_lock(&cgrp->event_list_lock);
  	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
  		list_del(&event->list);
  		remove_wait_queue(event->wqh, &event->wait);
  		eventfd_signal(event->eventfd, 1);
  		schedule_work(&event->remove);
  	}
  	spin_unlock(&cgrp->event_list_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
3924
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
3925
3926
  	return 0;
  }
06a119204   Li Zefan   cgroup: annotate ...
3927
  static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
ddbcc7e8e   Paul Menage   Task Control Grou...
3928
  {
ddbcc7e8e   Paul Menage   Task Control Grou...
3929
  	struct cgroup_subsys_state *css;
cfe36bde5   Diego Calleja   Improve cgroup pr...
3930
3931
3932
  
  	printk(KERN_INFO "Initializing cgroup subsys %s
  ", ss->name);
ddbcc7e8e   Paul Menage   Task Control Grou...
3933
3934
  
  	/* Create the top cgroup state for this subsystem */
33a68ac1c   Li Zefan   cgroups: add inac...
3935
  	list_add(&ss->sibling, &rootnode.subsys_list);
ddbcc7e8e   Paul Menage   Task Control Grou...
3936
3937
3938
3939
3940
  	ss->root = &rootnode;
  	css = ss->create(ss, dummytop);
  	/* We don't handle early failures gracefully */
  	BUG_ON(IS_ERR(css));
  	init_cgroup_css(css, ss, dummytop);
e8d55fdeb   Li Zefan   cgroups: simplify...
3941
  	/* Update the init_css_set to contain a subsys
817929ec2   Paul Menage   Task Control Grou...
3942
  	 * pointer to this state - since the subsystem is
e8d55fdeb   Li Zefan   cgroups: simplify...
3943
3944
3945
  	 * newly registered, all tasks and hence the
  	 * init_css_set is in the subsystem's top cgroup. */
  	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
ddbcc7e8e   Paul Menage   Task Control Grou...
3946
3947
  
  	need_forkexit_callback |= ss->fork || ss->exit;
e8d55fdeb   Li Zefan   cgroups: simplify...
3948
3949
3950
3951
  	/* At system boot, before all subsystems have been
  	 * registered, no tasks have been forked, so we don't
  	 * need to invoke fork callbacks here. */
  	BUG_ON(!list_empty(&init_task.tasks));
999cd8a45   Paul Menage   cgroups: add a pe...
3952
  	mutex_init(&ss->hierarchy_mutex);
cfebe563b   Li Zefan   cgroups: fix lock...
3953
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ddbcc7e8e   Paul Menage   Task Control Grou...
3954
  	ss->active = 1;
e6a1105ba   Ben Blum   cgroups: subsyste...
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
  
  	/* this function shouldn't be used with modular subsystems, since they
  	 * need to register a subsys_id, among other things */
  	BUG_ON(ss->module);
  }
  
  /**
   * cgroup_load_subsys: load and register a modular subsystem at runtime
   * @ss: the subsystem to load
   *
   * This function should be called in a modular subsystem's initcall. If the
883931612   Thomas Weber   Fix typos in comm...
3966
   * subsystem is built as a module, it will be assigned a new subsys_id and set
e6a1105ba   Ben Blum   cgroups: subsyste...
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
   * up for use. If the subsystem is built-in anyway, work is delegated to the
   * simpler cgroup_init_subsys.
   */
  int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
  {
  	int i;
  	struct cgroup_subsys_state *css;
  
  	/* check name and function validity */
  	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
  	    ss->create == NULL || ss->destroy == NULL)
  		return -EINVAL;
  
  	/*
  	 * we don't support callbacks in modular subsystems. this check is
  	 * before the ss->module check for consistency; a subsystem that could
  	 * be a module should still have no callbacks even if the user isn't
  	 * compiling it as one.
  	 */
  	if (ss->fork || ss->exit)
  		return -EINVAL;
  
  	/*
  	 * an optionally modular subsystem is built-in: we want to do nothing,
  	 * since cgroup_init_subsys will have already taken care of it.
  	 */
  	if (ss->module == NULL) {
  		/* a few sanity checks */
  		BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
  		BUG_ON(subsys[ss->subsys_id] != ss);
  		return 0;
  	}
  
  	/*
  	 * need to register a subsys id before anything else - for example,
  	 * init_cgroup_css needs it.
  	 */
  	mutex_lock(&cgroup_mutex);
  	/* find the first empty slot in the array */
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		if (subsys[i] == NULL)
  			break;
  	}
  	if (i == CGROUP_SUBSYS_COUNT) {
  		/* maximum number of subsystems already registered! */
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
  	/* assign ourselves the subsys_id */
  	ss->subsys_id = i;
  	subsys[i] = ss;
  
  	/*
  	 * no ss->create seems to need anything important in the ss struct, so
  	 * this can happen first (i.e. before the rootnode attachment).
  	 */
  	css = ss->create(ss, dummytop);
  	if (IS_ERR(css)) {
  		/* failure case - need to deassign the subsys[] slot. */
  		subsys[i] = NULL;
  		mutex_unlock(&cgroup_mutex);
  		return PTR_ERR(css);
  	}
  
  	list_add(&ss->sibling, &rootnode.subsys_list);
  	ss->root = &rootnode;
  
  	/* our new subsystem will be attached to the dummy hierarchy. */
  	init_cgroup_css(css, ss, dummytop);
  	/* init_idr must be after init_cgroup_css because it sets css->id. */
  	if (ss->use_id) {
  		int ret = cgroup_init_idr(ss, css);
  		if (ret) {
  			dummytop->subsys[ss->subsys_id] = NULL;
  			ss->destroy(ss, dummytop);
  			subsys[i] = NULL;
  			mutex_unlock(&cgroup_mutex);
  			return ret;
  		}
  	}
  
  	/*
  	 * Now we need to entangle the css into the existing css_sets. unlike
  	 * in cgroup_init_subsys, there are now multiple css_sets, so each one
  	 * will need a new pointer to it; done by iterating the css_set_table.
  	 * furthermore, modifying the existing css_sets will corrupt the hash
  	 * table state, so each changed css_set will need its hash recomputed.
  	 * this is all done under the css_set_lock.
  	 */
  	write_lock(&css_set_lock);
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  		struct css_set *cg;
  		struct hlist_node *node, *tmp;
  		struct hlist_head *bucket = &css_set_table[i], *new_bucket;
  
  		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
  			/* skip entries that we already rehashed */
  			if (cg->subsys[ss->subsys_id])
  				continue;
  			/* remove existing entry */
  			hlist_del(&cg->hlist);
  			/* set new value */
  			cg->subsys[ss->subsys_id] = css;
  			/* recompute hash and restore entry */
  			new_bucket = css_set_hash(cg->subsys);
  			hlist_add_head(&cg->hlist, new_bucket);
  		}
  	}
  	write_unlock(&css_set_lock);
  
  	mutex_init(&ss->hierarchy_mutex);
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
  	ss->active = 1;
e6a1105ba   Ben Blum   cgroups: subsyste...
4080
4081
4082
  	/* success! */
  	mutex_unlock(&cgroup_mutex);
  	return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
4083
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
4084
  EXPORT_SYMBOL_GPL(cgroup_load_subsys);
ddbcc7e8e   Paul Menage   Task Control Grou...
4085
4086
  
  /**
cf5d5941f   Ben Blum   cgroups: subsyste...
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
   * cgroup_unload_subsys: unload a modular subsystem
   * @ss: the subsystem to unload
   *
   * This function should be called in a modular subsystem's exitcall. When this
   * function is invoked, the refcount on the subsystem's module will be 0, so
   * the subsystem will not be attached to any hierarchy.
   */
  void cgroup_unload_subsys(struct cgroup_subsys *ss)
  {
  	struct cg_cgroup_link *link;
  	struct hlist_head *hhead;
  
  	BUG_ON(ss->module == NULL);
  
  	/*
  	 * we shouldn't be called if the subsystem is in use, and the use of
  	 * try_module_get in parse_cgroupfs_options should ensure that it
  	 * doesn't start being used while we're killing it off.
  	 */
  	BUG_ON(ss->root != &rootnode);
  
  	mutex_lock(&cgroup_mutex);
  	/* deassign the subsys_id */
  	BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
  	subsys[ss->subsys_id] = NULL;
  
  	/* remove subsystem from rootnode's list of subsystems */
8d2587970   Phil Carmody   cgroups: if you l...
4114
  	list_del_init(&ss->sibling);
cf5d5941f   Ben Blum   cgroups: subsyste...
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
  
  	/*
  	 * disentangle the css from all css_sets attached to the dummytop. as
  	 * in loading, we need to pay our respects to the hashtable gods.
  	 */
  	write_lock(&css_set_lock);
  	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
  		struct css_set *cg = link->cg;
  
  		hlist_del(&cg->hlist);
  		BUG_ON(!cg->subsys[ss->subsys_id]);
  		cg->subsys[ss->subsys_id] = NULL;
  		hhead = css_set_hash(cg->subsys);
  		hlist_add_head(&cg->hlist, hhead);
  	}
  	write_unlock(&css_set_lock);
  
  	/*
  	 * remove subsystem's css from the dummytop and free it - need to free
  	 * before marking as null because ss->destroy needs the cgrp->subsys
  	 * pointer to find their state. note that this also takes care of
  	 * freeing the css_id.
  	 */
  	ss->destroy(ss, dummytop);
  	dummytop->subsys[ss->subsys_id] = NULL;
  
  	mutex_unlock(&cgroup_mutex);
  }
  EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4146
4147
4148
4149
   * cgroup_init_early - cgroup initialization at system boot
   *
   * Initialize cgroups at system boot, and initialize any
   * subsystems that request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
4150
4151
4152
4153
   */
  int __init cgroup_init_early(void)
  {
  	int i;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
4154
  	atomic_set(&init_css_set.refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
4155
4156
  	INIT_LIST_HEAD(&init_css_set.cg_links);
  	INIT_LIST_HEAD(&init_css_set.tasks);
472b1053f   Li Zefan   cgroups: use a ha...
4157
  	INIT_HLIST_NODE(&init_css_set.hlist);
817929ec2   Paul Menage   Task Control Grou...
4158
  	css_set_count = 1;
ddbcc7e8e   Paul Menage   Task Control Grou...
4159
  	init_cgroup_root(&rootnode);
817929ec2   Paul Menage   Task Control Grou...
4160
4161
4162
4163
  	root_count = 1;
  	init_task.cgroups = &init_css_set;
  
  	init_css_set_link.cg = &init_css_set;
7717f7ba9   Paul Menage   cgroups: add a ba...
4164
  	init_css_set_link.cgrp = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
4165
  	list_add(&init_css_set_link.cgrp_link_list,
817929ec2   Paul Menage   Task Control Grou...
4166
4167
4168
  		 &rootnode.top_cgroup.css_sets);
  	list_add(&init_css_set_link.cg_link_list,
  		 &init_css_set.cg_links);
ddbcc7e8e   Paul Menage   Task Control Grou...
4169

472b1053f   Li Zefan   cgroups: use a ha...
4170
4171
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
  		INIT_HLIST_HEAD(&css_set_table[i]);
aae8aab40   Ben Blum   cgroups: revamp s...
4172
4173
  	/* at bootup time, we don't worry about modular subsystems */
  	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
ddbcc7e8e   Paul Menage   Task Control Grou...
4174
4175
4176
4177
4178
4179
4180
  		struct cgroup_subsys *ss = subsys[i];
  
  		BUG_ON(!ss->name);
  		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
  		BUG_ON(!ss->create);
  		BUG_ON(!ss->destroy);
  		if (ss->subsys_id != i) {
cfe36bde5   Diego Calleja   Improve cgroup pr...
4181
4182
  			printk(KERN_ERR "cgroup: Subsys %s id == %d
  ",
ddbcc7e8e   Paul Menage   Task Control Grou...
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
  			       ss->name, ss->subsys_id);
  			BUG();
  		}
  
  		if (ss->early_init)
  			cgroup_init_subsys(ss);
  	}
  	return 0;
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4194
4195
4196
4197
   * cgroup_init - cgroup initialization
   *
   * Register cgroup filesystem and /proc file, and initialize
   * any subsystems that didn't request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
4198
4199
4200
4201
4202
   */
  int __init cgroup_init(void)
  {
  	int err;
  	int i;
472b1053f   Li Zefan   cgroups: use a ha...
4203
  	struct hlist_head *hhead;
a424316ca   Paul Menage   Task Control Grou...
4204
4205
4206
4207
  
  	err = bdi_init(&cgroup_backing_dev_info);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
4208

aae8aab40   Ben Blum   cgroups: revamp s...
4209
4210
  	/* at bootup time, we don't worry about modular subsystems */
  	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
ddbcc7e8e   Paul Menage   Task Control Grou...
4211
4212
4213
  		struct cgroup_subsys *ss = subsys[i];
  		if (!ss->early_init)
  			cgroup_init_subsys(ss);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4214
  		if (ss->use_id)
e6a1105ba   Ben Blum   cgroups: subsyste...
4215
  			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
ddbcc7e8e   Paul Menage   Task Control Grou...
4216
  	}
472b1053f   Li Zefan   cgroups: use a ha...
4217
4218
4219
  	/* Add init_css_set to the hash table */
  	hhead = css_set_hash(init_css_set.subsys);
  	hlist_add_head(&init_css_set.hlist, hhead);
2c6ab6d20   Paul Menage   cgroups: allow cg...
4220
  	BUG_ON(!init_root_id(&rootnode));
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
4221
4222
4223
4224
4225
4226
  
  	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
  	if (!cgroup_kobj) {
  		err = -ENOMEM;
  		goto out;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
4227
  	err = register_filesystem(&cgroup_fs_type);
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
4228
4229
  	if (err < 0) {
  		kobject_put(cgroup_kobj);
ddbcc7e8e   Paul Menage   Task Control Grou...
4230
  		goto out;
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
4231
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
4232

46ae220be   Li Zefan   cgroup: switch to...
4233
  	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
a424316ca   Paul Menage   Task Control Grou...
4234

ddbcc7e8e   Paul Menage   Task Control Grou...
4235
  out:
a424316ca   Paul Menage   Task Control Grou...
4236
4237
  	if (err)
  		bdi_destroy(&cgroup_backing_dev_info);
ddbcc7e8e   Paul Menage   Task Control Grou...
4238
4239
  	return err;
  }
b4f48b636   Paul Menage   Task Control Grou...
4240

a424316ca   Paul Menage   Task Control Grou...
4241
4242
4243
4244
4245
4246
  /*
   * proc_cgroup_show()
   *  - Print task's cgroup paths into seq_file, one line for each hierarchy
   *  - Used for /proc/<pid>/cgroup.
   *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
   *    doesn't really matter if tsk->cgroup changes after we read it,
956db3ca0   Cliff Wickman   hotplug cpu: move...
4247
   *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
a424316ca   Paul Menage   Task Control Grou...
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
   *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
   *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
   *    cgroup to top_cgroup.
   */
  
  /* TODO: Use a proper seq_file iterator */
  static int proc_cgroup_show(struct seq_file *m, void *v)
  {
  	struct pid *pid;
  	struct task_struct *tsk;
  	char *buf;
  	int retval;
  	struct cgroupfs_root *root;
  
  	retval = -ENOMEM;
  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!buf)
  		goto out;
  
  	retval = -ESRCH;
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
  	if (!tsk)
  		goto out_free;
  
  	retval = 0;
  
  	mutex_lock(&cgroup_mutex);
e5f6a8609   Li Zefan   cgroups: make roo...
4276
  	for_each_active_root(root) {
a424316ca   Paul Menage   Task Control Grou...
4277
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
4278
  		struct cgroup *cgrp;
a424316ca   Paul Menage   Task Control Grou...
4279
  		int count = 0;
2c6ab6d20   Paul Menage   cgroups: allow cg...
4280
  		seq_printf(m, "%d:", root->hierarchy_id);
a424316ca   Paul Menage   Task Control Grou...
4281
4282
  		for_each_subsys(root, ss)
  			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
c6d57f331   Paul Menage   cgroups: support ...
4283
4284
4285
  		if (strlen(root->name))
  			seq_printf(m, "%sname=%s", count ? "," : "",
  				   root->name);
a424316ca   Paul Menage   Task Control Grou...
4286
  		seq_putc(m, ':');
7717f7ba9   Paul Menage   cgroups: add a ba...
4287
  		cgrp = task_cgroup_from_root(tsk, root);
bd89aabc6   Paul Menage   Control groups: R...
4288
  		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
a424316ca   Paul Menage   Task Control Grou...
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
  		if (retval < 0)
  			goto out_unlock;
  		seq_puts(m, buf);
  		seq_putc(m, '
  ');
  	}
  
  out_unlock:
  	mutex_unlock(&cgroup_mutex);
  	put_task_struct(tsk);
  out_free:
  	kfree(buf);
  out:
  	return retval;
  }
  
  static int cgroup_open(struct inode *inode, struct file *file)
  {
  	struct pid *pid = PROC_I(inode)->pid;
  	return single_open(file, proc_cgroup_show, pid);
  }
828c09509   Alexey Dobriyan   const: constify r...
4310
  const struct file_operations proc_cgroup_operations = {
a424316ca   Paul Menage   Task Control Grou...
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
  	.open		= cgroup_open,
  	.read		= seq_read,
  	.llseek		= seq_lseek,
  	.release	= single_release,
  };
  
  /* Display information about each subsystem and each hierarchy */
  static int proc_cgroupstats_show(struct seq_file *m, void *v)
  {
  	int i;
a424316ca   Paul Menage   Task Control Grou...
4321

8bab8dded   Paul Menage   cgroups: add cgro...
4322
4323
  	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled
  ");
aae8aab40   Ben Blum   cgroups: revamp s...
4324
4325
4326
4327
4328
  	/*
  	 * ideally we don't want subsystems moving around while we do this.
  	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
  	 * subsys/hierarchy state.
  	 */
a424316ca   Paul Menage   Task Control Grou...
4329
  	mutex_lock(&cgroup_mutex);
a424316ca   Paul Menage   Task Control Grou...
4330
4331
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
4332
4333
  		if (ss == NULL)
  			continue;
2c6ab6d20   Paul Menage   cgroups: allow cg...
4334
4335
4336
  		seq_printf(m, "%s\t%d\t%d\t%d
  ",
  			   ss->name, ss->root->hierarchy_id,
8bab8dded   Paul Menage   cgroups: add cgro...
4337
  			   ss->root->number_of_cgroups, !ss->disabled);
a424316ca   Paul Menage   Task Control Grou...
4338
4339
4340
4341
4342
4343
4344
  	}
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  static int cgroupstats_open(struct inode *inode, struct file *file)
  {
9dce07f1a   Al Viro   NULL noise: fs/*,...
4345
  	return single_open(file, proc_cgroupstats_show, NULL);
a424316ca   Paul Menage   Task Control Grou...
4346
  }
828c09509   Alexey Dobriyan   const: constify r...
4347
  static const struct file_operations proc_cgroupstats_operations = {
a424316ca   Paul Menage   Task Control Grou...
4348
4349
4350
4351
4352
  	.open = cgroupstats_open,
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.release = single_release,
  };
b4f48b636   Paul Menage   Task Control Grou...
4353
4354
  /**
   * cgroup_fork - attach newly forked task to its parents cgroup.
a043e3b2c   Li Zefan   cgroup: fix comments
4355
   * @child: pointer to task_struct of forking parent process.
b4f48b636   Paul Menage   Task Control Grou...
4356
4357
4358
4359
4360
   *
   * Description: A task inherits its parent's cgroup at fork().
   *
   * A pointer to the shared css_set was automatically copied in
   * fork.c by dup_task_struct().  However, we ignore that copy, since
7e381b0eb   Frederic Weisbecker   cgroup: Drop task...
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
   * it was not made under the protection of RCU, cgroup_mutex or
   * threadgroup_change_begin(), so it might no longer be a valid
   * cgroup pointer.  cgroup_attach_task() might have already changed
   * current->cgroups, allowing the previously referenced cgroup
   * group to be removed and freed.
   *
   * Outside the pointer validity we also need to process the css_set
   * inheritance between threadgoup_change_begin() and
   * threadgoup_change_end(), this way there is no leak in any process
   * wide migration performed by cgroup_attach_proc() that could otherwise
   * miss a thread because it is too early or too late in the fork stage.
b4f48b636   Paul Menage   Task Control Grou...
4372
4373
4374
4375
4376
4377
   *
   * At the point that cgroup_fork() is called, 'current' is the parent
   * task, and the passed argument 'child' points to the child task.
   */
  void cgroup_fork(struct task_struct *child)
  {
7e381b0eb   Frederic Weisbecker   cgroup: Drop task...
4378
4379
4380
4381
4382
4383
  	/*
  	 * We don't need to task_lock() current because current->cgroups
  	 * can't be changed concurrently here. The parent obviously hasn't
  	 * exited and called cgroup_exit(), and we are synchronized against
  	 * cgroup migration through threadgroup_change_begin().
  	 */
817929ec2   Paul Menage   Task Control Grou...
4384
4385
  	child->cgroups = current->cgroups;
  	get_css_set(child->cgroups);
817929ec2   Paul Menage   Task Control Grou...
4386
  	INIT_LIST_HEAD(&child->cg_list);
b4f48b636   Paul Menage   Task Control Grou...
4387
4388
4389
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4390
4391
4392
4393
4394
4395
   * cgroup_fork_callbacks - run fork callbacks
   * @child: the new task
   *
   * Called on a new task very soon before adding it to the
   * tasklist. No need to take any locks since no-one can
   * be operating on this task.
b4f48b636   Paul Menage   Task Control Grou...
4396
4397
4398
4399
4400
   */
  void cgroup_fork_callbacks(struct task_struct *child)
  {
  	if (need_forkexit_callback) {
  		int i;
aae8aab40   Ben Blum   cgroups: revamp s...
4401
4402
4403
4404
4405
4406
  		/*
  		 * forkexit callbacks are only supported for builtin
  		 * subsystems, and the builtin section of the subsys array is
  		 * immutable, so we don't need to lock the subsys array here.
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
b4f48b636   Paul Menage   Task Control Grou...
4407
4408
4409
4410
4411
4412
4413
4414
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->fork)
  				ss->fork(ss, child);
  		}
  	}
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4415
4416
4417
4418
4419
4420
4421
4422
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
   * Adds the task to the list running through its css_set if necessary.
   * Has to be after the task is visible on the task list in case we race
   * with the first call to cgroup_iter_start() - to guarantee that the
   * new task ends up on its list.
   */
817929ec2   Paul Menage   Task Control Grou...
4423
4424
4425
4426
  void cgroup_post_fork(struct task_struct *child)
  {
  	if (use_task_css_set_links) {
  		write_lock(&css_set_lock);
7e3aa30ac   Frederic Weisbecker   cgroup: Remove ta...
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
  		if (list_empty(&child->cg_list)) {
  			/*
  			 * It's safe to use child->cgroups without task_lock()
  			 * here because we are protected through
  			 * threadgroup_change_begin() against concurrent
  			 * css_set change in cgroup_task_migrate(). Also
  			 * the task can't exit at that point until
  			 * wake_up_new_task() is called, so we are protected
  			 * against cgroup_exit() setting child->cgroup to
  			 * init_css_set.
  			 */
817929ec2   Paul Menage   Task Control Grou...
4438
  			list_add(&child->cg_list, &child->cgroups->tasks);
7e3aa30ac   Frederic Weisbecker   cgroup: Remove ta...
4439
  		}
817929ec2   Paul Menage   Task Control Grou...
4440
4441
4442
4443
  		write_unlock(&css_set_lock);
  	}
  }
  /**
b4f48b636   Paul Menage   Task Control Grou...
4444
4445
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
a043e3b2c   Li Zefan   cgroup: fix comments
4446
   * @run_callback: run exit callbacks?
b4f48b636   Paul Menage   Task Control Grou...
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
   *
   * Description: Detach cgroup from @tsk and release it.
   *
   * Note that cgroups marked notify_on_release force every task in
   * them to take the global cgroup_mutex mutex when exiting.
   * This could impact scaling on very large systems.  Be reluctant to
   * use notify_on_release cgroups where very high task exit scaling
   * is required on large systems.
   *
   * the_top_cgroup_hack:
   *
   *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
   *
   *    We call cgroup_exit() while the task is still competent to
   *    handle notify_on_release(), then leave the task attached to the
   *    root cgroup in each hierarchy for the remainder of its exit.
   *
   *    To do this properly, we would increment the reference count on
   *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
   *    code we would add a second cgroup function call, to drop that
   *    reference.  This would just create an unnecessary hot spot on
   *    the top_cgroup reference count, to no avail.
   *
   *    Normally, holding a reference to a cgroup without bumping its
   *    count is unsafe.   The cgroup could go away, or someone could
   *    attach us to a different cgroup, decrementing the count on
   *    the first cgroup that we never incremented.  But in this case,
   *    top_cgroup isn't going away, and either task has PF_EXITING set,
956db3ca0   Cliff Wickman   hotplug cpu: move...
4475
4476
   *    which wards off any cgroup_attach_task() attempts, or task is a failed
   *    fork, never visible to cgroup_attach_task.
b4f48b636   Paul Menage   Task Control Grou...
4477
4478
4479
   */
  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
  {
817929ec2   Paul Menage   Task Control Grou...
4480
  	struct css_set *cg;
d41d5a016   Peter Zijlstra   cgroup: Fix cgrou...
4481
  	int i;
817929ec2   Paul Menage   Task Control Grou...
4482
4483
4484
4485
4486
4487
4488
4489
4490
  
  	/*
  	 * Unlink from the css_set task list if necessary.
  	 * Optimistically check cg_list before taking
  	 * css_set_lock
  	 */
  	if (!list_empty(&tsk->cg_list)) {
  		write_lock(&css_set_lock);
  		if (!list_empty(&tsk->cg_list))
8d2587970   Phil Carmody   cgroups: if you l...
4491
  			list_del_init(&tsk->cg_list);
817929ec2   Paul Menage   Task Control Grou...
4492
4493
  		write_unlock(&css_set_lock);
  	}
b4f48b636   Paul Menage   Task Control Grou...
4494
4495
  	/* Reassign the task to the init_css_set. */
  	task_lock(tsk);
817929ec2   Paul Menage   Task Control Grou...
4496
4497
  	cg = tsk->cgroups;
  	tsk->cgroups = &init_css_set;
d41d5a016   Peter Zijlstra   cgroup: Fix cgrou...
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
  
  	if (run_callbacks && need_forkexit_callback) {
  		/*
  		 * modular subsystems can't use callbacks, so no need to lock
  		 * the subsys array
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->exit) {
  				struct cgroup *old_cgrp =
  					rcu_dereference_raw(cg->subsys[i])->cgroup;
  				struct cgroup *cgrp = task_cgroup(tsk, i);
  				ss->exit(ss, cgrp, old_cgrp, tsk);
  			}
  		}
  	}
b4f48b636   Paul Menage   Task Control Grou...
4514
  	task_unlock(tsk);
d41d5a016   Peter Zijlstra   cgroup: Fix cgrou...
4515

817929ec2   Paul Menage   Task Control Grou...
4516
  	if (cg)
81a6a5cdd   Paul Menage   Task Control Grou...
4517
  		put_css_set_taskexit(cg);
b4f48b636   Paul Menage   Task Control Grou...
4518
  }
697f41610   Paul Menage   Task Control Grou...
4519
4520
  
  /**
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4521
   * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
a043e3b2c   Li Zefan   cgroup: fix comments
4522
   * @cgrp: the cgroup in question
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4523
   * @task: the task in question
a043e3b2c   Li Zefan   cgroup: fix comments
4524
   *
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4525
4526
   * See if @cgrp is a descendant of @task's cgroup in the appropriate
   * hierarchy.
697f41610   Paul Menage   Task Control Grou...
4527
4528
4529
4530
4531
4532
   *
   * If we are sending in dummytop, then presumably we are creating
   * the top cgroup in the subsystem.
   *
   * Called only by the ns (nsproxy) cgroup.
   */
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4533
  int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
697f41610   Paul Menage   Task Control Grou...
4534
4535
4536
  {
  	int ret;
  	struct cgroup *target;
697f41610   Paul Menage   Task Control Grou...
4537

bd89aabc6   Paul Menage   Control groups: R...
4538
  	if (cgrp == dummytop)
697f41610   Paul Menage   Task Control Grou...
4539
  		return 1;
7717f7ba9   Paul Menage   cgroups: add a ba...
4540
  	target = task_cgroup_from_root(task, cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
4541
4542
4543
  	while (cgrp != target && cgrp!= cgrp->top_cgroup)
  		cgrp = cgrp->parent;
  	ret = (cgrp == target);
697f41610   Paul Menage   Task Control Grou...
4544
4545
  	return ret;
  }
81a6a5cdd   Paul Menage   Task Control Grou...
4546

bd89aabc6   Paul Menage   Control groups: R...
4547
  static void check_for_release(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
4548
4549
4550
  {
  	/* All of these checks rely on RCU to keep the cgroup
  	 * structure alive */
bd89aabc6   Paul Menage   Control groups: R...
4551
4552
  	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
  	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
4553
4554
4555
4556
  		/* Control Group is currently removeable. If it's not
  		 * already queued for a userspace notification, queue
  		 * it now */
  		int need_schedule_work = 0;
cdcc136ff   Thomas Gleixner   locking, sched, c...
4557
  		raw_spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
4558
4559
4560
  		if (!cgroup_is_removed(cgrp) &&
  		    list_empty(&cgrp->release_list)) {
  			list_add(&cgrp->release_list, &release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
4561
4562
  			need_schedule_work = 1;
  		}
cdcc136ff   Thomas Gleixner   locking, sched, c...
4563
  		raw_spin_unlock(&release_list_lock);
81a6a5cdd   Paul Menage   Task Control Grou...
4564
4565
4566
4567
  		if (need_schedule_work)
  			schedule_work(&release_agent_work);
  	}
  }
d7b9fff71   Daisuke Nishimura   cgroup: introduce...
4568
4569
  /* Caller must verify that the css is not for root cgroup */
  void __css_put(struct cgroup_subsys_state *css, int count)
81a6a5cdd   Paul Menage   Task Control Grou...
4570
  {
bd89aabc6   Paul Menage   Control groups: R...
4571
  	struct cgroup *cgrp = css->cgroup;
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4572
  	int val;
81a6a5cdd   Paul Menage   Task Control Grou...
4573
  	rcu_read_lock();
d7b9fff71   Daisuke Nishimura   cgroup: introduce...
4574
  	val = atomic_sub_return(count, &css->refcnt);
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4575
  	if (val == 1) {
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
4576
4577
4578
4579
  		if (notify_on_release(cgrp)) {
  			set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
  		}
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
4580
  		cgroup_wakeup_rmdir_waiter(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
4581
4582
  	}
  	rcu_read_unlock();
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4583
  	WARN_ON_ONCE(val < 1);
81a6a5cdd   Paul Menage   Task Control Grou...
4584
  }
67523c48a   Ben Blum   cgroups: blkio su...
4585
  EXPORT_SYMBOL_GPL(__css_put);
81a6a5cdd   Paul Menage   Task Control Grou...
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
  
  /*
   * Notify userspace when a cgroup is released, by running the
   * configured release agent with the name of the cgroup (path
   * relative to the root of cgroup file system) as the argument.
   *
   * Most likely, this user command will try to rmdir this cgroup.
   *
   * This races with the possibility that some other task will be
   * attached to this cgroup before it is removed, or that some other
   * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
   * The presumed 'rmdir' will fail quietly if this cgroup is no longer
   * unused, and this cgroup will be reprieved from its death sentence,
   * to continue to serve a useful existence.  Next time it's released,
   * we will get notified again, if it still has 'notify_on_release' set.
   *
   * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
   * means only wait until the task is successfully execve()'d.  The
   * separate release agent task is forked by call_usermodehelper(),
   * then control in this thread returns here, without waiting for the
   * release agent task.  We don't bother to wait because the caller of
   * this routine has no use for the exit status of the release agent
   * task, so no sense holding our caller up for that.
81a6a5cdd   Paul Menage   Task Control Grou...
4609
   */
81a6a5cdd   Paul Menage   Task Control Grou...
4610
4611
4612
4613
  static void cgroup_release_agent(struct work_struct *work)
  {
  	BUG_ON(work != &release_agent_work);
  	mutex_lock(&cgroup_mutex);
cdcc136ff   Thomas Gleixner   locking, sched, c...
4614
  	raw_spin_lock(&release_list_lock);
81a6a5cdd   Paul Menage   Task Control Grou...
4615
4616
4617
  	while (!list_empty(&release_list)) {
  		char *argv[3], *envp[3];
  		int i;
e788e066c   Paul Menage   cgroup files: mov...
4618
  		char *pathbuf = NULL, *agentbuf = NULL;
bd89aabc6   Paul Menage   Control groups: R...
4619
  		struct cgroup *cgrp = list_entry(release_list.next,
81a6a5cdd   Paul Menage   Task Control Grou...
4620
4621
  						    struct cgroup,
  						    release_list);
bd89aabc6   Paul Menage   Control groups: R...
4622
  		list_del_init(&cgrp->release_list);
cdcc136ff   Thomas Gleixner   locking, sched, c...
4623
  		raw_spin_unlock(&release_list_lock);
81a6a5cdd   Paul Menage   Task Control Grou...
4624
  		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
e788e066c   Paul Menage   cgroup files: mov...
4625
4626
4627
4628
4629
4630
4631
  		if (!pathbuf)
  			goto continue_free;
  		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
  			goto continue_free;
  		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
  		if (!agentbuf)
  			goto continue_free;
81a6a5cdd   Paul Menage   Task Control Grou...
4632
4633
  
  		i = 0;
e788e066c   Paul Menage   cgroup files: mov...
4634
4635
  		argv[i++] = agentbuf;
  		argv[i++] = pathbuf;
81a6a5cdd   Paul Menage   Task Control Grou...
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
  		argv[i] = NULL;
  
  		i = 0;
  		/* minimal command environment */
  		envp[i++] = "HOME=/";
  		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  		envp[i] = NULL;
  
  		/* Drop the lock while we invoke the usermode helper,
  		 * since the exec could involve hitting disk and hence
  		 * be a slow process */
  		mutex_unlock(&cgroup_mutex);
  		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
81a6a5cdd   Paul Menage   Task Control Grou...
4649
  		mutex_lock(&cgroup_mutex);
e788e066c   Paul Menage   cgroup files: mov...
4650
4651
4652
   continue_free:
  		kfree(pathbuf);
  		kfree(agentbuf);
cdcc136ff   Thomas Gleixner   locking, sched, c...
4653
  		raw_spin_lock(&release_list_lock);
81a6a5cdd   Paul Menage   Task Control Grou...
4654
  	}
cdcc136ff   Thomas Gleixner   locking, sched, c...
4655
  	raw_spin_unlock(&release_list_lock);
81a6a5cdd   Paul Menage   Task Control Grou...
4656
4657
  	mutex_unlock(&cgroup_mutex);
  }
8bab8dded   Paul Menage   cgroups: add cgro...
4658
4659
4660
4661
4662
4663
4664
4665
4666
  
  static int __init cgroup_disable(char *str)
  {
  	int i;
  	char *token;
  
  	while ((token = strsep(&str, ",")) != NULL) {
  		if (!*token)
  			continue;
aae8aab40   Ben Blum   cgroups: revamp s...
4667
4668
4669
4670
4671
  		/*
  		 * cgroup_disable, being at boot time, can't know about module
  		 * subsystems, so we don't worry about them.
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
8bab8dded   Paul Menage   cgroups: add cgro...
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
  			struct cgroup_subsys *ss = subsys[i];
  
  			if (!strcmp(token, ss->name)) {
  				ss->disabled = 1;
  				printk(KERN_INFO "Disabling %s control group"
  					" subsystem
  ", ss->name);
  				break;
  			}
  		}
  	}
  	return 1;
  }
  __setup("cgroup_disable=", cgroup_disable);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
  
  /*
   * Functons for CSS ID.
   */
  
  /*
   *To get ID other than 0, this should be called when !cgroup_is_removed().
   */
  unsigned short css_id(struct cgroup_subsys_state *css)
  {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4696
4697
4698
4699
4700
4701
4702
  	struct css_id *cssid;
  
  	/*
  	 * This css_id() can return correct value when somone has refcnt
  	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
  	 * it's unchanged until freed.
  	 */
d8bf4ca9c   Michal Hocko   rcu: treewide: Do...
4703
  	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4704
4705
4706
4707
4708
  
  	if (cssid)
  		return cssid->id;
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
4709
  EXPORT_SYMBOL_GPL(css_id);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4710
4711
4712
  
  unsigned short css_depth(struct cgroup_subsys_state *css)
  {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4713
  	struct css_id *cssid;
d8bf4ca9c   Michal Hocko   rcu: treewide: Do...
4714
  	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4715
4716
4717
4718
4719
  
  	if (cssid)
  		return cssid->depth;
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
4720
  EXPORT_SYMBOL_GPL(css_depth);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4721

747388d78   KAMEZAWA Hiroyuki   memcg: fix css_is...
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
  /**
   *  css_is_ancestor - test "root" css is an ancestor of "child"
   * @child: the css to be tested.
   * @root: the css supporsed to be an ancestor of the child.
   *
   * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
   * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
   * But, considering usual usage, the csses should be valid objects after test.
   * Assuming that the caller will do some action to the child if this returns
   * returns true, the caller must take "child";s reference count.
   * If "child" is valid object and this returns true, "root" is valid, too.
   */
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4734
  bool css_is_ancestor(struct cgroup_subsys_state *child,
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
4735
  		    const struct cgroup_subsys_state *root)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4736
  {
747388d78   KAMEZAWA Hiroyuki   memcg: fix css_is...
4737
4738
4739
  	struct css_id *child_id;
  	struct css_id *root_id;
  	bool ret = true;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4740

747388d78   KAMEZAWA Hiroyuki   memcg: fix css_is...
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
  	rcu_read_lock();
  	child_id  = rcu_dereference(child->id);
  	root_id = rcu_dereference(root->id);
  	if (!child_id
  	    || !root_id
  	    || (child_id->depth < root_id->depth)
  	    || (child_id->stack[root_id->depth] != root_id->id))
  		ret = false;
  	rcu_read_unlock();
  	return ret;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4751
  }
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
  void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
  {
  	struct css_id *id = css->id;
  	/* When this is called before css_id initialization, id can be NULL */
  	if (!id)
  		return;
  
  	BUG_ON(!ss->use_id);
  
  	rcu_assign_pointer(id->css, NULL);
  	rcu_assign_pointer(css->id, NULL);
c1e2ee2dc   Andrew Bresticker   memcg: replace ss...
4763
  	write_lock(&ss->id_lock);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4764
  	idr_remove(&ss->idr, id->id);
c1e2ee2dc   Andrew Bresticker   memcg: replace ss...
4765
  	write_unlock(&ss->id_lock);
025cea99d   Lai Jiangshan   cgroup,rcu: conve...
4766
  	kfree_rcu(id, rcu_head);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4767
  }
67523c48a   Ben Blum   cgroups: blkio su...
4768
  EXPORT_SYMBOL_GPL(free_css_id);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
  
  /*
   * This is called by init or create(). Then, calls to this function are
   * always serialized (By cgroup_mutex() at create()).
   */
  
  static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
  {
  	struct css_id *newid;
  	int myid, error, size;
  
  	BUG_ON(!ss->use_id);
  
  	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
  	newid = kzalloc(size, GFP_KERNEL);
  	if (!newid)
  		return ERR_PTR(-ENOMEM);
  	/* get id */
  	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
  		error = -ENOMEM;
  		goto err_out;
  	}
c1e2ee2dc   Andrew Bresticker   memcg: replace ss...
4791
  	write_lock(&ss->id_lock);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4792
4793
  	/* Don't use 0. allocates an ID of 1-65535 */
  	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
c1e2ee2dc   Andrew Bresticker   memcg: replace ss...
4794
  	write_unlock(&ss->id_lock);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
  
  	/* Returns error when there are no free spaces for new ID.*/
  	if (error) {
  		error = -ENOSPC;
  		goto err_out;
  	}
  	if (myid > CSS_ID_MAX)
  		goto remove_idr;
  
  	newid->id = myid;
  	newid->depth = depth;
  	return newid;
  remove_idr:
  	error = -ENOSPC;
c1e2ee2dc   Andrew Bresticker   memcg: replace ss...
4809
  	write_lock(&ss->id_lock);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4810
  	idr_remove(&ss->idr, myid);
c1e2ee2dc   Andrew Bresticker   memcg: replace ss...
4811
  	write_unlock(&ss->id_lock);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4812
4813
4814
4815
4816
  err_out:
  	kfree(newid);
  	return ERR_PTR(error);
  
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
4817
4818
  static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
  					    struct cgroup_subsys_state *rootcss)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4819
4820
  {
  	struct css_id *newid;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4821

c1e2ee2dc   Andrew Bresticker   memcg: replace ss...
4822
  	rwlock_init(&ss->id_lock);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4823
  	idr_init(&ss->idr);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
  	newid = get_new_cssid(ss, 0);
  	if (IS_ERR(newid))
  		return PTR_ERR(newid);
  
  	newid->stack[0] = newid->id;
  	newid->css = rootcss;
  	rootcss->id = newid;
  	return 0;
  }
  
  static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
  			struct cgroup *child)
  {
  	int subsys_id, i, depth = 0;
  	struct cgroup_subsys_state *parent_css, *child_css;
fae9c7917   Li Zefan   cgroup: Fix an RC...
4839
  	struct css_id *child_id, *parent_id;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4840
4841
4842
4843
  
  	subsys_id = ss->subsys_id;
  	parent_css = parent->subsys[subsys_id];
  	child_css = child->subsys[subsys_id];
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4844
  	parent_id = parent_css->id;
94b3dd0f7   Greg Thelen   cgroups: alloc_cs...
4845
  	depth = parent_id->depth + 1;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
  
  	child_id = get_new_cssid(ss, depth);
  	if (IS_ERR(child_id))
  		return PTR_ERR(child_id);
  
  	for (i = 0; i < depth; i++)
  		child_id->stack[i] = parent_id->stack[i];
  	child_id->stack[depth] = child_id->id;
  	/*
  	 * child_id->css pointer will be set after this cgroup is available
  	 * see cgroup_populate_dir()
  	 */
  	rcu_assign_pointer(child_css->id, child_id);
  
  	return 0;
  }
  
  /**
   * css_lookup - lookup css by id
   * @ss: cgroup subsys to be looked into.
   * @id: the id
   *
   * Returns pointer to cgroup_subsys_state if there is valid one with id.
   * NULL if not. Should be called under rcu_read_lock()
   */
  struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
  {
  	struct css_id *cssid = NULL;
  
  	BUG_ON(!ss->use_id);
  	cssid = idr_find(&ss->idr, id);
  
  	if (unlikely(!cssid))
  		return NULL;
  
  	return rcu_dereference(cssid->css);
  }
67523c48a   Ben Blum   cgroups: blkio su...
4883
  EXPORT_SYMBOL_GPL(css_lookup);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
  
  /**
   * css_get_next - lookup next cgroup under specified hierarchy.
   * @ss: pointer to subsystem
   * @id: current position of iteration.
   * @root: pointer to css. search tree under this.
   * @foundid: position of found object.
   *
   * Search next css under the specified hierarchy of rootid. Calling under
   * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
   */
  struct cgroup_subsys_state *
  css_get_next(struct cgroup_subsys *ss, int id,
  	     struct cgroup_subsys_state *root, int *foundid)
  {
  	struct cgroup_subsys_state *ret = NULL;
  	struct css_id *tmp;
  	int tmpid;
  	int rootid = css_id(root);
  	int depth = css_depth(root);
  
  	if (!rootid)
  		return NULL;
  
  	BUG_ON(!ss->use_id);
  	/* fill start point for scan */
  	tmpid = id;
  	while (1) {
  		/*
  		 * scan next entry from bitmap(tree), tmpid is updated after
  		 * idr_get_next().
  		 */
c1e2ee2dc   Andrew Bresticker   memcg: replace ss...
4916
  		read_lock(&ss->id_lock);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4917
  		tmp = idr_get_next(&ss->idr, &tmpid);
c1e2ee2dc   Andrew Bresticker   memcg: replace ss...
4918
  		read_unlock(&ss->id_lock);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
  
  		if (!tmp)
  			break;
  		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
  			ret = rcu_dereference(tmp->css);
  			if (ret) {
  				*foundid = tmpid;
  				break;
  			}
  		}
  		/* continue to scan from next id */
  		tmpid = tmpid + 1;
  	}
  	return ret;
  }
e5d1367f1   Stephane Eranian   perf: Add cgroup ...
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
  /*
   * get corresponding css from file open on cgroupfs directory
   */
  struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
  {
  	struct cgroup *cgrp;
  	struct inode *inode;
  	struct cgroup_subsys_state *css;
  
  	inode = f->f_dentry->d_inode;
  	/* check in cgroup filesystem dir */
  	if (inode->i_op != &cgroup_dir_inode_operations)
  		return ERR_PTR(-EBADF);
  
  	if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
  		return ERR_PTR(-EINVAL);
  
  	/* get cgroup */
  	cgrp = __d_cgrp(f->f_dentry);
  	css = cgrp->subsys[id];
  	return css ? css : ERR_PTR(-ENOENT);
  }
fe6934354   Paul Menage   cgroups: move the...
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
  #ifdef CONFIG_CGROUP_DEBUG
  static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
  						   struct cgroup *cont)
  {
  	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
  
  	if (!css)
  		return ERR_PTR(-ENOMEM);
  
  	return css;
  }
  
  static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
  {
  	kfree(cont->subsys[debug_subsys_id]);
  }
  
  static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
  {
  	return atomic_read(&cont->count);
  }
  
  static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
  {
  	return cgroup_task_count(cont);
  }
  
  static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
  {
  	return (u64)(unsigned long)current->cgroups;
  }
  
  static u64 current_css_set_refcount_read(struct cgroup *cont,
  					   struct cftype *cft)
  {
  	u64 count;
  
  	rcu_read_lock();
  	count = atomic_read(&current->cgroups->refcount);
  	rcu_read_unlock();
  	return count;
  }
7717f7ba9   Paul Menage   cgroups: add a ba...
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
  static int current_css_set_cg_links_read(struct cgroup *cont,
  					 struct cftype *cft,
  					 struct seq_file *seq)
  {
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	read_lock(&css_set_lock);
  	rcu_read_lock();
  	cg = rcu_dereference(current->cgroups);
  	list_for_each_entry(link, &cg->cg_links, cg_link_list) {
  		struct cgroup *c = link->cgrp;
  		const char *name;
  
  		if (c->dentry)
  			name = c->dentry->d_name.name;
  		else
  			name = "?";
2c6ab6d20   Paul Menage   cgroups: allow cg...
5016
5017
5018
  		seq_printf(seq, "Root %d group %s
  ",
  			   c->root->hierarchy_id, name);
7717f7ba9   Paul Menage   cgroups: add a ba...
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
  	}
  	rcu_read_unlock();
  	read_unlock(&css_set_lock);
  	return 0;
  }
  
  #define MAX_TASKS_SHOWN_PER_CSS 25
  static int cgroup_css_links_read(struct cgroup *cont,
  				 struct cftype *cft,
  				 struct seq_file *seq)
  {
  	struct cg_cgroup_link *link;
  
  	read_lock(&css_set_lock);
  	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
  		struct css_set *cg = link->cg;
  		struct task_struct *task;
  		int count = 0;
  		seq_printf(seq, "css_set %p
  ", cg);
  		list_for_each_entry(task, &cg->tasks, cg_list) {
  			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
  				seq_puts(seq, "  ...
  ");
  				break;
  			} else {
  				seq_printf(seq, "  task %d
  ",
  					   task_pid_vnr(task));
  			}
  		}
  	}
  	read_unlock(&css_set_lock);
  	return 0;
  }
fe6934354   Paul Menage   cgroups: move the...
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
  static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
  {
  	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
  }
  
  static struct cftype debug_files[] =  {
  	{
  		.name = "cgroup_refcount",
  		.read_u64 = cgroup_refcount_read,
  	},
  	{
  		.name = "taskcount",
  		.read_u64 = debug_taskcount_read,
  	},
  
  	{
  		.name = "current_css_set",
  		.read_u64 = current_css_set_read,
  	},
  
  	{
  		.name = "current_css_set_refcount",
  		.read_u64 = current_css_set_refcount_read,
  	},
  
  	{
7717f7ba9   Paul Menage   cgroups: add a ba...
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
  		.name = "current_css_set_cg_links",
  		.read_seq_string = current_css_set_cg_links_read,
  	},
  
  	{
  		.name = "cgroup_css_links",
  		.read_seq_string = cgroup_css_links_read,
  	},
  
  	{
fe6934354   Paul Menage   cgroups: move the...
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
  		.name = "releasable",
  		.read_u64 = releasable_read,
  	},
  };
  
  static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
  {
  	return cgroup_add_files(cont, ss, debug_files,
  				ARRAY_SIZE(debug_files));
  }
  
  struct cgroup_subsys debug_subsys = {
  	.name = "debug",
  	.create = debug_create,
  	.destroy = debug_destroy,
  	.populate = debug_populate,
  	.subsys_id = debug_subsys_id,
  };
  #endif /* CONFIG_CGROUP_DEBUG */