Blame view

kernel/cgroup.c 129 KB
ddbcc7e8e   Paul Menage   Task Control Grou...
1
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
2
3
4
5
6
   *  Generic process-grouping system.
   *
   *  Based originally on the cpuset system, extracted by Paul Menage
   *  Copyright (C) 2006 Google, Inc
   *
0dea11687   Kirill A. Shutemov   cgroup: implement...
7
8
9
10
   *  Notifications support
   *  Copyright (C) 2009 Nokia Corporation
   *  Author: Kirill A. Shutemov
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
   *  Copyright notices from the original cpuset code:
   *  --------------------------------------------------
   *  Copyright (C) 2003 BULL SA.
   *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
   *
   *  2003-10-10 Written by Simon Derr.
   *  2003-10-22 Updates by Stephen Hemminger.
   *  2004 May-July Rework by Paul Jackson.
   *  ---------------------------------------------------
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
  
  #include <linux/cgroup.h>
c6d57f331   Paul Menage   cgroups: support ...
30
  #include <linux/ctype.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
31
32
33
34
35
36
37
38
  #include <linux/errno.h>
  #include <linux/fs.h>
  #include <linux/kernel.h>
  #include <linux/list.h>
  #include <linux/mm.h>
  #include <linux/mutex.h>
  #include <linux/mount.h>
  #include <linux/pagemap.h>
a424316ca   Paul Menage   Task Control Grou...
39
  #include <linux/proc_fs.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
40
41
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
817929ec2   Paul Menage   Task Control Grou...
42
  #include <linux/backing-dev.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
43
44
45
46
47
  #include <linux/seq_file.h>
  #include <linux/slab.h>
  #include <linux/magic.h>
  #include <linux/spinlock.h>
  #include <linux/string.h>
bbcb81d09   Paul Menage   Task Control Grou...
48
  #include <linux/sort.h>
81a6a5cdd   Paul Menage   Task Control Grou...
49
  #include <linux/kmod.h>
e6a1105ba   Ben Blum   cgroups: subsyste...
50
  #include <linux/module.h>
846c7bb05   Balbir Singh   Add cgroupstats
51
52
  #include <linux/delayacct.h>
  #include <linux/cgroupstats.h>
472b1053f   Li Zefan   cgroups: use a ha...
53
  #include <linux/hash.h>
3f8206d49   Al Viro   [PATCH] get rid o...
54
  #include <linux/namei.h>
096b7fe01   Li Zefan   cgroups: fix pid ...
55
  #include <linux/pid_namespace.h>
2c6ab6d20   Paul Menage   cgroups: allow cg...
56
  #include <linux/idr.h>
d1d9fd330   Ben Blum   cgroups: use vmal...
57
  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
0dea11687   Kirill A. Shutemov   cgroup: implement...
58
59
  #include <linux/eventfd.h>
  #include <linux/poll.h>
846c7bb05   Balbir Singh   Add cgroupstats
60

ddbcc7e8e   Paul Menage   Task Control Grou...
61
  #include <asm/atomic.h>
81a6a5cdd   Paul Menage   Task Control Grou...
62
  static DEFINE_MUTEX(cgroup_mutex);
aae8aab40   Ben Blum   cgroups: revamp s...
63
64
65
66
67
68
  /*
   * Generate an array of cgroup subsystem pointers. At boot time, this is
   * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
   * registered after that. The mutable section of this array is protected by
   * cgroup_mutex.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
69
  #define SUBSYS(_x) &_x ## _subsys,
aae8aab40   Ben Blum   cgroups: revamp s...
70
  static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
ddbcc7e8e   Paul Menage   Task Control Grou...
71
72
  #include <linux/cgroup_subsys.h>
  };
c6d57f331   Paul Menage   cgroups: support ...
73
  #define MAX_CGROUP_ROOT_NAMELEN 64
ddbcc7e8e   Paul Menage   Task Control Grou...
74
75
76
77
78
79
80
81
82
83
84
85
86
  /*
   * A cgroupfs_root represents the root of a cgroup hierarchy,
   * and may be associated with a superblock to form an active
   * hierarchy
   */
  struct cgroupfs_root {
  	struct super_block *sb;
  
  	/*
  	 * The bitmask of subsystems intended to be attached to this
  	 * hierarchy
  	 */
  	unsigned long subsys_bits;
2c6ab6d20   Paul Menage   cgroups: allow cg...
87
88
  	/* Unique id for this hierarchy. */
  	int hierarchy_id;
ddbcc7e8e   Paul Menage   Task Control Grou...
89
90
91
92
93
94
95
96
97
98
99
  	/* The bitmask of subsystems currently attached to this hierarchy */
  	unsigned long actual_subsys_bits;
  
  	/* A list running through the attached subsystems */
  	struct list_head subsys_list;
  
  	/* The root cgroup for this hierarchy */
  	struct cgroup top_cgroup;
  
  	/* Tracks how many cgroups are currently defined in hierarchy.*/
  	int number_of_cgroups;
e5f6a8609   Li Zefan   cgroups: make roo...
100
  	/* A list running through the active hierarchies */
ddbcc7e8e   Paul Menage   Task Control Grou...
101
102
103
104
  	struct list_head root_list;
  
  	/* Hierarchy-specific flags */
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
105

e788e066c   Paul Menage   cgroup files: mov...
106
  	/* The path to use for release notifications. */
81a6a5cdd   Paul Menage   Task Control Grou...
107
  	char release_agent_path[PATH_MAX];
c6d57f331   Paul Menage   cgroups: support ...
108
109
110
  
  	/* The name for this hierarchy - may be empty */
  	char name[MAX_CGROUP_ROOT_NAMELEN];
ddbcc7e8e   Paul Menage   Task Control Grou...
111
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
112
113
114
115
116
117
  /*
   * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
   * subsystems that are otherwise unattached - it never has more than a
   * single cgroup, and all tasks are part of that cgroup.
   */
  static struct cgroupfs_root rootnode;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
118
119
120
121
122
123
124
125
126
127
128
129
130
  /*
   * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
   * cgroup_subsys->use_id != 0.
   */
  #define CSS_ID_MAX	(65535)
  struct css_id {
  	/*
  	 * The css to which this ID points. This pointer is set to valid value
  	 * after cgroup is populated. If cgroup is removed, this will be NULL.
  	 * This pointer is expected to be RCU-safe because destroy()
  	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
  	 * css_tryget() should be used for avoiding race.
  	 */
2c392b8c3   Arnd Bergmann   cgroups: __rcu an...
131
  	struct cgroup_subsys_state __rcu *css;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
  	/*
  	 * ID of this css.
  	 */
  	unsigned short id;
  	/*
  	 * Depth in hierarchy which this ID belongs to.
  	 */
  	unsigned short depth;
  	/*
  	 * ID is freed by RCU. (and lookup routine is RCU safe.)
  	 */
  	struct rcu_head rcu_head;
  	/*
  	 * Hierarchy of CSS ID belongs to.
  	 */
  	unsigned short stack[0]; /* Array of Length (depth+1) */
  };
0dea11687   Kirill A. Shutemov   cgroup: implement...
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
  /*
   * cgroup_event represents events which userspace want to recieve.
   */
  struct cgroup_event {
  	/*
  	 * Cgroup which the event belongs to.
  	 */
  	struct cgroup *cgrp;
  	/*
  	 * Control file which the event associated.
  	 */
  	struct cftype *cft;
  	/*
  	 * eventfd to signal userspace about the event.
  	 */
  	struct eventfd_ctx *eventfd;
  	/*
  	 * Each of these stored in a list by the cgroup.
  	 */
  	struct list_head list;
  	/*
  	 * All fields below needed to unregister event when
  	 * userspace closes eventfd.
  	 */
  	poll_table pt;
  	wait_queue_head_t *wqh;
  	wait_queue_t wait;
  	struct work_struct remove;
  };
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
178

ddbcc7e8e   Paul Menage   Task Control Grou...
179
180
181
  /* The list of hierarchy roots */
  
  static LIST_HEAD(roots);
817929ec2   Paul Menage   Task Control Grou...
182
  static int root_count;
ddbcc7e8e   Paul Menage   Task Control Grou...
183

2c6ab6d20   Paul Menage   cgroups: allow cg...
184
185
186
  static DEFINE_IDA(hierarchy_ida);
  static int next_hierarchy_id;
  static DEFINE_SPINLOCK(hierarchy_id_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
187
188
189
190
  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
  #define dummytop (&rootnode.top_cgroup)
  
  /* This flag indicates whether tasks in the fork and exit paths should
a043e3b2c   Li Zefan   cgroup: fix comments
191
192
193
   * check for fork/exit handlers to call. This avoids us having to do
   * extra work in the fork/exit path if none of the subsystems need to
   * be called.
ddbcc7e8e   Paul Menage   Task Control Grou...
194
   */
8947f9d5b   Li Zefan   cgroups: annotate...
195
  static int need_forkexit_callback __read_mostly;
ddbcc7e8e   Paul Menage   Task Control Grou...
196

d11c563dd   Paul E. McKenney   sched: Use lockde...
197
198
199
200
201
202
203
204
205
206
207
208
209
  #ifdef CONFIG_PROVE_LOCKING
  int cgroup_lock_is_held(void)
  {
  	return lockdep_is_held(&cgroup_mutex);
  }
  #else /* #ifdef CONFIG_PROVE_LOCKING */
  int cgroup_lock_is_held(void)
  {
  	return mutex_is_locked(&cgroup_mutex);
  }
  #endif /* #else #ifdef CONFIG_PROVE_LOCKING */
  
  EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
ddbcc7e8e   Paul Menage   Task Control Grou...
210
  /* convenient tests for these bits */
bd89aabc6   Paul Menage   Control groups: R...
211
  inline int cgroup_is_removed(const struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
212
  {
bd89aabc6   Paul Menage   Control groups: R...
213
  	return test_bit(CGRP_REMOVED, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
214
215
216
217
218
219
  }
  
  /* bits in struct cgroupfs_root flags field */
  enum {
  	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
  };
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
220
  static int cgroup_is_releasable(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
221
222
  {
  	const int bits =
bd89aabc6   Paul Menage   Control groups: R...
223
224
225
  		(1 << CGRP_RELEASABLE) |
  		(1 << CGRP_NOTIFY_ON_RELEASE);
  	return (cgrp->flags & bits) == bits;
81a6a5cdd   Paul Menage   Task Control Grou...
226
  }
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
227
  static int notify_on_release(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
228
  {
bd89aabc6   Paul Menage   Control groups: R...
229
  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
230
  }
97978e6d1   Daniel Lezcano   cgroup: add clone...
231
232
233
234
  static int clone_children(const struct cgroup *cgrp)
  {
  	return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
235
236
237
238
239
240
  /*
   * for_each_subsys() allows you to iterate on each subsystem attached to
   * an active hierarchy
   */
  #define for_each_subsys(_root, _ss) \
  list_for_each_entry(_ss, &_root->subsys_list, sibling)
e5f6a8609   Li Zefan   cgroups: make roo...
241
242
  /* for_each_active_root() allows you to iterate across the active hierarchies */
  #define for_each_active_root(_root) \
ddbcc7e8e   Paul Menage   Task Control Grou...
243
  list_for_each_entry(_root, &roots, root_list)
81a6a5cdd   Paul Menage   Task Control Grou...
244
245
246
247
248
249
  /* the list of cgroups eligible for automatic release. Protected by
   * release_list_lock */
  static LIST_HEAD(release_list);
  static DEFINE_SPINLOCK(release_list_lock);
  static void cgroup_release_agent(struct work_struct *work);
  static DECLARE_WORK(release_agent_work, cgroup_release_agent);
bd89aabc6   Paul Menage   Control groups: R...
250
  static void check_for_release(struct cgroup *cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
251

817929ec2   Paul Menage   Task Control Grou...
252
253
254
255
256
257
  /* Link structure for associating css_set objects with cgroups */
  struct cg_cgroup_link {
  	/*
  	 * List running through cg_cgroup_links associated with a
  	 * cgroup, anchored on cgroup->css_sets
  	 */
bd89aabc6   Paul Menage   Control groups: R...
258
  	struct list_head cgrp_link_list;
7717f7ba9   Paul Menage   cgroups: add a ba...
259
  	struct cgroup *cgrp;
817929ec2   Paul Menage   Task Control Grou...
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
  	/*
  	 * List running through cg_cgroup_links pointing at a
  	 * single css_set object, anchored on css_set->cg_links
  	 */
  	struct list_head cg_link_list;
  	struct css_set *cg;
  };
  
  /* The default css_set - used by init and its children prior to any
   * hierarchies being mounted. It contains a pointer to the root state
   * for each subsystem. Also used to anchor the list of css_sets. Not
   * reference-counted, to improve performance when child cgroups
   * haven't been created.
   */
  
  static struct css_set init_css_set;
  static struct cg_cgroup_link init_css_set_link;
e6a1105ba   Ben Blum   cgroups: subsyste...
277
278
  static int cgroup_init_idr(struct cgroup_subsys *ss,
  			   struct cgroup_subsys_state *css);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
279

817929ec2   Paul Menage   Task Control Grou...
280
281
282
283
284
  /* css_set_lock protects the list of css_set objects, and the
   * chain of tasks off each css_set.  Nests outside task->alloc_lock
   * due to cgroup_iter_start() */
  static DEFINE_RWLOCK(css_set_lock);
  static int css_set_count;
7717f7ba9   Paul Menage   cgroups: add a ba...
285
286
287
288
289
  /*
   * hash table for cgroup groups. This improves the performance to find
   * an existing css_set. This hash doesn't (currently) take into
   * account cgroups in empty hierarchies.
   */
472b1053f   Li Zefan   cgroups: use a ha...
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
  #define CSS_SET_HASH_BITS	7
  #define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
  static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
  
  static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
  {
  	int i;
  	int index;
  	unsigned long tmp = 0UL;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
  		tmp += (unsigned long)css[i];
  	tmp = (tmp >> 16) ^ tmp;
  
  	index = hash_long(tmp, CSS_SET_HASH_BITS);
  
  	return &css_set_table[index];
  }
c378369d8   Ben Blum   cgroups: change c...
308
309
310
311
312
  static void free_css_set_rcu(struct rcu_head *obj)
  {
  	struct css_set *cg = container_of(obj, struct css_set, rcu_head);
  	kfree(cg);
  }
817929ec2   Paul Menage   Task Control Grou...
313
314
315
316
  /* We don't maintain the lists running through each css_set to its
   * task until after the first call to cgroup_iter_start(). This
   * reduces the fork()/exit() overhead for people who have cgroups
   * compiled into their kernel but not actually in use */
8947f9d5b   Li Zefan   cgroups: annotate...
317
  static int use_task_css_set_links __read_mostly;
817929ec2   Paul Menage   Task Control Grou...
318

2c6ab6d20   Paul Menage   cgroups: allow cg...
319
  static void __put_css_set(struct css_set *cg, int taskexit)
b4f48b636   Paul Menage   Task Control Grou...
320
  {
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
321
322
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
323
324
325
326
327
328
329
330
331
332
333
334
  	/*
  	 * Ensure that the refcount doesn't hit zero while any readers
  	 * can see it. Similar to atomic_dec_and_lock(), but for an
  	 * rwlock
  	 */
  	if (atomic_add_unless(&cg->refcount, -1, 1))
  		return;
  	write_lock(&css_set_lock);
  	if (!atomic_dec_and_test(&cg->refcount)) {
  		write_unlock(&css_set_lock);
  		return;
  	}
81a6a5cdd   Paul Menage   Task Control Grou...
335

2c6ab6d20   Paul Menage   cgroups: allow cg...
336
337
338
339
340
341
342
343
344
  	/* This css_set is dead. unlink it and release cgroup refcounts */
  	hlist_del(&cg->hlist);
  	css_set_count--;
  
  	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
  				 cg_link_list) {
  		struct cgroup *cgrp = link->cgrp;
  		list_del(&link->cg_link_list);
  		list_del(&link->cgrp_link_list);
bd89aabc6   Paul Menage   Control groups: R...
345
346
  		if (atomic_dec_and_test(&cgrp->count) &&
  		    notify_on_release(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
347
  			if (taskexit)
bd89aabc6   Paul Menage   Control groups: R...
348
349
  				set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
350
  		}
2c6ab6d20   Paul Menage   cgroups: allow cg...
351
352
  
  		kfree(link);
81a6a5cdd   Paul Menage   Task Control Grou...
353
  	}
2c6ab6d20   Paul Menage   cgroups: allow cg...
354
355
  
  	write_unlock(&css_set_lock);
c378369d8   Ben Blum   cgroups: change c...
356
  	call_rcu(&cg->rcu_head, free_css_set_rcu);
b4f48b636   Paul Menage   Task Control Grou...
357
  }
817929ec2   Paul Menage   Task Control Grou...
358
359
360
361
362
  /*
   * refcounted get/put for css_set objects
   */
  static inline void get_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
363
  	atomic_inc(&cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
364
365
366
367
  }
  
  static inline void put_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
368
  	__put_css_set(cg, 0);
817929ec2   Paul Menage   Task Control Grou...
369
  }
81a6a5cdd   Paul Menage   Task Control Grou...
370
371
  static inline void put_css_set_taskexit(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
372
  	__put_css_set(cg, 1);
81a6a5cdd   Paul Menage   Task Control Grou...
373
  }
817929ec2   Paul Menage   Task Control Grou...
374
  /*
7717f7ba9   Paul Menage   cgroups: add a ba...
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
   * compare_css_sets - helper function for find_existing_css_set().
   * @cg: candidate css_set being tested
   * @old_cg: existing css_set for a task
   * @new_cgrp: cgroup that's being entered by the task
   * @template: desired set of css pointers in css_set (pre-calculated)
   *
   * Returns true if "cg" matches "old_cg" except for the hierarchy
   * which "new_cgrp" belongs to, for which it should match "new_cgrp".
   */
  static bool compare_css_sets(struct css_set *cg,
  			     struct css_set *old_cg,
  			     struct cgroup *new_cgrp,
  			     struct cgroup_subsys_state *template[])
  {
  	struct list_head *l1, *l2;
  
  	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
  		/* Not all subsystems matched */
  		return false;
  	}
  
  	/*
  	 * Compare cgroup pointers in order to distinguish between
  	 * different cgroups in heirarchies with no subsystems. We
  	 * could get by with just this check alone (and skip the
  	 * memcmp above) but on most setups the memcmp check will
  	 * avoid the need for this more expensive check on almost all
  	 * candidates.
  	 */
  
  	l1 = &cg->cg_links;
  	l2 = &old_cg->cg_links;
  	while (1) {
  		struct cg_cgroup_link *cgl1, *cgl2;
  		struct cgroup *cg1, *cg2;
  
  		l1 = l1->next;
  		l2 = l2->next;
  		/* See if we reached the end - both lists are equal length. */
  		if (l1 == &cg->cg_links) {
  			BUG_ON(l2 != &old_cg->cg_links);
  			break;
  		} else {
  			BUG_ON(l2 == &old_cg->cg_links);
  		}
  		/* Locate the cgroups associated with these links. */
  		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
  		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
  		cg1 = cgl1->cgrp;
  		cg2 = cgl2->cgrp;
  		/* Hierarchies should be linked in the same order. */
  		BUG_ON(cg1->root != cg2->root);
  
  		/*
  		 * If this hierarchy is the hierarchy of the cgroup
  		 * that's changing, then we need to check that this
  		 * css_set points to the new cgroup; if it's any other
  		 * hierarchy, then this css_set should point to the
  		 * same cgroup as the old css_set.
  		 */
  		if (cg1->root == new_cgrp->root) {
  			if (cg1 != new_cgrp)
  				return false;
  		} else {
  			if (cg1 != cg2)
  				return false;
  		}
  	}
  	return true;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
447
448
   * find_existing_css_set() is a helper for
   * find_css_set(), and checks to see whether an existing
472b1053f   Li Zefan   cgroups: use a ha...
449
   * css_set is suitable.
817929ec2   Paul Menage   Task Control Grou...
450
451
452
453
   *
   * oldcg: the cgroup group that we're using before the cgroup
   * transition
   *
bd89aabc6   Paul Menage   Control groups: R...
454
   * cgrp: the cgroup that we're moving into
817929ec2   Paul Menage   Task Control Grou...
455
456
457
458
   *
   * template: location in which to build the desired set of subsystem
   * state objects for the new cgroup group
   */
817929ec2   Paul Menage   Task Control Grou...
459
460
  static struct css_set *find_existing_css_set(
  	struct css_set *oldcg,
bd89aabc6   Paul Menage   Control groups: R...
461
  	struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
462
  	struct cgroup_subsys_state *template[])
b4f48b636   Paul Menage   Task Control Grou...
463
464
  {
  	int i;
bd89aabc6   Paul Menage   Control groups: R...
465
  	struct cgroupfs_root *root = cgrp->root;
472b1053f   Li Zefan   cgroups: use a ha...
466
467
468
  	struct hlist_head *hhead;
  	struct hlist_node *node;
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
469

aae8aab40   Ben Blum   cgroups: revamp s...
470
471
472
473
474
  	/*
  	 * Build the set of subsystem state objects that we want to see in the
  	 * new css_set. while subsystems can change globally, the entries here
  	 * won't change, so no need for locking.
  	 */
817929ec2   Paul Menage   Task Control Grou...
475
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
476
  		if (root->subsys_bits & (1UL << i)) {
817929ec2   Paul Menage   Task Control Grou...
477
478
479
  			/* Subsystem is in this hierarchy. So we want
  			 * the subsystem state from the new
  			 * cgroup */
bd89aabc6   Paul Menage   Control groups: R...
480
  			template[i] = cgrp->subsys[i];
817929ec2   Paul Menage   Task Control Grou...
481
482
483
484
485
486
  		} else {
  			/* Subsystem is not in this hierarchy, so we
  			 * don't want to change the subsystem state */
  			template[i] = oldcg->subsys[i];
  		}
  	}
472b1053f   Li Zefan   cgroups: use a ha...
487
488
  	hhead = css_set_hash(template);
  	hlist_for_each_entry(cg, node, hhead, hlist) {
7717f7ba9   Paul Menage   cgroups: add a ba...
489
490
491
492
493
  		if (!compare_css_sets(cg, oldcg, cgrp, template))
  			continue;
  
  		/* This css_set matches what we need */
  		return cg;
472b1053f   Li Zefan   cgroups: use a ha...
494
  	}
817929ec2   Paul Menage   Task Control Grou...
495
496
497
498
  
  	/* No existing cgroup group matched */
  	return NULL;
  }
36553434f   Li Zefan   cgroup: remove du...
499
500
501
502
503
504
505
506
507
508
  static void free_cg_links(struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
  
  	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
  		list_del(&link->cgrp_link_list);
  		kfree(link);
  	}
  }
817929ec2   Paul Menage   Task Control Grou...
509
510
  /*
   * allocate_cg_links() allocates "count" cg_cgroup_link structures
bd89aabc6   Paul Menage   Control groups: R...
511
   * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
817929ec2   Paul Menage   Task Control Grou...
512
513
   * success or a negative error
   */
817929ec2   Paul Menage   Task Control Grou...
514
515
516
517
518
519
520
521
  static int allocate_cg_links(int count, struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	int i;
  	INIT_LIST_HEAD(tmp);
  	for (i = 0; i < count; i++) {
  		link = kmalloc(sizeof(*link), GFP_KERNEL);
  		if (!link) {
36553434f   Li Zefan   cgroup: remove du...
522
  			free_cg_links(tmp);
817929ec2   Paul Menage   Task Control Grou...
523
524
  			return -ENOMEM;
  		}
bd89aabc6   Paul Menage   Control groups: R...
525
  		list_add(&link->cgrp_link_list, tmp);
817929ec2   Paul Menage   Task Control Grou...
526
527
528
  	}
  	return 0;
  }
c12f65d43   Li Zefan   cgroups: introduc...
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
  /**
   * link_css_set - a helper function to link a css_set to a cgroup
   * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
   * @cg: the css_set to be linked
   * @cgrp: the destination cgroup
   */
  static void link_css_set(struct list_head *tmp_cg_links,
  			 struct css_set *cg, struct cgroup *cgrp)
  {
  	struct cg_cgroup_link *link;
  
  	BUG_ON(list_empty(tmp_cg_links));
  	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
  				cgrp_link_list);
  	link->cg = cg;
7717f7ba9   Paul Menage   cgroups: add a ba...
544
  	link->cgrp = cgrp;
2c6ab6d20   Paul Menage   cgroups: allow cg...
545
  	atomic_inc(&cgrp->count);
c12f65d43   Li Zefan   cgroups: introduc...
546
  	list_move(&link->cgrp_link_list, &cgrp->css_sets);
7717f7ba9   Paul Menage   cgroups: add a ba...
547
548
549
550
551
  	/*
  	 * Always add links to the tail of the list so that the list
  	 * is sorted by order of hierarchy creation
  	 */
  	list_add_tail(&link->cg_link_list, &cg->cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
552
  }
817929ec2   Paul Menage   Task Control Grou...
553
554
555
556
557
558
559
  /*
   * find_css_set() takes an existing cgroup group and a
   * cgroup object, and returns a css_set object that's
   * equivalent to the old group, but with the given cgroup
   * substituted into the appropriate hierarchy. Must be called with
   * cgroup_mutex held
   */
817929ec2   Paul Menage   Task Control Grou...
560
  static struct css_set *find_css_set(
bd89aabc6   Paul Menage   Control groups: R...
561
  	struct css_set *oldcg, struct cgroup *cgrp)
817929ec2   Paul Menage   Task Control Grou...
562
563
564
  {
  	struct css_set *res;
  	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
817929ec2   Paul Menage   Task Control Grou...
565
566
  
  	struct list_head tmp_cg_links;
817929ec2   Paul Menage   Task Control Grou...
567

472b1053f   Li Zefan   cgroups: use a ha...
568
  	struct hlist_head *hhead;
7717f7ba9   Paul Menage   cgroups: add a ba...
569
  	struct cg_cgroup_link *link;
472b1053f   Li Zefan   cgroups: use a ha...
570

817929ec2   Paul Menage   Task Control Grou...
571
572
  	/* First see if we already have a cgroup group that matches
  	 * the desired set */
7e9abd89c   Li Zefan   cgroup: use read ...
573
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
574
  	res = find_existing_css_set(oldcg, cgrp, template);
817929ec2   Paul Menage   Task Control Grou...
575
576
  	if (res)
  		get_css_set(res);
7e9abd89c   Li Zefan   cgroup: use read ...
577
  	read_unlock(&css_set_lock);
817929ec2   Paul Menage   Task Control Grou...
578
579
580
581
582
583
584
585
586
587
588
589
590
  
  	if (res)
  		return res;
  
  	res = kmalloc(sizeof(*res), GFP_KERNEL);
  	if (!res)
  		return NULL;
  
  	/* Allocate all the cg_cgroup_link objects that we'll need */
  	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
  		kfree(res);
  		return NULL;
  	}
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
591
  	atomic_set(&res->refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
592
593
  	INIT_LIST_HEAD(&res->cg_links);
  	INIT_LIST_HEAD(&res->tasks);
472b1053f   Li Zefan   cgroups: use a ha...
594
  	INIT_HLIST_NODE(&res->hlist);
817929ec2   Paul Menage   Task Control Grou...
595
596
597
598
599
600
601
  
  	/* Copy the set of subsystem state objects generated in
  	 * find_existing_css_set() */
  	memcpy(res->subsys, template, sizeof(res->subsys));
  
  	write_lock(&css_set_lock);
  	/* Add reference counts and links from the new css_set. */
7717f7ba9   Paul Menage   cgroups: add a ba...
602
603
604
605
606
607
  	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
  		struct cgroup *c = link->cgrp;
  		if (c->root == cgrp->root)
  			c = cgrp;
  		link_css_set(&tmp_cg_links, res, c);
  	}
817929ec2   Paul Menage   Task Control Grou...
608
609
  
  	BUG_ON(!list_empty(&tmp_cg_links));
817929ec2   Paul Menage   Task Control Grou...
610
  	css_set_count++;
472b1053f   Li Zefan   cgroups: use a ha...
611
612
613
614
  
  	/* Add this cgroup group to the hash table */
  	hhead = css_set_hash(res->subsys);
  	hlist_add_head(&res->hlist, hhead);
817929ec2   Paul Menage   Task Control Grou...
615
616
617
  	write_unlock(&css_set_lock);
  
  	return res;
b4f48b636   Paul Menage   Task Control Grou...
618
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
619
  /*
7717f7ba9   Paul Menage   cgroups: add a ba...
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
   * Return the cgroup for "task" from the given hierarchy. Must be
   * called with cgroup_mutex held.
   */
  static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  					    struct cgroupfs_root *root)
  {
  	struct css_set *css;
  	struct cgroup *res = NULL;
  
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
  	read_lock(&css_set_lock);
  	/*
  	 * No need to lock the task - since we hold cgroup_mutex the
  	 * task can't change groups, so the only thing that can happen
  	 * is that it exits and its css is set back to init_css_set.
  	 */
  	css = task->cgroups;
  	if (css == &init_css_set) {
  		res = &root->top_cgroup;
  	} else {
  		struct cg_cgroup_link *link;
  		list_for_each_entry(link, &css->cg_links, cg_link_list) {
  			struct cgroup *c = link->cgrp;
  			if (c->root == root) {
  				res = c;
  				break;
  			}
  		}
  	}
  	read_unlock(&css_set_lock);
  	BUG_ON(!res);
  	return res;
  }
  
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
655
656
657
658
659
660
661
662
663
   * There is one global cgroup mutex. We also require taking
   * task_lock() when dereferencing a task's cgroup subsys pointers.
   * See "The task_lock() exception", at the end of this comment.
   *
   * A task must hold cgroup_mutex to modify cgroups.
   *
   * Any task can increment and decrement the count field without lock.
   * So in general, code holding cgroup_mutex can't rely on the count
   * field not changing.  However, if the count goes to zero, then only
956db3ca0   Cliff Wickman   hotplug cpu: move...
664
   * cgroup_attach_task() can increment it again.  Because a count of zero
ddbcc7e8e   Paul Menage   Task Control Grou...
665
666
667
668
669
670
671
672
   * means that no tasks are currently attached, therefore there is no
   * way a task attached to that cgroup can fork (the other way to
   * increment the count).  So code holding cgroup_mutex can safely
   * assume that if the count is zero, it will stay zero. Similarly, if
   * a task holds cgroup_mutex on a cgroup with zero count, it
   * knows that the cgroup won't be removed, as cgroup_rmdir()
   * needs that mutex.
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
673
674
675
676
677
   * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
   * (usually) take cgroup_mutex.  These are the two most performance
   * critical pieces of code here.  The exception occurs on cgroup_exit(),
   * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
   * is taken, and if the cgroup count is zero, a usermode call made
a043e3b2c   Li Zefan   cgroup: fix comments
678
679
   * to the release agent with the name of the cgroup (path relative to
   * the root of cgroup file system) as the argument.
ddbcc7e8e   Paul Menage   Task Control Grou...
680
681
682
683
684
685
686
687
688
689
690
   *
   * A cgroup can only be deleted if both its 'count' of using tasks
   * is zero, and its list of 'children' cgroups is empty.  Since all
   * tasks in the system use _some_ cgroup, and since there is always at
   * least one task in the system (init, pid == 1), therefore, top_cgroup
   * always has either children cgroups and/or using tasks.  So we don't
   * need a special hack to ensure that top_cgroup cannot be deleted.
   *
   *	The task_lock() exception
   *
   * The need for this exception arises from the action of
956db3ca0   Cliff Wickman   hotplug cpu: move...
691
   * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
a043e3b2c   Li Zefan   cgroup: fix comments
692
   * another.  It does so using cgroup_mutex, however there are
ddbcc7e8e   Paul Menage   Task Control Grou...
693
694
695
   * several performance critical places that need to reference
   * task->cgroup without the expense of grabbing a system global
   * mutex.  Therefore except as noted below, when dereferencing or, as
956db3ca0   Cliff Wickman   hotplug cpu: move...
696
   * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
ddbcc7e8e   Paul Menage   Task Control Grou...
697
698
699
700
   * task_lock(), which acts on a spinlock (task->alloc_lock) already in
   * the task_struct routinely used for such matters.
   *
   * P.S.  One more locking exception.  RCU is used to guard the
956db3ca0   Cliff Wickman   hotplug cpu: move...
701
   * update of a tasks cgroup pointer by cgroup_attach_task()
ddbcc7e8e   Paul Menage   Task Control Grou...
702
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
703
704
705
706
  /**
   * cgroup_lock - lock out any changes to cgroup structures
   *
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
707
708
709
710
  void cgroup_lock(void)
  {
  	mutex_lock(&cgroup_mutex);
  }
67523c48a   Ben Blum   cgroups: blkio su...
711
  EXPORT_SYMBOL_GPL(cgroup_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
712
713
714
715
716
717
  
  /**
   * cgroup_unlock - release lock on cgroup changes
   *
   * Undo the lock taken in a previous cgroup_lock() call.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
718
719
720
721
  void cgroup_unlock(void)
  {
  	mutex_unlock(&cgroup_mutex);
  }
67523c48a   Ben Blum   cgroups: blkio su...
722
  EXPORT_SYMBOL_GPL(cgroup_unlock);
ddbcc7e8e   Paul Menage   Task Control Grou...
723
724
725
726
727
728
729
730
731
  
  /*
   * A couple of forward declarations required, due to cyclic reference loop:
   * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
   * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
   * -> cgroup_mkdir.
   */
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
c72a04e34   Al Viro   cgroup_fs: fix cg...
732
  static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
ddbcc7e8e   Paul Menage   Task Control Grou...
733
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
bd89aabc6   Paul Menage   Control groups: R...
734
  static int cgroup_populate_dir(struct cgroup *cgrp);
6e1d5dcc2   Alexey Dobriyan   const: mark remai...
735
  static const struct inode_operations cgroup_dir_inode_operations;
828c09509   Alexey Dobriyan   const: constify r...
736
  static const struct file_operations proc_cgroupstats_operations;
a424316ca   Paul Menage   Task Control Grou...
737
738
  
  static struct backing_dev_info cgroup_backing_dev_info = {
d993831fa   Jens Axboe   writeback: add na...
739
  	.name		= "cgroup",
e4ad08fe6   Miklos Szeredi   mm: bdi: add sepa...
740
  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
a424316ca   Paul Menage   Task Control Grou...
741
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
742

38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
743
744
  static int alloc_css_id(struct cgroup_subsys *ss,
  			struct cgroup *parent, struct cgroup *child);
ddbcc7e8e   Paul Menage   Task Control Grou...
745
746
747
  static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  {
  	struct inode *inode = new_inode(sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
748
749
  
  	if (inode) {
85fe4025c   Christoph Hellwig   fs: do not assign...
750
  		inode->i_ino = get_next_ino();
ddbcc7e8e   Paul Menage   Task Control Grou...
751
  		inode->i_mode = mode;
76aac0e9a   David Howells   CRED: Wrap task c...
752
753
  		inode->i_uid = current_fsuid();
  		inode->i_gid = current_fsgid();
ddbcc7e8e   Paul Menage   Task Control Grou...
754
755
756
757
758
  		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
  	}
  	return inode;
  }
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
759
760
761
762
  /*
   * Call subsys's pre_destroy handler.
   * This is called before css refcnt check.
   */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
763
  static int cgroup_call_pre_destroy(struct cgroup *cgrp)
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
764
765
  {
  	struct cgroup_subsys *ss;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
766
  	int ret = 0;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
767
  	for_each_subsys(cgrp->root, ss)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
768
769
770
  		if (ss->pre_destroy) {
  			ret = ss->pre_destroy(ss, cgrp);
  			if (ret)
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
771
  				break;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
772
  		}
0dea11687   Kirill A. Shutemov   cgroup: implement...
773

ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
774
  	return ret;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
775
  }
a47295e6b   Paul Menage   cgroups: make cgr...
776
777
778
779
780
781
  static void free_cgroup_rcu(struct rcu_head *obj)
  {
  	struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
  
  	kfree(cgrp);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
782
783
784
785
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
  	/* is dentry a directory ? if so, kfree() associated cgroup */
  	if (S_ISDIR(inode->i_mode)) {
bd89aabc6   Paul Menage   Control groups: R...
786
  		struct cgroup *cgrp = dentry->d_fsdata;
8dc4f3e17   Paul Menage   cgroups: move cgr...
787
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
788
  		BUG_ON(!(cgroup_is_removed(cgrp)));
81a6a5cdd   Paul Menage   Task Control Grou...
789
790
791
792
793
794
795
  		/* It's possible for external users to be holding css
  		 * reference counts on a cgroup; css_put() needs to
  		 * be able to access the cgroup after decrementing
  		 * the reference count in order to know if it needs to
  		 * queue the cgroup to be handled by the release
  		 * agent */
  		synchronize_rcu();
8dc4f3e17   Paul Menage   cgroups: move cgr...
796
797
798
799
800
  
  		mutex_lock(&cgroup_mutex);
  		/*
  		 * Release the subsystem state objects.
  		 */
75139b827   Li Zefan   cgroups: remove s...
801
802
  		for_each_subsys(cgrp->root, ss)
  			ss->destroy(ss, cgrp);
8dc4f3e17   Paul Menage   cgroups: move cgr...
803
804
805
  
  		cgrp->root->number_of_cgroups--;
  		mutex_unlock(&cgroup_mutex);
a47295e6b   Paul Menage   cgroups: make cgr...
806
807
808
809
  		/*
  		 * Drop the active superblock reference that we took when we
  		 * created the cgroup
  		 */
8dc4f3e17   Paul Menage   cgroups: move cgr...
810
  		deactivate_super(cgrp->root->sb);
72a8cb30d   Ben Blum   cgroups: ensure c...
811
812
813
814
815
  		/*
  		 * if we're getting rid of the cgroup, refcount should ensure
  		 * that there are no pidlists left.
  		 */
  		BUG_ON(!list_empty(&cgrp->pidlists));
a47295e6b   Paul Menage   cgroups: make cgr...
816
  		call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
ddbcc7e8e   Paul Menage   Task Control Grou...
817
818
819
  	}
  	iput(inode);
  }
c72a04e34   Al Viro   cgroup_fs: fix cg...
820
821
822
823
  static int cgroup_delete(const struct dentry *d)
  {
  	return 1;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
824
825
826
827
828
829
830
831
832
833
834
835
836
837
  static void remove_dir(struct dentry *d)
  {
  	struct dentry *parent = dget(d->d_parent);
  
  	d_delete(d);
  	simple_rmdir(parent->d_inode, d);
  	dput(parent);
  }
  
  static void cgroup_clear_directory(struct dentry *dentry)
  {
  	struct list_head *node;
  
  	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2fd6b7f50   Nick Piggin   fs: dcache scale ...
838
  	spin_lock(&dentry->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
839
840
841
  	node = dentry->d_subdirs.next;
  	while (node != &dentry->d_subdirs) {
  		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
842
843
  
  		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
ddbcc7e8e   Paul Menage   Task Control Grou...
844
845
846
847
848
  		list_del_init(node);
  		if (d->d_inode) {
  			/* This should never be called on a cgroup
  			 * directory with child cgroups */
  			BUG_ON(d->d_inode->i_mode & S_IFDIR);
dc0474be3   Nick Piggin   fs: dcache ration...
849
  			dget_dlock(d);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
850
851
  			spin_unlock(&d->d_lock);
  			spin_unlock(&dentry->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
852
853
854
  			d_delete(d);
  			simple_unlink(dentry->d_inode, d);
  			dput(d);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
855
856
857
  			spin_lock(&dentry->d_lock);
  		} else
  			spin_unlock(&d->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
858
859
  		node = dentry->d_subdirs.next;
  	}
2fd6b7f50   Nick Piggin   fs: dcache scale ...
860
  	spin_unlock(&dentry->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
861
862
863
864
865
866
867
  }
  
  /*
   * NOTE : the dentry must have been dget()'ed
   */
  static void cgroup_d_remove_dir(struct dentry *dentry)
  {
2fd6b7f50   Nick Piggin   fs: dcache scale ...
868
  	struct dentry *parent;
ddbcc7e8e   Paul Menage   Task Control Grou...
869
  	cgroup_clear_directory(dentry);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
870
871
  	parent = dentry->d_parent;
  	spin_lock(&parent->d_lock);
3ec762ad8   Li Zefan   cgroups: Fix a lo...
872
  	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
ddbcc7e8e   Paul Menage   Task Control Grou...
873
  	list_del_init(&dentry->d_u.d_child);
2fd6b7f50   Nick Piggin   fs: dcache scale ...
874
875
  	spin_unlock(&dentry->d_lock);
  	spin_unlock(&parent->d_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
876
877
  	remove_dir(dentry);
  }
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
878
879
880
881
882
883
  /*
   * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
   * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
   * reference to css->refcnt. In general, this refcnt is expected to goes down
   * to zero, soon.
   *
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
884
   * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
885
886
   */
  DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
887
  static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
888
  {
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
889
  	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
890
891
  		wake_up_all(&cgroup_rmdir_waitq);
  }
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
892
893
894
895
896
897
898
899
900
901
  void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
  {
  	css_get(css);
  }
  
  void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
  {
  	cgroup_wakeup_rmdir_waiter(css->cgroup);
  	css_put(css);
  }
aae8aab40   Ben Blum   cgroups: revamp s...
902
  /*
cf5d5941f   Ben Blum   cgroups: subsyste...
903
904
905
   * Call with cgroup_mutex held. Drops reference counts on modules, including
   * any duplicate ones that parse_cgroupfs_options took. If this function
   * returns an error, no reference counts are touched.
aae8aab40   Ben Blum   cgroups: revamp s...
906
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
907
908
909
910
  static int rebind_subsystems(struct cgroupfs_root *root,
  			      unsigned long final_bits)
  {
  	unsigned long added_bits, removed_bits;
bd89aabc6   Paul Menage   Control groups: R...
911
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
912
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
913
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
ddbcc7e8e   Paul Menage   Task Control Grou...
914
915
916
917
  	removed_bits = root->actual_subsys_bits & ~final_bits;
  	added_bits = final_bits & ~root->actual_subsys_bits;
  	/* Check that any added subsystems are currently free */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
918
  		unsigned long bit = 1UL << i;
ddbcc7e8e   Paul Menage   Task Control Grou...
919
920
921
  		struct cgroup_subsys *ss = subsys[i];
  		if (!(bit & added_bits))
  			continue;
aae8aab40   Ben Blum   cgroups: revamp s...
922
923
924
925
926
927
  		/*
  		 * Nobody should tell us to do a subsys that doesn't exist:
  		 * parse_cgroupfs_options should catch that case and refcounts
  		 * ensure that subsystems won't disappear once selected.
  		 */
  		BUG_ON(ss == NULL);
ddbcc7e8e   Paul Menage   Task Control Grou...
928
929
930
931
932
933
934
935
936
937
  		if (ss->root != &rootnode) {
  			/* Subsystem isn't free */
  			return -EBUSY;
  		}
  	}
  
  	/* Currently we don't handle adding/removing subsystems when
  	 * any child cgroups exist. This is theoretically supportable
  	 * but involves complex error handling, so it's being left until
  	 * later */
307257cf4   Paul Menage   cgroups: fix a ra...
938
  	if (root->number_of_cgroups > 1)
ddbcc7e8e   Paul Menage   Task Control Grou...
939
940
941
942
943
944
945
946
  		return -EBUSY;
  
  	/* Process each subsystem */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		unsigned long bit = 1UL << i;
  		if (bit & added_bits) {
  			/* We're binding this subsystem to this hierarchy */
aae8aab40   Ben Blum   cgroups: revamp s...
947
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
948
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
949
950
  			BUG_ON(!dummytop->subsys[i]);
  			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
999cd8a45   Paul Menage   cgroups: add a pe...
951
  			mutex_lock(&ss->hierarchy_mutex);
bd89aabc6   Paul Menage   Control groups: R...
952
953
  			cgrp->subsys[i] = dummytop->subsys[i];
  			cgrp->subsys[i]->cgroup = cgrp;
33a68ac1c   Li Zefan   cgroups: add inac...
954
  			list_move(&ss->sibling, &root->subsys_list);
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
955
  			ss->root = root;
ddbcc7e8e   Paul Menage   Task Control Grou...
956
  			if (ss->bind)
bd89aabc6   Paul Menage   Control groups: R...
957
  				ss->bind(ss, cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
958
  			mutex_unlock(&ss->hierarchy_mutex);
cf5d5941f   Ben Blum   cgroups: subsyste...
959
  			/* refcount was already taken, and we're keeping it */
ddbcc7e8e   Paul Menage   Task Control Grou...
960
961
  		} else if (bit & removed_bits) {
  			/* We're removing this subsystem */
aae8aab40   Ben Blum   cgroups: revamp s...
962
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
963
964
  			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
  			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
965
  			mutex_lock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
966
967
968
  			if (ss->bind)
  				ss->bind(ss, dummytop);
  			dummytop->subsys[i]->cgroup = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
969
  			cgrp->subsys[i] = NULL;
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
970
  			subsys[i]->root = &rootnode;
33a68ac1c   Li Zefan   cgroups: add inac...
971
  			list_move(&ss->sibling, &rootnode.subsys_list);
999cd8a45   Paul Menage   cgroups: add a pe...
972
  			mutex_unlock(&ss->hierarchy_mutex);
cf5d5941f   Ben Blum   cgroups: subsyste...
973
974
  			/* subsystem is now free - drop reference on module */
  			module_put(ss->module);
ddbcc7e8e   Paul Menage   Task Control Grou...
975
976
  		} else if (bit & final_bits) {
  			/* Subsystem state should already exist */
aae8aab40   Ben Blum   cgroups: revamp s...
977
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
978
  			BUG_ON(!cgrp->subsys[i]);
cf5d5941f   Ben Blum   cgroups: subsyste...
979
980
981
982
983
984
985
986
  			/*
  			 * a refcount was taken, but we already had one, so
  			 * drop the extra reference.
  			 */
  			module_put(ss->module);
  #ifdef CONFIG_MODULE_UNLOAD
  			BUG_ON(ss->module && !module_refcount(ss->module));
  #endif
ddbcc7e8e   Paul Menage   Task Control Grou...
987
988
  		} else {
  			/* Subsystem state shouldn't exist */
bd89aabc6   Paul Menage   Control groups: R...
989
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
  		}
  	}
  	root->subsys_bits = root->actual_subsys_bits = final_bits;
  	synchronize_rcu();
  
  	return 0;
  }
  
  static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
  {
  	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
  	struct cgroup_subsys *ss;
  
  	mutex_lock(&cgroup_mutex);
  	for_each_subsys(root, ss)
  		seq_printf(seq, ",%s", ss->name);
  	if (test_bit(ROOT_NOPREFIX, &root->flags))
  		seq_puts(seq, ",noprefix");
81a6a5cdd   Paul Menage   Task Control Grou...
1008
1009
  	if (strlen(root->release_agent_path))
  		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
97978e6d1   Daniel Lezcano   cgroup: add clone...
1010
1011
  	if (clone_children(&root->top_cgroup))
  		seq_puts(seq, ",clone_children");
c6d57f331   Paul Menage   cgroups: support ...
1012
1013
  	if (strlen(root->name))
  		seq_printf(seq, ",name=%s", root->name);
ddbcc7e8e   Paul Menage   Task Control Grou...
1014
1015
1016
1017
1018
1019
1020
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  struct cgroup_sb_opts {
  	unsigned long subsys_bits;
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
1021
  	char *release_agent;
97978e6d1   Daniel Lezcano   cgroup: add clone...
1022
  	bool clone_children;
c6d57f331   Paul Menage   cgroups: support ...
1023
  	char *name;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1024
1025
  	/* User explicitly requested empty subsystem */
  	bool none;
c6d57f331   Paul Menage   cgroups: support ...
1026
1027
  
  	struct cgroupfs_root *new_root;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1028

ddbcc7e8e   Paul Menage   Task Control Grou...
1029
  };
aae8aab40   Ben Blum   cgroups: revamp s...
1030
1031
  /*
   * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
cf5d5941f   Ben Blum   cgroups: subsyste...
1032
1033
1034
   * with cgroup_mutex held to protect the subsys[] array. This function takes
   * refcounts on subsystems to be used, unless it returns error, in which case
   * no refcounts are taken.
aae8aab40   Ben Blum   cgroups: revamp s...
1035
   */
cf5d5941f   Ben Blum   cgroups: subsyste...
1036
  static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
ddbcc7e8e   Paul Menage   Task Control Grou...
1037
  {
32a8cf235   Daniel Lezcano   cgroup: make the ...
1038
1039
  	char *token, *o = data;
  	bool all_ss = false, one_ss = false;
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1040
  	unsigned long mask = (unsigned long)-1;
cf5d5941f   Ben Blum   cgroups: subsyste...
1041
1042
  	int i;
  	bool module_pin_failed = false;
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1043

aae8aab40   Ben Blum   cgroups: revamp s...
1044
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1045
1046
1047
  #ifdef CONFIG_CPUSETS
  	mask = ~(1UL << cpuset_subsys_id);
  #endif
ddbcc7e8e   Paul Menage   Task Control Grou...
1048

c6d57f331   Paul Menage   cgroups: support ...
1049
  	memset(opts, 0, sizeof(*opts));
ddbcc7e8e   Paul Menage   Task Control Grou...
1050
1051
1052
1053
  
  	while ((token = strsep(&o, ",")) != NULL) {
  		if (!*token)
  			return -EINVAL;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1054
  		if (!strcmp(token, "none")) {
2c6ab6d20   Paul Menage   cgroups: allow cg...
1055
1056
  			/* Explicitly have no subsystems */
  			opts->none = true;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
  			continue;
  		}
  		if (!strcmp(token, "all")) {
  			/* Mutually exclusive option 'all' + subsystem name */
  			if (one_ss)
  				return -EINVAL;
  			all_ss = true;
  			continue;
  		}
  		if (!strcmp(token, "noprefix")) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1067
  			set_bit(ROOT_NOPREFIX, &opts->flags);
32a8cf235   Daniel Lezcano   cgroup: make the ...
1068
1069
1070
  			continue;
  		}
  		if (!strcmp(token, "clone_children")) {
97978e6d1   Daniel Lezcano   cgroup: add clone...
1071
  			opts->clone_children = true;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1072
1073
1074
  			continue;
  		}
  		if (!strncmp(token, "release_agent=", 14)) {
81a6a5cdd   Paul Menage   Task Control Grou...
1075
1076
1077
  			/* Specifying two release agents is forbidden */
  			if (opts->release_agent)
  				return -EINVAL;
c6d57f331   Paul Menage   cgroups: support ...
1078
  			opts->release_agent =
e400c2852   Dan Carpenter   cgroups: save spa...
1079
  				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
81a6a5cdd   Paul Menage   Task Control Grou...
1080
1081
  			if (!opts->release_agent)
  				return -ENOMEM;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1082
1083
1084
  			continue;
  		}
  		if (!strncmp(token, "name=", 5)) {
c6d57f331   Paul Menage   cgroups: support ...
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
  			const char *name = token + 5;
  			/* Can't specify an empty name */
  			if (!strlen(name))
  				return -EINVAL;
  			/* Must match [\w.-]+ */
  			for (i = 0; i < strlen(name); i++) {
  				char c = name[i];
  				if (isalnum(c))
  					continue;
  				if ((c == '.') || (c == '-') || (c == '_'))
  					continue;
  				return -EINVAL;
  			}
  			/* Specifying two names is forbidden */
  			if (opts->name)
  				return -EINVAL;
  			opts->name = kstrndup(name,
e400c2852   Dan Carpenter   cgroups: save spa...
1102
  					      MAX_CGROUP_ROOT_NAMELEN - 1,
c6d57f331   Paul Menage   cgroups: support ...
1103
1104
1105
  					      GFP_KERNEL);
  			if (!opts->name)
  				return -ENOMEM;
32a8cf235   Daniel Lezcano   cgroup: make the ...
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
  
  			continue;
  		}
  
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss == NULL)
  				continue;
  			if (strcmp(token, ss->name))
  				continue;
  			if (ss->disabled)
  				continue;
  
  			/* Mutually exclusive option 'all' + subsystem name */
  			if (all_ss)
  				return -EINVAL;
  			set_bit(i, &opts->subsys_bits);
  			one_ss = true;
  
  			break;
  		}
  		if (i == CGROUP_SUBSYS_COUNT)
  			return -ENOENT;
  	}
  
  	/*
  	 * If the 'all' option was specified select all the subsystems,
  	 * otherwise 'all, 'none' and a subsystem name options were not
  	 * specified, let's default to 'all'
  	 */
  	if (all_ss || (!all_ss && !one_ss && !opts->none)) {
  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss == NULL)
  				continue;
  			if (ss->disabled)
  				continue;
  			set_bit(i, &opts->subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
1144
1145
  		}
  	}
2c6ab6d20   Paul Menage   cgroups: allow cg...
1146
  	/* Consistency checks */
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1147
1148
1149
1150
1151
1152
1153
1154
  	/*
  	 * Option noprefix was introduced just for backward compatibility
  	 * with the old cpuset, so we allow noprefix only if mounting just
  	 * the cpuset subsystem.
  	 */
  	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
  	    (opts->subsys_bits & mask))
  		return -EINVAL;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1155
1156
1157
1158
1159
1160
1161
1162
1163
  
  	/* Can't specify "none" and some subsystems */
  	if (opts->subsys_bits && opts->none)
  		return -EINVAL;
  
  	/*
  	 * We either have to specify by name or by subsystems. (So all
  	 * empty hierarchies must have a name).
  	 */
c6d57f331   Paul Menage   cgroups: support ...
1164
  	if (!opts->subsys_bits && !opts->name)
ddbcc7e8e   Paul Menage   Task Control Grou...
1165
  		return -EINVAL;
cf5d5941f   Ben Blum   cgroups: subsyste...
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
  	/*
  	 * Grab references on all the modules we'll need, so the subsystems
  	 * don't dance around before rebind_subsystems attaches them. This may
  	 * take duplicate reference counts on a subsystem that's already used,
  	 * but rebind_subsystems handles this case.
  	 */
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		unsigned long bit = 1UL << i;
  
  		if (!(bit & opts->subsys_bits))
  			continue;
  		if (!try_module_get(subsys[i]->module)) {
  			module_pin_failed = true;
  			break;
  		}
  	}
  	if (module_pin_failed) {
  		/*
  		 * oops, one of the modules was going away. this means that we
  		 * raced with a module_delete call, and to the user this is
  		 * essentially a "subsystem doesn't exist" case.
  		 */
  		for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
  			/* drop refcounts only on the ones we took */
  			unsigned long bit = 1UL << i;
  
  			if (!(bit & opts->subsys_bits))
  				continue;
  			module_put(subsys[i]->module);
  		}
  		return -ENOENT;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1198
1199
  	return 0;
  }
cf5d5941f   Ben Blum   cgroups: subsyste...
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
  static void drop_parsed_module_refcounts(unsigned long subsys_bits)
  {
  	int i;
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		unsigned long bit = 1UL << i;
  
  		if (!(bit & subsys_bits))
  			continue;
  		module_put(subsys[i]->module);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1211
1212
1213
1214
  static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  {
  	int ret = 0;
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1215
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1216
  	struct cgroup_sb_opts opts;
bd89aabc6   Paul Menage   Control groups: R...
1217
  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1218
1219
1220
1221
1222
1223
  	mutex_lock(&cgroup_mutex);
  
  	/* See what subsystems are wanted */
  	ret = parse_cgroupfs_options(data, &opts);
  	if (ret)
  		goto out_unlock;
cf5d5941f   Ben Blum   cgroups: subsyste...
1224
1225
1226
  	/* Don't allow flags or name to change at remount */
  	if (opts.flags != root->flags ||
  	    (opts.name && strcmp(opts.name, root->name))) {
c6d57f331   Paul Menage   cgroups: support ...
1227
  		ret = -EINVAL;
cf5d5941f   Ben Blum   cgroups: subsyste...
1228
  		drop_parsed_module_refcounts(opts.subsys_bits);
c6d57f331   Paul Menage   cgroups: support ...
1229
1230
  		goto out_unlock;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1231
  	ret = rebind_subsystems(root, opts.subsys_bits);
cf5d5941f   Ben Blum   cgroups: subsyste...
1232
1233
  	if (ret) {
  		drop_parsed_module_refcounts(opts.subsys_bits);
0670e08bd   Li Zefan   cgroups: don't ch...
1234
  		goto out_unlock;
cf5d5941f   Ben Blum   cgroups: subsyste...
1235
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1236
1237
  
  	/* (re)populate subsystem files */
0670e08bd   Li Zefan   cgroups: don't ch...
1238
  	cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1239

81a6a5cdd   Paul Menage   Task Control Grou...
1240
1241
  	if (opts.release_agent)
  		strcpy(root->release_agent_path, opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1242
   out_unlock:
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
1243
  	kfree(opts.release_agent);
c6d57f331   Paul Menage   cgroups: support ...
1244
  	kfree(opts.name);
ddbcc7e8e   Paul Menage   Task Control Grou...
1245
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
1246
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1247
1248
  	return ret;
  }
b87221de6   Alexey Dobriyan   const: mark remai...
1249
  static const struct super_operations cgroup_ops = {
ddbcc7e8e   Paul Menage   Task Control Grou...
1250
1251
1252
1253
1254
  	.statfs = simple_statfs,
  	.drop_inode = generic_delete_inode,
  	.show_options = cgroup_show_options,
  	.remount_fs = cgroup_remount,
  };
cc31edcee   Paul Menage   cgroups: convert ...
1255
1256
1257
1258
1259
1260
  static void init_cgroup_housekeeping(struct cgroup *cgrp)
  {
  	INIT_LIST_HEAD(&cgrp->sibling);
  	INIT_LIST_HEAD(&cgrp->children);
  	INIT_LIST_HEAD(&cgrp->css_sets);
  	INIT_LIST_HEAD(&cgrp->release_list);
72a8cb30d   Ben Blum   cgroups: ensure c...
1261
1262
  	INIT_LIST_HEAD(&cgrp->pidlists);
  	mutex_init(&cgrp->pidlist_mutex);
0dea11687   Kirill A. Shutemov   cgroup: implement...
1263
1264
  	INIT_LIST_HEAD(&cgrp->event_list);
  	spin_lock_init(&cgrp->event_list_lock);
cc31edcee   Paul Menage   cgroups: convert ...
1265
  }
c6d57f331   Paul Menage   cgroups: support ...
1266

ddbcc7e8e   Paul Menage   Task Control Grou...
1267
1268
  static void init_cgroup_root(struct cgroupfs_root *root)
  {
bd89aabc6   Paul Menage   Control groups: R...
1269
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1270
1271
1272
  	INIT_LIST_HEAD(&root->subsys_list);
  	INIT_LIST_HEAD(&root->root_list);
  	root->number_of_cgroups = 1;
bd89aabc6   Paul Menage   Control groups: R...
1273
1274
  	cgrp->root = root;
  	cgrp->top_cgroup = cgrp;
cc31edcee   Paul Menage   cgroups: convert ...
1275
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1276
  }
2c6ab6d20   Paul Menage   cgroups: allow cg...
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
  static bool init_root_id(struct cgroupfs_root *root)
  {
  	int ret = 0;
  
  	do {
  		if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
  			return false;
  		spin_lock(&hierarchy_id_lock);
  		/* Try to allocate the next unused ID */
  		ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
  					&root->hierarchy_id);
  		if (ret == -ENOSPC)
  			/* Try again starting from 0 */
  			ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
  		if (!ret) {
  			next_hierarchy_id = root->hierarchy_id + 1;
  		} else if (ret != -EAGAIN) {
  			/* Can only get here if the 31-bit IDR is full ... */
  			BUG_ON(ret);
  		}
  		spin_unlock(&hierarchy_id_lock);
  	} while (ret);
  	return true;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1301
1302
  static int cgroup_test_super(struct super_block *sb, void *data)
  {
c6d57f331   Paul Menage   cgroups: support ...
1303
  	struct cgroup_sb_opts *opts = data;
ddbcc7e8e   Paul Menage   Task Control Grou...
1304
  	struct cgroupfs_root *root = sb->s_fs_info;
c6d57f331   Paul Menage   cgroups: support ...
1305
1306
1307
  	/* If we asked for a name then it must match */
  	if (opts->name && strcmp(opts->name, root->name))
  		return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
1308

2c6ab6d20   Paul Menage   cgroups: allow cg...
1309
1310
1311
1312
1313
1314
  	/*
  	 * If we asked for subsystems (or explicitly for no
  	 * subsystems) then they must match
  	 */
  	if ((opts->subsys_bits || opts->none)
  	    && (opts->subsys_bits != root->subsys_bits))
ddbcc7e8e   Paul Menage   Task Control Grou...
1315
1316
1317
1318
  		return 0;
  
  	return 1;
  }
c6d57f331   Paul Menage   cgroups: support ...
1319
1320
1321
  static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
  {
  	struct cgroupfs_root *root;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1322
  	if (!opts->subsys_bits && !opts->none)
c6d57f331   Paul Menage   cgroups: support ...
1323
1324
1325
1326
1327
  		return NULL;
  
  	root = kzalloc(sizeof(*root), GFP_KERNEL);
  	if (!root)
  		return ERR_PTR(-ENOMEM);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1328
1329
1330
1331
  	if (!init_root_id(root)) {
  		kfree(root);
  		return ERR_PTR(-ENOMEM);
  	}
c6d57f331   Paul Menage   cgroups: support ...
1332
  	init_cgroup_root(root);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1333

c6d57f331   Paul Menage   cgroups: support ...
1334
1335
1336
1337
1338
1339
  	root->subsys_bits = opts->subsys_bits;
  	root->flags = opts->flags;
  	if (opts->release_agent)
  		strcpy(root->release_agent_path, opts->release_agent);
  	if (opts->name)
  		strcpy(root->name, opts->name);
97978e6d1   Daniel Lezcano   cgroup: add clone...
1340
1341
  	if (opts->clone_children)
  		set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
c6d57f331   Paul Menage   cgroups: support ...
1342
1343
  	return root;
  }
2c6ab6d20   Paul Menage   cgroups: allow cg...
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
  static void cgroup_drop_root(struct cgroupfs_root *root)
  {
  	if (!root)
  		return;
  
  	BUG_ON(!root->hierarchy_id);
  	spin_lock(&hierarchy_id_lock);
  	ida_remove(&hierarchy_ida, root->hierarchy_id);
  	spin_unlock(&hierarchy_id_lock);
  	kfree(root);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1355
1356
1357
  static int cgroup_set_super(struct super_block *sb, void *data)
  {
  	int ret;
c6d57f331   Paul Menage   cgroups: support ...
1358
1359
1360
1361
1362
  	struct cgroup_sb_opts *opts = data;
  
  	/* If we don't have a new root, we can't set up a new sb */
  	if (!opts->new_root)
  		return -EINVAL;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1363
  	BUG_ON(!opts->subsys_bits && !opts->none);
ddbcc7e8e   Paul Menage   Task Control Grou...
1364
1365
1366
1367
  
  	ret = set_anon_super(sb, NULL);
  	if (ret)
  		return ret;
c6d57f331   Paul Menage   cgroups: support ...
1368
1369
  	sb->s_fs_info = opts->new_root;
  	opts->new_root->sb = sb;
ddbcc7e8e   Paul Menage   Task Control Grou...
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
  
  	sb->s_blocksize = PAGE_CACHE_SIZE;
  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
  	sb->s_magic = CGROUP_SUPER_MAGIC;
  	sb->s_op = &cgroup_ops;
  
  	return 0;
  }
  
  static int cgroup_get_rootdir(struct super_block *sb)
  {
0df6a63f8   Al Viro   switch cgroup
1381
1382
  	static const struct dentry_operations cgroup_dops = {
  		.d_iput = cgroup_diput,
c72a04e34   Al Viro   cgroup_fs: fix cg...
1383
  		.d_delete = cgroup_delete,
0df6a63f8   Al Viro   switch cgroup
1384
  	};
ddbcc7e8e   Paul Menage   Task Control Grou...
1385
1386
1387
1388
1389
1390
  	struct inode *inode =
  		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
  	struct dentry *dentry;
  
  	if (!inode)
  		return -ENOMEM;
ddbcc7e8e   Paul Menage   Task Control Grou...
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
  	inode->i_fop = &simple_dir_operations;
  	inode->i_op = &cgroup_dir_inode_operations;
  	/* directories start off with i_nlink == 2 (for "." entry) */
  	inc_nlink(inode);
  	dentry = d_alloc_root(inode);
  	if (!dentry) {
  		iput(inode);
  		return -ENOMEM;
  	}
  	sb->s_root = dentry;
0df6a63f8   Al Viro   switch cgroup
1401
1402
  	/* for everything else we want ->d_op set */
  	sb->s_d_op = &cgroup_dops;
ddbcc7e8e   Paul Menage   Task Control Grou...
1403
1404
  	return 0;
  }
f7e835710   Al Viro   convert cgroup an...
1405
  static struct dentry *cgroup_mount(struct file_system_type *fs_type,
ddbcc7e8e   Paul Menage   Task Control Grou...
1406
  			 int flags, const char *unused_dev_name,
f7e835710   Al Viro   convert cgroup an...
1407
  			 void *data)
ddbcc7e8e   Paul Menage   Task Control Grou...
1408
1409
  {
  	struct cgroup_sb_opts opts;
c6d57f331   Paul Menage   cgroups: support ...
1410
  	struct cgroupfs_root *root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1411
1412
  	int ret = 0;
  	struct super_block *sb;
c6d57f331   Paul Menage   cgroups: support ...
1413
  	struct cgroupfs_root *new_root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1414
1415
  
  	/* First find the desired set of subsystems */
aae8aab40   Ben Blum   cgroups: revamp s...
1416
  	mutex_lock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1417
  	ret = parse_cgroupfs_options(data, &opts);
aae8aab40   Ben Blum   cgroups: revamp s...
1418
  	mutex_unlock(&cgroup_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1419
1420
  	if (ret)
  		goto out_err;
ddbcc7e8e   Paul Menage   Task Control Grou...
1421

c6d57f331   Paul Menage   cgroups: support ...
1422
1423
1424
1425
1426
1427
1428
  	/*
  	 * Allocate a new cgroup root. We may not need it if we're
  	 * reusing an existing hierarchy.
  	 */
  	new_root = cgroup_root_from_opts(&opts);
  	if (IS_ERR(new_root)) {
  		ret = PTR_ERR(new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1429
  		goto drop_modules;
81a6a5cdd   Paul Menage   Task Control Grou...
1430
  	}
c6d57f331   Paul Menage   cgroups: support ...
1431
  	opts.new_root = new_root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1432

c6d57f331   Paul Menage   cgroups: support ...
1433
1434
  	/* Locate an existing or new sb for this hierarchy */
  	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
ddbcc7e8e   Paul Menage   Task Control Grou...
1435
  	if (IS_ERR(sb)) {
c6d57f331   Paul Menage   cgroups: support ...
1436
  		ret = PTR_ERR(sb);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1437
  		cgroup_drop_root(opts.new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1438
  		goto drop_modules;
ddbcc7e8e   Paul Menage   Task Control Grou...
1439
  	}
c6d57f331   Paul Menage   cgroups: support ...
1440
1441
1442
1443
1444
  	root = sb->s_fs_info;
  	BUG_ON(!root);
  	if (root == opts.new_root) {
  		/* We used the new root structure, so this is a new hierarchy */
  		struct list_head tmp_cg_links;
c12f65d43   Li Zefan   cgroups: introduc...
1445
  		struct cgroup *root_cgrp = &root->top_cgroup;
817929ec2   Paul Menage   Task Control Grou...
1446
  		struct inode *inode;
c6d57f331   Paul Menage   cgroups: support ...
1447
  		struct cgroupfs_root *existing_root;
28fd5dfc1   Li Zefan   cgroups: remove t...
1448
  		int i;
ddbcc7e8e   Paul Menage   Task Control Grou...
1449
1450
1451
1452
1453
1454
  
  		BUG_ON(sb->s_root != NULL);
  
  		ret = cgroup_get_rootdir(sb);
  		if (ret)
  			goto drop_new_super;
817929ec2   Paul Menage   Task Control Grou...
1455
  		inode = sb->s_root->d_inode;
ddbcc7e8e   Paul Menage   Task Control Grou...
1456

817929ec2   Paul Menage   Task Control Grou...
1457
  		mutex_lock(&inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1458
  		mutex_lock(&cgroup_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
  		if (strlen(root->name)) {
  			/* Check for name clashes with existing mounts */
  			for_each_active_root(existing_root) {
  				if (!strcmp(existing_root->name, root->name)) {
  					ret = -EBUSY;
  					mutex_unlock(&cgroup_mutex);
  					mutex_unlock(&inode->i_mutex);
  					goto drop_new_super;
  				}
  			}
  		}
817929ec2   Paul Menage   Task Control Grou...
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
  		/*
  		 * We're accessing css_set_count without locking
  		 * css_set_lock here, but that's OK - it can only be
  		 * increased by someone holding cgroup_lock, and
  		 * that's us. The worst that can happen is that we
  		 * have some link structures left over
  		 */
  		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
  		if (ret) {
  			mutex_unlock(&cgroup_mutex);
  			mutex_unlock(&inode->i_mutex);
  			goto drop_new_super;
  		}
ddbcc7e8e   Paul Menage   Task Control Grou...
1483
1484
1485
  		ret = rebind_subsystems(root, root->subsys_bits);
  		if (ret == -EBUSY) {
  			mutex_unlock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
1486
  			mutex_unlock(&inode->i_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1487
1488
  			free_cg_links(&tmp_cg_links);
  			goto drop_new_super;
ddbcc7e8e   Paul Menage   Task Control Grou...
1489
  		}
cf5d5941f   Ben Blum   cgroups: subsyste...
1490
1491
1492
1493
1494
  		/*
  		 * There must be no failure case after here, since rebinding
  		 * takes care of subsystems' refcounts, which are explicitly
  		 * dropped in the failure exit path.
  		 */
ddbcc7e8e   Paul Menage   Task Control Grou...
1495
1496
1497
1498
1499
  
  		/* EBUSY should be the only error here */
  		BUG_ON(ret);
  
  		list_add(&root->root_list, &roots);
817929ec2   Paul Menage   Task Control Grou...
1500
  		root_count++;
ddbcc7e8e   Paul Menage   Task Control Grou...
1501

c12f65d43   Li Zefan   cgroups: introduc...
1502
  		sb->s_root->d_fsdata = root_cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
1503
  		root->top_cgroup.dentry = sb->s_root;
817929ec2   Paul Menage   Task Control Grou...
1504
1505
1506
  		/* Link the top cgroup in this hierarchy into all
  		 * the css_set objects */
  		write_lock(&css_set_lock);
28fd5dfc1   Li Zefan   cgroups: remove t...
1507
1508
1509
  		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  			struct hlist_head *hhead = &css_set_table[i];
  			struct hlist_node *node;
817929ec2   Paul Menage   Task Control Grou...
1510
  			struct css_set *cg;
28fd5dfc1   Li Zefan   cgroups: remove t...
1511

c12f65d43   Li Zefan   cgroups: introduc...
1512
1513
  			hlist_for_each_entry(cg, node, hhead, hlist)
  				link_css_set(&tmp_cg_links, cg, root_cgrp);
28fd5dfc1   Li Zefan   cgroups: remove t...
1514
  		}
817929ec2   Paul Menage   Task Control Grou...
1515
1516
1517
  		write_unlock(&css_set_lock);
  
  		free_cg_links(&tmp_cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
1518
1519
  		BUG_ON(!list_empty(&root_cgrp->sibling));
  		BUG_ON(!list_empty(&root_cgrp->children));
ddbcc7e8e   Paul Menage   Task Control Grou...
1520
  		BUG_ON(root->number_of_cgroups != 1);
c12f65d43   Li Zefan   cgroups: introduc...
1521
  		cgroup_populate_dir(root_cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1522
  		mutex_unlock(&cgroup_mutex);
34f77a90f   Xiaotian Feng   cgroups: make unl...
1523
  		mutex_unlock(&inode->i_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1524
1525
1526
1527
1528
  	} else {
  		/*
  		 * We re-used an existing hierarchy - the new root (if
  		 * any) is not needed
  		 */
2c6ab6d20   Paul Menage   cgroups: allow cg...
1529
  		cgroup_drop_root(opts.new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1530
1531
  		/* no subsys rebinding, so refcounts don't change */
  		drop_parsed_module_refcounts(opts.subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
1532
  	}
c6d57f331   Paul Menage   cgroups: support ...
1533
1534
  	kfree(opts.release_agent);
  	kfree(opts.name);
f7e835710   Al Viro   convert cgroup an...
1535
  	return dget(sb->s_root);
ddbcc7e8e   Paul Menage   Task Control Grou...
1536
1537
  
   drop_new_super:
6f5bbff9a   Al Viro   Convert obvious p...
1538
  	deactivate_locked_super(sb);
cf5d5941f   Ben Blum   cgroups: subsyste...
1539
1540
   drop_modules:
  	drop_parsed_module_refcounts(opts.subsys_bits);
c6d57f331   Paul Menage   cgroups: support ...
1541
1542
1543
   out_err:
  	kfree(opts.release_agent);
  	kfree(opts.name);
f7e835710   Al Viro   convert cgroup an...
1544
  	return ERR_PTR(ret);
ddbcc7e8e   Paul Menage   Task Control Grou...
1545
1546
1547
1548
  }
  
  static void cgroup_kill_sb(struct super_block *sb) {
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1549
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1550
  	int ret;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1551
1552
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
ddbcc7e8e   Paul Menage   Task Control Grou...
1553
1554
1555
1556
  
  	BUG_ON(!root);
  
  	BUG_ON(root->number_of_cgroups != 1);
bd89aabc6   Paul Menage   Control groups: R...
1557
1558
  	BUG_ON(!list_empty(&cgrp->children));
  	BUG_ON(!list_empty(&cgrp->sibling));
ddbcc7e8e   Paul Menage   Task Control Grou...
1559
1560
1561
1562
1563
1564
1565
  
  	mutex_lock(&cgroup_mutex);
  
  	/* Rebind all subsystems back to the default hierarchy */
  	ret = rebind_subsystems(root, 0);
  	/* Shouldn't be able to fail ... */
  	BUG_ON(ret);
817929ec2   Paul Menage   Task Control Grou...
1566
1567
1568
1569
1570
  	/*
  	 * Release all the links from css_sets to this hierarchy's
  	 * root cgroup
  	 */
  	write_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1571
1572
1573
  
  	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
  				 cgrp_link_list) {
817929ec2   Paul Menage   Task Control Grou...
1574
  		list_del(&link->cg_link_list);
bd89aabc6   Paul Menage   Control groups: R...
1575
  		list_del(&link->cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
1576
1577
1578
  		kfree(link);
  	}
  	write_unlock(&css_set_lock);
839ec5452   Paul Menage   cgroup: fix root_...
1579
1580
1581
1582
  	if (!list_empty(&root->root_list)) {
  		list_del(&root->root_list);
  		root_count--;
  	}
e5f6a8609   Li Zefan   cgroups: make roo...
1583

ddbcc7e8e   Paul Menage   Task Control Grou...
1584
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1585
  	kill_litter_super(sb);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1586
  	cgroup_drop_root(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
1587
1588
1589
1590
  }
  
  static struct file_system_type cgroup_fs_type = {
  	.name = "cgroup",
f7e835710   Al Viro   convert cgroup an...
1591
  	.mount = cgroup_mount,
ddbcc7e8e   Paul Menage   Task Control Grou...
1592
1593
  	.kill_sb = cgroup_kill_sb,
  };
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
1594
  static struct kobject *cgroup_kobj;
bd89aabc6   Paul Menage   Control groups: R...
1595
  static inline struct cgroup *__d_cgrp(struct dentry *dentry)
ddbcc7e8e   Paul Menage   Task Control Grou...
1596
1597
1598
1599
1600
1601
1602
1603
  {
  	return dentry->d_fsdata;
  }
  
  static inline struct cftype *__d_cft(struct dentry *dentry)
  {
  	return dentry->d_fsdata;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1604
1605
1606
1607
1608
1609
  /**
   * cgroup_path - generate the path of a cgroup
   * @cgrp: the cgroup in question
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
a47295e6b   Paul Menage   cgroups: make cgr...
1610
1611
1612
   * Called with cgroup_mutex held or else with an RCU-protected cgroup
   * reference.  Writes path of cgroup into buf.  Returns 0 on success,
   * -errno on error.
ddbcc7e8e   Paul Menage   Task Control Grou...
1613
   */
bd89aabc6   Paul Menage   Control groups: R...
1614
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
ddbcc7e8e   Paul Menage   Task Control Grou...
1615
1616
  {
  	char *start;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1617
1618
1619
  	struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
  						      rcu_read_lock_held() ||
  						      cgroup_lock_is_held());
ddbcc7e8e   Paul Menage   Task Control Grou...
1620

a47295e6b   Paul Menage   cgroups: make cgr...
1621
  	if (!dentry || cgrp == dummytop) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
  		/*
  		 * Inactive subsystems have no dentry for their root
  		 * cgroup
  		 */
  		strcpy(buf, "/");
  		return 0;
  	}
  
  	start = buf + buflen;
  
  	*--start = '\0';
  	for (;;) {
a47295e6b   Paul Menage   cgroups: make cgr...
1634
  		int len = dentry->d_name.len;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1635

ddbcc7e8e   Paul Menage   Task Control Grou...
1636
1637
  		if ((start -= len) < buf)
  			return -ENAMETOOLONG;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1638
  		memcpy(start, dentry->d_name.name, len);
bd89aabc6   Paul Menage   Control groups: R...
1639
1640
  		cgrp = cgrp->parent;
  		if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
1641
  			break;
9a9686b63   Li Zefan   cgroup: Fix an RC...
1642
1643
1644
1645
  
  		dentry = rcu_dereference_check(cgrp->dentry,
  					       rcu_read_lock_held() ||
  					       cgroup_lock_is_held());
bd89aabc6   Paul Menage   Control groups: R...
1646
  		if (!cgrp->parent)
ddbcc7e8e   Paul Menage   Task Control Grou...
1647
1648
1649
1650
1651
1652
1653
1654
  			continue;
  		if (--start < buf)
  			return -ENAMETOOLONG;
  		*start = '/';
  	}
  	memmove(buf, start, buf + buflen - start);
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
1655
  EXPORT_SYMBOL_GPL(cgroup_path);
ddbcc7e8e   Paul Menage   Task Control Grou...
1656

a043e3b2c   Li Zefan   cgroup: fix comments
1657
1658
1659
1660
  /**
   * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
   * @cgrp: the cgroup the task is attaching to
   * @tsk: the task to be attached
bbcb81d09   Paul Menage   Task Control Grou...
1661
   *
a043e3b2c   Li Zefan   cgroup: fix comments
1662
1663
   * Call holding cgroup_mutex. May take task_lock of
   * the task 'tsk' during call.
bbcb81d09   Paul Menage   Task Control Grou...
1664
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1665
  int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
bbcb81d09   Paul Menage   Task Control Grou...
1666
1667
  {
  	int retval = 0;
2468c7234   Daisuke Nishimura   cgroup: introduce...
1668
  	struct cgroup_subsys *ss, *failed_ss = NULL;
bd89aabc6   Paul Menage   Control groups: R...
1669
  	struct cgroup *oldcgrp;
77efecd9e   Lai Jiangshan   cgroups: call fin...
1670
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
1671
  	struct css_set *newcg;
bd89aabc6   Paul Menage   Control groups: R...
1672
  	struct cgroupfs_root *root = cgrp->root;
bbcb81d09   Paul Menage   Task Control Grou...
1673
1674
  
  	/* Nothing to do if the task is already in that cgroup */
7717f7ba9   Paul Menage   cgroups: add a ba...
1675
  	oldcgrp = task_cgroup_from_root(tsk, root);
bd89aabc6   Paul Menage   Control groups: R...
1676
  	if (cgrp == oldcgrp)
bbcb81d09   Paul Menage   Task Control Grou...
1677
1678
1679
1680
  		return 0;
  
  	for_each_subsys(root, ss) {
  		if (ss->can_attach) {
be367d099   Ben Blum   cgroups: let ss->...
1681
  			retval = ss->can_attach(ss, cgrp, tsk, false);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
  			if (retval) {
  				/*
  				 * Remember on which subsystem the can_attach()
  				 * failed, so that we only call cancel_attach()
  				 * against the subsystems whose can_attach()
  				 * succeeded. (See below)
  				 */
  				failed_ss = ss;
  				goto out;
  			}
bbcb81d09   Paul Menage   Task Control Grou...
1692
1693
  		}
  	}
77efecd9e   Lai Jiangshan   cgroups: call fin...
1694
1695
1696
1697
  	task_lock(tsk);
  	cg = tsk->cgroups;
  	get_css_set(cg);
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1698
1699
1700
1701
  	/*
  	 * Locate or allocate a new css_set for this task,
  	 * based on its final set of cgroups
  	 */
bd89aabc6   Paul Menage   Control groups: R...
1702
  	newcg = find_css_set(cg, cgrp);
77efecd9e   Lai Jiangshan   cgroups: call fin...
1703
  	put_css_set(cg);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1704
1705
1706
1707
  	if (!newcg) {
  		retval = -ENOMEM;
  		goto out;
  	}
817929ec2   Paul Menage   Task Control Grou...
1708

bbcb81d09   Paul Menage   Task Control Grou...
1709
1710
1711
  	task_lock(tsk);
  	if (tsk->flags & PF_EXITING) {
  		task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1712
  		put_css_set(newcg);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1713
1714
  		retval = -ESRCH;
  		goto out;
bbcb81d09   Paul Menage   Task Control Grou...
1715
  	}
817929ec2   Paul Menage   Task Control Grou...
1716
  	rcu_assign_pointer(tsk->cgroups, newcg);
bbcb81d09   Paul Menage   Task Control Grou...
1717
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1718
1719
1720
1721
1722
1723
1724
  	/* Update the css_set linked lists if we're using them */
  	write_lock(&css_set_lock);
  	if (!list_empty(&tsk->cg_list)) {
  		list_del(&tsk->cg_list);
  		list_add(&tsk->cg_list, &newcg->tasks);
  	}
  	write_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
1725
  	for_each_subsys(root, ss) {
e18f6318e   Paul Jackson   cgroup brace codi...
1726
  		if (ss->attach)
be367d099   Ben Blum   cgroups: let ss->...
1727
  			ss->attach(ss, cgrp, oldcgrp, tsk, false);
bbcb81d09   Paul Menage   Task Control Grou...
1728
  	}
bd89aabc6   Paul Menage   Control groups: R...
1729
  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
bbcb81d09   Paul Menage   Task Control Grou...
1730
  	synchronize_rcu();
817929ec2   Paul Menage   Task Control Grou...
1731
  	put_css_set(cg);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
1732
1733
1734
1735
1736
  
  	/*
  	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
  	 * is no longer empty.
  	 */
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
1737
  	cgroup_wakeup_rmdir_waiter(cgrp);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
  out:
  	if (retval) {
  		for_each_subsys(root, ss) {
  			if (ss == failed_ss)
  				/*
  				 * This subsystem was the one that failed the
  				 * can_attach() check earlier, so we don't need
  				 * to call cancel_attach() against it or any
  				 * remaining subsystems.
  				 */
  				break;
  			if (ss->cancel_attach)
  				ss->cancel_attach(ss, cgrp, tsk, false);
  		}
  	}
  	return retval;
bbcb81d09   Paul Menage   Task Control Grou...
1754
  }
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1755
  /**
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1756
1757
   * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
   * @from: attach to all cgroups of a given task
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1758
1759
   * @tsk: the task to be attached
   */
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1760
  int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1761
1762
  {
  	struct cgroupfs_root *root;
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1763
1764
1765
1766
  	int retval = 0;
  
  	cgroup_lock();
  	for_each_active_root(root) {
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1767
1768
1769
  		struct cgroup *from_cg = task_cgroup_from_root(from, root);
  
  		retval = cgroup_attach_task(from_cg, tsk);
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1770
1771
1772
1773
1774
1775
1776
  		if (retval)
  			break;
  	}
  	cgroup_unlock();
  
  	return retval;
  }
31583bb0c   Michael S. Tsirkin   cgroups: fix API ...
1777
  EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
d7926ee38   Sridhar Samudrala   cgroups: Add an A...
1778

bbcb81d09   Paul Menage   Task Control Grou...
1779
  /*
af351026a   Paul Menage   cgroup files: tur...
1780
1781
   * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
   * held. May take task_lock of task
bbcb81d09   Paul Menage   Task Control Grou...
1782
   */
af351026a   Paul Menage   cgroup files: tur...
1783
  static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
bbcb81d09   Paul Menage   Task Control Grou...
1784
  {
bbcb81d09   Paul Menage   Task Control Grou...
1785
  	struct task_struct *tsk;
c69e8d9c0   David Howells   CRED: Use RCU to ...
1786
  	const struct cred *cred = current_cred(), *tcred;
bbcb81d09   Paul Menage   Task Control Grou...
1787
  	int ret;
bbcb81d09   Paul Menage   Task Control Grou...
1788
1789
  	if (pid) {
  		rcu_read_lock();
73507f335   Pavel Emelyanov   Handle pid namesp...
1790
  		tsk = find_task_by_vpid(pid);
bbcb81d09   Paul Menage   Task Control Grou...
1791
1792
1793
1794
  		if (!tsk || tsk->flags & PF_EXITING) {
  			rcu_read_unlock();
  			return -ESRCH;
  		}
bbcb81d09   Paul Menage   Task Control Grou...
1795

c69e8d9c0   David Howells   CRED: Use RCU to ...
1796
1797
1798
1799
1800
  		tcred = __task_cred(tsk);
  		if (cred->euid &&
  		    cred->euid != tcred->uid &&
  		    cred->euid != tcred->suid) {
  			rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
1801
1802
  			return -EACCES;
  		}
c69e8d9c0   David Howells   CRED: Use RCU to ...
1803
1804
  		get_task_struct(tsk);
  		rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
1805
1806
1807
1808
  	} else {
  		tsk = current;
  		get_task_struct(tsk);
  	}
956db3ca0   Cliff Wickman   hotplug cpu: move...
1809
  	ret = cgroup_attach_task(cgrp, tsk);
bbcb81d09   Paul Menage   Task Control Grou...
1810
1811
1812
  	put_task_struct(tsk);
  	return ret;
  }
af351026a   Paul Menage   cgroup files: tur...
1813
1814
1815
1816
1817
1818
1819
1820
1821
  static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
  {
  	int ret;
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	ret = attach_task_by_pid(cgrp, pid);
  	cgroup_unlock();
  	return ret;
  }
e788e066c   Paul Menage   cgroup files: mov...
1822
1823
1824
1825
  /**
   * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
   * @cgrp: the cgroup to be checked for liveness
   *
84eea8428   Paul Menage   cgroups: misc cle...
1826
1827
   * On success, returns true; the lock should be later released with
   * cgroup_unlock(). On failure returns false with no lock held.
e788e066c   Paul Menage   cgroup files: mov...
1828
   */
84eea8428   Paul Menage   cgroups: misc cle...
1829
  bool cgroup_lock_live_group(struct cgroup *cgrp)
e788e066c   Paul Menage   cgroup files: mov...
1830
1831
1832
1833
1834
1835
1836
1837
  {
  	mutex_lock(&cgroup_mutex);
  	if (cgroup_is_removed(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
  		return false;
  	}
  	return true;
  }
67523c48a   Ben Blum   cgroups: blkio su...
1838
  EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
e788e066c   Paul Menage   cgroup files: mov...
1839
1840
1841
1842
1843
  
  static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
f4a2589fe   Evgeny Kuznetsov   cgroups: add chec...
1844
1845
  	if (strlen(buffer) >= PATH_MAX)
  		return -EINVAL;
e788e066c   Paul Menage   cgroup files: mov...
1846
1847
1848
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	strcpy(cgrp->root->release_agent_path, buffer);
84eea8428   Paul Menage   cgroups: misc cle...
1849
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
  	return 0;
  }
  
  static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
  				     struct seq_file *seq)
  {
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	seq_puts(seq, cgrp->root->release_agent_path);
  	seq_putc(seq, '
  ');
84eea8428   Paul Menage   cgroups: misc cle...
1861
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
1862
1863
  	return 0;
  }
84eea8428   Paul Menage   cgroups: misc cle...
1864
1865
  /* A buffer size big enough for numbers or short strings */
  #define CGROUP_LOCAL_BUFFER_SIZE 64
e73d2c61d   Paul Menage   CGroups _s64 file...
1866
  static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
f4c753b7e   Paul Menage   CGroup API files:...
1867
1868
1869
  				struct file *file,
  				const char __user *userbuf,
  				size_t nbytes, loff_t *unused_ppos)
355e0c48b   Paul Menage   Add cgroup write_...
1870
  {
84eea8428   Paul Menage   cgroups: misc cle...
1871
  	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
355e0c48b   Paul Menage   Add cgroup write_...
1872
  	int retval = 0;
355e0c48b   Paul Menage   Add cgroup write_...
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
  	char *end;
  
  	if (!nbytes)
  		return -EINVAL;
  	if (nbytes >= sizeof(buffer))
  		return -E2BIG;
  	if (copy_from_user(buffer, userbuf, nbytes))
  		return -EFAULT;
  
  	buffer[nbytes] = 0;     /* nul-terminate */
e73d2c61d   Paul Menage   CGroups _s64 file...
1883
  	if (cft->write_u64) {
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
1884
  		u64 val = simple_strtoull(strstrip(buffer), &end, 0);
e73d2c61d   Paul Menage   CGroups _s64 file...
1885
1886
1887
1888
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_u64(cgrp, cft, val);
  	} else {
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
1889
  		s64 val = simple_strtoll(strstrip(buffer), &end, 0);
e73d2c61d   Paul Menage   CGroups _s64 file...
1890
1891
1892
1893
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_s64(cgrp, cft, val);
  	}
355e0c48b   Paul Menage   Add cgroup write_...
1894
1895
1896
1897
  	if (!retval)
  		retval = nbytes;
  	return retval;
  }
db3b14978   Paul Menage   cgroup files: add...
1898
1899
1900
1901
1902
  static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
  				   struct file *file,
  				   const char __user *userbuf,
  				   size_t nbytes, loff_t *unused_ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
1903
  	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
db3b14978   Paul Menage   cgroup files: add...
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
  	int retval = 0;
  	size_t max_bytes = cft->max_write_len;
  	char *buffer = local_buffer;
  
  	if (!max_bytes)
  		max_bytes = sizeof(local_buffer) - 1;
  	if (nbytes >= max_bytes)
  		return -E2BIG;
  	/* Allocate a dynamic buffer if we need one */
  	if (nbytes >= sizeof(local_buffer)) {
  		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
  		if (buffer == NULL)
  			return -ENOMEM;
  	}
5a3eb9f6b   Li Zefan   cgroup: fix possi...
1918
1919
1920
1921
  	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
  		retval = -EFAULT;
  		goto out;
  	}
db3b14978   Paul Menage   cgroup files: add...
1922
1923
  
  	buffer[nbytes] = 0;     /* nul-terminate */
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
1924
  	retval = cft->write_string(cgrp, cft, strstrip(buffer));
db3b14978   Paul Menage   cgroup files: add...
1925
1926
  	if (!retval)
  		retval = nbytes;
5a3eb9f6b   Li Zefan   cgroup: fix possi...
1927
  out:
db3b14978   Paul Menage   cgroup files: add...
1928
1929
1930
1931
  	if (buffer != local_buffer)
  		kfree(buffer);
  	return retval;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1932
1933
1934
1935
  static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
  						size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
1936
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1937

75139b827   Li Zefan   cgroups: remove s...
1938
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
1939
  		return -ENODEV;
355e0c48b   Paul Menage   Add cgroup write_...
1940
  	if (cft->write)
bd89aabc6   Paul Menage   Control groups: R...
1941
  		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
1942
1943
  	if (cft->write_u64 || cft->write_s64)
  		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
db3b14978   Paul Menage   cgroup files: add...
1944
1945
  	if (cft->write_string)
  		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
d447ea2f3   Pavel Emelyanov   cgroups: add the ...
1946
1947
1948
1949
  	if (cft->trigger) {
  		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
  		return ret ? ret : nbytes;
  	}
355e0c48b   Paul Menage   Add cgroup write_...
1950
  	return -EINVAL;
ddbcc7e8e   Paul Menage   Task Control Grou...
1951
  }
f4c753b7e   Paul Menage   CGroup API files:...
1952
1953
1954
1955
  static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
ddbcc7e8e   Paul Menage   Task Control Grou...
1956
  {
84eea8428   Paul Menage   cgroups: misc cle...
1957
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
f4c753b7e   Paul Menage   CGroup API files:...
1958
  	u64 val = cft->read_u64(cgrp, cft);
ddbcc7e8e   Paul Menage   Task Control Grou...
1959
1960
1961
1962
1963
  	int len = sprintf(tmp, "%llu
  ", (unsigned long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
e73d2c61d   Paul Menage   CGroups _s64 file...
1964
1965
1966
1967
1968
  static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
1969
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
e73d2c61d   Paul Menage   CGroups _s64 file...
1970
1971
1972
1973
1974
1975
  	s64 val = cft->read_s64(cgrp, cft);
  	int len = sprintf(tmp, "%lld
  ", (long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1976
1977
1978
1979
  static ssize_t cgroup_file_read(struct file *file, char __user *buf,
  				   size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
1980
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1981

75139b827   Li Zefan   cgroups: remove s...
1982
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
1983
1984
1985
  		return -ENODEV;
  
  	if (cft->read)
bd89aabc6   Paul Menage   Control groups: R...
1986
  		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
f4c753b7e   Paul Menage   CGroup API files:...
1987
1988
  	if (cft->read_u64)
  		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
1989
1990
  	if (cft->read_s64)
  		return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
ddbcc7e8e   Paul Menage   Task Control Grou...
1991
1992
  	return -EINVAL;
  }
917965696   Paul Menage   CGroup API files:...
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
  /*
   * seqfile ops/methods for returning structured data. Currently just
   * supports string->u64 maps, but can be extended in future.
   */
  
  struct cgroup_seqfile_state {
  	struct cftype *cft;
  	struct cgroup *cgroup;
  };
  
  static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
  {
  	struct seq_file *sf = cb->state;
  	return seq_printf(sf, "%s %llu
  ", key, (unsigned long long)value);
  }
  
  static int cgroup_seqfile_show(struct seq_file *m, void *arg)
  {
  	struct cgroup_seqfile_state *state = m->private;
  	struct cftype *cft = state->cft;
29486df32   Serge E. Hallyn   cgroups: introduc...
2014
2015
2016
2017
2018
2019
2020
2021
  	if (cft->read_map) {
  		struct cgroup_map_cb cb = {
  			.fill = cgroup_map_add,
  			.state = m,
  		};
  		return cft->read_map(state->cgroup, cft, &cb);
  	}
  	return cft->read_seq_string(state->cgroup, cft, m);
917965696   Paul Menage   CGroup API files:...
2022
  }
96930a636   Adrian Bunk   make cgroup_seqfi...
2023
  static int cgroup_seqfile_release(struct inode *inode, struct file *file)
917965696   Paul Menage   CGroup API files:...
2024
2025
2026
2027
2028
  {
  	struct seq_file *seq = file->private_data;
  	kfree(seq->private);
  	return single_release(inode, file);
  }
828c09509   Alexey Dobriyan   const: constify r...
2029
  static const struct file_operations cgroup_seqfile_operations = {
917965696   Paul Menage   CGroup API files:...
2030
  	.read = seq_read,
e788e066c   Paul Menage   cgroup files: mov...
2031
  	.write = cgroup_file_write,
917965696   Paul Menage   CGroup API files:...
2032
2033
2034
  	.llseek = seq_lseek,
  	.release = cgroup_seqfile_release,
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
2035
2036
2037
2038
2039
2040
2041
2042
  static int cgroup_file_open(struct inode *inode, struct file *file)
  {
  	int err;
  	struct cftype *cft;
  
  	err = generic_file_open(inode, file);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
2043
  	cft = __d_cft(file->f_dentry);
75139b827   Li Zefan   cgroups: remove s...
2044

29486df32   Serge E. Hallyn   cgroups: introduc...
2045
  	if (cft->read_map || cft->read_seq_string) {
917965696   Paul Menage   CGroup API files:...
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
  		struct cgroup_seqfile_state *state =
  			kzalloc(sizeof(*state), GFP_USER);
  		if (!state)
  			return -ENOMEM;
  		state->cft = cft;
  		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
  		file->f_op = &cgroup_seqfile_operations;
  		err = single_open(file, cgroup_seqfile_show, state);
  		if (err < 0)
  			kfree(state);
  	} else if (cft->open)
ddbcc7e8e   Paul Menage   Task Control Grou...
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
  		err = cft->open(inode, file);
  	else
  		err = 0;
  
  	return err;
  }
  
  static int cgroup_file_release(struct inode *inode, struct file *file)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
  	if (cft->release)
  		return cft->release(inode, file);
  	return 0;
  }
  
  /*
   * cgroup_rename - Only allow simple rename of directories in place.
   */
  static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
  			    struct inode *new_dir, struct dentry *new_dentry)
  {
  	if (!S_ISDIR(old_dentry->d_inode->i_mode))
  		return -ENOTDIR;
  	if (new_dentry->d_inode)
  		return -EEXIST;
  	if (old_dir != new_dir)
  		return -EIO;
  	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
  }
828c09509   Alexey Dobriyan   const: constify r...
2086
  static const struct file_operations cgroup_file_operations = {
ddbcc7e8e   Paul Menage   Task Control Grou...
2087
2088
2089
2090
2091
2092
  	.read = cgroup_file_read,
  	.write = cgroup_file_write,
  	.llseek = generic_file_llseek,
  	.open = cgroup_file_open,
  	.release = cgroup_file_release,
  };
6e1d5dcc2   Alexey Dobriyan   const: mark remai...
2093
  static const struct inode_operations cgroup_dir_inode_operations = {
c72a04e34   Al Viro   cgroup_fs: fix cg...
2094
  	.lookup = cgroup_lookup,
ddbcc7e8e   Paul Menage   Task Control Grou...
2095
2096
2097
2098
  	.mkdir = cgroup_mkdir,
  	.rmdir = cgroup_rmdir,
  	.rename = cgroup_rename,
  };
c72a04e34   Al Viro   cgroup_fs: fix cg...
2099
2100
2101
2102
2103
2104
2105
  static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
  {
  	if (dentry->d_name.len > NAME_MAX)
  		return ERR_PTR(-ENAMETOOLONG);
  	d_add(dentry, NULL);
  	return NULL;
  }
0dea11687   Kirill A. Shutemov   cgroup: implement...
2106
2107
2108
2109
2110
2111
2112
2113
2114
  /*
   * Check if a file is a control file
   */
  static inline struct cftype *__file_cft(struct file *file)
  {
  	if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
  		return ERR_PTR(-EINVAL);
  	return __d_cft(file->f_dentry);
  }
5adcee1d8   Nick Piggin   cgroup fs: avoid ...
2115
2116
2117
  static int cgroup_create_file(struct dentry *dentry, mode_t mode,
  				struct super_block *sb)
  {
ddbcc7e8e   Paul Menage   Task Control Grou...
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
  	struct inode *inode;
  
  	if (!dentry)
  		return -ENOENT;
  	if (dentry->d_inode)
  		return -EEXIST;
  
  	inode = cgroup_new_inode(mode, sb);
  	if (!inode)
  		return -ENOMEM;
  
  	if (S_ISDIR(mode)) {
  		inode->i_op = &cgroup_dir_inode_operations;
  		inode->i_fop = &simple_dir_operations;
  
  		/* start off with i_nlink == 2 (for "." entry) */
  		inc_nlink(inode);
  
  		/* start with the directory inode held, so that we can
  		 * populate it without racing with another mkdir */
817929ec2   Paul Menage   Task Control Grou...
2138
  		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ddbcc7e8e   Paul Menage   Task Control Grou...
2139
2140
2141
2142
  	} else if (S_ISREG(mode)) {
  		inode->i_size = 0;
  		inode->i_fop = &cgroup_file_operations;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
2143
2144
2145
2146
2147
2148
  	d_instantiate(dentry, inode);
  	dget(dentry);	/* Extra count - pin the dentry in core */
  	return 0;
  }
  
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
2149
2150
2151
2152
2153
   * cgroup_create_dir - create a directory for an object.
   * @cgrp: the cgroup we create the directory for. It must have a valid
   *        ->parent field. And we are going to fill its ->dentry field.
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new directory.
ddbcc7e8e   Paul Menage   Task Control Grou...
2154
   */
bd89aabc6   Paul Menage   Control groups: R...
2155
  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
2156
  				mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
2157
2158
2159
  {
  	struct dentry *parent;
  	int error = 0;
bd89aabc6   Paul Menage   Control groups: R...
2160
2161
  	parent = cgrp->parent->dentry;
  	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
2162
  	if (!error) {
bd89aabc6   Paul Menage   Control groups: R...
2163
  		dentry->d_fsdata = cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
2164
  		inc_nlink(parent->d_inode);
a47295e6b   Paul Menage   cgroups: make cgr...
2165
  		rcu_assign_pointer(cgrp->dentry, dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
2166
2167
2168
2169
2170
2171
  		dget(dentry);
  	}
  	dput(dentry);
  
  	return error;
  }
099fca322   Li Zefan   cgroups: show cor...
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
  /**
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
   *
   * returns cft->mode if ->mode is not 0
   * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
   * returns S_IRUGO if it has only a read handler
   * returns S_IWUSR if it has only a write hander
   */
  static mode_t cgroup_file_mode(const struct cftype *cft)
  {
  	mode_t mode = 0;
  
  	if (cft->mode)
  		return cft->mode;
  
  	if (cft->read || cft->read_u64 || cft->read_s64 ||
  	    cft->read_map || cft->read_seq_string)
  		mode |= S_IRUGO;
  
  	if (cft->write || cft->write_u64 || cft->write_s64 ||
  	    cft->write_string || cft->trigger)
  		mode |= S_IWUSR;
  
  	return mode;
  }
bd89aabc6   Paul Menage   Control groups: R...
2198
  int cgroup_add_file(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
2199
2200
2201
  		       struct cgroup_subsys *subsys,
  		       const struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
2202
  	struct dentry *dir = cgrp->dentry;
ddbcc7e8e   Paul Menage   Task Control Grou...
2203
2204
  	struct dentry *dentry;
  	int error;
099fca322   Li Zefan   cgroups: show cor...
2205
  	mode_t mode;
ddbcc7e8e   Paul Menage   Task Control Grou...
2206
2207
  
  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
bd89aabc6   Paul Menage   Control groups: R...
2208
  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2209
2210
2211
2212
2213
2214
2215
  		strcpy(name, subsys->name);
  		strcat(name, ".");
  	}
  	strcat(name, cft->name);
  	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
  	dentry = lookup_one_len(name, dir, strlen(name));
  	if (!IS_ERR(dentry)) {
099fca322   Li Zefan   cgroups: show cor...
2216
2217
  		mode = cgroup_file_mode(cft);
  		error = cgroup_create_file(dentry, mode | S_IFREG,
bd89aabc6   Paul Menage   Control groups: R...
2218
  						cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
2219
2220
2221
2222
2223
2224
2225
  		if (!error)
  			dentry->d_fsdata = (void *)cft;
  		dput(dentry);
  	} else
  		error = PTR_ERR(dentry);
  	return error;
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
2226
  EXPORT_SYMBOL_GPL(cgroup_add_file);
ddbcc7e8e   Paul Menage   Task Control Grou...
2227

bd89aabc6   Paul Menage   Control groups: R...
2228
  int cgroup_add_files(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
2229
2230
2231
2232
2233
2234
  			struct cgroup_subsys *subsys,
  			const struct cftype cft[],
  			int count)
  {
  	int i, err;
  	for (i = 0; i < count; i++) {
bd89aabc6   Paul Menage   Control groups: R...
2235
  		err = cgroup_add_file(cgrp, subsys, &cft[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
2236
2237
2238
2239
2240
  		if (err)
  			return err;
  	}
  	return 0;
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
2241
  EXPORT_SYMBOL_GPL(cgroup_add_files);
ddbcc7e8e   Paul Menage   Task Control Grou...
2242

a043e3b2c   Li Zefan   cgroup: fix comments
2243
2244
2245
2246
2247
2248
  /**
   * cgroup_task_count - count the number of tasks in a cgroup.
   * @cgrp: the cgroup in question
   *
   * Return the number of tasks in the cgroup.
   */
bd89aabc6   Paul Menage   Control groups: R...
2249
  int cgroup_task_count(const struct cgroup *cgrp)
bbcb81d09   Paul Menage   Task Control Grou...
2250
2251
  {
  	int count = 0;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
2252
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
2253
2254
  
  	read_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
2255
  	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
2256
  		count += atomic_read(&link->cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
2257
2258
  	}
  	read_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
2259
2260
2261
2262
  	return count;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
2263
2264
2265
   * Advance a list_head iterator.  The iterator should be positioned at
   * the start of a css_set
   */
bd89aabc6   Paul Menage   Control groups: R...
2266
  static void cgroup_advance_iter(struct cgroup *cgrp,
7717f7ba9   Paul Menage   cgroups: add a ba...
2267
  				struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2268
2269
2270
2271
2272
2273
2274
2275
  {
  	struct list_head *l = it->cg_link;
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	/* Advance to the next non-empty css_set */
  	do {
  		l = l->next;
bd89aabc6   Paul Menage   Control groups: R...
2276
  		if (l == &cgrp->css_sets) {
817929ec2   Paul Menage   Task Control Grou...
2277
2278
2279
  			it->cg_link = NULL;
  			return;
  		}
bd89aabc6   Paul Menage   Control groups: R...
2280
  		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
2281
2282
2283
2284
2285
  		cg = link->cg;
  	} while (list_empty(&cg->tasks));
  	it->cg_link = l;
  	it->task = cg->tasks.next;
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
2286
2287
2288
2289
2290
2291
2292
2293
2294
  /*
   * To reduce the fork() overhead for systems that are not actually
   * using their cgroups capability, we don't maintain the lists running
   * through each css_set to its tasks until we see the list actually
   * used - in other words after the first call to cgroup_iter_start().
   *
   * The tasklist_lock is not held here, as do_each_thread() and
   * while_each_thread() are protected by RCU.
   */
3df91fe30   Adrian Bunk   make cgroup_enabl...
2295
  static void cgroup_enable_task_cg_lists(void)
31a7df01f   Cliff Wickman   cgroups: mechanis...
2296
2297
2298
2299
2300
2301
  {
  	struct task_struct *p, *g;
  	write_lock(&css_set_lock);
  	use_task_css_set_links = 1;
  	do_each_thread(g, p) {
  		task_lock(p);
0e04388f0   Li Zefan   cgroup: fix a rac...
2302
2303
2304
2305
2306
2307
  		/*
  		 * We should check if the process is exiting, otherwise
  		 * it will race with cgroup_exit() in that the list
  		 * entry won't be deleted though the process has exited.
  		 */
  		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
31a7df01f   Cliff Wickman   cgroups: mechanis...
2308
2309
2310
2311
2312
  			list_add(&p->cg_list, &p->cgroups->tasks);
  		task_unlock(p);
  	} while_each_thread(g, p);
  	write_unlock(&css_set_lock);
  }
bd89aabc6   Paul Menage   Control groups: R...
2313
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2314
2315
2316
2317
2318
2319
  {
  	/*
  	 * The first time anyone tries to iterate across a cgroup,
  	 * we need to enable the list linking each css_set to its
  	 * tasks, and fix up all existing tasks.
  	 */
31a7df01f   Cliff Wickman   cgroups: mechanis...
2320
2321
  	if (!use_task_css_set_links)
  		cgroup_enable_task_cg_lists();
817929ec2   Paul Menage   Task Control Grou...
2322
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
2323
2324
  	it->cg_link = &cgrp->css_sets;
  	cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
2325
  }
bd89aabc6   Paul Menage   Control groups: R...
2326
  struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
2327
2328
2329
2330
  					struct cgroup_iter *it)
  {
  	struct task_struct *res;
  	struct list_head *l = it->task;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
2331
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
2332
2333
2334
2335
2336
2337
2338
  
  	/* If the iterator cg is NULL, we have no tasks */
  	if (!it->cg_link)
  		return NULL;
  	res = list_entry(l, struct task_struct, cg_list);
  	/* Advance iterator to find next entry */
  	l = l->next;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
2339
2340
  	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
  	if (l == &link->cg->tasks) {
817929ec2   Paul Menage   Task Control Grou...
2341
2342
  		/* We reached the end of this task list - move on to
  		 * the next cg_cgroup_link */
bd89aabc6   Paul Menage   Control groups: R...
2343
  		cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
2344
2345
2346
2347
2348
  	} else {
  		it->task = l;
  	}
  	return res;
  }
bd89aabc6   Paul Menage   Control groups: R...
2349
  void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2350
2351
2352
  {
  	read_unlock(&css_set_lock);
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
  static inline int started_after_time(struct task_struct *t1,
  				     struct timespec *time,
  				     struct task_struct *t2)
  {
  	int start_diff = timespec_compare(&t1->start_time, time);
  	if (start_diff > 0) {
  		return 1;
  	} else if (start_diff < 0) {
  		return 0;
  	} else {
  		/*
  		 * Arbitrarily, if two processes started at the same
  		 * time, we'll say that the lower pointer value
  		 * started first. Note that t2 may have exited by now
  		 * so this may not be a valid pointer any longer, but
  		 * that's fine - it still serves to distinguish
  		 * between two tasks started (effectively) simultaneously.
  		 */
  		return t1 > t2;
  	}
  }
  
  /*
   * This function is a callback from heap_insert() and is used to order
   * the heap.
   * In this case we order the heap in descending task start time.
   */
  static inline int started_after(void *p1, void *p2)
  {
  	struct task_struct *t1 = p1;
  	struct task_struct *t2 = p2;
  	return started_after_time(t1, &t2->start_time, t2);
  }
  
  /**
   * cgroup_scan_tasks - iterate though all the tasks in a cgroup
   * @scan: struct cgroup_scanner containing arguments for the scan
   *
   * Arguments include pointers to callback functions test_task() and
   * process_task().
   * Iterate through all the tasks in a cgroup, calling test_task() for each,
   * and if it returns true, call process_task() for it also.
   * The test_task pointer may be NULL, meaning always true (select all tasks).
   * Effectively duplicates cgroup_iter_{start,next,end}()
   * but does not lock css_set_lock for the call to process_task().
   * The struct cgroup_scanner may be embedded in any structure of the caller's
   * creation.
   * It is guaranteed that process_task() will act on every task that
   * is a member of the cgroup for the duration of this call. This
   * function may or may not call process_task() for tasks that exit
   * or move to a different cgroup during the call, or are forked or
   * move into the cgroup during the call.
   *
   * Note that test_task() may be called with locks held, and may in some
   * situations be called multiple times for the same task, so it should
   * be cheap.
   * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
   * pre-allocated and will be used for heap operations (and its "gt" member will
   * be overwritten), else a temporary heap will be used (allocation of which
   * may cause this function to fail).
   */
  int cgroup_scan_tasks(struct cgroup_scanner *scan)
  {
  	int retval, i;
  	struct cgroup_iter it;
  	struct task_struct *p, *dropped;
  	/* Never dereference latest_task, since it's not refcounted */
  	struct task_struct *latest_task = NULL;
  	struct ptr_heap tmp_heap;
  	struct ptr_heap *heap;
  	struct timespec latest_time = { 0, 0 };
  
  	if (scan->heap) {
  		/* The caller supplied our heap and pre-allocated its memory */
  		heap = scan->heap;
  		heap->gt = &started_after;
  	} else {
  		/* We need to allocate our own heap memory */
  		heap = &tmp_heap;
  		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
  		if (retval)
  			/* cannot allocate the heap */
  			return retval;
  	}
  
   again:
  	/*
  	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
  	 * to determine which are of interest, and using the scanner's
  	 * "process_task" callback to process any of them that need an update.
  	 * Since we don't want to hold any locks during the task updates,
  	 * gather tasks to be processed in a heap structure.
  	 * The heap is sorted by descending task start time.
  	 * If the statically-sized heap fills up, we overflow tasks that
  	 * started later, and in future iterations only consider tasks that
  	 * started after the latest task in the previous pass. This
  	 * guarantees forward progress and that we don't miss any tasks.
  	 */
  	heap->size = 0;
  	cgroup_iter_start(scan->cg, &it);
  	while ((p = cgroup_iter_next(scan->cg, &it))) {
  		/*
  		 * Only affect tasks that qualify per the caller's callback,
  		 * if he provided one
  		 */
  		if (scan->test_task && !scan->test_task(p, scan))
  			continue;
  		/*
  		 * Only process tasks that started after the last task
  		 * we processed
  		 */
  		if (!started_after_time(p, &latest_time, latest_task))
  			continue;
  		dropped = heap_insert(heap, p);
  		if (dropped == NULL) {
  			/*
  			 * The new task was inserted; the heap wasn't
  			 * previously full
  			 */
  			get_task_struct(p);
  		} else if (dropped != p) {
  			/*
  			 * The new task was inserted, and pushed out a
  			 * different task
  			 */
  			get_task_struct(p);
  			put_task_struct(dropped);
  		}
  		/*
  		 * Else the new task was newer than anything already in
  		 * the heap and wasn't inserted
  		 */
  	}
  	cgroup_iter_end(scan->cg, &it);
  
  	if (heap->size) {
  		for (i = 0; i < heap->size; i++) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2490
  			struct task_struct *q = heap->ptrs[i];
31a7df01f   Cliff Wickman   cgroups: mechanis...
2491
  			if (i == 0) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2492
2493
  				latest_time = q->start_time;
  				latest_task = q;
31a7df01f   Cliff Wickman   cgroups: mechanis...
2494
2495
  			}
  			/* Process the task per the caller's callback */
4fe91d518   Paul Jackson   cgroup: fix spars...
2496
2497
  			scan->process_task(q, scan);
  			put_task_struct(q);
31a7df01f   Cliff Wickman   cgroups: mechanis...
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
  		}
  		/*
  		 * If we had to process any tasks at all, scan again
  		 * in case some of them were in the middle of forking
  		 * children that didn't get processed.
  		 * Not the most efficient way to do it, but it avoids
  		 * having to take callback_mutex in the fork path
  		 */
  		goto again;
  	}
  	if (heap == &tmp_heap)
  		heap_free(&tmp_heap);
  	return 0;
  }
817929ec2   Paul Menage   Task Control Grou...
2512
  /*
102a775e3   Ben Blum   cgroups: add a re...
2513
   * Stuff for reading the 'tasks'/'procs' files.
bbcb81d09   Paul Menage   Task Control Grou...
2514
2515
2516
2517
2518
2519
   *
   * Reading this file can return large amounts of data if a cgroup has
   * *lots* of attached tasks. So it may need several calls to read(),
   * but we cannot guarantee that the information we produce is correct
   * unless we produce it entirely atomically.
   *
bbcb81d09   Paul Menage   Task Control Grou...
2520
   */
bbcb81d09   Paul Menage   Task Control Grou...
2521
2522
  
  /*
d1d9fd330   Ben Blum   cgroups: use vmal...
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
   * The following two functions "fix" the issue where there are more pids
   * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
   * TODO: replace with a kernel-wide solution to this problem
   */
  #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
  static void *pidlist_allocate(int count)
  {
  	if (PIDLIST_TOO_LARGE(count))
  		return vmalloc(count * sizeof(pid_t));
  	else
  		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
  }
  static void pidlist_free(void *p)
  {
  	if (is_vmalloc_addr(p))
  		vfree(p);
  	else
  		kfree(p);
  }
  static void *pidlist_resize(void *p, int newcount)
  {
  	void *newlist;
  	/* note: if new alloc fails, old p will still be valid either way */
  	if (is_vmalloc_addr(p)) {
  		newlist = vmalloc(newcount * sizeof(pid_t));
  		if (!newlist)
  			return NULL;
  		memcpy(newlist, p, newcount * sizeof(pid_t));
  		vfree(p);
  	} else {
  		newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
  	}
  	return newlist;
  }
  
  /*
102a775e3   Ben Blum   cgroups: add a re...
2559
2560
2561
2562
   * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
   * If the new stripped list is sufficiently smaller and there's enough memory
   * to allocate a new buffer, will let go of the unneeded memory. Returns the
   * number of unique elements.
bbcb81d09   Paul Menage   Task Control Grou...
2563
   */
102a775e3   Ben Blum   cgroups: add a re...
2564
2565
2566
  /* is the size difference enough that we should re-allocate the array? */
  #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
  static int pidlist_uniq(pid_t **p, int length)
bbcb81d09   Paul Menage   Task Control Grou...
2567
  {
102a775e3   Ben Blum   cgroups: add a re...
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
  	int src, dest = 1;
  	pid_t *list = *p;
  	pid_t *newlist;
  
  	/*
  	 * we presume the 0th element is unique, so i starts at 1. trivial
  	 * edge cases first; no work needs to be done for either
  	 */
  	if (length == 0 || length == 1)
  		return length;
  	/* src and dest walk down the list; dest counts unique elements */
  	for (src = 1; src < length; src++) {
  		/* find next unique element */
  		while (list[src] == list[src-1]) {
  			src++;
  			if (src == length)
  				goto after;
  		}
  		/* dest always points to where the next unique element goes */
  		list[dest] = list[src];
  		dest++;
  	}
  after:
  	/*
  	 * if the length difference is large enough, we want to allocate a
  	 * smaller buffer to save memory. if this fails due to out of memory,
  	 * we'll just stay with what we've got.
  	 */
  	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
d1d9fd330   Ben Blum   cgroups: use vmal...
2597
  		newlist = pidlist_resize(list, dest);
102a775e3   Ben Blum   cgroups: add a re...
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
  		if (newlist)
  			*p = newlist;
  	}
  	return dest;
  }
  
  static int cmppid(const void *a, const void *b)
  {
  	return *(pid_t *)a - *(pid_t *)b;
  }
  
  /*
72a8cb30d   Ben Blum   cgroups: ensure c...
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
   * find the appropriate pidlist for our purpose (given procs vs tasks)
   * returns with the lock on that pidlist already held, and takes care
   * of the use count, or returns NULL with no locks held if we're out of
   * memory.
   */
  static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
  						  enum cgroup_filetype type)
  {
  	struct cgroup_pidlist *l;
  	/* don't need task_nsproxy() if we're looking at ourself */
b70cc5fdb   Li Zefan   cgroups: clean up...
2620
  	struct pid_namespace *ns = current->nsproxy->pid_ns;
72a8cb30d   Ben Blum   cgroups: ensure c...
2621
2622
2623
2624
2625
2626
2627
2628
2629
  	/*
  	 * We can't drop the pidlist_mutex before taking the l->mutex in case
  	 * the last ref-holder is trying to remove l from the list at the same
  	 * time. Holding the pidlist_mutex precludes somebody taking whichever
  	 * list we find out from under us - compare release_pid_array().
  	 */
  	mutex_lock(&cgrp->pidlist_mutex);
  	list_for_each_entry(l, &cgrp->pidlists, links) {
  		if (l->key.type == type && l->key.ns == ns) {
72a8cb30d   Ben Blum   cgroups: ensure c...
2630
2631
2632
  			/* make sure l doesn't vanish out from under us */
  			down_write(&l->mutex);
  			mutex_unlock(&cgrp->pidlist_mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
2633
2634
2635
2636
2637
2638
2639
  			return l;
  		}
  	}
  	/* entry not found; create a new one */
  	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
  	if (!l) {
  		mutex_unlock(&cgrp->pidlist_mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
2640
2641
2642
2643
2644
  		return l;
  	}
  	init_rwsem(&l->mutex);
  	down_write(&l->mutex);
  	l->key.type = type;
b70cc5fdb   Li Zefan   cgroups: clean up...
2645
  	l->key.ns = get_pid_ns(ns);
72a8cb30d   Ben Blum   cgroups: ensure c...
2646
2647
2648
2649
2650
2651
2652
2653
2654
  	l->use_count = 0; /* don't increment here */
  	l->list = NULL;
  	l->owner = cgrp;
  	list_add(&l->links, &cgrp->pidlists);
  	mutex_unlock(&cgrp->pidlist_mutex);
  	return l;
  }
  
  /*
102a775e3   Ben Blum   cgroups: add a re...
2655
2656
   * Load a cgroup's pidarray with either procs' tgids or tasks' pids
   */
72a8cb30d   Ben Blum   cgroups: ensure c...
2657
2658
  static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
  			      struct cgroup_pidlist **lp)
102a775e3   Ben Blum   cgroups: add a re...
2659
2660
2661
2662
  {
  	pid_t *array;
  	int length;
  	int pid, n = 0; /* used for populating the array */
817929ec2   Paul Menage   Task Control Grou...
2663
2664
  	struct cgroup_iter it;
  	struct task_struct *tsk;
102a775e3   Ben Blum   cgroups: add a re...
2665
2666
2667
2668
2669
2670
2671
2672
2673
  	struct cgroup_pidlist *l;
  
  	/*
  	 * If cgroup gets more users after we read count, we won't have
  	 * enough space - tough.  This race is indistinguishable to the
  	 * caller from the case that the additional cgroup users didn't
  	 * show up until sometime later on.
  	 */
  	length = cgroup_task_count(cgrp);
d1d9fd330   Ben Blum   cgroups: use vmal...
2674
  	array = pidlist_allocate(length);
102a775e3   Ben Blum   cgroups: add a re...
2675
2676
2677
  	if (!array)
  		return -ENOMEM;
  	/* now, populate the array */
bd89aabc6   Paul Menage   Control groups: R...
2678
2679
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
102a775e3   Ben Blum   cgroups: add a re...
2680
  		if (unlikely(n == length))
817929ec2   Paul Menage   Task Control Grou...
2681
  			break;
102a775e3   Ben Blum   cgroups: add a re...
2682
  		/* get tgid or pid for procs or tasks file respectively */
72a8cb30d   Ben Blum   cgroups: ensure c...
2683
2684
2685
2686
  		if (type == CGROUP_FILE_PROCS)
  			pid = task_tgid_vnr(tsk);
  		else
  			pid = task_pid_vnr(tsk);
102a775e3   Ben Blum   cgroups: add a re...
2687
2688
  		if (pid > 0) /* make sure to only use valid results */
  			array[n++] = pid;
817929ec2   Paul Menage   Task Control Grou...
2689
  	}
bd89aabc6   Paul Menage   Control groups: R...
2690
  	cgroup_iter_end(cgrp, &it);
102a775e3   Ben Blum   cgroups: add a re...
2691
2692
2693
  	length = n;
  	/* now sort & (if procs) strip out duplicates */
  	sort(array, length, sizeof(pid_t), cmppid, NULL);
72a8cb30d   Ben Blum   cgroups: ensure c...
2694
  	if (type == CGROUP_FILE_PROCS)
102a775e3   Ben Blum   cgroups: add a re...
2695
  		length = pidlist_uniq(&array, length);
72a8cb30d   Ben Blum   cgroups: ensure c...
2696
2697
  	l = cgroup_pidlist_find(cgrp, type);
  	if (!l) {
d1d9fd330   Ben Blum   cgroups: use vmal...
2698
  		pidlist_free(array);
72a8cb30d   Ben Blum   cgroups: ensure c...
2699
  		return -ENOMEM;
102a775e3   Ben Blum   cgroups: add a re...
2700
  	}
72a8cb30d   Ben Blum   cgroups: ensure c...
2701
  	/* store array, freeing old if necessary - lock already held */
d1d9fd330   Ben Blum   cgroups: use vmal...
2702
  	pidlist_free(l->list);
102a775e3   Ben Blum   cgroups: add a re...
2703
2704
2705
2706
  	l->list = array;
  	l->length = length;
  	l->use_count++;
  	up_write(&l->mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
2707
  	*lp = l;
102a775e3   Ben Blum   cgroups: add a re...
2708
  	return 0;
bbcb81d09   Paul Menage   Task Control Grou...
2709
  }
846c7bb05   Balbir Singh   Add cgroupstats
2710
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2711
   * cgroupstats_build - build and fill cgroupstats
846c7bb05   Balbir Singh   Add cgroupstats
2712
2713
2714
   * @stats: cgroupstats to fill information into
   * @dentry: A dentry entry belonging to the cgroup for which stats have
   * been requested.
a043e3b2c   Li Zefan   cgroup: fix comments
2715
2716
2717
   *
   * Build and fill cgroupstats so that taskstats can export it to user
   * space.
846c7bb05   Balbir Singh   Add cgroupstats
2718
2719
2720
2721
   */
  int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  {
  	int ret = -EINVAL;
bd89aabc6   Paul Menage   Control groups: R...
2722
  	struct cgroup *cgrp;
846c7bb05   Balbir Singh   Add cgroupstats
2723
2724
  	struct cgroup_iter it;
  	struct task_struct *tsk;
33d283bef   Li Zefan   cgroups: fix a se...
2725

846c7bb05   Balbir Singh   Add cgroupstats
2726
  	/*
33d283bef   Li Zefan   cgroups: fix a se...
2727
2728
  	 * Validate dentry by checking the superblock operations,
  	 * and make sure it's a directory.
846c7bb05   Balbir Singh   Add cgroupstats
2729
  	 */
33d283bef   Li Zefan   cgroups: fix a se...
2730
2731
  	if (dentry->d_sb->s_op != &cgroup_ops ||
  	    !S_ISDIR(dentry->d_inode->i_mode))
846c7bb05   Balbir Singh   Add cgroupstats
2732
2733
2734
  		 goto err;
  
  	ret = 0;
bd89aabc6   Paul Menage   Control groups: R...
2735
  	cgrp = dentry->d_fsdata;
846c7bb05   Balbir Singh   Add cgroupstats
2736

bd89aabc6   Paul Menage   Control groups: R...
2737
2738
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
846c7bb05   Balbir Singh   Add cgroupstats
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
  		switch (tsk->state) {
  		case TASK_RUNNING:
  			stats->nr_running++;
  			break;
  		case TASK_INTERRUPTIBLE:
  			stats->nr_sleeping++;
  			break;
  		case TASK_UNINTERRUPTIBLE:
  			stats->nr_uninterruptible++;
  			break;
  		case TASK_STOPPED:
  			stats->nr_stopped++;
  			break;
  		default:
  			if (delayacct_is_task_waiting_on_io(tsk))
  				stats->nr_io_wait++;
  			break;
  		}
  	}
bd89aabc6   Paul Menage   Control groups: R...
2758
  	cgroup_iter_end(cgrp, &it);
846c7bb05   Balbir Singh   Add cgroupstats
2759

846c7bb05   Balbir Singh   Add cgroupstats
2760
2761
2762
  err:
  	return ret;
  }
8f3ff2086   Paul Menage   cgroups: revert "...
2763

bbcb81d09   Paul Menage   Task Control Grou...
2764
  /*
102a775e3   Ben Blum   cgroups: add a re...
2765
   * seq_file methods for the tasks/procs files. The seq_file position is the
cc31edcee   Paul Menage   cgroups: convert ...
2766
   * next pid to display; the seq_file iterator is a pointer to the pid
102a775e3   Ben Blum   cgroups: add a re...
2767
   * in the cgroup->l->list array.
bbcb81d09   Paul Menage   Task Control Grou...
2768
   */
cc31edcee   Paul Menage   cgroups: convert ...
2769

102a775e3   Ben Blum   cgroups: add a re...
2770
  static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
bbcb81d09   Paul Menage   Task Control Grou...
2771
  {
cc31edcee   Paul Menage   cgroups: convert ...
2772
2773
2774
2775
2776
2777
  	/*
  	 * Initially we receive a position value that corresponds to
  	 * one more than the last pid shown (or 0 on the first call or
  	 * after a seek to the start). Use a binary-search to find the
  	 * next pid to display, if any
  	 */
102a775e3   Ben Blum   cgroups: add a re...
2778
  	struct cgroup_pidlist *l = s->private;
cc31edcee   Paul Menage   cgroups: convert ...
2779
2780
  	int index = 0, pid = *pos;
  	int *iter;
102a775e3   Ben Blum   cgroups: add a re...
2781
  	down_read(&l->mutex);
cc31edcee   Paul Menage   cgroups: convert ...
2782
  	if (pid) {
102a775e3   Ben Blum   cgroups: add a re...
2783
  		int end = l->length;
207777664   Stephen Rothwell   cgroup: remove un...
2784

cc31edcee   Paul Menage   cgroups: convert ...
2785
2786
  		while (index < end) {
  			int mid = (index + end) / 2;
102a775e3   Ben Blum   cgroups: add a re...
2787
  			if (l->list[mid] == pid) {
cc31edcee   Paul Menage   cgroups: convert ...
2788
2789
  				index = mid;
  				break;
102a775e3   Ben Blum   cgroups: add a re...
2790
  			} else if (l->list[mid] <= pid)
cc31edcee   Paul Menage   cgroups: convert ...
2791
2792
2793
2794
2795
2796
  				index = mid + 1;
  			else
  				end = mid;
  		}
  	}
  	/* If we're off the end of the array, we're done */
102a775e3   Ben Blum   cgroups: add a re...
2797
  	if (index >= l->length)
cc31edcee   Paul Menage   cgroups: convert ...
2798
2799
  		return NULL;
  	/* Update the abstract position to be the actual pid that we found */
102a775e3   Ben Blum   cgroups: add a re...
2800
  	iter = l->list + index;
cc31edcee   Paul Menage   cgroups: convert ...
2801
2802
2803
  	*pos = *iter;
  	return iter;
  }
102a775e3   Ben Blum   cgroups: add a re...
2804
  static void cgroup_pidlist_stop(struct seq_file *s, void *v)
cc31edcee   Paul Menage   cgroups: convert ...
2805
  {
102a775e3   Ben Blum   cgroups: add a re...
2806
2807
  	struct cgroup_pidlist *l = s->private;
  	up_read(&l->mutex);
cc31edcee   Paul Menage   cgroups: convert ...
2808
  }
102a775e3   Ben Blum   cgroups: add a re...
2809
  static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
cc31edcee   Paul Menage   cgroups: convert ...
2810
  {
102a775e3   Ben Blum   cgroups: add a re...
2811
2812
2813
  	struct cgroup_pidlist *l = s->private;
  	pid_t *p = v;
  	pid_t *end = l->list + l->length;
cc31edcee   Paul Menage   cgroups: convert ...
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
  	/*
  	 * Advance to the next pid in the array. If this goes off the
  	 * end, we're done
  	 */
  	p++;
  	if (p >= end) {
  		return NULL;
  	} else {
  		*pos = *p;
  		return p;
  	}
  }
102a775e3   Ben Blum   cgroups: add a re...
2826
  static int cgroup_pidlist_show(struct seq_file *s, void *v)
cc31edcee   Paul Menage   cgroups: convert ...
2827
2828
2829
2830
  {
  	return seq_printf(s, "%d
  ", *(int *)v);
  }
bbcb81d09   Paul Menage   Task Control Grou...
2831

102a775e3   Ben Blum   cgroups: add a re...
2832
2833
2834
2835
2836
2837
2838
2839
2840
  /*
   * seq_operations functions for iterating on pidlists through seq_file -
   * independent of whether it's tasks or procs
   */
  static const struct seq_operations cgroup_pidlist_seq_operations = {
  	.start = cgroup_pidlist_start,
  	.stop = cgroup_pidlist_stop,
  	.next = cgroup_pidlist_next,
  	.show = cgroup_pidlist_show,
cc31edcee   Paul Menage   cgroups: convert ...
2841
  };
102a775e3   Ben Blum   cgroups: add a re...
2842
  static void cgroup_release_pid_array(struct cgroup_pidlist *l)
cc31edcee   Paul Menage   cgroups: convert ...
2843
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
2844
2845
2846
2847
2848
2849
2850
  	/*
  	 * the case where we're the last user of this particular pidlist will
  	 * have us remove it from the cgroup's list, which entails taking the
  	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
  	 * pidlist_mutex, we have to take pidlist_mutex first.
  	 */
  	mutex_lock(&l->owner->pidlist_mutex);
102a775e3   Ben Blum   cgroups: add a re...
2851
2852
2853
  	down_write(&l->mutex);
  	BUG_ON(!l->use_count);
  	if (!--l->use_count) {
72a8cb30d   Ben Blum   cgroups: ensure c...
2854
2855
2856
  		/* we're the last user if refcount is 0; remove and free */
  		list_del(&l->links);
  		mutex_unlock(&l->owner->pidlist_mutex);
d1d9fd330   Ben Blum   cgroups: use vmal...
2857
  		pidlist_free(l->list);
72a8cb30d   Ben Blum   cgroups: ensure c...
2858
2859
2860
2861
  		put_pid_ns(l->key.ns);
  		up_write(&l->mutex);
  		kfree(l);
  		return;
cc31edcee   Paul Menage   cgroups: convert ...
2862
  	}
72a8cb30d   Ben Blum   cgroups: ensure c...
2863
  	mutex_unlock(&l->owner->pidlist_mutex);
102a775e3   Ben Blum   cgroups: add a re...
2864
  	up_write(&l->mutex);
bbcb81d09   Paul Menage   Task Control Grou...
2865
  }
102a775e3   Ben Blum   cgroups: add a re...
2866
  static int cgroup_pidlist_release(struct inode *inode, struct file *file)
cc31edcee   Paul Menage   cgroups: convert ...
2867
  {
102a775e3   Ben Blum   cgroups: add a re...
2868
  	struct cgroup_pidlist *l;
cc31edcee   Paul Menage   cgroups: convert ...
2869
2870
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
102a775e3   Ben Blum   cgroups: add a re...
2871
2872
2873
2874
2875
2876
  	/*
  	 * the seq_file will only be initialized if the file was opened for
  	 * reading; hence we check if it's not null only in that case.
  	 */
  	l = ((struct seq_file *)file->private_data)->private;
  	cgroup_release_pid_array(l);
cc31edcee   Paul Menage   cgroups: convert ...
2877
2878
  	return seq_release(inode, file);
  }
102a775e3   Ben Blum   cgroups: add a re...
2879
  static const struct file_operations cgroup_pidlist_operations = {
cc31edcee   Paul Menage   cgroups: convert ...
2880
2881
2882
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.write = cgroup_file_write,
102a775e3   Ben Blum   cgroups: add a re...
2883
  	.release = cgroup_pidlist_release,
cc31edcee   Paul Menage   cgroups: convert ...
2884
  };
bbcb81d09   Paul Menage   Task Control Grou...
2885
  /*
102a775e3   Ben Blum   cgroups: add a re...
2886
2887
2888
   * The following functions handle opens on a file that displays a pidlist
   * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
   * in the cgroup.
bbcb81d09   Paul Menage   Task Control Grou...
2889
   */
102a775e3   Ben Blum   cgroups: add a re...
2890
  /* helper function for the two below it */
72a8cb30d   Ben Blum   cgroups: ensure c...
2891
  static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
bbcb81d09   Paul Menage   Task Control Grou...
2892
  {
bd89aabc6   Paul Menage   Control groups: R...
2893
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
72a8cb30d   Ben Blum   cgroups: ensure c...
2894
  	struct cgroup_pidlist *l;
cc31edcee   Paul Menage   cgroups: convert ...
2895
  	int retval;
bbcb81d09   Paul Menage   Task Control Grou...
2896

cc31edcee   Paul Menage   cgroups: convert ...
2897
  	/* Nothing to do for write-only files */
bbcb81d09   Paul Menage   Task Control Grou...
2898
2899
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
102a775e3   Ben Blum   cgroups: add a re...
2900
  	/* have the array populated */
72a8cb30d   Ben Blum   cgroups: ensure c...
2901
  	retval = pidlist_array_load(cgrp, type, &l);
102a775e3   Ben Blum   cgroups: add a re...
2902
2903
2904
2905
  	if (retval)
  		return retval;
  	/* configure file information */
  	file->f_op = &cgroup_pidlist_operations;
cc31edcee   Paul Menage   cgroups: convert ...
2906

102a775e3   Ben Blum   cgroups: add a re...
2907
  	retval = seq_open(file, &cgroup_pidlist_seq_operations);
cc31edcee   Paul Menage   cgroups: convert ...
2908
  	if (retval) {
102a775e3   Ben Blum   cgroups: add a re...
2909
  		cgroup_release_pid_array(l);
cc31edcee   Paul Menage   cgroups: convert ...
2910
  		return retval;
bbcb81d09   Paul Menage   Task Control Grou...
2911
  	}
102a775e3   Ben Blum   cgroups: add a re...
2912
  	((struct seq_file *)file->private_data)->private = l;
bbcb81d09   Paul Menage   Task Control Grou...
2913
2914
  	return 0;
  }
102a775e3   Ben Blum   cgroups: add a re...
2915
2916
  static int cgroup_tasks_open(struct inode *unused, struct file *file)
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
2917
  	return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
102a775e3   Ben Blum   cgroups: add a re...
2918
2919
2920
  }
  static int cgroup_procs_open(struct inode *unused, struct file *file)
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
2921
  	return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
102a775e3   Ben Blum   cgroups: add a re...
2922
  }
bbcb81d09   Paul Menage   Task Control Grou...
2923

bd89aabc6   Paul Menage   Control groups: R...
2924
  static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
81a6a5cdd   Paul Menage   Task Control Grou...
2925
2926
  					    struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
2927
  	return notify_on_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
2928
  }
6379c1061   Paul Menage   cgroup files: mov...
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
  static int cgroup_write_notify_on_release(struct cgroup *cgrp,
  					  struct cftype *cft,
  					  u64 val)
  {
  	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
  	if (val)
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	else
  		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	return 0;
  }
bbcb81d09   Paul Menage   Task Control Grou...
2940
  /*
0dea11687   Kirill A. Shutemov   cgroup: implement...
2941
2942
2943
2944
2945
2946
2947
2948
2949
   * Unregister event and free resources.
   *
   * Gets called from workqueue.
   */
  static void cgroup_event_remove(struct work_struct *work)
  {
  	struct cgroup_event *event = container_of(work, struct cgroup_event,
  			remove);
  	struct cgroup *cgrp = event->cgrp;
0dea11687   Kirill A. Shutemov   cgroup: implement...
2950
2951
2952
  	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
  
  	eventfd_ctx_put(event->eventfd);
0dea11687   Kirill A. Shutemov   cgroup: implement...
2953
  	kfree(event);
a0a4db548   Kirill A. Shutemov   cgroups: remove e...
2954
  	dput(cgrp->dentry);
0dea11687   Kirill A. Shutemov   cgroup: implement...
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
  }
  
  /*
   * Gets called on POLLHUP on eventfd when user closes it.
   *
   * Called with wqh->lock held and interrupts disabled.
   */
  static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
  		int sync, void *key)
  {
  	struct cgroup_event *event = container_of(wait,
  			struct cgroup_event, wait);
  	struct cgroup *cgrp = event->cgrp;
  	unsigned long flags = (unsigned long)key;
  
  	if (flags & POLLHUP) {
a93d2f174   Changli Gao   sched, wait: Use ...
2971
  		__remove_wait_queue(event->wqh, &event->wait);
0dea11687   Kirill A. Shutemov   cgroup: implement...
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
  		spin_lock(&cgrp->event_list_lock);
  		list_del(&event->list);
  		spin_unlock(&cgrp->event_list_lock);
  		/*
  		 * We are in atomic context, but cgroup_event_remove() may
  		 * sleep, so we have to call it in workqueue.
  		 */
  		schedule_work(&event->remove);
  	}
  
  	return 0;
  }
  
  static void cgroup_event_ptable_queue_proc(struct file *file,
  		wait_queue_head_t *wqh, poll_table *pt)
  {
  	struct cgroup_event *event = container_of(pt,
  			struct cgroup_event, pt);
  
  	event->wqh = wqh;
  	add_wait_queue(wqh, &event->wait);
  }
  
  /*
   * Parse input and register new cgroup event handler.
   *
   * Input must be in format '<event_fd> <control_fd> <args>'.
   * Interpretation of args is defined by control file implementation.
   */
  static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	struct cgroup_event *event = NULL;
  	unsigned int efd, cfd;
  	struct file *efile = NULL;
  	struct file *cfile = NULL;
  	char *endp;
  	int ret;
  
  	efd = simple_strtoul(buffer, &endp, 10);
  	if (*endp != ' ')
  		return -EINVAL;
  	buffer = endp + 1;
  
  	cfd = simple_strtoul(buffer, &endp, 10);
  	if ((*endp != ' ') && (*endp != '\0'))
  		return -EINVAL;
  	buffer = endp + 1;
  
  	event = kzalloc(sizeof(*event), GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
  	event->cgrp = cgrp;
  	INIT_LIST_HEAD(&event->list);
  	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
  	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
  	INIT_WORK(&event->remove, cgroup_event_remove);
  
  	efile = eventfd_fget(efd);
  	if (IS_ERR(efile)) {
  		ret = PTR_ERR(efile);
  		goto fail;
  	}
  
  	event->eventfd = eventfd_ctx_fileget(efile);
  	if (IS_ERR(event->eventfd)) {
  		ret = PTR_ERR(event->eventfd);
  		goto fail;
  	}
  
  	cfile = fget(cfd);
  	if (!cfile) {
  		ret = -EBADF;
  		goto fail;
  	}
  
  	/* the process need read permission on control file */
  	ret = file_permission(cfile, MAY_READ);
  	if (ret < 0)
  		goto fail;
  
  	event->cft = __file_cft(cfile);
  	if (IS_ERR(event->cft)) {
  		ret = PTR_ERR(event->cft);
  		goto fail;
  	}
  
  	if (!event->cft->register_event || !event->cft->unregister_event) {
  		ret = -EINVAL;
  		goto fail;
  	}
  
  	ret = event->cft->register_event(cgrp, event->cft,
  			event->eventfd, buffer);
  	if (ret)
  		goto fail;
  
  	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
  		event->cft->unregister_event(cgrp, event->cft, event->eventfd);
  		ret = 0;
  		goto fail;
  	}
a0a4db548   Kirill A. Shutemov   cgroups: remove e...
3074
3075
3076
3077
3078
3079
  	/*
  	 * Events should be removed after rmdir of cgroup directory, but before
  	 * destroying subsystem state objects. Let's take reference to cgroup
  	 * directory dentry to do that.
  	 */
  	dget(cgrp->dentry);
0dea11687   Kirill A. Shutemov   cgroup: implement...
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
  	spin_lock(&cgrp->event_list_lock);
  	list_add(&event->list, &cgrp->event_list);
  	spin_unlock(&cgrp->event_list_lock);
  
  	fput(cfile);
  	fput(efile);
  
  	return 0;
  
  fail:
  	if (cfile)
  		fput(cfile);
  
  	if (event && event->eventfd && !IS_ERR(event->eventfd))
  		eventfd_ctx_put(event->eventfd);
  
  	if (!IS_ERR_OR_NULL(efile))
  		fput(efile);
  
  	kfree(event);
  
  	return ret;
  }
97978e6d1   Daniel Lezcano   cgroup: add clone...
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
  static u64 cgroup_clone_children_read(struct cgroup *cgrp,
  				    struct cftype *cft)
  {
  	return clone_children(cgrp);
  }
  
  static int cgroup_clone_children_write(struct cgroup *cgrp,
  				     struct cftype *cft,
  				     u64 val)
  {
  	if (val)
  		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
  	else
  		clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
  	return 0;
  }
0dea11687   Kirill A. Shutemov   cgroup: implement...
3119
  /*
bbcb81d09   Paul Menage   Task Control Grou...
3120
3121
   * for the common functions, 'private' gives the type of file
   */
102a775e3   Ben Blum   cgroups: add a re...
3122
3123
  /* for hysterical raisins, we can't put this on the older files */
  #define CGROUP_FILE_GENERIC_PREFIX "cgroup."
81a6a5cdd   Paul Menage   Task Control Grou...
3124
3125
3126
3127
  static struct cftype files[] = {
  	{
  		.name = "tasks",
  		.open = cgroup_tasks_open,
af351026a   Paul Menage   cgroup files: tur...
3128
  		.write_u64 = cgroup_tasks_write,
102a775e3   Ben Blum   cgroups: add a re...
3129
  		.release = cgroup_pidlist_release,
099fca322   Li Zefan   cgroups: show cor...
3130
  		.mode = S_IRUGO | S_IWUSR,
81a6a5cdd   Paul Menage   Task Control Grou...
3131
  	},
102a775e3   Ben Blum   cgroups: add a re...
3132
3133
3134
3135
3136
3137
3138
  	{
  		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
  		.open = cgroup_procs_open,
  		/* .write_u64 = cgroup_procs_write, TODO */
  		.release = cgroup_pidlist_release,
  		.mode = S_IRUGO,
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
3139
3140
  	{
  		.name = "notify_on_release",
f4c753b7e   Paul Menage   CGroup API files:...
3141
  		.read_u64 = cgroup_read_notify_on_release,
6379c1061   Paul Menage   cgroup files: mov...
3142
  		.write_u64 = cgroup_write_notify_on_release,
81a6a5cdd   Paul Menage   Task Control Grou...
3143
  	},
0dea11687   Kirill A. Shutemov   cgroup: implement...
3144
3145
3146
3147
3148
  	{
  		.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
  		.write_string = cgroup_write_event_control,
  		.mode = S_IWUGO,
  	},
97978e6d1   Daniel Lezcano   cgroup: add clone...
3149
3150
3151
3152
3153
  	{
  		.name = "cgroup.clone_children",
  		.read_u64 = cgroup_clone_children_read,
  		.write_u64 = cgroup_clone_children_write,
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
3154
3155
3156
3157
  };
  
  static struct cftype cft_release_agent = {
  	.name = "release_agent",
e788e066c   Paul Menage   cgroup files: mov...
3158
3159
3160
  	.read_seq_string = cgroup_release_agent_show,
  	.write_string = cgroup_release_agent_write,
  	.max_write_len = PATH_MAX,
bbcb81d09   Paul Menage   Task Control Grou...
3161
  };
bd89aabc6   Paul Menage   Control groups: R...
3162
  static int cgroup_populate_dir(struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3163
3164
3165
3166
3167
  {
  	int err;
  	struct cgroup_subsys *ss;
  
  	/* First clear out any existing files */
bd89aabc6   Paul Menage   Control groups: R...
3168
  	cgroup_clear_directory(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
3169

bd89aabc6   Paul Menage   Control groups: R...
3170
  	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
bbcb81d09   Paul Menage   Task Control Grou...
3171
3172
  	if (err < 0)
  		return err;
bd89aabc6   Paul Menage   Control groups: R...
3173
3174
  	if (cgrp == cgrp->top_cgroup) {
  		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
81a6a5cdd   Paul Menage   Task Control Grou...
3175
3176
  			return err;
  	}
bd89aabc6   Paul Menage   Control groups: R...
3177
3178
  	for_each_subsys(cgrp->root, ss) {
  		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
ddbcc7e8e   Paul Menage   Task Control Grou...
3179
3180
  			return err;
  	}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
  	/* This cgroup is ready now */
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		/*
  		 * Update id->css pointer and make this css visible from
  		 * CSS ID functions. This pointer will be dereferened
  		 * from RCU-read-side without locks.
  		 */
  		if (css->id)
  			rcu_assign_pointer(css->id->css, css);
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3192
3193
3194
3195
3196
3197
  
  	return 0;
  }
  
  static void init_cgroup_css(struct cgroup_subsys_state *css,
  			       struct cgroup_subsys *ss,
bd89aabc6   Paul Menage   Control groups: R...
3198
  			       struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3199
  {
bd89aabc6   Paul Menage   Control groups: R...
3200
  	css->cgroup = cgrp;
e7c5ec919   Paul Menage   cgroups: add css_...
3201
  	atomic_set(&css->refcnt, 1);
ddbcc7e8e   Paul Menage   Task Control Grou...
3202
  	css->flags = 0;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3203
  	css->id = NULL;
bd89aabc6   Paul Menage   Control groups: R...
3204
  	if (cgrp == dummytop)
ddbcc7e8e   Paul Menage   Task Control Grou...
3205
  		set_bit(CSS_ROOT, &css->flags);
bd89aabc6   Paul Menage   Control groups: R...
3206
3207
  	BUG_ON(cgrp->subsys[ss->subsys_id]);
  	cgrp->subsys[ss->subsys_id] = css;
ddbcc7e8e   Paul Menage   Task Control Grou...
3208
  }
999cd8a45   Paul Menage   cgroups: add a pe...
3209
3210
3211
3212
  static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
  {
  	/* We need to take each hierarchy_mutex in a consistent order */
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3213
3214
3215
3216
  	/*
  	 * No worry about a race with rebind_subsystems that might mess up the
  	 * locking order, since both parties are under cgroup_mutex.
  	 */
999cd8a45   Paul Menage   cgroups: add a pe...
3217
3218
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3219
3220
  		if (ss == NULL)
  			continue;
999cd8a45   Paul Menage   cgroups: add a pe...
3221
  		if (ss->root == root)
cfebe563b   Li Zefan   cgroups: fix lock...
3222
  			mutex_lock(&ss->hierarchy_mutex);
999cd8a45   Paul Menage   cgroups: add a pe...
3223
3224
3225
3226
3227
3228
3229
3230
3231
  	}
  }
  
  static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  {
  	int i;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3232
3233
  		if (ss == NULL)
  			continue;
999cd8a45   Paul Menage   cgroups: add a pe...
3234
3235
3236
3237
  		if (ss->root == root)
  			mutex_unlock(&ss->hierarchy_mutex);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
3238
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
3239
3240
3241
3242
   * cgroup_create - create a cgroup
   * @parent: cgroup that will be parent of the new cgroup
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new inode
ddbcc7e8e   Paul Menage   Task Control Grou...
3243
   *
a043e3b2c   Li Zefan   cgroup: fix comments
3244
   * Must be called with the mutex on the parent inode held
ddbcc7e8e   Paul Menage   Task Control Grou...
3245
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
3246
  static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
3247
  			     mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
3248
  {
bd89aabc6   Paul Menage   Control groups: R...
3249
  	struct cgroup *cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
3250
3251
3252
3253
  	struct cgroupfs_root *root = parent->root;
  	int err = 0;
  	struct cgroup_subsys *ss;
  	struct super_block *sb = root->sb;
bd89aabc6   Paul Menage   Control groups: R...
3254
3255
  	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
  	if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
  		return -ENOMEM;
  
  	/* Grab a reference on the superblock so the hierarchy doesn't
  	 * get deleted on unmount if there are child cgroups.  This
  	 * can be done outside cgroup_mutex, since the sb can't
  	 * disappear while someone has an open control file on the
  	 * fs */
  	atomic_inc(&sb->s_active);
  
  	mutex_lock(&cgroup_mutex);
cc31edcee   Paul Menage   cgroups: convert ...
3266
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3267

bd89aabc6   Paul Menage   Control groups: R...
3268
3269
3270
  	cgrp->parent = parent;
  	cgrp->root = parent->root;
  	cgrp->top_cgroup = parent->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
3271

b6abdb0e6   Li Zefan   cgroup: fix defau...
3272
3273
  	if (notify_on_release(parent))
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
97978e6d1   Daniel Lezcano   cgroup: add clone...
3274
3275
  	if (clone_children(parent))
  		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3276
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
3277
  		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
4528fd059   Li Zefan   cgroups: fix to r...
3278

ddbcc7e8e   Paul Menage   Task Control Grou...
3279
3280
3281
3282
  		if (IS_ERR(css)) {
  			err = PTR_ERR(css);
  			goto err_destroy;
  		}
bd89aabc6   Paul Menage   Control groups: R...
3283
  		init_cgroup_css(css, ss, cgrp);
4528fd059   Li Zefan   cgroups: fix to r...
3284
3285
3286
  		if (ss->use_id) {
  			err = alloc_css_id(ss, parent, cgrp);
  			if (err)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3287
  				goto err_destroy;
4528fd059   Li Zefan   cgroups: fix to r...
3288
  		}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3289
  		/* At error, ->destroy() callback has to free assigned ID. */
97978e6d1   Daniel Lezcano   cgroup: add clone...
3290
3291
  		if (clone_children(parent) && ss->post_clone)
  			ss->post_clone(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3292
  	}
999cd8a45   Paul Menage   cgroups: add a pe...
3293
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
3294
  	list_add(&cgrp->sibling, &cgrp->parent->children);
999cd8a45   Paul Menage   cgroups: add a pe...
3295
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
3296
  	root->number_of_cgroups++;
bd89aabc6   Paul Menage   Control groups: R...
3297
  	err = cgroup_create_dir(cgrp, dentry, mode);
ddbcc7e8e   Paul Menage   Task Control Grou...
3298
3299
3300
3301
  	if (err < 0)
  		goto err_remove;
  
  	/* The cgroup directory was pre-locked for us */
bd89aabc6   Paul Menage   Control groups: R...
3302
  	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
ddbcc7e8e   Paul Menage   Task Control Grou...
3303

bd89aabc6   Paul Menage   Control groups: R...
3304
  	err = cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3305
3306
3307
  	/* If err < 0, we have a half-filled directory - oh well ;) */
  
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
3308
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
3309
3310
3311
3312
  
  	return 0;
  
   err_remove:
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
3313
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
3314
  	list_del(&cgrp->sibling);
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
3315
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
3316
3317
3318
3319
3320
  	root->number_of_cgroups--;
  
   err_destroy:
  
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
3321
3322
  		if (cgrp->subsys[ss->subsys_id])
  			ss->destroy(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3323
3324
3325
3326
3327
3328
  	}
  
  	mutex_unlock(&cgroup_mutex);
  
  	/* Release the reference count that we took on the superblock */
  	deactivate_super(sb);
bd89aabc6   Paul Menage   Control groups: R...
3329
  	kfree(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
  	return err;
  }
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  {
  	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
  
  	/* the vfs holds inode->i_mutex already */
  	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
55b6fd016   Li Zefan   cgroup: uninline ...
3340
  static int cgroup_has_css_refs(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
3341
3342
3343
  {
  	/* Check the reference count on each subsystem. Since we
  	 * already established that there are no tasks in the
e7c5ec919   Paul Menage   cgroups: add css_...
3344
  	 * cgroup, if the css refcount is also 1, then there should
81a6a5cdd   Paul Menage   Task Control Grou...
3345
3346
3347
3348
3349
3350
3351
  	 * be no outstanding references, so the subsystem is safe to
  	 * destroy. We scan across all subsystems rather than using
  	 * the per-hierarchy linked list of mounted subsystems since
  	 * we can be called via check_for_release() with no
  	 * synchronization other than RCU, and the subsystem linked
  	 * list isn't RCU-safe */
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3352
3353
3354
3355
3356
  	/*
  	 * We won't need to lock the subsys array, because the subsystems
  	 * we're concerned about aren't going anywhere since our cgroup root
  	 * has a reference on them.
  	 */
81a6a5cdd   Paul Menage   Task Control Grou...
3357
3358
3359
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		struct cgroup_subsys_state *css;
aae8aab40   Ben Blum   cgroups: revamp s...
3360
3361
  		/* Skip subsystems not present or not in this hierarchy */
  		if (ss == NULL || ss->root != cgrp->root)
81a6a5cdd   Paul Menage   Task Control Grou...
3362
  			continue;
bd89aabc6   Paul Menage   Control groups: R...
3363
  		css = cgrp->subsys[ss->subsys_id];
81a6a5cdd   Paul Menage   Task Control Grou...
3364
3365
3366
3367
3368
3369
  		/* When called from check_for_release() it's possible
  		 * that by this point the cgroup has been removed
  		 * and the css deleted. But a false-positive doesn't
  		 * matter, since it can only happen if the cgroup
  		 * has been deleted and hence no longer needs the
  		 * release agent to be called anyway. */
e7c5ec919   Paul Menage   cgroups: add css_...
3370
  		if (css && (atomic_read(&css->refcnt) > 1))
81a6a5cdd   Paul Menage   Task Control Grou...
3371
  			return 1;
81a6a5cdd   Paul Menage   Task Control Grou...
3372
3373
3374
  	}
  	return 0;
  }
e7c5ec919   Paul Menage   cgroups: add css_...
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
  /*
   * Atomically mark all (or else none) of the cgroup's CSS objects as
   * CSS_REMOVED. Return true on success, or false if the cgroup has
   * busy subsystems. Call with cgroup_mutex held
   */
  
  static int cgroup_clear_css_refs(struct cgroup *cgrp)
  {
  	struct cgroup_subsys *ss;
  	unsigned long flags;
  	bool failed = false;
  	local_irq_save(flags);
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		int refcnt;
804b3c28a   Paul Menage   cgroups: add cpu_...
3390
  		while (1) {
e7c5ec919   Paul Menage   cgroups: add css_...
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
  			/* We can only remove a CSS with a refcnt==1 */
  			refcnt = atomic_read(&css->refcnt);
  			if (refcnt > 1) {
  				failed = true;
  				goto done;
  			}
  			BUG_ON(!refcnt);
  			/*
  			 * Drop the refcnt to 0 while we check other
  			 * subsystems. This will cause any racing
  			 * css_tryget() to spin until we set the
  			 * CSS_REMOVED bits or abort
  			 */
804b3c28a   Paul Menage   cgroups: add cpu_...
3404
3405
3406
3407
  			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
  				break;
  			cpu_relax();
  		}
e7c5ec919   Paul Menage   cgroups: add css_...
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
  	}
   done:
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		if (failed) {
  			/*
  			 * Restore old refcnt if we previously managed
  			 * to clear it from 1 to 0
  			 */
  			if (!atomic_read(&css->refcnt))
  				atomic_set(&css->refcnt, 1);
  		} else {
  			/* Commit the fact that the CSS is removed */
  			set_bit(CSS_REMOVED, &css->flags);
  		}
  	}
  	local_irq_restore(flags);
  	return !failed;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
3427
3428
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
bd89aabc6   Paul Menage   Control groups: R...
3429
  	struct cgroup *cgrp = dentry->d_fsdata;
ddbcc7e8e   Paul Menage   Task Control Grou...
3430
3431
  	struct dentry *d;
  	struct cgroup *parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3432
  	DEFINE_WAIT(wait);
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
3433
  	struct cgroup_event *event, *tmp;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3434
  	int ret;
ddbcc7e8e   Paul Menage   Task Control Grou...
3435
3436
  
  	/* the vfs holds both inode->i_mutex already */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3437
  again:
ddbcc7e8e   Paul Menage   Task Control Grou...
3438
  	mutex_lock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
3439
  	if (atomic_read(&cgrp->count) != 0) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3440
3441
3442
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
bd89aabc6   Paul Menage   Control groups: R...
3443
  	if (!list_empty(&cgrp->children)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3444
3445
3446
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
3447
  	mutex_unlock(&cgroup_mutex);
a043e3b2c   Li Zefan   cgroup: fix comments
3448

4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
3449
  	/*
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
  	 * In general, subsystem has no css->refcnt after pre_destroy(). But
  	 * in racy cases, subsystem may have to get css->refcnt after
  	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
  	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
  	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
  	 * and subsystem's reference count handling. Please see css_get/put
  	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
  	 */
  	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  
  	/*
a043e3b2c   Li Zefan   cgroup: fix comments
3461
3462
  	 * Call pre_destroy handlers of subsys. Notify subsystems
  	 * that rmdir() request comes.
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
3463
  	 */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3464
  	ret = cgroup_call_pre_destroy(cgrp);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3465
3466
  	if (ret) {
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3467
  		return ret;
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3468
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3469

3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
3470
3471
  	mutex_lock(&cgroup_mutex);
  	parent = cgrp->parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3472
  	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3473
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3474
3475
3476
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3477
  	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3478
3479
  	if (!cgroup_clear_css_refs(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3480
3481
3482
3483
3484
3485
  		/*
  		 * Because someone may call cgroup_wakeup_rmdir_waiter() before
  		 * prepare_to_wait(), we need to check this flag.
  		 */
  		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
  			schedule();
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3486
3487
3488
3489
3490
3491
3492
3493
3494
  		finish_wait(&cgroup_rmdir_waitq, &wait);
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  		if (signal_pending(current))
  			return -EINTR;
  		goto again;
  	}
  	/* NO css_tryget() can success after here. */
  	finish_wait(&cgroup_rmdir_waitq, &wait);
  	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3495

81a6a5cdd   Paul Menage   Task Control Grou...
3496
  	spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
3497
3498
3499
  	set_bit(CGRP_REMOVED, &cgrp->flags);
  	if (!list_empty(&cgrp->release_list))
  		list_del(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
3500
  	spin_unlock(&release_list_lock);
999cd8a45   Paul Menage   cgroups: add a pe...
3501
3502
3503
  
  	cgroup_lock_hierarchy(cgrp->root);
  	/* delete this cgroup from parent->children */
bd89aabc6   Paul Menage   Control groups: R...
3504
  	list_del(&cgrp->sibling);
999cd8a45   Paul Menage   cgroups: add a pe...
3505
  	cgroup_unlock_hierarchy(cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
3506
  	d = dget(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
3507
3508
3509
  
  	cgroup_d_remove_dir(d);
  	dput(d);
ddbcc7e8e   Paul Menage   Task Control Grou...
3510

bd89aabc6   Paul Menage   Control groups: R...
3511
  	set_bit(CGRP_RELEASABLE, &parent->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
3512
  	check_for_release(parent);
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
  	/*
  	 * Unregister events and notify userspace.
  	 * Notify userspace about cgroup removing only after rmdir of cgroup
  	 * directory to avoid race between userspace and kernelspace
  	 */
  	spin_lock(&cgrp->event_list_lock);
  	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
  		list_del(&event->list);
  		remove_wait_queue(event->wqh, &event->wait);
  		eventfd_signal(event->eventfd, 1);
  		schedule_work(&event->remove);
  	}
  	spin_unlock(&cgrp->event_list_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
3526
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
3527
3528
  	return 0;
  }
06a119204   Li Zefan   cgroup: annotate ...
3529
  static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
ddbcc7e8e   Paul Menage   Task Control Grou...
3530
  {
ddbcc7e8e   Paul Menage   Task Control Grou...
3531
  	struct cgroup_subsys_state *css;
cfe36bde5   Diego Calleja   Improve cgroup pr...
3532
3533
3534
  
  	printk(KERN_INFO "Initializing cgroup subsys %s
  ", ss->name);
ddbcc7e8e   Paul Menage   Task Control Grou...
3535
3536
  
  	/* Create the top cgroup state for this subsystem */
33a68ac1c   Li Zefan   cgroups: add inac...
3537
  	list_add(&ss->sibling, &rootnode.subsys_list);
ddbcc7e8e   Paul Menage   Task Control Grou...
3538
3539
3540
3541
3542
  	ss->root = &rootnode;
  	css = ss->create(ss, dummytop);
  	/* We don't handle early failures gracefully */
  	BUG_ON(IS_ERR(css));
  	init_cgroup_css(css, ss, dummytop);
e8d55fdeb   Li Zefan   cgroups: simplify...
3543
  	/* Update the init_css_set to contain a subsys
817929ec2   Paul Menage   Task Control Grou...
3544
  	 * pointer to this state - since the subsystem is
e8d55fdeb   Li Zefan   cgroups: simplify...
3545
3546
3547
  	 * newly registered, all tasks and hence the
  	 * init_css_set is in the subsystem's top cgroup. */
  	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
ddbcc7e8e   Paul Menage   Task Control Grou...
3548
3549
  
  	need_forkexit_callback |= ss->fork || ss->exit;
e8d55fdeb   Li Zefan   cgroups: simplify...
3550
3551
3552
3553
  	/* At system boot, before all subsystems have been
  	 * registered, no tasks have been forked, so we don't
  	 * need to invoke fork callbacks here. */
  	BUG_ON(!list_empty(&init_task.tasks));
999cd8a45   Paul Menage   cgroups: add a pe...
3554
  	mutex_init(&ss->hierarchy_mutex);
cfebe563b   Li Zefan   cgroups: fix lock...
3555
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ddbcc7e8e   Paul Menage   Task Control Grou...
3556
  	ss->active = 1;
e6a1105ba   Ben Blum   cgroups: subsyste...
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
  
  	/* this function shouldn't be used with modular subsystems, since they
  	 * need to register a subsys_id, among other things */
  	BUG_ON(ss->module);
  }
  
  /**
   * cgroup_load_subsys: load and register a modular subsystem at runtime
   * @ss: the subsystem to load
   *
   * This function should be called in a modular subsystem's initcall. If the
883931612   Thomas Weber   Fix typos in comm...
3568
   * subsystem is built as a module, it will be assigned a new subsys_id and set
e6a1105ba   Ben Blum   cgroups: subsyste...
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
   * up for use. If the subsystem is built-in anyway, work is delegated to the
   * simpler cgroup_init_subsys.
   */
  int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
  {
  	int i;
  	struct cgroup_subsys_state *css;
  
  	/* check name and function validity */
  	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
  	    ss->create == NULL || ss->destroy == NULL)
  		return -EINVAL;
  
  	/*
  	 * we don't support callbacks in modular subsystems. this check is
  	 * before the ss->module check for consistency; a subsystem that could
  	 * be a module should still have no callbacks even if the user isn't
  	 * compiling it as one.
  	 */
  	if (ss->fork || ss->exit)
  		return -EINVAL;
  
  	/*
  	 * an optionally modular subsystem is built-in: we want to do nothing,
  	 * since cgroup_init_subsys will have already taken care of it.
  	 */
  	if (ss->module == NULL) {
  		/* a few sanity checks */
  		BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
  		BUG_ON(subsys[ss->subsys_id] != ss);
  		return 0;
  	}
  
  	/*
  	 * need to register a subsys id before anything else - for example,
  	 * init_cgroup_css needs it.
  	 */
  	mutex_lock(&cgroup_mutex);
  	/* find the first empty slot in the array */
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		if (subsys[i] == NULL)
  			break;
  	}
  	if (i == CGROUP_SUBSYS_COUNT) {
  		/* maximum number of subsystems already registered! */
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
  	/* assign ourselves the subsys_id */
  	ss->subsys_id = i;
  	subsys[i] = ss;
  
  	/*
  	 * no ss->create seems to need anything important in the ss struct, so
  	 * this can happen first (i.e. before the rootnode attachment).
  	 */
  	css = ss->create(ss, dummytop);
  	if (IS_ERR(css)) {
  		/* failure case - need to deassign the subsys[] slot. */
  		subsys[i] = NULL;
  		mutex_unlock(&cgroup_mutex);
  		return PTR_ERR(css);
  	}
  
  	list_add(&ss->sibling, &rootnode.subsys_list);
  	ss->root = &rootnode;
  
  	/* our new subsystem will be attached to the dummy hierarchy. */
  	init_cgroup_css(css, ss, dummytop);
  	/* init_idr must be after init_cgroup_css because it sets css->id. */
  	if (ss->use_id) {
  		int ret = cgroup_init_idr(ss, css);
  		if (ret) {
  			dummytop->subsys[ss->subsys_id] = NULL;
  			ss->destroy(ss, dummytop);
  			subsys[i] = NULL;
  			mutex_unlock(&cgroup_mutex);
  			return ret;
  		}
  	}
  
  	/*
  	 * Now we need to entangle the css into the existing css_sets. unlike
  	 * in cgroup_init_subsys, there are now multiple css_sets, so each one
  	 * will need a new pointer to it; done by iterating the css_set_table.
  	 * furthermore, modifying the existing css_sets will corrupt the hash
  	 * table state, so each changed css_set will need its hash recomputed.
  	 * this is all done under the css_set_lock.
  	 */
  	write_lock(&css_set_lock);
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  		struct css_set *cg;
  		struct hlist_node *node, *tmp;
  		struct hlist_head *bucket = &css_set_table[i], *new_bucket;
  
  		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
  			/* skip entries that we already rehashed */
  			if (cg->subsys[ss->subsys_id])
  				continue;
  			/* remove existing entry */
  			hlist_del(&cg->hlist);
  			/* set new value */
  			cg->subsys[ss->subsys_id] = css;
  			/* recompute hash and restore entry */
  			new_bucket = css_set_hash(cg->subsys);
  			hlist_add_head(&cg->hlist, new_bucket);
  		}
  	}
  	write_unlock(&css_set_lock);
  
  	mutex_init(&ss->hierarchy_mutex);
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
  	ss->active = 1;
e6a1105ba   Ben Blum   cgroups: subsyste...
3682
3683
3684
  	/* success! */
  	mutex_unlock(&cgroup_mutex);
  	return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
3685
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
3686
  EXPORT_SYMBOL_GPL(cgroup_load_subsys);
ddbcc7e8e   Paul Menage   Task Control Grou...
3687
3688
  
  /**
cf5d5941f   Ben Blum   cgroups: subsyste...
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
   * cgroup_unload_subsys: unload a modular subsystem
   * @ss: the subsystem to unload
   *
   * This function should be called in a modular subsystem's exitcall. When this
   * function is invoked, the refcount on the subsystem's module will be 0, so
   * the subsystem will not be attached to any hierarchy.
   */
  void cgroup_unload_subsys(struct cgroup_subsys *ss)
  {
  	struct cg_cgroup_link *link;
  	struct hlist_head *hhead;
  
  	BUG_ON(ss->module == NULL);
  
  	/*
  	 * we shouldn't be called if the subsystem is in use, and the use of
  	 * try_module_get in parse_cgroupfs_options should ensure that it
  	 * doesn't start being used while we're killing it off.
  	 */
  	BUG_ON(ss->root != &rootnode);
  
  	mutex_lock(&cgroup_mutex);
  	/* deassign the subsys_id */
  	BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
  	subsys[ss->subsys_id] = NULL;
  
  	/* remove subsystem from rootnode's list of subsystems */
  	list_del(&ss->sibling);
  
  	/*
  	 * disentangle the css from all css_sets attached to the dummytop. as
  	 * in loading, we need to pay our respects to the hashtable gods.
  	 */
  	write_lock(&css_set_lock);
  	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
  		struct css_set *cg = link->cg;
  
  		hlist_del(&cg->hlist);
  		BUG_ON(!cg->subsys[ss->subsys_id]);
  		cg->subsys[ss->subsys_id] = NULL;
  		hhead = css_set_hash(cg->subsys);
  		hlist_add_head(&cg->hlist, hhead);
  	}
  	write_unlock(&css_set_lock);
  
  	/*
  	 * remove subsystem's css from the dummytop and free it - need to free
  	 * before marking as null because ss->destroy needs the cgrp->subsys
  	 * pointer to find their state. note that this also takes care of
  	 * freeing the css_id.
  	 */
  	ss->destroy(ss, dummytop);
  	dummytop->subsys[ss->subsys_id] = NULL;
  
  	mutex_unlock(&cgroup_mutex);
  }
  EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3748
3749
3750
3751
   * cgroup_init_early - cgroup initialization at system boot
   *
   * Initialize cgroups at system boot, and initialize any
   * subsystems that request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
3752
3753
3754
3755
   */
  int __init cgroup_init_early(void)
  {
  	int i;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
3756
  	atomic_set(&init_css_set.refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
3757
3758
  	INIT_LIST_HEAD(&init_css_set.cg_links);
  	INIT_LIST_HEAD(&init_css_set.tasks);
472b1053f   Li Zefan   cgroups: use a ha...
3759
  	INIT_HLIST_NODE(&init_css_set.hlist);
817929ec2   Paul Menage   Task Control Grou...
3760
  	css_set_count = 1;
ddbcc7e8e   Paul Menage   Task Control Grou...
3761
  	init_cgroup_root(&rootnode);
817929ec2   Paul Menage   Task Control Grou...
3762
3763
3764
3765
  	root_count = 1;
  	init_task.cgroups = &init_css_set;
  
  	init_css_set_link.cg = &init_css_set;
7717f7ba9   Paul Menage   cgroups: add a ba...
3766
  	init_css_set_link.cgrp = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
3767
  	list_add(&init_css_set_link.cgrp_link_list,
817929ec2   Paul Menage   Task Control Grou...
3768
3769
3770
  		 &rootnode.top_cgroup.css_sets);
  	list_add(&init_css_set_link.cg_link_list,
  		 &init_css_set.cg_links);
ddbcc7e8e   Paul Menage   Task Control Grou...
3771

472b1053f   Li Zefan   cgroups: use a ha...
3772
3773
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
  		INIT_HLIST_HEAD(&css_set_table[i]);
aae8aab40   Ben Blum   cgroups: revamp s...
3774
3775
  	/* at bootup time, we don't worry about modular subsystems */
  	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3776
3777
3778
3779
3780
3781
3782
  		struct cgroup_subsys *ss = subsys[i];
  
  		BUG_ON(!ss->name);
  		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
  		BUG_ON(!ss->create);
  		BUG_ON(!ss->destroy);
  		if (ss->subsys_id != i) {
cfe36bde5   Diego Calleja   Improve cgroup pr...
3783
3784
  			printk(KERN_ERR "cgroup: Subsys %s id == %d
  ",
ddbcc7e8e   Paul Menage   Task Control Grou...
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
  			       ss->name, ss->subsys_id);
  			BUG();
  		}
  
  		if (ss->early_init)
  			cgroup_init_subsys(ss);
  	}
  	return 0;
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3796
3797
3798
3799
   * cgroup_init - cgroup initialization
   *
   * Register cgroup filesystem and /proc file, and initialize
   * any subsystems that didn't request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
3800
3801
3802
3803
3804
   */
  int __init cgroup_init(void)
  {
  	int err;
  	int i;
472b1053f   Li Zefan   cgroups: use a ha...
3805
  	struct hlist_head *hhead;
a424316ca   Paul Menage   Task Control Grou...
3806
3807
3808
3809
  
  	err = bdi_init(&cgroup_backing_dev_info);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
3810

aae8aab40   Ben Blum   cgroups: revamp s...
3811
3812
  	/* at bootup time, we don't worry about modular subsystems */
  	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3813
3814
3815
  		struct cgroup_subsys *ss = subsys[i];
  		if (!ss->early_init)
  			cgroup_init_subsys(ss);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3816
  		if (ss->use_id)
e6a1105ba   Ben Blum   cgroups: subsyste...
3817
  			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
ddbcc7e8e   Paul Menage   Task Control Grou...
3818
  	}
472b1053f   Li Zefan   cgroups: use a ha...
3819
3820
3821
  	/* Add init_css_set to the hash table */
  	hhead = css_set_hash(init_css_set.subsys);
  	hlist_add_head(&init_css_set.hlist, hhead);
2c6ab6d20   Paul Menage   cgroups: allow cg...
3822
  	BUG_ON(!init_root_id(&rootnode));
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
3823
3824
3825
3826
3827
3828
  
  	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
  	if (!cgroup_kobj) {
  		err = -ENOMEM;
  		goto out;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3829
  	err = register_filesystem(&cgroup_fs_type);
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
3830
3831
  	if (err < 0) {
  		kobject_put(cgroup_kobj);
ddbcc7e8e   Paul Menage   Task Control Grou...
3832
  		goto out;
676db4af0   Greg Kroah-Hartman   cgroupfs: create ...
3833
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3834

46ae220be   Li Zefan   cgroup: switch to...
3835
  	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
a424316ca   Paul Menage   Task Control Grou...
3836

ddbcc7e8e   Paul Menage   Task Control Grou...
3837
  out:
a424316ca   Paul Menage   Task Control Grou...
3838
3839
  	if (err)
  		bdi_destroy(&cgroup_backing_dev_info);
ddbcc7e8e   Paul Menage   Task Control Grou...
3840
3841
  	return err;
  }
b4f48b636   Paul Menage   Task Control Grou...
3842

a424316ca   Paul Menage   Task Control Grou...
3843
3844
3845
3846
3847
3848
  /*
   * proc_cgroup_show()
   *  - Print task's cgroup paths into seq_file, one line for each hierarchy
   *  - Used for /proc/<pid>/cgroup.
   *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
   *    doesn't really matter if tsk->cgroup changes after we read it,
956db3ca0   Cliff Wickman   hotplug cpu: move...
3849
   *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
a424316ca   Paul Menage   Task Control Grou...
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
   *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
   *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
   *    cgroup to top_cgroup.
   */
  
  /* TODO: Use a proper seq_file iterator */
  static int proc_cgroup_show(struct seq_file *m, void *v)
  {
  	struct pid *pid;
  	struct task_struct *tsk;
  	char *buf;
  	int retval;
  	struct cgroupfs_root *root;
  
  	retval = -ENOMEM;
  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!buf)
  		goto out;
  
  	retval = -ESRCH;
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
  	if (!tsk)
  		goto out_free;
  
  	retval = 0;
  
  	mutex_lock(&cgroup_mutex);
e5f6a8609   Li Zefan   cgroups: make roo...
3878
  	for_each_active_root(root) {
a424316ca   Paul Menage   Task Control Grou...
3879
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
3880
  		struct cgroup *cgrp;
a424316ca   Paul Menage   Task Control Grou...
3881
  		int count = 0;
2c6ab6d20   Paul Menage   cgroups: allow cg...
3882
  		seq_printf(m, "%d:", root->hierarchy_id);
a424316ca   Paul Menage   Task Control Grou...
3883
3884
  		for_each_subsys(root, ss)
  			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
c6d57f331   Paul Menage   cgroups: support ...
3885
3886
3887
  		if (strlen(root->name))
  			seq_printf(m, "%sname=%s", count ? "," : "",
  				   root->name);
a424316ca   Paul Menage   Task Control Grou...
3888
  		seq_putc(m, ':');
7717f7ba9   Paul Menage   cgroups: add a ba...
3889
  		cgrp = task_cgroup_from_root(tsk, root);
bd89aabc6   Paul Menage   Control groups: R...
3890
  		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
a424316ca   Paul Menage   Task Control Grou...
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
  		if (retval < 0)
  			goto out_unlock;
  		seq_puts(m, buf);
  		seq_putc(m, '
  ');
  	}
  
  out_unlock:
  	mutex_unlock(&cgroup_mutex);
  	put_task_struct(tsk);
  out_free:
  	kfree(buf);
  out:
  	return retval;
  }
  
  static int cgroup_open(struct inode *inode, struct file *file)
  {
  	struct pid *pid = PROC_I(inode)->pid;
  	return single_open(file, proc_cgroup_show, pid);
  }
828c09509   Alexey Dobriyan   const: constify r...
3912
  const struct file_operations proc_cgroup_operations = {
a424316ca   Paul Menage   Task Control Grou...
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
  	.open		= cgroup_open,
  	.read		= seq_read,
  	.llseek		= seq_lseek,
  	.release	= single_release,
  };
  
  /* Display information about each subsystem and each hierarchy */
  static int proc_cgroupstats_show(struct seq_file *m, void *v)
  {
  	int i;
a424316ca   Paul Menage   Task Control Grou...
3923

8bab8dded   Paul Menage   cgroups: add cgro...
3924
3925
  	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled
  ");
aae8aab40   Ben Blum   cgroups: revamp s...
3926
3927
3928
3929
3930
  	/*
  	 * ideally we don't want subsystems moving around while we do this.
  	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
  	 * subsys/hierarchy state.
  	 */
a424316ca   Paul Menage   Task Control Grou...
3931
  	mutex_lock(&cgroup_mutex);
a424316ca   Paul Menage   Task Control Grou...
3932
3933
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3934
3935
  		if (ss == NULL)
  			continue;
2c6ab6d20   Paul Menage   cgroups: allow cg...
3936
3937
3938
  		seq_printf(m, "%s\t%d\t%d\t%d
  ",
  			   ss->name, ss->root->hierarchy_id,
8bab8dded   Paul Menage   cgroups: add cgro...
3939
  			   ss->root->number_of_cgroups, !ss->disabled);
a424316ca   Paul Menage   Task Control Grou...
3940
3941
3942
3943
3944
3945
3946
  	}
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  static int cgroupstats_open(struct inode *inode, struct file *file)
  {
9dce07f1a   Al Viro   NULL noise: fs/*,...
3947
  	return single_open(file, proc_cgroupstats_show, NULL);
a424316ca   Paul Menage   Task Control Grou...
3948
  }
828c09509   Alexey Dobriyan   const: constify r...
3949
  static const struct file_operations proc_cgroupstats_operations = {
a424316ca   Paul Menage   Task Control Grou...
3950
3951
3952
3953
3954
  	.open = cgroupstats_open,
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.release = single_release,
  };
b4f48b636   Paul Menage   Task Control Grou...
3955
3956
  /**
   * cgroup_fork - attach newly forked task to its parents cgroup.
a043e3b2c   Li Zefan   cgroup: fix comments
3957
   * @child: pointer to task_struct of forking parent process.
b4f48b636   Paul Menage   Task Control Grou...
3958
3959
3960
3961
3962
3963
   *
   * Description: A task inherits its parent's cgroup at fork().
   *
   * A pointer to the shared css_set was automatically copied in
   * fork.c by dup_task_struct().  However, we ignore that copy, since
   * it was not made under the protection of RCU or cgroup_mutex, so
956db3ca0   Cliff Wickman   hotplug cpu: move...
3964
   * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
817929ec2   Paul Menage   Task Control Grou...
3965
3966
   * have already changed current->cgroups, allowing the previously
   * referenced cgroup group to be removed and freed.
b4f48b636   Paul Menage   Task Control Grou...
3967
3968
3969
3970
3971
3972
   *
   * At the point that cgroup_fork() is called, 'current' is the parent
   * task, and the passed argument 'child' points to the child task.
   */
  void cgroup_fork(struct task_struct *child)
  {
817929ec2   Paul Menage   Task Control Grou...
3973
3974
3975
3976
3977
  	task_lock(current);
  	child->cgroups = current->cgroups;
  	get_css_set(child->cgroups);
  	task_unlock(current);
  	INIT_LIST_HEAD(&child->cg_list);
b4f48b636   Paul Menage   Task Control Grou...
3978
3979
3980
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3981
3982
3983
3984
3985
3986
   * cgroup_fork_callbacks - run fork callbacks
   * @child: the new task
   *
   * Called on a new task very soon before adding it to the
   * tasklist. No need to take any locks since no-one can
   * be operating on this task.
b4f48b636   Paul Menage   Task Control Grou...
3987
3988
3989
3990
3991
   */
  void cgroup_fork_callbacks(struct task_struct *child)
  {
  	if (need_forkexit_callback) {
  		int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3992
3993
3994
3995
3996
3997
  		/*
  		 * forkexit callbacks are only supported for builtin
  		 * subsystems, and the builtin section of the subsys array is
  		 * immutable, so we don't need to lock the subsys array here.
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
b4f48b636   Paul Menage   Task Control Grou...
3998
3999
4000
4001
4002
4003
4004
4005
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->fork)
  				ss->fork(ss, child);
  		}
  	}
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4006
4007
4008
4009
4010
4011
4012
4013
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
   * Adds the task to the list running through its css_set if necessary.
   * Has to be after the task is visible on the task list in case we race
   * with the first call to cgroup_iter_start() - to guarantee that the
   * new task ends up on its list.
   */
817929ec2   Paul Menage   Task Control Grou...
4014
4015
4016
4017
  void cgroup_post_fork(struct task_struct *child)
  {
  	if (use_task_css_set_links) {
  		write_lock(&css_set_lock);
b12b533fa   Lai Jiangshan   cgroups: add lock...
4018
  		task_lock(child);
817929ec2   Paul Menage   Task Control Grou...
4019
4020
  		if (list_empty(&child->cg_list))
  			list_add(&child->cg_list, &child->cgroups->tasks);
b12b533fa   Lai Jiangshan   cgroups: add lock...
4021
  		task_unlock(child);
817929ec2   Paul Menage   Task Control Grou...
4022
4023
4024
4025
  		write_unlock(&css_set_lock);
  	}
  }
  /**
b4f48b636   Paul Menage   Task Control Grou...
4026
4027
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
a043e3b2c   Li Zefan   cgroup: fix comments
4028
   * @run_callback: run exit callbacks?
b4f48b636   Paul Menage   Task Control Grou...
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
   *
   * Description: Detach cgroup from @tsk and release it.
   *
   * Note that cgroups marked notify_on_release force every task in
   * them to take the global cgroup_mutex mutex when exiting.
   * This could impact scaling on very large systems.  Be reluctant to
   * use notify_on_release cgroups where very high task exit scaling
   * is required on large systems.
   *
   * the_top_cgroup_hack:
   *
   *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
   *
   *    We call cgroup_exit() while the task is still competent to
   *    handle notify_on_release(), then leave the task attached to the
   *    root cgroup in each hierarchy for the remainder of its exit.
   *
   *    To do this properly, we would increment the reference count on
   *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
   *    code we would add a second cgroup function call, to drop that
   *    reference.  This would just create an unnecessary hot spot on
   *    the top_cgroup reference count, to no avail.
   *
   *    Normally, holding a reference to a cgroup without bumping its
   *    count is unsafe.   The cgroup could go away, or someone could
   *    attach us to a different cgroup, decrementing the count on
   *    the first cgroup that we never incremented.  But in this case,
   *    top_cgroup isn't going away, and either task has PF_EXITING set,
956db3ca0   Cliff Wickman   hotplug cpu: move...
4057
4058
   *    which wards off any cgroup_attach_task() attempts, or task is a failed
   *    fork, never visible to cgroup_attach_task.
b4f48b636   Paul Menage   Task Control Grou...
4059
4060
4061
4062
   */
  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
  {
  	int i;
817929ec2   Paul Menage   Task Control Grou...
4063
  	struct css_set *cg;
b4f48b636   Paul Menage   Task Control Grou...
4064
4065
  
  	if (run_callbacks && need_forkexit_callback) {
aae8aab40   Ben Blum   cgroups: revamp s...
4066
4067
4068
4069
4070
  		/*
  		 * modular subsystems can't use callbacks, so no need to lock
  		 * the subsys array
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
b4f48b636   Paul Menage   Task Control Grou...
4071
4072
4073
4074
4075
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->exit)
  				ss->exit(ss, tsk);
  		}
  	}
817929ec2   Paul Menage   Task Control Grou...
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
  
  	/*
  	 * Unlink from the css_set task list if necessary.
  	 * Optimistically check cg_list before taking
  	 * css_set_lock
  	 */
  	if (!list_empty(&tsk->cg_list)) {
  		write_lock(&css_set_lock);
  		if (!list_empty(&tsk->cg_list))
  			list_del(&tsk->cg_list);
  		write_unlock(&css_set_lock);
  	}
b4f48b636   Paul Menage   Task Control Grou...
4088
4089
  	/* Reassign the task to the init_css_set. */
  	task_lock(tsk);
817929ec2   Paul Menage   Task Control Grou...
4090
4091
  	cg = tsk->cgroups;
  	tsk->cgroups = &init_css_set;
b4f48b636   Paul Menage   Task Control Grou...
4092
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
4093
  	if (cg)
81a6a5cdd   Paul Menage   Task Control Grou...
4094
  		put_css_set_taskexit(cg);
b4f48b636   Paul Menage   Task Control Grou...
4095
  }
697f41610   Paul Menage   Task Control Grou...
4096
4097
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
4098
4099
4100
   * cgroup_clone - clone the cgroup the given subsystem is attached to
   * @tsk: the task to be moved
   * @subsys: the given subsystem
e885dcde7   Serge E. Hallyn   cgroup_clone: use...
4101
   * @nodename: the name for the new cgroup
a043e3b2c   Li Zefan   cgroup: fix comments
4102
4103
4104
4105
   *
   * Duplicate the current cgroup in the hierarchy that the given
   * subsystem is attached to, and move this task into the new
   * child.
697f41610   Paul Menage   Task Control Grou...
4106
   */
e885dcde7   Serge E. Hallyn   cgroup_clone: use...
4107
4108
  int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
  							char *nodename)
697f41610   Paul Menage   Task Control Grou...
4109
4110
4111
  {
  	struct dentry *dentry;
  	int ret = 0;
697f41610   Paul Menage   Task Control Grou...
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
  	struct cgroup *parent, *child;
  	struct inode *inode;
  	struct css_set *cg;
  	struct cgroupfs_root *root;
  	struct cgroup_subsys *ss;
  
  	/* We shouldn't be called by an unregistered subsystem */
  	BUG_ON(!subsys->active);
  
  	/* First figure out what hierarchy and cgroup we're dealing
  	 * with, and pin them so we can drop cgroup_mutex */
  	mutex_lock(&cgroup_mutex);
   again:
  	root = subsys->root;
  	if (root == &rootnode) {
697f41610   Paul Menage   Task Control Grou...
4127
4128
4129
  		mutex_unlock(&cgroup_mutex);
  		return 0;
  	}
697f41610   Paul Menage   Task Control Grou...
4130

697f41610   Paul Menage   Task Control Grou...
4131
  	/* Pin the hierarchy */
1404f0656   Li Zefan   cgroups: fix lock...
4132
  	if (!atomic_inc_not_zero(&root->sb->s_active)) {
7b574b7b0   Li Zefan   cgroups: fix a ra...
4133
4134
4135
4136
  		/* We race with the final deactivate_super() */
  		mutex_unlock(&cgroup_mutex);
  		return 0;
  	}
697f41610   Paul Menage   Task Control Grou...
4137

817929ec2   Paul Menage   Task Control Grou...
4138
  	/* Keep the cgroup alive */
1404f0656   Li Zefan   cgroups: fix lock...
4139
4140
4141
  	task_lock(tsk);
  	parent = task_cgroup(tsk, subsys->subsys_id);
  	cg = tsk->cgroups;
817929ec2   Paul Menage   Task Control Grou...
4142
  	get_css_set(cg);
104cbd553   Lai Jiangshan   cgroups: use task...
4143
  	task_unlock(tsk);
1404f0656   Li Zefan   cgroups: fix lock...
4144

697f41610   Paul Menage   Task Control Grou...
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
  	mutex_unlock(&cgroup_mutex);
  
  	/* Now do the VFS work to create a cgroup */
  	inode = parent->dentry->d_inode;
  
  	/* Hold the parent directory mutex across this operation to
  	 * stop anyone else deleting the new cgroup */
  	mutex_lock(&inode->i_mutex);
  	dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
  	if (IS_ERR(dentry)) {
  		printk(KERN_INFO
cfe36bde5   Diego Calleja   Improve cgroup pr...
4156
4157
  		       "cgroup: Couldn't allocate dentry for %s: %ld
  ", nodename,
697f41610   Paul Menage   Task Control Grou...
4158
4159
4160
4161
4162
4163
  		       PTR_ERR(dentry));
  		ret = PTR_ERR(dentry);
  		goto out_release;
  	}
  
  	/* Create the cgroup directory, which also creates the cgroup */
75139b827   Li Zefan   cgroups: remove s...
4164
  	ret = vfs_mkdir(inode, dentry, 0755);
bd89aabc6   Paul Menage   Control groups: R...
4165
  	child = __d_cgrp(dentry);
697f41610   Paul Menage   Task Control Grou...
4166
4167
4168
4169
4170
4171
4172
4173
  	dput(dentry);
  	if (ret) {
  		printk(KERN_INFO
  		       "Failed to create cgroup %s: %d
  ", nodename,
  		       ret);
  		goto out_release;
  	}
697f41610   Paul Menage   Task Control Grou...
4174
4175
4176
4177
4178
4179
4180
4181
  	/* The cgroup now exists. Retake cgroup_mutex and check
  	 * that we're still in the same state that we thought we
  	 * were. */
  	mutex_lock(&cgroup_mutex);
  	if ((root != subsys->root) ||
  	    (parent != task_cgroup(tsk, subsys->subsys_id))) {
  		/* Aargh, we raced ... */
  		mutex_unlock(&inode->i_mutex);
817929ec2   Paul Menage   Task Control Grou...
4182
  		put_css_set(cg);
697f41610   Paul Menage   Task Control Grou...
4183

1404f0656   Li Zefan   cgroups: fix lock...
4184
  		deactivate_super(root->sb);
697f41610   Paul Menage   Task Control Grou...
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
  		/* The cgroup is still accessible in the VFS, but
  		 * we're not going to try to rmdir() it at this
  		 * point. */
  		printk(KERN_INFO
  		       "Race in cgroup_clone() - leaking cgroup %s
  ",
  		       nodename);
  		goto again;
  	}
  
  	/* do any required auto-setup */
  	for_each_subsys(root, ss) {
  		if (ss->post_clone)
  			ss->post_clone(ss, child);
  	}
  
  	/* All seems fine. Finish by moving the task into the new cgroup */
956db3ca0   Cliff Wickman   hotplug cpu: move...
4202
  	ret = cgroup_attach_task(child, tsk);
697f41610   Paul Menage   Task Control Grou...
4203
4204
4205
4206
  	mutex_unlock(&cgroup_mutex);
  
   out_release:
  	mutex_unlock(&inode->i_mutex);
81a6a5cdd   Paul Menage   Task Control Grou...
4207
4208
  
  	mutex_lock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
4209
  	put_css_set(cg);
81a6a5cdd   Paul Menage   Task Control Grou...
4210
  	mutex_unlock(&cgroup_mutex);
1404f0656   Li Zefan   cgroups: fix lock...
4211
  	deactivate_super(root->sb);
697f41610   Paul Menage   Task Control Grou...
4212
4213
  	return ret;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
4214
  /**
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4215
   * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
a043e3b2c   Li Zefan   cgroup: fix comments
4216
   * @cgrp: the cgroup in question
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4217
   * @task: the task in question
a043e3b2c   Li Zefan   cgroup: fix comments
4218
   *
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4219
4220
   * See if @cgrp is a descendant of @task's cgroup in the appropriate
   * hierarchy.
697f41610   Paul Menage   Task Control Grou...
4221
4222
4223
4224
4225
4226
   *
   * If we are sending in dummytop, then presumably we are creating
   * the top cgroup in the subsystem.
   *
   * Called only by the ns (nsproxy) cgroup.
   */
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4227
  int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
697f41610   Paul Menage   Task Control Grou...
4228
4229
4230
  {
  	int ret;
  	struct cgroup *target;
697f41610   Paul Menage   Task Control Grou...
4231

bd89aabc6   Paul Menage   Control groups: R...
4232
  	if (cgrp == dummytop)
697f41610   Paul Menage   Task Control Grou...
4233
  		return 1;
7717f7ba9   Paul Menage   cgroups: add a ba...
4234
  	target = task_cgroup_from_root(task, cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
4235
4236
4237
  	while (cgrp != target && cgrp!= cgrp->top_cgroup)
  		cgrp = cgrp->parent;
  	ret = (cgrp == target);
697f41610   Paul Menage   Task Control Grou...
4238
4239
  	return ret;
  }
81a6a5cdd   Paul Menage   Task Control Grou...
4240

bd89aabc6   Paul Menage   Control groups: R...
4241
  static void check_for_release(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
4242
4243
4244
  {
  	/* All of these checks rely on RCU to keep the cgroup
  	 * structure alive */
bd89aabc6   Paul Menage   Control groups: R...
4245
4246
  	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
  	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
4247
4248
4249
4250
4251
  		/* Control Group is currently removeable. If it's not
  		 * already queued for a userspace notification, queue
  		 * it now */
  		int need_schedule_work = 0;
  		spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
4252
4253
4254
  		if (!cgroup_is_removed(cgrp) &&
  		    list_empty(&cgrp->release_list)) {
  			list_add(&cgrp->release_list, &release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
4255
4256
4257
4258
4259
4260
4261
  			need_schedule_work = 1;
  		}
  		spin_unlock(&release_list_lock);
  		if (need_schedule_work)
  			schedule_work(&release_agent_work);
  	}
  }
d7b9fff71   Daisuke Nishimura   cgroup: introduce...
4262
4263
  /* Caller must verify that the css is not for root cgroup */
  void __css_put(struct cgroup_subsys_state *css, int count)
81a6a5cdd   Paul Menage   Task Control Grou...
4264
  {
bd89aabc6   Paul Menage   Control groups: R...
4265
  	struct cgroup *cgrp = css->cgroup;
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4266
  	int val;
81a6a5cdd   Paul Menage   Task Control Grou...
4267
  	rcu_read_lock();
d7b9fff71   Daisuke Nishimura   cgroup: introduce...
4268
  	val = atomic_sub_return(count, &css->refcnt);
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4269
  	if (val == 1) {
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
4270
4271
4272
4273
  		if (notify_on_release(cgrp)) {
  			set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
  		}
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
4274
  		cgroup_wakeup_rmdir_waiter(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
4275
4276
  	}
  	rcu_read_unlock();
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4277
  	WARN_ON_ONCE(val < 1);
81a6a5cdd   Paul Menage   Task Control Grou...
4278
  }
67523c48a   Ben Blum   cgroups: blkio su...
4279
  EXPORT_SYMBOL_GPL(__css_put);
81a6a5cdd   Paul Menage   Task Control Grou...
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
  
  /*
   * Notify userspace when a cgroup is released, by running the
   * configured release agent with the name of the cgroup (path
   * relative to the root of cgroup file system) as the argument.
   *
   * Most likely, this user command will try to rmdir this cgroup.
   *
   * This races with the possibility that some other task will be
   * attached to this cgroup before it is removed, or that some other
   * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
   * The presumed 'rmdir' will fail quietly if this cgroup is no longer
   * unused, and this cgroup will be reprieved from its death sentence,
   * to continue to serve a useful existence.  Next time it's released,
   * we will get notified again, if it still has 'notify_on_release' set.
   *
   * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
   * means only wait until the task is successfully execve()'d.  The
   * separate release agent task is forked by call_usermodehelper(),
   * then control in this thread returns here, without waiting for the
   * release agent task.  We don't bother to wait because the caller of
   * this routine has no use for the exit status of the release agent
   * task, so no sense holding our caller up for that.
81a6a5cdd   Paul Menage   Task Control Grou...
4303
   */
81a6a5cdd   Paul Menage   Task Control Grou...
4304
4305
4306
4307
4308
4309
4310
4311
  static void cgroup_release_agent(struct work_struct *work)
  {
  	BUG_ON(work != &release_agent_work);
  	mutex_lock(&cgroup_mutex);
  	spin_lock(&release_list_lock);
  	while (!list_empty(&release_list)) {
  		char *argv[3], *envp[3];
  		int i;
e788e066c   Paul Menage   cgroup files: mov...
4312
  		char *pathbuf = NULL, *agentbuf = NULL;
bd89aabc6   Paul Menage   Control groups: R...
4313
  		struct cgroup *cgrp = list_entry(release_list.next,
81a6a5cdd   Paul Menage   Task Control Grou...
4314
4315
  						    struct cgroup,
  						    release_list);
bd89aabc6   Paul Menage   Control groups: R...
4316
  		list_del_init(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
4317
4318
  		spin_unlock(&release_list_lock);
  		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
e788e066c   Paul Menage   cgroup files: mov...
4319
4320
4321
4322
4323
4324
4325
  		if (!pathbuf)
  			goto continue_free;
  		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
  			goto continue_free;
  		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
  		if (!agentbuf)
  			goto continue_free;
81a6a5cdd   Paul Menage   Task Control Grou...
4326
4327
  
  		i = 0;
e788e066c   Paul Menage   cgroup files: mov...
4328
4329
  		argv[i++] = agentbuf;
  		argv[i++] = pathbuf;
81a6a5cdd   Paul Menage   Task Control Grou...
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
  		argv[i] = NULL;
  
  		i = 0;
  		/* minimal command environment */
  		envp[i++] = "HOME=/";
  		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  		envp[i] = NULL;
  
  		/* Drop the lock while we invoke the usermode helper,
  		 * since the exec could involve hitting disk and hence
  		 * be a slow process */
  		mutex_unlock(&cgroup_mutex);
  		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
81a6a5cdd   Paul Menage   Task Control Grou...
4343
  		mutex_lock(&cgroup_mutex);
e788e066c   Paul Menage   cgroup files: mov...
4344
4345
4346
   continue_free:
  		kfree(pathbuf);
  		kfree(agentbuf);
81a6a5cdd   Paul Menage   Task Control Grou...
4347
4348
4349
4350
4351
  		spin_lock(&release_list_lock);
  	}
  	spin_unlock(&release_list_lock);
  	mutex_unlock(&cgroup_mutex);
  }
8bab8dded   Paul Menage   cgroups: add cgro...
4352
4353
4354
4355
4356
4357
4358
4359
4360
  
  static int __init cgroup_disable(char *str)
  {
  	int i;
  	char *token;
  
  	while ((token = strsep(&str, ",")) != NULL) {
  		if (!*token)
  			continue;
aae8aab40   Ben Blum   cgroups: revamp s...
4361
4362
4363
4364
4365
  		/*
  		 * cgroup_disable, being at boot time, can't know about module
  		 * subsystems, so we don't worry about them.
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
8bab8dded   Paul Menage   cgroups: add cgro...
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
  			struct cgroup_subsys *ss = subsys[i];
  
  			if (!strcmp(token, ss->name)) {
  				ss->disabled = 1;
  				printk(KERN_INFO "Disabling %s control group"
  					" subsystem
  ", ss->name);
  				break;
  			}
  		}
  	}
  	return 1;
  }
  __setup("cgroup_disable=", cgroup_disable);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
  
  /*
   * Functons for CSS ID.
   */
  
  /*
   *To get ID other than 0, this should be called when !cgroup_is_removed().
   */
  unsigned short css_id(struct cgroup_subsys_state *css)
  {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4390
4391
4392
4393
4394
4395
4396
4397
4398
  	struct css_id *cssid;
  
  	/*
  	 * This css_id() can return correct value when somone has refcnt
  	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
  	 * it's unchanged until freed.
  	 */
  	cssid = rcu_dereference_check(css->id,
  			rcu_read_lock_held() || atomic_read(&css->refcnt));
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4399
4400
4401
4402
4403
  
  	if (cssid)
  		return cssid->id;
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
4404
  EXPORT_SYMBOL_GPL(css_id);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4405
4406
4407
  
  unsigned short css_depth(struct cgroup_subsys_state *css)
  {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4408
4409
4410
4411
  	struct css_id *cssid;
  
  	cssid = rcu_dereference_check(css->id,
  			rcu_read_lock_held() || atomic_read(&css->refcnt));
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4412
4413
4414
4415
4416
  
  	if (cssid)
  		return cssid->depth;
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
4417
  EXPORT_SYMBOL_GPL(css_depth);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4418

747388d78   KAMEZAWA Hiroyuki   memcg: fix css_is...
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
  /**
   *  css_is_ancestor - test "root" css is an ancestor of "child"
   * @child: the css to be tested.
   * @root: the css supporsed to be an ancestor of the child.
   *
   * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
   * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
   * But, considering usual usage, the csses should be valid objects after test.
   * Assuming that the caller will do some action to the child if this returns
   * returns true, the caller must take "child";s reference count.
   * If "child" is valid object and this returns true, "root" is valid, too.
   */
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4431
  bool css_is_ancestor(struct cgroup_subsys_state *child,
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
4432
  		    const struct cgroup_subsys_state *root)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4433
  {
747388d78   KAMEZAWA Hiroyuki   memcg: fix css_is...
4434
4435
4436
  	struct css_id *child_id;
  	struct css_id *root_id;
  	bool ret = true;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4437

747388d78   KAMEZAWA Hiroyuki   memcg: fix css_is...
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
  	rcu_read_lock();
  	child_id  = rcu_dereference(child->id);
  	root_id = rcu_dereference(root->id);
  	if (!child_id
  	    || !root_id
  	    || (child_id->depth < root_id->depth)
  	    || (child_id->stack[root_id->depth] != root_id->id))
  		ret = false;
  	rcu_read_unlock();
  	return ret;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
  }
  
  static void __free_css_id_cb(struct rcu_head *head)
  {
  	struct css_id *id;
  
  	id = container_of(head, struct css_id, rcu_head);
  	kfree(id);
  }
  
  void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
  {
  	struct css_id *id = css->id;
  	/* When this is called before css_id initialization, id can be NULL */
  	if (!id)
  		return;
  
  	BUG_ON(!ss->use_id);
  
  	rcu_assign_pointer(id->css, NULL);
  	rcu_assign_pointer(css->id, NULL);
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, id->id);
  	spin_unlock(&ss->id_lock);
  	call_rcu(&id->rcu_head, __free_css_id_cb);
  }
67523c48a   Ben Blum   cgroups: blkio su...
4474
  EXPORT_SYMBOL_GPL(free_css_id);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
  
  /*
   * This is called by init or create(). Then, calls to this function are
   * always serialized (By cgroup_mutex() at create()).
   */
  
  static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
  {
  	struct css_id *newid;
  	int myid, error, size;
  
  	BUG_ON(!ss->use_id);
  
  	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
  	newid = kzalloc(size, GFP_KERNEL);
  	if (!newid)
  		return ERR_PTR(-ENOMEM);
  	/* get id */
  	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
  		error = -ENOMEM;
  		goto err_out;
  	}
  	spin_lock(&ss->id_lock);
  	/* Don't use 0. allocates an ID of 1-65535 */
  	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
  	spin_unlock(&ss->id_lock);
  
  	/* Returns error when there are no free spaces for new ID.*/
  	if (error) {
  		error = -ENOSPC;
  		goto err_out;
  	}
  	if (myid > CSS_ID_MAX)
  		goto remove_idr;
  
  	newid->id = myid;
  	newid->depth = depth;
  	return newid;
  remove_idr:
  	error = -ENOSPC;
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, myid);
  	spin_unlock(&ss->id_lock);
  err_out:
  	kfree(newid);
  	return ERR_PTR(error);
  
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
4523
4524
  static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
  					    struct cgroup_subsys_state *rootcss)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4525
4526
  {
  	struct css_id *newid;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4527
4528
4529
  
  	spin_lock_init(&ss->id_lock);
  	idr_init(&ss->idr);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
  	newid = get_new_cssid(ss, 0);
  	if (IS_ERR(newid))
  		return PTR_ERR(newid);
  
  	newid->stack[0] = newid->id;
  	newid->css = rootcss;
  	rootcss->id = newid;
  	return 0;
  }
  
  static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
  			struct cgroup *child)
  {
  	int subsys_id, i, depth = 0;
  	struct cgroup_subsys_state *parent_css, *child_css;
fae9c7917   Li Zefan   cgroup: Fix an RC...
4545
  	struct css_id *child_id, *parent_id;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4546
4547
4548
4549
  
  	subsys_id = ss->subsys_id;
  	parent_css = parent->subsys[subsys_id];
  	child_css = child->subsys[subsys_id];
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4550
  	parent_id = parent_css->id;
94b3dd0f7   Greg Thelen   cgroups: alloc_cs...
4551
  	depth = parent_id->depth + 1;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
  
  	child_id = get_new_cssid(ss, depth);
  	if (IS_ERR(child_id))
  		return PTR_ERR(child_id);
  
  	for (i = 0; i < depth; i++)
  		child_id->stack[i] = parent_id->stack[i];
  	child_id->stack[depth] = child_id->id;
  	/*
  	 * child_id->css pointer will be set after this cgroup is available
  	 * see cgroup_populate_dir()
  	 */
  	rcu_assign_pointer(child_css->id, child_id);
  
  	return 0;
  }
  
  /**
   * css_lookup - lookup css by id
   * @ss: cgroup subsys to be looked into.
   * @id: the id
   *
   * Returns pointer to cgroup_subsys_state if there is valid one with id.
   * NULL if not. Should be called under rcu_read_lock()
   */
  struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
  {
  	struct css_id *cssid = NULL;
  
  	BUG_ON(!ss->use_id);
  	cssid = idr_find(&ss->idr, id);
  
  	if (unlikely(!cssid))
  		return NULL;
  
  	return rcu_dereference(cssid->css);
  }
67523c48a   Ben Blum   cgroups: blkio su...
4589
  EXPORT_SYMBOL_GPL(css_lookup);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
  
  /**
   * css_get_next - lookup next cgroup under specified hierarchy.
   * @ss: pointer to subsystem
   * @id: current position of iteration.
   * @root: pointer to css. search tree under this.
   * @foundid: position of found object.
   *
   * Search next css under the specified hierarchy of rootid. Calling under
   * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
   */
  struct cgroup_subsys_state *
  css_get_next(struct cgroup_subsys *ss, int id,
  	     struct cgroup_subsys_state *root, int *foundid)
  {
  	struct cgroup_subsys_state *ret = NULL;
  	struct css_id *tmp;
  	int tmpid;
  	int rootid = css_id(root);
  	int depth = css_depth(root);
  
  	if (!rootid)
  		return NULL;
  
  	BUG_ON(!ss->use_id);
  	/* fill start point for scan */
  	tmpid = id;
  	while (1) {
  		/*
  		 * scan next entry from bitmap(tree), tmpid is updated after
  		 * idr_get_next().
  		 */
  		spin_lock(&ss->id_lock);
  		tmp = idr_get_next(&ss->idr, &tmpid);
  		spin_unlock(&ss->id_lock);
  
  		if (!tmp)
  			break;
  		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
  			ret = rcu_dereference(tmp->css);
  			if (ret) {
  				*foundid = tmpid;
  				break;
  			}
  		}
  		/* continue to scan from next id */
  		tmpid = tmpid + 1;
  	}
  	return ret;
  }
fe6934354   Paul Menage   cgroups: move the...
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
  #ifdef CONFIG_CGROUP_DEBUG
  static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
  						   struct cgroup *cont)
  {
  	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
  
  	if (!css)
  		return ERR_PTR(-ENOMEM);
  
  	return css;
  }
  
  static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
  {
  	kfree(cont->subsys[debug_subsys_id]);
  }
  
  static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
  {
  	return atomic_read(&cont->count);
  }
  
  static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
  {
  	return cgroup_task_count(cont);
  }
  
  static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
  {
  	return (u64)(unsigned long)current->cgroups;
  }
  
  static u64 current_css_set_refcount_read(struct cgroup *cont,
  					   struct cftype *cft)
  {
  	u64 count;
  
  	rcu_read_lock();
  	count = atomic_read(&current->cgroups->refcount);
  	rcu_read_unlock();
  	return count;
  }
7717f7ba9   Paul Menage   cgroups: add a ba...
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
  static int current_css_set_cg_links_read(struct cgroup *cont,
  					 struct cftype *cft,
  					 struct seq_file *seq)
  {
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	read_lock(&css_set_lock);
  	rcu_read_lock();
  	cg = rcu_dereference(current->cgroups);
  	list_for_each_entry(link, &cg->cg_links, cg_link_list) {
  		struct cgroup *c = link->cgrp;
  		const char *name;
  
  		if (c->dentry)
  			name = c->dentry->d_name.name;
  		else
  			name = "?";
2c6ab6d20   Paul Menage   cgroups: allow cg...
4700
4701
4702
  		seq_printf(seq, "Root %d group %s
  ",
  			   c->root->hierarchy_id, name);
7717f7ba9   Paul Menage   cgroups: add a ba...
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
  	}
  	rcu_read_unlock();
  	read_unlock(&css_set_lock);
  	return 0;
  }
  
  #define MAX_TASKS_SHOWN_PER_CSS 25
  static int cgroup_css_links_read(struct cgroup *cont,
  				 struct cftype *cft,
  				 struct seq_file *seq)
  {
  	struct cg_cgroup_link *link;
  
  	read_lock(&css_set_lock);
  	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
  		struct css_set *cg = link->cg;
  		struct task_struct *task;
  		int count = 0;
  		seq_printf(seq, "css_set %p
  ", cg);
  		list_for_each_entry(task, &cg->tasks, cg_list) {
  			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
  				seq_puts(seq, "  ...
  ");
  				break;
  			} else {
  				seq_printf(seq, "  task %d
  ",
  					   task_pid_vnr(task));
  			}
  		}
  	}
  	read_unlock(&css_set_lock);
  	return 0;
  }
fe6934354   Paul Menage   cgroups: move the...
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
  static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
  {
  	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
  }
  
  static struct cftype debug_files[] =  {
  	{
  		.name = "cgroup_refcount",
  		.read_u64 = cgroup_refcount_read,
  	},
  	{
  		.name = "taskcount",
  		.read_u64 = debug_taskcount_read,
  	},
  
  	{
  		.name = "current_css_set",
  		.read_u64 = current_css_set_read,
  	},
  
  	{
  		.name = "current_css_set_refcount",
  		.read_u64 = current_css_set_refcount_read,
  	},
  
  	{
7717f7ba9   Paul Menage   cgroups: add a ba...
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
  		.name = "current_css_set_cg_links",
  		.read_seq_string = current_css_set_cg_links_read,
  	},
  
  	{
  		.name = "cgroup_css_links",
  		.read_seq_string = cgroup_css_links_read,
  	},
  
  	{
fe6934354   Paul Menage   cgroups: move the...
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
  		.name = "releasable",
  		.read_u64 = releasable_read,
  	},
  };
  
  static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
  {
  	return cgroup_add_files(cont, ss, debug_files,
  				ARRAY_SIZE(debug_files));
  }
  
  struct cgroup_subsys debug_subsys = {
  	.name = "debug",
  	.create = debug_create,
  	.destroy = debug_destroy,
  	.populate = debug_populate,
  	.subsys_id = debug_subsys_id,
  };
  #endif /* CONFIG_CGROUP_DEBUG */