Blame view

kernel/cgroup.c 125 KB
ddbcc7e8e   Paul Menage   Task Control Grou...
1
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
2
3
4
5
6
   *  Generic process-grouping system.
   *
   *  Based originally on the cpuset system, extracted by Paul Menage
   *  Copyright (C) 2006 Google, Inc
   *
0dea11687   Kirill A. Shutemov   cgroup: implement...
7
8
9
10
   *  Notifications support
   *  Copyright (C) 2009 Nokia Corporation
   *  Author: Kirill A. Shutemov
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
   *  Copyright notices from the original cpuset code:
   *  --------------------------------------------------
   *  Copyright (C) 2003 BULL SA.
   *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
   *
   *  2003-10-10 Written by Simon Derr.
   *  2003-10-22 Updates by Stephen Hemminger.
   *  2004 May-July Rework by Paul Jackson.
   *  ---------------------------------------------------
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
  
  #include <linux/cgroup.h>
c6d57f331   Paul Menage   cgroups: support ...
30
  #include <linux/ctype.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
31
32
33
34
35
36
37
38
  #include <linux/errno.h>
  #include <linux/fs.h>
  #include <linux/kernel.h>
  #include <linux/list.h>
  #include <linux/mm.h>
  #include <linux/mutex.h>
  #include <linux/mount.h>
  #include <linux/pagemap.h>
a424316ca   Paul Menage   Task Control Grou...
39
  #include <linux/proc_fs.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
40
41
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
817929ec2   Paul Menage   Task Control Grou...
42
  #include <linux/backing-dev.h>
ddbcc7e8e   Paul Menage   Task Control Grou...
43
44
45
46
47
  #include <linux/seq_file.h>
  #include <linux/slab.h>
  #include <linux/magic.h>
  #include <linux/spinlock.h>
  #include <linux/string.h>
bbcb81d09   Paul Menage   Task Control Grou...
48
  #include <linux/sort.h>
81a6a5cdd   Paul Menage   Task Control Grou...
49
  #include <linux/kmod.h>
e6a1105ba   Ben Blum   cgroups: subsyste...
50
  #include <linux/module.h>
846c7bb05   Balbir Singh   Add cgroupstats
51
52
  #include <linux/delayacct.h>
  #include <linux/cgroupstats.h>
472b1053f   Li Zefan   cgroups: use a ha...
53
  #include <linux/hash.h>
3f8206d49   Al Viro   [PATCH] get rid o...
54
  #include <linux/namei.h>
337eb00a2   Alessio Igor Bogani   Push BKL down int...
55
  #include <linux/smp_lock.h>
096b7fe01   Li Zefan   cgroups: fix pid ...
56
  #include <linux/pid_namespace.h>
2c6ab6d20   Paul Menage   cgroups: allow cg...
57
  #include <linux/idr.h>
d1d9fd330   Ben Blum   cgroups: use vmal...
58
  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
0dea11687   Kirill A. Shutemov   cgroup: implement...
59
60
  #include <linux/eventfd.h>
  #include <linux/poll.h>
846c7bb05   Balbir Singh   Add cgroupstats
61

ddbcc7e8e   Paul Menage   Task Control Grou...
62
  #include <asm/atomic.h>
81a6a5cdd   Paul Menage   Task Control Grou...
63
  static DEFINE_MUTEX(cgroup_mutex);
aae8aab40   Ben Blum   cgroups: revamp s...
64
65
66
67
68
69
  /*
   * Generate an array of cgroup subsystem pointers. At boot time, this is
   * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
   * registered after that. The mutable section of this array is protected by
   * cgroup_mutex.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
70
  #define SUBSYS(_x) &_x ## _subsys,
aae8aab40   Ben Blum   cgroups: revamp s...
71
  static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
ddbcc7e8e   Paul Menage   Task Control Grou...
72
73
  #include <linux/cgroup_subsys.h>
  };
c6d57f331   Paul Menage   cgroups: support ...
74
  #define MAX_CGROUP_ROOT_NAMELEN 64
ddbcc7e8e   Paul Menage   Task Control Grou...
75
76
77
78
79
80
81
82
83
84
85
86
87
  /*
   * A cgroupfs_root represents the root of a cgroup hierarchy,
   * and may be associated with a superblock to form an active
   * hierarchy
   */
  struct cgroupfs_root {
  	struct super_block *sb;
  
  	/*
  	 * The bitmask of subsystems intended to be attached to this
  	 * hierarchy
  	 */
  	unsigned long subsys_bits;
2c6ab6d20   Paul Menage   cgroups: allow cg...
88
89
  	/* Unique id for this hierarchy. */
  	int hierarchy_id;
ddbcc7e8e   Paul Menage   Task Control Grou...
90
91
92
93
94
95
96
97
98
99
100
  	/* The bitmask of subsystems currently attached to this hierarchy */
  	unsigned long actual_subsys_bits;
  
  	/* A list running through the attached subsystems */
  	struct list_head subsys_list;
  
  	/* The root cgroup for this hierarchy */
  	struct cgroup top_cgroup;
  
  	/* Tracks how many cgroups are currently defined in hierarchy.*/
  	int number_of_cgroups;
e5f6a8609   Li Zefan   cgroups: make roo...
101
  	/* A list running through the active hierarchies */
ddbcc7e8e   Paul Menage   Task Control Grou...
102
103
104
105
  	struct list_head root_list;
  
  	/* Hierarchy-specific flags */
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
106

e788e066c   Paul Menage   cgroup files: mov...
107
  	/* The path to use for release notifications. */
81a6a5cdd   Paul Menage   Task Control Grou...
108
  	char release_agent_path[PATH_MAX];
c6d57f331   Paul Menage   cgroups: support ...
109
110
111
  
  	/* The name for this hierarchy - may be empty */
  	char name[MAX_CGROUP_ROOT_NAMELEN];
ddbcc7e8e   Paul Menage   Task Control Grou...
112
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
113
114
115
116
117
118
  /*
   * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
   * subsystems that are otherwise unattached - it never has more than a
   * single cgroup, and all tasks are part of that cgroup.
   */
  static struct cgroupfs_root rootnode;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
  /*
   * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
   * cgroup_subsys->use_id != 0.
   */
  #define CSS_ID_MAX	(65535)
  struct css_id {
  	/*
  	 * The css to which this ID points. This pointer is set to valid value
  	 * after cgroup is populated. If cgroup is removed, this will be NULL.
  	 * This pointer is expected to be RCU-safe because destroy()
  	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
  	 * css_tryget() should be used for avoiding race.
  	 */
  	struct cgroup_subsys_state *css;
  	/*
  	 * ID of this css.
  	 */
  	unsigned short id;
  	/*
  	 * Depth in hierarchy which this ID belongs to.
  	 */
  	unsigned short depth;
  	/*
  	 * ID is freed by RCU. (and lookup routine is RCU safe.)
  	 */
  	struct rcu_head rcu_head;
  	/*
  	 * Hierarchy of CSS ID belongs to.
  	 */
  	unsigned short stack[0]; /* Array of Length (depth+1) */
  };
0dea11687   Kirill A. Shutemov   cgroup: implement...
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  /*
   * cgroup_event represents events which userspace want to recieve.
   */
  struct cgroup_event {
  	/*
  	 * Cgroup which the event belongs to.
  	 */
  	struct cgroup *cgrp;
  	/*
  	 * Control file which the event associated.
  	 */
  	struct cftype *cft;
  	/*
  	 * eventfd to signal userspace about the event.
  	 */
  	struct eventfd_ctx *eventfd;
  	/*
  	 * Each of these stored in a list by the cgroup.
  	 */
  	struct list_head list;
  	/*
  	 * All fields below needed to unregister event when
  	 * userspace closes eventfd.
  	 */
  	poll_table pt;
  	wait_queue_head_t *wqh;
  	wait_queue_t wait;
  	struct work_struct remove;
  };
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
179

ddbcc7e8e   Paul Menage   Task Control Grou...
180
181
182
  /* The list of hierarchy roots */
  
  static LIST_HEAD(roots);
817929ec2   Paul Menage   Task Control Grou...
183
  static int root_count;
ddbcc7e8e   Paul Menage   Task Control Grou...
184

2c6ab6d20   Paul Menage   cgroups: allow cg...
185
186
187
  static DEFINE_IDA(hierarchy_ida);
  static int next_hierarchy_id;
  static DEFINE_SPINLOCK(hierarchy_id_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
188
189
190
191
  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
  #define dummytop (&rootnode.top_cgroup)
  
  /* This flag indicates whether tasks in the fork and exit paths should
a043e3b2c   Li Zefan   cgroup: fix comments
192
193
194
   * check for fork/exit handlers to call. This avoids us having to do
   * extra work in the fork/exit path if none of the subsystems need to
   * be called.
ddbcc7e8e   Paul Menage   Task Control Grou...
195
   */
8947f9d5b   Li Zefan   cgroups: annotate...
196
  static int need_forkexit_callback __read_mostly;
ddbcc7e8e   Paul Menage   Task Control Grou...
197

d11c563dd   Paul E. McKenney   sched: Use lockde...
198
199
200
201
202
203
204
205
206
207
208
209
210
  #ifdef CONFIG_PROVE_LOCKING
  int cgroup_lock_is_held(void)
  {
  	return lockdep_is_held(&cgroup_mutex);
  }
  #else /* #ifdef CONFIG_PROVE_LOCKING */
  int cgroup_lock_is_held(void)
  {
  	return mutex_is_locked(&cgroup_mutex);
  }
  #endif /* #else #ifdef CONFIG_PROVE_LOCKING */
  
  EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
ddbcc7e8e   Paul Menage   Task Control Grou...
211
  /* convenient tests for these bits */
bd89aabc6   Paul Menage   Control groups: R...
212
  inline int cgroup_is_removed(const struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
213
  {
bd89aabc6   Paul Menage   Control groups: R...
214
  	return test_bit(CGRP_REMOVED, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
215
216
217
218
219
220
  }
  
  /* bits in struct cgroupfs_root flags field */
  enum {
  	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
  };
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
221
  static int cgroup_is_releasable(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
222
223
  {
  	const int bits =
bd89aabc6   Paul Menage   Control groups: R...
224
225
226
  		(1 << CGRP_RELEASABLE) |
  		(1 << CGRP_NOTIFY_ON_RELEASE);
  	return (cgrp->flags & bits) == bits;
81a6a5cdd   Paul Menage   Task Control Grou...
227
  }
e9685a03c   Adrian Bunk   kernel/cgroup.c: ...
228
  static int notify_on_release(const struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
229
  {
bd89aabc6   Paul Menage   Control groups: R...
230
  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
231
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
232
233
234
235
236
237
  /*
   * for_each_subsys() allows you to iterate on each subsystem attached to
   * an active hierarchy
   */
  #define for_each_subsys(_root, _ss) \
  list_for_each_entry(_ss, &_root->subsys_list, sibling)
e5f6a8609   Li Zefan   cgroups: make roo...
238
239
  /* for_each_active_root() allows you to iterate across the active hierarchies */
  #define for_each_active_root(_root) \
ddbcc7e8e   Paul Menage   Task Control Grou...
240
  list_for_each_entry(_root, &roots, root_list)
81a6a5cdd   Paul Menage   Task Control Grou...
241
242
243
244
245
246
  /* the list of cgroups eligible for automatic release. Protected by
   * release_list_lock */
  static LIST_HEAD(release_list);
  static DEFINE_SPINLOCK(release_list_lock);
  static void cgroup_release_agent(struct work_struct *work);
  static DECLARE_WORK(release_agent_work, cgroup_release_agent);
bd89aabc6   Paul Menage   Control groups: R...
247
  static void check_for_release(struct cgroup *cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
248

817929ec2   Paul Menage   Task Control Grou...
249
250
251
252
253
254
  /* Link structure for associating css_set objects with cgroups */
  struct cg_cgroup_link {
  	/*
  	 * List running through cg_cgroup_links associated with a
  	 * cgroup, anchored on cgroup->css_sets
  	 */
bd89aabc6   Paul Menage   Control groups: R...
255
  	struct list_head cgrp_link_list;
7717f7ba9   Paul Menage   cgroups: add a ba...
256
  	struct cgroup *cgrp;
817929ec2   Paul Menage   Task Control Grou...
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
  	/*
  	 * List running through cg_cgroup_links pointing at a
  	 * single css_set object, anchored on css_set->cg_links
  	 */
  	struct list_head cg_link_list;
  	struct css_set *cg;
  };
  
  /* The default css_set - used by init and its children prior to any
   * hierarchies being mounted. It contains a pointer to the root state
   * for each subsystem. Also used to anchor the list of css_sets. Not
   * reference-counted, to improve performance when child cgroups
   * haven't been created.
   */
  
  static struct css_set init_css_set;
  static struct cg_cgroup_link init_css_set_link;
e6a1105ba   Ben Blum   cgroups: subsyste...
274
275
  static int cgroup_init_idr(struct cgroup_subsys *ss,
  			   struct cgroup_subsys_state *css);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
276

817929ec2   Paul Menage   Task Control Grou...
277
278
279
280
281
  /* css_set_lock protects the list of css_set objects, and the
   * chain of tasks off each css_set.  Nests outside task->alloc_lock
   * due to cgroup_iter_start() */
  static DEFINE_RWLOCK(css_set_lock);
  static int css_set_count;
7717f7ba9   Paul Menage   cgroups: add a ba...
282
283
284
285
286
  /*
   * hash table for cgroup groups. This improves the performance to find
   * an existing css_set. This hash doesn't (currently) take into
   * account cgroups in empty hierarchies.
   */
472b1053f   Li Zefan   cgroups: use a ha...
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
  #define CSS_SET_HASH_BITS	7
  #define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
  static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
  
  static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
  {
  	int i;
  	int index;
  	unsigned long tmp = 0UL;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
  		tmp += (unsigned long)css[i];
  	tmp = (tmp >> 16) ^ tmp;
  
  	index = hash_long(tmp, CSS_SET_HASH_BITS);
  
  	return &css_set_table[index];
  }
c378369d8   Ben Blum   cgroups: change c...
305
306
307
308
309
  static void free_css_set_rcu(struct rcu_head *obj)
  {
  	struct css_set *cg = container_of(obj, struct css_set, rcu_head);
  	kfree(cg);
  }
817929ec2   Paul Menage   Task Control Grou...
310
311
312
313
  /* We don't maintain the lists running through each css_set to its
   * task until after the first call to cgroup_iter_start(). This
   * reduces the fork()/exit() overhead for people who have cgroups
   * compiled into their kernel but not actually in use */
8947f9d5b   Li Zefan   cgroups: annotate...
314
  static int use_task_css_set_links __read_mostly;
817929ec2   Paul Menage   Task Control Grou...
315

2c6ab6d20   Paul Menage   cgroups: allow cg...
316
  static void __put_css_set(struct css_set *cg, int taskexit)
b4f48b636   Paul Menage   Task Control Grou...
317
  {
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
318
319
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
320
321
322
323
324
325
326
327
328
329
330
331
  	/*
  	 * Ensure that the refcount doesn't hit zero while any readers
  	 * can see it. Similar to atomic_dec_and_lock(), but for an
  	 * rwlock
  	 */
  	if (atomic_add_unless(&cg->refcount, -1, 1))
  		return;
  	write_lock(&css_set_lock);
  	if (!atomic_dec_and_test(&cg->refcount)) {
  		write_unlock(&css_set_lock);
  		return;
  	}
81a6a5cdd   Paul Menage   Task Control Grou...
332

2c6ab6d20   Paul Menage   cgroups: allow cg...
333
334
335
336
337
338
339
340
341
  	/* This css_set is dead. unlink it and release cgroup refcounts */
  	hlist_del(&cg->hlist);
  	css_set_count--;
  
  	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
  				 cg_link_list) {
  		struct cgroup *cgrp = link->cgrp;
  		list_del(&link->cg_link_list);
  		list_del(&link->cgrp_link_list);
bd89aabc6   Paul Menage   Control groups: R...
342
343
  		if (atomic_dec_and_test(&cgrp->count) &&
  		    notify_on_release(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
344
  			if (taskexit)
bd89aabc6   Paul Menage   Control groups: R...
345
346
  				set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
347
  		}
2c6ab6d20   Paul Menage   cgroups: allow cg...
348
349
  
  		kfree(link);
81a6a5cdd   Paul Menage   Task Control Grou...
350
  	}
2c6ab6d20   Paul Menage   cgroups: allow cg...
351
352
  
  	write_unlock(&css_set_lock);
c378369d8   Ben Blum   cgroups: change c...
353
  	call_rcu(&cg->rcu_head, free_css_set_rcu);
b4f48b636   Paul Menage   Task Control Grou...
354
  }
817929ec2   Paul Menage   Task Control Grou...
355
356
357
358
359
  /*
   * refcounted get/put for css_set objects
   */
  static inline void get_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
360
  	atomic_inc(&cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
361
362
363
364
  }
  
  static inline void put_css_set(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
365
  	__put_css_set(cg, 0);
817929ec2   Paul Menage   Task Control Grou...
366
  }
81a6a5cdd   Paul Menage   Task Control Grou...
367
368
  static inline void put_css_set_taskexit(struct css_set *cg)
  {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
369
  	__put_css_set(cg, 1);
81a6a5cdd   Paul Menage   Task Control Grou...
370
  }
817929ec2   Paul Menage   Task Control Grou...
371
  /*
7717f7ba9   Paul Menage   cgroups: add a ba...
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
   * compare_css_sets - helper function for find_existing_css_set().
   * @cg: candidate css_set being tested
   * @old_cg: existing css_set for a task
   * @new_cgrp: cgroup that's being entered by the task
   * @template: desired set of css pointers in css_set (pre-calculated)
   *
   * Returns true if "cg" matches "old_cg" except for the hierarchy
   * which "new_cgrp" belongs to, for which it should match "new_cgrp".
   */
  static bool compare_css_sets(struct css_set *cg,
  			     struct css_set *old_cg,
  			     struct cgroup *new_cgrp,
  			     struct cgroup_subsys_state *template[])
  {
  	struct list_head *l1, *l2;
  
  	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
  		/* Not all subsystems matched */
  		return false;
  	}
  
  	/*
  	 * Compare cgroup pointers in order to distinguish between
  	 * different cgroups in heirarchies with no subsystems. We
  	 * could get by with just this check alone (and skip the
  	 * memcmp above) but on most setups the memcmp check will
  	 * avoid the need for this more expensive check on almost all
  	 * candidates.
  	 */
  
  	l1 = &cg->cg_links;
  	l2 = &old_cg->cg_links;
  	while (1) {
  		struct cg_cgroup_link *cgl1, *cgl2;
  		struct cgroup *cg1, *cg2;
  
  		l1 = l1->next;
  		l2 = l2->next;
  		/* See if we reached the end - both lists are equal length. */
  		if (l1 == &cg->cg_links) {
  			BUG_ON(l2 != &old_cg->cg_links);
  			break;
  		} else {
  			BUG_ON(l2 == &old_cg->cg_links);
  		}
  		/* Locate the cgroups associated with these links. */
  		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
  		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
  		cg1 = cgl1->cgrp;
  		cg2 = cgl2->cgrp;
  		/* Hierarchies should be linked in the same order. */
  		BUG_ON(cg1->root != cg2->root);
  
  		/*
  		 * If this hierarchy is the hierarchy of the cgroup
  		 * that's changing, then we need to check that this
  		 * css_set points to the new cgroup; if it's any other
  		 * hierarchy, then this css_set should point to the
  		 * same cgroup as the old css_set.
  		 */
  		if (cg1->root == new_cgrp->root) {
  			if (cg1 != new_cgrp)
  				return false;
  		} else {
  			if (cg1 != cg2)
  				return false;
  		}
  	}
  	return true;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
444
445
   * find_existing_css_set() is a helper for
   * find_css_set(), and checks to see whether an existing
472b1053f   Li Zefan   cgroups: use a ha...
446
   * css_set is suitable.
817929ec2   Paul Menage   Task Control Grou...
447
448
449
450
   *
   * oldcg: the cgroup group that we're using before the cgroup
   * transition
   *
bd89aabc6   Paul Menage   Control groups: R...
451
   * cgrp: the cgroup that we're moving into
817929ec2   Paul Menage   Task Control Grou...
452
453
454
455
   *
   * template: location in which to build the desired set of subsystem
   * state objects for the new cgroup group
   */
817929ec2   Paul Menage   Task Control Grou...
456
457
  static struct css_set *find_existing_css_set(
  	struct css_set *oldcg,
bd89aabc6   Paul Menage   Control groups: R...
458
  	struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
459
  	struct cgroup_subsys_state *template[])
b4f48b636   Paul Menage   Task Control Grou...
460
461
  {
  	int i;
bd89aabc6   Paul Menage   Control groups: R...
462
  	struct cgroupfs_root *root = cgrp->root;
472b1053f   Li Zefan   cgroups: use a ha...
463
464
465
  	struct hlist_head *hhead;
  	struct hlist_node *node;
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
466

aae8aab40   Ben Blum   cgroups: revamp s...
467
468
469
470
471
  	/*
  	 * Build the set of subsystem state objects that we want to see in the
  	 * new css_set. while subsystems can change globally, the entries here
  	 * won't change, so no need for locking.
  	 */
817929ec2   Paul Menage   Task Control Grou...
472
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
473
  		if (root->subsys_bits & (1UL << i)) {
817929ec2   Paul Menage   Task Control Grou...
474
475
476
  			/* Subsystem is in this hierarchy. So we want
  			 * the subsystem state from the new
  			 * cgroup */
bd89aabc6   Paul Menage   Control groups: R...
477
  			template[i] = cgrp->subsys[i];
817929ec2   Paul Menage   Task Control Grou...
478
479
480
481
482
483
  		} else {
  			/* Subsystem is not in this hierarchy, so we
  			 * don't want to change the subsystem state */
  			template[i] = oldcg->subsys[i];
  		}
  	}
472b1053f   Li Zefan   cgroups: use a ha...
484
485
  	hhead = css_set_hash(template);
  	hlist_for_each_entry(cg, node, hhead, hlist) {
7717f7ba9   Paul Menage   cgroups: add a ba...
486
487
488
489
490
  		if (!compare_css_sets(cg, oldcg, cgrp, template))
  			continue;
  
  		/* This css_set matches what we need */
  		return cg;
472b1053f   Li Zefan   cgroups: use a ha...
491
  	}
817929ec2   Paul Menage   Task Control Grou...
492
493
494
495
  
  	/* No existing cgroup group matched */
  	return NULL;
  }
36553434f   Li Zefan   cgroup: remove du...
496
497
498
499
500
501
502
503
504
505
  static void free_cg_links(struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
  
  	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
  		list_del(&link->cgrp_link_list);
  		kfree(link);
  	}
  }
817929ec2   Paul Menage   Task Control Grou...
506
507
  /*
   * allocate_cg_links() allocates "count" cg_cgroup_link structures
bd89aabc6   Paul Menage   Control groups: R...
508
   * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
817929ec2   Paul Menage   Task Control Grou...
509
510
   * success or a negative error
   */
817929ec2   Paul Menage   Task Control Grou...
511
512
513
514
515
516
517
518
  static int allocate_cg_links(int count, struct list_head *tmp)
  {
  	struct cg_cgroup_link *link;
  	int i;
  	INIT_LIST_HEAD(tmp);
  	for (i = 0; i < count; i++) {
  		link = kmalloc(sizeof(*link), GFP_KERNEL);
  		if (!link) {
36553434f   Li Zefan   cgroup: remove du...
519
  			free_cg_links(tmp);
817929ec2   Paul Menage   Task Control Grou...
520
521
  			return -ENOMEM;
  		}
bd89aabc6   Paul Menage   Control groups: R...
522
  		list_add(&link->cgrp_link_list, tmp);
817929ec2   Paul Menage   Task Control Grou...
523
524
525
  	}
  	return 0;
  }
c12f65d43   Li Zefan   cgroups: introduc...
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
  /**
   * link_css_set - a helper function to link a css_set to a cgroup
   * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
   * @cg: the css_set to be linked
   * @cgrp: the destination cgroup
   */
  static void link_css_set(struct list_head *tmp_cg_links,
  			 struct css_set *cg, struct cgroup *cgrp)
  {
  	struct cg_cgroup_link *link;
  
  	BUG_ON(list_empty(tmp_cg_links));
  	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
  				cgrp_link_list);
  	link->cg = cg;
7717f7ba9   Paul Menage   cgroups: add a ba...
541
  	link->cgrp = cgrp;
2c6ab6d20   Paul Menage   cgroups: allow cg...
542
  	atomic_inc(&cgrp->count);
c12f65d43   Li Zefan   cgroups: introduc...
543
  	list_move(&link->cgrp_link_list, &cgrp->css_sets);
7717f7ba9   Paul Menage   cgroups: add a ba...
544
545
546
547
548
  	/*
  	 * Always add links to the tail of the list so that the list
  	 * is sorted by order of hierarchy creation
  	 */
  	list_add_tail(&link->cg_link_list, &cg->cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
549
  }
817929ec2   Paul Menage   Task Control Grou...
550
551
552
553
554
555
556
  /*
   * find_css_set() takes an existing cgroup group and a
   * cgroup object, and returns a css_set object that's
   * equivalent to the old group, but with the given cgroup
   * substituted into the appropriate hierarchy. Must be called with
   * cgroup_mutex held
   */
817929ec2   Paul Menage   Task Control Grou...
557
  static struct css_set *find_css_set(
bd89aabc6   Paul Menage   Control groups: R...
558
  	struct css_set *oldcg, struct cgroup *cgrp)
817929ec2   Paul Menage   Task Control Grou...
559
560
561
  {
  	struct css_set *res;
  	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
817929ec2   Paul Menage   Task Control Grou...
562
563
  
  	struct list_head tmp_cg_links;
817929ec2   Paul Menage   Task Control Grou...
564

472b1053f   Li Zefan   cgroups: use a ha...
565
  	struct hlist_head *hhead;
7717f7ba9   Paul Menage   cgroups: add a ba...
566
  	struct cg_cgroup_link *link;
472b1053f   Li Zefan   cgroups: use a ha...
567

817929ec2   Paul Menage   Task Control Grou...
568
569
  	/* First see if we already have a cgroup group that matches
  	 * the desired set */
7e9abd89c   Li Zefan   cgroup: use read ...
570
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
571
  	res = find_existing_css_set(oldcg, cgrp, template);
817929ec2   Paul Menage   Task Control Grou...
572
573
  	if (res)
  		get_css_set(res);
7e9abd89c   Li Zefan   cgroup: use read ...
574
  	read_unlock(&css_set_lock);
817929ec2   Paul Menage   Task Control Grou...
575
576
577
578
579
580
581
582
583
584
585
586
587
  
  	if (res)
  		return res;
  
  	res = kmalloc(sizeof(*res), GFP_KERNEL);
  	if (!res)
  		return NULL;
  
  	/* Allocate all the cg_cgroup_link objects that we'll need */
  	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
  		kfree(res);
  		return NULL;
  	}
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
588
  	atomic_set(&res->refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
589
590
  	INIT_LIST_HEAD(&res->cg_links);
  	INIT_LIST_HEAD(&res->tasks);
472b1053f   Li Zefan   cgroups: use a ha...
591
  	INIT_HLIST_NODE(&res->hlist);
817929ec2   Paul Menage   Task Control Grou...
592
593
594
595
596
597
598
  
  	/* Copy the set of subsystem state objects generated in
  	 * find_existing_css_set() */
  	memcpy(res->subsys, template, sizeof(res->subsys));
  
  	write_lock(&css_set_lock);
  	/* Add reference counts and links from the new css_set. */
7717f7ba9   Paul Menage   cgroups: add a ba...
599
600
601
602
603
604
  	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
  		struct cgroup *c = link->cgrp;
  		if (c->root == cgrp->root)
  			c = cgrp;
  		link_css_set(&tmp_cg_links, res, c);
  	}
817929ec2   Paul Menage   Task Control Grou...
605
606
  
  	BUG_ON(!list_empty(&tmp_cg_links));
817929ec2   Paul Menage   Task Control Grou...
607
  	css_set_count++;
472b1053f   Li Zefan   cgroups: use a ha...
608
609
610
611
  
  	/* Add this cgroup group to the hash table */
  	hhead = css_set_hash(res->subsys);
  	hlist_add_head(&res->hlist, hhead);
817929ec2   Paul Menage   Task Control Grou...
612
613
614
  	write_unlock(&css_set_lock);
  
  	return res;
b4f48b636   Paul Menage   Task Control Grou...
615
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
616
  /*
7717f7ba9   Paul Menage   cgroups: add a ba...
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
   * Return the cgroup for "task" from the given hierarchy. Must be
   * called with cgroup_mutex held.
   */
  static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  					    struct cgroupfs_root *root)
  {
  	struct css_set *css;
  	struct cgroup *res = NULL;
  
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
  	read_lock(&css_set_lock);
  	/*
  	 * No need to lock the task - since we hold cgroup_mutex the
  	 * task can't change groups, so the only thing that can happen
  	 * is that it exits and its css is set back to init_css_set.
  	 */
  	css = task->cgroups;
  	if (css == &init_css_set) {
  		res = &root->top_cgroup;
  	} else {
  		struct cg_cgroup_link *link;
  		list_for_each_entry(link, &css->cg_links, cg_link_list) {
  			struct cgroup *c = link->cgrp;
  			if (c->root == root) {
  				res = c;
  				break;
  			}
  		}
  	}
  	read_unlock(&css_set_lock);
  	BUG_ON(!res);
  	return res;
  }
  
  /*
ddbcc7e8e   Paul Menage   Task Control Grou...
652
653
654
655
656
657
658
659
660
   * There is one global cgroup mutex. We also require taking
   * task_lock() when dereferencing a task's cgroup subsys pointers.
   * See "The task_lock() exception", at the end of this comment.
   *
   * A task must hold cgroup_mutex to modify cgroups.
   *
   * Any task can increment and decrement the count field without lock.
   * So in general, code holding cgroup_mutex can't rely on the count
   * field not changing.  However, if the count goes to zero, then only
956db3ca0   Cliff Wickman   hotplug cpu: move...
661
   * cgroup_attach_task() can increment it again.  Because a count of zero
ddbcc7e8e   Paul Menage   Task Control Grou...
662
663
664
665
666
667
668
669
   * means that no tasks are currently attached, therefore there is no
   * way a task attached to that cgroup can fork (the other way to
   * increment the count).  So code holding cgroup_mutex can safely
   * assume that if the count is zero, it will stay zero. Similarly, if
   * a task holds cgroup_mutex on a cgroup with zero count, it
   * knows that the cgroup won't be removed, as cgroup_rmdir()
   * needs that mutex.
   *
ddbcc7e8e   Paul Menage   Task Control Grou...
670
671
672
673
674
   * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
   * (usually) take cgroup_mutex.  These are the two most performance
   * critical pieces of code here.  The exception occurs on cgroup_exit(),
   * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
   * is taken, and if the cgroup count is zero, a usermode call made
a043e3b2c   Li Zefan   cgroup: fix comments
675
676
   * to the release agent with the name of the cgroup (path relative to
   * the root of cgroup file system) as the argument.
ddbcc7e8e   Paul Menage   Task Control Grou...
677
678
679
680
681
682
683
684
685
686
687
   *
   * A cgroup can only be deleted if both its 'count' of using tasks
   * is zero, and its list of 'children' cgroups is empty.  Since all
   * tasks in the system use _some_ cgroup, and since there is always at
   * least one task in the system (init, pid == 1), therefore, top_cgroup
   * always has either children cgroups and/or using tasks.  So we don't
   * need a special hack to ensure that top_cgroup cannot be deleted.
   *
   *	The task_lock() exception
   *
   * The need for this exception arises from the action of
956db3ca0   Cliff Wickman   hotplug cpu: move...
688
   * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
a043e3b2c   Li Zefan   cgroup: fix comments
689
   * another.  It does so using cgroup_mutex, however there are
ddbcc7e8e   Paul Menage   Task Control Grou...
690
691
692
   * several performance critical places that need to reference
   * task->cgroup without the expense of grabbing a system global
   * mutex.  Therefore except as noted below, when dereferencing or, as
956db3ca0   Cliff Wickman   hotplug cpu: move...
693
   * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
ddbcc7e8e   Paul Menage   Task Control Grou...
694
695
696
697
   * task_lock(), which acts on a spinlock (task->alloc_lock) already in
   * the task_struct routinely used for such matters.
   *
   * P.S.  One more locking exception.  RCU is used to guard the
956db3ca0   Cliff Wickman   hotplug cpu: move...
698
   * update of a tasks cgroup pointer by cgroup_attach_task()
ddbcc7e8e   Paul Menage   Task Control Grou...
699
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
700
701
702
703
  /**
   * cgroup_lock - lock out any changes to cgroup structures
   *
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
704
705
706
707
  void cgroup_lock(void)
  {
  	mutex_lock(&cgroup_mutex);
  }
67523c48a   Ben Blum   cgroups: blkio su...
708
  EXPORT_SYMBOL_GPL(cgroup_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
709
710
711
712
713
714
  
  /**
   * cgroup_unlock - release lock on cgroup changes
   *
   * Undo the lock taken in a previous cgroup_lock() call.
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
715
716
717
718
  void cgroup_unlock(void)
  {
  	mutex_unlock(&cgroup_mutex);
  }
67523c48a   Ben Blum   cgroups: blkio su...
719
  EXPORT_SYMBOL_GPL(cgroup_unlock);
ddbcc7e8e   Paul Menage   Task Control Grou...
720
721
722
723
724
725
726
727
728
729
  
  /*
   * A couple of forward declarations required, due to cyclic reference loop:
   * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
   * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
   * -> cgroup_mkdir.
   */
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
bd89aabc6   Paul Menage   Control groups: R...
730
  static int cgroup_populate_dir(struct cgroup *cgrp);
6e1d5dcc2   Alexey Dobriyan   const: mark remai...
731
  static const struct inode_operations cgroup_dir_inode_operations;
828c09509   Alexey Dobriyan   const: constify r...
732
  static const struct file_operations proc_cgroupstats_operations;
a424316ca   Paul Menage   Task Control Grou...
733
734
  
  static struct backing_dev_info cgroup_backing_dev_info = {
d993831fa   Jens Axboe   writeback: add na...
735
  	.name		= "cgroup",
e4ad08fe6   Miklos Szeredi   mm: bdi: add sepa...
736
  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
a424316ca   Paul Menage   Task Control Grou...
737
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
738

38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
739
740
  static int alloc_css_id(struct cgroup_subsys *ss,
  			struct cgroup *parent, struct cgroup *child);
ddbcc7e8e   Paul Menage   Task Control Grou...
741
742
743
  static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  {
  	struct inode *inode = new_inode(sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
744
745
746
  
  	if (inode) {
  		inode->i_mode = mode;
76aac0e9a   David Howells   CRED: Wrap task c...
747
748
  		inode->i_uid = current_fsuid();
  		inode->i_gid = current_fsgid();
ddbcc7e8e   Paul Menage   Task Control Grou...
749
750
751
752
753
  		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
  	}
  	return inode;
  }
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
754
755
756
757
  /*
   * Call subsys's pre_destroy handler.
   * This is called before css refcnt check.
   */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
758
  static int cgroup_call_pre_destroy(struct cgroup *cgrp)
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
759
760
  {
  	struct cgroup_subsys *ss;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
761
  	int ret = 0;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
762
  	for_each_subsys(cgrp->root, ss)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
763
764
765
  		if (ss->pre_destroy) {
  			ret = ss->pre_destroy(ss, cgrp);
  			if (ret)
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
766
  				break;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
767
  		}
0dea11687   Kirill A. Shutemov   cgroup: implement...
768

ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
769
  	return ret;
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
770
  }
a47295e6b   Paul Menage   cgroups: make cgr...
771
772
773
774
775
776
  static void free_cgroup_rcu(struct rcu_head *obj)
  {
  	struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
  
  	kfree(cgrp);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
777
778
779
780
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
  	/* is dentry a directory ? if so, kfree() associated cgroup */
  	if (S_ISDIR(inode->i_mode)) {
bd89aabc6   Paul Menage   Control groups: R...
781
  		struct cgroup *cgrp = dentry->d_fsdata;
8dc4f3e17   Paul Menage   cgroups: move cgr...
782
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
783
  		BUG_ON(!(cgroup_is_removed(cgrp)));
81a6a5cdd   Paul Menage   Task Control Grou...
784
785
786
787
788
789
790
  		/* It's possible for external users to be holding css
  		 * reference counts on a cgroup; css_put() needs to
  		 * be able to access the cgroup after decrementing
  		 * the reference count in order to know if it needs to
  		 * queue the cgroup to be handled by the release
  		 * agent */
  		synchronize_rcu();
8dc4f3e17   Paul Menage   cgroups: move cgr...
791
792
793
794
795
  
  		mutex_lock(&cgroup_mutex);
  		/*
  		 * Release the subsystem state objects.
  		 */
75139b827   Li Zefan   cgroups: remove s...
796
797
  		for_each_subsys(cgrp->root, ss)
  			ss->destroy(ss, cgrp);
8dc4f3e17   Paul Menage   cgroups: move cgr...
798
799
800
  
  		cgrp->root->number_of_cgroups--;
  		mutex_unlock(&cgroup_mutex);
a47295e6b   Paul Menage   cgroups: make cgr...
801
802
803
804
  		/*
  		 * Drop the active superblock reference that we took when we
  		 * created the cgroup
  		 */
8dc4f3e17   Paul Menage   cgroups: move cgr...
805
  		deactivate_super(cgrp->root->sb);
72a8cb30d   Ben Blum   cgroups: ensure c...
806
807
808
809
810
  		/*
  		 * if we're getting rid of the cgroup, refcount should ensure
  		 * that there are no pidlists left.
  		 */
  		BUG_ON(!list_empty(&cgrp->pidlists));
a47295e6b   Paul Menage   cgroups: make cgr...
811
  		call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
ddbcc7e8e   Paul Menage   Task Control Grou...
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
  	}
  	iput(inode);
  }
  
  static void remove_dir(struct dentry *d)
  {
  	struct dentry *parent = dget(d->d_parent);
  
  	d_delete(d);
  	simple_rmdir(parent->d_inode, d);
  	dput(parent);
  }
  
  static void cgroup_clear_directory(struct dentry *dentry)
  {
  	struct list_head *node;
  
  	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
  	spin_lock(&dcache_lock);
  	node = dentry->d_subdirs.next;
  	while (node != &dentry->d_subdirs) {
  		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
  		list_del_init(node);
  		if (d->d_inode) {
  			/* This should never be called on a cgroup
  			 * directory with child cgroups */
  			BUG_ON(d->d_inode->i_mode & S_IFDIR);
  			d = dget_locked(d);
  			spin_unlock(&dcache_lock);
  			d_delete(d);
  			simple_unlink(dentry->d_inode, d);
  			dput(d);
  			spin_lock(&dcache_lock);
  		}
  		node = dentry->d_subdirs.next;
  	}
  	spin_unlock(&dcache_lock);
  }
  
  /*
   * NOTE : the dentry must have been dget()'ed
   */
  static void cgroup_d_remove_dir(struct dentry *dentry)
  {
  	cgroup_clear_directory(dentry);
  
  	spin_lock(&dcache_lock);
  	list_del_init(&dentry->d_u.d_child);
  	spin_unlock(&dcache_lock);
  	remove_dir(dentry);
  }
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
863
864
865
866
867
868
  /*
   * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
   * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
   * reference to css->refcnt. In general, this refcnt is expected to goes down
   * to zero, soon.
   *
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
869
   * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
870
871
   */
  DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
872
  static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
873
  {
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
874
  	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
875
876
  		wake_up_all(&cgroup_rmdir_waitq);
  }
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
877
878
879
880
881
882
883
884
885
886
  void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
  {
  	css_get(css);
  }
  
  void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
  {
  	cgroup_wakeup_rmdir_waiter(css->cgroup);
  	css_put(css);
  }
aae8aab40   Ben Blum   cgroups: revamp s...
887
  /*
cf5d5941f   Ben Blum   cgroups: subsyste...
888
889
890
   * Call with cgroup_mutex held. Drops reference counts on modules, including
   * any duplicate ones that parse_cgroupfs_options took. If this function
   * returns an error, no reference counts are touched.
aae8aab40   Ben Blum   cgroups: revamp s...
891
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
892
893
894
895
  static int rebind_subsystems(struct cgroupfs_root *root,
  			      unsigned long final_bits)
  {
  	unsigned long added_bits, removed_bits;
bd89aabc6   Paul Menage   Control groups: R...
896
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
897
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
898
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
ddbcc7e8e   Paul Menage   Task Control Grou...
899
900
901
902
  	removed_bits = root->actual_subsys_bits & ~final_bits;
  	added_bits = final_bits & ~root->actual_subsys_bits;
  	/* Check that any added subsystems are currently free */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
8d53d55d2   Li Zefan   cgroup: fix subsy...
903
  		unsigned long bit = 1UL << i;
ddbcc7e8e   Paul Menage   Task Control Grou...
904
905
906
  		struct cgroup_subsys *ss = subsys[i];
  		if (!(bit & added_bits))
  			continue;
aae8aab40   Ben Blum   cgroups: revamp s...
907
908
909
910
911
912
  		/*
  		 * Nobody should tell us to do a subsys that doesn't exist:
  		 * parse_cgroupfs_options should catch that case and refcounts
  		 * ensure that subsystems won't disappear once selected.
  		 */
  		BUG_ON(ss == NULL);
ddbcc7e8e   Paul Menage   Task Control Grou...
913
914
915
916
917
918
919
920
921
922
  		if (ss->root != &rootnode) {
  			/* Subsystem isn't free */
  			return -EBUSY;
  		}
  	}
  
  	/* Currently we don't handle adding/removing subsystems when
  	 * any child cgroups exist. This is theoretically supportable
  	 * but involves complex error handling, so it's being left until
  	 * later */
307257cf4   Paul Menage   cgroups: fix a ra...
923
  	if (root->number_of_cgroups > 1)
ddbcc7e8e   Paul Menage   Task Control Grou...
924
925
926
927
928
929
930
931
  		return -EBUSY;
  
  	/* Process each subsystem */
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		unsigned long bit = 1UL << i;
  		if (bit & added_bits) {
  			/* We're binding this subsystem to this hierarchy */
aae8aab40   Ben Blum   cgroups: revamp s...
932
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
933
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
934
935
  			BUG_ON(!dummytop->subsys[i]);
  			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
999cd8a45   Paul Menage   cgroups: add a pe...
936
  			mutex_lock(&ss->hierarchy_mutex);
bd89aabc6   Paul Menage   Control groups: R...
937
938
  			cgrp->subsys[i] = dummytop->subsys[i];
  			cgrp->subsys[i]->cgroup = cgrp;
33a68ac1c   Li Zefan   cgroups: add inac...
939
  			list_move(&ss->sibling, &root->subsys_list);
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
940
  			ss->root = root;
ddbcc7e8e   Paul Menage   Task Control Grou...
941
  			if (ss->bind)
bd89aabc6   Paul Menage   Control groups: R...
942
  				ss->bind(ss, cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
943
  			mutex_unlock(&ss->hierarchy_mutex);
cf5d5941f   Ben Blum   cgroups: subsyste...
944
  			/* refcount was already taken, and we're keeping it */
ddbcc7e8e   Paul Menage   Task Control Grou...
945
946
  		} else if (bit & removed_bits) {
  			/* We're removing this subsystem */
aae8aab40   Ben Blum   cgroups: revamp s...
947
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
948
949
  			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
  			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
999cd8a45   Paul Menage   cgroups: add a pe...
950
  			mutex_lock(&ss->hierarchy_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
951
952
953
  			if (ss->bind)
  				ss->bind(ss, dummytop);
  			dummytop->subsys[i]->cgroup = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
954
  			cgrp->subsys[i] = NULL;
b2aa30f7b   Lai Jiangshan   cgroups: don't pu...
955
  			subsys[i]->root = &rootnode;
33a68ac1c   Li Zefan   cgroups: add inac...
956
  			list_move(&ss->sibling, &rootnode.subsys_list);
999cd8a45   Paul Menage   cgroups: add a pe...
957
  			mutex_unlock(&ss->hierarchy_mutex);
cf5d5941f   Ben Blum   cgroups: subsyste...
958
959
  			/* subsystem is now free - drop reference on module */
  			module_put(ss->module);
ddbcc7e8e   Paul Menage   Task Control Grou...
960
961
  		} else if (bit & final_bits) {
  			/* Subsystem state should already exist */
aae8aab40   Ben Blum   cgroups: revamp s...
962
  			BUG_ON(ss == NULL);
bd89aabc6   Paul Menage   Control groups: R...
963
  			BUG_ON(!cgrp->subsys[i]);
cf5d5941f   Ben Blum   cgroups: subsyste...
964
965
966
967
968
969
970
971
  			/*
  			 * a refcount was taken, but we already had one, so
  			 * drop the extra reference.
  			 */
  			module_put(ss->module);
  #ifdef CONFIG_MODULE_UNLOAD
  			BUG_ON(ss->module && !module_refcount(ss->module));
  #endif
ddbcc7e8e   Paul Menage   Task Control Grou...
972
973
  		} else {
  			/* Subsystem state shouldn't exist */
bd89aabc6   Paul Menage   Control groups: R...
974
  			BUG_ON(cgrp->subsys[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
  		}
  	}
  	root->subsys_bits = root->actual_subsys_bits = final_bits;
  	synchronize_rcu();
  
  	return 0;
  }
  
  static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
  {
  	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
  	struct cgroup_subsys *ss;
  
  	mutex_lock(&cgroup_mutex);
  	for_each_subsys(root, ss)
  		seq_printf(seq, ",%s", ss->name);
  	if (test_bit(ROOT_NOPREFIX, &root->flags))
  		seq_puts(seq, ",noprefix");
81a6a5cdd   Paul Menage   Task Control Grou...
993
994
  	if (strlen(root->release_agent_path))
  		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
c6d57f331   Paul Menage   cgroups: support ...
995
996
  	if (strlen(root->name))
  		seq_printf(seq, ",name=%s", root->name);
ddbcc7e8e   Paul Menage   Task Control Grou...
997
998
999
1000
1001
1002
1003
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  struct cgroup_sb_opts {
  	unsigned long subsys_bits;
  	unsigned long flags;
81a6a5cdd   Paul Menage   Task Control Grou...
1004
  	char *release_agent;
c6d57f331   Paul Menage   cgroups: support ...
1005
  	char *name;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1006
1007
  	/* User explicitly requested empty subsystem */
  	bool none;
c6d57f331   Paul Menage   cgroups: support ...
1008
1009
  
  	struct cgroupfs_root *new_root;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1010

ddbcc7e8e   Paul Menage   Task Control Grou...
1011
  };
aae8aab40   Ben Blum   cgroups: revamp s...
1012
1013
  /*
   * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
cf5d5941f   Ben Blum   cgroups: subsyste...
1014
1015
1016
   * with cgroup_mutex held to protect the subsys[] array. This function takes
   * refcounts on subsystems to be used, unless it returns error, in which case
   * no refcounts are taken.
aae8aab40   Ben Blum   cgroups: revamp s...
1017
   */
cf5d5941f   Ben Blum   cgroups: subsyste...
1018
  static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
ddbcc7e8e   Paul Menage   Task Control Grou...
1019
1020
  {
  	char *token, *o = data ?: "all";
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1021
  	unsigned long mask = (unsigned long)-1;
cf5d5941f   Ben Blum   cgroups: subsyste...
1022
1023
  	int i;
  	bool module_pin_failed = false;
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1024

aae8aab40   Ben Blum   cgroups: revamp s...
1025
  	BUG_ON(!mutex_is_locked(&cgroup_mutex));
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1026
1027
1028
  #ifdef CONFIG_CPUSETS
  	mask = ~(1UL << cpuset_subsys_id);
  #endif
ddbcc7e8e   Paul Menage   Task Control Grou...
1029

c6d57f331   Paul Menage   cgroups: support ...
1030
  	memset(opts, 0, sizeof(*opts));
ddbcc7e8e   Paul Menage   Task Control Grou...
1031
1032
1033
1034
1035
  
  	while ((token = strsep(&o, ",")) != NULL) {
  		if (!*token)
  			return -EINVAL;
  		if (!strcmp(token, "all")) {
8bab8dded   Paul Menage   cgroups: add cgro...
1036
  			/* Add all non-disabled subsystems */
8bab8dded   Paul Menage   cgroups: add cgro...
1037
1038
1039
  			opts->subsys_bits = 0;
  			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  				struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
1040
1041
  				if (ss == NULL)
  					continue;
8bab8dded   Paul Menage   cgroups: add cgro...
1042
1043
1044
  				if (!ss->disabled)
  					opts->subsys_bits |= 1ul << i;
  			}
2c6ab6d20   Paul Menage   cgroups: allow cg...
1045
1046
1047
  		} else if (!strcmp(token, "none")) {
  			/* Explicitly have no subsystems */
  			opts->none = true;
ddbcc7e8e   Paul Menage   Task Control Grou...
1048
1049
  		} else if (!strcmp(token, "noprefix")) {
  			set_bit(ROOT_NOPREFIX, &opts->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
1050
1051
1052
1053
  		} else if (!strncmp(token, "release_agent=", 14)) {
  			/* Specifying two release agents is forbidden */
  			if (opts->release_agent)
  				return -EINVAL;
c6d57f331   Paul Menage   cgroups: support ...
1054
1055
  			opts->release_agent =
  				kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
81a6a5cdd   Paul Menage   Task Control Grou...
1056
1057
  			if (!opts->release_agent)
  				return -ENOMEM;
c6d57f331   Paul Menage   cgroups: support ...
1058
  		} else if (!strncmp(token, "name=", 5)) {
c6d57f331   Paul Menage   cgroups: support ...
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
  			const char *name = token + 5;
  			/* Can't specify an empty name */
  			if (!strlen(name))
  				return -EINVAL;
  			/* Must match [\w.-]+ */
  			for (i = 0; i < strlen(name); i++) {
  				char c = name[i];
  				if (isalnum(c))
  					continue;
  				if ((c == '.') || (c == '-') || (c == '_'))
  					continue;
  				return -EINVAL;
  			}
  			/* Specifying two names is forbidden */
  			if (opts->name)
  				return -EINVAL;
  			opts->name = kstrndup(name,
  					      MAX_CGROUP_ROOT_NAMELEN,
  					      GFP_KERNEL);
  			if (!opts->name)
  				return -ENOMEM;
ddbcc7e8e   Paul Menage   Task Control Grou...
1080
1081
  		} else {
  			struct cgroup_subsys *ss;
ddbcc7e8e   Paul Menage   Task Control Grou...
1082
1083
  			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  				ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
1084
1085
  				if (ss == NULL)
  					continue;
ddbcc7e8e   Paul Menage   Task Control Grou...
1086
  				if (!strcmp(token, ss->name)) {
8bab8dded   Paul Menage   cgroups: add cgro...
1087
1088
  					if (!ss->disabled)
  						set_bit(i, &opts->subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
1089
1090
1091
1092
1093
1094
1095
  					break;
  				}
  			}
  			if (i == CGROUP_SUBSYS_COUNT)
  				return -ENOENT;
  		}
  	}
2c6ab6d20   Paul Menage   cgroups: allow cg...
1096
  	/* Consistency checks */
f9ab5b5b0   Li Zefan   cgroups: forbid n...
1097
1098
1099
1100
1101
1102
1103
1104
  	/*
  	 * Option noprefix was introduced just for backward compatibility
  	 * with the old cpuset, so we allow noprefix only if mounting just
  	 * the cpuset subsystem.
  	 */
  	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
  	    (opts->subsys_bits & mask))
  		return -EINVAL;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1105
1106
1107
1108
1109
1110
1111
1112
1113
  
  	/* Can't specify "none" and some subsystems */
  	if (opts->subsys_bits && opts->none)
  		return -EINVAL;
  
  	/*
  	 * We either have to specify by name or by subsystems. (So all
  	 * empty hierarchies must have a name).
  	 */
c6d57f331   Paul Menage   cgroups: support ...
1114
  	if (!opts->subsys_bits && !opts->name)
ddbcc7e8e   Paul Menage   Task Control Grou...
1115
  		return -EINVAL;
cf5d5941f   Ben Blum   cgroups: subsyste...
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
  	/*
  	 * Grab references on all the modules we'll need, so the subsystems
  	 * don't dance around before rebind_subsystems attaches them. This may
  	 * take duplicate reference counts on a subsystem that's already used,
  	 * but rebind_subsystems handles this case.
  	 */
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		unsigned long bit = 1UL << i;
  
  		if (!(bit & opts->subsys_bits))
  			continue;
  		if (!try_module_get(subsys[i]->module)) {
  			module_pin_failed = true;
  			break;
  		}
  	}
  	if (module_pin_failed) {
  		/*
  		 * oops, one of the modules was going away. this means that we
  		 * raced with a module_delete call, and to the user this is
  		 * essentially a "subsystem doesn't exist" case.
  		 */
  		for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
  			/* drop refcounts only on the ones we took */
  			unsigned long bit = 1UL << i;
  
  			if (!(bit & opts->subsys_bits))
  				continue;
  			module_put(subsys[i]->module);
  		}
  		return -ENOENT;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1148
1149
  	return 0;
  }
cf5d5941f   Ben Blum   cgroups: subsyste...
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
  static void drop_parsed_module_refcounts(unsigned long subsys_bits)
  {
  	int i;
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		unsigned long bit = 1UL << i;
  
  		if (!(bit & subsys_bits))
  			continue;
  		module_put(subsys[i]->module);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1161
1162
1163
1164
  static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  {
  	int ret = 0;
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1165
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1166
  	struct cgroup_sb_opts opts;
337eb00a2   Alessio Igor Bogani   Push BKL down int...
1167
  	lock_kernel();
bd89aabc6   Paul Menage   Control groups: R...
1168
  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1169
1170
1171
1172
1173
1174
  	mutex_lock(&cgroup_mutex);
  
  	/* See what subsystems are wanted */
  	ret = parse_cgroupfs_options(data, &opts);
  	if (ret)
  		goto out_unlock;
cf5d5941f   Ben Blum   cgroups: subsyste...
1175
1176
1177
  	/* Don't allow flags or name to change at remount */
  	if (opts.flags != root->flags ||
  	    (opts.name && strcmp(opts.name, root->name))) {
c6d57f331   Paul Menage   cgroups: support ...
1178
  		ret = -EINVAL;
cf5d5941f   Ben Blum   cgroups: subsyste...
1179
  		drop_parsed_module_refcounts(opts.subsys_bits);
c6d57f331   Paul Menage   cgroups: support ...
1180
1181
  		goto out_unlock;
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1182
  	ret = rebind_subsystems(root, opts.subsys_bits);
cf5d5941f   Ben Blum   cgroups: subsyste...
1183
1184
  	if (ret) {
  		drop_parsed_module_refcounts(opts.subsys_bits);
0670e08bd   Li Zefan   cgroups: don't ch...
1185
  		goto out_unlock;
cf5d5941f   Ben Blum   cgroups: subsyste...
1186
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
1187
1188
  
  	/* (re)populate subsystem files */
0670e08bd   Li Zefan   cgroups: don't ch...
1189
  	cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1190

81a6a5cdd   Paul Menage   Task Control Grou...
1191
1192
  	if (opts.release_agent)
  		strcpy(root->release_agent_path, opts.release_agent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1193
   out_unlock:
66bdc9cfc   Jesper Juhl   kernel/cgroup.c: ...
1194
  	kfree(opts.release_agent);
c6d57f331   Paul Menage   cgroups: support ...
1195
  	kfree(opts.name);
ddbcc7e8e   Paul Menage   Task Control Grou...
1196
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
1197
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
337eb00a2   Alessio Igor Bogani   Push BKL down int...
1198
  	unlock_kernel();
ddbcc7e8e   Paul Menage   Task Control Grou...
1199
1200
  	return ret;
  }
b87221de6   Alexey Dobriyan   const: mark remai...
1201
  static const struct super_operations cgroup_ops = {
ddbcc7e8e   Paul Menage   Task Control Grou...
1202
1203
1204
1205
1206
  	.statfs = simple_statfs,
  	.drop_inode = generic_delete_inode,
  	.show_options = cgroup_show_options,
  	.remount_fs = cgroup_remount,
  };
cc31edcee   Paul Menage   cgroups: convert ...
1207
1208
1209
1210
1211
1212
  static void init_cgroup_housekeeping(struct cgroup *cgrp)
  {
  	INIT_LIST_HEAD(&cgrp->sibling);
  	INIT_LIST_HEAD(&cgrp->children);
  	INIT_LIST_HEAD(&cgrp->css_sets);
  	INIT_LIST_HEAD(&cgrp->release_list);
72a8cb30d   Ben Blum   cgroups: ensure c...
1213
1214
  	INIT_LIST_HEAD(&cgrp->pidlists);
  	mutex_init(&cgrp->pidlist_mutex);
0dea11687   Kirill A. Shutemov   cgroup: implement...
1215
1216
  	INIT_LIST_HEAD(&cgrp->event_list);
  	spin_lock_init(&cgrp->event_list_lock);
cc31edcee   Paul Menage   cgroups: convert ...
1217
  }
c6d57f331   Paul Menage   cgroups: support ...
1218

ddbcc7e8e   Paul Menage   Task Control Grou...
1219
1220
  static void init_cgroup_root(struct cgroupfs_root *root)
  {
bd89aabc6   Paul Menage   Control groups: R...
1221
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1222
1223
1224
  	INIT_LIST_HEAD(&root->subsys_list);
  	INIT_LIST_HEAD(&root->root_list);
  	root->number_of_cgroups = 1;
bd89aabc6   Paul Menage   Control groups: R...
1225
1226
  	cgrp->root = root;
  	cgrp->top_cgroup = cgrp;
cc31edcee   Paul Menage   cgroups: convert ...
1227
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1228
  }
2c6ab6d20   Paul Menage   cgroups: allow cg...
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
  static bool init_root_id(struct cgroupfs_root *root)
  {
  	int ret = 0;
  
  	do {
  		if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
  			return false;
  		spin_lock(&hierarchy_id_lock);
  		/* Try to allocate the next unused ID */
  		ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
  					&root->hierarchy_id);
  		if (ret == -ENOSPC)
  			/* Try again starting from 0 */
  			ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
  		if (!ret) {
  			next_hierarchy_id = root->hierarchy_id + 1;
  		} else if (ret != -EAGAIN) {
  			/* Can only get here if the 31-bit IDR is full ... */
  			BUG_ON(ret);
  		}
  		spin_unlock(&hierarchy_id_lock);
  	} while (ret);
  	return true;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1253
1254
  static int cgroup_test_super(struct super_block *sb, void *data)
  {
c6d57f331   Paul Menage   cgroups: support ...
1255
  	struct cgroup_sb_opts *opts = data;
ddbcc7e8e   Paul Menage   Task Control Grou...
1256
  	struct cgroupfs_root *root = sb->s_fs_info;
c6d57f331   Paul Menage   cgroups: support ...
1257
1258
1259
  	/* If we asked for a name then it must match */
  	if (opts->name && strcmp(opts->name, root->name))
  		return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
1260

2c6ab6d20   Paul Menage   cgroups: allow cg...
1261
1262
1263
1264
1265
1266
  	/*
  	 * If we asked for subsystems (or explicitly for no
  	 * subsystems) then they must match
  	 */
  	if ((opts->subsys_bits || opts->none)
  	    && (opts->subsys_bits != root->subsys_bits))
ddbcc7e8e   Paul Menage   Task Control Grou...
1267
1268
1269
1270
  		return 0;
  
  	return 1;
  }
c6d57f331   Paul Menage   cgroups: support ...
1271
1272
1273
  static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
  {
  	struct cgroupfs_root *root;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1274
  	if (!opts->subsys_bits && !opts->none)
c6d57f331   Paul Menage   cgroups: support ...
1275
1276
1277
1278
1279
  		return NULL;
  
  	root = kzalloc(sizeof(*root), GFP_KERNEL);
  	if (!root)
  		return ERR_PTR(-ENOMEM);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1280
1281
1282
1283
  	if (!init_root_id(root)) {
  		kfree(root);
  		return ERR_PTR(-ENOMEM);
  	}
c6d57f331   Paul Menage   cgroups: support ...
1284
  	init_cgroup_root(root);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1285

c6d57f331   Paul Menage   cgroups: support ...
1286
1287
1288
1289
1290
1291
1292
1293
  	root->subsys_bits = opts->subsys_bits;
  	root->flags = opts->flags;
  	if (opts->release_agent)
  		strcpy(root->release_agent_path, opts->release_agent);
  	if (opts->name)
  		strcpy(root->name, opts->name);
  	return root;
  }
2c6ab6d20   Paul Menage   cgroups: allow cg...
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
  static void cgroup_drop_root(struct cgroupfs_root *root)
  {
  	if (!root)
  		return;
  
  	BUG_ON(!root->hierarchy_id);
  	spin_lock(&hierarchy_id_lock);
  	ida_remove(&hierarchy_ida, root->hierarchy_id);
  	spin_unlock(&hierarchy_id_lock);
  	kfree(root);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1305
1306
1307
  static int cgroup_set_super(struct super_block *sb, void *data)
  {
  	int ret;
c6d57f331   Paul Menage   cgroups: support ...
1308
1309
1310
1311
1312
  	struct cgroup_sb_opts *opts = data;
  
  	/* If we don't have a new root, we can't set up a new sb */
  	if (!opts->new_root)
  		return -EINVAL;
2c6ab6d20   Paul Menage   cgroups: allow cg...
1313
  	BUG_ON(!opts->subsys_bits && !opts->none);
ddbcc7e8e   Paul Menage   Task Control Grou...
1314
1315
1316
1317
  
  	ret = set_anon_super(sb, NULL);
  	if (ret)
  		return ret;
c6d57f331   Paul Menage   cgroups: support ...
1318
1319
  	sb->s_fs_info = opts->new_root;
  	opts->new_root->sb = sb;
ddbcc7e8e   Paul Menage   Task Control Grou...
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
  
  	sb->s_blocksize = PAGE_CACHE_SIZE;
  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
  	sb->s_magic = CGROUP_SUPER_MAGIC;
  	sb->s_op = &cgroup_ops;
  
  	return 0;
  }
  
  static int cgroup_get_rootdir(struct super_block *sb)
  {
  	struct inode *inode =
  		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
  	struct dentry *dentry;
  
  	if (!inode)
  		return -ENOMEM;
ddbcc7e8e   Paul Menage   Task Control Grou...
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
  	inode->i_fop = &simple_dir_operations;
  	inode->i_op = &cgroup_dir_inode_operations;
  	/* directories start off with i_nlink == 2 (for "." entry) */
  	inc_nlink(inode);
  	dentry = d_alloc_root(inode);
  	if (!dentry) {
  		iput(inode);
  		return -ENOMEM;
  	}
  	sb->s_root = dentry;
  	return 0;
  }
  
  static int cgroup_get_sb(struct file_system_type *fs_type,
  			 int flags, const char *unused_dev_name,
  			 void *data, struct vfsmount *mnt)
  {
  	struct cgroup_sb_opts opts;
c6d57f331   Paul Menage   cgroups: support ...
1355
  	struct cgroupfs_root *root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1356
1357
  	int ret = 0;
  	struct super_block *sb;
c6d57f331   Paul Menage   cgroups: support ...
1358
  	struct cgroupfs_root *new_root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1359
1360
  
  	/* First find the desired set of subsystems */
aae8aab40   Ben Blum   cgroups: revamp s...
1361
  	mutex_lock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1362
  	ret = parse_cgroupfs_options(data, &opts);
aae8aab40   Ben Blum   cgroups: revamp s...
1363
  	mutex_unlock(&cgroup_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1364
1365
  	if (ret)
  		goto out_err;
ddbcc7e8e   Paul Menage   Task Control Grou...
1366

c6d57f331   Paul Menage   cgroups: support ...
1367
1368
1369
1370
1371
1372
1373
  	/*
  	 * Allocate a new cgroup root. We may not need it if we're
  	 * reusing an existing hierarchy.
  	 */
  	new_root = cgroup_root_from_opts(&opts);
  	if (IS_ERR(new_root)) {
  		ret = PTR_ERR(new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1374
  		goto drop_modules;
81a6a5cdd   Paul Menage   Task Control Grou...
1375
  	}
c6d57f331   Paul Menage   cgroups: support ...
1376
  	opts.new_root = new_root;
ddbcc7e8e   Paul Menage   Task Control Grou...
1377

c6d57f331   Paul Menage   cgroups: support ...
1378
1379
  	/* Locate an existing or new sb for this hierarchy */
  	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
ddbcc7e8e   Paul Menage   Task Control Grou...
1380
  	if (IS_ERR(sb)) {
c6d57f331   Paul Menage   cgroups: support ...
1381
  		ret = PTR_ERR(sb);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1382
  		cgroup_drop_root(opts.new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1383
  		goto drop_modules;
ddbcc7e8e   Paul Menage   Task Control Grou...
1384
  	}
c6d57f331   Paul Menage   cgroups: support ...
1385
1386
1387
1388
1389
  	root = sb->s_fs_info;
  	BUG_ON(!root);
  	if (root == opts.new_root) {
  		/* We used the new root structure, so this is a new hierarchy */
  		struct list_head tmp_cg_links;
c12f65d43   Li Zefan   cgroups: introduc...
1390
  		struct cgroup *root_cgrp = &root->top_cgroup;
817929ec2   Paul Menage   Task Control Grou...
1391
  		struct inode *inode;
c6d57f331   Paul Menage   cgroups: support ...
1392
  		struct cgroupfs_root *existing_root;
28fd5dfc1   Li Zefan   cgroups: remove t...
1393
  		int i;
ddbcc7e8e   Paul Menage   Task Control Grou...
1394
1395
1396
1397
1398
1399
  
  		BUG_ON(sb->s_root != NULL);
  
  		ret = cgroup_get_rootdir(sb);
  		if (ret)
  			goto drop_new_super;
817929ec2   Paul Menage   Task Control Grou...
1400
  		inode = sb->s_root->d_inode;
ddbcc7e8e   Paul Menage   Task Control Grou...
1401

817929ec2   Paul Menage   Task Control Grou...
1402
  		mutex_lock(&inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1403
  		mutex_lock(&cgroup_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
  		if (strlen(root->name)) {
  			/* Check for name clashes with existing mounts */
  			for_each_active_root(existing_root) {
  				if (!strcmp(existing_root->name, root->name)) {
  					ret = -EBUSY;
  					mutex_unlock(&cgroup_mutex);
  					mutex_unlock(&inode->i_mutex);
  					goto drop_new_super;
  				}
  			}
  		}
817929ec2   Paul Menage   Task Control Grou...
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
  		/*
  		 * We're accessing css_set_count without locking
  		 * css_set_lock here, but that's OK - it can only be
  		 * increased by someone holding cgroup_lock, and
  		 * that's us. The worst that can happen is that we
  		 * have some link structures left over
  		 */
  		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
  		if (ret) {
  			mutex_unlock(&cgroup_mutex);
  			mutex_unlock(&inode->i_mutex);
  			goto drop_new_super;
  		}
ddbcc7e8e   Paul Menage   Task Control Grou...
1428
1429
1430
  		ret = rebind_subsystems(root, root->subsys_bits);
  		if (ret == -EBUSY) {
  			mutex_unlock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
1431
  			mutex_unlock(&inode->i_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1432
1433
  			free_cg_links(&tmp_cg_links);
  			goto drop_new_super;
ddbcc7e8e   Paul Menage   Task Control Grou...
1434
  		}
cf5d5941f   Ben Blum   cgroups: subsyste...
1435
1436
1437
1438
1439
  		/*
  		 * There must be no failure case after here, since rebinding
  		 * takes care of subsystems' refcounts, which are explicitly
  		 * dropped in the failure exit path.
  		 */
ddbcc7e8e   Paul Menage   Task Control Grou...
1440
1441
1442
1443
1444
  
  		/* EBUSY should be the only error here */
  		BUG_ON(ret);
  
  		list_add(&root->root_list, &roots);
817929ec2   Paul Menage   Task Control Grou...
1445
  		root_count++;
ddbcc7e8e   Paul Menage   Task Control Grou...
1446

c12f65d43   Li Zefan   cgroups: introduc...
1447
  		sb->s_root->d_fsdata = root_cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
1448
  		root->top_cgroup.dentry = sb->s_root;
817929ec2   Paul Menage   Task Control Grou...
1449
1450
1451
  		/* Link the top cgroup in this hierarchy into all
  		 * the css_set objects */
  		write_lock(&css_set_lock);
28fd5dfc1   Li Zefan   cgroups: remove t...
1452
1453
1454
  		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  			struct hlist_head *hhead = &css_set_table[i];
  			struct hlist_node *node;
817929ec2   Paul Menage   Task Control Grou...
1455
  			struct css_set *cg;
28fd5dfc1   Li Zefan   cgroups: remove t...
1456

c12f65d43   Li Zefan   cgroups: introduc...
1457
1458
  			hlist_for_each_entry(cg, node, hhead, hlist)
  				link_css_set(&tmp_cg_links, cg, root_cgrp);
28fd5dfc1   Li Zefan   cgroups: remove t...
1459
  		}
817929ec2   Paul Menage   Task Control Grou...
1460
1461
1462
  		write_unlock(&css_set_lock);
  
  		free_cg_links(&tmp_cg_links);
c12f65d43   Li Zefan   cgroups: introduc...
1463
1464
  		BUG_ON(!list_empty(&root_cgrp->sibling));
  		BUG_ON(!list_empty(&root_cgrp->children));
ddbcc7e8e   Paul Menage   Task Control Grou...
1465
  		BUG_ON(root->number_of_cgroups != 1);
c12f65d43   Li Zefan   cgroups: introduc...
1466
  		cgroup_populate_dir(root_cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
1467
  		mutex_unlock(&cgroup_mutex);
34f77a90f   Xiaotian Feng   cgroups: make unl...
1468
  		mutex_unlock(&inode->i_mutex);
c6d57f331   Paul Menage   cgroups: support ...
1469
1470
1471
1472
1473
  	} else {
  		/*
  		 * We re-used an existing hierarchy - the new root (if
  		 * any) is not needed
  		 */
2c6ab6d20   Paul Menage   cgroups: allow cg...
1474
  		cgroup_drop_root(opts.new_root);
cf5d5941f   Ben Blum   cgroups: subsyste...
1475
1476
  		/* no subsys rebinding, so refcounts don't change */
  		drop_parsed_module_refcounts(opts.subsys_bits);
ddbcc7e8e   Paul Menage   Task Control Grou...
1477
  	}
a3ec947c8   Sukadev Bhattiprolu   vfs: simple_set_m...
1478
  	simple_set_mnt(mnt, sb);
c6d57f331   Paul Menage   cgroups: support ...
1479
1480
  	kfree(opts.release_agent);
  	kfree(opts.name);
a3ec947c8   Sukadev Bhattiprolu   vfs: simple_set_m...
1481
  	return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
1482
1483
  
   drop_new_super:
6f5bbff9a   Al Viro   Convert obvious p...
1484
  	deactivate_locked_super(sb);
cf5d5941f   Ben Blum   cgroups: subsyste...
1485
1486
   drop_modules:
  	drop_parsed_module_refcounts(opts.subsys_bits);
c6d57f331   Paul Menage   cgroups: support ...
1487
1488
1489
   out_err:
  	kfree(opts.release_agent);
  	kfree(opts.name);
ddbcc7e8e   Paul Menage   Task Control Grou...
1490
1491
1492
1493
1494
  	return ret;
  }
  
  static void cgroup_kill_sb(struct super_block *sb) {
  	struct cgroupfs_root *root = sb->s_fs_info;
bd89aabc6   Paul Menage   Control groups: R...
1495
  	struct cgroup *cgrp = &root->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
1496
  	int ret;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1497
1498
  	struct cg_cgroup_link *link;
  	struct cg_cgroup_link *saved_link;
ddbcc7e8e   Paul Menage   Task Control Grou...
1499
1500
1501
1502
  
  	BUG_ON(!root);
  
  	BUG_ON(root->number_of_cgroups != 1);
bd89aabc6   Paul Menage   Control groups: R...
1503
1504
  	BUG_ON(!list_empty(&cgrp->children));
  	BUG_ON(!list_empty(&cgrp->sibling));
ddbcc7e8e   Paul Menage   Task Control Grou...
1505
1506
1507
1508
1509
1510
1511
  
  	mutex_lock(&cgroup_mutex);
  
  	/* Rebind all subsystems back to the default hierarchy */
  	ret = rebind_subsystems(root, 0);
  	/* Shouldn't be able to fail ... */
  	BUG_ON(ret);
817929ec2   Paul Menage   Task Control Grou...
1512
1513
1514
1515
1516
  	/*
  	 * Release all the links from css_sets to this hierarchy's
  	 * root cgroup
  	 */
  	write_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
1517
1518
1519
  
  	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
  				 cgrp_link_list) {
817929ec2   Paul Menage   Task Control Grou...
1520
  		list_del(&link->cg_link_list);
bd89aabc6   Paul Menage   Control groups: R...
1521
  		list_del(&link->cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
1522
1523
1524
  		kfree(link);
  	}
  	write_unlock(&css_set_lock);
839ec5452   Paul Menage   cgroup: fix root_...
1525
1526
1527
1528
  	if (!list_empty(&root->root_list)) {
  		list_del(&root->root_list);
  		root_count--;
  	}
e5f6a8609   Li Zefan   cgroups: make roo...
1529

ddbcc7e8e   Paul Menage   Task Control Grou...
1530
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
1531
  	kill_litter_super(sb);
2c6ab6d20   Paul Menage   cgroups: allow cg...
1532
  	cgroup_drop_root(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
1533
1534
1535
1536
1537
1538
1539
  }
  
  static struct file_system_type cgroup_fs_type = {
  	.name = "cgroup",
  	.get_sb = cgroup_get_sb,
  	.kill_sb = cgroup_kill_sb,
  };
bd89aabc6   Paul Menage   Control groups: R...
1540
  static inline struct cgroup *__d_cgrp(struct dentry *dentry)
ddbcc7e8e   Paul Menage   Task Control Grou...
1541
1542
1543
1544
1545
1546
1547
1548
  {
  	return dentry->d_fsdata;
  }
  
  static inline struct cftype *__d_cft(struct dentry *dentry)
  {
  	return dentry->d_fsdata;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
1549
1550
1551
1552
1553
1554
  /**
   * cgroup_path - generate the path of a cgroup
   * @cgrp: the cgroup in question
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
a47295e6b   Paul Menage   cgroups: make cgr...
1555
1556
1557
   * Called with cgroup_mutex held or else with an RCU-protected cgroup
   * reference.  Writes path of cgroup into buf.  Returns 0 on success,
   * -errno on error.
ddbcc7e8e   Paul Menage   Task Control Grou...
1558
   */
bd89aabc6   Paul Menage   Control groups: R...
1559
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
ddbcc7e8e   Paul Menage   Task Control Grou...
1560
1561
  {
  	char *start;
a47295e6b   Paul Menage   cgroups: make cgr...
1562
  	struct dentry *dentry = rcu_dereference(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
1563

a47295e6b   Paul Menage   cgroups: make cgr...
1564
  	if (!dentry || cgrp == dummytop) {
ddbcc7e8e   Paul Menage   Task Control Grou...
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
  		/*
  		 * Inactive subsystems have no dentry for their root
  		 * cgroup
  		 */
  		strcpy(buf, "/");
  		return 0;
  	}
  
  	start = buf + buflen;
  
  	*--start = '\0';
  	for (;;) {
a47295e6b   Paul Menage   cgroups: make cgr...
1577
  		int len = dentry->d_name.len;
ddbcc7e8e   Paul Menage   Task Control Grou...
1578
1579
  		if ((start -= len) < buf)
  			return -ENAMETOOLONG;
bd89aabc6   Paul Menage   Control groups: R...
1580
1581
1582
  		memcpy(start, cgrp->dentry->d_name.name, len);
  		cgrp = cgrp->parent;
  		if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
1583
  			break;
a47295e6b   Paul Menage   cgroups: make cgr...
1584
  		dentry = rcu_dereference(cgrp->dentry);
bd89aabc6   Paul Menage   Control groups: R...
1585
  		if (!cgrp->parent)
ddbcc7e8e   Paul Menage   Task Control Grou...
1586
1587
1588
1589
1590
1591
1592
1593
  			continue;
  		if (--start < buf)
  			return -ENAMETOOLONG;
  		*start = '/';
  	}
  	memmove(buf, start, buf + buflen - start);
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
1594
  EXPORT_SYMBOL_GPL(cgroup_path);
ddbcc7e8e   Paul Menage   Task Control Grou...
1595

a043e3b2c   Li Zefan   cgroup: fix comments
1596
1597
1598
1599
  /**
   * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
   * @cgrp: the cgroup the task is attaching to
   * @tsk: the task to be attached
bbcb81d09   Paul Menage   Task Control Grou...
1600
   *
a043e3b2c   Li Zefan   cgroup: fix comments
1601
1602
   * Call holding cgroup_mutex. May take task_lock of
   * the task 'tsk' during call.
bbcb81d09   Paul Menage   Task Control Grou...
1603
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1604
  int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
bbcb81d09   Paul Menage   Task Control Grou...
1605
1606
  {
  	int retval = 0;
2468c7234   Daisuke Nishimura   cgroup: introduce...
1607
  	struct cgroup_subsys *ss, *failed_ss = NULL;
bd89aabc6   Paul Menage   Control groups: R...
1608
  	struct cgroup *oldcgrp;
77efecd9e   Lai Jiangshan   cgroups: call fin...
1609
  	struct css_set *cg;
817929ec2   Paul Menage   Task Control Grou...
1610
  	struct css_set *newcg;
bd89aabc6   Paul Menage   Control groups: R...
1611
  	struct cgroupfs_root *root = cgrp->root;
bbcb81d09   Paul Menage   Task Control Grou...
1612
1613
  
  	/* Nothing to do if the task is already in that cgroup */
7717f7ba9   Paul Menage   cgroups: add a ba...
1614
  	oldcgrp = task_cgroup_from_root(tsk, root);
bd89aabc6   Paul Menage   Control groups: R...
1615
  	if (cgrp == oldcgrp)
bbcb81d09   Paul Menage   Task Control Grou...
1616
1617
1618
1619
  		return 0;
  
  	for_each_subsys(root, ss) {
  		if (ss->can_attach) {
be367d099   Ben Blum   cgroups: let ss->...
1620
  			retval = ss->can_attach(ss, cgrp, tsk, false);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
  			if (retval) {
  				/*
  				 * Remember on which subsystem the can_attach()
  				 * failed, so that we only call cancel_attach()
  				 * against the subsystems whose can_attach()
  				 * succeeded. (See below)
  				 */
  				failed_ss = ss;
  				goto out;
  			}
bbcb81d09   Paul Menage   Task Control Grou...
1631
1632
  		}
  	}
77efecd9e   Lai Jiangshan   cgroups: call fin...
1633
1634
1635
1636
  	task_lock(tsk);
  	cg = tsk->cgroups;
  	get_css_set(cg);
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1637
1638
1639
1640
  	/*
  	 * Locate or allocate a new css_set for this task,
  	 * based on its final set of cgroups
  	 */
bd89aabc6   Paul Menage   Control groups: R...
1641
  	newcg = find_css_set(cg, cgrp);
77efecd9e   Lai Jiangshan   cgroups: call fin...
1642
  	put_css_set(cg);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1643
1644
1645
1646
  	if (!newcg) {
  		retval = -ENOMEM;
  		goto out;
  	}
817929ec2   Paul Menage   Task Control Grou...
1647

bbcb81d09   Paul Menage   Task Control Grou...
1648
1649
1650
  	task_lock(tsk);
  	if (tsk->flags & PF_EXITING) {
  		task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1651
  		put_css_set(newcg);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1652
1653
  		retval = -ESRCH;
  		goto out;
bbcb81d09   Paul Menage   Task Control Grou...
1654
  	}
817929ec2   Paul Menage   Task Control Grou...
1655
  	rcu_assign_pointer(tsk->cgroups, newcg);
bbcb81d09   Paul Menage   Task Control Grou...
1656
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
1657
1658
1659
1660
1661
1662
1663
  	/* Update the css_set linked lists if we're using them */
  	write_lock(&css_set_lock);
  	if (!list_empty(&tsk->cg_list)) {
  		list_del(&tsk->cg_list);
  		list_add(&tsk->cg_list, &newcg->tasks);
  	}
  	write_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
1664
  	for_each_subsys(root, ss) {
e18f6318e   Paul Jackson   cgroup brace codi...
1665
  		if (ss->attach)
be367d099   Ben Blum   cgroups: let ss->...
1666
  			ss->attach(ss, cgrp, oldcgrp, tsk, false);
bbcb81d09   Paul Menage   Task Control Grou...
1667
  	}
bd89aabc6   Paul Menage   Control groups: R...
1668
  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
bbcb81d09   Paul Menage   Task Control Grou...
1669
  	synchronize_rcu();
817929ec2   Paul Menage   Task Control Grou...
1670
  	put_css_set(cg);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
1671
1672
1673
1674
1675
  
  	/*
  	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
  	 * is no longer empty.
  	 */
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
1676
  	cgroup_wakeup_rmdir_waiter(cgrp);
2468c7234   Daisuke Nishimura   cgroup: introduce...
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
  out:
  	if (retval) {
  		for_each_subsys(root, ss) {
  			if (ss == failed_ss)
  				/*
  				 * This subsystem was the one that failed the
  				 * can_attach() check earlier, so we don't need
  				 * to call cancel_attach() against it or any
  				 * remaining subsystems.
  				 */
  				break;
  			if (ss->cancel_attach)
  				ss->cancel_attach(ss, cgrp, tsk, false);
  		}
  	}
  	return retval;
bbcb81d09   Paul Menage   Task Control Grou...
1693
1694
1695
  }
  
  /*
af351026a   Paul Menage   cgroup files: tur...
1696
1697
   * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
   * held. May take task_lock of task
bbcb81d09   Paul Menage   Task Control Grou...
1698
   */
af351026a   Paul Menage   cgroup files: tur...
1699
  static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
bbcb81d09   Paul Menage   Task Control Grou...
1700
  {
bbcb81d09   Paul Menage   Task Control Grou...
1701
  	struct task_struct *tsk;
c69e8d9c0   David Howells   CRED: Use RCU to ...
1702
  	const struct cred *cred = current_cred(), *tcred;
bbcb81d09   Paul Menage   Task Control Grou...
1703
  	int ret;
bbcb81d09   Paul Menage   Task Control Grou...
1704
1705
  	if (pid) {
  		rcu_read_lock();
73507f335   Pavel Emelyanov   Handle pid namesp...
1706
  		tsk = find_task_by_vpid(pid);
bbcb81d09   Paul Menage   Task Control Grou...
1707
1708
1709
1710
  		if (!tsk || tsk->flags & PF_EXITING) {
  			rcu_read_unlock();
  			return -ESRCH;
  		}
bbcb81d09   Paul Menage   Task Control Grou...
1711

c69e8d9c0   David Howells   CRED: Use RCU to ...
1712
1713
1714
1715
1716
  		tcred = __task_cred(tsk);
  		if (cred->euid &&
  		    cred->euid != tcred->uid &&
  		    cred->euid != tcred->suid) {
  			rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
1717
1718
  			return -EACCES;
  		}
c69e8d9c0   David Howells   CRED: Use RCU to ...
1719
1720
  		get_task_struct(tsk);
  		rcu_read_unlock();
bbcb81d09   Paul Menage   Task Control Grou...
1721
1722
1723
1724
  	} else {
  		tsk = current;
  		get_task_struct(tsk);
  	}
956db3ca0   Cliff Wickman   hotplug cpu: move...
1725
  	ret = cgroup_attach_task(cgrp, tsk);
bbcb81d09   Paul Menage   Task Control Grou...
1726
1727
1728
  	put_task_struct(tsk);
  	return ret;
  }
af351026a   Paul Menage   cgroup files: tur...
1729
1730
1731
1732
1733
1734
1735
1736
1737
  static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
  {
  	int ret;
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	ret = attach_task_by_pid(cgrp, pid);
  	cgroup_unlock();
  	return ret;
  }
e788e066c   Paul Menage   cgroup files: mov...
1738
1739
1740
1741
  /**
   * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
   * @cgrp: the cgroup to be checked for liveness
   *
84eea8428   Paul Menage   cgroups: misc cle...
1742
1743
   * On success, returns true; the lock should be later released with
   * cgroup_unlock(). On failure returns false with no lock held.
e788e066c   Paul Menage   cgroup files: mov...
1744
   */
84eea8428   Paul Menage   cgroups: misc cle...
1745
  bool cgroup_lock_live_group(struct cgroup *cgrp)
e788e066c   Paul Menage   cgroup files: mov...
1746
1747
1748
1749
1750
1751
1752
1753
  {
  	mutex_lock(&cgroup_mutex);
  	if (cgroup_is_removed(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
  		return false;
  	}
  	return true;
  }
67523c48a   Ben Blum   cgroups: blkio su...
1754
  EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
e788e066c   Paul Menage   cgroup files: mov...
1755
1756
1757
1758
1759
1760
1761
1762
  
  static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	strcpy(cgrp->root->release_agent_path, buffer);
84eea8428   Paul Menage   cgroups: misc cle...
1763
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
  	return 0;
  }
  
  static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
  				     struct seq_file *seq)
  {
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
  	seq_puts(seq, cgrp->root->release_agent_path);
  	seq_putc(seq, '
  ');
84eea8428   Paul Menage   cgroups: misc cle...
1775
  	cgroup_unlock();
e788e066c   Paul Menage   cgroup files: mov...
1776
1777
  	return 0;
  }
84eea8428   Paul Menage   cgroups: misc cle...
1778
1779
  /* A buffer size big enough for numbers or short strings */
  #define CGROUP_LOCAL_BUFFER_SIZE 64
e73d2c61d   Paul Menage   CGroups _s64 file...
1780
  static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
f4c753b7e   Paul Menage   CGroup API files:...
1781
1782
1783
  				struct file *file,
  				const char __user *userbuf,
  				size_t nbytes, loff_t *unused_ppos)
355e0c48b   Paul Menage   Add cgroup write_...
1784
  {
84eea8428   Paul Menage   cgroups: misc cle...
1785
  	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
355e0c48b   Paul Menage   Add cgroup write_...
1786
  	int retval = 0;
355e0c48b   Paul Menage   Add cgroup write_...
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
  	char *end;
  
  	if (!nbytes)
  		return -EINVAL;
  	if (nbytes >= sizeof(buffer))
  		return -E2BIG;
  	if (copy_from_user(buffer, userbuf, nbytes))
  		return -EFAULT;
  
  	buffer[nbytes] = 0;     /* nul-terminate */
e73d2c61d   Paul Menage   CGroups _s64 file...
1797
  	if (cft->write_u64) {
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
1798
  		u64 val = simple_strtoull(strstrip(buffer), &end, 0);
e73d2c61d   Paul Menage   CGroups _s64 file...
1799
1800
1801
1802
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_u64(cgrp, cft, val);
  	} else {
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
1803
  		s64 val = simple_strtoll(strstrip(buffer), &end, 0);
e73d2c61d   Paul Menage   CGroups _s64 file...
1804
1805
1806
1807
  		if (*end)
  			return -EINVAL;
  		retval = cft->write_s64(cgrp, cft, val);
  	}
355e0c48b   Paul Menage   Add cgroup write_...
1808
1809
1810
1811
  	if (!retval)
  		retval = nbytes;
  	return retval;
  }
db3b14978   Paul Menage   cgroup files: add...
1812
1813
1814
1815
1816
  static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
  				   struct file *file,
  				   const char __user *userbuf,
  				   size_t nbytes, loff_t *unused_ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
1817
  	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
db3b14978   Paul Menage   cgroup files: add...
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
  	int retval = 0;
  	size_t max_bytes = cft->max_write_len;
  	char *buffer = local_buffer;
  
  	if (!max_bytes)
  		max_bytes = sizeof(local_buffer) - 1;
  	if (nbytes >= max_bytes)
  		return -E2BIG;
  	/* Allocate a dynamic buffer if we need one */
  	if (nbytes >= sizeof(local_buffer)) {
  		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
  		if (buffer == NULL)
  			return -ENOMEM;
  	}
5a3eb9f6b   Li Zefan   cgroup: fix possi...
1832
1833
1834
1835
  	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
  		retval = -EFAULT;
  		goto out;
  	}
db3b14978   Paul Menage   cgroup files: add...
1836
1837
  
  	buffer[nbytes] = 0;     /* nul-terminate */
478988d3b   KOSAKI Motohiro   cgroup: fix strst...
1838
  	retval = cft->write_string(cgrp, cft, strstrip(buffer));
db3b14978   Paul Menage   cgroup files: add...
1839
1840
  	if (!retval)
  		retval = nbytes;
5a3eb9f6b   Li Zefan   cgroup: fix possi...
1841
  out:
db3b14978   Paul Menage   cgroup files: add...
1842
1843
1844
1845
  	if (buffer != local_buffer)
  		kfree(buffer);
  	return retval;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1846
1847
1848
1849
  static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
  						size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
1850
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1851

75139b827   Li Zefan   cgroups: remove s...
1852
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
1853
  		return -ENODEV;
355e0c48b   Paul Menage   Add cgroup write_...
1854
  	if (cft->write)
bd89aabc6   Paul Menage   Control groups: R...
1855
  		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
1856
1857
  	if (cft->write_u64 || cft->write_s64)
  		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
db3b14978   Paul Menage   cgroup files: add...
1858
1859
  	if (cft->write_string)
  		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
d447ea2f3   Pavel Emelyanov   cgroups: add the ...
1860
1861
1862
1863
  	if (cft->trigger) {
  		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
  		return ret ? ret : nbytes;
  	}
355e0c48b   Paul Menage   Add cgroup write_...
1864
  	return -EINVAL;
ddbcc7e8e   Paul Menage   Task Control Grou...
1865
  }
f4c753b7e   Paul Menage   CGroup API files:...
1866
1867
1868
1869
  static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
ddbcc7e8e   Paul Menage   Task Control Grou...
1870
  {
84eea8428   Paul Menage   cgroups: misc cle...
1871
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
f4c753b7e   Paul Menage   CGroup API files:...
1872
  	u64 val = cft->read_u64(cgrp, cft);
ddbcc7e8e   Paul Menage   Task Control Grou...
1873
1874
1875
1876
1877
  	int len = sprintf(tmp, "%llu
  ", (unsigned long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
e73d2c61d   Paul Menage   CGroups _s64 file...
1878
1879
1880
1881
1882
  static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
  			       struct file *file,
  			       char __user *buf, size_t nbytes,
  			       loff_t *ppos)
  {
84eea8428   Paul Menage   cgroups: misc cle...
1883
  	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
e73d2c61d   Paul Menage   CGroups _s64 file...
1884
1885
1886
1887
1888
1889
  	s64 val = cft->read_s64(cgrp, cft);
  	int len = sprintf(tmp, "%lld
  ", (long long) val);
  
  	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
1890
1891
1892
1893
  static ssize_t cgroup_file_read(struct file *file, char __user *buf,
  				   size_t nbytes, loff_t *ppos)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
bd89aabc6   Paul Menage   Control groups: R...
1894
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
ddbcc7e8e   Paul Menage   Task Control Grou...
1895

75139b827   Li Zefan   cgroups: remove s...
1896
  	if (cgroup_is_removed(cgrp))
ddbcc7e8e   Paul Menage   Task Control Grou...
1897
1898
1899
  		return -ENODEV;
  
  	if (cft->read)
bd89aabc6   Paul Menage   Control groups: R...
1900
  		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
f4c753b7e   Paul Menage   CGroup API files:...
1901
1902
  	if (cft->read_u64)
  		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
e73d2c61d   Paul Menage   CGroups _s64 file...
1903
1904
  	if (cft->read_s64)
  		return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
ddbcc7e8e   Paul Menage   Task Control Grou...
1905
1906
  	return -EINVAL;
  }
917965696   Paul Menage   CGroup API files:...
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
  /*
   * seqfile ops/methods for returning structured data. Currently just
   * supports string->u64 maps, but can be extended in future.
   */
  
  struct cgroup_seqfile_state {
  	struct cftype *cft;
  	struct cgroup *cgroup;
  };
  
  static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
  {
  	struct seq_file *sf = cb->state;
  	return seq_printf(sf, "%s %llu
  ", key, (unsigned long long)value);
  }
  
  static int cgroup_seqfile_show(struct seq_file *m, void *arg)
  {
  	struct cgroup_seqfile_state *state = m->private;
  	struct cftype *cft = state->cft;
29486df32   Serge E. Hallyn   cgroups: introduc...
1928
1929
1930
1931
1932
1933
1934
1935
  	if (cft->read_map) {
  		struct cgroup_map_cb cb = {
  			.fill = cgroup_map_add,
  			.state = m,
  		};
  		return cft->read_map(state->cgroup, cft, &cb);
  	}
  	return cft->read_seq_string(state->cgroup, cft, m);
917965696   Paul Menage   CGroup API files:...
1936
  }
96930a636   Adrian Bunk   make cgroup_seqfi...
1937
  static int cgroup_seqfile_release(struct inode *inode, struct file *file)
917965696   Paul Menage   CGroup API files:...
1938
1939
1940
1941
1942
  {
  	struct seq_file *seq = file->private_data;
  	kfree(seq->private);
  	return single_release(inode, file);
  }
828c09509   Alexey Dobriyan   const: constify r...
1943
  static const struct file_operations cgroup_seqfile_operations = {
917965696   Paul Menage   CGroup API files:...
1944
  	.read = seq_read,
e788e066c   Paul Menage   cgroup files: mov...
1945
  	.write = cgroup_file_write,
917965696   Paul Menage   CGroup API files:...
1946
1947
1948
  	.llseek = seq_lseek,
  	.release = cgroup_seqfile_release,
  };
ddbcc7e8e   Paul Menage   Task Control Grou...
1949
1950
1951
1952
1953
1954
1955
1956
  static int cgroup_file_open(struct inode *inode, struct file *file)
  {
  	int err;
  	struct cftype *cft;
  
  	err = generic_file_open(inode, file);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
1957
  	cft = __d_cft(file->f_dentry);
75139b827   Li Zefan   cgroups: remove s...
1958

29486df32   Serge E. Hallyn   cgroups: introduc...
1959
  	if (cft->read_map || cft->read_seq_string) {
917965696   Paul Menage   CGroup API files:...
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
  		struct cgroup_seqfile_state *state =
  			kzalloc(sizeof(*state), GFP_USER);
  		if (!state)
  			return -ENOMEM;
  		state->cft = cft;
  		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
  		file->f_op = &cgroup_seqfile_operations;
  		err = single_open(file, cgroup_seqfile_show, state);
  		if (err < 0)
  			kfree(state);
  	} else if (cft->open)
ddbcc7e8e   Paul Menage   Task Control Grou...
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
  		err = cft->open(inode, file);
  	else
  		err = 0;
  
  	return err;
  }
  
  static int cgroup_file_release(struct inode *inode, struct file *file)
  {
  	struct cftype *cft = __d_cft(file->f_dentry);
  	if (cft->release)
  		return cft->release(inode, file);
  	return 0;
  }
  
  /*
   * cgroup_rename - Only allow simple rename of directories in place.
   */
  static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
  			    struct inode *new_dir, struct dentry *new_dentry)
  {
  	if (!S_ISDIR(old_dentry->d_inode->i_mode))
  		return -ENOTDIR;
  	if (new_dentry->d_inode)
  		return -EEXIST;
  	if (old_dir != new_dir)
  		return -EIO;
  	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
  }
828c09509   Alexey Dobriyan   const: constify r...
2000
  static const struct file_operations cgroup_file_operations = {
ddbcc7e8e   Paul Menage   Task Control Grou...
2001
2002
2003
2004
2005
2006
  	.read = cgroup_file_read,
  	.write = cgroup_file_write,
  	.llseek = generic_file_llseek,
  	.open = cgroup_file_open,
  	.release = cgroup_file_release,
  };
6e1d5dcc2   Alexey Dobriyan   const: mark remai...
2007
  static const struct inode_operations cgroup_dir_inode_operations = {
ddbcc7e8e   Paul Menage   Task Control Grou...
2008
2009
2010
2011
2012
  	.lookup = simple_lookup,
  	.mkdir = cgroup_mkdir,
  	.rmdir = cgroup_rmdir,
  	.rename = cgroup_rename,
  };
0dea11687   Kirill A. Shutemov   cgroup: implement...
2013
2014
2015
2016
2017
2018
2019
2020
2021
  /*
   * Check if a file is a control file
   */
  static inline struct cftype *__file_cft(struct file *file)
  {
  	if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
  		return ERR_PTR(-EINVAL);
  	return __d_cft(file->f_dentry);
  }
099fca322   Li Zefan   cgroups: show cor...
2022
  static int cgroup_create_file(struct dentry *dentry, mode_t mode,
ddbcc7e8e   Paul Menage   Task Control Grou...
2023
2024
  				struct super_block *sb)
  {
3ba13d179   Al Viro   constify dentry_o...
2025
  	static const struct dentry_operations cgroup_dops = {
ddbcc7e8e   Paul Menage   Task Control Grou...
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
  		.d_iput = cgroup_diput,
  	};
  
  	struct inode *inode;
  
  	if (!dentry)
  		return -ENOENT;
  	if (dentry->d_inode)
  		return -EEXIST;
  
  	inode = cgroup_new_inode(mode, sb);
  	if (!inode)
  		return -ENOMEM;
  
  	if (S_ISDIR(mode)) {
  		inode->i_op = &cgroup_dir_inode_operations;
  		inode->i_fop = &simple_dir_operations;
  
  		/* start off with i_nlink == 2 (for "." entry) */
  		inc_nlink(inode);
  
  		/* start with the directory inode held, so that we can
  		 * populate it without racing with another mkdir */
817929ec2   Paul Menage   Task Control Grou...
2049
  		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ddbcc7e8e   Paul Menage   Task Control Grou...
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
  	} else if (S_ISREG(mode)) {
  		inode->i_size = 0;
  		inode->i_fop = &cgroup_file_operations;
  	}
  	dentry->d_op = &cgroup_dops;
  	d_instantiate(dentry, inode);
  	dget(dentry);	/* Extra count - pin the dentry in core */
  	return 0;
  }
  
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
2061
2062
2063
2064
2065
   * cgroup_create_dir - create a directory for an object.
   * @cgrp: the cgroup we create the directory for. It must have a valid
   *        ->parent field. And we are going to fill its ->dentry field.
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new directory.
ddbcc7e8e   Paul Menage   Task Control Grou...
2066
   */
bd89aabc6   Paul Menage   Control groups: R...
2067
  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
2068
  				mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
2069
2070
2071
  {
  	struct dentry *parent;
  	int error = 0;
bd89aabc6   Paul Menage   Control groups: R...
2072
2073
  	parent = cgrp->parent->dentry;
  	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
2074
  	if (!error) {
bd89aabc6   Paul Menage   Control groups: R...
2075
  		dentry->d_fsdata = cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
2076
  		inc_nlink(parent->d_inode);
a47295e6b   Paul Menage   cgroups: make cgr...
2077
  		rcu_assign_pointer(cgrp->dentry, dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
2078
2079
2080
2081
2082
2083
  		dget(dentry);
  	}
  	dput(dentry);
  
  	return error;
  }
099fca322   Li Zefan   cgroups: show cor...
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
  /**
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
   *
   * returns cft->mode if ->mode is not 0
   * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
   * returns S_IRUGO if it has only a read handler
   * returns S_IWUSR if it has only a write hander
   */
  static mode_t cgroup_file_mode(const struct cftype *cft)
  {
  	mode_t mode = 0;
  
  	if (cft->mode)
  		return cft->mode;
  
  	if (cft->read || cft->read_u64 || cft->read_s64 ||
  	    cft->read_map || cft->read_seq_string)
  		mode |= S_IRUGO;
  
  	if (cft->write || cft->write_u64 || cft->write_s64 ||
  	    cft->write_string || cft->trigger)
  		mode |= S_IWUSR;
  
  	return mode;
  }
bd89aabc6   Paul Menage   Control groups: R...
2110
  int cgroup_add_file(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
2111
2112
2113
  		       struct cgroup_subsys *subsys,
  		       const struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
2114
  	struct dentry *dir = cgrp->dentry;
ddbcc7e8e   Paul Menage   Task Control Grou...
2115
2116
  	struct dentry *dentry;
  	int error;
099fca322   Li Zefan   cgroups: show cor...
2117
  	mode_t mode;
ddbcc7e8e   Paul Menage   Task Control Grou...
2118
2119
  
  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
bd89aabc6   Paul Menage   Control groups: R...
2120
  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
2121
2122
2123
2124
2125
2126
2127
  		strcpy(name, subsys->name);
  		strcat(name, ".");
  	}
  	strcat(name, cft->name);
  	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
  	dentry = lookup_one_len(name, dir, strlen(name));
  	if (!IS_ERR(dentry)) {
099fca322   Li Zefan   cgroups: show cor...
2128
2129
  		mode = cgroup_file_mode(cft);
  		error = cgroup_create_file(dentry, mode | S_IFREG,
bd89aabc6   Paul Menage   Control groups: R...
2130
  						cgrp->root->sb);
ddbcc7e8e   Paul Menage   Task Control Grou...
2131
2132
2133
2134
2135
2136
2137
  		if (!error)
  			dentry->d_fsdata = (void *)cft;
  		dput(dentry);
  	} else
  		error = PTR_ERR(dentry);
  	return error;
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
2138
  EXPORT_SYMBOL_GPL(cgroup_add_file);
ddbcc7e8e   Paul Menage   Task Control Grou...
2139

bd89aabc6   Paul Menage   Control groups: R...
2140
  int cgroup_add_files(struct cgroup *cgrp,
ddbcc7e8e   Paul Menage   Task Control Grou...
2141
2142
2143
2144
2145
2146
  			struct cgroup_subsys *subsys,
  			const struct cftype cft[],
  			int count)
  {
  	int i, err;
  	for (i = 0; i < count; i++) {
bd89aabc6   Paul Menage   Control groups: R...
2147
  		err = cgroup_add_file(cgrp, subsys, &cft[i]);
ddbcc7e8e   Paul Menage   Task Control Grou...
2148
2149
2150
2151
2152
  		if (err)
  			return err;
  	}
  	return 0;
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
2153
  EXPORT_SYMBOL_GPL(cgroup_add_files);
ddbcc7e8e   Paul Menage   Task Control Grou...
2154

a043e3b2c   Li Zefan   cgroup: fix comments
2155
2156
2157
2158
2159
2160
  /**
   * cgroup_task_count - count the number of tasks in a cgroup.
   * @cgrp: the cgroup in question
   *
   * Return the number of tasks in the cgroup.
   */
bd89aabc6   Paul Menage   Control groups: R...
2161
  int cgroup_task_count(const struct cgroup *cgrp)
bbcb81d09   Paul Menage   Task Control Grou...
2162
2163
  {
  	int count = 0;
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
2164
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
2165
2166
  
  	read_lock(&css_set_lock);
71cbb949d   KOSAKI Motohiro   cgroup: list_for_...
2167
  	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
2168
  		count += atomic_read(&link->cg->refcount);
817929ec2   Paul Menage   Task Control Grou...
2169
2170
  	}
  	read_unlock(&css_set_lock);
bbcb81d09   Paul Menage   Task Control Grou...
2171
2172
2173
2174
  	return count;
  }
  
  /*
817929ec2   Paul Menage   Task Control Grou...
2175
2176
2177
   * Advance a list_head iterator.  The iterator should be positioned at
   * the start of a css_set
   */
bd89aabc6   Paul Menage   Control groups: R...
2178
  static void cgroup_advance_iter(struct cgroup *cgrp,
7717f7ba9   Paul Menage   cgroups: add a ba...
2179
  				struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2180
2181
2182
2183
2184
2185
2186
2187
  {
  	struct list_head *l = it->cg_link;
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	/* Advance to the next non-empty css_set */
  	do {
  		l = l->next;
bd89aabc6   Paul Menage   Control groups: R...
2188
  		if (l == &cgrp->css_sets) {
817929ec2   Paul Menage   Task Control Grou...
2189
2190
2191
  			it->cg_link = NULL;
  			return;
  		}
bd89aabc6   Paul Menage   Control groups: R...
2192
  		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
817929ec2   Paul Menage   Task Control Grou...
2193
2194
2195
2196
2197
  		cg = link->cg;
  	} while (list_empty(&cg->tasks));
  	it->cg_link = l;
  	it->task = cg->tasks.next;
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
2198
2199
2200
2201
2202
2203
2204
2205
2206
  /*
   * To reduce the fork() overhead for systems that are not actually
   * using their cgroups capability, we don't maintain the lists running
   * through each css_set to its tasks until we see the list actually
   * used - in other words after the first call to cgroup_iter_start().
   *
   * The tasklist_lock is not held here, as do_each_thread() and
   * while_each_thread() are protected by RCU.
   */
3df91fe30   Adrian Bunk   make cgroup_enabl...
2207
  static void cgroup_enable_task_cg_lists(void)
31a7df01f   Cliff Wickman   cgroups: mechanis...
2208
2209
2210
2211
2212
2213
  {
  	struct task_struct *p, *g;
  	write_lock(&css_set_lock);
  	use_task_css_set_links = 1;
  	do_each_thread(g, p) {
  		task_lock(p);
0e04388f0   Li Zefan   cgroup: fix a rac...
2214
2215
2216
2217
2218
2219
  		/*
  		 * We should check if the process is exiting, otherwise
  		 * it will race with cgroup_exit() in that the list
  		 * entry won't be deleted though the process has exited.
  		 */
  		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
31a7df01f   Cliff Wickman   cgroups: mechanis...
2220
2221
2222
2223
2224
  			list_add(&p->cg_list, &p->cgroups->tasks);
  		task_unlock(p);
  	} while_each_thread(g, p);
  	write_unlock(&css_set_lock);
  }
bd89aabc6   Paul Menage   Control groups: R...
2225
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2226
2227
2228
2229
2230
2231
  {
  	/*
  	 * The first time anyone tries to iterate across a cgroup,
  	 * we need to enable the list linking each css_set to its
  	 * tasks, and fix up all existing tasks.
  	 */
31a7df01f   Cliff Wickman   cgroups: mechanis...
2232
2233
  	if (!use_task_css_set_links)
  		cgroup_enable_task_cg_lists();
817929ec2   Paul Menage   Task Control Grou...
2234
  	read_lock(&css_set_lock);
bd89aabc6   Paul Menage   Control groups: R...
2235
2236
  	it->cg_link = &cgrp->css_sets;
  	cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
2237
  }
bd89aabc6   Paul Menage   Control groups: R...
2238
  struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
817929ec2   Paul Menage   Task Control Grou...
2239
2240
2241
2242
  					struct cgroup_iter *it)
  {
  	struct task_struct *res;
  	struct list_head *l = it->task;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
2243
  	struct cg_cgroup_link *link;
817929ec2   Paul Menage   Task Control Grou...
2244
2245
2246
2247
2248
2249
2250
  
  	/* If the iterator cg is NULL, we have no tasks */
  	if (!it->cg_link)
  		return NULL;
  	res = list_entry(l, struct task_struct, cg_list);
  	/* Advance iterator to find next entry */
  	l = l->next;
2019f634c   Lai Jiangshan   cgroups: fix cgro...
2251
2252
  	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
  	if (l == &link->cg->tasks) {
817929ec2   Paul Menage   Task Control Grou...
2253
2254
  		/* We reached the end of this task list - move on to
  		 * the next cg_cgroup_link */
bd89aabc6   Paul Menage   Control groups: R...
2255
  		cgroup_advance_iter(cgrp, it);
817929ec2   Paul Menage   Task Control Grou...
2256
2257
2258
2259
2260
  	} else {
  		it->task = l;
  	}
  	return res;
  }
bd89aabc6   Paul Menage   Control groups: R...
2261
  void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
817929ec2   Paul Menage   Task Control Grou...
2262
2263
2264
  {
  	read_unlock(&css_set_lock);
  }
31a7df01f   Cliff Wickman   cgroups: mechanis...
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
  static inline int started_after_time(struct task_struct *t1,
  				     struct timespec *time,
  				     struct task_struct *t2)
  {
  	int start_diff = timespec_compare(&t1->start_time, time);
  	if (start_diff > 0) {
  		return 1;
  	} else if (start_diff < 0) {
  		return 0;
  	} else {
  		/*
  		 * Arbitrarily, if two processes started at the same
  		 * time, we'll say that the lower pointer value
  		 * started first. Note that t2 may have exited by now
  		 * so this may not be a valid pointer any longer, but
  		 * that's fine - it still serves to distinguish
  		 * between two tasks started (effectively) simultaneously.
  		 */
  		return t1 > t2;
  	}
  }
  
  /*
   * This function is a callback from heap_insert() and is used to order
   * the heap.
   * In this case we order the heap in descending task start time.
   */
  static inline int started_after(void *p1, void *p2)
  {
  	struct task_struct *t1 = p1;
  	struct task_struct *t2 = p2;
  	return started_after_time(t1, &t2->start_time, t2);
  }
  
  /**
   * cgroup_scan_tasks - iterate though all the tasks in a cgroup
   * @scan: struct cgroup_scanner containing arguments for the scan
   *
   * Arguments include pointers to callback functions test_task() and
   * process_task().
   * Iterate through all the tasks in a cgroup, calling test_task() for each,
   * and if it returns true, call process_task() for it also.
   * The test_task pointer may be NULL, meaning always true (select all tasks).
   * Effectively duplicates cgroup_iter_{start,next,end}()
   * but does not lock css_set_lock for the call to process_task().
   * The struct cgroup_scanner may be embedded in any structure of the caller's
   * creation.
   * It is guaranteed that process_task() will act on every task that
   * is a member of the cgroup for the duration of this call. This
   * function may or may not call process_task() for tasks that exit
   * or move to a different cgroup during the call, or are forked or
   * move into the cgroup during the call.
   *
   * Note that test_task() may be called with locks held, and may in some
   * situations be called multiple times for the same task, so it should
   * be cheap.
   * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
   * pre-allocated and will be used for heap operations (and its "gt" member will
   * be overwritten), else a temporary heap will be used (allocation of which
   * may cause this function to fail).
   */
  int cgroup_scan_tasks(struct cgroup_scanner *scan)
  {
  	int retval, i;
  	struct cgroup_iter it;
  	struct task_struct *p, *dropped;
  	/* Never dereference latest_task, since it's not refcounted */
  	struct task_struct *latest_task = NULL;
  	struct ptr_heap tmp_heap;
  	struct ptr_heap *heap;
  	struct timespec latest_time = { 0, 0 };
  
  	if (scan->heap) {
  		/* The caller supplied our heap and pre-allocated its memory */
  		heap = scan->heap;
  		heap->gt = &started_after;
  	} else {
  		/* We need to allocate our own heap memory */
  		heap = &tmp_heap;
  		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
  		if (retval)
  			/* cannot allocate the heap */
  			return retval;
  	}
  
   again:
  	/*
  	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
  	 * to determine which are of interest, and using the scanner's
  	 * "process_task" callback to process any of them that need an update.
  	 * Since we don't want to hold any locks during the task updates,
  	 * gather tasks to be processed in a heap structure.
  	 * The heap is sorted by descending task start time.
  	 * If the statically-sized heap fills up, we overflow tasks that
  	 * started later, and in future iterations only consider tasks that
  	 * started after the latest task in the previous pass. This
  	 * guarantees forward progress and that we don't miss any tasks.
  	 */
  	heap->size = 0;
  	cgroup_iter_start(scan->cg, &it);
  	while ((p = cgroup_iter_next(scan->cg, &it))) {
  		/*
  		 * Only affect tasks that qualify per the caller's callback,
  		 * if he provided one
  		 */
  		if (scan->test_task && !scan->test_task(p, scan))
  			continue;
  		/*
  		 * Only process tasks that started after the last task
  		 * we processed
  		 */
  		if (!started_after_time(p, &latest_time, latest_task))
  			continue;
  		dropped = heap_insert(heap, p);
  		if (dropped == NULL) {
  			/*
  			 * The new task was inserted; the heap wasn't
  			 * previously full
  			 */
  			get_task_struct(p);
  		} else if (dropped != p) {
  			/*
  			 * The new task was inserted, and pushed out a
  			 * different task
  			 */
  			get_task_struct(p);
  			put_task_struct(dropped);
  		}
  		/*
  		 * Else the new task was newer than anything already in
  		 * the heap and wasn't inserted
  		 */
  	}
  	cgroup_iter_end(scan->cg, &it);
  
  	if (heap->size) {
  		for (i = 0; i < heap->size; i++) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2402
  			struct task_struct *q = heap->ptrs[i];
31a7df01f   Cliff Wickman   cgroups: mechanis...
2403
  			if (i == 0) {
4fe91d518   Paul Jackson   cgroup: fix spars...
2404
2405
  				latest_time = q->start_time;
  				latest_task = q;
31a7df01f   Cliff Wickman   cgroups: mechanis...
2406
2407
  			}
  			/* Process the task per the caller's callback */
4fe91d518   Paul Jackson   cgroup: fix spars...
2408
2409
  			scan->process_task(q, scan);
  			put_task_struct(q);
31a7df01f   Cliff Wickman   cgroups: mechanis...
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
  		}
  		/*
  		 * If we had to process any tasks at all, scan again
  		 * in case some of them were in the middle of forking
  		 * children that didn't get processed.
  		 * Not the most efficient way to do it, but it avoids
  		 * having to take callback_mutex in the fork path
  		 */
  		goto again;
  	}
  	if (heap == &tmp_heap)
  		heap_free(&tmp_heap);
  	return 0;
  }
817929ec2   Paul Menage   Task Control Grou...
2424
  /*
102a775e3   Ben Blum   cgroups: add a re...
2425
   * Stuff for reading the 'tasks'/'procs' files.
bbcb81d09   Paul Menage   Task Control Grou...
2426
2427
2428
2429
2430
2431
   *
   * Reading this file can return large amounts of data if a cgroup has
   * *lots* of attached tasks. So it may need several calls to read(),
   * but we cannot guarantee that the information we produce is correct
   * unless we produce it entirely atomically.
   *
bbcb81d09   Paul Menage   Task Control Grou...
2432
   */
bbcb81d09   Paul Menage   Task Control Grou...
2433
2434
  
  /*
d1d9fd330   Ben Blum   cgroups: use vmal...
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
   * The following two functions "fix" the issue where there are more pids
   * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
   * TODO: replace with a kernel-wide solution to this problem
   */
  #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
  static void *pidlist_allocate(int count)
  {
  	if (PIDLIST_TOO_LARGE(count))
  		return vmalloc(count * sizeof(pid_t));
  	else
  		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
  }
  static void pidlist_free(void *p)
  {
  	if (is_vmalloc_addr(p))
  		vfree(p);
  	else
  		kfree(p);
  }
  static void *pidlist_resize(void *p, int newcount)
  {
  	void *newlist;
  	/* note: if new alloc fails, old p will still be valid either way */
  	if (is_vmalloc_addr(p)) {
  		newlist = vmalloc(newcount * sizeof(pid_t));
  		if (!newlist)
  			return NULL;
  		memcpy(newlist, p, newcount * sizeof(pid_t));
  		vfree(p);
  	} else {
  		newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
  	}
  	return newlist;
  }
  
  /*
102a775e3   Ben Blum   cgroups: add a re...
2471
2472
2473
2474
   * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
   * If the new stripped list is sufficiently smaller and there's enough memory
   * to allocate a new buffer, will let go of the unneeded memory. Returns the
   * number of unique elements.
bbcb81d09   Paul Menage   Task Control Grou...
2475
   */
102a775e3   Ben Blum   cgroups: add a re...
2476
2477
2478
  /* is the size difference enough that we should re-allocate the array? */
  #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
  static int pidlist_uniq(pid_t **p, int length)
bbcb81d09   Paul Menage   Task Control Grou...
2479
  {
102a775e3   Ben Blum   cgroups: add a re...
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
  	int src, dest = 1;
  	pid_t *list = *p;
  	pid_t *newlist;
  
  	/*
  	 * we presume the 0th element is unique, so i starts at 1. trivial
  	 * edge cases first; no work needs to be done for either
  	 */
  	if (length == 0 || length == 1)
  		return length;
  	/* src and dest walk down the list; dest counts unique elements */
  	for (src = 1; src < length; src++) {
  		/* find next unique element */
  		while (list[src] == list[src-1]) {
  			src++;
  			if (src == length)
  				goto after;
  		}
  		/* dest always points to where the next unique element goes */
  		list[dest] = list[src];
  		dest++;
  	}
  after:
  	/*
  	 * if the length difference is large enough, we want to allocate a
  	 * smaller buffer to save memory. if this fails due to out of memory,
  	 * we'll just stay with what we've got.
  	 */
  	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
d1d9fd330   Ben Blum   cgroups: use vmal...
2509
  		newlist = pidlist_resize(list, dest);
102a775e3   Ben Blum   cgroups: add a re...
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
  		if (newlist)
  			*p = newlist;
  	}
  	return dest;
  }
  
  static int cmppid(const void *a, const void *b)
  {
  	return *(pid_t *)a - *(pid_t *)b;
  }
  
  /*
72a8cb30d   Ben Blum   cgroups: ensure c...
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
   * find the appropriate pidlist for our purpose (given procs vs tasks)
   * returns with the lock on that pidlist already held, and takes care
   * of the use count, or returns NULL with no locks held if we're out of
   * memory.
   */
  static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
  						  enum cgroup_filetype type)
  {
  	struct cgroup_pidlist *l;
  	/* don't need task_nsproxy() if we're looking at ourself */
b70cc5fdb   Li Zefan   cgroups: clean up...
2532
  	struct pid_namespace *ns = current->nsproxy->pid_ns;
72a8cb30d   Ben Blum   cgroups: ensure c...
2533
2534
2535
2536
2537
2538
2539
2540
2541
  	/*
  	 * We can't drop the pidlist_mutex before taking the l->mutex in case
  	 * the last ref-holder is trying to remove l from the list at the same
  	 * time. Holding the pidlist_mutex precludes somebody taking whichever
  	 * list we find out from under us - compare release_pid_array().
  	 */
  	mutex_lock(&cgrp->pidlist_mutex);
  	list_for_each_entry(l, &cgrp->pidlists, links) {
  		if (l->key.type == type && l->key.ns == ns) {
72a8cb30d   Ben Blum   cgroups: ensure c...
2542
2543
2544
  			/* make sure l doesn't vanish out from under us */
  			down_write(&l->mutex);
  			mutex_unlock(&cgrp->pidlist_mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
2545
2546
2547
2548
2549
2550
2551
  			return l;
  		}
  	}
  	/* entry not found; create a new one */
  	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
  	if (!l) {
  		mutex_unlock(&cgrp->pidlist_mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
2552
2553
2554
2555
2556
  		return l;
  	}
  	init_rwsem(&l->mutex);
  	down_write(&l->mutex);
  	l->key.type = type;
b70cc5fdb   Li Zefan   cgroups: clean up...
2557
  	l->key.ns = get_pid_ns(ns);
72a8cb30d   Ben Blum   cgroups: ensure c...
2558
2559
2560
2561
2562
2563
2564
2565
2566
  	l->use_count = 0; /* don't increment here */
  	l->list = NULL;
  	l->owner = cgrp;
  	list_add(&l->links, &cgrp->pidlists);
  	mutex_unlock(&cgrp->pidlist_mutex);
  	return l;
  }
  
  /*
102a775e3   Ben Blum   cgroups: add a re...
2567
2568
   * Load a cgroup's pidarray with either procs' tgids or tasks' pids
   */
72a8cb30d   Ben Blum   cgroups: ensure c...
2569
2570
  static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
  			      struct cgroup_pidlist **lp)
102a775e3   Ben Blum   cgroups: add a re...
2571
2572
2573
2574
  {
  	pid_t *array;
  	int length;
  	int pid, n = 0; /* used for populating the array */
817929ec2   Paul Menage   Task Control Grou...
2575
2576
  	struct cgroup_iter it;
  	struct task_struct *tsk;
102a775e3   Ben Blum   cgroups: add a re...
2577
2578
2579
2580
2581
2582
2583
2584
2585
  	struct cgroup_pidlist *l;
  
  	/*
  	 * If cgroup gets more users after we read count, we won't have
  	 * enough space - tough.  This race is indistinguishable to the
  	 * caller from the case that the additional cgroup users didn't
  	 * show up until sometime later on.
  	 */
  	length = cgroup_task_count(cgrp);
d1d9fd330   Ben Blum   cgroups: use vmal...
2586
  	array = pidlist_allocate(length);
102a775e3   Ben Blum   cgroups: add a re...
2587
2588
2589
  	if (!array)
  		return -ENOMEM;
  	/* now, populate the array */
bd89aabc6   Paul Menage   Control groups: R...
2590
2591
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
102a775e3   Ben Blum   cgroups: add a re...
2592
  		if (unlikely(n == length))
817929ec2   Paul Menage   Task Control Grou...
2593
  			break;
102a775e3   Ben Blum   cgroups: add a re...
2594
  		/* get tgid or pid for procs or tasks file respectively */
72a8cb30d   Ben Blum   cgroups: ensure c...
2595
2596
2597
2598
  		if (type == CGROUP_FILE_PROCS)
  			pid = task_tgid_vnr(tsk);
  		else
  			pid = task_pid_vnr(tsk);
102a775e3   Ben Blum   cgroups: add a re...
2599
2600
  		if (pid > 0) /* make sure to only use valid results */
  			array[n++] = pid;
817929ec2   Paul Menage   Task Control Grou...
2601
  	}
bd89aabc6   Paul Menage   Control groups: R...
2602
  	cgroup_iter_end(cgrp, &it);
102a775e3   Ben Blum   cgroups: add a re...
2603
2604
2605
  	length = n;
  	/* now sort & (if procs) strip out duplicates */
  	sort(array, length, sizeof(pid_t), cmppid, NULL);
72a8cb30d   Ben Blum   cgroups: ensure c...
2606
  	if (type == CGROUP_FILE_PROCS)
102a775e3   Ben Blum   cgroups: add a re...
2607
  		length = pidlist_uniq(&array, length);
72a8cb30d   Ben Blum   cgroups: ensure c...
2608
2609
  	l = cgroup_pidlist_find(cgrp, type);
  	if (!l) {
d1d9fd330   Ben Blum   cgroups: use vmal...
2610
  		pidlist_free(array);
72a8cb30d   Ben Blum   cgroups: ensure c...
2611
  		return -ENOMEM;
102a775e3   Ben Blum   cgroups: add a re...
2612
  	}
72a8cb30d   Ben Blum   cgroups: ensure c...
2613
  	/* store array, freeing old if necessary - lock already held */
d1d9fd330   Ben Blum   cgroups: use vmal...
2614
  	pidlist_free(l->list);
102a775e3   Ben Blum   cgroups: add a re...
2615
2616
2617
2618
  	l->list = array;
  	l->length = length;
  	l->use_count++;
  	up_write(&l->mutex);
72a8cb30d   Ben Blum   cgroups: ensure c...
2619
  	*lp = l;
102a775e3   Ben Blum   cgroups: add a re...
2620
  	return 0;
bbcb81d09   Paul Menage   Task Control Grou...
2621
  }
846c7bb05   Balbir Singh   Add cgroupstats
2622
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
2623
   * cgroupstats_build - build and fill cgroupstats
846c7bb05   Balbir Singh   Add cgroupstats
2624
2625
2626
   * @stats: cgroupstats to fill information into
   * @dentry: A dentry entry belonging to the cgroup for which stats have
   * been requested.
a043e3b2c   Li Zefan   cgroup: fix comments
2627
2628
2629
   *
   * Build and fill cgroupstats so that taskstats can export it to user
   * space.
846c7bb05   Balbir Singh   Add cgroupstats
2630
2631
2632
2633
   */
  int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  {
  	int ret = -EINVAL;
bd89aabc6   Paul Menage   Control groups: R...
2634
  	struct cgroup *cgrp;
846c7bb05   Balbir Singh   Add cgroupstats
2635
2636
  	struct cgroup_iter it;
  	struct task_struct *tsk;
33d283bef   Li Zefan   cgroups: fix a se...
2637

846c7bb05   Balbir Singh   Add cgroupstats
2638
  	/*
33d283bef   Li Zefan   cgroups: fix a se...
2639
2640
  	 * Validate dentry by checking the superblock operations,
  	 * and make sure it's a directory.
846c7bb05   Balbir Singh   Add cgroupstats
2641
  	 */
33d283bef   Li Zefan   cgroups: fix a se...
2642
2643
  	if (dentry->d_sb->s_op != &cgroup_ops ||
  	    !S_ISDIR(dentry->d_inode->i_mode))
846c7bb05   Balbir Singh   Add cgroupstats
2644
2645
2646
  		 goto err;
  
  	ret = 0;
bd89aabc6   Paul Menage   Control groups: R...
2647
  	cgrp = dentry->d_fsdata;
846c7bb05   Balbir Singh   Add cgroupstats
2648

bd89aabc6   Paul Menage   Control groups: R...
2649
2650
  	cgroup_iter_start(cgrp, &it);
  	while ((tsk = cgroup_iter_next(cgrp, &it))) {
846c7bb05   Balbir Singh   Add cgroupstats
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
  		switch (tsk->state) {
  		case TASK_RUNNING:
  			stats->nr_running++;
  			break;
  		case TASK_INTERRUPTIBLE:
  			stats->nr_sleeping++;
  			break;
  		case TASK_UNINTERRUPTIBLE:
  			stats->nr_uninterruptible++;
  			break;
  		case TASK_STOPPED:
  			stats->nr_stopped++;
  			break;
  		default:
  			if (delayacct_is_task_waiting_on_io(tsk))
  				stats->nr_io_wait++;
  			break;
  		}
  	}
bd89aabc6   Paul Menage   Control groups: R...
2670
  	cgroup_iter_end(cgrp, &it);
846c7bb05   Balbir Singh   Add cgroupstats
2671

846c7bb05   Balbir Singh   Add cgroupstats
2672
2673
2674
  err:
  	return ret;
  }
8f3ff2086   Paul Menage   cgroups: revert "...
2675

bbcb81d09   Paul Menage   Task Control Grou...
2676
  /*
102a775e3   Ben Blum   cgroups: add a re...
2677
   * seq_file methods for the tasks/procs files. The seq_file position is the
cc31edcee   Paul Menage   cgroups: convert ...
2678
   * next pid to display; the seq_file iterator is a pointer to the pid
102a775e3   Ben Blum   cgroups: add a re...
2679
   * in the cgroup->l->list array.
bbcb81d09   Paul Menage   Task Control Grou...
2680
   */
cc31edcee   Paul Menage   cgroups: convert ...
2681

102a775e3   Ben Blum   cgroups: add a re...
2682
  static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
bbcb81d09   Paul Menage   Task Control Grou...
2683
  {
cc31edcee   Paul Menage   cgroups: convert ...
2684
2685
2686
2687
2688
2689
  	/*
  	 * Initially we receive a position value that corresponds to
  	 * one more than the last pid shown (or 0 on the first call or
  	 * after a seek to the start). Use a binary-search to find the
  	 * next pid to display, if any
  	 */
102a775e3   Ben Blum   cgroups: add a re...
2690
  	struct cgroup_pidlist *l = s->private;
cc31edcee   Paul Menage   cgroups: convert ...
2691
2692
  	int index = 0, pid = *pos;
  	int *iter;
102a775e3   Ben Blum   cgroups: add a re...
2693
  	down_read(&l->mutex);
cc31edcee   Paul Menage   cgroups: convert ...
2694
  	if (pid) {
102a775e3   Ben Blum   cgroups: add a re...
2695
  		int end = l->length;
207777664   Stephen Rothwell   cgroup: remove un...
2696

cc31edcee   Paul Menage   cgroups: convert ...
2697
2698
  		while (index < end) {
  			int mid = (index + end) / 2;
102a775e3   Ben Blum   cgroups: add a re...
2699
  			if (l->list[mid] == pid) {
cc31edcee   Paul Menage   cgroups: convert ...
2700
2701
  				index = mid;
  				break;
102a775e3   Ben Blum   cgroups: add a re...
2702
  			} else if (l->list[mid] <= pid)
cc31edcee   Paul Menage   cgroups: convert ...
2703
2704
2705
2706
2707
2708
  				index = mid + 1;
  			else
  				end = mid;
  		}
  	}
  	/* If we're off the end of the array, we're done */
102a775e3   Ben Blum   cgroups: add a re...
2709
  	if (index >= l->length)
cc31edcee   Paul Menage   cgroups: convert ...
2710
2711
  		return NULL;
  	/* Update the abstract position to be the actual pid that we found */
102a775e3   Ben Blum   cgroups: add a re...
2712
  	iter = l->list + index;
cc31edcee   Paul Menage   cgroups: convert ...
2713
2714
2715
  	*pos = *iter;
  	return iter;
  }
102a775e3   Ben Blum   cgroups: add a re...
2716
  static void cgroup_pidlist_stop(struct seq_file *s, void *v)
cc31edcee   Paul Menage   cgroups: convert ...
2717
  {
102a775e3   Ben Blum   cgroups: add a re...
2718
2719
  	struct cgroup_pidlist *l = s->private;
  	up_read(&l->mutex);
cc31edcee   Paul Menage   cgroups: convert ...
2720
  }
102a775e3   Ben Blum   cgroups: add a re...
2721
  static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
cc31edcee   Paul Menage   cgroups: convert ...
2722
  {
102a775e3   Ben Blum   cgroups: add a re...
2723
2724
2725
  	struct cgroup_pidlist *l = s->private;
  	pid_t *p = v;
  	pid_t *end = l->list + l->length;
cc31edcee   Paul Menage   cgroups: convert ...
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
  	/*
  	 * Advance to the next pid in the array. If this goes off the
  	 * end, we're done
  	 */
  	p++;
  	if (p >= end) {
  		return NULL;
  	} else {
  		*pos = *p;
  		return p;
  	}
  }
102a775e3   Ben Blum   cgroups: add a re...
2738
  static int cgroup_pidlist_show(struct seq_file *s, void *v)
cc31edcee   Paul Menage   cgroups: convert ...
2739
2740
2741
2742
  {
  	return seq_printf(s, "%d
  ", *(int *)v);
  }
bbcb81d09   Paul Menage   Task Control Grou...
2743

102a775e3   Ben Blum   cgroups: add a re...
2744
2745
2746
2747
2748
2749
2750
2751
2752
  /*
   * seq_operations functions for iterating on pidlists through seq_file -
   * independent of whether it's tasks or procs
   */
  static const struct seq_operations cgroup_pidlist_seq_operations = {
  	.start = cgroup_pidlist_start,
  	.stop = cgroup_pidlist_stop,
  	.next = cgroup_pidlist_next,
  	.show = cgroup_pidlist_show,
cc31edcee   Paul Menage   cgroups: convert ...
2753
  };
102a775e3   Ben Blum   cgroups: add a re...
2754
  static void cgroup_release_pid_array(struct cgroup_pidlist *l)
cc31edcee   Paul Menage   cgroups: convert ...
2755
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
2756
2757
2758
2759
2760
2761
2762
  	/*
  	 * the case where we're the last user of this particular pidlist will
  	 * have us remove it from the cgroup's list, which entails taking the
  	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
  	 * pidlist_mutex, we have to take pidlist_mutex first.
  	 */
  	mutex_lock(&l->owner->pidlist_mutex);
102a775e3   Ben Blum   cgroups: add a re...
2763
2764
2765
  	down_write(&l->mutex);
  	BUG_ON(!l->use_count);
  	if (!--l->use_count) {
72a8cb30d   Ben Blum   cgroups: ensure c...
2766
2767
2768
  		/* we're the last user if refcount is 0; remove and free */
  		list_del(&l->links);
  		mutex_unlock(&l->owner->pidlist_mutex);
d1d9fd330   Ben Blum   cgroups: use vmal...
2769
  		pidlist_free(l->list);
72a8cb30d   Ben Blum   cgroups: ensure c...
2770
2771
2772
2773
  		put_pid_ns(l->key.ns);
  		up_write(&l->mutex);
  		kfree(l);
  		return;
cc31edcee   Paul Menage   cgroups: convert ...
2774
  	}
72a8cb30d   Ben Blum   cgroups: ensure c...
2775
  	mutex_unlock(&l->owner->pidlist_mutex);
102a775e3   Ben Blum   cgroups: add a re...
2776
  	up_write(&l->mutex);
bbcb81d09   Paul Menage   Task Control Grou...
2777
  }
102a775e3   Ben Blum   cgroups: add a re...
2778
  static int cgroup_pidlist_release(struct inode *inode, struct file *file)
cc31edcee   Paul Menage   cgroups: convert ...
2779
  {
102a775e3   Ben Blum   cgroups: add a re...
2780
  	struct cgroup_pidlist *l;
cc31edcee   Paul Menage   cgroups: convert ...
2781
2782
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
102a775e3   Ben Blum   cgroups: add a re...
2783
2784
2785
2786
2787
2788
  	/*
  	 * the seq_file will only be initialized if the file was opened for
  	 * reading; hence we check if it's not null only in that case.
  	 */
  	l = ((struct seq_file *)file->private_data)->private;
  	cgroup_release_pid_array(l);
cc31edcee   Paul Menage   cgroups: convert ...
2789
2790
  	return seq_release(inode, file);
  }
102a775e3   Ben Blum   cgroups: add a re...
2791
  static const struct file_operations cgroup_pidlist_operations = {
cc31edcee   Paul Menage   cgroups: convert ...
2792
2793
2794
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.write = cgroup_file_write,
102a775e3   Ben Blum   cgroups: add a re...
2795
  	.release = cgroup_pidlist_release,
cc31edcee   Paul Menage   cgroups: convert ...
2796
  };
bbcb81d09   Paul Menage   Task Control Grou...
2797
  /*
102a775e3   Ben Blum   cgroups: add a re...
2798
2799
2800
   * The following functions handle opens on a file that displays a pidlist
   * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
   * in the cgroup.
bbcb81d09   Paul Menage   Task Control Grou...
2801
   */
102a775e3   Ben Blum   cgroups: add a re...
2802
  /* helper function for the two below it */
72a8cb30d   Ben Blum   cgroups: ensure c...
2803
  static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
bbcb81d09   Paul Menage   Task Control Grou...
2804
  {
bd89aabc6   Paul Menage   Control groups: R...
2805
  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
72a8cb30d   Ben Blum   cgroups: ensure c...
2806
  	struct cgroup_pidlist *l;
cc31edcee   Paul Menage   cgroups: convert ...
2807
  	int retval;
bbcb81d09   Paul Menage   Task Control Grou...
2808

cc31edcee   Paul Menage   cgroups: convert ...
2809
  	/* Nothing to do for write-only files */
bbcb81d09   Paul Menage   Task Control Grou...
2810
2811
  	if (!(file->f_mode & FMODE_READ))
  		return 0;
102a775e3   Ben Blum   cgroups: add a re...
2812
  	/* have the array populated */
72a8cb30d   Ben Blum   cgroups: ensure c...
2813
  	retval = pidlist_array_load(cgrp, type, &l);
102a775e3   Ben Blum   cgroups: add a re...
2814
2815
2816
2817
  	if (retval)
  		return retval;
  	/* configure file information */
  	file->f_op = &cgroup_pidlist_operations;
cc31edcee   Paul Menage   cgroups: convert ...
2818

102a775e3   Ben Blum   cgroups: add a re...
2819
  	retval = seq_open(file, &cgroup_pidlist_seq_operations);
cc31edcee   Paul Menage   cgroups: convert ...
2820
  	if (retval) {
102a775e3   Ben Blum   cgroups: add a re...
2821
  		cgroup_release_pid_array(l);
cc31edcee   Paul Menage   cgroups: convert ...
2822
  		return retval;
bbcb81d09   Paul Menage   Task Control Grou...
2823
  	}
102a775e3   Ben Blum   cgroups: add a re...
2824
  	((struct seq_file *)file->private_data)->private = l;
bbcb81d09   Paul Menage   Task Control Grou...
2825
2826
  	return 0;
  }
102a775e3   Ben Blum   cgroups: add a re...
2827
2828
  static int cgroup_tasks_open(struct inode *unused, struct file *file)
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
2829
  	return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
102a775e3   Ben Blum   cgroups: add a re...
2830
2831
2832
  }
  static int cgroup_procs_open(struct inode *unused, struct file *file)
  {
72a8cb30d   Ben Blum   cgroups: ensure c...
2833
  	return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
102a775e3   Ben Blum   cgroups: add a re...
2834
  }
bbcb81d09   Paul Menage   Task Control Grou...
2835

bd89aabc6   Paul Menage   Control groups: R...
2836
  static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
81a6a5cdd   Paul Menage   Task Control Grou...
2837
2838
  					    struct cftype *cft)
  {
bd89aabc6   Paul Menage   Control groups: R...
2839
  	return notify_on_release(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
2840
  }
6379c1061   Paul Menage   cgroup files: mov...
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
  static int cgroup_write_notify_on_release(struct cgroup *cgrp,
  					  struct cftype *cft,
  					  u64 val)
  {
  	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
  	if (val)
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	else
  		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  	return 0;
  }
bbcb81d09   Paul Menage   Task Control Grou...
2852
  /*
0dea11687   Kirill A. Shutemov   cgroup: implement...
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
   * Unregister event and free resources.
   *
   * Gets called from workqueue.
   */
  static void cgroup_event_remove(struct work_struct *work)
  {
  	struct cgroup_event *event = container_of(work, struct cgroup_event,
  			remove);
  	struct cgroup *cgrp = event->cgrp;
  
  	/* TODO: check return code */
  	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
  
  	eventfd_ctx_put(event->eventfd);
0dea11687   Kirill A. Shutemov   cgroup: implement...
2867
  	kfree(event);
a0a4db548   Kirill A. Shutemov   cgroups: remove e...
2868
  	dput(cgrp->dentry);
0dea11687   Kirill A. Shutemov   cgroup: implement...
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
  }
  
  /*
   * Gets called on POLLHUP on eventfd when user closes it.
   *
   * Called with wqh->lock held and interrupts disabled.
   */
  static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
  		int sync, void *key)
  {
  	struct cgroup_event *event = container_of(wait,
  			struct cgroup_event, wait);
  	struct cgroup *cgrp = event->cgrp;
  	unsigned long flags = (unsigned long)key;
  
  	if (flags & POLLHUP) {
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
2885
  		remove_wait_queue_locked(event->wqh, &event->wait);
0dea11687   Kirill A. Shutemov   cgroup: implement...
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
  		spin_lock(&cgrp->event_list_lock);
  		list_del(&event->list);
  		spin_unlock(&cgrp->event_list_lock);
  		/*
  		 * We are in atomic context, but cgroup_event_remove() may
  		 * sleep, so we have to call it in workqueue.
  		 */
  		schedule_work(&event->remove);
  	}
  
  	return 0;
  }
  
  static void cgroup_event_ptable_queue_proc(struct file *file,
  		wait_queue_head_t *wqh, poll_table *pt)
  {
  	struct cgroup_event *event = container_of(pt,
  			struct cgroup_event, pt);
  
  	event->wqh = wqh;
  	add_wait_queue(wqh, &event->wait);
  }
  
  /*
   * Parse input and register new cgroup event handler.
   *
   * Input must be in format '<event_fd> <control_fd> <args>'.
   * Interpretation of args is defined by control file implementation.
   */
  static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
  				      const char *buffer)
  {
  	struct cgroup_event *event = NULL;
  	unsigned int efd, cfd;
  	struct file *efile = NULL;
  	struct file *cfile = NULL;
  	char *endp;
  	int ret;
  
  	efd = simple_strtoul(buffer, &endp, 10);
  	if (*endp != ' ')
  		return -EINVAL;
  	buffer = endp + 1;
  
  	cfd = simple_strtoul(buffer, &endp, 10);
  	if ((*endp != ' ') && (*endp != '\0'))
  		return -EINVAL;
  	buffer = endp + 1;
  
  	event = kzalloc(sizeof(*event), GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
  	event->cgrp = cgrp;
  	INIT_LIST_HEAD(&event->list);
  	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
  	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
  	INIT_WORK(&event->remove, cgroup_event_remove);
  
  	efile = eventfd_fget(efd);
  	if (IS_ERR(efile)) {
  		ret = PTR_ERR(efile);
  		goto fail;
  	}
  
  	event->eventfd = eventfd_ctx_fileget(efile);
  	if (IS_ERR(event->eventfd)) {
  		ret = PTR_ERR(event->eventfd);
  		goto fail;
  	}
  
  	cfile = fget(cfd);
  	if (!cfile) {
  		ret = -EBADF;
  		goto fail;
  	}
  
  	/* the process need read permission on control file */
  	ret = file_permission(cfile, MAY_READ);
  	if (ret < 0)
  		goto fail;
  
  	event->cft = __file_cft(cfile);
  	if (IS_ERR(event->cft)) {
  		ret = PTR_ERR(event->cft);
  		goto fail;
  	}
  
  	if (!event->cft->register_event || !event->cft->unregister_event) {
  		ret = -EINVAL;
  		goto fail;
  	}
  
  	ret = event->cft->register_event(cgrp, event->cft,
  			event->eventfd, buffer);
  	if (ret)
  		goto fail;
  
  	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
  		event->cft->unregister_event(cgrp, event->cft, event->eventfd);
  		ret = 0;
  		goto fail;
  	}
a0a4db548   Kirill A. Shutemov   cgroups: remove e...
2988
2989
2990
2991
2992
2993
  	/*
  	 * Events should be removed after rmdir of cgroup directory, but before
  	 * destroying subsystem state objects. Let's take reference to cgroup
  	 * directory dentry to do that.
  	 */
  	dget(cgrp->dentry);
0dea11687   Kirill A. Shutemov   cgroup: implement...
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
  	spin_lock(&cgrp->event_list_lock);
  	list_add(&event->list, &cgrp->event_list);
  	spin_unlock(&cgrp->event_list_lock);
  
  	fput(cfile);
  	fput(efile);
  
  	return 0;
  
  fail:
  	if (cfile)
  		fput(cfile);
  
  	if (event && event->eventfd && !IS_ERR(event->eventfd))
  		eventfd_ctx_put(event->eventfd);
  
  	if (!IS_ERR_OR_NULL(efile))
  		fput(efile);
  
  	kfree(event);
  
  	return ret;
  }
  
  /*
bbcb81d09   Paul Menage   Task Control Grou...
3019
3020
   * for the common functions, 'private' gives the type of file
   */
102a775e3   Ben Blum   cgroups: add a re...
3021
3022
  /* for hysterical raisins, we can't put this on the older files */
  #define CGROUP_FILE_GENERIC_PREFIX "cgroup."
81a6a5cdd   Paul Menage   Task Control Grou...
3023
3024
3025
3026
  static struct cftype files[] = {
  	{
  		.name = "tasks",
  		.open = cgroup_tasks_open,
af351026a   Paul Menage   cgroup files: tur...
3027
  		.write_u64 = cgroup_tasks_write,
102a775e3   Ben Blum   cgroups: add a re...
3028
  		.release = cgroup_pidlist_release,
099fca322   Li Zefan   cgroups: show cor...
3029
  		.mode = S_IRUGO | S_IWUSR,
81a6a5cdd   Paul Menage   Task Control Grou...
3030
  	},
102a775e3   Ben Blum   cgroups: add a re...
3031
3032
3033
3034
3035
3036
3037
  	{
  		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
  		.open = cgroup_procs_open,
  		/* .write_u64 = cgroup_procs_write, TODO */
  		.release = cgroup_pidlist_release,
  		.mode = S_IRUGO,
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
3038
3039
  	{
  		.name = "notify_on_release",
f4c753b7e   Paul Menage   CGroup API files:...
3040
  		.read_u64 = cgroup_read_notify_on_release,
6379c1061   Paul Menage   cgroup files: mov...
3041
  		.write_u64 = cgroup_write_notify_on_release,
81a6a5cdd   Paul Menage   Task Control Grou...
3042
  	},
0dea11687   Kirill A. Shutemov   cgroup: implement...
3043
3044
3045
3046
3047
  	{
  		.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
  		.write_string = cgroup_write_event_control,
  		.mode = S_IWUGO,
  	},
81a6a5cdd   Paul Menage   Task Control Grou...
3048
3049
3050
3051
  };
  
  static struct cftype cft_release_agent = {
  	.name = "release_agent",
e788e066c   Paul Menage   cgroup files: mov...
3052
3053
3054
  	.read_seq_string = cgroup_release_agent_show,
  	.write_string = cgroup_release_agent_write,
  	.max_write_len = PATH_MAX,
bbcb81d09   Paul Menage   Task Control Grou...
3055
  };
bd89aabc6   Paul Menage   Control groups: R...
3056
  static int cgroup_populate_dir(struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3057
3058
3059
3060
3061
  {
  	int err;
  	struct cgroup_subsys *ss;
  
  	/* First clear out any existing files */
bd89aabc6   Paul Menage   Control groups: R...
3062
  	cgroup_clear_directory(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
3063

bd89aabc6   Paul Menage   Control groups: R...
3064
  	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
bbcb81d09   Paul Menage   Task Control Grou...
3065
3066
  	if (err < 0)
  		return err;
bd89aabc6   Paul Menage   Control groups: R...
3067
3068
  	if (cgrp == cgrp->top_cgroup) {
  		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
81a6a5cdd   Paul Menage   Task Control Grou...
3069
3070
  			return err;
  	}
bd89aabc6   Paul Menage   Control groups: R...
3071
3072
  	for_each_subsys(cgrp->root, ss) {
  		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
ddbcc7e8e   Paul Menage   Task Control Grou...
3073
3074
  			return err;
  	}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
  	/* This cgroup is ready now */
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		/*
  		 * Update id->css pointer and make this css visible from
  		 * CSS ID functions. This pointer will be dereferened
  		 * from RCU-read-side without locks.
  		 */
  		if (css->id)
  			rcu_assign_pointer(css->id->css, css);
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3086
3087
3088
3089
3090
3091
  
  	return 0;
  }
  
  static void init_cgroup_css(struct cgroup_subsys_state *css,
  			       struct cgroup_subsys *ss,
bd89aabc6   Paul Menage   Control groups: R...
3092
  			       struct cgroup *cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3093
  {
bd89aabc6   Paul Menage   Control groups: R...
3094
  	css->cgroup = cgrp;
e7c5ec919   Paul Menage   cgroups: add css_...
3095
  	atomic_set(&css->refcnt, 1);
ddbcc7e8e   Paul Menage   Task Control Grou...
3096
  	css->flags = 0;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3097
  	css->id = NULL;
bd89aabc6   Paul Menage   Control groups: R...
3098
  	if (cgrp == dummytop)
ddbcc7e8e   Paul Menage   Task Control Grou...
3099
  		set_bit(CSS_ROOT, &css->flags);
bd89aabc6   Paul Menage   Control groups: R...
3100
3101
  	BUG_ON(cgrp->subsys[ss->subsys_id]);
  	cgrp->subsys[ss->subsys_id] = css;
ddbcc7e8e   Paul Menage   Task Control Grou...
3102
  }
999cd8a45   Paul Menage   cgroups: add a pe...
3103
3104
3105
3106
  static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
  {
  	/* We need to take each hierarchy_mutex in a consistent order */
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3107
3108
3109
3110
  	/*
  	 * No worry about a race with rebind_subsystems that might mess up the
  	 * locking order, since both parties are under cgroup_mutex.
  	 */
999cd8a45   Paul Menage   cgroups: add a pe...
3111
3112
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3113
3114
  		if (ss == NULL)
  			continue;
999cd8a45   Paul Menage   cgroups: add a pe...
3115
  		if (ss->root == root)
cfebe563b   Li Zefan   cgroups: fix lock...
3116
  			mutex_lock(&ss->hierarchy_mutex);
999cd8a45   Paul Menage   cgroups: add a pe...
3117
3118
3119
3120
3121
3122
3123
3124
3125
  	}
  }
  
  static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  {
  	int i;
  
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3126
3127
  		if (ss == NULL)
  			continue;
999cd8a45   Paul Menage   cgroups: add a pe...
3128
3129
3130
3131
  		if (ss->root == root)
  			mutex_unlock(&ss->hierarchy_mutex);
  	}
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
3132
  /*
a043e3b2c   Li Zefan   cgroup: fix comments
3133
3134
3135
3136
   * cgroup_create - create a cgroup
   * @parent: cgroup that will be parent of the new cgroup
   * @dentry: dentry of the new cgroup
   * @mode: mode to set on new inode
ddbcc7e8e   Paul Menage   Task Control Grou...
3137
   *
a043e3b2c   Li Zefan   cgroup: fix comments
3138
   * Must be called with the mutex on the parent inode held
ddbcc7e8e   Paul Menage   Task Control Grou...
3139
   */
ddbcc7e8e   Paul Menage   Task Control Grou...
3140
  static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
099fca322   Li Zefan   cgroups: show cor...
3141
  			     mode_t mode)
ddbcc7e8e   Paul Menage   Task Control Grou...
3142
  {
bd89aabc6   Paul Menage   Control groups: R...
3143
  	struct cgroup *cgrp;
ddbcc7e8e   Paul Menage   Task Control Grou...
3144
3145
3146
3147
  	struct cgroupfs_root *root = parent->root;
  	int err = 0;
  	struct cgroup_subsys *ss;
  	struct super_block *sb = root->sb;
bd89aabc6   Paul Menage   Control groups: R...
3148
3149
  	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
  	if (!cgrp)
ddbcc7e8e   Paul Menage   Task Control Grou...
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
  		return -ENOMEM;
  
  	/* Grab a reference on the superblock so the hierarchy doesn't
  	 * get deleted on unmount if there are child cgroups.  This
  	 * can be done outside cgroup_mutex, since the sb can't
  	 * disappear while someone has an open control file on the
  	 * fs */
  	atomic_inc(&sb->s_active);
  
  	mutex_lock(&cgroup_mutex);
cc31edcee   Paul Menage   cgroups: convert ...
3160
  	init_cgroup_housekeeping(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3161

bd89aabc6   Paul Menage   Control groups: R...
3162
3163
3164
  	cgrp->parent = parent;
  	cgrp->root = parent->root;
  	cgrp->top_cgroup = parent->top_cgroup;
ddbcc7e8e   Paul Menage   Task Control Grou...
3165

b6abdb0e6   Li Zefan   cgroup: fix defau...
3166
3167
  	if (notify_on_release(parent))
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3168
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
3169
  		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
4528fd059   Li Zefan   cgroups: fix to r...
3170

ddbcc7e8e   Paul Menage   Task Control Grou...
3171
3172
3173
3174
  		if (IS_ERR(css)) {
  			err = PTR_ERR(css);
  			goto err_destroy;
  		}
bd89aabc6   Paul Menage   Control groups: R...
3175
  		init_cgroup_css(css, ss, cgrp);
4528fd059   Li Zefan   cgroups: fix to r...
3176
3177
3178
  		if (ss->use_id) {
  			err = alloc_css_id(ss, parent, cgrp);
  			if (err)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3179
  				goto err_destroy;
4528fd059   Li Zefan   cgroups: fix to r...
3180
  		}
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3181
  		/* At error, ->destroy() callback has to free assigned ID. */
ddbcc7e8e   Paul Menage   Task Control Grou...
3182
  	}
999cd8a45   Paul Menage   cgroups: add a pe...
3183
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
3184
  	list_add(&cgrp->sibling, &cgrp->parent->children);
999cd8a45   Paul Menage   cgroups: add a pe...
3185
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
3186
  	root->number_of_cgroups++;
bd89aabc6   Paul Menage   Control groups: R...
3187
  	err = cgroup_create_dir(cgrp, dentry, mode);
ddbcc7e8e   Paul Menage   Task Control Grou...
3188
3189
3190
3191
  	if (err < 0)
  		goto err_remove;
  
  	/* The cgroup directory was pre-locked for us */
bd89aabc6   Paul Menage   Control groups: R...
3192
  	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
ddbcc7e8e   Paul Menage   Task Control Grou...
3193

bd89aabc6   Paul Menage   Control groups: R...
3194
  	err = cgroup_populate_dir(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3195
3196
3197
  	/* If err < 0, we have a half-filled directory - oh well ;) */
  
  	mutex_unlock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
3198
  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
3199
3200
3201
3202
  
  	return 0;
  
   err_remove:
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
3203
  	cgroup_lock_hierarchy(root);
bd89aabc6   Paul Menage   Control groups: R...
3204
  	list_del(&cgrp->sibling);
baef99a08   KAMEZAWA Hiroyuki   cgroups: use hier...
3205
  	cgroup_unlock_hierarchy(root);
ddbcc7e8e   Paul Menage   Task Control Grou...
3206
3207
3208
3209
3210
  	root->number_of_cgroups--;
  
   err_destroy:
  
  	for_each_subsys(root, ss) {
bd89aabc6   Paul Menage   Control groups: R...
3211
3212
  		if (cgrp->subsys[ss->subsys_id])
  			ss->destroy(ss, cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3213
3214
3215
3216
3217
3218
  	}
  
  	mutex_unlock(&cgroup_mutex);
  
  	/* Release the reference count that we took on the superblock */
  	deactivate_super(sb);
bd89aabc6   Paul Menage   Control groups: R...
3219
  	kfree(cgrp);
ddbcc7e8e   Paul Menage   Task Control Grou...
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
  	return err;
  }
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  {
  	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
  
  	/* the vfs holds inode->i_mutex already */
  	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
55b6fd016   Li Zefan   cgroup: uninline ...
3230
  static int cgroup_has_css_refs(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
3231
3232
3233
  {
  	/* Check the reference count on each subsystem. Since we
  	 * already established that there are no tasks in the
e7c5ec919   Paul Menage   cgroups: add css_...
3234
  	 * cgroup, if the css refcount is also 1, then there should
81a6a5cdd   Paul Menage   Task Control Grou...
3235
3236
3237
3238
3239
3240
3241
  	 * be no outstanding references, so the subsystem is safe to
  	 * destroy. We scan across all subsystems rather than using
  	 * the per-hierarchy linked list of mounted subsystems since
  	 * we can be called via check_for_release() with no
  	 * synchronization other than RCU, and the subsystem linked
  	 * list isn't RCU-safe */
  	int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3242
3243
3244
3245
3246
  	/*
  	 * We won't need to lock the subsys array, because the subsystems
  	 * we're concerned about aren't going anywhere since our cgroup root
  	 * has a reference on them.
  	 */
81a6a5cdd   Paul Menage   Task Control Grou...
3247
3248
3249
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
  		struct cgroup_subsys_state *css;
aae8aab40   Ben Blum   cgroups: revamp s...
3250
3251
  		/* Skip subsystems not present or not in this hierarchy */
  		if (ss == NULL || ss->root != cgrp->root)
81a6a5cdd   Paul Menage   Task Control Grou...
3252
  			continue;
bd89aabc6   Paul Menage   Control groups: R...
3253
  		css = cgrp->subsys[ss->subsys_id];
81a6a5cdd   Paul Menage   Task Control Grou...
3254
3255
3256
3257
3258
3259
  		/* When called from check_for_release() it's possible
  		 * that by this point the cgroup has been removed
  		 * and the css deleted. But a false-positive doesn't
  		 * matter, since it can only happen if the cgroup
  		 * has been deleted and hence no longer needs the
  		 * release agent to be called anyway. */
e7c5ec919   Paul Menage   cgroups: add css_...
3260
  		if (css && (atomic_read(&css->refcnt) > 1))
81a6a5cdd   Paul Menage   Task Control Grou...
3261
  			return 1;
81a6a5cdd   Paul Menage   Task Control Grou...
3262
3263
3264
  	}
  	return 0;
  }
e7c5ec919   Paul Menage   cgroups: add css_...
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
  /*
   * Atomically mark all (or else none) of the cgroup's CSS objects as
   * CSS_REMOVED. Return true on success, or false if the cgroup has
   * busy subsystems. Call with cgroup_mutex held
   */
  
  static int cgroup_clear_css_refs(struct cgroup *cgrp)
  {
  	struct cgroup_subsys *ss;
  	unsigned long flags;
  	bool failed = false;
  	local_irq_save(flags);
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		int refcnt;
804b3c28a   Paul Menage   cgroups: add cpu_...
3280
  		while (1) {
e7c5ec919   Paul Menage   cgroups: add css_...
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
  			/* We can only remove a CSS with a refcnt==1 */
  			refcnt = atomic_read(&css->refcnt);
  			if (refcnt > 1) {
  				failed = true;
  				goto done;
  			}
  			BUG_ON(!refcnt);
  			/*
  			 * Drop the refcnt to 0 while we check other
  			 * subsystems. This will cause any racing
  			 * css_tryget() to spin until we set the
  			 * CSS_REMOVED bits or abort
  			 */
804b3c28a   Paul Menage   cgroups: add cpu_...
3294
3295
3296
3297
  			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
  				break;
  			cpu_relax();
  		}
e7c5ec919   Paul Menage   cgroups: add css_...
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
  	}
   done:
  	for_each_subsys(cgrp->root, ss) {
  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  		if (failed) {
  			/*
  			 * Restore old refcnt if we previously managed
  			 * to clear it from 1 to 0
  			 */
  			if (!atomic_read(&css->refcnt))
  				atomic_set(&css->refcnt, 1);
  		} else {
  			/* Commit the fact that the CSS is removed */
  			set_bit(CSS_REMOVED, &css->flags);
  		}
  	}
  	local_irq_restore(flags);
  	return !failed;
  }
ddbcc7e8e   Paul Menage   Task Control Grou...
3317
3318
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
bd89aabc6   Paul Menage   Control groups: R...
3319
  	struct cgroup *cgrp = dentry->d_fsdata;
ddbcc7e8e   Paul Menage   Task Control Grou...
3320
3321
  	struct dentry *d;
  	struct cgroup *parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3322
  	DEFINE_WAIT(wait);
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
3323
  	struct cgroup_event *event, *tmp;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3324
  	int ret;
ddbcc7e8e   Paul Menage   Task Control Grou...
3325
3326
  
  	/* the vfs holds both inode->i_mutex already */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3327
  again:
ddbcc7e8e   Paul Menage   Task Control Grou...
3328
  	mutex_lock(&cgroup_mutex);
bd89aabc6   Paul Menage   Control groups: R...
3329
  	if (atomic_read(&cgrp->count) != 0) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3330
3331
3332
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
bd89aabc6   Paul Menage   Control groups: R...
3333
  	if (!list_empty(&cgrp->children)) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3334
3335
3336
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
3337
  	mutex_unlock(&cgroup_mutex);
a043e3b2c   Li Zefan   cgroup: fix comments
3338

4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
3339
  	/*
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
  	 * In general, subsystem has no css->refcnt after pre_destroy(). But
  	 * in racy cases, subsystem may have to get css->refcnt after
  	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
  	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
  	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
  	 * and subsystem's reference count handling. Please see css_get/put
  	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
  	 */
  	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  
  	/*
a043e3b2c   Li Zefan   cgroup: fix comments
3351
3352
  	 * Call pre_destroy handlers of subsys. Notify subsystems
  	 * that rmdir() request comes.
4fca88c87   KAMEZAWA Hiroyuki   memory cgroup enh...
3353
  	 */
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3354
  	ret = cgroup_call_pre_destroy(cgrp);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3355
3356
  	if (ret) {
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3357
  		return ret;
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3358
  	}
ddbcc7e8e   Paul Menage   Task Control Grou...
3359

3fa59dfbc   KAMEZAWA Hiroyuki   cgroup: fix poten...
3360
3361
  	mutex_lock(&cgroup_mutex);
  	parent = cgrp->parent;
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3362
  	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3363
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3364
3365
3366
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3367
  	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3368
3369
  	if (!cgroup_clear_css_refs(cgrp)) {
  		mutex_unlock(&cgroup_mutex);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3370
3371
3372
3373
3374
3375
  		/*
  		 * Because someone may call cgroup_wakeup_rmdir_waiter() before
  		 * prepare_to_wait(), we need to check this flag.
  		 */
  		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
  			schedule();
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3376
3377
3378
3379
3380
3381
3382
3383
3384
  		finish_wait(&cgroup_rmdir_waitq, &wait);
  		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  		if (signal_pending(current))
  			return -EINTR;
  		goto again;
  	}
  	/* NO css_tryget() can success after here. */
  	finish_wait(&cgroup_rmdir_waitq, &wait);
  	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
ddbcc7e8e   Paul Menage   Task Control Grou...
3385

81a6a5cdd   Paul Menage   Task Control Grou...
3386
  	spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
3387
3388
3389
  	set_bit(CGRP_REMOVED, &cgrp->flags);
  	if (!list_empty(&cgrp->release_list))
  		list_del(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
3390
  	spin_unlock(&release_list_lock);
999cd8a45   Paul Menage   cgroups: add a pe...
3391
3392
3393
  
  	cgroup_lock_hierarchy(cgrp->root);
  	/* delete this cgroup from parent->children */
bd89aabc6   Paul Menage   Control groups: R...
3394
  	list_del(&cgrp->sibling);
999cd8a45   Paul Menage   cgroups: add a pe...
3395
  	cgroup_unlock_hierarchy(cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
3396
3397
  	spin_lock(&cgrp->dentry->d_lock);
  	d = dget(cgrp->dentry);
ddbcc7e8e   Paul Menage   Task Control Grou...
3398
3399
3400
3401
  	spin_unlock(&d->d_lock);
  
  	cgroup_d_remove_dir(d);
  	dput(d);
ddbcc7e8e   Paul Menage   Task Control Grou...
3402

bd89aabc6   Paul Menage   Control groups: R...
3403
  	set_bit(CGRP_RELEASABLE, &parent->flags);
81a6a5cdd   Paul Menage   Task Control Grou...
3404
  	check_for_release(parent);
4ab78683c   Kirill A. Shutemov   cgroups: fix race...
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
  	/*
  	 * Unregister events and notify userspace.
  	 * Notify userspace about cgroup removing only after rmdir of cgroup
  	 * directory to avoid race between userspace and kernelspace
  	 */
  	spin_lock(&cgrp->event_list_lock);
  	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
  		list_del(&event->list);
  		remove_wait_queue(event->wqh, &event->wait);
  		eventfd_signal(event->eventfd, 1);
  		schedule_work(&event->remove);
  	}
  	spin_unlock(&cgrp->event_list_lock);
ddbcc7e8e   Paul Menage   Task Control Grou...
3418
  	mutex_unlock(&cgroup_mutex);
ddbcc7e8e   Paul Menage   Task Control Grou...
3419
3420
  	return 0;
  }
06a119204   Li Zefan   cgroup: annotate ...
3421
  static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
ddbcc7e8e   Paul Menage   Task Control Grou...
3422
  {
ddbcc7e8e   Paul Menage   Task Control Grou...
3423
  	struct cgroup_subsys_state *css;
cfe36bde5   Diego Calleja   Improve cgroup pr...
3424
3425
3426
  
  	printk(KERN_INFO "Initializing cgroup subsys %s
  ", ss->name);
ddbcc7e8e   Paul Menage   Task Control Grou...
3427
3428
  
  	/* Create the top cgroup state for this subsystem */
33a68ac1c   Li Zefan   cgroups: add inac...
3429
  	list_add(&ss->sibling, &rootnode.subsys_list);
ddbcc7e8e   Paul Menage   Task Control Grou...
3430
3431
3432
3433
3434
  	ss->root = &rootnode;
  	css = ss->create(ss, dummytop);
  	/* We don't handle early failures gracefully */
  	BUG_ON(IS_ERR(css));
  	init_cgroup_css(css, ss, dummytop);
e8d55fdeb   Li Zefan   cgroups: simplify...
3435
  	/* Update the init_css_set to contain a subsys
817929ec2   Paul Menage   Task Control Grou...
3436
  	 * pointer to this state - since the subsystem is
e8d55fdeb   Li Zefan   cgroups: simplify...
3437
3438
3439
  	 * newly registered, all tasks and hence the
  	 * init_css_set is in the subsystem's top cgroup. */
  	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
ddbcc7e8e   Paul Menage   Task Control Grou...
3440
3441
  
  	need_forkexit_callback |= ss->fork || ss->exit;
e8d55fdeb   Li Zefan   cgroups: simplify...
3442
3443
3444
3445
  	/* At system boot, before all subsystems have been
  	 * registered, no tasks have been forked, so we don't
  	 * need to invoke fork callbacks here. */
  	BUG_ON(!list_empty(&init_task.tasks));
999cd8a45   Paul Menage   cgroups: add a pe...
3446
  	mutex_init(&ss->hierarchy_mutex);
cfebe563b   Li Zefan   cgroups: fix lock...
3447
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ddbcc7e8e   Paul Menage   Task Control Grou...
3448
  	ss->active = 1;
e6a1105ba   Ben Blum   cgroups: subsyste...
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
  
  	/* this function shouldn't be used with modular subsystems, since they
  	 * need to register a subsys_id, among other things */
  	BUG_ON(ss->module);
  }
  
  /**
   * cgroup_load_subsys: load and register a modular subsystem at runtime
   * @ss: the subsystem to load
   *
   * This function should be called in a modular subsystem's initcall. If the
   * subsytem is built as a module, it will be assigned a new subsys_id and set
   * up for use. If the subsystem is built-in anyway, work is delegated to the
   * simpler cgroup_init_subsys.
   */
  int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
  {
  	int i;
  	struct cgroup_subsys_state *css;
  
  	/* check name and function validity */
  	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
  	    ss->create == NULL || ss->destroy == NULL)
  		return -EINVAL;
  
  	/*
  	 * we don't support callbacks in modular subsystems. this check is
  	 * before the ss->module check for consistency; a subsystem that could
  	 * be a module should still have no callbacks even if the user isn't
  	 * compiling it as one.
  	 */
  	if (ss->fork || ss->exit)
  		return -EINVAL;
  
  	/*
  	 * an optionally modular subsystem is built-in: we want to do nothing,
  	 * since cgroup_init_subsys will have already taken care of it.
  	 */
  	if (ss->module == NULL) {
  		/* a few sanity checks */
  		BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
  		BUG_ON(subsys[ss->subsys_id] != ss);
  		return 0;
  	}
  
  	/*
  	 * need to register a subsys id before anything else - for example,
  	 * init_cgroup_css needs it.
  	 */
  	mutex_lock(&cgroup_mutex);
  	/* find the first empty slot in the array */
  	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
  		if (subsys[i] == NULL)
  			break;
  	}
  	if (i == CGROUP_SUBSYS_COUNT) {
  		/* maximum number of subsystems already registered! */
  		mutex_unlock(&cgroup_mutex);
  		return -EBUSY;
  	}
  	/* assign ourselves the subsys_id */
  	ss->subsys_id = i;
  	subsys[i] = ss;
  
  	/*
  	 * no ss->create seems to need anything important in the ss struct, so
  	 * this can happen first (i.e. before the rootnode attachment).
  	 */
  	css = ss->create(ss, dummytop);
  	if (IS_ERR(css)) {
  		/* failure case - need to deassign the subsys[] slot. */
  		subsys[i] = NULL;
  		mutex_unlock(&cgroup_mutex);
  		return PTR_ERR(css);
  	}
  
  	list_add(&ss->sibling, &rootnode.subsys_list);
  	ss->root = &rootnode;
  
  	/* our new subsystem will be attached to the dummy hierarchy. */
  	init_cgroup_css(css, ss, dummytop);
  	/* init_idr must be after init_cgroup_css because it sets css->id. */
  	if (ss->use_id) {
  		int ret = cgroup_init_idr(ss, css);
  		if (ret) {
  			dummytop->subsys[ss->subsys_id] = NULL;
  			ss->destroy(ss, dummytop);
  			subsys[i] = NULL;
  			mutex_unlock(&cgroup_mutex);
  			return ret;
  		}
  	}
  
  	/*
  	 * Now we need to entangle the css into the existing css_sets. unlike
  	 * in cgroup_init_subsys, there are now multiple css_sets, so each one
  	 * will need a new pointer to it; done by iterating the css_set_table.
  	 * furthermore, modifying the existing css_sets will corrupt the hash
  	 * table state, so each changed css_set will need its hash recomputed.
  	 * this is all done under the css_set_lock.
  	 */
  	write_lock(&css_set_lock);
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
  		struct css_set *cg;
  		struct hlist_node *node, *tmp;
  		struct hlist_head *bucket = &css_set_table[i], *new_bucket;
  
  		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
  			/* skip entries that we already rehashed */
  			if (cg->subsys[ss->subsys_id])
  				continue;
  			/* remove existing entry */
  			hlist_del(&cg->hlist);
  			/* set new value */
  			cg->subsys[ss->subsys_id] = css;
  			/* recompute hash and restore entry */
  			new_bucket = css_set_hash(cg->subsys);
  			hlist_add_head(&cg->hlist, new_bucket);
  		}
  	}
  	write_unlock(&css_set_lock);
  
  	mutex_init(&ss->hierarchy_mutex);
  	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
  	ss->active = 1;
e6a1105ba   Ben Blum   cgroups: subsyste...
3574
3575
3576
  	/* success! */
  	mutex_unlock(&cgroup_mutex);
  	return 0;
ddbcc7e8e   Paul Menage   Task Control Grou...
3577
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
3578
  EXPORT_SYMBOL_GPL(cgroup_load_subsys);
ddbcc7e8e   Paul Menage   Task Control Grou...
3579
3580
  
  /**
cf5d5941f   Ben Blum   cgroups: subsyste...
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
   * cgroup_unload_subsys: unload a modular subsystem
   * @ss: the subsystem to unload
   *
   * This function should be called in a modular subsystem's exitcall. When this
   * function is invoked, the refcount on the subsystem's module will be 0, so
   * the subsystem will not be attached to any hierarchy.
   */
  void cgroup_unload_subsys(struct cgroup_subsys *ss)
  {
  	struct cg_cgroup_link *link;
  	struct hlist_head *hhead;
  
  	BUG_ON(ss->module == NULL);
  
  	/*
  	 * we shouldn't be called if the subsystem is in use, and the use of
  	 * try_module_get in parse_cgroupfs_options should ensure that it
  	 * doesn't start being used while we're killing it off.
  	 */
  	BUG_ON(ss->root != &rootnode);
  
  	mutex_lock(&cgroup_mutex);
  	/* deassign the subsys_id */
  	BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
  	subsys[ss->subsys_id] = NULL;
  
  	/* remove subsystem from rootnode's list of subsystems */
  	list_del(&ss->sibling);
  
  	/*
  	 * disentangle the css from all css_sets attached to the dummytop. as
  	 * in loading, we need to pay our respects to the hashtable gods.
  	 */
  	write_lock(&css_set_lock);
  	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
  		struct css_set *cg = link->cg;
  
  		hlist_del(&cg->hlist);
  		BUG_ON(!cg->subsys[ss->subsys_id]);
  		cg->subsys[ss->subsys_id] = NULL;
  		hhead = css_set_hash(cg->subsys);
  		hlist_add_head(&cg->hlist, hhead);
  	}
  	write_unlock(&css_set_lock);
  
  	/*
  	 * remove subsystem's css from the dummytop and free it - need to free
  	 * before marking as null because ss->destroy needs the cgrp->subsys
  	 * pointer to find their state. note that this also takes care of
  	 * freeing the css_id.
  	 */
  	ss->destroy(ss, dummytop);
  	dummytop->subsys[ss->subsys_id] = NULL;
  
  	mutex_unlock(&cgroup_mutex);
  }
  EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3640
3641
3642
3643
   * cgroup_init_early - cgroup initialization at system boot
   *
   * Initialize cgroups at system boot, and initialize any
   * subsystems that request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
3644
3645
3646
3647
   */
  int __init cgroup_init_early(void)
  {
  	int i;
146aa1bd0   Lai Jiangshan   cgroups: fix prob...
3648
  	atomic_set(&init_css_set.refcount, 1);
817929ec2   Paul Menage   Task Control Grou...
3649
3650
  	INIT_LIST_HEAD(&init_css_set.cg_links);
  	INIT_LIST_HEAD(&init_css_set.tasks);
472b1053f   Li Zefan   cgroups: use a ha...
3651
  	INIT_HLIST_NODE(&init_css_set.hlist);
817929ec2   Paul Menage   Task Control Grou...
3652
  	css_set_count = 1;
ddbcc7e8e   Paul Menage   Task Control Grou...
3653
  	init_cgroup_root(&rootnode);
817929ec2   Paul Menage   Task Control Grou...
3654
3655
3656
3657
  	root_count = 1;
  	init_task.cgroups = &init_css_set;
  
  	init_css_set_link.cg = &init_css_set;
7717f7ba9   Paul Menage   cgroups: add a ba...
3658
  	init_css_set_link.cgrp = dummytop;
bd89aabc6   Paul Menage   Control groups: R...
3659
  	list_add(&init_css_set_link.cgrp_link_list,
817929ec2   Paul Menage   Task Control Grou...
3660
3661
3662
  		 &rootnode.top_cgroup.css_sets);
  	list_add(&init_css_set_link.cg_link_list,
  		 &init_css_set.cg_links);
ddbcc7e8e   Paul Menage   Task Control Grou...
3663

472b1053f   Li Zefan   cgroups: use a ha...
3664
3665
  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
  		INIT_HLIST_HEAD(&css_set_table[i]);
aae8aab40   Ben Blum   cgroups: revamp s...
3666
3667
  	/* at bootup time, we don't worry about modular subsystems */
  	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3668
3669
3670
3671
3672
3673
3674
  		struct cgroup_subsys *ss = subsys[i];
  
  		BUG_ON(!ss->name);
  		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
  		BUG_ON(!ss->create);
  		BUG_ON(!ss->destroy);
  		if (ss->subsys_id != i) {
cfe36bde5   Diego Calleja   Improve cgroup pr...
3675
3676
  			printk(KERN_ERR "cgroup: Subsys %s id == %d
  ",
ddbcc7e8e   Paul Menage   Task Control Grou...
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
  			       ss->name, ss->subsys_id);
  			BUG();
  		}
  
  		if (ss->early_init)
  			cgroup_init_subsys(ss);
  	}
  	return 0;
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3688
3689
3690
3691
   * cgroup_init - cgroup initialization
   *
   * Register cgroup filesystem and /proc file, and initialize
   * any subsystems that didn't request early init.
ddbcc7e8e   Paul Menage   Task Control Grou...
3692
3693
3694
3695
3696
   */
  int __init cgroup_init(void)
  {
  	int err;
  	int i;
472b1053f   Li Zefan   cgroups: use a ha...
3697
  	struct hlist_head *hhead;
a424316ca   Paul Menage   Task Control Grou...
3698
3699
3700
3701
  
  	err = bdi_init(&cgroup_backing_dev_info);
  	if (err)
  		return err;
ddbcc7e8e   Paul Menage   Task Control Grou...
3702

aae8aab40   Ben Blum   cgroups: revamp s...
3703
3704
  	/* at bootup time, we don't worry about modular subsystems */
  	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
ddbcc7e8e   Paul Menage   Task Control Grou...
3705
3706
3707
  		struct cgroup_subsys *ss = subsys[i];
  		if (!ss->early_init)
  			cgroup_init_subsys(ss);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
3708
  		if (ss->use_id)
e6a1105ba   Ben Blum   cgroups: subsyste...
3709
  			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
ddbcc7e8e   Paul Menage   Task Control Grou...
3710
  	}
472b1053f   Li Zefan   cgroups: use a ha...
3711
3712
3713
  	/* Add init_css_set to the hash table */
  	hhead = css_set_hash(init_css_set.subsys);
  	hlist_add_head(&init_css_set.hlist, hhead);
2c6ab6d20   Paul Menage   cgroups: allow cg...
3714
  	BUG_ON(!init_root_id(&rootnode));
ddbcc7e8e   Paul Menage   Task Control Grou...
3715
3716
3717
  	err = register_filesystem(&cgroup_fs_type);
  	if (err < 0)
  		goto out;
46ae220be   Li Zefan   cgroup: switch to...
3718
  	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
a424316ca   Paul Menage   Task Control Grou...
3719

ddbcc7e8e   Paul Menage   Task Control Grou...
3720
  out:
a424316ca   Paul Menage   Task Control Grou...
3721
3722
  	if (err)
  		bdi_destroy(&cgroup_backing_dev_info);
ddbcc7e8e   Paul Menage   Task Control Grou...
3723
3724
  	return err;
  }
b4f48b636   Paul Menage   Task Control Grou...
3725

a424316ca   Paul Menage   Task Control Grou...
3726
3727
3728
3729
3730
3731
  /*
   * proc_cgroup_show()
   *  - Print task's cgroup paths into seq_file, one line for each hierarchy
   *  - Used for /proc/<pid>/cgroup.
   *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
   *    doesn't really matter if tsk->cgroup changes after we read it,
956db3ca0   Cliff Wickman   hotplug cpu: move...
3732
   *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
a424316ca   Paul Menage   Task Control Grou...
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
   *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
   *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
   *    cgroup to top_cgroup.
   */
  
  /* TODO: Use a proper seq_file iterator */
  static int proc_cgroup_show(struct seq_file *m, void *v)
  {
  	struct pid *pid;
  	struct task_struct *tsk;
  	char *buf;
  	int retval;
  	struct cgroupfs_root *root;
  
  	retval = -ENOMEM;
  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!buf)
  		goto out;
  
  	retval = -ESRCH;
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
  	if (!tsk)
  		goto out_free;
  
  	retval = 0;
  
  	mutex_lock(&cgroup_mutex);
e5f6a8609   Li Zefan   cgroups: make roo...
3761
  	for_each_active_root(root) {
a424316ca   Paul Menage   Task Control Grou...
3762
  		struct cgroup_subsys *ss;
bd89aabc6   Paul Menage   Control groups: R...
3763
  		struct cgroup *cgrp;
a424316ca   Paul Menage   Task Control Grou...
3764
  		int count = 0;
2c6ab6d20   Paul Menage   cgroups: allow cg...
3765
  		seq_printf(m, "%d:", root->hierarchy_id);
a424316ca   Paul Menage   Task Control Grou...
3766
3767
  		for_each_subsys(root, ss)
  			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
c6d57f331   Paul Menage   cgroups: support ...
3768
3769
3770
  		if (strlen(root->name))
  			seq_printf(m, "%sname=%s", count ? "," : "",
  				   root->name);
a424316ca   Paul Menage   Task Control Grou...
3771
  		seq_putc(m, ':');
7717f7ba9   Paul Menage   cgroups: add a ba...
3772
  		cgrp = task_cgroup_from_root(tsk, root);
bd89aabc6   Paul Menage   Control groups: R...
3773
  		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
a424316ca   Paul Menage   Task Control Grou...
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
  		if (retval < 0)
  			goto out_unlock;
  		seq_puts(m, buf);
  		seq_putc(m, '
  ');
  	}
  
  out_unlock:
  	mutex_unlock(&cgroup_mutex);
  	put_task_struct(tsk);
  out_free:
  	kfree(buf);
  out:
  	return retval;
  }
  
  static int cgroup_open(struct inode *inode, struct file *file)
  {
  	struct pid *pid = PROC_I(inode)->pid;
  	return single_open(file, proc_cgroup_show, pid);
  }
828c09509   Alexey Dobriyan   const: constify r...
3795
  const struct file_operations proc_cgroup_operations = {
a424316ca   Paul Menage   Task Control Grou...
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
  	.open		= cgroup_open,
  	.read		= seq_read,
  	.llseek		= seq_lseek,
  	.release	= single_release,
  };
  
  /* Display information about each subsystem and each hierarchy */
  static int proc_cgroupstats_show(struct seq_file *m, void *v)
  {
  	int i;
a424316ca   Paul Menage   Task Control Grou...
3806

8bab8dded   Paul Menage   cgroups: add cgro...
3807
3808
  	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled
  ");
aae8aab40   Ben Blum   cgroups: revamp s...
3809
3810
3811
3812
3813
  	/*
  	 * ideally we don't want subsystems moving around while we do this.
  	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
  	 * subsys/hierarchy state.
  	 */
a424316ca   Paul Menage   Task Control Grou...
3814
  	mutex_lock(&cgroup_mutex);
a424316ca   Paul Menage   Task Control Grou...
3815
3816
  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  		struct cgroup_subsys *ss = subsys[i];
aae8aab40   Ben Blum   cgroups: revamp s...
3817
3818
  		if (ss == NULL)
  			continue;
2c6ab6d20   Paul Menage   cgroups: allow cg...
3819
3820
3821
  		seq_printf(m, "%s\t%d\t%d\t%d
  ",
  			   ss->name, ss->root->hierarchy_id,
8bab8dded   Paul Menage   cgroups: add cgro...
3822
  			   ss->root->number_of_cgroups, !ss->disabled);
a424316ca   Paul Menage   Task Control Grou...
3823
3824
3825
3826
3827
3828
3829
  	}
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
  
  static int cgroupstats_open(struct inode *inode, struct file *file)
  {
9dce07f1a   Al Viro   NULL noise: fs/*,...
3830
  	return single_open(file, proc_cgroupstats_show, NULL);
a424316ca   Paul Menage   Task Control Grou...
3831
  }
828c09509   Alexey Dobriyan   const: constify r...
3832
  static const struct file_operations proc_cgroupstats_operations = {
a424316ca   Paul Menage   Task Control Grou...
3833
3834
3835
3836
3837
  	.open = cgroupstats_open,
  	.read = seq_read,
  	.llseek = seq_lseek,
  	.release = single_release,
  };
b4f48b636   Paul Menage   Task Control Grou...
3838
3839
  /**
   * cgroup_fork - attach newly forked task to its parents cgroup.
a043e3b2c   Li Zefan   cgroup: fix comments
3840
   * @child: pointer to task_struct of forking parent process.
b4f48b636   Paul Menage   Task Control Grou...
3841
3842
3843
3844
3845
3846
   *
   * Description: A task inherits its parent's cgroup at fork().
   *
   * A pointer to the shared css_set was automatically copied in
   * fork.c by dup_task_struct().  However, we ignore that copy, since
   * it was not made under the protection of RCU or cgroup_mutex, so
956db3ca0   Cliff Wickman   hotplug cpu: move...
3847
   * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
817929ec2   Paul Menage   Task Control Grou...
3848
3849
   * have already changed current->cgroups, allowing the previously
   * referenced cgroup group to be removed and freed.
b4f48b636   Paul Menage   Task Control Grou...
3850
3851
3852
3853
3854
3855
   *
   * At the point that cgroup_fork() is called, 'current' is the parent
   * task, and the passed argument 'child' points to the child task.
   */
  void cgroup_fork(struct task_struct *child)
  {
817929ec2   Paul Menage   Task Control Grou...
3856
3857
3858
3859
3860
  	task_lock(current);
  	child->cgroups = current->cgroups;
  	get_css_set(child->cgroups);
  	task_unlock(current);
  	INIT_LIST_HEAD(&child->cg_list);
b4f48b636   Paul Menage   Task Control Grou...
3861
3862
3863
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3864
3865
3866
3867
3868
3869
   * cgroup_fork_callbacks - run fork callbacks
   * @child: the new task
   *
   * Called on a new task very soon before adding it to the
   * tasklist. No need to take any locks since no-one can
   * be operating on this task.
b4f48b636   Paul Menage   Task Control Grou...
3870
3871
3872
3873
3874
   */
  void cgroup_fork_callbacks(struct task_struct *child)
  {
  	if (need_forkexit_callback) {
  		int i;
aae8aab40   Ben Blum   cgroups: revamp s...
3875
3876
3877
3878
3879
3880
  		/*
  		 * forkexit callbacks are only supported for builtin
  		 * subsystems, and the builtin section of the subsys array is
  		 * immutable, so we don't need to lock the subsys array here.
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
b4f48b636   Paul Menage   Task Control Grou...
3881
3882
3883
3884
3885
3886
3887
3888
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->fork)
  				ss->fork(ss, child);
  		}
  	}
  }
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3889
3890
3891
3892
3893
3894
3895
3896
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
   * Adds the task to the list running through its css_set if necessary.
   * Has to be after the task is visible on the task list in case we race
   * with the first call to cgroup_iter_start() - to guarantee that the
   * new task ends up on its list.
   */
817929ec2   Paul Menage   Task Control Grou...
3897
3898
3899
3900
  void cgroup_post_fork(struct task_struct *child)
  {
  	if (use_task_css_set_links) {
  		write_lock(&css_set_lock);
b12b533fa   Lai Jiangshan   cgroups: add lock...
3901
  		task_lock(child);
817929ec2   Paul Menage   Task Control Grou...
3902
3903
  		if (list_empty(&child->cg_list))
  			list_add(&child->cg_list, &child->cgroups->tasks);
b12b533fa   Lai Jiangshan   cgroups: add lock...
3904
  		task_unlock(child);
817929ec2   Paul Menage   Task Control Grou...
3905
3906
3907
3908
  		write_unlock(&css_set_lock);
  	}
  }
  /**
b4f48b636   Paul Menage   Task Control Grou...
3909
3910
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
a043e3b2c   Li Zefan   cgroup: fix comments
3911
   * @run_callback: run exit callbacks?
b4f48b636   Paul Menage   Task Control Grou...
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
   *
   * Description: Detach cgroup from @tsk and release it.
   *
   * Note that cgroups marked notify_on_release force every task in
   * them to take the global cgroup_mutex mutex when exiting.
   * This could impact scaling on very large systems.  Be reluctant to
   * use notify_on_release cgroups where very high task exit scaling
   * is required on large systems.
   *
   * the_top_cgroup_hack:
   *
   *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
   *
   *    We call cgroup_exit() while the task is still competent to
   *    handle notify_on_release(), then leave the task attached to the
   *    root cgroup in each hierarchy for the remainder of its exit.
   *
   *    To do this properly, we would increment the reference count on
   *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
   *    code we would add a second cgroup function call, to drop that
   *    reference.  This would just create an unnecessary hot spot on
   *    the top_cgroup reference count, to no avail.
   *
   *    Normally, holding a reference to a cgroup without bumping its
   *    count is unsafe.   The cgroup could go away, or someone could
   *    attach us to a different cgroup, decrementing the count on
   *    the first cgroup that we never incremented.  But in this case,
   *    top_cgroup isn't going away, and either task has PF_EXITING set,
956db3ca0   Cliff Wickman   hotplug cpu: move...
3940
3941
   *    which wards off any cgroup_attach_task() attempts, or task is a failed
   *    fork, never visible to cgroup_attach_task.
b4f48b636   Paul Menage   Task Control Grou...
3942
3943
3944
3945
   */
  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
  {
  	int i;
817929ec2   Paul Menage   Task Control Grou...
3946
  	struct css_set *cg;
b4f48b636   Paul Menage   Task Control Grou...
3947
3948
  
  	if (run_callbacks && need_forkexit_callback) {
aae8aab40   Ben Blum   cgroups: revamp s...
3949
3950
3951
3952
3953
  		/*
  		 * modular subsystems can't use callbacks, so no need to lock
  		 * the subsys array
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
b4f48b636   Paul Menage   Task Control Grou...
3954
3955
3956
3957
3958
  			struct cgroup_subsys *ss = subsys[i];
  			if (ss->exit)
  				ss->exit(ss, tsk);
  		}
  	}
817929ec2   Paul Menage   Task Control Grou...
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
  
  	/*
  	 * Unlink from the css_set task list if necessary.
  	 * Optimistically check cg_list before taking
  	 * css_set_lock
  	 */
  	if (!list_empty(&tsk->cg_list)) {
  		write_lock(&css_set_lock);
  		if (!list_empty(&tsk->cg_list))
  			list_del(&tsk->cg_list);
  		write_unlock(&css_set_lock);
  	}
b4f48b636   Paul Menage   Task Control Grou...
3971
3972
  	/* Reassign the task to the init_css_set. */
  	task_lock(tsk);
817929ec2   Paul Menage   Task Control Grou...
3973
3974
  	cg = tsk->cgroups;
  	tsk->cgroups = &init_css_set;
b4f48b636   Paul Menage   Task Control Grou...
3975
  	task_unlock(tsk);
817929ec2   Paul Menage   Task Control Grou...
3976
  	if (cg)
81a6a5cdd   Paul Menage   Task Control Grou...
3977
  		put_css_set_taskexit(cg);
b4f48b636   Paul Menage   Task Control Grou...
3978
  }
697f41610   Paul Menage   Task Control Grou...
3979
3980
  
  /**
a043e3b2c   Li Zefan   cgroup: fix comments
3981
3982
3983
   * cgroup_clone - clone the cgroup the given subsystem is attached to
   * @tsk: the task to be moved
   * @subsys: the given subsystem
e885dcde7   Serge E. Hallyn   cgroup_clone: use...
3984
   * @nodename: the name for the new cgroup
a043e3b2c   Li Zefan   cgroup: fix comments
3985
3986
3987
3988
   *
   * Duplicate the current cgroup in the hierarchy that the given
   * subsystem is attached to, and move this task into the new
   * child.
697f41610   Paul Menage   Task Control Grou...
3989
   */
e885dcde7   Serge E. Hallyn   cgroup_clone: use...
3990
3991
  int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
  							char *nodename)
697f41610   Paul Menage   Task Control Grou...
3992
3993
3994
  {
  	struct dentry *dentry;
  	int ret = 0;
697f41610   Paul Menage   Task Control Grou...
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
  	struct cgroup *parent, *child;
  	struct inode *inode;
  	struct css_set *cg;
  	struct cgroupfs_root *root;
  	struct cgroup_subsys *ss;
  
  	/* We shouldn't be called by an unregistered subsystem */
  	BUG_ON(!subsys->active);
  
  	/* First figure out what hierarchy and cgroup we're dealing
  	 * with, and pin them so we can drop cgroup_mutex */
  	mutex_lock(&cgroup_mutex);
   again:
  	root = subsys->root;
  	if (root == &rootnode) {
697f41610   Paul Menage   Task Control Grou...
4010
4011
4012
  		mutex_unlock(&cgroup_mutex);
  		return 0;
  	}
697f41610   Paul Menage   Task Control Grou...
4013

697f41610   Paul Menage   Task Control Grou...
4014
  	/* Pin the hierarchy */
1404f0656   Li Zefan   cgroups: fix lock...
4015
  	if (!atomic_inc_not_zero(&root->sb->s_active)) {
7b574b7b0   Li Zefan   cgroups: fix a ra...
4016
4017
4018
4019
  		/* We race with the final deactivate_super() */
  		mutex_unlock(&cgroup_mutex);
  		return 0;
  	}
697f41610   Paul Menage   Task Control Grou...
4020

817929ec2   Paul Menage   Task Control Grou...
4021
  	/* Keep the cgroup alive */
1404f0656   Li Zefan   cgroups: fix lock...
4022
4023
4024
  	task_lock(tsk);
  	parent = task_cgroup(tsk, subsys->subsys_id);
  	cg = tsk->cgroups;
817929ec2   Paul Menage   Task Control Grou...
4025
  	get_css_set(cg);
104cbd553   Lai Jiangshan   cgroups: use task...
4026
  	task_unlock(tsk);
1404f0656   Li Zefan   cgroups: fix lock...
4027

697f41610   Paul Menage   Task Control Grou...
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
  	mutex_unlock(&cgroup_mutex);
  
  	/* Now do the VFS work to create a cgroup */
  	inode = parent->dentry->d_inode;
  
  	/* Hold the parent directory mutex across this operation to
  	 * stop anyone else deleting the new cgroup */
  	mutex_lock(&inode->i_mutex);
  	dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
  	if (IS_ERR(dentry)) {
  		printk(KERN_INFO
cfe36bde5   Diego Calleja   Improve cgroup pr...
4039
4040
  		       "cgroup: Couldn't allocate dentry for %s: %ld
  ", nodename,
697f41610   Paul Menage   Task Control Grou...
4041
4042
4043
4044
4045
4046
  		       PTR_ERR(dentry));
  		ret = PTR_ERR(dentry);
  		goto out_release;
  	}
  
  	/* Create the cgroup directory, which also creates the cgroup */
75139b827   Li Zefan   cgroups: remove s...
4047
  	ret = vfs_mkdir(inode, dentry, 0755);
bd89aabc6   Paul Menage   Control groups: R...
4048
  	child = __d_cgrp(dentry);
697f41610   Paul Menage   Task Control Grou...
4049
4050
4051
4052
4053
4054
4055
4056
  	dput(dentry);
  	if (ret) {
  		printk(KERN_INFO
  		       "Failed to create cgroup %s: %d
  ", nodename,
  		       ret);
  		goto out_release;
  	}
697f41610   Paul Menage   Task Control Grou...
4057
4058
4059
4060
4061
4062
4063
4064
  	/* The cgroup now exists. Retake cgroup_mutex and check
  	 * that we're still in the same state that we thought we
  	 * were. */
  	mutex_lock(&cgroup_mutex);
  	if ((root != subsys->root) ||
  	    (parent != task_cgroup(tsk, subsys->subsys_id))) {
  		/* Aargh, we raced ... */
  		mutex_unlock(&inode->i_mutex);
817929ec2   Paul Menage   Task Control Grou...
4065
  		put_css_set(cg);
697f41610   Paul Menage   Task Control Grou...
4066

1404f0656   Li Zefan   cgroups: fix lock...
4067
  		deactivate_super(root->sb);
697f41610   Paul Menage   Task Control Grou...
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
  		/* The cgroup is still accessible in the VFS, but
  		 * we're not going to try to rmdir() it at this
  		 * point. */
  		printk(KERN_INFO
  		       "Race in cgroup_clone() - leaking cgroup %s
  ",
  		       nodename);
  		goto again;
  	}
  
  	/* do any required auto-setup */
  	for_each_subsys(root, ss) {
  		if (ss->post_clone)
  			ss->post_clone(ss, child);
  	}
  
  	/* All seems fine. Finish by moving the task into the new cgroup */
956db3ca0   Cliff Wickman   hotplug cpu: move...
4085
  	ret = cgroup_attach_task(child, tsk);
697f41610   Paul Menage   Task Control Grou...
4086
4087
4088
4089
  	mutex_unlock(&cgroup_mutex);
  
   out_release:
  	mutex_unlock(&inode->i_mutex);
81a6a5cdd   Paul Menage   Task Control Grou...
4090
4091
  
  	mutex_lock(&cgroup_mutex);
817929ec2   Paul Menage   Task Control Grou...
4092
  	put_css_set(cg);
81a6a5cdd   Paul Menage   Task Control Grou...
4093
  	mutex_unlock(&cgroup_mutex);
1404f0656   Li Zefan   cgroups: fix lock...
4094
  	deactivate_super(root->sb);
697f41610   Paul Menage   Task Control Grou...
4095
4096
  	return ret;
  }
a043e3b2c   Li Zefan   cgroup: fix comments
4097
  /**
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4098
   * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
a043e3b2c   Li Zefan   cgroup: fix comments
4099
   * @cgrp: the cgroup in question
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4100
   * @task: the task in question
a043e3b2c   Li Zefan   cgroup: fix comments
4101
   *
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4102
4103
   * See if @cgrp is a descendant of @task's cgroup in the appropriate
   * hierarchy.
697f41610   Paul Menage   Task Control Grou...
4104
4105
4106
4107
4108
4109
   *
   * If we are sending in dummytop, then presumably we are creating
   * the top cgroup in the subsystem.
   *
   * Called only by the ns (nsproxy) cgroup.
   */
313e924c0   Grzegorz Nosek   cgroups: relax ns...
4110
  int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
697f41610   Paul Menage   Task Control Grou...
4111
4112
4113
  {
  	int ret;
  	struct cgroup *target;
697f41610   Paul Menage   Task Control Grou...
4114

bd89aabc6   Paul Menage   Control groups: R...
4115
  	if (cgrp == dummytop)
697f41610   Paul Menage   Task Control Grou...
4116
  		return 1;
7717f7ba9   Paul Menage   cgroups: add a ba...
4117
  	target = task_cgroup_from_root(task, cgrp->root);
bd89aabc6   Paul Menage   Control groups: R...
4118
4119
4120
  	while (cgrp != target && cgrp!= cgrp->top_cgroup)
  		cgrp = cgrp->parent;
  	ret = (cgrp == target);
697f41610   Paul Menage   Task Control Grou...
4121
4122
  	return ret;
  }
81a6a5cdd   Paul Menage   Task Control Grou...
4123

bd89aabc6   Paul Menage   Control groups: R...
4124
  static void check_for_release(struct cgroup *cgrp)
81a6a5cdd   Paul Menage   Task Control Grou...
4125
4126
4127
  {
  	/* All of these checks rely on RCU to keep the cgroup
  	 * structure alive */
bd89aabc6   Paul Menage   Control groups: R...
4128
4129
  	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
  	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
81a6a5cdd   Paul Menage   Task Control Grou...
4130
4131
4132
4133
4134
  		/* Control Group is currently removeable. If it's not
  		 * already queued for a userspace notification, queue
  		 * it now */
  		int need_schedule_work = 0;
  		spin_lock(&release_list_lock);
bd89aabc6   Paul Menage   Control groups: R...
4135
4136
4137
  		if (!cgroup_is_removed(cgrp) &&
  		    list_empty(&cgrp->release_list)) {
  			list_add(&cgrp->release_list, &release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
4138
4139
4140
4141
4142
4143
4144
  			need_schedule_work = 1;
  		}
  		spin_unlock(&release_list_lock);
  		if (need_schedule_work)
  			schedule_work(&release_agent_work);
  	}
  }
d7b9fff71   Daisuke Nishimura   cgroup: introduce...
4145
4146
  /* Caller must verify that the css is not for root cgroup */
  void __css_put(struct cgroup_subsys_state *css, int count)
81a6a5cdd   Paul Menage   Task Control Grou...
4147
  {
bd89aabc6   Paul Menage   Control groups: R...
4148
  	struct cgroup *cgrp = css->cgroup;
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4149
  	int val;
81a6a5cdd   Paul Menage   Task Control Grou...
4150
  	rcu_read_lock();
d7b9fff71   Daisuke Nishimura   cgroup: introduce...
4151
  	val = atomic_sub_return(count, &css->refcnt);
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4152
  	if (val == 1) {
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
4153
4154
4155
4156
  		if (notify_on_release(cgrp)) {
  			set_bit(CGRP_RELEASABLE, &cgrp->flags);
  			check_for_release(cgrp);
  		}
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
4157
  		cgroup_wakeup_rmdir_waiter(cgrp);
81a6a5cdd   Paul Menage   Task Control Grou...
4158
4159
  	}
  	rcu_read_unlock();
3dece8347   KAMEZAWA Hiroyuki   cgroup: catch bad...
4160
  	WARN_ON_ONCE(val < 1);
81a6a5cdd   Paul Menage   Task Control Grou...
4161
  }
67523c48a   Ben Blum   cgroups: blkio su...
4162
  EXPORT_SYMBOL_GPL(__css_put);
81a6a5cdd   Paul Menage   Task Control Grou...
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
  
  /*
   * Notify userspace when a cgroup is released, by running the
   * configured release agent with the name of the cgroup (path
   * relative to the root of cgroup file system) as the argument.
   *
   * Most likely, this user command will try to rmdir this cgroup.
   *
   * This races with the possibility that some other task will be
   * attached to this cgroup before it is removed, or that some other
   * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
   * The presumed 'rmdir' will fail quietly if this cgroup is no longer
   * unused, and this cgroup will be reprieved from its death sentence,
   * to continue to serve a useful existence.  Next time it's released,
   * we will get notified again, if it still has 'notify_on_release' set.
   *
   * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
   * means only wait until the task is successfully execve()'d.  The
   * separate release agent task is forked by call_usermodehelper(),
   * then control in this thread returns here, without waiting for the
   * release agent task.  We don't bother to wait because the caller of
   * this routine has no use for the exit status of the release agent
   * task, so no sense holding our caller up for that.
81a6a5cdd   Paul Menage   Task Control Grou...
4186
   */
81a6a5cdd   Paul Menage   Task Control Grou...
4187
4188
4189
4190
4191
4192
4193
4194
  static void cgroup_release_agent(struct work_struct *work)
  {
  	BUG_ON(work != &release_agent_work);
  	mutex_lock(&cgroup_mutex);
  	spin_lock(&release_list_lock);
  	while (!list_empty(&release_list)) {
  		char *argv[3], *envp[3];
  		int i;
e788e066c   Paul Menage   cgroup files: mov...
4195
  		char *pathbuf = NULL, *agentbuf = NULL;
bd89aabc6   Paul Menage   Control groups: R...
4196
  		struct cgroup *cgrp = list_entry(release_list.next,
81a6a5cdd   Paul Menage   Task Control Grou...
4197
4198
  						    struct cgroup,
  						    release_list);
bd89aabc6   Paul Menage   Control groups: R...
4199
  		list_del_init(&cgrp->release_list);
81a6a5cdd   Paul Menage   Task Control Grou...
4200
4201
  		spin_unlock(&release_list_lock);
  		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
e788e066c   Paul Menage   cgroup files: mov...
4202
4203
4204
4205
4206
4207
4208
  		if (!pathbuf)
  			goto continue_free;
  		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
  			goto continue_free;
  		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
  		if (!agentbuf)
  			goto continue_free;
81a6a5cdd   Paul Menage   Task Control Grou...
4209
4210
  
  		i = 0;
e788e066c   Paul Menage   cgroup files: mov...
4211
4212
  		argv[i++] = agentbuf;
  		argv[i++] = pathbuf;
81a6a5cdd   Paul Menage   Task Control Grou...
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
  		argv[i] = NULL;
  
  		i = 0;
  		/* minimal command environment */
  		envp[i++] = "HOME=/";
  		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  		envp[i] = NULL;
  
  		/* Drop the lock while we invoke the usermode helper,
  		 * since the exec could involve hitting disk and hence
  		 * be a slow process */
  		mutex_unlock(&cgroup_mutex);
  		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
81a6a5cdd   Paul Menage   Task Control Grou...
4226
  		mutex_lock(&cgroup_mutex);
e788e066c   Paul Menage   cgroup files: mov...
4227
4228
4229
   continue_free:
  		kfree(pathbuf);
  		kfree(agentbuf);
81a6a5cdd   Paul Menage   Task Control Grou...
4230
4231
4232
4233
4234
  		spin_lock(&release_list_lock);
  	}
  	spin_unlock(&release_list_lock);
  	mutex_unlock(&cgroup_mutex);
  }
8bab8dded   Paul Menage   cgroups: add cgro...
4235
4236
4237
4238
4239
4240
4241
4242
4243
  
  static int __init cgroup_disable(char *str)
  {
  	int i;
  	char *token;
  
  	while ((token = strsep(&str, ",")) != NULL) {
  		if (!*token)
  			continue;
aae8aab40   Ben Blum   cgroups: revamp s...
4244
4245
4246
4247
4248
  		/*
  		 * cgroup_disable, being at boot time, can't know about module
  		 * subsystems, so we don't worry about them.
  		 */
  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
8bab8dded   Paul Menage   cgroups: add cgro...
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
  			struct cgroup_subsys *ss = subsys[i];
  
  			if (!strcmp(token, ss->name)) {
  				ss->disabled = 1;
  				printk(KERN_INFO "Disabling %s control group"
  					" subsystem
  ", ss->name);
  				break;
  			}
  		}
  	}
  	return 1;
  }
  __setup("cgroup_disable=", cgroup_disable);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
  
  /*
   * Functons for CSS ID.
   */
  
  /*
   *To get ID other than 0, this should be called when !cgroup_is_removed().
   */
  unsigned short css_id(struct cgroup_subsys_state *css)
  {
  	struct css_id *cssid = rcu_dereference(css->id);
  
  	if (cssid)
  		return cssid->id;
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
4279
  EXPORT_SYMBOL_GPL(css_id);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4280
4281
4282
4283
4284
4285
4286
4287
4288
  
  unsigned short css_depth(struct cgroup_subsys_state *css)
  {
  	struct css_id *cssid = rcu_dereference(css->id);
  
  	if (cssid)
  		return cssid->depth;
  	return 0;
  }
67523c48a   Ben Blum   cgroups: blkio su...
4289
  EXPORT_SYMBOL_GPL(css_depth);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4290
4291
  
  bool css_is_ancestor(struct cgroup_subsys_state *child,
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
4292
  		    const struct cgroup_subsys_state *root)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
  {
  	struct css_id *child_id = rcu_dereference(child->id);
  	struct css_id *root_id = rcu_dereference(root->id);
  
  	if (!child_id || !root_id || (child_id->depth < root_id->depth))
  		return false;
  	return child_id->stack[root_id->depth] == root_id->id;
  }
  
  static void __free_css_id_cb(struct rcu_head *head)
  {
  	struct css_id *id;
  
  	id = container_of(head, struct css_id, rcu_head);
  	kfree(id);
  }
  
  void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
  {
  	struct css_id *id = css->id;
  	/* When this is called before css_id initialization, id can be NULL */
  	if (!id)
  		return;
  
  	BUG_ON(!ss->use_id);
  
  	rcu_assign_pointer(id->css, NULL);
  	rcu_assign_pointer(css->id, NULL);
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, id->id);
  	spin_unlock(&ss->id_lock);
  	call_rcu(&id->rcu_head, __free_css_id_cb);
  }
67523c48a   Ben Blum   cgroups: blkio su...
4326
  EXPORT_SYMBOL_GPL(free_css_id);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
  
  /*
   * This is called by init or create(). Then, calls to this function are
   * always serialized (By cgroup_mutex() at create()).
   */
  
  static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
  {
  	struct css_id *newid;
  	int myid, error, size;
  
  	BUG_ON(!ss->use_id);
  
  	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
  	newid = kzalloc(size, GFP_KERNEL);
  	if (!newid)
  		return ERR_PTR(-ENOMEM);
  	/* get id */
  	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
  		error = -ENOMEM;
  		goto err_out;
  	}
  	spin_lock(&ss->id_lock);
  	/* Don't use 0. allocates an ID of 1-65535 */
  	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
  	spin_unlock(&ss->id_lock);
  
  	/* Returns error when there are no free spaces for new ID.*/
  	if (error) {
  		error = -ENOSPC;
  		goto err_out;
  	}
  	if (myid > CSS_ID_MAX)
  		goto remove_idr;
  
  	newid->id = myid;
  	newid->depth = depth;
  	return newid;
  remove_idr:
  	error = -ENOSPC;
  	spin_lock(&ss->id_lock);
  	idr_remove(&ss->idr, myid);
  	spin_unlock(&ss->id_lock);
  err_out:
  	kfree(newid);
  	return ERR_PTR(error);
  
  }
e6a1105ba   Ben Blum   cgroups: subsyste...
4375
4376
  static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
  					    struct cgroup_subsys_state *rootcss)
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4377
4378
  {
  	struct css_id *newid;
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4379
4380
4381
  
  	spin_lock_init(&ss->id_lock);
  	idr_init(&ss->idr);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
  	newid = get_new_cssid(ss, 0);
  	if (IS_ERR(newid))
  		return PTR_ERR(newid);
  
  	newid->stack[0] = newid->id;
  	newid->css = rootcss;
  	rootcss->id = newid;
  	return 0;
  }
  
  static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
  			struct cgroup *child)
  {
  	int subsys_id, i, depth = 0;
  	struct cgroup_subsys_state *parent_css, *child_css;
  	struct css_id *child_id, *parent_id = NULL;
  
  	subsys_id = ss->subsys_id;
  	parent_css = parent->subsys[subsys_id];
  	child_css = child->subsys[subsys_id];
  	depth = css_depth(parent_css) + 1;
  	parent_id = parent_css->id;
  
  	child_id = get_new_cssid(ss, depth);
  	if (IS_ERR(child_id))
  		return PTR_ERR(child_id);
  
  	for (i = 0; i < depth; i++)
  		child_id->stack[i] = parent_id->stack[i];
  	child_id->stack[depth] = child_id->id;
  	/*
  	 * child_id->css pointer will be set after this cgroup is available
  	 * see cgroup_populate_dir()
  	 */
  	rcu_assign_pointer(child_css->id, child_id);
  
  	return 0;
  }
  
  /**
   * css_lookup - lookup css by id
   * @ss: cgroup subsys to be looked into.
   * @id: the id
   *
   * Returns pointer to cgroup_subsys_state if there is valid one with id.
   * NULL if not. Should be called under rcu_read_lock()
   */
  struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
  {
  	struct css_id *cssid = NULL;
  
  	BUG_ON(!ss->use_id);
  	cssid = idr_find(&ss->idr, id);
  
  	if (unlikely(!cssid))
  		return NULL;
  
  	return rcu_dereference(cssid->css);
  }
67523c48a   Ben Blum   cgroups: blkio su...
4441
  EXPORT_SYMBOL_GPL(css_lookup);
38460b48d   KAMEZAWA Hiroyuki   cgroup: CSS ID su...
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
  
  /**
   * css_get_next - lookup next cgroup under specified hierarchy.
   * @ss: pointer to subsystem
   * @id: current position of iteration.
   * @root: pointer to css. search tree under this.
   * @foundid: position of found object.
   *
   * Search next css under the specified hierarchy of rootid. Calling under
   * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
   */
  struct cgroup_subsys_state *
  css_get_next(struct cgroup_subsys *ss, int id,
  	     struct cgroup_subsys_state *root, int *foundid)
  {
  	struct cgroup_subsys_state *ret = NULL;
  	struct css_id *tmp;
  	int tmpid;
  	int rootid = css_id(root);
  	int depth = css_depth(root);
  
  	if (!rootid)
  		return NULL;
  
  	BUG_ON(!ss->use_id);
  	/* fill start point for scan */
  	tmpid = id;
  	while (1) {
  		/*
  		 * scan next entry from bitmap(tree), tmpid is updated after
  		 * idr_get_next().
  		 */
  		spin_lock(&ss->id_lock);
  		tmp = idr_get_next(&ss->idr, &tmpid);
  		spin_unlock(&ss->id_lock);
  
  		if (!tmp)
  			break;
  		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
  			ret = rcu_dereference(tmp->css);
  			if (ret) {
  				*foundid = tmpid;
  				break;
  			}
  		}
  		/* continue to scan from next id */
  		tmpid = tmpid + 1;
  	}
  	return ret;
  }
fe6934354   Paul Menage   cgroups: move the...
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
  #ifdef CONFIG_CGROUP_DEBUG
  static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
  						   struct cgroup *cont)
  {
  	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
  
  	if (!css)
  		return ERR_PTR(-ENOMEM);
  
  	return css;
  }
  
  static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
  {
  	kfree(cont->subsys[debug_subsys_id]);
  }
  
  static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
  {
  	return atomic_read(&cont->count);
  }
  
  static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
  {
  	return cgroup_task_count(cont);
  }
  
  static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
  {
  	return (u64)(unsigned long)current->cgroups;
  }
  
  static u64 current_css_set_refcount_read(struct cgroup *cont,
  					   struct cftype *cft)
  {
  	u64 count;
  
  	rcu_read_lock();
  	count = atomic_read(&current->cgroups->refcount);
  	rcu_read_unlock();
  	return count;
  }
7717f7ba9   Paul Menage   cgroups: add a ba...
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
  static int current_css_set_cg_links_read(struct cgroup *cont,
  					 struct cftype *cft,
  					 struct seq_file *seq)
  {
  	struct cg_cgroup_link *link;
  	struct css_set *cg;
  
  	read_lock(&css_set_lock);
  	rcu_read_lock();
  	cg = rcu_dereference(current->cgroups);
  	list_for_each_entry(link, &cg->cg_links, cg_link_list) {
  		struct cgroup *c = link->cgrp;
  		const char *name;
  
  		if (c->dentry)
  			name = c->dentry->d_name.name;
  		else
  			name = "?";
2c6ab6d20   Paul Menage   cgroups: allow cg...
4552
4553
4554
  		seq_printf(seq, "Root %d group %s
  ",
  			   c->root->hierarchy_id, name);
7717f7ba9   Paul Menage   cgroups: add a ba...
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
  	}
  	rcu_read_unlock();
  	read_unlock(&css_set_lock);
  	return 0;
  }
  
  #define MAX_TASKS_SHOWN_PER_CSS 25
  static int cgroup_css_links_read(struct cgroup *cont,
  				 struct cftype *cft,
  				 struct seq_file *seq)
  {
  	struct cg_cgroup_link *link;
  
  	read_lock(&css_set_lock);
  	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
  		struct css_set *cg = link->cg;
  		struct task_struct *task;
  		int count = 0;
  		seq_printf(seq, "css_set %p
  ", cg);
  		list_for_each_entry(task, &cg->tasks, cg_list) {
  			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
  				seq_puts(seq, "  ...
  ");
  				break;
  			} else {
  				seq_printf(seq, "  task %d
  ",
  					   task_pid_vnr(task));
  			}
  		}
  	}
  	read_unlock(&css_set_lock);
  	return 0;
  }
fe6934354   Paul Menage   cgroups: move the...
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
  static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
  {
  	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
  }
  
  static struct cftype debug_files[] =  {
  	{
  		.name = "cgroup_refcount",
  		.read_u64 = cgroup_refcount_read,
  	},
  	{
  		.name = "taskcount",
  		.read_u64 = debug_taskcount_read,
  	},
  
  	{
  		.name = "current_css_set",
  		.read_u64 = current_css_set_read,
  	},
  
  	{
  		.name = "current_css_set_refcount",
  		.read_u64 = current_css_set_refcount_read,
  	},
  
  	{
7717f7ba9   Paul Menage   cgroups: add a ba...
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
  		.name = "current_css_set_cg_links",
  		.read_seq_string = current_css_set_cg_links_read,
  	},
  
  	{
  		.name = "cgroup_css_links",
  		.read_seq_string = cgroup_css_links_read,
  	},
  
  	{
fe6934354   Paul Menage   cgroups: move the...
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
  		.name = "releasable",
  		.read_u64 = releasable_read,
  	},
  };
  
  static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
  {
  	return cgroup_add_files(cont, ss, debug_files,
  				ARRAY_SIZE(debug_files));
  }
  
  struct cgroup_subsys debug_subsys = {
  	.name = "debug",
  	.create = debug_create,
  	.destroy = debug_destroy,
  	.populate = debug_populate,
  	.subsys_id = debug_subsys_id,
  };
  #endif /* CONFIG_CGROUP_DEBUG */