Blame view

kernel/cgroup/cgroup-v1.c 33.1 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
0a268dbd7   Tejun Heo   cgroup: move cgro...
2
  #include "cgroup-internal.h"
1592c9b22   Tejun Heo   cgroup: move v1 m...
3
  #include <linux/ctype.h>
0a268dbd7   Tejun Heo   cgroup: move cgro...
4
5
  #include <linux/kmod.h>
  #include <linux/sort.h>
1592c9b22   Tejun Heo   cgroup: move v1 m...
6
  #include <linux/delay.h>
0a268dbd7   Tejun Heo   cgroup: move cgro...
7
  #include <linux/mm.h>
c3edc4010   Ingo Molnar   sched/headers: Mo...
8
  #include <linux/sched/signal.h>
56cd69736   Ingo Molnar   sched/headers: Mo...
9
  #include <linux/sched/task.h>
50ff9d130   Ingo Molnar   sched/headers: Re...
10
  #include <linux/magic.h>
0a268dbd7   Tejun Heo   cgroup: move cgro...
11
12
13
14
15
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/delayacct.h>
  #include <linux/pid_namespace.h>
  #include <linux/cgroupstats.h>
8d2451f49   Al Viro   cgroup1: switch t...
16
  #include <linux/fs_parser.h>
0a268dbd7   Tejun Heo   cgroup: move cgro...
17
18
19
20
21
22
23
24
25
26
27
28
29
  
  #include <trace/events/cgroup.h>
  
  /*
   * pidlists linger the following amount before being destroyed.  The goal
   * is avoiding frequent destruction in the middle of consecutive read calls
   * Expiring in the middle is a performance problem not a correctness one.
   * 1 sec should be enough.
   */
  #define CGROUP_PIDLIST_DESTROY_DELAY	HZ
  
  /* Controllers blocked by the commandline in v1 */
  static u16 cgroup_no_v1_mask;
3fc9c12d2   Tejun Heo   cgroup: Add named...
30
31
  /* disable named v1 mounts */
  static bool cgroup_no_v1_named;
0a268dbd7   Tejun Heo   cgroup: move cgro...
32
33
34
35
36
  /*
   * pidlist destructions need to be flushed on cgroup destruction.  Use a
   * separate workqueue as flush domain.
   */
  static struct workqueue_struct *cgroup_pidlist_destroy_wq;
e7b20d979   Tejun Heo   cgroup: Restructu...
37
  /* protects cgroup_subsys->release_agent_path */
1592c9b22   Tejun Heo   cgroup: move v1 m...
38
  static DEFINE_SPINLOCK(release_agent_path_lock);
0a268dbd7   Tejun Heo   cgroup: move cgro...
39

d62beb7f3   Tejun Heo   cgroup: rename fu...
40
  bool cgroup1_ssid_disabled(int ssid)
0a268dbd7   Tejun Heo   cgroup: move cgro...
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  {
  	return cgroup_no_v1_mask & (1 << ssid);
  }
  
  /**
   * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
   * @from: attach to all cgroups of a given task
   * @tsk: the task to be attached
   */
  int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
  {
  	struct cgroup_root *root;
  	int retval = 0;
  
  	mutex_lock(&cgroup_mutex);
  	percpu_down_write(&cgroup_threadgroup_rwsem);
  	for_each_root(root) {
  		struct cgroup *from_cgrp;
  
  		if (root == &cgrp_dfl_root)
  			continue;
  
  		spin_lock_irq(&css_set_lock);
  		from_cgrp = task_cgroup_from_root(from, root);
  		spin_unlock_irq(&css_set_lock);
  
  		retval = cgroup_attach_task(from_cgrp, tsk, false);
  		if (retval)
  			break;
  	}
  	percpu_up_write(&cgroup_threadgroup_rwsem);
  	mutex_unlock(&cgroup_mutex);
  
  	return retval;
  }
  EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
  
  /**
   * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
   * @to: cgroup to which the tasks will be moved
   * @from: cgroup in which the tasks currently reside
   *
   * Locking rules between cgroup_post_fork() and the migration path
   * guarantee that, if a task is forking while being migrated, the new child
   * is guaranteed to be either visible in the source cgroup after the
   * parent's migration is complete or put into the target cgroup.  No task
   * can slip out of migration through forking.
   */
  int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
  {
e595cd706   Tejun Heo   cgroup: track mig...
91
  	DEFINE_CGROUP_MGCTX(mgctx);
0a268dbd7   Tejun Heo   cgroup: move cgro...
92
93
94
95
96
97
98
  	struct cgrp_cset_link *link;
  	struct css_task_iter it;
  	struct task_struct *task;
  	int ret;
  
  	if (cgroup_on_dfl(to))
  		return -EINVAL;
8cfd8147d   Tejun Heo   cgroup: implement...
99
100
101
  	ret = cgroup_migrate_vet_dst(to);
  	if (ret)
  		return ret;
0a268dbd7   Tejun Heo   cgroup: move cgro...
102
103
104
105
106
107
108
109
  
  	mutex_lock(&cgroup_mutex);
  
  	percpu_down_write(&cgroup_threadgroup_rwsem);
  
  	/* all tasks in @from are being moved, all csets are source */
  	spin_lock_irq(&css_set_lock);
  	list_for_each_entry(link, &from->cset_links, cset_link)
e595cd706   Tejun Heo   cgroup: track mig...
110
  		cgroup_migrate_add_src(link->cset, to, &mgctx);
0a268dbd7   Tejun Heo   cgroup: move cgro...
111
  	spin_unlock_irq(&css_set_lock);
e595cd706   Tejun Heo   cgroup: track mig...
112
  	ret = cgroup_migrate_prepare_dst(&mgctx);
0a268dbd7   Tejun Heo   cgroup: move cgro...
113
114
115
116
117
118
119
120
  	if (ret)
  		goto out_err;
  
  	/*
  	 * Migrate tasks one-by-one until @from is empty.  This fails iff
  	 * ->can_attach() fails.
  	 */
  	do {
bc2fb7ed0   Tejun Heo   cgroup: add @flag...
121
  		css_task_iter_start(&from->self, 0, &it);
116d2f749   Prateek Sood   cgroup: Fix deadl...
122
123
124
125
  
  		do {
  			task = css_task_iter_next(&it);
  		} while (task && (task->flags & PF_EXITING));
0a268dbd7   Tejun Heo   cgroup: move cgro...
126
127
128
129
130
  		if (task)
  			get_task_struct(task);
  		css_task_iter_end(&it);
  
  		if (task) {
bfc2cf6f6   Tejun Heo   cgroup: call subs...
131
  			ret = cgroup_migrate(task, false, &mgctx);
0a268dbd7   Tejun Heo   cgroup: move cgro...
132
  			if (!ret)
e4f8d81c7   Steven Rostedt (VMware)   cgroup/tracing: M...
133
  				TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
0a268dbd7   Tejun Heo   cgroup: move cgro...
134
135
136
137
  			put_task_struct(task);
  		}
  	} while (task && !ret);
  out_err:
e595cd706   Tejun Heo   cgroup: track mig...
138
  	cgroup_migrate_finish(&mgctx);
0a268dbd7   Tejun Heo   cgroup: move cgro...
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
  	percpu_up_write(&cgroup_threadgroup_rwsem);
  	mutex_unlock(&cgroup_mutex);
  	return ret;
  }
  
  /*
   * Stuff for reading the 'tasks'/'procs' files.
   *
   * Reading this file can return large amounts of data if a cgroup has
   * *lots* of attached tasks. So it may need several calls to read(),
   * but we cannot guarantee that the information we produce is correct
   * unless we produce it entirely atomically.
   *
   */
  
  /* which pidlist file are we talking about? */
  enum cgroup_filetype {
  	CGROUP_FILE_PROCS,
  	CGROUP_FILE_TASKS,
  };
  
  /*
   * A pidlist is a list of pids that virtually represents the contents of one
   * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
   * a pair (one each for procs, tasks) for each pid namespace that's relevant
   * to the cgroup.
   */
  struct cgroup_pidlist {
  	/*
  	 * used to find which pidlist is wanted. doesn't change as long as
  	 * this particular list stays in the list.
  	*/
  	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
  	/* array of xids */
  	pid_t *list;
  	/* how many elements the above list has */
  	int length;
  	/* each of these stored in a list by its cgroup */
  	struct list_head links;
  	/* pointer to the cgroup we belong to, for list removal purposes */
  	struct cgroup *owner;
  	/* for delayed destruction */
  	struct delayed_work destroy_dwork;
  };
  
  /*
0a268dbd7   Tejun Heo   cgroup: move cgro...
185
186
187
   * Used to destroy all pidlists lingering waiting for destroy timer.  None
   * should be left afterwards.
   */
d62beb7f3   Tejun Heo   cgroup: rename fu...
188
  void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
0a268dbd7   Tejun Heo   cgroup: move cgro...
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
  {
  	struct cgroup_pidlist *l, *tmp_l;
  
  	mutex_lock(&cgrp->pidlist_mutex);
  	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
  		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
  	mutex_unlock(&cgrp->pidlist_mutex);
  
  	flush_workqueue(cgroup_pidlist_destroy_wq);
  	BUG_ON(!list_empty(&cgrp->pidlists));
  }
  
  static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
  {
  	struct delayed_work *dwork = to_delayed_work(work);
  	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
  						destroy_dwork);
  	struct cgroup_pidlist *tofree = NULL;
  
  	mutex_lock(&l->owner->pidlist_mutex);
  
  	/*
  	 * Destroy iff we didn't get queued again.  The state won't change
  	 * as destroy_dwork can only be queued while locked.
  	 */
  	if (!delayed_work_pending(dwork)) {
  		list_del(&l->links);
653a23ca7   Marc Koderer   Use kvmalloc in c...
216
  		kvfree(l->list);
0a268dbd7   Tejun Heo   cgroup: move cgro...
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
  		put_pid_ns(l->key.ns);
  		tofree = l;
  	}
  
  	mutex_unlock(&l->owner->pidlist_mutex);
  	kfree(tofree);
  }
  
  /*
   * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
   * Returns the number of unique elements.
   */
  static int pidlist_uniq(pid_t *list, int length)
  {
  	int src, dest = 1;
  
  	/*
  	 * we presume the 0th element is unique, so i starts at 1. trivial
  	 * edge cases first; no work needs to be done for either
  	 */
  	if (length == 0 || length == 1)
  		return length;
  	/* src and dest walk down the list; dest counts unique elements */
  	for (src = 1; src < length; src++) {
  		/* find next unique element */
  		while (list[src] == list[src-1]) {
  			src++;
  			if (src == length)
  				goto after;
  		}
  		/* dest always points to where the next unique element goes */
  		list[dest] = list[src];
  		dest++;
  	}
  after:
  	return dest;
  }
  
  /*
   * The two pid files - task and cgroup.procs - guaranteed that the result
   * is sorted, which forced this whole pidlist fiasco.  As pid order is
   * different per namespace, each namespace needs differently sorted list,
   * making it impossible to use, for example, single rbtree of member tasks
   * sorted by task pointer.  As pidlists can be fairly large, allocating one
   * per open file is dangerous, so cgroup had to implement shared pool of
   * pidlists keyed by cgroup and namespace.
   */
  static int cmppid(const void *a, const void *b)
  {
  	return *(pid_t *)a - *(pid_t *)b;
  }
  
  static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
  						  enum cgroup_filetype type)
  {
  	struct cgroup_pidlist *l;
  	/* don't need task_nsproxy() if we're looking at ourself */
  	struct pid_namespace *ns = task_active_pid_ns(current);
  
  	lockdep_assert_held(&cgrp->pidlist_mutex);
  
  	list_for_each_entry(l, &cgrp->pidlists, links)
  		if (l->key.type == type && l->key.ns == ns)
  			return l;
  	return NULL;
  }
  
  /*
   * find the appropriate pidlist for our purpose (given procs vs tasks)
   * returns with the lock on that pidlist already held, and takes care
   * of the use count, or returns NULL with no locks held if we're out of
   * memory.
   */
  static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
  						enum cgroup_filetype type)
  {
  	struct cgroup_pidlist *l;
  
  	lockdep_assert_held(&cgrp->pidlist_mutex);
  
  	l = cgroup_pidlist_find(cgrp, type);
  	if (l)
  		return l;
  
  	/* entry not found; create a new one */
  	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
  	if (!l)
  		return l;
  
  	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
  	l->key.type = type;
  	/* don't need task_nsproxy() if we're looking at ourself */
  	l->key.ns = get_pid_ns(task_active_pid_ns(current));
  	l->owner = cgrp;
  	list_add(&l->links, &cgrp->pidlists);
  	return l;
  }
0a268dbd7   Tejun Heo   cgroup: move cgro...
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
  /*
   * Load a cgroup's pidarray with either procs' tgids or tasks' pids
   */
  static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
  			      struct cgroup_pidlist **lp)
  {
  	pid_t *array;
  	int length;
  	int pid, n = 0; /* used for populating the array */
  	struct css_task_iter it;
  	struct task_struct *tsk;
  	struct cgroup_pidlist *l;
  
  	lockdep_assert_held(&cgrp->pidlist_mutex);
  
  	/*
  	 * If cgroup gets more users after we read count, we won't have
  	 * enough space - tough.  This race is indistinguishable to the
  	 * caller from the case that the additional cgroup users didn't
  	 * show up until sometime later on.
  	 */
  	length = cgroup_task_count(cgrp);
653a23ca7   Marc Koderer   Use kvmalloc in c...
336
  	array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
0a268dbd7   Tejun Heo   cgroup: move cgro...
337
338
339
  	if (!array)
  		return -ENOMEM;
  	/* now, populate the array */
bc2fb7ed0   Tejun Heo   cgroup: add @flag...
340
  	css_task_iter_start(&cgrp->self, 0, &it);
0a268dbd7   Tejun Heo   cgroup: move cgro...
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
  	while ((tsk = css_task_iter_next(&it))) {
  		if (unlikely(n == length))
  			break;
  		/* get tgid or pid for procs or tasks file respectively */
  		if (type == CGROUP_FILE_PROCS)
  			pid = task_tgid_vnr(tsk);
  		else
  			pid = task_pid_vnr(tsk);
  		if (pid > 0) /* make sure to only use valid results */
  			array[n++] = pid;
  	}
  	css_task_iter_end(&it);
  	length = n;
  	/* now sort & (if procs) strip out duplicates */
  	sort(array, length, sizeof(pid_t), cmppid, NULL);
  	if (type == CGROUP_FILE_PROCS)
  		length = pidlist_uniq(array, length);
  
  	l = cgroup_pidlist_find_create(cgrp, type);
  	if (!l) {
653a23ca7   Marc Koderer   Use kvmalloc in c...
361
  		kvfree(array);
0a268dbd7   Tejun Heo   cgroup: move cgro...
362
363
364
365
  		return -ENOMEM;
  	}
  
  	/* store array, freeing old if necessary */
653a23ca7   Marc Koderer   Use kvmalloc in c...
366
  	kvfree(l->list);
0a268dbd7   Tejun Heo   cgroup: move cgro...
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
  	l->list = array;
  	l->length = length;
  	*lp = l;
  	return 0;
  }
  
  /*
   * seq_file methods for the tasks/procs files. The seq_file position is the
   * next pid to display; the seq_file iterator is a pointer to the pid
   * in the cgroup->l->list array.
   */
  
  static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
  {
  	/*
  	 * Initially we receive a position value that corresponds to
  	 * one more than the last pid shown (or 0 on the first call or
  	 * after a seek to the start). Use a binary-search to find the
  	 * next pid to display, if any
  	 */
  	struct kernfs_open_file *of = s->private;
  	struct cgroup *cgrp = seq_css(s)->cgroup;
  	struct cgroup_pidlist *l;
  	enum cgroup_filetype type = seq_cft(s)->private;
  	int index = 0, pid = *pos;
  	int *iter, ret;
  
  	mutex_lock(&cgrp->pidlist_mutex);
  
  	/*
  	 * !NULL @of->priv indicates that this isn't the first start()
  	 * after open.  If the matching pidlist is around, we can use that.
  	 * Look for it.  Note that @of->priv can't be used directly.  It
  	 * could already have been destroyed.
  	 */
  	if (of->priv)
  		of->priv = cgroup_pidlist_find(cgrp, type);
  
  	/*
  	 * Either this is the first start() after open or the matching
  	 * pidlist has been destroyed inbetween.  Create a new one.
  	 */
  	if (!of->priv) {
  		ret = pidlist_array_load(cgrp, type,
  					 (struct cgroup_pidlist **)&of->priv);
  		if (ret)
  			return ERR_PTR(ret);
  	}
  	l = of->priv;
  
  	if (pid) {
  		int end = l->length;
  
  		while (index < end) {
  			int mid = (index + end) / 2;
  			if (l->list[mid] == pid) {
  				index = mid;
  				break;
  			} else if (l->list[mid] <= pid)
  				index = mid + 1;
  			else
  				end = mid;
  		}
  	}
  	/* If we're off the end of the array, we're done */
  	if (index >= l->length)
  		return NULL;
  	/* Update the abstract position to be the actual pid that we found */
  	iter = l->list + index;
  	*pos = *iter;
  	return iter;
  }
  
  static void cgroup_pidlist_stop(struct seq_file *s, void *v)
  {
  	struct kernfs_open_file *of = s->private;
  	struct cgroup_pidlist *l = of->priv;
  
  	if (l)
  		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
  				 CGROUP_PIDLIST_DESTROY_DELAY);
  	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
  }
  
  static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
  {
  	struct kernfs_open_file *of = s->private;
  	struct cgroup_pidlist *l = of->priv;
  	pid_t *p = v;
  	pid_t *end = l->list + l->length;
  	/*
  	 * Advance to the next pid in the array. If this goes off the
  	 * end, we're done
  	 */
  	p++;
  	if (p >= end) {
db8dd9697   Vasily Averin   cgroup-v1: cgroup...
463
  		(*pos)++;
0a268dbd7   Tejun Heo   cgroup: move cgro...
464
465
466
467
468
469
470
471
472
473
474
475
476
477
  		return NULL;
  	} else {
  		*pos = *p;
  		return p;
  	}
  }
  
  static int cgroup_pidlist_show(struct seq_file *s, void *v)
  {
  	seq_printf(s, "%d
  ", *(int *)v);
  
  	return 0;
  }
715c809d9   Tejun Heo   cgroup: reorganiz...
478
479
480
  static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
  				     char *buf, size_t nbytes, loff_t off,
  				     bool threadgroup)
0a268dbd7   Tejun Heo   cgroup: move cgro...
481
  {
715c809d9   Tejun Heo   cgroup: reorganiz...
482
483
484
485
  	struct cgroup *cgrp;
  	struct task_struct *task;
  	const struct cred *cred, *tcred;
  	ssize_t ret;
9a3284fad   Michal Koutný   cgroup: Optimize ...
486
  	bool locked;
715c809d9   Tejun Heo   cgroup: reorganiz...
487
488
489
490
  
  	cgrp = cgroup_kn_lock_live(of->kn, false);
  	if (!cgrp)
  		return -ENODEV;
9a3284fad   Michal Koutný   cgroup: Optimize ...
491
  	task = cgroup_procs_write_start(buf, threadgroup, &locked);
715c809d9   Tejun Heo   cgroup: reorganiz...
492
493
494
495
496
497
498
499
500
501
502
503
  	ret = PTR_ERR_OR_ZERO(task);
  	if (ret)
  		goto out_unlock;
  
  	/*
  	 * Even if we're attaching all tasks in the thread group, we only
  	 * need to check permissions on one of them.
  	 */
  	cred = current_cred();
  	tcred = get_task_cred(task);
  	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
  	    !uid_eq(cred->euid, tcred->uid) &&
a88f61676   Dmitry Torokhov   CHROMIUM: cgroups...
504
505
  	    !uid_eq(cred->euid, tcred->suid) &&
  	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
715c809d9   Tejun Heo   cgroup: reorganiz...
506
507
508
509
510
511
512
513
  		ret = -EACCES;
  	put_cred(tcred);
  	if (ret)
  		goto out_finish;
  
  	ret = cgroup_attach_task(cgrp, task, threadgroup);
  
  out_finish:
9a3284fad   Michal Koutný   cgroup: Optimize ...
514
  	cgroup_procs_write_finish(task, locked);
715c809d9   Tejun Heo   cgroup: reorganiz...
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
  out_unlock:
  	cgroup_kn_unlock(of->kn);
  
  	return ret ?: nbytes;
  }
  
  static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
  				   char *buf, size_t nbytes, loff_t off)
  {
  	return __cgroup1_procs_write(of, buf, nbytes, off, true);
  }
  
  static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
  				   char *buf, size_t nbytes, loff_t off)
  {
  	return __cgroup1_procs_write(of, buf, nbytes, off, false);
0a268dbd7   Tejun Heo   cgroup: move cgro...
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
  }
  
  static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
  					  char *buf, size_t nbytes, loff_t off)
  {
  	struct cgroup *cgrp;
  
  	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
  
  	cgrp = cgroup_kn_lock_live(of->kn, false);
  	if (!cgrp)
  		return -ENODEV;
  	spin_lock(&release_agent_path_lock);
  	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
  		sizeof(cgrp->root->release_agent_path));
  	spin_unlock(&release_agent_path_lock);
  	cgroup_kn_unlock(of->kn);
  	return nbytes;
  }
  
  static int cgroup_release_agent_show(struct seq_file *seq, void *v)
  {
  	struct cgroup *cgrp = seq_css(seq)->cgroup;
  
  	spin_lock(&release_agent_path_lock);
  	seq_puts(seq, cgrp->root->release_agent_path);
  	spin_unlock(&release_agent_path_lock);
  	seq_putc(seq, '
  ');
  	return 0;
  }
  
  static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
  {
  	seq_puts(seq, "0
  ");
  	return 0;
  }
  
  static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
  					 struct cftype *cft)
  {
  	return notify_on_release(css->cgroup);
  }
  
  static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
  					  struct cftype *cft, u64 val)
  {
  	if (val)
  		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
  	else
  		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
  	return 0;
  }
  
  static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
  				      struct cftype *cft)
  {
  	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  }
  
  static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
  				       struct cftype *cft, u64 val)
  {
  	if (val)
  		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  	else
  		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  	return 0;
  }
  
  /* cgroup core interface files for the legacy hierarchies */
d62beb7f3   Tejun Heo   cgroup: rename fu...
603
  struct cftype cgroup1_base_files[] = {
0a268dbd7   Tejun Heo   cgroup: move cgro...
604
605
606
607
608
609
610
  	{
  		.name = "cgroup.procs",
  		.seq_start = cgroup_pidlist_start,
  		.seq_next = cgroup_pidlist_next,
  		.seq_stop = cgroup_pidlist_stop,
  		.seq_show = cgroup_pidlist_show,
  		.private = CGROUP_FILE_PROCS,
715c809d9   Tejun Heo   cgroup: reorganiz...
611
  		.write = cgroup1_procs_write,
0a268dbd7   Tejun Heo   cgroup: move cgro...
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
  	},
  	{
  		.name = "cgroup.clone_children",
  		.read_u64 = cgroup_clone_children_read,
  		.write_u64 = cgroup_clone_children_write,
  	},
  	{
  		.name = "cgroup.sane_behavior",
  		.flags = CFTYPE_ONLY_ON_ROOT,
  		.seq_show = cgroup_sane_behavior_show,
  	},
  	{
  		.name = "tasks",
  		.seq_start = cgroup_pidlist_start,
  		.seq_next = cgroup_pidlist_next,
  		.seq_stop = cgroup_pidlist_stop,
  		.seq_show = cgroup_pidlist_show,
  		.private = CGROUP_FILE_TASKS,
715c809d9   Tejun Heo   cgroup: reorganiz...
630
  		.write = cgroup1_tasks_write,
0a268dbd7   Tejun Heo   cgroup: move cgro...
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
  	},
  	{
  		.name = "notify_on_release",
  		.read_u64 = cgroup_read_notify_on_release,
  		.write_u64 = cgroup_write_notify_on_release,
  	},
  	{
  		.name = "release_agent",
  		.flags = CFTYPE_ONLY_ON_ROOT,
  		.seq_show = cgroup_release_agent_show,
  		.write = cgroup_release_agent_write,
  		.max_write_len = PATH_MAX - 1,
  	},
  	{ }	/* terminate */
  };
  
  /* Display information about each subsystem and each hierarchy */
3f3942aca   Christoph Hellwig   proc: introduce p...
648
  int proc_cgroupstats_show(struct seq_file *m, void *v)
0a268dbd7   Tejun Heo   cgroup: move cgro...
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
  {
  	struct cgroup_subsys *ss;
  	int i;
  
  	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled
  ");
  	/*
  	 * ideally we don't want subsystems moving around while we do this.
  	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
  	 * subsys/hierarchy state.
  	 */
  	mutex_lock(&cgroup_mutex);
  
  	for_each_subsys(ss, i)
  		seq_printf(m, "%s\t%d\t%d\t%d
  ",
  			   ss->legacy_name, ss->root->hierarchy_id,
  			   atomic_read(&ss->root->nr_cgrps),
  			   cgroup_ssid_enabled(i));
  
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
0a268dbd7   Tejun Heo   cgroup: move cgro...
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
  /**
   * cgroupstats_build - build and fill cgroupstats
   * @stats: cgroupstats to fill information into
   * @dentry: A dentry entry belonging to the cgroup for which stats have
   * been requested.
   *
   * Build and fill cgroupstats so that taskstats can export it to user
   * space.
   */
  int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  {
  	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
  	struct cgroup *cgrp;
  	struct css_task_iter it;
  	struct task_struct *tsk;
  
  	/* it should be kernfs_node belonging to cgroupfs and is a directory */
  	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
  	    kernfs_type(kn) != KERNFS_DIR)
  		return -EINVAL;
  
  	mutex_lock(&cgroup_mutex);
  
  	/*
  	 * We aren't being called from kernfs and there's no guarantee on
  	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
  	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
  	 */
  	rcu_read_lock();
e0aed7c74   Tejun Heo   cgroup: fix RCU r...
701
  	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
0a268dbd7   Tejun Heo   cgroup: move cgro...
702
703
704
705
706
707
  	if (!cgrp || cgroup_is_dead(cgrp)) {
  		rcu_read_unlock();
  		mutex_unlock(&cgroup_mutex);
  		return -ENOENT;
  	}
  	rcu_read_unlock();
bc2fb7ed0   Tejun Heo   cgroup: add @flag...
708
  	css_task_iter_start(&cgrp->self, 0, &it);
0a268dbd7   Tejun Heo   cgroup: move cgro...
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
  	while ((tsk = css_task_iter_next(&it))) {
  		switch (tsk->state) {
  		case TASK_RUNNING:
  			stats->nr_running++;
  			break;
  		case TASK_INTERRUPTIBLE:
  			stats->nr_sleeping++;
  			break;
  		case TASK_UNINTERRUPTIBLE:
  			stats->nr_uninterruptible++;
  			break;
  		case TASK_STOPPED:
  			stats->nr_stopped++;
  			break;
  		default:
  			if (delayacct_is_task_waiting_on_io(tsk))
  				stats->nr_io_wait++;
  			break;
  		}
  	}
  	css_task_iter_end(&it);
  
  	mutex_unlock(&cgroup_mutex);
  	return 0;
  }
d62beb7f3   Tejun Heo   cgroup: rename fu...
734
  void cgroup1_check_for_release(struct cgroup *cgrp)
0a268dbd7   Tejun Heo   cgroup: move cgro...
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
  {
  	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
  	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
  		schedule_work(&cgrp->release_agent_work);
  }
  
  /*
   * Notify userspace when a cgroup is released, by running the
   * configured release agent with the name of the cgroup (path
   * relative to the root of cgroup file system) as the argument.
   *
   * Most likely, this user command will try to rmdir this cgroup.
   *
   * This races with the possibility that some other task will be
   * attached to this cgroup before it is removed, or that some other
   * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
   * The presumed 'rmdir' will fail quietly if this cgroup is no longer
   * unused, and this cgroup will be reprieved from its death sentence,
   * to continue to serve a useful existence.  Next time it's released,
   * we will get notified again, if it still has 'notify_on_release' set.
   *
   * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
   * means only wait until the task is successfully execve()'d.  The
   * separate release agent task is forked by call_usermodehelper(),
   * then control in this thread returns here, without waiting for the
   * release agent task.  We don't bother to wait because the caller of
   * this routine has no use for the exit status of the release agent
   * task, so no sense holding our caller up for that.
   */
d62beb7f3   Tejun Heo   cgroup: rename fu...
764
  void cgroup1_release_agent(struct work_struct *work)
0a268dbd7   Tejun Heo   cgroup: move cgro...
765
766
767
  {
  	struct cgroup *cgrp =
  		container_of(work, struct cgroup, release_agent_work);
e7b20d979   Tejun Heo   cgroup: Restructu...
768
  	char *pathbuf, *agentbuf;
0a268dbd7   Tejun Heo   cgroup: move cgro...
769
770
  	char *argv[3], *envp[3];
  	int ret;
e7b20d979   Tejun Heo   cgroup: Restructu...
771
772
773
  	/* snoop agent path and exit early if empty */
  	if (!cgrp->root->release_agent_path[0])
  		return;
0a268dbd7   Tejun Heo   cgroup: move cgro...
774

e7b20d979   Tejun Heo   cgroup: Restructu...
775
  	/* prepare argument buffers */
0a268dbd7   Tejun Heo   cgroup: move cgro...
776
  	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
e7b20d979   Tejun Heo   cgroup: Restructu...
777
778
779
  	agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
  	if (!pathbuf || !agentbuf)
  		goto out_free;
0a268dbd7   Tejun Heo   cgroup: move cgro...
780

e7b20d979   Tejun Heo   cgroup: Restructu...
781
782
783
784
785
786
787
  	spin_lock(&release_agent_path_lock);
  	strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
  	spin_unlock(&release_agent_path_lock);
  	if (!agentbuf[0])
  		goto out_free;
  
  	ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
0a268dbd7   Tejun Heo   cgroup: move cgro...
788
  	if (ret < 0 || ret >= PATH_MAX)
e7b20d979   Tejun Heo   cgroup: Restructu...
789
  		goto out_free;
0a268dbd7   Tejun Heo   cgroup: move cgro...
790
791
792
793
794
795
796
797
798
  
  	argv[0] = agentbuf;
  	argv[1] = pathbuf;
  	argv[2] = NULL;
  
  	/* minimal command environment */
  	envp[0] = "HOME=/";
  	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  	envp[2] = NULL;
0a268dbd7   Tejun Heo   cgroup: move cgro...
799
  	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
0a268dbd7   Tejun Heo   cgroup: move cgro...
800
801
802
803
804
805
806
807
  out_free:
  	kfree(agentbuf);
  	kfree(pathbuf);
  }
  
  /*
   * cgroup_rename - Only allow simple rename of directories in place.
   */
1592c9b22   Tejun Heo   cgroup: move v1 m...
808
809
  static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
  			  const char *new_name_str)
0a268dbd7   Tejun Heo   cgroup: move cgro...
810
811
812
813
814
815
816
817
818
819
  {
  	struct cgroup *cgrp = kn->priv;
  	int ret;
  
  	if (kernfs_type(kn) != KERNFS_DIR)
  		return -ENOTDIR;
  	if (kn->parent != new_parent)
  		return -EIO;
  
  	/*
0a268dbd7   Tejun Heo   cgroup: move cgro...
820
821
822
823
824
825
826
827
828
829
830
  	 * We're gonna grab cgroup_mutex which nests outside kernfs
  	 * active_ref.  kernfs_rename() doesn't require active_ref
  	 * protection.  Break them before grabbing cgroup_mutex.
  	 */
  	kernfs_break_active_protection(new_parent);
  	kernfs_break_active_protection(kn);
  
  	mutex_lock(&cgroup_mutex);
  
  	ret = kernfs_rename(kn, new_parent, new_name_str);
  	if (!ret)
e4f8d81c7   Steven Rostedt (VMware)   cgroup/tracing: M...
831
  		TRACE_CGROUP_PATH(rename, cgrp);
0a268dbd7   Tejun Heo   cgroup: move cgro...
832
833
834
835
836
837
838
  
  	mutex_unlock(&cgroup_mutex);
  
  	kernfs_unbreak_active_protection(kn);
  	kernfs_unbreak_active_protection(new_parent);
  	return ret;
  }
1592c9b22   Tejun Heo   cgroup: move v1 m...
839
840
841
842
843
844
845
846
847
848
849
850
851
  static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
  {
  	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
  	struct cgroup_subsys *ss;
  	int ssid;
  
  	for_each_subsys(ss, ssid)
  		if (root->subsys_mask & (1 << ssid))
  			seq_show_option(seq, ss->legacy_name, NULL);
  	if (root->flags & CGRP_ROOT_NOPREFIX)
  		seq_puts(seq, ",noprefix");
  	if (root->flags & CGRP_ROOT_XATTR)
  		seq_puts(seq, ",xattr");
e1cba4b85   Waiman Long   cgroup: Add mount...
852
853
  	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
  		seq_puts(seq, ",cpuset_v2_mode");
1592c9b22   Tejun Heo   cgroup: move v1 m...
854
855
856
857
858
859
860
861
862
863
864
865
866
  
  	spin_lock(&release_agent_path_lock);
  	if (strlen(root->release_agent_path))
  		seq_show_option(seq, "release_agent",
  				root->release_agent_path);
  	spin_unlock(&release_agent_path_lock);
  
  	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
  		seq_puts(seq, ",clone_children");
  	if (strlen(root->name))
  		seq_show_option(seq, "name", root->name);
  	return 0;
  }
8d2451f49   Al Viro   cgroup1: switch t...
867
868
869
870
871
872
873
874
875
876
  enum cgroup1_param {
  	Opt_all,
  	Opt_clone_children,
  	Opt_cpuset_v2_mode,
  	Opt_name,
  	Opt_none,
  	Opt_noprefix,
  	Opt_release_agent,
  	Opt_xattr,
  };
1592c9b22   Tejun Heo   cgroup: move v1 m...
877

d7167b149   Al Viro   fs_parse: fold fs...
878
  const struct fs_parameter_spec cgroup1_fs_parameters[] = {
8d2451f49   Al Viro   cgroup1: switch t...
879
880
881
882
883
884
885
886
887
888
  	fsparam_flag  ("all",		Opt_all),
  	fsparam_flag  ("clone_children", Opt_clone_children),
  	fsparam_flag  ("cpuset_v2_mode", Opt_cpuset_v2_mode),
  	fsparam_string("name",		Opt_name),
  	fsparam_flag  ("none",		Opt_none),
  	fsparam_flag  ("noprefix",	Opt_noprefix),
  	fsparam_string("release_agent",	Opt_release_agent),
  	fsparam_flag  ("xattr",		Opt_xattr),
  	{}
  };
1592c9b22   Tejun Heo   cgroup: move v1 m...
889

8d2451f49   Al Viro   cgroup1: switch t...
890
891
892
893
894
895
  int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
  {
  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
  	struct cgroup_subsys *ss;
  	struct fs_parse_result result;
  	int opt, i;
d7167b149   Al Viro   fs_parse: fold fs...
896
  	opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
8d2451f49   Al Viro   cgroup1: switch t...
897
898
  	if (opt == -ENOPARAM) {
  		if (strcmp(param->key, "source") == 0) {
bf81221a4   Qinglang Miao   cgroup: Fix memor...
899
900
  			if (fc->source)
  				return invalf(fc, "Multiple sources not supported");
8d2451f49   Al Viro   cgroup1: switch t...
901
902
903
904
  			fc->source = param->string;
  			param->string = NULL;
  			return 0;
  		}
1592c9b22   Tejun Heo   cgroup: move v1 m...
905
  		for_each_subsys(ss, i) {
8d2451f49   Al Viro   cgroup1: switch t...
906
  			if (strcmp(param->key, ss->legacy_name))
1592c9b22   Tejun Heo   cgroup: move v1 m...
907
  				continue;
f5dfb5315   Al Viro   cgroup: take opti...
908
  			ctx->subsys_mask |= (1 << i);
8d2451f49   Al Viro   cgroup1: switch t...
909
  			return 0;
1592c9b22   Tejun Heo   cgroup: move v1 m...
910
  		}
58c025f0e   Al Viro   cgroup1: switch t...
911
  		return invalfc(fc, "Unknown subsys name '%s'", param->key);
8d2451f49   Al Viro   cgroup1: switch t...
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
  	}
  	if (opt < 0)
  		return opt;
  
  	switch (opt) {
  	case Opt_none:
  		/* Explicitly have no subsystems */
  		ctx->none = true;
  		break;
  	case Opt_all:
  		ctx->all_ss = true;
  		break;
  	case Opt_noprefix:
  		ctx->flags |= CGRP_ROOT_NOPREFIX;
  		break;
  	case Opt_clone_children:
  		ctx->cpuset_clone_children = true;
  		break;
  	case Opt_cpuset_v2_mode:
  		ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
  		break;
  	case Opt_xattr:
  		ctx->flags |= CGRP_ROOT_XATTR;
  		break;
  	case Opt_release_agent:
  		/* Specifying two release agents is forbidden */
  		if (ctx->release_agent)
58c025f0e   Al Viro   cgroup1: switch t...
939
  			return invalfc(fc, "release_agent respecified");
8d2451f49   Al Viro   cgroup1: switch t...
940
941
942
943
944
945
  		ctx->release_agent = param->string;
  		param->string = NULL;
  		break;
  	case Opt_name:
  		/* blocked by boot param? */
  		if (cgroup_no_v1_named)
1592c9b22   Tejun Heo   cgroup: move v1 m...
946
  			return -ENOENT;
8d2451f49   Al Viro   cgroup1: switch t...
947
948
  		/* Can't specify an empty name */
  		if (!param->size)
58c025f0e   Al Viro   cgroup1: switch t...
949
  			return invalfc(fc, "Empty name");
8d2451f49   Al Viro   cgroup1: switch t...
950
  		if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
58c025f0e   Al Viro   cgroup1: switch t...
951
  			return invalfc(fc, "Name too long");
8d2451f49   Al Viro   cgroup1: switch t...
952
953
954
955
956
957
958
  		/* Must match [\w.-]+ */
  		for (i = 0; i < param->size; i++) {
  			char c = param->string[i];
  			if (isalnum(c))
  				continue;
  			if ((c == '.') || (c == '-') || (c == '_'))
  				continue;
58c025f0e   Al Viro   cgroup1: switch t...
959
  			return invalfc(fc, "Invalid name");
8d2451f49   Al Viro   cgroup1: switch t...
960
961
962
  		}
  		/* Specifying two names is forbidden */
  		if (ctx->name)
58c025f0e   Al Viro   cgroup1: switch t...
963
  			return invalfc(fc, "name respecified");
8d2451f49   Al Viro   cgroup1: switch t...
964
965
966
  		ctx->name = param->string;
  		param->string = NULL;
  		break;
1592c9b22   Tejun Heo   cgroup: move v1 m...
967
  	}
f5dfb5315   Al Viro   cgroup: take opti...
968
969
  	return 0;
  }
8d2451f49   Al Viro   cgroup1: switch t...
970
  static int check_cgroupfs_options(struct fs_context *fc)
f5dfb5315   Al Viro   cgroup: take opti...
971
  {
8d2451f49   Al Viro   cgroup1: switch t...
972
  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
f5dfb5315   Al Viro   cgroup: take opti...
973
974
975
976
977
978
979
980
981
982
983
984
985
  	u16 mask = U16_MAX;
  	u16 enabled = 0;
  	struct cgroup_subsys *ss;
  	int i;
  
  #ifdef CONFIG_CPUSETS
  	mask = ~((u16)1 << cpuset_cgrp_id);
  #endif
  	for_each_subsys(ss, i)
  		if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
  			enabled |= 1 << i;
  
  	ctx->subsys_mask &= enabled;
1592c9b22   Tejun Heo   cgroup: move v1 m...
986
987
  
  	/*
f5dfb5315   Al Viro   cgroup: take opti...
988
989
  	 * In absense of 'none', 'name=' or subsystem name options,
  	 * let's default to 'all'.
1592c9b22   Tejun Heo   cgroup: move v1 m...
990
  	 */
f5dfb5315   Al Viro   cgroup: take opti...
991
992
993
994
995
996
  	if (!ctx->subsys_mask && !ctx->none && !ctx->name)
  		ctx->all_ss = true;
  
  	if (ctx->all_ss) {
  		/* Mutually exclusive option 'all' + subsystem name */
  		if (ctx->subsys_mask)
58c025f0e   Al Viro   cgroup1: switch t...
997
  			return invalfc(fc, "subsys name conflicts with all");
f5dfb5315   Al Viro   cgroup: take opti...
998
999
1000
  		/* 'all' => select all the subsystems */
  		ctx->subsys_mask = enabled;
  	}
1592c9b22   Tejun Heo   cgroup: move v1 m...
1001
1002
1003
1004
1005
  
  	/*
  	 * We either have to specify by name or by subsystems. (So all
  	 * empty hierarchies must have a name).
  	 */
f5dfb5315   Al Viro   cgroup: take opti...
1006
  	if (!ctx->subsys_mask && !ctx->name)
58c025f0e   Al Viro   cgroup1: switch t...
1007
  		return invalfc(fc, "Need name or subsystem set");
1592c9b22   Tejun Heo   cgroup: move v1 m...
1008
1009
1010
1011
1012
1013
  
  	/*
  	 * Option noprefix was introduced just for backward compatibility
  	 * with the old cpuset, so we allow noprefix only if mounting just
  	 * the cpuset subsystem.
  	 */
f5dfb5315   Al Viro   cgroup: take opti...
1014
  	if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
58c025f0e   Al Viro   cgroup1: switch t...
1015
  		return invalfc(fc, "noprefix used incorrectly");
1592c9b22   Tejun Heo   cgroup: move v1 m...
1016
1017
  
  	/* Can't specify "none" and some subsystems */
f5dfb5315   Al Viro   cgroup: take opti...
1018
  	if (ctx->subsys_mask && ctx->none)
58c025f0e   Al Viro   cgroup1: switch t...
1019
  		return invalfc(fc, "none used incorrectly");
1592c9b22   Tejun Heo   cgroup: move v1 m...
1020
1021
1022
  
  	return 0;
  }
90129625d   Al Viro   cgroup: start swi...
1023
  int cgroup1_reconfigure(struct fs_context *fc)
1592c9b22   Tejun Heo   cgroup: move v1 m...
1024
  {
90129625d   Al Viro   cgroup: start swi...
1025
1026
  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
  	struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
1592c9b22   Tejun Heo   cgroup: move v1 m...
1027
  	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
90129625d   Al Viro   cgroup: start swi...
1028
  	int ret = 0;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1029
1030
1031
1032
1033
  	u16 added_mask, removed_mask;
  
  	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  
  	/* See what subsystems are wanted */
8d2451f49   Al Viro   cgroup1: switch t...
1034
  	ret = check_cgroupfs_options(fc);
1592c9b22   Tejun Heo   cgroup: move v1 m...
1035
1036
  	if (ret)
  		goto out_unlock;
f5dfb5315   Al Viro   cgroup: take opti...
1037
  	if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
1592c9b22   Tejun Heo   cgroup: move v1 m...
1038
1039
1040
  		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)
  ",
  			task_tgid_nr(current), current->comm);
f5dfb5315   Al Viro   cgroup: take opti...
1041
1042
  	added_mask = ctx->subsys_mask & ~root->subsys_mask;
  	removed_mask = root->subsys_mask & ~ctx->subsys_mask;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1043
1044
  
  	/* Don't allow flags or name to change at remount */
f5dfb5315   Al Viro   cgroup: take opti...
1045
1046
  	if ((ctx->flags ^ root->flags) ||
  	    (ctx->name && strcmp(ctx->name, root->name))) {
58c025f0e   Al Viro   cgroup1: switch t...
1047
  		errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
f5dfb5315   Al Viro   cgroup: take opti...
1048
  		       ctx->flags, ctx->name ?: "", root->flags, root->name);
1592c9b22   Tejun Heo   cgroup: move v1 m...
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
  		ret = -EINVAL;
  		goto out_unlock;
  	}
  
  	/* remounting is not allowed for populated hierarchies */
  	if (!list_empty(&root->cgrp.self.children)) {
  		ret = -EBUSY;
  		goto out_unlock;
  	}
  
  	ret = rebind_subsystems(root, added_mask);
  	if (ret)
  		goto out_unlock;
  
  	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
f5dfb5315   Al Viro   cgroup: take opti...
1064
  	if (ctx->release_agent) {
1592c9b22   Tejun Heo   cgroup: move v1 m...
1065
  		spin_lock(&release_agent_path_lock);
f5dfb5315   Al Viro   cgroup: take opti...
1066
  		strcpy(root->release_agent_path, ctx->release_agent);
1592c9b22   Tejun Heo   cgroup: move v1 m...
1067
1068
1069
1070
1071
1072
  		spin_unlock(&release_agent_path_lock);
  	}
  
  	trace_cgroup_remount(root);
  
   out_unlock:
1592c9b22   Tejun Heo   cgroup: move v1 m...
1073
1074
1075
1076
1077
1078
1079
  	mutex_unlock(&cgroup_mutex);
  	return ret;
  }
  
  struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
  	.rename			= cgroup1_rename,
  	.show_options		= cgroup1_show_options,
1592c9b22   Tejun Heo   cgroup: move v1 m...
1080
1081
1082
1083
  	.mkdir			= cgroup_mkdir,
  	.rmdir			= cgroup_rmdir,
  	.show_path		= cgroup_show_path,
  };
6678889f0   Al Viro   cgroup1_get_tree(...
1084
1085
1086
1087
1088
1089
1090
1091
1092
  /*
   * The guts of cgroup1 mount - find or create cgroup_root to use.
   * Called with cgroup_mutex held; returns 0 on success, -E... on
   * error and positive - in case when the candidate is busy dying.
   * On success it stashes a reference to cgroup_root into given
   * cgroup_fs_context; that reference is *NOT* counting towards the
   * cgroup_root refcount.
   */
  static int cgroup1_root_to_use(struct fs_context *fc)
1592c9b22   Tejun Heo   cgroup: move v1 m...
1093
  {
7feeef586   Al Viro   cgroup: fold cgro...
1094
  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1592c9b22   Tejun Heo   cgroup: move v1 m...
1095
1096
  	struct cgroup_root *root;
  	struct cgroup_subsys *ss;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1097
  	int i, ret;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1098
  	/* First find the desired set of subsystems */
8d2451f49   Al Viro   cgroup1: switch t...
1099
  	ret = check_cgroupfs_options(fc);
1592c9b22   Tejun Heo   cgroup: move v1 m...
1100
  	if (ret)
6678889f0   Al Viro   cgroup1_get_tree(...
1101
  		return ret;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1102
1103
1104
1105
1106
1107
1108
1109
1110
  
  	/*
  	 * Destruction of cgroup root is asynchronous, so subsystems may
  	 * still be dying after the previous unmount.  Let's drain the
  	 * dying subsystems.  We just need to ensure that the ones
  	 * unmounted previously finish dying and don't care about new ones
  	 * starting.  Testing ref liveliness is good enough.
  	 */
  	for_each_subsys(ss, i) {
f5dfb5315   Al Viro   cgroup: take opti...
1111
  		if (!(ctx->subsys_mask & (1 << i)) ||
1592c9b22   Tejun Heo   cgroup: move v1 m...
1112
1113
  		    ss->root == &cgrp_dfl_root)
  			continue;
6678889f0   Al Viro   cgroup1_get_tree(...
1114
1115
  		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
  			return 1;	/* restart */
1592c9b22   Tejun Heo   cgroup: move v1 m...
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
  		cgroup_put(&ss->root->cgrp);
  	}
  
  	for_each_root(root) {
  		bool name_match = false;
  
  		if (root == &cgrp_dfl_root)
  			continue;
  
  		/*
  		 * If we asked for a name then it must match.  Also, if
  		 * name matches but sybsys_mask doesn't, we should fail.
  		 * Remember whether name matched.
  		 */
f5dfb5315   Al Viro   cgroup: take opti...
1130
1131
  		if (ctx->name) {
  			if (strcmp(ctx->name, root->name))
1592c9b22   Tejun Heo   cgroup: move v1 m...
1132
1133
1134
1135
1136
1137
1138
1139
  				continue;
  			name_match = true;
  		}
  
  		/*
  		 * If we asked for subsystems (or explicitly for no
  		 * subsystems) then they must match.
  		 */
f5dfb5315   Al Viro   cgroup: take opti...
1140
1141
  		if ((ctx->subsys_mask || ctx->none) &&
  		    (ctx->subsys_mask != root->subsys_mask)) {
1592c9b22   Tejun Heo   cgroup: move v1 m...
1142
1143
  			if (!name_match)
  				continue;
6678889f0   Al Viro   cgroup1_get_tree(...
1144
  			return -EBUSY;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1145
  		}
f5dfb5315   Al Viro   cgroup: take opti...
1146
  		if (root->flags ^ ctx->flags)
1592c9b22   Tejun Heo   cgroup: move v1 m...
1147
1148
  			pr_warn("new mount options do not match the existing superblock, will be ignored
  ");
cf6299b1d   Al Viro   cgroup: stash cgr...
1149
  		ctx->root = root;
6678889f0   Al Viro   cgroup1_get_tree(...
1150
  		return 0;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1151
1152
1153
1154
1155
1156
1157
  	}
  
  	/*
  	 * No such thing, create a new one.  name= matching without subsys
  	 * specification is allowed for already existing hierarchies but we
  	 * can't create new one without subsys specification.
  	 */
6678889f0   Al Viro   cgroup1_get_tree(...
1158
  	if (!ctx->subsys_mask && !ctx->none)
58c025f0e   Al Viro   cgroup1: switch t...
1159
  		return invalfc(fc, "No subsys list or none specified");
1592c9b22   Tejun Heo   cgroup: move v1 m...
1160
1161
  
  	/* Hierarchies may only be created in the initial cgroup namespace. */
cca8f3271   Al Viro   cgroup: store a r...
1162
  	if (ctx->ns != &init_cgroup_ns)
6678889f0   Al Viro   cgroup1_get_tree(...
1163
  		return -EPERM;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1164
1165
  
  	root = kzalloc(sizeof(*root), GFP_KERNEL);
6678889f0   Al Viro   cgroup1_get_tree(...
1166
1167
  	if (!root)
  		return -ENOMEM;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1168

cf6299b1d   Al Viro   cgroup: stash cgr...
1169
1170
  	ctx->root = root;
  	init_cgroup_root(ctx);
1592c9b22   Tejun Heo   cgroup: move v1 m...
1171

f5dfb5315   Al Viro   cgroup: take opti...
1172
  	ret = cgroup_setup_root(root, ctx->subsys_mask);
1592c9b22   Tejun Heo   cgroup: move v1 m...
1173
1174
  	if (ret)
  		cgroup_free_root(root);
6678889f0   Al Viro   cgroup1_get_tree(...
1175
1176
1177
1178
1179
  	return ret;
  }
  
  int cgroup1_get_tree(struct fs_context *fc)
  {
6678889f0   Al Viro   cgroup1_get_tree(...
1180
1181
1182
1183
  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
  	int ret;
  
  	/* Check if the caller has permission to mount. */
cca8f3271   Al Viro   cgroup: store a r...
1184
  	if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
6678889f0   Al Viro   cgroup1_get_tree(...
1185
1186
1187
1188
1189
1190
1191
  		return -EPERM;
  
  	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  
  	ret = cgroup1_root_to_use(fc);
  	if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
  		ret = 1;	/* restart */
1592c9b22   Tejun Heo   cgroup: move v1 m...
1192

1592c9b22   Tejun Heo   cgroup: move v1 m...
1193
  	mutex_unlock(&cgroup_mutex);
1592c9b22   Tejun Heo   cgroup: move v1 m...
1194

6678889f0   Al Viro   cgroup1_get_tree(...
1195
  	if (!ret)
cca8f3271   Al Viro   cgroup: store a r...
1196
  		ret = cgroup_do_get_tree(fc);
6678889f0   Al Viro   cgroup1_get_tree(...
1197
1198
  
  	if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
71d883c37   Al Viro   cgroup_do_mount()...
1199
1200
  		struct super_block *sb = fc->root->d_sb;
  		dput(fc->root);
35ac11842   Al Viro   cgroup: saner ref...
1201
  		deactivate_locked_super(sb);
6678889f0   Al Viro   cgroup1_get_tree(...
1202
1203
1204
1205
  		ret = 1;
  	}
  
  	if (unlikely(ret > 0)) {
35ac11842   Al Viro   cgroup: saner ref...
1206
  		msleep(10);
7feeef586   Al Viro   cgroup: fold cgro...
1207
  		return restart_syscall();
9732adc5d   Zefan Li   cgroup: avoid att...
1208
  	}
71d883c37   Al Viro   cgroup_do_mount()...
1209
  	return ret;
1592c9b22   Tejun Heo   cgroup: move v1 m...
1210
  }
0a268dbd7   Tejun Heo   cgroup: move cgro...
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
  static int __init cgroup1_wq_init(void)
  {
  	/*
  	 * Used to destroy pidlists and separate to serve as flush domain.
  	 * Cap @max_active to 1 too.
  	 */
  	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
  						    0, 1);
  	BUG_ON(!cgroup_pidlist_destroy_wq);
  	return 0;
  }
  core_initcall(cgroup1_wq_init);
  
  static int __init cgroup_no_v1(char *str)
  {
  	struct cgroup_subsys *ss;
  	char *token;
  	int i;
  
  	while ((token = strsep(&str, ",")) != NULL) {
  		if (!*token)
  			continue;
  
  		if (!strcmp(token, "all")) {
  			cgroup_no_v1_mask = U16_MAX;
3fc9c12d2   Tejun Heo   cgroup: Add named...
1236
1237
1238
1239
1240
1241
  			continue;
  		}
  
  		if (!strcmp(token, "named")) {
  			cgroup_no_v1_named = true;
  			continue;
0a268dbd7   Tejun Heo   cgroup: move cgro...
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
  		}
  
  		for_each_subsys(ss, i) {
  			if (strcmp(token, ss->name) &&
  			    strcmp(token, ss->legacy_name))
  				continue;
  
  			cgroup_no_v1_mask |= 1 << i;
  		}
  	}
  	return 1;
  }
  __setup("cgroup_no_v1=", cgroup_no_v1);