Task Control Groups: add tasks file interface

Add the per-directory "tasks" file for cgroupfs mounts; this allows the user to determine which tasks are members of a cgroup by reading a cgroup's "tasks", and to move a task into a cgroup by writing its pid to its "tasks". Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Task Control Groups: add tasks file interface
Add the per-directory "tasks" file for cgroupfs mounts; this allows the user to determine which tasks are members of a cgroup by reading a cgroup's "tasks", and to move a task into a cgroup by writing its pid to its "tasks". Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Paul Menage · Linus Torvalds
1 parent ddbcc7e8e5
Showing 2 changed files with 368 additions and 1 deletions Side-by-side Diff
include/linux/cgroup.h
kernel/cgroup.c
@@ -144,6 +144,16 @@
  
 int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
  
+int __cgroup_task_count(const struct cgroup *cont);
+static inline int cgroup_task_count(const struct cgroup *cont)
+{
+	int task_count;
+	rcu_read_lock();
+	task_count = __cgroup_task_count(cont);
+	rcu_read_unlock();
+	return task_count;
+}
+
 /* Return true if the cgroup is a descendant of the current cgroup */
 int cgroup_is_descendant(const struct cgroup *cont);
  
@@ -40,7 +40,7 @@
 #include <linux/magic.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
-
+#include <linux/sort.h>
 #include <asm/atomic.h>
  
 /* Generate an array of cgroup subsystem pointers */
@@ -700,6 +700,127 @@
 	return 0;
 }
  
+/*
+ * Return the first subsystem attached to a cgroup's hierarchy, and
+ * its subsystem id.
+ */
+
+static void get_first_subsys(const struct cgroup *cont,
+			struct cgroup_subsys_state **css, int *subsys_id)
+{
+	const struct cgroupfs_root *root = cont->root;
+	const struct cgroup_subsys *test_ss;
+	BUG_ON(list_empty(&root->subsys_list));
+	test_ss = list_entry(root->subsys_list.next,
+			     struct cgroup_subsys, sibling);
+	if (css) {
+		*css = cont->subsys[test_ss->subsys_id];
+		BUG_ON(!*css);
+	}
+	if (subsys_id)
+		*subsys_id = test_ss->subsys_id;
+}
+
+/*
+ * Attach task 'tsk' to cgroup 'cont'
+ *
+ * Call holding cgroup_mutex.  May take task_lock of
+ * the task 'pid' during call.
+ */
+static int attach_task(struct cgroup *cont, struct task_struct *tsk)
+{
+	int retval = 0;
+	struct cgroup_subsys *ss;
+	struct cgroup *oldcont;
+	struct css_set *cg = &tsk->cgroups;
+	struct cgroupfs_root *root = cont->root;
+	int i;
+	int subsys_id;
+
+	get_first_subsys(cont, NULL, &subsys_id);
+
+	/* Nothing to do if the task is already in that cgroup */
+	oldcont = task_cgroup(tsk, subsys_id);
+	if (cont == oldcont)
+		return 0;
+
+	for_each_subsys(root, ss) {
+		if (ss->can_attach) {
+			retval = ss->can_attach(ss, cont, tsk);
+			if (retval) {
+				return retval;
+			}
+		}
+	}
+
+	task_lock(tsk);
+	if (tsk->flags & PF_EXITING) {
+		task_unlock(tsk);
+		return -ESRCH;
+	}
+	/* Update the css_set pointers for the subsystems in this
+	 * hierarchy */
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		if (root->subsys_bits & (1ull << i)) {
+			/* Subsystem is in this hierarchy. So we want
+			 * the subsystem state from the new
+			 * cgroup. Transfer the refcount from the
+			 * old to the new */
+			atomic_inc(&cont->count);
+			atomic_dec(&cg->subsys[i]->cgroup->count);
+			rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
+		}
+	}
+	task_unlock(tsk);
+
+	for_each_subsys(root, ss) {
+		if (ss->attach) {
+			ss->attach(ss, cont, oldcont, tsk);
+		}
+	}
+
+	synchronize_rcu();
+	return 0;
+}
+
+/*
+ * Attach task with pid 'pid' to cgroup 'cont'. Call with
+ * cgroup_mutex, may take task_lock of task
+ */
+static int attach_task_by_pid(struct cgroup *cont, char *pidbuf)
+{
+	pid_t pid;
+	struct task_struct *tsk;
+	int ret;
+
+	if (sscanf(pidbuf, "%d", &pid) != 1)
+		return -EIO;
+
+	if (pid) {
+		rcu_read_lock();
+		tsk = find_task_by_pid(pid);
+		if (!tsk || tsk->flags & PF_EXITING) {
+			rcu_read_unlock();
+			return -ESRCH;
+		}
+		get_task_struct(tsk);
+		rcu_read_unlock();
+
+		if ((current->euid) && (current->euid != tsk->uid)
+		    && (current->euid != tsk->suid)) {
+			put_task_struct(tsk);
+			return -EACCES;
+		}
+	} else {
+		tsk = current;
+		get_task_struct(tsk);
+	}
+
+	ret = attach_task(cont, tsk);
+	put_task_struct(tsk);
+	return ret;
+}
+
 /* The various types of files and directories in a cgroup file system */
  
 enum cgroup_filetype {
@@ -708,6 +829,55 @@
 	FILE_TASKLIST,
 };
  
+static ssize_t cgroup_common_file_write(struct cgroup *cont,
+					   struct cftype *cft,
+					   struct file *file,
+					   const char __user *userbuf,
+					   size_t nbytes, loff_t *unused_ppos)
+{
+	enum cgroup_filetype type = cft->private;
+	char *buffer;
+	int retval = 0;
+
+	if (nbytes >= PATH_MAX)
+		return -E2BIG;
+
+	/* +1 for nul-terminator */
+	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+	if (buffer == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(buffer, userbuf, nbytes)) {
+		retval = -EFAULT;
+		goto out1;
+	}
+	buffer[nbytes] = 0;	/* nul-terminate */
+
+	mutex_lock(&cgroup_mutex);
+
+	if (cgroup_is_removed(cont)) {
+		retval = -ENODEV;
+		goto out2;
+	}
+
+	switch (type) {
+	case FILE_TASKLIST:
+		retval = attach_task_by_pid(cont, buffer);
+		break;
+	default:
+		retval = -EINVAL;
+		goto out2;
+	}
+
+	if (retval == 0)
+		retval = nbytes;
+out2:
+	mutex_unlock(&cgroup_mutex);
+out1:
+	kfree(buffer);
+	return retval;
+}
+
 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 						size_t nbytes, loff_t *ppos)
 {
@@ -914,6 +1084,189 @@
 	return 0;
 }
  
+/* Count the number of tasks in a cgroup. Could be made more
+ * time-efficient but less space-efficient with more linked lists
+ * running through each cgroup and the css_set structures that
+ * referenced it. Must be called with tasklist_lock held for read or
+ * write or in an rcu critical section.
+ */
+int __cgroup_task_count(const struct cgroup *cont)
+{
+	int count = 0;
+	struct task_struct *g, *p;
+	struct cgroup_subsys_state *css;
+	int subsys_id;
+
+	get_first_subsys(cont, &css, &subsys_id);
+	do_each_thread(g, p) {
+		if (task_subsys_state(p, subsys_id) == css)
+			count ++;
+	} while_each_thread(g, p);
+	return count;
+}
+
+/*
+ * Stuff for reading the 'tasks' file.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ * Upon tasks file open(), a struct ctr_struct is allocated, that
+ * will have a pointer to an array (also allocated here).  The struct
+ * ctr_struct * is stored in file->private_data.  Its resources will
+ * be freed by release() when the file is closed.  The array is used
+ * to sprintf the PIDs and then used by read().
+ */
+struct ctr_struct {
+	char *buf;
+	int bufsz;
+};
+
+/*
+ * Load into 'pidarray' up to 'npids' of the tasks using cgroup
+ * 'cont'.  Return actual number of pids loaded.  No need to
+ * task_lock(p) when reading out p->cgroup, since we're in an RCU
+ * read section, so the css_set can't go away, and is
+ * immutable after creation.
+ */
+static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cont)
+{
+	int n = 0;
+	struct task_struct *g, *p;
+	struct cgroup_subsys_state *css;
+	int subsys_id;
+
+	get_first_subsys(cont, &css, &subsys_id);
+	rcu_read_lock();
+	do_each_thread(g, p) {
+		if (task_subsys_state(p, subsys_id) == css) {
+			pidarray[n++] = pid_nr(task_pid(p));
+			if (unlikely(n == npids))
+				goto array_full;
+		}
+	} while_each_thread(g, p);
+
+array_full:
+	rcu_read_unlock();
+	return n;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+	return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * Convert array 'a' of 'npids' pid_t's to a string of newline separated
+ * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return
+ * count 'cnt' of how many chars would be written if buf were large enough.
+ */
+static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
+{
+	int cnt = 0;
+	int i;
+
+	for (i = 0; i < npids; i++)
+		cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
+	return cnt;
+}
+
+/*
+ * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * process id's of tasks currently attached to the cgroup being opened.
+ *
+ * Does not require any specific cgroup mutexes, and does not take any.
+ */
+static int cgroup_tasks_open(struct inode *unused, struct file *file)
+{
+	struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
+	struct ctr_struct *ctr;
+	pid_t *pidarray;
+	int npids;
+	char c;
+
+	if (!(file->f_mode & FMODE_READ))
+		return 0;
+
+	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
+	if (!ctr)
+		goto err0;
+
+	/*
+	 * If cgroup gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cgroup users didn't
+	 * show up until sometime later on.
+	 */
+	npids = cgroup_task_count(cont);
+	if (npids) {
+		pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+		if (!pidarray)
+			goto err1;
+
+		npids = pid_array_load(pidarray, npids, cont);
+		sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
+
+		/* Call pid_array_to_buf() twice, first just to get bufsz */
+		ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
+		ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
+		if (!ctr->buf)
+			goto err2;
+		ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
+
+		kfree(pidarray);
+	} else {
+		ctr->buf = 0;
+		ctr->bufsz = 0;
+	}
+	file->private_data = ctr;
+	return 0;
+
+err2:
+	kfree(pidarray);
+err1:
+	kfree(ctr);
+err0:
+	return -ENOMEM;
+}
+
+static ssize_t cgroup_tasks_read(struct cgroup *cont,
+				    struct cftype *cft,
+				    struct file *file, char __user *buf,
+				    size_t nbytes, loff_t *ppos)
+{
+	struct ctr_struct *ctr = file->private_data;
+
+	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
+}
+
+static int cgroup_tasks_release(struct inode *unused_inode,
+					struct file *file)
+{
+	struct ctr_struct *ctr;
+
+	if (file->f_mode & FMODE_READ) {
+		ctr = file->private_data;
+		kfree(ctr->buf);
+		kfree(ctr);
+	}
+	return 0;
+}
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+static struct cftype cft_tasks = {
+	.name = "tasks",
+	.open = cgroup_tasks_open,
+	.read = cgroup_tasks_read,
+	.write = cgroup_common_file_write,
+	.release = cgroup_tasks_release,
+	.private = FILE_TASKLIST,
+};
+
 static int cgroup_populate_dir(struct cgroup *cont)
 {
 	int err;
@@ -921,6 +1274,10 @@
  
 	/* First clear out any existing files */
 	cgroup_clear_directory(cont->dentry);
+
+	err = cgroup_add_file(cont, NULL, &cft_tasks);
+	if (err < 0)
+		return err;
  
 	for_each_subsys(cont->root, ss) {
 		if (ss->populate && (err = ss->populate(ss, cont)) < 0)
...	...	@@ -144,6 +144,16 @@
144	144
145	145	int cgroup_path(const struct cgroup cont, char buf, int buflen);
146	146
	147	+int __cgroup_task_count(const struct cgroup *cont);
	148	+static inline int cgroup_task_count(const struct cgroup *cont)
	149	+{
	150	+ int task_count;
	151	+ rcu_read_lock();
	152	+ task_count = __cgroup_task_count(cont);
	153	+ rcu_read_unlock();
	154	+ return task_count;
	155	+}
	156	+
147	157	/* Return true if the cgroup is a descendant of the current cgroup */
148	158	int cgroup_is_descendant(const struct cgroup *cont);
149	159
...	...	@@ -40,7 +40,7 @@
40	40	#include <linux/magic.h>
41	41	#include <linux/spinlock.h>
42	42	#include <linux/string.h>
43		-
	43	+#include <linux/sort.h>
44	44	#include <asm/atomic.h>
45	45
46	46	/* Generate an array of cgroup subsystem pointers */
...	...	@@ -700,6 +700,127 @@
700	700	return 0;
701	701	}
702	702
	703	+/*
	704	+ * Return the first subsystem attached to a cgroup's hierarchy, and
	705	+ * its subsystem id.
	706	+ */
	707	+
	708	+static void get_first_subsys(const struct cgroup *cont,
	709	+ struct cgroup_subsys_state *css, int subsys_id)
	710	+{
	711	+ const struct cgroupfs_root *root = cont->root;
	712	+ const struct cgroup_subsys *test_ss;
	713	+ BUG_ON(list_empty(&root->subsys_list));
	714	+ test_ss = list_entry(root->subsys_list.next,
	715	+ struct cgroup_subsys, sibling);
	716	+ if (css) {
	717	+ *css = cont->subsys[test_ss->subsys_id];
	718	+ BUG_ON(!*css);
	719	+ }
	720	+ if (subsys_id)
	721	+ *subsys_id = test_ss->subsys_id;
	722	+}
	723	+
	724	+/*
	725	+ * Attach task 'tsk' to cgroup 'cont'
	726	+ *
	727	+ * Call holding cgroup_mutex. May take task_lock of
	728	+ * the task 'pid' during call.
	729	+ */
	730	+static int attach_task(struct cgroup cont, struct task_struct tsk)
	731	+{
	732	+ int retval = 0;
	733	+ struct cgroup_subsys *ss;
	734	+ struct cgroup *oldcont;
	735	+ struct css_set *cg = &tsk->cgroups;
	736	+ struct cgroupfs_root *root = cont->root;
	737	+ int i;
	738	+ int subsys_id;
	739	+
	740	+ get_first_subsys(cont, NULL, &subsys_id);
	741	+
	742	+ /* Nothing to do if the task is already in that cgroup */
	743	+ oldcont = task_cgroup(tsk, subsys_id);
	744	+ if (cont == oldcont)
	745	+ return 0;
	746	+
	747	+ for_each_subsys(root, ss) {
	748	+ if (ss->can_attach) {
	749	+ retval = ss->can_attach(ss, cont, tsk);
	750	+ if (retval) {
	751	+ return retval;
	752	+ }
	753	+ }
	754	+ }
	755	+
	756	+ task_lock(tsk);
	757	+ if (tsk->flags & PF_EXITING) {
	758	+ task_unlock(tsk);
	759	+ return -ESRCH;
	760	+ }
	761	+ /* Update the css_set pointers for the subsystems in this
	762	+ * hierarchy */
	763	+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
	764	+ if (root->subsys_bits & (1ull << i)) {
	765	+ /* Subsystem is in this hierarchy. So we want
	766	+ * the subsystem state from the new
	767	+ * cgroup. Transfer the refcount from the
	768	+ * old to the new */
	769	+ atomic_inc(&cont->count);
	770	+ atomic_dec(&cg->subsys[i]->cgroup->count);
	771	+ rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
	772	+ }
	773	+ }
	774	+ task_unlock(tsk);
	775	+
	776	+ for_each_subsys(root, ss) {
	777	+ if (ss->attach) {
	778	+ ss->attach(ss, cont, oldcont, tsk);
	779	+ }
	780	+ }
	781	+
	782	+ synchronize_rcu();
	783	+ return 0;
	784	+}
	785	+
	786	+/*
	787	+ * Attach task with pid 'pid' to cgroup 'cont'. Call with
	788	+ * cgroup_mutex, may take task_lock of task
	789	+ */
	790	+static int attach_task_by_pid(struct cgroup cont, char pidbuf)
	791	+{
	792	+ pid_t pid;
	793	+ struct task_struct *tsk;
	794	+ int ret;
	795	+
	796	+ if (sscanf(pidbuf, "%d", &pid) != 1)
	797	+ return -EIO;
	798	+
	799	+ if (pid) {
	800	+ rcu_read_lock();
	801	+ tsk = find_task_by_pid(pid);
	802	+ if (!tsk \|\| tsk->flags & PF_EXITING) {
	803	+ rcu_read_unlock();
	804	+ return -ESRCH;
	805	+ }
	806	+ get_task_struct(tsk);
	807	+ rcu_read_unlock();
	808	+
	809	+ if ((current->euid) && (current->euid != tsk->uid)
	810	+ && (current->euid != tsk->suid)) {
	811	+ put_task_struct(tsk);
	812	+ return -EACCES;
	813	+ }
	814	+ } else {
	815	+ tsk = current;
	816	+ get_task_struct(tsk);
	817	+ }
	818	+
	819	+ ret = attach_task(cont, tsk);
	820	+ put_task_struct(tsk);
	821	+ return ret;
	822	+}
	823	+
703	824	/* The various types of files and directories in a cgroup file system */
704	825
705	826	enum cgroup_filetype {
...	...	@@ -708,6 +829,55 @@
708	829	FILE_TASKLIST,
709	830	};
710	831
	832	+static ssize_t cgroup_common_file_write(struct cgroup *cont,
	833	+ struct cftype *cft,
	834	+ struct file *file,
	835	+ const char __user *userbuf,
	836	+ size_t nbytes, loff_t *unused_ppos)
	837	+{
	838	+ enum cgroup_filetype type = cft->private;
	839	+ char *buffer;
	840	+ int retval = 0;
	841	+
	842	+ if (nbytes >= PATH_MAX)
	843	+ return -E2BIG;
	844	+
	845	+ /* +1 for nul-terminator */
	846	+ buffer = kmalloc(nbytes + 1, GFP_KERNEL);
	847	+ if (buffer == NULL)
	848	+ return -ENOMEM;
	849	+
	850	+ if (copy_from_user(buffer, userbuf, nbytes)) {
	851	+ retval = -EFAULT;
	852	+ goto out1;
	853	+ }
	854	+ buffer[nbytes] = 0; /* nul-terminate */
	855	+
	856	+ mutex_lock(&cgroup_mutex);
	857	+
	858	+ if (cgroup_is_removed(cont)) {
	859	+ retval = -ENODEV;
	860	+ goto out2;
	861	+ }
	862	+
	863	+ switch (type) {
	864	+ case FILE_TASKLIST:
	865	+ retval = attach_task_by_pid(cont, buffer);
	866	+ break;
	867	+ default:
	868	+ retval = -EINVAL;
	869	+ goto out2;
	870	+ }
	871	+
	872	+ if (retval == 0)
	873	+ retval = nbytes;
	874	+out2:
	875	+ mutex_unlock(&cgroup_mutex);
	876	+out1:
	877	+ kfree(buffer);
	878	+ return retval;
	879	+}
	880	+
711	881	static ssize_t cgroup_file_write(struct file file, const char __user buf,
712	882	size_t nbytes, loff_t *ppos)
713	883	{
...	...	@@ -914,6 +1084,189 @@
914	1084	return 0;
915	1085	}
916	1086
	1087	+/* Count the number of tasks in a cgroup. Could be made more
	1088	+ * time-efficient but less space-efficient with more linked lists
	1089	+ * running through each cgroup and the css_set structures that
	1090	+ * referenced it. Must be called with tasklist_lock held for read or
	1091	+ * write or in an rcu critical section.
	1092	+ */
	1093	+int __cgroup_task_count(const struct cgroup *cont)
	1094	+{
	1095	+ int count = 0;
	1096	+ struct task_struct g, p;
	1097	+ struct cgroup_subsys_state *css;
	1098	+ int subsys_id;
	1099	+
	1100	+ get_first_subsys(cont, &css, &subsys_id);
	1101	+ do_each_thread(g, p) {
	1102	+ if (task_subsys_state(p, subsys_id) == css)
	1103	+ count ++;
	1104	+ } while_each_thread(g, p);
	1105	+ return count;
	1106	+}
	1107	+
	1108	+/*
	1109	+ * Stuff for reading the 'tasks' file.
	1110	+ *
	1111	+ * Reading this file can return large amounts of data if a cgroup has
	1112	+ * lots of attached tasks. So it may need several calls to read(),
	1113	+ * but we cannot guarantee that the information we produce is correct
	1114	+ * unless we produce it entirely atomically.
	1115	+ *
	1116	+ * Upon tasks file open(), a struct ctr_struct is allocated, that
	1117	+ * will have a pointer to an array (also allocated here). The struct
	1118	+ * ctr_struct * is stored in file->private_data. Its resources will
	1119	+ * be freed by release() when the file is closed. The array is used
	1120	+ * to sprintf the PIDs and then used by read().
	1121	+ */
	1122	+struct ctr_struct {
	1123	+ char *buf;
	1124	+ int bufsz;
	1125	+};
	1126	+
	1127	+/*
	1128	+ * Load into 'pidarray' up to 'npids' of the tasks using cgroup
	1129	+ * 'cont'. Return actual number of pids loaded. No need to
	1130	+ * task_lock(p) when reading out p->cgroup, since we're in an RCU
	1131	+ * read section, so the css_set can't go away, and is
	1132	+ * immutable after creation.
	1133	+ */
	1134	+static int pid_array_load(pid_t pidarray, int npids, struct cgroup cont)
	1135	+{
	1136	+ int n = 0;
	1137	+ struct task_struct g, p;
	1138	+ struct cgroup_subsys_state *css;
	1139	+ int subsys_id;
	1140	+
	1141	+ get_first_subsys(cont, &css, &subsys_id);
	1142	+ rcu_read_lock();
	1143	+ do_each_thread(g, p) {
	1144	+ if (task_subsys_state(p, subsys_id) == css) {
	1145	+ pidarray[n++] = pid_nr(task_pid(p));
	1146	+ if (unlikely(n == npids))
	1147	+ goto array_full;
	1148	+ }
	1149	+ } while_each_thread(g, p);
	1150	+
	1151	+array_full:
	1152	+ rcu_read_unlock();
	1153	+ return n;
	1154	+}
	1155	+
	1156	+static int cmppid(const void a, const void b)
	1157	+{
	1158	+ return (pid_t )a - (pid_t )b;
	1159	+}
	1160	+
	1161	+/*
	1162	+ * Convert array 'a' of 'npids' pid_t's to a string of newline separated
	1163	+ * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
	1164	+ * count 'cnt' of how many chars would be written if buf were large enough.
	1165	+ */
	1166	+static int pid_array_to_buf(char buf, int sz, pid_t a, int npids)
	1167	+{
	1168	+ int cnt = 0;
	1169	+ int i;
	1170	+
	1171	+ for (i = 0; i < npids; i++)
	1172	+ cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
	1173	+ return cnt;
	1174	+}
	1175	+
	1176	+/*
	1177	+ * Handle an open on 'tasks' file. Prepare a buffer listing the
	1178	+ * process id's of tasks currently attached to the cgroup being opened.
	1179	+ *
	1180	+ * Does not require any specific cgroup mutexes, and does not take any.
	1181	+ */
	1182	+static int cgroup_tasks_open(struct inode unused, struct file file)
	1183	+{
	1184	+ struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
	1185	+ struct ctr_struct *ctr;
	1186	+ pid_t *pidarray;
	1187	+ int npids;
	1188	+ char c;
	1189	+
	1190	+ if (!(file->f_mode & FMODE_READ))
	1191	+ return 0;
	1192	+
	1193	+ ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
	1194	+ if (!ctr)
	1195	+ goto err0;
	1196	+
	1197	+ /*
	1198	+ * If cgroup gets more users after we read count, we won't have
	1199	+ * enough space - tough. This race is indistinguishable to the
	1200	+ * caller from the case that the additional cgroup users didn't
	1201	+ * show up until sometime later on.
	1202	+ */
	1203	+ npids = cgroup_task_count(cont);
	1204	+ if (npids) {
	1205	+ pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
	1206	+ if (!pidarray)
	1207	+ goto err1;
	1208	+
	1209	+ npids = pid_array_load(pidarray, npids, cont);
	1210	+ sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
	1211	+
	1212	+ /* Call pid_array_to_buf() twice, first just to get bufsz */
	1213	+ ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
	1214	+ ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
	1215	+ if (!ctr->buf)
	1216	+ goto err2;
	1217	+ ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
	1218	+
	1219	+ kfree(pidarray);
	1220	+ } else {
	1221	+ ctr->buf = 0;
	1222	+ ctr->bufsz = 0;
	1223	+ }
	1224	+ file->private_data = ctr;
	1225	+ return 0;
	1226	+
	1227	+err2:
	1228	+ kfree(pidarray);
	1229	+err1:
	1230	+ kfree(ctr);
	1231	+err0:
	1232	+ return -ENOMEM;
	1233	+}
	1234	+
	1235	+static ssize_t cgroup_tasks_read(struct cgroup *cont,
	1236	+ struct cftype *cft,
	1237	+ struct file file, char __user buf,
	1238	+ size_t nbytes, loff_t *ppos)
	1239	+{
	1240	+ struct ctr_struct *ctr = file->private_data;
	1241	+
	1242	+ return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
	1243	+}
	1244	+
	1245	+static int cgroup_tasks_release(struct inode *unused_inode,
	1246	+ struct file *file)
	1247	+{
	1248	+ struct ctr_struct *ctr;
	1249	+
	1250	+ if (file->f_mode & FMODE_READ) {
	1251	+ ctr = file->private_data;
	1252	+ kfree(ctr->buf);
	1253	+ kfree(ctr);
	1254	+ }
	1255	+ return 0;
	1256	+}
	1257	+
	1258	+/*
	1259	+ * for the common functions, 'private' gives the type of file
	1260	+ */
	1261	+static struct cftype cft_tasks = {
	1262	+ .name = "tasks",
	1263	+ .open = cgroup_tasks_open,
	1264	+ .read = cgroup_tasks_read,
	1265	+ .write = cgroup_common_file_write,
	1266	+ .release = cgroup_tasks_release,
	1267	+ .private = FILE_TASKLIST,
	1268	+};
	1269	+
917	1270	static int cgroup_populate_dir(struct cgroup *cont)
918	1271	{
919	1272	int err;
...	...	@@ -921,6 +1274,10 @@
921	1274
922	1275	/* First clear out any existing files */
923	1276	cgroup_clear_directory(cont->dentry);
	1277	+
	1278	+ err = cgroup_add_file(cont, NULL, &cft_tasks);
	1279	+ if (err < 0)
	1280	+ return err;
924	1281
925	1282	for_each_subsys(cont->root, ss) {
926	1283	if (ss->populate && (err = ss->populate(ss, cont)) < 0)