Commit bbcb81d09104f0d440974b994c1fc508ccbe9503

Authored by Paul Menage
Committed by Linus Torvalds
1 parent ddbcc7e8e5

Task Control Groups: add tasks file interface

Add the per-directory "tasks" file for cgroupfs mounts; this allows the
user to determine which tasks are members of a cgroup by reading a
cgroup's "tasks", and to move a task into a cgroup by writing its pid to
its "tasks".

Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 368 additions and 1 deletions Side-by-side Diff

include/linux/cgroup.h
... ... @@ -144,6 +144,16 @@
144 144  
145 145 int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
146 146  
  147 +int __cgroup_task_count(const struct cgroup *cont);
  148 +static inline int cgroup_task_count(const struct cgroup *cont)
  149 +{
  150 + int task_count;
  151 + rcu_read_lock();
  152 + task_count = __cgroup_task_count(cont);
  153 + rcu_read_unlock();
  154 + return task_count;
  155 +}
  156 +
147 157 /* Return true if the cgroup is a descendant of the current cgroup */
148 158 int cgroup_is_descendant(const struct cgroup *cont);
149 159  
... ... @@ -40,7 +40,7 @@
40 40 #include <linux/magic.h>
41 41 #include <linux/spinlock.h>
42 42 #include <linux/string.h>
43   -
  43 +#include <linux/sort.h>
44 44 #include <asm/atomic.h>
45 45  
46 46 /* Generate an array of cgroup subsystem pointers */
... ... @@ -700,6 +700,127 @@
700 700 return 0;
701 701 }
702 702  
  703 +/*
  704 + * Return the first subsystem attached to a cgroup's hierarchy, and
  705 + * its subsystem id.
  706 + */
  707 +
  708 +static void get_first_subsys(const struct cgroup *cont,
  709 + struct cgroup_subsys_state **css, int *subsys_id)
  710 +{
  711 + const struct cgroupfs_root *root = cont->root;
  712 + const struct cgroup_subsys *test_ss;
  713 + BUG_ON(list_empty(&root->subsys_list));
  714 + test_ss = list_entry(root->subsys_list.next,
  715 + struct cgroup_subsys, sibling);
  716 + if (css) {
  717 + *css = cont->subsys[test_ss->subsys_id];
  718 + BUG_ON(!*css);
  719 + }
  720 + if (subsys_id)
  721 + *subsys_id = test_ss->subsys_id;
  722 +}
  723 +
  724 +/*
  725 + * Attach task 'tsk' to cgroup 'cont'
  726 + *
  727 + * Call holding cgroup_mutex. May take task_lock of
  728 + * the task 'pid' during call.
  729 + */
  730 +static int attach_task(struct cgroup *cont, struct task_struct *tsk)
  731 +{
  732 + int retval = 0;
  733 + struct cgroup_subsys *ss;
  734 + struct cgroup *oldcont;
  735 + struct css_set *cg = &tsk->cgroups;
  736 + struct cgroupfs_root *root = cont->root;
  737 + int i;
  738 + int subsys_id;
  739 +
  740 + get_first_subsys(cont, NULL, &subsys_id);
  741 +
  742 + /* Nothing to do if the task is already in that cgroup */
  743 + oldcont = task_cgroup(tsk, subsys_id);
  744 + if (cont == oldcont)
  745 + return 0;
  746 +
  747 + for_each_subsys(root, ss) {
  748 + if (ss->can_attach) {
  749 + retval = ss->can_attach(ss, cont, tsk);
  750 + if (retval) {
  751 + return retval;
  752 + }
  753 + }
  754 + }
  755 +
  756 + task_lock(tsk);
  757 + if (tsk->flags & PF_EXITING) {
  758 + task_unlock(tsk);
  759 + return -ESRCH;
  760 + }
  761 + /* Update the css_set pointers for the subsystems in this
  762 + * hierarchy */
  763 + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
  764 + if (root->subsys_bits & (1ull << i)) {
  765 + /* Subsystem is in this hierarchy. So we want
  766 + * the subsystem state from the new
  767 + * cgroup. Transfer the refcount from the
  768 + * old to the new */
  769 + atomic_inc(&cont->count);
  770 + atomic_dec(&cg->subsys[i]->cgroup->count);
  771 + rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
  772 + }
  773 + }
  774 + task_unlock(tsk);
  775 +
  776 + for_each_subsys(root, ss) {
  777 + if (ss->attach) {
  778 + ss->attach(ss, cont, oldcont, tsk);
  779 + }
  780 + }
  781 +
  782 + synchronize_rcu();
  783 + return 0;
  784 +}
  785 +
  786 +/*
  787 + * Attach task with pid 'pid' to cgroup 'cont'. Call with
  788 + * cgroup_mutex, may take task_lock of task
  789 + */
  790 +static int attach_task_by_pid(struct cgroup *cont, char *pidbuf)
  791 +{
  792 + pid_t pid;
  793 + struct task_struct *tsk;
  794 + int ret;
  795 +
  796 + if (sscanf(pidbuf, "%d", &pid) != 1)
  797 + return -EIO;
  798 +
  799 + if (pid) {
  800 + rcu_read_lock();
  801 + tsk = find_task_by_pid(pid);
  802 + if (!tsk || tsk->flags & PF_EXITING) {
  803 + rcu_read_unlock();
  804 + return -ESRCH;
  805 + }
  806 + get_task_struct(tsk);
  807 + rcu_read_unlock();
  808 +
  809 + if ((current->euid) && (current->euid != tsk->uid)
  810 + && (current->euid != tsk->suid)) {
  811 + put_task_struct(tsk);
  812 + return -EACCES;
  813 + }
  814 + } else {
  815 + tsk = current;
  816 + get_task_struct(tsk);
  817 + }
  818 +
  819 + ret = attach_task(cont, tsk);
  820 + put_task_struct(tsk);
  821 + return ret;
  822 +}
  823 +
703 824 /* The various types of files and directories in a cgroup file system */
704 825  
705 826 enum cgroup_filetype {
... ... @@ -708,6 +829,55 @@
708 829 FILE_TASKLIST,
709 830 };
710 831  
  832 +static ssize_t cgroup_common_file_write(struct cgroup *cont,
  833 + struct cftype *cft,
  834 + struct file *file,
  835 + const char __user *userbuf,
  836 + size_t nbytes, loff_t *unused_ppos)
  837 +{
  838 + enum cgroup_filetype type = cft->private;
  839 + char *buffer;
  840 + int retval = 0;
  841 +
  842 + if (nbytes >= PATH_MAX)
  843 + return -E2BIG;
  844 +
  845 + /* +1 for nul-terminator */
  846 + buffer = kmalloc(nbytes + 1, GFP_KERNEL);
  847 + if (buffer == NULL)
  848 + return -ENOMEM;
  849 +
  850 + if (copy_from_user(buffer, userbuf, nbytes)) {
  851 + retval = -EFAULT;
  852 + goto out1;
  853 + }
  854 + buffer[nbytes] = 0; /* nul-terminate */
  855 +
  856 + mutex_lock(&cgroup_mutex);
  857 +
  858 + if (cgroup_is_removed(cont)) {
  859 + retval = -ENODEV;
  860 + goto out2;
  861 + }
  862 +
  863 + switch (type) {
  864 + case FILE_TASKLIST:
  865 + retval = attach_task_by_pid(cont, buffer);
  866 + break;
  867 + default:
  868 + retval = -EINVAL;
  869 + goto out2;
  870 + }
  871 +
  872 + if (retval == 0)
  873 + retval = nbytes;
  874 +out2:
  875 + mutex_unlock(&cgroup_mutex);
  876 +out1:
  877 + kfree(buffer);
  878 + return retval;
  879 +}
  880 +
711 881 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
712 882 size_t nbytes, loff_t *ppos)
713 883 {
... ... @@ -914,6 +1084,189 @@
914 1084 return 0;
915 1085 }
916 1086  
  1087 +/* Count the number of tasks in a cgroup. Could be made more
  1088 + * time-efficient but less space-efficient with more linked lists
  1089 + * running through each cgroup and the css_set structures that
  1090 + * referenced it. Must be called with tasklist_lock held for read or
  1091 + * write or in an rcu critical section.
  1092 + */
  1093 +int __cgroup_task_count(const struct cgroup *cont)
  1094 +{
  1095 + int count = 0;
  1096 + struct task_struct *g, *p;
  1097 + struct cgroup_subsys_state *css;
  1098 + int subsys_id;
  1099 +
  1100 + get_first_subsys(cont, &css, &subsys_id);
  1101 + do_each_thread(g, p) {
  1102 + if (task_subsys_state(p, subsys_id) == css)
  1103 + count ++;
  1104 + } while_each_thread(g, p);
  1105 + return count;
  1106 +}
  1107 +
  1108 +/*
  1109 + * Stuff for reading the 'tasks' file.
  1110 + *
  1111 + * Reading this file can return large amounts of data if a cgroup has
  1112 + * *lots* of attached tasks. So it may need several calls to read(),
  1113 + * but we cannot guarantee that the information we produce is correct
  1114 + * unless we produce it entirely atomically.
  1115 + *
  1116 + * Upon tasks file open(), a struct ctr_struct is allocated, that
  1117 + * will have a pointer to an array (also allocated here). The struct
  1118 + * ctr_struct * is stored in file->private_data. Its resources will
  1119 + * be freed by release() when the file is closed. The array is used
  1120 + * to sprintf the PIDs and then used by read().
  1121 + */
  1122 +struct ctr_struct {
  1123 + char *buf;
  1124 + int bufsz;
  1125 +};
  1126 +
  1127 +/*
  1128 + * Load into 'pidarray' up to 'npids' of the tasks using cgroup
  1129 + * 'cont'. Return actual number of pids loaded. No need to
  1130 + * task_lock(p) when reading out p->cgroup, since we're in an RCU
  1131 + * read section, so the css_set can't go away, and is
  1132 + * immutable after creation.
  1133 + */
  1134 +static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cont)
  1135 +{
  1136 + int n = 0;
  1137 + struct task_struct *g, *p;
  1138 + struct cgroup_subsys_state *css;
  1139 + int subsys_id;
  1140 +
  1141 + get_first_subsys(cont, &css, &subsys_id);
  1142 + rcu_read_lock();
  1143 + do_each_thread(g, p) {
  1144 + if (task_subsys_state(p, subsys_id) == css) {
  1145 + pidarray[n++] = pid_nr(task_pid(p));
  1146 + if (unlikely(n == npids))
  1147 + goto array_full;
  1148 + }
  1149 + } while_each_thread(g, p);
  1150 +
  1151 +array_full:
  1152 + rcu_read_unlock();
  1153 + return n;
  1154 +}
  1155 +
  1156 +static int cmppid(const void *a, const void *b)
  1157 +{
  1158 + return *(pid_t *)a - *(pid_t *)b;
  1159 +}
  1160 +
  1161 +/*
  1162 + * Convert array 'a' of 'npids' pid_t's to a string of newline separated
  1163 + * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
  1164 + * count 'cnt' of how many chars would be written if buf were large enough.
  1165 + */
  1166 +static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
  1167 +{
  1168 + int cnt = 0;
  1169 + int i;
  1170 +
  1171 + for (i = 0; i < npids; i++)
  1172 + cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
  1173 + return cnt;
  1174 +}
  1175 +
  1176 +/*
  1177 + * Handle an open on 'tasks' file. Prepare a buffer listing the
  1178 + * process id's of tasks currently attached to the cgroup being opened.
  1179 + *
  1180 + * Does not require any specific cgroup mutexes, and does not take any.
  1181 + */
  1182 +static int cgroup_tasks_open(struct inode *unused, struct file *file)
  1183 +{
  1184 + struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
  1185 + struct ctr_struct *ctr;
  1186 + pid_t *pidarray;
  1187 + int npids;
  1188 + char c;
  1189 +
  1190 + if (!(file->f_mode & FMODE_READ))
  1191 + return 0;
  1192 +
  1193 + ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
  1194 + if (!ctr)
  1195 + goto err0;
  1196 +
  1197 + /*
  1198 + * If cgroup gets more users after we read count, we won't have
  1199 + * enough space - tough. This race is indistinguishable to the
  1200 + * caller from the case that the additional cgroup users didn't
  1201 + * show up until sometime later on.
  1202 + */
  1203 + npids = cgroup_task_count(cont);
  1204 + if (npids) {
  1205 + pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
  1206 + if (!pidarray)
  1207 + goto err1;
  1208 +
  1209 + npids = pid_array_load(pidarray, npids, cont);
  1210 + sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
  1211 +
  1212 + /* Call pid_array_to_buf() twice, first just to get bufsz */
  1213 + ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
  1214 + ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
  1215 + if (!ctr->buf)
  1216 + goto err2;
  1217 + ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
  1218 +
  1219 + kfree(pidarray);
  1220 + } else {
  1221 + ctr->buf = 0;
  1222 + ctr->bufsz = 0;
  1223 + }
  1224 + file->private_data = ctr;
  1225 + return 0;
  1226 +
  1227 +err2:
  1228 + kfree(pidarray);
  1229 +err1:
  1230 + kfree(ctr);
  1231 +err0:
  1232 + return -ENOMEM;
  1233 +}
  1234 +
  1235 +static ssize_t cgroup_tasks_read(struct cgroup *cont,
  1236 + struct cftype *cft,
  1237 + struct file *file, char __user *buf,
  1238 + size_t nbytes, loff_t *ppos)
  1239 +{
  1240 + struct ctr_struct *ctr = file->private_data;
  1241 +
  1242 + return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
  1243 +}
  1244 +
  1245 +static int cgroup_tasks_release(struct inode *unused_inode,
  1246 + struct file *file)
  1247 +{
  1248 + struct ctr_struct *ctr;
  1249 +
  1250 + if (file->f_mode & FMODE_READ) {
  1251 + ctr = file->private_data;
  1252 + kfree(ctr->buf);
  1253 + kfree(ctr);
  1254 + }
  1255 + return 0;
  1256 +}
  1257 +
  1258 +/*
  1259 + * for the common functions, 'private' gives the type of file
  1260 + */
  1261 +static struct cftype cft_tasks = {
  1262 + .name = "tasks",
  1263 + .open = cgroup_tasks_open,
  1264 + .read = cgroup_tasks_read,
  1265 + .write = cgroup_common_file_write,
  1266 + .release = cgroup_tasks_release,
  1267 + .private = FILE_TASKLIST,
  1268 +};
  1269 +
917 1270 static int cgroup_populate_dir(struct cgroup *cont)
918 1271 {
919 1272 int err;
... ... @@ -921,6 +1274,10 @@
921 1274  
922 1275 /* First clear out any existing files */
923 1276 cgroup_clear_directory(cont->dentry);
  1277 +
  1278 + err = cgroup_add_file(cont, NULL, &cft_tasks);
  1279 + if (err < 0)
  1280 + return err;
924 1281  
925 1282 for_each_subsys(cont->root, ss) {
926 1283 if (ss->populate && (err = ss->populate(ss, cont)) < 0)