Commit bbcb81d09104f0d440974b994c1fc508ccbe9503
Committed by
Linus Torvalds
1 parent
ddbcc7e8e5
Exists in
master
and in
4 other branches
Task Control Groups: add tasks file interface
Add the per-directory "tasks" file for cgroupfs mounts; this allows the user to determine which tasks are members of a cgroup by reading a cgroup's "tasks", and to move a task into a cgroup by writing its pid to its "tasks". Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 2 changed files with 368 additions and 1 deletions Side-by-side Diff
include/linux/cgroup.h
... | ... | @@ -144,6 +144,16 @@ |
144 | 144 | |
145 | 145 | int cgroup_path(const struct cgroup *cont, char *buf, int buflen); |
146 | 146 | |
147 | +int __cgroup_task_count(const struct cgroup *cont); | |
148 | +static inline int cgroup_task_count(const struct cgroup *cont) | |
149 | +{ | |
150 | + int task_count; | |
151 | + rcu_read_lock(); | |
152 | + task_count = __cgroup_task_count(cont); | |
153 | + rcu_read_unlock(); | |
154 | + return task_count; | |
155 | +} | |
156 | + | |
147 | 157 | /* Return true if the cgroup is a descendant of the current cgroup */ |
148 | 158 | int cgroup_is_descendant(const struct cgroup *cont); |
149 | 159 |
kernel/cgroup.c
... | ... | @@ -40,7 +40,7 @@ |
40 | 40 | #include <linux/magic.h> |
41 | 41 | #include <linux/spinlock.h> |
42 | 42 | #include <linux/string.h> |
43 | - | |
43 | +#include <linux/sort.h> | |
44 | 44 | #include <asm/atomic.h> |
45 | 45 | |
46 | 46 | /* Generate an array of cgroup subsystem pointers */ |
... | ... | @@ -700,6 +700,127 @@ |
700 | 700 | return 0; |
701 | 701 | } |
702 | 702 | |
703 | +/* | |
704 | + * Return the first subsystem attached to a cgroup's hierarchy, and | |
705 | + * its subsystem id. | |
706 | + */ | |
707 | + | |
708 | +static void get_first_subsys(const struct cgroup *cont, | |
709 | + struct cgroup_subsys_state **css, int *subsys_id) | |
710 | +{ | |
711 | + const struct cgroupfs_root *root = cont->root; | |
712 | + const struct cgroup_subsys *test_ss; | |
713 | + BUG_ON(list_empty(&root->subsys_list)); | |
714 | + test_ss = list_entry(root->subsys_list.next, | |
715 | + struct cgroup_subsys, sibling); | |
716 | + if (css) { | |
717 | + *css = cont->subsys[test_ss->subsys_id]; | |
718 | + BUG_ON(!*css); | |
719 | + } | |
720 | + if (subsys_id) | |
721 | + *subsys_id = test_ss->subsys_id; | |
722 | +} | |
723 | + | |
724 | +/* | |
725 | + * Attach task 'tsk' to cgroup 'cont' | |
726 | + * | |
727 | + * Call holding cgroup_mutex. May take task_lock of | |
728 | + * the task 'pid' during call. | |
729 | + */ | |
730 | +static int attach_task(struct cgroup *cont, struct task_struct *tsk) | |
731 | +{ | |
732 | + int retval = 0; | |
733 | + struct cgroup_subsys *ss; | |
734 | + struct cgroup *oldcont; | |
735 | + struct css_set *cg = &tsk->cgroups; | |
736 | + struct cgroupfs_root *root = cont->root; | |
737 | + int i; | |
738 | + int subsys_id; | |
739 | + | |
740 | + get_first_subsys(cont, NULL, &subsys_id); | |
741 | + | |
742 | + /* Nothing to do if the task is already in that cgroup */ | |
743 | + oldcont = task_cgroup(tsk, subsys_id); | |
744 | + if (cont == oldcont) | |
745 | + return 0; | |
746 | + | |
747 | + for_each_subsys(root, ss) { | |
748 | + if (ss->can_attach) { | |
749 | + retval = ss->can_attach(ss, cont, tsk); | |
750 | + if (retval) { | |
751 | + return retval; | |
752 | + } | |
753 | + } | |
754 | + } | |
755 | + | |
756 | + task_lock(tsk); | |
757 | + if (tsk->flags & PF_EXITING) { | |
758 | + task_unlock(tsk); | |
759 | + return -ESRCH; | |
760 | + } | |
761 | + /* Update the css_set pointers for the subsystems in this | |
762 | + * hierarchy */ | |
763 | + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | |
764 | + if (root->subsys_bits & (1ull << i)) { | |
765 | + /* Subsystem is in this hierarchy. So we want | |
766 | + * the subsystem state from the new | |
767 | + * cgroup. Transfer the refcount from the | |
768 | + * old to the new */ | |
769 | + atomic_inc(&cont->count); | |
770 | + atomic_dec(&cg->subsys[i]->cgroup->count); | |
771 | + rcu_assign_pointer(cg->subsys[i], cont->subsys[i]); | |
772 | + } | |
773 | + } | |
774 | + task_unlock(tsk); | |
775 | + | |
776 | + for_each_subsys(root, ss) { | |
777 | + if (ss->attach) { | |
778 | + ss->attach(ss, cont, oldcont, tsk); | |
779 | + } | |
780 | + } | |
781 | + | |
782 | + synchronize_rcu(); | |
783 | + return 0; | |
784 | +} | |
785 | + | |
786 | +/* | |
787 | + * Attach task with pid 'pid' to cgroup 'cont'. Call with | |
788 | + * cgroup_mutex, may take task_lock of task | |
789 | + */ | |
790 | +static int attach_task_by_pid(struct cgroup *cont, char *pidbuf) | |
791 | +{ | |
792 | + pid_t pid; | |
793 | + struct task_struct *tsk; | |
794 | + int ret; | |
795 | + | |
796 | + if (sscanf(pidbuf, "%d", &pid) != 1) | |
797 | + return -EIO; | |
798 | + | |
799 | + if (pid) { | |
800 | + rcu_read_lock(); | |
801 | + tsk = find_task_by_pid(pid); | |
802 | + if (!tsk || tsk->flags & PF_EXITING) { | |
803 | + rcu_read_unlock(); | |
804 | + return -ESRCH; | |
805 | + } | |
806 | + get_task_struct(tsk); | |
807 | + rcu_read_unlock(); | |
808 | + | |
809 | + if ((current->euid) && (current->euid != tsk->uid) | |
810 | + && (current->euid != tsk->suid)) { | |
811 | + put_task_struct(tsk); | |
812 | + return -EACCES; | |
813 | + } | |
814 | + } else { | |
815 | + tsk = current; | |
816 | + get_task_struct(tsk); | |
817 | + } | |
818 | + | |
819 | + ret = attach_task(cont, tsk); | |
820 | + put_task_struct(tsk); | |
821 | + return ret; | |
822 | +} | |
823 | + | |
703 | 824 | /* The various types of files and directories in a cgroup file system */ |
704 | 825 | |
705 | 826 | enum cgroup_filetype { |
... | ... | @@ -708,6 +829,55 @@ |
708 | 829 | FILE_TASKLIST, |
709 | 830 | }; |
710 | 831 | |
832 | +static ssize_t cgroup_common_file_write(struct cgroup *cont, | |
833 | + struct cftype *cft, | |
834 | + struct file *file, | |
835 | + const char __user *userbuf, | |
836 | + size_t nbytes, loff_t *unused_ppos) | |
837 | +{ | |
838 | + enum cgroup_filetype type = cft->private; | |
839 | + char *buffer; | |
840 | + int retval = 0; | |
841 | + | |
842 | + if (nbytes >= PATH_MAX) | |
843 | + return -E2BIG; | |
844 | + | |
845 | + /* +1 for nul-terminator */ | |
846 | + buffer = kmalloc(nbytes + 1, GFP_KERNEL); | |
847 | + if (buffer == NULL) | |
848 | + return -ENOMEM; | |
849 | + | |
850 | + if (copy_from_user(buffer, userbuf, nbytes)) { | |
851 | + retval = -EFAULT; | |
852 | + goto out1; | |
853 | + } | |
854 | + buffer[nbytes] = 0; /* nul-terminate */ | |
855 | + | |
856 | + mutex_lock(&cgroup_mutex); | |
857 | + | |
858 | + if (cgroup_is_removed(cont)) { | |
859 | + retval = -ENODEV; | |
860 | + goto out2; | |
861 | + } | |
862 | + | |
863 | + switch (type) { | |
864 | + case FILE_TASKLIST: | |
865 | + retval = attach_task_by_pid(cont, buffer); | |
866 | + break; | |
867 | + default: | |
868 | + retval = -EINVAL; | |
869 | + goto out2; | |
870 | + } | |
871 | + | |
872 | + if (retval == 0) | |
873 | + retval = nbytes; | |
874 | +out2: | |
875 | + mutex_unlock(&cgroup_mutex); | |
876 | +out1: | |
877 | + kfree(buffer); | |
878 | + return retval; | |
879 | +} | |
880 | + | |
711 | 881 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, |
712 | 882 | size_t nbytes, loff_t *ppos) |
713 | 883 | { |
... | ... | @@ -914,6 +1084,189 @@ |
914 | 1084 | return 0; |
915 | 1085 | } |
916 | 1086 | |
1087 | +/* Count the number of tasks in a cgroup. Could be made more | |
1088 | + * time-efficient but less space-efficient with more linked lists | |
1089 | + * running through each cgroup and the css_set structures that | |
1090 | + * referenced it. Must be called with tasklist_lock held for read or | |
1091 | + * write or in an rcu critical section. | |
1092 | + */ | |
1093 | +int __cgroup_task_count(const struct cgroup *cont) | |
1094 | +{ | |
1095 | + int count = 0; | |
1096 | + struct task_struct *g, *p; | |
1097 | + struct cgroup_subsys_state *css; | |
1098 | + int subsys_id; | |
1099 | + | |
1100 | + get_first_subsys(cont, &css, &subsys_id); | |
1101 | + do_each_thread(g, p) { | |
1102 | + if (task_subsys_state(p, subsys_id) == css) | |
1103 | + count ++; | |
1104 | + } while_each_thread(g, p); | |
1105 | + return count; | |
1106 | +} | |
1107 | + | |
1108 | +/* | |
1109 | + * Stuff for reading the 'tasks' file. | |
1110 | + * | |
1111 | + * Reading this file can return large amounts of data if a cgroup has | |
1112 | + * *lots* of attached tasks. So it may need several calls to read(), | |
1113 | + * but we cannot guarantee that the information we produce is correct | |
1114 | + * unless we produce it entirely atomically. | |
1115 | + * | |
1116 | + * Upon tasks file open(), a struct ctr_struct is allocated, that | |
1117 | + * will have a pointer to an array (also allocated here). The struct | |
1118 | + * ctr_struct * is stored in file->private_data. Its resources will | |
1119 | + * be freed by release() when the file is closed. The array is used | |
1120 | + * to sprintf the PIDs and then used by read(). | |
1121 | + */ | |
1122 | +struct ctr_struct { | |
1123 | + char *buf; | |
1124 | + int bufsz; | |
1125 | +}; | |
1126 | + | |
1127 | +/* | |
1128 | + * Load into 'pidarray' up to 'npids' of the tasks using cgroup | |
1129 | + * 'cont'. Return actual number of pids loaded. No need to | |
1130 | + * task_lock(p) when reading out p->cgroup, since we're in an RCU | |
1131 | + * read section, so the css_set can't go away, and is | |
1132 | + * immutable after creation. | |
1133 | + */ | |
1134 | +static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cont) | |
1135 | +{ | |
1136 | + int n = 0; | |
1137 | + struct task_struct *g, *p; | |
1138 | + struct cgroup_subsys_state *css; | |
1139 | + int subsys_id; | |
1140 | + | |
1141 | + get_first_subsys(cont, &css, &subsys_id); | |
1142 | + rcu_read_lock(); | |
1143 | + do_each_thread(g, p) { | |
1144 | + if (task_subsys_state(p, subsys_id) == css) { | |
1145 | + pidarray[n++] = pid_nr(task_pid(p)); | |
1146 | + if (unlikely(n == npids)) | |
1147 | + goto array_full; | |
1148 | + } | |
1149 | + } while_each_thread(g, p); | |
1150 | + | |
1151 | +array_full: | |
1152 | + rcu_read_unlock(); | |
1153 | + return n; | |
1154 | +} | |
1155 | + | |
1156 | +static int cmppid(const void *a, const void *b) | |
1157 | +{ | |
1158 | + return *(pid_t *)a - *(pid_t *)b; | |
1159 | +} | |
1160 | + | |
1161 | +/* | |
1162 | + * Convert array 'a' of 'npids' pid_t's to a string of newline separated | |
1163 | + * decimal pids in 'buf'. Don't write more than 'sz' chars, but return | |
1164 | + * count 'cnt' of how many chars would be written if buf were large enough. | |
1165 | + */ | |
1166 | +static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |
1167 | +{ | |
1168 | + int cnt = 0; | |
1169 | + int i; | |
1170 | + | |
1171 | + for (i = 0; i < npids; i++) | |
1172 | + cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); | |
1173 | + return cnt; | |
1174 | +} | |
1175 | + | |
1176 | +/* | |
1177 | + * Handle an open on 'tasks' file. Prepare a buffer listing the | |
1178 | + * process id's of tasks currently attached to the cgroup being opened. | |
1179 | + * | |
1180 | + * Does not require any specific cgroup mutexes, and does not take any. | |
1181 | + */ | |
1182 | +static int cgroup_tasks_open(struct inode *unused, struct file *file) | |
1183 | +{ | |
1184 | + struct cgroup *cont = __d_cont(file->f_dentry->d_parent); | |
1185 | + struct ctr_struct *ctr; | |
1186 | + pid_t *pidarray; | |
1187 | + int npids; | |
1188 | + char c; | |
1189 | + | |
1190 | + if (!(file->f_mode & FMODE_READ)) | |
1191 | + return 0; | |
1192 | + | |
1193 | + ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); | |
1194 | + if (!ctr) | |
1195 | + goto err0; | |
1196 | + | |
1197 | + /* | |
1198 | + * If cgroup gets more users after we read count, we won't have | |
1199 | + * enough space - tough. This race is indistinguishable to the | |
1200 | + * caller from the case that the additional cgroup users didn't | |
1201 | + * show up until sometime later on. | |
1202 | + */ | |
1203 | + npids = cgroup_task_count(cont); | |
1204 | + if (npids) { | |
1205 | + pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); | |
1206 | + if (!pidarray) | |
1207 | + goto err1; | |
1208 | + | |
1209 | + npids = pid_array_load(pidarray, npids, cont); | |
1210 | + sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); | |
1211 | + | |
1212 | + /* Call pid_array_to_buf() twice, first just to get bufsz */ | |
1213 | + ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; | |
1214 | + ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); | |
1215 | + if (!ctr->buf) | |
1216 | + goto err2; | |
1217 | + ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); | |
1218 | + | |
1219 | + kfree(pidarray); | |
1220 | + } else { | |
1221 | + ctr->buf = 0; | |
1222 | + ctr->bufsz = 0; | |
1223 | + } | |
1224 | + file->private_data = ctr; | |
1225 | + return 0; | |
1226 | + | |
1227 | +err2: | |
1228 | + kfree(pidarray); | |
1229 | +err1: | |
1230 | + kfree(ctr); | |
1231 | +err0: | |
1232 | + return -ENOMEM; | |
1233 | +} | |
1234 | + | |
1235 | +static ssize_t cgroup_tasks_read(struct cgroup *cont, | |
1236 | + struct cftype *cft, | |
1237 | + struct file *file, char __user *buf, | |
1238 | + size_t nbytes, loff_t *ppos) | |
1239 | +{ | |
1240 | + struct ctr_struct *ctr = file->private_data; | |
1241 | + | |
1242 | + return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); | |
1243 | +} | |
1244 | + | |
1245 | +static int cgroup_tasks_release(struct inode *unused_inode, | |
1246 | + struct file *file) | |
1247 | +{ | |
1248 | + struct ctr_struct *ctr; | |
1249 | + | |
1250 | + if (file->f_mode & FMODE_READ) { | |
1251 | + ctr = file->private_data; | |
1252 | + kfree(ctr->buf); | |
1253 | + kfree(ctr); | |
1254 | + } | |
1255 | + return 0; | |
1256 | +} | |
1257 | + | |
1258 | +/* | |
1259 | + * for the common functions, 'private' gives the type of file | |
1260 | + */ | |
1261 | +static struct cftype cft_tasks = { | |
1262 | + .name = "tasks", | |
1263 | + .open = cgroup_tasks_open, | |
1264 | + .read = cgroup_tasks_read, | |
1265 | + .write = cgroup_common_file_write, | |
1266 | + .release = cgroup_tasks_release, | |
1267 | + .private = FILE_TASKLIST, | |
1268 | +}; | |
1269 | + | |
917 | 1270 | static int cgroup_populate_dir(struct cgroup *cont) |
918 | 1271 | { |
919 | 1272 | int err; |
... | ... | @@ -921,6 +1274,10 @@ |
921 | 1274 | |
922 | 1275 | /* First clear out any existing files */ |
923 | 1276 | cgroup_clear_directory(cont->dentry); |
1277 | + | |
1278 | + err = cgroup_add_file(cont, NULL, &cft_tasks); | |
1279 | + if (err < 0) | |
1280 | + return err; | |
924 | 1281 | |
925 | 1282 | for_each_subsys(cont->root, ss) { |
926 | 1283 | if (ss->populate && (err = ss->populate(ss, cont)) < 0) |