Commit 2c6ab6d200827e1c41dc71fff3a2ac7473f51777

Authored by Paul Menage
Committed by Linus Torvalds
1 parent 7717f7ba92

cgroups: allow cgroup hierarchies to be created with no bound subsystems

This patch removes the restriction that a cgroup hierarchy must have at
least one bound subsystem.  The mount option "none" is treated as an
explicit request for no bound subsystems.

A hierarchy with no subsystems can be useful for plain task tracking, and
is also a step towards the support for multiply-bindable subsystems.

As part of this change, the hierarchy id is no longer calculated from the
bitmask of subsystems in the hierarchy (since this is not guaranteed to be
unique) but is allocated via an ida.  Reference counts on cgroups from
css_set objects are now taken explicitly one per hierarchy, rather than
one per subsystem.

Example usage:

mount -t cgroup -o none,name=foo cgroup /mnt/cgroup

Based on the "no-op"/"none" subsystem concept proposed by
kamezawa.hiroyu@jp.fujitsu.com

Signed-off-by: Paul Menage <menage@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 99 additions and 59 deletions Side-by-side Diff

... ... @@ -49,6 +49,7 @@
49 49 #include <linux/namei.h>
50 50 #include <linux/smp_lock.h>
51 51 #include <linux/pid_namespace.h>
  52 +#include <linux/idr.h>
52 53  
53 54 #include <asm/atomic.h>
54 55  
... ... @@ -77,6 +78,9 @@
77 78 */
78 79 unsigned long subsys_bits;
79 80  
  81 + /* Unique id for this hierarchy. */
  82 + int hierarchy_id;
  83 +
80 84 /* The bitmask of subsystems currently attached to this hierarchy */
81 85 unsigned long actual_subsys_bits;
82 86  
... ... @@ -147,6 +151,10 @@
147 151 static LIST_HEAD(roots);
148 152 static int root_count;
149 153  
  154 +static DEFINE_IDA(hierarchy_ida);
  155 +static int next_hierarchy_id;
  156 +static DEFINE_SPINLOCK(hierarchy_id_lock);
  157 +
150 158 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
151 159 #define dummytop (&rootnode.top_cgroup)
152 160  
153 161  
... ... @@ -264,42 +272,10 @@
264 272 * compiled into their kernel but not actually in use */
265 273 static int use_task_css_set_links __read_mostly;
266 274  
267   -/* When we create or destroy a css_set, the operation simply
268   - * takes/releases a reference count on all the cgroups referenced
269   - * by subsystems in this css_set. This can end up multiple-counting
270   - * some cgroups, but that's OK - the ref-count is just a
271   - * busy/not-busy indicator; ensuring that we only count each cgroup
272   - * once would require taking a global lock to ensure that no
273   - * subsystems moved between hierarchies while we were doing so.
274   - *
275   - * Possible TODO: decide at boot time based on the number of
276   - * registered subsystems and the number of CPUs or NUMA nodes whether
277   - * it's better for performance to ref-count every subsystem, or to
278   - * take a global lock and only add one ref count to each hierarchy.
279   - */
280   -
281   -/*
282   - * unlink a css_set from the list and free it
283   - */
284   -static void unlink_css_set(struct css_set *cg)
  275 +static void __put_css_set(struct css_set *cg, int taskexit)
285 276 {
286 277 struct cg_cgroup_link *link;
287 278 struct cg_cgroup_link *saved_link;
288   -
289   - hlist_del(&cg->hlist);
290   - css_set_count--;
291   -
292   - list_for_each_entry_safe(link, saved_link, &cg->cg_links,
293   - cg_link_list) {
294   - list_del(&link->cg_link_list);
295   - list_del(&link->cgrp_link_list);
296   - kfree(link);
297   - }
298   -}
299   -
300   -static void __put_css_set(struct css_set *cg, int taskexit)
301   -{
302   - int i;
303 279 /*
304 280 * Ensure that the refcount doesn't hit zero while any readers
305 281 * can see it. Similar to atomic_dec_and_lock(), but for an
306 282  
307 283  
308 284  
... ... @@ -312,20 +288,27 @@
312 288 write_unlock(&css_set_lock);
313 289 return;
314 290 }
315   - unlink_css_set(cg);
316   - write_unlock(&css_set_lock);
317 291  
318   - rcu_read_lock();
319   - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
320   - struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
  292 + /* This css_set is dead. unlink it and release cgroup refcounts */
  293 + hlist_del(&cg->hlist);
  294 + css_set_count--;
  295 +
  296 + list_for_each_entry_safe(link, saved_link, &cg->cg_links,
  297 + cg_link_list) {
  298 + struct cgroup *cgrp = link->cgrp;
  299 + list_del(&link->cg_link_list);
  300 + list_del(&link->cgrp_link_list);
321 301 if (atomic_dec_and_test(&cgrp->count) &&
322 302 notify_on_release(cgrp)) {
323 303 if (taskexit)
324 304 set_bit(CGRP_RELEASABLE, &cgrp->flags);
325 305 check_for_release(cgrp);
326 306 }
  307 +
  308 + kfree(link);
327 309 }
328   - rcu_read_unlock();
  310 +
  311 + write_unlock(&css_set_lock);
329 312 kfree(cg);
330 313 }
331 314  
... ... @@ -519,6 +502,7 @@
519 502 cgrp_link_list);
520 503 link->cg = cg;
521 504 link->cgrp = cgrp;
  505 + atomic_inc(&cgrp->count);
522 506 list_move(&link->cgrp_link_list, &cgrp->css_sets);
523 507 /*
524 508 * Always add links to the tail of the list so that the list
... ... @@ -539,7 +523,6 @@
539 523 {
540 524 struct css_set *res;
541 525 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
542   - int i;
543 526  
544 527 struct list_head tmp_cg_links;
545 528  
... ... @@ -578,10 +561,6 @@
578 561  
579 562 write_lock(&css_set_lock);
580 563 /* Add reference counts and links from the new css_set. */
581   - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
582   - struct cgroup *cgrp = res->subsys[i]->cgroup;
583   - atomic_inc(&cgrp->count);
584   - }
585 564 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
586 565 struct cgroup *c = link->cgrp;
587 566 if (c->root == cgrp->root)
588 567  
... ... @@ -972,8 +951,11 @@
972 951 unsigned long flags;
973 952 char *release_agent;
974 953 char *name;
  954 + /* User explicitly requested empty subsystem */
  955 + bool none;
975 956  
976 957 struct cgroupfs_root *new_root;
  958 +
977 959 };
978 960  
979 961 /* Convert a hierarchy specifier into a bitmask of subsystems and
... ... @@ -1002,6 +984,9 @@
1002 984 if (!ss->disabled)
1003 985 opts->subsys_bits |= 1ul << i;
1004 986 }
  987 + } else if (!strcmp(token, "none")) {
  988 + /* Explicitly have no subsystems */
  989 + opts->none = true;
1005 990 } else if (!strcmp(token, "noprefix")) {
1006 991 set_bit(ROOT_NOPREFIX, &opts->flags);
1007 992 } else if (!strncmp(token, "release_agent=", 14)) {
... ... @@ -1051,6 +1036,8 @@
1051 1036 }
1052 1037 }
1053 1038  
  1039 + /* Consistency checks */
  1040 +
1054 1041 /*
1055 1042 * Option noprefix was introduced just for backward compatibility
1056 1043 * with the old cpuset, so we allow noprefix only if mounting just
... ... @@ -1060,7 +1047,15 @@
1060 1047 (opts->subsys_bits & mask))
1061 1048 return -EINVAL;
1062 1049  
1063   - /* We can't have an empty hierarchy */
  1050 +
  1051 + /* Can't specify "none" and some subsystems */
  1052 + if (opts->subsys_bits && opts->none)
  1053 + return -EINVAL;
  1054 +
  1055 + /*
  1056 + * We either have to specify by name or by subsystems. (So all
  1057 + * empty hierarchies must have a name).
  1058 + */
1064 1059 if (!opts->subsys_bits && !opts->name)
1065 1060 return -EINVAL;
1066 1061  
... ... @@ -1141,6 +1136,31 @@
1141 1136 init_cgroup_housekeeping(cgrp);
1142 1137 }
1143 1138  
  1139 +static bool init_root_id(struct cgroupfs_root *root)
  1140 +{
  1141 + int ret = 0;
  1142 +
  1143 + do {
  1144 + if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
  1145 + return false;
  1146 + spin_lock(&hierarchy_id_lock);
  1147 + /* Try to allocate the next unused ID */
  1148 + ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
  1149 + &root->hierarchy_id);
  1150 + if (ret == -ENOSPC)
  1151 + /* Try again starting from 0 */
  1152 + ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
  1153 + if (!ret) {
  1154 + next_hierarchy_id = root->hierarchy_id + 1;
  1155 + } else if (ret != -EAGAIN) {
  1156 + /* Can only get here if the 31-bit IDR is full ... */
  1157 + BUG_ON(ret);
  1158 + }
  1159 + spin_unlock(&hierarchy_id_lock);
  1160 + } while (ret);
  1161 + return true;
  1162 +}
  1163 +
1144 1164 static int cgroup_test_super(struct super_block *sb, void *data)
1145 1165 {
1146 1166 struct cgroup_sb_opts *opts = data;
... ... @@ -1150,8 +1170,12 @@
1150 1170 if (opts->name && strcmp(opts->name, root->name))
1151 1171 return 0;
1152 1172  
1153   - /* If we asked for subsystems then they must match */
1154   - if (opts->subsys_bits && (opts->subsys_bits != root->subsys_bits))
  1173 + /*
  1174 + * If we asked for subsystems (or explicitly for no
  1175 + * subsystems) then they must match
  1176 + */
  1177 + if ((opts->subsys_bits || opts->none)
  1178 + && (opts->subsys_bits != root->subsys_bits))
1155 1179 return 0;
1156 1180  
1157 1181 return 1;
1158 1182  
1159 1183  
... ... @@ -1161,15 +1185,19 @@
1161 1185 {
1162 1186 struct cgroupfs_root *root;
1163 1187  
1164   - /* Empty hierarchies aren't supported */
1165   - if (!opts->subsys_bits)
  1188 + if (!opts->subsys_bits && !opts->none)
1166 1189 return NULL;
1167 1190  
1168 1191 root = kzalloc(sizeof(*root), GFP_KERNEL);
1169 1192 if (!root)
1170 1193 return ERR_PTR(-ENOMEM);
1171 1194  
  1195 + if (!init_root_id(root)) {
  1196 + kfree(root);
  1197 + return ERR_PTR(-ENOMEM);
  1198 + }
1172 1199 init_cgroup_root(root);
  1200 +
1173 1201 root->subsys_bits = opts->subsys_bits;
1174 1202 root->flags = opts->flags;
1175 1203 if (opts->release_agent)
... ... @@ -1179,6 +1207,18 @@
1179 1207 return root;
1180 1208 }
1181 1209  
  1210 +static void cgroup_drop_root(struct cgroupfs_root *root)
  1211 +{
  1212 + if (!root)
  1213 + return;
  1214 +
  1215 + BUG_ON(!root->hierarchy_id);
  1216 + spin_lock(&hierarchy_id_lock);
  1217 + ida_remove(&hierarchy_ida, root->hierarchy_id);
  1218 + spin_unlock(&hierarchy_id_lock);
  1219 + kfree(root);
  1220 +}
  1221 +
1182 1222 static int cgroup_set_super(struct super_block *sb, void *data)
1183 1223 {
1184 1224 int ret;
... ... @@ -1188,7 +1228,7 @@
1188 1228 if (!opts->new_root)
1189 1229 return -EINVAL;
1190 1230  
1191   - BUG_ON(!opts->subsys_bits);
  1231 + BUG_ON(!opts->subsys_bits && !opts->none);
1192 1232  
1193 1233 ret = set_anon_super(sb, NULL);
1194 1234 if (ret)
... ... @@ -1257,7 +1297,7 @@
1257 1297 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1258 1298 if (IS_ERR(sb)) {
1259 1299 ret = PTR_ERR(sb);
1260   - kfree(opts.new_root);
  1300 + cgroup_drop_root(opts.new_root);
1261 1301 goto out_err;
1262 1302 }
1263 1303  
... ... @@ -1351,7 +1391,7 @@
1351 1391 * We re-used an existing hierarchy - the new root (if
1352 1392 * any) is not needed
1353 1393 */
1354   - kfree(opts.new_root);
  1394 + cgroup_drop_root(opts.new_root);
1355 1395 }
1356 1396  
1357 1397 simple_set_mnt(mnt, sb);
... ... @@ -1410,7 +1450,7 @@
1410 1450 mutex_unlock(&cgroup_mutex);
1411 1451  
1412 1452 kill_litter_super(sb);
1413   - kfree(root);
  1453 + cgroup_drop_root(root);
1414 1454 }
1415 1455  
1416 1456 static struct file_system_type cgroup_fs_type = {
... ... @@ -3109,7 +3149,7 @@
3109 3149 /* Add init_css_set to the hash table */
3110 3150 hhead = css_set_hash(init_css_set.subsys);
3111 3151 hlist_add_head(&init_css_set.hlist, hhead);
3112   -
  3152 + BUG_ON(!init_root_id(&rootnode));
3113 3153 err = register_filesystem(&cgroup_fs_type);
3114 3154 if (err < 0)
3115 3155 goto out;
... ... @@ -3164,7 +3204,7 @@
3164 3204 struct cgroup *cgrp;
3165 3205 int count = 0;
3166 3206  
3167   - seq_printf(m, "%lu:", root->subsys_bits);
  3207 + seq_printf(m, "%d:", root->hierarchy_id);
3168 3208 for_each_subsys(root, ss)
3169 3209 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3170 3210 if (strlen(root->name))
... ... @@ -3210,8 +3250,8 @@
3210 3250 mutex_lock(&cgroup_mutex);
3211 3251 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3212 3252 struct cgroup_subsys *ss = subsys[i];
3213   - seq_printf(m, "%s\t%lu\t%d\t%d\n",
3214   - ss->name, ss->root->subsys_bits,
  3253 + seq_printf(m, "%s\t%d\t%d\t%d\n",
  3254 + ss->name, ss->root->hierarchy_id,
3215 3255 ss->root->number_of_cgroups, !ss->disabled);
3216 3256 }
3217 3257 mutex_unlock(&cgroup_mutex);
... ... @@ -3929,8 +3969,8 @@
3929 3969 name = c->dentry->d_name.name;
3930 3970 else
3931 3971 name = "?";
3932   - seq_printf(seq, "Root %lu group %s\n",
3933   - c->root->subsys_bits, name);
  3972 + seq_printf(seq, "Root %d group %s\n",
  3973 + c->root->hierarchy_id, name);
3934 3974 }
3935 3975 rcu_read_unlock();
3936 3976 read_unlock(&css_set_lock);