Fix cpusets update_cpumask

Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks: - collect batches of tasks under tasklist_lock and then call set_cpus_allowed() on them outside the lock (since this can sleep). - add a simple generic priority heap type to allow efficient collection of batches of tasks to be processed without duplicating or missing any tasks in subsequent batches. - make "cpus" file update a no-op if the mask hasn't changed - fix race between update_cpumask() and sched_setaffinity() by making sched_setaffinity() post-check that it's not running on any cpus outside cpuset_cpus_allowed(). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Fix cpusets update_cpumask
Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks: - collect batches of tasks under tasklist_lock and then call set_cpus_allowed() on them outside the lock (since this can sleep). - add a simple generic priority heap type to allow efficient collection of batches of tasks to be processed without duplicating or missing any tasks in subsequent batches. - make "cpus" file update a no-op if the mask hasn't changed - fix race between update_cpumask() and sched_setaffinity() by making sched_setaffinity() post-check that it's not running on any cpus outside cpuset_cpus_allowed(). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Paul Menage · Linus Torvalds
1 parent 020958b627
Showing 5 changed files with 243 additions and 5 deletions Side-by-side Diff
include/linux/prio_heap.h
kernel/cpuset.c
kernel/sched.c
lib/Makefile
lib/prio_heap.c
+#ifndef _LINUX_PRIO_HEAP_H
+#define _LINUX_PRIO_HEAP_H
+
+/*
+ * Simple insertion-only static-sized priority heap containing
+ * pointers, based on CLR, chapter 7
+ */
+
+#include <linux/gfp.h>
+
+/**
+ * struct ptr_heap - simple static-sized priority heap
+ * @ptrs - pointer to data area
+ * @max - max number of elements that can be stored in @ptrs
+ * @size - current number of valid elements in @ptrs (in the range 0..@size-1
+ * @gt: comparison operator, which should implement "greater than"
+ */
+struct ptr_heap {
+	void **ptrs;
+	int max;
+	int size;
+	int (*gt)(void *, void *);
+};
+
+/**
+ * heap_init - initialize an empty heap with a given memory size
+ * @heap: the heap structure to be initialized
+ * @size: amount of memory to use in bytes
+ * @gfp_mask: mask to pass to kmalloc()
+ * @gt: comparison operator, which should implement "greater than"
+ */
+extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
+		     int (*gt)(void *, void *));
+
+/**
+ * heap_free - release a heap's storage
+ * @heap: the heap structure whose data should be released
+ */
+void heap_free(struct ptr_heap *heap);
+
+/**
+ * heap_insert - insert a value into the heap and return any overflowed value
+ * @heap: the heap to be operated on
+ * @p: the pointer to be inserted
+ *
+ * Attempts to insert the given value into the priority heap. If the
+ * heap is full prior to the insertion, then the resulting heap will
+ * consist of the smallest @max elements of the original heap and the
+ * new element; the greatest element will be removed from the heap and
+ * returned. Note that the returned element will be the new element
+ * (i.e. no change to the heap) if the new element is greater than all
+ * elements currently in the heap.
+ */
+extern void *heap_insert(struct ptr_heap *heap, void *p);
+
+
+
+#endif /* _LINUX_PRIO_HEAP_H */
@@ -38,6 +38,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/prio_heap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
@@ -701,6 +702,36 @@
 	/* Don't kfree(doms) -- partition_sched_domains() does that. */
 }
  
+static inline int started_after_time(struct task_struct *t1,
+				     struct timespec *time,
+				     struct task_struct *t2)
+{
+	int start_diff = timespec_compare(&t1->start_time, time);
+	if (start_diff > 0) {
+		return 1;
+	} else if (start_diff < 0) {
+		return 0;
+	} else {
+		/*
+		 * Arbitrarily, if two processes started at the same
+		 * time, we'll say that the lower pointer value
+		 * started first. Note that t2 may have exited by now
+		 * so this may not be a valid pointer any longer, but
+		 * that's fine - it still serves to distinguish
+		 * between two tasks started (effectively)
+		 * simultaneously.
+		 */
+		return t1 > t2;
+	}
+}
+
+static inline int started_after(void *p1, void *p2)
+{
+	struct task_struct *t1 = p1;
+	struct task_struct *t2 = p2;
+	return started_after_time(t1, &t2->start_time, t2);
+}
+
 /*
  * Call with manage_mutex held.  May take callback_mutex during call.
  */
@@ -708,8 +739,15 @@
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
-	int retval;
-	int cpus_changed, is_load_balanced;
+	int retval, i;
+	int is_load_balanced;
+	struct cgroup_iter it;
+	struct cgroup *cgrp = cs->css.cgroup;
+	struct task_struct *p, *dropped;
+	/* Never dereference latest_task, since it's not refcounted */
+	struct task_struct *latest_task = NULL;
+	struct ptr_heap heap;
+	struct timespec latest_time = { 0, 0 };
  
 	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
 	if (cs == &top_cpuset)
  
@@ -736,14 +774,73 @@
 	if (retval < 0)
 		return retval;
  
-	cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+	/* Nothing to do if the cpus didn't change */
+	if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
+		return 0;
+	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+	if (retval)
+		return retval;
+
 	is_load_balanced = is_sched_load_balance(&trialcs);
  
 	mutex_lock(&callback_mutex);
 	cs->cpus_allowed = trialcs.cpus_allowed;
 	mutex_unlock(&callback_mutex);
  
-	if (cpus_changed && is_load_balanced)
+ again:
+	/*
+	 * Scan tasks in the cpuset, and update the cpumasks of any
+	 * that need an update. Since we can't call set_cpus_allowed()
+	 * while holding tasklist_lock, gather tasks to be processed
+	 * in a heap structure. If the statically-sized heap fills up,
+	 * overflow tasks that started later, and in future iterations
+	 * only consider tasks that started after the latest task in
+	 * the previous pass. This guarantees forward progress and
+	 * that we don't miss any tasks
+	 */
+	heap.size = 0;
+	cgroup_iter_start(cgrp, &it);
+	while ((p = cgroup_iter_next(cgrp, &it))) {
+		/* Only affect tasks that don't have the right cpus_allowed */
+		if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
+			continue;
+		/*
+		 * Only process tasks that started after the last task
+		 * we processed
+		 */
+		if (!started_after_time(p, &latest_time, latest_task))
+			continue;
+		dropped = heap_insert(&heap, p);
+		if (dropped == NULL) {
+			get_task_struct(p);
+		} else if (dropped != p) {
+			get_task_struct(p);
+			put_task_struct(dropped);
+		}
+	}
+	cgroup_iter_end(cgrp, &it);
+	if (heap.size) {
+		for (i = 0; i < heap.size; i++) {
+			struct task_struct *p = heap.ptrs[i];
+			if (i == 0) {
+				latest_time = p->start_time;
+				latest_task = p;
+			}
+			set_cpus_allowed(p, cs->cpus_allowed);
+			put_task_struct(p);
+		}
+		/*
+		 * If we had to process any tasks at all, scan again
+		 * in case some of them were in the middle of forking
+		 * children that didn't notice the new cpumask
+		 * restriction.  Not the most efficient way to do it,
+		 * but it avoids having to take callback_mutex in the
+		 * fork path
+		 */
+		goto again;
+	}
+	heap_free(&heap);
+	if (is_load_balanced)
 		rebuild_sched_domains();
  
 	return 0;
@@ -4471,8 +4471,21 @@
  
 	cpus_allowed = cpuset_cpus_allowed(p);
 	cpus_and(new_mask, new_mask, cpus_allowed);
+ again:
 	retval = set_cpus_allowed(p, new_mask);
  
+	if (!retval) {
+		cpus_allowed = cpuset_cpus_allowed(p);
+		if (!cpus_subset(new_mask, cpus_allowed)) {
+			/*
+			 * We must have raced with a concurrent cpuset
+			 * update. Just reset the cpus_allowed to the
+			 * cpuset's cpus_allowed
+			 */
+			new_mask = cpus_allowed;
+			goto again;
+		}
+	}
 out_unlock:
 	put_task_struct(p);
 	mutex_unlock(&sched_hotcpu_mutex);
@@ -6,7 +6,7 @@
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
-	 proportions.o
+	 proportions.o prio_heap.o
  
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
+/*
+ * Simple insertion-only static-sized priority heap containing
+ * pointers, based on CLR, chapter 7
+ */
+
+#include <linux/slab.h>
+#include <linux/prio_heap.h>
+
+int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
+	      int (*gt)(void *, void *))
+{
+	heap->ptrs = kmalloc(size, gfp_mask);
+	if (!heap->ptrs)
+		return -ENOMEM;
+	heap->size = 0;
+	heap->max = size / sizeof(void *);
+	heap->gt = gt;
+	return 0;
+}
+
+void heap_free(struct ptr_heap *heap)
+{
+	kfree(heap->ptrs);
+}
+
+void *heap_insert(struct ptr_heap *heap, void *p)
+{
+	void *res;
+	void **ptrs = heap->ptrs;
+	int pos;
+
+	if (heap->size < heap->max) {
+		/* Heap insertion */
+		int pos = heap->size++;
+		while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) {
+			ptrs[pos] = ptrs[(pos-1)/2];
+			pos = (pos-1)/2;
+		}
+		ptrs[pos] = p;
+		return NULL;
+	}
+
+	/* The heap is full, so something will have to be dropped */
+
+	/* If the new pointer is greater than the current max, drop it */
+	if (heap->gt(p, ptrs[0]))
+		return p;
+
+	/* Replace the current max and heapify */
+	res = ptrs[0];
+	ptrs[0] = p;
+	pos = 0;
+
+	while (1) {
+		int left = 2 * pos + 1;
+		int right = 2 * pos + 2;
+		int largest = pos;
+		if (left < heap->size && heap->gt(ptrs[left], p))
+			largest = left;
+		if (right < heap->size && heap->gt(ptrs[right], ptrs[largest]))
+			largest = right;
+		if (largest == pos)
+			break;
+		/* Push p down the heap one level and bump one up */
+		ptrs[pos] = ptrs[largest];
+		ptrs[largest] = p;
+		pos = largest;
+	}
+	return res;
+}
	1	+#ifndef _LINUX_PRIO_HEAP_H
	2	+#define _LINUX_PRIO_HEAP_H
	3	+
	4	+/*
	5	+ * Simple insertion-only static-sized priority heap containing
	6	+ * pointers, based on CLR, chapter 7
	7	+ */
	8	+
	9	+#include <linux/gfp.h>
	10	+
	11	+/**
	12	+ * struct ptr_heap - simple static-sized priority heap
	13	+ * @ptrs - pointer to data area
	14	+ * @max - max number of elements that can be stored in @ptrs
	15	+ * @size - current number of valid elements in @ptrs (in the range 0..@size-1
	16	+ * @gt: comparison operator, which should implement "greater than"
	17	+ */
	18	+struct ptr_heap {
	19	+ void **ptrs;
	20	+ int max;
	21	+ int size;
	22	+ int (gt)(void , void *);
	23	+};
	24	+
	25	+/**
	26	+ * heap_init - initialize an empty heap with a given memory size
	27	+ * @heap: the heap structure to be initialized
	28	+ * @size: amount of memory to use in bytes
	29	+ * @gfp_mask: mask to pass to kmalloc()
	30	+ * @gt: comparison operator, which should implement "greater than"
	31	+ */
	32	+extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
	33	+ int (gt)(void , void *));
	34	+
	35	+/**
	36	+ * heap_free - release a heap's storage
	37	+ * @heap: the heap structure whose data should be released
	38	+ */
	39	+void heap_free(struct ptr_heap *heap);
	40	+
	41	+/**
	42	+ * heap_insert - insert a value into the heap and return any overflowed value
	43	+ * @heap: the heap to be operated on
	44	+ * @p: the pointer to be inserted
	45	+ *
	46	+ * Attempts to insert the given value into the priority heap. If the
	47	+ * heap is full prior to the insertion, then the resulting heap will
	48	+ * consist of the smallest @max elements of the original heap and the
	49	+ * new element; the greatest element will be removed from the heap and
	50	+ * returned. Note that the returned element will be the new element
	51	+ * (i.e. no change to the heap) if the new element is greater than all
	52	+ * elements currently in the heap.
	53	+ */
	54	+extern void heap_insert(struct ptr_heap heap, void *p);
	55	+
	56	+
	57	+
	58	+#endif /* _LINUX_PRIO_HEAP_H */
...	...	@@ -38,6 +38,7 @@
38	38	#include <linux/mount.h>
39	39	#include <linux/namei.h>
40	40	#include <linux/pagemap.h>
	41	+#include <linux/prio_heap.h>
41	42	#include <linux/proc_fs.h>
42	43	#include <linux/rcupdate.h>
43	44	#include <linux/sched.h>
...	...	@@ -701,6 +702,36 @@
701	702	/* Don't kfree(doms) -- partition_sched_domains() does that. */
702	703	}
703	704
	705	+static inline int started_after_time(struct task_struct *t1,
	706	+ struct timespec *time,
	707	+ struct task_struct *t2)
	708	+{
	709	+ int start_diff = timespec_compare(&t1->start_time, time);
	710	+ if (start_diff > 0) {
	711	+ return 1;
	712	+ } else if (start_diff < 0) {
	713	+ return 0;
	714	+ } else {
	715	+ /*
	716	+ * Arbitrarily, if two processes started at the same
	717	+ * time, we'll say that the lower pointer value
	718	+ * started first. Note that t2 may have exited by now
	719	+ * so this may not be a valid pointer any longer, but
	720	+ * that's fine - it still serves to distinguish
	721	+ * between two tasks started (effectively)
	722	+ * simultaneously.
	723	+ */
	724	+ return t1 > t2;
	725	+ }
	726	+}
	727	+
	728	+static inline int started_after(void p1, void p2)
	729	+{
	730	+ struct task_struct *t1 = p1;
	731	+ struct task_struct *t2 = p2;
	732	+ return started_after_time(t1, &t2->start_time, t2);
	733	+}
	734	+
704	735	/*
705	736	* Call with manage_mutex held. May take callback_mutex during call.
706	737	*/
...	...	@@ -708,8 +739,15 @@
708	739	static int update_cpumask(struct cpuset cs, char buf)
709	740	{
710	741	struct cpuset trialcs;
711		- int retval;
712		- int cpus_changed, is_load_balanced;
	742	+ int retval, i;
	743	+ int is_load_balanced;
	744	+ struct cgroup_iter it;
	745	+ struct cgroup *cgrp = cs->css.cgroup;
	746	+ struct task_struct p, dropped;
	747	+ /* Never dereference latest_task, since it's not refcounted */
	748	+ struct task_struct *latest_task = NULL;
	749	+ struct ptr_heap heap;
	750	+ struct timespec latest_time = { 0, 0 };
713	751
714	752	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
715	753	if (cs == &top_cpuset)
716	754
...	...	@@ -736,14 +774,73 @@
736	774	if (retval < 0)
737	775	return retval;
738	776
739		- cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
	777	+ /* Nothing to do if the cpus didn't change */
	778	+ if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
	779	+ return 0;
	780	+ retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
	781	+ if (retval)
	782	+ return retval;
	783	+
740	784	is_load_balanced = is_sched_load_balance(&trialcs);
741	785
742	786	mutex_lock(&callback_mutex);
743	787	cs->cpus_allowed = trialcs.cpus_allowed;
744	788	mutex_unlock(&callback_mutex);
745	789
746		- if (cpus_changed && is_load_balanced)
	790	+ again:
	791	+ /*
	792	+ * Scan tasks in the cpuset, and update the cpumasks of any
	793	+ * that need an update. Since we can't call set_cpus_allowed()
	794	+ * while holding tasklist_lock, gather tasks to be processed
	795	+ * in a heap structure. If the statically-sized heap fills up,
	796	+ * overflow tasks that started later, and in future iterations
	797	+ * only consider tasks that started after the latest task in
	798	+ * the previous pass. This guarantees forward progress and
	799	+ * that we don't miss any tasks
	800	+ */
	801	+ heap.size = 0;
	802	+ cgroup_iter_start(cgrp, &it);
	803	+ while ((p = cgroup_iter_next(cgrp, &it))) {
	804	+ /* Only affect tasks that don't have the right cpus_allowed */
	805	+ if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
	806	+ continue;
	807	+ /*
	808	+ * Only process tasks that started after the last task
	809	+ * we processed
	810	+ */
	811	+ if (!started_after_time(p, &latest_time, latest_task))
	812	+ continue;
	813	+ dropped = heap_insert(&heap, p);
	814	+ if (dropped == NULL) {
	815	+ get_task_struct(p);
	816	+ } else if (dropped != p) {
	817	+ get_task_struct(p);
	818	+ put_task_struct(dropped);
	819	+ }
	820	+ }
	821	+ cgroup_iter_end(cgrp, &it);
	822	+ if (heap.size) {
	823	+ for (i = 0; i < heap.size; i++) {
	824	+ struct task_struct *p = heap.ptrs[i];
	825	+ if (i == 0) {
	826	+ latest_time = p->start_time;
	827	+ latest_task = p;
	828	+ }
	829	+ set_cpus_allowed(p, cs->cpus_allowed);
	830	+ put_task_struct(p);
	831	+ }
	832	+ /*
	833	+ * If we had to process any tasks at all, scan again
	834	+ * in case some of them were in the middle of forking
	835	+ * children that didn't notice the new cpumask
	836	+ * restriction. Not the most efficient way to do it,
	837	+ * but it avoids having to take callback_mutex in the
	838	+ * fork path
	839	+ */
	840	+ goto again;
	841	+ }
	842	+ heap_free(&heap);
	843	+ if (is_load_balanced)
747	844	rebuild_sched_domains();
748	845
749	846	return 0;
...	...	@@ -4471,8 +4471,21 @@
4471	4471
4472	4472	cpus_allowed = cpuset_cpus_allowed(p);
4473	4473	cpus_and(new_mask, new_mask, cpus_allowed);
	4474	+ again:
4474	4475	retval = set_cpus_allowed(p, new_mask);
4475	4476
	4477	+ if (!retval) {
	4478	+ cpus_allowed = cpuset_cpus_allowed(p);
	4479	+ if (!cpus_subset(new_mask, cpus_allowed)) {
	4480	+ /*
	4481	+ * We must have raced with a concurrent cpuset
	4482	+ * update. Just reset the cpus_allowed to the
	4483	+ * cpuset's cpus_allowed
	4484	+ */
	4485	+ new_mask = cpus_allowed;
	4486	+ goto again;
	4487	+ }
	4488	+ }
4476	4489	out_unlock:
4477	4490	put_task_struct(p);
4478	4491	mutex_unlock(&sched_hotcpu_mutex);
...	...	@@ -6,7 +6,7 @@
6	6	rbtree.o radix-tree.o dump_stack.o \
7	7	idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
8	8	sha1.o irq_regs.o reciprocal_div.o argv_split.o \
9		- proportions.o
	9	+ proportions.o prio_heap.o
10	10
11	11	lib-$(CONFIG_MMU) += ioremap.o
12	12	lib-$(CONFIG_SMP) += cpumask.o