Commit 8707d8b8c0cbdf4441507f8dded194167da896c7
Committed by
Linus Torvalds
1 parent
020958b627
Exists in
master
and in
7 other branches
Fix cpusets update_cpumask
Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks: - collect batches of tasks under tasklist_lock and then call set_cpus_allowed() on them outside the lock (since this can sleep). - add a simple generic priority heap type to allow efficient collection of batches of tasks to be processed without duplicating or missing any tasks in subsequent batches. - make "cpus" file update a no-op if the mask hasn't changed - fix race between update_cpumask() and sched_setaffinity() by making sched_setaffinity() post-check that it's not running on any cpus outside cpuset_cpus_allowed(). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 5 changed files with 243 additions and 5 deletions Side-by-side Diff
include/linux/prio_heap.h
1 | +#ifndef _LINUX_PRIO_HEAP_H | |
2 | +#define _LINUX_PRIO_HEAP_H | |
3 | + | |
4 | +/* | |
5 | + * Simple insertion-only static-sized priority heap containing | |
6 | + * pointers, based on CLR, chapter 7 | |
7 | + */ | |
8 | + | |
9 | +#include <linux/gfp.h> | |
10 | + | |
11 | +/** | |
12 | + * struct ptr_heap - simple static-sized priority heap | |
13 | + * @ptrs - pointer to data area | |
14 | + * @max - max number of elements that can be stored in @ptrs | |
15 | + * @size - current number of valid elements in @ptrs (in the range 0..@size-1 | |
16 | + * @gt: comparison operator, which should implement "greater than" | |
17 | + */ | |
18 | +struct ptr_heap { | |
19 | + void **ptrs; | |
20 | + int max; | |
21 | + int size; | |
22 | + int (*gt)(void *, void *); | |
23 | +}; | |
24 | + | |
25 | +/** | |
26 | + * heap_init - initialize an empty heap with a given memory size | |
27 | + * @heap: the heap structure to be initialized | |
28 | + * @size: amount of memory to use in bytes | |
29 | + * @gfp_mask: mask to pass to kmalloc() | |
30 | + * @gt: comparison operator, which should implement "greater than" | |
31 | + */ | |
32 | +extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, | |
33 | + int (*gt)(void *, void *)); | |
34 | + | |
35 | +/** | |
36 | + * heap_free - release a heap's storage | |
37 | + * @heap: the heap structure whose data should be released | |
38 | + */ | |
39 | +void heap_free(struct ptr_heap *heap); | |
40 | + | |
41 | +/** | |
42 | + * heap_insert - insert a value into the heap and return any overflowed value | |
43 | + * @heap: the heap to be operated on | |
44 | + * @p: the pointer to be inserted | |
45 | + * | |
46 | + * Attempts to insert the given value into the priority heap. If the | |
47 | + * heap is full prior to the insertion, then the resulting heap will | |
48 | + * consist of the smallest @max elements of the original heap and the | |
49 | + * new element; the greatest element will be removed from the heap and | |
50 | + * returned. Note that the returned element will be the new element | |
51 | + * (i.e. no change to the heap) if the new element is greater than all | |
52 | + * elements currently in the heap. | |
53 | + */ | |
54 | +extern void *heap_insert(struct ptr_heap *heap, void *p); | |
55 | + | |
56 | + | |
57 | + | |
58 | +#endif /* _LINUX_PRIO_HEAP_H */ |
kernel/cpuset.c
... | ... | @@ -38,6 +38,7 @@ |
38 | 38 | #include <linux/mount.h> |
39 | 39 | #include <linux/namei.h> |
40 | 40 | #include <linux/pagemap.h> |
41 | +#include <linux/prio_heap.h> | |
41 | 42 | #include <linux/proc_fs.h> |
42 | 43 | #include <linux/rcupdate.h> |
43 | 44 | #include <linux/sched.h> |
... | ... | @@ -701,6 +702,36 @@ |
701 | 702 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
702 | 703 | } |
703 | 704 | |
705 | +static inline int started_after_time(struct task_struct *t1, | |
706 | + struct timespec *time, | |
707 | + struct task_struct *t2) | |
708 | +{ | |
709 | + int start_diff = timespec_compare(&t1->start_time, time); | |
710 | + if (start_diff > 0) { | |
711 | + return 1; | |
712 | + } else if (start_diff < 0) { | |
713 | + return 0; | |
714 | + } else { | |
715 | + /* | |
716 | + * Arbitrarily, if two processes started at the same | |
717 | + * time, we'll say that the lower pointer value | |
718 | + * started first. Note that t2 may have exited by now | |
719 | + * so this may not be a valid pointer any longer, but | |
720 | + * that's fine - it still serves to distinguish | |
721 | + * between two tasks started (effectively) | |
722 | + * simultaneously. | |
723 | + */ | |
724 | + return t1 > t2; | |
725 | + } | |
726 | +} | |
727 | + | |
728 | +static inline int started_after(void *p1, void *p2) | |
729 | +{ | |
730 | + struct task_struct *t1 = p1; | |
731 | + struct task_struct *t2 = p2; | |
732 | + return started_after_time(t1, &t2->start_time, t2); | |
733 | +} | |
734 | + | |
704 | 735 | /* |
705 | 736 | * Call with manage_mutex held. May take callback_mutex during call. |
706 | 737 | */ |
... | ... | @@ -708,8 +739,15 @@ |
708 | 739 | static int update_cpumask(struct cpuset *cs, char *buf) |
709 | 740 | { |
710 | 741 | struct cpuset trialcs; |
711 | - int retval; | |
712 | - int cpus_changed, is_load_balanced; | |
742 | + int retval, i; | |
743 | + int is_load_balanced; | |
744 | + struct cgroup_iter it; | |
745 | + struct cgroup *cgrp = cs->css.cgroup; | |
746 | + struct task_struct *p, *dropped; | |
747 | + /* Never dereference latest_task, since it's not refcounted */ | |
748 | + struct task_struct *latest_task = NULL; | |
749 | + struct ptr_heap heap; | |
750 | + struct timespec latest_time = { 0, 0 }; | |
713 | 751 | |
714 | 752 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ |
715 | 753 | if (cs == &top_cpuset) |
716 | 754 | |
... | ... | @@ -736,14 +774,73 @@ |
736 | 774 | if (retval < 0) |
737 | 775 | return retval; |
738 | 776 | |
739 | - cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | |
777 | + /* Nothing to do if the cpus didn't change */ | |
778 | + if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | |
779 | + return 0; | |
780 | + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | |
781 | + if (retval) | |
782 | + return retval; | |
783 | + | |
740 | 784 | is_load_balanced = is_sched_load_balance(&trialcs); |
741 | 785 | |
742 | 786 | mutex_lock(&callback_mutex); |
743 | 787 | cs->cpus_allowed = trialcs.cpus_allowed; |
744 | 788 | mutex_unlock(&callback_mutex); |
745 | 789 | |
746 | - if (cpus_changed && is_load_balanced) | |
790 | + again: | |
791 | + /* | |
792 | + * Scan tasks in the cpuset, and update the cpumasks of any | |
793 | + * that need an update. Since we can't call set_cpus_allowed() | |
794 | + * while holding tasklist_lock, gather tasks to be processed | |
795 | + * in a heap structure. If the statically-sized heap fills up, | |
796 | + * overflow tasks that started later, and in future iterations | |
797 | + * only consider tasks that started after the latest task in | |
798 | + * the previous pass. This guarantees forward progress and | |
799 | + * that we don't miss any tasks | |
800 | + */ | |
801 | + heap.size = 0; | |
802 | + cgroup_iter_start(cgrp, &it); | |
803 | + while ((p = cgroup_iter_next(cgrp, &it))) { | |
804 | + /* Only affect tasks that don't have the right cpus_allowed */ | |
805 | + if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) | |
806 | + continue; | |
807 | + /* | |
808 | + * Only process tasks that started after the last task | |
809 | + * we processed | |
810 | + */ | |
811 | + if (!started_after_time(p, &latest_time, latest_task)) | |
812 | + continue; | |
813 | + dropped = heap_insert(&heap, p); | |
814 | + if (dropped == NULL) { | |
815 | + get_task_struct(p); | |
816 | + } else if (dropped != p) { | |
817 | + get_task_struct(p); | |
818 | + put_task_struct(dropped); | |
819 | + } | |
820 | + } | |
821 | + cgroup_iter_end(cgrp, &it); | |
822 | + if (heap.size) { | |
823 | + for (i = 0; i < heap.size; i++) { | |
824 | + struct task_struct *p = heap.ptrs[i]; | |
825 | + if (i == 0) { | |
826 | + latest_time = p->start_time; | |
827 | + latest_task = p; | |
828 | + } | |
829 | + set_cpus_allowed(p, cs->cpus_allowed); | |
830 | + put_task_struct(p); | |
831 | + } | |
832 | + /* | |
833 | + * If we had to process any tasks at all, scan again | |
834 | + * in case some of them were in the middle of forking | |
835 | + * children that didn't notice the new cpumask | |
836 | + * restriction. Not the most efficient way to do it, | |
837 | + * but it avoids having to take callback_mutex in the | |
838 | + * fork path | |
839 | + */ | |
840 | + goto again; | |
841 | + } | |
842 | + heap_free(&heap); | |
843 | + if (is_load_balanced) | |
747 | 844 | rebuild_sched_domains(); |
748 | 845 | |
749 | 846 | return 0; |
kernel/sched.c
... | ... | @@ -4471,8 +4471,21 @@ |
4471 | 4471 | |
4472 | 4472 | cpus_allowed = cpuset_cpus_allowed(p); |
4473 | 4473 | cpus_and(new_mask, new_mask, cpus_allowed); |
4474 | + again: | |
4474 | 4475 | retval = set_cpus_allowed(p, new_mask); |
4475 | 4476 | |
4477 | + if (!retval) { | |
4478 | + cpus_allowed = cpuset_cpus_allowed(p); | |
4479 | + if (!cpus_subset(new_mask, cpus_allowed)) { | |
4480 | + /* | |
4481 | + * We must have raced with a concurrent cpuset | |
4482 | + * update. Just reset the cpus_allowed to the | |
4483 | + * cpuset's cpus_allowed | |
4484 | + */ | |
4485 | + new_mask = cpus_allowed; | |
4486 | + goto again; | |
4487 | + } | |
4488 | + } | |
4476 | 4489 | out_unlock: |
4477 | 4490 | put_task_struct(p); |
4478 | 4491 | mutex_unlock(&sched_hotcpu_mutex); |
lib/Makefile
... | ... | @@ -6,7 +6,7 @@ |
6 | 6 | rbtree.o radix-tree.o dump_stack.o \ |
7 | 7 | idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ |
8 | 8 | sha1.o irq_regs.o reciprocal_div.o argv_split.o \ |
9 | - proportions.o | |
9 | + proportions.o prio_heap.o | |
10 | 10 | |
11 | 11 | lib-$(CONFIG_MMU) += ioremap.o |
12 | 12 | lib-$(CONFIG_SMP) += cpumask.o |
lib/prio_heap.c
1 | +/* | |
2 | + * Simple insertion-only static-sized priority heap containing | |
3 | + * pointers, based on CLR, chapter 7 | |
4 | + */ | |
5 | + | |
6 | +#include <linux/slab.h> | |
7 | +#include <linux/prio_heap.h> | |
8 | + | |
9 | +int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, | |
10 | + int (*gt)(void *, void *)) | |
11 | +{ | |
12 | + heap->ptrs = kmalloc(size, gfp_mask); | |
13 | + if (!heap->ptrs) | |
14 | + return -ENOMEM; | |
15 | + heap->size = 0; | |
16 | + heap->max = size / sizeof(void *); | |
17 | + heap->gt = gt; | |
18 | + return 0; | |
19 | +} | |
20 | + | |
21 | +void heap_free(struct ptr_heap *heap) | |
22 | +{ | |
23 | + kfree(heap->ptrs); | |
24 | +} | |
25 | + | |
26 | +void *heap_insert(struct ptr_heap *heap, void *p) | |
27 | +{ | |
28 | + void *res; | |
29 | + void **ptrs = heap->ptrs; | |
30 | + int pos; | |
31 | + | |
32 | + if (heap->size < heap->max) { | |
33 | + /* Heap insertion */ | |
34 | + int pos = heap->size++; | |
35 | + while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) { | |
36 | + ptrs[pos] = ptrs[(pos-1)/2]; | |
37 | + pos = (pos-1)/2; | |
38 | + } | |
39 | + ptrs[pos] = p; | |
40 | + return NULL; | |
41 | + } | |
42 | + | |
43 | + /* The heap is full, so something will have to be dropped */ | |
44 | + | |
45 | + /* If the new pointer is greater than the current max, drop it */ | |
46 | + if (heap->gt(p, ptrs[0])) | |
47 | + return p; | |
48 | + | |
49 | + /* Replace the current max and heapify */ | |
50 | + res = ptrs[0]; | |
51 | + ptrs[0] = p; | |
52 | + pos = 0; | |
53 | + | |
54 | + while (1) { | |
55 | + int left = 2 * pos + 1; | |
56 | + int right = 2 * pos + 2; | |
57 | + int largest = pos; | |
58 | + if (left < heap->size && heap->gt(ptrs[left], p)) | |
59 | + largest = left; | |
60 | + if (right < heap->size && heap->gt(ptrs[right], ptrs[largest])) | |
61 | + largest = right; | |
62 | + if (largest == pos) | |
63 | + break; | |
64 | + /* Push p down the heap one level and bump one up */ | |
65 | + ptrs[pos] = ptrs[largest]; | |
66 | + ptrs[largest] = p; | |
67 | + pos = largest; | |
68 | + } | |
69 | + return res; | |
70 | +} |