Commit 8707d8b8c0cbdf4441507f8dded194167da896c7

Authored by Paul Menage
Committed by Linus Torvalds
1 parent 020958b627

Fix cpusets update_cpumask

Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks:

- collect batches of tasks under tasklist_lock and then call
  set_cpus_allowed() on them outside the lock (since this can sleep).

- add a simple generic priority heap type to allow efficient collection
  of batches of tasks to be processed without duplicating or missing any
  tasks in subsequent batches.

- make "cpus" file update a no-op if the mask hasn't changed

- fix race between update_cpumask() and sched_setaffinity() by making
  sched_setaffinity() post-check that it's not running on any cpus outside
  cpuset_cpus_allowed().

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 5 changed files with 243 additions and 5 deletions Side-by-side Diff

include/linux/prio_heap.h
  1 +#ifndef _LINUX_PRIO_HEAP_H
  2 +#define _LINUX_PRIO_HEAP_H
  3 +
  4 +/*
  5 + * Simple insertion-only static-sized priority heap containing
  6 + * pointers, based on CLR, chapter 7
  7 + */
  8 +
  9 +#include <linux/gfp.h>
  10 +
  11 +/**
  12 + * struct ptr_heap - simple static-sized priority heap
  13 + * @ptrs - pointer to data area
  14 + * @max - max number of elements that can be stored in @ptrs
  15 + * @size - current number of valid elements in @ptrs (in the range 0..@size-1
  16 + * @gt: comparison operator, which should implement "greater than"
  17 + */
  18 +struct ptr_heap {
  19 + void **ptrs;
  20 + int max;
  21 + int size;
  22 + int (*gt)(void *, void *);
  23 +};
  24 +
  25 +/**
  26 + * heap_init - initialize an empty heap with a given memory size
  27 + * @heap: the heap structure to be initialized
  28 + * @size: amount of memory to use in bytes
  29 + * @gfp_mask: mask to pass to kmalloc()
  30 + * @gt: comparison operator, which should implement "greater than"
  31 + */
  32 +extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
  33 + int (*gt)(void *, void *));
  34 +
  35 +/**
  36 + * heap_free - release a heap's storage
  37 + * @heap: the heap structure whose data should be released
  38 + */
  39 +void heap_free(struct ptr_heap *heap);
  40 +
  41 +/**
  42 + * heap_insert - insert a value into the heap and return any overflowed value
  43 + * @heap: the heap to be operated on
  44 + * @p: the pointer to be inserted
  45 + *
  46 + * Attempts to insert the given value into the priority heap. If the
  47 + * heap is full prior to the insertion, then the resulting heap will
  48 + * consist of the smallest @max elements of the original heap and the
  49 + * new element; the greatest element will be removed from the heap and
  50 + * returned. Note that the returned element will be the new element
  51 + * (i.e. no change to the heap) if the new element is greater than all
  52 + * elements currently in the heap.
  53 + */
  54 +extern void *heap_insert(struct ptr_heap *heap, void *p);
  55 +
  56 +
  57 +
  58 +#endif /* _LINUX_PRIO_HEAP_H */
... ... @@ -38,6 +38,7 @@
38 38 #include <linux/mount.h>
39 39 #include <linux/namei.h>
40 40 #include <linux/pagemap.h>
  41 +#include <linux/prio_heap.h>
41 42 #include <linux/proc_fs.h>
42 43 #include <linux/rcupdate.h>
43 44 #include <linux/sched.h>
... ... @@ -701,6 +702,36 @@
701 702 /* Don't kfree(doms) -- partition_sched_domains() does that. */
702 703 }
703 704  
  705 +static inline int started_after_time(struct task_struct *t1,
  706 + struct timespec *time,
  707 + struct task_struct *t2)
  708 +{
  709 + int start_diff = timespec_compare(&t1->start_time, time);
  710 + if (start_diff > 0) {
  711 + return 1;
  712 + } else if (start_diff < 0) {
  713 + return 0;
  714 + } else {
  715 + /*
  716 + * Arbitrarily, if two processes started at the same
  717 + * time, we'll say that the lower pointer value
  718 + * started first. Note that t2 may have exited by now
  719 + * so this may not be a valid pointer any longer, but
  720 + * that's fine - it still serves to distinguish
  721 + * between two tasks started (effectively)
  722 + * simultaneously.
  723 + */
  724 + return t1 > t2;
  725 + }
  726 +}
  727 +
  728 +static inline int started_after(void *p1, void *p2)
  729 +{
  730 + struct task_struct *t1 = p1;
  731 + struct task_struct *t2 = p2;
  732 + return started_after_time(t1, &t2->start_time, t2);
  733 +}
  734 +
704 735 /*
705 736 * Call with manage_mutex held. May take callback_mutex during call.
706 737 */
... ... @@ -708,8 +739,15 @@
708 739 static int update_cpumask(struct cpuset *cs, char *buf)
709 740 {
710 741 struct cpuset trialcs;
711   - int retval;
712   - int cpus_changed, is_load_balanced;
  742 + int retval, i;
  743 + int is_load_balanced;
  744 + struct cgroup_iter it;
  745 + struct cgroup *cgrp = cs->css.cgroup;
  746 + struct task_struct *p, *dropped;
  747 + /* Never dereference latest_task, since it's not refcounted */
  748 + struct task_struct *latest_task = NULL;
  749 + struct ptr_heap heap;
  750 + struct timespec latest_time = { 0, 0 };
713 751  
714 752 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
715 753 if (cs == &top_cpuset)
716 754  
... ... @@ -736,14 +774,73 @@
736 774 if (retval < 0)
737 775 return retval;
738 776  
739   - cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
  777 + /* Nothing to do if the cpus didn't change */
  778 + if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
  779 + return 0;
  780 + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
  781 + if (retval)
  782 + return retval;
  783 +
740 784 is_load_balanced = is_sched_load_balance(&trialcs);
741 785  
742 786 mutex_lock(&callback_mutex);
743 787 cs->cpus_allowed = trialcs.cpus_allowed;
744 788 mutex_unlock(&callback_mutex);
745 789  
746   - if (cpus_changed && is_load_balanced)
  790 + again:
  791 + /*
  792 + * Scan tasks in the cpuset, and update the cpumasks of any
  793 + * that need an update. Since we can't call set_cpus_allowed()
  794 + * while holding tasklist_lock, gather tasks to be processed
  795 + * in a heap structure. If the statically-sized heap fills up,
  796 + * overflow tasks that started later, and in future iterations
  797 + * only consider tasks that started after the latest task in
  798 + * the previous pass. This guarantees forward progress and
  799 + * that we don't miss any tasks
  800 + */
  801 + heap.size = 0;
  802 + cgroup_iter_start(cgrp, &it);
  803 + while ((p = cgroup_iter_next(cgrp, &it))) {
  804 + /* Only affect tasks that don't have the right cpus_allowed */
  805 + if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
  806 + continue;
  807 + /*
  808 + * Only process tasks that started after the last task
  809 + * we processed
  810 + */
  811 + if (!started_after_time(p, &latest_time, latest_task))
  812 + continue;
  813 + dropped = heap_insert(&heap, p);
  814 + if (dropped == NULL) {
  815 + get_task_struct(p);
  816 + } else if (dropped != p) {
  817 + get_task_struct(p);
  818 + put_task_struct(dropped);
  819 + }
  820 + }
  821 + cgroup_iter_end(cgrp, &it);
  822 + if (heap.size) {
  823 + for (i = 0; i < heap.size; i++) {
  824 + struct task_struct *p = heap.ptrs[i];
  825 + if (i == 0) {
  826 + latest_time = p->start_time;
  827 + latest_task = p;
  828 + }
  829 + set_cpus_allowed(p, cs->cpus_allowed);
  830 + put_task_struct(p);
  831 + }
  832 + /*
  833 + * If we had to process any tasks at all, scan again
  834 + * in case some of them were in the middle of forking
  835 + * children that didn't notice the new cpumask
  836 + * restriction. Not the most efficient way to do it,
  837 + * but it avoids having to take callback_mutex in the
  838 + * fork path
  839 + */
  840 + goto again;
  841 + }
  842 + heap_free(&heap);
  843 + if (is_load_balanced)
747 844 rebuild_sched_domains();
748 845  
749 846 return 0;
... ... @@ -4471,8 +4471,21 @@
4471 4471  
4472 4472 cpus_allowed = cpuset_cpus_allowed(p);
4473 4473 cpus_and(new_mask, new_mask, cpus_allowed);
  4474 + again:
4474 4475 retval = set_cpus_allowed(p, new_mask);
4475 4476  
  4477 + if (!retval) {
  4478 + cpus_allowed = cpuset_cpus_allowed(p);
  4479 + if (!cpus_subset(new_mask, cpus_allowed)) {
  4480 + /*
  4481 + * We must have raced with a concurrent cpuset
  4482 + * update. Just reset the cpus_allowed to the
  4483 + * cpuset's cpus_allowed
  4484 + */
  4485 + new_mask = cpus_allowed;
  4486 + goto again;
  4487 + }
  4488 + }
4476 4489 out_unlock:
4477 4490 put_task_struct(p);
4478 4491 mutex_unlock(&sched_hotcpu_mutex);
... ... @@ -6,7 +6,7 @@
6 6 rbtree.o radix-tree.o dump_stack.o \
7 7 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
8 8 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
9   - proportions.o
  9 + proportions.o prio_heap.o
10 10  
11 11 lib-$(CONFIG_MMU) += ioremap.o
12 12 lib-$(CONFIG_SMP) += cpumask.o
  1 +/*
  2 + * Simple insertion-only static-sized priority heap containing
  3 + * pointers, based on CLR, chapter 7
  4 + */
  5 +
  6 +#include <linux/slab.h>
  7 +#include <linux/prio_heap.h>
  8 +
  9 +int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
  10 + int (*gt)(void *, void *))
  11 +{
  12 + heap->ptrs = kmalloc(size, gfp_mask);
  13 + if (!heap->ptrs)
  14 + return -ENOMEM;
  15 + heap->size = 0;
  16 + heap->max = size / sizeof(void *);
  17 + heap->gt = gt;
  18 + return 0;
  19 +}
  20 +
  21 +void heap_free(struct ptr_heap *heap)
  22 +{
  23 + kfree(heap->ptrs);
  24 +}
  25 +
  26 +void *heap_insert(struct ptr_heap *heap, void *p)
  27 +{
  28 + void *res;
  29 + void **ptrs = heap->ptrs;
  30 + int pos;
  31 +
  32 + if (heap->size < heap->max) {
  33 + /* Heap insertion */
  34 + int pos = heap->size++;
  35 + while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) {
  36 + ptrs[pos] = ptrs[(pos-1)/2];
  37 + pos = (pos-1)/2;
  38 + }
  39 + ptrs[pos] = p;
  40 + return NULL;
  41 + }
  42 +
  43 + /* The heap is full, so something will have to be dropped */
  44 +
  45 + /* If the new pointer is greater than the current max, drop it */
  46 + if (heap->gt(p, ptrs[0]))
  47 + return p;
  48 +
  49 + /* Replace the current max and heapify */
  50 + res = ptrs[0];
  51 + ptrs[0] = p;
  52 + pos = 0;
  53 +
  54 + while (1) {
  55 + int left = 2 * pos + 1;
  56 + int right = 2 * pos + 2;
  57 + int largest = pos;
  58 + if (left < heap->size && heap->gt(ptrs[left], p))
  59 + largest = left;
  60 + if (right < heap->size && heap->gt(ptrs[right], ptrs[largest]))
  61 + largest = right;
  62 + if (largest == pos)
  63 + break;
  64 + /* Push p down the heap one level and bump one up */
  65 + ptrs[pos] = ptrs[largest];
  66 + ptrs[largest] = p;
  67 + pos = largest;
  68 + }
  69 + return res;
  70 +}