Commit 4225399a66b315d4d1fb1cb61b75dda201c832e3

Authored by Paul Jackson
Committed by Linus Torvalds
1 parent 202f72d5d1

[PATCH] cpuset: rebind vma mempolicies fix

Fix more of longstanding bug in cpuset/mempolicy interaction.

NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset.  The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.

When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.

An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.

This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired.  The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.

Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan.  In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock).  So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.

Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound.  A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.

When a task is moved to a different cpuset, it is easier, as there is only one
task involved.  It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.

It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice.  This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 3 changed files with 137 additions and 0 deletions Side-by-side Diff

include/linux/mempolicy.h
... ... @@ -150,6 +150,16 @@
150 150 extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
151 151 extern void mpol_rebind_task(struct task_struct *tsk,
152 152 const nodemask_t *new);
  153 +extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
  154 +#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
  155 +
  156 +#ifdef CONFIG_CPUSET
  157 +#define current_cpuset_is_being_rebound() \
  158 + (cpuset_being_rebound == current->cpuset)
  159 +#else
  160 +#define current_cpuset_is_being_rebound() 0
  161 +#endif
  162 +
153 163 extern struct mempolicy default_policy;
154 164 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
155 165 unsigned long addr);
... ... @@ -165,6 +175,8 @@
165 175 int do_migrate_pages(struct mm_struct *mm,
166 176 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
167 177  
  178 +extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */
  179 +
168 180 #else
169 181  
170 182 struct mempolicy {};
... ... @@ -233,6 +245,12 @@
233 245 const nodemask_t *new)
234 246 {
235 247 }
  248 +
  249 +static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
  250 +{
  251 +}
  252 +
  253 +#define set_cpuset_being_rebound(x) do {} while (0)
236 254  
237 255 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
238 256 unsigned long addr)
... ... @@ -812,12 +812,24 @@
812 812 }
813 813  
814 814 /*
  815 + * Handle user request to change the 'mems' memory placement
  816 + * of a cpuset. Needs to validate the request, update the
  817 + * cpusets mems_allowed and mems_generation, and for each
  818 + * task in the cpuset, rebind any vma mempolicies.
  819 + *
815 820 * Call with manage_sem held. May take callback_sem during call.
  821 + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
  822 + * lock each such tasks mm->mmap_sem, scan its vma's and rebind
  823 + * their mempolicies to the cpusets new mems_allowed.
816 824 */
817 825  
818 826 static int update_nodemask(struct cpuset *cs, char *buf)
819 827 {
820 828 struct cpuset trialcs;
  829 + struct task_struct *g, *p;
  830 + struct mm_struct **mmarray;
  831 + int i, n, ntasks;
  832 + int fudge;
821 833 int retval;
822 834  
823 835 trialcs = *cs;
... ... @@ -839,6 +851,76 @@
839 851 cs->mems_generation = atomic_read(&cpuset_mems_generation);
840 852 up(&callback_sem);
841 853  
  854 + set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
  855 +
  856 + fudge = 10; /* spare mmarray[] slots */
  857 + fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
  858 + retval = -ENOMEM;
  859 +
  860 + /*
  861 + * Allocate mmarray[] to hold mm reference for each task
  862 + * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
  863 + * tasklist_lock. We could use GFP_ATOMIC, but with a
  864 + * few more lines of code, we can retry until we get a big
  865 + * enough mmarray[] w/o using GFP_ATOMIC.
  866 + */
  867 + while (1) {
  868 + ntasks = atomic_read(&cs->count); /* guess */
  869 + ntasks += fudge;
  870 + mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
  871 + if (!mmarray)
  872 + goto done;
  873 + write_lock_irq(&tasklist_lock); /* block fork */
  874 + if (atomic_read(&cs->count) <= ntasks)
  875 + break; /* got enough */
  876 + write_unlock_irq(&tasklist_lock); /* try again */
  877 + kfree(mmarray);
  878 + }
  879 +
  880 + n = 0;
  881 +
  882 + /* Load up mmarray[] with mm reference for each task in cpuset. */
  883 + do_each_thread(g, p) {
  884 + struct mm_struct *mm;
  885 +
  886 + if (n >= ntasks) {
  887 + printk(KERN_WARNING
  888 + "Cpuset mempolicy rebind incomplete.\n");
  889 + continue;
  890 + }
  891 + if (p->cpuset != cs)
  892 + continue;
  893 + mm = get_task_mm(p);
  894 + if (!mm)
  895 + continue;
  896 + mmarray[n++] = mm;
  897 + } while_each_thread(g, p);
  898 + write_unlock_irq(&tasklist_lock);
  899 +
  900 + /*
  901 + * Now that we've dropped the tasklist spinlock, we can
  902 + * rebind the vma mempolicies of each mm in mmarray[] to their
  903 + * new cpuset, and release that mm. The mpol_rebind_mm()
  904 + * call takes mmap_sem, which we couldn't take while holding
  905 + * tasklist_lock. Forks can happen again now - the mpol_copy()
  906 + * cpuset_being_rebound check will catch such forks, and rebind
  907 + * their vma mempolicies too. Because we still hold the global
  908 + * cpuset manage_sem, we know that no other rebind effort will
  909 + * be contending for the global variable cpuset_being_rebound.
  910 + * It's ok if we rebind the same mm twice; mpol_rebind_mm()
  911 + * is idempotent.
  912 + */
  913 + for (i = 0; i < n; i++) {
  914 + struct mm_struct *mm = mmarray[i];
  915 +
  916 + mpol_rebind_mm(mm, &cs->mems_allowed);
  917 + mmput(mm);
  918 + }
  919 +
  920 + /* We're done rebinding vma's to this cpusets new mems_allowed. */
  921 + kfree(mmarray);
  922 + set_cpuset_being_rebound(NULL);
  923 + retval = 0;
842 924 done:
843 925 return retval;
844 926 }
... ... @@ -1011,6 +1093,7 @@
1011 1093 struct cpuset *oldcs;
1012 1094 cpumask_t cpus;
1013 1095 nodemask_t from, to;
  1096 + struct mm_struct *mm;
1014 1097  
1015 1098 if (sscanf(pidbuf, "%d", &pid) != 1)
1016 1099 return -EIO;
... ... @@ -1060,6 +1143,13 @@
1060 1143 to = cs->mems_allowed;
1061 1144  
1062 1145 up(&callback_sem);
  1146 +
  1147 + mm = get_task_mm(tsk);
  1148 + if (mm) {
  1149 + mpol_rebind_mm(mm, &to);
  1150 + mmput(mm);
  1151 + }
  1152 +
1063 1153 if (is_memory_migrate(cs))
1064 1154 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
1065 1155 put_task_struct(tsk);
... ... @@ -1131,6 +1131,15 @@
1131 1131 }
1132 1132 EXPORT_SYMBOL(alloc_pages_current);
1133 1133  
  1134 +/*
  1135 + * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
  1136 + * rebinds the mempolicy its copying by calling mpol_rebind_policy()
  1137 + * with the mems_allowed returned by cpuset_mems_allowed(). This
  1138 + * keeps mempolicies cpuset relative after its cpuset moves. See
  1139 + * further kernel/cpuset.c update_nodemask().
  1140 + */
  1141 +void *cpuset_being_rebound;
  1142 +
1134 1143 /* Slow path of a mempolicy copy */
1135 1144 struct mempolicy *__mpol_copy(struct mempolicy *old)
1136 1145 {
... ... @@ -1138,6 +1147,10 @@
1138 1147  
1139 1148 if (!new)
1140 1149 return ERR_PTR(-ENOMEM);
  1150 + if (current_cpuset_is_being_rebound()) {
  1151 + nodemask_t mems = cpuset_mems_allowed(current);
  1152 + mpol_rebind_policy(old, &mems);
  1153 + }
1141 1154 *new = *old;
1142 1155 atomic_set(&new->refcnt, 1);
1143 1156 if (new->policy == MPOL_BIND) {
... ... @@ -1478,6 +1491,22 @@
1478 1491 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1479 1492 {
1480 1493 mpol_rebind_policy(tsk->mempolicy, new);
  1494 +}
  1495 +
  1496 +/*
  1497 + * Rebind each vma in mm to new nodemask.
  1498 + *
  1499 + * Call holding a reference to mm. Takes mm->mmap_sem during call.
  1500 + */
  1501 +
  1502 +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
  1503 +{
  1504 + struct vm_area_struct *vma;
  1505 +
  1506 + down_write(&mm->mmap_sem);
  1507 + for (vma = mm->mmap; vma; vma = vma->vm_next)
  1508 + mpol_rebind_policy(vma->vm_policy, new);
  1509 + up_write(&mm->mmap_sem);
1481 1510 }
1482 1511  
1483 1512 /*