Commit 58568d2a8215cb6f55caf2332017d7bdff954e1c

Authored by Miao Xie
Committed by Linus Torvalds
1 parent 950592f7b9

cpuset,mm: update tasks' mems_allowed in time

Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.

In order to update tasks' mems_allowed in time, we must modify the code of
memory policy.  Because the memory policy is applied in the process's
context originally.  After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.

But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression.  But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set.  In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.

[lee.schermerhorn@hp.com:
  The rework of mpol_new() to extract the adjusting of the node mask to
  apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
  with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
  allocation.  Fix this by adding the check for MPOL_PREFERRED and empty
  node mask to mpol_new_mpolicy().

  Remove the now unneeded 'nodes = NULL' from mpol_new().

  Note that mpol_new_mempolicy() is always called with a non-NULL
  'nodes' parameter now that it has been removed from mpol_new().
  Therefore, we don't need to test nodes for NULL before testing it for
  'empty'.  However, just to be extra paranoid, add a VM_BUG_ON() to
  verify this assumption.]
[lee.schermerhorn@hp.com:

  I don't think the function name 'mpol_new_mempolicy' is descriptive
  enough to differentiate it from mpol_new().

  This function applies cpuset set context, usually constraining nodes
  to those allowed by the cpuset.  However, when the 'RELATIVE_NODES flag
  is set, it also translates the nodes.  So I settled on
  'mpol_set_nodemask()', because the comment block for mpol_new() mentions
  that we need to call this function to "set nodes".

  Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 7 changed files with 170 additions and 191 deletions Side-by-side Diff

include/linux/cpuset.h
... ... @@ -18,7 +18,6 @@
18 18  
19 19 extern int number_of_cpusets; /* How many cpusets are defined in system? */
20 20  
21   -extern int cpuset_init_early(void);
22 21 extern int cpuset_init(void);
23 22 extern void cpuset_init_smp(void);
24 23 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
... ... @@ -27,7 +26,6 @@
27 26 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
28 27 #define cpuset_current_mems_allowed (current->mems_allowed)
29 28 void cpuset_init_current_mems_allowed(void);
30   -void cpuset_update_task_memory_state(void);
31 29 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
32 30  
33 31 extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
34 32  
... ... @@ -92,9 +90,13 @@
92 90  
93 91 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
94 92  
  93 +static inline void set_mems_allowed(nodemask_t nodemask)
  94 +{
  95 + current->mems_allowed = nodemask;
  96 +}
  97 +
95 98 #else /* !CONFIG_CPUSETS */
96 99  
97   -static inline int cpuset_init_early(void) { return 0; }
98 100 static inline int cpuset_init(void) { return 0; }
99 101 static inline void cpuset_init_smp(void) {}
100 102  
... ... @@ -116,7 +118,6 @@
116 118  
117 119 #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
118 120 static inline void cpuset_init_current_mems_allowed(void) {}
119   -static inline void cpuset_update_task_memory_state(void) {}
120 121  
121 122 static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
122 123 {
... ... @@ -185,6 +186,10 @@
185 186 }
186 187  
187 188 static inline void cpuset_print_task_mems_allowed(struct task_struct *p)
  189 +{
  190 +}
  191 +
  192 +static inline void set_mems_allowed(nodemask_t nodemask)
188 193 {
189 194 }
190 195  
include/linux/sched.h
... ... @@ -1318,7 +1318,8 @@
1318 1318 /* Thread group tracking */
1319 1319 u32 parent_exec_id;
1320 1320 u32 self_exec_id;
1321   -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
  1321 +/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
  1322 + * mempolicy */
1322 1323 spinlock_t alloc_lock;
1323 1324  
1324 1325 #ifdef CONFIG_GENERIC_HARDIRQS
... ... @@ -1386,8 +1387,7 @@
1386 1387 cputime_t acct_timexpd; /* stime + utime since last update */
1387 1388 #endif
1388 1389 #ifdef CONFIG_CPUSETS
1389   - nodemask_t mems_allowed;
1390   - int cpuset_mems_generation;
  1390 + nodemask_t mems_allowed; /* Protected by alloc_lock */
1391 1391 int cpuset_mem_spread_rotor;
1392 1392 #endif
1393 1393 #ifdef CONFIG_CGROUPS
... ... @@ -1410,7 +1410,7 @@
1410 1410 struct list_head perf_counter_list;
1411 1411 #endif
1412 1412 #ifdef CONFIG_NUMA
1413   - struct mempolicy *mempolicy;
  1413 + struct mempolicy *mempolicy; /* Protected by alloc_lock */
1414 1414 short il_next;
1415 1415 #endif
1416 1416 atomic_t fs_excl; /* holding fs exclusive resources */
... ... @@ -670,7 +670,6 @@
670 670 initrd_start = 0;
671 671 }
672 672 #endif
673   - cpuset_init_early();
674 673 page_cgroup_init();
675 674 enable_debug_pagealloc();
676 675 cpu_hotplug_init();
... ... @@ -867,6 +866,11 @@
867 866 static int __init kernel_init(void * unused)
868 867 {
869 868 lock_kernel();
  869 +
  870 + /*
  871 + * init can allocate pages on any node
  872 + */
  873 + set_mems_allowed(node_possible_map);
870 874 /*
871 875 * init can run on any cpu.
872 876 */
... ... @@ -97,12 +97,6 @@
97 97  
98 98 struct cpuset *parent; /* my parent */
99 99  
100   - /*
101   - * Copy of global cpuset_mems_generation as of the most
102   - * recent time this cpuset changed its mems_allowed.
103   - */
104   - int mems_generation;
105   -
106 100 struct fmeter fmeter; /* memory_pressure filter */
107 101  
108 102 /* partition number for rebuild_sched_domains() */
... ... @@ -176,27 +170,6 @@
176 170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177 171 }
178 172  
179   -/*
180   - * Increment this integer everytime any cpuset changes its
181   - * mems_allowed value. Users of cpusets can track this generation
182   - * number, and avoid having to lock and reload mems_allowed unless
183   - * the cpuset they're using changes generation.
184   - *
185   - * A single, global generation is needed because cpuset_attach_task() could
186   - * reattach a task to a different cpuset, which must not have its
187   - * generation numbers aliased with those of that tasks previous cpuset.
188   - *
189   - * Generations are needed for mems_allowed because one task cannot
190   - * modify another's memory placement. So we must enable every task,
191   - * on every visit to __alloc_pages(), to efficiently check whether
192   - * its current->cpuset->mems_allowed has changed, requiring an update
193   - * of its current->mems_allowed.
194   - *
195   - * Since writes to cpuset_mems_generation are guarded by the cgroup lock
196   - * there is no need to mark it atomic.
197   - */
198   -static int cpuset_mems_generation;
199   -
200 173 static struct cpuset top_cpuset = {
201 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202 175 };
... ... @@ -228,8 +201,9 @@
228 201 * If a task is only holding callback_mutex, then it has read-only
229 202 * access to cpusets.
230 203 *
231   - * The task_struct fields mems_allowed and mems_generation may only
232   - * be accessed in the context of that task, so require no locks.
  204 + * Now, the task_struct fields mems_allowed and mempolicy may be changed
  205 + * by other task, we use alloc_lock in the task_struct fields to protect
  206 + * them.
233 207 *
234 208 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 209 * small pieces of code, such as when reading out possibly multi-word
... ... @@ -349,69 +323,6 @@
349 323 tsk->flags &= ~PF_SPREAD_SLAB;
350 324 }
351 325  
352   -/**
353   - * cpuset_update_task_memory_state - update task memory placement
354   - *
355   - * If the current tasks cpusets mems_allowed changed behind our
356   - * backs, update current->mems_allowed, mems_generation and task NUMA
357   - * mempolicy to the new value.
358   - *
359   - * Task mempolicy is updated by rebinding it relative to the
360   - * current->cpuset if a task has its memory placement changed.
361   - * Do not call this routine if in_interrupt().
362   - *
363   - * Call without callback_mutex or task_lock() held. May be
364   - * called with or without cgroup_mutex held. Thanks in part to
365   - * 'the_top_cpuset_hack', the task's cpuset pointer will never
366   - * be NULL. This routine also might acquire callback_mutex during
367   - * call.
368   - *
369   - * Reading current->cpuset->mems_generation doesn't need task_lock
370   - * to guard the current->cpuset derefence, because it is guarded
371   - * from concurrent freeing of current->cpuset using RCU.
372   - *
373   - * The rcu_dereference() is technically probably not needed,
374   - * as I don't actually mind if I see a new cpuset pointer but
375   - * an old value of mems_generation. However this really only
376   - * matters on alpha systems using cpusets heavily. If I dropped
377   - * that rcu_dereference(), it would save them a memory barrier.
378   - * For all other arch's, rcu_dereference is a no-op anyway, and for
379   - * alpha systems not using cpusets, another planned optimization,
380   - * avoiding the rcu critical section for tasks in the root cpuset
381   - * which is statically allocated, so can't vanish, will make this
382   - * irrelevant. Better to use RCU as intended, than to engage in
383   - * some cute trick to save a memory barrier that is impossible to
384   - * test, for alpha systems using cpusets heavily, which might not
385   - * even exist.
386   - *
387   - * This routine is needed to update the per-task mems_allowed data,
388   - * within the tasks context, when it is trying to allocate memory
389   - * (in various mm/mempolicy.c routines) and notices that some other
390   - * task has been modifying its cpuset.
391   - */
392   -
393   -void cpuset_update_task_memory_state(void)
394   -{
395   - int my_cpusets_mem_gen;
396   - struct task_struct *tsk = current;
397   - struct cpuset *cs;
398   -
399   - rcu_read_lock();
400   - my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
401   - rcu_read_unlock();
402   -
403   - if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
404   - mutex_lock(&callback_mutex);
405   - task_lock(tsk);
406   - cs = task_cs(tsk); /* Maybe changed when task not locked */
407   - guarantee_online_mems(cs, &tsk->mems_allowed);
408   - tsk->cpuset_mems_generation = cs->mems_generation;
409   - task_unlock(tsk);
410   - mutex_unlock(&callback_mutex);
411   - mpol_rebind_task(tsk, &tsk->mems_allowed);
412   - }
413   -}
414   -
415 326 /*
416 327 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
417 328 *
... ... @@ -1017,14 +928,6 @@
1017 928 * other task, the task_struct mems_allowed that we are hacking
1018 929 * is for our current task, which must allocate new pages for that
1019 930 * migrating memory region.
1020   - *
1021   - * We call cpuset_update_task_memory_state() before hacking
1022   - * our tasks mems_allowed, so that we are assured of being in
1023   - * sync with our tasks cpuset, and in particular, callbacks to
1024   - * cpuset_update_task_memory_state() from nested page allocations
1025   - * won't see any mismatch of our cpuset and task mems_generation
1026   - * values, so won't overwrite our hacked tasks mems_allowed
1027   - * nodemask.
1028 931 */
1029 932  
1030 933 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1031 934  
1032 935  
1033 936  
1034 937  
1035 938  
... ... @@ -1032,23 +935,38 @@
1032 935 {
1033 936 struct task_struct *tsk = current;
1034 937  
1035   - cpuset_update_task_memory_state();
1036   -
1037   - mutex_lock(&callback_mutex);
1038 938 tsk->mems_allowed = *to;
1039   - mutex_unlock(&callback_mutex);
1040 939  
1041 940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1042 941  
1043   - mutex_lock(&callback_mutex);
1044 942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1045   - mutex_unlock(&callback_mutex);
1046 943 }
1047 944  
1048 945 /*
1049   - * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
1050   - * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
  946 + * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
  947 + * @tsk: the task to change
  948 + * @newmems: new nodes that the task will be set
  949 + *
  950 + * In order to avoid seeing no nodes if the old and new nodes are disjoint,
  951 + * we structure updates as setting all new allowed nodes, then clearing newly
  952 + * disallowed ones.
  953 + *
  954 + * Called with task's alloc_lock held
1051 955 */
  956 +static void cpuset_change_task_nodemask(struct task_struct *tsk,
  957 + nodemask_t *newmems)
  958 +{
  959 + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
  960 + mpol_rebind_task(tsk, &tsk->mems_allowed);
  961 + mpol_rebind_task(tsk, newmems);
  962 + tsk->mems_allowed = *newmems;
  963 +}
  964 +
  965 +/*
  966 + * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
  967 + * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
  968 + * memory_migrate flag is set. Called with cgroup_mutex held.
  969 + */
1052 970 static void cpuset_change_nodemask(struct task_struct *p,
1053 971 struct cgroup_scanner *scan)
1054 972 {
1055 973  
1056 974  
... ... @@ -1056,12 +974,19 @@
1056 974 struct cpuset *cs;
1057 975 int migrate;
1058 976 const nodemask_t *oldmem = scan->data;
  977 + nodemask_t newmems;
1059 978  
  979 + cs = cgroup_cs(scan->cg);
  980 + guarantee_online_mems(cs, &newmems);
  981 +
  982 + task_lock(p);
  983 + cpuset_change_task_nodemask(p, &newmems);
  984 + task_unlock(p);
  985 +
1060 986 mm = get_task_mm(p);
1061 987 if (!mm)
1062 988 return;
1063 989  
1064   - cs = cgroup_cs(scan->cg);
1065 990 migrate = is_memory_migrate(cs);
1066 991  
1067 992 mpol_rebind_mm(mm, &cs->mems_allowed);
... ... @@ -1114,10 +1039,10 @@
1114 1039 /*
1115 1040 * Handle user request to change the 'mems' memory placement
1116 1041 * of a cpuset. Needs to validate the request, update the
1117   - * cpusets mems_allowed and mems_generation, and for each
1118   - * task in the cpuset, rebind any vma mempolicies and if
1119   - * the cpuset is marked 'memory_migrate', migrate the tasks
1120   - * pages to the new memory.
  1042 + * cpusets mems_allowed, and for each task in the cpuset,
  1043 + * update mems_allowed and rebind task's mempolicy and any vma
  1044 + * mempolicies and if the cpuset is marked 'memory_migrate',
  1045 + * migrate the tasks pages to the new memory.
1121 1046 *
1122 1047 * Call with cgroup_mutex held. May take callback_mutex during call.
1123 1048 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
... ... @@ -1170,7 +1095,6 @@
1170 1095  
1171 1096 mutex_lock(&callback_mutex);
1172 1097 cs->mems_allowed = trialcs->mems_allowed;
1173   - cs->mems_generation = cpuset_mems_generation++;
1174 1098 mutex_unlock(&callback_mutex);
1175 1099  
1176 1100 update_tasks_nodemask(cs, &oldmem, &heap);
1177 1101  
1178 1102  
1179 1103  
... ... @@ -1434,15 +1358,18 @@
1434 1358  
1435 1359 if (cs == &top_cpuset) {
1436 1360 cpumask_copy(cpus_attach, cpu_possible_mask);
  1361 + to = node_possible_map;
1437 1362 } else {
1438   - mutex_lock(&callback_mutex);
1439 1363 guarantee_online_cpus(cs, cpus_attach);
1440   - mutex_unlock(&callback_mutex);
  1364 + guarantee_online_mems(cs, &to);
1441 1365 }
1442 1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1443 1367 if (err)
1444 1368 return;
1445 1369  
  1370 + task_lock(tsk);
  1371 + cpuset_change_task_nodemask(tsk, &to);
  1372 + task_unlock(tsk);
1446 1373 cpuset_update_task_spread_flag(cs, tsk);
1447 1374  
1448 1375 from = oldcs->mems_allowed;
... ... @@ -1848,8 +1775,6 @@
1848 1775 struct cpuset *parent;
1849 1776  
1850 1777 if (!cont->parent) {
1851   - /* This is early initialization for the top cgroup */
1852   - top_cpuset.mems_generation = cpuset_mems_generation++;
1853 1778 return &top_cpuset.css;
1854 1779 }
1855 1780 parent = cgroup_cs(cont->parent);
... ... @@ -1861,7 +1786,6 @@
1861 1786 return ERR_PTR(-ENOMEM);
1862 1787 }
1863 1788  
1864   - cpuset_update_task_memory_state();
1865 1789 cs->flags = 0;
1866 1790 if (is_spread_page(parent))
1867 1791 set_bit(CS_SPREAD_PAGE, &cs->flags);
... ... @@ -1870,7 +1794,6 @@
1870 1794 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1871 1795 cpumask_clear(cs->cpus_allowed);
1872 1796 nodes_clear(cs->mems_allowed);
1873   - cs->mems_generation = cpuset_mems_generation++;
1874 1797 fmeter_init(&cs->fmeter);
1875 1798 cs->relax_domain_level = -1;
1876 1799  
... ... @@ -1889,8 +1812,6 @@
1889 1812 {
1890 1813 struct cpuset *cs = cgroup_cs(cont);
1891 1814  
1892   - cpuset_update_task_memory_state();
1893   -
1894 1815 if (is_sched_load_balance(cs))
1895 1816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1896 1817  
... ... @@ -1911,21 +1832,6 @@
1911 1832 .early_init = 1,
1912 1833 };
1913 1834  
1914   -/*
1915   - * cpuset_init_early - just enough so that the calls to
1916   - * cpuset_update_task_memory_state() in early init code
1917   - * are harmless.
1918   - */
1919   -
1920   -int __init cpuset_init_early(void)
1921   -{
1922   - alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
1923   -
1924   - top_cpuset.mems_generation = cpuset_mems_generation++;
1925   - return 0;
1926   -}
1927   -
1928   -
1929 1835 /**
1930 1836 * cpuset_init - initialize cpusets at system boot
1931 1837 *
1932 1838  
... ... @@ -1936,11 +1842,13 @@
1936 1842 {
1937 1843 int err = 0;
1938 1844  
  1845 + if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
  1846 + BUG();
  1847 +
1939 1848 cpumask_setall(top_cpuset.cpus_allowed);
1940 1849 nodes_setall(top_cpuset.mems_allowed);
1941 1850  
1942 1851 fmeter_init(&top_cpuset.fmeter);
1943   - top_cpuset.mems_generation = cpuset_mems_generation++;
1944 1852 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1945 1853 top_cpuset.relax_domain_level = -1;
1946 1854  
... ... @@ -9,6 +9,7 @@
9 9 #include <linux/kthread.h>
10 10 #include <linux/completion.h>
11 11 #include <linux/err.h>
  12 +#include <linux/cpuset.h>
12 13 #include <linux/unistd.h>
13 14 #include <linux/file.h>
14 15 #include <linux/module.h>
... ... @@ -236,6 +237,7 @@
236 237 ignore_signals(tsk);
237 238 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
238 239 set_cpus_allowed_ptr(tsk, cpu_all_mask);
  240 + set_mems_allowed(node_possible_map);
239 241  
240 242 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
241 243  
... ... @@ -182,13 +182,54 @@
182 182 return 0;
183 183 }
184 184  
185   -/* Create a new policy */
  185 +/*
  186 + * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
  187 + * any, for the new policy. mpol_new() has already validated the nodes
  188 + * parameter with respect to the policy mode and flags. But, we need to
  189 + * handle an empty nodemask with MPOL_PREFERRED here.
  190 + *
  191 + * Must be called holding task's alloc_lock to protect task's mems_allowed
  192 + * and mempolicy. May also be called holding the mmap_semaphore for write.
  193 + */
  194 +static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
  195 +{
  196 + nodemask_t cpuset_context_nmask;
  197 + int ret;
  198 +
  199 + /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
  200 + if (pol == NULL)
  201 + return 0;
  202 +
  203 + VM_BUG_ON(!nodes);
  204 + if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
  205 + nodes = NULL; /* explicit local allocation */
  206 + else {
  207 + if (pol->flags & MPOL_F_RELATIVE_NODES)
  208 + mpol_relative_nodemask(&cpuset_context_nmask, nodes,
  209 + &cpuset_current_mems_allowed);
  210 + else
  211 + nodes_and(cpuset_context_nmask, *nodes,
  212 + cpuset_current_mems_allowed);
  213 + if (mpol_store_user_nodemask(pol))
  214 + pol->w.user_nodemask = *nodes;
  215 + else
  216 + pol->w.cpuset_mems_allowed =
  217 + cpuset_current_mems_allowed;
  218 + }
  219 +
  220 + ret = mpol_ops[pol->mode].create(pol,
  221 + nodes ? &cpuset_context_nmask : NULL);
  222 + return ret;
  223 +}
  224 +
  225 +/*
  226 + * This function just creates a new policy, does some check and simple
  227 + * initialization. You must invoke mpol_set_nodemask() to set nodes.
  228 + */
186 229 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187 230 nodemask_t *nodes)
188 231 {
189 232 struct mempolicy *policy;
190   - nodemask_t cpuset_context_nmask;
191   - int ret;
192 233  
193 234 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194 235 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
... ... @@ -210,7 +251,6 @@
210 251 if (((flags & MPOL_F_STATIC_NODES) ||
211 252 (flags & MPOL_F_RELATIVE_NODES)))
212 253 return ERR_PTR(-EINVAL);
213   - nodes = NULL; /* flag local alloc */
214 254 }
215 255 } else if (nodes_empty(*nodes))
216 256 return ERR_PTR(-EINVAL);
... ... @@ -221,30 +261,6 @@
221 261 policy->mode = mode;
222 262 policy->flags = flags;
223 263  
224   - if (nodes) {
225   - /*
226   - * cpuset related setup doesn't apply to local allocation
227   - */
228   - cpuset_update_task_memory_state();
229   - if (flags & MPOL_F_RELATIVE_NODES)
230   - mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231   - &cpuset_current_mems_allowed);
232   - else
233   - nodes_and(cpuset_context_nmask, *nodes,
234   - cpuset_current_mems_allowed);
235   - if (mpol_store_user_nodemask(policy))
236   - policy->w.user_nodemask = *nodes;
237   - else
238   - policy->w.cpuset_mems_allowed =
239   - cpuset_mems_allowed(current);
240   - }
241   -
242   - ret = mpol_ops[mode].create(policy,
243   - nodes ? &cpuset_context_nmask : NULL);
244   - if (ret < 0) {
245   - kmem_cache_free(policy_cache, policy);
246   - return ERR_PTR(ret);
247   - }
248 264 return policy;
249 265 }
250 266  
... ... @@ -324,6 +340,8 @@
324 340 /*
325 341 * Wrapper for mpol_rebind_policy() that just requires task
326 342 * pointer, and updates task mempolicy.
  343 + *
  344 + * Called with task's alloc_lock held.
327 345 */
328 346  
329 347 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
330 348  
... ... @@ -600,8 +618,9 @@
600 618 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
601 619 nodemask_t *nodes)
602 620 {
603   - struct mempolicy *new;
  621 + struct mempolicy *new, *old;
604 622 struct mm_struct *mm = current->mm;
  623 + int ret;
605 624  
606 625 new = mpol_new(mode, flags, nodes);
607 626 if (IS_ERR(new))
608 627  
609 628  
610 629  
... ... @@ -615,20 +634,33 @@
615 634 */
616 635 if (mm)
617 636 down_write(&mm->mmap_sem);
618   - mpol_put(current->mempolicy);
  637 + task_lock(current);
  638 + ret = mpol_set_nodemask(new, nodes);
  639 + if (ret) {
  640 + task_unlock(current);
  641 + if (mm)
  642 + up_write(&mm->mmap_sem);
  643 + mpol_put(new);
  644 + return ret;
  645 + }
  646 + old = current->mempolicy;
619 647 current->mempolicy = new;
620 648 mpol_set_task_struct_flag();
621 649 if (new && new->mode == MPOL_INTERLEAVE &&
622 650 nodes_weight(new->v.nodes))
623 651 current->il_next = first_node(new->v.nodes);
  652 + task_unlock(current);
624 653 if (mm)
625 654 up_write(&mm->mmap_sem);
626 655  
  656 + mpol_put(old);
627 657 return 0;
628 658 }
629 659  
630 660 /*
631 661 * Return nodemask for policy for get_mempolicy() query
  662 + *
  663 + * Called with task's alloc_lock held
632 664 */
633 665 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
634 666 {
... ... @@ -674,7 +706,6 @@
674 706 struct vm_area_struct *vma = NULL;
675 707 struct mempolicy *pol = current->mempolicy;
676 708  
677   - cpuset_update_task_memory_state();
678 709 if (flags &
679 710 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
680 711 return -EINVAL;
681 712  
... ... @@ -683,7 +714,9 @@
683 714 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
684 715 return -EINVAL;
685 716 *policy = 0; /* just so it's initialized */
  717 + task_lock(current);
686 718 *nmask = cpuset_current_mems_allowed;
  719 + task_unlock(current);
687 720 return 0;
688 721 }
689 722  
690 723  
... ... @@ -738,8 +771,11 @@
738 771 }
739 772  
740 773 err = 0;
741   - if (nmask)
  774 + if (nmask) {
  775 + task_lock(current);
742 776 get_policy_nodemask(pol, nmask);
  777 + task_unlock(current);
  778 + }
743 779  
744 780 out:
745 781 mpol_cond_put(pol);
... ... @@ -979,6 +1015,14 @@
979 1015 return err;
980 1016 }
981 1017 down_write(&mm->mmap_sem);
  1018 + task_lock(current);
  1019 + err = mpol_set_nodemask(new, nmask);
  1020 + task_unlock(current);
  1021 + if (err) {
  1022 + up_write(&mm->mmap_sem);
  1023 + mpol_put(new);
  1024 + return err;
  1025 + }
982 1026 vma = check_range(mm, start, end, nmask,
983 1027 flags | MPOL_MF_INVERT, &pagelist);
984 1028  
... ... @@ -1545,8 +1589,6 @@
1545 1589 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1546 1590 struct zonelist *zl;
1547 1591  
1548   - cpuset_update_task_memory_state();
1549   -
1550 1592 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1551 1593 unsigned nid;
1552 1594  
... ... @@ -1593,8 +1635,6 @@
1593 1635 {
1594 1636 struct mempolicy *pol = current->mempolicy;
1595 1637  
1596   - if ((gfp & __GFP_WAIT) && !in_interrupt())
1597   - cpuset_update_task_memory_state();
1598 1638 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1599 1639 pol = &default_policy;
1600 1640  
... ... @@ -1854,6 +1894,8 @@
1854 1894 */
1855 1895 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1856 1896 {
  1897 + int ret;
  1898 +
1857 1899 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1858 1900 spin_lock_init(&sp->lock);
1859 1901  
1860 1902  
1861 1903  
... ... @@ -1863,10 +1905,20 @@
1863 1905  
1864 1906 /* contextualize the tmpfs mount point mempolicy */
1865 1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1866   - mpol_put(mpol); /* drop our ref on sb mpol */
1867   - if (IS_ERR(new))
  1908 + if (IS_ERR(new)) {
  1909 + mpol_put(mpol); /* drop our ref on sb mpol */
1868 1910 return; /* no valid nodemask intersection */
  1911 + }
1869 1912  
  1913 + task_lock(current);
  1914 + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
  1915 + task_unlock(current);
  1916 + mpol_put(mpol); /* drop our ref on sb mpol */
  1917 + if (ret) {
  1918 + mpol_put(new);
  1919 + return;
  1920 + }
  1921 +
1870 1922 /* Create pseudo-vma that contains just the policy */
1871 1923 memset(&pvma, 0, sizeof(struct vm_area_struct));
1872 1924 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
... ... @@ -2086,8 +2138,19 @@
2086 2138 new = mpol_new(mode, mode_flags, &nodes);
2087 2139 if (IS_ERR(new))
2088 2140 err = 1;
2089   - else if (no_context)
2090   - new->w.user_nodemask = nodes; /* save for contextualization */
  2141 + else {
  2142 + int ret;
  2143 +
  2144 + task_lock(current);
  2145 + ret = mpol_set_nodemask(new, &nodes);
  2146 + task_unlock(current);
  2147 + if (ret)
  2148 + err = 1;
  2149 + else if (no_context) {
  2150 + /* save for contextualization */
  2151 + new->w.user_nodemask = nodes;
  2152 + }
  2153 + }
2091 2154  
2092 2155 out:
2093 2156 /* Restore string for error message */
... ... @@ -1569,10 +1569,7 @@
1569 1569  
1570 1570 /* We now go into synchronous reclaim */
1571 1571 cpuset_memory_pressure_bump();
1572   - /*
1573   - * The task's cpuset might have expanded its set of allowable nodes
1574   - */
1575   - cpuset_update_task_memory_state();
  1572 +
1576 1573 p->flags |= PF_MEMALLOC;
1577 1574  
1578 1575 lockdep_set_current_reclaim_state(gfp_mask);