Commit 0e1e7c7a739562a321fda07c7cd2a97a7114f8f8

Authored by Christoph Lameter
Committed by Linus Torvalds
1 parent 523b945855

Memoryless nodes: Use N_HIGH_MEMORY for cpusets

cpusets try to ensure that any node added to a cpuset's mems_allowed is
on-line and contains memory.  The assumption was that online nodes contained
memory.  Thus, it is possible to add memoryless nodes to a cpuset and then add
tasks to this cpuset.  This results in continuous series of oom-kill and
apparent system hang.

Change cpusets to use node_states[N_HIGH_MEMORY] [a.k.a.  node_memory_map] in
place of node_online_map when vetting memories.  Return error if admin
attempts to write a non-empty mems_allowed node mask containing only
memoryless-nodes.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 43 additions and 22 deletions Side-by-side Diff

Documentation/cpusets.txt
... ... @@ -35,7 +35,8 @@
35 35 ----------------------
36 36  
37 37 Cpusets provide a mechanism for assigning a set of CPUs and Memory
38   -Nodes to a set of tasks.
  38 +Nodes to a set of tasks. In this document "Memory Node" refers to
  39 +an on-line node that contains memory.
39 40  
40 41 Cpusets constrain the CPU and Memory placement of tasks to only
41 42 the resources within a tasks current cpuset. They form a nested
... ... @@ -220,8 +221,8 @@
220 221 The cpus and mems files in the root (top_cpuset) cpuset are
221 222 read-only. The cpus file automatically tracks the value of
222 223 cpu_online_map using a CPU hotplug notifier, and the mems file
223   -automatically tracks the value of node_online_map using the
224   -cpuset_track_online_nodes() hook.
  224 +automatically tracks the value of node_states[N_MEMORY]--i.e.,
  225 +nodes with memory--using the cpuset_track_online_nodes() hook.
225 226  
226 227  
227 228 1.4 What are exclusive cpusets ?
include/linux/cpuset.h
... ... @@ -93,7 +93,7 @@
93 93 return node_possible_map;
94 94 }
95 95  
96   -#define cpuset_current_mems_allowed (node_online_map)
  96 +#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
97 97 static inline void cpuset_init_current_mems_allowed(void) {}
98 98 static inline void cpuset_update_task_memory_state(void) {}
99 99 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
... ... @@ -581,26 +581,28 @@
581 581  
582 582 /*
583 583 * Return in *pmask the portion of a cpusets's mems_allowed that
584   - * are online. If none are online, walk up the cpuset hierarchy
585   - * until we find one that does have some online mems. If we get
586   - * all the way to the top and still haven't found any online mems,
587   - * return node_online_map.
  584 + * are online, with memory. If none are online with memory, walk
  585 + * up the cpuset hierarchy until we find one that does have some
  586 + * online mems. If we get all the way to the top and still haven't
  587 + * found any online mems, return node_states[N_HIGH_MEMORY].
588 588 *
589 589 * One way or another, we guarantee to return some non-empty subset
590   - * of node_online_map.
  590 + * of node_states[N_HIGH_MEMORY].
591 591 *
592 592 * Call with callback_mutex held.
593 593 */
594 594  
595 595 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
596 596 {
597   - while (cs && !nodes_intersects(cs->mems_allowed, node_online_map))
  597 + while (cs && !nodes_intersects(cs->mems_allowed,
  598 + node_states[N_HIGH_MEMORY]))
598 599 cs = cs->parent;
599 600 if (cs)
600   - nodes_and(*pmask, cs->mems_allowed, node_online_map);
  601 + nodes_and(*pmask, cs->mems_allowed,
  602 + node_states[N_HIGH_MEMORY]);
601 603 else
602   - *pmask = node_online_map;
603   - BUG_ON(!nodes_intersects(*pmask, node_online_map));
  604 + *pmask = node_states[N_HIGH_MEMORY];
  605 + BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
604 606 }
605 607  
606 608 /**
... ... @@ -924,7 +926,10 @@
924 926 int fudge;
925 927 int retval;
926 928  
927   - /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
  929 + /*
  930 + * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
  931 + * it's read-only
  932 + */
928 933 if (cs == &top_cpuset)
929 934 return -EACCES;
930 935  
931 936  
... ... @@ -941,8 +946,21 @@
941 946 retval = nodelist_parse(buf, trialcs.mems_allowed);
942 947 if (retval < 0)
943 948 goto done;
  949 + if (!nodes_intersects(trialcs.mems_allowed,
  950 + node_states[N_HIGH_MEMORY])) {
  951 + /*
  952 + * error if only memoryless nodes specified.
  953 + */
  954 + retval = -ENOSPC;
  955 + goto done;
  956 + }
944 957 }
945   - nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
  958 + /*
  959 + * Exclude memoryless nodes. We know that trialcs.mems_allowed
  960 + * contains at least one node with memory.
  961 + */
  962 + nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
  963 + node_states[N_HIGH_MEMORY]);
946 964 oldmem = cs->mems_allowed;
947 965 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
948 966 retval = 0; /* Too easy - nothing to do */
... ... @@ -2098,8 +2116,9 @@
2098 2116  
2099 2117 /*
2100 2118 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
2101   - * cpu_online_map and node_online_map. Force the top cpuset to track
2102   - * whats online after any CPU or memory node hotplug or unplug event.
  2119 + * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
  2120 + * track what's online after any CPU or memory node hotplug or unplug
  2121 + * event.
2103 2122 *
2104 2123 * To ensure that we don't remove a CPU or node from the top cpuset
2105 2124 * that is currently in use by a child cpuset (which would violate
... ... @@ -2119,7 +2138,7 @@
2119 2138  
2120 2139 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
2121 2140 top_cpuset.cpus_allowed = cpu_online_map;
2122   - top_cpuset.mems_allowed = node_online_map;
  2141 + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2123 2142  
2124 2143 mutex_unlock(&callback_mutex);
2125 2144 mutex_unlock(&manage_mutex);
... ... @@ -2147,8 +2166,9 @@
2147 2166  
2148 2167 #ifdef CONFIG_MEMORY_HOTPLUG
2149 2168 /*
2150   - * Keep top_cpuset.mems_allowed tracking node_online_map.
2151   - * Call this routine anytime after you change node_online_map.
  2169 + * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
  2170 + * Call this routine anytime after you change
  2171 + * node_states[N_HIGH_MEMORY].
2152 2172 * See also the previous routine cpuset_handle_cpuhp().
2153 2173 */
2154 2174  
... ... @@ -2167,7 +2187,7 @@
2167 2187 void __init cpuset_init_smp(void)
2168 2188 {
2169 2189 top_cpuset.cpus_allowed = cpu_online_map;
2170   - top_cpuset.mems_allowed = node_online_map;
  2190 + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2171 2191  
2172 2192 hotcpu_notifier(cpuset_handle_cpuhp, 0);
2173 2193 }
... ... @@ -2309,7 +2329,7 @@
2309 2329 *
2310 2330 * Description: Returns the nodemask_t mems_allowed of the cpuset
2311 2331 * attached to the specified @tsk. Guaranteed to return some non-empty
2312   - * subset of node_online_map, even if this means going outside the
  2332 + * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
2313 2333 * tasks cpuset.
2314 2334 **/
2315 2335