Commit 0e1e7c7a739562a321fda07c7cd2a97a7114f8f8
Committed by
Linus Torvalds
1 parent
523b945855
Exists in
master
and in
39 other branches
Memoryless nodes: Use N_HIGH_MEMORY for cpusets
cpusets try to ensure that any node added to a cpuset's mems_allowed is on-line and contains memory. The assumption was that online nodes contained memory. Thus, it is possible to add memoryless nodes to a cpuset and then add tasks to this cpuset. This results in continuous series of oom-kill and apparent system hang. Change cpusets to use node_states[N_HIGH_MEMORY] [a.k.a. node_memory_map] in place of node_online_map when vetting memories. Return error if admin attempts to write a non-empty mems_allowed node mask containing only memoryless-nodes. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Bob Picco <bob.picco@hp.com> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@skynet.ie> Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 3 changed files with 43 additions and 22 deletions Side-by-side Diff
Documentation/cpusets.txt
... | ... | @@ -35,7 +35,8 @@ |
35 | 35 | ---------------------- |
36 | 36 | |
37 | 37 | Cpusets provide a mechanism for assigning a set of CPUs and Memory |
38 | -Nodes to a set of tasks. | |
38 | +Nodes to a set of tasks. In this document "Memory Node" refers to | |
39 | +an on-line node that contains memory. | |
39 | 40 | |
40 | 41 | Cpusets constrain the CPU and Memory placement of tasks to only |
41 | 42 | the resources within a tasks current cpuset. They form a nested |
... | ... | @@ -220,8 +221,8 @@ |
220 | 221 | The cpus and mems files in the root (top_cpuset) cpuset are |
221 | 222 | read-only. The cpus file automatically tracks the value of |
222 | 223 | cpu_online_map using a CPU hotplug notifier, and the mems file |
223 | -automatically tracks the value of node_online_map using the | |
224 | -cpuset_track_online_nodes() hook. | |
224 | +automatically tracks the value of node_states[N_MEMORY]--i.e., | |
225 | +nodes with memory--using the cpuset_track_online_nodes() hook. | |
225 | 226 | |
226 | 227 | |
227 | 228 | 1.4 What are exclusive cpusets ? |
include/linux/cpuset.h
... | ... | @@ -93,7 +93,7 @@ |
93 | 93 | return node_possible_map; |
94 | 94 | } |
95 | 95 | |
96 | -#define cpuset_current_mems_allowed (node_online_map) | |
96 | +#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) | |
97 | 97 | static inline void cpuset_init_current_mems_allowed(void) {} |
98 | 98 | static inline void cpuset_update_task_memory_state(void) {} |
99 | 99 | #define cpuset_nodes_subset_current_mems_allowed(nodes) (1) |
kernel/cpuset.c
... | ... | @@ -581,26 +581,28 @@ |
581 | 581 | |
582 | 582 | /* |
583 | 583 | * Return in *pmask the portion of a cpusets's mems_allowed that |
584 | - * are online. If none are online, walk up the cpuset hierarchy | |
585 | - * until we find one that does have some online mems. If we get | |
586 | - * all the way to the top and still haven't found any online mems, | |
587 | - * return node_online_map. | |
584 | + * are online, with memory. If none are online with memory, walk | |
585 | + * up the cpuset hierarchy until we find one that does have some | |
586 | + * online mems. If we get all the way to the top and still haven't | |
587 | + * found any online mems, return node_states[N_HIGH_MEMORY]. | |
588 | 588 | * |
589 | 589 | * One way or another, we guarantee to return some non-empty subset |
590 | - * of node_online_map. | |
590 | + * of node_states[N_HIGH_MEMORY]. | |
591 | 591 | * |
592 | 592 | * Call with callback_mutex held. |
593 | 593 | */ |
594 | 594 | |
595 | 595 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
596 | 596 | { |
597 | - while (cs && !nodes_intersects(cs->mems_allowed, node_online_map)) | |
597 | + while (cs && !nodes_intersects(cs->mems_allowed, | |
598 | + node_states[N_HIGH_MEMORY])) | |
598 | 599 | cs = cs->parent; |
599 | 600 | if (cs) |
600 | - nodes_and(*pmask, cs->mems_allowed, node_online_map); | |
601 | + nodes_and(*pmask, cs->mems_allowed, | |
602 | + node_states[N_HIGH_MEMORY]); | |
601 | 603 | else |
602 | - *pmask = node_online_map; | |
603 | - BUG_ON(!nodes_intersects(*pmask, node_online_map)); | |
604 | + *pmask = node_states[N_HIGH_MEMORY]; | |
605 | + BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); | |
604 | 606 | } |
605 | 607 | |
606 | 608 | /** |
... | ... | @@ -924,7 +926,10 @@ |
924 | 926 | int fudge; |
925 | 927 | int retval; |
926 | 928 | |
927 | - /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */ | |
929 | + /* | |
930 | + * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | |
931 | + * it's read-only | |
932 | + */ | |
928 | 933 | if (cs == &top_cpuset) |
929 | 934 | return -EACCES; |
930 | 935 | |
931 | 936 | |
... | ... | @@ -941,8 +946,21 @@ |
941 | 946 | retval = nodelist_parse(buf, trialcs.mems_allowed); |
942 | 947 | if (retval < 0) |
943 | 948 | goto done; |
949 | + if (!nodes_intersects(trialcs.mems_allowed, | |
950 | + node_states[N_HIGH_MEMORY])) { | |
951 | + /* | |
952 | + * error if only memoryless nodes specified. | |
953 | + */ | |
954 | + retval = -ENOSPC; | |
955 | + goto done; | |
956 | + } | |
944 | 957 | } |
945 | - nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); | |
958 | + /* | |
959 | + * Exclude memoryless nodes. We know that trialcs.mems_allowed | |
960 | + * contains at least one node with memory. | |
961 | + */ | |
962 | + nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, | |
963 | + node_states[N_HIGH_MEMORY]); | |
946 | 964 | oldmem = cs->mems_allowed; |
947 | 965 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { |
948 | 966 | retval = 0; /* Too easy - nothing to do */ |
... | ... | @@ -2098,8 +2116,9 @@ |
2098 | 2116 | |
2099 | 2117 | /* |
2100 | 2118 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track |
2101 | - * cpu_online_map and node_online_map. Force the top cpuset to track | |
2102 | - * whats online after any CPU or memory node hotplug or unplug event. | |
2119 | + * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | |
2120 | + * track what's online after any CPU or memory node hotplug or unplug | |
2121 | + * event. | |
2103 | 2122 | * |
2104 | 2123 | * To ensure that we don't remove a CPU or node from the top cpuset |
2105 | 2124 | * that is currently in use by a child cpuset (which would violate |
... | ... | @@ -2119,7 +2138,7 @@ |
2119 | 2138 | |
2120 | 2139 | guarantee_online_cpus_mems_in_subtree(&top_cpuset); |
2121 | 2140 | top_cpuset.cpus_allowed = cpu_online_map; |
2122 | - top_cpuset.mems_allowed = node_online_map; | |
2141 | + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | |
2123 | 2142 | |
2124 | 2143 | mutex_unlock(&callback_mutex); |
2125 | 2144 | mutex_unlock(&manage_mutex); |
... | ... | @@ -2147,8 +2166,9 @@ |
2147 | 2166 | |
2148 | 2167 | #ifdef CONFIG_MEMORY_HOTPLUG |
2149 | 2168 | /* |
2150 | - * Keep top_cpuset.mems_allowed tracking node_online_map. | |
2151 | - * Call this routine anytime after you change node_online_map. | |
2169 | + * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | |
2170 | + * Call this routine anytime after you change | |
2171 | + * node_states[N_HIGH_MEMORY]. | |
2152 | 2172 | * See also the previous routine cpuset_handle_cpuhp(). |
2153 | 2173 | */ |
2154 | 2174 | |
... | ... | @@ -2167,7 +2187,7 @@ |
2167 | 2187 | void __init cpuset_init_smp(void) |
2168 | 2188 | { |
2169 | 2189 | top_cpuset.cpus_allowed = cpu_online_map; |
2170 | - top_cpuset.mems_allowed = node_online_map; | |
2190 | + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | |
2171 | 2191 | |
2172 | 2192 | hotcpu_notifier(cpuset_handle_cpuhp, 0); |
2173 | 2193 | } |
... | ... | @@ -2309,7 +2329,7 @@ |
2309 | 2329 | * |
2310 | 2330 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
2311 | 2331 | * attached to the specified @tsk. Guaranteed to return some non-empty |
2312 | - * subset of node_online_map, even if this means going outside the | |
2332 | + * subset of node_states[N_HIGH_MEMORY], even if this means going outside the | |
2313 | 2333 | * tasks cpuset. |
2314 | 2334 | **/ |
2315 | 2335 |