Commit 5606e3877ad8baea42f3a71ebde0a03622bbb551

Authored by Mel Gorman
1 parent 03c5a6e163

mm: numa: Migrate on reference policy

This is the simplest possible policy that still does something of note.
When a pte_numa is faulted, it is moved immediately. Any replacement
policy must at least do better than this and in all likelihood this
policy regresses normal workloads.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>

Showing 2 changed files with 37 additions and 2 deletions Inline Diff

include/uapi/linux/mempolicy.h
1 /* 1 /*
2 * NUMA memory policies for Linux. 2 * NUMA memory policies for Linux.
3 * Copyright 2003,2004 Andi Kleen SuSE Labs 3 * Copyright 2003,2004 Andi Kleen SuSE Labs
4 */ 4 */
5 #ifndef _UAPI_LINUX_MEMPOLICY_H 5 #ifndef _UAPI_LINUX_MEMPOLICY_H
6 #define _UAPI_LINUX_MEMPOLICY_H 6 #define _UAPI_LINUX_MEMPOLICY_H
7 7
8 #include <linux/errno.h> 8 #include <linux/errno.h>
9 9
10 10
11 /* 11 /*
12 * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are 12 * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
13 * passed by the user to either set_mempolicy() or mbind() in an 'int' actual. 13 * passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
14 * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags. 14 * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
15 */ 15 */
16 16
17 /* Policies */ 17 /* Policies */
18 enum { 18 enum {
19 MPOL_DEFAULT, 19 MPOL_DEFAULT,
20 MPOL_PREFERRED, 20 MPOL_PREFERRED,
21 MPOL_BIND, 21 MPOL_BIND,
22 MPOL_INTERLEAVE, 22 MPOL_INTERLEAVE,
23 MPOL_LOCAL, 23 MPOL_LOCAL,
24 MPOL_MAX, /* always last member of enum */ 24 MPOL_MAX, /* always last member of enum */
25 }; 25 };
26 26
27 enum mpol_rebind_step { 27 enum mpol_rebind_step {
28 MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */ 28 MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
29 MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */ 29 MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
30 MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/ 30 MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
31 MPOL_REBIND_NSTEP, 31 MPOL_REBIND_NSTEP,
32 }; 32 };
33 33
34 /* Flags for set_mempolicy */ 34 /* Flags for set_mempolicy */
35 #define MPOL_F_STATIC_NODES (1 << 15) 35 #define MPOL_F_STATIC_NODES (1 << 15)
36 #define MPOL_F_RELATIVE_NODES (1 << 14) 36 #define MPOL_F_RELATIVE_NODES (1 << 14)
37 37
38 /* 38 /*
39 * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to 39 * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
40 * either set_mempolicy() or mbind(). 40 * either set_mempolicy() or mbind().
41 */ 41 */
42 #define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) 42 #define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
43 43
44 /* Flags for get_mempolicy */ 44 /* Flags for get_mempolicy */
45 #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */ 45 #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
46 #define MPOL_F_ADDR (1<<1) /* look up vma using address */ 46 #define MPOL_F_ADDR (1<<1) /* look up vma using address */
47 #define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */ 47 #define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
48 48
49 /* Flags for mbind */ 49 /* Flags for mbind */
50 #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ 50 #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
51 #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform 51 #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
52 to policy */ 52 to policy */
53 #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ 53 #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
54 #define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ 54 #define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
55 #define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ 55 #define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
56 56
57 #define MPOL_MF_VALID (MPOL_MF_STRICT | \ 57 #define MPOL_MF_VALID (MPOL_MF_STRICT | \
58 MPOL_MF_MOVE | \ 58 MPOL_MF_MOVE | \
59 MPOL_MF_MOVE_ALL) 59 MPOL_MF_MOVE_ALL)
60 60
61 /* 61 /*
62 * Internal flags that share the struct mempolicy flags word with 62 * Internal flags that share the struct mempolicy flags word with
63 * "mode flags". These flags are allocated from bit 0 up, as they 63 * "mode flags". These flags are allocated from bit 0 up, as they
64 * are never OR'ed into the mode in mempolicy API arguments. 64 * are never OR'ed into the mode in mempolicy API arguments.
65 */ 65 */
66 #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ 66 #define MPOL_F_SHARED (1 << 0) /* identify shared policies */
67 #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ 67 #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
68 #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ 68 #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
69 #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ 69 #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
70 #define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
70 71
71 72
72 #endif /* _UAPI_LINUX_MEMPOLICY_H */ 73 #endif /* _UAPI_LINUX_MEMPOLICY_H */
73 74
1 /* 1 /*
2 * Simple NUMA memory policy for the Linux kernel. 2 * Simple NUMA memory policy for the Linux kernel.
3 * 3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2. 6 * Subject to the GNU Public License, version 2.
7 * 7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should 8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated. 9 * be allocated.
10 * 10 *
11 * Support four policies per VMA and per process: 11 * Support four policies per VMA and per process:
12 * 12 *
13 * The VMA policy has priority over the process policy for a page fault. 13 * The VMA policy has priority over the process policy for a page fault.
14 * 14 *
15 * interleave Allocate memory interleaved over a set of nodes, 15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails. 16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the 17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping 18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter 19 * for anonymous memory. For process policy an process counter
20 * is used. 20 * is used.
21 * 21 *
22 * bind Only allocate memory on a specific set of nodes, 22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback. 23 * no fallback.
24 * FIXME: memory is allocated starting with the first node 24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict 25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead 26 * the allocation to memory nodes instead
27 * 27 *
28 * preferred Try a specific node first before normal fallback. 28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation 29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default, 30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default 31 * but useful to set in a VMA when you have a non default
32 * process policy. 32 * process policy.
33 * 33 *
34 * default Allocate on the local node first, or when on a VMA 34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did 35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default. 36 * in a NUMA aware kernel and still does by, ahem, default.
37 * 37 *
38 * The process policy is applied for most non interrupt memory allocations 38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always 39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory 40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM. 41 * allocations for a VMA in the VM.
42 * 42 *
43 * Currently there are a few corner cases in swapping where the policy 43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy 44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins. 45 * is used it is not remembered over swap outs/swap ins.
46 * 46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations 47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that 48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied. 49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations. 50 * Same with GFP_DMA allocations.
51 * 51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped. 53 * all users and remembered even when nobody has memory mapped.
54 */ 54 */
55 55
56 /* Notebook: 56 /* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache 57 fix mmap readahead to honour policy and enable policy for any page cache
58 object 58 object
59 statistics for bigpages 59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires 60 global policy for page cache? currently it uses process policy. Requires
61 first item above. 61 first item above.
62 handle mremap for shared memory (currently ignored for the policy) 62 handle mremap for shared memory (currently ignored for the policy)
63 grows down? 63 grows down?
64 make bind policy root only? It can trigger oom much faster and the 64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that. 65 kernel is not always grateful with that.
66 */ 66 */
67 67
68 #include <linux/mempolicy.h> 68 #include <linux/mempolicy.h>
69 #include <linux/mm.h> 69 #include <linux/mm.h>
70 #include <linux/highmem.h> 70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h> 71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h> 72 #include <linux/kernel.h>
73 #include <linux/sched.h> 73 #include <linux/sched.h>
74 #include <linux/nodemask.h> 74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h> 75 #include <linux/cpuset.h>
76 #include <linux/slab.h> 76 #include <linux/slab.h>
77 #include <linux/string.h> 77 #include <linux/string.h>
78 #include <linux/export.h> 78 #include <linux/export.h>
79 #include <linux/nsproxy.h> 79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h> 80 #include <linux/interrupt.h>
81 #include <linux/init.h> 81 #include <linux/init.h>
82 #include <linux/compat.h> 82 #include <linux/compat.h>
83 #include <linux/swap.h> 83 #include <linux/swap.h>
84 #include <linux/seq_file.h> 84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h> 85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h> 86 #include <linux/migrate.h>
87 #include <linux/ksm.h> 87 #include <linux/ksm.h>
88 #include <linux/rmap.h> 88 #include <linux/rmap.h>
89 #include <linux/security.h> 89 #include <linux/security.h>
90 #include <linux/syscalls.h> 90 #include <linux/syscalls.h>
91 #include <linux/ctype.h> 91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h> 92 #include <linux/mm_inline.h>
93 #include <linux/mmu_notifier.h> 93 #include <linux/mmu_notifier.h>
94 94
95 #include <asm/tlbflush.h> 95 #include <asm/tlbflush.h>
96 #include <asm/uaccess.h> 96 #include <asm/uaccess.h>
97 #include <linux/random.h> 97 #include <linux/random.h>
98 98
99 #include "internal.h" 99 #include "internal.h"
100 100
101 /* Internal flags */ 101 /* Internal flags */
102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
104 104
105 static struct kmem_cache *policy_cache; 105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache; 106 static struct kmem_cache *sn_cache;
107 107
108 /* Highest zone. An specific allocation for a zone below that is not 108 /* Highest zone. An specific allocation for a zone below that is not
109 policied. */ 109 policied. */
110 enum zone_type policy_zone = 0; 110 enum zone_type policy_zone = 0;
111 111
112 /* 112 /*
113 * run-time system-wide default policy => local allocation 113 * run-time system-wide default policy => local allocation
114 */ 114 */
115 static struct mempolicy default_policy = { 115 static struct mempolicy default_policy = {
116 .refcnt = ATOMIC_INIT(1), /* never free it */ 116 .refcnt = ATOMIC_INIT(1), /* never free it */
117 .mode = MPOL_PREFERRED, 117 .mode = MPOL_PREFERRED,
118 .flags = MPOL_F_LOCAL, 118 .flags = MPOL_F_LOCAL,
119 }; 119 };
120 120
121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122
123 static struct mempolicy *get_task_policy(struct task_struct *p)
124 {
125 struct mempolicy *pol = p->mempolicy;
126 int node;
127
128 if (!pol) {
129 node = numa_node_id();
130 if (node != -1)
131 pol = &preferred_node_policy[node];
132
133 /* preferred_node_policy is not initialised early in boot */
134 if (!pol->mode)
135 pol = NULL;
136 }
137
138 return pol;
139 }
140
121 static const struct mempolicy_operations { 141 static const struct mempolicy_operations {
122 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 142 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
123 /* 143 /*
124 * If read-side task has no lock to protect task->mempolicy, write-side 144 * If read-side task has no lock to protect task->mempolicy, write-side
125 * task will rebind the task->mempolicy by two step. The first step is 145 * task will rebind the task->mempolicy by two step. The first step is
126 * setting all the newly nodes, and the second step is cleaning all the 146 * setting all the newly nodes, and the second step is cleaning all the
127 * disallowed nodes. In this way, we can avoid finding no node to alloc 147 * disallowed nodes. In this way, we can avoid finding no node to alloc
128 * page. 148 * page.
129 * If we have a lock to protect task->mempolicy in read-side, we do 149 * If we have a lock to protect task->mempolicy in read-side, we do
130 * rebind directly. 150 * rebind directly.
131 * 151 *
132 * step: 152 * step:
133 * MPOL_REBIND_ONCE - do rebind work at once 153 * MPOL_REBIND_ONCE - do rebind work at once
134 * MPOL_REBIND_STEP1 - set all the newly nodes 154 * MPOL_REBIND_STEP1 - set all the newly nodes
135 * MPOL_REBIND_STEP2 - clean all the disallowed nodes 155 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
136 */ 156 */
137 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, 157 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
138 enum mpol_rebind_step step); 158 enum mpol_rebind_step step);
139 } mpol_ops[MPOL_MAX]; 159 } mpol_ops[MPOL_MAX];
140 160
141 /* Check that the nodemask contains at least one populated zone */ 161 /* Check that the nodemask contains at least one populated zone */
142 static int is_valid_nodemask(const nodemask_t *nodemask) 162 static int is_valid_nodemask(const nodemask_t *nodemask)
143 { 163 {
144 int nd, k; 164 int nd, k;
145 165
146 for_each_node_mask(nd, *nodemask) { 166 for_each_node_mask(nd, *nodemask) {
147 struct zone *z; 167 struct zone *z;
148 168
149 for (k = 0; k <= policy_zone; k++) { 169 for (k = 0; k <= policy_zone; k++) {
150 z = &NODE_DATA(nd)->node_zones[k]; 170 z = &NODE_DATA(nd)->node_zones[k];
151 if (z->present_pages > 0) 171 if (z->present_pages > 0)
152 return 1; 172 return 1;
153 } 173 }
154 } 174 }
155 175
156 return 0; 176 return 0;
157 } 177 }
158 178
159 static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 179 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
160 { 180 {
161 return pol->flags & MPOL_MODE_FLAGS; 181 return pol->flags & MPOL_MODE_FLAGS;
162 } 182 }
163 183
164 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 184 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
165 const nodemask_t *rel) 185 const nodemask_t *rel)
166 { 186 {
167 nodemask_t tmp; 187 nodemask_t tmp;
168 nodes_fold(tmp, *orig, nodes_weight(*rel)); 188 nodes_fold(tmp, *orig, nodes_weight(*rel));
169 nodes_onto(*ret, tmp, *rel); 189 nodes_onto(*ret, tmp, *rel);
170 } 190 }
171 191
172 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) 192 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
173 { 193 {
174 if (nodes_empty(*nodes)) 194 if (nodes_empty(*nodes))
175 return -EINVAL; 195 return -EINVAL;
176 pol->v.nodes = *nodes; 196 pol->v.nodes = *nodes;
177 return 0; 197 return 0;
178 } 198 }
179 199
180 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 200 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
181 { 201 {
182 if (!nodes) 202 if (!nodes)
183 pol->flags |= MPOL_F_LOCAL; /* local allocation */ 203 pol->flags |= MPOL_F_LOCAL; /* local allocation */
184 else if (nodes_empty(*nodes)) 204 else if (nodes_empty(*nodes))
185 return -EINVAL; /* no allowed nodes */ 205 return -EINVAL; /* no allowed nodes */
186 else 206 else
187 pol->v.preferred_node = first_node(*nodes); 207 pol->v.preferred_node = first_node(*nodes);
188 return 0; 208 return 0;
189 } 209 }
190 210
191 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) 211 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
192 { 212 {
193 if (!is_valid_nodemask(nodes)) 213 if (!is_valid_nodemask(nodes))
194 return -EINVAL; 214 return -EINVAL;
195 pol->v.nodes = *nodes; 215 pol->v.nodes = *nodes;
196 return 0; 216 return 0;
197 } 217 }
198 218
199 /* 219 /*
200 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 220 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
201 * any, for the new policy. mpol_new() has already validated the nodes 221 * any, for the new policy. mpol_new() has already validated the nodes
202 * parameter with respect to the policy mode and flags. But, we need to 222 * parameter with respect to the policy mode and flags. But, we need to
203 * handle an empty nodemask with MPOL_PREFERRED here. 223 * handle an empty nodemask with MPOL_PREFERRED here.
204 * 224 *
205 * Must be called holding task's alloc_lock to protect task's mems_allowed 225 * Must be called holding task's alloc_lock to protect task's mems_allowed
206 * and mempolicy. May also be called holding the mmap_semaphore for write. 226 * and mempolicy. May also be called holding the mmap_semaphore for write.
207 */ 227 */
208 static int mpol_set_nodemask(struct mempolicy *pol, 228 static int mpol_set_nodemask(struct mempolicy *pol,
209 const nodemask_t *nodes, struct nodemask_scratch *nsc) 229 const nodemask_t *nodes, struct nodemask_scratch *nsc)
210 { 230 {
211 int ret; 231 int ret;
212 232
213 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 233 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
214 if (pol == NULL) 234 if (pol == NULL)
215 return 0; 235 return 0;
216 /* Check N_HIGH_MEMORY */ 236 /* Check N_HIGH_MEMORY */
217 nodes_and(nsc->mask1, 237 nodes_and(nsc->mask1,
218 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); 238 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
219 239
220 VM_BUG_ON(!nodes); 240 VM_BUG_ON(!nodes);
221 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 241 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
222 nodes = NULL; /* explicit local allocation */ 242 nodes = NULL; /* explicit local allocation */
223 else { 243 else {
224 if (pol->flags & MPOL_F_RELATIVE_NODES) 244 if (pol->flags & MPOL_F_RELATIVE_NODES)
225 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); 245 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
226 else 246 else
227 nodes_and(nsc->mask2, *nodes, nsc->mask1); 247 nodes_and(nsc->mask2, *nodes, nsc->mask1);
228 248
229 if (mpol_store_user_nodemask(pol)) 249 if (mpol_store_user_nodemask(pol))
230 pol->w.user_nodemask = *nodes; 250 pol->w.user_nodemask = *nodes;
231 else 251 else
232 pol->w.cpuset_mems_allowed = 252 pol->w.cpuset_mems_allowed =
233 cpuset_current_mems_allowed; 253 cpuset_current_mems_allowed;
234 } 254 }
235 255
236 if (nodes) 256 if (nodes)
237 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 257 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
238 else 258 else
239 ret = mpol_ops[pol->mode].create(pol, NULL); 259 ret = mpol_ops[pol->mode].create(pol, NULL);
240 return ret; 260 return ret;
241 } 261 }
242 262
243 /* 263 /*
244 * This function just creates a new policy, does some check and simple 264 * This function just creates a new policy, does some check and simple
245 * initialization. You must invoke mpol_set_nodemask() to set nodes. 265 * initialization. You must invoke mpol_set_nodemask() to set nodes.
246 */ 266 */
247 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 267 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
248 nodemask_t *nodes) 268 nodemask_t *nodes)
249 { 269 {
250 struct mempolicy *policy; 270 struct mempolicy *policy;
251 271
252 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 272 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
253 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 273 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
254 274
255 if (mode == MPOL_DEFAULT) { 275 if (mode == MPOL_DEFAULT) {
256 if (nodes && !nodes_empty(*nodes)) 276 if (nodes && !nodes_empty(*nodes))
257 return ERR_PTR(-EINVAL); 277 return ERR_PTR(-EINVAL);
258 return NULL; 278 return NULL;
259 } 279 }
260 VM_BUG_ON(!nodes); 280 VM_BUG_ON(!nodes);
261 281
262 /* 282 /*
263 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 283 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
264 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 284 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
265 * All other modes require a valid pointer to a non-empty nodemask. 285 * All other modes require a valid pointer to a non-empty nodemask.
266 */ 286 */
267 if (mode == MPOL_PREFERRED) { 287 if (mode == MPOL_PREFERRED) {
268 if (nodes_empty(*nodes)) { 288 if (nodes_empty(*nodes)) {
269 if (((flags & MPOL_F_STATIC_NODES) || 289 if (((flags & MPOL_F_STATIC_NODES) ||
270 (flags & MPOL_F_RELATIVE_NODES))) 290 (flags & MPOL_F_RELATIVE_NODES)))
271 return ERR_PTR(-EINVAL); 291 return ERR_PTR(-EINVAL);
272 } 292 }
273 } else if (mode == MPOL_LOCAL) { 293 } else if (mode == MPOL_LOCAL) {
274 if (!nodes_empty(*nodes)) 294 if (!nodes_empty(*nodes))
275 return ERR_PTR(-EINVAL); 295 return ERR_PTR(-EINVAL);
276 mode = MPOL_PREFERRED; 296 mode = MPOL_PREFERRED;
277 } else if (nodes_empty(*nodes)) 297 } else if (nodes_empty(*nodes))
278 return ERR_PTR(-EINVAL); 298 return ERR_PTR(-EINVAL);
279 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 299 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
280 if (!policy) 300 if (!policy)
281 return ERR_PTR(-ENOMEM); 301 return ERR_PTR(-ENOMEM);
282 atomic_set(&policy->refcnt, 1); 302 atomic_set(&policy->refcnt, 1);
283 policy->mode = mode; 303 policy->mode = mode;
284 policy->flags = flags; 304 policy->flags = flags;
285 305
286 return policy; 306 return policy;
287 } 307 }
288 308
289 /* Slow path of a mpol destructor. */ 309 /* Slow path of a mpol destructor. */
290 void __mpol_put(struct mempolicy *p) 310 void __mpol_put(struct mempolicy *p)
291 { 311 {
292 if (!atomic_dec_and_test(&p->refcnt)) 312 if (!atomic_dec_and_test(&p->refcnt))
293 return; 313 return;
294 kmem_cache_free(policy_cache, p); 314 kmem_cache_free(policy_cache, p);
295 } 315 }
296 316
297 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, 317 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
298 enum mpol_rebind_step step) 318 enum mpol_rebind_step step)
299 { 319 {
300 } 320 }
301 321
302 /* 322 /*
303 * step: 323 * step:
304 * MPOL_REBIND_ONCE - do rebind work at once 324 * MPOL_REBIND_ONCE - do rebind work at once
305 * MPOL_REBIND_STEP1 - set all the newly nodes 325 * MPOL_REBIND_STEP1 - set all the newly nodes
306 * MPOL_REBIND_STEP2 - clean all the disallowed nodes 326 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
307 */ 327 */
308 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, 328 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
309 enum mpol_rebind_step step) 329 enum mpol_rebind_step step)
310 { 330 {
311 nodemask_t tmp; 331 nodemask_t tmp;
312 332
313 if (pol->flags & MPOL_F_STATIC_NODES) 333 if (pol->flags & MPOL_F_STATIC_NODES)
314 nodes_and(tmp, pol->w.user_nodemask, *nodes); 334 nodes_and(tmp, pol->w.user_nodemask, *nodes);
315 else if (pol->flags & MPOL_F_RELATIVE_NODES) 335 else if (pol->flags & MPOL_F_RELATIVE_NODES)
316 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 336 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
317 else { 337 else {
318 /* 338 /*
319 * if step == 1, we use ->w.cpuset_mems_allowed to cache the 339 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
320 * result 340 * result
321 */ 341 */
322 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { 342 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
323 nodes_remap(tmp, pol->v.nodes, 343 nodes_remap(tmp, pol->v.nodes,
324 pol->w.cpuset_mems_allowed, *nodes); 344 pol->w.cpuset_mems_allowed, *nodes);
325 pol->w.cpuset_mems_allowed = step ? tmp : *nodes; 345 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
326 } else if (step == MPOL_REBIND_STEP2) { 346 } else if (step == MPOL_REBIND_STEP2) {
327 tmp = pol->w.cpuset_mems_allowed; 347 tmp = pol->w.cpuset_mems_allowed;
328 pol->w.cpuset_mems_allowed = *nodes; 348 pol->w.cpuset_mems_allowed = *nodes;
329 } else 349 } else
330 BUG(); 350 BUG();
331 } 351 }
332 352
333 if (nodes_empty(tmp)) 353 if (nodes_empty(tmp))
334 tmp = *nodes; 354 tmp = *nodes;
335 355
336 if (step == MPOL_REBIND_STEP1) 356 if (step == MPOL_REBIND_STEP1)
337 nodes_or(pol->v.nodes, pol->v.nodes, tmp); 357 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
338 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) 358 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
339 pol->v.nodes = tmp; 359 pol->v.nodes = tmp;
340 else 360 else
341 BUG(); 361 BUG();
342 362
343 if (!node_isset(current->il_next, tmp)) { 363 if (!node_isset(current->il_next, tmp)) {
344 current->il_next = next_node(current->il_next, tmp); 364 current->il_next = next_node(current->il_next, tmp);
345 if (current->il_next >= MAX_NUMNODES) 365 if (current->il_next >= MAX_NUMNODES)
346 current->il_next = first_node(tmp); 366 current->il_next = first_node(tmp);
347 if (current->il_next >= MAX_NUMNODES) 367 if (current->il_next >= MAX_NUMNODES)
348 current->il_next = numa_node_id(); 368 current->il_next = numa_node_id();
349 } 369 }
350 } 370 }
351 371
352 static void mpol_rebind_preferred(struct mempolicy *pol, 372 static void mpol_rebind_preferred(struct mempolicy *pol,
353 const nodemask_t *nodes, 373 const nodemask_t *nodes,
354 enum mpol_rebind_step step) 374 enum mpol_rebind_step step)
355 { 375 {
356 nodemask_t tmp; 376 nodemask_t tmp;
357 377
358 if (pol->flags & MPOL_F_STATIC_NODES) { 378 if (pol->flags & MPOL_F_STATIC_NODES) {
359 int node = first_node(pol->w.user_nodemask); 379 int node = first_node(pol->w.user_nodemask);
360 380
361 if (node_isset(node, *nodes)) { 381 if (node_isset(node, *nodes)) {
362 pol->v.preferred_node = node; 382 pol->v.preferred_node = node;
363 pol->flags &= ~MPOL_F_LOCAL; 383 pol->flags &= ~MPOL_F_LOCAL;
364 } else 384 } else
365 pol->flags |= MPOL_F_LOCAL; 385 pol->flags |= MPOL_F_LOCAL;
366 } else if (pol->flags & MPOL_F_RELATIVE_NODES) { 386 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
367 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 387 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
368 pol->v.preferred_node = first_node(tmp); 388 pol->v.preferred_node = first_node(tmp);
369 } else if (!(pol->flags & MPOL_F_LOCAL)) { 389 } else if (!(pol->flags & MPOL_F_LOCAL)) {
370 pol->v.preferred_node = node_remap(pol->v.preferred_node, 390 pol->v.preferred_node = node_remap(pol->v.preferred_node,
371 pol->w.cpuset_mems_allowed, 391 pol->w.cpuset_mems_allowed,
372 *nodes); 392 *nodes);
373 pol->w.cpuset_mems_allowed = *nodes; 393 pol->w.cpuset_mems_allowed = *nodes;
374 } 394 }
375 } 395 }
376 396
377 /* 397 /*
378 * mpol_rebind_policy - Migrate a policy to a different set of nodes 398 * mpol_rebind_policy - Migrate a policy to a different set of nodes
379 * 399 *
380 * If read-side task has no lock to protect task->mempolicy, write-side 400 * If read-side task has no lock to protect task->mempolicy, write-side
381 * task will rebind the task->mempolicy by two step. The first step is 401 * task will rebind the task->mempolicy by two step. The first step is
382 * setting all the newly nodes, and the second step is cleaning all the 402 * setting all the newly nodes, and the second step is cleaning all the
383 * disallowed nodes. In this way, we can avoid finding no node to alloc 403 * disallowed nodes. In this way, we can avoid finding no node to alloc
384 * page. 404 * page.
385 * If we have a lock to protect task->mempolicy in read-side, we do 405 * If we have a lock to protect task->mempolicy in read-side, we do
386 * rebind directly. 406 * rebind directly.
387 * 407 *
388 * step: 408 * step:
389 * MPOL_REBIND_ONCE - do rebind work at once 409 * MPOL_REBIND_ONCE - do rebind work at once
390 * MPOL_REBIND_STEP1 - set all the newly nodes 410 * MPOL_REBIND_STEP1 - set all the newly nodes
391 * MPOL_REBIND_STEP2 - clean all the disallowed nodes 411 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
392 */ 412 */
393 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, 413 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
394 enum mpol_rebind_step step) 414 enum mpol_rebind_step step)
395 { 415 {
396 if (!pol) 416 if (!pol)
397 return; 417 return;
398 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && 418 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
399 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 419 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
400 return; 420 return;
401 421
402 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) 422 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
403 return; 423 return;
404 424
405 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) 425 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
406 BUG(); 426 BUG();
407 427
408 if (step == MPOL_REBIND_STEP1) 428 if (step == MPOL_REBIND_STEP1)
409 pol->flags |= MPOL_F_REBINDING; 429 pol->flags |= MPOL_F_REBINDING;
410 else if (step == MPOL_REBIND_STEP2) 430 else if (step == MPOL_REBIND_STEP2)
411 pol->flags &= ~MPOL_F_REBINDING; 431 pol->flags &= ~MPOL_F_REBINDING;
412 else if (step >= MPOL_REBIND_NSTEP) 432 else if (step >= MPOL_REBIND_NSTEP)
413 BUG(); 433 BUG();
414 434
415 mpol_ops[pol->mode].rebind(pol, newmask, step); 435 mpol_ops[pol->mode].rebind(pol, newmask, step);
416 } 436 }
417 437
418 /* 438 /*
419 * Wrapper for mpol_rebind_policy() that just requires task 439 * Wrapper for mpol_rebind_policy() that just requires task
420 * pointer, and updates task mempolicy. 440 * pointer, and updates task mempolicy.
421 * 441 *
422 * Called with task's alloc_lock held. 442 * Called with task's alloc_lock held.
423 */ 443 */
424 444
425 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, 445 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
426 enum mpol_rebind_step step) 446 enum mpol_rebind_step step)
427 { 447 {
428 mpol_rebind_policy(tsk->mempolicy, new, step); 448 mpol_rebind_policy(tsk->mempolicy, new, step);
429 } 449 }
430 450
431 /* 451 /*
432 * Rebind each vma in mm to new nodemask. 452 * Rebind each vma in mm to new nodemask.
433 * 453 *
434 * Call holding a reference to mm. Takes mm->mmap_sem during call. 454 * Call holding a reference to mm. Takes mm->mmap_sem during call.
435 */ 455 */
436 456
437 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 457 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
438 { 458 {
439 struct vm_area_struct *vma; 459 struct vm_area_struct *vma;
440 460
441 down_write(&mm->mmap_sem); 461 down_write(&mm->mmap_sem);
442 for (vma = mm->mmap; vma; vma = vma->vm_next) 462 for (vma = mm->mmap; vma; vma = vma->vm_next)
443 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); 463 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
444 up_write(&mm->mmap_sem); 464 up_write(&mm->mmap_sem);
445 } 465 }
446 466
447 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 467 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
448 [MPOL_DEFAULT] = { 468 [MPOL_DEFAULT] = {
449 .rebind = mpol_rebind_default, 469 .rebind = mpol_rebind_default,
450 }, 470 },
451 [MPOL_INTERLEAVE] = { 471 [MPOL_INTERLEAVE] = {
452 .create = mpol_new_interleave, 472 .create = mpol_new_interleave,
453 .rebind = mpol_rebind_nodemask, 473 .rebind = mpol_rebind_nodemask,
454 }, 474 },
455 [MPOL_PREFERRED] = { 475 [MPOL_PREFERRED] = {
456 .create = mpol_new_preferred, 476 .create = mpol_new_preferred,
457 .rebind = mpol_rebind_preferred, 477 .rebind = mpol_rebind_preferred,
458 }, 478 },
459 [MPOL_BIND] = { 479 [MPOL_BIND] = {
460 .create = mpol_new_bind, 480 .create = mpol_new_bind,
461 .rebind = mpol_rebind_nodemask, 481 .rebind = mpol_rebind_nodemask,
462 }, 482 },
463 }; 483 };
464 484
465 static void migrate_page_add(struct page *page, struct list_head *pagelist, 485 static void migrate_page_add(struct page *page, struct list_head *pagelist,
466 unsigned long flags); 486 unsigned long flags);
467 487
468 /* Scan through pages checking if pages follow certain conditions. */ 488 /* Scan through pages checking if pages follow certain conditions. */
469 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 489 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
470 unsigned long addr, unsigned long end, 490 unsigned long addr, unsigned long end,
471 const nodemask_t *nodes, unsigned long flags, 491 const nodemask_t *nodes, unsigned long flags,
472 void *private) 492 void *private)
473 { 493 {
474 pte_t *orig_pte; 494 pte_t *orig_pte;
475 pte_t *pte; 495 pte_t *pte;
476 spinlock_t *ptl; 496 spinlock_t *ptl;
477 497
478 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 498 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
479 do { 499 do {
480 struct page *page; 500 struct page *page;
481 int nid; 501 int nid;
482 502
483 if (!pte_present(*pte)) 503 if (!pte_present(*pte))
484 continue; 504 continue;
485 page = vm_normal_page(vma, addr, *pte); 505 page = vm_normal_page(vma, addr, *pte);
486 if (!page) 506 if (!page)
487 continue; 507 continue;
488 /* 508 /*
489 * vm_normal_page() filters out zero pages, but there might 509 * vm_normal_page() filters out zero pages, but there might
490 * still be PageReserved pages to skip, perhaps in a VDSO. 510 * still be PageReserved pages to skip, perhaps in a VDSO.
491 * And we cannot move PageKsm pages sensibly or safely yet. 511 * And we cannot move PageKsm pages sensibly or safely yet.
492 */ 512 */
493 if (PageReserved(page) || PageKsm(page)) 513 if (PageReserved(page) || PageKsm(page))
494 continue; 514 continue;
495 nid = page_to_nid(page); 515 nid = page_to_nid(page);
496 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 516 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
497 continue; 517 continue;
498 518
499 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 519 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
500 migrate_page_add(page, private, flags); 520 migrate_page_add(page, private, flags);
501 else 521 else
502 break; 522 break;
503 } while (pte++, addr += PAGE_SIZE, addr != end); 523 } while (pte++, addr += PAGE_SIZE, addr != end);
504 pte_unmap_unlock(orig_pte, ptl); 524 pte_unmap_unlock(orig_pte, ptl);
505 return addr != end; 525 return addr != end;
506 } 526 }
507 527
508 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 528 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
509 unsigned long addr, unsigned long end, 529 unsigned long addr, unsigned long end,
510 const nodemask_t *nodes, unsigned long flags, 530 const nodemask_t *nodes, unsigned long flags,
511 void *private) 531 void *private)
512 { 532 {
513 pmd_t *pmd; 533 pmd_t *pmd;
514 unsigned long next; 534 unsigned long next;
515 535
516 pmd = pmd_offset(pud, addr); 536 pmd = pmd_offset(pud, addr);
517 do { 537 do {
518 next = pmd_addr_end(addr, end); 538 next = pmd_addr_end(addr, end);
519 split_huge_page_pmd(vma->vm_mm, pmd); 539 split_huge_page_pmd(vma->vm_mm, pmd);
520 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 540 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
521 continue; 541 continue;
522 if (check_pte_range(vma, pmd, addr, next, nodes, 542 if (check_pte_range(vma, pmd, addr, next, nodes,
523 flags, private)) 543 flags, private))
524 return -EIO; 544 return -EIO;
525 } while (pmd++, addr = next, addr != end); 545 } while (pmd++, addr = next, addr != end);
526 return 0; 546 return 0;
527 } 547 }
528 548
529 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 549 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
530 unsigned long addr, unsigned long end, 550 unsigned long addr, unsigned long end,
531 const nodemask_t *nodes, unsigned long flags, 551 const nodemask_t *nodes, unsigned long flags,
532 void *private) 552 void *private)
533 { 553 {
534 pud_t *pud; 554 pud_t *pud;
535 unsigned long next; 555 unsigned long next;
536 556
537 pud = pud_offset(pgd, addr); 557 pud = pud_offset(pgd, addr);
538 do { 558 do {
539 next = pud_addr_end(addr, end); 559 next = pud_addr_end(addr, end);
540 if (pud_none_or_clear_bad(pud)) 560 if (pud_none_or_clear_bad(pud))
541 continue; 561 continue;
542 if (check_pmd_range(vma, pud, addr, next, nodes, 562 if (check_pmd_range(vma, pud, addr, next, nodes,
543 flags, private)) 563 flags, private))
544 return -EIO; 564 return -EIO;
545 } while (pud++, addr = next, addr != end); 565 } while (pud++, addr = next, addr != end);
546 return 0; 566 return 0;
547 } 567 }
548 568
549 static inline int check_pgd_range(struct vm_area_struct *vma, 569 static inline int check_pgd_range(struct vm_area_struct *vma,
550 unsigned long addr, unsigned long end, 570 unsigned long addr, unsigned long end,
551 const nodemask_t *nodes, unsigned long flags, 571 const nodemask_t *nodes, unsigned long flags,
552 void *private) 572 void *private)
553 { 573 {
554 pgd_t *pgd; 574 pgd_t *pgd;
555 unsigned long next; 575 unsigned long next;
556 576
557 pgd = pgd_offset(vma->vm_mm, addr); 577 pgd = pgd_offset(vma->vm_mm, addr);
558 do { 578 do {
559 next = pgd_addr_end(addr, end); 579 next = pgd_addr_end(addr, end);
560 if (pgd_none_or_clear_bad(pgd)) 580 if (pgd_none_or_clear_bad(pgd))
561 continue; 581 continue;
562 if (check_pud_range(vma, pgd, addr, next, nodes, 582 if (check_pud_range(vma, pgd, addr, next, nodes,
563 flags, private)) 583 flags, private))
564 return -EIO; 584 return -EIO;
565 } while (pgd++, addr = next, addr != end); 585 } while (pgd++, addr = next, addr != end);
566 return 0; 586 return 0;
567 } 587 }
568 588
569 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE 589 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
570 /* 590 /*
571 * This is used to mark a range of virtual addresses to be inaccessible. 591 * This is used to mark a range of virtual addresses to be inaccessible.
572 * These are later cleared by a NUMA hinting fault. Depending on these 592 * These are later cleared by a NUMA hinting fault. Depending on these
573 * faults, pages may be migrated for better NUMA placement. 593 * faults, pages may be migrated for better NUMA placement.
574 * 594 *
575 * This is assuming that NUMA faults are handled using PROT_NONE. If 595 * This is assuming that NUMA faults are handled using PROT_NONE. If
576 * an architecture makes a different choice, it will need further 596 * an architecture makes a different choice, it will need further
577 * changes to the core. 597 * changes to the core.
578 */ 598 */
579 unsigned long change_prot_numa(struct vm_area_struct *vma, 599 unsigned long change_prot_numa(struct vm_area_struct *vma,
580 unsigned long addr, unsigned long end) 600 unsigned long addr, unsigned long end)
581 { 601 {
582 int nr_updated; 602 int nr_updated;
583 BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); 603 BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
584 604
585 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); 605 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
586 if (nr_updated) 606 if (nr_updated)
587 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 607 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
588 608
589 return nr_updated; 609 return nr_updated;
590 } 610 }
591 #else 611 #else
592 static unsigned long change_prot_numa(struct vm_area_struct *vma, 612 static unsigned long change_prot_numa(struct vm_area_struct *vma,
593 unsigned long addr, unsigned long end) 613 unsigned long addr, unsigned long end)
594 { 614 {
595 return 0; 615 return 0;
596 } 616 }
597 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ 617 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
598 618
599 /* 619 /*
600 * Check if all pages in a range are on a set of nodes. 620 * Check if all pages in a range are on a set of nodes.
601 * If pagelist != NULL then isolate pages from the LRU and 621 * If pagelist != NULL then isolate pages from the LRU and
602 * put them on the pagelist. 622 * put them on the pagelist.
603 */ 623 */
604 static struct vm_area_struct * 624 static struct vm_area_struct *
605 check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 625 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
606 const nodemask_t *nodes, unsigned long flags, void *private) 626 const nodemask_t *nodes, unsigned long flags, void *private)
607 { 627 {
608 int err; 628 int err;
609 struct vm_area_struct *first, *vma, *prev; 629 struct vm_area_struct *first, *vma, *prev;
610 630
611 631
612 first = find_vma(mm, start); 632 first = find_vma(mm, start);
613 if (!first) 633 if (!first)
614 return ERR_PTR(-EFAULT); 634 return ERR_PTR(-EFAULT);
615 prev = NULL; 635 prev = NULL;
616 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 636 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
617 unsigned long endvma = vma->vm_end; 637 unsigned long endvma = vma->vm_end;
618 638
619 if (endvma > end) 639 if (endvma > end)
620 endvma = end; 640 endvma = end;
621 if (vma->vm_start > start) 641 if (vma->vm_start > start)
622 start = vma->vm_start; 642 start = vma->vm_start;
623 643
624 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 644 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
625 if (!vma->vm_next && vma->vm_end < end) 645 if (!vma->vm_next && vma->vm_end < end)
626 return ERR_PTR(-EFAULT); 646 return ERR_PTR(-EFAULT);
627 if (prev && prev->vm_end < vma->vm_start) 647 if (prev && prev->vm_end < vma->vm_start)
628 return ERR_PTR(-EFAULT); 648 return ERR_PTR(-EFAULT);
629 } 649 }
630 650
631 if (is_vm_hugetlb_page(vma)) 651 if (is_vm_hugetlb_page(vma))
632 goto next; 652 goto next;
633 653
634 if (flags & MPOL_MF_LAZY) { 654 if (flags & MPOL_MF_LAZY) {
635 change_prot_numa(vma, start, endvma); 655 change_prot_numa(vma, start, endvma);
636 goto next; 656 goto next;
637 } 657 }
638 658
639 if ((flags & MPOL_MF_STRICT) || 659 if ((flags & MPOL_MF_STRICT) ||
640 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 660 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
641 vma_migratable(vma))) { 661 vma_migratable(vma))) {
642 662
643 err = check_pgd_range(vma, start, endvma, nodes, 663 err = check_pgd_range(vma, start, endvma, nodes,
644 flags, private); 664 flags, private);
645 if (err) { 665 if (err) {
646 first = ERR_PTR(err); 666 first = ERR_PTR(err);
647 break; 667 break;
648 } 668 }
649 } 669 }
650 next: 670 next:
651 prev = vma; 671 prev = vma;
652 } 672 }
653 return first; 673 return first;
654 } 674 }
655 675
656 /* 676 /*
657 * Apply policy to a single VMA 677 * Apply policy to a single VMA
658 * This must be called with the mmap_sem held for writing. 678 * This must be called with the mmap_sem held for writing.
659 */ 679 */
660 static int vma_replace_policy(struct vm_area_struct *vma, 680 static int vma_replace_policy(struct vm_area_struct *vma,
661 struct mempolicy *pol) 681 struct mempolicy *pol)
662 { 682 {
663 int err; 683 int err;
664 struct mempolicy *old; 684 struct mempolicy *old;
665 struct mempolicy *new; 685 struct mempolicy *new;
666 686
667 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 687 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
668 vma->vm_start, vma->vm_end, vma->vm_pgoff, 688 vma->vm_start, vma->vm_end, vma->vm_pgoff,
669 vma->vm_ops, vma->vm_file, 689 vma->vm_ops, vma->vm_file,
670 vma->vm_ops ? vma->vm_ops->set_policy : NULL); 690 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
671 691
672 new = mpol_dup(pol); 692 new = mpol_dup(pol);
673 if (IS_ERR(new)) 693 if (IS_ERR(new))
674 return PTR_ERR(new); 694 return PTR_ERR(new);
675 695
676 if (vma->vm_ops && vma->vm_ops->set_policy) { 696 if (vma->vm_ops && vma->vm_ops->set_policy) {
677 err = vma->vm_ops->set_policy(vma, new); 697 err = vma->vm_ops->set_policy(vma, new);
678 if (err) 698 if (err)
679 goto err_out; 699 goto err_out;
680 } 700 }
681 701
682 old = vma->vm_policy; 702 old = vma->vm_policy;
683 vma->vm_policy = new; /* protected by mmap_sem */ 703 vma->vm_policy = new; /* protected by mmap_sem */
684 mpol_put(old); 704 mpol_put(old);
685 705
686 return 0; 706 return 0;
687 err_out: 707 err_out:
688 mpol_put(new); 708 mpol_put(new);
689 return err; 709 return err;
690 } 710 }
691 711
692 /* Step 2: apply policy to a range and do splits. */ 712 /* Step 2: apply policy to a range and do splits. */
693 static int mbind_range(struct mm_struct *mm, unsigned long start, 713 static int mbind_range(struct mm_struct *mm, unsigned long start,
694 unsigned long end, struct mempolicy *new_pol) 714 unsigned long end, struct mempolicy *new_pol)
695 { 715 {
696 struct vm_area_struct *next; 716 struct vm_area_struct *next;
697 struct vm_area_struct *prev; 717 struct vm_area_struct *prev;
698 struct vm_area_struct *vma; 718 struct vm_area_struct *vma;
699 int err = 0; 719 int err = 0;
700 pgoff_t pgoff; 720 pgoff_t pgoff;
701 unsigned long vmstart; 721 unsigned long vmstart;
702 unsigned long vmend; 722 unsigned long vmend;
703 723
704 vma = find_vma(mm, start); 724 vma = find_vma(mm, start);
705 if (!vma || vma->vm_start > start) 725 if (!vma || vma->vm_start > start)
706 return -EFAULT; 726 return -EFAULT;
707 727
708 prev = vma->vm_prev; 728 prev = vma->vm_prev;
709 if (start > vma->vm_start) 729 if (start > vma->vm_start)
710 prev = vma; 730 prev = vma;
711 731
712 for (; vma && vma->vm_start < end; prev = vma, vma = next) { 732 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
713 next = vma->vm_next; 733 next = vma->vm_next;
714 vmstart = max(start, vma->vm_start); 734 vmstart = max(start, vma->vm_start);
715 vmend = min(end, vma->vm_end); 735 vmend = min(end, vma->vm_end);
716 736
717 if (mpol_equal(vma_policy(vma), new_pol)) 737 if (mpol_equal(vma_policy(vma), new_pol))
718 continue; 738 continue;
719 739
720 pgoff = vma->vm_pgoff + 740 pgoff = vma->vm_pgoff +
721 ((vmstart - vma->vm_start) >> PAGE_SHIFT); 741 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
722 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 742 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
723 vma->anon_vma, vma->vm_file, pgoff, 743 vma->anon_vma, vma->vm_file, pgoff,
724 new_pol); 744 new_pol);
725 if (prev) { 745 if (prev) {
726 vma = prev; 746 vma = prev;
727 next = vma->vm_next; 747 next = vma->vm_next;
728 continue; 748 continue;
729 } 749 }
730 if (vma->vm_start != vmstart) { 750 if (vma->vm_start != vmstart) {
731 err = split_vma(vma->vm_mm, vma, vmstart, 1); 751 err = split_vma(vma->vm_mm, vma, vmstart, 1);
732 if (err) 752 if (err)
733 goto out; 753 goto out;
734 } 754 }
735 if (vma->vm_end != vmend) { 755 if (vma->vm_end != vmend) {
736 err = split_vma(vma->vm_mm, vma, vmend, 0); 756 err = split_vma(vma->vm_mm, vma, vmend, 0);
737 if (err) 757 if (err)
738 goto out; 758 goto out;
739 } 759 }
740 err = vma_replace_policy(vma, new_pol); 760 err = vma_replace_policy(vma, new_pol);
741 if (err) 761 if (err)
742 goto out; 762 goto out;
743 } 763 }
744 764
745 out: 765 out:
746 return err; 766 return err;
747 } 767 }
748 768
749 /* 769 /*
750 * Update task->flags PF_MEMPOLICY bit: set iff non-default 770 * Update task->flags PF_MEMPOLICY bit: set iff non-default
751 * mempolicy. Allows more rapid checking of this (combined perhaps 771 * mempolicy. Allows more rapid checking of this (combined perhaps
752 * with other PF_* flag bits) on memory allocation hot code paths. 772 * with other PF_* flag bits) on memory allocation hot code paths.
753 * 773 *
754 * If called from outside this file, the task 'p' should -only- be 774 * If called from outside this file, the task 'p' should -only- be
755 * a newly forked child not yet visible on the task list, because 775 * a newly forked child not yet visible on the task list, because
756 * manipulating the task flags of a visible task is not safe. 776 * manipulating the task flags of a visible task is not safe.
757 * 777 *
758 * The above limitation is why this routine has the funny name 778 * The above limitation is why this routine has the funny name
759 * mpol_fix_fork_child_flag(). 779 * mpol_fix_fork_child_flag().
760 * 780 *
761 * It is also safe to call this with a task pointer of current, 781 * It is also safe to call this with a task pointer of current,
762 * which the static wrapper mpol_set_task_struct_flag() does, 782 * which the static wrapper mpol_set_task_struct_flag() does,
763 * for use within this file. 783 * for use within this file.
764 */ 784 */
765 785
766 void mpol_fix_fork_child_flag(struct task_struct *p) 786 void mpol_fix_fork_child_flag(struct task_struct *p)
767 { 787 {
768 if (p->mempolicy) 788 if (p->mempolicy)
769 p->flags |= PF_MEMPOLICY; 789 p->flags |= PF_MEMPOLICY;
770 else 790 else
771 p->flags &= ~PF_MEMPOLICY; 791 p->flags &= ~PF_MEMPOLICY;
772 } 792 }
773 793
774 static void mpol_set_task_struct_flag(void) 794 static void mpol_set_task_struct_flag(void)
775 { 795 {
776 mpol_fix_fork_child_flag(current); 796 mpol_fix_fork_child_flag(current);
777 } 797 }
778 798
779 /* Set the process memory policy */ 799 /* Set the process memory policy */
780 static long do_set_mempolicy(unsigned short mode, unsigned short flags, 800 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
781 nodemask_t *nodes) 801 nodemask_t *nodes)
782 { 802 {
783 struct mempolicy *new, *old; 803 struct mempolicy *new, *old;
784 struct mm_struct *mm = current->mm; 804 struct mm_struct *mm = current->mm;
785 NODEMASK_SCRATCH(scratch); 805 NODEMASK_SCRATCH(scratch);
786 int ret; 806 int ret;
787 807
788 if (!scratch) 808 if (!scratch)
789 return -ENOMEM; 809 return -ENOMEM;
790 810
791 new = mpol_new(mode, flags, nodes); 811 new = mpol_new(mode, flags, nodes);
792 if (IS_ERR(new)) { 812 if (IS_ERR(new)) {
793 ret = PTR_ERR(new); 813 ret = PTR_ERR(new);
794 goto out; 814 goto out;
795 } 815 }
796 /* 816 /*
797 * prevent changing our mempolicy while show_numa_maps() 817 * prevent changing our mempolicy while show_numa_maps()
798 * is using it. 818 * is using it.
799 * Note: do_set_mempolicy() can be called at init time 819 * Note: do_set_mempolicy() can be called at init time
800 * with no 'mm'. 820 * with no 'mm'.
801 */ 821 */
802 if (mm) 822 if (mm)
803 down_write(&mm->mmap_sem); 823 down_write(&mm->mmap_sem);
804 task_lock(current); 824 task_lock(current);
805 ret = mpol_set_nodemask(new, nodes, scratch); 825 ret = mpol_set_nodemask(new, nodes, scratch);
806 if (ret) { 826 if (ret) {
807 task_unlock(current); 827 task_unlock(current);
808 if (mm) 828 if (mm)
809 up_write(&mm->mmap_sem); 829 up_write(&mm->mmap_sem);
810 mpol_put(new); 830 mpol_put(new);
811 goto out; 831 goto out;
812 } 832 }
813 old = current->mempolicy; 833 old = current->mempolicy;
814 current->mempolicy = new; 834 current->mempolicy = new;
815 mpol_set_task_struct_flag(); 835 mpol_set_task_struct_flag();
816 if (new && new->mode == MPOL_INTERLEAVE && 836 if (new && new->mode == MPOL_INTERLEAVE &&
817 nodes_weight(new->v.nodes)) 837 nodes_weight(new->v.nodes))
818 current->il_next = first_node(new->v.nodes); 838 current->il_next = first_node(new->v.nodes);
819 task_unlock(current); 839 task_unlock(current);
820 if (mm) 840 if (mm)
821 up_write(&mm->mmap_sem); 841 up_write(&mm->mmap_sem);
822 842
823 mpol_put(old); 843 mpol_put(old);
824 ret = 0; 844 ret = 0;
825 out: 845 out:
826 NODEMASK_SCRATCH_FREE(scratch); 846 NODEMASK_SCRATCH_FREE(scratch);
827 return ret; 847 return ret;
828 } 848 }
829 849
830 /* 850 /*
831 * Return nodemask for policy for get_mempolicy() query 851 * Return nodemask for policy for get_mempolicy() query
832 * 852 *
833 * Called with task's alloc_lock held 853 * Called with task's alloc_lock held
834 */ 854 */
835 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 855 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
836 { 856 {
837 nodes_clear(*nodes); 857 nodes_clear(*nodes);
838 if (p == &default_policy) 858 if (p == &default_policy)
839 return; 859 return;
840 860
841 switch (p->mode) { 861 switch (p->mode) {
842 case MPOL_BIND: 862 case MPOL_BIND:
843 /* Fall through */ 863 /* Fall through */
844 case MPOL_INTERLEAVE: 864 case MPOL_INTERLEAVE:
845 *nodes = p->v.nodes; 865 *nodes = p->v.nodes;
846 break; 866 break;
847 case MPOL_PREFERRED: 867 case MPOL_PREFERRED:
848 if (!(p->flags & MPOL_F_LOCAL)) 868 if (!(p->flags & MPOL_F_LOCAL))
849 node_set(p->v.preferred_node, *nodes); 869 node_set(p->v.preferred_node, *nodes);
850 /* else return empty node mask for local allocation */ 870 /* else return empty node mask for local allocation */
851 break; 871 break;
852 default: 872 default:
853 BUG(); 873 BUG();
854 } 874 }
855 } 875 }
856 876
857 static int lookup_node(struct mm_struct *mm, unsigned long addr) 877 static int lookup_node(struct mm_struct *mm, unsigned long addr)
858 { 878 {
859 struct page *p; 879 struct page *p;
860 int err; 880 int err;
861 881
862 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 882 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
863 if (err >= 0) { 883 if (err >= 0) {
864 err = page_to_nid(p); 884 err = page_to_nid(p);
865 put_page(p); 885 put_page(p);
866 } 886 }
867 return err; 887 return err;
868 } 888 }
869 889
870 /* Retrieve NUMA policy */ 890 /* Retrieve NUMA policy */
871 static long do_get_mempolicy(int *policy, nodemask_t *nmask, 891 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
872 unsigned long addr, unsigned long flags) 892 unsigned long addr, unsigned long flags)
873 { 893 {
874 int err; 894 int err;
875 struct mm_struct *mm = current->mm; 895 struct mm_struct *mm = current->mm;
876 struct vm_area_struct *vma = NULL; 896 struct vm_area_struct *vma = NULL;
877 struct mempolicy *pol = current->mempolicy; 897 struct mempolicy *pol = current->mempolicy;
878 898
879 if (flags & 899 if (flags &
880 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 900 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
881 return -EINVAL; 901 return -EINVAL;
882 902
883 if (flags & MPOL_F_MEMS_ALLOWED) { 903 if (flags & MPOL_F_MEMS_ALLOWED) {
884 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 904 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
885 return -EINVAL; 905 return -EINVAL;
886 *policy = 0; /* just so it's initialized */ 906 *policy = 0; /* just so it's initialized */
887 task_lock(current); 907 task_lock(current);
888 *nmask = cpuset_current_mems_allowed; 908 *nmask = cpuset_current_mems_allowed;
889 task_unlock(current); 909 task_unlock(current);
890 return 0; 910 return 0;
891 } 911 }
892 912
893 if (flags & MPOL_F_ADDR) { 913 if (flags & MPOL_F_ADDR) {
894 /* 914 /*
895 * Do NOT fall back to task policy if the 915 * Do NOT fall back to task policy if the
896 * vma/shared policy at addr is NULL. We 916 * vma/shared policy at addr is NULL. We
897 * want to return MPOL_DEFAULT in this case. 917 * want to return MPOL_DEFAULT in this case.
898 */ 918 */
899 down_read(&mm->mmap_sem); 919 down_read(&mm->mmap_sem);
900 vma = find_vma_intersection(mm, addr, addr+1); 920 vma = find_vma_intersection(mm, addr, addr+1);
901 if (!vma) { 921 if (!vma) {
902 up_read(&mm->mmap_sem); 922 up_read(&mm->mmap_sem);
903 return -EFAULT; 923 return -EFAULT;
904 } 924 }
905 if (vma->vm_ops && vma->vm_ops->get_policy) 925 if (vma->vm_ops && vma->vm_ops->get_policy)
906 pol = vma->vm_ops->get_policy(vma, addr); 926 pol = vma->vm_ops->get_policy(vma, addr);
907 else 927 else
908 pol = vma->vm_policy; 928 pol = vma->vm_policy;
909 } else if (addr) 929 } else if (addr)
910 return -EINVAL; 930 return -EINVAL;
911 931
912 if (!pol) 932 if (!pol)
913 pol = &default_policy; /* indicates default behavior */ 933 pol = &default_policy; /* indicates default behavior */
914 934
915 if (flags & MPOL_F_NODE) { 935 if (flags & MPOL_F_NODE) {
916 if (flags & MPOL_F_ADDR) { 936 if (flags & MPOL_F_ADDR) {
917 err = lookup_node(mm, addr); 937 err = lookup_node(mm, addr);
918 if (err < 0) 938 if (err < 0)
919 goto out; 939 goto out;
920 *policy = err; 940 *policy = err;
921 } else if (pol == current->mempolicy && 941 } else if (pol == current->mempolicy &&
922 pol->mode == MPOL_INTERLEAVE) { 942 pol->mode == MPOL_INTERLEAVE) {
923 *policy = current->il_next; 943 *policy = current->il_next;
924 } else { 944 } else {
925 err = -EINVAL; 945 err = -EINVAL;
926 goto out; 946 goto out;
927 } 947 }
928 } else { 948 } else {
929 *policy = pol == &default_policy ? MPOL_DEFAULT : 949 *policy = pol == &default_policy ? MPOL_DEFAULT :
930 pol->mode; 950 pol->mode;
931 /* 951 /*
932 * Internal mempolicy flags must be masked off before exposing 952 * Internal mempolicy flags must be masked off before exposing
933 * the policy to userspace. 953 * the policy to userspace.
934 */ 954 */
935 *policy |= (pol->flags & MPOL_MODE_FLAGS); 955 *policy |= (pol->flags & MPOL_MODE_FLAGS);
936 } 956 }
937 957
938 if (vma) { 958 if (vma) {
939 up_read(&current->mm->mmap_sem); 959 up_read(&current->mm->mmap_sem);
940 vma = NULL; 960 vma = NULL;
941 } 961 }
942 962
943 err = 0; 963 err = 0;
944 if (nmask) { 964 if (nmask) {
945 if (mpol_store_user_nodemask(pol)) { 965 if (mpol_store_user_nodemask(pol)) {
946 *nmask = pol->w.user_nodemask; 966 *nmask = pol->w.user_nodemask;
947 } else { 967 } else {
948 task_lock(current); 968 task_lock(current);
949 get_policy_nodemask(pol, nmask); 969 get_policy_nodemask(pol, nmask);
950 task_unlock(current); 970 task_unlock(current);
951 } 971 }
952 } 972 }
953 973
954 out: 974 out:
955 mpol_cond_put(pol); 975 mpol_cond_put(pol);
956 if (vma) 976 if (vma)
957 up_read(&current->mm->mmap_sem); 977 up_read(&current->mm->mmap_sem);
958 return err; 978 return err;
959 } 979 }
960 980
961 #ifdef CONFIG_MIGRATION 981 #ifdef CONFIG_MIGRATION
962 /* 982 /*
963 * page migration 983 * page migration
964 */ 984 */
965 static void migrate_page_add(struct page *page, struct list_head *pagelist, 985 static void migrate_page_add(struct page *page, struct list_head *pagelist,
966 unsigned long flags) 986 unsigned long flags)
967 { 987 {
968 /* 988 /*
969 * Avoid migrating a page that is shared with others. 989 * Avoid migrating a page that is shared with others.
970 */ 990 */
971 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 991 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
972 if (!isolate_lru_page(page)) { 992 if (!isolate_lru_page(page)) {
973 list_add_tail(&page->lru, pagelist); 993 list_add_tail(&page->lru, pagelist);
974 inc_zone_page_state(page, NR_ISOLATED_ANON + 994 inc_zone_page_state(page, NR_ISOLATED_ANON +
975 page_is_file_cache(page)); 995 page_is_file_cache(page));
976 } 996 }
977 } 997 }
978 } 998 }
979 999
980 static struct page *new_node_page(struct page *page, unsigned long node, int **x) 1000 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
981 { 1001 {
982 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); 1002 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
983 } 1003 }
984 1004
985 /* 1005 /*
986 * Migrate pages from one node to a target node. 1006 * Migrate pages from one node to a target node.
987 * Returns error or the number of pages not migrated. 1007 * Returns error or the number of pages not migrated.
988 */ 1008 */
989 static int migrate_to_node(struct mm_struct *mm, int source, int dest, 1009 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
990 int flags) 1010 int flags)
991 { 1011 {
992 nodemask_t nmask; 1012 nodemask_t nmask;
993 LIST_HEAD(pagelist); 1013 LIST_HEAD(pagelist);
994 int err = 0; 1014 int err = 0;
995 1015
996 nodes_clear(nmask); 1016 nodes_clear(nmask);
997 node_set(source, nmask); 1017 node_set(source, nmask);
998 1018
999 /* 1019 /*
1000 * This does not "check" the range but isolates all pages that 1020 * This does not "check" the range but isolates all pages that
1001 * need migration. Between passing in the full user address 1021 * need migration. Between passing in the full user address
1002 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. 1022 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1003 */ 1023 */
1004 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1024 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1005 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 1025 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1006 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1026 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1007 1027
1008 if (!list_empty(&pagelist)) { 1028 if (!list_empty(&pagelist)) {
1009 err = migrate_pages(&pagelist, new_node_page, dest, 1029 err = migrate_pages(&pagelist, new_node_page, dest,
1010 false, MIGRATE_SYNC, 1030 false, MIGRATE_SYNC,
1011 MR_SYSCALL); 1031 MR_SYSCALL);
1012 if (err) 1032 if (err)
1013 putback_lru_pages(&pagelist); 1033 putback_lru_pages(&pagelist);
1014 } 1034 }
1015 1035
1016 return err; 1036 return err;
1017 } 1037 }
1018 1038
1019 /* 1039 /*
1020 * Move pages between the two nodesets so as to preserve the physical 1040 * Move pages between the two nodesets so as to preserve the physical
1021 * layout as much as possible. 1041 * layout as much as possible.
1022 * 1042 *
1023 * Returns the number of page that could not be moved. 1043 * Returns the number of page that could not be moved.
1024 */ 1044 */
1025 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1045 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1026 const nodemask_t *to, int flags) 1046 const nodemask_t *to, int flags)
1027 { 1047 {
1028 int busy = 0; 1048 int busy = 0;
1029 int err; 1049 int err;
1030 nodemask_t tmp; 1050 nodemask_t tmp;
1031 1051
1032 err = migrate_prep(); 1052 err = migrate_prep();
1033 if (err) 1053 if (err)
1034 return err; 1054 return err;
1035 1055
1036 down_read(&mm->mmap_sem); 1056 down_read(&mm->mmap_sem);
1037 1057
1038 err = migrate_vmas(mm, from, to, flags); 1058 err = migrate_vmas(mm, from, to, flags);
1039 if (err) 1059 if (err)
1040 goto out; 1060 goto out;
1041 1061
1042 /* 1062 /*
1043 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 1063 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1044 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 1064 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1045 * bit in 'tmp', and return that <source, dest> pair for migration. 1065 * bit in 'tmp', and return that <source, dest> pair for migration.
1046 * The pair of nodemasks 'to' and 'from' define the map. 1066 * The pair of nodemasks 'to' and 'from' define the map.
1047 * 1067 *
1048 * If no pair of bits is found that way, fallback to picking some 1068 * If no pair of bits is found that way, fallback to picking some
1049 * pair of 'source' and 'dest' bits that are not the same. If the 1069 * pair of 'source' and 'dest' bits that are not the same. If the
1050 * 'source' and 'dest' bits are the same, this represents a node 1070 * 'source' and 'dest' bits are the same, this represents a node
1051 * that will be migrating to itself, so no pages need move. 1071 * that will be migrating to itself, so no pages need move.
1052 * 1072 *
1053 * If no bits are left in 'tmp', or if all remaining bits left 1073 * If no bits are left in 'tmp', or if all remaining bits left
1054 * in 'tmp' correspond to the same bit in 'to', return false 1074 * in 'tmp' correspond to the same bit in 'to', return false
1055 * (nothing left to migrate). 1075 * (nothing left to migrate).
1056 * 1076 *
1057 * This lets us pick a pair of nodes to migrate between, such that 1077 * This lets us pick a pair of nodes to migrate between, such that
1058 * if possible the dest node is not already occupied by some other 1078 * if possible the dest node is not already occupied by some other
1059 * source node, minimizing the risk of overloading the memory on a 1079 * source node, minimizing the risk of overloading the memory on a
1060 * node that would happen if we migrated incoming memory to a node 1080 * node that would happen if we migrated incoming memory to a node
1061 * before migrating outgoing memory source that same node. 1081 * before migrating outgoing memory source that same node.
1062 * 1082 *
1063 * A single scan of tmp is sufficient. As we go, we remember the 1083 * A single scan of tmp is sufficient. As we go, we remember the
1064 * most recent <s, d> pair that moved (s != d). If we find a pair 1084 * most recent <s, d> pair that moved (s != d). If we find a pair
1065 * that not only moved, but what's better, moved to an empty slot 1085 * that not only moved, but what's better, moved to an empty slot
1066 * (d is not set in tmp), then we break out then, with that pair. 1086 * (d is not set in tmp), then we break out then, with that pair.
1067 * Otherwise when we finish scanning from_tmp, we at least have the 1087 * Otherwise when we finish scanning from_tmp, we at least have the
1068 * most recent <s, d> pair that moved. If we get all the way through 1088 * most recent <s, d> pair that moved. If we get all the way through
1069 * the scan of tmp without finding any node that moved, much less 1089 * the scan of tmp without finding any node that moved, much less
1070 * moved to an empty node, then there is nothing left worth migrating. 1090 * moved to an empty node, then there is nothing left worth migrating.
1071 */ 1091 */
1072 1092
1073 tmp = *from; 1093 tmp = *from;
1074 while (!nodes_empty(tmp)) { 1094 while (!nodes_empty(tmp)) {
1075 int s,d; 1095 int s,d;
1076 int source = -1; 1096 int source = -1;
1077 int dest = 0; 1097 int dest = 0;
1078 1098
1079 for_each_node_mask(s, tmp) { 1099 for_each_node_mask(s, tmp) {
1080 1100
1081 /* 1101 /*
1082 * do_migrate_pages() tries to maintain the relative 1102 * do_migrate_pages() tries to maintain the relative
1083 * node relationship of the pages established between 1103 * node relationship of the pages established between
1084 * threads and memory areas. 1104 * threads and memory areas.
1085 * 1105 *
1086 * However if the number of source nodes is not equal to 1106 * However if the number of source nodes is not equal to
1087 * the number of destination nodes we can not preserve 1107 * the number of destination nodes we can not preserve
1088 * this node relative relationship. In that case, skip 1108 * this node relative relationship. In that case, skip
1089 * copying memory from a node that is in the destination 1109 * copying memory from a node that is in the destination
1090 * mask. 1110 * mask.
1091 * 1111 *
1092 * Example: [2,3,4] -> [3,4,5] moves everything. 1112 * Example: [2,3,4] -> [3,4,5] moves everything.
1093 * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 1113 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1094 */ 1114 */
1095 1115
1096 if ((nodes_weight(*from) != nodes_weight(*to)) && 1116 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1097 (node_isset(s, *to))) 1117 (node_isset(s, *to)))
1098 continue; 1118 continue;
1099 1119
1100 d = node_remap(s, *from, *to); 1120 d = node_remap(s, *from, *to);
1101 if (s == d) 1121 if (s == d)
1102 continue; 1122 continue;
1103 1123
1104 source = s; /* Node moved. Memorize */ 1124 source = s; /* Node moved. Memorize */
1105 dest = d; 1125 dest = d;
1106 1126
1107 /* dest not in remaining from nodes? */ 1127 /* dest not in remaining from nodes? */
1108 if (!node_isset(dest, tmp)) 1128 if (!node_isset(dest, tmp))
1109 break; 1129 break;
1110 } 1130 }
1111 if (source == -1) 1131 if (source == -1)
1112 break; 1132 break;
1113 1133
1114 node_clear(source, tmp); 1134 node_clear(source, tmp);
1115 err = migrate_to_node(mm, source, dest, flags); 1135 err = migrate_to_node(mm, source, dest, flags);
1116 if (err > 0) 1136 if (err > 0)
1117 busy += err; 1137 busy += err;
1118 if (err < 0) 1138 if (err < 0)
1119 break; 1139 break;
1120 } 1140 }
1121 out: 1141 out:
1122 up_read(&mm->mmap_sem); 1142 up_read(&mm->mmap_sem);
1123 if (err < 0) 1143 if (err < 0)
1124 return err; 1144 return err;
1125 return busy; 1145 return busy;
1126 1146
1127 } 1147 }
1128 1148
1129 /* 1149 /*
1130 * Allocate a new page for page migration based on vma policy. 1150 * Allocate a new page for page migration based on vma policy.
1131 * Start assuming that page is mapped by vma pointed to by @private. 1151 * Start assuming that page is mapped by vma pointed to by @private.
1132 * Search forward from there, if not. N.B., this assumes that the 1152 * Search forward from there, if not. N.B., this assumes that the
1133 * list of pages handed to migrate_pages()--which is how we get here-- 1153 * list of pages handed to migrate_pages()--which is how we get here--
1134 * is in virtual address order. 1154 * is in virtual address order.
1135 */ 1155 */
1136 static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 1156 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1137 { 1157 {
1138 struct vm_area_struct *vma = (struct vm_area_struct *)private; 1158 struct vm_area_struct *vma = (struct vm_area_struct *)private;
1139 unsigned long uninitialized_var(address); 1159 unsigned long uninitialized_var(address);
1140 1160
1141 while (vma) { 1161 while (vma) {
1142 address = page_address_in_vma(page, vma); 1162 address = page_address_in_vma(page, vma);
1143 if (address != -EFAULT) 1163 if (address != -EFAULT)
1144 break; 1164 break;
1145 vma = vma->vm_next; 1165 vma = vma->vm_next;
1146 } 1166 }
1147 1167
1148 /* 1168 /*
1149 * if !vma, alloc_page_vma() will use task or system default policy 1169 * if !vma, alloc_page_vma() will use task or system default policy
1150 */ 1170 */
1151 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1171 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1152 } 1172 }
1153 #else 1173 #else
1154 1174
1155 static void migrate_page_add(struct page *page, struct list_head *pagelist, 1175 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1156 unsigned long flags) 1176 unsigned long flags)
1157 { 1177 {
1158 } 1178 }
1159 1179
1160 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1180 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1161 const nodemask_t *to, int flags) 1181 const nodemask_t *to, int flags)
1162 { 1182 {
1163 return -ENOSYS; 1183 return -ENOSYS;
1164 } 1184 }
1165 1185
1166 static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 1186 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1167 { 1187 {
1168 return NULL; 1188 return NULL;
1169 } 1189 }
1170 #endif 1190 #endif
1171 1191
1172 static long do_mbind(unsigned long start, unsigned long len, 1192 static long do_mbind(unsigned long start, unsigned long len,
1173 unsigned short mode, unsigned short mode_flags, 1193 unsigned short mode, unsigned short mode_flags,
1174 nodemask_t *nmask, unsigned long flags) 1194 nodemask_t *nmask, unsigned long flags)
1175 { 1195 {
1176 struct vm_area_struct *vma; 1196 struct vm_area_struct *vma;
1177 struct mm_struct *mm = current->mm; 1197 struct mm_struct *mm = current->mm;
1178 struct mempolicy *new; 1198 struct mempolicy *new;
1179 unsigned long end; 1199 unsigned long end;
1180 int err; 1200 int err;
1181 LIST_HEAD(pagelist); 1201 LIST_HEAD(pagelist);
1182 1202
1183 if (flags & ~(unsigned long)MPOL_MF_VALID) 1203 if (flags & ~(unsigned long)MPOL_MF_VALID)
1184 return -EINVAL; 1204 return -EINVAL;
1185 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1205 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1186 return -EPERM; 1206 return -EPERM;
1187 1207
1188 if (start & ~PAGE_MASK) 1208 if (start & ~PAGE_MASK)
1189 return -EINVAL; 1209 return -EINVAL;
1190 1210
1191 if (mode == MPOL_DEFAULT) 1211 if (mode == MPOL_DEFAULT)
1192 flags &= ~MPOL_MF_STRICT; 1212 flags &= ~MPOL_MF_STRICT;
1193 1213
1194 len = (len + PAGE_SIZE - 1) & PAGE_MASK; 1214 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1195 end = start + len; 1215 end = start + len;
1196 1216
1197 if (end < start) 1217 if (end < start)
1198 return -EINVAL; 1218 return -EINVAL;
1199 if (end == start) 1219 if (end == start)
1200 return 0; 1220 return 0;
1201 1221
1202 new = mpol_new(mode, mode_flags, nmask); 1222 new = mpol_new(mode, mode_flags, nmask);
1203 if (IS_ERR(new)) 1223 if (IS_ERR(new))
1204 return PTR_ERR(new); 1224 return PTR_ERR(new);
1205 1225
1206 if (flags & MPOL_MF_LAZY) 1226 if (flags & MPOL_MF_LAZY)
1207 new->flags |= MPOL_F_MOF; 1227 new->flags |= MPOL_F_MOF;
1208 1228
1209 /* 1229 /*
1210 * If we are using the default policy then operation 1230 * If we are using the default policy then operation
1211 * on discontinuous address spaces is okay after all 1231 * on discontinuous address spaces is okay after all
1212 */ 1232 */
1213 if (!new) 1233 if (!new)
1214 flags |= MPOL_MF_DISCONTIG_OK; 1234 flags |= MPOL_MF_DISCONTIG_OK;
1215 1235
1216 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", 1236 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1217 start, start + len, mode, mode_flags, 1237 start, start + len, mode, mode_flags,
1218 nmask ? nodes_addr(*nmask)[0] : -1); 1238 nmask ? nodes_addr(*nmask)[0] : -1);
1219 1239
1220 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 1240 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1221 1241
1222 err = migrate_prep(); 1242 err = migrate_prep();
1223 if (err) 1243 if (err)
1224 goto mpol_out; 1244 goto mpol_out;
1225 } 1245 }
1226 { 1246 {
1227 NODEMASK_SCRATCH(scratch); 1247 NODEMASK_SCRATCH(scratch);
1228 if (scratch) { 1248 if (scratch) {
1229 down_write(&mm->mmap_sem); 1249 down_write(&mm->mmap_sem);
1230 task_lock(current); 1250 task_lock(current);
1231 err = mpol_set_nodemask(new, nmask, scratch); 1251 err = mpol_set_nodemask(new, nmask, scratch);
1232 task_unlock(current); 1252 task_unlock(current);
1233 if (err) 1253 if (err)
1234 up_write(&mm->mmap_sem); 1254 up_write(&mm->mmap_sem);
1235 } else 1255 } else
1236 err = -ENOMEM; 1256 err = -ENOMEM;
1237 NODEMASK_SCRATCH_FREE(scratch); 1257 NODEMASK_SCRATCH_FREE(scratch);
1238 } 1258 }
1239 if (err) 1259 if (err)
1240 goto mpol_out; 1260 goto mpol_out;
1241 1261
1242 vma = check_range(mm, start, end, nmask, 1262 vma = check_range(mm, start, end, nmask,
1243 flags | MPOL_MF_INVERT, &pagelist); 1263 flags | MPOL_MF_INVERT, &pagelist);
1244 1264
1245 err = PTR_ERR(vma); /* maybe ... */ 1265 err = PTR_ERR(vma); /* maybe ... */
1246 if (!IS_ERR(vma)) 1266 if (!IS_ERR(vma))
1247 err = mbind_range(mm, start, end, new); 1267 err = mbind_range(mm, start, end, new);
1248 1268
1249 if (!err) { 1269 if (!err) {
1250 int nr_failed = 0; 1270 int nr_failed = 0;
1251 1271
1252 if (!list_empty(&pagelist)) { 1272 if (!list_empty(&pagelist)) {
1253 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1273 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1254 nr_failed = migrate_pages(&pagelist, new_vma_page, 1274 nr_failed = migrate_pages(&pagelist, new_vma_page,
1255 (unsigned long)vma, 1275 (unsigned long)vma,
1256 false, MIGRATE_SYNC, 1276 false, MIGRATE_SYNC,
1257 MR_MEMPOLICY_MBIND); 1277 MR_MEMPOLICY_MBIND);
1258 if (nr_failed) 1278 if (nr_failed)
1259 putback_lru_pages(&pagelist); 1279 putback_lru_pages(&pagelist);
1260 } 1280 }
1261 1281
1262 if (nr_failed && (flags & MPOL_MF_STRICT)) 1282 if (nr_failed && (flags & MPOL_MF_STRICT))
1263 err = -EIO; 1283 err = -EIO;
1264 } else 1284 } else
1265 putback_lru_pages(&pagelist); 1285 putback_lru_pages(&pagelist);
1266 1286
1267 up_write(&mm->mmap_sem); 1287 up_write(&mm->mmap_sem);
1268 mpol_out: 1288 mpol_out:
1269 mpol_put(new); 1289 mpol_put(new);
1270 return err; 1290 return err;
1271 } 1291 }
1272 1292
1273 /* 1293 /*
1274 * User space interface with variable sized bitmaps for nodelists. 1294 * User space interface with variable sized bitmaps for nodelists.
1275 */ 1295 */
1276 1296
1277 /* Copy a node mask from user space. */ 1297 /* Copy a node mask from user space. */
1278 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 1298 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1279 unsigned long maxnode) 1299 unsigned long maxnode)
1280 { 1300 {
1281 unsigned long k; 1301 unsigned long k;
1282 unsigned long nlongs; 1302 unsigned long nlongs;
1283 unsigned long endmask; 1303 unsigned long endmask;
1284 1304
1285 --maxnode; 1305 --maxnode;
1286 nodes_clear(*nodes); 1306 nodes_clear(*nodes);
1287 if (maxnode == 0 || !nmask) 1307 if (maxnode == 0 || !nmask)
1288 return 0; 1308 return 0;
1289 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1309 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1290 return -EINVAL; 1310 return -EINVAL;
1291 1311
1292 nlongs = BITS_TO_LONGS(maxnode); 1312 nlongs = BITS_TO_LONGS(maxnode);
1293 if ((maxnode % BITS_PER_LONG) == 0) 1313 if ((maxnode % BITS_PER_LONG) == 0)
1294 endmask = ~0UL; 1314 endmask = ~0UL;
1295 else 1315 else
1296 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 1316 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1297 1317
1298 /* When the user specified more nodes than supported just check 1318 /* When the user specified more nodes than supported just check
1299 if the non supported part is all zero. */ 1319 if the non supported part is all zero. */
1300 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 1320 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1301 if (nlongs > PAGE_SIZE/sizeof(long)) 1321 if (nlongs > PAGE_SIZE/sizeof(long))
1302 return -EINVAL; 1322 return -EINVAL;
1303 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 1323 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1304 unsigned long t; 1324 unsigned long t;
1305 if (get_user(t, nmask + k)) 1325 if (get_user(t, nmask + k))
1306 return -EFAULT; 1326 return -EFAULT;
1307 if (k == nlongs - 1) { 1327 if (k == nlongs - 1) {
1308 if (t & endmask) 1328 if (t & endmask)
1309 return -EINVAL; 1329 return -EINVAL;
1310 } else if (t) 1330 } else if (t)
1311 return -EINVAL; 1331 return -EINVAL;
1312 } 1332 }
1313 nlongs = BITS_TO_LONGS(MAX_NUMNODES); 1333 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1314 endmask = ~0UL; 1334 endmask = ~0UL;
1315 } 1335 }
1316 1336
1317 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) 1337 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1318 return -EFAULT; 1338 return -EFAULT;
1319 nodes_addr(*nodes)[nlongs-1] &= endmask; 1339 nodes_addr(*nodes)[nlongs-1] &= endmask;
1320 return 0; 1340 return 0;
1321 } 1341 }
1322 1342
1323 /* Copy a kernel node mask to user space */ 1343 /* Copy a kernel node mask to user space */
1324 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 1344 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1325 nodemask_t *nodes) 1345 nodemask_t *nodes)
1326 { 1346 {
1327 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1347 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1328 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); 1348 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1329 1349
1330 if (copy > nbytes) { 1350 if (copy > nbytes) {
1331 if (copy > PAGE_SIZE) 1351 if (copy > PAGE_SIZE)
1332 return -EINVAL; 1352 return -EINVAL;
1333 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 1353 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1334 return -EFAULT; 1354 return -EFAULT;
1335 copy = nbytes; 1355 copy = nbytes;
1336 } 1356 }
1337 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1357 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1338 } 1358 }
1339 1359
1340 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1360 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1341 unsigned long, mode, unsigned long __user *, nmask, 1361 unsigned long, mode, unsigned long __user *, nmask,
1342 unsigned long, maxnode, unsigned, flags) 1362 unsigned long, maxnode, unsigned, flags)
1343 { 1363 {
1344 nodemask_t nodes; 1364 nodemask_t nodes;
1345 int err; 1365 int err;
1346 unsigned short mode_flags; 1366 unsigned short mode_flags;
1347 1367
1348 mode_flags = mode & MPOL_MODE_FLAGS; 1368 mode_flags = mode & MPOL_MODE_FLAGS;
1349 mode &= ~MPOL_MODE_FLAGS; 1369 mode &= ~MPOL_MODE_FLAGS;
1350 if (mode >= MPOL_MAX) 1370 if (mode >= MPOL_MAX)
1351 return -EINVAL; 1371 return -EINVAL;
1352 if ((mode_flags & MPOL_F_STATIC_NODES) && 1372 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1353 (mode_flags & MPOL_F_RELATIVE_NODES)) 1373 (mode_flags & MPOL_F_RELATIVE_NODES))
1354 return -EINVAL; 1374 return -EINVAL;
1355 err = get_nodes(&nodes, nmask, maxnode); 1375 err = get_nodes(&nodes, nmask, maxnode);
1356 if (err) 1376 if (err)
1357 return err; 1377 return err;
1358 return do_mbind(start, len, mode, mode_flags, &nodes, flags); 1378 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1359 } 1379 }
1360 1380
1361 /* Set the process memory policy */ 1381 /* Set the process memory policy */
1362 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, 1382 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1363 unsigned long, maxnode) 1383 unsigned long, maxnode)
1364 { 1384 {
1365 int err; 1385 int err;
1366 nodemask_t nodes; 1386 nodemask_t nodes;
1367 unsigned short flags; 1387 unsigned short flags;
1368 1388
1369 flags = mode & MPOL_MODE_FLAGS; 1389 flags = mode & MPOL_MODE_FLAGS;
1370 mode &= ~MPOL_MODE_FLAGS; 1390 mode &= ~MPOL_MODE_FLAGS;
1371 if ((unsigned int)mode >= MPOL_MAX) 1391 if ((unsigned int)mode >= MPOL_MAX)
1372 return -EINVAL; 1392 return -EINVAL;
1373 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) 1393 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1374 return -EINVAL; 1394 return -EINVAL;
1375 err = get_nodes(&nodes, nmask, maxnode); 1395 err = get_nodes(&nodes, nmask, maxnode);
1376 if (err) 1396 if (err)
1377 return err; 1397 return err;
1378 return do_set_mempolicy(mode, flags, &nodes); 1398 return do_set_mempolicy(mode, flags, &nodes);
1379 } 1399 }
1380 1400
1381 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1401 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1382 const unsigned long __user *, old_nodes, 1402 const unsigned long __user *, old_nodes,
1383 const unsigned long __user *, new_nodes) 1403 const unsigned long __user *, new_nodes)
1384 { 1404 {
1385 const struct cred *cred = current_cred(), *tcred; 1405 const struct cred *cred = current_cred(), *tcred;
1386 struct mm_struct *mm = NULL; 1406 struct mm_struct *mm = NULL;
1387 struct task_struct *task; 1407 struct task_struct *task;
1388 nodemask_t task_nodes; 1408 nodemask_t task_nodes;
1389 int err; 1409 int err;
1390 nodemask_t *old; 1410 nodemask_t *old;
1391 nodemask_t *new; 1411 nodemask_t *new;
1392 NODEMASK_SCRATCH(scratch); 1412 NODEMASK_SCRATCH(scratch);
1393 1413
1394 if (!scratch) 1414 if (!scratch)
1395 return -ENOMEM; 1415 return -ENOMEM;
1396 1416
1397 old = &scratch->mask1; 1417 old = &scratch->mask1;
1398 new = &scratch->mask2; 1418 new = &scratch->mask2;
1399 1419
1400 err = get_nodes(old, old_nodes, maxnode); 1420 err = get_nodes(old, old_nodes, maxnode);
1401 if (err) 1421 if (err)
1402 goto out; 1422 goto out;
1403 1423
1404 err = get_nodes(new, new_nodes, maxnode); 1424 err = get_nodes(new, new_nodes, maxnode);
1405 if (err) 1425 if (err)
1406 goto out; 1426 goto out;
1407 1427
1408 /* Find the mm_struct */ 1428 /* Find the mm_struct */
1409 rcu_read_lock(); 1429 rcu_read_lock();
1410 task = pid ? find_task_by_vpid(pid) : current; 1430 task = pid ? find_task_by_vpid(pid) : current;
1411 if (!task) { 1431 if (!task) {
1412 rcu_read_unlock(); 1432 rcu_read_unlock();
1413 err = -ESRCH; 1433 err = -ESRCH;
1414 goto out; 1434 goto out;
1415 } 1435 }
1416 get_task_struct(task); 1436 get_task_struct(task);
1417 1437
1418 err = -EINVAL; 1438 err = -EINVAL;
1419 1439
1420 /* 1440 /*
1421 * Check if this process has the right to modify the specified 1441 * Check if this process has the right to modify the specified
1422 * process. The right exists if the process has administrative 1442 * process. The right exists if the process has administrative
1423 * capabilities, superuser privileges or the same 1443 * capabilities, superuser privileges or the same
1424 * userid as the target process. 1444 * userid as the target process.
1425 */ 1445 */
1426 tcred = __task_cred(task); 1446 tcred = __task_cred(task);
1427 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && 1447 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1428 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && 1448 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1429 !capable(CAP_SYS_NICE)) { 1449 !capable(CAP_SYS_NICE)) {
1430 rcu_read_unlock(); 1450 rcu_read_unlock();
1431 err = -EPERM; 1451 err = -EPERM;
1432 goto out_put; 1452 goto out_put;
1433 } 1453 }
1434 rcu_read_unlock(); 1454 rcu_read_unlock();
1435 1455
1436 task_nodes = cpuset_mems_allowed(task); 1456 task_nodes = cpuset_mems_allowed(task);
1437 /* Is the user allowed to access the target nodes? */ 1457 /* Is the user allowed to access the target nodes? */
1438 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1458 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1439 err = -EPERM; 1459 err = -EPERM;
1440 goto out_put; 1460 goto out_put;
1441 } 1461 }
1442 1462
1443 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { 1463 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1444 err = -EINVAL; 1464 err = -EINVAL;
1445 goto out_put; 1465 goto out_put;
1446 } 1466 }
1447 1467
1448 err = security_task_movememory(task); 1468 err = security_task_movememory(task);
1449 if (err) 1469 if (err)
1450 goto out_put; 1470 goto out_put;
1451 1471
1452 mm = get_task_mm(task); 1472 mm = get_task_mm(task);
1453 put_task_struct(task); 1473 put_task_struct(task);
1454 1474
1455 if (!mm) { 1475 if (!mm) {
1456 err = -EINVAL; 1476 err = -EINVAL;
1457 goto out; 1477 goto out;
1458 } 1478 }
1459 1479
1460 err = do_migrate_pages(mm, old, new, 1480 err = do_migrate_pages(mm, old, new,
1461 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1481 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1462 1482
1463 mmput(mm); 1483 mmput(mm);
1464 out: 1484 out:
1465 NODEMASK_SCRATCH_FREE(scratch); 1485 NODEMASK_SCRATCH_FREE(scratch);
1466 1486
1467 return err; 1487 return err;
1468 1488
1469 out_put: 1489 out_put:
1470 put_task_struct(task); 1490 put_task_struct(task);
1471 goto out; 1491 goto out;
1472 1492
1473 } 1493 }
1474 1494
1475 1495
1476 /* Retrieve NUMA policy */ 1496 /* Retrieve NUMA policy */
1477 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1497 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1478 unsigned long __user *, nmask, unsigned long, maxnode, 1498 unsigned long __user *, nmask, unsigned long, maxnode,
1479 unsigned long, addr, unsigned long, flags) 1499 unsigned long, addr, unsigned long, flags)
1480 { 1500 {
1481 int err; 1501 int err;
1482 int uninitialized_var(pval); 1502 int uninitialized_var(pval);
1483 nodemask_t nodes; 1503 nodemask_t nodes;
1484 1504
1485 if (nmask != NULL && maxnode < MAX_NUMNODES) 1505 if (nmask != NULL && maxnode < MAX_NUMNODES)
1486 return -EINVAL; 1506 return -EINVAL;
1487 1507
1488 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1508 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1489 1509
1490 if (err) 1510 if (err)
1491 return err; 1511 return err;
1492 1512
1493 if (policy && put_user(pval, policy)) 1513 if (policy && put_user(pval, policy))
1494 return -EFAULT; 1514 return -EFAULT;
1495 1515
1496 if (nmask) 1516 if (nmask)
1497 err = copy_nodes_to_user(nmask, maxnode, &nodes); 1517 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1498 1518
1499 return err; 1519 return err;
1500 } 1520 }
1501 1521
1502 #ifdef CONFIG_COMPAT 1522 #ifdef CONFIG_COMPAT
1503 1523
1504 asmlinkage long compat_sys_get_mempolicy(int __user *policy, 1524 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1505 compat_ulong_t __user *nmask, 1525 compat_ulong_t __user *nmask,
1506 compat_ulong_t maxnode, 1526 compat_ulong_t maxnode,
1507 compat_ulong_t addr, compat_ulong_t flags) 1527 compat_ulong_t addr, compat_ulong_t flags)
1508 { 1528 {
1509 long err; 1529 long err;
1510 unsigned long __user *nm = NULL; 1530 unsigned long __user *nm = NULL;
1511 unsigned long nr_bits, alloc_size; 1531 unsigned long nr_bits, alloc_size;
1512 DECLARE_BITMAP(bm, MAX_NUMNODES); 1532 DECLARE_BITMAP(bm, MAX_NUMNODES);
1513 1533
1514 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 1534 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1515 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 1535 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1516 1536
1517 if (nmask) 1537 if (nmask)
1518 nm = compat_alloc_user_space(alloc_size); 1538 nm = compat_alloc_user_space(alloc_size);
1519 1539
1520 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 1540 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1521 1541
1522 if (!err && nmask) { 1542 if (!err && nmask) {
1523 unsigned long copy_size; 1543 unsigned long copy_size;
1524 copy_size = min_t(unsigned long, sizeof(bm), alloc_size); 1544 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1525 err = copy_from_user(bm, nm, copy_size); 1545 err = copy_from_user(bm, nm, copy_size);
1526 /* ensure entire bitmap is zeroed */ 1546 /* ensure entire bitmap is zeroed */
1527 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 1547 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1528 err |= compat_put_bitmap(nmask, bm, nr_bits); 1548 err |= compat_put_bitmap(nmask, bm, nr_bits);
1529 } 1549 }
1530 1550
1531 return err; 1551 return err;
1532 } 1552 }
1533 1553
1534 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, 1554 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1535 compat_ulong_t maxnode) 1555 compat_ulong_t maxnode)
1536 { 1556 {
1537 long err = 0; 1557 long err = 0;
1538 unsigned long __user *nm = NULL; 1558 unsigned long __user *nm = NULL;
1539 unsigned long nr_bits, alloc_size; 1559 unsigned long nr_bits, alloc_size;
1540 DECLARE_BITMAP(bm, MAX_NUMNODES); 1560 DECLARE_BITMAP(bm, MAX_NUMNODES);
1541 1561
1542 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 1562 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1543 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 1563 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1544 1564
1545 if (nmask) { 1565 if (nmask) {
1546 err = compat_get_bitmap(bm, nmask, nr_bits); 1566 err = compat_get_bitmap(bm, nmask, nr_bits);
1547 nm = compat_alloc_user_space(alloc_size); 1567 nm = compat_alloc_user_space(alloc_size);
1548 err |= copy_to_user(nm, bm, alloc_size); 1568 err |= copy_to_user(nm, bm, alloc_size);
1549 } 1569 }
1550 1570
1551 if (err) 1571 if (err)
1552 return -EFAULT; 1572 return -EFAULT;
1553 1573
1554 return sys_set_mempolicy(mode, nm, nr_bits+1); 1574 return sys_set_mempolicy(mode, nm, nr_bits+1);
1555 } 1575 }
1556 1576
1557 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, 1577 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1558 compat_ulong_t mode, compat_ulong_t __user *nmask, 1578 compat_ulong_t mode, compat_ulong_t __user *nmask,
1559 compat_ulong_t maxnode, compat_ulong_t flags) 1579 compat_ulong_t maxnode, compat_ulong_t flags)
1560 { 1580 {
1561 long err = 0; 1581 long err = 0;
1562 unsigned long __user *nm = NULL; 1582 unsigned long __user *nm = NULL;
1563 unsigned long nr_bits, alloc_size; 1583 unsigned long nr_bits, alloc_size;
1564 nodemask_t bm; 1584 nodemask_t bm;
1565 1585
1566 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 1586 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1567 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 1587 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1568 1588
1569 if (nmask) { 1589 if (nmask) {
1570 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); 1590 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1571 nm = compat_alloc_user_space(alloc_size); 1591 nm = compat_alloc_user_space(alloc_size);
1572 err |= copy_to_user(nm, nodes_addr(bm), alloc_size); 1592 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1573 } 1593 }
1574 1594
1575 if (err) 1595 if (err)
1576 return -EFAULT; 1596 return -EFAULT;
1577 1597
1578 return sys_mbind(start, len, mode, nm, nr_bits+1, flags); 1598 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1579 } 1599 }
1580 1600
1581 #endif 1601 #endif
1582 1602
1583 /* 1603 /*
1584 * get_vma_policy(@task, @vma, @addr) 1604 * get_vma_policy(@task, @vma, @addr)
1585 * @task - task for fallback if vma policy == default 1605 * @task - task for fallback if vma policy == default
1586 * @vma - virtual memory area whose policy is sought 1606 * @vma - virtual memory area whose policy is sought
1587 * @addr - address in @vma for shared policy lookup 1607 * @addr - address in @vma for shared policy lookup
1588 * 1608 *
1589 * Returns effective policy for a VMA at specified address. 1609 * Returns effective policy for a VMA at specified address.
1590 * Falls back to @task or system default policy, as necessary. 1610 * Falls back to @task or system default policy, as necessary.
1591 * Current or other task's task mempolicy and non-shared vma policies must be 1611 * Current or other task's task mempolicy and non-shared vma policies must be
1592 * protected by task_lock(task) by the caller. 1612 * protected by task_lock(task) by the caller.
1593 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 1613 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1594 * count--added by the get_policy() vm_op, as appropriate--to protect against 1614 * count--added by the get_policy() vm_op, as appropriate--to protect against
1595 * freeing by another task. It is the caller's responsibility to free the 1615 * freeing by another task. It is the caller's responsibility to free the
1596 * extra reference for shared policies. 1616 * extra reference for shared policies.
1597 */ 1617 */
1598 struct mempolicy *get_vma_policy(struct task_struct *task, 1618 struct mempolicy *get_vma_policy(struct task_struct *task,
1599 struct vm_area_struct *vma, unsigned long addr) 1619 struct vm_area_struct *vma, unsigned long addr)
1600 { 1620 {
1601 struct mempolicy *pol = task->mempolicy; 1621 struct mempolicy *pol = get_task_policy(task);
1602 1622
1603 if (vma) { 1623 if (vma) {
1604 if (vma->vm_ops && vma->vm_ops->get_policy) { 1624 if (vma->vm_ops && vma->vm_ops->get_policy) {
1605 struct mempolicy *vpol = vma->vm_ops->get_policy(vma, 1625 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1606 addr); 1626 addr);
1607 if (vpol) 1627 if (vpol)
1608 pol = vpol; 1628 pol = vpol;
1609 } else if (vma->vm_policy) { 1629 } else if (vma->vm_policy) {
1610 pol = vma->vm_policy; 1630 pol = vma->vm_policy;
1611 1631
1612 /* 1632 /*
1613 * shmem_alloc_page() passes MPOL_F_SHARED policy with 1633 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1614 * a pseudo vma whose vma->vm_ops=NULL. Take a reference 1634 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1615 * count on these policies which will be dropped by 1635 * count on these policies which will be dropped by
1616 * mpol_cond_put() later 1636 * mpol_cond_put() later
1617 */ 1637 */
1618 if (mpol_needs_cond_ref(pol)) 1638 if (mpol_needs_cond_ref(pol))
1619 mpol_get(pol); 1639 mpol_get(pol);
1620 } 1640 }
1621 } 1641 }
1622 if (!pol) 1642 if (!pol)
1623 pol = &default_policy; 1643 pol = &default_policy;
1624 return pol; 1644 return pol;
1625 } 1645 }
1626 1646
1627 /* 1647 /*
1628 * Return a nodemask representing a mempolicy for filtering nodes for 1648 * Return a nodemask representing a mempolicy for filtering nodes for
1629 * page allocation 1649 * page allocation
1630 */ 1650 */
1631 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) 1651 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1632 { 1652 {
1633 /* Lower zones don't get a nodemask applied for MPOL_BIND */ 1653 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1634 if (unlikely(policy->mode == MPOL_BIND) && 1654 if (unlikely(policy->mode == MPOL_BIND) &&
1635 gfp_zone(gfp) >= policy_zone && 1655 gfp_zone(gfp) >= policy_zone &&
1636 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) 1656 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1637 return &policy->v.nodes; 1657 return &policy->v.nodes;
1638 1658
1639 return NULL; 1659 return NULL;
1640 } 1660 }
1641 1661
1642 /* Return a zonelist indicated by gfp for node representing a mempolicy */ 1662 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1643 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, 1663 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1644 int nd) 1664 int nd)
1645 { 1665 {
1646 switch (policy->mode) { 1666 switch (policy->mode) {
1647 case MPOL_PREFERRED: 1667 case MPOL_PREFERRED:
1648 if (!(policy->flags & MPOL_F_LOCAL)) 1668 if (!(policy->flags & MPOL_F_LOCAL))
1649 nd = policy->v.preferred_node; 1669 nd = policy->v.preferred_node;
1650 break; 1670 break;
1651 case MPOL_BIND: 1671 case MPOL_BIND:
1652 /* 1672 /*
1653 * Normally, MPOL_BIND allocations are node-local within the 1673 * Normally, MPOL_BIND allocations are node-local within the
1654 * allowed nodemask. However, if __GFP_THISNODE is set and the 1674 * allowed nodemask. However, if __GFP_THISNODE is set and the
1655 * current node isn't part of the mask, we use the zonelist for 1675 * current node isn't part of the mask, we use the zonelist for
1656 * the first node in the mask instead. 1676 * the first node in the mask instead.
1657 */ 1677 */
1658 if (unlikely(gfp & __GFP_THISNODE) && 1678 if (unlikely(gfp & __GFP_THISNODE) &&
1659 unlikely(!node_isset(nd, policy->v.nodes))) 1679 unlikely(!node_isset(nd, policy->v.nodes)))
1660 nd = first_node(policy->v.nodes); 1680 nd = first_node(policy->v.nodes);
1661 break; 1681 break;
1662 default: 1682 default:
1663 BUG(); 1683 BUG();
1664 } 1684 }
1665 return node_zonelist(nd, gfp); 1685 return node_zonelist(nd, gfp);
1666 } 1686 }
1667 1687
1668 /* Do dynamic interleaving for a process */ 1688 /* Do dynamic interleaving for a process */
1669 static unsigned interleave_nodes(struct mempolicy *policy) 1689 static unsigned interleave_nodes(struct mempolicy *policy)
1670 { 1690 {
1671 unsigned nid, next; 1691 unsigned nid, next;
1672 struct task_struct *me = current; 1692 struct task_struct *me = current;
1673 1693
1674 nid = me->il_next; 1694 nid = me->il_next;
1675 next = next_node(nid, policy->v.nodes); 1695 next = next_node(nid, policy->v.nodes);
1676 if (next >= MAX_NUMNODES) 1696 if (next >= MAX_NUMNODES)
1677 next = first_node(policy->v.nodes); 1697 next = first_node(policy->v.nodes);
1678 if (next < MAX_NUMNODES) 1698 if (next < MAX_NUMNODES)
1679 me->il_next = next; 1699 me->il_next = next;
1680 return nid; 1700 return nid;
1681 } 1701 }
1682 1702
1683 /* 1703 /*
1684 * Depending on the memory policy provide a node from which to allocate the 1704 * Depending on the memory policy provide a node from which to allocate the
1685 * next slab entry. 1705 * next slab entry.
1686 * @policy must be protected by freeing by the caller. If @policy is 1706 * @policy must be protected by freeing by the caller. If @policy is
1687 * the current task's mempolicy, this protection is implicit, as only the 1707 * the current task's mempolicy, this protection is implicit, as only the
1688 * task can change it's policy. The system default policy requires no 1708 * task can change it's policy. The system default policy requires no
1689 * such protection. 1709 * such protection.
1690 */ 1710 */
1691 unsigned slab_node(void) 1711 unsigned slab_node(void)
1692 { 1712 {
1693 struct mempolicy *policy; 1713 struct mempolicy *policy;
1694 1714
1695 if (in_interrupt()) 1715 if (in_interrupt())
1696 return numa_node_id(); 1716 return numa_node_id();
1697 1717
1698 policy = current->mempolicy; 1718 policy = current->mempolicy;
1699 if (!policy || policy->flags & MPOL_F_LOCAL) 1719 if (!policy || policy->flags & MPOL_F_LOCAL)
1700 return numa_node_id(); 1720 return numa_node_id();
1701 1721
1702 switch (policy->mode) { 1722 switch (policy->mode) {
1703 case MPOL_PREFERRED: 1723 case MPOL_PREFERRED:
1704 /* 1724 /*
1705 * handled MPOL_F_LOCAL above 1725 * handled MPOL_F_LOCAL above
1706 */ 1726 */
1707 return policy->v.preferred_node; 1727 return policy->v.preferred_node;
1708 1728
1709 case MPOL_INTERLEAVE: 1729 case MPOL_INTERLEAVE:
1710 return interleave_nodes(policy); 1730 return interleave_nodes(policy);
1711 1731
1712 case MPOL_BIND: { 1732 case MPOL_BIND: {
1713 /* 1733 /*
1714 * Follow bind policy behavior and start allocation at the 1734 * Follow bind policy behavior and start allocation at the
1715 * first node. 1735 * first node.
1716 */ 1736 */
1717 struct zonelist *zonelist; 1737 struct zonelist *zonelist;
1718 struct zone *zone; 1738 struct zone *zone;
1719 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 1739 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1720 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; 1740 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1721 (void)first_zones_zonelist(zonelist, highest_zoneidx, 1741 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1722 &policy->v.nodes, 1742 &policy->v.nodes,
1723 &zone); 1743 &zone);
1724 return zone ? zone->node : numa_node_id(); 1744 return zone ? zone->node : numa_node_id();
1725 } 1745 }
1726 1746
1727 default: 1747 default:
1728 BUG(); 1748 BUG();
1729 } 1749 }
1730 } 1750 }
1731 1751
1732 /* Do static interleaving for a VMA with known offset. */ 1752 /* Do static interleaving for a VMA with known offset. */
1733 static unsigned offset_il_node(struct mempolicy *pol, 1753 static unsigned offset_il_node(struct mempolicy *pol,
1734 struct vm_area_struct *vma, unsigned long off) 1754 struct vm_area_struct *vma, unsigned long off)
1735 { 1755 {
1736 unsigned nnodes = nodes_weight(pol->v.nodes); 1756 unsigned nnodes = nodes_weight(pol->v.nodes);
1737 unsigned target; 1757 unsigned target;
1738 int c; 1758 int c;
1739 int nid = -1; 1759 int nid = -1;
1740 1760
1741 if (!nnodes) 1761 if (!nnodes)
1742 return numa_node_id(); 1762 return numa_node_id();
1743 target = (unsigned int)off % nnodes; 1763 target = (unsigned int)off % nnodes;
1744 c = 0; 1764 c = 0;
1745 do { 1765 do {
1746 nid = next_node(nid, pol->v.nodes); 1766 nid = next_node(nid, pol->v.nodes);
1747 c++; 1767 c++;
1748 } while (c <= target); 1768 } while (c <= target);
1749 return nid; 1769 return nid;
1750 } 1770 }
1751 1771
1752 /* Determine a node number for interleave */ 1772 /* Determine a node number for interleave */
1753 static inline unsigned interleave_nid(struct mempolicy *pol, 1773 static inline unsigned interleave_nid(struct mempolicy *pol,
1754 struct vm_area_struct *vma, unsigned long addr, int shift) 1774 struct vm_area_struct *vma, unsigned long addr, int shift)
1755 { 1775 {
1756 if (vma) { 1776 if (vma) {
1757 unsigned long off; 1777 unsigned long off;
1758 1778
1759 /* 1779 /*
1760 * for small pages, there is no difference between 1780 * for small pages, there is no difference between
1761 * shift and PAGE_SHIFT, so the bit-shift is safe. 1781 * shift and PAGE_SHIFT, so the bit-shift is safe.
1762 * for huge pages, since vm_pgoff is in units of small 1782 * for huge pages, since vm_pgoff is in units of small
1763 * pages, we need to shift off the always 0 bits to get 1783 * pages, we need to shift off the always 0 bits to get
1764 * a useful offset. 1784 * a useful offset.
1765 */ 1785 */
1766 BUG_ON(shift < PAGE_SHIFT); 1786 BUG_ON(shift < PAGE_SHIFT);
1767 off = vma->vm_pgoff >> (shift - PAGE_SHIFT); 1787 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1768 off += (addr - vma->vm_start) >> shift; 1788 off += (addr - vma->vm_start) >> shift;
1769 return offset_il_node(pol, vma, off); 1789 return offset_il_node(pol, vma, off);
1770 } else 1790 } else
1771 return interleave_nodes(pol); 1791 return interleave_nodes(pol);
1772 } 1792 }
1773 1793
1774 /* 1794 /*
1775 * Return the bit number of a random bit set in the nodemask. 1795 * Return the bit number of a random bit set in the nodemask.
1776 * (returns -1 if nodemask is empty) 1796 * (returns -1 if nodemask is empty)
1777 */ 1797 */
1778 int node_random(const nodemask_t *maskp) 1798 int node_random(const nodemask_t *maskp)
1779 { 1799 {
1780 int w, bit = -1; 1800 int w, bit = -1;
1781 1801
1782 w = nodes_weight(*maskp); 1802 w = nodes_weight(*maskp);
1783 if (w) 1803 if (w)
1784 bit = bitmap_ord_to_pos(maskp->bits, 1804 bit = bitmap_ord_to_pos(maskp->bits,
1785 get_random_int() % w, MAX_NUMNODES); 1805 get_random_int() % w, MAX_NUMNODES);
1786 return bit; 1806 return bit;
1787 } 1807 }
1788 1808
1789 #ifdef CONFIG_HUGETLBFS 1809 #ifdef CONFIG_HUGETLBFS
1790 /* 1810 /*
1791 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1811 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1792 * @vma = virtual memory area whose policy is sought 1812 * @vma = virtual memory area whose policy is sought
1793 * @addr = address in @vma for shared policy lookup and interleave policy 1813 * @addr = address in @vma for shared policy lookup and interleave policy
1794 * @gfp_flags = for requested zone 1814 * @gfp_flags = for requested zone
1795 * @mpol = pointer to mempolicy pointer for reference counted mempolicy 1815 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1796 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask 1816 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1797 * 1817 *
1798 * Returns a zonelist suitable for a huge page allocation and a pointer 1818 * Returns a zonelist suitable for a huge page allocation and a pointer
1799 * to the struct mempolicy for conditional unref after allocation. 1819 * to the struct mempolicy for conditional unref after allocation.
1800 * If the effective policy is 'BIND, returns a pointer to the mempolicy's 1820 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1801 * @nodemask for filtering the zonelist. 1821 * @nodemask for filtering the zonelist.
1802 * 1822 *
1803 * Must be protected by get_mems_allowed() 1823 * Must be protected by get_mems_allowed()
1804 */ 1824 */
1805 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1825 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1806 gfp_t gfp_flags, struct mempolicy **mpol, 1826 gfp_t gfp_flags, struct mempolicy **mpol,
1807 nodemask_t **nodemask) 1827 nodemask_t **nodemask)
1808 { 1828 {
1809 struct zonelist *zl; 1829 struct zonelist *zl;
1810 1830
1811 *mpol = get_vma_policy(current, vma, addr); 1831 *mpol = get_vma_policy(current, vma, addr);
1812 *nodemask = NULL; /* assume !MPOL_BIND */ 1832 *nodemask = NULL; /* assume !MPOL_BIND */
1813 1833
1814 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1834 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1815 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1835 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1816 huge_page_shift(hstate_vma(vma))), gfp_flags); 1836 huge_page_shift(hstate_vma(vma))), gfp_flags);
1817 } else { 1837 } else {
1818 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); 1838 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1819 if ((*mpol)->mode == MPOL_BIND) 1839 if ((*mpol)->mode == MPOL_BIND)
1820 *nodemask = &(*mpol)->v.nodes; 1840 *nodemask = &(*mpol)->v.nodes;
1821 } 1841 }
1822 return zl; 1842 return zl;
1823 } 1843 }
1824 1844
1825 /* 1845 /*
1826 * init_nodemask_of_mempolicy 1846 * init_nodemask_of_mempolicy
1827 * 1847 *
1828 * If the current task's mempolicy is "default" [NULL], return 'false' 1848 * If the current task's mempolicy is "default" [NULL], return 'false'
1829 * to indicate default policy. Otherwise, extract the policy nodemask 1849 * to indicate default policy. Otherwise, extract the policy nodemask
1830 * for 'bind' or 'interleave' policy into the argument nodemask, or 1850 * for 'bind' or 'interleave' policy into the argument nodemask, or
1831 * initialize the argument nodemask to contain the single node for 1851 * initialize the argument nodemask to contain the single node for
1832 * 'preferred' or 'local' policy and return 'true' to indicate presence 1852 * 'preferred' or 'local' policy and return 'true' to indicate presence
1833 * of non-default mempolicy. 1853 * of non-default mempolicy.
1834 * 1854 *
1835 * We don't bother with reference counting the mempolicy [mpol_get/put] 1855 * We don't bother with reference counting the mempolicy [mpol_get/put]
1836 * because the current task is examining it's own mempolicy and a task's 1856 * because the current task is examining it's own mempolicy and a task's
1837 * mempolicy is only ever changed by the task itself. 1857 * mempolicy is only ever changed by the task itself.
1838 * 1858 *
1839 * N.B., it is the caller's responsibility to free a returned nodemask. 1859 * N.B., it is the caller's responsibility to free a returned nodemask.
1840 */ 1860 */
1841 bool init_nodemask_of_mempolicy(nodemask_t *mask) 1861 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1842 { 1862 {
1843 struct mempolicy *mempolicy; 1863 struct mempolicy *mempolicy;
1844 int nid; 1864 int nid;
1845 1865
1846 if (!(mask && current->mempolicy)) 1866 if (!(mask && current->mempolicy))
1847 return false; 1867 return false;
1848 1868
1849 task_lock(current); 1869 task_lock(current);
1850 mempolicy = current->mempolicy; 1870 mempolicy = current->mempolicy;
1851 switch (mempolicy->mode) { 1871 switch (mempolicy->mode) {
1852 case MPOL_PREFERRED: 1872 case MPOL_PREFERRED:
1853 if (mempolicy->flags & MPOL_F_LOCAL) 1873 if (mempolicy->flags & MPOL_F_LOCAL)
1854 nid = numa_node_id(); 1874 nid = numa_node_id();
1855 else 1875 else
1856 nid = mempolicy->v.preferred_node; 1876 nid = mempolicy->v.preferred_node;
1857 init_nodemask_of_node(mask, nid); 1877 init_nodemask_of_node(mask, nid);
1858 break; 1878 break;
1859 1879
1860 case MPOL_BIND: 1880 case MPOL_BIND:
1861 /* Fall through */ 1881 /* Fall through */
1862 case MPOL_INTERLEAVE: 1882 case MPOL_INTERLEAVE:
1863 *mask = mempolicy->v.nodes; 1883 *mask = mempolicy->v.nodes;
1864 break; 1884 break;
1865 1885
1866 default: 1886 default:
1867 BUG(); 1887 BUG();
1868 } 1888 }
1869 task_unlock(current); 1889 task_unlock(current);
1870 1890
1871 return true; 1891 return true;
1872 } 1892 }
1873 #endif 1893 #endif
1874 1894
1875 /* 1895 /*
1876 * mempolicy_nodemask_intersects 1896 * mempolicy_nodemask_intersects
1877 * 1897 *
1878 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default 1898 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1879 * policy. Otherwise, check for intersection between mask and the policy 1899 * policy. Otherwise, check for intersection between mask and the policy
1880 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' 1900 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1881 * policy, always return true since it may allocate elsewhere on fallback. 1901 * policy, always return true since it may allocate elsewhere on fallback.
1882 * 1902 *
1883 * Takes task_lock(tsk) to prevent freeing of its mempolicy. 1903 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1884 */ 1904 */
1885 bool mempolicy_nodemask_intersects(struct task_struct *tsk, 1905 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1886 const nodemask_t *mask) 1906 const nodemask_t *mask)
1887 { 1907 {
1888 struct mempolicy *mempolicy; 1908 struct mempolicy *mempolicy;
1889 bool ret = true; 1909 bool ret = true;
1890 1910
1891 if (!mask) 1911 if (!mask)
1892 return ret; 1912 return ret;
1893 task_lock(tsk); 1913 task_lock(tsk);
1894 mempolicy = tsk->mempolicy; 1914 mempolicy = tsk->mempolicy;
1895 if (!mempolicy) 1915 if (!mempolicy)
1896 goto out; 1916 goto out;
1897 1917
1898 switch (mempolicy->mode) { 1918 switch (mempolicy->mode) {
1899 case MPOL_PREFERRED: 1919 case MPOL_PREFERRED:
1900 /* 1920 /*
1901 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to 1921 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1902 * allocate from, they may fallback to other nodes when oom. 1922 * allocate from, they may fallback to other nodes when oom.
1903 * Thus, it's possible for tsk to have allocated memory from 1923 * Thus, it's possible for tsk to have allocated memory from
1904 * nodes in mask. 1924 * nodes in mask.
1905 */ 1925 */
1906 break; 1926 break;
1907 case MPOL_BIND: 1927 case MPOL_BIND:
1908 case MPOL_INTERLEAVE: 1928 case MPOL_INTERLEAVE:
1909 ret = nodes_intersects(mempolicy->v.nodes, *mask); 1929 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1910 break; 1930 break;
1911 default: 1931 default:
1912 BUG(); 1932 BUG();
1913 } 1933 }
1914 out: 1934 out:
1915 task_unlock(tsk); 1935 task_unlock(tsk);
1916 return ret; 1936 return ret;
1917 } 1937 }
1918 1938
1919 /* Allocate a page in interleaved policy. 1939 /* Allocate a page in interleaved policy.
1920 Own path because it needs to do special accounting. */ 1940 Own path because it needs to do special accounting. */
1921 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1941 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1922 unsigned nid) 1942 unsigned nid)
1923 { 1943 {
1924 struct zonelist *zl; 1944 struct zonelist *zl;
1925 struct page *page; 1945 struct page *page;
1926 1946
1927 zl = node_zonelist(nid, gfp); 1947 zl = node_zonelist(nid, gfp);
1928 page = __alloc_pages(gfp, order, zl); 1948 page = __alloc_pages(gfp, order, zl);
1929 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) 1949 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1930 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); 1950 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1931 return page; 1951 return page;
1932 } 1952 }
1933 1953
1934 /** 1954 /**
1935 * alloc_pages_vma - Allocate a page for a VMA. 1955 * alloc_pages_vma - Allocate a page for a VMA.
1936 * 1956 *
1937 * @gfp: 1957 * @gfp:
1938 * %GFP_USER user allocation. 1958 * %GFP_USER user allocation.
1939 * %GFP_KERNEL kernel allocations, 1959 * %GFP_KERNEL kernel allocations,
1940 * %GFP_HIGHMEM highmem/user allocations, 1960 * %GFP_HIGHMEM highmem/user allocations,
1941 * %GFP_FS allocation should not call back into a file system. 1961 * %GFP_FS allocation should not call back into a file system.
1942 * %GFP_ATOMIC don't sleep. 1962 * %GFP_ATOMIC don't sleep.
1943 * 1963 *
1944 * @order:Order of the GFP allocation. 1964 * @order:Order of the GFP allocation.
1945 * @vma: Pointer to VMA or NULL if not available. 1965 * @vma: Pointer to VMA or NULL if not available.
1946 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1966 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1947 * 1967 *
1948 * This function allocates a page from the kernel page pool and applies 1968 * This function allocates a page from the kernel page pool and applies
1949 * a NUMA policy associated with the VMA or the current process. 1969 * a NUMA policy associated with the VMA or the current process.
1950 * When VMA is not NULL caller must hold down_read on the mmap_sem of the 1970 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1951 * mm_struct of the VMA to prevent it from going away. Should be used for 1971 * mm_struct of the VMA to prevent it from going away. Should be used for
1952 * all allocations for pages that will be mapped into 1972 * all allocations for pages that will be mapped into
1953 * user space. Returns NULL when no page can be allocated. 1973 * user space. Returns NULL when no page can be allocated.
1954 * 1974 *
1955 * Should be called with the mm_sem of the vma hold. 1975 * Should be called with the mm_sem of the vma hold.
1956 */ 1976 */
1957 struct page * 1977 struct page *
1958 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1978 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1959 unsigned long addr, int node) 1979 unsigned long addr, int node)
1960 { 1980 {
1961 struct mempolicy *pol; 1981 struct mempolicy *pol;
1962 struct zonelist *zl; 1982 struct zonelist *zl;
1963 struct page *page; 1983 struct page *page;
1964 unsigned int cpuset_mems_cookie; 1984 unsigned int cpuset_mems_cookie;
1965 1985
1966 retry_cpuset: 1986 retry_cpuset:
1967 pol = get_vma_policy(current, vma, addr); 1987 pol = get_vma_policy(current, vma, addr);
1968 cpuset_mems_cookie = get_mems_allowed(); 1988 cpuset_mems_cookie = get_mems_allowed();
1969 1989
1970 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1990 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1971 unsigned nid; 1991 unsigned nid;
1972 1992
1973 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1993 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1974 mpol_cond_put(pol); 1994 mpol_cond_put(pol);
1975 page = alloc_page_interleave(gfp, order, nid); 1995 page = alloc_page_interleave(gfp, order, nid);
1976 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 1996 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1977 goto retry_cpuset; 1997 goto retry_cpuset;
1978 1998
1979 return page; 1999 return page;
1980 } 2000 }
1981 zl = policy_zonelist(gfp, pol, node); 2001 zl = policy_zonelist(gfp, pol, node);
1982 if (unlikely(mpol_needs_cond_ref(pol))) { 2002 if (unlikely(mpol_needs_cond_ref(pol))) {
1983 /* 2003 /*
1984 * slow path: ref counted shared policy 2004 * slow path: ref counted shared policy
1985 */ 2005 */
1986 struct page *page = __alloc_pages_nodemask(gfp, order, 2006 struct page *page = __alloc_pages_nodemask(gfp, order,
1987 zl, policy_nodemask(gfp, pol)); 2007 zl, policy_nodemask(gfp, pol));
1988 __mpol_put(pol); 2008 __mpol_put(pol);
1989 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2009 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1990 goto retry_cpuset; 2010 goto retry_cpuset;
1991 return page; 2011 return page;
1992 } 2012 }
1993 /* 2013 /*
1994 * fast path: default or task policy 2014 * fast path: default or task policy
1995 */ 2015 */
1996 page = __alloc_pages_nodemask(gfp, order, zl, 2016 page = __alloc_pages_nodemask(gfp, order, zl,
1997 policy_nodemask(gfp, pol)); 2017 policy_nodemask(gfp, pol));
1998 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2018 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1999 goto retry_cpuset; 2019 goto retry_cpuset;
2000 return page; 2020 return page;
2001 } 2021 }
2002 2022
2003 /** 2023 /**
2004 * alloc_pages_current - Allocate pages. 2024 * alloc_pages_current - Allocate pages.
2005 * 2025 *
2006 * @gfp: 2026 * @gfp:
2007 * %GFP_USER user allocation, 2027 * %GFP_USER user allocation,
2008 * %GFP_KERNEL kernel allocation, 2028 * %GFP_KERNEL kernel allocation,
2009 * %GFP_HIGHMEM highmem allocation, 2029 * %GFP_HIGHMEM highmem allocation,
2010 * %GFP_FS don't call back into a file system. 2030 * %GFP_FS don't call back into a file system.
2011 * %GFP_ATOMIC don't sleep. 2031 * %GFP_ATOMIC don't sleep.
2012 * @order: Power of two of allocation size in pages. 0 is a single page. 2032 * @order: Power of two of allocation size in pages. 0 is a single page.
2013 * 2033 *
2014 * Allocate a page from the kernel page pool. When not in 2034 * Allocate a page from the kernel page pool. When not in
2015 * interrupt context and apply the current process NUMA policy. 2035 * interrupt context and apply the current process NUMA policy.
2016 * Returns NULL when no page can be allocated. 2036 * Returns NULL when no page can be allocated.
2017 * 2037 *
2018 * Don't call cpuset_update_task_memory_state() unless 2038 * Don't call cpuset_update_task_memory_state() unless
2019 * 1) it's ok to take cpuset_sem (can WAIT), and 2039 * 1) it's ok to take cpuset_sem (can WAIT), and
2020 * 2) allocating for current task (not interrupt). 2040 * 2) allocating for current task (not interrupt).
2021 */ 2041 */
2022 struct page *alloc_pages_current(gfp_t gfp, unsigned order) 2042 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2023 { 2043 {
2024 struct mempolicy *pol = current->mempolicy; 2044 struct mempolicy *pol = get_task_policy(current);
2025 struct page *page; 2045 struct page *page;
2026 unsigned int cpuset_mems_cookie; 2046 unsigned int cpuset_mems_cookie;
2027 2047
2028 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 2048 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2029 pol = &default_policy; 2049 pol = &default_policy;
2030 2050
2031 retry_cpuset: 2051 retry_cpuset:
2032 cpuset_mems_cookie = get_mems_allowed(); 2052 cpuset_mems_cookie = get_mems_allowed();
2033 2053
2034 /* 2054 /*
2035 * No reference counting needed for current->mempolicy 2055 * No reference counting needed for current->mempolicy
2036 * nor system default_policy 2056 * nor system default_policy
2037 */ 2057 */
2038 if (pol->mode == MPOL_INTERLEAVE) 2058 if (pol->mode == MPOL_INTERLEAVE)
2039 page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); 2059 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2040 else 2060 else
2041 page = __alloc_pages_nodemask(gfp, order, 2061 page = __alloc_pages_nodemask(gfp, order,
2042 policy_zonelist(gfp, pol, numa_node_id()), 2062 policy_zonelist(gfp, pol, numa_node_id()),
2043 policy_nodemask(gfp, pol)); 2063 policy_nodemask(gfp, pol));
2044 2064
2045 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2065 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2046 goto retry_cpuset; 2066 goto retry_cpuset;
2047 2067
2048 return page; 2068 return page;
2049 } 2069 }
2050 EXPORT_SYMBOL(alloc_pages_current); 2070 EXPORT_SYMBOL(alloc_pages_current);
2051 2071
2052 /* 2072 /*
2053 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2073 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2054 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2074 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2055 * with the mems_allowed returned by cpuset_mems_allowed(). This 2075 * with the mems_allowed returned by cpuset_mems_allowed(). This
2056 * keeps mempolicies cpuset relative after its cpuset moves. See 2076 * keeps mempolicies cpuset relative after its cpuset moves. See
2057 * further kernel/cpuset.c update_nodemask(). 2077 * further kernel/cpuset.c update_nodemask().
2058 * 2078 *
2059 * current's mempolicy may be rebinded by the other task(the task that changes 2079 * current's mempolicy may be rebinded by the other task(the task that changes
2060 * cpuset's mems), so we needn't do rebind work for current task. 2080 * cpuset's mems), so we needn't do rebind work for current task.
2061 */ 2081 */
2062 2082
2063 /* Slow path of a mempolicy duplicate */ 2083 /* Slow path of a mempolicy duplicate */
2064 struct mempolicy *__mpol_dup(struct mempolicy *old) 2084 struct mempolicy *__mpol_dup(struct mempolicy *old)
2065 { 2085 {
2066 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2086 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2067 2087
2068 if (!new) 2088 if (!new)
2069 return ERR_PTR(-ENOMEM); 2089 return ERR_PTR(-ENOMEM);
2070 2090
2071 /* task's mempolicy is protected by alloc_lock */ 2091 /* task's mempolicy is protected by alloc_lock */
2072 if (old == current->mempolicy) { 2092 if (old == current->mempolicy) {
2073 task_lock(current); 2093 task_lock(current);
2074 *new = *old; 2094 *new = *old;
2075 task_unlock(current); 2095 task_unlock(current);
2076 } else 2096 } else
2077 *new = *old; 2097 *new = *old;
2078 2098
2079 rcu_read_lock(); 2099 rcu_read_lock();
2080 if (current_cpuset_is_being_rebound()) { 2100 if (current_cpuset_is_being_rebound()) {
2081 nodemask_t mems = cpuset_mems_allowed(current); 2101 nodemask_t mems = cpuset_mems_allowed(current);
2082 if (new->flags & MPOL_F_REBINDING) 2102 if (new->flags & MPOL_F_REBINDING)
2083 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); 2103 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2084 else 2104 else
2085 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); 2105 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2086 } 2106 }
2087 rcu_read_unlock(); 2107 rcu_read_unlock();
2088 atomic_set(&new->refcnt, 1); 2108 atomic_set(&new->refcnt, 1);
2089 return new; 2109 return new;
2090 } 2110 }
2091 2111
2092 /* 2112 /*
2093 * If *frompol needs [has] an extra ref, copy *frompol to *tompol , 2113 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
2094 * eliminate the * MPOL_F_* flags that require conditional ref and 2114 * eliminate the * MPOL_F_* flags that require conditional ref and
2095 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly 2115 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
2096 * after return. Use the returned value. 2116 * after return. Use the returned value.
2097 * 2117 *
2098 * Allows use of a mempolicy for, e.g., multiple allocations with a single 2118 * Allows use of a mempolicy for, e.g., multiple allocations with a single
2099 * policy lookup, even if the policy needs/has extra ref on lookup. 2119 * policy lookup, even if the policy needs/has extra ref on lookup.
2100 * shmem_readahead needs this. 2120 * shmem_readahead needs this.
2101 */ 2121 */
2102 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, 2122 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
2103 struct mempolicy *frompol) 2123 struct mempolicy *frompol)
2104 { 2124 {
2105 if (!mpol_needs_cond_ref(frompol)) 2125 if (!mpol_needs_cond_ref(frompol))
2106 return frompol; 2126 return frompol;
2107 2127
2108 *tompol = *frompol; 2128 *tompol = *frompol;
2109 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ 2129 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
2110 __mpol_put(frompol); 2130 __mpol_put(frompol);
2111 return tompol; 2131 return tompol;
2112 } 2132 }
2113 2133
2114 /* Slow path of a mempolicy comparison */ 2134 /* Slow path of a mempolicy comparison */
2115 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2135 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2116 { 2136 {
2117 if (!a || !b) 2137 if (!a || !b)
2118 return false; 2138 return false;
2119 if (a->mode != b->mode) 2139 if (a->mode != b->mode)
2120 return false; 2140 return false;
2121 if (a->flags != b->flags) 2141 if (a->flags != b->flags)
2122 return false; 2142 return false;
2123 if (mpol_store_user_nodemask(a)) 2143 if (mpol_store_user_nodemask(a))
2124 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2144 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2125 return false; 2145 return false;
2126 2146
2127 switch (a->mode) { 2147 switch (a->mode) {
2128 case MPOL_BIND: 2148 case MPOL_BIND:
2129 /* Fall through */ 2149 /* Fall through */
2130 case MPOL_INTERLEAVE: 2150 case MPOL_INTERLEAVE:
2131 return !!nodes_equal(a->v.nodes, b->v.nodes); 2151 return !!nodes_equal(a->v.nodes, b->v.nodes);
2132 case MPOL_PREFERRED: 2152 case MPOL_PREFERRED:
2133 return a->v.preferred_node == b->v.preferred_node; 2153 return a->v.preferred_node == b->v.preferred_node;
2134 default: 2154 default:
2135 BUG(); 2155 BUG();
2136 return false; 2156 return false;
2137 } 2157 }
2138 } 2158 }
2139 2159
2140 /* 2160 /*
2141 * Shared memory backing store policy support. 2161 * Shared memory backing store policy support.
2142 * 2162 *
2143 * Remember policies even when nobody has shared memory mapped. 2163 * Remember policies even when nobody has shared memory mapped.
2144 * The policies are kept in Red-Black tree linked from the inode. 2164 * The policies are kept in Red-Black tree linked from the inode.
2145 * They are protected by the sp->lock spinlock, which should be held 2165 * They are protected by the sp->lock spinlock, which should be held
2146 * for any accesses to the tree. 2166 * for any accesses to the tree.
2147 */ 2167 */
2148 2168
2149 /* lookup first element intersecting start-end */ 2169 /* lookup first element intersecting start-end */
2150 /* Caller holds sp->mutex */ 2170 /* Caller holds sp->mutex */
2151 static struct sp_node * 2171 static struct sp_node *
2152 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2172 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2153 { 2173 {
2154 struct rb_node *n = sp->root.rb_node; 2174 struct rb_node *n = sp->root.rb_node;
2155 2175
2156 while (n) { 2176 while (n) {
2157 struct sp_node *p = rb_entry(n, struct sp_node, nd); 2177 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2158 2178
2159 if (start >= p->end) 2179 if (start >= p->end)
2160 n = n->rb_right; 2180 n = n->rb_right;
2161 else if (end <= p->start) 2181 else if (end <= p->start)
2162 n = n->rb_left; 2182 n = n->rb_left;
2163 else 2183 else
2164 break; 2184 break;
2165 } 2185 }
2166 if (!n) 2186 if (!n)
2167 return NULL; 2187 return NULL;
2168 for (;;) { 2188 for (;;) {
2169 struct sp_node *w = NULL; 2189 struct sp_node *w = NULL;
2170 struct rb_node *prev = rb_prev(n); 2190 struct rb_node *prev = rb_prev(n);
2171 if (!prev) 2191 if (!prev)
2172 break; 2192 break;
2173 w = rb_entry(prev, struct sp_node, nd); 2193 w = rb_entry(prev, struct sp_node, nd);
2174 if (w->end <= start) 2194 if (w->end <= start)
2175 break; 2195 break;
2176 n = prev; 2196 n = prev;
2177 } 2197 }
2178 return rb_entry(n, struct sp_node, nd); 2198 return rb_entry(n, struct sp_node, nd);
2179 } 2199 }
2180 2200
2181 /* Insert a new shared policy into the list. */ 2201 /* Insert a new shared policy into the list. */
2182 /* Caller holds sp->lock */ 2202 /* Caller holds sp->lock */
2183 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2203 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2184 { 2204 {
2185 struct rb_node **p = &sp->root.rb_node; 2205 struct rb_node **p = &sp->root.rb_node;
2186 struct rb_node *parent = NULL; 2206 struct rb_node *parent = NULL;
2187 struct sp_node *nd; 2207 struct sp_node *nd;
2188 2208
2189 while (*p) { 2209 while (*p) {
2190 parent = *p; 2210 parent = *p;
2191 nd = rb_entry(parent, struct sp_node, nd); 2211 nd = rb_entry(parent, struct sp_node, nd);
2192 if (new->start < nd->start) 2212 if (new->start < nd->start)
2193 p = &(*p)->rb_left; 2213 p = &(*p)->rb_left;
2194 else if (new->end > nd->end) 2214 else if (new->end > nd->end)
2195 p = &(*p)->rb_right; 2215 p = &(*p)->rb_right;
2196 else 2216 else
2197 BUG(); 2217 BUG();
2198 } 2218 }
2199 rb_link_node(&new->nd, parent, p); 2219 rb_link_node(&new->nd, parent, p);
2200 rb_insert_color(&new->nd, &sp->root); 2220 rb_insert_color(&new->nd, &sp->root);
2201 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, 2221 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2202 new->policy ? new->policy->mode : 0); 2222 new->policy ? new->policy->mode : 0);
2203 } 2223 }
2204 2224
2205 /* Find shared policy intersecting idx */ 2225 /* Find shared policy intersecting idx */
2206 struct mempolicy * 2226 struct mempolicy *
2207 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 2227 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2208 { 2228 {
2209 struct mempolicy *pol = NULL; 2229 struct mempolicy *pol = NULL;
2210 struct sp_node *sn; 2230 struct sp_node *sn;
2211 2231
2212 if (!sp->root.rb_node) 2232 if (!sp->root.rb_node)
2213 return NULL; 2233 return NULL;
2214 mutex_lock(&sp->mutex); 2234 mutex_lock(&sp->mutex);
2215 sn = sp_lookup(sp, idx, idx+1); 2235 sn = sp_lookup(sp, idx, idx+1);
2216 if (sn) { 2236 if (sn) {
2217 mpol_get(sn->policy); 2237 mpol_get(sn->policy);
2218 pol = sn->policy; 2238 pol = sn->policy;
2219 } 2239 }
2220 mutex_unlock(&sp->mutex); 2240 mutex_unlock(&sp->mutex);
2221 return pol; 2241 return pol;
2222 } 2242 }
2223 2243
2224 static void sp_free(struct sp_node *n) 2244 static void sp_free(struct sp_node *n)
2225 { 2245 {
2226 mpol_put(n->policy); 2246 mpol_put(n->policy);
2227 kmem_cache_free(sn_cache, n); 2247 kmem_cache_free(sn_cache, n);
2228 } 2248 }
2229 2249
2230 /** 2250 /**
2231 * mpol_misplaced - check whether current page node is valid in policy 2251 * mpol_misplaced - check whether current page node is valid in policy
2232 * 2252 *
2233 * @page - page to be checked 2253 * @page - page to be checked
2234 * @vma - vm area where page mapped 2254 * @vma - vm area where page mapped
2235 * @addr - virtual address where page mapped 2255 * @addr - virtual address where page mapped
2236 * 2256 *
2237 * Lookup current policy node id for vma,addr and "compare to" page's 2257 * Lookup current policy node id for vma,addr and "compare to" page's
2238 * node id. 2258 * node id.
2239 * 2259 *
2240 * Returns: 2260 * Returns:
2241 * -1 - not misplaced, page is in the right node 2261 * -1 - not misplaced, page is in the right node
2242 * node - node id where the page should be 2262 * node - node id where the page should be
2243 * 2263 *
2244 * Policy determination "mimics" alloc_page_vma(). 2264 * Policy determination "mimics" alloc_page_vma().
2245 * Called from fault path where we know the vma and faulting address. 2265 * Called from fault path where we know the vma and faulting address.
2246 */ 2266 */
2247 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) 2267 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2248 { 2268 {
2249 struct mempolicy *pol; 2269 struct mempolicy *pol;
2250 struct zone *zone; 2270 struct zone *zone;
2251 int curnid = page_to_nid(page); 2271 int curnid = page_to_nid(page);
2252 unsigned long pgoff; 2272 unsigned long pgoff;
2253 int polnid = -1; 2273 int polnid = -1;
2254 int ret = -1; 2274 int ret = -1;
2255 2275
2256 BUG_ON(!vma); 2276 BUG_ON(!vma);
2257 2277
2258 pol = get_vma_policy(current, vma, addr); 2278 pol = get_vma_policy(current, vma, addr);
2259 if (!(pol->flags & MPOL_F_MOF)) 2279 if (!(pol->flags & MPOL_F_MOF))
2260 goto out; 2280 goto out;
2261 2281
2262 switch (pol->mode) { 2282 switch (pol->mode) {
2263 case MPOL_INTERLEAVE: 2283 case MPOL_INTERLEAVE:
2264 BUG_ON(addr >= vma->vm_end); 2284 BUG_ON(addr >= vma->vm_end);
2265 BUG_ON(addr < vma->vm_start); 2285 BUG_ON(addr < vma->vm_start);
2266 2286
2267 pgoff = vma->vm_pgoff; 2287 pgoff = vma->vm_pgoff;
2268 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; 2288 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2269 polnid = offset_il_node(pol, vma, pgoff); 2289 polnid = offset_il_node(pol, vma, pgoff);
2270 break; 2290 break;
2271 2291
2272 case MPOL_PREFERRED: 2292 case MPOL_PREFERRED:
2273 if (pol->flags & MPOL_F_LOCAL) 2293 if (pol->flags & MPOL_F_LOCAL)
2274 polnid = numa_node_id(); 2294 polnid = numa_node_id();
2275 else 2295 else
2276 polnid = pol->v.preferred_node; 2296 polnid = pol->v.preferred_node;
2277 break; 2297 break;
2278 2298
2279 case MPOL_BIND: 2299 case MPOL_BIND:
2280 /* 2300 /*
2281 * allows binding to multiple nodes. 2301 * allows binding to multiple nodes.
2282 * use current page if in policy nodemask, 2302 * use current page if in policy nodemask,
2283 * else select nearest allowed node, if any. 2303 * else select nearest allowed node, if any.
2284 * If no allowed nodes, use current [!misplaced]. 2304 * If no allowed nodes, use current [!misplaced].
2285 */ 2305 */
2286 if (node_isset(curnid, pol->v.nodes)) 2306 if (node_isset(curnid, pol->v.nodes))
2287 goto out; 2307 goto out;
2288 (void)first_zones_zonelist( 2308 (void)first_zones_zonelist(
2289 node_zonelist(numa_node_id(), GFP_HIGHUSER), 2309 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2290 gfp_zone(GFP_HIGHUSER), 2310 gfp_zone(GFP_HIGHUSER),
2291 &pol->v.nodes, &zone); 2311 &pol->v.nodes, &zone);
2292 polnid = zone->node; 2312 polnid = zone->node;
2293 break; 2313 break;
2294 2314
2295 default: 2315 default:
2296 BUG(); 2316 BUG();
2297 } 2317 }
2318
2319 /* Migrate the page towards the node whose CPU is referencing it */
2320 if (pol->flags & MPOL_F_MORON)
2321 polnid = numa_node_id();
2322
2298 if (curnid != polnid) 2323 if (curnid != polnid)
2299 ret = polnid; 2324 ret = polnid;
2300 out: 2325 out:
2301 mpol_cond_put(pol); 2326 mpol_cond_put(pol);
2302 2327
2303 return ret; 2328 return ret;
2304 } 2329 }
2305 2330
2306 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2331 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2307 { 2332 {
2308 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2333 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2309 rb_erase(&n->nd, &sp->root); 2334 rb_erase(&n->nd, &sp->root);
2310 sp_free(n); 2335 sp_free(n);
2311 } 2336 }
2312 2337
2313 static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2338 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2314 struct mempolicy *pol) 2339 struct mempolicy *pol)
2315 { 2340 {
2316 struct sp_node *n; 2341 struct sp_node *n;
2317 struct mempolicy *newpol; 2342 struct mempolicy *newpol;
2318 2343
2319 n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2344 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2320 if (!n) 2345 if (!n)
2321 return NULL; 2346 return NULL;
2322 2347
2323 newpol = mpol_dup(pol); 2348 newpol = mpol_dup(pol);
2324 if (IS_ERR(newpol)) { 2349 if (IS_ERR(newpol)) {
2325 kmem_cache_free(sn_cache, n); 2350 kmem_cache_free(sn_cache, n);
2326 return NULL; 2351 return NULL;
2327 } 2352 }
2328 newpol->flags |= MPOL_F_SHARED; 2353 newpol->flags |= MPOL_F_SHARED;
2329 2354
2330 n->start = start; 2355 n->start = start;
2331 n->end = end; 2356 n->end = end;
2332 n->policy = newpol; 2357 n->policy = newpol;
2333 2358
2334 return n; 2359 return n;
2335 } 2360 }
2336 2361
2337 /* Replace a policy range. */ 2362 /* Replace a policy range. */
2338 static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 2363 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2339 unsigned long end, struct sp_node *new) 2364 unsigned long end, struct sp_node *new)
2340 { 2365 {
2341 struct sp_node *n; 2366 struct sp_node *n;
2342 int ret = 0; 2367 int ret = 0;
2343 2368
2344 mutex_lock(&sp->mutex); 2369 mutex_lock(&sp->mutex);
2345 n = sp_lookup(sp, start, end); 2370 n = sp_lookup(sp, start, end);
2346 /* Take care of old policies in the same range. */ 2371 /* Take care of old policies in the same range. */
2347 while (n && n->start < end) { 2372 while (n && n->start < end) {
2348 struct rb_node *next = rb_next(&n->nd); 2373 struct rb_node *next = rb_next(&n->nd);
2349 if (n->start >= start) { 2374 if (n->start >= start) {
2350 if (n->end <= end) 2375 if (n->end <= end)
2351 sp_delete(sp, n); 2376 sp_delete(sp, n);
2352 else 2377 else
2353 n->start = end; 2378 n->start = end;
2354 } else { 2379 } else {
2355 /* Old policy spanning whole new range. */ 2380 /* Old policy spanning whole new range. */
2356 if (n->end > end) { 2381 if (n->end > end) {
2357 struct sp_node *new2; 2382 struct sp_node *new2;
2358 new2 = sp_alloc(end, n->end, n->policy); 2383 new2 = sp_alloc(end, n->end, n->policy);
2359 if (!new2) { 2384 if (!new2) {
2360 ret = -ENOMEM; 2385 ret = -ENOMEM;
2361 goto out; 2386 goto out;
2362 } 2387 }
2363 n->end = start; 2388 n->end = start;
2364 sp_insert(sp, new2); 2389 sp_insert(sp, new2);
2365 break; 2390 break;
2366 } else 2391 } else
2367 n->end = start; 2392 n->end = start;
2368 } 2393 }
2369 if (!next) 2394 if (!next)
2370 break; 2395 break;
2371 n = rb_entry(next, struct sp_node, nd); 2396 n = rb_entry(next, struct sp_node, nd);
2372 } 2397 }
2373 if (new) 2398 if (new)
2374 sp_insert(sp, new); 2399 sp_insert(sp, new);
2375 out: 2400 out:
2376 mutex_unlock(&sp->mutex); 2401 mutex_unlock(&sp->mutex);
2377 return ret; 2402 return ret;
2378 } 2403 }
2379 2404
2380 /** 2405 /**
2381 * mpol_shared_policy_init - initialize shared policy for inode 2406 * mpol_shared_policy_init - initialize shared policy for inode
2382 * @sp: pointer to inode shared policy 2407 * @sp: pointer to inode shared policy
2383 * @mpol: struct mempolicy to install 2408 * @mpol: struct mempolicy to install
2384 * 2409 *
2385 * Install non-NULL @mpol in inode's shared policy rb-tree. 2410 * Install non-NULL @mpol in inode's shared policy rb-tree.
2386 * On entry, the current task has a reference on a non-NULL @mpol. 2411 * On entry, the current task has a reference on a non-NULL @mpol.
2387 * This must be released on exit. 2412 * This must be released on exit.
2388 * This is called at get_inode() calls and we can use GFP_KERNEL. 2413 * This is called at get_inode() calls and we can use GFP_KERNEL.
2389 */ 2414 */
2390 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 2415 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2391 { 2416 {
2392 int ret; 2417 int ret;
2393 2418
2394 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2419 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2395 mutex_init(&sp->mutex); 2420 mutex_init(&sp->mutex);
2396 2421
2397 if (mpol) { 2422 if (mpol) {
2398 struct vm_area_struct pvma; 2423 struct vm_area_struct pvma;
2399 struct mempolicy *new; 2424 struct mempolicy *new;
2400 NODEMASK_SCRATCH(scratch); 2425 NODEMASK_SCRATCH(scratch);
2401 2426
2402 if (!scratch) 2427 if (!scratch)
2403 goto put_mpol; 2428 goto put_mpol;
2404 /* contextualize the tmpfs mount point mempolicy */ 2429 /* contextualize the tmpfs mount point mempolicy */
2405 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 2430 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2406 if (IS_ERR(new)) 2431 if (IS_ERR(new))
2407 goto free_scratch; /* no valid nodemask intersection */ 2432 goto free_scratch; /* no valid nodemask intersection */
2408 2433
2409 task_lock(current); 2434 task_lock(current);
2410 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); 2435 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2411 task_unlock(current); 2436 task_unlock(current);
2412 if (ret) 2437 if (ret)
2413 goto put_new; 2438 goto put_new;
2414 2439
2415 /* Create pseudo-vma that contains just the policy */ 2440 /* Create pseudo-vma that contains just the policy */
2416 memset(&pvma, 0, sizeof(struct vm_area_struct)); 2441 memset(&pvma, 0, sizeof(struct vm_area_struct));
2417 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 2442 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2418 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 2443 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2419 2444
2420 put_new: 2445 put_new:
2421 mpol_put(new); /* drop initial ref */ 2446 mpol_put(new); /* drop initial ref */
2422 free_scratch: 2447 free_scratch:
2423 NODEMASK_SCRATCH_FREE(scratch); 2448 NODEMASK_SCRATCH_FREE(scratch);
2424 put_mpol: 2449 put_mpol:
2425 mpol_put(mpol); /* drop our incoming ref on sb mpol */ 2450 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2426 } 2451 }
2427 } 2452 }
2428 2453
2429 int mpol_set_shared_policy(struct shared_policy *info, 2454 int mpol_set_shared_policy(struct shared_policy *info,
2430 struct vm_area_struct *vma, struct mempolicy *npol) 2455 struct vm_area_struct *vma, struct mempolicy *npol)
2431 { 2456 {
2432 int err; 2457 int err;
2433 struct sp_node *new = NULL; 2458 struct sp_node *new = NULL;
2434 unsigned long sz = vma_pages(vma); 2459 unsigned long sz = vma_pages(vma);
2435 2460
2436 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", 2461 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2437 vma->vm_pgoff, 2462 vma->vm_pgoff,
2438 sz, npol ? npol->mode : -1, 2463 sz, npol ? npol->mode : -1,
2439 npol ? npol->flags : -1, 2464 npol ? npol->flags : -1,
2440 npol ? nodes_addr(npol->v.nodes)[0] : -1); 2465 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2441 2466
2442 if (npol) { 2467 if (npol) {
2443 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 2468 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2444 if (!new) 2469 if (!new)
2445 return -ENOMEM; 2470 return -ENOMEM;
2446 } 2471 }
2447 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 2472 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2448 if (err && new) 2473 if (err && new)
2449 sp_free(new); 2474 sp_free(new);
2450 return err; 2475 return err;
2451 } 2476 }
2452 2477
2453 /* Free a backing policy store on inode delete. */ 2478 /* Free a backing policy store on inode delete. */
2454 void mpol_free_shared_policy(struct shared_policy *p) 2479 void mpol_free_shared_policy(struct shared_policy *p)
2455 { 2480 {
2456 struct sp_node *n; 2481 struct sp_node *n;
2457 struct rb_node *next; 2482 struct rb_node *next;
2458 2483
2459 if (!p->root.rb_node) 2484 if (!p->root.rb_node)
2460 return; 2485 return;
2461 mutex_lock(&p->mutex); 2486 mutex_lock(&p->mutex);
2462 next = rb_first(&p->root); 2487 next = rb_first(&p->root);
2463 while (next) { 2488 while (next) {
2464 n = rb_entry(next, struct sp_node, nd); 2489 n = rb_entry(next, struct sp_node, nd);
2465 next = rb_next(&n->nd); 2490 next = rb_next(&n->nd);
2466 sp_delete(p, n); 2491 sp_delete(p, n);
2467 } 2492 }
2468 mutex_unlock(&p->mutex); 2493 mutex_unlock(&p->mutex);
2469 } 2494 }
2470 2495
2471 /* assumes fs == KERNEL_DS */ 2496 /* assumes fs == KERNEL_DS */
2472 void __init numa_policy_init(void) 2497 void __init numa_policy_init(void)
2473 { 2498 {
2474 nodemask_t interleave_nodes; 2499 nodemask_t interleave_nodes;
2475 unsigned long largest = 0; 2500 unsigned long largest = 0;
2476 int nid, prefer = 0; 2501 int nid, prefer = 0;
2477 2502
2478 policy_cache = kmem_cache_create("numa_policy", 2503 policy_cache = kmem_cache_create("numa_policy",
2479 sizeof(struct mempolicy), 2504 sizeof(struct mempolicy),
2480 0, SLAB_PANIC, NULL); 2505 0, SLAB_PANIC, NULL);
2481 2506
2482 sn_cache = kmem_cache_create("shared_policy_node", 2507 sn_cache = kmem_cache_create("shared_policy_node",
2483 sizeof(struct sp_node), 2508 sizeof(struct sp_node),
2484 0, SLAB_PANIC, NULL); 2509 0, SLAB_PANIC, NULL);
2510
2511 for_each_node(nid) {
2512 preferred_node_policy[nid] = (struct mempolicy) {
2513 .refcnt = ATOMIC_INIT(1),
2514 .mode = MPOL_PREFERRED,
2515 .flags = MPOL_F_MOF | MPOL_F_MORON,
2516 .v = { .preferred_node = nid, },
2517 };
2518 }
2485 2519
2486 /* 2520 /*
2487 * Set interleaving policy for system init. Interleaving is only 2521 * Set interleaving policy for system init. Interleaving is only
2488 * enabled across suitably sized nodes (default is >= 16MB), or 2522 * enabled across suitably sized nodes (default is >= 16MB), or
2489 * fall back to the largest node if they're all smaller. 2523 * fall back to the largest node if they're all smaller.
2490 */ 2524 */
2491 nodes_clear(interleave_nodes); 2525 nodes_clear(interleave_nodes);
2492 for_each_node_state(nid, N_HIGH_MEMORY) { 2526 for_each_node_state(nid, N_HIGH_MEMORY) {
2493 unsigned long total_pages = node_present_pages(nid); 2527 unsigned long total_pages = node_present_pages(nid);
2494 2528
2495 /* Preserve the largest node */ 2529 /* Preserve the largest node */
2496 if (largest < total_pages) { 2530 if (largest < total_pages) {
2497 largest = total_pages; 2531 largest = total_pages;
2498 prefer = nid; 2532 prefer = nid;
2499 } 2533 }
2500 2534
2501 /* Interleave this node? */ 2535 /* Interleave this node? */
2502 if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 2536 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2503 node_set(nid, interleave_nodes); 2537 node_set(nid, interleave_nodes);
2504 } 2538 }
2505 2539
2506 /* All too small, use the largest */ 2540 /* All too small, use the largest */
2507 if (unlikely(nodes_empty(interleave_nodes))) 2541 if (unlikely(nodes_empty(interleave_nodes)))
2508 node_set(prefer, interleave_nodes); 2542 node_set(prefer, interleave_nodes);
2509 2543
2510 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 2544 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2511 printk("numa_policy_init: interleaving failed\n"); 2545 printk("numa_policy_init: interleaving failed\n");
2512 } 2546 }
2513 2547
2514 /* Reset policy of current process to default */ 2548 /* Reset policy of current process to default */
2515 void numa_default_policy(void) 2549 void numa_default_policy(void)
2516 { 2550 {
2517 do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 2551 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2518 } 2552 }
2519 2553
2520 /* 2554 /*
2521 * Parse and format mempolicy from/to strings 2555 * Parse and format mempolicy from/to strings
2522 */ 2556 */
2523 2557
2524 /* 2558 /*
2525 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2559 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
2526 * Used only for mpol_parse_str() and mpol_to_str() 2560 * Used only for mpol_parse_str() and mpol_to_str()
2527 */ 2561 */
2528 static const char * const policy_modes[] = 2562 static const char * const policy_modes[] =
2529 { 2563 {
2530 [MPOL_DEFAULT] = "default", 2564 [MPOL_DEFAULT] = "default",
2531 [MPOL_PREFERRED] = "prefer", 2565 [MPOL_PREFERRED] = "prefer",
2532 [MPOL_BIND] = "bind", 2566 [MPOL_BIND] = "bind",
2533 [MPOL_INTERLEAVE] = "interleave", 2567 [MPOL_INTERLEAVE] = "interleave",
2534 [MPOL_LOCAL] = "local", 2568 [MPOL_LOCAL] = "local",
2535 }; 2569 };
2536 2570
2537 2571
2538 #ifdef CONFIG_TMPFS 2572 #ifdef CONFIG_TMPFS
2539 /** 2573 /**
2540 * mpol_parse_str - parse string to mempolicy 2574 * mpol_parse_str - parse string to mempolicy
2541 * @str: string containing mempolicy to parse 2575 * @str: string containing mempolicy to parse
2542 * @mpol: pointer to struct mempolicy pointer, returned on success. 2576 * @mpol: pointer to struct mempolicy pointer, returned on success.
2543 * @no_context: flag whether to "contextualize" the mempolicy 2577 * @no_context: flag whether to "contextualize" the mempolicy
2544 * 2578 *
2545 * Format of input: 2579 * Format of input:
2546 * <mode>[=<flags>][:<nodelist>] 2580 * <mode>[=<flags>][:<nodelist>]
2547 * 2581 *
2548 * if @no_context is true, save the input nodemask in w.user_nodemask in 2582 * if @no_context is true, save the input nodemask in w.user_nodemask in
2549 * the returned mempolicy. This will be used to "clone" the mempolicy in 2583 * the returned mempolicy. This will be used to "clone" the mempolicy in
2550 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol 2584 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
2551 * mount option. Note that if 'static' or 'relative' mode flags were 2585 * mount option. Note that if 'static' or 'relative' mode flags were
2552 * specified, the input nodemask will already have been saved. Saving 2586 * specified, the input nodemask will already have been saved. Saving
2553 * it again is redundant, but safe. 2587 * it again is redundant, but safe.
2554 * 2588 *
2555 * On success, returns 0, else 1 2589 * On success, returns 0, else 1
2556 */ 2590 */
2557 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) 2591 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2558 { 2592 {
2559 struct mempolicy *new = NULL; 2593 struct mempolicy *new = NULL;
2560 unsigned short mode; 2594 unsigned short mode;
2561 unsigned short uninitialized_var(mode_flags); 2595 unsigned short uninitialized_var(mode_flags);
2562 nodemask_t nodes; 2596 nodemask_t nodes;
2563 char *nodelist = strchr(str, ':'); 2597 char *nodelist = strchr(str, ':');
2564 char *flags = strchr(str, '='); 2598 char *flags = strchr(str, '=');
2565 int err = 1; 2599 int err = 1;
2566 2600
2567 if (nodelist) { 2601 if (nodelist) {
2568 /* NUL-terminate mode or flags string */ 2602 /* NUL-terminate mode or flags string */
2569 *nodelist++ = '\0'; 2603 *nodelist++ = '\0';
2570 if (nodelist_parse(nodelist, nodes)) 2604 if (nodelist_parse(nodelist, nodes))
2571 goto out; 2605 goto out;
2572 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) 2606 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2573 goto out; 2607 goto out;
2574 } else 2608 } else
2575 nodes_clear(nodes); 2609 nodes_clear(nodes);
2576 2610
2577 if (flags) 2611 if (flags)
2578 *flags++ = '\0'; /* terminate mode string */ 2612 *flags++ = '\0'; /* terminate mode string */
2579 2613
2580 for (mode = 0; mode < MPOL_MAX; mode++) { 2614 for (mode = 0; mode < MPOL_MAX; mode++) {
2581 if (!strcmp(str, policy_modes[mode])) { 2615 if (!strcmp(str, policy_modes[mode])) {
2582 break; 2616 break;
2583 } 2617 }
2584 } 2618 }
2585 if (mode >= MPOL_MAX) 2619 if (mode >= MPOL_MAX)
2586 goto out; 2620 goto out;
2587 2621
2588 switch (mode) { 2622 switch (mode) {
2589 case MPOL_PREFERRED: 2623 case MPOL_PREFERRED:
2590 /* 2624 /*
2591 * Insist on a nodelist of one node only 2625 * Insist on a nodelist of one node only
2592 */ 2626 */
2593 if (nodelist) { 2627 if (nodelist) {
2594 char *rest = nodelist; 2628 char *rest = nodelist;
2595 while (isdigit(*rest)) 2629 while (isdigit(*rest))
2596 rest++; 2630 rest++;
2597 if (*rest) 2631 if (*rest)
2598 goto out; 2632 goto out;
2599 } 2633 }
2600 break; 2634 break;
2601 case MPOL_INTERLEAVE: 2635 case MPOL_INTERLEAVE:
2602 /* 2636 /*
2603 * Default to online nodes with memory if no nodelist 2637 * Default to online nodes with memory if no nodelist
2604 */ 2638 */
2605 if (!nodelist) 2639 if (!nodelist)
2606 nodes = node_states[N_HIGH_MEMORY]; 2640 nodes = node_states[N_HIGH_MEMORY];
2607 break; 2641 break;
2608 case MPOL_LOCAL: 2642 case MPOL_LOCAL:
2609 /* 2643 /*
2610 * Don't allow a nodelist; mpol_new() checks flags 2644 * Don't allow a nodelist; mpol_new() checks flags
2611 */ 2645 */
2612 if (nodelist) 2646 if (nodelist)
2613 goto out; 2647 goto out;
2614 mode = MPOL_PREFERRED; 2648 mode = MPOL_PREFERRED;
2615 break; 2649 break;
2616 case MPOL_DEFAULT: 2650 case MPOL_DEFAULT:
2617 /* 2651 /*
2618 * Insist on a empty nodelist 2652 * Insist on a empty nodelist
2619 */ 2653 */
2620 if (!nodelist) 2654 if (!nodelist)
2621 err = 0; 2655 err = 0;
2622 goto out; 2656 goto out;
2623 case MPOL_BIND: 2657 case MPOL_BIND:
2624 /* 2658 /*
2625 * Insist on a nodelist 2659 * Insist on a nodelist
2626 */ 2660 */
2627 if (!nodelist) 2661 if (!nodelist)
2628 goto out; 2662 goto out;
2629 } 2663 }
2630 2664
2631 mode_flags = 0; 2665 mode_flags = 0;
2632 if (flags) { 2666 if (flags) {
2633 /* 2667 /*
2634 * Currently, we only support two mutually exclusive 2668 * Currently, we only support two mutually exclusive
2635 * mode flags. 2669 * mode flags.
2636 */ 2670 */
2637 if (!strcmp(flags, "static")) 2671 if (!strcmp(flags, "static"))
2638 mode_flags |= MPOL_F_STATIC_NODES; 2672 mode_flags |= MPOL_F_STATIC_NODES;
2639 else if (!strcmp(flags, "relative")) 2673 else if (!strcmp(flags, "relative"))
2640 mode_flags |= MPOL_F_RELATIVE_NODES; 2674 mode_flags |= MPOL_F_RELATIVE_NODES;
2641 else 2675 else
2642 goto out; 2676 goto out;
2643 } 2677 }
2644 2678
2645 new = mpol_new(mode, mode_flags, &nodes); 2679 new = mpol_new(mode, mode_flags, &nodes);
2646 if (IS_ERR(new)) 2680 if (IS_ERR(new))
2647 goto out; 2681 goto out;
2648 2682
2649 if (no_context) { 2683 if (no_context) {
2650 /* save for contextualization */ 2684 /* save for contextualization */
2651 new->w.user_nodemask = nodes; 2685 new->w.user_nodemask = nodes;
2652 } else { 2686 } else {
2653 int ret; 2687 int ret;
2654 NODEMASK_SCRATCH(scratch); 2688 NODEMASK_SCRATCH(scratch);
2655 if (scratch) { 2689 if (scratch) {
2656 task_lock(current); 2690 task_lock(current);
2657 ret = mpol_set_nodemask(new, &nodes, scratch); 2691 ret = mpol_set_nodemask(new, &nodes, scratch);
2658 task_unlock(current); 2692 task_unlock(current);
2659 } else 2693 } else
2660 ret = -ENOMEM; 2694 ret = -ENOMEM;
2661 NODEMASK_SCRATCH_FREE(scratch); 2695 NODEMASK_SCRATCH_FREE(scratch);
2662 if (ret) { 2696 if (ret) {
2663 mpol_put(new); 2697 mpol_put(new);
2664 goto out; 2698 goto out;
2665 } 2699 }
2666 } 2700 }
2667 err = 0; 2701 err = 0;
2668 2702
2669 out: 2703 out:
2670 /* Restore string for error message */ 2704 /* Restore string for error message */
2671 if (nodelist) 2705 if (nodelist)
2672 *--nodelist = ':'; 2706 *--nodelist = ':';
2673 if (flags) 2707 if (flags)
2674 *--flags = '='; 2708 *--flags = '=';
2675 if (!err) 2709 if (!err)
2676 *mpol = new; 2710 *mpol = new;
2677 return err; 2711 return err;
2678 } 2712 }
2679 #endif /* CONFIG_TMPFS */ 2713 #endif /* CONFIG_TMPFS */
2680 2714
2681 /** 2715 /**
2682 * mpol_to_str - format a mempolicy structure for printing 2716 * mpol_to_str - format a mempolicy structure for printing
2683 * @buffer: to contain formatted mempolicy string 2717 * @buffer: to contain formatted mempolicy string
2684 * @maxlen: length of @buffer 2718 * @maxlen: length of @buffer
2685 * @pol: pointer to mempolicy to be formatted 2719 * @pol: pointer to mempolicy to be formatted
2686 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask 2720 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2687 * 2721 *
2688 * Convert a mempolicy into a string. 2722 * Convert a mempolicy into a string.
2689 * Returns the number of characters in buffer (if positive) 2723 * Returns the number of characters in buffer (if positive)
2690 * or an error (negative) 2724 * or an error (negative)
2691 */ 2725 */
2692 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) 2726 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2693 { 2727 {
2694 char *p = buffer; 2728 char *p = buffer;
2695 int l; 2729 int l;
2696 nodemask_t nodes; 2730 nodemask_t nodes;
2697 unsigned short mode; 2731 unsigned short mode;
2698 unsigned short flags = pol ? pol->flags : 0; 2732 unsigned short flags = pol ? pol->flags : 0;
2699 2733
2700 /* 2734 /*
2701 * Sanity check: room for longest mode, flag and some nodes 2735 * Sanity check: room for longest mode, flag and some nodes
2702 */ 2736 */
2703 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); 2737 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2704 2738
2705 if (!pol || pol == &default_policy) 2739 if (!pol || pol == &default_policy)
2706 mode = MPOL_DEFAULT; 2740 mode = MPOL_DEFAULT;
2707 else 2741 else
2708 mode = pol->mode; 2742 mode = pol->mode;
2709 2743
2710 switch (mode) { 2744 switch (mode) {
2711 case MPOL_DEFAULT: 2745 case MPOL_DEFAULT:
2712 nodes_clear(nodes); 2746 nodes_clear(nodes);
2713 break; 2747 break;
2714 2748
2715 case MPOL_PREFERRED: 2749 case MPOL_PREFERRED:
2716 nodes_clear(nodes); 2750 nodes_clear(nodes);
2717 if (flags & MPOL_F_LOCAL) 2751 if (flags & MPOL_F_LOCAL)
2718 mode = MPOL_LOCAL; /* pseudo-policy */ 2752 mode = MPOL_LOCAL; /* pseudo-policy */
2719 else 2753 else
2720 node_set(pol->v.preferred_node, nodes); 2754 node_set(pol->v.preferred_node, nodes);
2721 break; 2755 break;
2722 2756
2723 case MPOL_BIND: 2757 case MPOL_BIND:
2724 /* Fall through */ 2758 /* Fall through */
2725 case MPOL_INTERLEAVE: 2759 case MPOL_INTERLEAVE:
2726 if (no_context) 2760 if (no_context)
2727 nodes = pol->w.user_nodemask; 2761 nodes = pol->w.user_nodemask;
2728 else 2762 else
2729 nodes = pol->v.nodes; 2763 nodes = pol->v.nodes;
2730 break; 2764 break;
2731 2765
2732 default: 2766 default:
2733 return -EINVAL; 2767 return -EINVAL;
2734 } 2768 }
2735 2769
2736 l = strlen(policy_modes[mode]); 2770 l = strlen(policy_modes[mode]);
2737 if (buffer + maxlen < p + l + 1) 2771 if (buffer + maxlen < p + l + 1)
2738 return -ENOSPC; 2772 return -ENOSPC;
2739 2773
2740 strcpy(p, policy_modes[mode]); 2774 strcpy(p, policy_modes[mode]);
2741 p += l; 2775 p += l;
2742 2776
2743 if (flags & MPOL_MODE_FLAGS) { 2777 if (flags & MPOL_MODE_FLAGS) {
2744 if (buffer + maxlen < p + 2) 2778 if (buffer + maxlen < p + 2)
2745 return -ENOSPC; 2779 return -ENOSPC;
2746 *p++ = '='; 2780 *p++ = '=';
2747 2781
2748 /* 2782 /*
2749 * Currently, the only defined flags are mutually exclusive 2783 * Currently, the only defined flags are mutually exclusive
2750 */ 2784 */
2751 if (flags & MPOL_F_STATIC_NODES) 2785 if (flags & MPOL_F_STATIC_NODES)
2752 p += snprintf(p, buffer + maxlen - p, "static"); 2786 p += snprintf(p, buffer + maxlen - p, "static");
2753 else if (flags & MPOL_F_RELATIVE_NODES) 2787 else if (flags & MPOL_F_RELATIVE_NODES)
2754 p += snprintf(p, buffer + maxlen - p, "relative"); 2788 p += snprintf(p, buffer + maxlen - p, "relative");
2755 } 2789 }
2756 2790
2757 if (!nodes_empty(nodes)) { 2791 if (!nodes_empty(nodes)) {
2758 if (buffer + maxlen < p + 2) 2792 if (buffer + maxlen < p + 2)
2759 return -ENOSPC; 2793 return -ENOSPC;
2760 *p++ = ':'; 2794 *p++ = ':';
2761 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 2795 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2762 } 2796 }
2763 return p - buffer; 2797 return p - buffer;
2764 } 2798 }
2765 2799