Blame view

mm/mempolicy.c 71.6 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
  /*
   * Simple NUMA memory policy for the Linux kernel.
   *
   * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85ff   Christoph Lameter   [PATCH] Implement...
5
   * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
   * Subject to the GNU Public License, version 2.
   *
   * NUMA policy allows the user to give hints in which node(s) memory should
   * be allocated.
   *
   * Support four policies per VMA and per process:
   *
   * The VMA policy has priority over the process policy for a page fault.
   *
   * interleave     Allocate memory interleaved over a set of nodes,
   *                with normal fallback if it fails.
   *                For VMA based allocations this interleaves based on the
   *                offset into the backing object or offset into the mapping
   *                for anonymous memory. For process policy an process counter
   *                is used.
8bccd85ff   Christoph Lameter   [PATCH] Implement...
21
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
   * bind           Only allocate memory on a specific set of nodes,
   *                no fallback.
8bccd85ff   Christoph Lameter   [PATCH] Implement...
24
25
26
27
   *                FIXME: memory is allocated starting with the first node
   *                to the last. It would be better if bind would truly restrict
   *                the allocation to memory nodes instead
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
28
   * preferred       Try a specific node first before normal fallback.
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
29
   *                As a special case NUMA_NO_NODE here means do the allocation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
30
31
32
   *                on the local CPU. This is normally identical to default,
   *                but useful to set in a VMA when you have a non default
   *                process policy.
8bccd85ff   Christoph Lameter   [PATCH] Implement...
33
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
   * default        Allocate on the local node first, or when on a VMA
   *                use the process policy. This is what Linux always did
   *		  in a NUMA aware kernel and still does by, ahem, default.
   *
   * The process policy is applied for most non interrupt memory allocations
   * in that process' context. Interrupts ignore the policies and always
   * try to allocate on the local CPU. The VMA policy is only applied for memory
   * allocations for a VMA in the VM.
   *
   * Currently there are a few corner cases in swapping where the policy
   * is not applied, but the majority should be handled. When process policy
   * is used it is not remembered over swap outs/swap ins.
   *
   * Only the highest zone in the zone hierarchy gets policied. Allocations
   * requesting a lower zone just use default policy. This implies that
   * on systems with highmem kernel lowmem allocation don't get policied.
   * Same with GFP_DMA allocations.
   *
   * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
   * all users and remembered even when nobody has memory mapped.
   */
  
  /* Notebook:
     fix mmap readahead to honour policy and enable policy for any page cache
     object
     statistics for bigpages
     global policy for page cache? currently it uses process policy. Requires
     first item above.
     handle mremap for shared memory (currently ignored for the policy)
     grows down?
     make bind policy root only? It can trigger oom much faster and the
     kernel is not always grateful with that.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
66
67
68
69
70
71
72
73
  */
  
  #include <linux/mempolicy.h>
  #include <linux/mm.h>
  #include <linux/highmem.h>
  #include <linux/hugetlb.h>
  #include <linux/kernel.h>
  #include <linux/sched.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
74
75
  #include <linux/nodemask.h>
  #include <linux/cpuset.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
76
77
  #include <linux/slab.h>
  #include <linux/string.h>
b95f1b31b   Paul Gortmaker   mm: Map most file...
78
  #include <linux/export.h>
b488893a3   Pavel Emelyanov   pid namespaces: c...
79
  #include <linux/nsproxy.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
80
81
82
  #include <linux/interrupt.h>
  #include <linux/init.h>
  #include <linux/compat.h>
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
83
  #include <linux/swap.h>
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
84
85
  #include <linux/seq_file.h>
  #include <linux/proc_fs.h>
b20a35035   Christoph Lameter   [PATCH] page migr...
86
  #include <linux/migrate.h>
62b61f611   Hugh Dickins   ksm: memory hotre...
87
  #include <linux/ksm.h>
95a402c38   Christoph Lameter   [PATCH] page migr...
88
  #include <linux/rmap.h>
86c3a7645   David Quigley   [PATCH] SELinux: ...
89
  #include <linux/security.h>
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
90
  #include <linux/syscalls.h>
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
91
  #include <linux/ctype.h>
6d9c285a6   KOSAKI Motohiro   mm: move inc_zone...
92
  #include <linux/mm_inline.h>
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
93
  #include <linux/mmu_notifier.h>
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
94

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
95
96
  #include <asm/tlbflush.h>
  #include <asm/uaccess.h>
778d3b0ff   Michal Hocko   cpusets: randomiz...
97
  #include <linux/random.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
98

62695a84e   Nick Piggin   vmscan: move isol...
99
  #include "internal.h"
38e35860d   Christoph Lameter   [PATCH] mempolici...
100
  /* Internal flags */
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
101
  #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
38e35860d   Christoph Lameter   [PATCH] mempolici...
102
  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
103

fcc234f88   Pekka Enberg   [PATCH] mm: kill ...
104
105
  static struct kmem_cache *policy_cache;
  static struct kmem_cache *sn_cache;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
106

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
107
108
  /* Highest zone. An specific allocation for a zone below that is not
     policied. */
6267276f3   Christoph Lameter   [PATCH] optional ...
109
  enum zone_type policy_zone = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
110

bea904d54   Lee Schermerhorn   mempolicy: use MP...
111
112
113
  /*
   * run-time system-wide default policy => local allocation
   */
e754d79d3   H Hartley Sweeten   mm/mempolicy.c: q...
114
  static struct mempolicy default_policy = {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
115
  	.refcnt = ATOMIC_INIT(1), /* never free it */
bea904d54   Lee Schermerhorn   mempolicy: use MP...
116
  	.mode = MPOL_PREFERRED,
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
117
  	.flags = MPOL_F_LOCAL,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
118
  };
5606e3877   Mel Gorman   mm: numa: Migrate...
119
120
121
122
123
  static struct mempolicy preferred_node_policy[MAX_NUMNODES];
  
  static struct mempolicy *get_task_policy(struct task_struct *p)
  {
  	struct mempolicy *pol = p->mempolicy;
5606e3877   Mel Gorman   mm: numa: Migrate...
124
125
  
  	if (!pol) {
1da6f0e1b   Jianguo Wu   mm/mempolicy: ret...
126
  		int node = numa_node_id();
5606e3877   Mel Gorman   mm: numa: Migrate...
127

1da6f0e1b   Jianguo Wu   mm/mempolicy: ret...
128
129
130
131
132
133
134
135
136
  		if (node != NUMA_NO_NODE) {
  			pol = &preferred_node_policy[node];
  			/*
  			 * preferred_node_policy is not initialised early in
  			 * boot
  			 */
  			if (!pol->mode)
  				pol = NULL;
  		}
5606e3877   Mel Gorman   mm: numa: Migrate...
137
138
139
140
  	}
  
  	return pol;
  }
37012946d   David Rientjes   mempolicy: create...
141
142
  static const struct mempolicy_operations {
  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
708c1bbc9   Miao Xie   mempolicy: restru...
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
  	/*
  	 * If read-side task has no lock to protect task->mempolicy, write-side
  	 * task will rebind the task->mempolicy by two step. The first step is
  	 * setting all the newly nodes, and the second step is cleaning all the
  	 * disallowed nodes. In this way, we can avoid finding no node to alloc
  	 * page.
  	 * If we have a lock to protect task->mempolicy in read-side, we do
  	 * rebind directly.
  	 *
  	 * step:
  	 * 	MPOL_REBIND_ONCE - do rebind work at once
  	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
  	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
  	 */
  	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
  			enum mpol_rebind_step step);
37012946d   David Rientjes   mempolicy: create...
159
  } mpol_ops[MPOL_MAX];
19770b326   Mel Gorman   mm: filter based ...
160
  /* Check that the nodemask contains at least one populated zone */
37012946d   David Rientjes   mempolicy: create...
161
  static int is_valid_nodemask(const nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
162
  {
d3eb1570a   Lai Jiangshan   mempolicy: fix is...
163
  	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
164
  }
f5b087b52   David Rientjes   mempolicy: add MP...
165
166
  static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
  {
6d556294d   Bob Liu   mempolicy: remove...
167
  	return pol->flags & MPOL_MODE_FLAGS;
4c50bc011   David Rientjes   mempolicy: add MP...
168
169
170
171
172
173
174
175
  }
  
  static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
  				   const nodemask_t *rel)
  {
  	nodemask_t tmp;
  	nodes_fold(tmp, *orig, nodes_weight(*rel));
  	nodes_onto(*ret, tmp, *rel);
f5b087b52   David Rientjes   mempolicy: add MP...
176
  }
37012946d   David Rientjes   mempolicy: create...
177
178
179
180
181
182
183
184
185
186
187
  static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
  {
  	if (nodes_empty(*nodes))
  		return -EINVAL;
  	pol->v.nodes = *nodes;
  	return 0;
  }
  
  static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
  {
  	if (!nodes)
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
188
  		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
37012946d   David Rientjes   mempolicy: create...
189
190
191
192
193
194
195
196
197
198
199
200
201
202
  	else if (nodes_empty(*nodes))
  		return -EINVAL;			/*  no allowed nodes */
  	else
  		pol->v.preferred_node = first_node(*nodes);
  	return 0;
  }
  
  static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
  {
  	if (!is_valid_nodemask(nodes))
  		return -EINVAL;
  	pol->v.nodes = *nodes;
  	return 0;
  }
58568d2a8   Miao Xie   cpuset,mm: update...
203
204
205
206
207
208
209
210
211
  /*
   * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
   * any, for the new policy.  mpol_new() has already validated the nodes
   * parameter with respect to the policy mode and flags.  But, we need to
   * handle an empty nodemask with MPOL_PREFERRED here.
   *
   * Must be called holding task's alloc_lock to protect task's mems_allowed
   * and mempolicy.  May also be called holding the mmap_semaphore for write.
   */
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
212
213
  static int mpol_set_nodemask(struct mempolicy *pol,
  		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a8   Miao Xie   cpuset,mm: update...
214
  {
58568d2a8   Miao Xie   cpuset,mm: update...
215
216
217
218
219
  	int ret;
  
  	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
  	if (pol == NULL)
  		return 0;
01f13bd60   Lai Jiangshan   mempolicy: use N_...
220
  	/* Check N_MEMORY */
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
221
  	nodes_and(nsc->mask1,
01f13bd60   Lai Jiangshan   mempolicy: use N_...
222
  		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a8   Miao Xie   cpuset,mm: update...
223
224
225
226
227
228
  
  	VM_BUG_ON(!nodes);
  	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
  		nodes = NULL;	/* explicit local allocation */
  	else {
  		if (pol->flags & MPOL_F_RELATIVE_NODES)
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
229
  			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
58568d2a8   Miao Xie   cpuset,mm: update...
230
  		else
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
231
  			nodes_and(nsc->mask2, *nodes, nsc->mask1);
58568d2a8   Miao Xie   cpuset,mm: update...
232
233
234
235
236
237
  		if (mpol_store_user_nodemask(pol))
  			pol->w.user_nodemask = *nodes;
  		else
  			pol->w.cpuset_mems_allowed =
  						cpuset_current_mems_allowed;
  	}
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
238
239
240
241
  	if (nodes)
  		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
  	else
  		ret = mpol_ops[pol->mode].create(pol, NULL);
58568d2a8   Miao Xie   cpuset,mm: update...
242
243
244
245
246
247
248
  	return ret;
  }
  
  /*
   * This function just creates a new policy, does some check and simple
   * initialization. You must invoke mpol_set_nodemask() to set nodes.
   */
028fec414   David Rientjes   mempolicy: suppor...
249
250
  static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
  				  nodemask_t *nodes)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
251
252
  {
  	struct mempolicy *policy;
028fec414   David Rientjes   mempolicy: suppor...
253
254
  	pr_debug("setting mode %d flags %d nodes[0] %lx
  ",
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
255
  		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
140d5a490   Paul Mundt   numa: mempolicy: ...
256

3e1f06456   David Rientjes   mempolicy: disall...
257
258
  	if (mode == MPOL_DEFAULT) {
  		if (nodes && !nodes_empty(*nodes))
37012946d   David Rientjes   mempolicy: create...
259
  			return ERR_PTR(-EINVAL);
d3a710337   Lee Schermerhorn   mm: mempolicy: Ad...
260
  		return NULL;
37012946d   David Rientjes   mempolicy: create...
261
  	}
3e1f06456   David Rientjes   mempolicy: disall...
262
263
264
265
266
267
268
269
270
271
272
273
  	VM_BUG_ON(!nodes);
  
  	/*
  	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
  	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
  	 * All other modes require a valid pointer to a non-empty nodemask.
  	 */
  	if (mode == MPOL_PREFERRED) {
  		if (nodes_empty(*nodes)) {
  			if (((flags & MPOL_F_STATIC_NODES) ||
  			     (flags & MPOL_F_RELATIVE_NODES)))
  				return ERR_PTR(-EINVAL);
3e1f06456   David Rientjes   mempolicy: disall...
274
  		}
479e2802d   Peter Zijlstra   mm: mempolicy: Ma...
275
276
277
278
  	} else if (mode == MPOL_LOCAL) {
  		if (!nodes_empty(*nodes))
  			return ERR_PTR(-EINVAL);
  		mode = MPOL_PREFERRED;
3e1f06456   David Rientjes   mempolicy: disall...
279
280
  	} else if (nodes_empty(*nodes))
  		return ERR_PTR(-EINVAL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
281
282
283
284
  	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
  	if (!policy)
  		return ERR_PTR(-ENOMEM);
  	atomic_set(&policy->refcnt, 1);
45c4745af   Lee Schermerhorn   mempolicy: rename...
285
  	policy->mode = mode;
3e1f06456   David Rientjes   mempolicy: disall...
286
  	policy->flags = flags;
37012946d   David Rientjes   mempolicy: create...
287

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
288
  	return policy;
37012946d   David Rientjes   mempolicy: create...
289
  }
52cd3b074   Lee Schermerhorn   mempolicy: rework...
290
291
292
293
294
  /* Slow path of a mpol destructor. */
  void __mpol_put(struct mempolicy *p)
  {
  	if (!atomic_dec_and_test(&p->refcnt))
  		return;
52cd3b074   Lee Schermerhorn   mempolicy: rework...
295
296
  	kmem_cache_free(policy_cache, p);
  }
708c1bbc9   Miao Xie   mempolicy: restru...
297
298
  static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
  				enum mpol_rebind_step step)
37012946d   David Rientjes   mempolicy: create...
299
300
  {
  }
708c1bbc9   Miao Xie   mempolicy: restru...
301
302
303
304
305
306
307
308
  /*
   * step:
   * 	MPOL_REBIND_ONCE  - do rebind work at once
   * 	MPOL_REBIND_STEP1 - set all the newly nodes
   * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
   */
  static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
  				 enum mpol_rebind_step step)
37012946d   David Rientjes   mempolicy: create...
309
310
311
312
313
314
315
316
  {
  	nodemask_t tmp;
  
  	if (pol->flags & MPOL_F_STATIC_NODES)
  		nodes_and(tmp, pol->w.user_nodemask, *nodes);
  	else if (pol->flags & MPOL_F_RELATIVE_NODES)
  		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
  	else {
708c1bbc9   Miao Xie   mempolicy: restru...
317
318
319
320
321
322
323
324
325
326
327
328
329
  		/*
  		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
  		 * result
  		 */
  		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
  			nodes_remap(tmp, pol->v.nodes,
  					pol->w.cpuset_mems_allowed, *nodes);
  			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
  		} else if (step == MPOL_REBIND_STEP2) {
  			tmp = pol->w.cpuset_mems_allowed;
  			pol->w.cpuset_mems_allowed = *nodes;
  		} else
  			BUG();
37012946d   David Rientjes   mempolicy: create...
330
  	}
f5b087b52   David Rientjes   mempolicy: add MP...
331

708c1bbc9   Miao Xie   mempolicy: restru...
332
333
334
335
336
337
338
339
340
  	if (nodes_empty(tmp))
  		tmp = *nodes;
  
  	if (step == MPOL_REBIND_STEP1)
  		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
  	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
  		pol->v.nodes = tmp;
  	else
  		BUG();
37012946d   David Rientjes   mempolicy: create...
341
342
343
344
345
346
347
348
349
350
  	if (!node_isset(current->il_next, tmp)) {
  		current->il_next = next_node(current->il_next, tmp);
  		if (current->il_next >= MAX_NUMNODES)
  			current->il_next = first_node(tmp);
  		if (current->il_next >= MAX_NUMNODES)
  			current->il_next = numa_node_id();
  	}
  }
  
  static void mpol_rebind_preferred(struct mempolicy *pol,
708c1bbc9   Miao Xie   mempolicy: restru...
351
352
  				  const nodemask_t *nodes,
  				  enum mpol_rebind_step step)
37012946d   David Rientjes   mempolicy: create...
353
354
  {
  	nodemask_t tmp;
37012946d   David Rientjes   mempolicy: create...
355
356
  	if (pol->flags & MPOL_F_STATIC_NODES) {
  		int node = first_node(pol->w.user_nodemask);
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
357
  		if (node_isset(node, *nodes)) {
37012946d   David Rientjes   mempolicy: create...
358
  			pol->v.preferred_node = node;
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
359
360
361
  			pol->flags &= ~MPOL_F_LOCAL;
  		} else
  			pol->flags |= MPOL_F_LOCAL;
37012946d   David Rientjes   mempolicy: create...
362
363
364
  	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
  		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
  		pol->v.preferred_node = first_node(tmp);
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
365
  	} else if (!(pol->flags & MPOL_F_LOCAL)) {
37012946d   David Rientjes   mempolicy: create...
366
367
368
369
370
  		pol->v.preferred_node = node_remap(pol->v.preferred_node,
  						   pol->w.cpuset_mems_allowed,
  						   *nodes);
  		pol->w.cpuset_mems_allowed = *nodes;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
371
  }
708c1bbc9   Miao Xie   mempolicy: restru...
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
  /*
   * mpol_rebind_policy - Migrate a policy to a different set of nodes
   *
   * If read-side task has no lock to protect task->mempolicy, write-side
   * task will rebind the task->mempolicy by two step. The first step is
   * setting all the newly nodes, and the second step is cleaning all the
   * disallowed nodes. In this way, we can avoid finding no node to alloc
   * page.
   * If we have a lock to protect task->mempolicy in read-side, we do
   * rebind directly.
   *
   * step:
   * 	MPOL_REBIND_ONCE  - do rebind work at once
   * 	MPOL_REBIND_STEP1 - set all the newly nodes
   * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
   */
  static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
  				enum mpol_rebind_step step)
1d0d2680a   David Rientjes   mempolicy: move r...
390
  {
1d0d2680a   David Rientjes   mempolicy: move r...
391
392
  	if (!pol)
  		return;
89c522c78   Wang Sheng-Hui   mm/mempolicy.c: u...
393
  	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
1d0d2680a   David Rientjes   mempolicy: move r...
394
395
  	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
  		return;
708c1bbc9   Miao Xie   mempolicy: restru...
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
  
  	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
  		return;
  
  	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
  		BUG();
  
  	if (step == MPOL_REBIND_STEP1)
  		pol->flags |= MPOL_F_REBINDING;
  	else if (step == MPOL_REBIND_STEP2)
  		pol->flags &= ~MPOL_F_REBINDING;
  	else if (step >= MPOL_REBIND_NSTEP)
  		BUG();
  
  	mpol_ops[pol->mode].rebind(pol, newmask, step);
1d0d2680a   David Rientjes   mempolicy: move r...
411
412
413
414
415
  }
  
  /*
   * Wrapper for mpol_rebind_policy() that just requires task
   * pointer, and updates task mempolicy.
58568d2a8   Miao Xie   cpuset,mm: update...
416
417
   *
   * Called with task's alloc_lock held.
1d0d2680a   David Rientjes   mempolicy: move r...
418
   */
708c1bbc9   Miao Xie   mempolicy: restru...
419
420
  void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
  			enum mpol_rebind_step step)
1d0d2680a   David Rientjes   mempolicy: move r...
421
  {
708c1bbc9   Miao Xie   mempolicy: restru...
422
  	mpol_rebind_policy(tsk->mempolicy, new, step);
1d0d2680a   David Rientjes   mempolicy: move r...
423
424
425
426
427
428
429
430
431
432
433
434
435
436
  }
  
  /*
   * Rebind each vma in mm to new nodemask.
   *
   * Call holding a reference to mm.  Takes mm->mmap_sem during call.
   */
  
  void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
  {
  	struct vm_area_struct *vma;
  
  	down_write(&mm->mmap_sem);
  	for (vma = mm->mmap; vma; vma = vma->vm_next)
708c1bbc9   Miao Xie   mempolicy: restru...
437
  		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
1d0d2680a   David Rientjes   mempolicy: move r...
438
439
  	up_write(&mm->mmap_sem);
  }
37012946d   David Rientjes   mempolicy: create...
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
  static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
  	[MPOL_DEFAULT] = {
  		.rebind = mpol_rebind_default,
  	},
  	[MPOL_INTERLEAVE] = {
  		.create = mpol_new_interleave,
  		.rebind = mpol_rebind_nodemask,
  	},
  	[MPOL_PREFERRED] = {
  		.create = mpol_new_preferred,
  		.rebind = mpol_rebind_preferred,
  	},
  	[MPOL_BIND] = {
  		.create = mpol_new_bind,
  		.rebind = mpol_rebind_nodemask,
  	},
  };
fc3012896   Christoph Lameter   [PATCH] Simplify ...
457
458
  static void migrate_page_add(struct page *page, struct list_head *pagelist,
  				unsigned long flags);
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
459

980949457   Naoya Horiguchi   mm/mempolicy: ren...
460
461
462
463
464
  /*
   * Scan through pages checking if pages follow certain conditions,
   * and move them to the pagelist if they do.
   */
  static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
465
466
  		unsigned long addr, unsigned long end,
  		const nodemask_t *nodes, unsigned long flags,
38e35860d   Christoph Lameter   [PATCH] mempolici...
467
  		void *private)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
468
  {
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
469
470
  	pte_t *orig_pte;
  	pte_t *pte;
705e87c0c   Hugh Dickins   [PATCH] mm: pte_o...
471
  	spinlock_t *ptl;
941150a32   Hugh Dickins   [PATCH] mbind: fi...
472

705e87c0c   Hugh Dickins   [PATCH] mm: pte_o...
473
  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
474
  	do {
6aab341e0   Linus Torvalds   mm: re-architect ...
475
  		struct page *page;
25ba77c14   Andy Whitcroft   [PATCH] numa node...
476
  		int nid;
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
477
478
  
  		if (!pte_present(*pte))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
479
  			continue;
6aab341e0   Linus Torvalds   mm: re-architect ...
480
481
  		page = vm_normal_page(vma, addr, *pte);
  		if (!page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
482
  			continue;
053837fce   Nick Piggin   [PATCH] mm: migra...
483
  		/*
62b61f611   Hugh Dickins   ksm: memory hotre...
484
485
  		 * vm_normal_page() filters out zero pages, but there might
  		 * still be PageReserved pages to skip, perhaps in a VDSO.
053837fce   Nick Piggin   [PATCH] mm: migra...
486
  		 */
b79bc0a0c   Hugh Dickins   ksm: enable KSM p...
487
  		if (PageReserved(page))
f4598c8b3   Christoph Lameter   [PATCH] migration...
488
  			continue;
6aab341e0   Linus Torvalds   mm: re-architect ...
489
  		nid = page_to_nid(page);
38e35860d   Christoph Lameter   [PATCH] mempolici...
490
491
  		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
  			continue;
b1f72d185   Stephen Wilson   mm: remove MPOL_M...
492
  		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
fc3012896   Christoph Lameter   [PATCH] Simplify ...
493
  			migrate_page_add(page, private, flags);
38e35860d   Christoph Lameter   [PATCH] mempolici...
494
495
  		else
  			break;
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
496
  	} while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0c   Hugh Dickins   [PATCH] mm: pte_o...
497
  	pte_unmap_unlock(orig_pte, ptl);
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
498
499
  	return addr != end;
  }
980949457   Naoya Horiguchi   mm/mempolicy: ren...
500
501
  static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
  		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
e2d8cf405   Naoya Horiguchi   migrate: add huge...
502
503
504
505
506
  				    void *private)
  {
  #ifdef CONFIG_HUGETLB_PAGE
  	int nid;
  	struct page *page;
cb900f412   Kirill A. Shutemov   mm, hugetlb: conv...
507
  	spinlock_t *ptl;
e2d8cf405   Naoya Horiguchi   migrate: add huge...
508

cb900f412   Kirill A. Shutemov   mm, hugetlb: conv...
509
  	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
e2d8cf405   Naoya Horiguchi   migrate: add huge...
510
511
512
513
514
515
516
517
518
  	page = pte_page(huge_ptep_get((pte_t *)pmd));
  	nid = page_to_nid(page);
  	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
  		goto unlock;
  	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
  	if (flags & (MPOL_MF_MOVE_ALL) ||
  	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
  		isolate_huge_page(page, private);
  unlock:
cb900f412   Kirill A. Shutemov   mm, hugetlb: conv...
519
  	spin_unlock(ptl);
e2d8cf405   Naoya Horiguchi   migrate: add huge...
520
521
522
523
  #else
  	BUG();
  #endif
  }
980949457   Naoya Horiguchi   mm/mempolicy: ren...
524
  static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
525
526
  		unsigned long addr, unsigned long end,
  		const nodemask_t *nodes, unsigned long flags,
38e35860d   Christoph Lameter   [PATCH] mempolici...
527
  		void *private)
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
528
529
530
531
532
533
534
  {
  	pmd_t *pmd;
  	unsigned long next;
  
  	pmd = pmd_offset(pud, addr);
  	do {
  		next = pmd_addr_end(addr, end);
e2d8cf405   Naoya Horiguchi   migrate: add huge...
535
536
537
  		if (!pmd_present(*pmd))
  			continue;
  		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
980949457   Naoya Horiguchi   mm/mempolicy: ren...
538
  			queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
e2d8cf405   Naoya Horiguchi   migrate: add huge...
539
540
541
  						flags, private);
  			continue;
  		}
e180377f1   Kirill A. Shutemov   thp: change split...
542
  		split_huge_page_pmd(vma, addr, pmd);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
543
  		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
544
  			continue;
980949457   Naoya Horiguchi   mm/mempolicy: ren...
545
  		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
38e35860d   Christoph Lameter   [PATCH] mempolici...
546
  				    flags, private))
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
547
548
549
550
  			return -EIO;
  	} while (pmd++, addr = next, addr != end);
  	return 0;
  }
980949457   Naoya Horiguchi   mm/mempolicy: ren...
551
  static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
552
553
  		unsigned long addr, unsigned long end,
  		const nodemask_t *nodes, unsigned long flags,
38e35860d   Christoph Lameter   [PATCH] mempolici...
554
  		void *private)
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
555
556
557
558
559
560
561
  {
  	pud_t *pud;
  	unsigned long next;
  
  	pud = pud_offset(pgd, addr);
  	do {
  		next = pud_addr_end(addr, end);
e2d8cf405   Naoya Horiguchi   migrate: add huge...
562
563
  		if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
  			continue;
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
564
565
  		if (pud_none_or_clear_bad(pud))
  			continue;
980949457   Naoya Horiguchi   mm/mempolicy: ren...
566
  		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
38e35860d   Christoph Lameter   [PATCH] mempolici...
567
  				    flags, private))
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
568
569
570
571
  			return -EIO;
  	} while (pud++, addr = next, addr != end);
  	return 0;
  }
980949457   Naoya Horiguchi   mm/mempolicy: ren...
572
  static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
573
574
  		unsigned long addr, unsigned long end,
  		const nodemask_t *nodes, unsigned long flags,
38e35860d   Christoph Lameter   [PATCH] mempolici...
575
  		void *private)
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
576
577
578
  {
  	pgd_t *pgd;
  	unsigned long next;
b5810039a   Nick Piggin   [PATCH] core remo...
579
  	pgd = pgd_offset(vma->vm_mm, addr);
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
580
581
582
583
  	do {
  		next = pgd_addr_end(addr, end);
  		if (pgd_none_or_clear_bad(pgd))
  			continue;
980949457   Naoya Horiguchi   mm/mempolicy: ren...
584
  		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
38e35860d   Christoph Lameter   [PATCH] mempolici...
585
  				    flags, private))
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
586
587
588
  			return -EIO;
  	} while (pgd++, addr = next, addr != end);
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
589
  }
5877231f6   Aneesh Kumar K.V   mm: Move change_p...
590
  #ifdef CONFIG_NUMA_BALANCING
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
591
  /*
4b10e7d56   Mel Gorman   mm: mempolicy: Im...
592
593
594
595
596
597
598
   * This is used to mark a range of virtual addresses to be inaccessible.
   * These are later cleared by a NUMA hinting fault. Depending on these
   * faults, pages may be migrated for better NUMA placement.
   *
   * This is assuming that NUMA faults are handled using PROT_NONE. If
   * an architecture makes a different choice, it will need further
   * changes to the core.
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
599
   */
4b10e7d56   Mel Gorman   mm: mempolicy: Im...
600
601
  unsigned long change_prot_numa(struct vm_area_struct *vma,
  			unsigned long addr, unsigned long end)
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
602
  {
4b10e7d56   Mel Gorman   mm: mempolicy: Im...
603
  	int nr_updated;
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
604

4b10e7d56   Mel Gorman   mm: mempolicy: Im...
605
  	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
03c5a6e16   Mel Gorman   mm: numa: Add pte...
606
607
  	if (nr_updated)
  		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
608

4b10e7d56   Mel Gorman   mm: mempolicy: Im...
609
  	return nr_updated;
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
610
611
612
613
614
615
616
  }
  #else
  static unsigned long change_prot_numa(struct vm_area_struct *vma,
  			unsigned long addr, unsigned long end)
  {
  	return 0;
  }
5877231f6   Aneesh Kumar K.V   mm: Move change_p...
617
  #endif /* CONFIG_NUMA_BALANCING */
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
618

dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
619
  /*
980949457   Naoya Horiguchi   mm/mempolicy: ren...
620
621
622
623
624
   * Walk through page tables and collect pages to be migrated.
   *
   * If pages found in a given range are on a set of nodes (determined by
   * @nodes and @flags,) it's isolated and queued to the pagelist which is
   * passed via @private.)
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
625
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
626
  static struct vm_area_struct *
980949457   Naoya Horiguchi   mm/mempolicy: ren...
627
  queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
38e35860d   Christoph Lameter   [PATCH] mempolici...
628
  		const nodemask_t *nodes, unsigned long flags, void *private)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
629
630
631
  {
  	int err;
  	struct vm_area_struct *first, *vma, *prev;
053837fce   Nick Piggin   [PATCH] mm: migra...
632

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
633
634
635
636
637
  	first = find_vma(mm, start);
  	if (!first)
  		return ERR_PTR(-EFAULT);
  	prev = NULL;
  	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
638
639
640
641
642
643
  		unsigned long endvma = vma->vm_end;
  
  		if (endvma > end)
  			endvma = end;
  		if (vma->vm_start > start)
  			start = vma->vm_start;
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
644
645
646
647
648
649
  		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
  			if (!vma->vm_next && vma->vm_end < end)
  				return ERR_PTR(-EFAULT);
  			if (prev && prev->vm_end < vma->vm_start)
  				return ERR_PTR(-EFAULT);
  		}
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
650

b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
651
652
653
654
655
656
  		if (flags & MPOL_MF_LAZY) {
  			change_prot_numa(vma, start, endvma);
  			goto next;
  		}
  
  		if ((flags & MPOL_MF_STRICT) ||
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
657
  		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
658
  		      vma_migratable(vma))) {
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
659

980949457   Naoya Horiguchi   mm/mempolicy: ren...
660
  			err = queue_pages_pgd_range(vma, start, endvma, nodes,
38e35860d   Christoph Lameter   [PATCH] mempolici...
661
  						flags, private);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
662
663
664
665
666
  			if (err) {
  				first = ERR_PTR(err);
  				break;
  			}
  		}
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
667
  next:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
668
669
670
671
  		prev = vma;
  	}
  	return first;
  }
869833f2c   KOSAKI Motohiro   mempolicy: remove...
672
673
674
675
676
677
  /*
   * Apply policy to a single VMA
   * This must be called with the mmap_sem held for writing.
   */
  static int vma_replace_policy(struct vm_area_struct *vma,
  						struct mempolicy *pol)
8d34694c1   KOSAKI Motohiro   revert "mm: mempo...
678
  {
869833f2c   KOSAKI Motohiro   mempolicy: remove...
679
680
681
  	int err;
  	struct mempolicy *old;
  	struct mempolicy *new;
8d34694c1   KOSAKI Motohiro   revert "mm: mempo...
682
683
684
685
686
687
  
  	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p
  ",
  		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
  		 vma->vm_ops, vma->vm_file,
  		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
869833f2c   KOSAKI Motohiro   mempolicy: remove...
688
689
690
691
692
  	new = mpol_dup(pol);
  	if (IS_ERR(new))
  		return PTR_ERR(new);
  
  	if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c1   KOSAKI Motohiro   revert "mm: mempo...
693
  		err = vma->vm_ops->set_policy(vma, new);
869833f2c   KOSAKI Motohiro   mempolicy: remove...
694
695
  		if (err)
  			goto err_out;
8d34694c1   KOSAKI Motohiro   revert "mm: mempo...
696
  	}
869833f2c   KOSAKI Motohiro   mempolicy: remove...
697
698
699
700
701
702
703
704
  
  	old = vma->vm_policy;
  	vma->vm_policy = new; /* protected by mmap_sem */
  	mpol_put(old);
  
  	return 0;
   err_out:
  	mpol_put(new);
8d34694c1   KOSAKI Motohiro   revert "mm: mempo...
705
706
  	return err;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
707
  /* Step 2: apply policy to a range and do splits. */
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
708
709
  static int mbind_range(struct mm_struct *mm, unsigned long start,
  		       unsigned long end, struct mempolicy *new_pol)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
710
711
  {
  	struct vm_area_struct *next;
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
712
713
714
  	struct vm_area_struct *prev;
  	struct vm_area_struct *vma;
  	int err = 0;
e26a51148   KOSAKI Motohiro   mm/mempolicy.c: r...
715
  	pgoff_t pgoff;
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
716
717
  	unsigned long vmstart;
  	unsigned long vmend;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
718

097d59106   Linus Torvalds   vm: avoid using f...
719
  	vma = find_vma(mm, start);
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
720
721
  	if (!vma || vma->vm_start > start)
  		return -EFAULT;
097d59106   Linus Torvalds   vm: avoid using f...
722
  	prev = vma->vm_prev;
e26a51148   KOSAKI Motohiro   mm/mempolicy.c: r...
723
724
  	if (start > vma->vm_start)
  		prev = vma;
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
725
  	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
726
  		next = vma->vm_next;
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
727
728
  		vmstart = max(start, vma->vm_start);
  		vmend   = min(end, vma->vm_end);
e26a51148   KOSAKI Motohiro   mm/mempolicy.c: r...
729
730
731
732
733
  		if (mpol_equal(vma_policy(vma), new_pol))
  			continue;
  
  		pgoff = vma->vm_pgoff +
  			((vmstart - vma->vm_start) >> PAGE_SHIFT);
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
734
  		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
e26a51148   KOSAKI Motohiro   mm/mempolicy.c: r...
735
  				  vma->anon_vma, vma->vm_file, pgoff,
8aacc9f55   Caspar Zhang   mm/mempolicy.c: f...
736
  				  new_pol);
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
737
738
739
  		if (prev) {
  			vma = prev;
  			next = vma->vm_next;
3964acd0d   Oleg Nesterov   mm: mempolicy: fi...
740
741
742
743
  			if (mpol_equal(vma_policy(vma), new_pol))
  				continue;
  			/* vma_merge() joined vma && vma->next, case 8 */
  			goto replace;
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
744
745
746
747
748
749
750
751
752
753
754
  		}
  		if (vma->vm_start != vmstart) {
  			err = split_vma(vma->vm_mm, vma, vmstart, 1);
  			if (err)
  				goto out;
  		}
  		if (vma->vm_end != vmend) {
  			err = split_vma(vma->vm_mm, vma, vmend, 0);
  			if (err)
  				goto out;
  		}
3964acd0d   Oleg Nesterov   mm: mempolicy: fi...
755
   replace:
869833f2c   KOSAKI Motohiro   mempolicy: remove...
756
  		err = vma_replace_policy(vma, new_pol);
8d34694c1   KOSAKI Motohiro   revert "mm: mempo...
757
758
  		if (err)
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
759
  	}
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
760
761
  
   out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
762
763
  	return err;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
764
  /* Set the process memory policy */
028fec414   David Rientjes   mempolicy: suppor...
765
766
  static long do_set_mempolicy(unsigned short mode, unsigned short flags,
  			     nodemask_t *nodes)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
767
  {
58568d2a8   Miao Xie   cpuset,mm: update...
768
  	struct mempolicy *new, *old;
f4e53d910   Lee Schermerhorn   mempolicy: write ...
769
  	struct mm_struct *mm = current->mm;
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
770
  	NODEMASK_SCRATCH(scratch);
58568d2a8   Miao Xie   cpuset,mm: update...
771
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
772

4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
773
774
  	if (!scratch)
  		return -ENOMEM;
f4e53d910   Lee Schermerhorn   mempolicy: write ...
775

4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
776
777
778
779
780
  	new = mpol_new(mode, flags, nodes);
  	if (IS_ERR(new)) {
  		ret = PTR_ERR(new);
  		goto out;
  	}
f4e53d910   Lee Schermerhorn   mempolicy: write ...
781
782
783
784
785
786
787
788
  	/*
  	 * prevent changing our mempolicy while show_numa_maps()
  	 * is using it.
  	 * Note:  do_set_mempolicy() can be called at init time
  	 * with no 'mm'.
  	 */
  	if (mm)
  		down_write(&mm->mmap_sem);
58568d2a8   Miao Xie   cpuset,mm: update...
789
  	task_lock(current);
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
790
  	ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a8   Miao Xie   cpuset,mm: update...
791
792
793
794
795
  	if (ret) {
  		task_unlock(current);
  		if (mm)
  			up_write(&mm->mmap_sem);
  		mpol_put(new);
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
796
  		goto out;
58568d2a8   Miao Xie   cpuset,mm: update...
797
798
  	}
  	old = current->mempolicy;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
799
  	current->mempolicy = new;
45c4745af   Lee Schermerhorn   mempolicy: rename...
800
  	if (new && new->mode == MPOL_INTERLEAVE &&
f5b087b52   David Rientjes   mempolicy: add MP...
801
  	    nodes_weight(new->v.nodes))
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
802
  		current->il_next = first_node(new->v.nodes);
58568d2a8   Miao Xie   cpuset,mm: update...
803
  	task_unlock(current);
f4e53d910   Lee Schermerhorn   mempolicy: write ...
804
805
  	if (mm)
  		up_write(&mm->mmap_sem);
58568d2a8   Miao Xie   cpuset,mm: update...
806
  	mpol_put(old);
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
807
808
809
810
  	ret = 0;
  out:
  	NODEMASK_SCRATCH_FREE(scratch);
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
811
  }
bea904d54   Lee Schermerhorn   mempolicy: use MP...
812
813
  /*
   * Return nodemask for policy for get_mempolicy() query
58568d2a8   Miao Xie   cpuset,mm: update...
814
815
   *
   * Called with task's alloc_lock held
bea904d54   Lee Schermerhorn   mempolicy: use MP...
816
817
   */
  static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
818
  {
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
819
  	nodes_clear(*nodes);
bea904d54   Lee Schermerhorn   mempolicy: use MP...
820
821
  	if (p == &default_policy)
  		return;
45c4745af   Lee Schermerhorn   mempolicy: rename...
822
  	switch (p->mode) {
19770b326   Mel Gorman   mm: filter based ...
823
824
  	case MPOL_BIND:
  		/* Fall through */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
825
  	case MPOL_INTERLEAVE:
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
826
  		*nodes = p->v.nodes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
827
828
  		break;
  	case MPOL_PREFERRED:
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
829
  		if (!(p->flags & MPOL_F_LOCAL))
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
830
  			node_set(p->v.preferred_node, *nodes);
53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
831
  		/* else return empty node mask for local allocation */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
  		break;
  	default:
  		BUG();
  	}
  }
  
  static int lookup_node(struct mm_struct *mm, unsigned long addr)
  {
  	struct page *p;
  	int err;
  
  	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
  	if (err >= 0) {
  		err = page_to_nid(p);
  		put_page(p);
  	}
  	return err;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
850
  /* Retrieve NUMA policy */
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
851
852
  static long do_get_mempolicy(int *policy, nodemask_t *nmask,
  			     unsigned long addr, unsigned long flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
853
  {
8bccd85ff   Christoph Lameter   [PATCH] Implement...
854
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
855
856
857
  	struct mm_struct *mm = current->mm;
  	struct vm_area_struct *vma = NULL;
  	struct mempolicy *pol = current->mempolicy;
754af6f5a   Lee Schermerhorn   Mem Policy: add M...
858
859
  	if (flags &
  		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
860
  		return -EINVAL;
754af6f5a   Lee Schermerhorn   Mem Policy: add M...
861
862
863
864
865
  
  	if (flags & MPOL_F_MEMS_ALLOWED) {
  		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
  			return -EINVAL;
  		*policy = 0;	/* just so it's initialized */
58568d2a8   Miao Xie   cpuset,mm: update...
866
  		task_lock(current);
754af6f5a   Lee Schermerhorn   Mem Policy: add M...
867
  		*nmask  = cpuset_current_mems_allowed;
58568d2a8   Miao Xie   cpuset,mm: update...
868
  		task_unlock(current);
754af6f5a   Lee Schermerhorn   Mem Policy: add M...
869
870
  		return 0;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
871
  	if (flags & MPOL_F_ADDR) {
bea904d54   Lee Schermerhorn   mempolicy: use MP...
872
873
874
875
876
  		/*
  		 * Do NOT fall back to task policy if the
  		 * vma/shared policy at addr is NULL.  We
  		 * want to return MPOL_DEFAULT in this case.
  		 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
877
878
879
880
881
882
883
884
885
886
887
888
889
890
  		down_read(&mm->mmap_sem);
  		vma = find_vma_intersection(mm, addr, addr+1);
  		if (!vma) {
  			up_read(&mm->mmap_sem);
  			return -EFAULT;
  		}
  		if (vma->vm_ops && vma->vm_ops->get_policy)
  			pol = vma->vm_ops->get_policy(vma, addr);
  		else
  			pol = vma->vm_policy;
  	} else if (addr)
  		return -EINVAL;
  
  	if (!pol)
bea904d54   Lee Schermerhorn   mempolicy: use MP...
891
  		pol = &default_policy;	/* indicates default behavior */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
892
893
894
895
896
897
  
  	if (flags & MPOL_F_NODE) {
  		if (flags & MPOL_F_ADDR) {
  			err = lookup_node(mm, addr);
  			if (err < 0)
  				goto out;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
898
  			*policy = err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
899
  		} else if (pol == current->mempolicy &&
45c4745af   Lee Schermerhorn   mempolicy: rename...
900
  				pol->mode == MPOL_INTERLEAVE) {
8bccd85ff   Christoph Lameter   [PATCH] Implement...
901
  			*policy = current->il_next;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
902
903
904
905
  		} else {
  			err = -EINVAL;
  			goto out;
  		}
bea904d54   Lee Schermerhorn   mempolicy: use MP...
906
907
908
  	} else {
  		*policy = pol == &default_policy ? MPOL_DEFAULT :
  						pol->mode;
d79df630f   David Rientjes   mempolicy: mask o...
909
910
911
912
913
  		/*
  		 * Internal mempolicy flags must be masked off before exposing
  		 * the policy to userspace.
  		 */
  		*policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d54   Lee Schermerhorn   mempolicy: use MP...
914
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
915
916
917
918
919
  
  	if (vma) {
  		up_read(&current->mm->mmap_sem);
  		vma = NULL;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
920
  	err = 0;
58568d2a8   Miao Xie   cpuset,mm: update...
921
  	if (nmask) {
c6b6ef8bb   Lee Schermerhorn   mempolicy: fix ge...
922
923
924
925
926
927
928
  		if (mpol_store_user_nodemask(pol)) {
  			*nmask = pol->w.user_nodemask;
  		} else {
  			task_lock(current);
  			get_policy_nodemask(pol, nmask);
  			task_unlock(current);
  		}
58568d2a8   Miao Xie   cpuset,mm: update...
929
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
930
931
  
   out:
52cd3b074   Lee Schermerhorn   mempolicy: rework...
932
  	mpol_cond_put(pol);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
933
934
935
936
  	if (vma)
  		up_read(&current->mm->mmap_sem);
  	return err;
  }
b20a35035   Christoph Lameter   [PATCH] page migr...
937
  #ifdef CONFIG_MIGRATION
8bccd85ff   Christoph Lameter   [PATCH] Implement...
938
  /*
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
939
940
   * page migration
   */
fc3012896   Christoph Lameter   [PATCH] Simplify ...
941
942
  static void migrate_page_add(struct page *page, struct list_head *pagelist,
  				unsigned long flags)
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
943
944
  {
  	/*
fc3012896   Christoph Lameter   [PATCH] Simplify ...
945
  	 * Avoid migrating a page that is shared with others.
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
946
  	 */
62695a84e   Nick Piggin   vmscan: move isol...
947
948
949
  	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
  		if (!isolate_lru_page(page)) {
  			list_add_tail(&page->lru, pagelist);
6d9c285a6   KOSAKI Motohiro   mm: move inc_zone...
950
951
  			inc_zone_page_state(page, NR_ISOLATED_ANON +
  					    page_is_file_cache(page));
62695a84e   Nick Piggin   vmscan: move isol...
952
953
  		}
  	}
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
954
  }
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
955

742755a1d   Christoph Lameter   [PATCH] page migr...
956
  static struct page *new_node_page(struct page *page, unsigned long node, int **x)
95a402c38   Christoph Lameter   [PATCH] page migr...
957
  {
e2d8cf405   Naoya Horiguchi   migrate: add huge...
958
959
960
961
962
  	if (PageHuge(page))
  		return alloc_huge_page_node(page_hstate(compound_head(page)),
  					node);
  	else
  		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
95a402c38   Christoph Lameter   [PATCH] page migr...
963
  }
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
964
965
966
967
  /*
   * Migrate pages from one node to a target node.
   * Returns error or the number of pages not migrated.
   */
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
968
969
  static int migrate_to_node(struct mm_struct *mm, int source, int dest,
  			   int flags)
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
970
971
972
973
974
975
976
  {
  	nodemask_t nmask;
  	LIST_HEAD(pagelist);
  	int err = 0;
  
  	nodes_clear(nmask);
  	node_set(source, nmask);
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
977

082708072   Minchan Kim   mm: revert 0def08...
978
979
980
981
982
983
  	/*
  	 * This does not "check" the range but isolates all pages that
  	 * need migration.  Between passing in the full user address
  	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
  	 */
  	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
980949457   Naoya Horiguchi   mm/mempolicy: ren...
984
  	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
985
  			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
cf608ac19   Minchan Kim   mm: compaction: f...
986
  	if (!list_empty(&pagelist)) {
7f0f24967   Mel Gorman   mm: migration: cl...
987
  		err = migrate_pages(&pagelist, new_node_page, dest,
9c620e2bc   Hugh Dickins   mm: remove offlin...
988
  					MIGRATE_SYNC, MR_SYSCALL);
cf608ac19   Minchan Kim   mm: compaction: f...
989
  		if (err)
e2d8cf405   Naoya Horiguchi   migrate: add huge...
990
  			putback_movable_pages(&pagelist);
cf608ac19   Minchan Kim   mm: compaction: f...
991
  	}
95a402c38   Christoph Lameter   [PATCH] page migr...
992

7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
993
  	return err;
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
994
995
996
  }
  
  /*
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
997
998
   * Move pages between the two nodesets so as to preserve the physical
   * layout as much as possible.
39743889a   Christoph Lameter   [PATCH] Swap Migr...
999
1000
1001
   *
   * Returns the number of page that could not be moved.
   */
0ce72d4f7   Andrew Morton   mm: do_migrate_pa...
1002
1003
  int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
  		     const nodemask_t *to, int flags)
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1004
  {
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
1005
  	int busy = 0;
0aedadf91   Christoph Lameter   mm: move migrate_...
1006
  	int err;
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
1007
  	nodemask_t tmp;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1008

0aedadf91   Christoph Lameter   mm: move migrate_...
1009
1010
1011
  	err = migrate_prep();
  	if (err)
  		return err;
53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
1012
  	down_read(&mm->mmap_sem);
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1013

0ce72d4f7   Andrew Morton   mm: do_migrate_pa...
1014
  	err = migrate_vmas(mm, from, to, flags);
7b2259b3e   Christoph Lameter   [PATCH] page migr...
1015
1016
  	if (err)
  		goto out;
da0aa1389   KOSAKI Motohiro   mm/mempolicy.c: f...
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
  	/*
  	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
  	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
  	 * bit in 'tmp', and return that <source, dest> pair for migration.
  	 * The pair of nodemasks 'to' and 'from' define the map.
  	 *
  	 * If no pair of bits is found that way, fallback to picking some
  	 * pair of 'source' and 'dest' bits that are not the same.  If the
  	 * 'source' and 'dest' bits are the same, this represents a node
  	 * that will be migrating to itself, so no pages need move.
  	 *
  	 * If no bits are left in 'tmp', or if all remaining bits left
  	 * in 'tmp' correspond to the same bit in 'to', return false
  	 * (nothing left to migrate).
  	 *
  	 * This lets us pick a pair of nodes to migrate between, such that
  	 * if possible the dest node is not already occupied by some other
  	 * source node, minimizing the risk of overloading the memory on a
  	 * node that would happen if we migrated incoming memory to a node
  	 * before migrating outgoing memory source that same node.
  	 *
  	 * A single scan of tmp is sufficient.  As we go, we remember the
  	 * most recent <s, d> pair that moved (s != d).  If we find a pair
  	 * that not only moved, but what's better, moved to an empty slot
  	 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f02   Justin P. Mattock   Remove one to man...
1042
  	 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa1389   KOSAKI Motohiro   mm/mempolicy.c: f...
1043
1044
1045
1046
  	 * most recent <s, d> pair that moved.  If we get all the way through
  	 * the scan of tmp without finding any node that moved, much less
  	 * moved to an empty node, then there is nothing left worth migrating.
  	 */
d49847113   Christoph Lameter   [PATCH] SwapMig: ...
1047

0ce72d4f7   Andrew Morton   mm: do_migrate_pa...
1048
  	tmp = *from;
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
1049
1050
  	while (!nodes_empty(tmp)) {
  		int s,d;
b76ac7e73   Jianguo Wu   mm/mempolicy: use...
1051
  		int source = NUMA_NO_NODE;
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
1052
1053
1054
  		int dest = 0;
  
  		for_each_node_mask(s, tmp) {
4a5b18cc1   Larry Woodman   mm: do_migrate_pa...
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
  
  			/*
  			 * do_migrate_pages() tries to maintain the relative
  			 * node relationship of the pages established between
  			 * threads and memory areas.
                           *
  			 * However if the number of source nodes is not equal to
  			 * the number of destination nodes we can not preserve
  			 * this node relative relationship.  In that case, skip
  			 * copying memory from a node that is in the destination
  			 * mask.
  			 *
  			 * Example: [2,3,4] -> [3,4,5] moves everything.
  			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
  			 */
0ce72d4f7   Andrew Morton   mm: do_migrate_pa...
1070
1071
  			if ((nodes_weight(*from) != nodes_weight(*to)) &&
  						(node_isset(s, *to)))
4a5b18cc1   Larry Woodman   mm: do_migrate_pa...
1072
  				continue;
0ce72d4f7   Andrew Morton   mm: do_migrate_pa...
1073
  			d = node_remap(s, *from, *to);
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
  			if (s == d)
  				continue;
  
  			source = s;	/* Node moved. Memorize */
  			dest = d;
  
  			/* dest not in remaining from nodes? */
  			if (!node_isset(dest, tmp))
  				break;
  		}
b76ac7e73   Jianguo Wu   mm/mempolicy: use...
1084
  		if (source == NUMA_NO_NODE)
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
1085
1086
1087
1088
1089
1090
1091
1092
  			break;
  
  		node_clear(source, tmp);
  		err = migrate_to_node(mm, source, dest, flags);
  		if (err > 0)
  			busy += err;
  		if (err < 0)
  			break;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1093
  	}
7b2259b3e   Christoph Lameter   [PATCH] page migr...
1094
  out:
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1095
  	up_read(&mm->mmap_sem);
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
1096
1097
1098
  	if (err < 0)
  		return err;
  	return busy;
b20a35035   Christoph Lameter   [PATCH] page migr...
1099
1100
  
  }
3ad33b243   Lee Schermerhorn   Migration: find c...
1101
1102
1103
1104
1105
1106
1107
  /*
   * Allocate a new page for page migration based on vma policy.
   * Start assuming that page is mapped by vma pointed to by @private.
   * Search forward from there, if not.  N.B., this assumes that the
   * list of pages handed to migrate_pages()--which is how we get here--
   * is in virtual address order.
   */
742755a1d   Christoph Lameter   [PATCH] page migr...
1108
  static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c38   Christoph Lameter   [PATCH] page migr...
1109
1110
  {
  	struct vm_area_struct *vma = (struct vm_area_struct *)private;
3ad33b243   Lee Schermerhorn   Migration: find c...
1111
  	unsigned long uninitialized_var(address);
95a402c38   Christoph Lameter   [PATCH] page migr...
1112

3ad33b243   Lee Schermerhorn   Migration: find c...
1113
1114
1115
1116
1117
1118
  	while (vma) {
  		address = page_address_in_vma(page, vma);
  		if (address != -EFAULT)
  			break;
  		vma = vma->vm_next;
  	}
11c731e81   Wanpeng Li   mm/mempolicy: fix...
1119
1120
  
  	if (PageHuge(page)) {
cc81717ed   Michal Hocko   mm: new_vma_page(...
1121
1122
  		BUG_ON(!vma);
  		return alloc_huge_page_noerr(vma, address, 1);
11c731e81   Wanpeng Li   mm/mempolicy: fix...
1123
  	}
0bf598d86   Naoya Horiguchi   mbind: add BUG_ON...
1124
  	/*
11c731e81   Wanpeng Li   mm/mempolicy: fix...
1125
  	 * if !vma, alloc_page_vma() will use task or system default policy
0bf598d86   Naoya Horiguchi   mbind: add BUG_ON...
1126
  	 */
3ad33b243   Lee Schermerhorn   Migration: find c...
1127
  	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
95a402c38   Christoph Lameter   [PATCH] page migr...
1128
  }
b20a35035   Christoph Lameter   [PATCH] page migr...
1129
1130
1131
1132
1133
  #else
  
  static void migrate_page_add(struct page *page, struct list_head *pagelist,
  				unsigned long flags)
  {
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1134
  }
0ce72d4f7   Andrew Morton   mm: do_migrate_pa...
1135
1136
  int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
  		     const nodemask_t *to, int flags)
b20a35035   Christoph Lameter   [PATCH] page migr...
1137
1138
1139
  {
  	return -ENOSYS;
  }
95a402c38   Christoph Lameter   [PATCH] page migr...
1140

699397499   Keith Owens   [PATCH] Fix do_mb...
1141
  static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c38   Christoph Lameter   [PATCH] page migr...
1142
1143
1144
  {
  	return NULL;
  }
b20a35035   Christoph Lameter   [PATCH] page migr...
1145
  #endif
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
1146
  static long do_mbind(unsigned long start, unsigned long len,
028fec414   David Rientjes   mempolicy: suppor...
1147
1148
  		     unsigned short mode, unsigned short mode_flags,
  		     nodemask_t *nmask, unsigned long flags)
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1149
1150
1151
1152
1153
1154
1155
  {
  	struct vm_area_struct *vma;
  	struct mm_struct *mm = current->mm;
  	struct mempolicy *new;
  	unsigned long end;
  	int err;
  	LIST_HEAD(pagelist);
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
1156
  	if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1157
  		return -EINVAL;
74c002410   Christoph Lameter   [PATCH] Consisten...
1158
  	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
  		return -EPERM;
  
  	if (start & ~PAGE_MASK)
  		return -EINVAL;
  
  	if (mode == MPOL_DEFAULT)
  		flags &= ~MPOL_MF_STRICT;
  
  	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
  	end = start + len;
  
  	if (end < start)
  		return -EINVAL;
  	if (end == start)
  		return 0;
028fec414   David Rientjes   mempolicy: suppor...
1174
  	new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1175
1176
  	if (IS_ERR(new))
  		return PTR_ERR(new);
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
1177
1178
  	if (flags & MPOL_MF_LAZY)
  		new->flags |= MPOL_F_MOF;
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1179
1180
1181
1182
1183
1184
  	/*
  	 * If we are using the default policy then operation
  	 * on discontinuous address spaces is okay after all
  	 */
  	if (!new)
  		flags |= MPOL_MF_DISCONTIG_OK;
028fec414   David Rientjes   mempolicy: suppor...
1185
1186
1187
  	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx
  ",
  		 start, start + len, mode, mode_flags,
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
1188
  		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1189

0aedadf91   Christoph Lameter   mm: move migrate_...
1190
1191
1192
1193
  	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
  
  		err = migrate_prep();
  		if (err)
b05ca7385   KOSAKI Motohiro   do_mbind(): fix m...
1194
  			goto mpol_out;
0aedadf91   Christoph Lameter   mm: move migrate_...
1195
  	}
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
  	{
  		NODEMASK_SCRATCH(scratch);
  		if (scratch) {
  			down_write(&mm->mmap_sem);
  			task_lock(current);
  			err = mpol_set_nodemask(new, nmask, scratch);
  			task_unlock(current);
  			if (err)
  				up_write(&mm->mmap_sem);
  		} else
  			err = -ENOMEM;
  		NODEMASK_SCRATCH_FREE(scratch);
  	}
b05ca7385   KOSAKI Motohiro   do_mbind(): fix m...
1209
1210
  	if (err)
  		goto mpol_out;
980949457   Naoya Horiguchi   mm/mempolicy: ren...
1211
  	vma = queue_pages_range(mm, start, end, nmask,
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1212
  			  flags | MPOL_MF_INVERT, &pagelist);
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
1213
  	err = PTR_ERR(vma);	/* maybe ... */
a720094de   Mel Gorman   mm: mempolicy: Hi...
1214
  	if (!IS_ERR(vma))
9d8cebd4b   KOSAKI Motohiro   mm: fix mbind vma...
1215
  		err = mbind_range(mm, start, end, new);
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
1216

b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
1217
1218
  	if (!err) {
  		int nr_failed = 0;
cf608ac19   Minchan Kim   mm: compaction: f...
1219
  		if (!list_empty(&pagelist)) {
b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
1220
  			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
95a402c38   Christoph Lameter   [PATCH] page migr...
1221
  			nr_failed = migrate_pages(&pagelist, new_vma_page,
9c620e2bc   Hugh Dickins   mm: remove offlin...
1222
1223
  					(unsigned long)vma,
  					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
cf608ac19   Minchan Kim   mm: compaction: f...
1224
  			if (nr_failed)
74060e4d7   Naoya Horiguchi   mm: mbind: add hu...
1225
  				putback_movable_pages(&pagelist);
cf608ac19   Minchan Kim   mm: compaction: f...
1226
  		}
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1227

b24f53a0b   Lee Schermerhorn   mm: mempolicy: Ad...
1228
  		if (nr_failed && (flags & MPOL_MF_STRICT))
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1229
  			err = -EIO;
ab8a3e14e   KOSAKI Motohiro   mbind(): fix leak...
1230
  	} else
b0e5fd735   Joonsoo Kim   mm/mempolicy: cor...
1231
  		putback_movable_pages(&pagelist);
b20a35035   Christoph Lameter   [PATCH] page migr...
1232

6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1233
  	up_write(&mm->mmap_sem);
b05ca7385   KOSAKI Motohiro   do_mbind(): fix m...
1234
   mpol_out:
f0be3d32b   Lee Schermerhorn   mempolicy: rename...
1235
  	mpol_put(new);
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
1236
1237
  	return err;
  }
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1238
  /*
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1239
1240
1241
1242
   * User space interface with variable sized bitmaps for nodelists.
   */
  
  /* Copy a node mask from user space. */
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1243
  static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
  		     unsigned long maxnode)
  {
  	unsigned long k;
  	unsigned long nlongs;
  	unsigned long endmask;
  
  	--maxnode;
  	nodes_clear(*nodes);
  	if (maxnode == 0 || !nmask)
  		return 0;
a9c930bac   Andi Kleen   [PATCH] Fix units...
1254
  	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c17   Chris Wright   [PATCH] sys_mbind...
1255
  		return -EINVAL;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
  
  	nlongs = BITS_TO_LONGS(maxnode);
  	if ((maxnode % BITS_PER_LONG) == 0)
  		endmask = ~0UL;
  	else
  		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
  
  	/* When the user specified more nodes than supported just check
  	   if the non supported part is all zero. */
  	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
  		if (nlongs > PAGE_SIZE/sizeof(long))
  			return -EINVAL;
  		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
  			unsigned long t;
  			if (get_user(t, nmask + k))
  				return -EFAULT;
  			if (k == nlongs - 1) {
  				if (t & endmask)
  					return -EINVAL;
  			} else if (t)
  				return -EINVAL;
  		}
  		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
  		endmask = ~0UL;
  	}
  
  	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
  		return -EFAULT;
  	nodes_addr(*nodes)[nlongs-1] &= endmask;
  	return 0;
  }
  
  /* Copy a kernel node mask to user space */
  static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
  			      nodemask_t *nodes)
  {
  	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
  	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
  
  	if (copy > nbytes) {
  		if (copy > PAGE_SIZE)
  			return -EINVAL;
  		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
  			return -EFAULT;
  		copy = nbytes;
  	}
  	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
  }
938bb9f5e   Heiko Carstens   [CVE-2009-0029] S...
1304
1305
1306
  SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
  		unsigned long, mode, unsigned long __user *, nmask,
  		unsigned long, maxnode, unsigned, flags)
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1307
1308
1309
  {
  	nodemask_t nodes;
  	int err;
028fec414   David Rientjes   mempolicy: suppor...
1310
  	unsigned short mode_flags;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1311

028fec414   David Rientjes   mempolicy: suppor...
1312
1313
  	mode_flags = mode & MPOL_MODE_FLAGS;
  	mode &= ~MPOL_MODE_FLAGS;
a3b51e014   David Rientjes   mempolicy: conver...
1314
1315
  	if (mode >= MPOL_MAX)
  		return -EINVAL;
4c50bc011   David Rientjes   mempolicy: add MP...
1316
1317
1318
  	if ((mode_flags & MPOL_F_STATIC_NODES) &&
  	    (mode_flags & MPOL_F_RELATIVE_NODES))
  		return -EINVAL;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1319
1320
1321
  	err = get_nodes(&nodes, nmask, maxnode);
  	if (err)
  		return err;
028fec414   David Rientjes   mempolicy: suppor...
1322
  	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1323
1324
1325
  }
  
  /* Set the process memory policy */
938bb9f5e   Heiko Carstens   [CVE-2009-0029] S...
1326
1327
  SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
  		unsigned long, maxnode)
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1328
1329
1330
  {
  	int err;
  	nodemask_t nodes;
028fec414   David Rientjes   mempolicy: suppor...
1331
  	unsigned short flags;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1332

028fec414   David Rientjes   mempolicy: suppor...
1333
1334
1335
  	flags = mode & MPOL_MODE_FLAGS;
  	mode &= ~MPOL_MODE_FLAGS;
  	if ((unsigned int)mode >= MPOL_MAX)
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1336
  		return -EINVAL;
4c50bc011   David Rientjes   mempolicy: add MP...
1337
1338
  	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
  		return -EINVAL;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1339
1340
1341
  	err = get_nodes(&nodes, nmask, maxnode);
  	if (err)
  		return err;
028fec414   David Rientjes   mempolicy: suppor...
1342
  	return do_set_mempolicy(mode, flags, &nodes);
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1343
  }
938bb9f5e   Heiko Carstens   [CVE-2009-0029] S...
1344
1345
1346
  SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
  		const unsigned long __user *, old_nodes,
  		const unsigned long __user *, new_nodes)
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1347
  {
c69e8d9c0   David Howells   CRED: Use RCU to ...
1348
  	const struct cred *cred = current_cred(), *tcred;
596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1349
  	struct mm_struct *mm = NULL;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1350
  	struct task_struct *task;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1351
1352
  	nodemask_t task_nodes;
  	int err;
596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1353
1354
1355
1356
1357
1358
  	nodemask_t *old;
  	nodemask_t *new;
  	NODEMASK_SCRATCH(scratch);
  
  	if (!scratch)
  		return -ENOMEM;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1359

596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1360
1361
1362
1363
  	old = &scratch->mask1;
  	new = &scratch->mask2;
  
  	err = get_nodes(old, old_nodes, maxnode);
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1364
  	if (err)
596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1365
  		goto out;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1366

596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1367
  	err = get_nodes(new, new_nodes, maxnode);
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1368
  	if (err)
596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1369
  		goto out;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1370
1371
  
  	/* Find the mm_struct */
55cfaa3cb   Zeng Zhaoming   mm/mempolicy.c: a...
1372
  	rcu_read_lock();
228ebcbe6   Pavel Emelyanov   Uninline find_tas...
1373
  	task = pid ? find_task_by_vpid(pid) : current;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1374
  	if (!task) {
55cfaa3cb   Zeng Zhaoming   mm/mempolicy.c: a...
1375
  		rcu_read_unlock();
596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1376
1377
  		err = -ESRCH;
  		goto out;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1378
  	}
3268c63ed   Christoph Lameter   mm: fix move/migr...
1379
  	get_task_struct(task);
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1380

596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1381
  	err = -EINVAL;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1382
1383
1384
1385
  
  	/*
  	 * Check if this process has the right to modify the specified
  	 * process. The right exists if the process has administrative
7f927fcc2   Alexey Dobriyan   [PATCH] Typo fixes
1386
  	 * capabilities, superuser privileges or the same
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1387
1388
  	 * userid as the target process.
  	 */
c69e8d9c0   David Howells   CRED: Use RCU to ...
1389
  	tcred = __task_cred(task);
b38a86eb1   Eric W. Biederman   userns: Convert t...
1390
1391
  	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
  	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
74c002410   Christoph Lameter   [PATCH] Consisten...
1392
  	    !capable(CAP_SYS_NICE)) {
c69e8d9c0   David Howells   CRED: Use RCU to ...
1393
  		rcu_read_unlock();
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1394
  		err = -EPERM;
3268c63ed   Christoph Lameter   mm: fix move/migr...
1395
  		goto out_put;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1396
  	}
c69e8d9c0   David Howells   CRED: Use RCU to ...
1397
  	rcu_read_unlock();
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1398
1399
1400
  
  	task_nodes = cpuset_mems_allowed(task);
  	/* Is the user allowed to access the target nodes? */
596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1401
  	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1402
  		err = -EPERM;
3268c63ed   Christoph Lameter   mm: fix move/migr...
1403
  		goto out_put;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1404
  	}
01f13bd60   Lai Jiangshan   mempolicy: use N_...
1405
  	if (!nodes_subset(*new, node_states[N_MEMORY])) {
3b42d28b2   Christoph Lameter   Page migration: D...
1406
  		err = -EINVAL;
3268c63ed   Christoph Lameter   mm: fix move/migr...
1407
  		goto out_put;
3b42d28b2   Christoph Lameter   Page migration: D...
1408
  	}
86c3a7645   David Quigley   [PATCH] SELinux: ...
1409
1410
  	err = security_task_movememory(task);
  	if (err)
3268c63ed   Christoph Lameter   mm: fix move/migr...
1411
  		goto out_put;
86c3a7645   David Quigley   [PATCH] SELinux: ...
1412

3268c63ed   Christoph Lameter   mm: fix move/migr...
1413
1414
  	mm = get_task_mm(task);
  	put_task_struct(task);
f2a9ef880   Sasha Levin   mm: fix NULL ptr ...
1415
1416
  
  	if (!mm) {
3268c63ed   Christoph Lameter   mm: fix move/migr...
1417
  		err = -EINVAL;
f2a9ef880   Sasha Levin   mm: fix NULL ptr ...
1418
1419
1420
1421
1422
  		goto out;
  	}
  
  	err = do_migrate_pages(mm, old, new,
  		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63ed   Christoph Lameter   mm: fix move/migr...
1423
1424
1425
  
  	mmput(mm);
  out:
596d7cfa2   KOSAKI Motohiro   mempolicy: reduce...
1426
  	NODEMASK_SCRATCH_FREE(scratch);
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1427
  	return err;
3268c63ed   Christoph Lameter   mm: fix move/migr...
1428
1429
1430
1431
  
  out_put:
  	put_task_struct(task);
  	goto out;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1432
  }
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1433
  /* Retrieve NUMA policy */
938bb9f5e   Heiko Carstens   [CVE-2009-0029] S...
1434
1435
1436
  SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
  		unsigned long __user *, nmask, unsigned long, maxnode,
  		unsigned long, addr, unsigned long, flags)
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1437
  {
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
1438
1439
  	int err;
  	int uninitialized_var(pval);
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
  	nodemask_t nodes;
  
  	if (nmask != NULL && maxnode < MAX_NUMNODES)
  		return -EINVAL;
  
  	err = do_get_mempolicy(&pval, &nodes, addr, flags);
  
  	if (err)
  		return err;
  
  	if (policy && put_user(pval, policy))
  		return -EFAULT;
  
  	if (nmask)
  		err = copy_nodes_to_user(nmask, maxnode, &nodes);
  
  	return err;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1458
  #ifdef CONFIG_COMPAT
c93e0f6c8   Heiko Carstens   mm/compat: conver...
1459
1460
1461
1462
  COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
  		       compat_ulong_t __user *, nmask,
  		       compat_ulong_t, maxnode,
  		       compat_ulong_t, addr, compat_ulong_t, flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
  {
  	long err;
  	unsigned long __user *nm = NULL;
  	unsigned long nr_bits, alloc_size;
  	DECLARE_BITMAP(bm, MAX_NUMNODES);
  
  	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
  	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
  
  	if (nmask)
  		nm = compat_alloc_user_space(alloc_size);
  
  	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
  
  	if (!err && nmask) {
2bbff6c76   KAMEZAWA Hiroyuki   mm/mempolicy.c: m...
1478
1479
1480
  		unsigned long copy_size;
  		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
  		err = copy_from_user(bm, nm, copy_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1481
1482
1483
1484
1485
1486
1487
  		/* ensure entire bitmap is zeroed */
  		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
  		err |= compat_put_bitmap(nmask, bm, nr_bits);
  	}
  
  	return err;
  }
c93e0f6c8   Heiko Carstens   mm/compat: conver...
1488
1489
  COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
  		       compat_ulong_t, maxnode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
  {
  	long err = 0;
  	unsigned long __user *nm = NULL;
  	unsigned long nr_bits, alloc_size;
  	DECLARE_BITMAP(bm, MAX_NUMNODES);
  
  	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
  	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
  
  	if (nmask) {
  		err = compat_get_bitmap(bm, nmask, nr_bits);
  		nm = compat_alloc_user_space(alloc_size);
  		err |= copy_to_user(nm, bm, alloc_size);
  	}
  
  	if (err)
  		return -EFAULT;
  
  	return sys_set_mempolicy(mode, nm, nr_bits+1);
  }
c93e0f6c8   Heiko Carstens   mm/compat: conver...
1510
1511
1512
  COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
  		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
  		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1513
1514
1515
1516
  {
  	long err = 0;
  	unsigned long __user *nm = NULL;
  	unsigned long nr_bits, alloc_size;
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1517
  	nodemask_t bm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1518
1519
1520
1521
1522
  
  	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
  	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
  
  	if (nmask) {
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1523
  		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1524
  		nm = compat_alloc_user_space(alloc_size);
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1525
  		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1526
1527
1528
1529
1530
1531
1532
1533
1534
  	}
  
  	if (err)
  		return -EFAULT;
  
  	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
  }
  
  #endif
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1535
1536
1537
1538
1539
1540
1541
1542
  /*
   * get_vma_policy(@task, @vma, @addr)
   * @task - task for fallback if vma policy == default
   * @vma   - virtual memory area whose policy is sought
   * @addr  - address in @vma for shared policy lookup
   *
   * Returns effective policy for a VMA at specified address.
   * Falls back to @task or system default policy, as necessary.
32f8516a8   David Rientjes   mm, mempolicy: fi...
1543
1544
   * Current or other task's task mempolicy and non-shared vma policies must be
   * protected by task_lock(task) by the caller.
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1545
1546
1547
1548
   * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
   * count--added by the get_policy() vm_op, as appropriate--to protect against
   * freeing by another task.  It is the caller's responsibility to free the
   * extra reference for shared policies.
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1549
   */
d98f6cb67   Stephen Wilson   mm: export get_vm...
1550
  struct mempolicy *get_vma_policy(struct task_struct *task,
48fce3429   Christoph Lameter   [PATCH] mempolici...
1551
  		struct vm_area_struct *vma, unsigned long addr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1552
  {
5606e3877   Mel Gorman   mm: numa: Migrate...
1553
  	struct mempolicy *pol = get_task_policy(task);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1554
1555
  
  	if (vma) {
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1556
  		if (vma->vm_ops && vma->vm_ops->get_policy) {
ae4d8c16a   Lee Schermerhorn   mempolicy: fixup ...
1557
1558
1559
1560
  			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
  									addr);
  			if (vpol)
  				pol = vpol;
00442ad04   Mel Gorman   mempolicy: fix a ...
1561
  		} else if (vma->vm_policy) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1562
  			pol = vma->vm_policy;
00442ad04   Mel Gorman   mempolicy: fix a ...
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
  
  			/*
  			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
  			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
  			 * count on these policies which will be dropped by
  			 * mpol_cond_put() later
  			 */
  			if (mpol_needs_cond_ref(pol))
  				mpol_get(pol);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1573
1574
1575
1576
1577
  	}
  	if (!pol)
  		pol = &default_policy;
  	return pol;
  }
fc3147245   Mel Gorman   mm: numa: Limit N...
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
  bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
  {
  	struct mempolicy *pol = get_task_policy(task);
  	if (vma) {
  		if (vma->vm_ops && vma->vm_ops->get_policy) {
  			bool ret = false;
  
  			pol = vma->vm_ops->get_policy(vma, vma->vm_start);
  			if (pol && (pol->flags & MPOL_F_MOF))
  				ret = true;
  			mpol_cond_put(pol);
  
  			return ret;
  		} else if (vma->vm_policy) {
  			pol = vma->vm_policy;
  		}
  	}
  
  	if (!pol)
  		return default_policy.flags & MPOL_F_MOF;
  
  	return pol->flags & MPOL_F_MOF;
  }
d3eb1570a   Lai Jiangshan   mempolicy: fix is...
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
  static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
  {
  	enum zone_type dynamic_policy_zone = policy_zone;
  
  	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
  
  	/*
  	 * if policy->v.nodes has movable memory only,
  	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
  	 *
  	 * policy->v.nodes is intersect with node_states[N_MEMORY].
  	 * so if the following test faile, it implies
  	 * policy->v.nodes has movable memory only.
  	 */
  	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
  		dynamic_policy_zone = ZONE_MOVABLE;
  
  	return zone >= dynamic_policy_zone;
  }
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1620
1621
1622
1623
1624
  /*
   * Return a nodemask representing a mempolicy for filtering nodes for
   * page allocation
   */
  static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b326   Mel Gorman   mm: filter based ...
1625
1626
  {
  	/* Lower zones don't get a nodemask applied for MPOL_BIND */
45c4745af   Lee Schermerhorn   mempolicy: rename...
1627
  	if (unlikely(policy->mode == MPOL_BIND) &&
d3eb1570a   Lai Jiangshan   mempolicy: fix is...
1628
  			apply_policy_zone(policy, gfp_zone(gfp)) &&
19770b326   Mel Gorman   mm: filter based ...
1629
1630
1631
1632
1633
  			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
  		return &policy->v.nodes;
  
  	return NULL;
  }
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1634
  /* Return a zonelist indicated by gfp for node representing a mempolicy */
2f5f9486f   Andi Kleen   mm: change alloc_...
1635
1636
  static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
  	int nd)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1637
  {
45c4745af   Lee Schermerhorn   mempolicy: rename...
1638
  	switch (policy->mode) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1639
  	case MPOL_PREFERRED:
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
1640
1641
  		if (!(policy->flags & MPOL_F_LOCAL))
  			nd = policy->v.preferred_node;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1642
1643
  		break;
  	case MPOL_BIND:
19770b326   Mel Gorman   mm: filter based ...
1644
  		/*
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1645
1646
  		 * Normally, MPOL_BIND allocations are node-local within the
  		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
6eb27e1fd   Bob Liu   mempolicy: remove...
1647
  		 * current node isn't part of the mask, we use the zonelist for
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1648
  		 * the first node in the mask instead.
19770b326   Mel Gorman   mm: filter based ...
1649
  		 */
19770b326   Mel Gorman   mm: filter based ...
1650
1651
1652
1653
  		if (unlikely(gfp & __GFP_THISNODE) &&
  				unlikely(!node_isset(nd, policy->v.nodes)))
  			nd = first_node(policy->v.nodes);
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1654
  	default:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1655
1656
  		BUG();
  	}
0e88460da   Mel Gorman   mm: introduce nod...
1657
  	return node_zonelist(nd, gfp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1658
1659
1660
1661
1662
1663
1664
1665
1666
  }
  
  /* Do dynamic interleaving for a process */
  static unsigned interleave_nodes(struct mempolicy *policy)
  {
  	unsigned nid, next;
  	struct task_struct *me = current;
  
  	nid = me->il_next;
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1667
  	next = next_node(nid, policy->v.nodes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1668
  	if (next >= MAX_NUMNODES)
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1669
  		next = first_node(policy->v.nodes);
f5b087b52   David Rientjes   mempolicy: add MP...
1670
1671
  	if (next < MAX_NUMNODES)
  		me->il_next = next;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1672
1673
  	return nid;
  }
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1674
1675
1676
1677
  /*
   * Depending on the memory policy provide a node from which to allocate the
   * next slab entry.
   */
2a389610a   David Rientjes   mm, mempolicy: re...
1678
  unsigned int mempolicy_slab_node(void)
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1679
  {
e7b691b08   Andi Kleen   slab/mempolicy: a...
1680
  	struct mempolicy *policy;
2a389610a   David Rientjes   mm, mempolicy: re...
1681
  	int node = numa_mem_id();
e7b691b08   Andi Kleen   slab/mempolicy: a...
1682
1683
  
  	if (in_interrupt())
2a389610a   David Rientjes   mm, mempolicy: re...
1684
  		return node;
e7b691b08   Andi Kleen   slab/mempolicy: a...
1685
1686
  
  	policy = current->mempolicy;
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
1687
  	if (!policy || policy->flags & MPOL_F_LOCAL)
2a389610a   David Rientjes   mm, mempolicy: re...
1688
  		return node;
bea904d54   Lee Schermerhorn   mempolicy: use MP...
1689
1690
1691
  
  	switch (policy->mode) {
  	case MPOL_PREFERRED:
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
1692
1693
1694
1695
  		/*
  		 * handled MPOL_F_LOCAL above
  		 */
  		return policy->v.preferred_node;
765c4507a   Christoph Lameter   [PATCH] GFP_THISN...
1696

dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1697
1698
  	case MPOL_INTERLEAVE:
  		return interleave_nodes(policy);
dd1a239f6   Mel Gorman   mm: have zonelist...
1699
  	case MPOL_BIND: {
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1700
1701
1702
1703
  		/*
  		 * Follow bind policy behavior and start allocation at the
  		 * first node.
  		 */
19770b326   Mel Gorman   mm: filter based ...
1704
1705
1706
  		struct zonelist *zonelist;
  		struct zone *zone;
  		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2a389610a   David Rientjes   mm, mempolicy: re...
1707
  		zonelist = &NODE_DATA(node)->node_zonelists[0];
19770b326   Mel Gorman   mm: filter based ...
1708
1709
1710
  		(void)first_zones_zonelist(zonelist, highest_zoneidx,
  							&policy->v.nodes,
  							&zone);
2a389610a   David Rientjes   mm, mempolicy: re...
1711
  		return zone ? zone->node : node;
dd1a239f6   Mel Gorman   mm: have zonelist...
1712
  	}
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1713

dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1714
  	default:
bea904d54   Lee Schermerhorn   mempolicy: use MP...
1715
  		BUG();
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1716
1717
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1718
1719
1720
1721
  /* Do static interleaving for a VMA with known offset. */
  static unsigned offset_il_node(struct mempolicy *pol,
  		struct vm_area_struct *vma, unsigned long off)
  {
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1722
  	unsigned nnodes = nodes_weight(pol->v.nodes);
f5b087b52   David Rientjes   mempolicy: add MP...
1723
  	unsigned target;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1724
  	int c;
b76ac7e73   Jianguo Wu   mm/mempolicy: use...
1725
  	int nid = NUMA_NO_NODE;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1726

f5b087b52   David Rientjes   mempolicy: add MP...
1727
1728
1729
  	if (!nnodes)
  		return numa_node_id();
  	target = (unsigned int)off % nnodes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1730
1731
  	c = 0;
  	do {
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1732
  		nid = next_node(nid, pol->v.nodes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1733
1734
  		c++;
  	} while (c <= target);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1735
1736
  	return nid;
  }
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1737
1738
1739
1740
1741
1742
  /* Determine a node number for interleave */
  static inline unsigned interleave_nid(struct mempolicy *pol,
  		 struct vm_area_struct *vma, unsigned long addr, int shift)
  {
  	if (vma) {
  		unsigned long off;
3b98b087f   Nishanth Aravamudan   [PATCH] fix NUMA ...
1743
1744
1745
1746
1747
1748
1749
1750
1751
  		/*
  		 * for small pages, there is no difference between
  		 * shift and PAGE_SHIFT, so the bit-shift is safe.
  		 * for huge pages, since vm_pgoff is in units of small
  		 * pages, we need to shift off the always 0 bits to get
  		 * a useful offset.
  		 */
  		BUG_ON(shift < PAGE_SHIFT);
  		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1752
1753
1754
1755
1756
  		off += (addr - vma->vm_start) >> shift;
  		return offset_il_node(pol, vma, off);
  	} else
  		return interleave_nodes(pol);
  }
778d3b0ff   Michal Hocko   cpusets: randomiz...
1757
1758
  /*
   * Return the bit number of a random bit set in the nodemask.
b76ac7e73   Jianguo Wu   mm/mempolicy: use...
1759
   * (returns NUMA_NO_NODE if nodemask is empty)
778d3b0ff   Michal Hocko   cpusets: randomiz...
1760
1761
1762
   */
  int node_random(const nodemask_t *maskp)
  {
b76ac7e73   Jianguo Wu   mm/mempolicy: use...
1763
  	int w, bit = NUMA_NO_NODE;
778d3b0ff   Michal Hocko   cpusets: randomiz...
1764
1765
1766
1767
1768
1769
1770
  
  	w = nodes_weight(*maskp);
  	if (w)
  		bit = bitmap_ord_to_pos(maskp->bits,
  			get_random_int() % w, MAX_NUMNODES);
  	return bit;
  }
00ac59adf   Kenneth W Chen   [PATCH] x86_64: F...
1771
  #ifdef CONFIG_HUGETLBFS
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1772
1773
1774
1775
1776
  /*
   * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
   * @vma = virtual memory area whose policy is sought
   * @addr = address in @vma for shared policy lookup and interleave policy
   * @gfp_flags = for requested zone
19770b326   Mel Gorman   mm: filter based ...
1777
1778
   * @mpol = pointer to mempolicy pointer for reference counted mempolicy
   * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1779
   *
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1780
1781
1782
1783
   * Returns a zonelist suitable for a huge page allocation and a pointer
   * to the struct mempolicy for conditional unref after allocation.
   * If the effective policy is 'BIND, returns a pointer to the mempolicy's
   * @nodemask for filtering the zonelist.
c0ff7453b   Miao Xie   cpuset,mm: fix no...
1784
   *
d26914d11   Mel Gorman   mm: optimize put_...
1785
   * Must be protected by read_mems_allowed_begin()
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1786
   */
396faf030   Mel Gorman   Allow huge page a...
1787
  struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
19770b326   Mel Gorman   mm: filter based ...
1788
1789
  				gfp_t gfp_flags, struct mempolicy **mpol,
  				nodemask_t **nodemask)
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1790
  {
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1791
  	struct zonelist *zl;
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1792

52cd3b074   Lee Schermerhorn   mempolicy: rework...
1793
  	*mpol = get_vma_policy(current, vma, addr);
19770b326   Mel Gorman   mm: filter based ...
1794
  	*nodemask = NULL;	/* assume !MPOL_BIND */
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1795

52cd3b074   Lee Schermerhorn   mempolicy: rework...
1796
1797
  	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
  		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
a55164389   Andi Kleen   hugetlb: modular ...
1798
  				huge_page_shift(hstate_vma(vma))), gfp_flags);
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1799
  	} else {
2f5f9486f   Andi Kleen   mm: change alloc_...
1800
  		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1801
1802
  		if ((*mpol)->mode == MPOL_BIND)
  			*nodemask = &(*mpol)->v.nodes;
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1803
1804
  	}
  	return zl;
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1805
  }
06808b082   Lee Schermerhorn   hugetlb: derive h...
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
  
  /*
   * init_nodemask_of_mempolicy
   *
   * If the current task's mempolicy is "default" [NULL], return 'false'
   * to indicate default policy.  Otherwise, extract the policy nodemask
   * for 'bind' or 'interleave' policy into the argument nodemask, or
   * initialize the argument nodemask to contain the single node for
   * 'preferred' or 'local' policy and return 'true' to indicate presence
   * of non-default mempolicy.
   *
   * We don't bother with reference counting the mempolicy [mpol_get/put]
   * because the current task is examining it's own mempolicy and a task's
   * mempolicy is only ever changed by the task itself.
   *
   * N.B., it is the caller's responsibility to free a returned nodemask.
   */
  bool init_nodemask_of_mempolicy(nodemask_t *mask)
  {
  	struct mempolicy *mempolicy;
  	int nid;
  
  	if (!(mask && current->mempolicy))
  		return false;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
1830
  	task_lock(current);
06808b082   Lee Schermerhorn   hugetlb: derive h...
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
  	mempolicy = current->mempolicy;
  	switch (mempolicy->mode) {
  	case MPOL_PREFERRED:
  		if (mempolicy->flags & MPOL_F_LOCAL)
  			nid = numa_node_id();
  		else
  			nid = mempolicy->v.preferred_node;
  		init_nodemask_of_node(mask, nid);
  		break;
  
  	case MPOL_BIND:
  		/* Fall through */
  	case MPOL_INTERLEAVE:
  		*mask =  mempolicy->v.nodes;
  		break;
  
  	default:
  		BUG();
  	}
c0ff7453b   Miao Xie   cpuset,mm: fix no...
1850
  	task_unlock(current);
06808b082   Lee Schermerhorn   hugetlb: derive h...
1851
1852
1853
  
  	return true;
  }
00ac59adf   Kenneth W Chen   [PATCH] x86_64: F...
1854
  #endif
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1855

6f48d0ebd   David Rientjes   oom: select task ...
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
  /*
   * mempolicy_nodemask_intersects
   *
   * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
   * policy.  Otherwise, check for intersection between mask and the policy
   * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
   * policy, always return true since it may allocate elsewhere on fallback.
   *
   * Takes task_lock(tsk) to prevent freeing of its mempolicy.
   */
  bool mempolicy_nodemask_intersects(struct task_struct *tsk,
  					const nodemask_t *mask)
  {
  	struct mempolicy *mempolicy;
  	bool ret = true;
  
  	if (!mask)
  		return ret;
  	task_lock(tsk);
  	mempolicy = tsk->mempolicy;
  	if (!mempolicy)
  		goto out;
  
  	switch (mempolicy->mode) {
  	case MPOL_PREFERRED:
  		/*
  		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
  		 * allocate from, they may fallback to other nodes when oom.
  		 * Thus, it's possible for tsk to have allocated memory from
  		 * nodes in mask.
  		 */
  		break;
  	case MPOL_BIND:
  	case MPOL_INTERLEAVE:
  		ret = nodes_intersects(mempolicy->v.nodes, *mask);
  		break;
  	default:
  		BUG();
  	}
  out:
  	task_unlock(tsk);
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1899
1900
  /* Allocate a page in interleaved policy.
     Own path because it needs to do special accounting. */
662f3a0b9   Andi Kleen   [PATCH] Remove ne...
1901
1902
  static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  					unsigned nid)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1903
1904
1905
  {
  	struct zonelist *zl;
  	struct page *page;
0e88460da   Mel Gorman   mm: introduce nod...
1906
  	zl = node_zonelist(nid, gfp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1907
  	page = __alloc_pages(gfp, order, zl);
dd1a239f6   Mel Gorman   mm: have zonelist...
1908
  	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
ca889e6c4   Christoph Lameter   [PATCH] Use Zoned...
1909
  		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1910
1911
1912
1913
  	return page;
  }
  
  /**
0bbbc0b33   Andrea Arcangeli   thp: add numa awa...
1914
   * 	alloc_pages_vma	- Allocate a page for a VMA.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1915
1916
1917
1918
1919
1920
1921
1922
   *
   * 	@gfp:
   *      %GFP_USER    user allocation.
   *      %GFP_KERNEL  kernel allocations,
   *      %GFP_HIGHMEM highmem/user allocations,
   *      %GFP_FS      allocation should not call back into a file system.
   *      %GFP_ATOMIC  don't sleep.
   *
0bbbc0b33   Andrea Arcangeli   thp: add numa awa...
1923
   *	@order:Order of the GFP allocation.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
   * 	@vma:  Pointer to VMA or NULL if not available.
   *	@addr: Virtual Address of the allocation. Must be inside the VMA.
   *
   * 	This function allocates a page from the kernel page pool and applies
   *	a NUMA policy associated with the VMA or the current process.
   *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
   *	mm_struct of the VMA to prevent it from going away. Should be used for
   *	all allocations for pages that will be mapped into
   * 	user space. Returns NULL when no page can be allocated.
   *
   *	Should be called with the mm_sem of the vma hold.
   */
  struct page *
0bbbc0b33   Andrea Arcangeli   thp: add numa awa...
1937
  alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2f5f9486f   Andi Kleen   mm: change alloc_...
1938
  		unsigned long addr, int node)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1939
  {
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
1940
  	struct mempolicy *pol;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
1941
  	struct page *page;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
1942
1943
1944
1945
  	unsigned int cpuset_mems_cookie;
  
  retry_cpuset:
  	pol = get_vma_policy(current, vma, addr);
d26914d11   Mel Gorman   mm: optimize put_...
1946
  	cpuset_mems_cookie = read_mems_allowed_begin();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1947

45c4745af   Lee Schermerhorn   mempolicy: rename...
1948
  	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1949
  		unsigned nid;
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1950

8eac563c1   Andi Kleen   thp: fix interlea...
1951
  		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1952
  		mpol_cond_put(pol);
0bbbc0b33   Andrea Arcangeli   thp: add numa awa...
1953
  		page = alloc_page_interleave(gfp, order, nid);
d26914d11   Mel Gorman   mm: optimize put_...
1954
  		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
1955
  			goto retry_cpuset;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
1956
  		return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1957
  	}
212a0a6f2   David Rientjes   mm, mempolicy: re...
1958
1959
  	page = __alloc_pages_nodemask(gfp, order,
  				      policy_zonelist(gfp, pol, node),
0bbbc0b33   Andrea Arcangeli   thp: add numa awa...
1960
  				      policy_nodemask(gfp, pol));
212a0a6f2   David Rientjes   mm, mempolicy: re...
1961
1962
  	if (unlikely(mpol_needs_cond_ref(pol)))
  		__mpol_put(pol);
d26914d11   Mel Gorman   mm: optimize put_...
1963
  	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
1964
  		goto retry_cpuset;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
1965
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
  }
  
  /**
   * 	alloc_pages_current - Allocate pages.
   *
   *	@gfp:
   *		%GFP_USER   user allocation,
   *      	%GFP_KERNEL kernel allocation,
   *      	%GFP_HIGHMEM highmem allocation,
   *      	%GFP_FS     don't call back into a file system.
   *      	%GFP_ATOMIC don't sleep.
   *	@order: Power of two of allocation size in pages. 0 is a single page.
   *
   *	Allocate a page from the kernel page pool.  When not in
   *	interrupt context and apply the current process NUMA policy.
   *	Returns NULL when no page can be allocated.
   *
cf2a473c4   Paul Jackson   [PATCH] cpuset: c...
1983
   *	Don't call cpuset_update_task_memory_state() unless
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1984
1985
1986
   *	1) it's ok to take cpuset_sem (can WAIT), and
   *	2) allocating for current task (not interrupt).
   */
dd0fc66fb   Al Viro   [PATCH] gfp flags...
1987
  struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1988
  {
5606e3877   Mel Gorman   mm: numa: Migrate...
1989
  	struct mempolicy *pol = get_task_policy(current);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
1990
  	struct page *page;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
1991
  	unsigned int cpuset_mems_cookie;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1992

9b819d204   Christoph Lameter   [PATCH] Add __GFP...
1993
  	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1994
  		pol = &default_policy;
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1995

cc9a6c877   Mel Gorman   cpuset: mm: reduc...
1996
  retry_cpuset:
d26914d11   Mel Gorman   mm: optimize put_...
1997
  	cpuset_mems_cookie = read_mems_allowed_begin();
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
1998

52cd3b074   Lee Schermerhorn   mempolicy: rework...
1999
2000
2001
2002
  	/*
  	 * No reference counting needed for current->mempolicy
  	 * nor system default_policy
  	 */
45c4745af   Lee Schermerhorn   mempolicy: rename...
2003
  	if (pol->mode == MPOL_INTERLEAVE)
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2004
2005
2006
  		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
  	else
  		page = __alloc_pages_nodemask(gfp, order,
5c4b4be3b   Andi Kleen   mm: use correct n...
2007
2008
  				policy_zonelist(gfp, pol, numa_node_id()),
  				policy_nodemask(gfp, pol));
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
2009

d26914d11   Mel Gorman   mm: optimize put_...
2010
  	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
2011
  		goto retry_cpuset;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
2012
  	return page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2013
2014
  }
  EXPORT_SYMBOL(alloc_pages_current);
ef0855d33   Oleg Nesterov   mm: mempolicy: tu...
2015
2016
2017
2018
2019
2020
2021
2022
2023
  int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
  {
  	struct mempolicy *pol = mpol_dup(vma_policy(src));
  
  	if (IS_ERR(pol))
  		return PTR_ERR(pol);
  	dst->vm_policy = pol;
  	return 0;
  }
4225399a6   Paul Jackson   [PATCH] cpuset: r...
2024
  /*
846a16bf0   Lee Schermerhorn   mempolicy: rename...
2025
   * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a6   Paul Jackson   [PATCH] cpuset: r...
2026
2027
2028
2029
   * rebinds the mempolicy its copying by calling mpol_rebind_policy()
   * with the mems_allowed returned by cpuset_mems_allowed().  This
   * keeps mempolicies cpuset relative after its cpuset moves.  See
   * further kernel/cpuset.c update_nodemask().
708c1bbc9   Miao Xie   mempolicy: restru...
2030
2031
2032
   *
   * current's mempolicy may be rebinded by the other task(the task that changes
   * cpuset's mems), so we needn't do rebind work for current task.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
2033
   */
4225399a6   Paul Jackson   [PATCH] cpuset: r...
2034

846a16bf0   Lee Schermerhorn   mempolicy: rename...
2035
2036
  /* Slow path of a mempolicy duplicate */
  struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2037
2038
2039
2040
2041
  {
  	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
  
  	if (!new)
  		return ERR_PTR(-ENOMEM);
708c1bbc9   Miao Xie   mempolicy: restru...
2042
2043
2044
2045
2046
2047
2048
2049
  
  	/* task's mempolicy is protected by alloc_lock */
  	if (old == current->mempolicy) {
  		task_lock(current);
  		*new = *old;
  		task_unlock(current);
  	} else
  		*new = *old;
99ee4ca74   Paul E. McKenney   rcu: Suppress __m...
2050
  	rcu_read_lock();
4225399a6   Paul Jackson   [PATCH] cpuset: r...
2051
2052
  	if (current_cpuset_is_being_rebound()) {
  		nodemask_t mems = cpuset_mems_allowed(current);
708c1bbc9   Miao Xie   mempolicy: restru...
2053
2054
2055
2056
  		if (new->flags & MPOL_F_REBINDING)
  			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
  		else
  			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
2057
  	}
99ee4ca74   Paul E. McKenney   rcu: Suppress __m...
2058
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2059
  	atomic_set(&new->refcnt, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2060
2061
2062
2063
  	return new;
  }
  
  /* Slow path of a mempolicy comparison */
fcfb4dcc9   KOSAKI Motohiro   mm/mempolicy.c: m...
2064
  bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2065
2066
  {
  	if (!a || !b)
fcfb4dcc9   KOSAKI Motohiro   mm/mempolicy.c: m...
2067
  		return false;
45c4745af   Lee Schermerhorn   mempolicy: rename...
2068
  	if (a->mode != b->mode)
fcfb4dcc9   KOSAKI Motohiro   mm/mempolicy.c: m...
2069
  		return false;
198005025   Bob Liu   mempolicy: remove...
2070
  	if (a->flags != b->flags)
fcfb4dcc9   KOSAKI Motohiro   mm/mempolicy.c: m...
2071
  		return false;
198005025   Bob Liu   mempolicy: remove...
2072
2073
  	if (mpol_store_user_nodemask(a))
  		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc9   KOSAKI Motohiro   mm/mempolicy.c: m...
2074
  			return false;
198005025   Bob Liu   mempolicy: remove...
2075

45c4745af   Lee Schermerhorn   mempolicy: rename...
2076
  	switch (a->mode) {
19770b326   Mel Gorman   mm: filter based ...
2077
2078
  	case MPOL_BIND:
  		/* Fall through */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2079
  	case MPOL_INTERLEAVE:
fcfb4dcc9   KOSAKI Motohiro   mm/mempolicy.c: m...
2080
  		return !!nodes_equal(a->v.nodes, b->v.nodes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2081
  	case MPOL_PREFERRED:
757196618   Namhyung Kim   mempolicy: remove...
2082
  		return a->v.preferred_node == b->v.preferred_node;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2083
2084
  	default:
  		BUG();
fcfb4dcc9   KOSAKI Motohiro   mm/mempolicy.c: m...
2085
  		return false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2086
2087
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2088
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2089
2090
2091
2092
2093
2094
2095
2096
2097
   * Shared memory backing store policy support.
   *
   * Remember policies even when nobody has shared memory mapped.
   * The policies are kept in Red-Black tree linked from the inode.
   * They are protected by the sp->lock spinlock, which should be held
   * for any accesses to the tree.
   */
  
  /* lookup first element intersecting start-end */
42288fe36   Mel Gorman   mm: mempolicy: Co...
2098
  /* Caller holds sp->lock */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
  static struct sp_node *
  sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
  {
  	struct rb_node *n = sp->root.rb_node;
  
  	while (n) {
  		struct sp_node *p = rb_entry(n, struct sp_node, nd);
  
  		if (start >= p->end)
  			n = n->rb_right;
  		else if (end <= p->start)
  			n = n->rb_left;
  		else
  			break;
  	}
  	if (!n)
  		return NULL;
  	for (;;) {
  		struct sp_node *w = NULL;
  		struct rb_node *prev = rb_prev(n);
  		if (!prev)
  			break;
  		w = rb_entry(prev, struct sp_node, nd);
  		if (w->end <= start)
  			break;
  		n = prev;
  	}
  	return rb_entry(n, struct sp_node, nd);
  }
  
  /* Insert a new shared policy into the list. */
  /* Caller holds sp->lock */
  static void sp_insert(struct shared_policy *sp, struct sp_node *new)
  {
  	struct rb_node **p = &sp->root.rb_node;
  	struct rb_node *parent = NULL;
  	struct sp_node *nd;
  
  	while (*p) {
  		parent = *p;
  		nd = rb_entry(parent, struct sp_node, nd);
  		if (new->start < nd->start)
  			p = &(*p)->rb_left;
  		else if (new->end > nd->end)
  			p = &(*p)->rb_right;
  		else
  			BUG();
  	}
  	rb_link_node(&new->nd, parent, p);
  	rb_insert_color(&new->nd, &sp->root);
140d5a490   Paul Mundt   numa: mempolicy: ...
2149
2150
  	pr_debug("inserting %lx-%lx: %d
  ", new->start, new->end,
45c4745af   Lee Schermerhorn   mempolicy: rename...
2151
  		 new->policy ? new->policy->mode : 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
  }
  
  /* Find shared policy intersecting idx */
  struct mempolicy *
  mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
  {
  	struct mempolicy *pol = NULL;
  	struct sp_node *sn;
  
  	if (!sp->root.rb_node)
  		return NULL;
42288fe36   Mel Gorman   mm: mempolicy: Co...
2163
  	spin_lock(&sp->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2164
2165
2166
2167
2168
  	sn = sp_lookup(sp, idx, idx+1);
  	if (sn) {
  		mpol_get(sn->policy);
  		pol = sn->policy;
  	}
42288fe36   Mel Gorman   mm: mempolicy: Co...
2169
  	spin_unlock(&sp->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2170
2171
  	return pol;
  }
63f74ca21   KOSAKI Motohiro   mempolicy: fix re...
2172
2173
2174
2175
2176
  static void sp_free(struct sp_node *n)
  {
  	mpol_put(n->policy);
  	kmem_cache_free(sn_cache, n);
  }
771fb4d80   Lee Schermerhorn   mm: mempolicy: Ch...
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
  /**
   * mpol_misplaced - check whether current page node is valid in policy
   *
   * @page   - page to be checked
   * @vma    - vm area where page mapped
   * @addr   - virtual address where page mapped
   *
   * Lookup current policy node id for vma,addr and "compare to" page's
   * node id.
   *
   * Returns:
   *	-1	- not misplaced, page is in the right node
   *	node	- node id where the page should be
   *
   * Policy determination "mimics" alloc_page_vma().
   * Called from fault path where we know the vma and faulting address.
   */
  int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
  {
  	struct mempolicy *pol;
  	struct zone *zone;
  	int curnid = page_to_nid(page);
  	unsigned long pgoff;
90572890d   Peter Zijlstra   mm: numa: Change ...
2200
2201
  	int thiscpu = raw_smp_processor_id();
  	int thisnid = cpu_to_node(thiscpu);
771fb4d80   Lee Schermerhorn   mm: mempolicy: Ch...
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
  	int polnid = -1;
  	int ret = -1;
  
  	BUG_ON(!vma);
  
  	pol = get_vma_policy(current, vma, addr);
  	if (!(pol->flags & MPOL_F_MOF))
  		goto out;
  
  	switch (pol->mode) {
  	case MPOL_INTERLEAVE:
  		BUG_ON(addr >= vma->vm_end);
  		BUG_ON(addr < vma->vm_start);
  
  		pgoff = vma->vm_pgoff;
  		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
  		polnid = offset_il_node(pol, vma, pgoff);
  		break;
  
  	case MPOL_PREFERRED:
  		if (pol->flags & MPOL_F_LOCAL)
  			polnid = numa_node_id();
  		else
  			polnid = pol->v.preferred_node;
  		break;
  
  	case MPOL_BIND:
  		/*
  		 * allows binding to multiple nodes.
  		 * use current page if in policy nodemask,
  		 * else select nearest allowed node, if any.
  		 * If no allowed nodes, use current [!misplaced].
  		 */
  		if (node_isset(curnid, pol->v.nodes))
  			goto out;
  		(void)first_zones_zonelist(
  				node_zonelist(numa_node_id(), GFP_HIGHUSER),
  				gfp_zone(GFP_HIGHUSER),
  				&pol->v.nodes, &zone);
  		polnid = zone->node;
  		break;
  
  	default:
  		BUG();
  	}
5606e3877   Mel Gorman   mm: numa: Migrate...
2247
2248
  
  	/* Migrate the page towards the node whose CPU is referencing it */
e42c8ff29   Mel Gorman   mm: numa: Use a t...
2249
  	if (pol->flags & MPOL_F_MORON) {
90572890d   Peter Zijlstra   mm: numa: Change ...
2250
  		polnid = thisnid;
5606e3877   Mel Gorman   mm: numa: Migrate...
2251

10f390427   Rik van Riel   sched/numa, mm: U...
2252
  		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
de1c9ce6f   Rik van Riel   sched/numa: Skip ...
2253
  			goto out;
e42c8ff29   Mel Gorman   mm: numa: Use a t...
2254
  	}
771fb4d80   Lee Schermerhorn   mm: mempolicy: Ch...
2255
2256
2257
2258
2259
2260
2261
  	if (curnid != polnid)
  		ret = polnid;
  out:
  	mpol_cond_put(pol);
  
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2262
2263
  static void sp_delete(struct shared_policy *sp, struct sp_node *n)
  {
140d5a490   Paul Mundt   numa: mempolicy: ...
2264
2265
  	pr_debug("deleting %lx-l%lx
  ", n->start, n->end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2266
  	rb_erase(&n->nd, &sp->root);
63f74ca21   KOSAKI Motohiro   mempolicy: fix re...
2267
  	sp_free(n);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2268
  }
42288fe36   Mel Gorman   mm: mempolicy: Co...
2269
2270
2271
2272
2273
2274
2275
  static void sp_node_init(struct sp_node *node, unsigned long start,
  			unsigned long end, struct mempolicy *pol)
  {
  	node->start = start;
  	node->end = end;
  	node->policy = pol;
  }
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
2276
2277
  static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
  				struct mempolicy *pol)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2278
  {
869833f2c   KOSAKI Motohiro   mempolicy: remove...
2279
2280
  	struct sp_node *n;
  	struct mempolicy *newpol;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2281

869833f2c   KOSAKI Motohiro   mempolicy: remove...
2282
  	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2283
2284
  	if (!n)
  		return NULL;
869833f2c   KOSAKI Motohiro   mempolicy: remove...
2285
2286
2287
2288
2289
2290
2291
  
  	newpol = mpol_dup(pol);
  	if (IS_ERR(newpol)) {
  		kmem_cache_free(sn_cache, n);
  		return NULL;
  	}
  	newpol->flags |= MPOL_F_SHARED;
42288fe36   Mel Gorman   mm: mempolicy: Co...
2292
  	sp_node_init(n, start, end, newpol);
869833f2c   KOSAKI Motohiro   mempolicy: remove...
2293

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2294
2295
2296
2297
2298
2299
2300
  	return n;
  }
  
  /* Replace a policy range. */
  static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
  				 unsigned long end, struct sp_node *new)
  {
b22d127a3   Mel Gorman   mempolicy: fix a ...
2301
  	struct sp_node *n;
42288fe36   Mel Gorman   mm: mempolicy: Co...
2302
2303
  	struct sp_node *n_new = NULL;
  	struct mempolicy *mpol_new = NULL;
b22d127a3   Mel Gorman   mempolicy: fix a ...
2304
  	int ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2305

42288fe36   Mel Gorman   mm: mempolicy: Co...
2306
2307
  restart:
  	spin_lock(&sp->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
  	n = sp_lookup(sp, start, end);
  	/* Take care of old policies in the same range. */
  	while (n && n->start < end) {
  		struct rb_node *next = rb_next(&n->nd);
  		if (n->start >= start) {
  			if (n->end <= end)
  				sp_delete(sp, n);
  			else
  				n->start = end;
  		} else {
  			/* Old policy spanning whole new range. */
  			if (n->end > end) {
42288fe36   Mel Gorman   mm: mempolicy: Co...
2320
2321
2322
2323
2324
  				if (!n_new)
  					goto alloc_new;
  
  				*mpol_new = *n->policy;
  				atomic_set(&mpol_new->refcnt, 1);
7880639c3   KOSAKI Motohiro   mm/mempolicy.c: f...
2325
  				sp_node_init(n_new, end, n->end, mpol_new);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2326
  				n->end = start;
5ca395751   Hillf Danton   mm/mempolicy.c: f...
2327
  				sp_insert(sp, n_new);
42288fe36   Mel Gorman   mm: mempolicy: Co...
2328
2329
  				n_new = NULL;
  				mpol_new = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
  				break;
  			} else
  				n->end = start;
  		}
  		if (!next)
  			break;
  		n = rb_entry(next, struct sp_node, nd);
  	}
  	if (new)
  		sp_insert(sp, new);
42288fe36   Mel Gorman   mm: mempolicy: Co...
2340
2341
2342
2343
2344
2345
2346
2347
  	spin_unlock(&sp->lock);
  	ret = 0;
  
  err_out:
  	if (mpol_new)
  		mpol_put(mpol_new);
  	if (n_new)
  		kmem_cache_free(sn_cache, n_new);
b22d127a3   Mel Gorman   mempolicy: fix a ...
2348
  	return ret;
42288fe36   Mel Gorman   mm: mempolicy: Co...
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
  
  alloc_new:
  	spin_unlock(&sp->lock);
  	ret = -ENOMEM;
  	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
  	if (!n_new)
  		goto err_out;
  	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
  	if (!mpol_new)
  		goto err_out;
  	goto restart;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2360
  }
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2361
2362
2363
2364
2365
2366
2367
2368
  /**
   * mpol_shared_policy_init - initialize shared policy for inode
   * @sp: pointer to inode shared policy
   * @mpol:  struct mempolicy to install
   *
   * Install non-NULL @mpol in inode's shared policy rb-tree.
   * On entry, the current task has a reference on a non-NULL @mpol.
   * This must be released on exit.
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
2369
   * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2370
2371
2372
   */
  void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
  {
58568d2a8   Miao Xie   cpuset,mm: update...
2373
  	int ret;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2374
  	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
42288fe36   Mel Gorman   mm: mempolicy: Co...
2375
  	spin_lock_init(&sp->lock);
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2376
2377
2378
2379
  
  	if (mpol) {
  		struct vm_area_struct pvma;
  		struct mempolicy *new;
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
2380
  		NODEMASK_SCRATCH(scratch);
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2381

4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
2382
  		if (!scratch)
5c0c16549   Lee Schermerhorn   mempolicy: fix da...
2383
  			goto put_mpol;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2384
2385
  		/* contextualize the tmpfs mount point mempolicy */
  		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
15d77835a   Lee Schermerhorn   mempolicy: factor...
2386
  		if (IS_ERR(new))
0cae3457b   Dan Carpenter   mempolicy: ERR_PT...
2387
  			goto free_scratch; /* no valid nodemask intersection */
58568d2a8   Miao Xie   cpuset,mm: update...
2388
2389
  
  		task_lock(current);
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
2390
  		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a8   Miao Xie   cpuset,mm: update...
2391
  		task_unlock(current);
15d77835a   Lee Schermerhorn   mempolicy: factor...
2392
  		if (ret)
5c0c16549   Lee Schermerhorn   mempolicy: fix da...
2393
  			goto put_new;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2394
2395
2396
2397
2398
  
  		/* Create pseudo-vma that contains just the policy */
  		memset(&pvma, 0, sizeof(struct vm_area_struct));
  		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
  		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
15d77835a   Lee Schermerhorn   mempolicy: factor...
2399

5c0c16549   Lee Schermerhorn   mempolicy: fix da...
2400
  put_new:
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2401
  		mpol_put(new);			/* drop initial ref */
0cae3457b   Dan Carpenter   mempolicy: ERR_PT...
2402
  free_scratch:
4bfc44958   KAMEZAWA Hiroyuki   mm: make set_memp...
2403
  		NODEMASK_SCRATCH_FREE(scratch);
5c0c16549   Lee Schermerhorn   mempolicy: fix da...
2404
2405
  put_mpol:
  		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
7339ff830   Robin Holt   [PATCH] Add tmpfs...
2406
2407
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2408
2409
2410
2411
2412
2413
  int mpol_set_shared_policy(struct shared_policy *info,
  			struct vm_area_struct *vma, struct mempolicy *npol)
  {
  	int err;
  	struct sp_node *new = NULL;
  	unsigned long sz = vma_pages(vma);
028fec414   David Rientjes   mempolicy: suppor...
2414
2415
  	pr_debug("set_shared_policy %lx sz %lu %d %d %lx
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2416
  		 vma->vm_pgoff,
45c4745af   Lee Schermerhorn   mempolicy: rename...
2417
  		 sz, npol ? npol->mode : -1,
028fec414   David Rientjes   mempolicy: suppor...
2418
  		 npol ? npol->flags : -1,
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
2419
  		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2420
2421
2422
2423
2424
2425
2426
2427
  
  	if (npol) {
  		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
  		if (!new)
  			return -ENOMEM;
  	}
  	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
  	if (err && new)
63f74ca21   KOSAKI Motohiro   mempolicy: fix re...
2428
  		sp_free(new);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
  	return err;
  }
  
  /* Free a backing policy store on inode delete. */
  void mpol_free_shared_policy(struct shared_policy *p)
  {
  	struct sp_node *n;
  	struct rb_node *next;
  
  	if (!p->root.rb_node)
  		return;
42288fe36   Mel Gorman   mm: mempolicy: Co...
2440
  	spin_lock(&p->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2441
2442
2443
2444
  	next = rb_first(&p->root);
  	while (next) {
  		n = rb_entry(next, struct sp_node, nd);
  		next = rb_next(&n->nd);
63f74ca21   KOSAKI Motohiro   mempolicy: fix re...
2445
  		sp_delete(p, n);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2446
  	}
42288fe36   Mel Gorman   mm: mempolicy: Co...
2447
  	spin_unlock(&p->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2448
  }
1a687c2e9   Mel Gorman   mm: sched: numa: ...
2449
  #ifdef CONFIG_NUMA_BALANCING
c297663c0   Mel Gorman   mm: numa: initial...
2450
  static int __initdata numabalancing_override;
1a687c2e9   Mel Gorman   mm: sched: numa: ...
2451
2452
2453
2454
2455
2456
2457
  
  static void __init check_numabalancing_enable(void)
  {
  	bool numabalancing_default = false;
  
  	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
  		numabalancing_default = true;
c297663c0   Mel Gorman   mm: numa: initial...
2458
2459
2460
  	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
  	if (numabalancing_override)
  		set_numabalancing_state(numabalancing_override == 1);
1a687c2e9   Mel Gorman   mm: sched: numa: ...
2461
  	if (nr_node_ids > 1 && !numabalancing_override) {
4a404bea9   Andrew Morton   mm/mempolicy.c: c...
2462
  		pr_info("%s automatic NUMA balancing. "
c297663c0   Mel Gorman   mm: numa: initial...
2463
2464
2465
  			"Configure with numa_balancing= or the "
  			"kernel.numa_balancing sysctl",
  			numabalancing_default ? "Enabling" : "Disabling");
1a687c2e9   Mel Gorman   mm: sched: numa: ...
2466
2467
2468
2469
2470
2471
2472
2473
2474
  		set_numabalancing_state(numabalancing_default);
  	}
  }
  
  static int __init setup_numabalancing(char *str)
  {
  	int ret = 0;
  	if (!str)
  		goto out;
1a687c2e9   Mel Gorman   mm: sched: numa: ...
2475
2476
  
  	if (!strcmp(str, "enable")) {
c297663c0   Mel Gorman   mm: numa: initial...
2477
  		numabalancing_override = 1;
1a687c2e9   Mel Gorman   mm: sched: numa: ...
2478
2479
  		ret = 1;
  	} else if (!strcmp(str, "disable")) {
c297663c0   Mel Gorman   mm: numa: initial...
2480
  		numabalancing_override = -1;
1a687c2e9   Mel Gorman   mm: sched: numa: ...
2481
2482
2483
2484
  		ret = 1;
  	}
  out:
  	if (!ret)
4a404bea9   Andrew Morton   mm/mempolicy.c: c...
2485
2486
  		pr_warn("Unable to parse numa_balancing=
  ");
1a687c2e9   Mel Gorman   mm: sched: numa: ...
2487
2488
2489
2490
2491
2492
2493
2494
2495
  
  	return ret;
  }
  __setup("numa_balancing=", setup_numabalancing);
  #else
  static inline void __init check_numabalancing_enable(void)
  {
  }
  #endif /* CONFIG_NUMA_BALANCING */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2496
2497
2498
  /* assumes fs == KERNEL_DS */
  void __init numa_policy_init(void)
  {
b71636e29   Paul Mundt   numa: mempolicy: ...
2499
2500
2501
  	nodemask_t interleave_nodes;
  	unsigned long largest = 0;
  	int nid, prefer = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2502
2503
  	policy_cache = kmem_cache_create("numa_policy",
  					 sizeof(struct mempolicy),
20c2df83d   Paul Mundt   mm: Remove slab d...
2504
  					 0, SLAB_PANIC, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2505
2506
2507
  
  	sn_cache = kmem_cache_create("shared_policy_node",
  				     sizeof(struct sp_node),
20c2df83d   Paul Mundt   mm: Remove slab d...
2508
  				     0, SLAB_PANIC, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2509

5606e3877   Mel Gorman   mm: numa: Migrate...
2510
2511
2512
2513
2514
2515
2516
2517
  	for_each_node(nid) {
  		preferred_node_policy[nid] = (struct mempolicy) {
  			.refcnt = ATOMIC_INIT(1),
  			.mode = MPOL_PREFERRED,
  			.flags = MPOL_F_MOF | MPOL_F_MORON,
  			.v = { .preferred_node = nid, },
  		};
  	}
b71636e29   Paul Mundt   numa: mempolicy: ...
2518
2519
2520
2521
2522
2523
  	/*
  	 * Set interleaving policy for system init. Interleaving is only
  	 * enabled across suitably sized nodes (default is >= 16MB), or
  	 * fall back to the largest node if they're all smaller.
  	 */
  	nodes_clear(interleave_nodes);
01f13bd60   Lai Jiangshan   mempolicy: use N_...
2524
  	for_each_node_state(nid, N_MEMORY) {
b71636e29   Paul Mundt   numa: mempolicy: ...
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
  		unsigned long total_pages = node_present_pages(nid);
  
  		/* Preserve the largest node */
  		if (largest < total_pages) {
  			largest = total_pages;
  			prefer = nid;
  		}
  
  		/* Interleave this node? */
  		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
  			node_set(nid, interleave_nodes);
  	}
  
  	/* All too small, use the largest */
  	if (unlikely(nodes_empty(interleave_nodes)))
  		node_set(prefer, interleave_nodes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2541

028fec414   David Rientjes   mempolicy: suppor...
2542
  	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2543
2544
  		printk("numa_policy_init: interleaving failed
  ");
1a687c2e9   Mel Gorman   mm: sched: numa: ...
2545
2546
  
  	check_numabalancing_enable();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2547
  }
8bccd85ff   Christoph Lameter   [PATCH] Implement...
2548
  /* Reset policy of current process to default */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2549
2550
  void numa_default_policy(void)
  {
028fec414   David Rientjes   mempolicy: suppor...
2551
  	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2552
  }
68860ec10   Paul Jackson   [PATCH] cpusets: ...
2553

4225399a6   Paul Jackson   [PATCH] cpuset: r...
2554
  /*
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2555
2556
2557
2558
   * Parse and format mempolicy from/to strings
   */
  
  /*
f2a07f40d   Hugh Dickins   tmpfs mempolicy: ...
2559
   * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2560
   */
345ace9c7   Lee Schermerhorn   mempolicy: rename...
2561
2562
2563
2564
2565
2566
  static const char * const policy_modes[] =
  {
  	[MPOL_DEFAULT]    = "default",
  	[MPOL_PREFERRED]  = "prefer",
  	[MPOL_BIND]       = "bind",
  	[MPOL_INTERLEAVE] = "interleave",
d3a710337   Lee Schermerhorn   mm: mempolicy: Ad...
2567
  	[MPOL_LOCAL]      = "local",
345ace9c7   Lee Schermerhorn   mempolicy: rename...
2568
  };
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2569

095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2570
2571
2572
  
  #ifdef CONFIG_TMPFS
  /**
f2a07f40d   Hugh Dickins   tmpfs mempolicy: ...
2573
   * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2574
   * @str:  string containing mempolicy to parse
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2575
   * @mpol:  pointer to struct mempolicy pointer, returned on success.
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2576
2577
2578
2579
   *
   * Format of input:
   *	<mode>[=<flags>][:<nodelist>]
   *
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2580
   * On success, returns 0, else 1
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2581
   */
a7a88b237   Hugh Dickins   mempolicy: remove...
2582
  int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2583
  {
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2584
  	struct mempolicy *new = NULL;
b4652e842   Lee Schermerhorn   mempolicy: lose u...
2585
  	unsigned short mode;
f2a07f40d   Hugh Dickins   tmpfs mempolicy: ...
2586
  	unsigned short mode_flags;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2587
  	nodemask_t nodes;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2588
2589
  	char *nodelist = strchr(str, ':');
  	char *flags = strchr(str, '=');
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2590
2591
2592
2593
2594
  	int err = 1;
  
  	if (nodelist) {
  		/* NUL-terminate mode or flags string */
  		*nodelist++ = '\0';
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2595
  		if (nodelist_parse(nodelist, nodes))
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2596
  			goto out;
01f13bd60   Lai Jiangshan   mempolicy: use N_...
2597
  		if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2598
  			goto out;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2599
2600
  	} else
  		nodes_clear(nodes);
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2601
2602
  	if (flags)
  		*flags++ = '\0';	/* terminate mode string */
479e2802d   Peter Zijlstra   mm: mempolicy: Ma...
2603
  	for (mode = 0; mode < MPOL_MAX; mode++) {
345ace9c7   Lee Schermerhorn   mempolicy: rename...
2604
  		if (!strcmp(str, policy_modes[mode])) {
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2605
2606
2607
  			break;
  		}
  	}
a720094de   Mel Gorman   mm: mempolicy: Hi...
2608
  	if (mode >= MPOL_MAX)
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2609
  		goto out;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2610
  	switch (mode) {
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2611
  	case MPOL_PREFERRED:
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2612
2613
2614
  		/*
  		 * Insist on a nodelist of one node only
  		 */
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2615
2616
2617
2618
  		if (nodelist) {
  			char *rest = nodelist;
  			while (isdigit(*rest))
  				rest++;
926f2ae04   KOSAKI Motohiro   tmpfs: cleanup mp...
2619
2620
  			if (*rest)
  				goto out;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2621
2622
  		}
  		break;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2623
2624
2625
2626
2627
  	case MPOL_INTERLEAVE:
  		/*
  		 * Default to online nodes with memory if no nodelist
  		 */
  		if (!nodelist)
01f13bd60   Lai Jiangshan   mempolicy: use N_...
2628
  			nodes = node_states[N_MEMORY];
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
2629
  		break;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2630
  	case MPOL_LOCAL:
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
2631
  		/*
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2632
  		 * Don't allow a nodelist;  mpol_new() checks flags
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
2633
  		 */
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2634
  		if (nodelist)
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
2635
  			goto out;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2636
  		mode = MPOL_PREFERRED;
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
2637
  		break;
413b43dea   Ravikiran G Thirumalai   tmpfs: fix oops o...
2638
2639
2640
2641
2642
2643
2644
  	case MPOL_DEFAULT:
  		/*
  		 * Insist on a empty nodelist
  		 */
  		if (!nodelist)
  			err = 0;
  		goto out;
d69b2e63e   KOSAKI Motohiro   tmpfs: mpol=bind:...
2645
2646
2647
2648
2649
2650
  	case MPOL_BIND:
  		/*
  		 * Insist on a nodelist
  		 */
  		if (!nodelist)
  			goto out;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2651
  	}
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2652
  	mode_flags = 0;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2653
2654
2655
2656
2657
2658
  	if (flags) {
  		/*
  		 * Currently, we only support two mutually exclusive
  		 * mode flags.
  		 */
  		if (!strcmp(flags, "static"))
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2659
  			mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2660
  		else if (!strcmp(flags, "relative"))
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2661
  			mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2662
  		else
926f2ae04   KOSAKI Motohiro   tmpfs: cleanup mp...
2663
  			goto out;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2664
  	}
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2665
2666
2667
  
  	new = mpol_new(mode, mode_flags, &nodes);
  	if (IS_ERR(new))
926f2ae04   KOSAKI Motohiro   tmpfs: cleanup mp...
2668
  		goto out;
f2a07f40d   Hugh Dickins   tmpfs mempolicy: ...
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
  	/*
  	 * Save nodes for mpol_to_str() to show the tmpfs mount options
  	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
  	 */
  	if (mode != MPOL_PREFERRED)
  		new->v.nodes = nodes;
  	else if (nodelist)
  		new->v.preferred_node = first_node(nodes);
  	else
  		new->flags |= MPOL_F_LOCAL;
  
  	/*
  	 * Save nodes for contextualization: this will be used to "clone"
  	 * the mempolicy in a specific context [cpuset] at a later time.
  	 */
  	new->w.user_nodemask = nodes;
926f2ae04   KOSAKI Motohiro   tmpfs: cleanup mp...
2685
  	err = 0;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2686

095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2687
2688
2689
2690
2691
2692
  out:
  	/* Restore string for error message */
  	if (nodelist)
  		*--nodelist = ':';
  	if (flags)
  		*--flags = '=';
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2693
2694
  	if (!err)
  		*mpol = new;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2695
2696
2697
  	return err;
  }
  #endif /* CONFIG_TMPFS */
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2698
2699
2700
2701
2702
  /**
   * mpol_to_str - format a mempolicy structure for printing
   * @buffer:  to contain formatted mempolicy string
   * @maxlen:  length of @buffer
   * @pol:  pointer to mempolicy to be formatted
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2703
   *
948927ee9   David Rientjes   mm, mempolicy: ma...
2704
2705
2706
   * Convert @pol into a string.  If @buffer is too short, truncate the string.
   * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
   * longest flag, "relative", and to display at least a few node ids.
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2707
   */
948927ee9   David Rientjes   mm, mempolicy: ma...
2708
  void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2709
2710
  {
  	char *p = buffer;
948927ee9   David Rientjes   mm, mempolicy: ma...
2711
2712
2713
  	nodemask_t nodes = NODE_MASK_NONE;
  	unsigned short mode = MPOL_DEFAULT;
  	unsigned short flags = 0;
2291990ab   Lee Schermerhorn   mempolicy: clean-...
2714

8790c71a1   David Rientjes   mm/mempolicy.c: f...
2715
  	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d54   Lee Schermerhorn   mempolicy: use MP...
2716
  		mode = pol->mode;
948927ee9   David Rientjes   mm, mempolicy: ma...
2717
2718
  		flags = pol->flags;
  	}
bea904d54   Lee Schermerhorn   mempolicy: use MP...
2719

1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2720
2721
  	switch (mode) {
  	case MPOL_DEFAULT:
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2722
  		break;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2723
  	case MPOL_PREFERRED:
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
2724
  		if (flags & MPOL_F_LOCAL)
f2a07f40d   Hugh Dickins   tmpfs mempolicy: ...
2725
  			mode = MPOL_LOCAL;
53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
2726
  		else
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
2727
  			node_set(pol->v.preferred_node, nodes);
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2728
  		break;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2729
  	case MPOL_BIND:
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2730
  	case MPOL_INTERLEAVE:
f2a07f40d   Hugh Dickins   tmpfs mempolicy: ...
2731
  		nodes = pol->v.nodes;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2732
  		break;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2733
  	default:
948927ee9   David Rientjes   mm, mempolicy: ma...
2734
2735
2736
  		WARN_ON_ONCE(1);
  		snprintf(p, maxlen, "unknown");
  		return;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2737
  	}
b7a9f420e   David Rientjes   mm, mempolicy: si...
2738
  	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2739

fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
2740
  	if (flags & MPOL_MODE_FLAGS) {
948927ee9   David Rientjes   mm, mempolicy: ma...
2741
  		p += snprintf(p, buffer + maxlen - p, "=");
f5b087b52   David Rientjes   mempolicy: add MP...
2742

2291990ab   Lee Schermerhorn   mempolicy: clean-...
2743
2744
2745
  		/*
  		 * Currently, the only defined flags are mutually exclusive
  		 */
f5b087b52   David Rientjes   mempolicy: add MP...
2746
  		if (flags & MPOL_F_STATIC_NODES)
2291990ab   Lee Schermerhorn   mempolicy: clean-...
2747
2748
2749
  			p += snprintf(p, buffer + maxlen - p, "static");
  		else if (flags & MPOL_F_RELATIVE_NODES)
  			p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b52   David Rientjes   mempolicy: add MP...
2750
  	}
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2751
  	if (!nodes_empty(nodes)) {
948927ee9   David Rientjes   mm, mempolicy: ma...
2752
  		p += snprintf(p, buffer + maxlen - p, ":");
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2753
2754
  	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
  	}
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2755
  }