Blame view

mm/mempolicy.c 59.3 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
  /*
   * Simple NUMA memory policy for the Linux kernel.
   *
   * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85ff   Christoph Lameter   [PATCH] Implement...
5
   * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
   * Subject to the GNU Public License, version 2.
   *
   * NUMA policy allows the user to give hints in which node(s) memory should
   * be allocated.
   *
   * Support four policies per VMA and per process:
   *
   * The VMA policy has priority over the process policy for a page fault.
   *
   * interleave     Allocate memory interleaved over a set of nodes,
   *                with normal fallback if it fails.
   *                For VMA based allocations this interleaves based on the
   *                offset into the backing object or offset into the mapping
   *                for anonymous memory. For process policy an process counter
   *                is used.
8bccd85ff   Christoph Lameter   [PATCH] Implement...
21
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
   * bind           Only allocate memory on a specific set of nodes,
   *                no fallback.
8bccd85ff   Christoph Lameter   [PATCH] Implement...
24
25
26
27
   *                FIXME: memory is allocated starting with the first node
   *                to the last. It would be better if bind would truly restrict
   *                the allocation to memory nodes instead
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
28
29
30
31
32
   * preferred       Try a specific node first before normal fallback.
   *                As a special case node -1 here means do the allocation
   *                on the local CPU. This is normally identical to default,
   *                but useful to set in a VMA when you have a non default
   *                process policy.
8bccd85ff   Christoph Lameter   [PATCH] Implement...
33
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
   * default        Allocate on the local node first, or when on a VMA
   *                use the process policy. This is what Linux always did
   *		  in a NUMA aware kernel and still does by, ahem, default.
   *
   * The process policy is applied for most non interrupt memory allocations
   * in that process' context. Interrupts ignore the policies and always
   * try to allocate on the local CPU. The VMA policy is only applied for memory
   * allocations for a VMA in the VM.
   *
   * Currently there are a few corner cases in swapping where the policy
   * is not applied, but the majority should be handled. When process policy
   * is used it is not remembered over swap outs/swap ins.
   *
   * Only the highest zone in the zone hierarchy gets policied. Allocations
   * requesting a lower zone just use default policy. This implies that
   * on systems with highmem kernel lowmem allocation don't get policied.
   * Same with GFP_DMA allocations.
   *
   * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
   * all users and remembered even when nobody has memory mapped.
   */
  
  /* Notebook:
     fix mmap readahead to honour policy and enable policy for any page cache
     object
     statistics for bigpages
     global policy for page cache? currently it uses process policy. Requires
     first item above.
     handle mremap for shared memory (currently ignored for the policy)
     grows down?
     make bind policy root only? It can trigger oom much faster and the
     kernel is not always grateful with that.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
66
67
68
69
70
71
72
73
  */
  
  #include <linux/mempolicy.h>
  #include <linux/mm.h>
  #include <linux/highmem.h>
  #include <linux/hugetlb.h>
  #include <linux/kernel.h>
  #include <linux/sched.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
74
75
76
77
78
79
  #include <linux/nodemask.h>
  #include <linux/cpuset.h>
  #include <linux/gfp.h>
  #include <linux/slab.h>
  #include <linux/string.h>
  #include <linux/module.h>
b488893a3   Pavel Emelyanov   pid namespaces: c...
80
  #include <linux/nsproxy.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
81
82
83
  #include <linux/interrupt.h>
  #include <linux/init.h>
  #include <linux/compat.h>
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
84
  #include <linux/swap.h>
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
85
86
  #include <linux/seq_file.h>
  #include <linux/proc_fs.h>
b20a35035   Christoph Lameter   [PATCH] page migr...
87
  #include <linux/migrate.h>
95a402c38   Christoph Lameter   [PATCH] page migr...
88
  #include <linux/rmap.h>
86c3a7645   David Quigley   [PATCH] SELinux: ...
89
  #include <linux/security.h>
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
90
  #include <linux/syscalls.h>
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
91
  #include <linux/ctype.h>
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
92

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
93
94
  #include <asm/tlbflush.h>
  #include <asm/uaccess.h>
62695a84e   Nick Piggin   vmscan: move isol...
95
  #include "internal.h"
38e35860d   Christoph Lameter   [PATCH] mempolici...
96
  /* Internal flags */
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
97
  #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
38e35860d   Christoph Lameter   [PATCH] mempolici...
98
  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
99
  #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
100

fcc234f88   Pekka Enberg   [PATCH] mm: kill ...
101
102
  static struct kmem_cache *policy_cache;
  static struct kmem_cache *sn_cache;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
103

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
104
105
  /* Highest zone. An specific allocation for a zone below that is not
     policied. */
6267276f3   Christoph Lameter   [PATCH] optional ...
106
  enum zone_type policy_zone = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
107

bea904d54   Lee Schermerhorn   mempolicy: use MP...
108
109
110
  /*
   * run-time system-wide default policy => local allocation
   */
d42c69972   Andi Kleen   [PATCH] PCI: Run ...
111
  struct mempolicy default_policy = {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
112
  	.refcnt = ATOMIC_INIT(1), /* never free it */
bea904d54   Lee Schermerhorn   mempolicy: use MP...
113
  	.mode = MPOL_PREFERRED,
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
114
  	.flags = MPOL_F_LOCAL,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
115
  };
37012946d   David Rientjes   mempolicy: create...
116
117
118
119
  static const struct mempolicy_operations {
  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
  	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
  } mpol_ops[MPOL_MAX];
19770b326   Mel Gorman   mm: filter based ...
120
  /* Check that the nodemask contains at least one populated zone */
37012946d   David Rientjes   mempolicy: create...
121
  static int is_valid_nodemask(const nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
122
  {
19770b326   Mel Gorman   mm: filter based ...
123
  	int nd, k;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
124

19770b326   Mel Gorman   mm: filter based ...
125
126
127
128
129
130
131
132
133
134
  	/* Check that there is something useful in this mask */
  	k = policy_zone;
  
  	for_each_node_mask(nd, *nodemask) {
  		struct zone *z;
  
  		for (k = 0; k <= policy_zone; k++) {
  			z = &NODE_DATA(nd)->node_zones[k];
  			if (z->present_pages > 0)
  				return 1;
dd942ae33   Andi Kleen   [PATCH] Handle al...
135
  		}
8af5e2eb3   KAMEZAWA Hiroyuki   [PATCH] fix mempo...
136
  	}
19770b326   Mel Gorman   mm: filter based ...
137
138
  
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
139
  }
f5b087b52   David Rientjes   mempolicy: add MP...
140
141
  static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
  {
4c50bc011   David Rientjes   mempolicy: add MP...
142
143
144
145
146
147
148
149
150
  	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
  }
  
  static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
  				   const nodemask_t *rel)
  {
  	nodemask_t tmp;
  	nodes_fold(tmp, *orig, nodes_weight(*rel));
  	nodes_onto(*ret, tmp, *rel);
f5b087b52   David Rientjes   mempolicy: add MP...
151
  }
37012946d   David Rientjes   mempolicy: create...
152
153
154
155
156
157
158
159
160
161
162
  static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
  {
  	if (nodes_empty(*nodes))
  		return -EINVAL;
  	pol->v.nodes = *nodes;
  	return 0;
  }
  
  static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
  {
  	if (!nodes)
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
163
  		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
37012946d   David Rientjes   mempolicy: create...
164
165
166
167
168
169
170
171
172
173
174
175
176
177
  	else if (nodes_empty(*nodes))
  		return -EINVAL;			/*  no allowed nodes */
  	else
  		pol->v.preferred_node = first_node(*nodes);
  	return 0;
  }
  
  static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
  {
  	if (!is_valid_nodemask(nodes))
  		return -EINVAL;
  	pol->v.nodes = *nodes;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
178
  /* Create a new policy */
028fec414   David Rientjes   mempolicy: suppor...
179
180
  static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
  				  nodemask_t *nodes)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
181
182
  {
  	struct mempolicy *policy;
f5b087b52   David Rientjes   mempolicy: add MP...
183
  	nodemask_t cpuset_context_nmask;
37012946d   David Rientjes   mempolicy: create...
184
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
185

028fec414   David Rientjes   mempolicy: suppor...
186
187
188
  	pr_debug("setting mode %d flags %d nodes[0] %lx
  ",
  		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
140d5a490   Paul Mundt   numa: mempolicy: ...
189

3e1f06456   David Rientjes   mempolicy: disall...
190
191
  	if (mode == MPOL_DEFAULT) {
  		if (nodes && !nodes_empty(*nodes))
37012946d   David Rientjes   mempolicy: create...
192
  			return ERR_PTR(-EINVAL);
bea904d54   Lee Schermerhorn   mempolicy: use MP...
193
  		return NULL;	/* simply delete any existing policy */
37012946d   David Rientjes   mempolicy: create...
194
  	}
3e1f06456   David Rientjes   mempolicy: disall...
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
  	VM_BUG_ON(!nodes);
  
  	/*
  	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
  	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
  	 * All other modes require a valid pointer to a non-empty nodemask.
  	 */
  	if (mode == MPOL_PREFERRED) {
  		if (nodes_empty(*nodes)) {
  			if (((flags & MPOL_F_STATIC_NODES) ||
  			     (flags & MPOL_F_RELATIVE_NODES)))
  				return ERR_PTR(-EINVAL);
  			nodes = NULL;	/* flag local alloc */
  		}
  	} else if (nodes_empty(*nodes))
  		return ERR_PTR(-EINVAL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
211
212
213
214
  	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
  	if (!policy)
  		return ERR_PTR(-ENOMEM);
  	atomic_set(&policy->refcnt, 1);
45c4745af   Lee Schermerhorn   mempolicy: rename...
215
  	policy->mode = mode;
3e1f06456   David Rientjes   mempolicy: disall...
216
  	policy->flags = flags;
37012946d   David Rientjes   mempolicy: create...
217

3e1f06456   David Rientjes   mempolicy: disall...
218
219
220
221
  	if (nodes) {
  		/*
  		 * cpuset related setup doesn't apply to local allocation
  		 */
37012946d   David Rientjes   mempolicy: create...
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
  		cpuset_update_task_memory_state();
  		if (flags & MPOL_F_RELATIVE_NODES)
  			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
  					       &cpuset_current_mems_allowed);
  		else
  			nodes_and(cpuset_context_nmask, *nodes,
  				  cpuset_current_mems_allowed);
  		if (mpol_store_user_nodemask(policy))
  			policy->w.user_nodemask = *nodes;
  		else
  			policy->w.cpuset_mems_allowed =
  						cpuset_mems_allowed(current);
  	}
  
  	ret = mpol_ops[mode].create(policy,
3e1f06456   David Rientjes   mempolicy: disall...
237
  				nodes ? &cpuset_context_nmask : NULL);
37012946d   David Rientjes   mempolicy: create...
238
239
240
241
  	if (ret < 0) {
  		kmem_cache_free(policy_cache, policy);
  		return ERR_PTR(ret);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
242
  	return policy;
37012946d   David Rientjes   mempolicy: create...
243
  }
52cd3b074   Lee Schermerhorn   mempolicy: rework...
244
245
246
247
248
  /* Slow path of a mpol destructor. */
  void __mpol_put(struct mempolicy *p)
  {
  	if (!atomic_dec_and_test(&p->refcnt))
  		return;
52cd3b074   Lee Schermerhorn   mempolicy: rework...
249
250
  	kmem_cache_free(policy_cache, p);
  }
37012946d   David Rientjes   mempolicy: create...
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
  static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
  {
  }
  
  static void mpol_rebind_nodemask(struct mempolicy *pol,
  				 const nodemask_t *nodes)
  {
  	nodemask_t tmp;
  
  	if (pol->flags & MPOL_F_STATIC_NODES)
  		nodes_and(tmp, pol->w.user_nodemask, *nodes);
  	else if (pol->flags & MPOL_F_RELATIVE_NODES)
  		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
  	else {
  		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
  			    *nodes);
  		pol->w.cpuset_mems_allowed = *nodes;
  	}
f5b087b52   David Rientjes   mempolicy: add MP...
269

37012946d   David Rientjes   mempolicy: create...
270
271
272
273
274
275
276
277
278
279
280
281
282
283
  	pol->v.nodes = tmp;
  	if (!node_isset(current->il_next, tmp)) {
  		current->il_next = next_node(current->il_next, tmp);
  		if (current->il_next >= MAX_NUMNODES)
  			current->il_next = first_node(tmp);
  		if (current->il_next >= MAX_NUMNODES)
  			current->il_next = numa_node_id();
  	}
  }
  
  static void mpol_rebind_preferred(struct mempolicy *pol,
  				  const nodemask_t *nodes)
  {
  	nodemask_t tmp;
37012946d   David Rientjes   mempolicy: create...
284
285
  	if (pol->flags & MPOL_F_STATIC_NODES) {
  		int node = first_node(pol->w.user_nodemask);
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
286
  		if (node_isset(node, *nodes)) {
37012946d   David Rientjes   mempolicy: create...
287
  			pol->v.preferred_node = node;
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
288
289
290
  			pol->flags &= ~MPOL_F_LOCAL;
  		} else
  			pol->flags |= MPOL_F_LOCAL;
37012946d   David Rientjes   mempolicy: create...
291
292
293
  	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
  		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
  		pol->v.preferred_node = first_node(tmp);
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
294
  	} else if (!(pol->flags & MPOL_F_LOCAL)) {
37012946d   David Rientjes   mempolicy: create...
295
296
297
298
299
  		pol->v.preferred_node = node_remap(pol->v.preferred_node,
  						   pol->w.cpuset_mems_allowed,
  						   *nodes);
  		pol->w.cpuset_mems_allowed = *nodes;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
300
  }
1d0d2680a   David Rientjes   mempolicy: move r...
301
302
303
304
  /* Migrate a policy to a different set of nodes */
  static void mpol_rebind_policy(struct mempolicy *pol,
  			       const nodemask_t *newmask)
  {
1d0d2680a   David Rientjes   mempolicy: move r...
305
306
  	if (!pol)
  		return;
1d0d2680a   David Rientjes   mempolicy: move r...
307
308
309
  	if (!mpol_store_user_nodemask(pol) &&
  	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
  		return;
45c4745af   Lee Schermerhorn   mempolicy: rename...
310
  	mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680a   David Rientjes   mempolicy: move r...
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
  }
  
  /*
   * Wrapper for mpol_rebind_policy() that just requires task
   * pointer, and updates task mempolicy.
   */
  
  void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
  {
  	mpol_rebind_policy(tsk->mempolicy, new);
  }
  
  /*
   * Rebind each vma in mm to new nodemask.
   *
   * Call holding a reference to mm.  Takes mm->mmap_sem during call.
   */
  
  void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
  {
  	struct vm_area_struct *vma;
  
  	down_write(&mm->mmap_sem);
  	for (vma = mm->mmap; vma; vma = vma->vm_next)
  		mpol_rebind_policy(vma->vm_policy, new);
  	up_write(&mm->mmap_sem);
  }
37012946d   David Rientjes   mempolicy: create...
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
  static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
  	[MPOL_DEFAULT] = {
  		.rebind = mpol_rebind_default,
  	},
  	[MPOL_INTERLEAVE] = {
  		.create = mpol_new_interleave,
  		.rebind = mpol_rebind_nodemask,
  	},
  	[MPOL_PREFERRED] = {
  		.create = mpol_new_preferred,
  		.rebind = mpol_rebind_preferred,
  	},
  	[MPOL_BIND] = {
  		.create = mpol_new_bind,
  		.rebind = mpol_rebind_nodemask,
  	},
  };
397874dfe   Christoph Lameter   [PATCH] numa_maps...
355
  static void gather_stats(struct page *, void *, int pte_dirty);
fc3012896   Christoph Lameter   [PATCH] Simplify ...
356
357
  static void migrate_page_add(struct page *page, struct list_head *pagelist,
  				unsigned long flags);
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
358

38e35860d   Christoph Lameter   [PATCH] mempolici...
359
  /* Scan through pages checking if pages follow certain conditions. */
b5810039a   Nick Piggin   [PATCH] core remo...
360
  static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
361
362
  		unsigned long addr, unsigned long end,
  		const nodemask_t *nodes, unsigned long flags,
38e35860d   Christoph Lameter   [PATCH] mempolici...
363
  		void *private)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
364
  {
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
365
366
  	pte_t *orig_pte;
  	pte_t *pte;
705e87c0c   Hugh Dickins   [PATCH] mm: pte_o...
367
  	spinlock_t *ptl;
941150a32   Hugh Dickins   [PATCH] mbind: fi...
368

705e87c0c   Hugh Dickins   [PATCH] mm: pte_o...
369
  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
370
  	do {
6aab341e0   Linus Torvalds   mm: re-architect ...
371
  		struct page *page;
25ba77c14   Andy Whitcroft   [PATCH] numa node...
372
  		int nid;
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
373
374
  
  		if (!pte_present(*pte))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
375
  			continue;
6aab341e0   Linus Torvalds   mm: re-architect ...
376
377
  		page = vm_normal_page(vma, addr, *pte);
  		if (!page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
378
  			continue;
053837fce   Nick Piggin   [PATCH] mm: migra...
379
380
381
382
383
384
385
386
387
388
389
  		/*
  		 * The check for PageReserved here is important to avoid
  		 * handling zero pages and other pages that may have been
  		 * marked special by the system.
  		 *
  		 * If the PageReserved would not be checked here then f.e.
  		 * the location of the zero page could have an influence
  		 * on MPOL_MF_STRICT, zero pages would be counted for
  		 * the per node stats, and there would be useless attempts
  		 * to put zero pages on the migration list.
  		 */
f4598c8b3   Christoph Lameter   [PATCH] migration...
390
391
  		if (PageReserved(page))
  			continue;
6aab341e0   Linus Torvalds   mm: re-architect ...
392
  		nid = page_to_nid(page);
38e35860d   Christoph Lameter   [PATCH] mempolici...
393
394
  		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
  			continue;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
395
  		if (flags & MPOL_MF_STATS)
397874dfe   Christoph Lameter   [PATCH] numa_maps...
396
  			gather_stats(page, private, pte_dirty(*pte));
053837fce   Nick Piggin   [PATCH] mm: migra...
397
  		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
fc3012896   Christoph Lameter   [PATCH] Simplify ...
398
  			migrate_page_add(page, private, flags);
38e35860d   Christoph Lameter   [PATCH] mempolici...
399
400
  		else
  			break;
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
401
  	} while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0c   Hugh Dickins   [PATCH] mm: pte_o...
402
  	pte_unmap_unlock(orig_pte, ptl);
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
403
404
  	return addr != end;
  }
b5810039a   Nick Piggin   [PATCH] core remo...
405
  static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
406
407
  		unsigned long addr, unsigned long end,
  		const nodemask_t *nodes, unsigned long flags,
38e35860d   Christoph Lameter   [PATCH] mempolici...
408
  		void *private)
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
409
410
411
412
413
414
415
416
417
  {
  	pmd_t *pmd;
  	unsigned long next;
  
  	pmd = pmd_offset(pud, addr);
  	do {
  		next = pmd_addr_end(addr, end);
  		if (pmd_none_or_clear_bad(pmd))
  			continue;
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
418
  		if (check_pte_range(vma, pmd, addr, next, nodes,
38e35860d   Christoph Lameter   [PATCH] mempolici...
419
  				    flags, private))
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
420
421
422
423
  			return -EIO;
  	} while (pmd++, addr = next, addr != end);
  	return 0;
  }
b5810039a   Nick Piggin   [PATCH] core remo...
424
  static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
425
426
  		unsigned long addr, unsigned long end,
  		const nodemask_t *nodes, unsigned long flags,
38e35860d   Christoph Lameter   [PATCH] mempolici...
427
  		void *private)
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
428
429
430
431
432
433
434
435
436
  {
  	pud_t *pud;
  	unsigned long next;
  
  	pud = pud_offset(pgd, addr);
  	do {
  		next = pud_addr_end(addr, end);
  		if (pud_none_or_clear_bad(pud))
  			continue;
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
437
  		if (check_pmd_range(vma, pud, addr, next, nodes,
38e35860d   Christoph Lameter   [PATCH] mempolici...
438
  				    flags, private))
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
439
440
441
442
  			return -EIO;
  	} while (pud++, addr = next, addr != end);
  	return 0;
  }
b5810039a   Nick Piggin   [PATCH] core remo...
443
  static inline int check_pgd_range(struct vm_area_struct *vma,
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
444
445
  		unsigned long addr, unsigned long end,
  		const nodemask_t *nodes, unsigned long flags,
38e35860d   Christoph Lameter   [PATCH] mempolici...
446
  		void *private)
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
447
448
449
  {
  	pgd_t *pgd;
  	unsigned long next;
b5810039a   Nick Piggin   [PATCH] core remo...
450
  	pgd = pgd_offset(vma->vm_mm, addr);
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
451
452
453
454
  	do {
  		next = pgd_addr_end(addr, end);
  		if (pgd_none_or_clear_bad(pgd))
  			continue;
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
455
  		if (check_pud_range(vma, pgd, addr, next, nodes,
38e35860d   Christoph Lameter   [PATCH] mempolici...
456
  				    flags, private))
91612e0df   Hugh Dickins   [PATCH] mbind: ch...
457
458
459
  			return -EIO;
  	} while (pgd++, addr = next, addr != end);
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
460
  }
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
461
462
463
464
465
  /*
   * Check if all pages in a range are on a set of nodes.
   * If pagelist != NULL then isolate pages from the LRU and
   * put them on the pagelist.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
466
467
  static struct vm_area_struct *
  check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
38e35860d   Christoph Lameter   [PATCH] mempolici...
468
  		const nodemask_t *nodes, unsigned long flags, void *private)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
469
470
471
  {
  	int err;
  	struct vm_area_struct *first, *vma, *prev;
053837fce   Nick Piggin   [PATCH] mm: migra...
472

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
473
474
475
476
477
  	first = find_vma(mm, start);
  	if (!first)
  		return ERR_PTR(-EFAULT);
  	prev = NULL;
  	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
478
479
480
481
482
483
484
485
486
487
  		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
  			if (!vma->vm_next && vma->vm_end < end)
  				return ERR_PTR(-EFAULT);
  			if (prev && prev->vm_end < vma->vm_start)
  				return ERR_PTR(-EFAULT);
  		}
  		if (!is_vm_hugetlb_page(vma) &&
  		    ((flags & MPOL_MF_STRICT) ||
  		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
  				vma_migratable(vma)))) {
5b952b3c1   Andi Kleen   [PATCH] Fix MPOL_...
488
  			unsigned long endvma = vma->vm_end;
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
489

5b952b3c1   Andi Kleen   [PATCH] Fix MPOL_...
490
491
492
493
  			if (endvma > end)
  				endvma = end;
  			if (vma->vm_start > start)
  				start = vma->vm_start;
dc9aa5b9d   Christoph Lameter   [PATCH] Swap Migr...
494
  			err = check_pgd_range(vma, start, endvma, nodes,
38e35860d   Christoph Lameter   [PATCH] mempolici...
495
  						flags, private);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
  			if (err) {
  				first = ERR_PTR(err);
  				break;
  			}
  		}
  		prev = vma;
  	}
  	return first;
  }
  
  /* Apply policy to a single VMA */
  static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
  {
  	int err = 0;
  	struct mempolicy *old = vma->vm_policy;
140d5a490   Paul Mundt   numa: mempolicy: ...
511
512
  	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
513
514
515
516
517
518
519
520
521
  		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
  		 vma->vm_ops, vma->vm_file,
  		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
  
  	if (vma->vm_ops && vma->vm_ops->set_policy)
  		err = vma->vm_ops->set_policy(vma, new);
  	if (!err) {
  		mpol_get(new);
  		vma->vm_policy = new;
f0be3d32b   Lee Schermerhorn   mempolicy: rename...
522
  		mpol_put(old);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
  	}
  	return err;
  }
  
  /* Step 2: apply policy to a range and do splits. */
  static int mbind_range(struct vm_area_struct *vma, unsigned long start,
  		       unsigned long end, struct mempolicy *new)
  {
  	struct vm_area_struct *next;
  	int err;
  
  	err = 0;
  	for (; vma && vma->vm_start < end; vma = next) {
  		next = vma->vm_next;
  		if (vma->vm_start < start)
  			err = split_vma(vma->vm_mm, vma, start, 1);
  		if (!err && vma->vm_end > end)
  			err = split_vma(vma->vm_mm, vma, end, 0);
  		if (!err)
  			err = policy_vma(vma, new);
  		if (err)
  			break;
  	}
  	return err;
  }
c61afb181   Paul Jackson   [PATCH] cpuset me...
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
  /*
   * Update task->flags PF_MEMPOLICY bit: set iff non-default
   * mempolicy.  Allows more rapid checking of this (combined perhaps
   * with other PF_* flag bits) on memory allocation hot code paths.
   *
   * If called from outside this file, the task 'p' should -only- be
   * a newly forked child not yet visible on the task list, because
   * manipulating the task flags of a visible task is not safe.
   *
   * The above limitation is why this routine has the funny name
   * mpol_fix_fork_child_flag().
   *
   * It is also safe to call this with a task pointer of current,
   * which the static wrapper mpol_set_task_struct_flag() does,
   * for use within this file.
   */
  
  void mpol_fix_fork_child_flag(struct task_struct *p)
  {
  	if (p->mempolicy)
  		p->flags |= PF_MEMPOLICY;
  	else
  		p->flags &= ~PF_MEMPOLICY;
  }
  
  static void mpol_set_task_struct_flag(void)
  {
  	mpol_fix_fork_child_flag(current);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
577
  /* Set the process memory policy */
028fec414   David Rientjes   mempolicy: suppor...
578
579
  static long do_set_mempolicy(unsigned short mode, unsigned short flags,
  			     nodemask_t *nodes)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
580
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
581
  	struct mempolicy *new;
f4e53d910   Lee Schermerhorn   mempolicy: write ...
582
  	struct mm_struct *mm = current->mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
583

028fec414   David Rientjes   mempolicy: suppor...
584
  	new = mpol_new(mode, flags, nodes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
585
586
  	if (IS_ERR(new))
  		return PTR_ERR(new);
f4e53d910   Lee Schermerhorn   mempolicy: write ...
587
588
589
590
591
592
593
594
595
  
  	/*
  	 * prevent changing our mempolicy while show_numa_maps()
  	 * is using it.
  	 * Note:  do_set_mempolicy() can be called at init time
  	 * with no 'mm'.
  	 */
  	if (mm)
  		down_write(&mm->mmap_sem);
f0be3d32b   Lee Schermerhorn   mempolicy: rename...
596
  	mpol_put(current->mempolicy);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
597
  	current->mempolicy = new;
c61afb181   Paul Jackson   [PATCH] cpuset me...
598
  	mpol_set_task_struct_flag();
45c4745af   Lee Schermerhorn   mempolicy: rename...
599
  	if (new && new->mode == MPOL_INTERLEAVE &&
f5b087b52   David Rientjes   mempolicy: add MP...
600
  	    nodes_weight(new->v.nodes))
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
601
  		current->il_next = first_node(new->v.nodes);
f4e53d910   Lee Schermerhorn   mempolicy: write ...
602
603
  	if (mm)
  		up_write(&mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
604
605
  	return 0;
  }
bea904d54   Lee Schermerhorn   mempolicy: use MP...
606
607
608
609
  /*
   * Return nodemask for policy for get_mempolicy() query
   */
  static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
610
  {
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
611
  	nodes_clear(*nodes);
bea904d54   Lee Schermerhorn   mempolicy: use MP...
612
613
  	if (p == &default_policy)
  		return;
45c4745af   Lee Schermerhorn   mempolicy: rename...
614
  	switch (p->mode) {
19770b326   Mel Gorman   mm: filter based ...
615
616
  	case MPOL_BIND:
  		/* Fall through */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
617
  	case MPOL_INTERLEAVE:
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
618
  		*nodes = p->v.nodes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
619
620
  		break;
  	case MPOL_PREFERRED:
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
621
  		if (!(p->flags & MPOL_F_LOCAL))
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
622
  			node_set(p->v.preferred_node, *nodes);
53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
623
  		/* else return empty node mask for local allocation */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
  		break;
  	default:
  		BUG();
  	}
  }
  
  static int lookup_node(struct mm_struct *mm, unsigned long addr)
  {
  	struct page *p;
  	int err;
  
  	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
  	if (err >= 0) {
  		err = page_to_nid(p);
  		put_page(p);
  	}
  	return err;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
642
  /* Retrieve NUMA policy */
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
643
644
  static long do_get_mempolicy(int *policy, nodemask_t *nmask,
  			     unsigned long addr, unsigned long flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
645
  {
8bccd85ff   Christoph Lameter   [PATCH] Implement...
646
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
647
648
649
  	struct mm_struct *mm = current->mm;
  	struct vm_area_struct *vma = NULL;
  	struct mempolicy *pol = current->mempolicy;
cf2a473c4   Paul Jackson   [PATCH] cpuset: c...
650
  	cpuset_update_task_memory_state();
754af6f5a   Lee Schermerhorn   Mem Policy: add M...
651
652
  	if (flags &
  		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
653
  		return -EINVAL;
754af6f5a   Lee Schermerhorn   Mem Policy: add M...
654
655
656
657
658
659
660
661
  
  	if (flags & MPOL_F_MEMS_ALLOWED) {
  		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
  			return -EINVAL;
  		*policy = 0;	/* just so it's initialized */
  		*nmask  = cpuset_current_mems_allowed;
  		return 0;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
662
  	if (flags & MPOL_F_ADDR) {
bea904d54   Lee Schermerhorn   mempolicy: use MP...
663
664
665
666
667
  		/*
  		 * Do NOT fall back to task policy if the
  		 * vma/shared policy at addr is NULL.  We
  		 * want to return MPOL_DEFAULT in this case.
  		 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
668
669
670
671
672
673
674
675
676
677
678
679
680
681
  		down_read(&mm->mmap_sem);
  		vma = find_vma_intersection(mm, addr, addr+1);
  		if (!vma) {
  			up_read(&mm->mmap_sem);
  			return -EFAULT;
  		}
  		if (vma->vm_ops && vma->vm_ops->get_policy)
  			pol = vma->vm_ops->get_policy(vma, addr);
  		else
  			pol = vma->vm_policy;
  	} else if (addr)
  		return -EINVAL;
  
  	if (!pol)
bea904d54   Lee Schermerhorn   mempolicy: use MP...
682
  		pol = &default_policy;	/* indicates default behavior */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
683
684
685
686
687
688
  
  	if (flags & MPOL_F_NODE) {
  		if (flags & MPOL_F_ADDR) {
  			err = lookup_node(mm, addr);
  			if (err < 0)
  				goto out;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
689
  			*policy = err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
690
  		} else if (pol == current->mempolicy &&
45c4745af   Lee Schermerhorn   mempolicy: rename...
691
  				pol->mode == MPOL_INTERLEAVE) {
8bccd85ff   Christoph Lameter   [PATCH] Implement...
692
  			*policy = current->il_next;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
693
694
695
696
  		} else {
  			err = -EINVAL;
  			goto out;
  		}
bea904d54   Lee Schermerhorn   mempolicy: use MP...
697
698
699
  	} else {
  		*policy = pol == &default_policy ? MPOL_DEFAULT :
  						pol->mode;
d79df630f   David Rientjes   mempolicy: mask o...
700
701
702
703
704
  		/*
  		 * Internal mempolicy flags must be masked off before exposing
  		 * the policy to userspace.
  		 */
  		*policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d54   Lee Schermerhorn   mempolicy: use MP...
705
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
706
707
708
709
710
  
  	if (vma) {
  		up_read(&current->mm->mmap_sem);
  		vma = NULL;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
711
  	err = 0;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
712
  	if (nmask)
bea904d54   Lee Schermerhorn   mempolicy: use MP...
713
  		get_policy_nodemask(pol, nmask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
714
715
  
   out:
52cd3b074   Lee Schermerhorn   mempolicy: rework...
716
  	mpol_cond_put(pol);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
717
718
719
720
  	if (vma)
  		up_read(&current->mm->mmap_sem);
  	return err;
  }
b20a35035   Christoph Lameter   [PATCH] page migr...
721
  #ifdef CONFIG_MIGRATION
8bccd85ff   Christoph Lameter   [PATCH] Implement...
722
  /*
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
723
724
   * page migration
   */
fc3012896   Christoph Lameter   [PATCH] Simplify ...
725
726
  static void migrate_page_add(struct page *page, struct list_head *pagelist,
  				unsigned long flags)
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
727
728
  {
  	/*
fc3012896   Christoph Lameter   [PATCH] Simplify ...
729
  	 * Avoid migrating a page that is shared with others.
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
730
  	 */
62695a84e   Nick Piggin   vmscan: move isol...
731
732
733
734
735
  	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
  		if (!isolate_lru_page(page)) {
  			list_add_tail(&page->lru, pagelist);
  		}
  	}
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
736
  }
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
737

742755a1d   Christoph Lameter   [PATCH] page migr...
738
  static struct page *new_node_page(struct page *page, unsigned long node, int **x)
95a402c38   Christoph Lameter   [PATCH] page migr...
739
  {
769848c03   Mel Gorman   Add __GFP_MOVABLE...
740
  	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
95a402c38   Christoph Lameter   [PATCH] page migr...
741
  }
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
742
743
744
745
  /*
   * Migrate pages from one node to a target node.
   * Returns error or the number of pages not migrated.
   */
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
746
747
  static int migrate_to_node(struct mm_struct *mm, int source, int dest,
  			   int flags)
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
748
749
750
751
752
753
754
  {
  	nodemask_t nmask;
  	LIST_HEAD(pagelist);
  	int err = 0;
  
  	nodes_clear(nmask);
  	node_set(source, nmask);
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
755

7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
756
757
  	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
  			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
aaa994b30   Christoph Lameter   [PATCH] page migr...
758
  	if (!list_empty(&pagelist))
95a402c38   Christoph Lameter   [PATCH] page migr...
759
  		err = migrate_pages(&pagelist, new_node_page, dest);
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
760
  	return err;
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
761
762
763
  }
  
  /*
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
764
765
   * Move pages between the two nodesets so as to preserve the physical
   * layout as much as possible.
39743889a   Christoph Lameter   [PATCH] Swap Migr...
766
767
768
769
770
771
   *
   * Returns the number of page that could not be moved.
   */
  int do_migrate_pages(struct mm_struct *mm,
  	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
  {
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
772
  	int busy = 0;
0aedadf91   Christoph Lameter   mm: move migrate_...
773
  	int err;
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
774
  	nodemask_t tmp;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
775

0aedadf91   Christoph Lameter   mm: move migrate_...
776
777
778
  	err = migrate_prep();
  	if (err)
  		return err;
53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
779
  	down_read(&mm->mmap_sem);
39743889a   Christoph Lameter   [PATCH] Swap Migr...
780

7b2259b3e   Christoph Lameter   [PATCH] page migr...
781
782
783
  	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
  	if (err)
  		goto out;
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
  /*
   * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
   * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
   * bit in 'tmp', and return that <source, dest> pair for migration.
   * The pair of nodemasks 'to' and 'from' define the map.
   *
   * If no pair of bits is found that way, fallback to picking some
   * pair of 'source' and 'dest' bits that are not the same.  If the
   * 'source' and 'dest' bits are the same, this represents a node
   * that will be migrating to itself, so no pages need move.
   *
   * If no bits are left in 'tmp', or if all remaining bits left
   * in 'tmp' correspond to the same bit in 'to', return false
   * (nothing left to migrate).
   *
   * This lets us pick a pair of nodes to migrate between, such that
   * if possible the dest node is not already occupied by some other
   * source node, minimizing the risk of overloading the memory on a
   * node that would happen if we migrated incoming memory to a node
   * before migrating outgoing memory source that same node.
   *
   * A single scan of tmp is sufficient.  As we go, we remember the
   * most recent <s, d> pair that moved (s != d).  If we find a pair
   * that not only moved, but what's better, moved to an empty slot
   * (d is not set in tmp), then we break out then, with that pair.
   * Otherwise when we finish scannng from_tmp, we at least have the
   * most recent <s, d> pair that moved.  If we get all the way through
   * the scan of tmp without finding any node that moved, much less
   * moved to an empty node, then there is nothing left worth migrating.
   */
d49847113   Christoph Lameter   [PATCH] SwapMig: ...
814

7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
  	tmp = *from_nodes;
  	while (!nodes_empty(tmp)) {
  		int s,d;
  		int source = -1;
  		int dest = 0;
  
  		for_each_node_mask(s, tmp) {
  			d = node_remap(s, *from_nodes, *to_nodes);
  			if (s == d)
  				continue;
  
  			source = s;	/* Node moved. Memorize */
  			dest = d;
  
  			/* dest not in remaining from nodes? */
  			if (!node_isset(dest, tmp))
  				break;
  		}
  		if (source == -1)
  			break;
  
  		node_clear(source, tmp);
  		err = migrate_to_node(mm, source, dest, flags);
  		if (err > 0)
  			busy += err;
  		if (err < 0)
  			break;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
842
  	}
7b2259b3e   Christoph Lameter   [PATCH] page migr...
843
  out:
39743889a   Christoph Lameter   [PATCH] Swap Migr...
844
  	up_read(&mm->mmap_sem);
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
845
846
847
  	if (err < 0)
  		return err;
  	return busy;
b20a35035   Christoph Lameter   [PATCH] page migr...
848
849
  
  }
3ad33b243   Lee Schermerhorn   Migration: find c...
850
851
852
853
854
855
856
  /*
   * Allocate a new page for page migration based on vma policy.
   * Start assuming that page is mapped by vma pointed to by @private.
   * Search forward from there, if not.  N.B., this assumes that the
   * list of pages handed to migrate_pages()--which is how we get here--
   * is in virtual address order.
   */
742755a1d   Christoph Lameter   [PATCH] page migr...
857
  static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c38   Christoph Lameter   [PATCH] page migr...
858
859
  {
  	struct vm_area_struct *vma = (struct vm_area_struct *)private;
3ad33b243   Lee Schermerhorn   Migration: find c...
860
  	unsigned long uninitialized_var(address);
95a402c38   Christoph Lameter   [PATCH] page migr...
861

3ad33b243   Lee Schermerhorn   Migration: find c...
862
863
864
865
866
867
868
869
870
871
872
  	while (vma) {
  		address = page_address_in_vma(page, vma);
  		if (address != -EFAULT)
  			break;
  		vma = vma->vm_next;
  	}
  
  	/*
  	 * if !vma, alloc_page_vma() will use task or system default policy
  	 */
  	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
95a402c38   Christoph Lameter   [PATCH] page migr...
873
  }
b20a35035   Christoph Lameter   [PATCH] page migr...
874
875
876
877
878
  #else
  
  static void migrate_page_add(struct page *page, struct list_head *pagelist,
  				unsigned long flags)
  {
39743889a   Christoph Lameter   [PATCH] Swap Migr...
879
  }
b20a35035   Christoph Lameter   [PATCH] page migr...
880
881
882
883
884
  int do_migrate_pages(struct mm_struct *mm,
  	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
  {
  	return -ENOSYS;
  }
95a402c38   Christoph Lameter   [PATCH] page migr...
885

699397499   Keith Owens   [PATCH] Fix do_mb...
886
  static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c38   Christoph Lameter   [PATCH] page migr...
887
888
889
  {
  	return NULL;
  }
b20a35035   Christoph Lameter   [PATCH] page migr...
890
  #endif
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
891
  static long do_mbind(unsigned long start, unsigned long len,
028fec414   David Rientjes   mempolicy: suppor...
892
893
  		     unsigned short mode, unsigned short mode_flags,
  		     nodemask_t *nmask, unsigned long flags)
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
894
895
896
897
898
899
900
  {
  	struct vm_area_struct *vma;
  	struct mm_struct *mm = current->mm;
  	struct mempolicy *new;
  	unsigned long end;
  	int err;
  	LIST_HEAD(pagelist);
a3b51e014   David Rientjes   mempolicy: conver...
901
902
  	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
  				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
903
  		return -EINVAL;
74c002410   Christoph Lameter   [PATCH] Consisten...
904
  	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
  		return -EPERM;
  
  	if (start & ~PAGE_MASK)
  		return -EINVAL;
  
  	if (mode == MPOL_DEFAULT)
  		flags &= ~MPOL_MF_STRICT;
  
  	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
  	end = start + len;
  
  	if (end < start)
  		return -EINVAL;
  	if (end == start)
  		return 0;
028fec414   David Rientjes   mempolicy: suppor...
920
  	new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
921
922
923
924
925
926
927
928
929
  	if (IS_ERR(new))
  		return PTR_ERR(new);
  
  	/*
  	 * If we are using the default policy then operation
  	 * on discontinuous address spaces is okay after all
  	 */
  	if (!new)
  		flags |= MPOL_MF_DISCONTIG_OK;
028fec414   David Rientjes   mempolicy: suppor...
930
931
932
933
  	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx
  ",
  		 start, start + len, mode, mode_flags,
  		 nmask ? nodes_addr(*nmask)[0] : -1);
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
934

0aedadf91   Christoph Lameter   mm: move migrate_...
935
936
937
938
939
940
  	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
  
  		err = migrate_prep();
  		if (err)
  			return err;
  	}
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
941
942
943
944
945
946
947
948
949
  	down_write(&mm->mmap_sem);
  	vma = check_range(mm, start, end, nmask,
  			  flags | MPOL_MF_INVERT, &pagelist);
  
  	err = PTR_ERR(vma);
  	if (!IS_ERR(vma)) {
  		int nr_failed = 0;
  
  		err = mbind_range(vma, start, end, new);
7e2ab150d   Christoph Lameter   [PATCH] Direct Mi...
950

6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
951
  		if (!list_empty(&pagelist))
95a402c38   Christoph Lameter   [PATCH] page migr...
952
953
  			nr_failed = migrate_pages(&pagelist, new_vma_page,
  						(unsigned long)vma);
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
954
955
956
957
  
  		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
  			err = -EIO;
  	}
b20a35035   Christoph Lameter   [PATCH] page migr...
958

6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
959
  	up_write(&mm->mmap_sem);
f0be3d32b   Lee Schermerhorn   mempolicy: rename...
960
  	mpol_put(new);
6ce3c4c0f   Christoph Lameter   [PATCH] Move page...
961
962
  	return err;
  }
39743889a   Christoph Lameter   [PATCH] Swap Migr...
963
  /*
8bccd85ff   Christoph Lameter   [PATCH] Implement...
964
965
966
967
   * User space interface with variable sized bitmaps for nodelists.
   */
  
  /* Copy a node mask from user space. */
39743889a   Christoph Lameter   [PATCH] Swap Migr...
968
  static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85ff   Christoph Lameter   [PATCH] Implement...
969
970
971
972
973
974
975
976
977
978
  		     unsigned long maxnode)
  {
  	unsigned long k;
  	unsigned long nlongs;
  	unsigned long endmask;
  
  	--maxnode;
  	nodes_clear(*nodes);
  	if (maxnode == 0 || !nmask)
  		return 0;
a9c930bac   Andi Kleen   [PATCH] Fix units...
979
  	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c17   Chris Wright   [PATCH] sys_mbind...
980
  		return -EINVAL;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
  
  	nlongs = BITS_TO_LONGS(maxnode);
  	if ((maxnode % BITS_PER_LONG) == 0)
  		endmask = ~0UL;
  	else
  		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
  
  	/* When the user specified more nodes than supported just check
  	   if the non supported part is all zero. */
  	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
  		if (nlongs > PAGE_SIZE/sizeof(long))
  			return -EINVAL;
  		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
  			unsigned long t;
  			if (get_user(t, nmask + k))
  				return -EFAULT;
  			if (k == nlongs - 1) {
  				if (t & endmask)
  					return -EINVAL;
  			} else if (t)
  				return -EINVAL;
  		}
  		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
  		endmask = ~0UL;
  	}
  
  	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
  		return -EFAULT;
  	nodes_addr(*nodes)[nlongs-1] &= endmask;
  	return 0;
  }
  
  /* Copy a kernel node mask to user space */
  static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
  			      nodemask_t *nodes)
  {
  	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
  	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
  
  	if (copy > nbytes) {
  		if (copy > PAGE_SIZE)
  			return -EINVAL;
  		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
  			return -EFAULT;
  		copy = nbytes;
  	}
  	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
  }
  
  asmlinkage long sys_mbind(unsigned long start, unsigned long len,
  			unsigned long mode,
  			unsigned long __user *nmask, unsigned long maxnode,
  			unsigned flags)
  {
  	nodemask_t nodes;
  	int err;
028fec414   David Rientjes   mempolicy: suppor...
1037
  	unsigned short mode_flags;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1038

028fec414   David Rientjes   mempolicy: suppor...
1039
1040
  	mode_flags = mode & MPOL_MODE_FLAGS;
  	mode &= ~MPOL_MODE_FLAGS;
a3b51e014   David Rientjes   mempolicy: conver...
1041
1042
  	if (mode >= MPOL_MAX)
  		return -EINVAL;
4c50bc011   David Rientjes   mempolicy: add MP...
1043
1044
1045
  	if ((mode_flags & MPOL_F_STATIC_NODES) &&
  	    (mode_flags & MPOL_F_RELATIVE_NODES))
  		return -EINVAL;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1046
1047
1048
  	err = get_nodes(&nodes, nmask, maxnode);
  	if (err)
  		return err;
028fec414   David Rientjes   mempolicy: suppor...
1049
  	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1050
1051
1052
1053
1054
1055
1056
1057
  }
  
  /* Set the process memory policy */
  asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
  		unsigned long maxnode)
  {
  	int err;
  	nodemask_t nodes;
028fec414   David Rientjes   mempolicy: suppor...
1058
  	unsigned short flags;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1059

028fec414   David Rientjes   mempolicy: suppor...
1060
1061
1062
  	flags = mode & MPOL_MODE_FLAGS;
  	mode &= ~MPOL_MODE_FLAGS;
  	if ((unsigned int)mode >= MPOL_MAX)
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1063
  		return -EINVAL;
4c50bc011   David Rientjes   mempolicy: add MP...
1064
1065
  	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
  		return -EINVAL;
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1066
1067
1068
  	err = get_nodes(&nodes, nmask, maxnode);
  	if (err)
  		return err;
028fec414   David Rientjes   mempolicy: suppor...
1069
  	return do_set_mempolicy(mode, flags, &nodes);
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1070
  }
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1071
1072
1073
1074
  asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
  		const unsigned long __user *old_nodes,
  		const unsigned long __user *new_nodes)
  {
c69e8d9c0   David Howells   CRED: Use RCU to ...
1075
  	const struct cred *cred = current_cred(), *tcred;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
  	struct mm_struct *mm;
  	struct task_struct *task;
  	nodemask_t old;
  	nodemask_t new;
  	nodemask_t task_nodes;
  	int err;
  
  	err = get_nodes(&old, old_nodes, maxnode);
  	if (err)
  		return err;
  
  	err = get_nodes(&new, new_nodes, maxnode);
  	if (err)
  		return err;
  
  	/* Find the mm_struct */
  	read_lock(&tasklist_lock);
228ebcbe6   Pavel Emelyanov   Uninline find_tas...
1093
  	task = pid ? find_task_by_vpid(pid) : current;
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
  	if (!task) {
  		read_unlock(&tasklist_lock);
  		return -ESRCH;
  	}
  	mm = get_task_mm(task);
  	read_unlock(&tasklist_lock);
  
  	if (!mm)
  		return -EINVAL;
  
  	/*
  	 * Check if this process has the right to modify the specified
  	 * process. The right exists if the process has administrative
7f927fcc2   Alexey Dobriyan   [PATCH] Typo fixes
1107
  	 * capabilities, superuser privileges or the same
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1108
1109
  	 * userid as the target process.
  	 */
c69e8d9c0   David Howells   CRED: Use RCU to ...
1110
1111
  	rcu_read_lock();
  	tcred = __task_cred(task);
b6dff3ec5   David Howells   CRED: Separate ta...
1112
1113
  	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
  	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
74c002410   Christoph Lameter   [PATCH] Consisten...
1114
  	    !capable(CAP_SYS_NICE)) {
c69e8d9c0   David Howells   CRED: Use RCU to ...
1115
  		rcu_read_unlock();
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1116
1117
1118
  		err = -EPERM;
  		goto out;
  	}
c69e8d9c0   David Howells   CRED: Use RCU to ...
1119
  	rcu_read_unlock();
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1120
1121
1122
  
  	task_nodes = cpuset_mems_allowed(task);
  	/* Is the user allowed to access the target nodes? */
74c002410   Christoph Lameter   [PATCH] Consisten...
1123
  	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1124
1125
1126
  		err = -EPERM;
  		goto out;
  	}
37b07e416   Lee Schermerhorn   memoryless nodes:...
1127
  	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
3b42d28b2   Christoph Lameter   Page migration: D...
1128
1129
1130
  		err = -EINVAL;
  		goto out;
  	}
86c3a7645   David Quigley   [PATCH] SELinux: ...
1131
1132
1133
  	err = security_task_movememory(task);
  	if (err)
  		goto out;
511030bcd   Christoph Lameter   [PATCH] Fix sys_m...
1134
  	err = do_migrate_pages(mm, &old, &new,
74c002410   Christoph Lameter   [PATCH] Consisten...
1135
  		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
39743889a   Christoph Lameter   [PATCH] Swap Migr...
1136
1137
1138
1139
  out:
  	mmput(mm);
  	return err;
  }
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1140
1141
1142
1143
1144
1145
  /* Retrieve NUMA policy */
  asmlinkage long sys_get_mempolicy(int __user *policy,
  				unsigned long __user *nmask,
  				unsigned long maxnode,
  				unsigned long addr, unsigned long flags)
  {
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
1146
1147
  	int err;
  	int uninitialized_var(pval);
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
  	nodemask_t nodes;
  
  	if (nmask != NULL && maxnode < MAX_NUMNODES)
  		return -EINVAL;
  
  	err = do_get_mempolicy(&pval, &nodes, addr, flags);
  
  	if (err)
  		return err;
  
  	if (policy && put_user(pval, policy))
  		return -EFAULT;
  
  	if (nmask)
  		err = copy_nodes_to_user(nmask, maxnode, &nodes);
  
  	return err;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
  #ifdef CONFIG_COMPAT
  
  asmlinkage long compat_sys_get_mempolicy(int __user *policy,
  				     compat_ulong_t __user *nmask,
  				     compat_ulong_t maxnode,
  				     compat_ulong_t addr, compat_ulong_t flags)
  {
  	long err;
  	unsigned long __user *nm = NULL;
  	unsigned long nr_bits, alloc_size;
  	DECLARE_BITMAP(bm, MAX_NUMNODES);
  
  	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
  	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
  
  	if (nmask)
  		nm = compat_alloc_user_space(alloc_size);
  
  	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
  
  	if (!err && nmask) {
  		err = copy_from_user(bm, nm, alloc_size);
  		/* ensure entire bitmap is zeroed */
  		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
  		err |= compat_put_bitmap(nmask, bm, nr_bits);
  	}
  
  	return err;
  }
  
  asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
  				     compat_ulong_t maxnode)
  {
  	long err = 0;
  	unsigned long __user *nm = NULL;
  	unsigned long nr_bits, alloc_size;
  	DECLARE_BITMAP(bm, MAX_NUMNODES);
  
  	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
  	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
  
  	if (nmask) {
  		err = compat_get_bitmap(bm, nmask, nr_bits);
  		nm = compat_alloc_user_space(alloc_size);
  		err |= copy_to_user(nm, bm, alloc_size);
  	}
  
  	if (err)
  		return -EFAULT;
  
  	return sys_set_mempolicy(mode, nm, nr_bits+1);
  }
  
  asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
  			     compat_ulong_t mode, compat_ulong_t __user *nmask,
  			     compat_ulong_t maxnode, compat_ulong_t flags)
  {
  	long err = 0;
  	unsigned long __user *nm = NULL;
  	unsigned long nr_bits, alloc_size;
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1226
  	nodemask_t bm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1227
1228
1229
1230
1231
  
  	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
  	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
  
  	if (nmask) {
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1232
  		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1233
  		nm = compat_alloc_user_space(alloc_size);
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1234
  		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1235
1236
1237
1238
1239
1240
1241
1242
1243
  	}
  
  	if (err)
  		return -EFAULT;
  
  	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
  }
  
  #endif
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1244
1245
1246
1247
1248
1249
1250
1251
  /*
   * get_vma_policy(@task, @vma, @addr)
   * @task - task for fallback if vma policy == default
   * @vma   - virtual memory area whose policy is sought
   * @addr  - address in @vma for shared policy lookup
   *
   * Returns effective policy for a VMA at specified address.
   * Falls back to @task or system default policy, as necessary.
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1252
1253
1254
1255
1256
1257
1258
   * Current or other task's task mempolicy and non-shared vma policies
   * are protected by the task's mmap_sem, which must be held for read by
   * the caller.
   * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
   * count--added by the get_policy() vm_op, as appropriate--to protect against
   * freeing by another task.  It is the caller's responsibility to free the
   * extra reference for shared policies.
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1259
   */
ae4d8c16a   Lee Schermerhorn   mempolicy: fixup ...
1260
  static struct mempolicy *get_vma_policy(struct task_struct *task,
48fce3429   Christoph Lameter   [PATCH] mempolici...
1261
  		struct vm_area_struct *vma, unsigned long addr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1262
  {
6e21c8f14   Christoph Lameter   [PATCH] /proc/<pi...
1263
  	struct mempolicy *pol = task->mempolicy;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1264
1265
  
  	if (vma) {
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1266
  		if (vma->vm_ops && vma->vm_ops->get_policy) {
ae4d8c16a   Lee Schermerhorn   mempolicy: fixup ...
1267
1268
1269
1270
  			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
  									addr);
  			if (vpol)
  				pol = vpol;
bea904d54   Lee Schermerhorn   mempolicy: use MP...
1271
  		} else if (vma->vm_policy)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1272
1273
1274
1275
1276
1277
  			pol = vma->vm_policy;
  	}
  	if (!pol)
  		pol = &default_policy;
  	return pol;
  }
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1278
1279
1280
1281
1282
  /*
   * Return a nodemask representing a mempolicy for filtering nodes for
   * page allocation
   */
  static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b326   Mel Gorman   mm: filter based ...
1283
1284
  {
  	/* Lower zones don't get a nodemask applied for MPOL_BIND */
45c4745af   Lee Schermerhorn   mempolicy: rename...
1285
  	if (unlikely(policy->mode == MPOL_BIND) &&
19770b326   Mel Gorman   mm: filter based ...
1286
1287
1288
1289
1290
1291
  			gfp_zone(gfp) >= policy_zone &&
  			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
  		return &policy->v.nodes;
  
  	return NULL;
  }
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1292
1293
  /* Return a zonelist indicated by gfp for node representing a mempolicy */
  static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1294
  {
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
1295
  	int nd = numa_node_id();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1296

45c4745af   Lee Schermerhorn   mempolicy: rename...
1297
  	switch (policy->mode) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1298
  	case MPOL_PREFERRED:
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
1299
1300
  		if (!(policy->flags & MPOL_F_LOCAL))
  			nd = policy->v.preferred_node;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1301
1302
  		break;
  	case MPOL_BIND:
19770b326   Mel Gorman   mm: filter based ...
1303
  		/*
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1304
1305
1306
1307
  		 * Normally, MPOL_BIND allocations are node-local within the
  		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
  		 * current node is part of the mask, we use the zonelist for
  		 * the first node in the mask instead.
19770b326   Mel Gorman   mm: filter based ...
1308
  		 */
19770b326   Mel Gorman   mm: filter based ...
1309
1310
1311
1312
  		if (unlikely(gfp & __GFP_THISNODE) &&
  				unlikely(!node_isset(nd, policy->v.nodes)))
  			nd = first_node(policy->v.nodes);
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1313
  	case MPOL_INTERLEAVE: /* should not happen */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1314
1315
  		break;
  	default:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1316
1317
  		BUG();
  	}
0e88460da   Mel Gorman   mm: introduce nod...
1318
  	return node_zonelist(nd, gfp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1319
1320
1321
1322
1323
1324
1325
1326
1327
  }
  
  /* Do dynamic interleaving for a process */
  static unsigned interleave_nodes(struct mempolicy *policy)
  {
  	unsigned nid, next;
  	struct task_struct *me = current;
  
  	nid = me->il_next;
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1328
  	next = next_node(nid, policy->v.nodes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1329
  	if (next >= MAX_NUMNODES)
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1330
  		next = first_node(policy->v.nodes);
f5b087b52   David Rientjes   mempolicy: add MP...
1331
1332
  	if (next < MAX_NUMNODES)
  		me->il_next = next;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1333
1334
  	return nid;
  }
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1335
1336
1337
  /*
   * Depending on the memory policy provide a node from which to allocate the
   * next slab entry.
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1338
1339
1340
1341
   * @policy must be protected by freeing by the caller.  If @policy is
   * the current task's mempolicy, this protection is implicit, as only the
   * task can change it's policy.  The system default policy requires no
   * such protection.
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1342
1343
1344
   */
  unsigned slab_node(struct mempolicy *policy)
  {
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
1345
  	if (!policy || policy->flags & MPOL_F_LOCAL)
bea904d54   Lee Schermerhorn   mempolicy: use MP...
1346
1347
1348
1349
  		return numa_node_id();
  
  	switch (policy->mode) {
  	case MPOL_PREFERRED:
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
1350
1351
1352
1353
  		/*
  		 * handled MPOL_F_LOCAL above
  		 */
  		return policy->v.preferred_node;
765c4507a   Christoph Lameter   [PATCH] GFP_THISN...
1354

dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1355
1356
  	case MPOL_INTERLEAVE:
  		return interleave_nodes(policy);
dd1a239f6   Mel Gorman   mm: have zonelist...
1357
  	case MPOL_BIND: {
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1358
1359
1360
1361
  		/*
  		 * Follow bind policy behavior and start allocation at the
  		 * first node.
  		 */
19770b326   Mel Gorman   mm: filter based ...
1362
1363
1364
1365
1366
1367
1368
1369
  		struct zonelist *zonelist;
  		struct zone *zone;
  		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
  		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
  		(void)first_zones_zonelist(zonelist, highest_zoneidx,
  							&policy->v.nodes,
  							&zone);
  		return zone->node;
dd1a239f6   Mel Gorman   mm: have zonelist...
1370
  	}
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1371

dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1372
  	default:
bea904d54   Lee Schermerhorn   mempolicy: use MP...
1373
  		BUG();
dc85da15d   Christoph Lameter   [PATCH] NUMA poli...
1374
1375
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1376
1377
1378
1379
  /* Do static interleaving for a VMA with known offset. */
  static unsigned offset_il_node(struct mempolicy *pol,
  		struct vm_area_struct *vma, unsigned long off)
  {
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1380
  	unsigned nnodes = nodes_weight(pol->v.nodes);
f5b087b52   David Rientjes   mempolicy: add MP...
1381
  	unsigned target;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1382
1383
  	int c;
  	int nid = -1;
f5b087b52   David Rientjes   mempolicy: add MP...
1384
1385
1386
  	if (!nnodes)
  		return numa_node_id();
  	target = (unsigned int)off % nnodes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1387
1388
  	c = 0;
  	do {
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1389
  		nid = next_node(nid, pol->v.nodes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1390
1391
  		c++;
  	} while (c <= target);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1392
1393
  	return nid;
  }
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1394
1395
1396
1397
1398
1399
  /* Determine a node number for interleave */
  static inline unsigned interleave_nid(struct mempolicy *pol,
  		 struct vm_area_struct *vma, unsigned long addr, int shift)
  {
  	if (vma) {
  		unsigned long off;
3b98b087f   Nishanth Aravamudan   [PATCH] fix NUMA ...
1400
1401
1402
1403
1404
1405
1406
1407
1408
  		/*
  		 * for small pages, there is no difference between
  		 * shift and PAGE_SHIFT, so the bit-shift is safe.
  		 * for huge pages, since vm_pgoff is in units of small
  		 * pages, we need to shift off the always 0 bits to get
  		 * a useful offset.
  		 */
  		BUG_ON(shift < PAGE_SHIFT);
  		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1409
1410
1411
1412
1413
  		off += (addr - vma->vm_start) >> shift;
  		return offset_il_node(pol, vma, off);
  	} else
  		return interleave_nodes(pol);
  }
00ac59adf   Kenneth W Chen   [PATCH] x86_64: F...
1414
  #ifdef CONFIG_HUGETLBFS
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1415
1416
1417
1418
1419
  /*
   * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
   * @vma = virtual memory area whose policy is sought
   * @addr = address in @vma for shared policy lookup and interleave policy
   * @gfp_flags = for requested zone
19770b326   Mel Gorman   mm: filter based ...
1420
1421
   * @mpol = pointer to mempolicy pointer for reference counted mempolicy
   * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1422
   *
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1423
1424
1425
1426
   * Returns a zonelist suitable for a huge page allocation and a pointer
   * to the struct mempolicy for conditional unref after allocation.
   * If the effective policy is 'BIND, returns a pointer to the mempolicy's
   * @nodemask for filtering the zonelist.
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1427
   */
396faf030   Mel Gorman   Allow huge page a...
1428
  struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
19770b326   Mel Gorman   mm: filter based ...
1429
1430
  				gfp_t gfp_flags, struct mempolicy **mpol,
  				nodemask_t **nodemask)
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1431
  {
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1432
  	struct zonelist *zl;
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1433

52cd3b074   Lee Schermerhorn   mempolicy: rework...
1434
  	*mpol = get_vma_policy(current, vma, addr);
19770b326   Mel Gorman   mm: filter based ...
1435
  	*nodemask = NULL;	/* assume !MPOL_BIND */
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1436

52cd3b074   Lee Schermerhorn   mempolicy: rework...
1437
1438
  	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
  		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
a55164389   Andi Kleen   hugetlb: modular ...
1439
  				huge_page_shift(hstate_vma(vma))), gfp_flags);
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1440
1441
1442
1443
  	} else {
  		zl = policy_zonelist(gfp_flags, *mpol);
  		if ((*mpol)->mode == MPOL_BIND)
  			*nodemask = &(*mpol)->v.nodes;
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1444
1445
  	}
  	return zl;
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1446
  }
00ac59adf   Kenneth W Chen   [PATCH] x86_64: F...
1447
  #endif
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1448

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1449
1450
  /* Allocate a page in interleaved policy.
     Own path because it needs to do special accounting. */
662f3a0b9   Andi Kleen   [PATCH] Remove ne...
1451
1452
  static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  					unsigned nid)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1453
1454
1455
  {
  	struct zonelist *zl;
  	struct page *page;
0e88460da   Mel Gorman   mm: introduce nod...
1456
  	zl = node_zonelist(nid, gfp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1457
  	page = __alloc_pages(gfp, order, zl);
dd1a239f6   Mel Gorman   mm: have zonelist...
1458
  	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
ca889e6c4   Christoph Lameter   [PATCH] Use Zoned...
1459
  		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
  	return page;
  }
  
  /**
   * 	alloc_page_vma	- Allocate a page for a VMA.
   *
   * 	@gfp:
   *      %GFP_USER    user allocation.
   *      %GFP_KERNEL  kernel allocations,
   *      %GFP_HIGHMEM highmem/user allocations,
   *      %GFP_FS      allocation should not call back into a file system.
   *      %GFP_ATOMIC  don't sleep.
   *
   * 	@vma:  Pointer to VMA or NULL if not available.
   *	@addr: Virtual Address of the allocation. Must be inside the VMA.
   *
   * 	This function allocates a page from the kernel page pool and applies
   *	a NUMA policy associated with the VMA or the current process.
   *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
   *	mm_struct of the VMA to prevent it from going away. Should be used for
   *	all allocations for pages that will be mapped into
   * 	user space. Returns NULL when no page can be allocated.
   *
   *	Should be called with the mm_sem of the vma hold.
   */
  struct page *
dd0fc66fb   Al Viro   [PATCH] gfp flags...
1486
  alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1487
  {
6e21c8f14   Christoph Lameter   [PATCH] /proc/<pi...
1488
  	struct mempolicy *pol = get_vma_policy(current, vma, addr);
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1489
  	struct zonelist *zl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1490

cf2a473c4   Paul Jackson   [PATCH] cpuset: c...
1491
  	cpuset_update_task_memory_state();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1492

45c4745af   Lee Schermerhorn   mempolicy: rename...
1493
  	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1494
  		unsigned nid;
5da7ca860   Christoph Lameter   [PATCH] Add NUMA ...
1495
1496
  
  		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1497
  		mpol_cond_put(pol);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1498
1499
  		return alloc_page_interleave(gfp, 0, nid);
  	}
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1500
1501
  	zl = policy_zonelist(gfp, pol);
  	if (unlikely(mpol_needs_cond_ref(pol))) {
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1502
  		/*
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1503
  		 * slow path: ref counted shared policy
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1504
  		 */
19770b326   Mel Gorman   mm: filter based ...
1505
  		struct page *page =  __alloc_pages_nodemask(gfp, 0,
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1506
  						zl, policy_nodemask(gfp, pol));
f0be3d32b   Lee Schermerhorn   mempolicy: rename...
1507
  		__mpol_put(pol);
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
1508
1509
1510
1511
1512
  		return page;
  	}
  	/*
  	 * fast path:  default or task policy
  	 */
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1513
  	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
  }
  
  /**
   * 	alloc_pages_current - Allocate pages.
   *
   *	@gfp:
   *		%GFP_USER   user allocation,
   *      	%GFP_KERNEL kernel allocation,
   *      	%GFP_HIGHMEM highmem allocation,
   *      	%GFP_FS     don't call back into a file system.
   *      	%GFP_ATOMIC don't sleep.
   *	@order: Power of two of allocation size in pages. 0 is a single page.
   *
   *	Allocate a page from the kernel page pool.  When not in
   *	interrupt context and apply the current process NUMA policy.
   *	Returns NULL when no page can be allocated.
   *
cf2a473c4   Paul Jackson   [PATCH] cpuset: c...
1531
   *	Don't call cpuset_update_task_memory_state() unless
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1532
1533
1534
   *	1) it's ok to take cpuset_sem (can WAIT), and
   *	2) allocating for current task (not interrupt).
   */
dd0fc66fb   Al Viro   [PATCH] gfp flags...
1535
  struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1536
1537
1538
1539
  {
  	struct mempolicy *pol = current->mempolicy;
  
  	if ((gfp & __GFP_WAIT) && !in_interrupt())
cf2a473c4   Paul Jackson   [PATCH] cpuset: c...
1540
  		cpuset_update_task_memory_state();
9b819d204   Christoph Lameter   [PATCH] Add __GFP...
1541
  	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1542
  		pol = &default_policy;
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1543
1544
1545
1546
1547
  
  	/*
  	 * No reference counting needed for current->mempolicy
  	 * nor system default_policy
  	 */
45c4745af   Lee Schermerhorn   mempolicy: rename...
1548
  	if (pol->mode == MPOL_INTERLEAVE)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1549
  		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
19770b326   Mel Gorman   mm: filter based ...
1550
  	return __alloc_pages_nodemask(gfp, order,
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1551
  			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1552
1553
  }
  EXPORT_SYMBOL(alloc_pages_current);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1554
  /*
846a16bf0   Lee Schermerhorn   mempolicy: rename...
1555
   * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1556
1557
1558
1559
1560
   * rebinds the mempolicy its copying by calling mpol_rebind_policy()
   * with the mems_allowed returned by cpuset_mems_allowed().  This
   * keeps mempolicies cpuset relative after its cpuset moves.  See
   * further kernel/cpuset.c update_nodemask().
   */
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1561

846a16bf0   Lee Schermerhorn   mempolicy: rename...
1562
1563
  /* Slow path of a mempolicy duplicate */
  struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1564
1565
1566
1567
1568
  {
  	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
  
  	if (!new)
  		return ERR_PTR(-ENOMEM);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1569
1570
1571
1572
  	if (current_cpuset_is_being_rebound()) {
  		nodemask_t mems = cpuset_mems_allowed(current);
  		mpol_rebind_policy(old, &mems);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1573
1574
  	*new = *old;
  	atomic_set(&new->refcnt, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1575
1576
  	return new;
  }
52cd3b074   Lee Schermerhorn   mempolicy: rework...
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
  /*
   * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
   * eliminate the * MPOL_F_* flags that require conditional ref and
   * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
   * after return.  Use the returned value.
   *
   * Allows use of a mempolicy for, e.g., multiple allocations with a single
   * policy lookup, even if the policy needs/has extra ref on lookup.
   * shmem_readahead needs this.
   */
  struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
  						struct mempolicy *frompol)
  {
  	if (!mpol_needs_cond_ref(frompol))
  		return frompol;
  
  	*tompol = *frompol;
  	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
  	__mpol_put(frompol);
  	return tompol;
  }
f5b087b52   David Rientjes   mempolicy: add MP...
1598
1599
1600
1601
1602
1603
1604
1605
1606
  static int mpol_match_intent(const struct mempolicy *a,
  			     const struct mempolicy *b)
  {
  	if (a->flags != b->flags)
  		return 0;
  	if (!mpol_store_user_nodemask(a))
  		return 1;
  	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1607
1608
1609
1610
1611
  /* Slow path of a mempolicy comparison */
  int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
  {
  	if (!a || !b)
  		return 0;
45c4745af   Lee Schermerhorn   mempolicy: rename...
1612
  	if (a->mode != b->mode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1613
  		return 0;
45c4745af   Lee Schermerhorn   mempolicy: rename...
1614
  	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
f5b087b52   David Rientjes   mempolicy: add MP...
1615
  		return 0;
45c4745af   Lee Schermerhorn   mempolicy: rename...
1616
  	switch (a->mode) {
19770b326   Mel Gorman   mm: filter based ...
1617
1618
  	case MPOL_BIND:
  		/* Fall through */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1619
  	case MPOL_INTERLEAVE:
dfcd3c0dc   Andi Kleen   [PATCH] Convert m...
1620
  		return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1621
  	case MPOL_PREFERRED:
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
1622
1623
  		return a->v.preferred_node == b->v.preferred_node &&
  			a->flags == b->flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1624
1625
1626
1627
1628
  	default:
  		BUG();
  		return 0;
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1629
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
   * Shared memory backing store policy support.
   *
   * Remember policies even when nobody has shared memory mapped.
   * The policies are kept in Red-Black tree linked from the inode.
   * They are protected by the sp->lock spinlock, which should be held
   * for any accesses to the tree.
   */
  
  /* lookup first element intersecting start-end */
  /* Caller holds sp->lock */
  static struct sp_node *
  sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
  {
  	struct rb_node *n = sp->root.rb_node;
  
  	while (n) {
  		struct sp_node *p = rb_entry(n, struct sp_node, nd);
  
  		if (start >= p->end)
  			n = n->rb_right;
  		else if (end <= p->start)
  			n = n->rb_left;
  		else
  			break;
  	}
  	if (!n)
  		return NULL;
  	for (;;) {
  		struct sp_node *w = NULL;
  		struct rb_node *prev = rb_prev(n);
  		if (!prev)
  			break;
  		w = rb_entry(prev, struct sp_node, nd);
  		if (w->end <= start)
  			break;
  		n = prev;
  	}
  	return rb_entry(n, struct sp_node, nd);
  }
  
  /* Insert a new shared policy into the list. */
  /* Caller holds sp->lock */
  static void sp_insert(struct shared_policy *sp, struct sp_node *new)
  {
  	struct rb_node **p = &sp->root.rb_node;
  	struct rb_node *parent = NULL;
  	struct sp_node *nd;
  
  	while (*p) {
  		parent = *p;
  		nd = rb_entry(parent, struct sp_node, nd);
  		if (new->start < nd->start)
  			p = &(*p)->rb_left;
  		else if (new->end > nd->end)
  			p = &(*p)->rb_right;
  		else
  			BUG();
  	}
  	rb_link_node(&new->nd, parent, p);
  	rb_insert_color(&new->nd, &sp->root);
140d5a490   Paul Mundt   numa: mempolicy: ...
1690
1691
  	pr_debug("inserting %lx-%lx: %d
  ", new->start, new->end,
45c4745af   Lee Schermerhorn   mempolicy: rename...
1692
  		 new->policy ? new->policy->mode : 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
  }
  
  /* Find shared policy intersecting idx */
  struct mempolicy *
  mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
  {
  	struct mempolicy *pol = NULL;
  	struct sp_node *sn;
  
  	if (!sp->root.rb_node)
  		return NULL;
  	spin_lock(&sp->lock);
  	sn = sp_lookup(sp, idx, idx+1);
  	if (sn) {
  		mpol_get(sn->policy);
  		pol = sn->policy;
  	}
  	spin_unlock(&sp->lock);
  	return pol;
  }
  
  static void sp_delete(struct shared_policy *sp, struct sp_node *n)
  {
140d5a490   Paul Mundt   numa: mempolicy: ...
1716
1717
  	pr_debug("deleting %lx-l%lx
  ", n->start, n->end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1718
  	rb_erase(&n->nd, &sp->root);
f0be3d32b   Lee Schermerhorn   mempolicy: rename...
1719
  	mpol_put(n->policy);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1720
1721
  	kmem_cache_free(sn_cache, n);
  }
dbcb0f19c   Adrian Bunk   mm/mempolicy.c: c...
1722
1723
  static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
  				struct mempolicy *pol)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1724
1725
1726
1727
1728
1729
1730
1731
  {
  	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
  
  	if (!n)
  		return NULL;
  	n->start = start;
  	n->end = end;
  	mpol_get(pol);
aab0b1029   Lee Schermerhorn   mempolicy: mark s...
1732
  	pol->flags |= MPOL_F_SHARED;	/* for unref */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
  	n->policy = pol;
  	return n;
  }
  
  /* Replace a policy range. */
  static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
  				 unsigned long end, struct sp_node *new)
  {
  	struct sp_node *n, *new2 = NULL;
  
  restart:
  	spin_lock(&sp->lock);
  	n = sp_lookup(sp, start, end);
  	/* Take care of old policies in the same range. */
  	while (n && n->start < end) {
  		struct rb_node *next = rb_next(&n->nd);
  		if (n->start >= start) {
  			if (n->end <= end)
  				sp_delete(sp, n);
  			else
  				n->start = end;
  		} else {
  			/* Old policy spanning whole new range. */
  			if (n->end > end) {
  				if (!new2) {
  					spin_unlock(&sp->lock);
  					new2 = sp_alloc(end, n->end, n->policy);
  					if (!new2)
  						return -ENOMEM;
  					goto restart;
  				}
  				n->end = start;
  				sp_insert(sp, new2);
  				new2 = NULL;
  				break;
  			} else
  				n->end = start;
  		}
  		if (!next)
  			break;
  		n = rb_entry(next, struct sp_node, nd);
  	}
  	if (new)
  		sp_insert(sp, new);
  	spin_unlock(&sp->lock);
  	if (new2) {
f0be3d32b   Lee Schermerhorn   mempolicy: rename...
1779
  		mpol_put(new2->policy);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1780
1781
1782
1783
  		kmem_cache_free(sn_cache, new2);
  	}
  	return 0;
  }
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
  /**
   * mpol_shared_policy_init - initialize shared policy for inode
   * @sp: pointer to inode shared policy
   * @mpol:  struct mempolicy to install
   *
   * Install non-NULL @mpol in inode's shared policy rb-tree.
   * On entry, the current task has a reference on a non-NULL @mpol.
   * This must be released on exit.
   */
  void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
  {
  	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
  	spin_lock_init(&sp->lock);
  
  	if (mpol) {
  		struct vm_area_struct pvma;
  		struct mempolicy *new;
  
  		/* contextualize the tmpfs mount point mempolicy */
  		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
  		mpol_put(mpol);	/* drop our ref on sb mpol */
  		if (IS_ERR(new))
  			return;		/* no valid nodemask intersection */
  
  		/* Create pseudo-vma that contains just the policy */
  		memset(&pvma, 0, sizeof(struct vm_area_struct));
  		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
  		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
  		mpol_put(new);			/* drop initial ref */
7339ff830   Robin Holt   [PATCH] Add tmpfs...
1813
1814
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1815
1816
1817
1818
1819
1820
  int mpol_set_shared_policy(struct shared_policy *info,
  			struct vm_area_struct *vma, struct mempolicy *npol)
  {
  	int err;
  	struct sp_node *new = NULL;
  	unsigned long sz = vma_pages(vma);
028fec414   David Rientjes   mempolicy: suppor...
1821
1822
  	pr_debug("set_shared_policy %lx sz %lu %d %d %lx
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1823
  		 vma->vm_pgoff,
45c4745af   Lee Schermerhorn   mempolicy: rename...
1824
  		 sz, npol ? npol->mode : -1,
028fec414   David Rientjes   mempolicy: suppor...
1825
  		 npol ? npol->flags : -1,
140d5a490   Paul Mundt   numa: mempolicy: ...
1826
  		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
  
  	if (npol) {
  		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
  		if (!new)
  			return -ENOMEM;
  	}
  	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
  	if (err && new)
  		kmem_cache_free(sn_cache, new);
  	return err;
  }
  
  /* Free a backing policy store on inode delete. */
  void mpol_free_shared_policy(struct shared_policy *p)
  {
  	struct sp_node *n;
  	struct rb_node *next;
  
  	if (!p->root.rb_node)
  		return;
  	spin_lock(&p->lock);
  	next = rb_first(&p->root);
  	while (next) {
  		n = rb_entry(next, struct sp_node, nd);
  		next = rb_next(&n->nd);
90c5029e4   Andi Kleen   [PATCH] Undo memp...
1852
  		rb_erase(&n->nd, &p->root);
f0be3d32b   Lee Schermerhorn   mempolicy: rename...
1853
  		mpol_put(n->policy);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1854
1855
1856
  		kmem_cache_free(sn_cache, n);
  	}
  	spin_unlock(&p->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1857
1858
1859
1860
1861
  }
  
  /* assumes fs == KERNEL_DS */
  void __init numa_policy_init(void)
  {
b71636e29   Paul Mundt   numa: mempolicy: ...
1862
1863
1864
  	nodemask_t interleave_nodes;
  	unsigned long largest = 0;
  	int nid, prefer = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1865
1866
  	policy_cache = kmem_cache_create("numa_policy",
  					 sizeof(struct mempolicy),
20c2df83d   Paul Mundt   mm: Remove slab d...
1867
  					 0, SLAB_PANIC, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1868
1869
1870
  
  	sn_cache = kmem_cache_create("shared_policy_node",
  				     sizeof(struct sp_node),
20c2df83d   Paul Mundt   mm: Remove slab d...
1871
  				     0, SLAB_PANIC, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1872

b71636e29   Paul Mundt   numa: mempolicy: ...
1873
1874
1875
1876
1877
1878
  	/*
  	 * Set interleaving policy for system init. Interleaving is only
  	 * enabled across suitably sized nodes (default is >= 16MB), or
  	 * fall back to the largest node if they're all smaller.
  	 */
  	nodes_clear(interleave_nodes);
56bbd65df   Christoph Lameter   Memoryless nodes:...
1879
  	for_each_node_state(nid, N_HIGH_MEMORY) {
b71636e29   Paul Mundt   numa: mempolicy: ...
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
  		unsigned long total_pages = node_present_pages(nid);
  
  		/* Preserve the largest node */
  		if (largest < total_pages) {
  			largest = total_pages;
  			prefer = nid;
  		}
  
  		/* Interleave this node? */
  		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
  			node_set(nid, interleave_nodes);
  	}
  
  	/* All too small, use the largest */
  	if (unlikely(nodes_empty(interleave_nodes)))
  		node_set(prefer, interleave_nodes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1896

028fec414   David Rientjes   mempolicy: suppor...
1897
  	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1898
1899
1900
  		printk("numa_policy_init: interleaving failed
  ");
  }
8bccd85ff   Christoph Lameter   [PATCH] Implement...
1901
  /* Reset policy of current process to default */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1902
1903
  void numa_default_policy(void)
  {
028fec414   David Rientjes   mempolicy: suppor...
1904
  	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1905
  }
68860ec10   Paul Jackson   [PATCH] cpusets: ...
1906

4225399a6   Paul Jackson   [PATCH] cpuset: r...
1907
  /*
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1908
1909
1910
1911
   * Parse and format mempolicy from/to strings
   */
  
  /*
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
1912
   * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
1913
   * Used only for mpol_parse_str() and mpol_to_str()
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
1914
   */
53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
1915
  #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
15ad7cdcf   Helge Deller   [PATCH] struct se...
1916
  static const char * const policy_types[] =
53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
1917
  	{ "default", "prefer", "bind", "interleave", "local" };
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
1918

095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1919
1920
1921
1922
1923
  
  #ifdef CONFIG_TMPFS
  /**
   * mpol_parse_str - parse string to mempolicy
   * @str:  string containing mempolicy to parse
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1924
1925
   * @mpol:  pointer to struct mempolicy pointer, returned on success.
   * @no_context:  flag whether to "contextualize" the mempolicy
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1926
1927
1928
1929
   *
   * Format of input:
   *	<mode>[=<flags>][:<nodelist>]
   *
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1930
1931
1932
1933
1934
1935
1936
1937
   * if @no_context is true, save the input nodemask in w.user_nodemask in
   * the returned mempolicy.  This will be used to "clone" the mempolicy in
   * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
   * mount option.  Note that if 'static' or 'relative' mode flags were
   * specified, the input nodemask will already have been saved.  Saving
   * it again is redundant, but safe.
   *
   * On success, returns 0, else 1
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1938
   */
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1939
  int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1940
  {
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1941
1942
1943
1944
  	struct mempolicy *new = NULL;
  	unsigned short uninitialized_var(mode);
  	unsigned short uninitialized_var(mode_flags);
  	nodemask_t nodes;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1945
1946
1947
1948
1949
1950
1951
1952
  	char *nodelist = strchr(str, ':');
  	char *flags = strchr(str, '=');
  	int i;
  	int err = 1;
  
  	if (nodelist) {
  		/* NUL-terminate mode or flags string */
  		*nodelist++ = '\0';
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1953
  		if (nodelist_parse(nodelist, nodes))
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1954
  			goto out;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1955
  		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1956
  			goto out;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1957
1958
  	} else
  		nodes_clear(nodes);
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1959
1960
  	if (flags)
  		*flags++ = '\0';	/* terminate mode string */
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
1961
  	for (i = 0; i <= MPOL_LOCAL; i++) {
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1962
  		if (!strcmp(str, policy_types[i])) {
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1963
  			mode = i;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1964
1965
1966
  			break;
  		}
  	}
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
1967
  	if (i > MPOL_LOCAL)
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1968
  		goto out;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1969
  	switch (mode) {
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1970
  	case MPOL_PREFERRED:
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1971
1972
1973
  		/*
  		 * Insist on a nodelist of one node only
  		 */
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1974
1975
1976
1977
1978
1979
1980
1981
  		if (nodelist) {
  			char *rest = nodelist;
  			while (isdigit(*rest))
  				rest++;
  			if (!*rest)
  				err = 0;
  		}
  		break;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1982
1983
1984
1985
1986
  	case MPOL_INTERLEAVE:
  		/*
  		 * Default to online nodes with memory if no nodelist
  		 */
  		if (!nodelist)
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1987
  			nodes = node_states[N_HIGH_MEMORY];
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
1988
  		err = 0;
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
1989
  		break;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1990
  	case MPOL_LOCAL:
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
1991
  		/*
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1992
  		 * Don't allow a nodelist;  mpol_new() checks flags
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
1993
  		 */
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1994
  		if (nodelist)
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
1995
  			goto out;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1996
  		mode = MPOL_PREFERRED;
3f226aa1c   Lee Schermerhorn   mempolicy: suppor...
1997
  		break;
71fe804b6   Lee Schermerhorn   mempolicy: use st...
1998
1999
2000
2001
2002
  
  	/*
  	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
  	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
  	 */
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2003
  	}
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2004
  	mode_flags = 0;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2005
2006
2007
2008
2009
2010
  	if (flags) {
  		/*
  		 * Currently, we only support two mutually exclusive
  		 * mode flags.
  		 */
  		if (!strcmp(flags, "static"))
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2011
  			mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2012
  		else if (!strcmp(flags, "relative"))
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2013
  			mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2014
2015
2016
  		else
  			err = 1;
  	}
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2017
2018
2019
2020
2021
2022
  
  	new = mpol_new(mode, mode_flags, &nodes);
  	if (IS_ERR(new))
  		err = 1;
  	else if (no_context)
  		new->w.user_nodemask = nodes;	/* save for contextualization */
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2023
2024
2025
2026
2027
2028
  out:
  	/* Restore string for error message */
  	if (nodelist)
  		*--nodelist = ':';
  	if (flags)
  		*--flags = '=';
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2029
2030
  	if (!err)
  		*mpol = new;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2031
2032
2033
  	return err;
  }
  #endif /* CONFIG_TMPFS */
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2034
2035
2036
2037
2038
2039
2040
  /**
   * mpol_to_str - format a mempolicy structure for printing
   * @buffer:  to contain formatted mempolicy string
   * @maxlen:  length of @buffer
   * @pol:  pointer to mempolicy to be formatted
   * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
   *
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2041
2042
2043
2044
   * Convert a mempolicy into a string.
   * Returns the number of characters in buffer (if positive)
   * or an error (negative)
   */
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2045
  int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2046
2047
2048
2049
  {
  	char *p = buffer;
  	int l;
  	nodemask_t nodes;
bea904d54   Lee Schermerhorn   mempolicy: use MP...
2050
  	unsigned short mode;
f5b087b52   David Rientjes   mempolicy: add MP...
2051
  	unsigned short flags = pol ? pol->flags : 0;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2052

2291990ab   Lee Schermerhorn   mempolicy: clean-...
2053
2054
2055
2056
  	/*
  	 * Sanity check:  room for longest mode, flag and some nodes
  	 */
  	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
bea904d54   Lee Schermerhorn   mempolicy: use MP...
2057
2058
2059
2060
  	if (!pol || pol == &default_policy)
  		mode = MPOL_DEFAULT;
  	else
  		mode = pol->mode;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2061
2062
2063
2064
2065
2066
2067
  	switch (mode) {
  	case MPOL_DEFAULT:
  		nodes_clear(nodes);
  		break;
  
  	case MPOL_PREFERRED:
  		nodes_clear(nodes);
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
2068
  		if (flags & MPOL_F_LOCAL)
53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
2069
2070
  			mode = MPOL_LOCAL;	/* pseudo-policy */
  		else
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
2071
  			node_set(pol->v.preferred_node, nodes);
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2072
2073
2074
  		break;
  
  	case MPOL_BIND:
19770b326   Mel Gorman   mm: filter based ...
2075
  		/* Fall through */
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2076
  	case MPOL_INTERLEAVE:
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2077
2078
2079
2080
  		if (no_context)
  			nodes = pol->w.user_nodemask;
  		else
  			nodes = pol->v.nodes;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2081
2082
2083
2084
  		break;
  
  	default:
  		BUG();
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2085
2086
2087
  	}
  
  	l = strlen(policy_types[mode]);
53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
2088
2089
  	if (buffer + maxlen < p + l + 1)
  		return -ENOSPC;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2090
2091
2092
  
  	strcpy(p, policy_types[mode]);
  	p += l;
fc36b8d3d   Lee Schermerhorn   mempolicy: use MP...
2093
  	if (flags & MPOL_MODE_FLAGS) {
f5b087b52   David Rientjes   mempolicy: add MP...
2094
2095
2096
  		if (buffer + maxlen < p + 2)
  			return -ENOSPC;
  		*p++ = '=';
2291990ab   Lee Schermerhorn   mempolicy: clean-...
2097
2098
2099
  		/*
  		 * Currently, the only defined flags are mutually exclusive
  		 */
f5b087b52   David Rientjes   mempolicy: add MP...
2100
  		if (flags & MPOL_F_STATIC_NODES)
2291990ab   Lee Schermerhorn   mempolicy: clean-...
2101
2102
2103
  			p += snprintf(p, buffer + maxlen - p, "static");
  		else if (flags & MPOL_F_RELATIVE_NODES)
  			p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b52   David Rientjes   mempolicy: add MP...
2104
  	}
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2105
2106
2107
  	if (!nodes_empty(nodes)) {
  		if (buffer + maxlen < p + 2)
  			return -ENOSPC;
095f1fc4e   Lee Schermerhorn   mempolicy: rework...
2108
  		*p++ = ':';
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2109
2110
2111
2112
2113
2114
2115
2116
  	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
  	}
  	return p - buffer;
  }
  
  struct numa_maps {
  	unsigned long pages;
  	unsigned long anon;
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2117
2118
  	unsigned long active;
  	unsigned long writeback;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2119
  	unsigned long mapcount_max;
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2120
2121
  	unsigned long dirty;
  	unsigned long swapcache;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2122
2123
  	unsigned long node[MAX_NUMNODES];
  };
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2124
  static void gather_stats(struct page *page, void *private, int pte_dirty)
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2125
2126
2127
  {
  	struct numa_maps *md = private;
  	int count = page_mapcount(page);
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2128
2129
2130
  	md->pages++;
  	if (pte_dirty || PageDirty(page))
  		md->dirty++;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2131

397874dfe   Christoph Lameter   [PATCH] numa_maps...
2132
2133
  	if (PageSwapCache(page))
  		md->swapcache++;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2134

894bc3104   Lee Schermerhorn   Unevictable LRU I...
2135
  	if (PageActive(page) || PageUnevictable(page))
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2136
2137
2138
2139
  		md->active++;
  
  	if (PageWriteback(page))
  		md->writeback++;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2140
2141
2142
  
  	if (PageAnon(page))
  		md->anon++;
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2143
2144
  	if (count > md->mapcount_max)
  		md->mapcount_max = count;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2145
  	md->node[page_to_nid(page)]++;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2146
  }
7f709ed0e   Andrew Morton   [PATCH] numa_maps...
2147
  #ifdef CONFIG_HUGETLB_PAGE
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2148
2149
2150
2151
2152
2153
  static void check_huge_range(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct numa_maps *md)
  {
  	unsigned long addr;
  	struct page *page;
a55164389   Andi Kleen   hugetlb: modular ...
2154
2155
  	struct hstate *h = hstate_vma(vma);
  	unsigned long sz = huge_page_size(h);
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2156

a55164389   Andi Kleen   hugetlb: modular ...
2157
2158
2159
  	for (addr = start; addr < end; addr += sz) {
  		pte_t *ptep = huge_pte_offset(vma->vm_mm,
  						addr & huge_page_mask(h));
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
  		pte_t pte;
  
  		if (!ptep)
  			continue;
  
  		pte = *ptep;
  		if (pte_none(pte))
  			continue;
  
  		page = pte_page(pte);
  		if (!page)
  			continue;
  
  		gather_stats(page, md, pte_dirty(*ptep));
  	}
  }
7f709ed0e   Andrew Morton   [PATCH] numa_maps...
2176
2177
2178
2179
2180
2181
2182
  #else
  static inline void check_huge_range(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct numa_maps *md)
  {
  }
  #endif
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2183

53f2556b6   Lee Schermerhorn   mempolicy: mPOL_P...
2184
2185
2186
  /*
   * Display pages allocated per node and memory policy via /proc.
   */
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2187
2188
  int show_numa_map(struct seq_file *m, void *v)
  {
99f895518   Eric W. Biederman   [PATCH] proc: don...
2189
  	struct proc_maps_private *priv = m->private;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2190
2191
  	struct vm_area_struct *vma = v;
  	struct numa_maps *md;
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2192
2193
  	struct file *file = vma->vm_file;
  	struct mm_struct *mm = vma->vm_mm;
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
2194
  	struct mempolicy *pol;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2195
2196
  	int n;
  	char buffer[50];
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2197
  	if (!mm)
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2198
2199
2200
2201
2202
  		return 0;
  
  	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
  	if (!md)
  		return 0;
480eccf9a   Lee Schermerhorn   Fix NUMA Memory P...
2203
  	pol = get_vma_policy(priv->task, vma, vma->vm_start);
71fe804b6   Lee Schermerhorn   mempolicy: use st...
2204
  	mpol_to_str(buffer, sizeof(buffer), pol, 0);
52cd3b074   Lee Schermerhorn   mempolicy: rework...
2205
  	mpol_cond_put(pol);
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2206
2207
2208
2209
2210
  
  	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
  
  	if (file) {
  		seq_printf(m, " file=");
c32c2f63a   Jan Blunck   d_path: Make seq_...
2211
2212
  		seq_path(m, &file->f_path, "
  \t= ");
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
  	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
  		seq_printf(m, " heap");
  	} else if (vma->vm_start <= mm->start_stack &&
  			vma->vm_end >= mm->start_stack) {
  		seq_printf(m, " stack");
  	}
  
  	if (is_vm_hugetlb_page(vma)) {
  		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
  		seq_printf(m, " huge");
  	} else {
a57ebfdb2   Christoph Lameter   [PATCH] numa_maps...
2224
  		check_pgd_range(vma, vma->vm_start, vma->vm_end,
56bbd65df   Christoph Lameter   Memoryless nodes:...
2225
  			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2226
2227
2228
2229
  	}
  
  	if (!md->pages)
  		goto out;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2230

397874dfe   Christoph Lameter   [PATCH] numa_maps...
2231
2232
  	if (md->anon)
  		seq_printf(m," anon=%lu",md->anon);
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2233

397874dfe   Christoph Lameter   [PATCH] numa_maps...
2234
2235
  	if (md->dirty)
  		seq_printf(m," dirty=%lu",md->dirty);
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2236

397874dfe   Christoph Lameter   [PATCH] numa_maps...
2237
2238
  	if (md->pages != md->anon && md->pages != md->dirty)
  		seq_printf(m, " mapped=%lu", md->pages);
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2239

397874dfe   Christoph Lameter   [PATCH] numa_maps...
2240
2241
  	if (md->mapcount_max > 1)
  		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2242

397874dfe   Christoph Lameter   [PATCH] numa_maps...
2243
2244
2245
2246
2247
2248
2249
2250
  	if (md->swapcache)
  		seq_printf(m," swapcache=%lu", md->swapcache);
  
  	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
  		seq_printf(m," active=%lu", md->active);
  
  	if (md->writeback)
  		seq_printf(m," writeback=%lu", md->writeback);
56bbd65df   Christoph Lameter   Memoryless nodes:...
2251
  	for_each_node_state(n, N_HIGH_MEMORY)
397874dfe   Christoph Lameter   [PATCH] numa_maps...
2252
2253
2254
2255
2256
  		if (md->node[n])
  			seq_printf(m, " N%d=%lu", n, md->node[n]);
  out:
  	seq_putc(m, '
  ');
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2257
2258
2259
  	kfree(md);
  
  	if (m->count < m->size)
99f895518   Eric W. Biederman   [PATCH] proc: don...
2260
  		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1a75a6c82   Christoph Lameter   [PATCH] Fold numa...
2261
2262
  	return 0;
  }