Commit d4c54919ed86302094c0ca7d48a8cbd4ee753e92

Authored by Naoya Horiguchi
Committed by Linus Torvalds
1 parent d54d14bfb4

mm: add !pte_present() check on existing hugetlb_entry callbacks

The age table walker doesn't check non-present hugetlb entry in common
path, so hugetlb_entry() callbacks must check it.  The reason for this
behavior is that some callers want to handle it in its own way.

[ I think that reason is bogus, btw - it should just do what the regular
  code does, which is to call the "pte_hole()" function for such hugetlb
  entries  - Linus]

However, some callers don't check it now, which causes unpredictable
result, for example when we have a race between migrating hugepage and
reading /proc/pid/numa_maps.  This patch fixes it by adding !pte_present
checks on buggy callbacks.

This bug exists for years and got visible by introducing hugepage
migration.

ChangeLog v2:
- fix if condition (check !pte_present() instead of pte_present())

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org> [3.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Backported to 3.15.  Signed-off-by: Josh Boyer <jwboyer@fedoraproject.org> ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 6 additions and 2 deletions Inline Diff

1 #include <linux/mm.h> 1 #include <linux/mm.h>
2 #include <linux/vmacache.h> 2 #include <linux/vmacache.h>
3 #include <linux/hugetlb.h> 3 #include <linux/hugetlb.h>
4 #include <linux/huge_mm.h> 4 #include <linux/huge_mm.h>
5 #include <linux/mount.h> 5 #include <linux/mount.h>
6 #include <linux/seq_file.h> 6 #include <linux/seq_file.h>
7 #include <linux/highmem.h> 7 #include <linux/highmem.h>
8 #include <linux/ptrace.h> 8 #include <linux/ptrace.h>
9 #include <linux/slab.h> 9 #include <linux/slab.h>
10 #include <linux/pagemap.h> 10 #include <linux/pagemap.h>
11 #include <linux/mempolicy.h> 11 #include <linux/mempolicy.h>
12 #include <linux/rmap.h> 12 #include <linux/rmap.h>
13 #include <linux/swap.h> 13 #include <linux/swap.h>
14 #include <linux/swapops.h> 14 #include <linux/swapops.h>
15 #include <linux/mmu_notifier.h> 15 #include <linux/mmu_notifier.h>
16 16
17 #include <asm/elf.h> 17 #include <asm/elf.h>
18 #include <asm/uaccess.h> 18 #include <asm/uaccess.h>
19 #include <asm/tlbflush.h> 19 #include <asm/tlbflush.h>
20 #include "internal.h" 20 #include "internal.h"
21 21
22 void task_mem(struct seq_file *m, struct mm_struct *mm) 22 void task_mem(struct seq_file *m, struct mm_struct *mm)
23 { 23 {
24 unsigned long data, text, lib, swap; 24 unsigned long data, text, lib, swap;
25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
26 26
27 /* 27 /*
28 * Note: to minimize their overhead, mm maintains hiwater_vm and 28 * Note: to minimize their overhead, mm maintains hiwater_vm and
29 * hiwater_rss only when about to *lower* total_vm or rss. Any 29 * hiwater_rss only when about to *lower* total_vm or rss. Any
30 * collector of these hiwater stats must therefore get total_vm 30 * collector of these hiwater stats must therefore get total_vm
31 * and rss too, which will usually be the higher. Barriers? not 31 * and rss too, which will usually be the higher. Barriers? not
32 * worth the effort, such snapshots can always be inconsistent. 32 * worth the effort, such snapshots can always be inconsistent.
33 */ 33 */
34 hiwater_vm = total_vm = mm->total_vm; 34 hiwater_vm = total_vm = mm->total_vm;
35 if (hiwater_vm < mm->hiwater_vm) 35 if (hiwater_vm < mm->hiwater_vm)
36 hiwater_vm = mm->hiwater_vm; 36 hiwater_vm = mm->hiwater_vm;
37 hiwater_rss = total_rss = get_mm_rss(mm); 37 hiwater_rss = total_rss = get_mm_rss(mm);
38 if (hiwater_rss < mm->hiwater_rss) 38 if (hiwater_rss < mm->hiwater_rss)
39 hiwater_rss = mm->hiwater_rss; 39 hiwater_rss = mm->hiwater_rss;
40 40
41 data = mm->total_vm - mm->shared_vm - mm->stack_vm; 41 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
44 swap = get_mm_counter(mm, MM_SWAPENTS); 44 swap = get_mm_counter(mm, MM_SWAPENTS);
45 seq_printf(m, 45 seq_printf(m,
46 "VmPeak:\t%8lu kB\n" 46 "VmPeak:\t%8lu kB\n"
47 "VmSize:\t%8lu kB\n" 47 "VmSize:\t%8lu kB\n"
48 "VmLck:\t%8lu kB\n" 48 "VmLck:\t%8lu kB\n"
49 "VmPin:\t%8lu kB\n" 49 "VmPin:\t%8lu kB\n"
50 "VmHWM:\t%8lu kB\n" 50 "VmHWM:\t%8lu kB\n"
51 "VmRSS:\t%8lu kB\n" 51 "VmRSS:\t%8lu kB\n"
52 "VmData:\t%8lu kB\n" 52 "VmData:\t%8lu kB\n"
53 "VmStk:\t%8lu kB\n" 53 "VmStk:\t%8lu kB\n"
54 "VmExe:\t%8lu kB\n" 54 "VmExe:\t%8lu kB\n"
55 "VmLib:\t%8lu kB\n" 55 "VmLib:\t%8lu kB\n"
56 "VmPTE:\t%8lu kB\n" 56 "VmPTE:\t%8lu kB\n"
57 "VmSwap:\t%8lu kB\n", 57 "VmSwap:\t%8lu kB\n",
58 hiwater_vm << (PAGE_SHIFT-10), 58 hiwater_vm << (PAGE_SHIFT-10),
59 total_vm << (PAGE_SHIFT-10), 59 total_vm << (PAGE_SHIFT-10),
60 mm->locked_vm << (PAGE_SHIFT-10), 60 mm->locked_vm << (PAGE_SHIFT-10),
61 mm->pinned_vm << (PAGE_SHIFT-10), 61 mm->pinned_vm << (PAGE_SHIFT-10),
62 hiwater_rss << (PAGE_SHIFT-10), 62 hiwater_rss << (PAGE_SHIFT-10),
63 total_rss << (PAGE_SHIFT-10), 63 total_rss << (PAGE_SHIFT-10),
64 data << (PAGE_SHIFT-10), 64 data << (PAGE_SHIFT-10),
65 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 65 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
66 (PTRS_PER_PTE * sizeof(pte_t) * 66 (PTRS_PER_PTE * sizeof(pte_t) *
67 atomic_long_read(&mm->nr_ptes)) >> 10, 67 atomic_long_read(&mm->nr_ptes)) >> 10,
68 swap << (PAGE_SHIFT-10)); 68 swap << (PAGE_SHIFT-10));
69 } 69 }
70 70
71 unsigned long task_vsize(struct mm_struct *mm) 71 unsigned long task_vsize(struct mm_struct *mm)
72 { 72 {
73 return PAGE_SIZE * mm->total_vm; 73 return PAGE_SIZE * mm->total_vm;
74 } 74 }
75 75
76 unsigned long task_statm(struct mm_struct *mm, 76 unsigned long task_statm(struct mm_struct *mm,
77 unsigned long *shared, unsigned long *text, 77 unsigned long *shared, unsigned long *text,
78 unsigned long *data, unsigned long *resident) 78 unsigned long *data, unsigned long *resident)
79 { 79 {
80 *shared = get_mm_counter(mm, MM_FILEPAGES); 80 *shared = get_mm_counter(mm, MM_FILEPAGES);
81 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 81 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
82 >> PAGE_SHIFT; 82 >> PAGE_SHIFT;
83 *data = mm->total_vm - mm->shared_vm; 83 *data = mm->total_vm - mm->shared_vm;
84 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); 84 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
85 return mm->total_vm; 85 return mm->total_vm;
86 } 86 }
87 87
88 #ifdef CONFIG_NUMA 88 #ifdef CONFIG_NUMA
89 /* 89 /*
90 * These functions are for numa_maps but called in generic **maps seq_file 90 * These functions are for numa_maps but called in generic **maps seq_file
91 * ->start(), ->stop() ops. 91 * ->start(), ->stop() ops.
92 * 92 *
93 * numa_maps scans all vmas under mmap_sem and checks their mempolicy. 93 * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
94 * Each mempolicy object is controlled by reference counting. The problem here 94 * Each mempolicy object is controlled by reference counting. The problem here
95 * is how to avoid accessing dead mempolicy object. 95 * is how to avoid accessing dead mempolicy object.
96 * 96 *
97 * Because we're holding mmap_sem while reading seq_file, it's safe to access 97 * Because we're holding mmap_sem while reading seq_file, it's safe to access
98 * each vma's mempolicy, no vma objects will never drop refs to mempolicy. 98 * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
99 * 99 *
100 * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy 100 * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
101 * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). 101 * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
102 * So, without task_lock(), we cannot trust get_vma_policy() because we cannot 102 * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
103 * gurantee the task never exits under us. But taking task_lock() around 103 * gurantee the task never exits under us. But taking task_lock() around
104 * get_vma_plicy() causes lock order problem. 104 * get_vma_plicy() causes lock order problem.
105 * 105 *
106 * To access task->mempolicy without lock, we hold a reference count of an 106 * To access task->mempolicy without lock, we hold a reference count of an
107 * object pointed by task->mempolicy and remember it. This will guarantee 107 * object pointed by task->mempolicy and remember it. This will guarantee
108 * that task->mempolicy points to an alive object or NULL in numa_maps accesses. 108 * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
109 */ 109 */
110 static void hold_task_mempolicy(struct proc_maps_private *priv) 110 static void hold_task_mempolicy(struct proc_maps_private *priv)
111 { 111 {
112 struct task_struct *task = priv->task; 112 struct task_struct *task = priv->task;
113 113
114 task_lock(task); 114 task_lock(task);
115 priv->task_mempolicy = task->mempolicy; 115 priv->task_mempolicy = task->mempolicy;
116 mpol_get(priv->task_mempolicy); 116 mpol_get(priv->task_mempolicy);
117 task_unlock(task); 117 task_unlock(task);
118 } 118 }
119 static void release_task_mempolicy(struct proc_maps_private *priv) 119 static void release_task_mempolicy(struct proc_maps_private *priv)
120 { 120 {
121 mpol_put(priv->task_mempolicy); 121 mpol_put(priv->task_mempolicy);
122 } 122 }
123 #else 123 #else
124 static void hold_task_mempolicy(struct proc_maps_private *priv) 124 static void hold_task_mempolicy(struct proc_maps_private *priv)
125 { 125 {
126 } 126 }
127 static void release_task_mempolicy(struct proc_maps_private *priv) 127 static void release_task_mempolicy(struct proc_maps_private *priv)
128 { 128 {
129 } 129 }
130 #endif 130 #endif
131 131
132 static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) 132 static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
133 { 133 {
134 if (vma && vma != priv->tail_vma) { 134 if (vma && vma != priv->tail_vma) {
135 struct mm_struct *mm = vma->vm_mm; 135 struct mm_struct *mm = vma->vm_mm;
136 release_task_mempolicy(priv); 136 release_task_mempolicy(priv);
137 up_read(&mm->mmap_sem); 137 up_read(&mm->mmap_sem);
138 mmput(mm); 138 mmput(mm);
139 } 139 }
140 } 140 }
141 141
142 static void *m_start(struct seq_file *m, loff_t *pos) 142 static void *m_start(struct seq_file *m, loff_t *pos)
143 { 143 {
144 struct proc_maps_private *priv = m->private; 144 struct proc_maps_private *priv = m->private;
145 unsigned long last_addr = m->version; 145 unsigned long last_addr = m->version;
146 struct mm_struct *mm; 146 struct mm_struct *mm;
147 struct vm_area_struct *vma, *tail_vma = NULL; 147 struct vm_area_struct *vma, *tail_vma = NULL;
148 loff_t l = *pos; 148 loff_t l = *pos;
149 149
150 /* Clear the per syscall fields in priv */ 150 /* Clear the per syscall fields in priv */
151 priv->task = NULL; 151 priv->task = NULL;
152 priv->tail_vma = NULL; 152 priv->tail_vma = NULL;
153 153
154 /* 154 /*
155 * We remember last_addr rather than next_addr to hit with 155 * We remember last_addr rather than next_addr to hit with
156 * vmacache most of the time. We have zero last_addr at 156 * vmacache most of the time. We have zero last_addr at
157 * the beginning and also after lseek. We will have -1 last_addr 157 * the beginning and also after lseek. We will have -1 last_addr
158 * after the end of the vmas. 158 * after the end of the vmas.
159 */ 159 */
160 160
161 if (last_addr == -1UL) 161 if (last_addr == -1UL)
162 return NULL; 162 return NULL;
163 163
164 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 164 priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
165 if (!priv->task) 165 if (!priv->task)
166 return ERR_PTR(-ESRCH); 166 return ERR_PTR(-ESRCH);
167 167
168 mm = mm_access(priv->task, PTRACE_MODE_READ); 168 mm = mm_access(priv->task, PTRACE_MODE_READ);
169 if (!mm || IS_ERR(mm)) 169 if (!mm || IS_ERR(mm))
170 return mm; 170 return mm;
171 down_read(&mm->mmap_sem); 171 down_read(&mm->mmap_sem);
172 172
173 tail_vma = get_gate_vma(priv->task->mm); 173 tail_vma = get_gate_vma(priv->task->mm);
174 priv->tail_vma = tail_vma; 174 priv->tail_vma = tail_vma;
175 hold_task_mempolicy(priv); 175 hold_task_mempolicy(priv);
176 /* Start with last addr hint */ 176 /* Start with last addr hint */
177 vma = find_vma(mm, last_addr); 177 vma = find_vma(mm, last_addr);
178 if (last_addr && vma) { 178 if (last_addr && vma) {
179 vma = vma->vm_next; 179 vma = vma->vm_next;
180 goto out; 180 goto out;
181 } 181 }
182 182
183 /* 183 /*
184 * Check the vma index is within the range and do 184 * Check the vma index is within the range and do
185 * sequential scan until m_index. 185 * sequential scan until m_index.
186 */ 186 */
187 vma = NULL; 187 vma = NULL;
188 if ((unsigned long)l < mm->map_count) { 188 if ((unsigned long)l < mm->map_count) {
189 vma = mm->mmap; 189 vma = mm->mmap;
190 while (l-- && vma) 190 while (l-- && vma)
191 vma = vma->vm_next; 191 vma = vma->vm_next;
192 goto out; 192 goto out;
193 } 193 }
194 194
195 if (l != mm->map_count) 195 if (l != mm->map_count)
196 tail_vma = NULL; /* After gate vma */ 196 tail_vma = NULL; /* After gate vma */
197 197
198 out: 198 out:
199 if (vma) 199 if (vma)
200 return vma; 200 return vma;
201 201
202 release_task_mempolicy(priv); 202 release_task_mempolicy(priv);
203 /* End of vmas has been reached */ 203 /* End of vmas has been reached */
204 m->version = (tail_vma != NULL)? 0: -1UL; 204 m->version = (tail_vma != NULL)? 0: -1UL;
205 up_read(&mm->mmap_sem); 205 up_read(&mm->mmap_sem);
206 mmput(mm); 206 mmput(mm);
207 return tail_vma; 207 return tail_vma;
208 } 208 }
209 209
210 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 210 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
211 { 211 {
212 struct proc_maps_private *priv = m->private; 212 struct proc_maps_private *priv = m->private;
213 struct vm_area_struct *vma = v; 213 struct vm_area_struct *vma = v;
214 struct vm_area_struct *tail_vma = priv->tail_vma; 214 struct vm_area_struct *tail_vma = priv->tail_vma;
215 215
216 (*pos)++; 216 (*pos)++;
217 if (vma && (vma != tail_vma) && vma->vm_next) 217 if (vma && (vma != tail_vma) && vma->vm_next)
218 return vma->vm_next; 218 return vma->vm_next;
219 vma_stop(priv, vma); 219 vma_stop(priv, vma);
220 return (vma != tail_vma)? tail_vma: NULL; 220 return (vma != tail_vma)? tail_vma: NULL;
221 } 221 }
222 222
223 static void m_stop(struct seq_file *m, void *v) 223 static void m_stop(struct seq_file *m, void *v)
224 { 224 {
225 struct proc_maps_private *priv = m->private; 225 struct proc_maps_private *priv = m->private;
226 struct vm_area_struct *vma = v; 226 struct vm_area_struct *vma = v;
227 227
228 if (!IS_ERR(vma)) 228 if (!IS_ERR(vma))
229 vma_stop(priv, vma); 229 vma_stop(priv, vma);
230 if (priv->task) 230 if (priv->task)
231 put_task_struct(priv->task); 231 put_task_struct(priv->task);
232 } 232 }
233 233
234 static int do_maps_open(struct inode *inode, struct file *file, 234 static int do_maps_open(struct inode *inode, struct file *file,
235 const struct seq_operations *ops) 235 const struct seq_operations *ops)
236 { 236 {
237 struct proc_maps_private *priv; 237 struct proc_maps_private *priv;
238 int ret = -ENOMEM; 238 int ret = -ENOMEM;
239 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 239 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
240 if (priv) { 240 if (priv) {
241 priv->pid = proc_pid(inode); 241 priv->pid = proc_pid(inode);
242 ret = seq_open(file, ops); 242 ret = seq_open(file, ops);
243 if (!ret) { 243 if (!ret) {
244 struct seq_file *m = file->private_data; 244 struct seq_file *m = file->private_data;
245 m->private = priv; 245 m->private = priv;
246 } else { 246 } else {
247 kfree(priv); 247 kfree(priv);
248 } 248 }
249 } 249 }
250 return ret; 250 return ret;
251 } 251 }
252 252
253 static void 253 static void
254 show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) 254 show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
255 { 255 {
256 struct mm_struct *mm = vma->vm_mm; 256 struct mm_struct *mm = vma->vm_mm;
257 struct file *file = vma->vm_file; 257 struct file *file = vma->vm_file;
258 struct proc_maps_private *priv = m->private; 258 struct proc_maps_private *priv = m->private;
259 struct task_struct *task = priv->task; 259 struct task_struct *task = priv->task;
260 vm_flags_t flags = vma->vm_flags; 260 vm_flags_t flags = vma->vm_flags;
261 unsigned long ino = 0; 261 unsigned long ino = 0;
262 unsigned long long pgoff = 0; 262 unsigned long long pgoff = 0;
263 unsigned long start, end; 263 unsigned long start, end;
264 dev_t dev = 0; 264 dev_t dev = 0;
265 const char *name = NULL; 265 const char *name = NULL;
266 266
267 if (file) { 267 if (file) {
268 struct inode *inode = file_inode(vma->vm_file); 268 struct inode *inode = file_inode(vma->vm_file);
269 dev = inode->i_sb->s_dev; 269 dev = inode->i_sb->s_dev;
270 ino = inode->i_ino; 270 ino = inode->i_ino;
271 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; 271 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
272 } 272 }
273 273
274 /* We don't show the stack guard page in /proc/maps */ 274 /* We don't show the stack guard page in /proc/maps */
275 start = vma->vm_start; 275 start = vma->vm_start;
276 if (stack_guard_page_start(vma, start)) 276 if (stack_guard_page_start(vma, start))
277 start += PAGE_SIZE; 277 start += PAGE_SIZE;
278 end = vma->vm_end; 278 end = vma->vm_end;
279 if (stack_guard_page_end(vma, end)) 279 if (stack_guard_page_end(vma, end))
280 end -= PAGE_SIZE; 280 end -= PAGE_SIZE;
281 281
282 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); 282 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
283 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", 283 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
284 start, 284 start,
285 end, 285 end,
286 flags & VM_READ ? 'r' : '-', 286 flags & VM_READ ? 'r' : '-',
287 flags & VM_WRITE ? 'w' : '-', 287 flags & VM_WRITE ? 'w' : '-',
288 flags & VM_EXEC ? 'x' : '-', 288 flags & VM_EXEC ? 'x' : '-',
289 flags & VM_MAYSHARE ? 's' : 'p', 289 flags & VM_MAYSHARE ? 's' : 'p',
290 pgoff, 290 pgoff,
291 MAJOR(dev), MINOR(dev), ino); 291 MAJOR(dev), MINOR(dev), ino);
292 292
293 /* 293 /*
294 * Print the dentry name for named mappings, and a 294 * Print the dentry name for named mappings, and a
295 * special [heap] marker for the heap: 295 * special [heap] marker for the heap:
296 */ 296 */
297 if (file) { 297 if (file) {
298 seq_pad(m, ' '); 298 seq_pad(m, ' ');
299 seq_path(m, &file->f_path, "\n"); 299 seq_path(m, &file->f_path, "\n");
300 goto done; 300 goto done;
301 } 301 }
302 302
303 name = arch_vma_name(vma); 303 name = arch_vma_name(vma);
304 if (!name) { 304 if (!name) {
305 pid_t tid; 305 pid_t tid;
306 306
307 if (!mm) { 307 if (!mm) {
308 name = "[vdso]"; 308 name = "[vdso]";
309 goto done; 309 goto done;
310 } 310 }
311 311
312 if (vma->vm_start <= mm->brk && 312 if (vma->vm_start <= mm->brk &&
313 vma->vm_end >= mm->start_brk) { 313 vma->vm_end >= mm->start_brk) {
314 name = "[heap]"; 314 name = "[heap]";
315 goto done; 315 goto done;
316 } 316 }
317 317
318 tid = vm_is_stack(task, vma, is_pid); 318 tid = vm_is_stack(task, vma, is_pid);
319 319
320 if (tid != 0) { 320 if (tid != 0) {
321 /* 321 /*
322 * Thread stack in /proc/PID/task/TID/maps or 322 * Thread stack in /proc/PID/task/TID/maps or
323 * the main process stack. 323 * the main process stack.
324 */ 324 */
325 if (!is_pid || (vma->vm_start <= mm->start_stack && 325 if (!is_pid || (vma->vm_start <= mm->start_stack &&
326 vma->vm_end >= mm->start_stack)) { 326 vma->vm_end >= mm->start_stack)) {
327 name = "[stack]"; 327 name = "[stack]";
328 } else { 328 } else {
329 /* Thread stack in /proc/PID/maps */ 329 /* Thread stack in /proc/PID/maps */
330 seq_pad(m, ' '); 330 seq_pad(m, ' ');
331 seq_printf(m, "[stack:%d]", tid); 331 seq_printf(m, "[stack:%d]", tid);
332 } 332 }
333 } 333 }
334 } 334 }
335 335
336 done: 336 done:
337 if (name) { 337 if (name) {
338 seq_pad(m, ' '); 338 seq_pad(m, ' ');
339 seq_puts(m, name); 339 seq_puts(m, name);
340 } 340 }
341 seq_putc(m, '\n'); 341 seq_putc(m, '\n');
342 } 342 }
343 343
344 static int show_map(struct seq_file *m, void *v, int is_pid) 344 static int show_map(struct seq_file *m, void *v, int is_pid)
345 { 345 {
346 struct vm_area_struct *vma = v; 346 struct vm_area_struct *vma = v;
347 struct proc_maps_private *priv = m->private; 347 struct proc_maps_private *priv = m->private;
348 struct task_struct *task = priv->task; 348 struct task_struct *task = priv->task;
349 349
350 show_map_vma(m, vma, is_pid); 350 show_map_vma(m, vma, is_pid);
351 351
352 if (m->count < m->size) /* vma is copied successfully */ 352 if (m->count < m->size) /* vma is copied successfully */
353 m->version = (vma != get_gate_vma(task->mm)) 353 m->version = (vma != get_gate_vma(task->mm))
354 ? vma->vm_start : 0; 354 ? vma->vm_start : 0;
355 return 0; 355 return 0;
356 } 356 }
357 357
358 static int show_pid_map(struct seq_file *m, void *v) 358 static int show_pid_map(struct seq_file *m, void *v)
359 { 359 {
360 return show_map(m, v, 1); 360 return show_map(m, v, 1);
361 } 361 }
362 362
363 static int show_tid_map(struct seq_file *m, void *v) 363 static int show_tid_map(struct seq_file *m, void *v)
364 { 364 {
365 return show_map(m, v, 0); 365 return show_map(m, v, 0);
366 } 366 }
367 367
368 static const struct seq_operations proc_pid_maps_op = { 368 static const struct seq_operations proc_pid_maps_op = {
369 .start = m_start, 369 .start = m_start,
370 .next = m_next, 370 .next = m_next,
371 .stop = m_stop, 371 .stop = m_stop,
372 .show = show_pid_map 372 .show = show_pid_map
373 }; 373 };
374 374
375 static const struct seq_operations proc_tid_maps_op = { 375 static const struct seq_operations proc_tid_maps_op = {
376 .start = m_start, 376 .start = m_start,
377 .next = m_next, 377 .next = m_next,
378 .stop = m_stop, 378 .stop = m_stop,
379 .show = show_tid_map 379 .show = show_tid_map
380 }; 380 };
381 381
382 static int pid_maps_open(struct inode *inode, struct file *file) 382 static int pid_maps_open(struct inode *inode, struct file *file)
383 { 383 {
384 return do_maps_open(inode, file, &proc_pid_maps_op); 384 return do_maps_open(inode, file, &proc_pid_maps_op);
385 } 385 }
386 386
387 static int tid_maps_open(struct inode *inode, struct file *file) 387 static int tid_maps_open(struct inode *inode, struct file *file)
388 { 388 {
389 return do_maps_open(inode, file, &proc_tid_maps_op); 389 return do_maps_open(inode, file, &proc_tid_maps_op);
390 } 390 }
391 391
392 const struct file_operations proc_pid_maps_operations = { 392 const struct file_operations proc_pid_maps_operations = {
393 .open = pid_maps_open, 393 .open = pid_maps_open,
394 .read = seq_read, 394 .read = seq_read,
395 .llseek = seq_lseek, 395 .llseek = seq_lseek,
396 .release = seq_release_private, 396 .release = seq_release_private,
397 }; 397 };
398 398
399 const struct file_operations proc_tid_maps_operations = { 399 const struct file_operations proc_tid_maps_operations = {
400 .open = tid_maps_open, 400 .open = tid_maps_open,
401 .read = seq_read, 401 .read = seq_read,
402 .llseek = seq_lseek, 402 .llseek = seq_lseek,
403 .release = seq_release_private, 403 .release = seq_release_private,
404 }; 404 };
405 405
406 /* 406 /*
407 * Proportional Set Size(PSS): my share of RSS. 407 * Proportional Set Size(PSS): my share of RSS.
408 * 408 *
409 * PSS of a process is the count of pages it has in memory, where each 409 * PSS of a process is the count of pages it has in memory, where each
410 * page is divided by the number of processes sharing it. So if a 410 * page is divided by the number of processes sharing it. So if a
411 * process has 1000 pages all to itself, and 1000 shared with one other 411 * process has 1000 pages all to itself, and 1000 shared with one other
412 * process, its PSS will be 1500. 412 * process, its PSS will be 1500.
413 * 413 *
414 * To keep (accumulated) division errors low, we adopt a 64bit 414 * To keep (accumulated) division errors low, we adopt a 64bit
415 * fixed-point pss counter to minimize division errors. So (pss >> 415 * fixed-point pss counter to minimize division errors. So (pss >>
416 * PSS_SHIFT) would be the real byte count. 416 * PSS_SHIFT) would be the real byte count.
417 * 417 *
418 * A shift of 12 before division means (assuming 4K page size): 418 * A shift of 12 before division means (assuming 4K page size):
419 * - 1M 3-user-pages add up to 8KB errors; 419 * - 1M 3-user-pages add up to 8KB errors;
420 * - supports mapcount up to 2^24, or 16M; 420 * - supports mapcount up to 2^24, or 16M;
421 * - supports PSS up to 2^52 bytes, or 4PB. 421 * - supports PSS up to 2^52 bytes, or 4PB.
422 */ 422 */
423 #define PSS_SHIFT 12 423 #define PSS_SHIFT 12
424 424
425 #ifdef CONFIG_PROC_PAGE_MONITOR 425 #ifdef CONFIG_PROC_PAGE_MONITOR
426 struct mem_size_stats { 426 struct mem_size_stats {
427 struct vm_area_struct *vma; 427 struct vm_area_struct *vma;
428 unsigned long resident; 428 unsigned long resident;
429 unsigned long shared_clean; 429 unsigned long shared_clean;
430 unsigned long shared_dirty; 430 unsigned long shared_dirty;
431 unsigned long private_clean; 431 unsigned long private_clean;
432 unsigned long private_dirty; 432 unsigned long private_dirty;
433 unsigned long referenced; 433 unsigned long referenced;
434 unsigned long anonymous; 434 unsigned long anonymous;
435 unsigned long anonymous_thp; 435 unsigned long anonymous_thp;
436 unsigned long swap; 436 unsigned long swap;
437 unsigned long nonlinear; 437 unsigned long nonlinear;
438 u64 pss; 438 u64 pss;
439 }; 439 };
440 440
441 441
442 static void smaps_pte_entry(pte_t ptent, unsigned long addr, 442 static void smaps_pte_entry(pte_t ptent, unsigned long addr,
443 unsigned long ptent_size, struct mm_walk *walk) 443 unsigned long ptent_size, struct mm_walk *walk)
444 { 444 {
445 struct mem_size_stats *mss = walk->private; 445 struct mem_size_stats *mss = walk->private;
446 struct vm_area_struct *vma = mss->vma; 446 struct vm_area_struct *vma = mss->vma;
447 pgoff_t pgoff = linear_page_index(vma, addr); 447 pgoff_t pgoff = linear_page_index(vma, addr);
448 struct page *page = NULL; 448 struct page *page = NULL;
449 int mapcount; 449 int mapcount;
450 450
451 if (pte_present(ptent)) { 451 if (pte_present(ptent)) {
452 page = vm_normal_page(vma, addr, ptent); 452 page = vm_normal_page(vma, addr, ptent);
453 } else if (is_swap_pte(ptent)) { 453 } else if (is_swap_pte(ptent)) {
454 swp_entry_t swpent = pte_to_swp_entry(ptent); 454 swp_entry_t swpent = pte_to_swp_entry(ptent);
455 455
456 if (!non_swap_entry(swpent)) 456 if (!non_swap_entry(swpent))
457 mss->swap += ptent_size; 457 mss->swap += ptent_size;
458 else if (is_migration_entry(swpent)) 458 else if (is_migration_entry(swpent))
459 page = migration_entry_to_page(swpent); 459 page = migration_entry_to_page(swpent);
460 } else if (pte_file(ptent)) { 460 } else if (pte_file(ptent)) {
461 if (pte_to_pgoff(ptent) != pgoff) 461 if (pte_to_pgoff(ptent) != pgoff)
462 mss->nonlinear += ptent_size; 462 mss->nonlinear += ptent_size;
463 } 463 }
464 464
465 if (!page) 465 if (!page)
466 return; 466 return;
467 467
468 if (PageAnon(page)) 468 if (PageAnon(page))
469 mss->anonymous += ptent_size; 469 mss->anonymous += ptent_size;
470 470
471 if (page->index != pgoff) 471 if (page->index != pgoff)
472 mss->nonlinear += ptent_size; 472 mss->nonlinear += ptent_size;
473 473
474 mss->resident += ptent_size; 474 mss->resident += ptent_size;
475 /* Accumulate the size in pages that have been accessed. */ 475 /* Accumulate the size in pages that have been accessed. */
476 if (pte_young(ptent) || PageReferenced(page)) 476 if (pte_young(ptent) || PageReferenced(page))
477 mss->referenced += ptent_size; 477 mss->referenced += ptent_size;
478 mapcount = page_mapcount(page); 478 mapcount = page_mapcount(page);
479 if (mapcount >= 2) { 479 if (mapcount >= 2) {
480 if (pte_dirty(ptent) || PageDirty(page)) 480 if (pte_dirty(ptent) || PageDirty(page))
481 mss->shared_dirty += ptent_size; 481 mss->shared_dirty += ptent_size;
482 else 482 else
483 mss->shared_clean += ptent_size; 483 mss->shared_clean += ptent_size;
484 mss->pss += (ptent_size << PSS_SHIFT) / mapcount; 484 mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
485 } else { 485 } else {
486 if (pte_dirty(ptent) || PageDirty(page)) 486 if (pte_dirty(ptent) || PageDirty(page))
487 mss->private_dirty += ptent_size; 487 mss->private_dirty += ptent_size;
488 else 488 else
489 mss->private_clean += ptent_size; 489 mss->private_clean += ptent_size;
490 mss->pss += (ptent_size << PSS_SHIFT); 490 mss->pss += (ptent_size << PSS_SHIFT);
491 } 491 }
492 } 492 }
493 493
494 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 494 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
495 struct mm_walk *walk) 495 struct mm_walk *walk)
496 { 496 {
497 struct mem_size_stats *mss = walk->private; 497 struct mem_size_stats *mss = walk->private;
498 struct vm_area_struct *vma = mss->vma; 498 struct vm_area_struct *vma = mss->vma;
499 pte_t *pte; 499 pte_t *pte;
500 spinlock_t *ptl; 500 spinlock_t *ptl;
501 501
502 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 502 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
503 smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); 503 smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
504 spin_unlock(ptl); 504 spin_unlock(ptl);
505 mss->anonymous_thp += HPAGE_PMD_SIZE; 505 mss->anonymous_thp += HPAGE_PMD_SIZE;
506 return 0; 506 return 0;
507 } 507 }
508 508
509 if (pmd_trans_unstable(pmd)) 509 if (pmd_trans_unstable(pmd))
510 return 0; 510 return 0;
511 /* 511 /*
512 * The mmap_sem held all the way back in m_start() is what 512 * The mmap_sem held all the way back in m_start() is what
513 * keeps khugepaged out of here and from collapsing things 513 * keeps khugepaged out of here and from collapsing things
514 * in here. 514 * in here.
515 */ 515 */
516 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 516 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
517 for (; addr != end; pte++, addr += PAGE_SIZE) 517 for (; addr != end; pte++, addr += PAGE_SIZE)
518 smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); 518 smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
519 pte_unmap_unlock(pte - 1, ptl); 519 pte_unmap_unlock(pte - 1, ptl);
520 cond_resched(); 520 cond_resched();
521 return 0; 521 return 0;
522 } 522 }
523 523
524 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) 524 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
525 { 525 {
526 /* 526 /*
527 * Don't forget to update Documentation/ on changes. 527 * Don't forget to update Documentation/ on changes.
528 */ 528 */
529 static const char mnemonics[BITS_PER_LONG][2] = { 529 static const char mnemonics[BITS_PER_LONG][2] = {
530 /* 530 /*
531 * In case if we meet a flag we don't know about. 531 * In case if we meet a flag we don't know about.
532 */ 532 */
533 [0 ... (BITS_PER_LONG-1)] = "??", 533 [0 ... (BITS_PER_LONG-1)] = "??",
534 534
535 [ilog2(VM_READ)] = "rd", 535 [ilog2(VM_READ)] = "rd",
536 [ilog2(VM_WRITE)] = "wr", 536 [ilog2(VM_WRITE)] = "wr",
537 [ilog2(VM_EXEC)] = "ex", 537 [ilog2(VM_EXEC)] = "ex",
538 [ilog2(VM_SHARED)] = "sh", 538 [ilog2(VM_SHARED)] = "sh",
539 [ilog2(VM_MAYREAD)] = "mr", 539 [ilog2(VM_MAYREAD)] = "mr",
540 [ilog2(VM_MAYWRITE)] = "mw", 540 [ilog2(VM_MAYWRITE)] = "mw",
541 [ilog2(VM_MAYEXEC)] = "me", 541 [ilog2(VM_MAYEXEC)] = "me",
542 [ilog2(VM_MAYSHARE)] = "ms", 542 [ilog2(VM_MAYSHARE)] = "ms",
543 [ilog2(VM_GROWSDOWN)] = "gd", 543 [ilog2(VM_GROWSDOWN)] = "gd",
544 [ilog2(VM_PFNMAP)] = "pf", 544 [ilog2(VM_PFNMAP)] = "pf",
545 [ilog2(VM_DENYWRITE)] = "dw", 545 [ilog2(VM_DENYWRITE)] = "dw",
546 [ilog2(VM_LOCKED)] = "lo", 546 [ilog2(VM_LOCKED)] = "lo",
547 [ilog2(VM_IO)] = "io", 547 [ilog2(VM_IO)] = "io",
548 [ilog2(VM_SEQ_READ)] = "sr", 548 [ilog2(VM_SEQ_READ)] = "sr",
549 [ilog2(VM_RAND_READ)] = "rr", 549 [ilog2(VM_RAND_READ)] = "rr",
550 [ilog2(VM_DONTCOPY)] = "dc", 550 [ilog2(VM_DONTCOPY)] = "dc",
551 [ilog2(VM_DONTEXPAND)] = "de", 551 [ilog2(VM_DONTEXPAND)] = "de",
552 [ilog2(VM_ACCOUNT)] = "ac", 552 [ilog2(VM_ACCOUNT)] = "ac",
553 [ilog2(VM_NORESERVE)] = "nr", 553 [ilog2(VM_NORESERVE)] = "nr",
554 [ilog2(VM_HUGETLB)] = "ht", 554 [ilog2(VM_HUGETLB)] = "ht",
555 [ilog2(VM_NONLINEAR)] = "nl", 555 [ilog2(VM_NONLINEAR)] = "nl",
556 [ilog2(VM_ARCH_1)] = "ar", 556 [ilog2(VM_ARCH_1)] = "ar",
557 [ilog2(VM_DONTDUMP)] = "dd", 557 [ilog2(VM_DONTDUMP)] = "dd",
558 #ifdef CONFIG_MEM_SOFT_DIRTY 558 #ifdef CONFIG_MEM_SOFT_DIRTY
559 [ilog2(VM_SOFTDIRTY)] = "sd", 559 [ilog2(VM_SOFTDIRTY)] = "sd",
560 #endif 560 #endif
561 [ilog2(VM_MIXEDMAP)] = "mm", 561 [ilog2(VM_MIXEDMAP)] = "mm",
562 [ilog2(VM_HUGEPAGE)] = "hg", 562 [ilog2(VM_HUGEPAGE)] = "hg",
563 [ilog2(VM_NOHUGEPAGE)] = "nh", 563 [ilog2(VM_NOHUGEPAGE)] = "nh",
564 [ilog2(VM_MERGEABLE)] = "mg", 564 [ilog2(VM_MERGEABLE)] = "mg",
565 }; 565 };
566 size_t i; 566 size_t i;
567 567
568 seq_puts(m, "VmFlags: "); 568 seq_puts(m, "VmFlags: ");
569 for (i = 0; i < BITS_PER_LONG; i++) { 569 for (i = 0; i < BITS_PER_LONG; i++) {
570 if (vma->vm_flags & (1UL << i)) { 570 if (vma->vm_flags & (1UL << i)) {
571 seq_printf(m, "%c%c ", 571 seq_printf(m, "%c%c ",
572 mnemonics[i][0], mnemonics[i][1]); 572 mnemonics[i][0], mnemonics[i][1]);
573 } 573 }
574 } 574 }
575 seq_putc(m, '\n'); 575 seq_putc(m, '\n');
576 } 576 }
577 577
578 static int show_smap(struct seq_file *m, void *v, int is_pid) 578 static int show_smap(struct seq_file *m, void *v, int is_pid)
579 { 579 {
580 struct proc_maps_private *priv = m->private; 580 struct proc_maps_private *priv = m->private;
581 struct task_struct *task = priv->task; 581 struct task_struct *task = priv->task;
582 struct vm_area_struct *vma = v; 582 struct vm_area_struct *vma = v;
583 struct mem_size_stats mss; 583 struct mem_size_stats mss;
584 struct mm_walk smaps_walk = { 584 struct mm_walk smaps_walk = {
585 .pmd_entry = smaps_pte_range, 585 .pmd_entry = smaps_pte_range,
586 .mm = vma->vm_mm, 586 .mm = vma->vm_mm,
587 .private = &mss, 587 .private = &mss,
588 }; 588 };
589 589
590 memset(&mss, 0, sizeof mss); 590 memset(&mss, 0, sizeof mss);
591 mss.vma = vma; 591 mss.vma = vma;
592 /* mmap_sem is held in m_start */ 592 /* mmap_sem is held in m_start */
593 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 593 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
594 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 594 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
595 595
596 show_map_vma(m, vma, is_pid); 596 show_map_vma(m, vma, is_pid);
597 597
598 seq_printf(m, 598 seq_printf(m,
599 "Size: %8lu kB\n" 599 "Size: %8lu kB\n"
600 "Rss: %8lu kB\n" 600 "Rss: %8lu kB\n"
601 "Pss: %8lu kB\n" 601 "Pss: %8lu kB\n"
602 "Shared_Clean: %8lu kB\n" 602 "Shared_Clean: %8lu kB\n"
603 "Shared_Dirty: %8lu kB\n" 603 "Shared_Dirty: %8lu kB\n"
604 "Private_Clean: %8lu kB\n" 604 "Private_Clean: %8lu kB\n"
605 "Private_Dirty: %8lu kB\n" 605 "Private_Dirty: %8lu kB\n"
606 "Referenced: %8lu kB\n" 606 "Referenced: %8lu kB\n"
607 "Anonymous: %8lu kB\n" 607 "Anonymous: %8lu kB\n"
608 "AnonHugePages: %8lu kB\n" 608 "AnonHugePages: %8lu kB\n"
609 "Swap: %8lu kB\n" 609 "Swap: %8lu kB\n"
610 "KernelPageSize: %8lu kB\n" 610 "KernelPageSize: %8lu kB\n"
611 "MMUPageSize: %8lu kB\n" 611 "MMUPageSize: %8lu kB\n"
612 "Locked: %8lu kB\n", 612 "Locked: %8lu kB\n",
613 (vma->vm_end - vma->vm_start) >> 10, 613 (vma->vm_end - vma->vm_start) >> 10,
614 mss.resident >> 10, 614 mss.resident >> 10,
615 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 615 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
616 mss.shared_clean >> 10, 616 mss.shared_clean >> 10,
617 mss.shared_dirty >> 10, 617 mss.shared_dirty >> 10,
618 mss.private_clean >> 10, 618 mss.private_clean >> 10,
619 mss.private_dirty >> 10, 619 mss.private_dirty >> 10,
620 mss.referenced >> 10, 620 mss.referenced >> 10,
621 mss.anonymous >> 10, 621 mss.anonymous >> 10,
622 mss.anonymous_thp >> 10, 622 mss.anonymous_thp >> 10,
623 mss.swap >> 10, 623 mss.swap >> 10,
624 vma_kernel_pagesize(vma) >> 10, 624 vma_kernel_pagesize(vma) >> 10,
625 vma_mmu_pagesize(vma) >> 10, 625 vma_mmu_pagesize(vma) >> 10,
626 (vma->vm_flags & VM_LOCKED) ? 626 (vma->vm_flags & VM_LOCKED) ?
627 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 627 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
628 628
629 if (vma->vm_flags & VM_NONLINEAR) 629 if (vma->vm_flags & VM_NONLINEAR)
630 seq_printf(m, "Nonlinear: %8lu kB\n", 630 seq_printf(m, "Nonlinear: %8lu kB\n",
631 mss.nonlinear >> 10); 631 mss.nonlinear >> 10);
632 632
633 show_smap_vma_flags(m, vma); 633 show_smap_vma_flags(m, vma);
634 634
635 if (m->count < m->size) /* vma is copied successfully */ 635 if (m->count < m->size) /* vma is copied successfully */
636 m->version = (vma != get_gate_vma(task->mm)) 636 m->version = (vma != get_gate_vma(task->mm))
637 ? vma->vm_start : 0; 637 ? vma->vm_start : 0;
638 return 0; 638 return 0;
639 } 639 }
640 640
641 static int show_pid_smap(struct seq_file *m, void *v) 641 static int show_pid_smap(struct seq_file *m, void *v)
642 { 642 {
643 return show_smap(m, v, 1); 643 return show_smap(m, v, 1);
644 } 644 }
645 645
646 static int show_tid_smap(struct seq_file *m, void *v) 646 static int show_tid_smap(struct seq_file *m, void *v)
647 { 647 {
648 return show_smap(m, v, 0); 648 return show_smap(m, v, 0);
649 } 649 }
650 650
651 static const struct seq_operations proc_pid_smaps_op = { 651 static const struct seq_operations proc_pid_smaps_op = {
652 .start = m_start, 652 .start = m_start,
653 .next = m_next, 653 .next = m_next,
654 .stop = m_stop, 654 .stop = m_stop,
655 .show = show_pid_smap 655 .show = show_pid_smap
656 }; 656 };
657 657
658 static const struct seq_operations proc_tid_smaps_op = { 658 static const struct seq_operations proc_tid_smaps_op = {
659 .start = m_start, 659 .start = m_start,
660 .next = m_next, 660 .next = m_next,
661 .stop = m_stop, 661 .stop = m_stop,
662 .show = show_tid_smap 662 .show = show_tid_smap
663 }; 663 };
664 664
665 static int pid_smaps_open(struct inode *inode, struct file *file) 665 static int pid_smaps_open(struct inode *inode, struct file *file)
666 { 666 {
667 return do_maps_open(inode, file, &proc_pid_smaps_op); 667 return do_maps_open(inode, file, &proc_pid_smaps_op);
668 } 668 }
669 669
670 static int tid_smaps_open(struct inode *inode, struct file *file) 670 static int tid_smaps_open(struct inode *inode, struct file *file)
671 { 671 {
672 return do_maps_open(inode, file, &proc_tid_smaps_op); 672 return do_maps_open(inode, file, &proc_tid_smaps_op);
673 } 673 }
674 674
675 const struct file_operations proc_pid_smaps_operations = { 675 const struct file_operations proc_pid_smaps_operations = {
676 .open = pid_smaps_open, 676 .open = pid_smaps_open,
677 .read = seq_read, 677 .read = seq_read,
678 .llseek = seq_lseek, 678 .llseek = seq_lseek,
679 .release = seq_release_private, 679 .release = seq_release_private,
680 }; 680 };
681 681
682 const struct file_operations proc_tid_smaps_operations = { 682 const struct file_operations proc_tid_smaps_operations = {
683 .open = tid_smaps_open, 683 .open = tid_smaps_open,
684 .read = seq_read, 684 .read = seq_read,
685 .llseek = seq_lseek, 685 .llseek = seq_lseek,
686 .release = seq_release_private, 686 .release = seq_release_private,
687 }; 687 };
688 688
689 /* 689 /*
690 * We do not want to have constant page-shift bits sitting in 690 * We do not want to have constant page-shift bits sitting in
691 * pagemap entries and are about to reuse them some time soon. 691 * pagemap entries and are about to reuse them some time soon.
692 * 692 *
693 * Here's the "migration strategy": 693 * Here's the "migration strategy":
694 * 1. when the system boots these bits remain what they are, 694 * 1. when the system boots these bits remain what they are,
695 * but a warning about future change is printed in log; 695 * but a warning about future change is printed in log;
696 * 2. once anyone clears soft-dirty bits via clear_refs file, 696 * 2. once anyone clears soft-dirty bits via clear_refs file,
697 * these flag is set to denote, that user is aware of the 697 * these flag is set to denote, that user is aware of the
698 * new API and those page-shift bits change their meaning. 698 * new API and those page-shift bits change their meaning.
699 * The respective warning is printed in dmesg; 699 * The respective warning is printed in dmesg;
700 * 3. In a couple of releases we will remove all the mentions 700 * 3. In a couple of releases we will remove all the mentions
701 * of page-shift in pagemap entries. 701 * of page-shift in pagemap entries.
702 */ 702 */
703 703
704 static bool soft_dirty_cleared __read_mostly; 704 static bool soft_dirty_cleared __read_mostly;
705 705
706 enum clear_refs_types { 706 enum clear_refs_types {
707 CLEAR_REFS_ALL = 1, 707 CLEAR_REFS_ALL = 1,
708 CLEAR_REFS_ANON, 708 CLEAR_REFS_ANON,
709 CLEAR_REFS_MAPPED, 709 CLEAR_REFS_MAPPED,
710 CLEAR_REFS_SOFT_DIRTY, 710 CLEAR_REFS_SOFT_DIRTY,
711 CLEAR_REFS_LAST, 711 CLEAR_REFS_LAST,
712 }; 712 };
713 713
714 struct clear_refs_private { 714 struct clear_refs_private {
715 struct vm_area_struct *vma; 715 struct vm_area_struct *vma;
716 enum clear_refs_types type; 716 enum clear_refs_types type;
717 }; 717 };
718 718
719 static inline void clear_soft_dirty(struct vm_area_struct *vma, 719 static inline void clear_soft_dirty(struct vm_area_struct *vma,
720 unsigned long addr, pte_t *pte) 720 unsigned long addr, pte_t *pte)
721 { 721 {
722 #ifdef CONFIG_MEM_SOFT_DIRTY 722 #ifdef CONFIG_MEM_SOFT_DIRTY
723 /* 723 /*
724 * The soft-dirty tracker uses #PF-s to catch writes 724 * The soft-dirty tracker uses #PF-s to catch writes
725 * to pages, so write-protect the pte as well. See the 725 * to pages, so write-protect the pte as well. See the
726 * Documentation/vm/soft-dirty.txt for full description 726 * Documentation/vm/soft-dirty.txt for full description
727 * of how soft-dirty works. 727 * of how soft-dirty works.
728 */ 728 */
729 pte_t ptent = *pte; 729 pte_t ptent = *pte;
730 730
731 if (pte_present(ptent)) { 731 if (pte_present(ptent)) {
732 ptent = pte_wrprotect(ptent); 732 ptent = pte_wrprotect(ptent);
733 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); 733 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
734 } else if (is_swap_pte(ptent)) { 734 } else if (is_swap_pte(ptent)) {
735 ptent = pte_swp_clear_soft_dirty(ptent); 735 ptent = pte_swp_clear_soft_dirty(ptent);
736 } else if (pte_file(ptent)) { 736 } else if (pte_file(ptent)) {
737 ptent = pte_file_clear_soft_dirty(ptent); 737 ptent = pte_file_clear_soft_dirty(ptent);
738 } 738 }
739 739
740 if (vma->vm_flags & VM_SOFTDIRTY) 740 if (vma->vm_flags & VM_SOFTDIRTY)
741 vma->vm_flags &= ~VM_SOFTDIRTY; 741 vma->vm_flags &= ~VM_SOFTDIRTY;
742 742
743 set_pte_at(vma->vm_mm, addr, pte, ptent); 743 set_pte_at(vma->vm_mm, addr, pte, ptent);
744 #endif 744 #endif
745 } 745 }
746 746
747 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 747 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
748 unsigned long end, struct mm_walk *walk) 748 unsigned long end, struct mm_walk *walk)
749 { 749 {
750 struct clear_refs_private *cp = walk->private; 750 struct clear_refs_private *cp = walk->private;
751 struct vm_area_struct *vma = cp->vma; 751 struct vm_area_struct *vma = cp->vma;
752 pte_t *pte, ptent; 752 pte_t *pte, ptent;
753 spinlock_t *ptl; 753 spinlock_t *ptl;
754 struct page *page; 754 struct page *page;
755 755
756 split_huge_page_pmd(vma, addr, pmd); 756 split_huge_page_pmd(vma, addr, pmd);
757 if (pmd_trans_unstable(pmd)) 757 if (pmd_trans_unstable(pmd))
758 return 0; 758 return 0;
759 759
760 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 760 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
761 for (; addr != end; pte++, addr += PAGE_SIZE) { 761 for (; addr != end; pte++, addr += PAGE_SIZE) {
762 ptent = *pte; 762 ptent = *pte;
763 763
764 if (cp->type == CLEAR_REFS_SOFT_DIRTY) { 764 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
765 clear_soft_dirty(vma, addr, pte); 765 clear_soft_dirty(vma, addr, pte);
766 continue; 766 continue;
767 } 767 }
768 768
769 if (!pte_present(ptent)) 769 if (!pte_present(ptent))
770 continue; 770 continue;
771 771
772 page = vm_normal_page(vma, addr, ptent); 772 page = vm_normal_page(vma, addr, ptent);
773 if (!page) 773 if (!page)
774 continue; 774 continue;
775 775
776 /* Clear accessed and referenced bits. */ 776 /* Clear accessed and referenced bits. */
777 ptep_test_and_clear_young(vma, addr, pte); 777 ptep_test_and_clear_young(vma, addr, pte);
778 ClearPageReferenced(page); 778 ClearPageReferenced(page);
779 } 779 }
780 pte_unmap_unlock(pte - 1, ptl); 780 pte_unmap_unlock(pte - 1, ptl);
781 cond_resched(); 781 cond_resched();
782 return 0; 782 return 0;
783 } 783 }
784 784
785 static ssize_t clear_refs_write(struct file *file, const char __user *buf, 785 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
786 size_t count, loff_t *ppos) 786 size_t count, loff_t *ppos)
787 { 787 {
788 struct task_struct *task; 788 struct task_struct *task;
789 char buffer[PROC_NUMBUF]; 789 char buffer[PROC_NUMBUF];
790 struct mm_struct *mm; 790 struct mm_struct *mm;
791 struct vm_area_struct *vma; 791 struct vm_area_struct *vma;
792 enum clear_refs_types type; 792 enum clear_refs_types type;
793 int itype; 793 int itype;
794 int rv; 794 int rv;
795 795
796 memset(buffer, 0, sizeof(buffer)); 796 memset(buffer, 0, sizeof(buffer));
797 if (count > sizeof(buffer) - 1) 797 if (count > sizeof(buffer) - 1)
798 count = sizeof(buffer) - 1; 798 count = sizeof(buffer) - 1;
799 if (copy_from_user(buffer, buf, count)) 799 if (copy_from_user(buffer, buf, count))
800 return -EFAULT; 800 return -EFAULT;
801 rv = kstrtoint(strstrip(buffer), 10, &itype); 801 rv = kstrtoint(strstrip(buffer), 10, &itype);
802 if (rv < 0) 802 if (rv < 0)
803 return rv; 803 return rv;
804 type = (enum clear_refs_types)itype; 804 type = (enum clear_refs_types)itype;
805 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) 805 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
806 return -EINVAL; 806 return -EINVAL;
807 807
808 if (type == CLEAR_REFS_SOFT_DIRTY) { 808 if (type == CLEAR_REFS_SOFT_DIRTY) {
809 soft_dirty_cleared = true; 809 soft_dirty_cleared = true;
810 pr_warn_once("The pagemap bits 55-60 has changed their meaning! " 810 pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
811 "See the linux/Documentation/vm/pagemap.txt for details.\n"); 811 "See the linux/Documentation/vm/pagemap.txt for details.\n");
812 } 812 }
813 813
814 task = get_proc_task(file_inode(file)); 814 task = get_proc_task(file_inode(file));
815 if (!task) 815 if (!task)
816 return -ESRCH; 816 return -ESRCH;
817 mm = get_task_mm(task); 817 mm = get_task_mm(task);
818 if (mm) { 818 if (mm) {
819 struct clear_refs_private cp = { 819 struct clear_refs_private cp = {
820 .type = type, 820 .type = type,
821 }; 821 };
822 struct mm_walk clear_refs_walk = { 822 struct mm_walk clear_refs_walk = {
823 .pmd_entry = clear_refs_pte_range, 823 .pmd_entry = clear_refs_pte_range,
824 .mm = mm, 824 .mm = mm,
825 .private = &cp, 825 .private = &cp,
826 }; 826 };
827 down_read(&mm->mmap_sem); 827 down_read(&mm->mmap_sem);
828 if (type == CLEAR_REFS_SOFT_DIRTY) 828 if (type == CLEAR_REFS_SOFT_DIRTY)
829 mmu_notifier_invalidate_range_start(mm, 0, -1); 829 mmu_notifier_invalidate_range_start(mm, 0, -1);
830 for (vma = mm->mmap; vma; vma = vma->vm_next) { 830 for (vma = mm->mmap; vma; vma = vma->vm_next) {
831 cp.vma = vma; 831 cp.vma = vma;
832 if (is_vm_hugetlb_page(vma)) 832 if (is_vm_hugetlb_page(vma))
833 continue; 833 continue;
834 /* 834 /*
835 * Writing 1 to /proc/pid/clear_refs affects all pages. 835 * Writing 1 to /proc/pid/clear_refs affects all pages.
836 * 836 *
837 * Writing 2 to /proc/pid/clear_refs only affects 837 * Writing 2 to /proc/pid/clear_refs only affects
838 * Anonymous pages. 838 * Anonymous pages.
839 * 839 *
840 * Writing 3 to /proc/pid/clear_refs only affects file 840 * Writing 3 to /proc/pid/clear_refs only affects file
841 * mapped pages. 841 * mapped pages.
842 */ 842 */
843 if (type == CLEAR_REFS_ANON && vma->vm_file) 843 if (type == CLEAR_REFS_ANON && vma->vm_file)
844 continue; 844 continue;
845 if (type == CLEAR_REFS_MAPPED && !vma->vm_file) 845 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
846 continue; 846 continue;
847 walk_page_range(vma->vm_start, vma->vm_end, 847 walk_page_range(vma->vm_start, vma->vm_end,
848 &clear_refs_walk); 848 &clear_refs_walk);
849 } 849 }
850 if (type == CLEAR_REFS_SOFT_DIRTY) 850 if (type == CLEAR_REFS_SOFT_DIRTY)
851 mmu_notifier_invalidate_range_end(mm, 0, -1); 851 mmu_notifier_invalidate_range_end(mm, 0, -1);
852 flush_tlb_mm(mm); 852 flush_tlb_mm(mm);
853 up_read(&mm->mmap_sem); 853 up_read(&mm->mmap_sem);
854 mmput(mm); 854 mmput(mm);
855 } 855 }
856 put_task_struct(task); 856 put_task_struct(task);
857 857
858 return count; 858 return count;
859 } 859 }
860 860
861 const struct file_operations proc_clear_refs_operations = { 861 const struct file_operations proc_clear_refs_operations = {
862 .write = clear_refs_write, 862 .write = clear_refs_write,
863 .llseek = noop_llseek, 863 .llseek = noop_llseek,
864 }; 864 };
865 865
866 typedef struct { 866 typedef struct {
867 u64 pme; 867 u64 pme;
868 } pagemap_entry_t; 868 } pagemap_entry_t;
869 869
870 struct pagemapread { 870 struct pagemapread {
871 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ 871 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
872 pagemap_entry_t *buffer; 872 pagemap_entry_t *buffer;
873 bool v2; 873 bool v2;
874 }; 874 };
875 875
876 #define PAGEMAP_WALK_SIZE (PMD_SIZE) 876 #define PAGEMAP_WALK_SIZE (PMD_SIZE)
877 #define PAGEMAP_WALK_MASK (PMD_MASK) 877 #define PAGEMAP_WALK_MASK (PMD_MASK)
878 878
879 #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) 879 #define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
880 #define PM_STATUS_BITS 3 880 #define PM_STATUS_BITS 3
881 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 881 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
882 #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) 882 #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
883 #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) 883 #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
884 #define PM_PSHIFT_BITS 6 884 #define PM_PSHIFT_BITS 6
885 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 885 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
886 #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 886 #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
887 #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) 887 #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
888 #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) 888 #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
889 #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) 889 #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
890 /* in "new" pagemap pshift bits are occupied with more status bits */ 890 /* in "new" pagemap pshift bits are occupied with more status bits */
891 #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) 891 #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
892 892
893 #define __PM_SOFT_DIRTY (1LL) 893 #define __PM_SOFT_DIRTY (1LL)
894 #define PM_PRESENT PM_STATUS(4LL) 894 #define PM_PRESENT PM_STATUS(4LL)
895 #define PM_SWAP PM_STATUS(2LL) 895 #define PM_SWAP PM_STATUS(2LL)
896 #define PM_FILE PM_STATUS(1LL) 896 #define PM_FILE PM_STATUS(1LL)
897 #define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) 897 #define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
898 #define PM_END_OF_BUFFER 1 898 #define PM_END_OF_BUFFER 1
899 899
900 static inline pagemap_entry_t make_pme(u64 val) 900 static inline pagemap_entry_t make_pme(u64 val)
901 { 901 {
902 return (pagemap_entry_t) { .pme = val }; 902 return (pagemap_entry_t) { .pme = val };
903 } 903 }
904 904
905 static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, 905 static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
906 struct pagemapread *pm) 906 struct pagemapread *pm)
907 { 907 {
908 pm->buffer[pm->pos++] = *pme; 908 pm->buffer[pm->pos++] = *pme;
909 if (pm->pos >= pm->len) 909 if (pm->pos >= pm->len)
910 return PM_END_OF_BUFFER; 910 return PM_END_OF_BUFFER;
911 return 0; 911 return 0;
912 } 912 }
913 913
914 static int pagemap_pte_hole(unsigned long start, unsigned long end, 914 static int pagemap_pte_hole(unsigned long start, unsigned long end,
915 struct mm_walk *walk) 915 struct mm_walk *walk)
916 { 916 {
917 struct pagemapread *pm = walk->private; 917 struct pagemapread *pm = walk->private;
918 unsigned long addr; 918 unsigned long addr;
919 int err = 0; 919 int err = 0;
920 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); 920 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
921 921
922 for (addr = start; addr < end; addr += PAGE_SIZE) { 922 for (addr = start; addr < end; addr += PAGE_SIZE) {
923 err = add_to_pagemap(addr, &pme, pm); 923 err = add_to_pagemap(addr, &pme, pm);
924 if (err) 924 if (err)
925 break; 925 break;
926 } 926 }
927 return err; 927 return err;
928 } 928 }
929 929
930 static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 930 static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
931 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 931 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
932 { 932 {
933 u64 frame, flags; 933 u64 frame, flags;
934 struct page *page = NULL; 934 struct page *page = NULL;
935 int flags2 = 0; 935 int flags2 = 0;
936 936
937 if (pte_present(pte)) { 937 if (pte_present(pte)) {
938 frame = pte_pfn(pte); 938 frame = pte_pfn(pte);
939 flags = PM_PRESENT; 939 flags = PM_PRESENT;
940 page = vm_normal_page(vma, addr, pte); 940 page = vm_normal_page(vma, addr, pte);
941 if (pte_soft_dirty(pte)) 941 if (pte_soft_dirty(pte))
942 flags2 |= __PM_SOFT_DIRTY; 942 flags2 |= __PM_SOFT_DIRTY;
943 } else if (is_swap_pte(pte)) { 943 } else if (is_swap_pte(pte)) {
944 swp_entry_t entry; 944 swp_entry_t entry;
945 if (pte_swp_soft_dirty(pte)) 945 if (pte_swp_soft_dirty(pte))
946 flags2 |= __PM_SOFT_DIRTY; 946 flags2 |= __PM_SOFT_DIRTY;
947 entry = pte_to_swp_entry(pte); 947 entry = pte_to_swp_entry(pte);
948 frame = swp_type(entry) | 948 frame = swp_type(entry) |
949 (swp_offset(entry) << MAX_SWAPFILES_SHIFT); 949 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
950 flags = PM_SWAP; 950 flags = PM_SWAP;
951 if (is_migration_entry(entry)) 951 if (is_migration_entry(entry))
952 page = migration_entry_to_page(entry); 952 page = migration_entry_to_page(entry);
953 } else { 953 } else {
954 if (vma->vm_flags & VM_SOFTDIRTY) 954 if (vma->vm_flags & VM_SOFTDIRTY)
955 flags2 |= __PM_SOFT_DIRTY; 955 flags2 |= __PM_SOFT_DIRTY;
956 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); 956 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
957 return; 957 return;
958 } 958 }
959 959
960 if (page && !PageAnon(page)) 960 if (page && !PageAnon(page))
961 flags |= PM_FILE; 961 flags |= PM_FILE;
962 if ((vma->vm_flags & VM_SOFTDIRTY)) 962 if ((vma->vm_flags & VM_SOFTDIRTY))
963 flags2 |= __PM_SOFT_DIRTY; 963 flags2 |= __PM_SOFT_DIRTY;
964 964
965 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); 965 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
966 } 966 }
967 967
968 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 968 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
969 static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 969 static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
970 pmd_t pmd, int offset, int pmd_flags2) 970 pmd_t pmd, int offset, int pmd_flags2)
971 { 971 {
972 /* 972 /*
973 * Currently pmd for thp is always present because thp can not be 973 * Currently pmd for thp is always present because thp can not be
974 * swapped-out, migrated, or HWPOISONed (split in such cases instead.) 974 * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
975 * This if-check is just to prepare for future implementation. 975 * This if-check is just to prepare for future implementation.
976 */ 976 */
977 if (pmd_present(pmd)) 977 if (pmd_present(pmd))
978 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 978 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
979 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); 979 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
980 else 980 else
981 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); 981 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
982 } 982 }
983 #else 983 #else
984 static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 984 static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
985 pmd_t pmd, int offset, int pmd_flags2) 985 pmd_t pmd, int offset, int pmd_flags2)
986 { 986 {
987 } 987 }
988 #endif 988 #endif
989 989
990 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 990 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
991 struct mm_walk *walk) 991 struct mm_walk *walk)
992 { 992 {
993 struct vm_area_struct *vma; 993 struct vm_area_struct *vma;
994 struct pagemapread *pm = walk->private; 994 struct pagemapread *pm = walk->private;
995 spinlock_t *ptl; 995 spinlock_t *ptl;
996 pte_t *pte; 996 pte_t *pte;
997 int err = 0; 997 int err = 0;
998 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); 998 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
999 999
1000 /* find the first VMA at or above 'addr' */ 1000 /* find the first VMA at or above 'addr' */
1001 vma = find_vma(walk->mm, addr); 1001 vma = find_vma(walk->mm, addr);
1002 if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1002 if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1003 int pmd_flags2; 1003 int pmd_flags2;
1004 1004
1005 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) 1005 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
1006 pmd_flags2 = __PM_SOFT_DIRTY; 1006 pmd_flags2 = __PM_SOFT_DIRTY;
1007 else 1007 else
1008 pmd_flags2 = 0; 1008 pmd_flags2 = 0;
1009 1009
1010 for (; addr != end; addr += PAGE_SIZE) { 1010 for (; addr != end; addr += PAGE_SIZE) {
1011 unsigned long offset; 1011 unsigned long offset;
1012 1012
1013 offset = (addr & ~PAGEMAP_WALK_MASK) >> 1013 offset = (addr & ~PAGEMAP_WALK_MASK) >>
1014 PAGE_SHIFT; 1014 PAGE_SHIFT;
1015 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); 1015 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
1016 err = add_to_pagemap(addr, &pme, pm); 1016 err = add_to_pagemap(addr, &pme, pm);
1017 if (err) 1017 if (err)
1018 break; 1018 break;
1019 } 1019 }
1020 spin_unlock(ptl); 1020 spin_unlock(ptl);
1021 return err; 1021 return err;
1022 } 1022 }
1023 1023
1024 if (pmd_trans_unstable(pmd)) 1024 if (pmd_trans_unstable(pmd))
1025 return 0; 1025 return 0;
1026 for (; addr != end; addr += PAGE_SIZE) { 1026 for (; addr != end; addr += PAGE_SIZE) {
1027 int flags2; 1027 int flags2;
1028 1028
1029 /* check to see if we've left 'vma' behind 1029 /* check to see if we've left 'vma' behind
1030 * and need a new, higher one */ 1030 * and need a new, higher one */
1031 if (vma && (addr >= vma->vm_end)) { 1031 if (vma && (addr >= vma->vm_end)) {
1032 vma = find_vma(walk->mm, addr); 1032 vma = find_vma(walk->mm, addr);
1033 if (vma && (vma->vm_flags & VM_SOFTDIRTY)) 1033 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1034 flags2 = __PM_SOFT_DIRTY; 1034 flags2 = __PM_SOFT_DIRTY;
1035 else 1035 else
1036 flags2 = 0; 1036 flags2 = 0;
1037 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); 1037 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
1038 } 1038 }
1039 1039
1040 /* check that 'vma' actually covers this address, 1040 /* check that 'vma' actually covers this address,
1041 * and that it isn't a huge page vma */ 1041 * and that it isn't a huge page vma */
1042 if (vma && (vma->vm_start <= addr) && 1042 if (vma && (vma->vm_start <= addr) &&
1043 !is_vm_hugetlb_page(vma)) { 1043 !is_vm_hugetlb_page(vma)) {
1044 pte = pte_offset_map(pmd, addr); 1044 pte = pte_offset_map(pmd, addr);
1045 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); 1045 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1046 /* unmap before userspace copy */ 1046 /* unmap before userspace copy */
1047 pte_unmap(pte); 1047 pte_unmap(pte);
1048 } 1048 }
1049 err = add_to_pagemap(addr, &pme, pm); 1049 err = add_to_pagemap(addr, &pme, pm);
1050 if (err) 1050 if (err)
1051 return err; 1051 return err;
1052 } 1052 }
1053 1053
1054 cond_resched(); 1054 cond_resched();
1055 1055
1056 return err; 1056 return err;
1057 } 1057 }
1058 1058
1059 #ifdef CONFIG_HUGETLB_PAGE 1059 #ifdef CONFIG_HUGETLB_PAGE
1060 static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1060 static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1061 pte_t pte, int offset, int flags2) 1061 pte_t pte, int offset, int flags2)
1062 { 1062 {
1063 if (pte_present(pte)) 1063 if (pte_present(pte))
1064 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | 1064 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
1065 PM_STATUS2(pm->v2, flags2) | 1065 PM_STATUS2(pm->v2, flags2) |
1066 PM_PRESENT); 1066 PM_PRESENT);
1067 else 1067 else
1068 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | 1068 *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
1069 PM_STATUS2(pm->v2, flags2)); 1069 PM_STATUS2(pm->v2, flags2));
1070 } 1070 }
1071 1071
1072 /* This function walks within one hugetlb entry in the single call */ 1072 /* This function walks within one hugetlb entry in the single call */
1073 static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, 1073 static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1074 unsigned long addr, unsigned long end, 1074 unsigned long addr, unsigned long end,
1075 struct mm_walk *walk) 1075 struct mm_walk *walk)
1076 { 1076 {
1077 struct pagemapread *pm = walk->private; 1077 struct pagemapread *pm = walk->private;
1078 struct vm_area_struct *vma; 1078 struct vm_area_struct *vma;
1079 int err = 0; 1079 int err = 0;
1080 int flags2; 1080 int flags2;
1081 pagemap_entry_t pme; 1081 pagemap_entry_t pme;
1082 1082
1083 vma = find_vma(walk->mm, addr); 1083 vma = find_vma(walk->mm, addr);
1084 WARN_ON_ONCE(!vma); 1084 WARN_ON_ONCE(!vma);
1085 1085
1086 if (vma && (vma->vm_flags & VM_SOFTDIRTY)) 1086 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1087 flags2 = __PM_SOFT_DIRTY; 1087 flags2 = __PM_SOFT_DIRTY;
1088 else 1088 else
1089 flags2 = 0; 1089 flags2 = 0;
1090 1090
1091 for (; addr != end; addr += PAGE_SIZE) { 1091 for (; addr != end; addr += PAGE_SIZE) {
1092 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1092 int offset = (addr & ~hmask) >> PAGE_SHIFT;
1093 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); 1093 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
1094 err = add_to_pagemap(addr, &pme, pm); 1094 err = add_to_pagemap(addr, &pme, pm);
1095 if (err) 1095 if (err)
1096 return err; 1096 return err;
1097 } 1097 }
1098 1098
1099 cond_resched(); 1099 cond_resched();
1100 1100
1101 return err; 1101 return err;
1102 } 1102 }
1103 #endif /* HUGETLB_PAGE */ 1103 #endif /* HUGETLB_PAGE */
1104 1104
1105 /* 1105 /*
1106 * /proc/pid/pagemap - an array mapping virtual pages to pfns 1106 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1107 * 1107 *
1108 * For each page in the address space, this file contains one 64-bit entry 1108 * For each page in the address space, this file contains one 64-bit entry
1109 * consisting of the following: 1109 * consisting of the following:
1110 * 1110 *
1111 * Bits 0-54 page frame number (PFN) if present 1111 * Bits 0-54 page frame number (PFN) if present
1112 * Bits 0-4 swap type if swapped 1112 * Bits 0-4 swap type if swapped
1113 * Bits 5-54 swap offset if swapped 1113 * Bits 5-54 swap offset if swapped
1114 * Bits 55-60 page shift (page size = 1<<page shift) 1114 * Bits 55-60 page shift (page size = 1<<page shift)
1115 * Bit 61 page is file-page or shared-anon 1115 * Bit 61 page is file-page or shared-anon
1116 * Bit 62 page swapped 1116 * Bit 62 page swapped
1117 * Bit 63 page present 1117 * Bit 63 page present
1118 * 1118 *
1119 * If the page is not present but in swap, then the PFN contains an 1119 * If the page is not present but in swap, then the PFN contains an
1120 * encoding of the swap file number and the page's offset into the 1120 * encoding of the swap file number and the page's offset into the
1121 * swap. Unmapped pages return a null PFN. This allows determining 1121 * swap. Unmapped pages return a null PFN. This allows determining
1122 * precisely which pages are mapped (or in swap) and comparing mapped 1122 * precisely which pages are mapped (or in swap) and comparing mapped
1123 * pages between processes. 1123 * pages between processes.
1124 * 1124 *
1125 * Efficient users of this interface will use /proc/pid/maps to 1125 * Efficient users of this interface will use /proc/pid/maps to
1126 * determine which areas of memory are actually mapped and llseek to 1126 * determine which areas of memory are actually mapped and llseek to
1127 * skip over unmapped regions. 1127 * skip over unmapped regions.
1128 */ 1128 */
1129 static ssize_t pagemap_read(struct file *file, char __user *buf, 1129 static ssize_t pagemap_read(struct file *file, char __user *buf,
1130 size_t count, loff_t *ppos) 1130 size_t count, loff_t *ppos)
1131 { 1131 {
1132 struct task_struct *task = get_proc_task(file_inode(file)); 1132 struct task_struct *task = get_proc_task(file_inode(file));
1133 struct mm_struct *mm; 1133 struct mm_struct *mm;
1134 struct pagemapread pm; 1134 struct pagemapread pm;
1135 int ret = -ESRCH; 1135 int ret = -ESRCH;
1136 struct mm_walk pagemap_walk = {}; 1136 struct mm_walk pagemap_walk = {};
1137 unsigned long src; 1137 unsigned long src;
1138 unsigned long svpfn; 1138 unsigned long svpfn;
1139 unsigned long start_vaddr; 1139 unsigned long start_vaddr;
1140 unsigned long end_vaddr; 1140 unsigned long end_vaddr;
1141 int copied = 0; 1141 int copied = 0;
1142 1142
1143 if (!task) 1143 if (!task)
1144 goto out; 1144 goto out;
1145 1145
1146 ret = -EINVAL; 1146 ret = -EINVAL;
1147 /* file position must be aligned */ 1147 /* file position must be aligned */
1148 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) 1148 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1149 goto out_task; 1149 goto out_task;
1150 1150
1151 ret = 0; 1151 ret = 0;
1152 if (!count) 1152 if (!count)
1153 goto out_task; 1153 goto out_task;
1154 1154
1155 pm.v2 = soft_dirty_cleared; 1155 pm.v2 = soft_dirty_cleared;
1156 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1156 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1157 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); 1157 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1158 ret = -ENOMEM; 1158 ret = -ENOMEM;
1159 if (!pm.buffer) 1159 if (!pm.buffer)
1160 goto out_task; 1160 goto out_task;
1161 1161
1162 mm = mm_access(task, PTRACE_MODE_READ); 1162 mm = mm_access(task, PTRACE_MODE_READ);
1163 ret = PTR_ERR(mm); 1163 ret = PTR_ERR(mm);
1164 if (!mm || IS_ERR(mm)) 1164 if (!mm || IS_ERR(mm))
1165 goto out_free; 1165 goto out_free;
1166 1166
1167 pagemap_walk.pmd_entry = pagemap_pte_range; 1167 pagemap_walk.pmd_entry = pagemap_pte_range;
1168 pagemap_walk.pte_hole = pagemap_pte_hole; 1168 pagemap_walk.pte_hole = pagemap_pte_hole;
1169 #ifdef CONFIG_HUGETLB_PAGE 1169 #ifdef CONFIG_HUGETLB_PAGE
1170 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 1170 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1171 #endif 1171 #endif
1172 pagemap_walk.mm = mm; 1172 pagemap_walk.mm = mm;
1173 pagemap_walk.private = &pm; 1173 pagemap_walk.private = &pm;
1174 1174
1175 src = *ppos; 1175 src = *ppos;
1176 svpfn = src / PM_ENTRY_BYTES; 1176 svpfn = src / PM_ENTRY_BYTES;
1177 start_vaddr = svpfn << PAGE_SHIFT; 1177 start_vaddr = svpfn << PAGE_SHIFT;
1178 end_vaddr = TASK_SIZE_OF(task); 1178 end_vaddr = TASK_SIZE_OF(task);
1179 1179
1180 /* watch out for wraparound */ 1180 /* watch out for wraparound */
1181 if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) 1181 if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
1182 start_vaddr = end_vaddr; 1182 start_vaddr = end_vaddr;
1183 1183
1184 /* 1184 /*
1185 * The odds are that this will stop walking way 1185 * The odds are that this will stop walking way
1186 * before end_vaddr, because the length of the 1186 * before end_vaddr, because the length of the
1187 * user buffer is tracked in "pm", and the walk 1187 * user buffer is tracked in "pm", and the walk
1188 * will stop when we hit the end of the buffer. 1188 * will stop when we hit the end of the buffer.
1189 */ 1189 */
1190 ret = 0; 1190 ret = 0;
1191 while (count && (start_vaddr < end_vaddr)) { 1191 while (count && (start_vaddr < end_vaddr)) {
1192 int len; 1192 int len;
1193 unsigned long end; 1193 unsigned long end;
1194 1194
1195 pm.pos = 0; 1195 pm.pos = 0;
1196 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; 1196 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
1197 /* overflow ? */ 1197 /* overflow ? */
1198 if (end < start_vaddr || end > end_vaddr) 1198 if (end < start_vaddr || end > end_vaddr)
1199 end = end_vaddr; 1199 end = end_vaddr;
1200 down_read(&mm->mmap_sem); 1200 down_read(&mm->mmap_sem);
1201 ret = walk_page_range(start_vaddr, end, &pagemap_walk); 1201 ret = walk_page_range(start_vaddr, end, &pagemap_walk);
1202 up_read(&mm->mmap_sem); 1202 up_read(&mm->mmap_sem);
1203 start_vaddr = end; 1203 start_vaddr = end;
1204 1204
1205 len = min(count, PM_ENTRY_BYTES * pm.pos); 1205 len = min(count, PM_ENTRY_BYTES * pm.pos);
1206 if (copy_to_user(buf, pm.buffer, len)) { 1206 if (copy_to_user(buf, pm.buffer, len)) {
1207 ret = -EFAULT; 1207 ret = -EFAULT;
1208 goto out_mm; 1208 goto out_mm;
1209 } 1209 }
1210 copied += len; 1210 copied += len;
1211 buf += len; 1211 buf += len;
1212 count -= len; 1212 count -= len;
1213 } 1213 }
1214 *ppos += copied; 1214 *ppos += copied;
1215 if (!ret || ret == PM_END_OF_BUFFER) 1215 if (!ret || ret == PM_END_OF_BUFFER)
1216 ret = copied; 1216 ret = copied;
1217 1217
1218 out_mm: 1218 out_mm:
1219 mmput(mm); 1219 mmput(mm);
1220 out_free: 1220 out_free:
1221 kfree(pm.buffer); 1221 kfree(pm.buffer);
1222 out_task: 1222 out_task:
1223 put_task_struct(task); 1223 put_task_struct(task);
1224 out: 1224 out:
1225 return ret; 1225 return ret;
1226 } 1226 }
1227 1227
1228 static int pagemap_open(struct inode *inode, struct file *file) 1228 static int pagemap_open(struct inode *inode, struct file *file)
1229 { 1229 {
1230 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " 1230 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1231 "to stop being page-shift some time soon. See the " 1231 "to stop being page-shift some time soon. See the "
1232 "linux/Documentation/vm/pagemap.txt for details.\n"); 1232 "linux/Documentation/vm/pagemap.txt for details.\n");
1233 return 0; 1233 return 0;
1234 } 1234 }
1235 1235
1236 const struct file_operations proc_pagemap_operations = { 1236 const struct file_operations proc_pagemap_operations = {
1237 .llseek = mem_lseek, /* borrow this */ 1237 .llseek = mem_lseek, /* borrow this */
1238 .read = pagemap_read, 1238 .read = pagemap_read,
1239 .open = pagemap_open, 1239 .open = pagemap_open,
1240 }; 1240 };
1241 #endif /* CONFIG_PROC_PAGE_MONITOR */ 1241 #endif /* CONFIG_PROC_PAGE_MONITOR */
1242 1242
1243 #ifdef CONFIG_NUMA 1243 #ifdef CONFIG_NUMA
1244 1244
1245 struct numa_maps { 1245 struct numa_maps {
1246 struct vm_area_struct *vma; 1246 struct vm_area_struct *vma;
1247 unsigned long pages; 1247 unsigned long pages;
1248 unsigned long anon; 1248 unsigned long anon;
1249 unsigned long active; 1249 unsigned long active;
1250 unsigned long writeback; 1250 unsigned long writeback;
1251 unsigned long mapcount_max; 1251 unsigned long mapcount_max;
1252 unsigned long dirty; 1252 unsigned long dirty;
1253 unsigned long swapcache; 1253 unsigned long swapcache;
1254 unsigned long node[MAX_NUMNODES]; 1254 unsigned long node[MAX_NUMNODES];
1255 }; 1255 };
1256 1256
1257 struct numa_maps_private { 1257 struct numa_maps_private {
1258 struct proc_maps_private proc_maps; 1258 struct proc_maps_private proc_maps;
1259 struct numa_maps md; 1259 struct numa_maps md;
1260 }; 1260 };
1261 1261
1262 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, 1262 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
1263 unsigned long nr_pages) 1263 unsigned long nr_pages)
1264 { 1264 {
1265 int count = page_mapcount(page); 1265 int count = page_mapcount(page);
1266 1266
1267 md->pages += nr_pages; 1267 md->pages += nr_pages;
1268 if (pte_dirty || PageDirty(page)) 1268 if (pte_dirty || PageDirty(page))
1269 md->dirty += nr_pages; 1269 md->dirty += nr_pages;
1270 1270
1271 if (PageSwapCache(page)) 1271 if (PageSwapCache(page))
1272 md->swapcache += nr_pages; 1272 md->swapcache += nr_pages;
1273 1273
1274 if (PageActive(page) || PageUnevictable(page)) 1274 if (PageActive(page) || PageUnevictable(page))
1275 md->active += nr_pages; 1275 md->active += nr_pages;
1276 1276
1277 if (PageWriteback(page)) 1277 if (PageWriteback(page))
1278 md->writeback += nr_pages; 1278 md->writeback += nr_pages;
1279 1279
1280 if (PageAnon(page)) 1280 if (PageAnon(page))
1281 md->anon += nr_pages; 1281 md->anon += nr_pages;
1282 1282
1283 if (count > md->mapcount_max) 1283 if (count > md->mapcount_max)
1284 md->mapcount_max = count; 1284 md->mapcount_max = count;
1285 1285
1286 md->node[page_to_nid(page)] += nr_pages; 1286 md->node[page_to_nid(page)] += nr_pages;
1287 } 1287 }
1288 1288
1289 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, 1289 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1290 unsigned long addr) 1290 unsigned long addr)
1291 { 1291 {
1292 struct page *page; 1292 struct page *page;
1293 int nid; 1293 int nid;
1294 1294
1295 if (!pte_present(pte)) 1295 if (!pte_present(pte))
1296 return NULL; 1296 return NULL;
1297 1297
1298 page = vm_normal_page(vma, addr, pte); 1298 page = vm_normal_page(vma, addr, pte);
1299 if (!page) 1299 if (!page)
1300 return NULL; 1300 return NULL;
1301 1301
1302 if (PageReserved(page)) 1302 if (PageReserved(page))
1303 return NULL; 1303 return NULL;
1304 1304
1305 nid = page_to_nid(page); 1305 nid = page_to_nid(page);
1306 if (!node_isset(nid, node_states[N_MEMORY])) 1306 if (!node_isset(nid, node_states[N_MEMORY]))
1307 return NULL; 1307 return NULL;
1308 1308
1309 return page; 1309 return page;
1310 } 1310 }
1311 1311
1312 static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 1312 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1313 unsigned long end, struct mm_walk *walk) 1313 unsigned long end, struct mm_walk *walk)
1314 { 1314 {
1315 struct numa_maps *md; 1315 struct numa_maps *md;
1316 spinlock_t *ptl; 1316 spinlock_t *ptl;
1317 pte_t *orig_pte; 1317 pte_t *orig_pte;
1318 pte_t *pte; 1318 pte_t *pte;
1319 1319
1320 md = walk->private; 1320 md = walk->private;
1321 1321
1322 if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { 1322 if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
1323 pte_t huge_pte = *(pte_t *)pmd; 1323 pte_t huge_pte = *(pte_t *)pmd;
1324 struct page *page; 1324 struct page *page;
1325 1325
1326 page = can_gather_numa_stats(huge_pte, md->vma, addr); 1326 page = can_gather_numa_stats(huge_pte, md->vma, addr);
1327 if (page) 1327 if (page)
1328 gather_stats(page, md, pte_dirty(huge_pte), 1328 gather_stats(page, md, pte_dirty(huge_pte),
1329 HPAGE_PMD_SIZE/PAGE_SIZE); 1329 HPAGE_PMD_SIZE/PAGE_SIZE);
1330 spin_unlock(ptl); 1330 spin_unlock(ptl);
1331 return 0; 1331 return 0;
1332 } 1332 }
1333 1333
1334 if (pmd_trans_unstable(pmd)) 1334 if (pmd_trans_unstable(pmd))
1335 return 0; 1335 return 0;
1336 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 1336 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1337 do { 1337 do {
1338 struct page *page = can_gather_numa_stats(*pte, md->vma, addr); 1338 struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
1339 if (!page) 1339 if (!page)
1340 continue; 1340 continue;
1341 gather_stats(page, md, pte_dirty(*pte), 1); 1341 gather_stats(page, md, pte_dirty(*pte), 1);
1342 1342
1343 } while (pte++, addr += PAGE_SIZE, addr != end); 1343 } while (pte++, addr += PAGE_SIZE, addr != end);
1344 pte_unmap_unlock(orig_pte, ptl); 1344 pte_unmap_unlock(orig_pte, ptl);
1345 return 0; 1345 return 0;
1346 } 1346 }
1347 #ifdef CONFIG_HUGETLB_PAGE 1347 #ifdef CONFIG_HUGETLB_PAGE
1348 static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1348 static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1349 unsigned long addr, unsigned long end, struct mm_walk *walk) 1349 unsigned long addr, unsigned long end, struct mm_walk *walk)
1350 { 1350 {
1351 struct numa_maps *md; 1351 struct numa_maps *md;
1352 struct page *page; 1352 struct page *page;
1353 1353
1354 if (pte_none(*pte)) 1354 if (!pte_present(*pte))
1355 return 0; 1355 return 0;
1356 1356
1357 page = pte_page(*pte); 1357 page = pte_page(*pte);
1358 if (!page) 1358 if (!page)
1359 return 0; 1359 return 0;
1360 1360
1361 md = walk->private; 1361 md = walk->private;
1362 gather_stats(page, md, pte_dirty(*pte), 1); 1362 gather_stats(page, md, pte_dirty(*pte), 1);
1363 return 0; 1363 return 0;
1364 } 1364 }
1365 1365
1366 #else 1366 #else
1367 static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1367 static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1368 unsigned long addr, unsigned long end, struct mm_walk *walk) 1368 unsigned long addr, unsigned long end, struct mm_walk *walk)
1369 { 1369 {
1370 return 0; 1370 return 0;
1371 } 1371 }
1372 #endif 1372 #endif
1373 1373
1374 /* 1374 /*
1375 * Display pages allocated per node and memory policy via /proc. 1375 * Display pages allocated per node and memory policy via /proc.
1376 */ 1376 */
1377 static int show_numa_map(struct seq_file *m, void *v, int is_pid) 1377 static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1378 { 1378 {
1379 struct numa_maps_private *numa_priv = m->private; 1379 struct numa_maps_private *numa_priv = m->private;
1380 struct proc_maps_private *proc_priv = &numa_priv->proc_maps; 1380 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
1381 struct vm_area_struct *vma = v; 1381 struct vm_area_struct *vma = v;
1382 struct numa_maps *md = &numa_priv->md; 1382 struct numa_maps *md = &numa_priv->md;
1383 struct file *file = vma->vm_file; 1383 struct file *file = vma->vm_file;
1384 struct task_struct *task = proc_priv->task; 1384 struct task_struct *task = proc_priv->task;
1385 struct mm_struct *mm = vma->vm_mm; 1385 struct mm_struct *mm = vma->vm_mm;
1386 struct mm_walk walk = {}; 1386 struct mm_walk walk = {};
1387 struct mempolicy *pol; 1387 struct mempolicy *pol;
1388 char buffer[64]; 1388 char buffer[64];
1389 int nid; 1389 int nid;
1390 1390
1391 if (!mm) 1391 if (!mm)
1392 return 0; 1392 return 0;
1393 1393
1394 /* Ensure we start with an empty set of numa_maps statistics. */ 1394 /* Ensure we start with an empty set of numa_maps statistics. */
1395 memset(md, 0, sizeof(*md)); 1395 memset(md, 0, sizeof(*md));
1396 1396
1397 md->vma = vma; 1397 md->vma = vma;
1398 1398
1399 walk.hugetlb_entry = gather_hugetbl_stats; 1399 walk.hugetlb_entry = gather_hugetbl_stats;
1400 walk.pmd_entry = gather_pte_stats; 1400 walk.pmd_entry = gather_pte_stats;
1401 walk.private = md; 1401 walk.private = md;
1402 walk.mm = mm; 1402 walk.mm = mm;
1403 1403
1404 pol = get_vma_policy(task, vma, vma->vm_start); 1404 pol = get_vma_policy(task, vma, vma->vm_start);
1405 mpol_to_str(buffer, sizeof(buffer), pol); 1405 mpol_to_str(buffer, sizeof(buffer), pol);
1406 mpol_cond_put(pol); 1406 mpol_cond_put(pol);
1407 1407
1408 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1408 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1409 1409
1410 if (file) { 1410 if (file) {
1411 seq_printf(m, " file="); 1411 seq_printf(m, " file=");
1412 seq_path(m, &file->f_path, "\n\t= "); 1412 seq_path(m, &file->f_path, "\n\t= ");
1413 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1413 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1414 seq_printf(m, " heap"); 1414 seq_printf(m, " heap");
1415 } else { 1415 } else {
1416 pid_t tid = vm_is_stack(task, vma, is_pid); 1416 pid_t tid = vm_is_stack(task, vma, is_pid);
1417 if (tid != 0) { 1417 if (tid != 0) {
1418 /* 1418 /*
1419 * Thread stack in /proc/PID/task/TID/maps or 1419 * Thread stack in /proc/PID/task/TID/maps or
1420 * the main process stack. 1420 * the main process stack.
1421 */ 1421 */
1422 if (!is_pid || (vma->vm_start <= mm->start_stack && 1422 if (!is_pid || (vma->vm_start <= mm->start_stack &&
1423 vma->vm_end >= mm->start_stack)) 1423 vma->vm_end >= mm->start_stack))
1424 seq_printf(m, " stack"); 1424 seq_printf(m, " stack");
1425 else 1425 else
1426 seq_printf(m, " stack:%d", tid); 1426 seq_printf(m, " stack:%d", tid);
1427 } 1427 }
1428 } 1428 }
1429 1429
1430 if (is_vm_hugetlb_page(vma)) 1430 if (is_vm_hugetlb_page(vma))
1431 seq_printf(m, " huge"); 1431 seq_printf(m, " huge");
1432 1432
1433 walk_page_range(vma->vm_start, vma->vm_end, &walk); 1433 walk_page_range(vma->vm_start, vma->vm_end, &walk);
1434 1434
1435 if (!md->pages) 1435 if (!md->pages)
1436 goto out; 1436 goto out;
1437 1437
1438 if (md->anon) 1438 if (md->anon)
1439 seq_printf(m, " anon=%lu", md->anon); 1439 seq_printf(m, " anon=%lu", md->anon);
1440 1440
1441 if (md->dirty) 1441 if (md->dirty)
1442 seq_printf(m, " dirty=%lu", md->dirty); 1442 seq_printf(m, " dirty=%lu", md->dirty);
1443 1443
1444 if (md->pages != md->anon && md->pages != md->dirty) 1444 if (md->pages != md->anon && md->pages != md->dirty)
1445 seq_printf(m, " mapped=%lu", md->pages); 1445 seq_printf(m, " mapped=%lu", md->pages);
1446 1446
1447 if (md->mapcount_max > 1) 1447 if (md->mapcount_max > 1)
1448 seq_printf(m, " mapmax=%lu", md->mapcount_max); 1448 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1449 1449
1450 if (md->swapcache) 1450 if (md->swapcache)
1451 seq_printf(m, " swapcache=%lu", md->swapcache); 1451 seq_printf(m, " swapcache=%lu", md->swapcache);
1452 1452
1453 if (md->active < md->pages && !is_vm_hugetlb_page(vma)) 1453 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1454 seq_printf(m, " active=%lu", md->active); 1454 seq_printf(m, " active=%lu", md->active);
1455 1455
1456 if (md->writeback) 1456 if (md->writeback)
1457 seq_printf(m, " writeback=%lu", md->writeback); 1457 seq_printf(m, " writeback=%lu", md->writeback);
1458 1458
1459 for_each_node_state(nid, N_MEMORY) 1459 for_each_node_state(nid, N_MEMORY)
1460 if (md->node[nid]) 1460 if (md->node[nid])
1461 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 1461 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1462 out: 1462 out:
1463 seq_putc(m, '\n'); 1463 seq_putc(m, '\n');
1464 1464
1465 if (m->count < m->size) 1465 if (m->count < m->size)
1466 m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; 1466 m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
1467 return 0; 1467 return 0;
1468 } 1468 }
1469 1469
1470 static int show_pid_numa_map(struct seq_file *m, void *v) 1470 static int show_pid_numa_map(struct seq_file *m, void *v)
1471 { 1471 {
1472 return show_numa_map(m, v, 1); 1472 return show_numa_map(m, v, 1);
1473 } 1473 }
1474 1474
1475 static int show_tid_numa_map(struct seq_file *m, void *v) 1475 static int show_tid_numa_map(struct seq_file *m, void *v)
1476 { 1476 {
1477 return show_numa_map(m, v, 0); 1477 return show_numa_map(m, v, 0);
1478 } 1478 }
1479 1479
1480 static const struct seq_operations proc_pid_numa_maps_op = { 1480 static const struct seq_operations proc_pid_numa_maps_op = {
1481 .start = m_start, 1481 .start = m_start,
1482 .next = m_next, 1482 .next = m_next,
1483 .stop = m_stop, 1483 .stop = m_stop,
1484 .show = show_pid_numa_map, 1484 .show = show_pid_numa_map,
1485 }; 1485 };
1486 1486
1487 static const struct seq_operations proc_tid_numa_maps_op = { 1487 static const struct seq_operations proc_tid_numa_maps_op = {
1488 .start = m_start, 1488 .start = m_start,
1489 .next = m_next, 1489 .next = m_next,
1490 .stop = m_stop, 1490 .stop = m_stop,
1491 .show = show_tid_numa_map, 1491 .show = show_tid_numa_map,
1492 }; 1492 };
1493 1493
1494 static int numa_maps_open(struct inode *inode, struct file *file, 1494 static int numa_maps_open(struct inode *inode, struct file *file,
1495 const struct seq_operations *ops) 1495 const struct seq_operations *ops)
1496 { 1496 {
1497 struct numa_maps_private *priv; 1497 struct numa_maps_private *priv;
1498 int ret = -ENOMEM; 1498 int ret = -ENOMEM;
1499 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 1499 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1500 if (priv) { 1500 if (priv) {
1501 priv->proc_maps.pid = proc_pid(inode); 1501 priv->proc_maps.pid = proc_pid(inode);
1502 ret = seq_open(file, ops); 1502 ret = seq_open(file, ops);
1503 if (!ret) { 1503 if (!ret) {
1504 struct seq_file *m = file->private_data; 1504 struct seq_file *m = file->private_data;
1505 m->private = priv; 1505 m->private = priv;
1506 } else { 1506 } else {
1507 kfree(priv); 1507 kfree(priv);
1508 } 1508 }
1509 } 1509 }
1510 return ret; 1510 return ret;
1511 } 1511 }
1512 1512
1513 static int pid_numa_maps_open(struct inode *inode, struct file *file) 1513 static int pid_numa_maps_open(struct inode *inode, struct file *file)
1514 { 1514 {
1515 return numa_maps_open(inode, file, &proc_pid_numa_maps_op); 1515 return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
1516 } 1516 }
1517 1517
1518 static int tid_numa_maps_open(struct inode *inode, struct file *file) 1518 static int tid_numa_maps_open(struct inode *inode, struct file *file)
1519 { 1519 {
1520 return numa_maps_open(inode, file, &proc_tid_numa_maps_op); 1520 return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
1521 } 1521 }
1522 1522
1523 const struct file_operations proc_pid_numa_maps_operations = { 1523 const struct file_operations proc_pid_numa_maps_operations = {
1524 .open = pid_numa_maps_open, 1524 .open = pid_numa_maps_open,
1525 .read = seq_read, 1525 .read = seq_read,
1526 .llseek = seq_lseek, 1526 .llseek = seq_lseek,
1527 .release = seq_release_private, 1527 .release = seq_release_private,
1528 }; 1528 };
1529 1529
1530 const struct file_operations proc_tid_numa_maps_operations = { 1530 const struct file_operations proc_tid_numa_maps_operations = {
1531 .open = tid_numa_maps_open, 1531 .open = tid_numa_maps_open,
1532 .read = seq_read, 1532 .read = seq_read,
1533 .llseek = seq_lseek, 1533 .llseek = seq_lseek,
1534 .release = seq_release_private, 1534 .release = seq_release_private,
1535 }; 1535 };
1536 #endif /* CONFIG_NUMA */ 1536 #endif /* CONFIG_NUMA */
1537 1537
1 /* 1 /*
2 * Simple NUMA memory policy for the Linux kernel. 2 * Simple NUMA memory policy for the Linux kernel.
3 * 3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2. 6 * Subject to the GNU Public License, version 2.
7 * 7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should 8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated. 9 * be allocated.
10 * 10 *
11 * Support four policies per VMA and per process: 11 * Support four policies per VMA and per process:
12 * 12 *
13 * The VMA policy has priority over the process policy for a page fault. 13 * The VMA policy has priority over the process policy for a page fault.
14 * 14 *
15 * interleave Allocate memory interleaved over a set of nodes, 15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails. 16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the 17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping 18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter 19 * for anonymous memory. For process policy an process counter
20 * is used. 20 * is used.
21 * 21 *
22 * bind Only allocate memory on a specific set of nodes, 22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback. 23 * no fallback.
24 * FIXME: memory is allocated starting with the first node 24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict 25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead 26 * the allocation to memory nodes instead
27 * 27 *
28 * preferred Try a specific node first before normal fallback. 28 * preferred Try a specific node first before normal fallback.
29 * As a special case NUMA_NO_NODE here means do the allocation 29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default, 30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default 31 * but useful to set in a VMA when you have a non default
32 * process policy. 32 * process policy.
33 * 33 *
34 * default Allocate on the local node first, or when on a VMA 34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did 35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default. 36 * in a NUMA aware kernel and still does by, ahem, default.
37 * 37 *
38 * The process policy is applied for most non interrupt memory allocations 38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always 39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory 40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM. 41 * allocations for a VMA in the VM.
42 * 42 *
43 * Currently there are a few corner cases in swapping where the policy 43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy 44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins. 45 * is used it is not remembered over swap outs/swap ins.
46 * 46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations 47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that 48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied. 49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations. 50 * Same with GFP_DMA allocations.
51 * 51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped. 53 * all users and remembered even when nobody has memory mapped.
54 */ 54 */
55 55
56 /* Notebook: 56 /* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache 57 fix mmap readahead to honour policy and enable policy for any page cache
58 object 58 object
59 statistics for bigpages 59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires 60 global policy for page cache? currently it uses process policy. Requires
61 first item above. 61 first item above.
62 handle mremap for shared memory (currently ignored for the policy) 62 handle mremap for shared memory (currently ignored for the policy)
63 grows down? 63 grows down?
64 make bind policy root only? It can trigger oom much faster and the 64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that. 65 kernel is not always grateful with that.
66 */ 66 */
67 67
68 #include <linux/mempolicy.h> 68 #include <linux/mempolicy.h>
69 #include <linux/mm.h> 69 #include <linux/mm.h>
70 #include <linux/highmem.h> 70 #include <linux/highmem.h>
71 #include <linux/hugetlb.h> 71 #include <linux/hugetlb.h>
72 #include <linux/kernel.h> 72 #include <linux/kernel.h>
73 #include <linux/sched.h> 73 #include <linux/sched.h>
74 #include <linux/nodemask.h> 74 #include <linux/nodemask.h>
75 #include <linux/cpuset.h> 75 #include <linux/cpuset.h>
76 #include <linux/slab.h> 76 #include <linux/slab.h>
77 #include <linux/string.h> 77 #include <linux/string.h>
78 #include <linux/export.h> 78 #include <linux/export.h>
79 #include <linux/nsproxy.h> 79 #include <linux/nsproxy.h>
80 #include <linux/interrupt.h> 80 #include <linux/interrupt.h>
81 #include <linux/init.h> 81 #include <linux/init.h>
82 #include <linux/compat.h> 82 #include <linux/compat.h>
83 #include <linux/swap.h> 83 #include <linux/swap.h>
84 #include <linux/seq_file.h> 84 #include <linux/seq_file.h>
85 #include <linux/proc_fs.h> 85 #include <linux/proc_fs.h>
86 #include <linux/migrate.h> 86 #include <linux/migrate.h>
87 #include <linux/ksm.h> 87 #include <linux/ksm.h>
88 #include <linux/rmap.h> 88 #include <linux/rmap.h>
89 #include <linux/security.h> 89 #include <linux/security.h>
90 #include <linux/syscalls.h> 90 #include <linux/syscalls.h>
91 #include <linux/ctype.h> 91 #include <linux/ctype.h>
92 #include <linux/mm_inline.h> 92 #include <linux/mm_inline.h>
93 #include <linux/mmu_notifier.h> 93 #include <linux/mmu_notifier.h>
94 94
95 #include <asm/tlbflush.h> 95 #include <asm/tlbflush.h>
96 #include <asm/uaccess.h> 96 #include <asm/uaccess.h>
97 #include <linux/random.h> 97 #include <linux/random.h>
98 98
99 #include "internal.h" 99 #include "internal.h"
100 100
101 /* Internal flags */ 101 /* Internal flags */
102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
104 104
105 static struct kmem_cache *policy_cache; 105 static struct kmem_cache *policy_cache;
106 static struct kmem_cache *sn_cache; 106 static struct kmem_cache *sn_cache;
107 107
108 /* Highest zone. An specific allocation for a zone below that is not 108 /* Highest zone. An specific allocation for a zone below that is not
109 policied. */ 109 policied. */
110 enum zone_type policy_zone = 0; 110 enum zone_type policy_zone = 0;
111 111
112 /* 112 /*
113 * run-time system-wide default policy => local allocation 113 * run-time system-wide default policy => local allocation
114 */ 114 */
115 static struct mempolicy default_policy = { 115 static struct mempolicy default_policy = {
116 .refcnt = ATOMIC_INIT(1), /* never free it */ 116 .refcnt = ATOMIC_INIT(1), /* never free it */
117 .mode = MPOL_PREFERRED, 117 .mode = MPOL_PREFERRED,
118 .flags = MPOL_F_LOCAL, 118 .flags = MPOL_F_LOCAL,
119 }; 119 };
120 120
121 static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122 122
123 static struct mempolicy *get_task_policy(struct task_struct *p) 123 static struct mempolicy *get_task_policy(struct task_struct *p)
124 { 124 {
125 struct mempolicy *pol = p->mempolicy; 125 struct mempolicy *pol = p->mempolicy;
126 126
127 if (!pol) { 127 if (!pol) {
128 int node = numa_node_id(); 128 int node = numa_node_id();
129 129
130 if (node != NUMA_NO_NODE) { 130 if (node != NUMA_NO_NODE) {
131 pol = &preferred_node_policy[node]; 131 pol = &preferred_node_policy[node];
132 /* 132 /*
133 * preferred_node_policy is not initialised early in 133 * preferred_node_policy is not initialised early in
134 * boot 134 * boot
135 */ 135 */
136 if (!pol->mode) 136 if (!pol->mode)
137 pol = NULL; 137 pol = NULL;
138 } 138 }
139 } 139 }
140 140
141 return pol; 141 return pol;
142 } 142 }
143 143
144 static const struct mempolicy_operations { 144 static const struct mempolicy_operations {
145 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 145 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146 /* 146 /*
147 * If read-side task has no lock to protect task->mempolicy, write-side 147 * If read-side task has no lock to protect task->mempolicy, write-side
148 * task will rebind the task->mempolicy by two step. The first step is 148 * task will rebind the task->mempolicy by two step. The first step is
149 * setting all the newly nodes, and the second step is cleaning all the 149 * setting all the newly nodes, and the second step is cleaning all the
150 * disallowed nodes. In this way, we can avoid finding no node to alloc 150 * disallowed nodes. In this way, we can avoid finding no node to alloc
151 * page. 151 * page.
152 * If we have a lock to protect task->mempolicy in read-side, we do 152 * If we have a lock to protect task->mempolicy in read-side, we do
153 * rebind directly. 153 * rebind directly.
154 * 154 *
155 * step: 155 * step:
156 * MPOL_REBIND_ONCE - do rebind work at once 156 * MPOL_REBIND_ONCE - do rebind work at once
157 * MPOL_REBIND_STEP1 - set all the newly nodes 157 * MPOL_REBIND_STEP1 - set all the newly nodes
158 * MPOL_REBIND_STEP2 - clean all the disallowed nodes 158 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
159 */ 159 */
160 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, 160 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161 enum mpol_rebind_step step); 161 enum mpol_rebind_step step);
162 } mpol_ops[MPOL_MAX]; 162 } mpol_ops[MPOL_MAX];
163 163
164 /* Check that the nodemask contains at least one populated zone */ 164 /* Check that the nodemask contains at least one populated zone */
165 static int is_valid_nodemask(const nodemask_t *nodemask) 165 static int is_valid_nodemask(const nodemask_t *nodemask)
166 { 166 {
167 return nodes_intersects(*nodemask, node_states[N_MEMORY]); 167 return nodes_intersects(*nodemask, node_states[N_MEMORY]);
168 } 168 }
169 169
170 static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 170 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
171 { 171 {
172 return pol->flags & MPOL_MODE_FLAGS; 172 return pol->flags & MPOL_MODE_FLAGS;
173 } 173 }
174 174
175 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 175 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
176 const nodemask_t *rel) 176 const nodemask_t *rel)
177 { 177 {
178 nodemask_t tmp; 178 nodemask_t tmp;
179 nodes_fold(tmp, *orig, nodes_weight(*rel)); 179 nodes_fold(tmp, *orig, nodes_weight(*rel));
180 nodes_onto(*ret, tmp, *rel); 180 nodes_onto(*ret, tmp, *rel);
181 } 181 }
182 182
183 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) 183 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
184 { 184 {
185 if (nodes_empty(*nodes)) 185 if (nodes_empty(*nodes))
186 return -EINVAL; 186 return -EINVAL;
187 pol->v.nodes = *nodes; 187 pol->v.nodes = *nodes;
188 return 0; 188 return 0;
189 } 189 }
190 190
191 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 191 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
192 { 192 {
193 if (!nodes) 193 if (!nodes)
194 pol->flags |= MPOL_F_LOCAL; /* local allocation */ 194 pol->flags |= MPOL_F_LOCAL; /* local allocation */
195 else if (nodes_empty(*nodes)) 195 else if (nodes_empty(*nodes))
196 return -EINVAL; /* no allowed nodes */ 196 return -EINVAL; /* no allowed nodes */
197 else 197 else
198 pol->v.preferred_node = first_node(*nodes); 198 pol->v.preferred_node = first_node(*nodes);
199 return 0; 199 return 0;
200 } 200 }
201 201
202 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) 202 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
203 { 203 {
204 if (!is_valid_nodemask(nodes)) 204 if (!is_valid_nodemask(nodes))
205 return -EINVAL; 205 return -EINVAL;
206 pol->v.nodes = *nodes; 206 pol->v.nodes = *nodes;
207 return 0; 207 return 0;
208 } 208 }
209 209
210 /* 210 /*
211 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 211 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
212 * any, for the new policy. mpol_new() has already validated the nodes 212 * any, for the new policy. mpol_new() has already validated the nodes
213 * parameter with respect to the policy mode and flags. But, we need to 213 * parameter with respect to the policy mode and flags. But, we need to
214 * handle an empty nodemask with MPOL_PREFERRED here. 214 * handle an empty nodemask with MPOL_PREFERRED here.
215 * 215 *
216 * Must be called holding task's alloc_lock to protect task's mems_allowed 216 * Must be called holding task's alloc_lock to protect task's mems_allowed
217 * and mempolicy. May also be called holding the mmap_semaphore for write. 217 * and mempolicy. May also be called holding the mmap_semaphore for write.
218 */ 218 */
219 static int mpol_set_nodemask(struct mempolicy *pol, 219 static int mpol_set_nodemask(struct mempolicy *pol,
220 const nodemask_t *nodes, struct nodemask_scratch *nsc) 220 const nodemask_t *nodes, struct nodemask_scratch *nsc)
221 { 221 {
222 int ret; 222 int ret;
223 223
224 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 224 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
225 if (pol == NULL) 225 if (pol == NULL)
226 return 0; 226 return 0;
227 /* Check N_MEMORY */ 227 /* Check N_MEMORY */
228 nodes_and(nsc->mask1, 228 nodes_and(nsc->mask1,
229 cpuset_current_mems_allowed, node_states[N_MEMORY]); 229 cpuset_current_mems_allowed, node_states[N_MEMORY]);
230 230
231 VM_BUG_ON(!nodes); 231 VM_BUG_ON(!nodes);
232 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 232 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
233 nodes = NULL; /* explicit local allocation */ 233 nodes = NULL; /* explicit local allocation */
234 else { 234 else {
235 if (pol->flags & MPOL_F_RELATIVE_NODES) 235 if (pol->flags & MPOL_F_RELATIVE_NODES)
236 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); 236 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
237 else 237 else
238 nodes_and(nsc->mask2, *nodes, nsc->mask1); 238 nodes_and(nsc->mask2, *nodes, nsc->mask1);
239 239
240 if (mpol_store_user_nodemask(pol)) 240 if (mpol_store_user_nodemask(pol))
241 pol->w.user_nodemask = *nodes; 241 pol->w.user_nodemask = *nodes;
242 else 242 else
243 pol->w.cpuset_mems_allowed = 243 pol->w.cpuset_mems_allowed =
244 cpuset_current_mems_allowed; 244 cpuset_current_mems_allowed;
245 } 245 }
246 246
247 if (nodes) 247 if (nodes)
248 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 248 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
249 else 249 else
250 ret = mpol_ops[pol->mode].create(pol, NULL); 250 ret = mpol_ops[pol->mode].create(pol, NULL);
251 return ret; 251 return ret;
252 } 252 }
253 253
254 /* 254 /*
255 * This function just creates a new policy, does some check and simple 255 * This function just creates a new policy, does some check and simple
256 * initialization. You must invoke mpol_set_nodemask() to set nodes. 256 * initialization. You must invoke mpol_set_nodemask() to set nodes.
257 */ 257 */
258 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 258 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
259 nodemask_t *nodes) 259 nodemask_t *nodes)
260 { 260 {
261 struct mempolicy *policy; 261 struct mempolicy *policy;
262 262
263 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 263 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
264 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); 264 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
265 265
266 if (mode == MPOL_DEFAULT) { 266 if (mode == MPOL_DEFAULT) {
267 if (nodes && !nodes_empty(*nodes)) 267 if (nodes && !nodes_empty(*nodes))
268 return ERR_PTR(-EINVAL); 268 return ERR_PTR(-EINVAL);
269 return NULL; 269 return NULL;
270 } 270 }
271 VM_BUG_ON(!nodes); 271 VM_BUG_ON(!nodes);
272 272
273 /* 273 /*
274 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 274 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
275 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 275 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
276 * All other modes require a valid pointer to a non-empty nodemask. 276 * All other modes require a valid pointer to a non-empty nodemask.
277 */ 277 */
278 if (mode == MPOL_PREFERRED) { 278 if (mode == MPOL_PREFERRED) {
279 if (nodes_empty(*nodes)) { 279 if (nodes_empty(*nodes)) {
280 if (((flags & MPOL_F_STATIC_NODES) || 280 if (((flags & MPOL_F_STATIC_NODES) ||
281 (flags & MPOL_F_RELATIVE_NODES))) 281 (flags & MPOL_F_RELATIVE_NODES)))
282 return ERR_PTR(-EINVAL); 282 return ERR_PTR(-EINVAL);
283 } 283 }
284 } else if (mode == MPOL_LOCAL) { 284 } else if (mode == MPOL_LOCAL) {
285 if (!nodes_empty(*nodes)) 285 if (!nodes_empty(*nodes))
286 return ERR_PTR(-EINVAL); 286 return ERR_PTR(-EINVAL);
287 mode = MPOL_PREFERRED; 287 mode = MPOL_PREFERRED;
288 } else if (nodes_empty(*nodes)) 288 } else if (nodes_empty(*nodes))
289 return ERR_PTR(-EINVAL); 289 return ERR_PTR(-EINVAL);
290 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 290 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
291 if (!policy) 291 if (!policy)
292 return ERR_PTR(-ENOMEM); 292 return ERR_PTR(-ENOMEM);
293 atomic_set(&policy->refcnt, 1); 293 atomic_set(&policy->refcnt, 1);
294 policy->mode = mode; 294 policy->mode = mode;
295 policy->flags = flags; 295 policy->flags = flags;
296 296
297 return policy; 297 return policy;
298 } 298 }
299 299
300 /* Slow path of a mpol destructor. */ 300 /* Slow path of a mpol destructor. */
301 void __mpol_put(struct mempolicy *p) 301 void __mpol_put(struct mempolicy *p)
302 { 302 {
303 if (!atomic_dec_and_test(&p->refcnt)) 303 if (!atomic_dec_and_test(&p->refcnt))
304 return; 304 return;
305 kmem_cache_free(policy_cache, p); 305 kmem_cache_free(policy_cache, p);
306 } 306 }
307 307
308 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, 308 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
309 enum mpol_rebind_step step) 309 enum mpol_rebind_step step)
310 { 310 {
311 } 311 }
312 312
313 /* 313 /*
314 * step: 314 * step:
315 * MPOL_REBIND_ONCE - do rebind work at once 315 * MPOL_REBIND_ONCE - do rebind work at once
316 * MPOL_REBIND_STEP1 - set all the newly nodes 316 * MPOL_REBIND_STEP1 - set all the newly nodes
317 * MPOL_REBIND_STEP2 - clean all the disallowed nodes 317 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
318 */ 318 */
319 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, 319 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
320 enum mpol_rebind_step step) 320 enum mpol_rebind_step step)
321 { 321 {
322 nodemask_t tmp; 322 nodemask_t tmp;
323 323
324 if (pol->flags & MPOL_F_STATIC_NODES) 324 if (pol->flags & MPOL_F_STATIC_NODES)
325 nodes_and(tmp, pol->w.user_nodemask, *nodes); 325 nodes_and(tmp, pol->w.user_nodemask, *nodes);
326 else if (pol->flags & MPOL_F_RELATIVE_NODES) 326 else if (pol->flags & MPOL_F_RELATIVE_NODES)
327 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 327 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
328 else { 328 else {
329 /* 329 /*
330 * if step == 1, we use ->w.cpuset_mems_allowed to cache the 330 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
331 * result 331 * result
332 */ 332 */
333 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { 333 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
334 nodes_remap(tmp, pol->v.nodes, 334 nodes_remap(tmp, pol->v.nodes,
335 pol->w.cpuset_mems_allowed, *nodes); 335 pol->w.cpuset_mems_allowed, *nodes);
336 pol->w.cpuset_mems_allowed = step ? tmp : *nodes; 336 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
337 } else if (step == MPOL_REBIND_STEP2) { 337 } else if (step == MPOL_REBIND_STEP2) {
338 tmp = pol->w.cpuset_mems_allowed; 338 tmp = pol->w.cpuset_mems_allowed;
339 pol->w.cpuset_mems_allowed = *nodes; 339 pol->w.cpuset_mems_allowed = *nodes;
340 } else 340 } else
341 BUG(); 341 BUG();
342 } 342 }
343 343
344 if (nodes_empty(tmp)) 344 if (nodes_empty(tmp))
345 tmp = *nodes; 345 tmp = *nodes;
346 346
347 if (step == MPOL_REBIND_STEP1) 347 if (step == MPOL_REBIND_STEP1)
348 nodes_or(pol->v.nodes, pol->v.nodes, tmp); 348 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
349 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) 349 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
350 pol->v.nodes = tmp; 350 pol->v.nodes = tmp;
351 else 351 else
352 BUG(); 352 BUG();
353 353
354 if (!node_isset(current->il_next, tmp)) { 354 if (!node_isset(current->il_next, tmp)) {
355 current->il_next = next_node(current->il_next, tmp); 355 current->il_next = next_node(current->il_next, tmp);
356 if (current->il_next >= MAX_NUMNODES) 356 if (current->il_next >= MAX_NUMNODES)
357 current->il_next = first_node(tmp); 357 current->il_next = first_node(tmp);
358 if (current->il_next >= MAX_NUMNODES) 358 if (current->il_next >= MAX_NUMNODES)
359 current->il_next = numa_node_id(); 359 current->il_next = numa_node_id();
360 } 360 }
361 } 361 }
362 362
363 static void mpol_rebind_preferred(struct mempolicy *pol, 363 static void mpol_rebind_preferred(struct mempolicy *pol,
364 const nodemask_t *nodes, 364 const nodemask_t *nodes,
365 enum mpol_rebind_step step) 365 enum mpol_rebind_step step)
366 { 366 {
367 nodemask_t tmp; 367 nodemask_t tmp;
368 368
369 if (pol->flags & MPOL_F_STATIC_NODES) { 369 if (pol->flags & MPOL_F_STATIC_NODES) {
370 int node = first_node(pol->w.user_nodemask); 370 int node = first_node(pol->w.user_nodemask);
371 371
372 if (node_isset(node, *nodes)) { 372 if (node_isset(node, *nodes)) {
373 pol->v.preferred_node = node; 373 pol->v.preferred_node = node;
374 pol->flags &= ~MPOL_F_LOCAL; 374 pol->flags &= ~MPOL_F_LOCAL;
375 } else 375 } else
376 pol->flags |= MPOL_F_LOCAL; 376 pol->flags |= MPOL_F_LOCAL;
377 } else if (pol->flags & MPOL_F_RELATIVE_NODES) { 377 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
378 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 378 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
379 pol->v.preferred_node = first_node(tmp); 379 pol->v.preferred_node = first_node(tmp);
380 } else if (!(pol->flags & MPOL_F_LOCAL)) { 380 } else if (!(pol->flags & MPOL_F_LOCAL)) {
381 pol->v.preferred_node = node_remap(pol->v.preferred_node, 381 pol->v.preferred_node = node_remap(pol->v.preferred_node,
382 pol->w.cpuset_mems_allowed, 382 pol->w.cpuset_mems_allowed,
383 *nodes); 383 *nodes);
384 pol->w.cpuset_mems_allowed = *nodes; 384 pol->w.cpuset_mems_allowed = *nodes;
385 } 385 }
386 } 386 }
387 387
388 /* 388 /*
389 * mpol_rebind_policy - Migrate a policy to a different set of nodes 389 * mpol_rebind_policy - Migrate a policy to a different set of nodes
390 * 390 *
391 * If read-side task has no lock to protect task->mempolicy, write-side 391 * If read-side task has no lock to protect task->mempolicy, write-side
392 * task will rebind the task->mempolicy by two step. The first step is 392 * task will rebind the task->mempolicy by two step. The first step is
393 * setting all the newly nodes, and the second step is cleaning all the 393 * setting all the newly nodes, and the second step is cleaning all the
394 * disallowed nodes. In this way, we can avoid finding no node to alloc 394 * disallowed nodes. In this way, we can avoid finding no node to alloc
395 * page. 395 * page.
396 * If we have a lock to protect task->mempolicy in read-side, we do 396 * If we have a lock to protect task->mempolicy in read-side, we do
397 * rebind directly. 397 * rebind directly.
398 * 398 *
399 * step: 399 * step:
400 * MPOL_REBIND_ONCE - do rebind work at once 400 * MPOL_REBIND_ONCE - do rebind work at once
401 * MPOL_REBIND_STEP1 - set all the newly nodes 401 * MPOL_REBIND_STEP1 - set all the newly nodes
402 * MPOL_REBIND_STEP2 - clean all the disallowed nodes 402 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
403 */ 403 */
404 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, 404 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
405 enum mpol_rebind_step step) 405 enum mpol_rebind_step step)
406 { 406 {
407 if (!pol) 407 if (!pol)
408 return; 408 return;
409 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && 409 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
410 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 410 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
411 return; 411 return;
412 412
413 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) 413 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
414 return; 414 return;
415 415
416 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) 416 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
417 BUG(); 417 BUG();
418 418
419 if (step == MPOL_REBIND_STEP1) 419 if (step == MPOL_REBIND_STEP1)
420 pol->flags |= MPOL_F_REBINDING; 420 pol->flags |= MPOL_F_REBINDING;
421 else if (step == MPOL_REBIND_STEP2) 421 else if (step == MPOL_REBIND_STEP2)
422 pol->flags &= ~MPOL_F_REBINDING; 422 pol->flags &= ~MPOL_F_REBINDING;
423 else if (step >= MPOL_REBIND_NSTEP) 423 else if (step >= MPOL_REBIND_NSTEP)
424 BUG(); 424 BUG();
425 425
426 mpol_ops[pol->mode].rebind(pol, newmask, step); 426 mpol_ops[pol->mode].rebind(pol, newmask, step);
427 } 427 }
428 428
429 /* 429 /*
430 * Wrapper for mpol_rebind_policy() that just requires task 430 * Wrapper for mpol_rebind_policy() that just requires task
431 * pointer, and updates task mempolicy. 431 * pointer, and updates task mempolicy.
432 * 432 *
433 * Called with task's alloc_lock held. 433 * Called with task's alloc_lock held.
434 */ 434 */
435 435
436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, 436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
437 enum mpol_rebind_step step) 437 enum mpol_rebind_step step)
438 { 438 {
439 mpol_rebind_policy(tsk->mempolicy, new, step); 439 mpol_rebind_policy(tsk->mempolicy, new, step);
440 } 440 }
441 441
442 /* 442 /*
443 * Rebind each vma in mm to new nodemask. 443 * Rebind each vma in mm to new nodemask.
444 * 444 *
445 * Call holding a reference to mm. Takes mm->mmap_sem during call. 445 * Call holding a reference to mm. Takes mm->mmap_sem during call.
446 */ 446 */
447 447
448 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 448 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
449 { 449 {
450 struct vm_area_struct *vma; 450 struct vm_area_struct *vma;
451 451
452 down_write(&mm->mmap_sem); 452 down_write(&mm->mmap_sem);
453 for (vma = mm->mmap; vma; vma = vma->vm_next) 453 for (vma = mm->mmap; vma; vma = vma->vm_next)
454 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); 454 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
455 up_write(&mm->mmap_sem); 455 up_write(&mm->mmap_sem);
456 } 456 }
457 457
458 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 458 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
459 [MPOL_DEFAULT] = { 459 [MPOL_DEFAULT] = {
460 .rebind = mpol_rebind_default, 460 .rebind = mpol_rebind_default,
461 }, 461 },
462 [MPOL_INTERLEAVE] = { 462 [MPOL_INTERLEAVE] = {
463 .create = mpol_new_interleave, 463 .create = mpol_new_interleave,
464 .rebind = mpol_rebind_nodemask, 464 .rebind = mpol_rebind_nodemask,
465 }, 465 },
466 [MPOL_PREFERRED] = { 466 [MPOL_PREFERRED] = {
467 .create = mpol_new_preferred, 467 .create = mpol_new_preferred,
468 .rebind = mpol_rebind_preferred, 468 .rebind = mpol_rebind_preferred,
469 }, 469 },
470 [MPOL_BIND] = { 470 [MPOL_BIND] = {
471 .create = mpol_new_bind, 471 .create = mpol_new_bind,
472 .rebind = mpol_rebind_nodemask, 472 .rebind = mpol_rebind_nodemask,
473 }, 473 },
474 }; 474 };
475 475
476 static void migrate_page_add(struct page *page, struct list_head *pagelist, 476 static void migrate_page_add(struct page *page, struct list_head *pagelist,
477 unsigned long flags); 477 unsigned long flags);
478 478
479 /* 479 /*
480 * Scan through pages checking if pages follow certain conditions, 480 * Scan through pages checking if pages follow certain conditions,
481 * and move them to the pagelist if they do. 481 * and move them to the pagelist if they do.
482 */ 482 */
483 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 483 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
484 unsigned long addr, unsigned long end, 484 unsigned long addr, unsigned long end,
485 const nodemask_t *nodes, unsigned long flags, 485 const nodemask_t *nodes, unsigned long flags,
486 void *private) 486 void *private)
487 { 487 {
488 pte_t *orig_pte; 488 pte_t *orig_pte;
489 pte_t *pte; 489 pte_t *pte;
490 spinlock_t *ptl; 490 spinlock_t *ptl;
491 491
492 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 492 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
493 do { 493 do {
494 struct page *page; 494 struct page *page;
495 int nid; 495 int nid;
496 496
497 if (!pte_present(*pte)) 497 if (!pte_present(*pte))
498 continue; 498 continue;
499 page = vm_normal_page(vma, addr, *pte); 499 page = vm_normal_page(vma, addr, *pte);
500 if (!page) 500 if (!page)
501 continue; 501 continue;
502 /* 502 /*
503 * vm_normal_page() filters out zero pages, but there might 503 * vm_normal_page() filters out zero pages, but there might
504 * still be PageReserved pages to skip, perhaps in a VDSO. 504 * still be PageReserved pages to skip, perhaps in a VDSO.
505 */ 505 */
506 if (PageReserved(page)) 506 if (PageReserved(page))
507 continue; 507 continue;
508 nid = page_to_nid(page); 508 nid = page_to_nid(page);
509 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 509 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
510 continue; 510 continue;
511 511
512 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 512 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
513 migrate_page_add(page, private, flags); 513 migrate_page_add(page, private, flags);
514 else 514 else
515 break; 515 break;
516 } while (pte++, addr += PAGE_SIZE, addr != end); 516 } while (pte++, addr += PAGE_SIZE, addr != end);
517 pte_unmap_unlock(orig_pte, ptl); 517 pte_unmap_unlock(orig_pte, ptl);
518 return addr != end; 518 return addr != end;
519 } 519 }
520 520
521 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, 521 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
522 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, 522 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
523 void *private) 523 void *private)
524 { 524 {
525 #ifdef CONFIG_HUGETLB_PAGE 525 #ifdef CONFIG_HUGETLB_PAGE
526 int nid; 526 int nid;
527 struct page *page; 527 struct page *page;
528 spinlock_t *ptl; 528 spinlock_t *ptl;
529 pte_t entry;
529 530
530 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); 531 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
531 page = pte_page(huge_ptep_get((pte_t *)pmd)); 532 entry = huge_ptep_get((pte_t *)pmd);
533 if (!pte_present(entry))
534 goto unlock;
535 page = pte_page(entry);
532 nid = page_to_nid(page); 536 nid = page_to_nid(page);
533 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 537 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
534 goto unlock; 538 goto unlock;
535 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ 539 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
536 if (flags & (MPOL_MF_MOVE_ALL) || 540 if (flags & (MPOL_MF_MOVE_ALL) ||
537 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) 541 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
538 isolate_huge_page(page, private); 542 isolate_huge_page(page, private);
539 unlock: 543 unlock:
540 spin_unlock(ptl); 544 spin_unlock(ptl);
541 #else 545 #else
542 BUG(); 546 BUG();
543 #endif 547 #endif
544 } 548 }
545 549
546 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, 550 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
547 unsigned long addr, unsigned long end, 551 unsigned long addr, unsigned long end,
548 const nodemask_t *nodes, unsigned long flags, 552 const nodemask_t *nodes, unsigned long flags,
549 void *private) 553 void *private)
550 { 554 {
551 pmd_t *pmd; 555 pmd_t *pmd;
552 unsigned long next; 556 unsigned long next;
553 557
554 pmd = pmd_offset(pud, addr); 558 pmd = pmd_offset(pud, addr);
555 do { 559 do {
556 next = pmd_addr_end(addr, end); 560 next = pmd_addr_end(addr, end);
557 if (!pmd_present(*pmd)) 561 if (!pmd_present(*pmd))
558 continue; 562 continue;
559 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { 563 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
560 queue_pages_hugetlb_pmd_range(vma, pmd, nodes, 564 queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
561 flags, private); 565 flags, private);
562 continue; 566 continue;
563 } 567 }
564 split_huge_page_pmd(vma, addr, pmd); 568 split_huge_page_pmd(vma, addr, pmd);
565 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 569 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
566 continue; 570 continue;
567 if (queue_pages_pte_range(vma, pmd, addr, next, nodes, 571 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
568 flags, private)) 572 flags, private))
569 return -EIO; 573 return -EIO;
570 } while (pmd++, addr = next, addr != end); 574 } while (pmd++, addr = next, addr != end);
571 return 0; 575 return 0;
572 } 576 }
573 577
574 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 578 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
575 unsigned long addr, unsigned long end, 579 unsigned long addr, unsigned long end,
576 const nodemask_t *nodes, unsigned long flags, 580 const nodemask_t *nodes, unsigned long flags,
577 void *private) 581 void *private)
578 { 582 {
579 pud_t *pud; 583 pud_t *pud;
580 unsigned long next; 584 unsigned long next;
581 585
582 pud = pud_offset(pgd, addr); 586 pud = pud_offset(pgd, addr);
583 do { 587 do {
584 next = pud_addr_end(addr, end); 588 next = pud_addr_end(addr, end);
585 if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) 589 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
586 continue; 590 continue;
587 if (pud_none_or_clear_bad(pud)) 591 if (pud_none_or_clear_bad(pud))
588 continue; 592 continue;
589 if (queue_pages_pmd_range(vma, pud, addr, next, nodes, 593 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
590 flags, private)) 594 flags, private))
591 return -EIO; 595 return -EIO;
592 } while (pud++, addr = next, addr != end); 596 } while (pud++, addr = next, addr != end);
593 return 0; 597 return 0;
594 } 598 }
595 599
596 static inline int queue_pages_pgd_range(struct vm_area_struct *vma, 600 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
597 unsigned long addr, unsigned long end, 601 unsigned long addr, unsigned long end,
598 const nodemask_t *nodes, unsigned long flags, 602 const nodemask_t *nodes, unsigned long flags,
599 void *private) 603 void *private)
600 { 604 {
601 pgd_t *pgd; 605 pgd_t *pgd;
602 unsigned long next; 606 unsigned long next;
603 607
604 pgd = pgd_offset(vma->vm_mm, addr); 608 pgd = pgd_offset(vma->vm_mm, addr);
605 do { 609 do {
606 next = pgd_addr_end(addr, end); 610 next = pgd_addr_end(addr, end);
607 if (pgd_none_or_clear_bad(pgd)) 611 if (pgd_none_or_clear_bad(pgd))
608 continue; 612 continue;
609 if (queue_pages_pud_range(vma, pgd, addr, next, nodes, 613 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
610 flags, private)) 614 flags, private))
611 return -EIO; 615 return -EIO;
612 } while (pgd++, addr = next, addr != end); 616 } while (pgd++, addr = next, addr != end);
613 return 0; 617 return 0;
614 } 618 }
615 619
616 #ifdef CONFIG_NUMA_BALANCING 620 #ifdef CONFIG_NUMA_BALANCING
617 /* 621 /*
618 * This is used to mark a range of virtual addresses to be inaccessible. 622 * This is used to mark a range of virtual addresses to be inaccessible.
619 * These are later cleared by a NUMA hinting fault. Depending on these 623 * These are later cleared by a NUMA hinting fault. Depending on these
620 * faults, pages may be migrated for better NUMA placement. 624 * faults, pages may be migrated for better NUMA placement.
621 * 625 *
622 * This is assuming that NUMA faults are handled using PROT_NONE. If 626 * This is assuming that NUMA faults are handled using PROT_NONE. If
623 * an architecture makes a different choice, it will need further 627 * an architecture makes a different choice, it will need further
624 * changes to the core. 628 * changes to the core.
625 */ 629 */
626 unsigned long change_prot_numa(struct vm_area_struct *vma, 630 unsigned long change_prot_numa(struct vm_area_struct *vma,
627 unsigned long addr, unsigned long end) 631 unsigned long addr, unsigned long end)
628 { 632 {
629 int nr_updated; 633 int nr_updated;
630 634
631 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); 635 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
632 if (nr_updated) 636 if (nr_updated)
633 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 637 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
634 638
635 return nr_updated; 639 return nr_updated;
636 } 640 }
637 #else 641 #else
638 static unsigned long change_prot_numa(struct vm_area_struct *vma, 642 static unsigned long change_prot_numa(struct vm_area_struct *vma,
639 unsigned long addr, unsigned long end) 643 unsigned long addr, unsigned long end)
640 { 644 {
641 return 0; 645 return 0;
642 } 646 }
643 #endif /* CONFIG_NUMA_BALANCING */ 647 #endif /* CONFIG_NUMA_BALANCING */
644 648
645 /* 649 /*
646 * Walk through page tables and collect pages to be migrated. 650 * Walk through page tables and collect pages to be migrated.
647 * 651 *
648 * If pages found in a given range are on a set of nodes (determined by 652 * If pages found in a given range are on a set of nodes (determined by
649 * @nodes and @flags,) it's isolated and queued to the pagelist which is 653 * @nodes and @flags,) it's isolated and queued to the pagelist which is
650 * passed via @private.) 654 * passed via @private.)
651 */ 655 */
652 static struct vm_area_struct * 656 static struct vm_area_struct *
653 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 657 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
654 const nodemask_t *nodes, unsigned long flags, void *private) 658 const nodemask_t *nodes, unsigned long flags, void *private)
655 { 659 {
656 int err; 660 int err;
657 struct vm_area_struct *first, *vma, *prev; 661 struct vm_area_struct *first, *vma, *prev;
658 662
659 663
660 first = find_vma(mm, start); 664 first = find_vma(mm, start);
661 if (!first) 665 if (!first)
662 return ERR_PTR(-EFAULT); 666 return ERR_PTR(-EFAULT);
663 prev = NULL; 667 prev = NULL;
664 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 668 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
665 unsigned long endvma = vma->vm_end; 669 unsigned long endvma = vma->vm_end;
666 670
667 if (endvma > end) 671 if (endvma > end)
668 endvma = end; 672 endvma = end;
669 if (vma->vm_start > start) 673 if (vma->vm_start > start)
670 start = vma->vm_start; 674 start = vma->vm_start;
671 675
672 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 676 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
673 if (!vma->vm_next && vma->vm_end < end) 677 if (!vma->vm_next && vma->vm_end < end)
674 return ERR_PTR(-EFAULT); 678 return ERR_PTR(-EFAULT);
675 if (prev && prev->vm_end < vma->vm_start) 679 if (prev && prev->vm_end < vma->vm_start)
676 return ERR_PTR(-EFAULT); 680 return ERR_PTR(-EFAULT);
677 } 681 }
678 682
679 if (flags & MPOL_MF_LAZY) { 683 if (flags & MPOL_MF_LAZY) {
680 change_prot_numa(vma, start, endvma); 684 change_prot_numa(vma, start, endvma);
681 goto next; 685 goto next;
682 } 686 }
683 687
684 if ((flags & MPOL_MF_STRICT) || 688 if ((flags & MPOL_MF_STRICT) ||
685 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 689 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
686 vma_migratable(vma))) { 690 vma_migratable(vma))) {
687 691
688 err = queue_pages_pgd_range(vma, start, endvma, nodes, 692 err = queue_pages_pgd_range(vma, start, endvma, nodes,
689 flags, private); 693 flags, private);
690 if (err) { 694 if (err) {
691 first = ERR_PTR(err); 695 first = ERR_PTR(err);
692 break; 696 break;
693 } 697 }
694 } 698 }
695 next: 699 next:
696 prev = vma; 700 prev = vma;
697 } 701 }
698 return first; 702 return first;
699 } 703 }
700 704
701 /* 705 /*
702 * Apply policy to a single VMA 706 * Apply policy to a single VMA
703 * This must be called with the mmap_sem held for writing. 707 * This must be called with the mmap_sem held for writing.
704 */ 708 */
705 static int vma_replace_policy(struct vm_area_struct *vma, 709 static int vma_replace_policy(struct vm_area_struct *vma,
706 struct mempolicy *pol) 710 struct mempolicy *pol)
707 { 711 {
708 int err; 712 int err;
709 struct mempolicy *old; 713 struct mempolicy *old;
710 struct mempolicy *new; 714 struct mempolicy *new;
711 715
712 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 716 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
713 vma->vm_start, vma->vm_end, vma->vm_pgoff, 717 vma->vm_start, vma->vm_end, vma->vm_pgoff,
714 vma->vm_ops, vma->vm_file, 718 vma->vm_ops, vma->vm_file,
715 vma->vm_ops ? vma->vm_ops->set_policy : NULL); 719 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
716 720
717 new = mpol_dup(pol); 721 new = mpol_dup(pol);
718 if (IS_ERR(new)) 722 if (IS_ERR(new))
719 return PTR_ERR(new); 723 return PTR_ERR(new);
720 724
721 if (vma->vm_ops && vma->vm_ops->set_policy) { 725 if (vma->vm_ops && vma->vm_ops->set_policy) {
722 err = vma->vm_ops->set_policy(vma, new); 726 err = vma->vm_ops->set_policy(vma, new);
723 if (err) 727 if (err)
724 goto err_out; 728 goto err_out;
725 } 729 }
726 730
727 old = vma->vm_policy; 731 old = vma->vm_policy;
728 vma->vm_policy = new; /* protected by mmap_sem */ 732 vma->vm_policy = new; /* protected by mmap_sem */
729 mpol_put(old); 733 mpol_put(old);
730 734
731 return 0; 735 return 0;
732 err_out: 736 err_out:
733 mpol_put(new); 737 mpol_put(new);
734 return err; 738 return err;
735 } 739 }
736 740
737 /* Step 2: apply policy to a range and do splits. */ 741 /* Step 2: apply policy to a range and do splits. */
738 static int mbind_range(struct mm_struct *mm, unsigned long start, 742 static int mbind_range(struct mm_struct *mm, unsigned long start,
739 unsigned long end, struct mempolicy *new_pol) 743 unsigned long end, struct mempolicy *new_pol)
740 { 744 {
741 struct vm_area_struct *next; 745 struct vm_area_struct *next;
742 struct vm_area_struct *prev; 746 struct vm_area_struct *prev;
743 struct vm_area_struct *vma; 747 struct vm_area_struct *vma;
744 int err = 0; 748 int err = 0;
745 pgoff_t pgoff; 749 pgoff_t pgoff;
746 unsigned long vmstart; 750 unsigned long vmstart;
747 unsigned long vmend; 751 unsigned long vmend;
748 752
749 vma = find_vma(mm, start); 753 vma = find_vma(mm, start);
750 if (!vma || vma->vm_start > start) 754 if (!vma || vma->vm_start > start)
751 return -EFAULT; 755 return -EFAULT;
752 756
753 prev = vma->vm_prev; 757 prev = vma->vm_prev;
754 if (start > vma->vm_start) 758 if (start > vma->vm_start)
755 prev = vma; 759 prev = vma;
756 760
757 for (; vma && vma->vm_start < end; prev = vma, vma = next) { 761 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
758 next = vma->vm_next; 762 next = vma->vm_next;
759 vmstart = max(start, vma->vm_start); 763 vmstart = max(start, vma->vm_start);
760 vmend = min(end, vma->vm_end); 764 vmend = min(end, vma->vm_end);
761 765
762 if (mpol_equal(vma_policy(vma), new_pol)) 766 if (mpol_equal(vma_policy(vma), new_pol))
763 continue; 767 continue;
764 768
765 pgoff = vma->vm_pgoff + 769 pgoff = vma->vm_pgoff +
766 ((vmstart - vma->vm_start) >> PAGE_SHIFT); 770 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
767 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 771 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
768 vma->anon_vma, vma->vm_file, pgoff, 772 vma->anon_vma, vma->vm_file, pgoff,
769 new_pol); 773 new_pol);
770 if (prev) { 774 if (prev) {
771 vma = prev; 775 vma = prev;
772 next = vma->vm_next; 776 next = vma->vm_next;
773 if (mpol_equal(vma_policy(vma), new_pol)) 777 if (mpol_equal(vma_policy(vma), new_pol))
774 continue; 778 continue;
775 /* vma_merge() joined vma && vma->next, case 8 */ 779 /* vma_merge() joined vma && vma->next, case 8 */
776 goto replace; 780 goto replace;
777 } 781 }
778 if (vma->vm_start != vmstart) { 782 if (vma->vm_start != vmstart) {
779 err = split_vma(vma->vm_mm, vma, vmstart, 1); 783 err = split_vma(vma->vm_mm, vma, vmstart, 1);
780 if (err) 784 if (err)
781 goto out; 785 goto out;
782 } 786 }
783 if (vma->vm_end != vmend) { 787 if (vma->vm_end != vmend) {
784 err = split_vma(vma->vm_mm, vma, vmend, 0); 788 err = split_vma(vma->vm_mm, vma, vmend, 0);
785 if (err) 789 if (err)
786 goto out; 790 goto out;
787 } 791 }
788 replace: 792 replace:
789 err = vma_replace_policy(vma, new_pol); 793 err = vma_replace_policy(vma, new_pol);
790 if (err) 794 if (err)
791 goto out; 795 goto out;
792 } 796 }
793 797
794 out: 798 out:
795 return err; 799 return err;
796 } 800 }
797 801
798 /* Set the process memory policy */ 802 /* Set the process memory policy */
799 static long do_set_mempolicy(unsigned short mode, unsigned short flags, 803 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
800 nodemask_t *nodes) 804 nodemask_t *nodes)
801 { 805 {
802 struct mempolicy *new, *old; 806 struct mempolicy *new, *old;
803 struct mm_struct *mm = current->mm; 807 struct mm_struct *mm = current->mm;
804 NODEMASK_SCRATCH(scratch); 808 NODEMASK_SCRATCH(scratch);
805 int ret; 809 int ret;
806 810
807 if (!scratch) 811 if (!scratch)
808 return -ENOMEM; 812 return -ENOMEM;
809 813
810 new = mpol_new(mode, flags, nodes); 814 new = mpol_new(mode, flags, nodes);
811 if (IS_ERR(new)) { 815 if (IS_ERR(new)) {
812 ret = PTR_ERR(new); 816 ret = PTR_ERR(new);
813 goto out; 817 goto out;
814 } 818 }
815 /* 819 /*
816 * prevent changing our mempolicy while show_numa_maps() 820 * prevent changing our mempolicy while show_numa_maps()
817 * is using it. 821 * is using it.
818 * Note: do_set_mempolicy() can be called at init time 822 * Note: do_set_mempolicy() can be called at init time
819 * with no 'mm'. 823 * with no 'mm'.
820 */ 824 */
821 if (mm) 825 if (mm)
822 down_write(&mm->mmap_sem); 826 down_write(&mm->mmap_sem);
823 task_lock(current); 827 task_lock(current);
824 ret = mpol_set_nodemask(new, nodes, scratch); 828 ret = mpol_set_nodemask(new, nodes, scratch);
825 if (ret) { 829 if (ret) {
826 task_unlock(current); 830 task_unlock(current);
827 if (mm) 831 if (mm)
828 up_write(&mm->mmap_sem); 832 up_write(&mm->mmap_sem);
829 mpol_put(new); 833 mpol_put(new);
830 goto out; 834 goto out;
831 } 835 }
832 old = current->mempolicy; 836 old = current->mempolicy;
833 current->mempolicy = new; 837 current->mempolicy = new;
834 if (new && new->mode == MPOL_INTERLEAVE && 838 if (new && new->mode == MPOL_INTERLEAVE &&
835 nodes_weight(new->v.nodes)) 839 nodes_weight(new->v.nodes))
836 current->il_next = first_node(new->v.nodes); 840 current->il_next = first_node(new->v.nodes);
837 task_unlock(current); 841 task_unlock(current);
838 if (mm) 842 if (mm)
839 up_write(&mm->mmap_sem); 843 up_write(&mm->mmap_sem);
840 844
841 mpol_put(old); 845 mpol_put(old);
842 ret = 0; 846 ret = 0;
843 out: 847 out:
844 NODEMASK_SCRATCH_FREE(scratch); 848 NODEMASK_SCRATCH_FREE(scratch);
845 return ret; 849 return ret;
846 } 850 }
847 851
848 /* 852 /*
849 * Return nodemask for policy for get_mempolicy() query 853 * Return nodemask for policy for get_mempolicy() query
850 * 854 *
851 * Called with task's alloc_lock held 855 * Called with task's alloc_lock held
852 */ 856 */
853 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 857 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
854 { 858 {
855 nodes_clear(*nodes); 859 nodes_clear(*nodes);
856 if (p == &default_policy) 860 if (p == &default_policy)
857 return; 861 return;
858 862
859 switch (p->mode) { 863 switch (p->mode) {
860 case MPOL_BIND: 864 case MPOL_BIND:
861 /* Fall through */ 865 /* Fall through */
862 case MPOL_INTERLEAVE: 866 case MPOL_INTERLEAVE:
863 *nodes = p->v.nodes; 867 *nodes = p->v.nodes;
864 break; 868 break;
865 case MPOL_PREFERRED: 869 case MPOL_PREFERRED:
866 if (!(p->flags & MPOL_F_LOCAL)) 870 if (!(p->flags & MPOL_F_LOCAL))
867 node_set(p->v.preferred_node, *nodes); 871 node_set(p->v.preferred_node, *nodes);
868 /* else return empty node mask for local allocation */ 872 /* else return empty node mask for local allocation */
869 break; 873 break;
870 default: 874 default:
871 BUG(); 875 BUG();
872 } 876 }
873 } 877 }
874 878
875 static int lookup_node(struct mm_struct *mm, unsigned long addr) 879 static int lookup_node(struct mm_struct *mm, unsigned long addr)
876 { 880 {
877 struct page *p; 881 struct page *p;
878 int err; 882 int err;
879 883
880 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 884 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
881 if (err >= 0) { 885 if (err >= 0) {
882 err = page_to_nid(p); 886 err = page_to_nid(p);
883 put_page(p); 887 put_page(p);
884 } 888 }
885 return err; 889 return err;
886 } 890 }
887 891
888 /* Retrieve NUMA policy */ 892 /* Retrieve NUMA policy */
889 static long do_get_mempolicy(int *policy, nodemask_t *nmask, 893 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
890 unsigned long addr, unsigned long flags) 894 unsigned long addr, unsigned long flags)
891 { 895 {
892 int err; 896 int err;
893 struct mm_struct *mm = current->mm; 897 struct mm_struct *mm = current->mm;
894 struct vm_area_struct *vma = NULL; 898 struct vm_area_struct *vma = NULL;
895 struct mempolicy *pol = current->mempolicy; 899 struct mempolicy *pol = current->mempolicy;
896 900
897 if (flags & 901 if (flags &
898 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 902 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
899 return -EINVAL; 903 return -EINVAL;
900 904
901 if (flags & MPOL_F_MEMS_ALLOWED) { 905 if (flags & MPOL_F_MEMS_ALLOWED) {
902 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 906 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
903 return -EINVAL; 907 return -EINVAL;
904 *policy = 0; /* just so it's initialized */ 908 *policy = 0; /* just so it's initialized */
905 task_lock(current); 909 task_lock(current);
906 *nmask = cpuset_current_mems_allowed; 910 *nmask = cpuset_current_mems_allowed;
907 task_unlock(current); 911 task_unlock(current);
908 return 0; 912 return 0;
909 } 913 }
910 914
911 if (flags & MPOL_F_ADDR) { 915 if (flags & MPOL_F_ADDR) {
912 /* 916 /*
913 * Do NOT fall back to task policy if the 917 * Do NOT fall back to task policy if the
914 * vma/shared policy at addr is NULL. We 918 * vma/shared policy at addr is NULL. We
915 * want to return MPOL_DEFAULT in this case. 919 * want to return MPOL_DEFAULT in this case.
916 */ 920 */
917 down_read(&mm->mmap_sem); 921 down_read(&mm->mmap_sem);
918 vma = find_vma_intersection(mm, addr, addr+1); 922 vma = find_vma_intersection(mm, addr, addr+1);
919 if (!vma) { 923 if (!vma) {
920 up_read(&mm->mmap_sem); 924 up_read(&mm->mmap_sem);
921 return -EFAULT; 925 return -EFAULT;
922 } 926 }
923 if (vma->vm_ops && vma->vm_ops->get_policy) 927 if (vma->vm_ops && vma->vm_ops->get_policy)
924 pol = vma->vm_ops->get_policy(vma, addr); 928 pol = vma->vm_ops->get_policy(vma, addr);
925 else 929 else
926 pol = vma->vm_policy; 930 pol = vma->vm_policy;
927 } else if (addr) 931 } else if (addr)
928 return -EINVAL; 932 return -EINVAL;
929 933
930 if (!pol) 934 if (!pol)
931 pol = &default_policy; /* indicates default behavior */ 935 pol = &default_policy; /* indicates default behavior */
932 936
933 if (flags & MPOL_F_NODE) { 937 if (flags & MPOL_F_NODE) {
934 if (flags & MPOL_F_ADDR) { 938 if (flags & MPOL_F_ADDR) {
935 err = lookup_node(mm, addr); 939 err = lookup_node(mm, addr);
936 if (err < 0) 940 if (err < 0)
937 goto out; 941 goto out;
938 *policy = err; 942 *policy = err;
939 } else if (pol == current->mempolicy && 943 } else if (pol == current->mempolicy &&
940 pol->mode == MPOL_INTERLEAVE) { 944 pol->mode == MPOL_INTERLEAVE) {
941 *policy = current->il_next; 945 *policy = current->il_next;
942 } else { 946 } else {
943 err = -EINVAL; 947 err = -EINVAL;
944 goto out; 948 goto out;
945 } 949 }
946 } else { 950 } else {
947 *policy = pol == &default_policy ? MPOL_DEFAULT : 951 *policy = pol == &default_policy ? MPOL_DEFAULT :
948 pol->mode; 952 pol->mode;
949 /* 953 /*
950 * Internal mempolicy flags must be masked off before exposing 954 * Internal mempolicy flags must be masked off before exposing
951 * the policy to userspace. 955 * the policy to userspace.
952 */ 956 */
953 *policy |= (pol->flags & MPOL_MODE_FLAGS); 957 *policy |= (pol->flags & MPOL_MODE_FLAGS);
954 } 958 }
955 959
956 if (vma) { 960 if (vma) {
957 up_read(&current->mm->mmap_sem); 961 up_read(&current->mm->mmap_sem);
958 vma = NULL; 962 vma = NULL;
959 } 963 }
960 964
961 err = 0; 965 err = 0;
962 if (nmask) { 966 if (nmask) {
963 if (mpol_store_user_nodemask(pol)) { 967 if (mpol_store_user_nodemask(pol)) {
964 *nmask = pol->w.user_nodemask; 968 *nmask = pol->w.user_nodemask;
965 } else { 969 } else {
966 task_lock(current); 970 task_lock(current);
967 get_policy_nodemask(pol, nmask); 971 get_policy_nodemask(pol, nmask);
968 task_unlock(current); 972 task_unlock(current);
969 } 973 }
970 } 974 }
971 975
972 out: 976 out:
973 mpol_cond_put(pol); 977 mpol_cond_put(pol);
974 if (vma) 978 if (vma)
975 up_read(&current->mm->mmap_sem); 979 up_read(&current->mm->mmap_sem);
976 return err; 980 return err;
977 } 981 }
978 982
979 #ifdef CONFIG_MIGRATION 983 #ifdef CONFIG_MIGRATION
980 /* 984 /*
981 * page migration 985 * page migration
982 */ 986 */
983 static void migrate_page_add(struct page *page, struct list_head *pagelist, 987 static void migrate_page_add(struct page *page, struct list_head *pagelist,
984 unsigned long flags) 988 unsigned long flags)
985 { 989 {
986 /* 990 /*
987 * Avoid migrating a page that is shared with others. 991 * Avoid migrating a page that is shared with others.
988 */ 992 */
989 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 993 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
990 if (!isolate_lru_page(page)) { 994 if (!isolate_lru_page(page)) {
991 list_add_tail(&page->lru, pagelist); 995 list_add_tail(&page->lru, pagelist);
992 inc_zone_page_state(page, NR_ISOLATED_ANON + 996 inc_zone_page_state(page, NR_ISOLATED_ANON +
993 page_is_file_cache(page)); 997 page_is_file_cache(page));
994 } 998 }
995 } 999 }
996 } 1000 }
997 1001
998 static struct page *new_node_page(struct page *page, unsigned long node, int **x) 1002 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
999 { 1003 {
1000 if (PageHuge(page)) 1004 if (PageHuge(page))
1001 return alloc_huge_page_node(page_hstate(compound_head(page)), 1005 return alloc_huge_page_node(page_hstate(compound_head(page)),
1002 node); 1006 node);
1003 else 1007 else
1004 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); 1008 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1005 } 1009 }
1006 1010
1007 /* 1011 /*
1008 * Migrate pages from one node to a target node. 1012 * Migrate pages from one node to a target node.
1009 * Returns error or the number of pages not migrated. 1013 * Returns error or the number of pages not migrated.
1010 */ 1014 */
1011 static int migrate_to_node(struct mm_struct *mm, int source, int dest, 1015 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1012 int flags) 1016 int flags)
1013 { 1017 {
1014 nodemask_t nmask; 1018 nodemask_t nmask;
1015 LIST_HEAD(pagelist); 1019 LIST_HEAD(pagelist);
1016 int err = 0; 1020 int err = 0;
1017 1021
1018 nodes_clear(nmask); 1022 nodes_clear(nmask);
1019 node_set(source, nmask); 1023 node_set(source, nmask);
1020 1024
1021 /* 1025 /*
1022 * This does not "check" the range but isolates all pages that 1026 * This does not "check" the range but isolates all pages that
1023 * need migration. Between passing in the full user address 1027 * need migration. Between passing in the full user address
1024 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. 1028 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1025 */ 1029 */
1026 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1030 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1027 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 1031 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1028 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1032 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1029 1033
1030 if (!list_empty(&pagelist)) { 1034 if (!list_empty(&pagelist)) {
1031 err = migrate_pages(&pagelist, new_node_page, dest, 1035 err = migrate_pages(&pagelist, new_node_page, dest,
1032 MIGRATE_SYNC, MR_SYSCALL); 1036 MIGRATE_SYNC, MR_SYSCALL);
1033 if (err) 1037 if (err)
1034 putback_movable_pages(&pagelist); 1038 putback_movable_pages(&pagelist);
1035 } 1039 }
1036 1040
1037 return err; 1041 return err;
1038 } 1042 }
1039 1043
1040 /* 1044 /*
1041 * Move pages between the two nodesets so as to preserve the physical 1045 * Move pages between the two nodesets so as to preserve the physical
1042 * layout as much as possible. 1046 * layout as much as possible.
1043 * 1047 *
1044 * Returns the number of page that could not be moved. 1048 * Returns the number of page that could not be moved.
1045 */ 1049 */
1046 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1050 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1047 const nodemask_t *to, int flags) 1051 const nodemask_t *to, int flags)
1048 { 1052 {
1049 int busy = 0; 1053 int busy = 0;
1050 int err; 1054 int err;
1051 nodemask_t tmp; 1055 nodemask_t tmp;
1052 1056
1053 err = migrate_prep(); 1057 err = migrate_prep();
1054 if (err) 1058 if (err)
1055 return err; 1059 return err;
1056 1060
1057 down_read(&mm->mmap_sem); 1061 down_read(&mm->mmap_sem);
1058 1062
1059 err = migrate_vmas(mm, from, to, flags); 1063 err = migrate_vmas(mm, from, to, flags);
1060 if (err) 1064 if (err)
1061 goto out; 1065 goto out;
1062 1066
1063 /* 1067 /*
1064 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 1068 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1065 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 1069 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1066 * bit in 'tmp', and return that <source, dest> pair for migration. 1070 * bit in 'tmp', and return that <source, dest> pair for migration.
1067 * The pair of nodemasks 'to' and 'from' define the map. 1071 * The pair of nodemasks 'to' and 'from' define the map.
1068 * 1072 *
1069 * If no pair of bits is found that way, fallback to picking some 1073 * If no pair of bits is found that way, fallback to picking some
1070 * pair of 'source' and 'dest' bits that are not the same. If the 1074 * pair of 'source' and 'dest' bits that are not the same. If the
1071 * 'source' and 'dest' bits are the same, this represents a node 1075 * 'source' and 'dest' bits are the same, this represents a node
1072 * that will be migrating to itself, so no pages need move. 1076 * that will be migrating to itself, so no pages need move.
1073 * 1077 *
1074 * If no bits are left in 'tmp', or if all remaining bits left 1078 * If no bits are left in 'tmp', or if all remaining bits left
1075 * in 'tmp' correspond to the same bit in 'to', return false 1079 * in 'tmp' correspond to the same bit in 'to', return false
1076 * (nothing left to migrate). 1080 * (nothing left to migrate).
1077 * 1081 *
1078 * This lets us pick a pair of nodes to migrate between, such that 1082 * This lets us pick a pair of nodes to migrate between, such that
1079 * if possible the dest node is not already occupied by some other 1083 * if possible the dest node is not already occupied by some other
1080 * source node, minimizing the risk of overloading the memory on a 1084 * source node, minimizing the risk of overloading the memory on a
1081 * node that would happen if we migrated incoming memory to a node 1085 * node that would happen if we migrated incoming memory to a node
1082 * before migrating outgoing memory source that same node. 1086 * before migrating outgoing memory source that same node.
1083 * 1087 *
1084 * A single scan of tmp is sufficient. As we go, we remember the 1088 * A single scan of tmp is sufficient. As we go, we remember the
1085 * most recent <s, d> pair that moved (s != d). If we find a pair 1089 * most recent <s, d> pair that moved (s != d). If we find a pair
1086 * that not only moved, but what's better, moved to an empty slot 1090 * that not only moved, but what's better, moved to an empty slot
1087 * (d is not set in tmp), then we break out then, with that pair. 1091 * (d is not set in tmp), then we break out then, with that pair.
1088 * Otherwise when we finish scanning from_tmp, we at least have the 1092 * Otherwise when we finish scanning from_tmp, we at least have the
1089 * most recent <s, d> pair that moved. If we get all the way through 1093 * most recent <s, d> pair that moved. If we get all the way through
1090 * the scan of tmp without finding any node that moved, much less 1094 * the scan of tmp without finding any node that moved, much less
1091 * moved to an empty node, then there is nothing left worth migrating. 1095 * moved to an empty node, then there is nothing left worth migrating.
1092 */ 1096 */
1093 1097
1094 tmp = *from; 1098 tmp = *from;
1095 while (!nodes_empty(tmp)) { 1099 while (!nodes_empty(tmp)) {
1096 int s,d; 1100 int s,d;
1097 int source = NUMA_NO_NODE; 1101 int source = NUMA_NO_NODE;
1098 int dest = 0; 1102 int dest = 0;
1099 1103
1100 for_each_node_mask(s, tmp) { 1104 for_each_node_mask(s, tmp) {
1101 1105
1102 /* 1106 /*
1103 * do_migrate_pages() tries to maintain the relative 1107 * do_migrate_pages() tries to maintain the relative
1104 * node relationship of the pages established between 1108 * node relationship of the pages established between
1105 * threads and memory areas. 1109 * threads and memory areas.
1106 * 1110 *
1107 * However if the number of source nodes is not equal to 1111 * However if the number of source nodes is not equal to
1108 * the number of destination nodes we can not preserve 1112 * the number of destination nodes we can not preserve
1109 * this node relative relationship. In that case, skip 1113 * this node relative relationship. In that case, skip
1110 * copying memory from a node that is in the destination 1114 * copying memory from a node that is in the destination
1111 * mask. 1115 * mask.
1112 * 1116 *
1113 * Example: [2,3,4] -> [3,4,5] moves everything. 1117 * Example: [2,3,4] -> [3,4,5] moves everything.
1114 * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 1118 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1115 */ 1119 */
1116 1120
1117 if ((nodes_weight(*from) != nodes_weight(*to)) && 1121 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1118 (node_isset(s, *to))) 1122 (node_isset(s, *to)))
1119 continue; 1123 continue;
1120 1124
1121 d = node_remap(s, *from, *to); 1125 d = node_remap(s, *from, *to);
1122 if (s == d) 1126 if (s == d)
1123 continue; 1127 continue;
1124 1128
1125 source = s; /* Node moved. Memorize */ 1129 source = s; /* Node moved. Memorize */
1126 dest = d; 1130 dest = d;
1127 1131
1128 /* dest not in remaining from nodes? */ 1132 /* dest not in remaining from nodes? */
1129 if (!node_isset(dest, tmp)) 1133 if (!node_isset(dest, tmp))
1130 break; 1134 break;
1131 } 1135 }
1132 if (source == NUMA_NO_NODE) 1136 if (source == NUMA_NO_NODE)
1133 break; 1137 break;
1134 1138
1135 node_clear(source, tmp); 1139 node_clear(source, tmp);
1136 err = migrate_to_node(mm, source, dest, flags); 1140 err = migrate_to_node(mm, source, dest, flags);
1137 if (err > 0) 1141 if (err > 0)
1138 busy += err; 1142 busy += err;
1139 if (err < 0) 1143 if (err < 0)
1140 break; 1144 break;
1141 } 1145 }
1142 out: 1146 out:
1143 up_read(&mm->mmap_sem); 1147 up_read(&mm->mmap_sem);
1144 if (err < 0) 1148 if (err < 0)
1145 return err; 1149 return err;
1146 return busy; 1150 return busy;
1147 1151
1148 } 1152 }
1149 1153
1150 /* 1154 /*
1151 * Allocate a new page for page migration based on vma policy. 1155 * Allocate a new page for page migration based on vma policy.
1152 * Start assuming that page is mapped by vma pointed to by @private. 1156 * Start assuming that page is mapped by vma pointed to by @private.
1153 * Search forward from there, if not. N.B., this assumes that the 1157 * Search forward from there, if not. N.B., this assumes that the
1154 * list of pages handed to migrate_pages()--which is how we get here-- 1158 * list of pages handed to migrate_pages()--which is how we get here--
1155 * is in virtual address order. 1159 * is in virtual address order.
1156 */ 1160 */
1157 static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 1161 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1158 { 1162 {
1159 struct vm_area_struct *vma = (struct vm_area_struct *)private; 1163 struct vm_area_struct *vma = (struct vm_area_struct *)private;
1160 unsigned long uninitialized_var(address); 1164 unsigned long uninitialized_var(address);
1161 1165
1162 while (vma) { 1166 while (vma) {
1163 address = page_address_in_vma(page, vma); 1167 address = page_address_in_vma(page, vma);
1164 if (address != -EFAULT) 1168 if (address != -EFAULT)
1165 break; 1169 break;
1166 vma = vma->vm_next; 1170 vma = vma->vm_next;
1167 } 1171 }
1168 1172
1169 if (PageHuge(page)) { 1173 if (PageHuge(page)) {
1170 BUG_ON(!vma); 1174 BUG_ON(!vma);
1171 return alloc_huge_page_noerr(vma, address, 1); 1175 return alloc_huge_page_noerr(vma, address, 1);
1172 } 1176 }
1173 /* 1177 /*
1174 * if !vma, alloc_page_vma() will use task or system default policy 1178 * if !vma, alloc_page_vma() will use task or system default policy
1175 */ 1179 */
1176 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1180 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1177 } 1181 }
1178 #else 1182 #else
1179 1183
1180 static void migrate_page_add(struct page *page, struct list_head *pagelist, 1184 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1181 unsigned long flags) 1185 unsigned long flags)
1182 { 1186 {
1183 } 1187 }
1184 1188
1185 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 1189 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1186 const nodemask_t *to, int flags) 1190 const nodemask_t *to, int flags)
1187 { 1191 {
1188 return -ENOSYS; 1192 return -ENOSYS;
1189 } 1193 }
1190 1194
1191 static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 1195 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1192 { 1196 {
1193 return NULL; 1197 return NULL;
1194 } 1198 }
1195 #endif 1199 #endif
1196 1200
1197 static long do_mbind(unsigned long start, unsigned long len, 1201 static long do_mbind(unsigned long start, unsigned long len,
1198 unsigned short mode, unsigned short mode_flags, 1202 unsigned short mode, unsigned short mode_flags,
1199 nodemask_t *nmask, unsigned long flags) 1203 nodemask_t *nmask, unsigned long flags)
1200 { 1204 {
1201 struct vm_area_struct *vma; 1205 struct vm_area_struct *vma;
1202 struct mm_struct *mm = current->mm; 1206 struct mm_struct *mm = current->mm;
1203 struct mempolicy *new; 1207 struct mempolicy *new;
1204 unsigned long end; 1208 unsigned long end;
1205 int err; 1209 int err;
1206 LIST_HEAD(pagelist); 1210 LIST_HEAD(pagelist);
1207 1211
1208 if (flags & ~(unsigned long)MPOL_MF_VALID) 1212 if (flags & ~(unsigned long)MPOL_MF_VALID)
1209 return -EINVAL; 1213 return -EINVAL;
1210 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1214 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1211 return -EPERM; 1215 return -EPERM;
1212 1216
1213 if (start & ~PAGE_MASK) 1217 if (start & ~PAGE_MASK)
1214 return -EINVAL; 1218 return -EINVAL;
1215 1219
1216 if (mode == MPOL_DEFAULT) 1220 if (mode == MPOL_DEFAULT)
1217 flags &= ~MPOL_MF_STRICT; 1221 flags &= ~MPOL_MF_STRICT;
1218 1222
1219 len = (len + PAGE_SIZE - 1) & PAGE_MASK; 1223 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1220 end = start + len; 1224 end = start + len;
1221 1225
1222 if (end < start) 1226 if (end < start)
1223 return -EINVAL; 1227 return -EINVAL;
1224 if (end == start) 1228 if (end == start)
1225 return 0; 1229 return 0;
1226 1230
1227 new = mpol_new(mode, mode_flags, nmask); 1231 new = mpol_new(mode, mode_flags, nmask);
1228 if (IS_ERR(new)) 1232 if (IS_ERR(new))
1229 return PTR_ERR(new); 1233 return PTR_ERR(new);
1230 1234
1231 if (flags & MPOL_MF_LAZY) 1235 if (flags & MPOL_MF_LAZY)
1232 new->flags |= MPOL_F_MOF; 1236 new->flags |= MPOL_F_MOF;
1233 1237
1234 /* 1238 /*
1235 * If we are using the default policy then operation 1239 * If we are using the default policy then operation
1236 * on discontinuous address spaces is okay after all 1240 * on discontinuous address spaces is okay after all
1237 */ 1241 */
1238 if (!new) 1242 if (!new)
1239 flags |= MPOL_MF_DISCONTIG_OK; 1243 flags |= MPOL_MF_DISCONTIG_OK;
1240 1244
1241 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", 1245 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1242 start, start + len, mode, mode_flags, 1246 start, start + len, mode, mode_flags,
1243 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); 1247 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1244 1248
1245 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 1249 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1246 1250
1247 err = migrate_prep(); 1251 err = migrate_prep();
1248 if (err) 1252 if (err)
1249 goto mpol_out; 1253 goto mpol_out;
1250 } 1254 }
1251 { 1255 {
1252 NODEMASK_SCRATCH(scratch); 1256 NODEMASK_SCRATCH(scratch);
1253 if (scratch) { 1257 if (scratch) {
1254 down_write(&mm->mmap_sem); 1258 down_write(&mm->mmap_sem);
1255 task_lock(current); 1259 task_lock(current);
1256 err = mpol_set_nodemask(new, nmask, scratch); 1260 err = mpol_set_nodemask(new, nmask, scratch);
1257 task_unlock(current); 1261 task_unlock(current);
1258 if (err) 1262 if (err)
1259 up_write(&mm->mmap_sem); 1263 up_write(&mm->mmap_sem);
1260 } else 1264 } else
1261 err = -ENOMEM; 1265 err = -ENOMEM;
1262 NODEMASK_SCRATCH_FREE(scratch); 1266 NODEMASK_SCRATCH_FREE(scratch);
1263 } 1267 }
1264 if (err) 1268 if (err)
1265 goto mpol_out; 1269 goto mpol_out;
1266 1270
1267 vma = queue_pages_range(mm, start, end, nmask, 1271 vma = queue_pages_range(mm, start, end, nmask,
1268 flags | MPOL_MF_INVERT, &pagelist); 1272 flags | MPOL_MF_INVERT, &pagelist);
1269 1273
1270 err = PTR_ERR(vma); /* maybe ... */ 1274 err = PTR_ERR(vma); /* maybe ... */
1271 if (!IS_ERR(vma)) 1275 if (!IS_ERR(vma))
1272 err = mbind_range(mm, start, end, new); 1276 err = mbind_range(mm, start, end, new);
1273 1277
1274 if (!err) { 1278 if (!err) {
1275 int nr_failed = 0; 1279 int nr_failed = 0;
1276 1280
1277 if (!list_empty(&pagelist)) { 1281 if (!list_empty(&pagelist)) {
1278 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1282 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1279 nr_failed = migrate_pages(&pagelist, new_vma_page, 1283 nr_failed = migrate_pages(&pagelist, new_vma_page,
1280 (unsigned long)vma, 1284 (unsigned long)vma,
1281 MIGRATE_SYNC, MR_MEMPOLICY_MBIND); 1285 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1282 if (nr_failed) 1286 if (nr_failed)
1283 putback_movable_pages(&pagelist); 1287 putback_movable_pages(&pagelist);
1284 } 1288 }
1285 1289
1286 if (nr_failed && (flags & MPOL_MF_STRICT)) 1290 if (nr_failed && (flags & MPOL_MF_STRICT))
1287 err = -EIO; 1291 err = -EIO;
1288 } else 1292 } else
1289 putback_movable_pages(&pagelist); 1293 putback_movable_pages(&pagelist);
1290 1294
1291 up_write(&mm->mmap_sem); 1295 up_write(&mm->mmap_sem);
1292 mpol_out: 1296 mpol_out:
1293 mpol_put(new); 1297 mpol_put(new);
1294 return err; 1298 return err;
1295 } 1299 }
1296 1300
1297 /* 1301 /*
1298 * User space interface with variable sized bitmaps for nodelists. 1302 * User space interface with variable sized bitmaps for nodelists.
1299 */ 1303 */
1300 1304
1301 /* Copy a node mask from user space. */ 1305 /* Copy a node mask from user space. */
1302 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 1306 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1303 unsigned long maxnode) 1307 unsigned long maxnode)
1304 { 1308 {
1305 unsigned long k; 1309 unsigned long k;
1306 unsigned long nlongs; 1310 unsigned long nlongs;
1307 unsigned long endmask; 1311 unsigned long endmask;
1308 1312
1309 --maxnode; 1313 --maxnode;
1310 nodes_clear(*nodes); 1314 nodes_clear(*nodes);
1311 if (maxnode == 0 || !nmask) 1315 if (maxnode == 0 || !nmask)
1312 return 0; 1316 return 0;
1313 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 1317 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1314 return -EINVAL; 1318 return -EINVAL;
1315 1319
1316 nlongs = BITS_TO_LONGS(maxnode); 1320 nlongs = BITS_TO_LONGS(maxnode);
1317 if ((maxnode % BITS_PER_LONG) == 0) 1321 if ((maxnode % BITS_PER_LONG) == 0)
1318 endmask = ~0UL; 1322 endmask = ~0UL;
1319 else 1323 else
1320 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 1324 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1321 1325
1322 /* When the user specified more nodes than supported just check 1326 /* When the user specified more nodes than supported just check
1323 if the non supported part is all zero. */ 1327 if the non supported part is all zero. */
1324 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 1328 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1325 if (nlongs > PAGE_SIZE/sizeof(long)) 1329 if (nlongs > PAGE_SIZE/sizeof(long))
1326 return -EINVAL; 1330 return -EINVAL;
1327 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 1331 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1328 unsigned long t; 1332 unsigned long t;
1329 if (get_user(t, nmask + k)) 1333 if (get_user(t, nmask + k))
1330 return -EFAULT; 1334 return -EFAULT;
1331 if (k == nlongs - 1) { 1335 if (k == nlongs - 1) {
1332 if (t & endmask) 1336 if (t & endmask)
1333 return -EINVAL; 1337 return -EINVAL;
1334 } else if (t) 1338 } else if (t)
1335 return -EINVAL; 1339 return -EINVAL;
1336 } 1340 }
1337 nlongs = BITS_TO_LONGS(MAX_NUMNODES); 1341 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1338 endmask = ~0UL; 1342 endmask = ~0UL;
1339 } 1343 }
1340 1344
1341 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) 1345 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1342 return -EFAULT; 1346 return -EFAULT;
1343 nodes_addr(*nodes)[nlongs-1] &= endmask; 1347 nodes_addr(*nodes)[nlongs-1] &= endmask;
1344 return 0; 1348 return 0;
1345 } 1349 }
1346 1350
1347 /* Copy a kernel node mask to user space */ 1351 /* Copy a kernel node mask to user space */
1348 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 1352 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1349 nodemask_t *nodes) 1353 nodemask_t *nodes)
1350 { 1354 {
1351 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 1355 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1352 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); 1356 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1353 1357
1354 if (copy > nbytes) { 1358 if (copy > nbytes) {
1355 if (copy > PAGE_SIZE) 1359 if (copy > PAGE_SIZE)
1356 return -EINVAL; 1360 return -EINVAL;
1357 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 1361 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1358 return -EFAULT; 1362 return -EFAULT;
1359 copy = nbytes; 1363 copy = nbytes;
1360 } 1364 }
1361 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1365 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1362 } 1366 }
1363 1367
1364 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 1368 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1365 unsigned long, mode, unsigned long __user *, nmask, 1369 unsigned long, mode, unsigned long __user *, nmask,
1366 unsigned long, maxnode, unsigned, flags) 1370 unsigned long, maxnode, unsigned, flags)
1367 { 1371 {
1368 nodemask_t nodes; 1372 nodemask_t nodes;
1369 int err; 1373 int err;
1370 unsigned short mode_flags; 1374 unsigned short mode_flags;
1371 1375
1372 mode_flags = mode & MPOL_MODE_FLAGS; 1376 mode_flags = mode & MPOL_MODE_FLAGS;
1373 mode &= ~MPOL_MODE_FLAGS; 1377 mode &= ~MPOL_MODE_FLAGS;
1374 if (mode >= MPOL_MAX) 1378 if (mode >= MPOL_MAX)
1375 return -EINVAL; 1379 return -EINVAL;
1376 if ((mode_flags & MPOL_F_STATIC_NODES) && 1380 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1377 (mode_flags & MPOL_F_RELATIVE_NODES)) 1381 (mode_flags & MPOL_F_RELATIVE_NODES))
1378 return -EINVAL; 1382 return -EINVAL;
1379 err = get_nodes(&nodes, nmask, maxnode); 1383 err = get_nodes(&nodes, nmask, maxnode);
1380 if (err) 1384 if (err)
1381 return err; 1385 return err;
1382 return do_mbind(start, len, mode, mode_flags, &nodes, flags); 1386 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1383 } 1387 }
1384 1388
1385 /* Set the process memory policy */ 1389 /* Set the process memory policy */
1386 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, 1390 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1387 unsigned long, maxnode) 1391 unsigned long, maxnode)
1388 { 1392 {
1389 int err; 1393 int err;
1390 nodemask_t nodes; 1394 nodemask_t nodes;
1391 unsigned short flags; 1395 unsigned short flags;
1392 1396
1393 flags = mode & MPOL_MODE_FLAGS; 1397 flags = mode & MPOL_MODE_FLAGS;
1394 mode &= ~MPOL_MODE_FLAGS; 1398 mode &= ~MPOL_MODE_FLAGS;
1395 if ((unsigned int)mode >= MPOL_MAX) 1399 if ((unsigned int)mode >= MPOL_MAX)
1396 return -EINVAL; 1400 return -EINVAL;
1397 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) 1401 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1398 return -EINVAL; 1402 return -EINVAL;
1399 err = get_nodes(&nodes, nmask, maxnode); 1403 err = get_nodes(&nodes, nmask, maxnode);
1400 if (err) 1404 if (err)
1401 return err; 1405 return err;
1402 return do_set_mempolicy(mode, flags, &nodes); 1406 return do_set_mempolicy(mode, flags, &nodes);
1403 } 1407 }
1404 1408
1405 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 1409 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1406 const unsigned long __user *, old_nodes, 1410 const unsigned long __user *, old_nodes,
1407 const unsigned long __user *, new_nodes) 1411 const unsigned long __user *, new_nodes)
1408 { 1412 {
1409 const struct cred *cred = current_cred(), *tcred; 1413 const struct cred *cred = current_cred(), *tcred;
1410 struct mm_struct *mm = NULL; 1414 struct mm_struct *mm = NULL;
1411 struct task_struct *task; 1415 struct task_struct *task;
1412 nodemask_t task_nodes; 1416 nodemask_t task_nodes;
1413 int err; 1417 int err;
1414 nodemask_t *old; 1418 nodemask_t *old;
1415 nodemask_t *new; 1419 nodemask_t *new;
1416 NODEMASK_SCRATCH(scratch); 1420 NODEMASK_SCRATCH(scratch);
1417 1421
1418 if (!scratch) 1422 if (!scratch)
1419 return -ENOMEM; 1423 return -ENOMEM;
1420 1424
1421 old = &scratch->mask1; 1425 old = &scratch->mask1;
1422 new = &scratch->mask2; 1426 new = &scratch->mask2;
1423 1427
1424 err = get_nodes(old, old_nodes, maxnode); 1428 err = get_nodes(old, old_nodes, maxnode);
1425 if (err) 1429 if (err)
1426 goto out; 1430 goto out;
1427 1431
1428 err = get_nodes(new, new_nodes, maxnode); 1432 err = get_nodes(new, new_nodes, maxnode);
1429 if (err) 1433 if (err)
1430 goto out; 1434 goto out;
1431 1435
1432 /* Find the mm_struct */ 1436 /* Find the mm_struct */
1433 rcu_read_lock(); 1437 rcu_read_lock();
1434 task = pid ? find_task_by_vpid(pid) : current; 1438 task = pid ? find_task_by_vpid(pid) : current;
1435 if (!task) { 1439 if (!task) {
1436 rcu_read_unlock(); 1440 rcu_read_unlock();
1437 err = -ESRCH; 1441 err = -ESRCH;
1438 goto out; 1442 goto out;
1439 } 1443 }
1440 get_task_struct(task); 1444 get_task_struct(task);
1441 1445
1442 err = -EINVAL; 1446 err = -EINVAL;
1443 1447
1444 /* 1448 /*
1445 * Check if this process has the right to modify the specified 1449 * Check if this process has the right to modify the specified
1446 * process. The right exists if the process has administrative 1450 * process. The right exists if the process has administrative
1447 * capabilities, superuser privileges or the same 1451 * capabilities, superuser privileges or the same
1448 * userid as the target process. 1452 * userid as the target process.
1449 */ 1453 */
1450 tcred = __task_cred(task); 1454 tcred = __task_cred(task);
1451 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && 1455 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1452 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && 1456 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1453 !capable(CAP_SYS_NICE)) { 1457 !capable(CAP_SYS_NICE)) {
1454 rcu_read_unlock(); 1458 rcu_read_unlock();
1455 err = -EPERM; 1459 err = -EPERM;
1456 goto out_put; 1460 goto out_put;
1457 } 1461 }
1458 rcu_read_unlock(); 1462 rcu_read_unlock();
1459 1463
1460 task_nodes = cpuset_mems_allowed(task); 1464 task_nodes = cpuset_mems_allowed(task);
1461 /* Is the user allowed to access the target nodes? */ 1465 /* Is the user allowed to access the target nodes? */
1462 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1466 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1463 err = -EPERM; 1467 err = -EPERM;
1464 goto out_put; 1468 goto out_put;
1465 } 1469 }
1466 1470
1467 if (!nodes_subset(*new, node_states[N_MEMORY])) { 1471 if (!nodes_subset(*new, node_states[N_MEMORY])) {
1468 err = -EINVAL; 1472 err = -EINVAL;
1469 goto out_put; 1473 goto out_put;
1470 } 1474 }
1471 1475
1472 err = security_task_movememory(task); 1476 err = security_task_movememory(task);
1473 if (err) 1477 if (err)
1474 goto out_put; 1478 goto out_put;
1475 1479
1476 mm = get_task_mm(task); 1480 mm = get_task_mm(task);
1477 put_task_struct(task); 1481 put_task_struct(task);
1478 1482
1479 if (!mm) { 1483 if (!mm) {
1480 err = -EINVAL; 1484 err = -EINVAL;
1481 goto out; 1485 goto out;
1482 } 1486 }
1483 1487
1484 err = do_migrate_pages(mm, old, new, 1488 err = do_migrate_pages(mm, old, new,
1485 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1489 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1486 1490
1487 mmput(mm); 1491 mmput(mm);
1488 out: 1492 out:
1489 NODEMASK_SCRATCH_FREE(scratch); 1493 NODEMASK_SCRATCH_FREE(scratch);
1490 1494
1491 return err; 1495 return err;
1492 1496
1493 out_put: 1497 out_put:
1494 put_task_struct(task); 1498 put_task_struct(task);
1495 goto out; 1499 goto out;
1496 1500
1497 } 1501 }
1498 1502
1499 1503
1500 /* Retrieve NUMA policy */ 1504 /* Retrieve NUMA policy */
1501 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1505 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1502 unsigned long __user *, nmask, unsigned long, maxnode, 1506 unsigned long __user *, nmask, unsigned long, maxnode,
1503 unsigned long, addr, unsigned long, flags) 1507 unsigned long, addr, unsigned long, flags)
1504 { 1508 {
1505 int err; 1509 int err;
1506 int uninitialized_var(pval); 1510 int uninitialized_var(pval);
1507 nodemask_t nodes; 1511 nodemask_t nodes;
1508 1512
1509 if (nmask != NULL && maxnode < MAX_NUMNODES) 1513 if (nmask != NULL && maxnode < MAX_NUMNODES)
1510 return -EINVAL; 1514 return -EINVAL;
1511 1515
1512 err = do_get_mempolicy(&pval, &nodes, addr, flags); 1516 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1513 1517
1514 if (err) 1518 if (err)
1515 return err; 1519 return err;
1516 1520
1517 if (policy && put_user(pval, policy)) 1521 if (policy && put_user(pval, policy))
1518 return -EFAULT; 1522 return -EFAULT;
1519 1523
1520 if (nmask) 1524 if (nmask)
1521 err = copy_nodes_to_user(nmask, maxnode, &nodes); 1525 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1522 1526
1523 return err; 1527 return err;
1524 } 1528 }
1525 1529
1526 #ifdef CONFIG_COMPAT 1530 #ifdef CONFIG_COMPAT
1527 1531
1528 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 1532 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1529 compat_ulong_t __user *, nmask, 1533 compat_ulong_t __user *, nmask,
1530 compat_ulong_t, maxnode, 1534 compat_ulong_t, maxnode,
1531 compat_ulong_t, addr, compat_ulong_t, flags) 1535 compat_ulong_t, addr, compat_ulong_t, flags)
1532 { 1536 {
1533 long err; 1537 long err;
1534 unsigned long __user *nm = NULL; 1538 unsigned long __user *nm = NULL;
1535 unsigned long nr_bits, alloc_size; 1539 unsigned long nr_bits, alloc_size;
1536 DECLARE_BITMAP(bm, MAX_NUMNODES); 1540 DECLARE_BITMAP(bm, MAX_NUMNODES);
1537 1541
1538 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 1542 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1539 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 1543 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1540 1544
1541 if (nmask) 1545 if (nmask)
1542 nm = compat_alloc_user_space(alloc_size); 1546 nm = compat_alloc_user_space(alloc_size);
1543 1547
1544 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 1548 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1545 1549
1546 if (!err && nmask) { 1550 if (!err && nmask) {
1547 unsigned long copy_size; 1551 unsigned long copy_size;
1548 copy_size = min_t(unsigned long, sizeof(bm), alloc_size); 1552 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1549 err = copy_from_user(bm, nm, copy_size); 1553 err = copy_from_user(bm, nm, copy_size);
1550 /* ensure entire bitmap is zeroed */ 1554 /* ensure entire bitmap is zeroed */
1551 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 1555 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1552 err |= compat_put_bitmap(nmask, bm, nr_bits); 1556 err |= compat_put_bitmap(nmask, bm, nr_bits);
1553 } 1557 }
1554 1558
1555 return err; 1559 return err;
1556 } 1560 }
1557 1561
1558 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, 1562 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1559 compat_ulong_t, maxnode) 1563 compat_ulong_t, maxnode)
1560 { 1564 {
1561 long err = 0; 1565 long err = 0;
1562 unsigned long __user *nm = NULL; 1566 unsigned long __user *nm = NULL;
1563 unsigned long nr_bits, alloc_size; 1567 unsigned long nr_bits, alloc_size;
1564 DECLARE_BITMAP(bm, MAX_NUMNODES); 1568 DECLARE_BITMAP(bm, MAX_NUMNODES);
1565 1569
1566 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 1570 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1567 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 1571 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1568 1572
1569 if (nmask) { 1573 if (nmask) {
1570 err = compat_get_bitmap(bm, nmask, nr_bits); 1574 err = compat_get_bitmap(bm, nmask, nr_bits);
1571 nm = compat_alloc_user_space(alloc_size); 1575 nm = compat_alloc_user_space(alloc_size);
1572 err |= copy_to_user(nm, bm, alloc_size); 1576 err |= copy_to_user(nm, bm, alloc_size);
1573 } 1577 }
1574 1578
1575 if (err) 1579 if (err)
1576 return -EFAULT; 1580 return -EFAULT;
1577 1581
1578 return sys_set_mempolicy(mode, nm, nr_bits+1); 1582 return sys_set_mempolicy(mode, nm, nr_bits+1);
1579 } 1583 }
1580 1584
1581 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, 1585 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1582 compat_ulong_t, mode, compat_ulong_t __user *, nmask, 1586 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1583 compat_ulong_t, maxnode, compat_ulong_t, flags) 1587 compat_ulong_t, maxnode, compat_ulong_t, flags)
1584 { 1588 {
1585 long err = 0; 1589 long err = 0;
1586 unsigned long __user *nm = NULL; 1590 unsigned long __user *nm = NULL;
1587 unsigned long nr_bits, alloc_size; 1591 unsigned long nr_bits, alloc_size;
1588 nodemask_t bm; 1592 nodemask_t bm;
1589 1593
1590 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 1594 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1591 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 1595 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1592 1596
1593 if (nmask) { 1597 if (nmask) {
1594 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); 1598 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1595 nm = compat_alloc_user_space(alloc_size); 1599 nm = compat_alloc_user_space(alloc_size);
1596 err |= copy_to_user(nm, nodes_addr(bm), alloc_size); 1600 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1597 } 1601 }
1598 1602
1599 if (err) 1603 if (err)
1600 return -EFAULT; 1604 return -EFAULT;
1601 1605
1602 return sys_mbind(start, len, mode, nm, nr_bits+1, flags); 1606 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1603 } 1607 }
1604 1608
1605 #endif 1609 #endif
1606 1610
1607 /* 1611 /*
1608 * get_vma_policy(@task, @vma, @addr) 1612 * get_vma_policy(@task, @vma, @addr)
1609 * @task - task for fallback if vma policy == default 1613 * @task - task for fallback if vma policy == default
1610 * @vma - virtual memory area whose policy is sought 1614 * @vma - virtual memory area whose policy is sought
1611 * @addr - address in @vma for shared policy lookup 1615 * @addr - address in @vma for shared policy lookup
1612 * 1616 *
1613 * Returns effective policy for a VMA at specified address. 1617 * Returns effective policy for a VMA at specified address.
1614 * Falls back to @task or system default policy, as necessary. 1618 * Falls back to @task or system default policy, as necessary.
1615 * Current or other task's task mempolicy and non-shared vma policies must be 1619 * Current or other task's task mempolicy and non-shared vma policies must be
1616 * protected by task_lock(task) by the caller. 1620 * protected by task_lock(task) by the caller.
1617 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 1621 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1618 * count--added by the get_policy() vm_op, as appropriate--to protect against 1622 * count--added by the get_policy() vm_op, as appropriate--to protect against
1619 * freeing by another task. It is the caller's responsibility to free the 1623 * freeing by another task. It is the caller's responsibility to free the
1620 * extra reference for shared policies. 1624 * extra reference for shared policies.
1621 */ 1625 */
1622 struct mempolicy *get_vma_policy(struct task_struct *task, 1626 struct mempolicy *get_vma_policy(struct task_struct *task,
1623 struct vm_area_struct *vma, unsigned long addr) 1627 struct vm_area_struct *vma, unsigned long addr)
1624 { 1628 {
1625 struct mempolicy *pol = get_task_policy(task); 1629 struct mempolicy *pol = get_task_policy(task);
1626 1630
1627 if (vma) { 1631 if (vma) {
1628 if (vma->vm_ops && vma->vm_ops->get_policy) { 1632 if (vma->vm_ops && vma->vm_ops->get_policy) {
1629 struct mempolicy *vpol = vma->vm_ops->get_policy(vma, 1633 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1630 addr); 1634 addr);
1631 if (vpol) 1635 if (vpol)
1632 pol = vpol; 1636 pol = vpol;
1633 } else if (vma->vm_policy) { 1637 } else if (vma->vm_policy) {
1634 pol = vma->vm_policy; 1638 pol = vma->vm_policy;
1635 1639
1636 /* 1640 /*
1637 * shmem_alloc_page() passes MPOL_F_SHARED policy with 1641 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1638 * a pseudo vma whose vma->vm_ops=NULL. Take a reference 1642 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1639 * count on these policies which will be dropped by 1643 * count on these policies which will be dropped by
1640 * mpol_cond_put() later 1644 * mpol_cond_put() later
1641 */ 1645 */
1642 if (mpol_needs_cond_ref(pol)) 1646 if (mpol_needs_cond_ref(pol))
1643 mpol_get(pol); 1647 mpol_get(pol);
1644 } 1648 }
1645 } 1649 }
1646 if (!pol) 1650 if (!pol)
1647 pol = &default_policy; 1651 pol = &default_policy;
1648 return pol; 1652 return pol;
1649 } 1653 }
1650 1654
1651 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) 1655 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1652 { 1656 {
1653 struct mempolicy *pol = get_task_policy(task); 1657 struct mempolicy *pol = get_task_policy(task);
1654 if (vma) { 1658 if (vma) {
1655 if (vma->vm_ops && vma->vm_ops->get_policy) { 1659 if (vma->vm_ops && vma->vm_ops->get_policy) {
1656 bool ret = false; 1660 bool ret = false;
1657 1661
1658 pol = vma->vm_ops->get_policy(vma, vma->vm_start); 1662 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1659 if (pol && (pol->flags & MPOL_F_MOF)) 1663 if (pol && (pol->flags & MPOL_F_MOF))
1660 ret = true; 1664 ret = true;
1661 mpol_cond_put(pol); 1665 mpol_cond_put(pol);
1662 1666
1663 return ret; 1667 return ret;
1664 } else if (vma->vm_policy) { 1668 } else if (vma->vm_policy) {
1665 pol = vma->vm_policy; 1669 pol = vma->vm_policy;
1666 } 1670 }
1667 } 1671 }
1668 1672
1669 if (!pol) 1673 if (!pol)
1670 return default_policy.flags & MPOL_F_MOF; 1674 return default_policy.flags & MPOL_F_MOF;
1671 1675
1672 return pol->flags & MPOL_F_MOF; 1676 return pol->flags & MPOL_F_MOF;
1673 } 1677 }
1674 1678
1675 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1679 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1676 { 1680 {
1677 enum zone_type dynamic_policy_zone = policy_zone; 1681 enum zone_type dynamic_policy_zone = policy_zone;
1678 1682
1679 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 1683 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1680 1684
1681 /* 1685 /*
1682 * if policy->v.nodes has movable memory only, 1686 * if policy->v.nodes has movable memory only,
1683 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 1687 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1684 * 1688 *
1685 * policy->v.nodes is intersect with node_states[N_MEMORY]. 1689 * policy->v.nodes is intersect with node_states[N_MEMORY].
1686 * so if the following test faile, it implies 1690 * so if the following test faile, it implies
1687 * policy->v.nodes has movable memory only. 1691 * policy->v.nodes has movable memory only.
1688 */ 1692 */
1689 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) 1693 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1690 dynamic_policy_zone = ZONE_MOVABLE; 1694 dynamic_policy_zone = ZONE_MOVABLE;
1691 1695
1692 return zone >= dynamic_policy_zone; 1696 return zone >= dynamic_policy_zone;
1693 } 1697 }
1694 1698
1695 /* 1699 /*
1696 * Return a nodemask representing a mempolicy for filtering nodes for 1700 * Return a nodemask representing a mempolicy for filtering nodes for
1697 * page allocation 1701 * page allocation
1698 */ 1702 */
1699 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) 1703 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1700 { 1704 {
1701 /* Lower zones don't get a nodemask applied for MPOL_BIND */ 1705 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1702 if (unlikely(policy->mode == MPOL_BIND) && 1706 if (unlikely(policy->mode == MPOL_BIND) &&
1703 apply_policy_zone(policy, gfp_zone(gfp)) && 1707 apply_policy_zone(policy, gfp_zone(gfp)) &&
1704 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) 1708 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1705 return &policy->v.nodes; 1709 return &policy->v.nodes;
1706 1710
1707 return NULL; 1711 return NULL;
1708 } 1712 }
1709 1713
1710 /* Return a zonelist indicated by gfp for node representing a mempolicy */ 1714 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1711 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, 1715 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1712 int nd) 1716 int nd)
1713 { 1717 {
1714 switch (policy->mode) { 1718 switch (policy->mode) {
1715 case MPOL_PREFERRED: 1719 case MPOL_PREFERRED:
1716 if (!(policy->flags & MPOL_F_LOCAL)) 1720 if (!(policy->flags & MPOL_F_LOCAL))
1717 nd = policy->v.preferred_node; 1721 nd = policy->v.preferred_node;
1718 break; 1722 break;
1719 case MPOL_BIND: 1723 case MPOL_BIND:
1720 /* 1724 /*
1721 * Normally, MPOL_BIND allocations are node-local within the 1725 * Normally, MPOL_BIND allocations are node-local within the
1722 * allowed nodemask. However, if __GFP_THISNODE is set and the 1726 * allowed nodemask. However, if __GFP_THISNODE is set and the
1723 * current node isn't part of the mask, we use the zonelist for 1727 * current node isn't part of the mask, we use the zonelist for
1724 * the first node in the mask instead. 1728 * the first node in the mask instead.
1725 */ 1729 */
1726 if (unlikely(gfp & __GFP_THISNODE) && 1730 if (unlikely(gfp & __GFP_THISNODE) &&
1727 unlikely(!node_isset(nd, policy->v.nodes))) 1731 unlikely(!node_isset(nd, policy->v.nodes)))
1728 nd = first_node(policy->v.nodes); 1732 nd = first_node(policy->v.nodes);
1729 break; 1733 break;
1730 default: 1734 default:
1731 BUG(); 1735 BUG();
1732 } 1736 }
1733 return node_zonelist(nd, gfp); 1737 return node_zonelist(nd, gfp);
1734 } 1738 }
1735 1739
1736 /* Do dynamic interleaving for a process */ 1740 /* Do dynamic interleaving for a process */
1737 static unsigned interleave_nodes(struct mempolicy *policy) 1741 static unsigned interleave_nodes(struct mempolicy *policy)
1738 { 1742 {
1739 unsigned nid, next; 1743 unsigned nid, next;
1740 struct task_struct *me = current; 1744 struct task_struct *me = current;
1741 1745
1742 nid = me->il_next; 1746 nid = me->il_next;
1743 next = next_node(nid, policy->v.nodes); 1747 next = next_node(nid, policy->v.nodes);
1744 if (next >= MAX_NUMNODES) 1748 if (next >= MAX_NUMNODES)
1745 next = first_node(policy->v.nodes); 1749 next = first_node(policy->v.nodes);
1746 if (next < MAX_NUMNODES) 1750 if (next < MAX_NUMNODES)
1747 me->il_next = next; 1751 me->il_next = next;
1748 return nid; 1752 return nid;
1749 } 1753 }
1750 1754
1751 /* 1755 /*
1752 * Depending on the memory policy provide a node from which to allocate the 1756 * Depending on the memory policy provide a node from which to allocate the
1753 * next slab entry. 1757 * next slab entry.
1754 */ 1758 */
1755 unsigned int mempolicy_slab_node(void) 1759 unsigned int mempolicy_slab_node(void)
1756 { 1760 {
1757 struct mempolicy *policy; 1761 struct mempolicy *policy;
1758 int node = numa_mem_id(); 1762 int node = numa_mem_id();
1759 1763
1760 if (in_interrupt()) 1764 if (in_interrupt())
1761 return node; 1765 return node;
1762 1766
1763 policy = current->mempolicy; 1767 policy = current->mempolicy;
1764 if (!policy || policy->flags & MPOL_F_LOCAL) 1768 if (!policy || policy->flags & MPOL_F_LOCAL)
1765 return node; 1769 return node;
1766 1770
1767 switch (policy->mode) { 1771 switch (policy->mode) {
1768 case MPOL_PREFERRED: 1772 case MPOL_PREFERRED:
1769 /* 1773 /*
1770 * handled MPOL_F_LOCAL above 1774 * handled MPOL_F_LOCAL above
1771 */ 1775 */
1772 return policy->v.preferred_node; 1776 return policy->v.preferred_node;
1773 1777
1774 case MPOL_INTERLEAVE: 1778 case MPOL_INTERLEAVE:
1775 return interleave_nodes(policy); 1779 return interleave_nodes(policy);
1776 1780
1777 case MPOL_BIND: { 1781 case MPOL_BIND: {
1778 /* 1782 /*
1779 * Follow bind policy behavior and start allocation at the 1783 * Follow bind policy behavior and start allocation at the
1780 * first node. 1784 * first node.
1781 */ 1785 */
1782 struct zonelist *zonelist; 1786 struct zonelist *zonelist;
1783 struct zone *zone; 1787 struct zone *zone;
1784 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 1788 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1785 zonelist = &NODE_DATA(node)->node_zonelists[0]; 1789 zonelist = &NODE_DATA(node)->node_zonelists[0];
1786 (void)first_zones_zonelist(zonelist, highest_zoneidx, 1790 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1787 &policy->v.nodes, 1791 &policy->v.nodes,
1788 &zone); 1792 &zone);
1789 return zone ? zone->node : node; 1793 return zone ? zone->node : node;
1790 } 1794 }
1791 1795
1792 default: 1796 default:
1793 BUG(); 1797 BUG();
1794 } 1798 }
1795 } 1799 }
1796 1800
1797 /* Do static interleaving for a VMA with known offset. */ 1801 /* Do static interleaving for a VMA with known offset. */
1798 static unsigned offset_il_node(struct mempolicy *pol, 1802 static unsigned offset_il_node(struct mempolicy *pol,
1799 struct vm_area_struct *vma, unsigned long off) 1803 struct vm_area_struct *vma, unsigned long off)
1800 { 1804 {
1801 unsigned nnodes = nodes_weight(pol->v.nodes); 1805 unsigned nnodes = nodes_weight(pol->v.nodes);
1802 unsigned target; 1806 unsigned target;
1803 int c; 1807 int c;
1804 int nid = NUMA_NO_NODE; 1808 int nid = NUMA_NO_NODE;
1805 1809
1806 if (!nnodes) 1810 if (!nnodes)
1807 return numa_node_id(); 1811 return numa_node_id();
1808 target = (unsigned int)off % nnodes; 1812 target = (unsigned int)off % nnodes;
1809 c = 0; 1813 c = 0;
1810 do { 1814 do {
1811 nid = next_node(nid, pol->v.nodes); 1815 nid = next_node(nid, pol->v.nodes);
1812 c++; 1816 c++;
1813 } while (c <= target); 1817 } while (c <= target);
1814 return nid; 1818 return nid;
1815 } 1819 }
1816 1820
1817 /* Determine a node number for interleave */ 1821 /* Determine a node number for interleave */
1818 static inline unsigned interleave_nid(struct mempolicy *pol, 1822 static inline unsigned interleave_nid(struct mempolicy *pol,
1819 struct vm_area_struct *vma, unsigned long addr, int shift) 1823 struct vm_area_struct *vma, unsigned long addr, int shift)
1820 { 1824 {
1821 if (vma) { 1825 if (vma) {
1822 unsigned long off; 1826 unsigned long off;
1823 1827
1824 /* 1828 /*
1825 * for small pages, there is no difference between 1829 * for small pages, there is no difference between
1826 * shift and PAGE_SHIFT, so the bit-shift is safe. 1830 * shift and PAGE_SHIFT, so the bit-shift is safe.
1827 * for huge pages, since vm_pgoff is in units of small 1831 * for huge pages, since vm_pgoff is in units of small
1828 * pages, we need to shift off the always 0 bits to get 1832 * pages, we need to shift off the always 0 bits to get
1829 * a useful offset. 1833 * a useful offset.
1830 */ 1834 */
1831 BUG_ON(shift < PAGE_SHIFT); 1835 BUG_ON(shift < PAGE_SHIFT);
1832 off = vma->vm_pgoff >> (shift - PAGE_SHIFT); 1836 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1833 off += (addr - vma->vm_start) >> shift; 1837 off += (addr - vma->vm_start) >> shift;
1834 return offset_il_node(pol, vma, off); 1838 return offset_il_node(pol, vma, off);
1835 } else 1839 } else
1836 return interleave_nodes(pol); 1840 return interleave_nodes(pol);
1837 } 1841 }
1838 1842
1839 /* 1843 /*
1840 * Return the bit number of a random bit set in the nodemask. 1844 * Return the bit number of a random bit set in the nodemask.
1841 * (returns NUMA_NO_NODE if nodemask is empty) 1845 * (returns NUMA_NO_NODE if nodemask is empty)
1842 */ 1846 */
1843 int node_random(const nodemask_t *maskp) 1847 int node_random(const nodemask_t *maskp)
1844 { 1848 {
1845 int w, bit = NUMA_NO_NODE; 1849 int w, bit = NUMA_NO_NODE;
1846 1850
1847 w = nodes_weight(*maskp); 1851 w = nodes_weight(*maskp);
1848 if (w) 1852 if (w)
1849 bit = bitmap_ord_to_pos(maskp->bits, 1853 bit = bitmap_ord_to_pos(maskp->bits,
1850 get_random_int() % w, MAX_NUMNODES); 1854 get_random_int() % w, MAX_NUMNODES);
1851 return bit; 1855 return bit;
1852 } 1856 }
1853 1857
1854 #ifdef CONFIG_HUGETLBFS 1858 #ifdef CONFIG_HUGETLBFS
1855 /* 1859 /*
1856 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1860 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1857 * @vma = virtual memory area whose policy is sought 1861 * @vma = virtual memory area whose policy is sought
1858 * @addr = address in @vma for shared policy lookup and interleave policy 1862 * @addr = address in @vma for shared policy lookup and interleave policy
1859 * @gfp_flags = for requested zone 1863 * @gfp_flags = for requested zone
1860 * @mpol = pointer to mempolicy pointer for reference counted mempolicy 1864 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1861 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask 1865 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1862 * 1866 *
1863 * Returns a zonelist suitable for a huge page allocation and a pointer 1867 * Returns a zonelist suitable for a huge page allocation and a pointer
1864 * to the struct mempolicy for conditional unref after allocation. 1868 * to the struct mempolicy for conditional unref after allocation.
1865 * If the effective policy is 'BIND, returns a pointer to the mempolicy's 1869 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1866 * @nodemask for filtering the zonelist. 1870 * @nodemask for filtering the zonelist.
1867 * 1871 *
1868 * Must be protected by read_mems_allowed_begin() 1872 * Must be protected by read_mems_allowed_begin()
1869 */ 1873 */
1870 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1874 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1871 gfp_t gfp_flags, struct mempolicy **mpol, 1875 gfp_t gfp_flags, struct mempolicy **mpol,
1872 nodemask_t **nodemask) 1876 nodemask_t **nodemask)
1873 { 1877 {
1874 struct zonelist *zl; 1878 struct zonelist *zl;
1875 1879
1876 *mpol = get_vma_policy(current, vma, addr); 1880 *mpol = get_vma_policy(current, vma, addr);
1877 *nodemask = NULL; /* assume !MPOL_BIND */ 1881 *nodemask = NULL; /* assume !MPOL_BIND */
1878 1882
1879 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1883 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1880 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1884 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1881 huge_page_shift(hstate_vma(vma))), gfp_flags); 1885 huge_page_shift(hstate_vma(vma))), gfp_flags);
1882 } else { 1886 } else {
1883 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); 1887 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1884 if ((*mpol)->mode == MPOL_BIND) 1888 if ((*mpol)->mode == MPOL_BIND)
1885 *nodemask = &(*mpol)->v.nodes; 1889 *nodemask = &(*mpol)->v.nodes;
1886 } 1890 }
1887 return zl; 1891 return zl;
1888 } 1892 }
1889 1893
1890 /* 1894 /*
1891 * init_nodemask_of_mempolicy 1895 * init_nodemask_of_mempolicy
1892 * 1896 *
1893 * If the current task's mempolicy is "default" [NULL], return 'false' 1897 * If the current task's mempolicy is "default" [NULL], return 'false'
1894 * to indicate default policy. Otherwise, extract the policy nodemask 1898 * to indicate default policy. Otherwise, extract the policy nodemask
1895 * for 'bind' or 'interleave' policy into the argument nodemask, or 1899 * for 'bind' or 'interleave' policy into the argument nodemask, or
1896 * initialize the argument nodemask to contain the single node for 1900 * initialize the argument nodemask to contain the single node for
1897 * 'preferred' or 'local' policy and return 'true' to indicate presence 1901 * 'preferred' or 'local' policy and return 'true' to indicate presence
1898 * of non-default mempolicy. 1902 * of non-default mempolicy.
1899 * 1903 *
1900 * We don't bother with reference counting the mempolicy [mpol_get/put] 1904 * We don't bother with reference counting the mempolicy [mpol_get/put]
1901 * because the current task is examining it's own mempolicy and a task's 1905 * because the current task is examining it's own mempolicy and a task's
1902 * mempolicy is only ever changed by the task itself. 1906 * mempolicy is only ever changed by the task itself.
1903 * 1907 *
1904 * N.B., it is the caller's responsibility to free a returned nodemask. 1908 * N.B., it is the caller's responsibility to free a returned nodemask.
1905 */ 1909 */
1906 bool init_nodemask_of_mempolicy(nodemask_t *mask) 1910 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1907 { 1911 {
1908 struct mempolicy *mempolicy; 1912 struct mempolicy *mempolicy;
1909 int nid; 1913 int nid;
1910 1914
1911 if (!(mask && current->mempolicy)) 1915 if (!(mask && current->mempolicy))
1912 return false; 1916 return false;
1913 1917
1914 task_lock(current); 1918 task_lock(current);
1915 mempolicy = current->mempolicy; 1919 mempolicy = current->mempolicy;
1916 switch (mempolicy->mode) { 1920 switch (mempolicy->mode) {
1917 case MPOL_PREFERRED: 1921 case MPOL_PREFERRED:
1918 if (mempolicy->flags & MPOL_F_LOCAL) 1922 if (mempolicy->flags & MPOL_F_LOCAL)
1919 nid = numa_node_id(); 1923 nid = numa_node_id();
1920 else 1924 else
1921 nid = mempolicy->v.preferred_node; 1925 nid = mempolicy->v.preferred_node;
1922 init_nodemask_of_node(mask, nid); 1926 init_nodemask_of_node(mask, nid);
1923 break; 1927 break;
1924 1928
1925 case MPOL_BIND: 1929 case MPOL_BIND:
1926 /* Fall through */ 1930 /* Fall through */
1927 case MPOL_INTERLEAVE: 1931 case MPOL_INTERLEAVE:
1928 *mask = mempolicy->v.nodes; 1932 *mask = mempolicy->v.nodes;
1929 break; 1933 break;
1930 1934
1931 default: 1935 default:
1932 BUG(); 1936 BUG();
1933 } 1937 }
1934 task_unlock(current); 1938 task_unlock(current);
1935 1939
1936 return true; 1940 return true;
1937 } 1941 }
1938 #endif 1942 #endif
1939 1943
1940 /* 1944 /*
1941 * mempolicy_nodemask_intersects 1945 * mempolicy_nodemask_intersects
1942 * 1946 *
1943 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default 1947 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1944 * policy. Otherwise, check for intersection between mask and the policy 1948 * policy. Otherwise, check for intersection between mask and the policy
1945 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' 1949 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1946 * policy, always return true since it may allocate elsewhere on fallback. 1950 * policy, always return true since it may allocate elsewhere on fallback.
1947 * 1951 *
1948 * Takes task_lock(tsk) to prevent freeing of its mempolicy. 1952 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1949 */ 1953 */
1950 bool mempolicy_nodemask_intersects(struct task_struct *tsk, 1954 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1951 const nodemask_t *mask) 1955 const nodemask_t *mask)
1952 { 1956 {
1953 struct mempolicy *mempolicy; 1957 struct mempolicy *mempolicy;
1954 bool ret = true; 1958 bool ret = true;
1955 1959
1956 if (!mask) 1960 if (!mask)
1957 return ret; 1961 return ret;
1958 task_lock(tsk); 1962 task_lock(tsk);
1959 mempolicy = tsk->mempolicy; 1963 mempolicy = tsk->mempolicy;
1960 if (!mempolicy) 1964 if (!mempolicy)
1961 goto out; 1965 goto out;
1962 1966
1963 switch (mempolicy->mode) { 1967 switch (mempolicy->mode) {
1964 case MPOL_PREFERRED: 1968 case MPOL_PREFERRED:
1965 /* 1969 /*
1966 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to 1970 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1967 * allocate from, they may fallback to other nodes when oom. 1971 * allocate from, they may fallback to other nodes when oom.
1968 * Thus, it's possible for tsk to have allocated memory from 1972 * Thus, it's possible for tsk to have allocated memory from
1969 * nodes in mask. 1973 * nodes in mask.
1970 */ 1974 */
1971 break; 1975 break;
1972 case MPOL_BIND: 1976 case MPOL_BIND:
1973 case MPOL_INTERLEAVE: 1977 case MPOL_INTERLEAVE:
1974 ret = nodes_intersects(mempolicy->v.nodes, *mask); 1978 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1975 break; 1979 break;
1976 default: 1980 default:
1977 BUG(); 1981 BUG();
1978 } 1982 }
1979 out: 1983 out:
1980 task_unlock(tsk); 1984 task_unlock(tsk);
1981 return ret; 1985 return ret;
1982 } 1986 }
1983 1987
1984 /* Allocate a page in interleaved policy. 1988 /* Allocate a page in interleaved policy.
1985 Own path because it needs to do special accounting. */ 1989 Own path because it needs to do special accounting. */
1986 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1990 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1987 unsigned nid) 1991 unsigned nid)
1988 { 1992 {
1989 struct zonelist *zl; 1993 struct zonelist *zl;
1990 struct page *page; 1994 struct page *page;
1991 1995
1992 zl = node_zonelist(nid, gfp); 1996 zl = node_zonelist(nid, gfp);
1993 page = __alloc_pages(gfp, order, zl); 1997 page = __alloc_pages(gfp, order, zl);
1994 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) 1998 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1995 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); 1999 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1996 return page; 2000 return page;
1997 } 2001 }
1998 2002
1999 /** 2003 /**
2000 * alloc_pages_vma - Allocate a page for a VMA. 2004 * alloc_pages_vma - Allocate a page for a VMA.
2001 * 2005 *
2002 * @gfp: 2006 * @gfp:
2003 * %GFP_USER user allocation. 2007 * %GFP_USER user allocation.
2004 * %GFP_KERNEL kernel allocations, 2008 * %GFP_KERNEL kernel allocations,
2005 * %GFP_HIGHMEM highmem/user allocations, 2009 * %GFP_HIGHMEM highmem/user allocations,
2006 * %GFP_FS allocation should not call back into a file system. 2010 * %GFP_FS allocation should not call back into a file system.
2007 * %GFP_ATOMIC don't sleep. 2011 * %GFP_ATOMIC don't sleep.
2008 * 2012 *
2009 * @order:Order of the GFP allocation. 2013 * @order:Order of the GFP allocation.
2010 * @vma: Pointer to VMA or NULL if not available. 2014 * @vma: Pointer to VMA or NULL if not available.
2011 * @addr: Virtual Address of the allocation. Must be inside the VMA. 2015 * @addr: Virtual Address of the allocation. Must be inside the VMA.
2012 * 2016 *
2013 * This function allocates a page from the kernel page pool and applies 2017 * This function allocates a page from the kernel page pool and applies
2014 * a NUMA policy associated with the VMA or the current process. 2018 * a NUMA policy associated with the VMA or the current process.
2015 * When VMA is not NULL caller must hold down_read on the mmap_sem of the 2019 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
2016 * mm_struct of the VMA to prevent it from going away. Should be used for 2020 * mm_struct of the VMA to prevent it from going away. Should be used for
2017 * all allocations for pages that will be mapped into 2021 * all allocations for pages that will be mapped into
2018 * user space. Returns NULL when no page can be allocated. 2022 * user space. Returns NULL when no page can be allocated.
2019 * 2023 *
2020 * Should be called with the mm_sem of the vma hold. 2024 * Should be called with the mm_sem of the vma hold.
2021 */ 2025 */
2022 struct page * 2026 struct page *
2023 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 2027 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2024 unsigned long addr, int node) 2028 unsigned long addr, int node)
2025 { 2029 {
2026 struct mempolicy *pol; 2030 struct mempolicy *pol;
2027 struct page *page; 2031 struct page *page;
2028 unsigned int cpuset_mems_cookie; 2032 unsigned int cpuset_mems_cookie;
2029 2033
2030 retry_cpuset: 2034 retry_cpuset:
2031 pol = get_vma_policy(current, vma, addr); 2035 pol = get_vma_policy(current, vma, addr);
2032 cpuset_mems_cookie = read_mems_allowed_begin(); 2036 cpuset_mems_cookie = read_mems_allowed_begin();
2033 2037
2034 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 2038 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2035 unsigned nid; 2039 unsigned nid;
2036 2040
2037 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 2041 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2038 mpol_cond_put(pol); 2042 mpol_cond_put(pol);
2039 page = alloc_page_interleave(gfp, order, nid); 2043 page = alloc_page_interleave(gfp, order, nid);
2040 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2044 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2041 goto retry_cpuset; 2045 goto retry_cpuset;
2042 2046
2043 return page; 2047 return page;
2044 } 2048 }
2045 page = __alloc_pages_nodemask(gfp, order, 2049 page = __alloc_pages_nodemask(gfp, order,
2046 policy_zonelist(gfp, pol, node), 2050 policy_zonelist(gfp, pol, node),
2047 policy_nodemask(gfp, pol)); 2051 policy_nodemask(gfp, pol));
2048 if (unlikely(mpol_needs_cond_ref(pol))) 2052 if (unlikely(mpol_needs_cond_ref(pol)))
2049 __mpol_put(pol); 2053 __mpol_put(pol);
2050 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2054 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2051 goto retry_cpuset; 2055 goto retry_cpuset;
2052 return page; 2056 return page;
2053 } 2057 }
2054 2058
2055 /** 2059 /**
2056 * alloc_pages_current - Allocate pages. 2060 * alloc_pages_current - Allocate pages.
2057 * 2061 *
2058 * @gfp: 2062 * @gfp:
2059 * %GFP_USER user allocation, 2063 * %GFP_USER user allocation,
2060 * %GFP_KERNEL kernel allocation, 2064 * %GFP_KERNEL kernel allocation,
2061 * %GFP_HIGHMEM highmem allocation, 2065 * %GFP_HIGHMEM highmem allocation,
2062 * %GFP_FS don't call back into a file system. 2066 * %GFP_FS don't call back into a file system.
2063 * %GFP_ATOMIC don't sleep. 2067 * %GFP_ATOMIC don't sleep.
2064 * @order: Power of two of allocation size in pages. 0 is a single page. 2068 * @order: Power of two of allocation size in pages. 0 is a single page.
2065 * 2069 *
2066 * Allocate a page from the kernel page pool. When not in 2070 * Allocate a page from the kernel page pool. When not in
2067 * interrupt context and apply the current process NUMA policy. 2071 * interrupt context and apply the current process NUMA policy.
2068 * Returns NULL when no page can be allocated. 2072 * Returns NULL when no page can be allocated.
2069 * 2073 *
2070 * Don't call cpuset_update_task_memory_state() unless 2074 * Don't call cpuset_update_task_memory_state() unless
2071 * 1) it's ok to take cpuset_sem (can WAIT), and 2075 * 1) it's ok to take cpuset_sem (can WAIT), and
2072 * 2) allocating for current task (not interrupt). 2076 * 2) allocating for current task (not interrupt).
2073 */ 2077 */
2074 struct page *alloc_pages_current(gfp_t gfp, unsigned order) 2078 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2075 { 2079 {
2076 struct mempolicy *pol = get_task_policy(current); 2080 struct mempolicy *pol = get_task_policy(current);
2077 struct page *page; 2081 struct page *page;
2078 unsigned int cpuset_mems_cookie; 2082 unsigned int cpuset_mems_cookie;
2079 2083
2080 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 2084 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2081 pol = &default_policy; 2085 pol = &default_policy;
2082 2086
2083 retry_cpuset: 2087 retry_cpuset:
2084 cpuset_mems_cookie = read_mems_allowed_begin(); 2088 cpuset_mems_cookie = read_mems_allowed_begin();
2085 2089
2086 /* 2090 /*
2087 * No reference counting needed for current->mempolicy 2091 * No reference counting needed for current->mempolicy
2088 * nor system default_policy 2092 * nor system default_policy
2089 */ 2093 */
2090 if (pol->mode == MPOL_INTERLEAVE) 2094 if (pol->mode == MPOL_INTERLEAVE)
2091 page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); 2095 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2092 else 2096 else
2093 page = __alloc_pages_nodemask(gfp, order, 2097 page = __alloc_pages_nodemask(gfp, order,
2094 policy_zonelist(gfp, pol, numa_node_id()), 2098 policy_zonelist(gfp, pol, numa_node_id()),
2095 policy_nodemask(gfp, pol)); 2099 policy_nodemask(gfp, pol));
2096 2100
2097 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2101 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2098 goto retry_cpuset; 2102 goto retry_cpuset;
2099 2103
2100 return page; 2104 return page;
2101 } 2105 }
2102 EXPORT_SYMBOL(alloc_pages_current); 2106 EXPORT_SYMBOL(alloc_pages_current);
2103 2107
2104 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 2108 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2105 { 2109 {
2106 struct mempolicy *pol = mpol_dup(vma_policy(src)); 2110 struct mempolicy *pol = mpol_dup(vma_policy(src));
2107 2111
2108 if (IS_ERR(pol)) 2112 if (IS_ERR(pol))
2109 return PTR_ERR(pol); 2113 return PTR_ERR(pol);
2110 dst->vm_policy = pol; 2114 dst->vm_policy = pol;
2111 return 0; 2115 return 0;
2112 } 2116 }
2113 2117
2114 /* 2118 /*
2115 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2119 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2116 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2120 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2117 * with the mems_allowed returned by cpuset_mems_allowed(). This 2121 * with the mems_allowed returned by cpuset_mems_allowed(). This
2118 * keeps mempolicies cpuset relative after its cpuset moves. See 2122 * keeps mempolicies cpuset relative after its cpuset moves. See
2119 * further kernel/cpuset.c update_nodemask(). 2123 * further kernel/cpuset.c update_nodemask().
2120 * 2124 *
2121 * current's mempolicy may be rebinded by the other task(the task that changes 2125 * current's mempolicy may be rebinded by the other task(the task that changes
2122 * cpuset's mems), so we needn't do rebind work for current task. 2126 * cpuset's mems), so we needn't do rebind work for current task.
2123 */ 2127 */
2124 2128
2125 /* Slow path of a mempolicy duplicate */ 2129 /* Slow path of a mempolicy duplicate */
2126 struct mempolicy *__mpol_dup(struct mempolicy *old) 2130 struct mempolicy *__mpol_dup(struct mempolicy *old)
2127 { 2131 {
2128 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2132 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2129 2133
2130 if (!new) 2134 if (!new)
2131 return ERR_PTR(-ENOMEM); 2135 return ERR_PTR(-ENOMEM);
2132 2136
2133 /* task's mempolicy is protected by alloc_lock */ 2137 /* task's mempolicy is protected by alloc_lock */
2134 if (old == current->mempolicy) { 2138 if (old == current->mempolicy) {
2135 task_lock(current); 2139 task_lock(current);
2136 *new = *old; 2140 *new = *old;
2137 task_unlock(current); 2141 task_unlock(current);
2138 } else 2142 } else
2139 *new = *old; 2143 *new = *old;
2140 2144
2141 rcu_read_lock(); 2145 rcu_read_lock();
2142 if (current_cpuset_is_being_rebound()) { 2146 if (current_cpuset_is_being_rebound()) {
2143 nodemask_t mems = cpuset_mems_allowed(current); 2147 nodemask_t mems = cpuset_mems_allowed(current);
2144 if (new->flags & MPOL_F_REBINDING) 2148 if (new->flags & MPOL_F_REBINDING)
2145 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); 2149 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2146 else 2150 else
2147 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); 2151 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2148 } 2152 }
2149 rcu_read_unlock(); 2153 rcu_read_unlock();
2150 atomic_set(&new->refcnt, 1); 2154 atomic_set(&new->refcnt, 1);
2151 return new; 2155 return new;
2152 } 2156 }
2153 2157
2154 /* Slow path of a mempolicy comparison */ 2158 /* Slow path of a mempolicy comparison */
2155 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2159 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2156 { 2160 {
2157 if (!a || !b) 2161 if (!a || !b)
2158 return false; 2162 return false;
2159 if (a->mode != b->mode) 2163 if (a->mode != b->mode)
2160 return false; 2164 return false;
2161 if (a->flags != b->flags) 2165 if (a->flags != b->flags)
2162 return false; 2166 return false;
2163 if (mpol_store_user_nodemask(a)) 2167 if (mpol_store_user_nodemask(a))
2164 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 2168 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2165 return false; 2169 return false;
2166 2170
2167 switch (a->mode) { 2171 switch (a->mode) {
2168 case MPOL_BIND: 2172 case MPOL_BIND:
2169 /* Fall through */ 2173 /* Fall through */
2170 case MPOL_INTERLEAVE: 2174 case MPOL_INTERLEAVE:
2171 return !!nodes_equal(a->v.nodes, b->v.nodes); 2175 return !!nodes_equal(a->v.nodes, b->v.nodes);
2172 case MPOL_PREFERRED: 2176 case MPOL_PREFERRED:
2173 return a->v.preferred_node == b->v.preferred_node; 2177 return a->v.preferred_node == b->v.preferred_node;
2174 default: 2178 default:
2175 BUG(); 2179 BUG();
2176 return false; 2180 return false;
2177 } 2181 }
2178 } 2182 }
2179 2183
2180 /* 2184 /*
2181 * Shared memory backing store policy support. 2185 * Shared memory backing store policy support.
2182 * 2186 *
2183 * Remember policies even when nobody has shared memory mapped. 2187 * Remember policies even when nobody has shared memory mapped.
2184 * The policies are kept in Red-Black tree linked from the inode. 2188 * The policies are kept in Red-Black tree linked from the inode.
2185 * They are protected by the sp->lock spinlock, which should be held 2189 * They are protected by the sp->lock spinlock, which should be held
2186 * for any accesses to the tree. 2190 * for any accesses to the tree.
2187 */ 2191 */
2188 2192
2189 /* lookup first element intersecting start-end */ 2193 /* lookup first element intersecting start-end */
2190 /* Caller holds sp->lock */ 2194 /* Caller holds sp->lock */
2191 static struct sp_node * 2195 static struct sp_node *
2192 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2196 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2193 { 2197 {
2194 struct rb_node *n = sp->root.rb_node; 2198 struct rb_node *n = sp->root.rb_node;
2195 2199
2196 while (n) { 2200 while (n) {
2197 struct sp_node *p = rb_entry(n, struct sp_node, nd); 2201 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2198 2202
2199 if (start >= p->end) 2203 if (start >= p->end)
2200 n = n->rb_right; 2204 n = n->rb_right;
2201 else if (end <= p->start) 2205 else if (end <= p->start)
2202 n = n->rb_left; 2206 n = n->rb_left;
2203 else 2207 else
2204 break; 2208 break;
2205 } 2209 }
2206 if (!n) 2210 if (!n)
2207 return NULL; 2211 return NULL;
2208 for (;;) { 2212 for (;;) {
2209 struct sp_node *w = NULL; 2213 struct sp_node *w = NULL;
2210 struct rb_node *prev = rb_prev(n); 2214 struct rb_node *prev = rb_prev(n);
2211 if (!prev) 2215 if (!prev)
2212 break; 2216 break;
2213 w = rb_entry(prev, struct sp_node, nd); 2217 w = rb_entry(prev, struct sp_node, nd);
2214 if (w->end <= start) 2218 if (w->end <= start)
2215 break; 2219 break;
2216 n = prev; 2220 n = prev;
2217 } 2221 }
2218 return rb_entry(n, struct sp_node, nd); 2222 return rb_entry(n, struct sp_node, nd);
2219 } 2223 }
2220 2224
2221 /* Insert a new shared policy into the list. */ 2225 /* Insert a new shared policy into the list. */
2222 /* Caller holds sp->lock */ 2226 /* Caller holds sp->lock */
2223 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 2227 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2224 { 2228 {
2225 struct rb_node **p = &sp->root.rb_node; 2229 struct rb_node **p = &sp->root.rb_node;
2226 struct rb_node *parent = NULL; 2230 struct rb_node *parent = NULL;
2227 struct sp_node *nd; 2231 struct sp_node *nd;
2228 2232
2229 while (*p) { 2233 while (*p) {
2230 parent = *p; 2234 parent = *p;
2231 nd = rb_entry(parent, struct sp_node, nd); 2235 nd = rb_entry(parent, struct sp_node, nd);
2232 if (new->start < nd->start) 2236 if (new->start < nd->start)
2233 p = &(*p)->rb_left; 2237 p = &(*p)->rb_left;
2234 else if (new->end > nd->end) 2238 else if (new->end > nd->end)
2235 p = &(*p)->rb_right; 2239 p = &(*p)->rb_right;
2236 else 2240 else
2237 BUG(); 2241 BUG();
2238 } 2242 }
2239 rb_link_node(&new->nd, parent, p); 2243 rb_link_node(&new->nd, parent, p);
2240 rb_insert_color(&new->nd, &sp->root); 2244 rb_insert_color(&new->nd, &sp->root);
2241 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, 2245 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2242 new->policy ? new->policy->mode : 0); 2246 new->policy ? new->policy->mode : 0);
2243 } 2247 }
2244 2248
2245 /* Find shared policy intersecting idx */ 2249 /* Find shared policy intersecting idx */
2246 struct mempolicy * 2250 struct mempolicy *
2247 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 2251 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2248 { 2252 {
2249 struct mempolicy *pol = NULL; 2253 struct mempolicy *pol = NULL;
2250 struct sp_node *sn; 2254 struct sp_node *sn;
2251 2255
2252 if (!sp->root.rb_node) 2256 if (!sp->root.rb_node)
2253 return NULL; 2257 return NULL;
2254 spin_lock(&sp->lock); 2258 spin_lock(&sp->lock);
2255 sn = sp_lookup(sp, idx, idx+1); 2259 sn = sp_lookup(sp, idx, idx+1);
2256 if (sn) { 2260 if (sn) {
2257 mpol_get(sn->policy); 2261 mpol_get(sn->policy);
2258 pol = sn->policy; 2262 pol = sn->policy;
2259 } 2263 }
2260 spin_unlock(&sp->lock); 2264 spin_unlock(&sp->lock);
2261 return pol; 2265 return pol;
2262 } 2266 }
2263 2267
2264 static void sp_free(struct sp_node *n) 2268 static void sp_free(struct sp_node *n)
2265 { 2269 {
2266 mpol_put(n->policy); 2270 mpol_put(n->policy);
2267 kmem_cache_free(sn_cache, n); 2271 kmem_cache_free(sn_cache, n);
2268 } 2272 }
2269 2273
2270 /** 2274 /**
2271 * mpol_misplaced - check whether current page node is valid in policy 2275 * mpol_misplaced - check whether current page node is valid in policy
2272 * 2276 *
2273 * @page - page to be checked 2277 * @page - page to be checked
2274 * @vma - vm area where page mapped 2278 * @vma - vm area where page mapped
2275 * @addr - virtual address where page mapped 2279 * @addr - virtual address where page mapped
2276 * 2280 *
2277 * Lookup current policy node id for vma,addr and "compare to" page's 2281 * Lookup current policy node id for vma,addr and "compare to" page's
2278 * node id. 2282 * node id.
2279 * 2283 *
2280 * Returns: 2284 * Returns:
2281 * -1 - not misplaced, page is in the right node 2285 * -1 - not misplaced, page is in the right node
2282 * node - node id where the page should be 2286 * node - node id where the page should be
2283 * 2287 *
2284 * Policy determination "mimics" alloc_page_vma(). 2288 * Policy determination "mimics" alloc_page_vma().
2285 * Called from fault path where we know the vma and faulting address. 2289 * Called from fault path where we know the vma and faulting address.
2286 */ 2290 */
2287 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) 2291 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2288 { 2292 {
2289 struct mempolicy *pol; 2293 struct mempolicy *pol;
2290 struct zone *zone; 2294 struct zone *zone;
2291 int curnid = page_to_nid(page); 2295 int curnid = page_to_nid(page);
2292 unsigned long pgoff; 2296 unsigned long pgoff;
2293 int thiscpu = raw_smp_processor_id(); 2297 int thiscpu = raw_smp_processor_id();
2294 int thisnid = cpu_to_node(thiscpu); 2298 int thisnid = cpu_to_node(thiscpu);
2295 int polnid = -1; 2299 int polnid = -1;
2296 int ret = -1; 2300 int ret = -1;
2297 2301
2298 BUG_ON(!vma); 2302 BUG_ON(!vma);
2299 2303
2300 pol = get_vma_policy(current, vma, addr); 2304 pol = get_vma_policy(current, vma, addr);
2301 if (!(pol->flags & MPOL_F_MOF)) 2305 if (!(pol->flags & MPOL_F_MOF))
2302 goto out; 2306 goto out;
2303 2307
2304 switch (pol->mode) { 2308 switch (pol->mode) {
2305 case MPOL_INTERLEAVE: 2309 case MPOL_INTERLEAVE:
2306 BUG_ON(addr >= vma->vm_end); 2310 BUG_ON(addr >= vma->vm_end);
2307 BUG_ON(addr < vma->vm_start); 2311 BUG_ON(addr < vma->vm_start);
2308 2312
2309 pgoff = vma->vm_pgoff; 2313 pgoff = vma->vm_pgoff;
2310 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; 2314 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2311 polnid = offset_il_node(pol, vma, pgoff); 2315 polnid = offset_il_node(pol, vma, pgoff);
2312 break; 2316 break;
2313 2317
2314 case MPOL_PREFERRED: 2318 case MPOL_PREFERRED:
2315 if (pol->flags & MPOL_F_LOCAL) 2319 if (pol->flags & MPOL_F_LOCAL)
2316 polnid = numa_node_id(); 2320 polnid = numa_node_id();
2317 else 2321 else
2318 polnid = pol->v.preferred_node; 2322 polnid = pol->v.preferred_node;
2319 break; 2323 break;
2320 2324
2321 case MPOL_BIND: 2325 case MPOL_BIND:
2322 /* 2326 /*
2323 * allows binding to multiple nodes. 2327 * allows binding to multiple nodes.
2324 * use current page if in policy nodemask, 2328 * use current page if in policy nodemask,
2325 * else select nearest allowed node, if any. 2329 * else select nearest allowed node, if any.
2326 * If no allowed nodes, use current [!misplaced]. 2330 * If no allowed nodes, use current [!misplaced].
2327 */ 2331 */
2328 if (node_isset(curnid, pol->v.nodes)) 2332 if (node_isset(curnid, pol->v.nodes))
2329 goto out; 2333 goto out;
2330 (void)first_zones_zonelist( 2334 (void)first_zones_zonelist(
2331 node_zonelist(numa_node_id(), GFP_HIGHUSER), 2335 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2332 gfp_zone(GFP_HIGHUSER), 2336 gfp_zone(GFP_HIGHUSER),
2333 &pol->v.nodes, &zone); 2337 &pol->v.nodes, &zone);
2334 polnid = zone->node; 2338 polnid = zone->node;
2335 break; 2339 break;
2336 2340
2337 default: 2341 default:
2338 BUG(); 2342 BUG();
2339 } 2343 }
2340 2344
2341 /* Migrate the page towards the node whose CPU is referencing it */ 2345 /* Migrate the page towards the node whose CPU is referencing it */
2342 if (pol->flags & MPOL_F_MORON) { 2346 if (pol->flags & MPOL_F_MORON) {
2343 polnid = thisnid; 2347 polnid = thisnid;
2344 2348
2345 if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) 2349 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2346 goto out; 2350 goto out;
2347 } 2351 }
2348 2352
2349 if (curnid != polnid) 2353 if (curnid != polnid)
2350 ret = polnid; 2354 ret = polnid;
2351 out: 2355 out:
2352 mpol_cond_put(pol); 2356 mpol_cond_put(pol);
2353 2357
2354 return ret; 2358 return ret;
2355 } 2359 }
2356 2360
2357 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2361 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2358 { 2362 {
2359 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2363 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2360 rb_erase(&n->nd, &sp->root); 2364 rb_erase(&n->nd, &sp->root);
2361 sp_free(n); 2365 sp_free(n);
2362 } 2366 }
2363 2367
2364 static void sp_node_init(struct sp_node *node, unsigned long start, 2368 static void sp_node_init(struct sp_node *node, unsigned long start,
2365 unsigned long end, struct mempolicy *pol) 2369 unsigned long end, struct mempolicy *pol)
2366 { 2370 {
2367 node->start = start; 2371 node->start = start;
2368 node->end = end; 2372 node->end = end;
2369 node->policy = pol; 2373 node->policy = pol;
2370 } 2374 }
2371 2375
2372 static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2376 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2373 struct mempolicy *pol) 2377 struct mempolicy *pol)
2374 { 2378 {
2375 struct sp_node *n; 2379 struct sp_node *n;
2376 struct mempolicy *newpol; 2380 struct mempolicy *newpol;
2377 2381
2378 n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2382 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2379 if (!n) 2383 if (!n)
2380 return NULL; 2384 return NULL;
2381 2385
2382 newpol = mpol_dup(pol); 2386 newpol = mpol_dup(pol);
2383 if (IS_ERR(newpol)) { 2387 if (IS_ERR(newpol)) {
2384 kmem_cache_free(sn_cache, n); 2388 kmem_cache_free(sn_cache, n);
2385 return NULL; 2389 return NULL;
2386 } 2390 }
2387 newpol->flags |= MPOL_F_SHARED; 2391 newpol->flags |= MPOL_F_SHARED;
2388 sp_node_init(n, start, end, newpol); 2392 sp_node_init(n, start, end, newpol);
2389 2393
2390 return n; 2394 return n;
2391 } 2395 }
2392 2396
2393 /* Replace a policy range. */ 2397 /* Replace a policy range. */
2394 static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 2398 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2395 unsigned long end, struct sp_node *new) 2399 unsigned long end, struct sp_node *new)
2396 { 2400 {
2397 struct sp_node *n; 2401 struct sp_node *n;
2398 struct sp_node *n_new = NULL; 2402 struct sp_node *n_new = NULL;
2399 struct mempolicy *mpol_new = NULL; 2403 struct mempolicy *mpol_new = NULL;
2400 int ret = 0; 2404 int ret = 0;
2401 2405
2402 restart: 2406 restart:
2403 spin_lock(&sp->lock); 2407 spin_lock(&sp->lock);
2404 n = sp_lookup(sp, start, end); 2408 n = sp_lookup(sp, start, end);
2405 /* Take care of old policies in the same range. */ 2409 /* Take care of old policies in the same range. */
2406 while (n && n->start < end) { 2410 while (n && n->start < end) {
2407 struct rb_node *next = rb_next(&n->nd); 2411 struct rb_node *next = rb_next(&n->nd);
2408 if (n->start >= start) { 2412 if (n->start >= start) {
2409 if (n->end <= end) 2413 if (n->end <= end)
2410 sp_delete(sp, n); 2414 sp_delete(sp, n);
2411 else 2415 else
2412 n->start = end; 2416 n->start = end;
2413 } else { 2417 } else {
2414 /* Old policy spanning whole new range. */ 2418 /* Old policy spanning whole new range. */
2415 if (n->end > end) { 2419 if (n->end > end) {
2416 if (!n_new) 2420 if (!n_new)
2417 goto alloc_new; 2421 goto alloc_new;
2418 2422
2419 *mpol_new = *n->policy; 2423 *mpol_new = *n->policy;
2420 atomic_set(&mpol_new->refcnt, 1); 2424 atomic_set(&mpol_new->refcnt, 1);
2421 sp_node_init(n_new, end, n->end, mpol_new); 2425 sp_node_init(n_new, end, n->end, mpol_new);
2422 n->end = start; 2426 n->end = start;
2423 sp_insert(sp, n_new); 2427 sp_insert(sp, n_new);
2424 n_new = NULL; 2428 n_new = NULL;
2425 mpol_new = NULL; 2429 mpol_new = NULL;
2426 break; 2430 break;
2427 } else 2431 } else
2428 n->end = start; 2432 n->end = start;
2429 } 2433 }
2430 if (!next) 2434 if (!next)
2431 break; 2435 break;
2432 n = rb_entry(next, struct sp_node, nd); 2436 n = rb_entry(next, struct sp_node, nd);
2433 } 2437 }
2434 if (new) 2438 if (new)
2435 sp_insert(sp, new); 2439 sp_insert(sp, new);
2436 spin_unlock(&sp->lock); 2440 spin_unlock(&sp->lock);
2437 ret = 0; 2441 ret = 0;
2438 2442
2439 err_out: 2443 err_out:
2440 if (mpol_new) 2444 if (mpol_new)
2441 mpol_put(mpol_new); 2445 mpol_put(mpol_new);
2442 if (n_new) 2446 if (n_new)
2443 kmem_cache_free(sn_cache, n_new); 2447 kmem_cache_free(sn_cache, n_new);
2444 2448
2445 return ret; 2449 return ret;
2446 2450
2447 alloc_new: 2451 alloc_new:
2448 spin_unlock(&sp->lock); 2452 spin_unlock(&sp->lock);
2449 ret = -ENOMEM; 2453 ret = -ENOMEM;
2450 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2454 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2451 if (!n_new) 2455 if (!n_new)
2452 goto err_out; 2456 goto err_out;
2453 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 2457 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2454 if (!mpol_new) 2458 if (!mpol_new)
2455 goto err_out; 2459 goto err_out;
2456 goto restart; 2460 goto restart;
2457 } 2461 }
2458 2462
2459 /** 2463 /**
2460 * mpol_shared_policy_init - initialize shared policy for inode 2464 * mpol_shared_policy_init - initialize shared policy for inode
2461 * @sp: pointer to inode shared policy 2465 * @sp: pointer to inode shared policy
2462 * @mpol: struct mempolicy to install 2466 * @mpol: struct mempolicy to install
2463 * 2467 *
2464 * Install non-NULL @mpol in inode's shared policy rb-tree. 2468 * Install non-NULL @mpol in inode's shared policy rb-tree.
2465 * On entry, the current task has a reference on a non-NULL @mpol. 2469 * On entry, the current task has a reference on a non-NULL @mpol.
2466 * This must be released on exit. 2470 * This must be released on exit.
2467 * This is called at get_inode() calls and we can use GFP_KERNEL. 2471 * This is called at get_inode() calls and we can use GFP_KERNEL.
2468 */ 2472 */
2469 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 2473 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2470 { 2474 {
2471 int ret; 2475 int ret;
2472 2476
2473 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2477 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2474 spin_lock_init(&sp->lock); 2478 spin_lock_init(&sp->lock);
2475 2479
2476 if (mpol) { 2480 if (mpol) {
2477 struct vm_area_struct pvma; 2481 struct vm_area_struct pvma;
2478 struct mempolicy *new; 2482 struct mempolicy *new;
2479 NODEMASK_SCRATCH(scratch); 2483 NODEMASK_SCRATCH(scratch);
2480 2484
2481 if (!scratch) 2485 if (!scratch)
2482 goto put_mpol; 2486 goto put_mpol;
2483 /* contextualize the tmpfs mount point mempolicy */ 2487 /* contextualize the tmpfs mount point mempolicy */
2484 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 2488 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2485 if (IS_ERR(new)) 2489 if (IS_ERR(new))
2486 goto free_scratch; /* no valid nodemask intersection */ 2490 goto free_scratch; /* no valid nodemask intersection */
2487 2491
2488 task_lock(current); 2492 task_lock(current);
2489 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); 2493 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2490 task_unlock(current); 2494 task_unlock(current);
2491 if (ret) 2495 if (ret)
2492 goto put_new; 2496 goto put_new;
2493 2497
2494 /* Create pseudo-vma that contains just the policy */ 2498 /* Create pseudo-vma that contains just the policy */
2495 memset(&pvma, 0, sizeof(struct vm_area_struct)); 2499 memset(&pvma, 0, sizeof(struct vm_area_struct));
2496 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 2500 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2497 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 2501 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2498 2502
2499 put_new: 2503 put_new:
2500 mpol_put(new); /* drop initial ref */ 2504 mpol_put(new); /* drop initial ref */
2501 free_scratch: 2505 free_scratch:
2502 NODEMASK_SCRATCH_FREE(scratch); 2506 NODEMASK_SCRATCH_FREE(scratch);
2503 put_mpol: 2507 put_mpol:
2504 mpol_put(mpol); /* drop our incoming ref on sb mpol */ 2508 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2505 } 2509 }
2506 } 2510 }
2507 2511
2508 int mpol_set_shared_policy(struct shared_policy *info, 2512 int mpol_set_shared_policy(struct shared_policy *info,
2509 struct vm_area_struct *vma, struct mempolicy *npol) 2513 struct vm_area_struct *vma, struct mempolicy *npol)
2510 { 2514 {
2511 int err; 2515 int err;
2512 struct sp_node *new = NULL; 2516 struct sp_node *new = NULL;
2513 unsigned long sz = vma_pages(vma); 2517 unsigned long sz = vma_pages(vma);
2514 2518
2515 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", 2519 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2516 vma->vm_pgoff, 2520 vma->vm_pgoff,
2517 sz, npol ? npol->mode : -1, 2521 sz, npol ? npol->mode : -1,
2518 npol ? npol->flags : -1, 2522 npol ? npol->flags : -1,
2519 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); 2523 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2520 2524
2521 if (npol) { 2525 if (npol) {
2522 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 2526 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2523 if (!new) 2527 if (!new)
2524 return -ENOMEM; 2528 return -ENOMEM;
2525 } 2529 }
2526 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 2530 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2527 if (err && new) 2531 if (err && new)
2528 sp_free(new); 2532 sp_free(new);
2529 return err; 2533 return err;
2530 } 2534 }
2531 2535
2532 /* Free a backing policy store on inode delete. */ 2536 /* Free a backing policy store on inode delete. */
2533 void mpol_free_shared_policy(struct shared_policy *p) 2537 void mpol_free_shared_policy(struct shared_policy *p)
2534 { 2538 {
2535 struct sp_node *n; 2539 struct sp_node *n;
2536 struct rb_node *next; 2540 struct rb_node *next;
2537 2541
2538 if (!p->root.rb_node) 2542 if (!p->root.rb_node)
2539 return; 2543 return;
2540 spin_lock(&p->lock); 2544 spin_lock(&p->lock);
2541 next = rb_first(&p->root); 2545 next = rb_first(&p->root);
2542 while (next) { 2546 while (next) {
2543 n = rb_entry(next, struct sp_node, nd); 2547 n = rb_entry(next, struct sp_node, nd);
2544 next = rb_next(&n->nd); 2548 next = rb_next(&n->nd);
2545 sp_delete(p, n); 2549 sp_delete(p, n);
2546 } 2550 }
2547 spin_unlock(&p->lock); 2551 spin_unlock(&p->lock);
2548 } 2552 }
2549 2553
2550 #ifdef CONFIG_NUMA_BALANCING 2554 #ifdef CONFIG_NUMA_BALANCING
2551 static int __initdata numabalancing_override; 2555 static int __initdata numabalancing_override;
2552 2556
2553 static void __init check_numabalancing_enable(void) 2557 static void __init check_numabalancing_enable(void)
2554 { 2558 {
2555 bool numabalancing_default = false; 2559 bool numabalancing_default = false;
2556 2560
2557 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 2561 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2558 numabalancing_default = true; 2562 numabalancing_default = true;
2559 2563
2560 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 2564 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2561 if (numabalancing_override) 2565 if (numabalancing_override)
2562 set_numabalancing_state(numabalancing_override == 1); 2566 set_numabalancing_state(numabalancing_override == 1);
2563 2567
2564 if (nr_node_ids > 1 && !numabalancing_override) { 2568 if (nr_node_ids > 1 && !numabalancing_override) {
2565 pr_info("%s automatic NUMA balancing. " 2569 pr_info("%s automatic NUMA balancing. "
2566 "Configure with numa_balancing= or the " 2570 "Configure with numa_balancing= or the "
2567 "kernel.numa_balancing sysctl", 2571 "kernel.numa_balancing sysctl",
2568 numabalancing_default ? "Enabling" : "Disabling"); 2572 numabalancing_default ? "Enabling" : "Disabling");
2569 set_numabalancing_state(numabalancing_default); 2573 set_numabalancing_state(numabalancing_default);
2570 } 2574 }
2571 } 2575 }
2572 2576
2573 static int __init setup_numabalancing(char *str) 2577 static int __init setup_numabalancing(char *str)
2574 { 2578 {
2575 int ret = 0; 2579 int ret = 0;
2576 if (!str) 2580 if (!str)
2577 goto out; 2581 goto out;
2578 2582
2579 if (!strcmp(str, "enable")) { 2583 if (!strcmp(str, "enable")) {
2580 numabalancing_override = 1; 2584 numabalancing_override = 1;
2581 ret = 1; 2585 ret = 1;
2582 } else if (!strcmp(str, "disable")) { 2586 } else if (!strcmp(str, "disable")) {
2583 numabalancing_override = -1; 2587 numabalancing_override = -1;
2584 ret = 1; 2588 ret = 1;
2585 } 2589 }
2586 out: 2590 out:
2587 if (!ret) 2591 if (!ret)
2588 pr_warn("Unable to parse numa_balancing=\n"); 2592 pr_warn("Unable to parse numa_balancing=\n");
2589 2593
2590 return ret; 2594 return ret;
2591 } 2595 }
2592 __setup("numa_balancing=", setup_numabalancing); 2596 __setup("numa_balancing=", setup_numabalancing);
2593 #else 2597 #else
2594 static inline void __init check_numabalancing_enable(void) 2598 static inline void __init check_numabalancing_enable(void)
2595 { 2599 {
2596 } 2600 }
2597 #endif /* CONFIG_NUMA_BALANCING */ 2601 #endif /* CONFIG_NUMA_BALANCING */
2598 2602
2599 /* assumes fs == KERNEL_DS */ 2603 /* assumes fs == KERNEL_DS */
2600 void __init numa_policy_init(void) 2604 void __init numa_policy_init(void)
2601 { 2605 {
2602 nodemask_t interleave_nodes; 2606 nodemask_t interleave_nodes;
2603 unsigned long largest = 0; 2607 unsigned long largest = 0;
2604 int nid, prefer = 0; 2608 int nid, prefer = 0;
2605 2609
2606 policy_cache = kmem_cache_create("numa_policy", 2610 policy_cache = kmem_cache_create("numa_policy",
2607 sizeof(struct mempolicy), 2611 sizeof(struct mempolicy),
2608 0, SLAB_PANIC, NULL); 2612 0, SLAB_PANIC, NULL);
2609 2613
2610 sn_cache = kmem_cache_create("shared_policy_node", 2614 sn_cache = kmem_cache_create("shared_policy_node",
2611 sizeof(struct sp_node), 2615 sizeof(struct sp_node),
2612 0, SLAB_PANIC, NULL); 2616 0, SLAB_PANIC, NULL);
2613 2617
2614 for_each_node(nid) { 2618 for_each_node(nid) {
2615 preferred_node_policy[nid] = (struct mempolicy) { 2619 preferred_node_policy[nid] = (struct mempolicy) {
2616 .refcnt = ATOMIC_INIT(1), 2620 .refcnt = ATOMIC_INIT(1),
2617 .mode = MPOL_PREFERRED, 2621 .mode = MPOL_PREFERRED,
2618 .flags = MPOL_F_MOF | MPOL_F_MORON, 2622 .flags = MPOL_F_MOF | MPOL_F_MORON,
2619 .v = { .preferred_node = nid, }, 2623 .v = { .preferred_node = nid, },
2620 }; 2624 };
2621 } 2625 }
2622 2626
2623 /* 2627 /*
2624 * Set interleaving policy for system init. Interleaving is only 2628 * Set interleaving policy for system init. Interleaving is only
2625 * enabled across suitably sized nodes (default is >= 16MB), or 2629 * enabled across suitably sized nodes (default is >= 16MB), or
2626 * fall back to the largest node if they're all smaller. 2630 * fall back to the largest node if they're all smaller.
2627 */ 2631 */
2628 nodes_clear(interleave_nodes); 2632 nodes_clear(interleave_nodes);
2629 for_each_node_state(nid, N_MEMORY) { 2633 for_each_node_state(nid, N_MEMORY) {
2630 unsigned long total_pages = node_present_pages(nid); 2634 unsigned long total_pages = node_present_pages(nid);
2631 2635
2632 /* Preserve the largest node */ 2636 /* Preserve the largest node */
2633 if (largest < total_pages) { 2637 if (largest < total_pages) {
2634 largest = total_pages; 2638 largest = total_pages;
2635 prefer = nid; 2639 prefer = nid;
2636 } 2640 }
2637 2641
2638 /* Interleave this node? */ 2642 /* Interleave this node? */
2639 if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 2643 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2640 node_set(nid, interleave_nodes); 2644 node_set(nid, interleave_nodes);
2641 } 2645 }
2642 2646
2643 /* All too small, use the largest */ 2647 /* All too small, use the largest */
2644 if (unlikely(nodes_empty(interleave_nodes))) 2648 if (unlikely(nodes_empty(interleave_nodes)))
2645 node_set(prefer, interleave_nodes); 2649 node_set(prefer, interleave_nodes);
2646 2650
2647 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 2651 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2648 printk("numa_policy_init: interleaving failed\n"); 2652 printk("numa_policy_init: interleaving failed\n");
2649 2653
2650 check_numabalancing_enable(); 2654 check_numabalancing_enable();
2651 } 2655 }
2652 2656
2653 /* Reset policy of current process to default */ 2657 /* Reset policy of current process to default */
2654 void numa_default_policy(void) 2658 void numa_default_policy(void)
2655 { 2659 {
2656 do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 2660 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2657 } 2661 }
2658 2662
2659 /* 2663 /*
2660 * Parse and format mempolicy from/to strings 2664 * Parse and format mempolicy from/to strings
2661 */ 2665 */
2662 2666
2663 /* 2667 /*
2664 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. 2668 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2665 */ 2669 */
2666 static const char * const policy_modes[] = 2670 static const char * const policy_modes[] =
2667 { 2671 {
2668 [MPOL_DEFAULT] = "default", 2672 [MPOL_DEFAULT] = "default",
2669 [MPOL_PREFERRED] = "prefer", 2673 [MPOL_PREFERRED] = "prefer",
2670 [MPOL_BIND] = "bind", 2674 [MPOL_BIND] = "bind",
2671 [MPOL_INTERLEAVE] = "interleave", 2675 [MPOL_INTERLEAVE] = "interleave",
2672 [MPOL_LOCAL] = "local", 2676 [MPOL_LOCAL] = "local",
2673 }; 2677 };
2674 2678
2675 2679
2676 #ifdef CONFIG_TMPFS 2680 #ifdef CONFIG_TMPFS
2677 /** 2681 /**
2678 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 2682 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2679 * @str: string containing mempolicy to parse 2683 * @str: string containing mempolicy to parse
2680 * @mpol: pointer to struct mempolicy pointer, returned on success. 2684 * @mpol: pointer to struct mempolicy pointer, returned on success.
2681 * 2685 *
2682 * Format of input: 2686 * Format of input:
2683 * <mode>[=<flags>][:<nodelist>] 2687 * <mode>[=<flags>][:<nodelist>]
2684 * 2688 *
2685 * On success, returns 0, else 1 2689 * On success, returns 0, else 1
2686 */ 2690 */
2687 int mpol_parse_str(char *str, struct mempolicy **mpol) 2691 int mpol_parse_str(char *str, struct mempolicy **mpol)
2688 { 2692 {
2689 struct mempolicy *new = NULL; 2693 struct mempolicy *new = NULL;
2690 unsigned short mode; 2694 unsigned short mode;
2691 unsigned short mode_flags; 2695 unsigned short mode_flags;
2692 nodemask_t nodes; 2696 nodemask_t nodes;
2693 char *nodelist = strchr(str, ':'); 2697 char *nodelist = strchr(str, ':');
2694 char *flags = strchr(str, '='); 2698 char *flags = strchr(str, '=');
2695 int err = 1; 2699 int err = 1;
2696 2700
2697 if (nodelist) { 2701 if (nodelist) {
2698 /* NUL-terminate mode or flags string */ 2702 /* NUL-terminate mode or flags string */
2699 *nodelist++ = '\0'; 2703 *nodelist++ = '\0';
2700 if (nodelist_parse(nodelist, nodes)) 2704 if (nodelist_parse(nodelist, nodes))
2701 goto out; 2705 goto out;
2702 if (!nodes_subset(nodes, node_states[N_MEMORY])) 2706 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2703 goto out; 2707 goto out;
2704 } else 2708 } else
2705 nodes_clear(nodes); 2709 nodes_clear(nodes);
2706 2710
2707 if (flags) 2711 if (flags)
2708 *flags++ = '\0'; /* terminate mode string */ 2712 *flags++ = '\0'; /* terminate mode string */
2709 2713
2710 for (mode = 0; mode < MPOL_MAX; mode++) { 2714 for (mode = 0; mode < MPOL_MAX; mode++) {
2711 if (!strcmp(str, policy_modes[mode])) { 2715 if (!strcmp(str, policy_modes[mode])) {
2712 break; 2716 break;
2713 } 2717 }
2714 } 2718 }
2715 if (mode >= MPOL_MAX) 2719 if (mode >= MPOL_MAX)
2716 goto out; 2720 goto out;
2717 2721
2718 switch (mode) { 2722 switch (mode) {
2719 case MPOL_PREFERRED: 2723 case MPOL_PREFERRED:
2720 /* 2724 /*
2721 * Insist on a nodelist of one node only 2725 * Insist on a nodelist of one node only
2722 */ 2726 */
2723 if (nodelist) { 2727 if (nodelist) {
2724 char *rest = nodelist; 2728 char *rest = nodelist;
2725 while (isdigit(*rest)) 2729 while (isdigit(*rest))
2726 rest++; 2730 rest++;
2727 if (*rest) 2731 if (*rest)
2728 goto out; 2732 goto out;
2729 } 2733 }
2730 break; 2734 break;
2731 case MPOL_INTERLEAVE: 2735 case MPOL_INTERLEAVE:
2732 /* 2736 /*
2733 * Default to online nodes with memory if no nodelist 2737 * Default to online nodes with memory if no nodelist
2734 */ 2738 */
2735 if (!nodelist) 2739 if (!nodelist)
2736 nodes = node_states[N_MEMORY]; 2740 nodes = node_states[N_MEMORY];
2737 break; 2741 break;
2738 case MPOL_LOCAL: 2742 case MPOL_LOCAL:
2739 /* 2743 /*
2740 * Don't allow a nodelist; mpol_new() checks flags 2744 * Don't allow a nodelist; mpol_new() checks flags
2741 */ 2745 */
2742 if (nodelist) 2746 if (nodelist)
2743 goto out; 2747 goto out;
2744 mode = MPOL_PREFERRED; 2748 mode = MPOL_PREFERRED;
2745 break; 2749 break;
2746 case MPOL_DEFAULT: 2750 case MPOL_DEFAULT:
2747 /* 2751 /*
2748 * Insist on a empty nodelist 2752 * Insist on a empty nodelist
2749 */ 2753 */
2750 if (!nodelist) 2754 if (!nodelist)
2751 err = 0; 2755 err = 0;
2752 goto out; 2756 goto out;
2753 case MPOL_BIND: 2757 case MPOL_BIND:
2754 /* 2758 /*
2755 * Insist on a nodelist 2759 * Insist on a nodelist
2756 */ 2760 */
2757 if (!nodelist) 2761 if (!nodelist)
2758 goto out; 2762 goto out;
2759 } 2763 }
2760 2764
2761 mode_flags = 0; 2765 mode_flags = 0;
2762 if (flags) { 2766 if (flags) {
2763 /* 2767 /*
2764 * Currently, we only support two mutually exclusive 2768 * Currently, we only support two mutually exclusive
2765 * mode flags. 2769 * mode flags.
2766 */ 2770 */
2767 if (!strcmp(flags, "static")) 2771 if (!strcmp(flags, "static"))
2768 mode_flags |= MPOL_F_STATIC_NODES; 2772 mode_flags |= MPOL_F_STATIC_NODES;
2769 else if (!strcmp(flags, "relative")) 2773 else if (!strcmp(flags, "relative"))
2770 mode_flags |= MPOL_F_RELATIVE_NODES; 2774 mode_flags |= MPOL_F_RELATIVE_NODES;
2771 else 2775 else
2772 goto out; 2776 goto out;
2773 } 2777 }
2774 2778
2775 new = mpol_new(mode, mode_flags, &nodes); 2779 new = mpol_new(mode, mode_flags, &nodes);
2776 if (IS_ERR(new)) 2780 if (IS_ERR(new))
2777 goto out; 2781 goto out;
2778 2782
2779 /* 2783 /*
2780 * Save nodes for mpol_to_str() to show the tmpfs mount options 2784 * Save nodes for mpol_to_str() to show the tmpfs mount options
2781 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 2785 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2782 */ 2786 */
2783 if (mode != MPOL_PREFERRED) 2787 if (mode != MPOL_PREFERRED)
2784 new->v.nodes = nodes; 2788 new->v.nodes = nodes;
2785 else if (nodelist) 2789 else if (nodelist)
2786 new->v.preferred_node = first_node(nodes); 2790 new->v.preferred_node = first_node(nodes);
2787 else 2791 else
2788 new->flags |= MPOL_F_LOCAL; 2792 new->flags |= MPOL_F_LOCAL;
2789 2793
2790 /* 2794 /*
2791 * Save nodes for contextualization: this will be used to "clone" 2795 * Save nodes for contextualization: this will be used to "clone"
2792 * the mempolicy in a specific context [cpuset] at a later time. 2796 * the mempolicy in a specific context [cpuset] at a later time.
2793 */ 2797 */
2794 new->w.user_nodemask = nodes; 2798 new->w.user_nodemask = nodes;
2795 2799
2796 err = 0; 2800 err = 0;
2797 2801
2798 out: 2802 out:
2799 /* Restore string for error message */ 2803 /* Restore string for error message */
2800 if (nodelist) 2804 if (nodelist)
2801 *--nodelist = ':'; 2805 *--nodelist = ':';
2802 if (flags) 2806 if (flags)
2803 *--flags = '='; 2807 *--flags = '=';
2804 if (!err) 2808 if (!err)
2805 *mpol = new; 2809 *mpol = new;
2806 return err; 2810 return err;
2807 } 2811 }
2808 #endif /* CONFIG_TMPFS */ 2812 #endif /* CONFIG_TMPFS */
2809 2813
2810 /** 2814 /**
2811 * mpol_to_str - format a mempolicy structure for printing 2815 * mpol_to_str - format a mempolicy structure for printing
2812 * @buffer: to contain formatted mempolicy string 2816 * @buffer: to contain formatted mempolicy string
2813 * @maxlen: length of @buffer 2817 * @maxlen: length of @buffer
2814 * @pol: pointer to mempolicy to be formatted 2818 * @pol: pointer to mempolicy to be formatted
2815 * 2819 *
2816 * Convert @pol into a string. If @buffer is too short, truncate the string. 2820 * Convert @pol into a string. If @buffer is too short, truncate the string.
2817 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the 2821 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2818 * longest flag, "relative", and to display at least a few node ids. 2822 * longest flag, "relative", and to display at least a few node ids.
2819 */ 2823 */
2820 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 2824 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2821 { 2825 {
2822 char *p = buffer; 2826 char *p = buffer;
2823 nodemask_t nodes = NODE_MASK_NONE; 2827 nodemask_t nodes = NODE_MASK_NONE;
2824 unsigned short mode = MPOL_DEFAULT; 2828 unsigned short mode = MPOL_DEFAULT;
2825 unsigned short flags = 0; 2829 unsigned short flags = 0;
2826 2830
2827 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { 2831 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2828 mode = pol->mode; 2832 mode = pol->mode;
2829 flags = pol->flags; 2833 flags = pol->flags;
2830 } 2834 }
2831 2835
2832 switch (mode) { 2836 switch (mode) {
2833 case MPOL_DEFAULT: 2837 case MPOL_DEFAULT:
2834 break; 2838 break;
2835 case MPOL_PREFERRED: 2839 case MPOL_PREFERRED:
2836 if (flags & MPOL_F_LOCAL) 2840 if (flags & MPOL_F_LOCAL)
2837 mode = MPOL_LOCAL; 2841 mode = MPOL_LOCAL;
2838 else 2842 else
2839 node_set(pol->v.preferred_node, nodes); 2843 node_set(pol->v.preferred_node, nodes);
2840 break; 2844 break;
2841 case MPOL_BIND: 2845 case MPOL_BIND:
2842 case MPOL_INTERLEAVE: 2846 case MPOL_INTERLEAVE:
2843 nodes = pol->v.nodes; 2847 nodes = pol->v.nodes;
2844 break; 2848 break;
2845 default: 2849 default:
2846 WARN_ON_ONCE(1); 2850 WARN_ON_ONCE(1);
2847 snprintf(p, maxlen, "unknown"); 2851 snprintf(p, maxlen, "unknown");
2848 return; 2852 return;
2849 } 2853 }
2850 2854
2851 p += snprintf(p, maxlen, "%s", policy_modes[mode]); 2855 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2852 2856
2853 if (flags & MPOL_MODE_FLAGS) { 2857 if (flags & MPOL_MODE_FLAGS) {
2854 p += snprintf(p, buffer + maxlen - p, "="); 2858 p += snprintf(p, buffer + maxlen - p, "=");
2855 2859
2856 /* 2860 /*
2857 * Currently, the only defined flags are mutually exclusive 2861 * Currently, the only defined flags are mutually exclusive
2858 */ 2862 */
2859 if (flags & MPOL_F_STATIC_NODES) 2863 if (flags & MPOL_F_STATIC_NODES)
2860 p += snprintf(p, buffer + maxlen - p, "static"); 2864 p += snprintf(p, buffer + maxlen - p, "static");
2861 else if (flags & MPOL_F_RELATIVE_NODES) 2865 else if (flags & MPOL_F_RELATIVE_NODES)
2862 p += snprintf(p, buffer + maxlen - p, "relative"); 2866 p += snprintf(p, buffer + maxlen - p, "relative");
2863 } 2867 }
2864 2868
2865 if (!nodes_empty(nodes)) { 2869 if (!nodes_empty(nodes)) {
2866 p += snprintf(p, buffer + maxlen - p, ":"); 2870 p += snprintf(p, buffer + maxlen - p, ":");
2867 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 2871 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2868 } 2872 }
2869 } 2873 }
2870 2874