Commit d4c54919ed86302094c0ca7d48a8cbd4ee753e92
Committed by
Linus Torvalds
1 parent
d54d14bfb4
Exists in
ti-lsk-linux-4.1.y
and in
12 other branches
mm: add !pte_present() check on existing hugetlb_entry callbacks
The age table walker doesn't check non-present hugetlb entry in common path, so hugetlb_entry() callbacks must check it. The reason for this behavior is that some callers want to handle it in its own way. [ I think that reason is bogus, btw - it should just do what the regular code does, which is to call the "pte_hole()" function for such hugetlb entries - Linus] However, some callers don't check it now, which causes unpredictable result, for example when we have a race between migrating hugepage and reading /proc/pid/numa_maps. This patch fixes it by adding !pte_present checks on buggy callbacks. This bug exists for years and got visible by introducing hugepage migration. ChangeLog v2: - fix if condition (check !pte_present() instead of pte_present()) Reported-by: Sasha Levin <sasha.levin@oracle.com> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Rik van Riel <riel@redhat.com> Cc: <stable@vger.kernel.org> [3.12+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> [ Backported to 3.15. Signed-off-by: Josh Boyer <jwboyer@fedoraproject.org> ] Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 2 changed files with 6 additions and 2 deletions Inline Diff
fs/proc/task_mmu.c
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/vmacache.h> | 2 | #include <linux/vmacache.h> |
3 | #include <linux/hugetlb.h> | 3 | #include <linux/hugetlb.h> |
4 | #include <linux/huge_mm.h> | 4 | #include <linux/huge_mm.h> |
5 | #include <linux/mount.h> | 5 | #include <linux/mount.h> |
6 | #include <linux/seq_file.h> | 6 | #include <linux/seq_file.h> |
7 | #include <linux/highmem.h> | 7 | #include <linux/highmem.h> |
8 | #include <linux/ptrace.h> | 8 | #include <linux/ptrace.h> |
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | #include <linux/pagemap.h> | 10 | #include <linux/pagemap.h> |
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/rmap.h> | 12 | #include <linux/rmap.h> |
13 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
14 | #include <linux/swapops.h> | 14 | #include <linux/swapops.h> |
15 | #include <linux/mmu_notifier.h> | 15 | #include <linux/mmu_notifier.h> |
16 | 16 | ||
17 | #include <asm/elf.h> | 17 | #include <asm/elf.h> |
18 | #include <asm/uaccess.h> | 18 | #include <asm/uaccess.h> |
19 | #include <asm/tlbflush.h> | 19 | #include <asm/tlbflush.h> |
20 | #include "internal.h" | 20 | #include "internal.h" |
21 | 21 | ||
22 | void task_mem(struct seq_file *m, struct mm_struct *mm) | 22 | void task_mem(struct seq_file *m, struct mm_struct *mm) |
23 | { | 23 | { |
24 | unsigned long data, text, lib, swap; | 24 | unsigned long data, text, lib, swap; |
25 | unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; | 25 | unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; |
26 | 26 | ||
27 | /* | 27 | /* |
28 | * Note: to minimize their overhead, mm maintains hiwater_vm and | 28 | * Note: to minimize their overhead, mm maintains hiwater_vm and |
29 | * hiwater_rss only when about to *lower* total_vm or rss. Any | 29 | * hiwater_rss only when about to *lower* total_vm or rss. Any |
30 | * collector of these hiwater stats must therefore get total_vm | 30 | * collector of these hiwater stats must therefore get total_vm |
31 | * and rss too, which will usually be the higher. Barriers? not | 31 | * and rss too, which will usually be the higher. Barriers? not |
32 | * worth the effort, such snapshots can always be inconsistent. | 32 | * worth the effort, such snapshots can always be inconsistent. |
33 | */ | 33 | */ |
34 | hiwater_vm = total_vm = mm->total_vm; | 34 | hiwater_vm = total_vm = mm->total_vm; |
35 | if (hiwater_vm < mm->hiwater_vm) | 35 | if (hiwater_vm < mm->hiwater_vm) |
36 | hiwater_vm = mm->hiwater_vm; | 36 | hiwater_vm = mm->hiwater_vm; |
37 | hiwater_rss = total_rss = get_mm_rss(mm); | 37 | hiwater_rss = total_rss = get_mm_rss(mm); |
38 | if (hiwater_rss < mm->hiwater_rss) | 38 | if (hiwater_rss < mm->hiwater_rss) |
39 | hiwater_rss = mm->hiwater_rss; | 39 | hiwater_rss = mm->hiwater_rss; |
40 | 40 | ||
41 | data = mm->total_vm - mm->shared_vm - mm->stack_vm; | 41 | data = mm->total_vm - mm->shared_vm - mm->stack_vm; |
42 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; | 42 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; |
43 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; | 43 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; |
44 | swap = get_mm_counter(mm, MM_SWAPENTS); | 44 | swap = get_mm_counter(mm, MM_SWAPENTS); |
45 | seq_printf(m, | 45 | seq_printf(m, |
46 | "VmPeak:\t%8lu kB\n" | 46 | "VmPeak:\t%8lu kB\n" |
47 | "VmSize:\t%8lu kB\n" | 47 | "VmSize:\t%8lu kB\n" |
48 | "VmLck:\t%8lu kB\n" | 48 | "VmLck:\t%8lu kB\n" |
49 | "VmPin:\t%8lu kB\n" | 49 | "VmPin:\t%8lu kB\n" |
50 | "VmHWM:\t%8lu kB\n" | 50 | "VmHWM:\t%8lu kB\n" |
51 | "VmRSS:\t%8lu kB\n" | 51 | "VmRSS:\t%8lu kB\n" |
52 | "VmData:\t%8lu kB\n" | 52 | "VmData:\t%8lu kB\n" |
53 | "VmStk:\t%8lu kB\n" | 53 | "VmStk:\t%8lu kB\n" |
54 | "VmExe:\t%8lu kB\n" | 54 | "VmExe:\t%8lu kB\n" |
55 | "VmLib:\t%8lu kB\n" | 55 | "VmLib:\t%8lu kB\n" |
56 | "VmPTE:\t%8lu kB\n" | 56 | "VmPTE:\t%8lu kB\n" |
57 | "VmSwap:\t%8lu kB\n", | 57 | "VmSwap:\t%8lu kB\n", |
58 | hiwater_vm << (PAGE_SHIFT-10), | 58 | hiwater_vm << (PAGE_SHIFT-10), |
59 | total_vm << (PAGE_SHIFT-10), | 59 | total_vm << (PAGE_SHIFT-10), |
60 | mm->locked_vm << (PAGE_SHIFT-10), | 60 | mm->locked_vm << (PAGE_SHIFT-10), |
61 | mm->pinned_vm << (PAGE_SHIFT-10), | 61 | mm->pinned_vm << (PAGE_SHIFT-10), |
62 | hiwater_rss << (PAGE_SHIFT-10), | 62 | hiwater_rss << (PAGE_SHIFT-10), |
63 | total_rss << (PAGE_SHIFT-10), | 63 | total_rss << (PAGE_SHIFT-10), |
64 | data << (PAGE_SHIFT-10), | 64 | data << (PAGE_SHIFT-10), |
65 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, | 65 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, |
66 | (PTRS_PER_PTE * sizeof(pte_t) * | 66 | (PTRS_PER_PTE * sizeof(pte_t) * |
67 | atomic_long_read(&mm->nr_ptes)) >> 10, | 67 | atomic_long_read(&mm->nr_ptes)) >> 10, |
68 | swap << (PAGE_SHIFT-10)); | 68 | swap << (PAGE_SHIFT-10)); |
69 | } | 69 | } |
70 | 70 | ||
71 | unsigned long task_vsize(struct mm_struct *mm) | 71 | unsigned long task_vsize(struct mm_struct *mm) |
72 | { | 72 | { |
73 | return PAGE_SIZE * mm->total_vm; | 73 | return PAGE_SIZE * mm->total_vm; |
74 | } | 74 | } |
75 | 75 | ||
76 | unsigned long task_statm(struct mm_struct *mm, | 76 | unsigned long task_statm(struct mm_struct *mm, |
77 | unsigned long *shared, unsigned long *text, | 77 | unsigned long *shared, unsigned long *text, |
78 | unsigned long *data, unsigned long *resident) | 78 | unsigned long *data, unsigned long *resident) |
79 | { | 79 | { |
80 | *shared = get_mm_counter(mm, MM_FILEPAGES); | 80 | *shared = get_mm_counter(mm, MM_FILEPAGES); |
81 | *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) | 81 | *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) |
82 | >> PAGE_SHIFT; | 82 | >> PAGE_SHIFT; |
83 | *data = mm->total_vm - mm->shared_vm; | 83 | *data = mm->total_vm - mm->shared_vm; |
84 | *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); | 84 | *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); |
85 | return mm->total_vm; | 85 | return mm->total_vm; |
86 | } | 86 | } |
87 | 87 | ||
88 | #ifdef CONFIG_NUMA | 88 | #ifdef CONFIG_NUMA |
89 | /* | 89 | /* |
90 | * These functions are for numa_maps but called in generic **maps seq_file | 90 | * These functions are for numa_maps but called in generic **maps seq_file |
91 | * ->start(), ->stop() ops. | 91 | * ->start(), ->stop() ops. |
92 | * | 92 | * |
93 | * numa_maps scans all vmas under mmap_sem and checks their mempolicy. | 93 | * numa_maps scans all vmas under mmap_sem and checks their mempolicy. |
94 | * Each mempolicy object is controlled by reference counting. The problem here | 94 | * Each mempolicy object is controlled by reference counting. The problem here |
95 | * is how to avoid accessing dead mempolicy object. | 95 | * is how to avoid accessing dead mempolicy object. |
96 | * | 96 | * |
97 | * Because we're holding mmap_sem while reading seq_file, it's safe to access | 97 | * Because we're holding mmap_sem while reading seq_file, it's safe to access |
98 | * each vma's mempolicy, no vma objects will never drop refs to mempolicy. | 98 | * each vma's mempolicy, no vma objects will never drop refs to mempolicy. |
99 | * | 99 | * |
100 | * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy | 100 | * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy |
101 | * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). | 101 | * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). |
102 | * So, without task_lock(), we cannot trust get_vma_policy() because we cannot | 102 | * So, without task_lock(), we cannot trust get_vma_policy() because we cannot |
103 | * gurantee the task never exits under us. But taking task_lock() around | 103 | * gurantee the task never exits under us. But taking task_lock() around |
104 | * get_vma_plicy() causes lock order problem. | 104 | * get_vma_plicy() causes lock order problem. |
105 | * | 105 | * |
106 | * To access task->mempolicy without lock, we hold a reference count of an | 106 | * To access task->mempolicy without lock, we hold a reference count of an |
107 | * object pointed by task->mempolicy and remember it. This will guarantee | 107 | * object pointed by task->mempolicy and remember it. This will guarantee |
108 | * that task->mempolicy points to an alive object or NULL in numa_maps accesses. | 108 | * that task->mempolicy points to an alive object or NULL in numa_maps accesses. |
109 | */ | 109 | */ |
110 | static void hold_task_mempolicy(struct proc_maps_private *priv) | 110 | static void hold_task_mempolicy(struct proc_maps_private *priv) |
111 | { | 111 | { |
112 | struct task_struct *task = priv->task; | 112 | struct task_struct *task = priv->task; |
113 | 113 | ||
114 | task_lock(task); | 114 | task_lock(task); |
115 | priv->task_mempolicy = task->mempolicy; | 115 | priv->task_mempolicy = task->mempolicy; |
116 | mpol_get(priv->task_mempolicy); | 116 | mpol_get(priv->task_mempolicy); |
117 | task_unlock(task); | 117 | task_unlock(task); |
118 | } | 118 | } |
119 | static void release_task_mempolicy(struct proc_maps_private *priv) | 119 | static void release_task_mempolicy(struct proc_maps_private *priv) |
120 | { | 120 | { |
121 | mpol_put(priv->task_mempolicy); | 121 | mpol_put(priv->task_mempolicy); |
122 | } | 122 | } |
123 | #else | 123 | #else |
124 | static void hold_task_mempolicy(struct proc_maps_private *priv) | 124 | static void hold_task_mempolicy(struct proc_maps_private *priv) |
125 | { | 125 | { |
126 | } | 126 | } |
127 | static void release_task_mempolicy(struct proc_maps_private *priv) | 127 | static void release_task_mempolicy(struct proc_maps_private *priv) |
128 | { | 128 | { |
129 | } | 129 | } |
130 | #endif | 130 | #endif |
131 | 131 | ||
132 | static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) | 132 | static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) |
133 | { | 133 | { |
134 | if (vma && vma != priv->tail_vma) { | 134 | if (vma && vma != priv->tail_vma) { |
135 | struct mm_struct *mm = vma->vm_mm; | 135 | struct mm_struct *mm = vma->vm_mm; |
136 | release_task_mempolicy(priv); | 136 | release_task_mempolicy(priv); |
137 | up_read(&mm->mmap_sem); | 137 | up_read(&mm->mmap_sem); |
138 | mmput(mm); | 138 | mmput(mm); |
139 | } | 139 | } |
140 | } | 140 | } |
141 | 141 | ||
142 | static void *m_start(struct seq_file *m, loff_t *pos) | 142 | static void *m_start(struct seq_file *m, loff_t *pos) |
143 | { | 143 | { |
144 | struct proc_maps_private *priv = m->private; | 144 | struct proc_maps_private *priv = m->private; |
145 | unsigned long last_addr = m->version; | 145 | unsigned long last_addr = m->version; |
146 | struct mm_struct *mm; | 146 | struct mm_struct *mm; |
147 | struct vm_area_struct *vma, *tail_vma = NULL; | 147 | struct vm_area_struct *vma, *tail_vma = NULL; |
148 | loff_t l = *pos; | 148 | loff_t l = *pos; |
149 | 149 | ||
150 | /* Clear the per syscall fields in priv */ | 150 | /* Clear the per syscall fields in priv */ |
151 | priv->task = NULL; | 151 | priv->task = NULL; |
152 | priv->tail_vma = NULL; | 152 | priv->tail_vma = NULL; |
153 | 153 | ||
154 | /* | 154 | /* |
155 | * We remember last_addr rather than next_addr to hit with | 155 | * We remember last_addr rather than next_addr to hit with |
156 | * vmacache most of the time. We have zero last_addr at | 156 | * vmacache most of the time. We have zero last_addr at |
157 | * the beginning and also after lseek. We will have -1 last_addr | 157 | * the beginning and also after lseek. We will have -1 last_addr |
158 | * after the end of the vmas. | 158 | * after the end of the vmas. |
159 | */ | 159 | */ |
160 | 160 | ||
161 | if (last_addr == -1UL) | 161 | if (last_addr == -1UL) |
162 | return NULL; | 162 | return NULL; |
163 | 163 | ||
164 | priv->task = get_pid_task(priv->pid, PIDTYPE_PID); | 164 | priv->task = get_pid_task(priv->pid, PIDTYPE_PID); |
165 | if (!priv->task) | 165 | if (!priv->task) |
166 | return ERR_PTR(-ESRCH); | 166 | return ERR_PTR(-ESRCH); |
167 | 167 | ||
168 | mm = mm_access(priv->task, PTRACE_MODE_READ); | 168 | mm = mm_access(priv->task, PTRACE_MODE_READ); |
169 | if (!mm || IS_ERR(mm)) | 169 | if (!mm || IS_ERR(mm)) |
170 | return mm; | 170 | return mm; |
171 | down_read(&mm->mmap_sem); | 171 | down_read(&mm->mmap_sem); |
172 | 172 | ||
173 | tail_vma = get_gate_vma(priv->task->mm); | 173 | tail_vma = get_gate_vma(priv->task->mm); |
174 | priv->tail_vma = tail_vma; | 174 | priv->tail_vma = tail_vma; |
175 | hold_task_mempolicy(priv); | 175 | hold_task_mempolicy(priv); |
176 | /* Start with last addr hint */ | 176 | /* Start with last addr hint */ |
177 | vma = find_vma(mm, last_addr); | 177 | vma = find_vma(mm, last_addr); |
178 | if (last_addr && vma) { | 178 | if (last_addr && vma) { |
179 | vma = vma->vm_next; | 179 | vma = vma->vm_next; |
180 | goto out; | 180 | goto out; |
181 | } | 181 | } |
182 | 182 | ||
183 | /* | 183 | /* |
184 | * Check the vma index is within the range and do | 184 | * Check the vma index is within the range and do |
185 | * sequential scan until m_index. | 185 | * sequential scan until m_index. |
186 | */ | 186 | */ |
187 | vma = NULL; | 187 | vma = NULL; |
188 | if ((unsigned long)l < mm->map_count) { | 188 | if ((unsigned long)l < mm->map_count) { |
189 | vma = mm->mmap; | 189 | vma = mm->mmap; |
190 | while (l-- && vma) | 190 | while (l-- && vma) |
191 | vma = vma->vm_next; | 191 | vma = vma->vm_next; |
192 | goto out; | 192 | goto out; |
193 | } | 193 | } |
194 | 194 | ||
195 | if (l != mm->map_count) | 195 | if (l != mm->map_count) |
196 | tail_vma = NULL; /* After gate vma */ | 196 | tail_vma = NULL; /* After gate vma */ |
197 | 197 | ||
198 | out: | 198 | out: |
199 | if (vma) | 199 | if (vma) |
200 | return vma; | 200 | return vma; |
201 | 201 | ||
202 | release_task_mempolicy(priv); | 202 | release_task_mempolicy(priv); |
203 | /* End of vmas has been reached */ | 203 | /* End of vmas has been reached */ |
204 | m->version = (tail_vma != NULL)? 0: -1UL; | 204 | m->version = (tail_vma != NULL)? 0: -1UL; |
205 | up_read(&mm->mmap_sem); | 205 | up_read(&mm->mmap_sem); |
206 | mmput(mm); | 206 | mmput(mm); |
207 | return tail_vma; | 207 | return tail_vma; |
208 | } | 208 | } |
209 | 209 | ||
210 | static void *m_next(struct seq_file *m, void *v, loff_t *pos) | 210 | static void *m_next(struct seq_file *m, void *v, loff_t *pos) |
211 | { | 211 | { |
212 | struct proc_maps_private *priv = m->private; | 212 | struct proc_maps_private *priv = m->private; |
213 | struct vm_area_struct *vma = v; | 213 | struct vm_area_struct *vma = v; |
214 | struct vm_area_struct *tail_vma = priv->tail_vma; | 214 | struct vm_area_struct *tail_vma = priv->tail_vma; |
215 | 215 | ||
216 | (*pos)++; | 216 | (*pos)++; |
217 | if (vma && (vma != tail_vma) && vma->vm_next) | 217 | if (vma && (vma != tail_vma) && vma->vm_next) |
218 | return vma->vm_next; | 218 | return vma->vm_next; |
219 | vma_stop(priv, vma); | 219 | vma_stop(priv, vma); |
220 | return (vma != tail_vma)? tail_vma: NULL; | 220 | return (vma != tail_vma)? tail_vma: NULL; |
221 | } | 221 | } |
222 | 222 | ||
223 | static void m_stop(struct seq_file *m, void *v) | 223 | static void m_stop(struct seq_file *m, void *v) |
224 | { | 224 | { |
225 | struct proc_maps_private *priv = m->private; | 225 | struct proc_maps_private *priv = m->private; |
226 | struct vm_area_struct *vma = v; | 226 | struct vm_area_struct *vma = v; |
227 | 227 | ||
228 | if (!IS_ERR(vma)) | 228 | if (!IS_ERR(vma)) |
229 | vma_stop(priv, vma); | 229 | vma_stop(priv, vma); |
230 | if (priv->task) | 230 | if (priv->task) |
231 | put_task_struct(priv->task); | 231 | put_task_struct(priv->task); |
232 | } | 232 | } |
233 | 233 | ||
234 | static int do_maps_open(struct inode *inode, struct file *file, | 234 | static int do_maps_open(struct inode *inode, struct file *file, |
235 | const struct seq_operations *ops) | 235 | const struct seq_operations *ops) |
236 | { | 236 | { |
237 | struct proc_maps_private *priv; | 237 | struct proc_maps_private *priv; |
238 | int ret = -ENOMEM; | 238 | int ret = -ENOMEM; |
239 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | 239 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
240 | if (priv) { | 240 | if (priv) { |
241 | priv->pid = proc_pid(inode); | 241 | priv->pid = proc_pid(inode); |
242 | ret = seq_open(file, ops); | 242 | ret = seq_open(file, ops); |
243 | if (!ret) { | 243 | if (!ret) { |
244 | struct seq_file *m = file->private_data; | 244 | struct seq_file *m = file->private_data; |
245 | m->private = priv; | 245 | m->private = priv; |
246 | } else { | 246 | } else { |
247 | kfree(priv); | 247 | kfree(priv); |
248 | } | 248 | } |
249 | } | 249 | } |
250 | return ret; | 250 | return ret; |
251 | } | 251 | } |
252 | 252 | ||
253 | static void | 253 | static void |
254 | show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) | 254 | show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) |
255 | { | 255 | { |
256 | struct mm_struct *mm = vma->vm_mm; | 256 | struct mm_struct *mm = vma->vm_mm; |
257 | struct file *file = vma->vm_file; | 257 | struct file *file = vma->vm_file; |
258 | struct proc_maps_private *priv = m->private; | 258 | struct proc_maps_private *priv = m->private; |
259 | struct task_struct *task = priv->task; | 259 | struct task_struct *task = priv->task; |
260 | vm_flags_t flags = vma->vm_flags; | 260 | vm_flags_t flags = vma->vm_flags; |
261 | unsigned long ino = 0; | 261 | unsigned long ino = 0; |
262 | unsigned long long pgoff = 0; | 262 | unsigned long long pgoff = 0; |
263 | unsigned long start, end; | 263 | unsigned long start, end; |
264 | dev_t dev = 0; | 264 | dev_t dev = 0; |
265 | const char *name = NULL; | 265 | const char *name = NULL; |
266 | 266 | ||
267 | if (file) { | 267 | if (file) { |
268 | struct inode *inode = file_inode(vma->vm_file); | 268 | struct inode *inode = file_inode(vma->vm_file); |
269 | dev = inode->i_sb->s_dev; | 269 | dev = inode->i_sb->s_dev; |
270 | ino = inode->i_ino; | 270 | ino = inode->i_ino; |
271 | pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; | 271 | pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; |
272 | } | 272 | } |
273 | 273 | ||
274 | /* We don't show the stack guard page in /proc/maps */ | 274 | /* We don't show the stack guard page in /proc/maps */ |
275 | start = vma->vm_start; | 275 | start = vma->vm_start; |
276 | if (stack_guard_page_start(vma, start)) | 276 | if (stack_guard_page_start(vma, start)) |
277 | start += PAGE_SIZE; | 277 | start += PAGE_SIZE; |
278 | end = vma->vm_end; | 278 | end = vma->vm_end; |
279 | if (stack_guard_page_end(vma, end)) | 279 | if (stack_guard_page_end(vma, end)) |
280 | end -= PAGE_SIZE; | 280 | end -= PAGE_SIZE; |
281 | 281 | ||
282 | seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); | 282 | seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); |
283 | seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", | 283 | seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", |
284 | start, | 284 | start, |
285 | end, | 285 | end, |
286 | flags & VM_READ ? 'r' : '-', | 286 | flags & VM_READ ? 'r' : '-', |
287 | flags & VM_WRITE ? 'w' : '-', | 287 | flags & VM_WRITE ? 'w' : '-', |
288 | flags & VM_EXEC ? 'x' : '-', | 288 | flags & VM_EXEC ? 'x' : '-', |
289 | flags & VM_MAYSHARE ? 's' : 'p', | 289 | flags & VM_MAYSHARE ? 's' : 'p', |
290 | pgoff, | 290 | pgoff, |
291 | MAJOR(dev), MINOR(dev), ino); | 291 | MAJOR(dev), MINOR(dev), ino); |
292 | 292 | ||
293 | /* | 293 | /* |
294 | * Print the dentry name for named mappings, and a | 294 | * Print the dentry name for named mappings, and a |
295 | * special [heap] marker for the heap: | 295 | * special [heap] marker for the heap: |
296 | */ | 296 | */ |
297 | if (file) { | 297 | if (file) { |
298 | seq_pad(m, ' '); | 298 | seq_pad(m, ' '); |
299 | seq_path(m, &file->f_path, "\n"); | 299 | seq_path(m, &file->f_path, "\n"); |
300 | goto done; | 300 | goto done; |
301 | } | 301 | } |
302 | 302 | ||
303 | name = arch_vma_name(vma); | 303 | name = arch_vma_name(vma); |
304 | if (!name) { | 304 | if (!name) { |
305 | pid_t tid; | 305 | pid_t tid; |
306 | 306 | ||
307 | if (!mm) { | 307 | if (!mm) { |
308 | name = "[vdso]"; | 308 | name = "[vdso]"; |
309 | goto done; | 309 | goto done; |
310 | } | 310 | } |
311 | 311 | ||
312 | if (vma->vm_start <= mm->brk && | 312 | if (vma->vm_start <= mm->brk && |
313 | vma->vm_end >= mm->start_brk) { | 313 | vma->vm_end >= mm->start_brk) { |
314 | name = "[heap]"; | 314 | name = "[heap]"; |
315 | goto done; | 315 | goto done; |
316 | } | 316 | } |
317 | 317 | ||
318 | tid = vm_is_stack(task, vma, is_pid); | 318 | tid = vm_is_stack(task, vma, is_pid); |
319 | 319 | ||
320 | if (tid != 0) { | 320 | if (tid != 0) { |
321 | /* | 321 | /* |
322 | * Thread stack in /proc/PID/task/TID/maps or | 322 | * Thread stack in /proc/PID/task/TID/maps or |
323 | * the main process stack. | 323 | * the main process stack. |
324 | */ | 324 | */ |
325 | if (!is_pid || (vma->vm_start <= mm->start_stack && | 325 | if (!is_pid || (vma->vm_start <= mm->start_stack && |
326 | vma->vm_end >= mm->start_stack)) { | 326 | vma->vm_end >= mm->start_stack)) { |
327 | name = "[stack]"; | 327 | name = "[stack]"; |
328 | } else { | 328 | } else { |
329 | /* Thread stack in /proc/PID/maps */ | 329 | /* Thread stack in /proc/PID/maps */ |
330 | seq_pad(m, ' '); | 330 | seq_pad(m, ' '); |
331 | seq_printf(m, "[stack:%d]", tid); | 331 | seq_printf(m, "[stack:%d]", tid); |
332 | } | 332 | } |
333 | } | 333 | } |
334 | } | 334 | } |
335 | 335 | ||
336 | done: | 336 | done: |
337 | if (name) { | 337 | if (name) { |
338 | seq_pad(m, ' '); | 338 | seq_pad(m, ' '); |
339 | seq_puts(m, name); | 339 | seq_puts(m, name); |
340 | } | 340 | } |
341 | seq_putc(m, '\n'); | 341 | seq_putc(m, '\n'); |
342 | } | 342 | } |
343 | 343 | ||
344 | static int show_map(struct seq_file *m, void *v, int is_pid) | 344 | static int show_map(struct seq_file *m, void *v, int is_pid) |
345 | { | 345 | { |
346 | struct vm_area_struct *vma = v; | 346 | struct vm_area_struct *vma = v; |
347 | struct proc_maps_private *priv = m->private; | 347 | struct proc_maps_private *priv = m->private; |
348 | struct task_struct *task = priv->task; | 348 | struct task_struct *task = priv->task; |
349 | 349 | ||
350 | show_map_vma(m, vma, is_pid); | 350 | show_map_vma(m, vma, is_pid); |
351 | 351 | ||
352 | if (m->count < m->size) /* vma is copied successfully */ | 352 | if (m->count < m->size) /* vma is copied successfully */ |
353 | m->version = (vma != get_gate_vma(task->mm)) | 353 | m->version = (vma != get_gate_vma(task->mm)) |
354 | ? vma->vm_start : 0; | 354 | ? vma->vm_start : 0; |
355 | return 0; | 355 | return 0; |
356 | } | 356 | } |
357 | 357 | ||
358 | static int show_pid_map(struct seq_file *m, void *v) | 358 | static int show_pid_map(struct seq_file *m, void *v) |
359 | { | 359 | { |
360 | return show_map(m, v, 1); | 360 | return show_map(m, v, 1); |
361 | } | 361 | } |
362 | 362 | ||
363 | static int show_tid_map(struct seq_file *m, void *v) | 363 | static int show_tid_map(struct seq_file *m, void *v) |
364 | { | 364 | { |
365 | return show_map(m, v, 0); | 365 | return show_map(m, v, 0); |
366 | } | 366 | } |
367 | 367 | ||
368 | static const struct seq_operations proc_pid_maps_op = { | 368 | static const struct seq_operations proc_pid_maps_op = { |
369 | .start = m_start, | 369 | .start = m_start, |
370 | .next = m_next, | 370 | .next = m_next, |
371 | .stop = m_stop, | 371 | .stop = m_stop, |
372 | .show = show_pid_map | 372 | .show = show_pid_map |
373 | }; | 373 | }; |
374 | 374 | ||
375 | static const struct seq_operations proc_tid_maps_op = { | 375 | static const struct seq_operations proc_tid_maps_op = { |
376 | .start = m_start, | 376 | .start = m_start, |
377 | .next = m_next, | 377 | .next = m_next, |
378 | .stop = m_stop, | 378 | .stop = m_stop, |
379 | .show = show_tid_map | 379 | .show = show_tid_map |
380 | }; | 380 | }; |
381 | 381 | ||
382 | static int pid_maps_open(struct inode *inode, struct file *file) | 382 | static int pid_maps_open(struct inode *inode, struct file *file) |
383 | { | 383 | { |
384 | return do_maps_open(inode, file, &proc_pid_maps_op); | 384 | return do_maps_open(inode, file, &proc_pid_maps_op); |
385 | } | 385 | } |
386 | 386 | ||
387 | static int tid_maps_open(struct inode *inode, struct file *file) | 387 | static int tid_maps_open(struct inode *inode, struct file *file) |
388 | { | 388 | { |
389 | return do_maps_open(inode, file, &proc_tid_maps_op); | 389 | return do_maps_open(inode, file, &proc_tid_maps_op); |
390 | } | 390 | } |
391 | 391 | ||
392 | const struct file_operations proc_pid_maps_operations = { | 392 | const struct file_operations proc_pid_maps_operations = { |
393 | .open = pid_maps_open, | 393 | .open = pid_maps_open, |
394 | .read = seq_read, | 394 | .read = seq_read, |
395 | .llseek = seq_lseek, | 395 | .llseek = seq_lseek, |
396 | .release = seq_release_private, | 396 | .release = seq_release_private, |
397 | }; | 397 | }; |
398 | 398 | ||
399 | const struct file_operations proc_tid_maps_operations = { | 399 | const struct file_operations proc_tid_maps_operations = { |
400 | .open = tid_maps_open, | 400 | .open = tid_maps_open, |
401 | .read = seq_read, | 401 | .read = seq_read, |
402 | .llseek = seq_lseek, | 402 | .llseek = seq_lseek, |
403 | .release = seq_release_private, | 403 | .release = seq_release_private, |
404 | }; | 404 | }; |
405 | 405 | ||
406 | /* | 406 | /* |
407 | * Proportional Set Size(PSS): my share of RSS. | 407 | * Proportional Set Size(PSS): my share of RSS. |
408 | * | 408 | * |
409 | * PSS of a process is the count of pages it has in memory, where each | 409 | * PSS of a process is the count of pages it has in memory, where each |
410 | * page is divided by the number of processes sharing it. So if a | 410 | * page is divided by the number of processes sharing it. So if a |
411 | * process has 1000 pages all to itself, and 1000 shared with one other | 411 | * process has 1000 pages all to itself, and 1000 shared with one other |
412 | * process, its PSS will be 1500. | 412 | * process, its PSS will be 1500. |
413 | * | 413 | * |
414 | * To keep (accumulated) division errors low, we adopt a 64bit | 414 | * To keep (accumulated) division errors low, we adopt a 64bit |
415 | * fixed-point pss counter to minimize division errors. So (pss >> | 415 | * fixed-point pss counter to minimize division errors. So (pss >> |
416 | * PSS_SHIFT) would be the real byte count. | 416 | * PSS_SHIFT) would be the real byte count. |
417 | * | 417 | * |
418 | * A shift of 12 before division means (assuming 4K page size): | 418 | * A shift of 12 before division means (assuming 4K page size): |
419 | * - 1M 3-user-pages add up to 8KB errors; | 419 | * - 1M 3-user-pages add up to 8KB errors; |
420 | * - supports mapcount up to 2^24, or 16M; | 420 | * - supports mapcount up to 2^24, or 16M; |
421 | * - supports PSS up to 2^52 bytes, or 4PB. | 421 | * - supports PSS up to 2^52 bytes, or 4PB. |
422 | */ | 422 | */ |
423 | #define PSS_SHIFT 12 | 423 | #define PSS_SHIFT 12 |
424 | 424 | ||
425 | #ifdef CONFIG_PROC_PAGE_MONITOR | 425 | #ifdef CONFIG_PROC_PAGE_MONITOR |
426 | struct mem_size_stats { | 426 | struct mem_size_stats { |
427 | struct vm_area_struct *vma; | 427 | struct vm_area_struct *vma; |
428 | unsigned long resident; | 428 | unsigned long resident; |
429 | unsigned long shared_clean; | 429 | unsigned long shared_clean; |
430 | unsigned long shared_dirty; | 430 | unsigned long shared_dirty; |
431 | unsigned long private_clean; | 431 | unsigned long private_clean; |
432 | unsigned long private_dirty; | 432 | unsigned long private_dirty; |
433 | unsigned long referenced; | 433 | unsigned long referenced; |
434 | unsigned long anonymous; | 434 | unsigned long anonymous; |
435 | unsigned long anonymous_thp; | 435 | unsigned long anonymous_thp; |
436 | unsigned long swap; | 436 | unsigned long swap; |
437 | unsigned long nonlinear; | 437 | unsigned long nonlinear; |
438 | u64 pss; | 438 | u64 pss; |
439 | }; | 439 | }; |
440 | 440 | ||
441 | 441 | ||
442 | static void smaps_pte_entry(pte_t ptent, unsigned long addr, | 442 | static void smaps_pte_entry(pte_t ptent, unsigned long addr, |
443 | unsigned long ptent_size, struct mm_walk *walk) | 443 | unsigned long ptent_size, struct mm_walk *walk) |
444 | { | 444 | { |
445 | struct mem_size_stats *mss = walk->private; | 445 | struct mem_size_stats *mss = walk->private; |
446 | struct vm_area_struct *vma = mss->vma; | 446 | struct vm_area_struct *vma = mss->vma; |
447 | pgoff_t pgoff = linear_page_index(vma, addr); | 447 | pgoff_t pgoff = linear_page_index(vma, addr); |
448 | struct page *page = NULL; | 448 | struct page *page = NULL; |
449 | int mapcount; | 449 | int mapcount; |
450 | 450 | ||
451 | if (pte_present(ptent)) { | 451 | if (pte_present(ptent)) { |
452 | page = vm_normal_page(vma, addr, ptent); | 452 | page = vm_normal_page(vma, addr, ptent); |
453 | } else if (is_swap_pte(ptent)) { | 453 | } else if (is_swap_pte(ptent)) { |
454 | swp_entry_t swpent = pte_to_swp_entry(ptent); | 454 | swp_entry_t swpent = pte_to_swp_entry(ptent); |
455 | 455 | ||
456 | if (!non_swap_entry(swpent)) | 456 | if (!non_swap_entry(swpent)) |
457 | mss->swap += ptent_size; | 457 | mss->swap += ptent_size; |
458 | else if (is_migration_entry(swpent)) | 458 | else if (is_migration_entry(swpent)) |
459 | page = migration_entry_to_page(swpent); | 459 | page = migration_entry_to_page(swpent); |
460 | } else if (pte_file(ptent)) { | 460 | } else if (pte_file(ptent)) { |
461 | if (pte_to_pgoff(ptent) != pgoff) | 461 | if (pte_to_pgoff(ptent) != pgoff) |
462 | mss->nonlinear += ptent_size; | 462 | mss->nonlinear += ptent_size; |
463 | } | 463 | } |
464 | 464 | ||
465 | if (!page) | 465 | if (!page) |
466 | return; | 466 | return; |
467 | 467 | ||
468 | if (PageAnon(page)) | 468 | if (PageAnon(page)) |
469 | mss->anonymous += ptent_size; | 469 | mss->anonymous += ptent_size; |
470 | 470 | ||
471 | if (page->index != pgoff) | 471 | if (page->index != pgoff) |
472 | mss->nonlinear += ptent_size; | 472 | mss->nonlinear += ptent_size; |
473 | 473 | ||
474 | mss->resident += ptent_size; | 474 | mss->resident += ptent_size; |
475 | /* Accumulate the size in pages that have been accessed. */ | 475 | /* Accumulate the size in pages that have been accessed. */ |
476 | if (pte_young(ptent) || PageReferenced(page)) | 476 | if (pte_young(ptent) || PageReferenced(page)) |
477 | mss->referenced += ptent_size; | 477 | mss->referenced += ptent_size; |
478 | mapcount = page_mapcount(page); | 478 | mapcount = page_mapcount(page); |
479 | if (mapcount >= 2) { | 479 | if (mapcount >= 2) { |
480 | if (pte_dirty(ptent) || PageDirty(page)) | 480 | if (pte_dirty(ptent) || PageDirty(page)) |
481 | mss->shared_dirty += ptent_size; | 481 | mss->shared_dirty += ptent_size; |
482 | else | 482 | else |
483 | mss->shared_clean += ptent_size; | 483 | mss->shared_clean += ptent_size; |
484 | mss->pss += (ptent_size << PSS_SHIFT) / mapcount; | 484 | mss->pss += (ptent_size << PSS_SHIFT) / mapcount; |
485 | } else { | 485 | } else { |
486 | if (pte_dirty(ptent) || PageDirty(page)) | 486 | if (pte_dirty(ptent) || PageDirty(page)) |
487 | mss->private_dirty += ptent_size; | 487 | mss->private_dirty += ptent_size; |
488 | else | 488 | else |
489 | mss->private_clean += ptent_size; | 489 | mss->private_clean += ptent_size; |
490 | mss->pss += (ptent_size << PSS_SHIFT); | 490 | mss->pss += (ptent_size << PSS_SHIFT); |
491 | } | 491 | } |
492 | } | 492 | } |
493 | 493 | ||
494 | static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 494 | static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
495 | struct mm_walk *walk) | 495 | struct mm_walk *walk) |
496 | { | 496 | { |
497 | struct mem_size_stats *mss = walk->private; | 497 | struct mem_size_stats *mss = walk->private; |
498 | struct vm_area_struct *vma = mss->vma; | 498 | struct vm_area_struct *vma = mss->vma; |
499 | pte_t *pte; | 499 | pte_t *pte; |
500 | spinlock_t *ptl; | 500 | spinlock_t *ptl; |
501 | 501 | ||
502 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 502 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
503 | smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); | 503 | smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); |
504 | spin_unlock(ptl); | 504 | spin_unlock(ptl); |
505 | mss->anonymous_thp += HPAGE_PMD_SIZE; | 505 | mss->anonymous_thp += HPAGE_PMD_SIZE; |
506 | return 0; | 506 | return 0; |
507 | } | 507 | } |
508 | 508 | ||
509 | if (pmd_trans_unstable(pmd)) | 509 | if (pmd_trans_unstable(pmd)) |
510 | return 0; | 510 | return 0; |
511 | /* | 511 | /* |
512 | * The mmap_sem held all the way back in m_start() is what | 512 | * The mmap_sem held all the way back in m_start() is what |
513 | * keeps khugepaged out of here and from collapsing things | 513 | * keeps khugepaged out of here and from collapsing things |
514 | * in here. | 514 | * in here. |
515 | */ | 515 | */ |
516 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 516 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
517 | for (; addr != end; pte++, addr += PAGE_SIZE) | 517 | for (; addr != end; pte++, addr += PAGE_SIZE) |
518 | smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); | 518 | smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); |
519 | pte_unmap_unlock(pte - 1, ptl); | 519 | pte_unmap_unlock(pte - 1, ptl); |
520 | cond_resched(); | 520 | cond_resched(); |
521 | return 0; | 521 | return 0; |
522 | } | 522 | } |
523 | 523 | ||
524 | static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) | 524 | static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) |
525 | { | 525 | { |
526 | /* | 526 | /* |
527 | * Don't forget to update Documentation/ on changes. | 527 | * Don't forget to update Documentation/ on changes. |
528 | */ | 528 | */ |
529 | static const char mnemonics[BITS_PER_LONG][2] = { | 529 | static const char mnemonics[BITS_PER_LONG][2] = { |
530 | /* | 530 | /* |
531 | * In case if we meet a flag we don't know about. | 531 | * In case if we meet a flag we don't know about. |
532 | */ | 532 | */ |
533 | [0 ... (BITS_PER_LONG-1)] = "??", | 533 | [0 ... (BITS_PER_LONG-1)] = "??", |
534 | 534 | ||
535 | [ilog2(VM_READ)] = "rd", | 535 | [ilog2(VM_READ)] = "rd", |
536 | [ilog2(VM_WRITE)] = "wr", | 536 | [ilog2(VM_WRITE)] = "wr", |
537 | [ilog2(VM_EXEC)] = "ex", | 537 | [ilog2(VM_EXEC)] = "ex", |
538 | [ilog2(VM_SHARED)] = "sh", | 538 | [ilog2(VM_SHARED)] = "sh", |
539 | [ilog2(VM_MAYREAD)] = "mr", | 539 | [ilog2(VM_MAYREAD)] = "mr", |
540 | [ilog2(VM_MAYWRITE)] = "mw", | 540 | [ilog2(VM_MAYWRITE)] = "mw", |
541 | [ilog2(VM_MAYEXEC)] = "me", | 541 | [ilog2(VM_MAYEXEC)] = "me", |
542 | [ilog2(VM_MAYSHARE)] = "ms", | 542 | [ilog2(VM_MAYSHARE)] = "ms", |
543 | [ilog2(VM_GROWSDOWN)] = "gd", | 543 | [ilog2(VM_GROWSDOWN)] = "gd", |
544 | [ilog2(VM_PFNMAP)] = "pf", | 544 | [ilog2(VM_PFNMAP)] = "pf", |
545 | [ilog2(VM_DENYWRITE)] = "dw", | 545 | [ilog2(VM_DENYWRITE)] = "dw", |
546 | [ilog2(VM_LOCKED)] = "lo", | 546 | [ilog2(VM_LOCKED)] = "lo", |
547 | [ilog2(VM_IO)] = "io", | 547 | [ilog2(VM_IO)] = "io", |
548 | [ilog2(VM_SEQ_READ)] = "sr", | 548 | [ilog2(VM_SEQ_READ)] = "sr", |
549 | [ilog2(VM_RAND_READ)] = "rr", | 549 | [ilog2(VM_RAND_READ)] = "rr", |
550 | [ilog2(VM_DONTCOPY)] = "dc", | 550 | [ilog2(VM_DONTCOPY)] = "dc", |
551 | [ilog2(VM_DONTEXPAND)] = "de", | 551 | [ilog2(VM_DONTEXPAND)] = "de", |
552 | [ilog2(VM_ACCOUNT)] = "ac", | 552 | [ilog2(VM_ACCOUNT)] = "ac", |
553 | [ilog2(VM_NORESERVE)] = "nr", | 553 | [ilog2(VM_NORESERVE)] = "nr", |
554 | [ilog2(VM_HUGETLB)] = "ht", | 554 | [ilog2(VM_HUGETLB)] = "ht", |
555 | [ilog2(VM_NONLINEAR)] = "nl", | 555 | [ilog2(VM_NONLINEAR)] = "nl", |
556 | [ilog2(VM_ARCH_1)] = "ar", | 556 | [ilog2(VM_ARCH_1)] = "ar", |
557 | [ilog2(VM_DONTDUMP)] = "dd", | 557 | [ilog2(VM_DONTDUMP)] = "dd", |
558 | #ifdef CONFIG_MEM_SOFT_DIRTY | 558 | #ifdef CONFIG_MEM_SOFT_DIRTY |
559 | [ilog2(VM_SOFTDIRTY)] = "sd", | 559 | [ilog2(VM_SOFTDIRTY)] = "sd", |
560 | #endif | 560 | #endif |
561 | [ilog2(VM_MIXEDMAP)] = "mm", | 561 | [ilog2(VM_MIXEDMAP)] = "mm", |
562 | [ilog2(VM_HUGEPAGE)] = "hg", | 562 | [ilog2(VM_HUGEPAGE)] = "hg", |
563 | [ilog2(VM_NOHUGEPAGE)] = "nh", | 563 | [ilog2(VM_NOHUGEPAGE)] = "nh", |
564 | [ilog2(VM_MERGEABLE)] = "mg", | 564 | [ilog2(VM_MERGEABLE)] = "mg", |
565 | }; | 565 | }; |
566 | size_t i; | 566 | size_t i; |
567 | 567 | ||
568 | seq_puts(m, "VmFlags: "); | 568 | seq_puts(m, "VmFlags: "); |
569 | for (i = 0; i < BITS_PER_LONG; i++) { | 569 | for (i = 0; i < BITS_PER_LONG; i++) { |
570 | if (vma->vm_flags & (1UL << i)) { | 570 | if (vma->vm_flags & (1UL << i)) { |
571 | seq_printf(m, "%c%c ", | 571 | seq_printf(m, "%c%c ", |
572 | mnemonics[i][0], mnemonics[i][1]); | 572 | mnemonics[i][0], mnemonics[i][1]); |
573 | } | 573 | } |
574 | } | 574 | } |
575 | seq_putc(m, '\n'); | 575 | seq_putc(m, '\n'); |
576 | } | 576 | } |
577 | 577 | ||
578 | static int show_smap(struct seq_file *m, void *v, int is_pid) | 578 | static int show_smap(struct seq_file *m, void *v, int is_pid) |
579 | { | 579 | { |
580 | struct proc_maps_private *priv = m->private; | 580 | struct proc_maps_private *priv = m->private; |
581 | struct task_struct *task = priv->task; | 581 | struct task_struct *task = priv->task; |
582 | struct vm_area_struct *vma = v; | 582 | struct vm_area_struct *vma = v; |
583 | struct mem_size_stats mss; | 583 | struct mem_size_stats mss; |
584 | struct mm_walk smaps_walk = { | 584 | struct mm_walk smaps_walk = { |
585 | .pmd_entry = smaps_pte_range, | 585 | .pmd_entry = smaps_pte_range, |
586 | .mm = vma->vm_mm, | 586 | .mm = vma->vm_mm, |
587 | .private = &mss, | 587 | .private = &mss, |
588 | }; | 588 | }; |
589 | 589 | ||
590 | memset(&mss, 0, sizeof mss); | 590 | memset(&mss, 0, sizeof mss); |
591 | mss.vma = vma; | 591 | mss.vma = vma; |
592 | /* mmap_sem is held in m_start */ | 592 | /* mmap_sem is held in m_start */ |
593 | if (vma->vm_mm && !is_vm_hugetlb_page(vma)) | 593 | if (vma->vm_mm && !is_vm_hugetlb_page(vma)) |
594 | walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); | 594 | walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); |
595 | 595 | ||
596 | show_map_vma(m, vma, is_pid); | 596 | show_map_vma(m, vma, is_pid); |
597 | 597 | ||
598 | seq_printf(m, | 598 | seq_printf(m, |
599 | "Size: %8lu kB\n" | 599 | "Size: %8lu kB\n" |
600 | "Rss: %8lu kB\n" | 600 | "Rss: %8lu kB\n" |
601 | "Pss: %8lu kB\n" | 601 | "Pss: %8lu kB\n" |
602 | "Shared_Clean: %8lu kB\n" | 602 | "Shared_Clean: %8lu kB\n" |
603 | "Shared_Dirty: %8lu kB\n" | 603 | "Shared_Dirty: %8lu kB\n" |
604 | "Private_Clean: %8lu kB\n" | 604 | "Private_Clean: %8lu kB\n" |
605 | "Private_Dirty: %8lu kB\n" | 605 | "Private_Dirty: %8lu kB\n" |
606 | "Referenced: %8lu kB\n" | 606 | "Referenced: %8lu kB\n" |
607 | "Anonymous: %8lu kB\n" | 607 | "Anonymous: %8lu kB\n" |
608 | "AnonHugePages: %8lu kB\n" | 608 | "AnonHugePages: %8lu kB\n" |
609 | "Swap: %8lu kB\n" | 609 | "Swap: %8lu kB\n" |
610 | "KernelPageSize: %8lu kB\n" | 610 | "KernelPageSize: %8lu kB\n" |
611 | "MMUPageSize: %8lu kB\n" | 611 | "MMUPageSize: %8lu kB\n" |
612 | "Locked: %8lu kB\n", | 612 | "Locked: %8lu kB\n", |
613 | (vma->vm_end - vma->vm_start) >> 10, | 613 | (vma->vm_end - vma->vm_start) >> 10, |
614 | mss.resident >> 10, | 614 | mss.resident >> 10, |
615 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), | 615 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), |
616 | mss.shared_clean >> 10, | 616 | mss.shared_clean >> 10, |
617 | mss.shared_dirty >> 10, | 617 | mss.shared_dirty >> 10, |
618 | mss.private_clean >> 10, | 618 | mss.private_clean >> 10, |
619 | mss.private_dirty >> 10, | 619 | mss.private_dirty >> 10, |
620 | mss.referenced >> 10, | 620 | mss.referenced >> 10, |
621 | mss.anonymous >> 10, | 621 | mss.anonymous >> 10, |
622 | mss.anonymous_thp >> 10, | 622 | mss.anonymous_thp >> 10, |
623 | mss.swap >> 10, | 623 | mss.swap >> 10, |
624 | vma_kernel_pagesize(vma) >> 10, | 624 | vma_kernel_pagesize(vma) >> 10, |
625 | vma_mmu_pagesize(vma) >> 10, | 625 | vma_mmu_pagesize(vma) >> 10, |
626 | (vma->vm_flags & VM_LOCKED) ? | 626 | (vma->vm_flags & VM_LOCKED) ? |
627 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); | 627 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); |
628 | 628 | ||
629 | if (vma->vm_flags & VM_NONLINEAR) | 629 | if (vma->vm_flags & VM_NONLINEAR) |
630 | seq_printf(m, "Nonlinear: %8lu kB\n", | 630 | seq_printf(m, "Nonlinear: %8lu kB\n", |
631 | mss.nonlinear >> 10); | 631 | mss.nonlinear >> 10); |
632 | 632 | ||
633 | show_smap_vma_flags(m, vma); | 633 | show_smap_vma_flags(m, vma); |
634 | 634 | ||
635 | if (m->count < m->size) /* vma is copied successfully */ | 635 | if (m->count < m->size) /* vma is copied successfully */ |
636 | m->version = (vma != get_gate_vma(task->mm)) | 636 | m->version = (vma != get_gate_vma(task->mm)) |
637 | ? vma->vm_start : 0; | 637 | ? vma->vm_start : 0; |
638 | return 0; | 638 | return 0; |
639 | } | 639 | } |
640 | 640 | ||
641 | static int show_pid_smap(struct seq_file *m, void *v) | 641 | static int show_pid_smap(struct seq_file *m, void *v) |
642 | { | 642 | { |
643 | return show_smap(m, v, 1); | 643 | return show_smap(m, v, 1); |
644 | } | 644 | } |
645 | 645 | ||
646 | static int show_tid_smap(struct seq_file *m, void *v) | 646 | static int show_tid_smap(struct seq_file *m, void *v) |
647 | { | 647 | { |
648 | return show_smap(m, v, 0); | 648 | return show_smap(m, v, 0); |
649 | } | 649 | } |
650 | 650 | ||
651 | static const struct seq_operations proc_pid_smaps_op = { | 651 | static const struct seq_operations proc_pid_smaps_op = { |
652 | .start = m_start, | 652 | .start = m_start, |
653 | .next = m_next, | 653 | .next = m_next, |
654 | .stop = m_stop, | 654 | .stop = m_stop, |
655 | .show = show_pid_smap | 655 | .show = show_pid_smap |
656 | }; | 656 | }; |
657 | 657 | ||
658 | static const struct seq_operations proc_tid_smaps_op = { | 658 | static const struct seq_operations proc_tid_smaps_op = { |
659 | .start = m_start, | 659 | .start = m_start, |
660 | .next = m_next, | 660 | .next = m_next, |
661 | .stop = m_stop, | 661 | .stop = m_stop, |
662 | .show = show_tid_smap | 662 | .show = show_tid_smap |
663 | }; | 663 | }; |
664 | 664 | ||
665 | static int pid_smaps_open(struct inode *inode, struct file *file) | 665 | static int pid_smaps_open(struct inode *inode, struct file *file) |
666 | { | 666 | { |
667 | return do_maps_open(inode, file, &proc_pid_smaps_op); | 667 | return do_maps_open(inode, file, &proc_pid_smaps_op); |
668 | } | 668 | } |
669 | 669 | ||
670 | static int tid_smaps_open(struct inode *inode, struct file *file) | 670 | static int tid_smaps_open(struct inode *inode, struct file *file) |
671 | { | 671 | { |
672 | return do_maps_open(inode, file, &proc_tid_smaps_op); | 672 | return do_maps_open(inode, file, &proc_tid_smaps_op); |
673 | } | 673 | } |
674 | 674 | ||
675 | const struct file_operations proc_pid_smaps_operations = { | 675 | const struct file_operations proc_pid_smaps_operations = { |
676 | .open = pid_smaps_open, | 676 | .open = pid_smaps_open, |
677 | .read = seq_read, | 677 | .read = seq_read, |
678 | .llseek = seq_lseek, | 678 | .llseek = seq_lseek, |
679 | .release = seq_release_private, | 679 | .release = seq_release_private, |
680 | }; | 680 | }; |
681 | 681 | ||
682 | const struct file_operations proc_tid_smaps_operations = { | 682 | const struct file_operations proc_tid_smaps_operations = { |
683 | .open = tid_smaps_open, | 683 | .open = tid_smaps_open, |
684 | .read = seq_read, | 684 | .read = seq_read, |
685 | .llseek = seq_lseek, | 685 | .llseek = seq_lseek, |
686 | .release = seq_release_private, | 686 | .release = seq_release_private, |
687 | }; | 687 | }; |
688 | 688 | ||
689 | /* | 689 | /* |
690 | * We do not want to have constant page-shift bits sitting in | 690 | * We do not want to have constant page-shift bits sitting in |
691 | * pagemap entries and are about to reuse them some time soon. | 691 | * pagemap entries and are about to reuse them some time soon. |
692 | * | 692 | * |
693 | * Here's the "migration strategy": | 693 | * Here's the "migration strategy": |
694 | * 1. when the system boots these bits remain what they are, | 694 | * 1. when the system boots these bits remain what they are, |
695 | * but a warning about future change is printed in log; | 695 | * but a warning about future change is printed in log; |
696 | * 2. once anyone clears soft-dirty bits via clear_refs file, | 696 | * 2. once anyone clears soft-dirty bits via clear_refs file, |
697 | * these flag is set to denote, that user is aware of the | 697 | * these flag is set to denote, that user is aware of the |
698 | * new API and those page-shift bits change their meaning. | 698 | * new API and those page-shift bits change their meaning. |
699 | * The respective warning is printed in dmesg; | 699 | * The respective warning is printed in dmesg; |
700 | * 3. In a couple of releases we will remove all the mentions | 700 | * 3. In a couple of releases we will remove all the mentions |
701 | * of page-shift in pagemap entries. | 701 | * of page-shift in pagemap entries. |
702 | */ | 702 | */ |
703 | 703 | ||
704 | static bool soft_dirty_cleared __read_mostly; | 704 | static bool soft_dirty_cleared __read_mostly; |
705 | 705 | ||
706 | enum clear_refs_types { | 706 | enum clear_refs_types { |
707 | CLEAR_REFS_ALL = 1, | 707 | CLEAR_REFS_ALL = 1, |
708 | CLEAR_REFS_ANON, | 708 | CLEAR_REFS_ANON, |
709 | CLEAR_REFS_MAPPED, | 709 | CLEAR_REFS_MAPPED, |
710 | CLEAR_REFS_SOFT_DIRTY, | 710 | CLEAR_REFS_SOFT_DIRTY, |
711 | CLEAR_REFS_LAST, | 711 | CLEAR_REFS_LAST, |
712 | }; | 712 | }; |
713 | 713 | ||
714 | struct clear_refs_private { | 714 | struct clear_refs_private { |
715 | struct vm_area_struct *vma; | 715 | struct vm_area_struct *vma; |
716 | enum clear_refs_types type; | 716 | enum clear_refs_types type; |
717 | }; | 717 | }; |
718 | 718 | ||
719 | static inline void clear_soft_dirty(struct vm_area_struct *vma, | 719 | static inline void clear_soft_dirty(struct vm_area_struct *vma, |
720 | unsigned long addr, pte_t *pte) | 720 | unsigned long addr, pte_t *pte) |
721 | { | 721 | { |
722 | #ifdef CONFIG_MEM_SOFT_DIRTY | 722 | #ifdef CONFIG_MEM_SOFT_DIRTY |
723 | /* | 723 | /* |
724 | * The soft-dirty tracker uses #PF-s to catch writes | 724 | * The soft-dirty tracker uses #PF-s to catch writes |
725 | * to pages, so write-protect the pte as well. See the | 725 | * to pages, so write-protect the pte as well. See the |
726 | * Documentation/vm/soft-dirty.txt for full description | 726 | * Documentation/vm/soft-dirty.txt for full description |
727 | * of how soft-dirty works. | 727 | * of how soft-dirty works. |
728 | */ | 728 | */ |
729 | pte_t ptent = *pte; | 729 | pte_t ptent = *pte; |
730 | 730 | ||
731 | if (pte_present(ptent)) { | 731 | if (pte_present(ptent)) { |
732 | ptent = pte_wrprotect(ptent); | 732 | ptent = pte_wrprotect(ptent); |
733 | ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); | 733 | ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); |
734 | } else if (is_swap_pte(ptent)) { | 734 | } else if (is_swap_pte(ptent)) { |
735 | ptent = pte_swp_clear_soft_dirty(ptent); | 735 | ptent = pte_swp_clear_soft_dirty(ptent); |
736 | } else if (pte_file(ptent)) { | 736 | } else if (pte_file(ptent)) { |
737 | ptent = pte_file_clear_soft_dirty(ptent); | 737 | ptent = pte_file_clear_soft_dirty(ptent); |
738 | } | 738 | } |
739 | 739 | ||
740 | if (vma->vm_flags & VM_SOFTDIRTY) | 740 | if (vma->vm_flags & VM_SOFTDIRTY) |
741 | vma->vm_flags &= ~VM_SOFTDIRTY; | 741 | vma->vm_flags &= ~VM_SOFTDIRTY; |
742 | 742 | ||
743 | set_pte_at(vma->vm_mm, addr, pte, ptent); | 743 | set_pte_at(vma->vm_mm, addr, pte, ptent); |
744 | #endif | 744 | #endif |
745 | } | 745 | } |
746 | 746 | ||
747 | static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | 747 | static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, |
748 | unsigned long end, struct mm_walk *walk) | 748 | unsigned long end, struct mm_walk *walk) |
749 | { | 749 | { |
750 | struct clear_refs_private *cp = walk->private; | 750 | struct clear_refs_private *cp = walk->private; |
751 | struct vm_area_struct *vma = cp->vma; | 751 | struct vm_area_struct *vma = cp->vma; |
752 | pte_t *pte, ptent; | 752 | pte_t *pte, ptent; |
753 | spinlock_t *ptl; | 753 | spinlock_t *ptl; |
754 | struct page *page; | 754 | struct page *page; |
755 | 755 | ||
756 | split_huge_page_pmd(vma, addr, pmd); | 756 | split_huge_page_pmd(vma, addr, pmd); |
757 | if (pmd_trans_unstable(pmd)) | 757 | if (pmd_trans_unstable(pmd)) |
758 | return 0; | 758 | return 0; |
759 | 759 | ||
760 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 760 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
761 | for (; addr != end; pte++, addr += PAGE_SIZE) { | 761 | for (; addr != end; pte++, addr += PAGE_SIZE) { |
762 | ptent = *pte; | 762 | ptent = *pte; |
763 | 763 | ||
764 | if (cp->type == CLEAR_REFS_SOFT_DIRTY) { | 764 | if (cp->type == CLEAR_REFS_SOFT_DIRTY) { |
765 | clear_soft_dirty(vma, addr, pte); | 765 | clear_soft_dirty(vma, addr, pte); |
766 | continue; | 766 | continue; |
767 | } | 767 | } |
768 | 768 | ||
769 | if (!pte_present(ptent)) | 769 | if (!pte_present(ptent)) |
770 | continue; | 770 | continue; |
771 | 771 | ||
772 | page = vm_normal_page(vma, addr, ptent); | 772 | page = vm_normal_page(vma, addr, ptent); |
773 | if (!page) | 773 | if (!page) |
774 | continue; | 774 | continue; |
775 | 775 | ||
776 | /* Clear accessed and referenced bits. */ | 776 | /* Clear accessed and referenced bits. */ |
777 | ptep_test_and_clear_young(vma, addr, pte); | 777 | ptep_test_and_clear_young(vma, addr, pte); |
778 | ClearPageReferenced(page); | 778 | ClearPageReferenced(page); |
779 | } | 779 | } |
780 | pte_unmap_unlock(pte - 1, ptl); | 780 | pte_unmap_unlock(pte - 1, ptl); |
781 | cond_resched(); | 781 | cond_resched(); |
782 | return 0; | 782 | return 0; |
783 | } | 783 | } |
784 | 784 | ||
785 | static ssize_t clear_refs_write(struct file *file, const char __user *buf, | 785 | static ssize_t clear_refs_write(struct file *file, const char __user *buf, |
786 | size_t count, loff_t *ppos) | 786 | size_t count, loff_t *ppos) |
787 | { | 787 | { |
788 | struct task_struct *task; | 788 | struct task_struct *task; |
789 | char buffer[PROC_NUMBUF]; | 789 | char buffer[PROC_NUMBUF]; |
790 | struct mm_struct *mm; | 790 | struct mm_struct *mm; |
791 | struct vm_area_struct *vma; | 791 | struct vm_area_struct *vma; |
792 | enum clear_refs_types type; | 792 | enum clear_refs_types type; |
793 | int itype; | 793 | int itype; |
794 | int rv; | 794 | int rv; |
795 | 795 | ||
796 | memset(buffer, 0, sizeof(buffer)); | 796 | memset(buffer, 0, sizeof(buffer)); |
797 | if (count > sizeof(buffer) - 1) | 797 | if (count > sizeof(buffer) - 1) |
798 | count = sizeof(buffer) - 1; | 798 | count = sizeof(buffer) - 1; |
799 | if (copy_from_user(buffer, buf, count)) | 799 | if (copy_from_user(buffer, buf, count)) |
800 | return -EFAULT; | 800 | return -EFAULT; |
801 | rv = kstrtoint(strstrip(buffer), 10, &itype); | 801 | rv = kstrtoint(strstrip(buffer), 10, &itype); |
802 | if (rv < 0) | 802 | if (rv < 0) |
803 | return rv; | 803 | return rv; |
804 | type = (enum clear_refs_types)itype; | 804 | type = (enum clear_refs_types)itype; |
805 | if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) | 805 | if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) |
806 | return -EINVAL; | 806 | return -EINVAL; |
807 | 807 | ||
808 | if (type == CLEAR_REFS_SOFT_DIRTY) { | 808 | if (type == CLEAR_REFS_SOFT_DIRTY) { |
809 | soft_dirty_cleared = true; | 809 | soft_dirty_cleared = true; |
810 | pr_warn_once("The pagemap bits 55-60 has changed their meaning! " | 810 | pr_warn_once("The pagemap bits 55-60 has changed their meaning! " |
811 | "See the linux/Documentation/vm/pagemap.txt for details.\n"); | 811 | "See the linux/Documentation/vm/pagemap.txt for details.\n"); |
812 | } | 812 | } |
813 | 813 | ||
814 | task = get_proc_task(file_inode(file)); | 814 | task = get_proc_task(file_inode(file)); |
815 | if (!task) | 815 | if (!task) |
816 | return -ESRCH; | 816 | return -ESRCH; |
817 | mm = get_task_mm(task); | 817 | mm = get_task_mm(task); |
818 | if (mm) { | 818 | if (mm) { |
819 | struct clear_refs_private cp = { | 819 | struct clear_refs_private cp = { |
820 | .type = type, | 820 | .type = type, |
821 | }; | 821 | }; |
822 | struct mm_walk clear_refs_walk = { | 822 | struct mm_walk clear_refs_walk = { |
823 | .pmd_entry = clear_refs_pte_range, | 823 | .pmd_entry = clear_refs_pte_range, |
824 | .mm = mm, | 824 | .mm = mm, |
825 | .private = &cp, | 825 | .private = &cp, |
826 | }; | 826 | }; |
827 | down_read(&mm->mmap_sem); | 827 | down_read(&mm->mmap_sem); |
828 | if (type == CLEAR_REFS_SOFT_DIRTY) | 828 | if (type == CLEAR_REFS_SOFT_DIRTY) |
829 | mmu_notifier_invalidate_range_start(mm, 0, -1); | 829 | mmu_notifier_invalidate_range_start(mm, 0, -1); |
830 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 830 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
831 | cp.vma = vma; | 831 | cp.vma = vma; |
832 | if (is_vm_hugetlb_page(vma)) | 832 | if (is_vm_hugetlb_page(vma)) |
833 | continue; | 833 | continue; |
834 | /* | 834 | /* |
835 | * Writing 1 to /proc/pid/clear_refs affects all pages. | 835 | * Writing 1 to /proc/pid/clear_refs affects all pages. |
836 | * | 836 | * |
837 | * Writing 2 to /proc/pid/clear_refs only affects | 837 | * Writing 2 to /proc/pid/clear_refs only affects |
838 | * Anonymous pages. | 838 | * Anonymous pages. |
839 | * | 839 | * |
840 | * Writing 3 to /proc/pid/clear_refs only affects file | 840 | * Writing 3 to /proc/pid/clear_refs only affects file |
841 | * mapped pages. | 841 | * mapped pages. |
842 | */ | 842 | */ |
843 | if (type == CLEAR_REFS_ANON && vma->vm_file) | 843 | if (type == CLEAR_REFS_ANON && vma->vm_file) |
844 | continue; | 844 | continue; |
845 | if (type == CLEAR_REFS_MAPPED && !vma->vm_file) | 845 | if (type == CLEAR_REFS_MAPPED && !vma->vm_file) |
846 | continue; | 846 | continue; |
847 | walk_page_range(vma->vm_start, vma->vm_end, | 847 | walk_page_range(vma->vm_start, vma->vm_end, |
848 | &clear_refs_walk); | 848 | &clear_refs_walk); |
849 | } | 849 | } |
850 | if (type == CLEAR_REFS_SOFT_DIRTY) | 850 | if (type == CLEAR_REFS_SOFT_DIRTY) |
851 | mmu_notifier_invalidate_range_end(mm, 0, -1); | 851 | mmu_notifier_invalidate_range_end(mm, 0, -1); |
852 | flush_tlb_mm(mm); | 852 | flush_tlb_mm(mm); |
853 | up_read(&mm->mmap_sem); | 853 | up_read(&mm->mmap_sem); |
854 | mmput(mm); | 854 | mmput(mm); |
855 | } | 855 | } |
856 | put_task_struct(task); | 856 | put_task_struct(task); |
857 | 857 | ||
858 | return count; | 858 | return count; |
859 | } | 859 | } |
860 | 860 | ||
861 | const struct file_operations proc_clear_refs_operations = { | 861 | const struct file_operations proc_clear_refs_operations = { |
862 | .write = clear_refs_write, | 862 | .write = clear_refs_write, |
863 | .llseek = noop_llseek, | 863 | .llseek = noop_llseek, |
864 | }; | 864 | }; |
865 | 865 | ||
866 | typedef struct { | 866 | typedef struct { |
867 | u64 pme; | 867 | u64 pme; |
868 | } pagemap_entry_t; | 868 | } pagemap_entry_t; |
869 | 869 | ||
870 | struct pagemapread { | 870 | struct pagemapread { |
871 | int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ | 871 | int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ |
872 | pagemap_entry_t *buffer; | 872 | pagemap_entry_t *buffer; |
873 | bool v2; | 873 | bool v2; |
874 | }; | 874 | }; |
875 | 875 | ||
876 | #define PAGEMAP_WALK_SIZE (PMD_SIZE) | 876 | #define PAGEMAP_WALK_SIZE (PMD_SIZE) |
877 | #define PAGEMAP_WALK_MASK (PMD_MASK) | 877 | #define PAGEMAP_WALK_MASK (PMD_MASK) |
878 | 878 | ||
879 | #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) | 879 | #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) |
880 | #define PM_STATUS_BITS 3 | 880 | #define PM_STATUS_BITS 3 |
881 | #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) | 881 | #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) |
882 | #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) | 882 | #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) |
883 | #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) | 883 | #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) |
884 | #define PM_PSHIFT_BITS 6 | 884 | #define PM_PSHIFT_BITS 6 |
885 | #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) | 885 | #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) |
886 | #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) | 886 | #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) |
887 | #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) | 887 | #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) |
888 | #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) | 888 | #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) |
889 | #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) | 889 | #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) |
890 | /* in "new" pagemap pshift bits are occupied with more status bits */ | 890 | /* in "new" pagemap pshift bits are occupied with more status bits */ |
891 | #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) | 891 | #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) |
892 | 892 | ||
893 | #define __PM_SOFT_DIRTY (1LL) | 893 | #define __PM_SOFT_DIRTY (1LL) |
894 | #define PM_PRESENT PM_STATUS(4LL) | 894 | #define PM_PRESENT PM_STATUS(4LL) |
895 | #define PM_SWAP PM_STATUS(2LL) | 895 | #define PM_SWAP PM_STATUS(2LL) |
896 | #define PM_FILE PM_STATUS(1LL) | 896 | #define PM_FILE PM_STATUS(1LL) |
897 | #define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) | 897 | #define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) |
898 | #define PM_END_OF_BUFFER 1 | 898 | #define PM_END_OF_BUFFER 1 |
899 | 899 | ||
900 | static inline pagemap_entry_t make_pme(u64 val) | 900 | static inline pagemap_entry_t make_pme(u64 val) |
901 | { | 901 | { |
902 | return (pagemap_entry_t) { .pme = val }; | 902 | return (pagemap_entry_t) { .pme = val }; |
903 | } | 903 | } |
904 | 904 | ||
905 | static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, | 905 | static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, |
906 | struct pagemapread *pm) | 906 | struct pagemapread *pm) |
907 | { | 907 | { |
908 | pm->buffer[pm->pos++] = *pme; | 908 | pm->buffer[pm->pos++] = *pme; |
909 | if (pm->pos >= pm->len) | 909 | if (pm->pos >= pm->len) |
910 | return PM_END_OF_BUFFER; | 910 | return PM_END_OF_BUFFER; |
911 | return 0; | 911 | return 0; |
912 | } | 912 | } |
913 | 913 | ||
914 | static int pagemap_pte_hole(unsigned long start, unsigned long end, | 914 | static int pagemap_pte_hole(unsigned long start, unsigned long end, |
915 | struct mm_walk *walk) | 915 | struct mm_walk *walk) |
916 | { | 916 | { |
917 | struct pagemapread *pm = walk->private; | 917 | struct pagemapread *pm = walk->private; |
918 | unsigned long addr; | 918 | unsigned long addr; |
919 | int err = 0; | 919 | int err = 0; |
920 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); | 920 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); |
921 | 921 | ||
922 | for (addr = start; addr < end; addr += PAGE_SIZE) { | 922 | for (addr = start; addr < end; addr += PAGE_SIZE) { |
923 | err = add_to_pagemap(addr, &pme, pm); | 923 | err = add_to_pagemap(addr, &pme, pm); |
924 | if (err) | 924 | if (err) |
925 | break; | 925 | break; |
926 | } | 926 | } |
927 | return err; | 927 | return err; |
928 | } | 928 | } |
929 | 929 | ||
930 | static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 930 | static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
931 | struct vm_area_struct *vma, unsigned long addr, pte_t pte) | 931 | struct vm_area_struct *vma, unsigned long addr, pte_t pte) |
932 | { | 932 | { |
933 | u64 frame, flags; | 933 | u64 frame, flags; |
934 | struct page *page = NULL; | 934 | struct page *page = NULL; |
935 | int flags2 = 0; | 935 | int flags2 = 0; |
936 | 936 | ||
937 | if (pte_present(pte)) { | 937 | if (pte_present(pte)) { |
938 | frame = pte_pfn(pte); | 938 | frame = pte_pfn(pte); |
939 | flags = PM_PRESENT; | 939 | flags = PM_PRESENT; |
940 | page = vm_normal_page(vma, addr, pte); | 940 | page = vm_normal_page(vma, addr, pte); |
941 | if (pte_soft_dirty(pte)) | 941 | if (pte_soft_dirty(pte)) |
942 | flags2 |= __PM_SOFT_DIRTY; | 942 | flags2 |= __PM_SOFT_DIRTY; |
943 | } else if (is_swap_pte(pte)) { | 943 | } else if (is_swap_pte(pte)) { |
944 | swp_entry_t entry; | 944 | swp_entry_t entry; |
945 | if (pte_swp_soft_dirty(pte)) | 945 | if (pte_swp_soft_dirty(pte)) |
946 | flags2 |= __PM_SOFT_DIRTY; | 946 | flags2 |= __PM_SOFT_DIRTY; |
947 | entry = pte_to_swp_entry(pte); | 947 | entry = pte_to_swp_entry(pte); |
948 | frame = swp_type(entry) | | 948 | frame = swp_type(entry) | |
949 | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); | 949 | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); |
950 | flags = PM_SWAP; | 950 | flags = PM_SWAP; |
951 | if (is_migration_entry(entry)) | 951 | if (is_migration_entry(entry)) |
952 | page = migration_entry_to_page(entry); | 952 | page = migration_entry_to_page(entry); |
953 | } else { | 953 | } else { |
954 | if (vma->vm_flags & VM_SOFTDIRTY) | 954 | if (vma->vm_flags & VM_SOFTDIRTY) |
955 | flags2 |= __PM_SOFT_DIRTY; | 955 | flags2 |= __PM_SOFT_DIRTY; |
956 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); | 956 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); |
957 | return; | 957 | return; |
958 | } | 958 | } |
959 | 959 | ||
960 | if (page && !PageAnon(page)) | 960 | if (page && !PageAnon(page)) |
961 | flags |= PM_FILE; | 961 | flags |= PM_FILE; |
962 | if ((vma->vm_flags & VM_SOFTDIRTY)) | 962 | if ((vma->vm_flags & VM_SOFTDIRTY)) |
963 | flags2 |= __PM_SOFT_DIRTY; | 963 | flags2 |= __PM_SOFT_DIRTY; |
964 | 964 | ||
965 | *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); | 965 | *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); |
966 | } | 966 | } |
967 | 967 | ||
968 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 968 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
969 | static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 969 | static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
970 | pmd_t pmd, int offset, int pmd_flags2) | 970 | pmd_t pmd, int offset, int pmd_flags2) |
971 | { | 971 | { |
972 | /* | 972 | /* |
973 | * Currently pmd for thp is always present because thp can not be | 973 | * Currently pmd for thp is always present because thp can not be |
974 | * swapped-out, migrated, or HWPOISONed (split in such cases instead.) | 974 | * swapped-out, migrated, or HWPOISONed (split in such cases instead.) |
975 | * This if-check is just to prepare for future implementation. | 975 | * This if-check is just to prepare for future implementation. |
976 | */ | 976 | */ |
977 | if (pmd_present(pmd)) | 977 | if (pmd_present(pmd)) |
978 | *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) | 978 | *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) |
979 | | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); | 979 | | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); |
980 | else | 980 | else |
981 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); | 981 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); |
982 | } | 982 | } |
983 | #else | 983 | #else |
984 | static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 984 | static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
985 | pmd_t pmd, int offset, int pmd_flags2) | 985 | pmd_t pmd, int offset, int pmd_flags2) |
986 | { | 986 | { |
987 | } | 987 | } |
988 | #endif | 988 | #endif |
989 | 989 | ||
990 | static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 990 | static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
991 | struct mm_walk *walk) | 991 | struct mm_walk *walk) |
992 | { | 992 | { |
993 | struct vm_area_struct *vma; | 993 | struct vm_area_struct *vma; |
994 | struct pagemapread *pm = walk->private; | 994 | struct pagemapread *pm = walk->private; |
995 | spinlock_t *ptl; | 995 | spinlock_t *ptl; |
996 | pte_t *pte; | 996 | pte_t *pte; |
997 | int err = 0; | 997 | int err = 0; |
998 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); | 998 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); |
999 | 999 | ||
1000 | /* find the first VMA at or above 'addr' */ | 1000 | /* find the first VMA at or above 'addr' */ |
1001 | vma = find_vma(walk->mm, addr); | 1001 | vma = find_vma(walk->mm, addr); |
1002 | if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1002 | if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
1003 | int pmd_flags2; | 1003 | int pmd_flags2; |
1004 | 1004 | ||
1005 | if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) | 1005 | if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) |
1006 | pmd_flags2 = __PM_SOFT_DIRTY; | 1006 | pmd_flags2 = __PM_SOFT_DIRTY; |
1007 | else | 1007 | else |
1008 | pmd_flags2 = 0; | 1008 | pmd_flags2 = 0; |
1009 | 1009 | ||
1010 | for (; addr != end; addr += PAGE_SIZE) { | 1010 | for (; addr != end; addr += PAGE_SIZE) { |
1011 | unsigned long offset; | 1011 | unsigned long offset; |
1012 | 1012 | ||
1013 | offset = (addr & ~PAGEMAP_WALK_MASK) >> | 1013 | offset = (addr & ~PAGEMAP_WALK_MASK) >> |
1014 | PAGE_SHIFT; | 1014 | PAGE_SHIFT; |
1015 | thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); | 1015 | thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); |
1016 | err = add_to_pagemap(addr, &pme, pm); | 1016 | err = add_to_pagemap(addr, &pme, pm); |
1017 | if (err) | 1017 | if (err) |
1018 | break; | 1018 | break; |
1019 | } | 1019 | } |
1020 | spin_unlock(ptl); | 1020 | spin_unlock(ptl); |
1021 | return err; | 1021 | return err; |
1022 | } | 1022 | } |
1023 | 1023 | ||
1024 | if (pmd_trans_unstable(pmd)) | 1024 | if (pmd_trans_unstable(pmd)) |
1025 | return 0; | 1025 | return 0; |
1026 | for (; addr != end; addr += PAGE_SIZE) { | 1026 | for (; addr != end; addr += PAGE_SIZE) { |
1027 | int flags2; | 1027 | int flags2; |
1028 | 1028 | ||
1029 | /* check to see if we've left 'vma' behind | 1029 | /* check to see if we've left 'vma' behind |
1030 | * and need a new, higher one */ | 1030 | * and need a new, higher one */ |
1031 | if (vma && (addr >= vma->vm_end)) { | 1031 | if (vma && (addr >= vma->vm_end)) { |
1032 | vma = find_vma(walk->mm, addr); | 1032 | vma = find_vma(walk->mm, addr); |
1033 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) | 1033 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) |
1034 | flags2 = __PM_SOFT_DIRTY; | 1034 | flags2 = __PM_SOFT_DIRTY; |
1035 | else | 1035 | else |
1036 | flags2 = 0; | 1036 | flags2 = 0; |
1037 | pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); | 1037 | pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); |
1038 | } | 1038 | } |
1039 | 1039 | ||
1040 | /* check that 'vma' actually covers this address, | 1040 | /* check that 'vma' actually covers this address, |
1041 | * and that it isn't a huge page vma */ | 1041 | * and that it isn't a huge page vma */ |
1042 | if (vma && (vma->vm_start <= addr) && | 1042 | if (vma && (vma->vm_start <= addr) && |
1043 | !is_vm_hugetlb_page(vma)) { | 1043 | !is_vm_hugetlb_page(vma)) { |
1044 | pte = pte_offset_map(pmd, addr); | 1044 | pte = pte_offset_map(pmd, addr); |
1045 | pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); | 1045 | pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); |
1046 | /* unmap before userspace copy */ | 1046 | /* unmap before userspace copy */ |
1047 | pte_unmap(pte); | 1047 | pte_unmap(pte); |
1048 | } | 1048 | } |
1049 | err = add_to_pagemap(addr, &pme, pm); | 1049 | err = add_to_pagemap(addr, &pme, pm); |
1050 | if (err) | 1050 | if (err) |
1051 | return err; | 1051 | return err; |
1052 | } | 1052 | } |
1053 | 1053 | ||
1054 | cond_resched(); | 1054 | cond_resched(); |
1055 | 1055 | ||
1056 | return err; | 1056 | return err; |
1057 | } | 1057 | } |
1058 | 1058 | ||
1059 | #ifdef CONFIG_HUGETLB_PAGE | 1059 | #ifdef CONFIG_HUGETLB_PAGE |
1060 | static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 1060 | static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
1061 | pte_t pte, int offset, int flags2) | 1061 | pte_t pte, int offset, int flags2) |
1062 | { | 1062 | { |
1063 | if (pte_present(pte)) | 1063 | if (pte_present(pte)) |
1064 | *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | | 1064 | *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | |
1065 | PM_STATUS2(pm->v2, flags2) | | 1065 | PM_STATUS2(pm->v2, flags2) | |
1066 | PM_PRESENT); | 1066 | PM_PRESENT); |
1067 | else | 1067 | else |
1068 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | | 1068 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | |
1069 | PM_STATUS2(pm->v2, flags2)); | 1069 | PM_STATUS2(pm->v2, flags2)); |
1070 | } | 1070 | } |
1071 | 1071 | ||
1072 | /* This function walks within one hugetlb entry in the single call */ | 1072 | /* This function walks within one hugetlb entry in the single call */ |
1073 | static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, | 1073 | static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, |
1074 | unsigned long addr, unsigned long end, | 1074 | unsigned long addr, unsigned long end, |
1075 | struct mm_walk *walk) | 1075 | struct mm_walk *walk) |
1076 | { | 1076 | { |
1077 | struct pagemapread *pm = walk->private; | 1077 | struct pagemapread *pm = walk->private; |
1078 | struct vm_area_struct *vma; | 1078 | struct vm_area_struct *vma; |
1079 | int err = 0; | 1079 | int err = 0; |
1080 | int flags2; | 1080 | int flags2; |
1081 | pagemap_entry_t pme; | 1081 | pagemap_entry_t pme; |
1082 | 1082 | ||
1083 | vma = find_vma(walk->mm, addr); | 1083 | vma = find_vma(walk->mm, addr); |
1084 | WARN_ON_ONCE(!vma); | 1084 | WARN_ON_ONCE(!vma); |
1085 | 1085 | ||
1086 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) | 1086 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) |
1087 | flags2 = __PM_SOFT_DIRTY; | 1087 | flags2 = __PM_SOFT_DIRTY; |
1088 | else | 1088 | else |
1089 | flags2 = 0; | 1089 | flags2 = 0; |
1090 | 1090 | ||
1091 | for (; addr != end; addr += PAGE_SIZE) { | 1091 | for (; addr != end; addr += PAGE_SIZE) { |
1092 | int offset = (addr & ~hmask) >> PAGE_SHIFT; | 1092 | int offset = (addr & ~hmask) >> PAGE_SHIFT; |
1093 | huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); | 1093 | huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); |
1094 | err = add_to_pagemap(addr, &pme, pm); | 1094 | err = add_to_pagemap(addr, &pme, pm); |
1095 | if (err) | 1095 | if (err) |
1096 | return err; | 1096 | return err; |
1097 | } | 1097 | } |
1098 | 1098 | ||
1099 | cond_resched(); | 1099 | cond_resched(); |
1100 | 1100 | ||
1101 | return err; | 1101 | return err; |
1102 | } | 1102 | } |
1103 | #endif /* HUGETLB_PAGE */ | 1103 | #endif /* HUGETLB_PAGE */ |
1104 | 1104 | ||
1105 | /* | 1105 | /* |
1106 | * /proc/pid/pagemap - an array mapping virtual pages to pfns | 1106 | * /proc/pid/pagemap - an array mapping virtual pages to pfns |
1107 | * | 1107 | * |
1108 | * For each page in the address space, this file contains one 64-bit entry | 1108 | * For each page in the address space, this file contains one 64-bit entry |
1109 | * consisting of the following: | 1109 | * consisting of the following: |
1110 | * | 1110 | * |
1111 | * Bits 0-54 page frame number (PFN) if present | 1111 | * Bits 0-54 page frame number (PFN) if present |
1112 | * Bits 0-4 swap type if swapped | 1112 | * Bits 0-4 swap type if swapped |
1113 | * Bits 5-54 swap offset if swapped | 1113 | * Bits 5-54 swap offset if swapped |
1114 | * Bits 55-60 page shift (page size = 1<<page shift) | 1114 | * Bits 55-60 page shift (page size = 1<<page shift) |
1115 | * Bit 61 page is file-page or shared-anon | 1115 | * Bit 61 page is file-page or shared-anon |
1116 | * Bit 62 page swapped | 1116 | * Bit 62 page swapped |
1117 | * Bit 63 page present | 1117 | * Bit 63 page present |
1118 | * | 1118 | * |
1119 | * If the page is not present but in swap, then the PFN contains an | 1119 | * If the page is not present but in swap, then the PFN contains an |
1120 | * encoding of the swap file number and the page's offset into the | 1120 | * encoding of the swap file number and the page's offset into the |
1121 | * swap. Unmapped pages return a null PFN. This allows determining | 1121 | * swap. Unmapped pages return a null PFN. This allows determining |
1122 | * precisely which pages are mapped (or in swap) and comparing mapped | 1122 | * precisely which pages are mapped (or in swap) and comparing mapped |
1123 | * pages between processes. | 1123 | * pages between processes. |
1124 | * | 1124 | * |
1125 | * Efficient users of this interface will use /proc/pid/maps to | 1125 | * Efficient users of this interface will use /proc/pid/maps to |
1126 | * determine which areas of memory are actually mapped and llseek to | 1126 | * determine which areas of memory are actually mapped and llseek to |
1127 | * skip over unmapped regions. | 1127 | * skip over unmapped regions. |
1128 | */ | 1128 | */ |
1129 | static ssize_t pagemap_read(struct file *file, char __user *buf, | 1129 | static ssize_t pagemap_read(struct file *file, char __user *buf, |
1130 | size_t count, loff_t *ppos) | 1130 | size_t count, loff_t *ppos) |
1131 | { | 1131 | { |
1132 | struct task_struct *task = get_proc_task(file_inode(file)); | 1132 | struct task_struct *task = get_proc_task(file_inode(file)); |
1133 | struct mm_struct *mm; | 1133 | struct mm_struct *mm; |
1134 | struct pagemapread pm; | 1134 | struct pagemapread pm; |
1135 | int ret = -ESRCH; | 1135 | int ret = -ESRCH; |
1136 | struct mm_walk pagemap_walk = {}; | 1136 | struct mm_walk pagemap_walk = {}; |
1137 | unsigned long src; | 1137 | unsigned long src; |
1138 | unsigned long svpfn; | 1138 | unsigned long svpfn; |
1139 | unsigned long start_vaddr; | 1139 | unsigned long start_vaddr; |
1140 | unsigned long end_vaddr; | 1140 | unsigned long end_vaddr; |
1141 | int copied = 0; | 1141 | int copied = 0; |
1142 | 1142 | ||
1143 | if (!task) | 1143 | if (!task) |
1144 | goto out; | 1144 | goto out; |
1145 | 1145 | ||
1146 | ret = -EINVAL; | 1146 | ret = -EINVAL; |
1147 | /* file position must be aligned */ | 1147 | /* file position must be aligned */ |
1148 | if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) | 1148 | if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) |
1149 | goto out_task; | 1149 | goto out_task; |
1150 | 1150 | ||
1151 | ret = 0; | 1151 | ret = 0; |
1152 | if (!count) | 1152 | if (!count) |
1153 | goto out_task; | 1153 | goto out_task; |
1154 | 1154 | ||
1155 | pm.v2 = soft_dirty_cleared; | 1155 | pm.v2 = soft_dirty_cleared; |
1156 | pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); | 1156 | pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); |
1157 | pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); | 1157 | pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); |
1158 | ret = -ENOMEM; | 1158 | ret = -ENOMEM; |
1159 | if (!pm.buffer) | 1159 | if (!pm.buffer) |
1160 | goto out_task; | 1160 | goto out_task; |
1161 | 1161 | ||
1162 | mm = mm_access(task, PTRACE_MODE_READ); | 1162 | mm = mm_access(task, PTRACE_MODE_READ); |
1163 | ret = PTR_ERR(mm); | 1163 | ret = PTR_ERR(mm); |
1164 | if (!mm || IS_ERR(mm)) | 1164 | if (!mm || IS_ERR(mm)) |
1165 | goto out_free; | 1165 | goto out_free; |
1166 | 1166 | ||
1167 | pagemap_walk.pmd_entry = pagemap_pte_range; | 1167 | pagemap_walk.pmd_entry = pagemap_pte_range; |
1168 | pagemap_walk.pte_hole = pagemap_pte_hole; | 1168 | pagemap_walk.pte_hole = pagemap_pte_hole; |
1169 | #ifdef CONFIG_HUGETLB_PAGE | 1169 | #ifdef CONFIG_HUGETLB_PAGE |
1170 | pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; | 1170 | pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; |
1171 | #endif | 1171 | #endif |
1172 | pagemap_walk.mm = mm; | 1172 | pagemap_walk.mm = mm; |
1173 | pagemap_walk.private = ± | 1173 | pagemap_walk.private = ± |
1174 | 1174 | ||
1175 | src = *ppos; | 1175 | src = *ppos; |
1176 | svpfn = src / PM_ENTRY_BYTES; | 1176 | svpfn = src / PM_ENTRY_BYTES; |
1177 | start_vaddr = svpfn << PAGE_SHIFT; | 1177 | start_vaddr = svpfn << PAGE_SHIFT; |
1178 | end_vaddr = TASK_SIZE_OF(task); | 1178 | end_vaddr = TASK_SIZE_OF(task); |
1179 | 1179 | ||
1180 | /* watch out for wraparound */ | 1180 | /* watch out for wraparound */ |
1181 | if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) | 1181 | if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) |
1182 | start_vaddr = end_vaddr; | 1182 | start_vaddr = end_vaddr; |
1183 | 1183 | ||
1184 | /* | 1184 | /* |
1185 | * The odds are that this will stop walking way | 1185 | * The odds are that this will stop walking way |
1186 | * before end_vaddr, because the length of the | 1186 | * before end_vaddr, because the length of the |
1187 | * user buffer is tracked in "pm", and the walk | 1187 | * user buffer is tracked in "pm", and the walk |
1188 | * will stop when we hit the end of the buffer. | 1188 | * will stop when we hit the end of the buffer. |
1189 | */ | 1189 | */ |
1190 | ret = 0; | 1190 | ret = 0; |
1191 | while (count && (start_vaddr < end_vaddr)) { | 1191 | while (count && (start_vaddr < end_vaddr)) { |
1192 | int len; | 1192 | int len; |
1193 | unsigned long end; | 1193 | unsigned long end; |
1194 | 1194 | ||
1195 | pm.pos = 0; | 1195 | pm.pos = 0; |
1196 | end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; | 1196 | end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; |
1197 | /* overflow ? */ | 1197 | /* overflow ? */ |
1198 | if (end < start_vaddr || end > end_vaddr) | 1198 | if (end < start_vaddr || end > end_vaddr) |
1199 | end = end_vaddr; | 1199 | end = end_vaddr; |
1200 | down_read(&mm->mmap_sem); | 1200 | down_read(&mm->mmap_sem); |
1201 | ret = walk_page_range(start_vaddr, end, &pagemap_walk); | 1201 | ret = walk_page_range(start_vaddr, end, &pagemap_walk); |
1202 | up_read(&mm->mmap_sem); | 1202 | up_read(&mm->mmap_sem); |
1203 | start_vaddr = end; | 1203 | start_vaddr = end; |
1204 | 1204 | ||
1205 | len = min(count, PM_ENTRY_BYTES * pm.pos); | 1205 | len = min(count, PM_ENTRY_BYTES * pm.pos); |
1206 | if (copy_to_user(buf, pm.buffer, len)) { | 1206 | if (copy_to_user(buf, pm.buffer, len)) { |
1207 | ret = -EFAULT; | 1207 | ret = -EFAULT; |
1208 | goto out_mm; | 1208 | goto out_mm; |
1209 | } | 1209 | } |
1210 | copied += len; | 1210 | copied += len; |
1211 | buf += len; | 1211 | buf += len; |
1212 | count -= len; | 1212 | count -= len; |
1213 | } | 1213 | } |
1214 | *ppos += copied; | 1214 | *ppos += copied; |
1215 | if (!ret || ret == PM_END_OF_BUFFER) | 1215 | if (!ret || ret == PM_END_OF_BUFFER) |
1216 | ret = copied; | 1216 | ret = copied; |
1217 | 1217 | ||
1218 | out_mm: | 1218 | out_mm: |
1219 | mmput(mm); | 1219 | mmput(mm); |
1220 | out_free: | 1220 | out_free: |
1221 | kfree(pm.buffer); | 1221 | kfree(pm.buffer); |
1222 | out_task: | 1222 | out_task: |
1223 | put_task_struct(task); | 1223 | put_task_struct(task); |
1224 | out: | 1224 | out: |
1225 | return ret; | 1225 | return ret; |
1226 | } | 1226 | } |
1227 | 1227 | ||
1228 | static int pagemap_open(struct inode *inode, struct file *file) | 1228 | static int pagemap_open(struct inode *inode, struct file *file) |
1229 | { | 1229 | { |
1230 | pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " | 1230 | pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " |
1231 | "to stop being page-shift some time soon. See the " | 1231 | "to stop being page-shift some time soon. See the " |
1232 | "linux/Documentation/vm/pagemap.txt for details.\n"); | 1232 | "linux/Documentation/vm/pagemap.txt for details.\n"); |
1233 | return 0; | 1233 | return 0; |
1234 | } | 1234 | } |
1235 | 1235 | ||
1236 | const struct file_operations proc_pagemap_operations = { | 1236 | const struct file_operations proc_pagemap_operations = { |
1237 | .llseek = mem_lseek, /* borrow this */ | 1237 | .llseek = mem_lseek, /* borrow this */ |
1238 | .read = pagemap_read, | 1238 | .read = pagemap_read, |
1239 | .open = pagemap_open, | 1239 | .open = pagemap_open, |
1240 | }; | 1240 | }; |
1241 | #endif /* CONFIG_PROC_PAGE_MONITOR */ | 1241 | #endif /* CONFIG_PROC_PAGE_MONITOR */ |
1242 | 1242 | ||
1243 | #ifdef CONFIG_NUMA | 1243 | #ifdef CONFIG_NUMA |
1244 | 1244 | ||
1245 | struct numa_maps { | 1245 | struct numa_maps { |
1246 | struct vm_area_struct *vma; | 1246 | struct vm_area_struct *vma; |
1247 | unsigned long pages; | 1247 | unsigned long pages; |
1248 | unsigned long anon; | 1248 | unsigned long anon; |
1249 | unsigned long active; | 1249 | unsigned long active; |
1250 | unsigned long writeback; | 1250 | unsigned long writeback; |
1251 | unsigned long mapcount_max; | 1251 | unsigned long mapcount_max; |
1252 | unsigned long dirty; | 1252 | unsigned long dirty; |
1253 | unsigned long swapcache; | 1253 | unsigned long swapcache; |
1254 | unsigned long node[MAX_NUMNODES]; | 1254 | unsigned long node[MAX_NUMNODES]; |
1255 | }; | 1255 | }; |
1256 | 1256 | ||
1257 | struct numa_maps_private { | 1257 | struct numa_maps_private { |
1258 | struct proc_maps_private proc_maps; | 1258 | struct proc_maps_private proc_maps; |
1259 | struct numa_maps md; | 1259 | struct numa_maps md; |
1260 | }; | 1260 | }; |
1261 | 1261 | ||
1262 | static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, | 1262 | static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, |
1263 | unsigned long nr_pages) | 1263 | unsigned long nr_pages) |
1264 | { | 1264 | { |
1265 | int count = page_mapcount(page); | 1265 | int count = page_mapcount(page); |
1266 | 1266 | ||
1267 | md->pages += nr_pages; | 1267 | md->pages += nr_pages; |
1268 | if (pte_dirty || PageDirty(page)) | 1268 | if (pte_dirty || PageDirty(page)) |
1269 | md->dirty += nr_pages; | 1269 | md->dirty += nr_pages; |
1270 | 1270 | ||
1271 | if (PageSwapCache(page)) | 1271 | if (PageSwapCache(page)) |
1272 | md->swapcache += nr_pages; | 1272 | md->swapcache += nr_pages; |
1273 | 1273 | ||
1274 | if (PageActive(page) || PageUnevictable(page)) | 1274 | if (PageActive(page) || PageUnevictable(page)) |
1275 | md->active += nr_pages; | 1275 | md->active += nr_pages; |
1276 | 1276 | ||
1277 | if (PageWriteback(page)) | 1277 | if (PageWriteback(page)) |
1278 | md->writeback += nr_pages; | 1278 | md->writeback += nr_pages; |
1279 | 1279 | ||
1280 | if (PageAnon(page)) | 1280 | if (PageAnon(page)) |
1281 | md->anon += nr_pages; | 1281 | md->anon += nr_pages; |
1282 | 1282 | ||
1283 | if (count > md->mapcount_max) | 1283 | if (count > md->mapcount_max) |
1284 | md->mapcount_max = count; | 1284 | md->mapcount_max = count; |
1285 | 1285 | ||
1286 | md->node[page_to_nid(page)] += nr_pages; | 1286 | md->node[page_to_nid(page)] += nr_pages; |
1287 | } | 1287 | } |
1288 | 1288 | ||
1289 | static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, | 1289 | static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, |
1290 | unsigned long addr) | 1290 | unsigned long addr) |
1291 | { | 1291 | { |
1292 | struct page *page; | 1292 | struct page *page; |
1293 | int nid; | 1293 | int nid; |
1294 | 1294 | ||
1295 | if (!pte_present(pte)) | 1295 | if (!pte_present(pte)) |
1296 | return NULL; | 1296 | return NULL; |
1297 | 1297 | ||
1298 | page = vm_normal_page(vma, addr, pte); | 1298 | page = vm_normal_page(vma, addr, pte); |
1299 | if (!page) | 1299 | if (!page) |
1300 | return NULL; | 1300 | return NULL; |
1301 | 1301 | ||
1302 | if (PageReserved(page)) | 1302 | if (PageReserved(page)) |
1303 | return NULL; | 1303 | return NULL; |
1304 | 1304 | ||
1305 | nid = page_to_nid(page); | 1305 | nid = page_to_nid(page); |
1306 | if (!node_isset(nid, node_states[N_MEMORY])) | 1306 | if (!node_isset(nid, node_states[N_MEMORY])) |
1307 | return NULL; | 1307 | return NULL; |
1308 | 1308 | ||
1309 | return page; | 1309 | return page; |
1310 | } | 1310 | } |
1311 | 1311 | ||
1312 | static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | 1312 | static int gather_pte_stats(pmd_t *pmd, unsigned long addr, |
1313 | unsigned long end, struct mm_walk *walk) | 1313 | unsigned long end, struct mm_walk *walk) |
1314 | { | 1314 | { |
1315 | struct numa_maps *md; | 1315 | struct numa_maps *md; |
1316 | spinlock_t *ptl; | 1316 | spinlock_t *ptl; |
1317 | pte_t *orig_pte; | 1317 | pte_t *orig_pte; |
1318 | pte_t *pte; | 1318 | pte_t *pte; |
1319 | 1319 | ||
1320 | md = walk->private; | 1320 | md = walk->private; |
1321 | 1321 | ||
1322 | if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { | 1322 | if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { |
1323 | pte_t huge_pte = *(pte_t *)pmd; | 1323 | pte_t huge_pte = *(pte_t *)pmd; |
1324 | struct page *page; | 1324 | struct page *page; |
1325 | 1325 | ||
1326 | page = can_gather_numa_stats(huge_pte, md->vma, addr); | 1326 | page = can_gather_numa_stats(huge_pte, md->vma, addr); |
1327 | if (page) | 1327 | if (page) |
1328 | gather_stats(page, md, pte_dirty(huge_pte), | 1328 | gather_stats(page, md, pte_dirty(huge_pte), |
1329 | HPAGE_PMD_SIZE/PAGE_SIZE); | 1329 | HPAGE_PMD_SIZE/PAGE_SIZE); |
1330 | spin_unlock(ptl); | 1330 | spin_unlock(ptl); |
1331 | return 0; | 1331 | return 0; |
1332 | } | 1332 | } |
1333 | 1333 | ||
1334 | if (pmd_trans_unstable(pmd)) | 1334 | if (pmd_trans_unstable(pmd)) |
1335 | return 0; | 1335 | return 0; |
1336 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | 1336 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
1337 | do { | 1337 | do { |
1338 | struct page *page = can_gather_numa_stats(*pte, md->vma, addr); | 1338 | struct page *page = can_gather_numa_stats(*pte, md->vma, addr); |
1339 | if (!page) | 1339 | if (!page) |
1340 | continue; | 1340 | continue; |
1341 | gather_stats(page, md, pte_dirty(*pte), 1); | 1341 | gather_stats(page, md, pte_dirty(*pte), 1); |
1342 | 1342 | ||
1343 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1343 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1344 | pte_unmap_unlock(orig_pte, ptl); | 1344 | pte_unmap_unlock(orig_pte, ptl); |
1345 | return 0; | 1345 | return 0; |
1346 | } | 1346 | } |
1347 | #ifdef CONFIG_HUGETLB_PAGE | 1347 | #ifdef CONFIG_HUGETLB_PAGE |
1348 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, | 1348 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, |
1349 | unsigned long addr, unsigned long end, struct mm_walk *walk) | 1349 | unsigned long addr, unsigned long end, struct mm_walk *walk) |
1350 | { | 1350 | { |
1351 | struct numa_maps *md; | 1351 | struct numa_maps *md; |
1352 | struct page *page; | 1352 | struct page *page; |
1353 | 1353 | ||
1354 | if (pte_none(*pte)) | 1354 | if (!pte_present(*pte)) |
1355 | return 0; | 1355 | return 0; |
1356 | 1356 | ||
1357 | page = pte_page(*pte); | 1357 | page = pte_page(*pte); |
1358 | if (!page) | 1358 | if (!page) |
1359 | return 0; | 1359 | return 0; |
1360 | 1360 | ||
1361 | md = walk->private; | 1361 | md = walk->private; |
1362 | gather_stats(page, md, pte_dirty(*pte), 1); | 1362 | gather_stats(page, md, pte_dirty(*pte), 1); |
1363 | return 0; | 1363 | return 0; |
1364 | } | 1364 | } |
1365 | 1365 | ||
1366 | #else | 1366 | #else |
1367 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, | 1367 | static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, |
1368 | unsigned long addr, unsigned long end, struct mm_walk *walk) | 1368 | unsigned long addr, unsigned long end, struct mm_walk *walk) |
1369 | { | 1369 | { |
1370 | return 0; | 1370 | return 0; |
1371 | } | 1371 | } |
1372 | #endif | 1372 | #endif |
1373 | 1373 | ||
1374 | /* | 1374 | /* |
1375 | * Display pages allocated per node and memory policy via /proc. | 1375 | * Display pages allocated per node and memory policy via /proc. |
1376 | */ | 1376 | */ |
1377 | static int show_numa_map(struct seq_file *m, void *v, int is_pid) | 1377 | static int show_numa_map(struct seq_file *m, void *v, int is_pid) |
1378 | { | 1378 | { |
1379 | struct numa_maps_private *numa_priv = m->private; | 1379 | struct numa_maps_private *numa_priv = m->private; |
1380 | struct proc_maps_private *proc_priv = &numa_priv->proc_maps; | 1380 | struct proc_maps_private *proc_priv = &numa_priv->proc_maps; |
1381 | struct vm_area_struct *vma = v; | 1381 | struct vm_area_struct *vma = v; |
1382 | struct numa_maps *md = &numa_priv->md; | 1382 | struct numa_maps *md = &numa_priv->md; |
1383 | struct file *file = vma->vm_file; | 1383 | struct file *file = vma->vm_file; |
1384 | struct task_struct *task = proc_priv->task; | 1384 | struct task_struct *task = proc_priv->task; |
1385 | struct mm_struct *mm = vma->vm_mm; | 1385 | struct mm_struct *mm = vma->vm_mm; |
1386 | struct mm_walk walk = {}; | 1386 | struct mm_walk walk = {}; |
1387 | struct mempolicy *pol; | 1387 | struct mempolicy *pol; |
1388 | char buffer[64]; | 1388 | char buffer[64]; |
1389 | int nid; | 1389 | int nid; |
1390 | 1390 | ||
1391 | if (!mm) | 1391 | if (!mm) |
1392 | return 0; | 1392 | return 0; |
1393 | 1393 | ||
1394 | /* Ensure we start with an empty set of numa_maps statistics. */ | 1394 | /* Ensure we start with an empty set of numa_maps statistics. */ |
1395 | memset(md, 0, sizeof(*md)); | 1395 | memset(md, 0, sizeof(*md)); |
1396 | 1396 | ||
1397 | md->vma = vma; | 1397 | md->vma = vma; |
1398 | 1398 | ||
1399 | walk.hugetlb_entry = gather_hugetbl_stats; | 1399 | walk.hugetlb_entry = gather_hugetbl_stats; |
1400 | walk.pmd_entry = gather_pte_stats; | 1400 | walk.pmd_entry = gather_pte_stats; |
1401 | walk.private = md; | 1401 | walk.private = md; |
1402 | walk.mm = mm; | 1402 | walk.mm = mm; |
1403 | 1403 | ||
1404 | pol = get_vma_policy(task, vma, vma->vm_start); | 1404 | pol = get_vma_policy(task, vma, vma->vm_start); |
1405 | mpol_to_str(buffer, sizeof(buffer), pol); | 1405 | mpol_to_str(buffer, sizeof(buffer), pol); |
1406 | mpol_cond_put(pol); | 1406 | mpol_cond_put(pol); |
1407 | 1407 | ||
1408 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 1408 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
1409 | 1409 | ||
1410 | if (file) { | 1410 | if (file) { |
1411 | seq_printf(m, " file="); | 1411 | seq_printf(m, " file="); |
1412 | seq_path(m, &file->f_path, "\n\t= "); | 1412 | seq_path(m, &file->f_path, "\n\t= "); |
1413 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | 1413 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { |
1414 | seq_printf(m, " heap"); | 1414 | seq_printf(m, " heap"); |
1415 | } else { | 1415 | } else { |
1416 | pid_t tid = vm_is_stack(task, vma, is_pid); | 1416 | pid_t tid = vm_is_stack(task, vma, is_pid); |
1417 | if (tid != 0) { | 1417 | if (tid != 0) { |
1418 | /* | 1418 | /* |
1419 | * Thread stack in /proc/PID/task/TID/maps or | 1419 | * Thread stack in /proc/PID/task/TID/maps or |
1420 | * the main process stack. | 1420 | * the main process stack. |
1421 | */ | 1421 | */ |
1422 | if (!is_pid || (vma->vm_start <= mm->start_stack && | 1422 | if (!is_pid || (vma->vm_start <= mm->start_stack && |
1423 | vma->vm_end >= mm->start_stack)) | 1423 | vma->vm_end >= mm->start_stack)) |
1424 | seq_printf(m, " stack"); | 1424 | seq_printf(m, " stack"); |
1425 | else | 1425 | else |
1426 | seq_printf(m, " stack:%d", tid); | 1426 | seq_printf(m, " stack:%d", tid); |
1427 | } | 1427 | } |
1428 | } | 1428 | } |
1429 | 1429 | ||
1430 | if (is_vm_hugetlb_page(vma)) | 1430 | if (is_vm_hugetlb_page(vma)) |
1431 | seq_printf(m, " huge"); | 1431 | seq_printf(m, " huge"); |
1432 | 1432 | ||
1433 | walk_page_range(vma->vm_start, vma->vm_end, &walk); | 1433 | walk_page_range(vma->vm_start, vma->vm_end, &walk); |
1434 | 1434 | ||
1435 | if (!md->pages) | 1435 | if (!md->pages) |
1436 | goto out; | 1436 | goto out; |
1437 | 1437 | ||
1438 | if (md->anon) | 1438 | if (md->anon) |
1439 | seq_printf(m, " anon=%lu", md->anon); | 1439 | seq_printf(m, " anon=%lu", md->anon); |
1440 | 1440 | ||
1441 | if (md->dirty) | 1441 | if (md->dirty) |
1442 | seq_printf(m, " dirty=%lu", md->dirty); | 1442 | seq_printf(m, " dirty=%lu", md->dirty); |
1443 | 1443 | ||
1444 | if (md->pages != md->anon && md->pages != md->dirty) | 1444 | if (md->pages != md->anon && md->pages != md->dirty) |
1445 | seq_printf(m, " mapped=%lu", md->pages); | 1445 | seq_printf(m, " mapped=%lu", md->pages); |
1446 | 1446 | ||
1447 | if (md->mapcount_max > 1) | 1447 | if (md->mapcount_max > 1) |
1448 | seq_printf(m, " mapmax=%lu", md->mapcount_max); | 1448 | seq_printf(m, " mapmax=%lu", md->mapcount_max); |
1449 | 1449 | ||
1450 | if (md->swapcache) | 1450 | if (md->swapcache) |
1451 | seq_printf(m, " swapcache=%lu", md->swapcache); | 1451 | seq_printf(m, " swapcache=%lu", md->swapcache); |
1452 | 1452 | ||
1453 | if (md->active < md->pages && !is_vm_hugetlb_page(vma)) | 1453 | if (md->active < md->pages && !is_vm_hugetlb_page(vma)) |
1454 | seq_printf(m, " active=%lu", md->active); | 1454 | seq_printf(m, " active=%lu", md->active); |
1455 | 1455 | ||
1456 | if (md->writeback) | 1456 | if (md->writeback) |
1457 | seq_printf(m, " writeback=%lu", md->writeback); | 1457 | seq_printf(m, " writeback=%lu", md->writeback); |
1458 | 1458 | ||
1459 | for_each_node_state(nid, N_MEMORY) | 1459 | for_each_node_state(nid, N_MEMORY) |
1460 | if (md->node[nid]) | 1460 | if (md->node[nid]) |
1461 | seq_printf(m, " N%d=%lu", nid, md->node[nid]); | 1461 | seq_printf(m, " N%d=%lu", nid, md->node[nid]); |
1462 | out: | 1462 | out: |
1463 | seq_putc(m, '\n'); | 1463 | seq_putc(m, '\n'); |
1464 | 1464 | ||
1465 | if (m->count < m->size) | 1465 | if (m->count < m->size) |
1466 | m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; | 1466 | m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; |
1467 | return 0; | 1467 | return 0; |
1468 | } | 1468 | } |
1469 | 1469 | ||
1470 | static int show_pid_numa_map(struct seq_file *m, void *v) | 1470 | static int show_pid_numa_map(struct seq_file *m, void *v) |
1471 | { | 1471 | { |
1472 | return show_numa_map(m, v, 1); | 1472 | return show_numa_map(m, v, 1); |
1473 | } | 1473 | } |
1474 | 1474 | ||
1475 | static int show_tid_numa_map(struct seq_file *m, void *v) | 1475 | static int show_tid_numa_map(struct seq_file *m, void *v) |
1476 | { | 1476 | { |
1477 | return show_numa_map(m, v, 0); | 1477 | return show_numa_map(m, v, 0); |
1478 | } | 1478 | } |
1479 | 1479 | ||
1480 | static const struct seq_operations proc_pid_numa_maps_op = { | 1480 | static const struct seq_operations proc_pid_numa_maps_op = { |
1481 | .start = m_start, | 1481 | .start = m_start, |
1482 | .next = m_next, | 1482 | .next = m_next, |
1483 | .stop = m_stop, | 1483 | .stop = m_stop, |
1484 | .show = show_pid_numa_map, | 1484 | .show = show_pid_numa_map, |
1485 | }; | 1485 | }; |
1486 | 1486 | ||
1487 | static const struct seq_operations proc_tid_numa_maps_op = { | 1487 | static const struct seq_operations proc_tid_numa_maps_op = { |
1488 | .start = m_start, | 1488 | .start = m_start, |
1489 | .next = m_next, | 1489 | .next = m_next, |
1490 | .stop = m_stop, | 1490 | .stop = m_stop, |
1491 | .show = show_tid_numa_map, | 1491 | .show = show_tid_numa_map, |
1492 | }; | 1492 | }; |
1493 | 1493 | ||
1494 | static int numa_maps_open(struct inode *inode, struct file *file, | 1494 | static int numa_maps_open(struct inode *inode, struct file *file, |
1495 | const struct seq_operations *ops) | 1495 | const struct seq_operations *ops) |
1496 | { | 1496 | { |
1497 | struct numa_maps_private *priv; | 1497 | struct numa_maps_private *priv; |
1498 | int ret = -ENOMEM; | 1498 | int ret = -ENOMEM; |
1499 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | 1499 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
1500 | if (priv) { | 1500 | if (priv) { |
1501 | priv->proc_maps.pid = proc_pid(inode); | 1501 | priv->proc_maps.pid = proc_pid(inode); |
1502 | ret = seq_open(file, ops); | 1502 | ret = seq_open(file, ops); |
1503 | if (!ret) { | 1503 | if (!ret) { |
1504 | struct seq_file *m = file->private_data; | 1504 | struct seq_file *m = file->private_data; |
1505 | m->private = priv; | 1505 | m->private = priv; |
1506 | } else { | 1506 | } else { |
1507 | kfree(priv); | 1507 | kfree(priv); |
1508 | } | 1508 | } |
1509 | } | 1509 | } |
1510 | return ret; | 1510 | return ret; |
1511 | } | 1511 | } |
1512 | 1512 | ||
1513 | static int pid_numa_maps_open(struct inode *inode, struct file *file) | 1513 | static int pid_numa_maps_open(struct inode *inode, struct file *file) |
1514 | { | 1514 | { |
1515 | return numa_maps_open(inode, file, &proc_pid_numa_maps_op); | 1515 | return numa_maps_open(inode, file, &proc_pid_numa_maps_op); |
1516 | } | 1516 | } |
1517 | 1517 | ||
1518 | static int tid_numa_maps_open(struct inode *inode, struct file *file) | 1518 | static int tid_numa_maps_open(struct inode *inode, struct file *file) |
1519 | { | 1519 | { |
1520 | return numa_maps_open(inode, file, &proc_tid_numa_maps_op); | 1520 | return numa_maps_open(inode, file, &proc_tid_numa_maps_op); |
1521 | } | 1521 | } |
1522 | 1522 | ||
1523 | const struct file_operations proc_pid_numa_maps_operations = { | 1523 | const struct file_operations proc_pid_numa_maps_operations = { |
1524 | .open = pid_numa_maps_open, | 1524 | .open = pid_numa_maps_open, |
1525 | .read = seq_read, | 1525 | .read = seq_read, |
1526 | .llseek = seq_lseek, | 1526 | .llseek = seq_lseek, |
1527 | .release = seq_release_private, | 1527 | .release = seq_release_private, |
1528 | }; | 1528 | }; |
1529 | 1529 | ||
1530 | const struct file_operations proc_tid_numa_maps_operations = { | 1530 | const struct file_operations proc_tid_numa_maps_operations = { |
1531 | .open = tid_numa_maps_open, | 1531 | .open = tid_numa_maps_open, |
1532 | .read = seq_read, | 1532 | .read = seq_read, |
1533 | .llseek = seq_lseek, | 1533 | .llseek = seq_lseek, |
1534 | .release = seq_release_private, | 1534 | .release = seq_release_private, |
1535 | }; | 1535 | }; |
1536 | #endif /* CONFIG_NUMA */ | 1536 | #endif /* CONFIG_NUMA */ |
1537 | 1537 |
mm/mempolicy.c
1 | /* | 1 | /* |
2 | * Simple NUMA memory policy for the Linux kernel. | 2 | * Simple NUMA memory policy for the Linux kernel. |
3 | * | 3 | * |
4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. | 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. |
5 | * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. | 5 | * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. |
6 | * Subject to the GNU Public License, version 2. | 6 | * Subject to the GNU Public License, version 2. |
7 | * | 7 | * |
8 | * NUMA policy allows the user to give hints in which node(s) memory should | 8 | * NUMA policy allows the user to give hints in which node(s) memory should |
9 | * be allocated. | 9 | * be allocated. |
10 | * | 10 | * |
11 | * Support four policies per VMA and per process: | 11 | * Support four policies per VMA and per process: |
12 | * | 12 | * |
13 | * The VMA policy has priority over the process policy for a page fault. | 13 | * The VMA policy has priority over the process policy for a page fault. |
14 | * | 14 | * |
15 | * interleave Allocate memory interleaved over a set of nodes, | 15 | * interleave Allocate memory interleaved over a set of nodes, |
16 | * with normal fallback if it fails. | 16 | * with normal fallback if it fails. |
17 | * For VMA based allocations this interleaves based on the | 17 | * For VMA based allocations this interleaves based on the |
18 | * offset into the backing object or offset into the mapping | 18 | * offset into the backing object or offset into the mapping |
19 | * for anonymous memory. For process policy an process counter | 19 | * for anonymous memory. For process policy an process counter |
20 | * is used. | 20 | * is used. |
21 | * | 21 | * |
22 | * bind Only allocate memory on a specific set of nodes, | 22 | * bind Only allocate memory on a specific set of nodes, |
23 | * no fallback. | 23 | * no fallback. |
24 | * FIXME: memory is allocated starting with the first node | 24 | * FIXME: memory is allocated starting with the first node |
25 | * to the last. It would be better if bind would truly restrict | 25 | * to the last. It would be better if bind would truly restrict |
26 | * the allocation to memory nodes instead | 26 | * the allocation to memory nodes instead |
27 | * | 27 | * |
28 | * preferred Try a specific node first before normal fallback. | 28 | * preferred Try a specific node first before normal fallback. |
29 | * As a special case NUMA_NO_NODE here means do the allocation | 29 | * As a special case NUMA_NO_NODE here means do the allocation |
30 | * on the local CPU. This is normally identical to default, | 30 | * on the local CPU. This is normally identical to default, |
31 | * but useful to set in a VMA when you have a non default | 31 | * but useful to set in a VMA when you have a non default |
32 | * process policy. | 32 | * process policy. |
33 | * | 33 | * |
34 | * default Allocate on the local node first, or when on a VMA | 34 | * default Allocate on the local node first, or when on a VMA |
35 | * use the process policy. This is what Linux always did | 35 | * use the process policy. This is what Linux always did |
36 | * in a NUMA aware kernel and still does by, ahem, default. | 36 | * in a NUMA aware kernel and still does by, ahem, default. |
37 | * | 37 | * |
38 | * The process policy is applied for most non interrupt memory allocations | 38 | * The process policy is applied for most non interrupt memory allocations |
39 | * in that process' context. Interrupts ignore the policies and always | 39 | * in that process' context. Interrupts ignore the policies and always |
40 | * try to allocate on the local CPU. The VMA policy is only applied for memory | 40 | * try to allocate on the local CPU. The VMA policy is only applied for memory |
41 | * allocations for a VMA in the VM. | 41 | * allocations for a VMA in the VM. |
42 | * | 42 | * |
43 | * Currently there are a few corner cases in swapping where the policy | 43 | * Currently there are a few corner cases in swapping where the policy |
44 | * is not applied, but the majority should be handled. When process policy | 44 | * is not applied, but the majority should be handled. When process policy |
45 | * is used it is not remembered over swap outs/swap ins. | 45 | * is used it is not remembered over swap outs/swap ins. |
46 | * | 46 | * |
47 | * Only the highest zone in the zone hierarchy gets policied. Allocations | 47 | * Only the highest zone in the zone hierarchy gets policied. Allocations |
48 | * requesting a lower zone just use default policy. This implies that | 48 | * requesting a lower zone just use default policy. This implies that |
49 | * on systems with highmem kernel lowmem allocation don't get policied. | 49 | * on systems with highmem kernel lowmem allocation don't get policied. |
50 | * Same with GFP_DMA allocations. | 50 | * Same with GFP_DMA allocations. |
51 | * | 51 | * |
52 | * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between | 52 | * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between |
53 | * all users and remembered even when nobody has memory mapped. | 53 | * all users and remembered even when nobody has memory mapped. |
54 | */ | 54 | */ |
55 | 55 | ||
56 | /* Notebook: | 56 | /* Notebook: |
57 | fix mmap readahead to honour policy and enable policy for any page cache | 57 | fix mmap readahead to honour policy and enable policy for any page cache |
58 | object | 58 | object |
59 | statistics for bigpages | 59 | statistics for bigpages |
60 | global policy for page cache? currently it uses process policy. Requires | 60 | global policy for page cache? currently it uses process policy. Requires |
61 | first item above. | 61 | first item above. |
62 | handle mremap for shared memory (currently ignored for the policy) | 62 | handle mremap for shared memory (currently ignored for the policy) |
63 | grows down? | 63 | grows down? |
64 | make bind policy root only? It can trigger oom much faster and the | 64 | make bind policy root only? It can trigger oom much faster and the |
65 | kernel is not always grateful with that. | 65 | kernel is not always grateful with that. |
66 | */ | 66 | */ |
67 | 67 | ||
68 | #include <linux/mempolicy.h> | 68 | #include <linux/mempolicy.h> |
69 | #include <linux/mm.h> | 69 | #include <linux/mm.h> |
70 | #include <linux/highmem.h> | 70 | #include <linux/highmem.h> |
71 | #include <linux/hugetlb.h> | 71 | #include <linux/hugetlb.h> |
72 | #include <linux/kernel.h> | 72 | #include <linux/kernel.h> |
73 | #include <linux/sched.h> | 73 | #include <linux/sched.h> |
74 | #include <linux/nodemask.h> | 74 | #include <linux/nodemask.h> |
75 | #include <linux/cpuset.h> | 75 | #include <linux/cpuset.h> |
76 | #include <linux/slab.h> | 76 | #include <linux/slab.h> |
77 | #include <linux/string.h> | 77 | #include <linux/string.h> |
78 | #include <linux/export.h> | 78 | #include <linux/export.h> |
79 | #include <linux/nsproxy.h> | 79 | #include <linux/nsproxy.h> |
80 | #include <linux/interrupt.h> | 80 | #include <linux/interrupt.h> |
81 | #include <linux/init.h> | 81 | #include <linux/init.h> |
82 | #include <linux/compat.h> | 82 | #include <linux/compat.h> |
83 | #include <linux/swap.h> | 83 | #include <linux/swap.h> |
84 | #include <linux/seq_file.h> | 84 | #include <linux/seq_file.h> |
85 | #include <linux/proc_fs.h> | 85 | #include <linux/proc_fs.h> |
86 | #include <linux/migrate.h> | 86 | #include <linux/migrate.h> |
87 | #include <linux/ksm.h> | 87 | #include <linux/ksm.h> |
88 | #include <linux/rmap.h> | 88 | #include <linux/rmap.h> |
89 | #include <linux/security.h> | 89 | #include <linux/security.h> |
90 | #include <linux/syscalls.h> | 90 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 91 | #include <linux/ctype.h> |
92 | #include <linux/mm_inline.h> | 92 | #include <linux/mm_inline.h> |
93 | #include <linux/mmu_notifier.h> | 93 | #include <linux/mmu_notifier.h> |
94 | 94 | ||
95 | #include <asm/tlbflush.h> | 95 | #include <asm/tlbflush.h> |
96 | #include <asm/uaccess.h> | 96 | #include <asm/uaccess.h> |
97 | #include <linux/random.h> | 97 | #include <linux/random.h> |
98 | 98 | ||
99 | #include "internal.h" | 99 | #include "internal.h" |
100 | 100 | ||
101 | /* Internal flags */ | 101 | /* Internal flags */ |
102 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ | 102 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ |
103 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | 103 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ |
104 | 104 | ||
105 | static struct kmem_cache *policy_cache; | 105 | static struct kmem_cache *policy_cache; |
106 | static struct kmem_cache *sn_cache; | 106 | static struct kmem_cache *sn_cache; |
107 | 107 | ||
108 | /* Highest zone. An specific allocation for a zone below that is not | 108 | /* Highest zone. An specific allocation for a zone below that is not |
109 | policied. */ | 109 | policied. */ |
110 | enum zone_type policy_zone = 0; | 110 | enum zone_type policy_zone = 0; |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * run-time system-wide default policy => local allocation | 113 | * run-time system-wide default policy => local allocation |
114 | */ | 114 | */ |
115 | static struct mempolicy default_policy = { | 115 | static struct mempolicy default_policy = { |
116 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 116 | .refcnt = ATOMIC_INIT(1), /* never free it */ |
117 | .mode = MPOL_PREFERRED, | 117 | .mode = MPOL_PREFERRED, |
118 | .flags = MPOL_F_LOCAL, | 118 | .flags = MPOL_F_LOCAL, |
119 | }; | 119 | }; |
120 | 120 | ||
121 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; | 121 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; |
122 | 122 | ||
123 | static struct mempolicy *get_task_policy(struct task_struct *p) | 123 | static struct mempolicy *get_task_policy(struct task_struct *p) |
124 | { | 124 | { |
125 | struct mempolicy *pol = p->mempolicy; | 125 | struct mempolicy *pol = p->mempolicy; |
126 | 126 | ||
127 | if (!pol) { | 127 | if (!pol) { |
128 | int node = numa_node_id(); | 128 | int node = numa_node_id(); |
129 | 129 | ||
130 | if (node != NUMA_NO_NODE) { | 130 | if (node != NUMA_NO_NODE) { |
131 | pol = &preferred_node_policy[node]; | 131 | pol = &preferred_node_policy[node]; |
132 | /* | 132 | /* |
133 | * preferred_node_policy is not initialised early in | 133 | * preferred_node_policy is not initialised early in |
134 | * boot | 134 | * boot |
135 | */ | 135 | */ |
136 | if (!pol->mode) | 136 | if (!pol->mode) |
137 | pol = NULL; | 137 | pol = NULL; |
138 | } | 138 | } |
139 | } | 139 | } |
140 | 140 | ||
141 | return pol; | 141 | return pol; |
142 | } | 142 | } |
143 | 143 | ||
144 | static const struct mempolicy_operations { | 144 | static const struct mempolicy_operations { |
145 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); | 145 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); |
146 | /* | 146 | /* |
147 | * If read-side task has no lock to protect task->mempolicy, write-side | 147 | * If read-side task has no lock to protect task->mempolicy, write-side |
148 | * task will rebind the task->mempolicy by two step. The first step is | 148 | * task will rebind the task->mempolicy by two step. The first step is |
149 | * setting all the newly nodes, and the second step is cleaning all the | 149 | * setting all the newly nodes, and the second step is cleaning all the |
150 | * disallowed nodes. In this way, we can avoid finding no node to alloc | 150 | * disallowed nodes. In this way, we can avoid finding no node to alloc |
151 | * page. | 151 | * page. |
152 | * If we have a lock to protect task->mempolicy in read-side, we do | 152 | * If we have a lock to protect task->mempolicy in read-side, we do |
153 | * rebind directly. | 153 | * rebind directly. |
154 | * | 154 | * |
155 | * step: | 155 | * step: |
156 | * MPOL_REBIND_ONCE - do rebind work at once | 156 | * MPOL_REBIND_ONCE - do rebind work at once |
157 | * MPOL_REBIND_STEP1 - set all the newly nodes | 157 | * MPOL_REBIND_STEP1 - set all the newly nodes |
158 | * MPOL_REBIND_STEP2 - clean all the disallowed nodes | 158 | * MPOL_REBIND_STEP2 - clean all the disallowed nodes |
159 | */ | 159 | */ |
160 | void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, | 160 | void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, |
161 | enum mpol_rebind_step step); | 161 | enum mpol_rebind_step step); |
162 | } mpol_ops[MPOL_MAX]; | 162 | } mpol_ops[MPOL_MAX]; |
163 | 163 | ||
164 | /* Check that the nodemask contains at least one populated zone */ | 164 | /* Check that the nodemask contains at least one populated zone */ |
165 | static int is_valid_nodemask(const nodemask_t *nodemask) | 165 | static int is_valid_nodemask(const nodemask_t *nodemask) |
166 | { | 166 | { |
167 | return nodes_intersects(*nodemask, node_states[N_MEMORY]); | 167 | return nodes_intersects(*nodemask, node_states[N_MEMORY]); |
168 | } | 168 | } |
169 | 169 | ||
170 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) | 170 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) |
171 | { | 171 | { |
172 | return pol->flags & MPOL_MODE_FLAGS; | 172 | return pol->flags & MPOL_MODE_FLAGS; |
173 | } | 173 | } |
174 | 174 | ||
175 | static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, | 175 | static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, |
176 | const nodemask_t *rel) | 176 | const nodemask_t *rel) |
177 | { | 177 | { |
178 | nodemask_t tmp; | 178 | nodemask_t tmp; |
179 | nodes_fold(tmp, *orig, nodes_weight(*rel)); | 179 | nodes_fold(tmp, *orig, nodes_weight(*rel)); |
180 | nodes_onto(*ret, tmp, *rel); | 180 | nodes_onto(*ret, tmp, *rel); |
181 | } | 181 | } |
182 | 182 | ||
183 | static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) | 183 | static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) |
184 | { | 184 | { |
185 | if (nodes_empty(*nodes)) | 185 | if (nodes_empty(*nodes)) |
186 | return -EINVAL; | 186 | return -EINVAL; |
187 | pol->v.nodes = *nodes; | 187 | pol->v.nodes = *nodes; |
188 | return 0; | 188 | return 0; |
189 | } | 189 | } |
190 | 190 | ||
191 | static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) | 191 | static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) |
192 | { | 192 | { |
193 | if (!nodes) | 193 | if (!nodes) |
194 | pol->flags |= MPOL_F_LOCAL; /* local allocation */ | 194 | pol->flags |= MPOL_F_LOCAL; /* local allocation */ |
195 | else if (nodes_empty(*nodes)) | 195 | else if (nodes_empty(*nodes)) |
196 | return -EINVAL; /* no allowed nodes */ | 196 | return -EINVAL; /* no allowed nodes */ |
197 | else | 197 | else |
198 | pol->v.preferred_node = first_node(*nodes); | 198 | pol->v.preferred_node = first_node(*nodes); |
199 | return 0; | 199 | return 0; |
200 | } | 200 | } |
201 | 201 | ||
202 | static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) | 202 | static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) |
203 | { | 203 | { |
204 | if (!is_valid_nodemask(nodes)) | 204 | if (!is_valid_nodemask(nodes)) |
205 | return -EINVAL; | 205 | return -EINVAL; |
206 | pol->v.nodes = *nodes; | 206 | pol->v.nodes = *nodes; |
207 | return 0; | 207 | return 0; |
208 | } | 208 | } |
209 | 209 | ||
210 | /* | 210 | /* |
211 | * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if | 211 | * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if |
212 | * any, for the new policy. mpol_new() has already validated the nodes | 212 | * any, for the new policy. mpol_new() has already validated the nodes |
213 | * parameter with respect to the policy mode and flags. But, we need to | 213 | * parameter with respect to the policy mode and flags. But, we need to |
214 | * handle an empty nodemask with MPOL_PREFERRED here. | 214 | * handle an empty nodemask with MPOL_PREFERRED here. |
215 | * | 215 | * |
216 | * Must be called holding task's alloc_lock to protect task's mems_allowed | 216 | * Must be called holding task's alloc_lock to protect task's mems_allowed |
217 | * and mempolicy. May also be called holding the mmap_semaphore for write. | 217 | * and mempolicy. May also be called holding the mmap_semaphore for write. |
218 | */ | 218 | */ |
219 | static int mpol_set_nodemask(struct mempolicy *pol, | 219 | static int mpol_set_nodemask(struct mempolicy *pol, |
220 | const nodemask_t *nodes, struct nodemask_scratch *nsc) | 220 | const nodemask_t *nodes, struct nodemask_scratch *nsc) |
221 | { | 221 | { |
222 | int ret; | 222 | int ret; |
223 | 223 | ||
224 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | 224 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ |
225 | if (pol == NULL) | 225 | if (pol == NULL) |
226 | return 0; | 226 | return 0; |
227 | /* Check N_MEMORY */ | 227 | /* Check N_MEMORY */ |
228 | nodes_and(nsc->mask1, | 228 | nodes_and(nsc->mask1, |
229 | cpuset_current_mems_allowed, node_states[N_MEMORY]); | 229 | cpuset_current_mems_allowed, node_states[N_MEMORY]); |
230 | 230 | ||
231 | VM_BUG_ON(!nodes); | 231 | VM_BUG_ON(!nodes); |
232 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | 232 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) |
233 | nodes = NULL; /* explicit local allocation */ | 233 | nodes = NULL; /* explicit local allocation */ |
234 | else { | 234 | else { |
235 | if (pol->flags & MPOL_F_RELATIVE_NODES) | 235 | if (pol->flags & MPOL_F_RELATIVE_NODES) |
236 | mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); | 236 | mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); |
237 | else | 237 | else |
238 | nodes_and(nsc->mask2, *nodes, nsc->mask1); | 238 | nodes_and(nsc->mask2, *nodes, nsc->mask1); |
239 | 239 | ||
240 | if (mpol_store_user_nodemask(pol)) | 240 | if (mpol_store_user_nodemask(pol)) |
241 | pol->w.user_nodemask = *nodes; | 241 | pol->w.user_nodemask = *nodes; |
242 | else | 242 | else |
243 | pol->w.cpuset_mems_allowed = | 243 | pol->w.cpuset_mems_allowed = |
244 | cpuset_current_mems_allowed; | 244 | cpuset_current_mems_allowed; |
245 | } | 245 | } |
246 | 246 | ||
247 | if (nodes) | 247 | if (nodes) |
248 | ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); | 248 | ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); |
249 | else | 249 | else |
250 | ret = mpol_ops[pol->mode].create(pol, NULL); | 250 | ret = mpol_ops[pol->mode].create(pol, NULL); |
251 | return ret; | 251 | return ret; |
252 | } | 252 | } |
253 | 253 | ||
254 | /* | 254 | /* |
255 | * This function just creates a new policy, does some check and simple | 255 | * This function just creates a new policy, does some check and simple |
256 | * initialization. You must invoke mpol_set_nodemask() to set nodes. | 256 | * initialization. You must invoke mpol_set_nodemask() to set nodes. |
257 | */ | 257 | */ |
258 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | 258 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, |
259 | nodemask_t *nodes) | 259 | nodemask_t *nodes) |
260 | { | 260 | { |
261 | struct mempolicy *policy; | 261 | struct mempolicy *policy; |
262 | 262 | ||
263 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 263 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
264 | mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); | 264 | mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); |
265 | 265 | ||
266 | if (mode == MPOL_DEFAULT) { | 266 | if (mode == MPOL_DEFAULT) { |
267 | if (nodes && !nodes_empty(*nodes)) | 267 | if (nodes && !nodes_empty(*nodes)) |
268 | return ERR_PTR(-EINVAL); | 268 | return ERR_PTR(-EINVAL); |
269 | return NULL; | 269 | return NULL; |
270 | } | 270 | } |
271 | VM_BUG_ON(!nodes); | 271 | VM_BUG_ON(!nodes); |
272 | 272 | ||
273 | /* | 273 | /* |
274 | * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or | 274 | * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or |
275 | * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). | 275 | * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). |
276 | * All other modes require a valid pointer to a non-empty nodemask. | 276 | * All other modes require a valid pointer to a non-empty nodemask. |
277 | */ | 277 | */ |
278 | if (mode == MPOL_PREFERRED) { | 278 | if (mode == MPOL_PREFERRED) { |
279 | if (nodes_empty(*nodes)) { | 279 | if (nodes_empty(*nodes)) { |
280 | if (((flags & MPOL_F_STATIC_NODES) || | 280 | if (((flags & MPOL_F_STATIC_NODES) || |
281 | (flags & MPOL_F_RELATIVE_NODES))) | 281 | (flags & MPOL_F_RELATIVE_NODES))) |
282 | return ERR_PTR(-EINVAL); | 282 | return ERR_PTR(-EINVAL); |
283 | } | 283 | } |
284 | } else if (mode == MPOL_LOCAL) { | 284 | } else if (mode == MPOL_LOCAL) { |
285 | if (!nodes_empty(*nodes)) | 285 | if (!nodes_empty(*nodes)) |
286 | return ERR_PTR(-EINVAL); | 286 | return ERR_PTR(-EINVAL); |
287 | mode = MPOL_PREFERRED; | 287 | mode = MPOL_PREFERRED; |
288 | } else if (nodes_empty(*nodes)) | 288 | } else if (nodes_empty(*nodes)) |
289 | return ERR_PTR(-EINVAL); | 289 | return ERR_PTR(-EINVAL); |
290 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 290 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
291 | if (!policy) | 291 | if (!policy) |
292 | return ERR_PTR(-ENOMEM); | 292 | return ERR_PTR(-ENOMEM); |
293 | atomic_set(&policy->refcnt, 1); | 293 | atomic_set(&policy->refcnt, 1); |
294 | policy->mode = mode; | 294 | policy->mode = mode; |
295 | policy->flags = flags; | 295 | policy->flags = flags; |
296 | 296 | ||
297 | return policy; | 297 | return policy; |
298 | } | 298 | } |
299 | 299 | ||
300 | /* Slow path of a mpol destructor. */ | 300 | /* Slow path of a mpol destructor. */ |
301 | void __mpol_put(struct mempolicy *p) | 301 | void __mpol_put(struct mempolicy *p) |
302 | { | 302 | { |
303 | if (!atomic_dec_and_test(&p->refcnt)) | 303 | if (!atomic_dec_and_test(&p->refcnt)) |
304 | return; | 304 | return; |
305 | kmem_cache_free(policy_cache, p); | 305 | kmem_cache_free(policy_cache, p); |
306 | } | 306 | } |
307 | 307 | ||
308 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, | 308 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, |
309 | enum mpol_rebind_step step) | 309 | enum mpol_rebind_step step) |
310 | { | 310 | { |
311 | } | 311 | } |
312 | 312 | ||
313 | /* | 313 | /* |
314 | * step: | 314 | * step: |
315 | * MPOL_REBIND_ONCE - do rebind work at once | 315 | * MPOL_REBIND_ONCE - do rebind work at once |
316 | * MPOL_REBIND_STEP1 - set all the newly nodes | 316 | * MPOL_REBIND_STEP1 - set all the newly nodes |
317 | * MPOL_REBIND_STEP2 - clean all the disallowed nodes | 317 | * MPOL_REBIND_STEP2 - clean all the disallowed nodes |
318 | */ | 318 | */ |
319 | static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, | 319 | static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, |
320 | enum mpol_rebind_step step) | 320 | enum mpol_rebind_step step) |
321 | { | 321 | { |
322 | nodemask_t tmp; | 322 | nodemask_t tmp; |
323 | 323 | ||
324 | if (pol->flags & MPOL_F_STATIC_NODES) | 324 | if (pol->flags & MPOL_F_STATIC_NODES) |
325 | nodes_and(tmp, pol->w.user_nodemask, *nodes); | 325 | nodes_and(tmp, pol->w.user_nodemask, *nodes); |
326 | else if (pol->flags & MPOL_F_RELATIVE_NODES) | 326 | else if (pol->flags & MPOL_F_RELATIVE_NODES) |
327 | mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); | 327 | mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); |
328 | else { | 328 | else { |
329 | /* | 329 | /* |
330 | * if step == 1, we use ->w.cpuset_mems_allowed to cache the | 330 | * if step == 1, we use ->w.cpuset_mems_allowed to cache the |
331 | * result | 331 | * result |
332 | */ | 332 | */ |
333 | if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { | 333 | if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { |
334 | nodes_remap(tmp, pol->v.nodes, | 334 | nodes_remap(tmp, pol->v.nodes, |
335 | pol->w.cpuset_mems_allowed, *nodes); | 335 | pol->w.cpuset_mems_allowed, *nodes); |
336 | pol->w.cpuset_mems_allowed = step ? tmp : *nodes; | 336 | pol->w.cpuset_mems_allowed = step ? tmp : *nodes; |
337 | } else if (step == MPOL_REBIND_STEP2) { | 337 | } else if (step == MPOL_REBIND_STEP2) { |
338 | tmp = pol->w.cpuset_mems_allowed; | 338 | tmp = pol->w.cpuset_mems_allowed; |
339 | pol->w.cpuset_mems_allowed = *nodes; | 339 | pol->w.cpuset_mems_allowed = *nodes; |
340 | } else | 340 | } else |
341 | BUG(); | 341 | BUG(); |
342 | } | 342 | } |
343 | 343 | ||
344 | if (nodes_empty(tmp)) | 344 | if (nodes_empty(tmp)) |
345 | tmp = *nodes; | 345 | tmp = *nodes; |
346 | 346 | ||
347 | if (step == MPOL_REBIND_STEP1) | 347 | if (step == MPOL_REBIND_STEP1) |
348 | nodes_or(pol->v.nodes, pol->v.nodes, tmp); | 348 | nodes_or(pol->v.nodes, pol->v.nodes, tmp); |
349 | else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) | 349 | else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) |
350 | pol->v.nodes = tmp; | 350 | pol->v.nodes = tmp; |
351 | else | 351 | else |
352 | BUG(); | 352 | BUG(); |
353 | 353 | ||
354 | if (!node_isset(current->il_next, tmp)) { | 354 | if (!node_isset(current->il_next, tmp)) { |
355 | current->il_next = next_node(current->il_next, tmp); | 355 | current->il_next = next_node(current->il_next, tmp); |
356 | if (current->il_next >= MAX_NUMNODES) | 356 | if (current->il_next >= MAX_NUMNODES) |
357 | current->il_next = first_node(tmp); | 357 | current->il_next = first_node(tmp); |
358 | if (current->il_next >= MAX_NUMNODES) | 358 | if (current->il_next >= MAX_NUMNODES) |
359 | current->il_next = numa_node_id(); | 359 | current->il_next = numa_node_id(); |
360 | } | 360 | } |
361 | } | 361 | } |
362 | 362 | ||
363 | static void mpol_rebind_preferred(struct mempolicy *pol, | 363 | static void mpol_rebind_preferred(struct mempolicy *pol, |
364 | const nodemask_t *nodes, | 364 | const nodemask_t *nodes, |
365 | enum mpol_rebind_step step) | 365 | enum mpol_rebind_step step) |
366 | { | 366 | { |
367 | nodemask_t tmp; | 367 | nodemask_t tmp; |
368 | 368 | ||
369 | if (pol->flags & MPOL_F_STATIC_NODES) { | 369 | if (pol->flags & MPOL_F_STATIC_NODES) { |
370 | int node = first_node(pol->w.user_nodemask); | 370 | int node = first_node(pol->w.user_nodemask); |
371 | 371 | ||
372 | if (node_isset(node, *nodes)) { | 372 | if (node_isset(node, *nodes)) { |
373 | pol->v.preferred_node = node; | 373 | pol->v.preferred_node = node; |
374 | pol->flags &= ~MPOL_F_LOCAL; | 374 | pol->flags &= ~MPOL_F_LOCAL; |
375 | } else | 375 | } else |
376 | pol->flags |= MPOL_F_LOCAL; | 376 | pol->flags |= MPOL_F_LOCAL; |
377 | } else if (pol->flags & MPOL_F_RELATIVE_NODES) { | 377 | } else if (pol->flags & MPOL_F_RELATIVE_NODES) { |
378 | mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); | 378 | mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); |
379 | pol->v.preferred_node = first_node(tmp); | 379 | pol->v.preferred_node = first_node(tmp); |
380 | } else if (!(pol->flags & MPOL_F_LOCAL)) { | 380 | } else if (!(pol->flags & MPOL_F_LOCAL)) { |
381 | pol->v.preferred_node = node_remap(pol->v.preferred_node, | 381 | pol->v.preferred_node = node_remap(pol->v.preferred_node, |
382 | pol->w.cpuset_mems_allowed, | 382 | pol->w.cpuset_mems_allowed, |
383 | *nodes); | 383 | *nodes); |
384 | pol->w.cpuset_mems_allowed = *nodes; | 384 | pol->w.cpuset_mems_allowed = *nodes; |
385 | } | 385 | } |
386 | } | 386 | } |
387 | 387 | ||
388 | /* | 388 | /* |
389 | * mpol_rebind_policy - Migrate a policy to a different set of nodes | 389 | * mpol_rebind_policy - Migrate a policy to a different set of nodes |
390 | * | 390 | * |
391 | * If read-side task has no lock to protect task->mempolicy, write-side | 391 | * If read-side task has no lock to protect task->mempolicy, write-side |
392 | * task will rebind the task->mempolicy by two step. The first step is | 392 | * task will rebind the task->mempolicy by two step. The first step is |
393 | * setting all the newly nodes, and the second step is cleaning all the | 393 | * setting all the newly nodes, and the second step is cleaning all the |
394 | * disallowed nodes. In this way, we can avoid finding no node to alloc | 394 | * disallowed nodes. In this way, we can avoid finding no node to alloc |
395 | * page. | 395 | * page. |
396 | * If we have a lock to protect task->mempolicy in read-side, we do | 396 | * If we have a lock to protect task->mempolicy in read-side, we do |
397 | * rebind directly. | 397 | * rebind directly. |
398 | * | 398 | * |
399 | * step: | 399 | * step: |
400 | * MPOL_REBIND_ONCE - do rebind work at once | 400 | * MPOL_REBIND_ONCE - do rebind work at once |
401 | * MPOL_REBIND_STEP1 - set all the newly nodes | 401 | * MPOL_REBIND_STEP1 - set all the newly nodes |
402 | * MPOL_REBIND_STEP2 - clean all the disallowed nodes | 402 | * MPOL_REBIND_STEP2 - clean all the disallowed nodes |
403 | */ | 403 | */ |
404 | static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, | 404 | static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, |
405 | enum mpol_rebind_step step) | 405 | enum mpol_rebind_step step) |
406 | { | 406 | { |
407 | if (!pol) | 407 | if (!pol) |
408 | return; | 408 | return; |
409 | if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && | 409 | if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && |
410 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) | 410 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) |
411 | return; | 411 | return; |
412 | 412 | ||
413 | if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) | 413 | if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) |
414 | return; | 414 | return; |
415 | 415 | ||
416 | if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) | 416 | if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) |
417 | BUG(); | 417 | BUG(); |
418 | 418 | ||
419 | if (step == MPOL_REBIND_STEP1) | 419 | if (step == MPOL_REBIND_STEP1) |
420 | pol->flags |= MPOL_F_REBINDING; | 420 | pol->flags |= MPOL_F_REBINDING; |
421 | else if (step == MPOL_REBIND_STEP2) | 421 | else if (step == MPOL_REBIND_STEP2) |
422 | pol->flags &= ~MPOL_F_REBINDING; | 422 | pol->flags &= ~MPOL_F_REBINDING; |
423 | else if (step >= MPOL_REBIND_NSTEP) | 423 | else if (step >= MPOL_REBIND_NSTEP) |
424 | BUG(); | 424 | BUG(); |
425 | 425 | ||
426 | mpol_ops[pol->mode].rebind(pol, newmask, step); | 426 | mpol_ops[pol->mode].rebind(pol, newmask, step); |
427 | } | 427 | } |
428 | 428 | ||
429 | /* | 429 | /* |
430 | * Wrapper for mpol_rebind_policy() that just requires task | 430 | * Wrapper for mpol_rebind_policy() that just requires task |
431 | * pointer, and updates task mempolicy. | 431 | * pointer, and updates task mempolicy. |
432 | * | 432 | * |
433 | * Called with task's alloc_lock held. | 433 | * Called with task's alloc_lock held. |
434 | */ | 434 | */ |
435 | 435 | ||
436 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, | 436 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, |
437 | enum mpol_rebind_step step) | 437 | enum mpol_rebind_step step) |
438 | { | 438 | { |
439 | mpol_rebind_policy(tsk->mempolicy, new, step); | 439 | mpol_rebind_policy(tsk->mempolicy, new, step); |
440 | } | 440 | } |
441 | 441 | ||
442 | /* | 442 | /* |
443 | * Rebind each vma in mm to new nodemask. | 443 | * Rebind each vma in mm to new nodemask. |
444 | * | 444 | * |
445 | * Call holding a reference to mm. Takes mm->mmap_sem during call. | 445 | * Call holding a reference to mm. Takes mm->mmap_sem during call. |
446 | */ | 446 | */ |
447 | 447 | ||
448 | void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | 448 | void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) |
449 | { | 449 | { |
450 | struct vm_area_struct *vma; | 450 | struct vm_area_struct *vma; |
451 | 451 | ||
452 | down_write(&mm->mmap_sem); | 452 | down_write(&mm->mmap_sem); |
453 | for (vma = mm->mmap; vma; vma = vma->vm_next) | 453 | for (vma = mm->mmap; vma; vma = vma->vm_next) |
454 | mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); | 454 | mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); |
455 | up_write(&mm->mmap_sem); | 455 | up_write(&mm->mmap_sem); |
456 | } | 456 | } |
457 | 457 | ||
458 | static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { | 458 | static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { |
459 | [MPOL_DEFAULT] = { | 459 | [MPOL_DEFAULT] = { |
460 | .rebind = mpol_rebind_default, | 460 | .rebind = mpol_rebind_default, |
461 | }, | 461 | }, |
462 | [MPOL_INTERLEAVE] = { | 462 | [MPOL_INTERLEAVE] = { |
463 | .create = mpol_new_interleave, | 463 | .create = mpol_new_interleave, |
464 | .rebind = mpol_rebind_nodemask, | 464 | .rebind = mpol_rebind_nodemask, |
465 | }, | 465 | }, |
466 | [MPOL_PREFERRED] = { | 466 | [MPOL_PREFERRED] = { |
467 | .create = mpol_new_preferred, | 467 | .create = mpol_new_preferred, |
468 | .rebind = mpol_rebind_preferred, | 468 | .rebind = mpol_rebind_preferred, |
469 | }, | 469 | }, |
470 | [MPOL_BIND] = { | 470 | [MPOL_BIND] = { |
471 | .create = mpol_new_bind, | 471 | .create = mpol_new_bind, |
472 | .rebind = mpol_rebind_nodemask, | 472 | .rebind = mpol_rebind_nodemask, |
473 | }, | 473 | }, |
474 | }; | 474 | }; |
475 | 475 | ||
476 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 476 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
477 | unsigned long flags); | 477 | unsigned long flags); |
478 | 478 | ||
479 | /* | 479 | /* |
480 | * Scan through pages checking if pages follow certain conditions, | 480 | * Scan through pages checking if pages follow certain conditions, |
481 | * and move them to the pagelist if they do. | 481 | * and move them to the pagelist if they do. |
482 | */ | 482 | */ |
483 | static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 483 | static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
484 | unsigned long addr, unsigned long end, | 484 | unsigned long addr, unsigned long end, |
485 | const nodemask_t *nodes, unsigned long flags, | 485 | const nodemask_t *nodes, unsigned long flags, |
486 | void *private) | 486 | void *private) |
487 | { | 487 | { |
488 | pte_t *orig_pte; | 488 | pte_t *orig_pte; |
489 | pte_t *pte; | 489 | pte_t *pte; |
490 | spinlock_t *ptl; | 490 | spinlock_t *ptl; |
491 | 491 | ||
492 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 492 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
493 | do { | 493 | do { |
494 | struct page *page; | 494 | struct page *page; |
495 | int nid; | 495 | int nid; |
496 | 496 | ||
497 | if (!pte_present(*pte)) | 497 | if (!pte_present(*pte)) |
498 | continue; | 498 | continue; |
499 | page = vm_normal_page(vma, addr, *pte); | 499 | page = vm_normal_page(vma, addr, *pte); |
500 | if (!page) | 500 | if (!page) |
501 | continue; | 501 | continue; |
502 | /* | 502 | /* |
503 | * vm_normal_page() filters out zero pages, but there might | 503 | * vm_normal_page() filters out zero pages, but there might |
504 | * still be PageReserved pages to skip, perhaps in a VDSO. | 504 | * still be PageReserved pages to skip, perhaps in a VDSO. |
505 | */ | 505 | */ |
506 | if (PageReserved(page)) | 506 | if (PageReserved(page)) |
507 | continue; | 507 | continue; |
508 | nid = page_to_nid(page); | 508 | nid = page_to_nid(page); |
509 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 509 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
510 | continue; | 510 | continue; |
511 | 511 | ||
512 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 512 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
513 | migrate_page_add(page, private, flags); | 513 | migrate_page_add(page, private, flags); |
514 | else | 514 | else |
515 | break; | 515 | break; |
516 | } while (pte++, addr += PAGE_SIZE, addr != end); | 516 | } while (pte++, addr += PAGE_SIZE, addr != end); |
517 | pte_unmap_unlock(orig_pte, ptl); | 517 | pte_unmap_unlock(orig_pte, ptl); |
518 | return addr != end; | 518 | return addr != end; |
519 | } | 519 | } |
520 | 520 | ||
521 | static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, | 521 | static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, |
522 | pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, | 522 | pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, |
523 | void *private) | 523 | void *private) |
524 | { | 524 | { |
525 | #ifdef CONFIG_HUGETLB_PAGE | 525 | #ifdef CONFIG_HUGETLB_PAGE |
526 | int nid; | 526 | int nid; |
527 | struct page *page; | 527 | struct page *page; |
528 | spinlock_t *ptl; | 528 | spinlock_t *ptl; |
529 | pte_t entry; | ||
529 | 530 | ||
530 | ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); | 531 | ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); |
531 | page = pte_page(huge_ptep_get((pte_t *)pmd)); | 532 | entry = huge_ptep_get((pte_t *)pmd); |
533 | if (!pte_present(entry)) | ||
534 | goto unlock; | ||
535 | page = pte_page(entry); | ||
532 | nid = page_to_nid(page); | 536 | nid = page_to_nid(page); |
533 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 537 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
534 | goto unlock; | 538 | goto unlock; |
535 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ | 539 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ |
536 | if (flags & (MPOL_MF_MOVE_ALL) || | 540 | if (flags & (MPOL_MF_MOVE_ALL) || |
537 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) | 541 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) |
538 | isolate_huge_page(page, private); | 542 | isolate_huge_page(page, private); |
539 | unlock: | 543 | unlock: |
540 | spin_unlock(ptl); | 544 | spin_unlock(ptl); |
541 | #else | 545 | #else |
542 | BUG(); | 546 | BUG(); |
543 | #endif | 547 | #endif |
544 | } | 548 | } |
545 | 549 | ||
546 | static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 550 | static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
547 | unsigned long addr, unsigned long end, | 551 | unsigned long addr, unsigned long end, |
548 | const nodemask_t *nodes, unsigned long flags, | 552 | const nodemask_t *nodes, unsigned long flags, |
549 | void *private) | 553 | void *private) |
550 | { | 554 | { |
551 | pmd_t *pmd; | 555 | pmd_t *pmd; |
552 | unsigned long next; | 556 | unsigned long next; |
553 | 557 | ||
554 | pmd = pmd_offset(pud, addr); | 558 | pmd = pmd_offset(pud, addr); |
555 | do { | 559 | do { |
556 | next = pmd_addr_end(addr, end); | 560 | next = pmd_addr_end(addr, end); |
557 | if (!pmd_present(*pmd)) | 561 | if (!pmd_present(*pmd)) |
558 | continue; | 562 | continue; |
559 | if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { | 563 | if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { |
560 | queue_pages_hugetlb_pmd_range(vma, pmd, nodes, | 564 | queue_pages_hugetlb_pmd_range(vma, pmd, nodes, |
561 | flags, private); | 565 | flags, private); |
562 | continue; | 566 | continue; |
563 | } | 567 | } |
564 | split_huge_page_pmd(vma, addr, pmd); | 568 | split_huge_page_pmd(vma, addr, pmd); |
565 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 569 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
566 | continue; | 570 | continue; |
567 | if (queue_pages_pte_range(vma, pmd, addr, next, nodes, | 571 | if (queue_pages_pte_range(vma, pmd, addr, next, nodes, |
568 | flags, private)) | 572 | flags, private)) |
569 | return -EIO; | 573 | return -EIO; |
570 | } while (pmd++, addr = next, addr != end); | 574 | } while (pmd++, addr = next, addr != end); |
571 | return 0; | 575 | return 0; |
572 | } | 576 | } |
573 | 577 | ||
574 | static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 578 | static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
575 | unsigned long addr, unsigned long end, | 579 | unsigned long addr, unsigned long end, |
576 | const nodemask_t *nodes, unsigned long flags, | 580 | const nodemask_t *nodes, unsigned long flags, |
577 | void *private) | 581 | void *private) |
578 | { | 582 | { |
579 | pud_t *pud; | 583 | pud_t *pud; |
580 | unsigned long next; | 584 | unsigned long next; |
581 | 585 | ||
582 | pud = pud_offset(pgd, addr); | 586 | pud = pud_offset(pgd, addr); |
583 | do { | 587 | do { |
584 | next = pud_addr_end(addr, end); | 588 | next = pud_addr_end(addr, end); |
585 | if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) | 589 | if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) |
586 | continue; | 590 | continue; |
587 | if (pud_none_or_clear_bad(pud)) | 591 | if (pud_none_or_clear_bad(pud)) |
588 | continue; | 592 | continue; |
589 | if (queue_pages_pmd_range(vma, pud, addr, next, nodes, | 593 | if (queue_pages_pmd_range(vma, pud, addr, next, nodes, |
590 | flags, private)) | 594 | flags, private)) |
591 | return -EIO; | 595 | return -EIO; |
592 | } while (pud++, addr = next, addr != end); | 596 | } while (pud++, addr = next, addr != end); |
593 | return 0; | 597 | return 0; |
594 | } | 598 | } |
595 | 599 | ||
596 | static inline int queue_pages_pgd_range(struct vm_area_struct *vma, | 600 | static inline int queue_pages_pgd_range(struct vm_area_struct *vma, |
597 | unsigned long addr, unsigned long end, | 601 | unsigned long addr, unsigned long end, |
598 | const nodemask_t *nodes, unsigned long flags, | 602 | const nodemask_t *nodes, unsigned long flags, |
599 | void *private) | 603 | void *private) |
600 | { | 604 | { |
601 | pgd_t *pgd; | 605 | pgd_t *pgd; |
602 | unsigned long next; | 606 | unsigned long next; |
603 | 607 | ||
604 | pgd = pgd_offset(vma->vm_mm, addr); | 608 | pgd = pgd_offset(vma->vm_mm, addr); |
605 | do { | 609 | do { |
606 | next = pgd_addr_end(addr, end); | 610 | next = pgd_addr_end(addr, end); |
607 | if (pgd_none_or_clear_bad(pgd)) | 611 | if (pgd_none_or_clear_bad(pgd)) |
608 | continue; | 612 | continue; |
609 | if (queue_pages_pud_range(vma, pgd, addr, next, nodes, | 613 | if (queue_pages_pud_range(vma, pgd, addr, next, nodes, |
610 | flags, private)) | 614 | flags, private)) |
611 | return -EIO; | 615 | return -EIO; |
612 | } while (pgd++, addr = next, addr != end); | 616 | } while (pgd++, addr = next, addr != end); |
613 | return 0; | 617 | return 0; |
614 | } | 618 | } |
615 | 619 | ||
616 | #ifdef CONFIG_NUMA_BALANCING | 620 | #ifdef CONFIG_NUMA_BALANCING |
617 | /* | 621 | /* |
618 | * This is used to mark a range of virtual addresses to be inaccessible. | 622 | * This is used to mark a range of virtual addresses to be inaccessible. |
619 | * These are later cleared by a NUMA hinting fault. Depending on these | 623 | * These are later cleared by a NUMA hinting fault. Depending on these |
620 | * faults, pages may be migrated for better NUMA placement. | 624 | * faults, pages may be migrated for better NUMA placement. |
621 | * | 625 | * |
622 | * This is assuming that NUMA faults are handled using PROT_NONE. If | 626 | * This is assuming that NUMA faults are handled using PROT_NONE. If |
623 | * an architecture makes a different choice, it will need further | 627 | * an architecture makes a different choice, it will need further |
624 | * changes to the core. | 628 | * changes to the core. |
625 | */ | 629 | */ |
626 | unsigned long change_prot_numa(struct vm_area_struct *vma, | 630 | unsigned long change_prot_numa(struct vm_area_struct *vma, |
627 | unsigned long addr, unsigned long end) | 631 | unsigned long addr, unsigned long end) |
628 | { | 632 | { |
629 | int nr_updated; | 633 | int nr_updated; |
630 | 634 | ||
631 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); | 635 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); |
632 | if (nr_updated) | 636 | if (nr_updated) |
633 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); | 637 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); |
634 | 638 | ||
635 | return nr_updated; | 639 | return nr_updated; |
636 | } | 640 | } |
637 | #else | 641 | #else |
638 | static unsigned long change_prot_numa(struct vm_area_struct *vma, | 642 | static unsigned long change_prot_numa(struct vm_area_struct *vma, |
639 | unsigned long addr, unsigned long end) | 643 | unsigned long addr, unsigned long end) |
640 | { | 644 | { |
641 | return 0; | 645 | return 0; |
642 | } | 646 | } |
643 | #endif /* CONFIG_NUMA_BALANCING */ | 647 | #endif /* CONFIG_NUMA_BALANCING */ |
644 | 648 | ||
645 | /* | 649 | /* |
646 | * Walk through page tables and collect pages to be migrated. | 650 | * Walk through page tables and collect pages to be migrated. |
647 | * | 651 | * |
648 | * If pages found in a given range are on a set of nodes (determined by | 652 | * If pages found in a given range are on a set of nodes (determined by |
649 | * @nodes and @flags,) it's isolated and queued to the pagelist which is | 653 | * @nodes and @flags,) it's isolated and queued to the pagelist which is |
650 | * passed via @private.) | 654 | * passed via @private.) |
651 | */ | 655 | */ |
652 | static struct vm_area_struct * | 656 | static struct vm_area_struct * |
653 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 657 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
654 | const nodemask_t *nodes, unsigned long flags, void *private) | 658 | const nodemask_t *nodes, unsigned long flags, void *private) |
655 | { | 659 | { |
656 | int err; | 660 | int err; |
657 | struct vm_area_struct *first, *vma, *prev; | 661 | struct vm_area_struct *first, *vma, *prev; |
658 | 662 | ||
659 | 663 | ||
660 | first = find_vma(mm, start); | 664 | first = find_vma(mm, start); |
661 | if (!first) | 665 | if (!first) |
662 | return ERR_PTR(-EFAULT); | 666 | return ERR_PTR(-EFAULT); |
663 | prev = NULL; | 667 | prev = NULL; |
664 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 668 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
665 | unsigned long endvma = vma->vm_end; | 669 | unsigned long endvma = vma->vm_end; |
666 | 670 | ||
667 | if (endvma > end) | 671 | if (endvma > end) |
668 | endvma = end; | 672 | endvma = end; |
669 | if (vma->vm_start > start) | 673 | if (vma->vm_start > start) |
670 | start = vma->vm_start; | 674 | start = vma->vm_start; |
671 | 675 | ||
672 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 676 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
673 | if (!vma->vm_next && vma->vm_end < end) | 677 | if (!vma->vm_next && vma->vm_end < end) |
674 | return ERR_PTR(-EFAULT); | 678 | return ERR_PTR(-EFAULT); |
675 | if (prev && prev->vm_end < vma->vm_start) | 679 | if (prev && prev->vm_end < vma->vm_start) |
676 | return ERR_PTR(-EFAULT); | 680 | return ERR_PTR(-EFAULT); |
677 | } | 681 | } |
678 | 682 | ||
679 | if (flags & MPOL_MF_LAZY) { | 683 | if (flags & MPOL_MF_LAZY) { |
680 | change_prot_numa(vma, start, endvma); | 684 | change_prot_numa(vma, start, endvma); |
681 | goto next; | 685 | goto next; |
682 | } | 686 | } |
683 | 687 | ||
684 | if ((flags & MPOL_MF_STRICT) || | 688 | if ((flags & MPOL_MF_STRICT) || |
685 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | 689 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && |
686 | vma_migratable(vma))) { | 690 | vma_migratable(vma))) { |
687 | 691 | ||
688 | err = queue_pages_pgd_range(vma, start, endvma, nodes, | 692 | err = queue_pages_pgd_range(vma, start, endvma, nodes, |
689 | flags, private); | 693 | flags, private); |
690 | if (err) { | 694 | if (err) { |
691 | first = ERR_PTR(err); | 695 | first = ERR_PTR(err); |
692 | break; | 696 | break; |
693 | } | 697 | } |
694 | } | 698 | } |
695 | next: | 699 | next: |
696 | prev = vma; | 700 | prev = vma; |
697 | } | 701 | } |
698 | return first; | 702 | return first; |
699 | } | 703 | } |
700 | 704 | ||
701 | /* | 705 | /* |
702 | * Apply policy to a single VMA | 706 | * Apply policy to a single VMA |
703 | * This must be called with the mmap_sem held for writing. | 707 | * This must be called with the mmap_sem held for writing. |
704 | */ | 708 | */ |
705 | static int vma_replace_policy(struct vm_area_struct *vma, | 709 | static int vma_replace_policy(struct vm_area_struct *vma, |
706 | struct mempolicy *pol) | 710 | struct mempolicy *pol) |
707 | { | 711 | { |
708 | int err; | 712 | int err; |
709 | struct mempolicy *old; | 713 | struct mempolicy *old; |
710 | struct mempolicy *new; | 714 | struct mempolicy *new; |
711 | 715 | ||
712 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | 716 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", |
713 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | 717 | vma->vm_start, vma->vm_end, vma->vm_pgoff, |
714 | vma->vm_ops, vma->vm_file, | 718 | vma->vm_ops, vma->vm_file, |
715 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | 719 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); |
716 | 720 | ||
717 | new = mpol_dup(pol); | 721 | new = mpol_dup(pol); |
718 | if (IS_ERR(new)) | 722 | if (IS_ERR(new)) |
719 | return PTR_ERR(new); | 723 | return PTR_ERR(new); |
720 | 724 | ||
721 | if (vma->vm_ops && vma->vm_ops->set_policy) { | 725 | if (vma->vm_ops && vma->vm_ops->set_policy) { |
722 | err = vma->vm_ops->set_policy(vma, new); | 726 | err = vma->vm_ops->set_policy(vma, new); |
723 | if (err) | 727 | if (err) |
724 | goto err_out; | 728 | goto err_out; |
725 | } | 729 | } |
726 | 730 | ||
727 | old = vma->vm_policy; | 731 | old = vma->vm_policy; |
728 | vma->vm_policy = new; /* protected by mmap_sem */ | 732 | vma->vm_policy = new; /* protected by mmap_sem */ |
729 | mpol_put(old); | 733 | mpol_put(old); |
730 | 734 | ||
731 | return 0; | 735 | return 0; |
732 | err_out: | 736 | err_out: |
733 | mpol_put(new); | 737 | mpol_put(new); |
734 | return err; | 738 | return err; |
735 | } | 739 | } |
736 | 740 | ||
737 | /* Step 2: apply policy to a range and do splits. */ | 741 | /* Step 2: apply policy to a range and do splits. */ |
738 | static int mbind_range(struct mm_struct *mm, unsigned long start, | 742 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
739 | unsigned long end, struct mempolicy *new_pol) | 743 | unsigned long end, struct mempolicy *new_pol) |
740 | { | 744 | { |
741 | struct vm_area_struct *next; | 745 | struct vm_area_struct *next; |
742 | struct vm_area_struct *prev; | 746 | struct vm_area_struct *prev; |
743 | struct vm_area_struct *vma; | 747 | struct vm_area_struct *vma; |
744 | int err = 0; | 748 | int err = 0; |
745 | pgoff_t pgoff; | 749 | pgoff_t pgoff; |
746 | unsigned long vmstart; | 750 | unsigned long vmstart; |
747 | unsigned long vmend; | 751 | unsigned long vmend; |
748 | 752 | ||
749 | vma = find_vma(mm, start); | 753 | vma = find_vma(mm, start); |
750 | if (!vma || vma->vm_start > start) | 754 | if (!vma || vma->vm_start > start) |
751 | return -EFAULT; | 755 | return -EFAULT; |
752 | 756 | ||
753 | prev = vma->vm_prev; | 757 | prev = vma->vm_prev; |
754 | if (start > vma->vm_start) | 758 | if (start > vma->vm_start) |
755 | prev = vma; | 759 | prev = vma; |
756 | 760 | ||
757 | for (; vma && vma->vm_start < end; prev = vma, vma = next) { | 761 | for (; vma && vma->vm_start < end; prev = vma, vma = next) { |
758 | next = vma->vm_next; | 762 | next = vma->vm_next; |
759 | vmstart = max(start, vma->vm_start); | 763 | vmstart = max(start, vma->vm_start); |
760 | vmend = min(end, vma->vm_end); | 764 | vmend = min(end, vma->vm_end); |
761 | 765 | ||
762 | if (mpol_equal(vma_policy(vma), new_pol)) | 766 | if (mpol_equal(vma_policy(vma), new_pol)) |
763 | continue; | 767 | continue; |
764 | 768 | ||
765 | pgoff = vma->vm_pgoff + | 769 | pgoff = vma->vm_pgoff + |
766 | ((vmstart - vma->vm_start) >> PAGE_SHIFT); | 770 | ((vmstart - vma->vm_start) >> PAGE_SHIFT); |
767 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, | 771 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
768 | vma->anon_vma, vma->vm_file, pgoff, | 772 | vma->anon_vma, vma->vm_file, pgoff, |
769 | new_pol); | 773 | new_pol); |
770 | if (prev) { | 774 | if (prev) { |
771 | vma = prev; | 775 | vma = prev; |
772 | next = vma->vm_next; | 776 | next = vma->vm_next; |
773 | if (mpol_equal(vma_policy(vma), new_pol)) | 777 | if (mpol_equal(vma_policy(vma), new_pol)) |
774 | continue; | 778 | continue; |
775 | /* vma_merge() joined vma && vma->next, case 8 */ | 779 | /* vma_merge() joined vma && vma->next, case 8 */ |
776 | goto replace; | 780 | goto replace; |
777 | } | 781 | } |
778 | if (vma->vm_start != vmstart) { | 782 | if (vma->vm_start != vmstart) { |
779 | err = split_vma(vma->vm_mm, vma, vmstart, 1); | 783 | err = split_vma(vma->vm_mm, vma, vmstart, 1); |
780 | if (err) | 784 | if (err) |
781 | goto out; | 785 | goto out; |
782 | } | 786 | } |
783 | if (vma->vm_end != vmend) { | 787 | if (vma->vm_end != vmend) { |
784 | err = split_vma(vma->vm_mm, vma, vmend, 0); | 788 | err = split_vma(vma->vm_mm, vma, vmend, 0); |
785 | if (err) | 789 | if (err) |
786 | goto out; | 790 | goto out; |
787 | } | 791 | } |
788 | replace: | 792 | replace: |
789 | err = vma_replace_policy(vma, new_pol); | 793 | err = vma_replace_policy(vma, new_pol); |
790 | if (err) | 794 | if (err) |
791 | goto out; | 795 | goto out; |
792 | } | 796 | } |
793 | 797 | ||
794 | out: | 798 | out: |
795 | return err; | 799 | return err; |
796 | } | 800 | } |
797 | 801 | ||
798 | /* Set the process memory policy */ | 802 | /* Set the process memory policy */ |
799 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, | 803 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, |
800 | nodemask_t *nodes) | 804 | nodemask_t *nodes) |
801 | { | 805 | { |
802 | struct mempolicy *new, *old; | 806 | struct mempolicy *new, *old; |
803 | struct mm_struct *mm = current->mm; | 807 | struct mm_struct *mm = current->mm; |
804 | NODEMASK_SCRATCH(scratch); | 808 | NODEMASK_SCRATCH(scratch); |
805 | int ret; | 809 | int ret; |
806 | 810 | ||
807 | if (!scratch) | 811 | if (!scratch) |
808 | return -ENOMEM; | 812 | return -ENOMEM; |
809 | 813 | ||
810 | new = mpol_new(mode, flags, nodes); | 814 | new = mpol_new(mode, flags, nodes); |
811 | if (IS_ERR(new)) { | 815 | if (IS_ERR(new)) { |
812 | ret = PTR_ERR(new); | 816 | ret = PTR_ERR(new); |
813 | goto out; | 817 | goto out; |
814 | } | 818 | } |
815 | /* | 819 | /* |
816 | * prevent changing our mempolicy while show_numa_maps() | 820 | * prevent changing our mempolicy while show_numa_maps() |
817 | * is using it. | 821 | * is using it. |
818 | * Note: do_set_mempolicy() can be called at init time | 822 | * Note: do_set_mempolicy() can be called at init time |
819 | * with no 'mm'. | 823 | * with no 'mm'. |
820 | */ | 824 | */ |
821 | if (mm) | 825 | if (mm) |
822 | down_write(&mm->mmap_sem); | 826 | down_write(&mm->mmap_sem); |
823 | task_lock(current); | 827 | task_lock(current); |
824 | ret = mpol_set_nodemask(new, nodes, scratch); | 828 | ret = mpol_set_nodemask(new, nodes, scratch); |
825 | if (ret) { | 829 | if (ret) { |
826 | task_unlock(current); | 830 | task_unlock(current); |
827 | if (mm) | 831 | if (mm) |
828 | up_write(&mm->mmap_sem); | 832 | up_write(&mm->mmap_sem); |
829 | mpol_put(new); | 833 | mpol_put(new); |
830 | goto out; | 834 | goto out; |
831 | } | 835 | } |
832 | old = current->mempolicy; | 836 | old = current->mempolicy; |
833 | current->mempolicy = new; | 837 | current->mempolicy = new; |
834 | if (new && new->mode == MPOL_INTERLEAVE && | 838 | if (new && new->mode == MPOL_INTERLEAVE && |
835 | nodes_weight(new->v.nodes)) | 839 | nodes_weight(new->v.nodes)) |
836 | current->il_next = first_node(new->v.nodes); | 840 | current->il_next = first_node(new->v.nodes); |
837 | task_unlock(current); | 841 | task_unlock(current); |
838 | if (mm) | 842 | if (mm) |
839 | up_write(&mm->mmap_sem); | 843 | up_write(&mm->mmap_sem); |
840 | 844 | ||
841 | mpol_put(old); | 845 | mpol_put(old); |
842 | ret = 0; | 846 | ret = 0; |
843 | out: | 847 | out: |
844 | NODEMASK_SCRATCH_FREE(scratch); | 848 | NODEMASK_SCRATCH_FREE(scratch); |
845 | return ret; | 849 | return ret; |
846 | } | 850 | } |
847 | 851 | ||
848 | /* | 852 | /* |
849 | * Return nodemask for policy for get_mempolicy() query | 853 | * Return nodemask for policy for get_mempolicy() query |
850 | * | 854 | * |
851 | * Called with task's alloc_lock held | 855 | * Called with task's alloc_lock held |
852 | */ | 856 | */ |
853 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | 857 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) |
854 | { | 858 | { |
855 | nodes_clear(*nodes); | 859 | nodes_clear(*nodes); |
856 | if (p == &default_policy) | 860 | if (p == &default_policy) |
857 | return; | 861 | return; |
858 | 862 | ||
859 | switch (p->mode) { | 863 | switch (p->mode) { |
860 | case MPOL_BIND: | 864 | case MPOL_BIND: |
861 | /* Fall through */ | 865 | /* Fall through */ |
862 | case MPOL_INTERLEAVE: | 866 | case MPOL_INTERLEAVE: |
863 | *nodes = p->v.nodes; | 867 | *nodes = p->v.nodes; |
864 | break; | 868 | break; |
865 | case MPOL_PREFERRED: | 869 | case MPOL_PREFERRED: |
866 | if (!(p->flags & MPOL_F_LOCAL)) | 870 | if (!(p->flags & MPOL_F_LOCAL)) |
867 | node_set(p->v.preferred_node, *nodes); | 871 | node_set(p->v.preferred_node, *nodes); |
868 | /* else return empty node mask for local allocation */ | 872 | /* else return empty node mask for local allocation */ |
869 | break; | 873 | break; |
870 | default: | 874 | default: |
871 | BUG(); | 875 | BUG(); |
872 | } | 876 | } |
873 | } | 877 | } |
874 | 878 | ||
875 | static int lookup_node(struct mm_struct *mm, unsigned long addr) | 879 | static int lookup_node(struct mm_struct *mm, unsigned long addr) |
876 | { | 880 | { |
877 | struct page *p; | 881 | struct page *p; |
878 | int err; | 882 | int err; |
879 | 883 | ||
880 | err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); | 884 | err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); |
881 | if (err >= 0) { | 885 | if (err >= 0) { |
882 | err = page_to_nid(p); | 886 | err = page_to_nid(p); |
883 | put_page(p); | 887 | put_page(p); |
884 | } | 888 | } |
885 | return err; | 889 | return err; |
886 | } | 890 | } |
887 | 891 | ||
888 | /* Retrieve NUMA policy */ | 892 | /* Retrieve NUMA policy */ |
889 | static long do_get_mempolicy(int *policy, nodemask_t *nmask, | 893 | static long do_get_mempolicy(int *policy, nodemask_t *nmask, |
890 | unsigned long addr, unsigned long flags) | 894 | unsigned long addr, unsigned long flags) |
891 | { | 895 | { |
892 | int err; | 896 | int err; |
893 | struct mm_struct *mm = current->mm; | 897 | struct mm_struct *mm = current->mm; |
894 | struct vm_area_struct *vma = NULL; | 898 | struct vm_area_struct *vma = NULL; |
895 | struct mempolicy *pol = current->mempolicy; | 899 | struct mempolicy *pol = current->mempolicy; |
896 | 900 | ||
897 | if (flags & | 901 | if (flags & |
898 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | 902 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) |
899 | return -EINVAL; | 903 | return -EINVAL; |
900 | 904 | ||
901 | if (flags & MPOL_F_MEMS_ALLOWED) { | 905 | if (flags & MPOL_F_MEMS_ALLOWED) { |
902 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | 906 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) |
903 | return -EINVAL; | 907 | return -EINVAL; |
904 | *policy = 0; /* just so it's initialized */ | 908 | *policy = 0; /* just so it's initialized */ |
905 | task_lock(current); | 909 | task_lock(current); |
906 | *nmask = cpuset_current_mems_allowed; | 910 | *nmask = cpuset_current_mems_allowed; |
907 | task_unlock(current); | 911 | task_unlock(current); |
908 | return 0; | 912 | return 0; |
909 | } | 913 | } |
910 | 914 | ||
911 | if (flags & MPOL_F_ADDR) { | 915 | if (flags & MPOL_F_ADDR) { |
912 | /* | 916 | /* |
913 | * Do NOT fall back to task policy if the | 917 | * Do NOT fall back to task policy if the |
914 | * vma/shared policy at addr is NULL. We | 918 | * vma/shared policy at addr is NULL. We |
915 | * want to return MPOL_DEFAULT in this case. | 919 | * want to return MPOL_DEFAULT in this case. |
916 | */ | 920 | */ |
917 | down_read(&mm->mmap_sem); | 921 | down_read(&mm->mmap_sem); |
918 | vma = find_vma_intersection(mm, addr, addr+1); | 922 | vma = find_vma_intersection(mm, addr, addr+1); |
919 | if (!vma) { | 923 | if (!vma) { |
920 | up_read(&mm->mmap_sem); | 924 | up_read(&mm->mmap_sem); |
921 | return -EFAULT; | 925 | return -EFAULT; |
922 | } | 926 | } |
923 | if (vma->vm_ops && vma->vm_ops->get_policy) | 927 | if (vma->vm_ops && vma->vm_ops->get_policy) |
924 | pol = vma->vm_ops->get_policy(vma, addr); | 928 | pol = vma->vm_ops->get_policy(vma, addr); |
925 | else | 929 | else |
926 | pol = vma->vm_policy; | 930 | pol = vma->vm_policy; |
927 | } else if (addr) | 931 | } else if (addr) |
928 | return -EINVAL; | 932 | return -EINVAL; |
929 | 933 | ||
930 | if (!pol) | 934 | if (!pol) |
931 | pol = &default_policy; /* indicates default behavior */ | 935 | pol = &default_policy; /* indicates default behavior */ |
932 | 936 | ||
933 | if (flags & MPOL_F_NODE) { | 937 | if (flags & MPOL_F_NODE) { |
934 | if (flags & MPOL_F_ADDR) { | 938 | if (flags & MPOL_F_ADDR) { |
935 | err = lookup_node(mm, addr); | 939 | err = lookup_node(mm, addr); |
936 | if (err < 0) | 940 | if (err < 0) |
937 | goto out; | 941 | goto out; |
938 | *policy = err; | 942 | *policy = err; |
939 | } else if (pol == current->mempolicy && | 943 | } else if (pol == current->mempolicy && |
940 | pol->mode == MPOL_INTERLEAVE) { | 944 | pol->mode == MPOL_INTERLEAVE) { |
941 | *policy = current->il_next; | 945 | *policy = current->il_next; |
942 | } else { | 946 | } else { |
943 | err = -EINVAL; | 947 | err = -EINVAL; |
944 | goto out; | 948 | goto out; |
945 | } | 949 | } |
946 | } else { | 950 | } else { |
947 | *policy = pol == &default_policy ? MPOL_DEFAULT : | 951 | *policy = pol == &default_policy ? MPOL_DEFAULT : |
948 | pol->mode; | 952 | pol->mode; |
949 | /* | 953 | /* |
950 | * Internal mempolicy flags must be masked off before exposing | 954 | * Internal mempolicy flags must be masked off before exposing |
951 | * the policy to userspace. | 955 | * the policy to userspace. |
952 | */ | 956 | */ |
953 | *policy |= (pol->flags & MPOL_MODE_FLAGS); | 957 | *policy |= (pol->flags & MPOL_MODE_FLAGS); |
954 | } | 958 | } |
955 | 959 | ||
956 | if (vma) { | 960 | if (vma) { |
957 | up_read(¤t->mm->mmap_sem); | 961 | up_read(¤t->mm->mmap_sem); |
958 | vma = NULL; | 962 | vma = NULL; |
959 | } | 963 | } |
960 | 964 | ||
961 | err = 0; | 965 | err = 0; |
962 | if (nmask) { | 966 | if (nmask) { |
963 | if (mpol_store_user_nodemask(pol)) { | 967 | if (mpol_store_user_nodemask(pol)) { |
964 | *nmask = pol->w.user_nodemask; | 968 | *nmask = pol->w.user_nodemask; |
965 | } else { | 969 | } else { |
966 | task_lock(current); | 970 | task_lock(current); |
967 | get_policy_nodemask(pol, nmask); | 971 | get_policy_nodemask(pol, nmask); |
968 | task_unlock(current); | 972 | task_unlock(current); |
969 | } | 973 | } |
970 | } | 974 | } |
971 | 975 | ||
972 | out: | 976 | out: |
973 | mpol_cond_put(pol); | 977 | mpol_cond_put(pol); |
974 | if (vma) | 978 | if (vma) |
975 | up_read(¤t->mm->mmap_sem); | 979 | up_read(¤t->mm->mmap_sem); |
976 | return err; | 980 | return err; |
977 | } | 981 | } |
978 | 982 | ||
979 | #ifdef CONFIG_MIGRATION | 983 | #ifdef CONFIG_MIGRATION |
980 | /* | 984 | /* |
981 | * page migration | 985 | * page migration |
982 | */ | 986 | */ |
983 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 987 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
984 | unsigned long flags) | 988 | unsigned long flags) |
985 | { | 989 | { |
986 | /* | 990 | /* |
987 | * Avoid migrating a page that is shared with others. | 991 | * Avoid migrating a page that is shared with others. |
988 | */ | 992 | */ |
989 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 993 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
990 | if (!isolate_lru_page(page)) { | 994 | if (!isolate_lru_page(page)) { |
991 | list_add_tail(&page->lru, pagelist); | 995 | list_add_tail(&page->lru, pagelist); |
992 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 996 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
993 | page_is_file_cache(page)); | 997 | page_is_file_cache(page)); |
994 | } | 998 | } |
995 | } | 999 | } |
996 | } | 1000 | } |
997 | 1001 | ||
998 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | 1002 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
999 | { | 1003 | { |
1000 | if (PageHuge(page)) | 1004 | if (PageHuge(page)) |
1001 | return alloc_huge_page_node(page_hstate(compound_head(page)), | 1005 | return alloc_huge_page_node(page_hstate(compound_head(page)), |
1002 | node); | 1006 | node); |
1003 | else | 1007 | else |
1004 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); | 1008 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); |
1005 | } | 1009 | } |
1006 | 1010 | ||
1007 | /* | 1011 | /* |
1008 | * Migrate pages from one node to a target node. | 1012 | * Migrate pages from one node to a target node. |
1009 | * Returns error or the number of pages not migrated. | 1013 | * Returns error or the number of pages not migrated. |
1010 | */ | 1014 | */ |
1011 | static int migrate_to_node(struct mm_struct *mm, int source, int dest, | 1015 | static int migrate_to_node(struct mm_struct *mm, int source, int dest, |
1012 | int flags) | 1016 | int flags) |
1013 | { | 1017 | { |
1014 | nodemask_t nmask; | 1018 | nodemask_t nmask; |
1015 | LIST_HEAD(pagelist); | 1019 | LIST_HEAD(pagelist); |
1016 | int err = 0; | 1020 | int err = 0; |
1017 | 1021 | ||
1018 | nodes_clear(nmask); | 1022 | nodes_clear(nmask); |
1019 | node_set(source, nmask); | 1023 | node_set(source, nmask); |
1020 | 1024 | ||
1021 | /* | 1025 | /* |
1022 | * This does not "check" the range but isolates all pages that | 1026 | * This does not "check" the range but isolates all pages that |
1023 | * need migration. Between passing in the full user address | 1027 | * need migration. Between passing in the full user address |
1024 | * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. | 1028 | * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. |
1025 | */ | 1029 | */ |
1026 | VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); | 1030 | VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); |
1027 | queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, | 1031 | queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, |
1028 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 1032 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
1029 | 1033 | ||
1030 | if (!list_empty(&pagelist)) { | 1034 | if (!list_empty(&pagelist)) { |
1031 | err = migrate_pages(&pagelist, new_node_page, dest, | 1035 | err = migrate_pages(&pagelist, new_node_page, dest, |
1032 | MIGRATE_SYNC, MR_SYSCALL); | 1036 | MIGRATE_SYNC, MR_SYSCALL); |
1033 | if (err) | 1037 | if (err) |
1034 | putback_movable_pages(&pagelist); | 1038 | putback_movable_pages(&pagelist); |
1035 | } | 1039 | } |
1036 | 1040 | ||
1037 | return err; | 1041 | return err; |
1038 | } | 1042 | } |
1039 | 1043 | ||
1040 | /* | 1044 | /* |
1041 | * Move pages between the two nodesets so as to preserve the physical | 1045 | * Move pages between the two nodesets so as to preserve the physical |
1042 | * layout as much as possible. | 1046 | * layout as much as possible. |
1043 | * | 1047 | * |
1044 | * Returns the number of page that could not be moved. | 1048 | * Returns the number of page that could not be moved. |
1045 | */ | 1049 | */ |
1046 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, | 1050 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, |
1047 | const nodemask_t *to, int flags) | 1051 | const nodemask_t *to, int flags) |
1048 | { | 1052 | { |
1049 | int busy = 0; | 1053 | int busy = 0; |
1050 | int err; | 1054 | int err; |
1051 | nodemask_t tmp; | 1055 | nodemask_t tmp; |
1052 | 1056 | ||
1053 | err = migrate_prep(); | 1057 | err = migrate_prep(); |
1054 | if (err) | 1058 | if (err) |
1055 | return err; | 1059 | return err; |
1056 | 1060 | ||
1057 | down_read(&mm->mmap_sem); | 1061 | down_read(&mm->mmap_sem); |
1058 | 1062 | ||
1059 | err = migrate_vmas(mm, from, to, flags); | 1063 | err = migrate_vmas(mm, from, to, flags); |
1060 | if (err) | 1064 | if (err) |
1061 | goto out; | 1065 | goto out; |
1062 | 1066 | ||
1063 | /* | 1067 | /* |
1064 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 1068 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
1065 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' | 1069 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
1066 | * bit in 'tmp', and return that <source, dest> pair for migration. | 1070 | * bit in 'tmp', and return that <source, dest> pair for migration. |
1067 | * The pair of nodemasks 'to' and 'from' define the map. | 1071 | * The pair of nodemasks 'to' and 'from' define the map. |
1068 | * | 1072 | * |
1069 | * If no pair of bits is found that way, fallback to picking some | 1073 | * If no pair of bits is found that way, fallback to picking some |
1070 | * pair of 'source' and 'dest' bits that are not the same. If the | 1074 | * pair of 'source' and 'dest' bits that are not the same. If the |
1071 | * 'source' and 'dest' bits are the same, this represents a node | 1075 | * 'source' and 'dest' bits are the same, this represents a node |
1072 | * that will be migrating to itself, so no pages need move. | 1076 | * that will be migrating to itself, so no pages need move. |
1073 | * | 1077 | * |
1074 | * If no bits are left in 'tmp', or if all remaining bits left | 1078 | * If no bits are left in 'tmp', or if all remaining bits left |
1075 | * in 'tmp' correspond to the same bit in 'to', return false | 1079 | * in 'tmp' correspond to the same bit in 'to', return false |
1076 | * (nothing left to migrate). | 1080 | * (nothing left to migrate). |
1077 | * | 1081 | * |
1078 | * This lets us pick a pair of nodes to migrate between, such that | 1082 | * This lets us pick a pair of nodes to migrate between, such that |
1079 | * if possible the dest node is not already occupied by some other | 1083 | * if possible the dest node is not already occupied by some other |
1080 | * source node, minimizing the risk of overloading the memory on a | 1084 | * source node, minimizing the risk of overloading the memory on a |
1081 | * node that would happen if we migrated incoming memory to a node | 1085 | * node that would happen if we migrated incoming memory to a node |
1082 | * before migrating outgoing memory source that same node. | 1086 | * before migrating outgoing memory source that same node. |
1083 | * | 1087 | * |
1084 | * A single scan of tmp is sufficient. As we go, we remember the | 1088 | * A single scan of tmp is sufficient. As we go, we remember the |
1085 | * most recent <s, d> pair that moved (s != d). If we find a pair | 1089 | * most recent <s, d> pair that moved (s != d). If we find a pair |
1086 | * that not only moved, but what's better, moved to an empty slot | 1090 | * that not only moved, but what's better, moved to an empty slot |
1087 | * (d is not set in tmp), then we break out then, with that pair. | 1091 | * (d is not set in tmp), then we break out then, with that pair. |
1088 | * Otherwise when we finish scanning from_tmp, we at least have the | 1092 | * Otherwise when we finish scanning from_tmp, we at least have the |
1089 | * most recent <s, d> pair that moved. If we get all the way through | 1093 | * most recent <s, d> pair that moved. If we get all the way through |
1090 | * the scan of tmp without finding any node that moved, much less | 1094 | * the scan of tmp without finding any node that moved, much less |
1091 | * moved to an empty node, then there is nothing left worth migrating. | 1095 | * moved to an empty node, then there is nothing left worth migrating. |
1092 | */ | 1096 | */ |
1093 | 1097 | ||
1094 | tmp = *from; | 1098 | tmp = *from; |
1095 | while (!nodes_empty(tmp)) { | 1099 | while (!nodes_empty(tmp)) { |
1096 | int s,d; | 1100 | int s,d; |
1097 | int source = NUMA_NO_NODE; | 1101 | int source = NUMA_NO_NODE; |
1098 | int dest = 0; | 1102 | int dest = 0; |
1099 | 1103 | ||
1100 | for_each_node_mask(s, tmp) { | 1104 | for_each_node_mask(s, tmp) { |
1101 | 1105 | ||
1102 | /* | 1106 | /* |
1103 | * do_migrate_pages() tries to maintain the relative | 1107 | * do_migrate_pages() tries to maintain the relative |
1104 | * node relationship of the pages established between | 1108 | * node relationship of the pages established between |
1105 | * threads and memory areas. | 1109 | * threads and memory areas. |
1106 | * | 1110 | * |
1107 | * However if the number of source nodes is not equal to | 1111 | * However if the number of source nodes is not equal to |
1108 | * the number of destination nodes we can not preserve | 1112 | * the number of destination nodes we can not preserve |
1109 | * this node relative relationship. In that case, skip | 1113 | * this node relative relationship. In that case, skip |
1110 | * copying memory from a node that is in the destination | 1114 | * copying memory from a node that is in the destination |
1111 | * mask. | 1115 | * mask. |
1112 | * | 1116 | * |
1113 | * Example: [2,3,4] -> [3,4,5] moves everything. | 1117 | * Example: [2,3,4] -> [3,4,5] moves everything. |
1114 | * [0-7] - > [3,4,5] moves only 0,1,2,6,7. | 1118 | * [0-7] - > [3,4,5] moves only 0,1,2,6,7. |
1115 | */ | 1119 | */ |
1116 | 1120 | ||
1117 | if ((nodes_weight(*from) != nodes_weight(*to)) && | 1121 | if ((nodes_weight(*from) != nodes_weight(*to)) && |
1118 | (node_isset(s, *to))) | 1122 | (node_isset(s, *to))) |
1119 | continue; | 1123 | continue; |
1120 | 1124 | ||
1121 | d = node_remap(s, *from, *to); | 1125 | d = node_remap(s, *from, *to); |
1122 | if (s == d) | 1126 | if (s == d) |
1123 | continue; | 1127 | continue; |
1124 | 1128 | ||
1125 | source = s; /* Node moved. Memorize */ | 1129 | source = s; /* Node moved. Memorize */ |
1126 | dest = d; | 1130 | dest = d; |
1127 | 1131 | ||
1128 | /* dest not in remaining from nodes? */ | 1132 | /* dest not in remaining from nodes? */ |
1129 | if (!node_isset(dest, tmp)) | 1133 | if (!node_isset(dest, tmp)) |
1130 | break; | 1134 | break; |
1131 | } | 1135 | } |
1132 | if (source == NUMA_NO_NODE) | 1136 | if (source == NUMA_NO_NODE) |
1133 | break; | 1137 | break; |
1134 | 1138 | ||
1135 | node_clear(source, tmp); | 1139 | node_clear(source, tmp); |
1136 | err = migrate_to_node(mm, source, dest, flags); | 1140 | err = migrate_to_node(mm, source, dest, flags); |
1137 | if (err > 0) | 1141 | if (err > 0) |
1138 | busy += err; | 1142 | busy += err; |
1139 | if (err < 0) | 1143 | if (err < 0) |
1140 | break; | 1144 | break; |
1141 | } | 1145 | } |
1142 | out: | 1146 | out: |
1143 | up_read(&mm->mmap_sem); | 1147 | up_read(&mm->mmap_sem); |
1144 | if (err < 0) | 1148 | if (err < 0) |
1145 | return err; | 1149 | return err; |
1146 | return busy; | 1150 | return busy; |
1147 | 1151 | ||
1148 | } | 1152 | } |
1149 | 1153 | ||
1150 | /* | 1154 | /* |
1151 | * Allocate a new page for page migration based on vma policy. | 1155 | * Allocate a new page for page migration based on vma policy. |
1152 | * Start assuming that page is mapped by vma pointed to by @private. | 1156 | * Start assuming that page is mapped by vma pointed to by @private. |
1153 | * Search forward from there, if not. N.B., this assumes that the | 1157 | * Search forward from there, if not. N.B., this assumes that the |
1154 | * list of pages handed to migrate_pages()--which is how we get here-- | 1158 | * list of pages handed to migrate_pages()--which is how we get here-- |
1155 | * is in virtual address order. | 1159 | * is in virtual address order. |
1156 | */ | 1160 | */ |
1157 | static struct page *new_vma_page(struct page *page, unsigned long private, int **x) | 1161 | static struct page *new_vma_page(struct page *page, unsigned long private, int **x) |
1158 | { | 1162 | { |
1159 | struct vm_area_struct *vma = (struct vm_area_struct *)private; | 1163 | struct vm_area_struct *vma = (struct vm_area_struct *)private; |
1160 | unsigned long uninitialized_var(address); | 1164 | unsigned long uninitialized_var(address); |
1161 | 1165 | ||
1162 | while (vma) { | 1166 | while (vma) { |
1163 | address = page_address_in_vma(page, vma); | 1167 | address = page_address_in_vma(page, vma); |
1164 | if (address != -EFAULT) | 1168 | if (address != -EFAULT) |
1165 | break; | 1169 | break; |
1166 | vma = vma->vm_next; | 1170 | vma = vma->vm_next; |
1167 | } | 1171 | } |
1168 | 1172 | ||
1169 | if (PageHuge(page)) { | 1173 | if (PageHuge(page)) { |
1170 | BUG_ON(!vma); | 1174 | BUG_ON(!vma); |
1171 | return alloc_huge_page_noerr(vma, address, 1); | 1175 | return alloc_huge_page_noerr(vma, address, 1); |
1172 | } | 1176 | } |
1173 | /* | 1177 | /* |
1174 | * if !vma, alloc_page_vma() will use task or system default policy | 1178 | * if !vma, alloc_page_vma() will use task or system default policy |
1175 | */ | 1179 | */ |
1176 | return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1180 | return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1177 | } | 1181 | } |
1178 | #else | 1182 | #else |
1179 | 1183 | ||
1180 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 1184 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
1181 | unsigned long flags) | 1185 | unsigned long flags) |
1182 | { | 1186 | { |
1183 | } | 1187 | } |
1184 | 1188 | ||
1185 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, | 1189 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, |
1186 | const nodemask_t *to, int flags) | 1190 | const nodemask_t *to, int flags) |
1187 | { | 1191 | { |
1188 | return -ENOSYS; | 1192 | return -ENOSYS; |
1189 | } | 1193 | } |
1190 | 1194 | ||
1191 | static struct page *new_vma_page(struct page *page, unsigned long private, int **x) | 1195 | static struct page *new_vma_page(struct page *page, unsigned long private, int **x) |
1192 | { | 1196 | { |
1193 | return NULL; | 1197 | return NULL; |
1194 | } | 1198 | } |
1195 | #endif | 1199 | #endif |
1196 | 1200 | ||
1197 | static long do_mbind(unsigned long start, unsigned long len, | 1201 | static long do_mbind(unsigned long start, unsigned long len, |
1198 | unsigned short mode, unsigned short mode_flags, | 1202 | unsigned short mode, unsigned short mode_flags, |
1199 | nodemask_t *nmask, unsigned long flags) | 1203 | nodemask_t *nmask, unsigned long flags) |
1200 | { | 1204 | { |
1201 | struct vm_area_struct *vma; | 1205 | struct vm_area_struct *vma; |
1202 | struct mm_struct *mm = current->mm; | 1206 | struct mm_struct *mm = current->mm; |
1203 | struct mempolicy *new; | 1207 | struct mempolicy *new; |
1204 | unsigned long end; | 1208 | unsigned long end; |
1205 | int err; | 1209 | int err; |
1206 | LIST_HEAD(pagelist); | 1210 | LIST_HEAD(pagelist); |
1207 | 1211 | ||
1208 | if (flags & ~(unsigned long)MPOL_MF_VALID) | 1212 | if (flags & ~(unsigned long)MPOL_MF_VALID) |
1209 | return -EINVAL; | 1213 | return -EINVAL; |
1210 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) | 1214 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
1211 | return -EPERM; | 1215 | return -EPERM; |
1212 | 1216 | ||
1213 | if (start & ~PAGE_MASK) | 1217 | if (start & ~PAGE_MASK) |
1214 | return -EINVAL; | 1218 | return -EINVAL; |
1215 | 1219 | ||
1216 | if (mode == MPOL_DEFAULT) | 1220 | if (mode == MPOL_DEFAULT) |
1217 | flags &= ~MPOL_MF_STRICT; | 1221 | flags &= ~MPOL_MF_STRICT; |
1218 | 1222 | ||
1219 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | 1223 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; |
1220 | end = start + len; | 1224 | end = start + len; |
1221 | 1225 | ||
1222 | if (end < start) | 1226 | if (end < start) |
1223 | return -EINVAL; | 1227 | return -EINVAL; |
1224 | if (end == start) | 1228 | if (end == start) |
1225 | return 0; | 1229 | return 0; |
1226 | 1230 | ||
1227 | new = mpol_new(mode, mode_flags, nmask); | 1231 | new = mpol_new(mode, mode_flags, nmask); |
1228 | if (IS_ERR(new)) | 1232 | if (IS_ERR(new)) |
1229 | return PTR_ERR(new); | 1233 | return PTR_ERR(new); |
1230 | 1234 | ||
1231 | if (flags & MPOL_MF_LAZY) | 1235 | if (flags & MPOL_MF_LAZY) |
1232 | new->flags |= MPOL_F_MOF; | 1236 | new->flags |= MPOL_F_MOF; |
1233 | 1237 | ||
1234 | /* | 1238 | /* |
1235 | * If we are using the default policy then operation | 1239 | * If we are using the default policy then operation |
1236 | * on discontinuous address spaces is okay after all | 1240 | * on discontinuous address spaces is okay after all |
1237 | */ | 1241 | */ |
1238 | if (!new) | 1242 | if (!new) |
1239 | flags |= MPOL_MF_DISCONTIG_OK; | 1243 | flags |= MPOL_MF_DISCONTIG_OK; |
1240 | 1244 | ||
1241 | pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", | 1245 | pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", |
1242 | start, start + len, mode, mode_flags, | 1246 | start, start + len, mode, mode_flags, |
1243 | nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); | 1247 | nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); |
1244 | 1248 | ||
1245 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | 1249 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { |
1246 | 1250 | ||
1247 | err = migrate_prep(); | 1251 | err = migrate_prep(); |
1248 | if (err) | 1252 | if (err) |
1249 | goto mpol_out; | 1253 | goto mpol_out; |
1250 | } | 1254 | } |
1251 | { | 1255 | { |
1252 | NODEMASK_SCRATCH(scratch); | 1256 | NODEMASK_SCRATCH(scratch); |
1253 | if (scratch) { | 1257 | if (scratch) { |
1254 | down_write(&mm->mmap_sem); | 1258 | down_write(&mm->mmap_sem); |
1255 | task_lock(current); | 1259 | task_lock(current); |
1256 | err = mpol_set_nodemask(new, nmask, scratch); | 1260 | err = mpol_set_nodemask(new, nmask, scratch); |
1257 | task_unlock(current); | 1261 | task_unlock(current); |
1258 | if (err) | 1262 | if (err) |
1259 | up_write(&mm->mmap_sem); | 1263 | up_write(&mm->mmap_sem); |
1260 | } else | 1264 | } else |
1261 | err = -ENOMEM; | 1265 | err = -ENOMEM; |
1262 | NODEMASK_SCRATCH_FREE(scratch); | 1266 | NODEMASK_SCRATCH_FREE(scratch); |
1263 | } | 1267 | } |
1264 | if (err) | 1268 | if (err) |
1265 | goto mpol_out; | 1269 | goto mpol_out; |
1266 | 1270 | ||
1267 | vma = queue_pages_range(mm, start, end, nmask, | 1271 | vma = queue_pages_range(mm, start, end, nmask, |
1268 | flags | MPOL_MF_INVERT, &pagelist); | 1272 | flags | MPOL_MF_INVERT, &pagelist); |
1269 | 1273 | ||
1270 | err = PTR_ERR(vma); /* maybe ... */ | 1274 | err = PTR_ERR(vma); /* maybe ... */ |
1271 | if (!IS_ERR(vma)) | 1275 | if (!IS_ERR(vma)) |
1272 | err = mbind_range(mm, start, end, new); | 1276 | err = mbind_range(mm, start, end, new); |
1273 | 1277 | ||
1274 | if (!err) { | 1278 | if (!err) { |
1275 | int nr_failed = 0; | 1279 | int nr_failed = 0; |
1276 | 1280 | ||
1277 | if (!list_empty(&pagelist)) { | 1281 | if (!list_empty(&pagelist)) { |
1278 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | 1282 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); |
1279 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1283 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1280 | (unsigned long)vma, | 1284 | (unsigned long)vma, |
1281 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); | 1285 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); |
1282 | if (nr_failed) | 1286 | if (nr_failed) |
1283 | putback_movable_pages(&pagelist); | 1287 | putback_movable_pages(&pagelist); |
1284 | } | 1288 | } |
1285 | 1289 | ||
1286 | if (nr_failed && (flags & MPOL_MF_STRICT)) | 1290 | if (nr_failed && (flags & MPOL_MF_STRICT)) |
1287 | err = -EIO; | 1291 | err = -EIO; |
1288 | } else | 1292 | } else |
1289 | putback_movable_pages(&pagelist); | 1293 | putback_movable_pages(&pagelist); |
1290 | 1294 | ||
1291 | up_write(&mm->mmap_sem); | 1295 | up_write(&mm->mmap_sem); |
1292 | mpol_out: | 1296 | mpol_out: |
1293 | mpol_put(new); | 1297 | mpol_put(new); |
1294 | return err; | 1298 | return err; |
1295 | } | 1299 | } |
1296 | 1300 | ||
1297 | /* | 1301 | /* |
1298 | * User space interface with variable sized bitmaps for nodelists. | 1302 | * User space interface with variable sized bitmaps for nodelists. |
1299 | */ | 1303 | */ |
1300 | 1304 | ||
1301 | /* Copy a node mask from user space. */ | 1305 | /* Copy a node mask from user space. */ |
1302 | static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, | 1306 | static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, |
1303 | unsigned long maxnode) | 1307 | unsigned long maxnode) |
1304 | { | 1308 | { |
1305 | unsigned long k; | 1309 | unsigned long k; |
1306 | unsigned long nlongs; | 1310 | unsigned long nlongs; |
1307 | unsigned long endmask; | 1311 | unsigned long endmask; |
1308 | 1312 | ||
1309 | --maxnode; | 1313 | --maxnode; |
1310 | nodes_clear(*nodes); | 1314 | nodes_clear(*nodes); |
1311 | if (maxnode == 0 || !nmask) | 1315 | if (maxnode == 0 || !nmask) |
1312 | return 0; | 1316 | return 0; |
1313 | if (maxnode > PAGE_SIZE*BITS_PER_BYTE) | 1317 | if (maxnode > PAGE_SIZE*BITS_PER_BYTE) |
1314 | return -EINVAL; | 1318 | return -EINVAL; |
1315 | 1319 | ||
1316 | nlongs = BITS_TO_LONGS(maxnode); | 1320 | nlongs = BITS_TO_LONGS(maxnode); |
1317 | if ((maxnode % BITS_PER_LONG) == 0) | 1321 | if ((maxnode % BITS_PER_LONG) == 0) |
1318 | endmask = ~0UL; | 1322 | endmask = ~0UL; |
1319 | else | 1323 | else |
1320 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | 1324 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; |
1321 | 1325 | ||
1322 | /* When the user specified more nodes than supported just check | 1326 | /* When the user specified more nodes than supported just check |
1323 | if the non supported part is all zero. */ | 1327 | if the non supported part is all zero. */ |
1324 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | 1328 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { |
1325 | if (nlongs > PAGE_SIZE/sizeof(long)) | 1329 | if (nlongs > PAGE_SIZE/sizeof(long)) |
1326 | return -EINVAL; | 1330 | return -EINVAL; |
1327 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | 1331 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { |
1328 | unsigned long t; | 1332 | unsigned long t; |
1329 | if (get_user(t, nmask + k)) | 1333 | if (get_user(t, nmask + k)) |
1330 | return -EFAULT; | 1334 | return -EFAULT; |
1331 | if (k == nlongs - 1) { | 1335 | if (k == nlongs - 1) { |
1332 | if (t & endmask) | 1336 | if (t & endmask) |
1333 | return -EINVAL; | 1337 | return -EINVAL; |
1334 | } else if (t) | 1338 | } else if (t) |
1335 | return -EINVAL; | 1339 | return -EINVAL; |
1336 | } | 1340 | } |
1337 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | 1341 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); |
1338 | endmask = ~0UL; | 1342 | endmask = ~0UL; |
1339 | } | 1343 | } |
1340 | 1344 | ||
1341 | if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) | 1345 | if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) |
1342 | return -EFAULT; | 1346 | return -EFAULT; |
1343 | nodes_addr(*nodes)[nlongs-1] &= endmask; | 1347 | nodes_addr(*nodes)[nlongs-1] &= endmask; |
1344 | return 0; | 1348 | return 0; |
1345 | } | 1349 | } |
1346 | 1350 | ||
1347 | /* Copy a kernel node mask to user space */ | 1351 | /* Copy a kernel node mask to user space */ |
1348 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | 1352 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, |
1349 | nodemask_t *nodes) | 1353 | nodemask_t *nodes) |
1350 | { | 1354 | { |
1351 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | 1355 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; |
1352 | const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); | 1356 | const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); |
1353 | 1357 | ||
1354 | if (copy > nbytes) { | 1358 | if (copy > nbytes) { |
1355 | if (copy > PAGE_SIZE) | 1359 | if (copy > PAGE_SIZE) |
1356 | return -EINVAL; | 1360 | return -EINVAL; |
1357 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | 1361 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) |
1358 | return -EFAULT; | 1362 | return -EFAULT; |
1359 | copy = nbytes; | 1363 | copy = nbytes; |
1360 | } | 1364 | } |
1361 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | 1365 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; |
1362 | } | 1366 | } |
1363 | 1367 | ||
1364 | SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, | 1368 | SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, |
1365 | unsigned long, mode, unsigned long __user *, nmask, | 1369 | unsigned long, mode, unsigned long __user *, nmask, |
1366 | unsigned long, maxnode, unsigned, flags) | 1370 | unsigned long, maxnode, unsigned, flags) |
1367 | { | 1371 | { |
1368 | nodemask_t nodes; | 1372 | nodemask_t nodes; |
1369 | int err; | 1373 | int err; |
1370 | unsigned short mode_flags; | 1374 | unsigned short mode_flags; |
1371 | 1375 | ||
1372 | mode_flags = mode & MPOL_MODE_FLAGS; | 1376 | mode_flags = mode & MPOL_MODE_FLAGS; |
1373 | mode &= ~MPOL_MODE_FLAGS; | 1377 | mode &= ~MPOL_MODE_FLAGS; |
1374 | if (mode >= MPOL_MAX) | 1378 | if (mode >= MPOL_MAX) |
1375 | return -EINVAL; | 1379 | return -EINVAL; |
1376 | if ((mode_flags & MPOL_F_STATIC_NODES) && | 1380 | if ((mode_flags & MPOL_F_STATIC_NODES) && |
1377 | (mode_flags & MPOL_F_RELATIVE_NODES)) | 1381 | (mode_flags & MPOL_F_RELATIVE_NODES)) |
1378 | return -EINVAL; | 1382 | return -EINVAL; |
1379 | err = get_nodes(&nodes, nmask, maxnode); | 1383 | err = get_nodes(&nodes, nmask, maxnode); |
1380 | if (err) | 1384 | if (err) |
1381 | return err; | 1385 | return err; |
1382 | return do_mbind(start, len, mode, mode_flags, &nodes, flags); | 1386 | return do_mbind(start, len, mode, mode_flags, &nodes, flags); |
1383 | } | 1387 | } |
1384 | 1388 | ||
1385 | /* Set the process memory policy */ | 1389 | /* Set the process memory policy */ |
1386 | SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, | 1390 | SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, |
1387 | unsigned long, maxnode) | 1391 | unsigned long, maxnode) |
1388 | { | 1392 | { |
1389 | int err; | 1393 | int err; |
1390 | nodemask_t nodes; | 1394 | nodemask_t nodes; |
1391 | unsigned short flags; | 1395 | unsigned short flags; |
1392 | 1396 | ||
1393 | flags = mode & MPOL_MODE_FLAGS; | 1397 | flags = mode & MPOL_MODE_FLAGS; |
1394 | mode &= ~MPOL_MODE_FLAGS; | 1398 | mode &= ~MPOL_MODE_FLAGS; |
1395 | if ((unsigned int)mode >= MPOL_MAX) | 1399 | if ((unsigned int)mode >= MPOL_MAX) |
1396 | return -EINVAL; | 1400 | return -EINVAL; |
1397 | if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) | 1401 | if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) |
1398 | return -EINVAL; | 1402 | return -EINVAL; |
1399 | err = get_nodes(&nodes, nmask, maxnode); | 1403 | err = get_nodes(&nodes, nmask, maxnode); |
1400 | if (err) | 1404 | if (err) |
1401 | return err; | 1405 | return err; |
1402 | return do_set_mempolicy(mode, flags, &nodes); | 1406 | return do_set_mempolicy(mode, flags, &nodes); |
1403 | } | 1407 | } |
1404 | 1408 | ||
1405 | SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | 1409 | SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, |
1406 | const unsigned long __user *, old_nodes, | 1410 | const unsigned long __user *, old_nodes, |
1407 | const unsigned long __user *, new_nodes) | 1411 | const unsigned long __user *, new_nodes) |
1408 | { | 1412 | { |
1409 | const struct cred *cred = current_cred(), *tcred; | 1413 | const struct cred *cred = current_cred(), *tcred; |
1410 | struct mm_struct *mm = NULL; | 1414 | struct mm_struct *mm = NULL; |
1411 | struct task_struct *task; | 1415 | struct task_struct *task; |
1412 | nodemask_t task_nodes; | 1416 | nodemask_t task_nodes; |
1413 | int err; | 1417 | int err; |
1414 | nodemask_t *old; | 1418 | nodemask_t *old; |
1415 | nodemask_t *new; | 1419 | nodemask_t *new; |
1416 | NODEMASK_SCRATCH(scratch); | 1420 | NODEMASK_SCRATCH(scratch); |
1417 | 1421 | ||
1418 | if (!scratch) | 1422 | if (!scratch) |
1419 | return -ENOMEM; | 1423 | return -ENOMEM; |
1420 | 1424 | ||
1421 | old = &scratch->mask1; | 1425 | old = &scratch->mask1; |
1422 | new = &scratch->mask2; | 1426 | new = &scratch->mask2; |
1423 | 1427 | ||
1424 | err = get_nodes(old, old_nodes, maxnode); | 1428 | err = get_nodes(old, old_nodes, maxnode); |
1425 | if (err) | 1429 | if (err) |
1426 | goto out; | 1430 | goto out; |
1427 | 1431 | ||
1428 | err = get_nodes(new, new_nodes, maxnode); | 1432 | err = get_nodes(new, new_nodes, maxnode); |
1429 | if (err) | 1433 | if (err) |
1430 | goto out; | 1434 | goto out; |
1431 | 1435 | ||
1432 | /* Find the mm_struct */ | 1436 | /* Find the mm_struct */ |
1433 | rcu_read_lock(); | 1437 | rcu_read_lock(); |
1434 | task = pid ? find_task_by_vpid(pid) : current; | 1438 | task = pid ? find_task_by_vpid(pid) : current; |
1435 | if (!task) { | 1439 | if (!task) { |
1436 | rcu_read_unlock(); | 1440 | rcu_read_unlock(); |
1437 | err = -ESRCH; | 1441 | err = -ESRCH; |
1438 | goto out; | 1442 | goto out; |
1439 | } | 1443 | } |
1440 | get_task_struct(task); | 1444 | get_task_struct(task); |
1441 | 1445 | ||
1442 | err = -EINVAL; | 1446 | err = -EINVAL; |
1443 | 1447 | ||
1444 | /* | 1448 | /* |
1445 | * Check if this process has the right to modify the specified | 1449 | * Check if this process has the right to modify the specified |
1446 | * process. The right exists if the process has administrative | 1450 | * process. The right exists if the process has administrative |
1447 | * capabilities, superuser privileges or the same | 1451 | * capabilities, superuser privileges or the same |
1448 | * userid as the target process. | 1452 | * userid as the target process. |
1449 | */ | 1453 | */ |
1450 | tcred = __task_cred(task); | 1454 | tcred = __task_cred(task); |
1451 | if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && | 1455 | if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && |
1452 | !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && | 1456 | !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && |
1453 | !capable(CAP_SYS_NICE)) { | 1457 | !capable(CAP_SYS_NICE)) { |
1454 | rcu_read_unlock(); | 1458 | rcu_read_unlock(); |
1455 | err = -EPERM; | 1459 | err = -EPERM; |
1456 | goto out_put; | 1460 | goto out_put; |
1457 | } | 1461 | } |
1458 | rcu_read_unlock(); | 1462 | rcu_read_unlock(); |
1459 | 1463 | ||
1460 | task_nodes = cpuset_mems_allowed(task); | 1464 | task_nodes = cpuset_mems_allowed(task); |
1461 | /* Is the user allowed to access the target nodes? */ | 1465 | /* Is the user allowed to access the target nodes? */ |
1462 | if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { | 1466 | if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { |
1463 | err = -EPERM; | 1467 | err = -EPERM; |
1464 | goto out_put; | 1468 | goto out_put; |
1465 | } | 1469 | } |
1466 | 1470 | ||
1467 | if (!nodes_subset(*new, node_states[N_MEMORY])) { | 1471 | if (!nodes_subset(*new, node_states[N_MEMORY])) { |
1468 | err = -EINVAL; | 1472 | err = -EINVAL; |
1469 | goto out_put; | 1473 | goto out_put; |
1470 | } | 1474 | } |
1471 | 1475 | ||
1472 | err = security_task_movememory(task); | 1476 | err = security_task_movememory(task); |
1473 | if (err) | 1477 | if (err) |
1474 | goto out_put; | 1478 | goto out_put; |
1475 | 1479 | ||
1476 | mm = get_task_mm(task); | 1480 | mm = get_task_mm(task); |
1477 | put_task_struct(task); | 1481 | put_task_struct(task); |
1478 | 1482 | ||
1479 | if (!mm) { | 1483 | if (!mm) { |
1480 | err = -EINVAL; | 1484 | err = -EINVAL; |
1481 | goto out; | 1485 | goto out; |
1482 | } | 1486 | } |
1483 | 1487 | ||
1484 | err = do_migrate_pages(mm, old, new, | 1488 | err = do_migrate_pages(mm, old, new, |
1485 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | 1489 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); |
1486 | 1490 | ||
1487 | mmput(mm); | 1491 | mmput(mm); |
1488 | out: | 1492 | out: |
1489 | NODEMASK_SCRATCH_FREE(scratch); | 1493 | NODEMASK_SCRATCH_FREE(scratch); |
1490 | 1494 | ||
1491 | return err; | 1495 | return err; |
1492 | 1496 | ||
1493 | out_put: | 1497 | out_put: |
1494 | put_task_struct(task); | 1498 | put_task_struct(task); |
1495 | goto out; | 1499 | goto out; |
1496 | 1500 | ||
1497 | } | 1501 | } |
1498 | 1502 | ||
1499 | 1503 | ||
1500 | /* Retrieve NUMA policy */ | 1504 | /* Retrieve NUMA policy */ |
1501 | SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, | 1505 | SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, |
1502 | unsigned long __user *, nmask, unsigned long, maxnode, | 1506 | unsigned long __user *, nmask, unsigned long, maxnode, |
1503 | unsigned long, addr, unsigned long, flags) | 1507 | unsigned long, addr, unsigned long, flags) |
1504 | { | 1508 | { |
1505 | int err; | 1509 | int err; |
1506 | int uninitialized_var(pval); | 1510 | int uninitialized_var(pval); |
1507 | nodemask_t nodes; | 1511 | nodemask_t nodes; |
1508 | 1512 | ||
1509 | if (nmask != NULL && maxnode < MAX_NUMNODES) | 1513 | if (nmask != NULL && maxnode < MAX_NUMNODES) |
1510 | return -EINVAL; | 1514 | return -EINVAL; |
1511 | 1515 | ||
1512 | err = do_get_mempolicy(&pval, &nodes, addr, flags); | 1516 | err = do_get_mempolicy(&pval, &nodes, addr, flags); |
1513 | 1517 | ||
1514 | if (err) | 1518 | if (err) |
1515 | return err; | 1519 | return err; |
1516 | 1520 | ||
1517 | if (policy && put_user(pval, policy)) | 1521 | if (policy && put_user(pval, policy)) |
1518 | return -EFAULT; | 1522 | return -EFAULT; |
1519 | 1523 | ||
1520 | if (nmask) | 1524 | if (nmask) |
1521 | err = copy_nodes_to_user(nmask, maxnode, &nodes); | 1525 | err = copy_nodes_to_user(nmask, maxnode, &nodes); |
1522 | 1526 | ||
1523 | return err; | 1527 | return err; |
1524 | } | 1528 | } |
1525 | 1529 | ||
1526 | #ifdef CONFIG_COMPAT | 1530 | #ifdef CONFIG_COMPAT |
1527 | 1531 | ||
1528 | COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, | 1532 | COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, |
1529 | compat_ulong_t __user *, nmask, | 1533 | compat_ulong_t __user *, nmask, |
1530 | compat_ulong_t, maxnode, | 1534 | compat_ulong_t, maxnode, |
1531 | compat_ulong_t, addr, compat_ulong_t, flags) | 1535 | compat_ulong_t, addr, compat_ulong_t, flags) |
1532 | { | 1536 | { |
1533 | long err; | 1537 | long err; |
1534 | unsigned long __user *nm = NULL; | 1538 | unsigned long __user *nm = NULL; |
1535 | unsigned long nr_bits, alloc_size; | 1539 | unsigned long nr_bits, alloc_size; |
1536 | DECLARE_BITMAP(bm, MAX_NUMNODES); | 1540 | DECLARE_BITMAP(bm, MAX_NUMNODES); |
1537 | 1541 | ||
1538 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | 1542 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); |
1539 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | 1543 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; |
1540 | 1544 | ||
1541 | if (nmask) | 1545 | if (nmask) |
1542 | nm = compat_alloc_user_space(alloc_size); | 1546 | nm = compat_alloc_user_space(alloc_size); |
1543 | 1547 | ||
1544 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); | 1548 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); |
1545 | 1549 | ||
1546 | if (!err && nmask) { | 1550 | if (!err && nmask) { |
1547 | unsigned long copy_size; | 1551 | unsigned long copy_size; |
1548 | copy_size = min_t(unsigned long, sizeof(bm), alloc_size); | 1552 | copy_size = min_t(unsigned long, sizeof(bm), alloc_size); |
1549 | err = copy_from_user(bm, nm, copy_size); | 1553 | err = copy_from_user(bm, nm, copy_size); |
1550 | /* ensure entire bitmap is zeroed */ | 1554 | /* ensure entire bitmap is zeroed */ |
1551 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); | 1555 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); |
1552 | err |= compat_put_bitmap(nmask, bm, nr_bits); | 1556 | err |= compat_put_bitmap(nmask, bm, nr_bits); |
1553 | } | 1557 | } |
1554 | 1558 | ||
1555 | return err; | 1559 | return err; |
1556 | } | 1560 | } |
1557 | 1561 | ||
1558 | COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, | 1562 | COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, |
1559 | compat_ulong_t, maxnode) | 1563 | compat_ulong_t, maxnode) |
1560 | { | 1564 | { |
1561 | long err = 0; | 1565 | long err = 0; |
1562 | unsigned long __user *nm = NULL; | 1566 | unsigned long __user *nm = NULL; |
1563 | unsigned long nr_bits, alloc_size; | 1567 | unsigned long nr_bits, alloc_size; |
1564 | DECLARE_BITMAP(bm, MAX_NUMNODES); | 1568 | DECLARE_BITMAP(bm, MAX_NUMNODES); |
1565 | 1569 | ||
1566 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | 1570 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); |
1567 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | 1571 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; |
1568 | 1572 | ||
1569 | if (nmask) { | 1573 | if (nmask) { |
1570 | err = compat_get_bitmap(bm, nmask, nr_bits); | 1574 | err = compat_get_bitmap(bm, nmask, nr_bits); |
1571 | nm = compat_alloc_user_space(alloc_size); | 1575 | nm = compat_alloc_user_space(alloc_size); |
1572 | err |= copy_to_user(nm, bm, alloc_size); | 1576 | err |= copy_to_user(nm, bm, alloc_size); |
1573 | } | 1577 | } |
1574 | 1578 | ||
1575 | if (err) | 1579 | if (err) |
1576 | return -EFAULT; | 1580 | return -EFAULT; |
1577 | 1581 | ||
1578 | return sys_set_mempolicy(mode, nm, nr_bits+1); | 1582 | return sys_set_mempolicy(mode, nm, nr_bits+1); |
1579 | } | 1583 | } |
1580 | 1584 | ||
1581 | COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, | 1585 | COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, |
1582 | compat_ulong_t, mode, compat_ulong_t __user *, nmask, | 1586 | compat_ulong_t, mode, compat_ulong_t __user *, nmask, |
1583 | compat_ulong_t, maxnode, compat_ulong_t, flags) | 1587 | compat_ulong_t, maxnode, compat_ulong_t, flags) |
1584 | { | 1588 | { |
1585 | long err = 0; | 1589 | long err = 0; |
1586 | unsigned long __user *nm = NULL; | 1590 | unsigned long __user *nm = NULL; |
1587 | unsigned long nr_bits, alloc_size; | 1591 | unsigned long nr_bits, alloc_size; |
1588 | nodemask_t bm; | 1592 | nodemask_t bm; |
1589 | 1593 | ||
1590 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | 1594 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); |
1591 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | 1595 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; |
1592 | 1596 | ||
1593 | if (nmask) { | 1597 | if (nmask) { |
1594 | err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); | 1598 | err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); |
1595 | nm = compat_alloc_user_space(alloc_size); | 1599 | nm = compat_alloc_user_space(alloc_size); |
1596 | err |= copy_to_user(nm, nodes_addr(bm), alloc_size); | 1600 | err |= copy_to_user(nm, nodes_addr(bm), alloc_size); |
1597 | } | 1601 | } |
1598 | 1602 | ||
1599 | if (err) | 1603 | if (err) |
1600 | return -EFAULT; | 1604 | return -EFAULT; |
1601 | 1605 | ||
1602 | return sys_mbind(start, len, mode, nm, nr_bits+1, flags); | 1606 | return sys_mbind(start, len, mode, nm, nr_bits+1, flags); |
1603 | } | 1607 | } |
1604 | 1608 | ||
1605 | #endif | 1609 | #endif |
1606 | 1610 | ||
1607 | /* | 1611 | /* |
1608 | * get_vma_policy(@task, @vma, @addr) | 1612 | * get_vma_policy(@task, @vma, @addr) |
1609 | * @task - task for fallback if vma policy == default | 1613 | * @task - task for fallback if vma policy == default |
1610 | * @vma - virtual memory area whose policy is sought | 1614 | * @vma - virtual memory area whose policy is sought |
1611 | * @addr - address in @vma for shared policy lookup | 1615 | * @addr - address in @vma for shared policy lookup |
1612 | * | 1616 | * |
1613 | * Returns effective policy for a VMA at specified address. | 1617 | * Returns effective policy for a VMA at specified address. |
1614 | * Falls back to @task or system default policy, as necessary. | 1618 | * Falls back to @task or system default policy, as necessary. |
1615 | * Current or other task's task mempolicy and non-shared vma policies must be | 1619 | * Current or other task's task mempolicy and non-shared vma policies must be |
1616 | * protected by task_lock(task) by the caller. | 1620 | * protected by task_lock(task) by the caller. |
1617 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference | 1621 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference |
1618 | * count--added by the get_policy() vm_op, as appropriate--to protect against | 1622 | * count--added by the get_policy() vm_op, as appropriate--to protect against |
1619 | * freeing by another task. It is the caller's responsibility to free the | 1623 | * freeing by another task. It is the caller's responsibility to free the |
1620 | * extra reference for shared policies. | 1624 | * extra reference for shared policies. |
1621 | */ | 1625 | */ |
1622 | struct mempolicy *get_vma_policy(struct task_struct *task, | 1626 | struct mempolicy *get_vma_policy(struct task_struct *task, |
1623 | struct vm_area_struct *vma, unsigned long addr) | 1627 | struct vm_area_struct *vma, unsigned long addr) |
1624 | { | 1628 | { |
1625 | struct mempolicy *pol = get_task_policy(task); | 1629 | struct mempolicy *pol = get_task_policy(task); |
1626 | 1630 | ||
1627 | if (vma) { | 1631 | if (vma) { |
1628 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1632 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
1629 | struct mempolicy *vpol = vma->vm_ops->get_policy(vma, | 1633 | struct mempolicy *vpol = vma->vm_ops->get_policy(vma, |
1630 | addr); | 1634 | addr); |
1631 | if (vpol) | 1635 | if (vpol) |
1632 | pol = vpol; | 1636 | pol = vpol; |
1633 | } else if (vma->vm_policy) { | 1637 | } else if (vma->vm_policy) { |
1634 | pol = vma->vm_policy; | 1638 | pol = vma->vm_policy; |
1635 | 1639 | ||
1636 | /* | 1640 | /* |
1637 | * shmem_alloc_page() passes MPOL_F_SHARED policy with | 1641 | * shmem_alloc_page() passes MPOL_F_SHARED policy with |
1638 | * a pseudo vma whose vma->vm_ops=NULL. Take a reference | 1642 | * a pseudo vma whose vma->vm_ops=NULL. Take a reference |
1639 | * count on these policies which will be dropped by | 1643 | * count on these policies which will be dropped by |
1640 | * mpol_cond_put() later | 1644 | * mpol_cond_put() later |
1641 | */ | 1645 | */ |
1642 | if (mpol_needs_cond_ref(pol)) | 1646 | if (mpol_needs_cond_ref(pol)) |
1643 | mpol_get(pol); | 1647 | mpol_get(pol); |
1644 | } | 1648 | } |
1645 | } | 1649 | } |
1646 | if (!pol) | 1650 | if (!pol) |
1647 | pol = &default_policy; | 1651 | pol = &default_policy; |
1648 | return pol; | 1652 | return pol; |
1649 | } | 1653 | } |
1650 | 1654 | ||
1651 | bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) | 1655 | bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) |
1652 | { | 1656 | { |
1653 | struct mempolicy *pol = get_task_policy(task); | 1657 | struct mempolicy *pol = get_task_policy(task); |
1654 | if (vma) { | 1658 | if (vma) { |
1655 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1659 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
1656 | bool ret = false; | 1660 | bool ret = false; |
1657 | 1661 | ||
1658 | pol = vma->vm_ops->get_policy(vma, vma->vm_start); | 1662 | pol = vma->vm_ops->get_policy(vma, vma->vm_start); |
1659 | if (pol && (pol->flags & MPOL_F_MOF)) | 1663 | if (pol && (pol->flags & MPOL_F_MOF)) |
1660 | ret = true; | 1664 | ret = true; |
1661 | mpol_cond_put(pol); | 1665 | mpol_cond_put(pol); |
1662 | 1666 | ||
1663 | return ret; | 1667 | return ret; |
1664 | } else if (vma->vm_policy) { | 1668 | } else if (vma->vm_policy) { |
1665 | pol = vma->vm_policy; | 1669 | pol = vma->vm_policy; |
1666 | } | 1670 | } |
1667 | } | 1671 | } |
1668 | 1672 | ||
1669 | if (!pol) | 1673 | if (!pol) |
1670 | return default_policy.flags & MPOL_F_MOF; | 1674 | return default_policy.flags & MPOL_F_MOF; |
1671 | 1675 | ||
1672 | return pol->flags & MPOL_F_MOF; | 1676 | return pol->flags & MPOL_F_MOF; |
1673 | } | 1677 | } |
1674 | 1678 | ||
1675 | static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) | 1679 | static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) |
1676 | { | 1680 | { |
1677 | enum zone_type dynamic_policy_zone = policy_zone; | 1681 | enum zone_type dynamic_policy_zone = policy_zone; |
1678 | 1682 | ||
1679 | BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); | 1683 | BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); |
1680 | 1684 | ||
1681 | /* | 1685 | /* |
1682 | * if policy->v.nodes has movable memory only, | 1686 | * if policy->v.nodes has movable memory only, |
1683 | * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. | 1687 | * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. |
1684 | * | 1688 | * |
1685 | * policy->v.nodes is intersect with node_states[N_MEMORY]. | 1689 | * policy->v.nodes is intersect with node_states[N_MEMORY]. |
1686 | * so if the following test faile, it implies | 1690 | * so if the following test faile, it implies |
1687 | * policy->v.nodes has movable memory only. | 1691 | * policy->v.nodes has movable memory only. |
1688 | */ | 1692 | */ |
1689 | if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) | 1693 | if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) |
1690 | dynamic_policy_zone = ZONE_MOVABLE; | 1694 | dynamic_policy_zone = ZONE_MOVABLE; |
1691 | 1695 | ||
1692 | return zone >= dynamic_policy_zone; | 1696 | return zone >= dynamic_policy_zone; |
1693 | } | 1697 | } |
1694 | 1698 | ||
1695 | /* | 1699 | /* |
1696 | * Return a nodemask representing a mempolicy for filtering nodes for | 1700 | * Return a nodemask representing a mempolicy for filtering nodes for |
1697 | * page allocation | 1701 | * page allocation |
1698 | */ | 1702 | */ |
1699 | static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) | 1703 | static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) |
1700 | { | 1704 | { |
1701 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | 1705 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ |
1702 | if (unlikely(policy->mode == MPOL_BIND) && | 1706 | if (unlikely(policy->mode == MPOL_BIND) && |
1703 | apply_policy_zone(policy, gfp_zone(gfp)) && | 1707 | apply_policy_zone(policy, gfp_zone(gfp)) && |
1704 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) | 1708 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) |
1705 | return &policy->v.nodes; | 1709 | return &policy->v.nodes; |
1706 | 1710 | ||
1707 | return NULL; | 1711 | return NULL; |
1708 | } | 1712 | } |
1709 | 1713 | ||
1710 | /* Return a zonelist indicated by gfp for node representing a mempolicy */ | 1714 | /* Return a zonelist indicated by gfp for node representing a mempolicy */ |
1711 | static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, | 1715 | static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, |
1712 | int nd) | 1716 | int nd) |
1713 | { | 1717 | { |
1714 | switch (policy->mode) { | 1718 | switch (policy->mode) { |
1715 | case MPOL_PREFERRED: | 1719 | case MPOL_PREFERRED: |
1716 | if (!(policy->flags & MPOL_F_LOCAL)) | 1720 | if (!(policy->flags & MPOL_F_LOCAL)) |
1717 | nd = policy->v.preferred_node; | 1721 | nd = policy->v.preferred_node; |
1718 | break; | 1722 | break; |
1719 | case MPOL_BIND: | 1723 | case MPOL_BIND: |
1720 | /* | 1724 | /* |
1721 | * Normally, MPOL_BIND allocations are node-local within the | 1725 | * Normally, MPOL_BIND allocations are node-local within the |
1722 | * allowed nodemask. However, if __GFP_THISNODE is set and the | 1726 | * allowed nodemask. However, if __GFP_THISNODE is set and the |
1723 | * current node isn't part of the mask, we use the zonelist for | 1727 | * current node isn't part of the mask, we use the zonelist for |
1724 | * the first node in the mask instead. | 1728 | * the first node in the mask instead. |
1725 | */ | 1729 | */ |
1726 | if (unlikely(gfp & __GFP_THISNODE) && | 1730 | if (unlikely(gfp & __GFP_THISNODE) && |
1727 | unlikely(!node_isset(nd, policy->v.nodes))) | 1731 | unlikely(!node_isset(nd, policy->v.nodes))) |
1728 | nd = first_node(policy->v.nodes); | 1732 | nd = first_node(policy->v.nodes); |
1729 | break; | 1733 | break; |
1730 | default: | 1734 | default: |
1731 | BUG(); | 1735 | BUG(); |
1732 | } | 1736 | } |
1733 | return node_zonelist(nd, gfp); | 1737 | return node_zonelist(nd, gfp); |
1734 | } | 1738 | } |
1735 | 1739 | ||
1736 | /* Do dynamic interleaving for a process */ | 1740 | /* Do dynamic interleaving for a process */ |
1737 | static unsigned interleave_nodes(struct mempolicy *policy) | 1741 | static unsigned interleave_nodes(struct mempolicy *policy) |
1738 | { | 1742 | { |
1739 | unsigned nid, next; | 1743 | unsigned nid, next; |
1740 | struct task_struct *me = current; | 1744 | struct task_struct *me = current; |
1741 | 1745 | ||
1742 | nid = me->il_next; | 1746 | nid = me->il_next; |
1743 | next = next_node(nid, policy->v.nodes); | 1747 | next = next_node(nid, policy->v.nodes); |
1744 | if (next >= MAX_NUMNODES) | 1748 | if (next >= MAX_NUMNODES) |
1745 | next = first_node(policy->v.nodes); | 1749 | next = first_node(policy->v.nodes); |
1746 | if (next < MAX_NUMNODES) | 1750 | if (next < MAX_NUMNODES) |
1747 | me->il_next = next; | 1751 | me->il_next = next; |
1748 | return nid; | 1752 | return nid; |
1749 | } | 1753 | } |
1750 | 1754 | ||
1751 | /* | 1755 | /* |
1752 | * Depending on the memory policy provide a node from which to allocate the | 1756 | * Depending on the memory policy provide a node from which to allocate the |
1753 | * next slab entry. | 1757 | * next slab entry. |
1754 | */ | 1758 | */ |
1755 | unsigned int mempolicy_slab_node(void) | 1759 | unsigned int mempolicy_slab_node(void) |
1756 | { | 1760 | { |
1757 | struct mempolicy *policy; | 1761 | struct mempolicy *policy; |
1758 | int node = numa_mem_id(); | 1762 | int node = numa_mem_id(); |
1759 | 1763 | ||
1760 | if (in_interrupt()) | 1764 | if (in_interrupt()) |
1761 | return node; | 1765 | return node; |
1762 | 1766 | ||
1763 | policy = current->mempolicy; | 1767 | policy = current->mempolicy; |
1764 | if (!policy || policy->flags & MPOL_F_LOCAL) | 1768 | if (!policy || policy->flags & MPOL_F_LOCAL) |
1765 | return node; | 1769 | return node; |
1766 | 1770 | ||
1767 | switch (policy->mode) { | 1771 | switch (policy->mode) { |
1768 | case MPOL_PREFERRED: | 1772 | case MPOL_PREFERRED: |
1769 | /* | 1773 | /* |
1770 | * handled MPOL_F_LOCAL above | 1774 | * handled MPOL_F_LOCAL above |
1771 | */ | 1775 | */ |
1772 | return policy->v.preferred_node; | 1776 | return policy->v.preferred_node; |
1773 | 1777 | ||
1774 | case MPOL_INTERLEAVE: | 1778 | case MPOL_INTERLEAVE: |
1775 | return interleave_nodes(policy); | 1779 | return interleave_nodes(policy); |
1776 | 1780 | ||
1777 | case MPOL_BIND: { | 1781 | case MPOL_BIND: { |
1778 | /* | 1782 | /* |
1779 | * Follow bind policy behavior and start allocation at the | 1783 | * Follow bind policy behavior and start allocation at the |
1780 | * first node. | 1784 | * first node. |
1781 | */ | 1785 | */ |
1782 | struct zonelist *zonelist; | 1786 | struct zonelist *zonelist; |
1783 | struct zone *zone; | 1787 | struct zone *zone; |
1784 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); | 1788 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); |
1785 | zonelist = &NODE_DATA(node)->node_zonelists[0]; | 1789 | zonelist = &NODE_DATA(node)->node_zonelists[0]; |
1786 | (void)first_zones_zonelist(zonelist, highest_zoneidx, | 1790 | (void)first_zones_zonelist(zonelist, highest_zoneidx, |
1787 | &policy->v.nodes, | 1791 | &policy->v.nodes, |
1788 | &zone); | 1792 | &zone); |
1789 | return zone ? zone->node : node; | 1793 | return zone ? zone->node : node; |
1790 | } | 1794 | } |
1791 | 1795 | ||
1792 | default: | 1796 | default: |
1793 | BUG(); | 1797 | BUG(); |
1794 | } | 1798 | } |
1795 | } | 1799 | } |
1796 | 1800 | ||
1797 | /* Do static interleaving for a VMA with known offset. */ | 1801 | /* Do static interleaving for a VMA with known offset. */ |
1798 | static unsigned offset_il_node(struct mempolicy *pol, | 1802 | static unsigned offset_il_node(struct mempolicy *pol, |
1799 | struct vm_area_struct *vma, unsigned long off) | 1803 | struct vm_area_struct *vma, unsigned long off) |
1800 | { | 1804 | { |
1801 | unsigned nnodes = nodes_weight(pol->v.nodes); | 1805 | unsigned nnodes = nodes_weight(pol->v.nodes); |
1802 | unsigned target; | 1806 | unsigned target; |
1803 | int c; | 1807 | int c; |
1804 | int nid = NUMA_NO_NODE; | 1808 | int nid = NUMA_NO_NODE; |
1805 | 1809 | ||
1806 | if (!nnodes) | 1810 | if (!nnodes) |
1807 | return numa_node_id(); | 1811 | return numa_node_id(); |
1808 | target = (unsigned int)off % nnodes; | 1812 | target = (unsigned int)off % nnodes; |
1809 | c = 0; | 1813 | c = 0; |
1810 | do { | 1814 | do { |
1811 | nid = next_node(nid, pol->v.nodes); | 1815 | nid = next_node(nid, pol->v.nodes); |
1812 | c++; | 1816 | c++; |
1813 | } while (c <= target); | 1817 | } while (c <= target); |
1814 | return nid; | 1818 | return nid; |
1815 | } | 1819 | } |
1816 | 1820 | ||
1817 | /* Determine a node number for interleave */ | 1821 | /* Determine a node number for interleave */ |
1818 | static inline unsigned interleave_nid(struct mempolicy *pol, | 1822 | static inline unsigned interleave_nid(struct mempolicy *pol, |
1819 | struct vm_area_struct *vma, unsigned long addr, int shift) | 1823 | struct vm_area_struct *vma, unsigned long addr, int shift) |
1820 | { | 1824 | { |
1821 | if (vma) { | 1825 | if (vma) { |
1822 | unsigned long off; | 1826 | unsigned long off; |
1823 | 1827 | ||
1824 | /* | 1828 | /* |
1825 | * for small pages, there is no difference between | 1829 | * for small pages, there is no difference between |
1826 | * shift and PAGE_SHIFT, so the bit-shift is safe. | 1830 | * shift and PAGE_SHIFT, so the bit-shift is safe. |
1827 | * for huge pages, since vm_pgoff is in units of small | 1831 | * for huge pages, since vm_pgoff is in units of small |
1828 | * pages, we need to shift off the always 0 bits to get | 1832 | * pages, we need to shift off the always 0 bits to get |
1829 | * a useful offset. | 1833 | * a useful offset. |
1830 | */ | 1834 | */ |
1831 | BUG_ON(shift < PAGE_SHIFT); | 1835 | BUG_ON(shift < PAGE_SHIFT); |
1832 | off = vma->vm_pgoff >> (shift - PAGE_SHIFT); | 1836 | off = vma->vm_pgoff >> (shift - PAGE_SHIFT); |
1833 | off += (addr - vma->vm_start) >> shift; | 1837 | off += (addr - vma->vm_start) >> shift; |
1834 | return offset_il_node(pol, vma, off); | 1838 | return offset_il_node(pol, vma, off); |
1835 | } else | 1839 | } else |
1836 | return interleave_nodes(pol); | 1840 | return interleave_nodes(pol); |
1837 | } | 1841 | } |
1838 | 1842 | ||
1839 | /* | 1843 | /* |
1840 | * Return the bit number of a random bit set in the nodemask. | 1844 | * Return the bit number of a random bit set in the nodemask. |
1841 | * (returns NUMA_NO_NODE if nodemask is empty) | 1845 | * (returns NUMA_NO_NODE if nodemask is empty) |
1842 | */ | 1846 | */ |
1843 | int node_random(const nodemask_t *maskp) | 1847 | int node_random(const nodemask_t *maskp) |
1844 | { | 1848 | { |
1845 | int w, bit = NUMA_NO_NODE; | 1849 | int w, bit = NUMA_NO_NODE; |
1846 | 1850 | ||
1847 | w = nodes_weight(*maskp); | 1851 | w = nodes_weight(*maskp); |
1848 | if (w) | 1852 | if (w) |
1849 | bit = bitmap_ord_to_pos(maskp->bits, | 1853 | bit = bitmap_ord_to_pos(maskp->bits, |
1850 | get_random_int() % w, MAX_NUMNODES); | 1854 | get_random_int() % w, MAX_NUMNODES); |
1851 | return bit; | 1855 | return bit; |
1852 | } | 1856 | } |
1853 | 1857 | ||
1854 | #ifdef CONFIG_HUGETLBFS | 1858 | #ifdef CONFIG_HUGETLBFS |
1855 | /* | 1859 | /* |
1856 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) | 1860 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) |
1857 | * @vma = virtual memory area whose policy is sought | 1861 | * @vma = virtual memory area whose policy is sought |
1858 | * @addr = address in @vma for shared policy lookup and interleave policy | 1862 | * @addr = address in @vma for shared policy lookup and interleave policy |
1859 | * @gfp_flags = for requested zone | 1863 | * @gfp_flags = for requested zone |
1860 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy | 1864 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy |
1861 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | 1865 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask |
1862 | * | 1866 | * |
1863 | * Returns a zonelist suitable for a huge page allocation and a pointer | 1867 | * Returns a zonelist suitable for a huge page allocation and a pointer |
1864 | * to the struct mempolicy for conditional unref after allocation. | 1868 | * to the struct mempolicy for conditional unref after allocation. |
1865 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's | 1869 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's |
1866 | * @nodemask for filtering the zonelist. | 1870 | * @nodemask for filtering the zonelist. |
1867 | * | 1871 | * |
1868 | * Must be protected by read_mems_allowed_begin() | 1872 | * Must be protected by read_mems_allowed_begin() |
1869 | */ | 1873 | */ |
1870 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1874 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1871 | gfp_t gfp_flags, struct mempolicy **mpol, | 1875 | gfp_t gfp_flags, struct mempolicy **mpol, |
1872 | nodemask_t **nodemask) | 1876 | nodemask_t **nodemask) |
1873 | { | 1877 | { |
1874 | struct zonelist *zl; | 1878 | struct zonelist *zl; |
1875 | 1879 | ||
1876 | *mpol = get_vma_policy(current, vma, addr); | 1880 | *mpol = get_vma_policy(current, vma, addr); |
1877 | *nodemask = NULL; /* assume !MPOL_BIND */ | 1881 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1878 | 1882 | ||
1879 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { | 1883 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { |
1880 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, | 1884 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, |
1881 | huge_page_shift(hstate_vma(vma))), gfp_flags); | 1885 | huge_page_shift(hstate_vma(vma))), gfp_flags); |
1882 | } else { | 1886 | } else { |
1883 | zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); | 1887 | zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); |
1884 | if ((*mpol)->mode == MPOL_BIND) | 1888 | if ((*mpol)->mode == MPOL_BIND) |
1885 | *nodemask = &(*mpol)->v.nodes; | 1889 | *nodemask = &(*mpol)->v.nodes; |
1886 | } | 1890 | } |
1887 | return zl; | 1891 | return zl; |
1888 | } | 1892 | } |
1889 | 1893 | ||
1890 | /* | 1894 | /* |
1891 | * init_nodemask_of_mempolicy | 1895 | * init_nodemask_of_mempolicy |
1892 | * | 1896 | * |
1893 | * If the current task's mempolicy is "default" [NULL], return 'false' | 1897 | * If the current task's mempolicy is "default" [NULL], return 'false' |
1894 | * to indicate default policy. Otherwise, extract the policy nodemask | 1898 | * to indicate default policy. Otherwise, extract the policy nodemask |
1895 | * for 'bind' or 'interleave' policy into the argument nodemask, or | 1899 | * for 'bind' or 'interleave' policy into the argument nodemask, or |
1896 | * initialize the argument nodemask to contain the single node for | 1900 | * initialize the argument nodemask to contain the single node for |
1897 | * 'preferred' or 'local' policy and return 'true' to indicate presence | 1901 | * 'preferred' or 'local' policy and return 'true' to indicate presence |
1898 | * of non-default mempolicy. | 1902 | * of non-default mempolicy. |
1899 | * | 1903 | * |
1900 | * We don't bother with reference counting the mempolicy [mpol_get/put] | 1904 | * We don't bother with reference counting the mempolicy [mpol_get/put] |
1901 | * because the current task is examining it's own mempolicy and a task's | 1905 | * because the current task is examining it's own mempolicy and a task's |
1902 | * mempolicy is only ever changed by the task itself. | 1906 | * mempolicy is only ever changed by the task itself. |
1903 | * | 1907 | * |
1904 | * N.B., it is the caller's responsibility to free a returned nodemask. | 1908 | * N.B., it is the caller's responsibility to free a returned nodemask. |
1905 | */ | 1909 | */ |
1906 | bool init_nodemask_of_mempolicy(nodemask_t *mask) | 1910 | bool init_nodemask_of_mempolicy(nodemask_t *mask) |
1907 | { | 1911 | { |
1908 | struct mempolicy *mempolicy; | 1912 | struct mempolicy *mempolicy; |
1909 | int nid; | 1913 | int nid; |
1910 | 1914 | ||
1911 | if (!(mask && current->mempolicy)) | 1915 | if (!(mask && current->mempolicy)) |
1912 | return false; | 1916 | return false; |
1913 | 1917 | ||
1914 | task_lock(current); | 1918 | task_lock(current); |
1915 | mempolicy = current->mempolicy; | 1919 | mempolicy = current->mempolicy; |
1916 | switch (mempolicy->mode) { | 1920 | switch (mempolicy->mode) { |
1917 | case MPOL_PREFERRED: | 1921 | case MPOL_PREFERRED: |
1918 | if (mempolicy->flags & MPOL_F_LOCAL) | 1922 | if (mempolicy->flags & MPOL_F_LOCAL) |
1919 | nid = numa_node_id(); | 1923 | nid = numa_node_id(); |
1920 | else | 1924 | else |
1921 | nid = mempolicy->v.preferred_node; | 1925 | nid = mempolicy->v.preferred_node; |
1922 | init_nodemask_of_node(mask, nid); | 1926 | init_nodemask_of_node(mask, nid); |
1923 | break; | 1927 | break; |
1924 | 1928 | ||
1925 | case MPOL_BIND: | 1929 | case MPOL_BIND: |
1926 | /* Fall through */ | 1930 | /* Fall through */ |
1927 | case MPOL_INTERLEAVE: | 1931 | case MPOL_INTERLEAVE: |
1928 | *mask = mempolicy->v.nodes; | 1932 | *mask = mempolicy->v.nodes; |
1929 | break; | 1933 | break; |
1930 | 1934 | ||
1931 | default: | 1935 | default: |
1932 | BUG(); | 1936 | BUG(); |
1933 | } | 1937 | } |
1934 | task_unlock(current); | 1938 | task_unlock(current); |
1935 | 1939 | ||
1936 | return true; | 1940 | return true; |
1937 | } | 1941 | } |
1938 | #endif | 1942 | #endif |
1939 | 1943 | ||
1940 | /* | 1944 | /* |
1941 | * mempolicy_nodemask_intersects | 1945 | * mempolicy_nodemask_intersects |
1942 | * | 1946 | * |
1943 | * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default | 1947 | * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default |
1944 | * policy. Otherwise, check for intersection between mask and the policy | 1948 | * policy. Otherwise, check for intersection between mask and the policy |
1945 | * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' | 1949 | * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' |
1946 | * policy, always return true since it may allocate elsewhere on fallback. | 1950 | * policy, always return true since it may allocate elsewhere on fallback. |
1947 | * | 1951 | * |
1948 | * Takes task_lock(tsk) to prevent freeing of its mempolicy. | 1952 | * Takes task_lock(tsk) to prevent freeing of its mempolicy. |
1949 | */ | 1953 | */ |
1950 | bool mempolicy_nodemask_intersects(struct task_struct *tsk, | 1954 | bool mempolicy_nodemask_intersects(struct task_struct *tsk, |
1951 | const nodemask_t *mask) | 1955 | const nodemask_t *mask) |
1952 | { | 1956 | { |
1953 | struct mempolicy *mempolicy; | 1957 | struct mempolicy *mempolicy; |
1954 | bool ret = true; | 1958 | bool ret = true; |
1955 | 1959 | ||
1956 | if (!mask) | 1960 | if (!mask) |
1957 | return ret; | 1961 | return ret; |
1958 | task_lock(tsk); | 1962 | task_lock(tsk); |
1959 | mempolicy = tsk->mempolicy; | 1963 | mempolicy = tsk->mempolicy; |
1960 | if (!mempolicy) | 1964 | if (!mempolicy) |
1961 | goto out; | 1965 | goto out; |
1962 | 1966 | ||
1963 | switch (mempolicy->mode) { | 1967 | switch (mempolicy->mode) { |
1964 | case MPOL_PREFERRED: | 1968 | case MPOL_PREFERRED: |
1965 | /* | 1969 | /* |
1966 | * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to | 1970 | * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to |
1967 | * allocate from, they may fallback to other nodes when oom. | 1971 | * allocate from, they may fallback to other nodes when oom. |
1968 | * Thus, it's possible for tsk to have allocated memory from | 1972 | * Thus, it's possible for tsk to have allocated memory from |
1969 | * nodes in mask. | 1973 | * nodes in mask. |
1970 | */ | 1974 | */ |
1971 | break; | 1975 | break; |
1972 | case MPOL_BIND: | 1976 | case MPOL_BIND: |
1973 | case MPOL_INTERLEAVE: | 1977 | case MPOL_INTERLEAVE: |
1974 | ret = nodes_intersects(mempolicy->v.nodes, *mask); | 1978 | ret = nodes_intersects(mempolicy->v.nodes, *mask); |
1975 | break; | 1979 | break; |
1976 | default: | 1980 | default: |
1977 | BUG(); | 1981 | BUG(); |
1978 | } | 1982 | } |
1979 | out: | 1983 | out: |
1980 | task_unlock(tsk); | 1984 | task_unlock(tsk); |
1981 | return ret; | 1985 | return ret; |
1982 | } | 1986 | } |
1983 | 1987 | ||
1984 | /* Allocate a page in interleaved policy. | 1988 | /* Allocate a page in interleaved policy. |
1985 | Own path because it needs to do special accounting. */ | 1989 | Own path because it needs to do special accounting. */ |
1986 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | 1990 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |
1987 | unsigned nid) | 1991 | unsigned nid) |
1988 | { | 1992 | { |
1989 | struct zonelist *zl; | 1993 | struct zonelist *zl; |
1990 | struct page *page; | 1994 | struct page *page; |
1991 | 1995 | ||
1992 | zl = node_zonelist(nid, gfp); | 1996 | zl = node_zonelist(nid, gfp); |
1993 | page = __alloc_pages(gfp, order, zl); | 1997 | page = __alloc_pages(gfp, order, zl); |
1994 | if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) | 1998 | if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) |
1995 | inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); | 1999 | inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); |
1996 | return page; | 2000 | return page; |
1997 | } | 2001 | } |
1998 | 2002 | ||
1999 | /** | 2003 | /** |
2000 | * alloc_pages_vma - Allocate a page for a VMA. | 2004 | * alloc_pages_vma - Allocate a page for a VMA. |
2001 | * | 2005 | * |
2002 | * @gfp: | 2006 | * @gfp: |
2003 | * %GFP_USER user allocation. | 2007 | * %GFP_USER user allocation. |
2004 | * %GFP_KERNEL kernel allocations, | 2008 | * %GFP_KERNEL kernel allocations, |
2005 | * %GFP_HIGHMEM highmem/user allocations, | 2009 | * %GFP_HIGHMEM highmem/user allocations, |
2006 | * %GFP_FS allocation should not call back into a file system. | 2010 | * %GFP_FS allocation should not call back into a file system. |
2007 | * %GFP_ATOMIC don't sleep. | 2011 | * %GFP_ATOMIC don't sleep. |
2008 | * | 2012 | * |
2009 | * @order:Order of the GFP allocation. | 2013 | * @order:Order of the GFP allocation. |
2010 | * @vma: Pointer to VMA or NULL if not available. | 2014 | * @vma: Pointer to VMA or NULL if not available. |
2011 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | 2015 | * @addr: Virtual Address of the allocation. Must be inside the VMA. |
2012 | * | 2016 | * |
2013 | * This function allocates a page from the kernel page pool and applies | 2017 | * This function allocates a page from the kernel page pool and applies |
2014 | * a NUMA policy associated with the VMA or the current process. | 2018 | * a NUMA policy associated with the VMA or the current process. |
2015 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the | 2019 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the |
2016 | * mm_struct of the VMA to prevent it from going away. Should be used for | 2020 | * mm_struct of the VMA to prevent it from going away. Should be used for |
2017 | * all allocations for pages that will be mapped into | 2021 | * all allocations for pages that will be mapped into |
2018 | * user space. Returns NULL when no page can be allocated. | 2022 | * user space. Returns NULL when no page can be allocated. |
2019 | * | 2023 | * |
2020 | * Should be called with the mm_sem of the vma hold. | 2024 | * Should be called with the mm_sem of the vma hold. |
2021 | */ | 2025 | */ |
2022 | struct page * | 2026 | struct page * |
2023 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 2027 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
2024 | unsigned long addr, int node) | 2028 | unsigned long addr, int node) |
2025 | { | 2029 | { |
2026 | struct mempolicy *pol; | 2030 | struct mempolicy *pol; |
2027 | struct page *page; | 2031 | struct page *page; |
2028 | unsigned int cpuset_mems_cookie; | 2032 | unsigned int cpuset_mems_cookie; |
2029 | 2033 | ||
2030 | retry_cpuset: | 2034 | retry_cpuset: |
2031 | pol = get_vma_policy(current, vma, addr); | 2035 | pol = get_vma_policy(current, vma, addr); |
2032 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2036 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2033 | 2037 | ||
2034 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 2038 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
2035 | unsigned nid; | 2039 | unsigned nid; |
2036 | 2040 | ||
2037 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 2041 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
2038 | mpol_cond_put(pol); | 2042 | mpol_cond_put(pol); |
2039 | page = alloc_page_interleave(gfp, order, nid); | 2043 | page = alloc_page_interleave(gfp, order, nid); |
2040 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2044 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2041 | goto retry_cpuset; | 2045 | goto retry_cpuset; |
2042 | 2046 | ||
2043 | return page; | 2047 | return page; |
2044 | } | 2048 | } |
2045 | page = __alloc_pages_nodemask(gfp, order, | 2049 | page = __alloc_pages_nodemask(gfp, order, |
2046 | policy_zonelist(gfp, pol, node), | 2050 | policy_zonelist(gfp, pol, node), |
2047 | policy_nodemask(gfp, pol)); | 2051 | policy_nodemask(gfp, pol)); |
2048 | if (unlikely(mpol_needs_cond_ref(pol))) | 2052 | if (unlikely(mpol_needs_cond_ref(pol))) |
2049 | __mpol_put(pol); | 2053 | __mpol_put(pol); |
2050 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2054 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2051 | goto retry_cpuset; | 2055 | goto retry_cpuset; |
2052 | return page; | 2056 | return page; |
2053 | } | 2057 | } |
2054 | 2058 | ||
2055 | /** | 2059 | /** |
2056 | * alloc_pages_current - Allocate pages. | 2060 | * alloc_pages_current - Allocate pages. |
2057 | * | 2061 | * |
2058 | * @gfp: | 2062 | * @gfp: |
2059 | * %GFP_USER user allocation, | 2063 | * %GFP_USER user allocation, |
2060 | * %GFP_KERNEL kernel allocation, | 2064 | * %GFP_KERNEL kernel allocation, |
2061 | * %GFP_HIGHMEM highmem allocation, | 2065 | * %GFP_HIGHMEM highmem allocation, |
2062 | * %GFP_FS don't call back into a file system. | 2066 | * %GFP_FS don't call back into a file system. |
2063 | * %GFP_ATOMIC don't sleep. | 2067 | * %GFP_ATOMIC don't sleep. |
2064 | * @order: Power of two of allocation size in pages. 0 is a single page. | 2068 | * @order: Power of two of allocation size in pages. 0 is a single page. |
2065 | * | 2069 | * |
2066 | * Allocate a page from the kernel page pool. When not in | 2070 | * Allocate a page from the kernel page pool. When not in |
2067 | * interrupt context and apply the current process NUMA policy. | 2071 | * interrupt context and apply the current process NUMA policy. |
2068 | * Returns NULL when no page can be allocated. | 2072 | * Returns NULL when no page can be allocated. |
2069 | * | 2073 | * |
2070 | * Don't call cpuset_update_task_memory_state() unless | 2074 | * Don't call cpuset_update_task_memory_state() unless |
2071 | * 1) it's ok to take cpuset_sem (can WAIT), and | 2075 | * 1) it's ok to take cpuset_sem (can WAIT), and |
2072 | * 2) allocating for current task (not interrupt). | 2076 | * 2) allocating for current task (not interrupt). |
2073 | */ | 2077 | */ |
2074 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) | 2078 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) |
2075 | { | 2079 | { |
2076 | struct mempolicy *pol = get_task_policy(current); | 2080 | struct mempolicy *pol = get_task_policy(current); |
2077 | struct page *page; | 2081 | struct page *page; |
2078 | unsigned int cpuset_mems_cookie; | 2082 | unsigned int cpuset_mems_cookie; |
2079 | 2083 | ||
2080 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 2084 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
2081 | pol = &default_policy; | 2085 | pol = &default_policy; |
2082 | 2086 | ||
2083 | retry_cpuset: | 2087 | retry_cpuset: |
2084 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2088 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2085 | 2089 | ||
2086 | /* | 2090 | /* |
2087 | * No reference counting needed for current->mempolicy | 2091 | * No reference counting needed for current->mempolicy |
2088 | * nor system default_policy | 2092 | * nor system default_policy |
2089 | */ | 2093 | */ |
2090 | if (pol->mode == MPOL_INTERLEAVE) | 2094 | if (pol->mode == MPOL_INTERLEAVE) |
2091 | page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 2095 | page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
2092 | else | 2096 | else |
2093 | page = __alloc_pages_nodemask(gfp, order, | 2097 | page = __alloc_pages_nodemask(gfp, order, |
2094 | policy_zonelist(gfp, pol, numa_node_id()), | 2098 | policy_zonelist(gfp, pol, numa_node_id()), |
2095 | policy_nodemask(gfp, pol)); | 2099 | policy_nodemask(gfp, pol)); |
2096 | 2100 | ||
2097 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2101 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2098 | goto retry_cpuset; | 2102 | goto retry_cpuset; |
2099 | 2103 | ||
2100 | return page; | 2104 | return page; |
2101 | } | 2105 | } |
2102 | EXPORT_SYMBOL(alloc_pages_current); | 2106 | EXPORT_SYMBOL(alloc_pages_current); |
2103 | 2107 | ||
2104 | int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) | 2108 | int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) |
2105 | { | 2109 | { |
2106 | struct mempolicy *pol = mpol_dup(vma_policy(src)); | 2110 | struct mempolicy *pol = mpol_dup(vma_policy(src)); |
2107 | 2111 | ||
2108 | if (IS_ERR(pol)) | 2112 | if (IS_ERR(pol)) |
2109 | return PTR_ERR(pol); | 2113 | return PTR_ERR(pol); |
2110 | dst->vm_policy = pol; | 2114 | dst->vm_policy = pol; |
2111 | return 0; | 2115 | return 0; |
2112 | } | 2116 | } |
2113 | 2117 | ||
2114 | /* | 2118 | /* |
2115 | * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it | 2119 | * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it |
2116 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() | 2120 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() |
2117 | * with the mems_allowed returned by cpuset_mems_allowed(). This | 2121 | * with the mems_allowed returned by cpuset_mems_allowed(). This |
2118 | * keeps mempolicies cpuset relative after its cpuset moves. See | 2122 | * keeps mempolicies cpuset relative after its cpuset moves. See |
2119 | * further kernel/cpuset.c update_nodemask(). | 2123 | * further kernel/cpuset.c update_nodemask(). |
2120 | * | 2124 | * |
2121 | * current's mempolicy may be rebinded by the other task(the task that changes | 2125 | * current's mempolicy may be rebinded by the other task(the task that changes |
2122 | * cpuset's mems), so we needn't do rebind work for current task. | 2126 | * cpuset's mems), so we needn't do rebind work for current task. |
2123 | */ | 2127 | */ |
2124 | 2128 | ||
2125 | /* Slow path of a mempolicy duplicate */ | 2129 | /* Slow path of a mempolicy duplicate */ |
2126 | struct mempolicy *__mpol_dup(struct mempolicy *old) | 2130 | struct mempolicy *__mpol_dup(struct mempolicy *old) |
2127 | { | 2131 | { |
2128 | struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 2132 | struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
2129 | 2133 | ||
2130 | if (!new) | 2134 | if (!new) |
2131 | return ERR_PTR(-ENOMEM); | 2135 | return ERR_PTR(-ENOMEM); |
2132 | 2136 | ||
2133 | /* task's mempolicy is protected by alloc_lock */ | 2137 | /* task's mempolicy is protected by alloc_lock */ |
2134 | if (old == current->mempolicy) { | 2138 | if (old == current->mempolicy) { |
2135 | task_lock(current); | 2139 | task_lock(current); |
2136 | *new = *old; | 2140 | *new = *old; |
2137 | task_unlock(current); | 2141 | task_unlock(current); |
2138 | } else | 2142 | } else |
2139 | *new = *old; | 2143 | *new = *old; |
2140 | 2144 | ||
2141 | rcu_read_lock(); | 2145 | rcu_read_lock(); |
2142 | if (current_cpuset_is_being_rebound()) { | 2146 | if (current_cpuset_is_being_rebound()) { |
2143 | nodemask_t mems = cpuset_mems_allowed(current); | 2147 | nodemask_t mems = cpuset_mems_allowed(current); |
2144 | if (new->flags & MPOL_F_REBINDING) | 2148 | if (new->flags & MPOL_F_REBINDING) |
2145 | mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); | 2149 | mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); |
2146 | else | 2150 | else |
2147 | mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); | 2151 | mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); |
2148 | } | 2152 | } |
2149 | rcu_read_unlock(); | 2153 | rcu_read_unlock(); |
2150 | atomic_set(&new->refcnt, 1); | 2154 | atomic_set(&new->refcnt, 1); |
2151 | return new; | 2155 | return new; |
2152 | } | 2156 | } |
2153 | 2157 | ||
2154 | /* Slow path of a mempolicy comparison */ | 2158 | /* Slow path of a mempolicy comparison */ |
2155 | bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) | 2159 | bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) |
2156 | { | 2160 | { |
2157 | if (!a || !b) | 2161 | if (!a || !b) |
2158 | return false; | 2162 | return false; |
2159 | if (a->mode != b->mode) | 2163 | if (a->mode != b->mode) |
2160 | return false; | 2164 | return false; |
2161 | if (a->flags != b->flags) | 2165 | if (a->flags != b->flags) |
2162 | return false; | 2166 | return false; |
2163 | if (mpol_store_user_nodemask(a)) | 2167 | if (mpol_store_user_nodemask(a)) |
2164 | if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) | 2168 | if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) |
2165 | return false; | 2169 | return false; |
2166 | 2170 | ||
2167 | switch (a->mode) { | 2171 | switch (a->mode) { |
2168 | case MPOL_BIND: | 2172 | case MPOL_BIND: |
2169 | /* Fall through */ | 2173 | /* Fall through */ |
2170 | case MPOL_INTERLEAVE: | 2174 | case MPOL_INTERLEAVE: |
2171 | return !!nodes_equal(a->v.nodes, b->v.nodes); | 2175 | return !!nodes_equal(a->v.nodes, b->v.nodes); |
2172 | case MPOL_PREFERRED: | 2176 | case MPOL_PREFERRED: |
2173 | return a->v.preferred_node == b->v.preferred_node; | 2177 | return a->v.preferred_node == b->v.preferred_node; |
2174 | default: | 2178 | default: |
2175 | BUG(); | 2179 | BUG(); |
2176 | return false; | 2180 | return false; |
2177 | } | 2181 | } |
2178 | } | 2182 | } |
2179 | 2183 | ||
2180 | /* | 2184 | /* |
2181 | * Shared memory backing store policy support. | 2185 | * Shared memory backing store policy support. |
2182 | * | 2186 | * |
2183 | * Remember policies even when nobody has shared memory mapped. | 2187 | * Remember policies even when nobody has shared memory mapped. |
2184 | * The policies are kept in Red-Black tree linked from the inode. | 2188 | * The policies are kept in Red-Black tree linked from the inode. |
2185 | * They are protected by the sp->lock spinlock, which should be held | 2189 | * They are protected by the sp->lock spinlock, which should be held |
2186 | * for any accesses to the tree. | 2190 | * for any accesses to the tree. |
2187 | */ | 2191 | */ |
2188 | 2192 | ||
2189 | /* lookup first element intersecting start-end */ | 2193 | /* lookup first element intersecting start-end */ |
2190 | /* Caller holds sp->lock */ | 2194 | /* Caller holds sp->lock */ |
2191 | static struct sp_node * | 2195 | static struct sp_node * |
2192 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) | 2196 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) |
2193 | { | 2197 | { |
2194 | struct rb_node *n = sp->root.rb_node; | 2198 | struct rb_node *n = sp->root.rb_node; |
2195 | 2199 | ||
2196 | while (n) { | 2200 | while (n) { |
2197 | struct sp_node *p = rb_entry(n, struct sp_node, nd); | 2201 | struct sp_node *p = rb_entry(n, struct sp_node, nd); |
2198 | 2202 | ||
2199 | if (start >= p->end) | 2203 | if (start >= p->end) |
2200 | n = n->rb_right; | 2204 | n = n->rb_right; |
2201 | else if (end <= p->start) | 2205 | else if (end <= p->start) |
2202 | n = n->rb_left; | 2206 | n = n->rb_left; |
2203 | else | 2207 | else |
2204 | break; | 2208 | break; |
2205 | } | 2209 | } |
2206 | if (!n) | 2210 | if (!n) |
2207 | return NULL; | 2211 | return NULL; |
2208 | for (;;) { | 2212 | for (;;) { |
2209 | struct sp_node *w = NULL; | 2213 | struct sp_node *w = NULL; |
2210 | struct rb_node *prev = rb_prev(n); | 2214 | struct rb_node *prev = rb_prev(n); |
2211 | if (!prev) | 2215 | if (!prev) |
2212 | break; | 2216 | break; |
2213 | w = rb_entry(prev, struct sp_node, nd); | 2217 | w = rb_entry(prev, struct sp_node, nd); |
2214 | if (w->end <= start) | 2218 | if (w->end <= start) |
2215 | break; | 2219 | break; |
2216 | n = prev; | 2220 | n = prev; |
2217 | } | 2221 | } |
2218 | return rb_entry(n, struct sp_node, nd); | 2222 | return rb_entry(n, struct sp_node, nd); |
2219 | } | 2223 | } |
2220 | 2224 | ||
2221 | /* Insert a new shared policy into the list. */ | 2225 | /* Insert a new shared policy into the list. */ |
2222 | /* Caller holds sp->lock */ | 2226 | /* Caller holds sp->lock */ |
2223 | static void sp_insert(struct shared_policy *sp, struct sp_node *new) | 2227 | static void sp_insert(struct shared_policy *sp, struct sp_node *new) |
2224 | { | 2228 | { |
2225 | struct rb_node **p = &sp->root.rb_node; | 2229 | struct rb_node **p = &sp->root.rb_node; |
2226 | struct rb_node *parent = NULL; | 2230 | struct rb_node *parent = NULL; |
2227 | struct sp_node *nd; | 2231 | struct sp_node *nd; |
2228 | 2232 | ||
2229 | while (*p) { | 2233 | while (*p) { |
2230 | parent = *p; | 2234 | parent = *p; |
2231 | nd = rb_entry(parent, struct sp_node, nd); | 2235 | nd = rb_entry(parent, struct sp_node, nd); |
2232 | if (new->start < nd->start) | 2236 | if (new->start < nd->start) |
2233 | p = &(*p)->rb_left; | 2237 | p = &(*p)->rb_left; |
2234 | else if (new->end > nd->end) | 2238 | else if (new->end > nd->end) |
2235 | p = &(*p)->rb_right; | 2239 | p = &(*p)->rb_right; |
2236 | else | 2240 | else |
2237 | BUG(); | 2241 | BUG(); |
2238 | } | 2242 | } |
2239 | rb_link_node(&new->nd, parent, p); | 2243 | rb_link_node(&new->nd, parent, p); |
2240 | rb_insert_color(&new->nd, &sp->root); | 2244 | rb_insert_color(&new->nd, &sp->root); |
2241 | pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, | 2245 | pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, |
2242 | new->policy ? new->policy->mode : 0); | 2246 | new->policy ? new->policy->mode : 0); |
2243 | } | 2247 | } |
2244 | 2248 | ||
2245 | /* Find shared policy intersecting idx */ | 2249 | /* Find shared policy intersecting idx */ |
2246 | struct mempolicy * | 2250 | struct mempolicy * |
2247 | mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | 2251 | mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) |
2248 | { | 2252 | { |
2249 | struct mempolicy *pol = NULL; | 2253 | struct mempolicy *pol = NULL; |
2250 | struct sp_node *sn; | 2254 | struct sp_node *sn; |
2251 | 2255 | ||
2252 | if (!sp->root.rb_node) | 2256 | if (!sp->root.rb_node) |
2253 | return NULL; | 2257 | return NULL; |
2254 | spin_lock(&sp->lock); | 2258 | spin_lock(&sp->lock); |
2255 | sn = sp_lookup(sp, idx, idx+1); | 2259 | sn = sp_lookup(sp, idx, idx+1); |
2256 | if (sn) { | 2260 | if (sn) { |
2257 | mpol_get(sn->policy); | 2261 | mpol_get(sn->policy); |
2258 | pol = sn->policy; | 2262 | pol = sn->policy; |
2259 | } | 2263 | } |
2260 | spin_unlock(&sp->lock); | 2264 | spin_unlock(&sp->lock); |
2261 | return pol; | 2265 | return pol; |
2262 | } | 2266 | } |
2263 | 2267 | ||
2264 | static void sp_free(struct sp_node *n) | 2268 | static void sp_free(struct sp_node *n) |
2265 | { | 2269 | { |
2266 | mpol_put(n->policy); | 2270 | mpol_put(n->policy); |
2267 | kmem_cache_free(sn_cache, n); | 2271 | kmem_cache_free(sn_cache, n); |
2268 | } | 2272 | } |
2269 | 2273 | ||
2270 | /** | 2274 | /** |
2271 | * mpol_misplaced - check whether current page node is valid in policy | 2275 | * mpol_misplaced - check whether current page node is valid in policy |
2272 | * | 2276 | * |
2273 | * @page - page to be checked | 2277 | * @page - page to be checked |
2274 | * @vma - vm area where page mapped | 2278 | * @vma - vm area where page mapped |
2275 | * @addr - virtual address where page mapped | 2279 | * @addr - virtual address where page mapped |
2276 | * | 2280 | * |
2277 | * Lookup current policy node id for vma,addr and "compare to" page's | 2281 | * Lookup current policy node id for vma,addr and "compare to" page's |
2278 | * node id. | 2282 | * node id. |
2279 | * | 2283 | * |
2280 | * Returns: | 2284 | * Returns: |
2281 | * -1 - not misplaced, page is in the right node | 2285 | * -1 - not misplaced, page is in the right node |
2282 | * node - node id where the page should be | 2286 | * node - node id where the page should be |
2283 | * | 2287 | * |
2284 | * Policy determination "mimics" alloc_page_vma(). | 2288 | * Policy determination "mimics" alloc_page_vma(). |
2285 | * Called from fault path where we know the vma and faulting address. | 2289 | * Called from fault path where we know the vma and faulting address. |
2286 | */ | 2290 | */ |
2287 | int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) | 2291 | int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) |
2288 | { | 2292 | { |
2289 | struct mempolicy *pol; | 2293 | struct mempolicy *pol; |
2290 | struct zone *zone; | 2294 | struct zone *zone; |
2291 | int curnid = page_to_nid(page); | 2295 | int curnid = page_to_nid(page); |
2292 | unsigned long pgoff; | 2296 | unsigned long pgoff; |
2293 | int thiscpu = raw_smp_processor_id(); | 2297 | int thiscpu = raw_smp_processor_id(); |
2294 | int thisnid = cpu_to_node(thiscpu); | 2298 | int thisnid = cpu_to_node(thiscpu); |
2295 | int polnid = -1; | 2299 | int polnid = -1; |
2296 | int ret = -1; | 2300 | int ret = -1; |
2297 | 2301 | ||
2298 | BUG_ON(!vma); | 2302 | BUG_ON(!vma); |
2299 | 2303 | ||
2300 | pol = get_vma_policy(current, vma, addr); | 2304 | pol = get_vma_policy(current, vma, addr); |
2301 | if (!(pol->flags & MPOL_F_MOF)) | 2305 | if (!(pol->flags & MPOL_F_MOF)) |
2302 | goto out; | 2306 | goto out; |
2303 | 2307 | ||
2304 | switch (pol->mode) { | 2308 | switch (pol->mode) { |
2305 | case MPOL_INTERLEAVE: | 2309 | case MPOL_INTERLEAVE: |
2306 | BUG_ON(addr >= vma->vm_end); | 2310 | BUG_ON(addr >= vma->vm_end); |
2307 | BUG_ON(addr < vma->vm_start); | 2311 | BUG_ON(addr < vma->vm_start); |
2308 | 2312 | ||
2309 | pgoff = vma->vm_pgoff; | 2313 | pgoff = vma->vm_pgoff; |
2310 | pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; | 2314 | pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; |
2311 | polnid = offset_il_node(pol, vma, pgoff); | 2315 | polnid = offset_il_node(pol, vma, pgoff); |
2312 | break; | 2316 | break; |
2313 | 2317 | ||
2314 | case MPOL_PREFERRED: | 2318 | case MPOL_PREFERRED: |
2315 | if (pol->flags & MPOL_F_LOCAL) | 2319 | if (pol->flags & MPOL_F_LOCAL) |
2316 | polnid = numa_node_id(); | 2320 | polnid = numa_node_id(); |
2317 | else | 2321 | else |
2318 | polnid = pol->v.preferred_node; | 2322 | polnid = pol->v.preferred_node; |
2319 | break; | 2323 | break; |
2320 | 2324 | ||
2321 | case MPOL_BIND: | 2325 | case MPOL_BIND: |
2322 | /* | 2326 | /* |
2323 | * allows binding to multiple nodes. | 2327 | * allows binding to multiple nodes. |
2324 | * use current page if in policy nodemask, | 2328 | * use current page if in policy nodemask, |
2325 | * else select nearest allowed node, if any. | 2329 | * else select nearest allowed node, if any. |
2326 | * If no allowed nodes, use current [!misplaced]. | 2330 | * If no allowed nodes, use current [!misplaced]. |
2327 | */ | 2331 | */ |
2328 | if (node_isset(curnid, pol->v.nodes)) | 2332 | if (node_isset(curnid, pol->v.nodes)) |
2329 | goto out; | 2333 | goto out; |
2330 | (void)first_zones_zonelist( | 2334 | (void)first_zones_zonelist( |
2331 | node_zonelist(numa_node_id(), GFP_HIGHUSER), | 2335 | node_zonelist(numa_node_id(), GFP_HIGHUSER), |
2332 | gfp_zone(GFP_HIGHUSER), | 2336 | gfp_zone(GFP_HIGHUSER), |
2333 | &pol->v.nodes, &zone); | 2337 | &pol->v.nodes, &zone); |
2334 | polnid = zone->node; | 2338 | polnid = zone->node; |
2335 | break; | 2339 | break; |
2336 | 2340 | ||
2337 | default: | 2341 | default: |
2338 | BUG(); | 2342 | BUG(); |
2339 | } | 2343 | } |
2340 | 2344 | ||
2341 | /* Migrate the page towards the node whose CPU is referencing it */ | 2345 | /* Migrate the page towards the node whose CPU is referencing it */ |
2342 | if (pol->flags & MPOL_F_MORON) { | 2346 | if (pol->flags & MPOL_F_MORON) { |
2343 | polnid = thisnid; | 2347 | polnid = thisnid; |
2344 | 2348 | ||
2345 | if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) | 2349 | if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) |
2346 | goto out; | 2350 | goto out; |
2347 | } | 2351 | } |
2348 | 2352 | ||
2349 | if (curnid != polnid) | 2353 | if (curnid != polnid) |
2350 | ret = polnid; | 2354 | ret = polnid; |
2351 | out: | 2355 | out: |
2352 | mpol_cond_put(pol); | 2356 | mpol_cond_put(pol); |
2353 | 2357 | ||
2354 | return ret; | 2358 | return ret; |
2355 | } | 2359 | } |
2356 | 2360 | ||
2357 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 2361 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
2358 | { | 2362 | { |
2359 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 2363 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
2360 | rb_erase(&n->nd, &sp->root); | 2364 | rb_erase(&n->nd, &sp->root); |
2361 | sp_free(n); | 2365 | sp_free(n); |
2362 | } | 2366 | } |
2363 | 2367 | ||
2364 | static void sp_node_init(struct sp_node *node, unsigned long start, | 2368 | static void sp_node_init(struct sp_node *node, unsigned long start, |
2365 | unsigned long end, struct mempolicy *pol) | 2369 | unsigned long end, struct mempolicy *pol) |
2366 | { | 2370 | { |
2367 | node->start = start; | 2371 | node->start = start; |
2368 | node->end = end; | 2372 | node->end = end; |
2369 | node->policy = pol; | 2373 | node->policy = pol; |
2370 | } | 2374 | } |
2371 | 2375 | ||
2372 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | 2376 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, |
2373 | struct mempolicy *pol) | 2377 | struct mempolicy *pol) |
2374 | { | 2378 | { |
2375 | struct sp_node *n; | 2379 | struct sp_node *n; |
2376 | struct mempolicy *newpol; | 2380 | struct mempolicy *newpol; |
2377 | 2381 | ||
2378 | n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | 2382 | n = kmem_cache_alloc(sn_cache, GFP_KERNEL); |
2379 | if (!n) | 2383 | if (!n) |
2380 | return NULL; | 2384 | return NULL; |
2381 | 2385 | ||
2382 | newpol = mpol_dup(pol); | 2386 | newpol = mpol_dup(pol); |
2383 | if (IS_ERR(newpol)) { | 2387 | if (IS_ERR(newpol)) { |
2384 | kmem_cache_free(sn_cache, n); | 2388 | kmem_cache_free(sn_cache, n); |
2385 | return NULL; | 2389 | return NULL; |
2386 | } | 2390 | } |
2387 | newpol->flags |= MPOL_F_SHARED; | 2391 | newpol->flags |= MPOL_F_SHARED; |
2388 | sp_node_init(n, start, end, newpol); | 2392 | sp_node_init(n, start, end, newpol); |
2389 | 2393 | ||
2390 | return n; | 2394 | return n; |
2391 | } | 2395 | } |
2392 | 2396 | ||
2393 | /* Replace a policy range. */ | 2397 | /* Replace a policy range. */ |
2394 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, | 2398 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, |
2395 | unsigned long end, struct sp_node *new) | 2399 | unsigned long end, struct sp_node *new) |
2396 | { | 2400 | { |
2397 | struct sp_node *n; | 2401 | struct sp_node *n; |
2398 | struct sp_node *n_new = NULL; | 2402 | struct sp_node *n_new = NULL; |
2399 | struct mempolicy *mpol_new = NULL; | 2403 | struct mempolicy *mpol_new = NULL; |
2400 | int ret = 0; | 2404 | int ret = 0; |
2401 | 2405 | ||
2402 | restart: | 2406 | restart: |
2403 | spin_lock(&sp->lock); | 2407 | spin_lock(&sp->lock); |
2404 | n = sp_lookup(sp, start, end); | 2408 | n = sp_lookup(sp, start, end); |
2405 | /* Take care of old policies in the same range. */ | 2409 | /* Take care of old policies in the same range. */ |
2406 | while (n && n->start < end) { | 2410 | while (n && n->start < end) { |
2407 | struct rb_node *next = rb_next(&n->nd); | 2411 | struct rb_node *next = rb_next(&n->nd); |
2408 | if (n->start >= start) { | 2412 | if (n->start >= start) { |
2409 | if (n->end <= end) | 2413 | if (n->end <= end) |
2410 | sp_delete(sp, n); | 2414 | sp_delete(sp, n); |
2411 | else | 2415 | else |
2412 | n->start = end; | 2416 | n->start = end; |
2413 | } else { | 2417 | } else { |
2414 | /* Old policy spanning whole new range. */ | 2418 | /* Old policy spanning whole new range. */ |
2415 | if (n->end > end) { | 2419 | if (n->end > end) { |
2416 | if (!n_new) | 2420 | if (!n_new) |
2417 | goto alloc_new; | 2421 | goto alloc_new; |
2418 | 2422 | ||
2419 | *mpol_new = *n->policy; | 2423 | *mpol_new = *n->policy; |
2420 | atomic_set(&mpol_new->refcnt, 1); | 2424 | atomic_set(&mpol_new->refcnt, 1); |
2421 | sp_node_init(n_new, end, n->end, mpol_new); | 2425 | sp_node_init(n_new, end, n->end, mpol_new); |
2422 | n->end = start; | 2426 | n->end = start; |
2423 | sp_insert(sp, n_new); | 2427 | sp_insert(sp, n_new); |
2424 | n_new = NULL; | 2428 | n_new = NULL; |
2425 | mpol_new = NULL; | 2429 | mpol_new = NULL; |
2426 | break; | 2430 | break; |
2427 | } else | 2431 | } else |
2428 | n->end = start; | 2432 | n->end = start; |
2429 | } | 2433 | } |
2430 | if (!next) | 2434 | if (!next) |
2431 | break; | 2435 | break; |
2432 | n = rb_entry(next, struct sp_node, nd); | 2436 | n = rb_entry(next, struct sp_node, nd); |
2433 | } | 2437 | } |
2434 | if (new) | 2438 | if (new) |
2435 | sp_insert(sp, new); | 2439 | sp_insert(sp, new); |
2436 | spin_unlock(&sp->lock); | 2440 | spin_unlock(&sp->lock); |
2437 | ret = 0; | 2441 | ret = 0; |
2438 | 2442 | ||
2439 | err_out: | 2443 | err_out: |
2440 | if (mpol_new) | 2444 | if (mpol_new) |
2441 | mpol_put(mpol_new); | 2445 | mpol_put(mpol_new); |
2442 | if (n_new) | 2446 | if (n_new) |
2443 | kmem_cache_free(sn_cache, n_new); | 2447 | kmem_cache_free(sn_cache, n_new); |
2444 | 2448 | ||
2445 | return ret; | 2449 | return ret; |
2446 | 2450 | ||
2447 | alloc_new: | 2451 | alloc_new: |
2448 | spin_unlock(&sp->lock); | 2452 | spin_unlock(&sp->lock); |
2449 | ret = -ENOMEM; | 2453 | ret = -ENOMEM; |
2450 | n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); | 2454 | n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); |
2451 | if (!n_new) | 2455 | if (!n_new) |
2452 | goto err_out; | 2456 | goto err_out; |
2453 | mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 2457 | mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
2454 | if (!mpol_new) | 2458 | if (!mpol_new) |
2455 | goto err_out; | 2459 | goto err_out; |
2456 | goto restart; | 2460 | goto restart; |
2457 | } | 2461 | } |
2458 | 2462 | ||
2459 | /** | 2463 | /** |
2460 | * mpol_shared_policy_init - initialize shared policy for inode | 2464 | * mpol_shared_policy_init - initialize shared policy for inode |
2461 | * @sp: pointer to inode shared policy | 2465 | * @sp: pointer to inode shared policy |
2462 | * @mpol: struct mempolicy to install | 2466 | * @mpol: struct mempolicy to install |
2463 | * | 2467 | * |
2464 | * Install non-NULL @mpol in inode's shared policy rb-tree. | 2468 | * Install non-NULL @mpol in inode's shared policy rb-tree. |
2465 | * On entry, the current task has a reference on a non-NULL @mpol. | 2469 | * On entry, the current task has a reference on a non-NULL @mpol. |
2466 | * This must be released on exit. | 2470 | * This must be released on exit. |
2467 | * This is called at get_inode() calls and we can use GFP_KERNEL. | 2471 | * This is called at get_inode() calls and we can use GFP_KERNEL. |
2468 | */ | 2472 | */ |
2469 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | 2473 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) |
2470 | { | 2474 | { |
2471 | int ret; | 2475 | int ret; |
2472 | 2476 | ||
2473 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 2477 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
2474 | spin_lock_init(&sp->lock); | 2478 | spin_lock_init(&sp->lock); |
2475 | 2479 | ||
2476 | if (mpol) { | 2480 | if (mpol) { |
2477 | struct vm_area_struct pvma; | 2481 | struct vm_area_struct pvma; |
2478 | struct mempolicy *new; | 2482 | struct mempolicy *new; |
2479 | NODEMASK_SCRATCH(scratch); | 2483 | NODEMASK_SCRATCH(scratch); |
2480 | 2484 | ||
2481 | if (!scratch) | 2485 | if (!scratch) |
2482 | goto put_mpol; | 2486 | goto put_mpol; |
2483 | /* contextualize the tmpfs mount point mempolicy */ | 2487 | /* contextualize the tmpfs mount point mempolicy */ |
2484 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); | 2488 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
2485 | if (IS_ERR(new)) | 2489 | if (IS_ERR(new)) |
2486 | goto free_scratch; /* no valid nodemask intersection */ | 2490 | goto free_scratch; /* no valid nodemask intersection */ |
2487 | 2491 | ||
2488 | task_lock(current); | 2492 | task_lock(current); |
2489 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); | 2493 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); |
2490 | task_unlock(current); | 2494 | task_unlock(current); |
2491 | if (ret) | 2495 | if (ret) |
2492 | goto put_new; | 2496 | goto put_new; |
2493 | 2497 | ||
2494 | /* Create pseudo-vma that contains just the policy */ | 2498 | /* Create pseudo-vma that contains just the policy */ |
2495 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 2499 | memset(&pvma, 0, sizeof(struct vm_area_struct)); |
2496 | pvma.vm_end = TASK_SIZE; /* policy covers entire file */ | 2500 | pvma.vm_end = TASK_SIZE; /* policy covers entire file */ |
2497 | mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ | 2501 | mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ |
2498 | 2502 | ||
2499 | put_new: | 2503 | put_new: |
2500 | mpol_put(new); /* drop initial ref */ | 2504 | mpol_put(new); /* drop initial ref */ |
2501 | free_scratch: | 2505 | free_scratch: |
2502 | NODEMASK_SCRATCH_FREE(scratch); | 2506 | NODEMASK_SCRATCH_FREE(scratch); |
2503 | put_mpol: | 2507 | put_mpol: |
2504 | mpol_put(mpol); /* drop our incoming ref on sb mpol */ | 2508 | mpol_put(mpol); /* drop our incoming ref on sb mpol */ |
2505 | } | 2509 | } |
2506 | } | 2510 | } |
2507 | 2511 | ||
2508 | int mpol_set_shared_policy(struct shared_policy *info, | 2512 | int mpol_set_shared_policy(struct shared_policy *info, |
2509 | struct vm_area_struct *vma, struct mempolicy *npol) | 2513 | struct vm_area_struct *vma, struct mempolicy *npol) |
2510 | { | 2514 | { |
2511 | int err; | 2515 | int err; |
2512 | struct sp_node *new = NULL; | 2516 | struct sp_node *new = NULL; |
2513 | unsigned long sz = vma_pages(vma); | 2517 | unsigned long sz = vma_pages(vma); |
2514 | 2518 | ||
2515 | pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", | 2519 | pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", |
2516 | vma->vm_pgoff, | 2520 | vma->vm_pgoff, |
2517 | sz, npol ? npol->mode : -1, | 2521 | sz, npol ? npol->mode : -1, |
2518 | npol ? npol->flags : -1, | 2522 | npol ? npol->flags : -1, |
2519 | npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); | 2523 | npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); |
2520 | 2524 | ||
2521 | if (npol) { | 2525 | if (npol) { |
2522 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 2526 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
2523 | if (!new) | 2527 | if (!new) |
2524 | return -ENOMEM; | 2528 | return -ENOMEM; |
2525 | } | 2529 | } |
2526 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); | 2530 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); |
2527 | if (err && new) | 2531 | if (err && new) |
2528 | sp_free(new); | 2532 | sp_free(new); |
2529 | return err; | 2533 | return err; |
2530 | } | 2534 | } |
2531 | 2535 | ||
2532 | /* Free a backing policy store on inode delete. */ | 2536 | /* Free a backing policy store on inode delete. */ |
2533 | void mpol_free_shared_policy(struct shared_policy *p) | 2537 | void mpol_free_shared_policy(struct shared_policy *p) |
2534 | { | 2538 | { |
2535 | struct sp_node *n; | 2539 | struct sp_node *n; |
2536 | struct rb_node *next; | 2540 | struct rb_node *next; |
2537 | 2541 | ||
2538 | if (!p->root.rb_node) | 2542 | if (!p->root.rb_node) |
2539 | return; | 2543 | return; |
2540 | spin_lock(&p->lock); | 2544 | spin_lock(&p->lock); |
2541 | next = rb_first(&p->root); | 2545 | next = rb_first(&p->root); |
2542 | while (next) { | 2546 | while (next) { |
2543 | n = rb_entry(next, struct sp_node, nd); | 2547 | n = rb_entry(next, struct sp_node, nd); |
2544 | next = rb_next(&n->nd); | 2548 | next = rb_next(&n->nd); |
2545 | sp_delete(p, n); | 2549 | sp_delete(p, n); |
2546 | } | 2550 | } |
2547 | spin_unlock(&p->lock); | 2551 | spin_unlock(&p->lock); |
2548 | } | 2552 | } |
2549 | 2553 | ||
2550 | #ifdef CONFIG_NUMA_BALANCING | 2554 | #ifdef CONFIG_NUMA_BALANCING |
2551 | static int __initdata numabalancing_override; | 2555 | static int __initdata numabalancing_override; |
2552 | 2556 | ||
2553 | static void __init check_numabalancing_enable(void) | 2557 | static void __init check_numabalancing_enable(void) |
2554 | { | 2558 | { |
2555 | bool numabalancing_default = false; | 2559 | bool numabalancing_default = false; |
2556 | 2560 | ||
2557 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) | 2561 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) |
2558 | numabalancing_default = true; | 2562 | numabalancing_default = true; |
2559 | 2563 | ||
2560 | /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ | 2564 | /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ |
2561 | if (numabalancing_override) | 2565 | if (numabalancing_override) |
2562 | set_numabalancing_state(numabalancing_override == 1); | 2566 | set_numabalancing_state(numabalancing_override == 1); |
2563 | 2567 | ||
2564 | if (nr_node_ids > 1 && !numabalancing_override) { | 2568 | if (nr_node_ids > 1 && !numabalancing_override) { |
2565 | pr_info("%s automatic NUMA balancing. " | 2569 | pr_info("%s automatic NUMA balancing. " |
2566 | "Configure with numa_balancing= or the " | 2570 | "Configure with numa_balancing= or the " |
2567 | "kernel.numa_balancing sysctl", | 2571 | "kernel.numa_balancing sysctl", |
2568 | numabalancing_default ? "Enabling" : "Disabling"); | 2572 | numabalancing_default ? "Enabling" : "Disabling"); |
2569 | set_numabalancing_state(numabalancing_default); | 2573 | set_numabalancing_state(numabalancing_default); |
2570 | } | 2574 | } |
2571 | } | 2575 | } |
2572 | 2576 | ||
2573 | static int __init setup_numabalancing(char *str) | 2577 | static int __init setup_numabalancing(char *str) |
2574 | { | 2578 | { |
2575 | int ret = 0; | 2579 | int ret = 0; |
2576 | if (!str) | 2580 | if (!str) |
2577 | goto out; | 2581 | goto out; |
2578 | 2582 | ||
2579 | if (!strcmp(str, "enable")) { | 2583 | if (!strcmp(str, "enable")) { |
2580 | numabalancing_override = 1; | 2584 | numabalancing_override = 1; |
2581 | ret = 1; | 2585 | ret = 1; |
2582 | } else if (!strcmp(str, "disable")) { | 2586 | } else if (!strcmp(str, "disable")) { |
2583 | numabalancing_override = -1; | 2587 | numabalancing_override = -1; |
2584 | ret = 1; | 2588 | ret = 1; |
2585 | } | 2589 | } |
2586 | out: | 2590 | out: |
2587 | if (!ret) | 2591 | if (!ret) |
2588 | pr_warn("Unable to parse numa_balancing=\n"); | 2592 | pr_warn("Unable to parse numa_balancing=\n"); |
2589 | 2593 | ||
2590 | return ret; | 2594 | return ret; |
2591 | } | 2595 | } |
2592 | __setup("numa_balancing=", setup_numabalancing); | 2596 | __setup("numa_balancing=", setup_numabalancing); |
2593 | #else | 2597 | #else |
2594 | static inline void __init check_numabalancing_enable(void) | 2598 | static inline void __init check_numabalancing_enable(void) |
2595 | { | 2599 | { |
2596 | } | 2600 | } |
2597 | #endif /* CONFIG_NUMA_BALANCING */ | 2601 | #endif /* CONFIG_NUMA_BALANCING */ |
2598 | 2602 | ||
2599 | /* assumes fs == KERNEL_DS */ | 2603 | /* assumes fs == KERNEL_DS */ |
2600 | void __init numa_policy_init(void) | 2604 | void __init numa_policy_init(void) |
2601 | { | 2605 | { |
2602 | nodemask_t interleave_nodes; | 2606 | nodemask_t interleave_nodes; |
2603 | unsigned long largest = 0; | 2607 | unsigned long largest = 0; |
2604 | int nid, prefer = 0; | 2608 | int nid, prefer = 0; |
2605 | 2609 | ||
2606 | policy_cache = kmem_cache_create("numa_policy", | 2610 | policy_cache = kmem_cache_create("numa_policy", |
2607 | sizeof(struct mempolicy), | 2611 | sizeof(struct mempolicy), |
2608 | 0, SLAB_PANIC, NULL); | 2612 | 0, SLAB_PANIC, NULL); |
2609 | 2613 | ||
2610 | sn_cache = kmem_cache_create("shared_policy_node", | 2614 | sn_cache = kmem_cache_create("shared_policy_node", |
2611 | sizeof(struct sp_node), | 2615 | sizeof(struct sp_node), |
2612 | 0, SLAB_PANIC, NULL); | 2616 | 0, SLAB_PANIC, NULL); |
2613 | 2617 | ||
2614 | for_each_node(nid) { | 2618 | for_each_node(nid) { |
2615 | preferred_node_policy[nid] = (struct mempolicy) { | 2619 | preferred_node_policy[nid] = (struct mempolicy) { |
2616 | .refcnt = ATOMIC_INIT(1), | 2620 | .refcnt = ATOMIC_INIT(1), |
2617 | .mode = MPOL_PREFERRED, | 2621 | .mode = MPOL_PREFERRED, |
2618 | .flags = MPOL_F_MOF | MPOL_F_MORON, | 2622 | .flags = MPOL_F_MOF | MPOL_F_MORON, |
2619 | .v = { .preferred_node = nid, }, | 2623 | .v = { .preferred_node = nid, }, |
2620 | }; | 2624 | }; |
2621 | } | 2625 | } |
2622 | 2626 | ||
2623 | /* | 2627 | /* |
2624 | * Set interleaving policy for system init. Interleaving is only | 2628 | * Set interleaving policy for system init. Interleaving is only |
2625 | * enabled across suitably sized nodes (default is >= 16MB), or | 2629 | * enabled across suitably sized nodes (default is >= 16MB), or |
2626 | * fall back to the largest node if they're all smaller. | 2630 | * fall back to the largest node if they're all smaller. |
2627 | */ | 2631 | */ |
2628 | nodes_clear(interleave_nodes); | 2632 | nodes_clear(interleave_nodes); |
2629 | for_each_node_state(nid, N_MEMORY) { | 2633 | for_each_node_state(nid, N_MEMORY) { |
2630 | unsigned long total_pages = node_present_pages(nid); | 2634 | unsigned long total_pages = node_present_pages(nid); |
2631 | 2635 | ||
2632 | /* Preserve the largest node */ | 2636 | /* Preserve the largest node */ |
2633 | if (largest < total_pages) { | 2637 | if (largest < total_pages) { |
2634 | largest = total_pages; | 2638 | largest = total_pages; |
2635 | prefer = nid; | 2639 | prefer = nid; |
2636 | } | 2640 | } |
2637 | 2641 | ||
2638 | /* Interleave this node? */ | 2642 | /* Interleave this node? */ |
2639 | if ((total_pages << PAGE_SHIFT) >= (16 << 20)) | 2643 | if ((total_pages << PAGE_SHIFT) >= (16 << 20)) |
2640 | node_set(nid, interleave_nodes); | 2644 | node_set(nid, interleave_nodes); |
2641 | } | 2645 | } |
2642 | 2646 | ||
2643 | /* All too small, use the largest */ | 2647 | /* All too small, use the largest */ |
2644 | if (unlikely(nodes_empty(interleave_nodes))) | 2648 | if (unlikely(nodes_empty(interleave_nodes))) |
2645 | node_set(prefer, interleave_nodes); | 2649 | node_set(prefer, interleave_nodes); |
2646 | 2650 | ||
2647 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) | 2651 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) |
2648 | printk("numa_policy_init: interleaving failed\n"); | 2652 | printk("numa_policy_init: interleaving failed\n"); |
2649 | 2653 | ||
2650 | check_numabalancing_enable(); | 2654 | check_numabalancing_enable(); |
2651 | } | 2655 | } |
2652 | 2656 | ||
2653 | /* Reset policy of current process to default */ | 2657 | /* Reset policy of current process to default */ |
2654 | void numa_default_policy(void) | 2658 | void numa_default_policy(void) |
2655 | { | 2659 | { |
2656 | do_set_mempolicy(MPOL_DEFAULT, 0, NULL); | 2660 | do_set_mempolicy(MPOL_DEFAULT, 0, NULL); |
2657 | } | 2661 | } |
2658 | 2662 | ||
2659 | /* | 2663 | /* |
2660 | * Parse and format mempolicy from/to strings | 2664 | * Parse and format mempolicy from/to strings |
2661 | */ | 2665 | */ |
2662 | 2666 | ||
2663 | /* | 2667 | /* |
2664 | * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. | 2668 | * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. |
2665 | */ | 2669 | */ |
2666 | static const char * const policy_modes[] = | 2670 | static const char * const policy_modes[] = |
2667 | { | 2671 | { |
2668 | [MPOL_DEFAULT] = "default", | 2672 | [MPOL_DEFAULT] = "default", |
2669 | [MPOL_PREFERRED] = "prefer", | 2673 | [MPOL_PREFERRED] = "prefer", |
2670 | [MPOL_BIND] = "bind", | 2674 | [MPOL_BIND] = "bind", |
2671 | [MPOL_INTERLEAVE] = "interleave", | 2675 | [MPOL_INTERLEAVE] = "interleave", |
2672 | [MPOL_LOCAL] = "local", | 2676 | [MPOL_LOCAL] = "local", |
2673 | }; | 2677 | }; |
2674 | 2678 | ||
2675 | 2679 | ||
2676 | #ifdef CONFIG_TMPFS | 2680 | #ifdef CONFIG_TMPFS |
2677 | /** | 2681 | /** |
2678 | * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. | 2682 | * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. |
2679 | * @str: string containing mempolicy to parse | 2683 | * @str: string containing mempolicy to parse |
2680 | * @mpol: pointer to struct mempolicy pointer, returned on success. | 2684 | * @mpol: pointer to struct mempolicy pointer, returned on success. |
2681 | * | 2685 | * |
2682 | * Format of input: | 2686 | * Format of input: |
2683 | * <mode>[=<flags>][:<nodelist>] | 2687 | * <mode>[=<flags>][:<nodelist>] |
2684 | * | 2688 | * |
2685 | * On success, returns 0, else 1 | 2689 | * On success, returns 0, else 1 |
2686 | */ | 2690 | */ |
2687 | int mpol_parse_str(char *str, struct mempolicy **mpol) | 2691 | int mpol_parse_str(char *str, struct mempolicy **mpol) |
2688 | { | 2692 | { |
2689 | struct mempolicy *new = NULL; | 2693 | struct mempolicy *new = NULL; |
2690 | unsigned short mode; | 2694 | unsigned short mode; |
2691 | unsigned short mode_flags; | 2695 | unsigned short mode_flags; |
2692 | nodemask_t nodes; | 2696 | nodemask_t nodes; |
2693 | char *nodelist = strchr(str, ':'); | 2697 | char *nodelist = strchr(str, ':'); |
2694 | char *flags = strchr(str, '='); | 2698 | char *flags = strchr(str, '='); |
2695 | int err = 1; | 2699 | int err = 1; |
2696 | 2700 | ||
2697 | if (nodelist) { | 2701 | if (nodelist) { |
2698 | /* NUL-terminate mode or flags string */ | 2702 | /* NUL-terminate mode or flags string */ |
2699 | *nodelist++ = '\0'; | 2703 | *nodelist++ = '\0'; |
2700 | if (nodelist_parse(nodelist, nodes)) | 2704 | if (nodelist_parse(nodelist, nodes)) |
2701 | goto out; | 2705 | goto out; |
2702 | if (!nodes_subset(nodes, node_states[N_MEMORY])) | 2706 | if (!nodes_subset(nodes, node_states[N_MEMORY])) |
2703 | goto out; | 2707 | goto out; |
2704 | } else | 2708 | } else |
2705 | nodes_clear(nodes); | 2709 | nodes_clear(nodes); |
2706 | 2710 | ||
2707 | if (flags) | 2711 | if (flags) |
2708 | *flags++ = '\0'; /* terminate mode string */ | 2712 | *flags++ = '\0'; /* terminate mode string */ |
2709 | 2713 | ||
2710 | for (mode = 0; mode < MPOL_MAX; mode++) { | 2714 | for (mode = 0; mode < MPOL_MAX; mode++) { |
2711 | if (!strcmp(str, policy_modes[mode])) { | 2715 | if (!strcmp(str, policy_modes[mode])) { |
2712 | break; | 2716 | break; |
2713 | } | 2717 | } |
2714 | } | 2718 | } |
2715 | if (mode >= MPOL_MAX) | 2719 | if (mode >= MPOL_MAX) |
2716 | goto out; | 2720 | goto out; |
2717 | 2721 | ||
2718 | switch (mode) { | 2722 | switch (mode) { |
2719 | case MPOL_PREFERRED: | 2723 | case MPOL_PREFERRED: |
2720 | /* | 2724 | /* |
2721 | * Insist on a nodelist of one node only | 2725 | * Insist on a nodelist of one node only |
2722 | */ | 2726 | */ |
2723 | if (nodelist) { | 2727 | if (nodelist) { |
2724 | char *rest = nodelist; | 2728 | char *rest = nodelist; |
2725 | while (isdigit(*rest)) | 2729 | while (isdigit(*rest)) |
2726 | rest++; | 2730 | rest++; |
2727 | if (*rest) | 2731 | if (*rest) |
2728 | goto out; | 2732 | goto out; |
2729 | } | 2733 | } |
2730 | break; | 2734 | break; |
2731 | case MPOL_INTERLEAVE: | 2735 | case MPOL_INTERLEAVE: |
2732 | /* | 2736 | /* |
2733 | * Default to online nodes with memory if no nodelist | 2737 | * Default to online nodes with memory if no nodelist |
2734 | */ | 2738 | */ |
2735 | if (!nodelist) | 2739 | if (!nodelist) |
2736 | nodes = node_states[N_MEMORY]; | 2740 | nodes = node_states[N_MEMORY]; |
2737 | break; | 2741 | break; |
2738 | case MPOL_LOCAL: | 2742 | case MPOL_LOCAL: |
2739 | /* | 2743 | /* |
2740 | * Don't allow a nodelist; mpol_new() checks flags | 2744 | * Don't allow a nodelist; mpol_new() checks flags |
2741 | */ | 2745 | */ |
2742 | if (nodelist) | 2746 | if (nodelist) |
2743 | goto out; | 2747 | goto out; |
2744 | mode = MPOL_PREFERRED; | 2748 | mode = MPOL_PREFERRED; |
2745 | break; | 2749 | break; |
2746 | case MPOL_DEFAULT: | 2750 | case MPOL_DEFAULT: |
2747 | /* | 2751 | /* |
2748 | * Insist on a empty nodelist | 2752 | * Insist on a empty nodelist |
2749 | */ | 2753 | */ |
2750 | if (!nodelist) | 2754 | if (!nodelist) |
2751 | err = 0; | 2755 | err = 0; |
2752 | goto out; | 2756 | goto out; |
2753 | case MPOL_BIND: | 2757 | case MPOL_BIND: |
2754 | /* | 2758 | /* |
2755 | * Insist on a nodelist | 2759 | * Insist on a nodelist |
2756 | */ | 2760 | */ |
2757 | if (!nodelist) | 2761 | if (!nodelist) |
2758 | goto out; | 2762 | goto out; |
2759 | } | 2763 | } |
2760 | 2764 | ||
2761 | mode_flags = 0; | 2765 | mode_flags = 0; |
2762 | if (flags) { | 2766 | if (flags) { |
2763 | /* | 2767 | /* |
2764 | * Currently, we only support two mutually exclusive | 2768 | * Currently, we only support two mutually exclusive |
2765 | * mode flags. | 2769 | * mode flags. |
2766 | */ | 2770 | */ |
2767 | if (!strcmp(flags, "static")) | 2771 | if (!strcmp(flags, "static")) |
2768 | mode_flags |= MPOL_F_STATIC_NODES; | 2772 | mode_flags |= MPOL_F_STATIC_NODES; |
2769 | else if (!strcmp(flags, "relative")) | 2773 | else if (!strcmp(flags, "relative")) |
2770 | mode_flags |= MPOL_F_RELATIVE_NODES; | 2774 | mode_flags |= MPOL_F_RELATIVE_NODES; |
2771 | else | 2775 | else |
2772 | goto out; | 2776 | goto out; |
2773 | } | 2777 | } |
2774 | 2778 | ||
2775 | new = mpol_new(mode, mode_flags, &nodes); | 2779 | new = mpol_new(mode, mode_flags, &nodes); |
2776 | if (IS_ERR(new)) | 2780 | if (IS_ERR(new)) |
2777 | goto out; | 2781 | goto out; |
2778 | 2782 | ||
2779 | /* | 2783 | /* |
2780 | * Save nodes for mpol_to_str() to show the tmpfs mount options | 2784 | * Save nodes for mpol_to_str() to show the tmpfs mount options |
2781 | * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. | 2785 | * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. |
2782 | */ | 2786 | */ |
2783 | if (mode != MPOL_PREFERRED) | 2787 | if (mode != MPOL_PREFERRED) |
2784 | new->v.nodes = nodes; | 2788 | new->v.nodes = nodes; |
2785 | else if (nodelist) | 2789 | else if (nodelist) |
2786 | new->v.preferred_node = first_node(nodes); | 2790 | new->v.preferred_node = first_node(nodes); |
2787 | else | 2791 | else |
2788 | new->flags |= MPOL_F_LOCAL; | 2792 | new->flags |= MPOL_F_LOCAL; |
2789 | 2793 | ||
2790 | /* | 2794 | /* |
2791 | * Save nodes for contextualization: this will be used to "clone" | 2795 | * Save nodes for contextualization: this will be used to "clone" |
2792 | * the mempolicy in a specific context [cpuset] at a later time. | 2796 | * the mempolicy in a specific context [cpuset] at a later time. |
2793 | */ | 2797 | */ |
2794 | new->w.user_nodemask = nodes; | 2798 | new->w.user_nodemask = nodes; |
2795 | 2799 | ||
2796 | err = 0; | 2800 | err = 0; |
2797 | 2801 | ||
2798 | out: | 2802 | out: |
2799 | /* Restore string for error message */ | 2803 | /* Restore string for error message */ |
2800 | if (nodelist) | 2804 | if (nodelist) |
2801 | *--nodelist = ':'; | 2805 | *--nodelist = ':'; |
2802 | if (flags) | 2806 | if (flags) |
2803 | *--flags = '='; | 2807 | *--flags = '='; |
2804 | if (!err) | 2808 | if (!err) |
2805 | *mpol = new; | 2809 | *mpol = new; |
2806 | return err; | 2810 | return err; |
2807 | } | 2811 | } |
2808 | #endif /* CONFIG_TMPFS */ | 2812 | #endif /* CONFIG_TMPFS */ |
2809 | 2813 | ||
2810 | /** | 2814 | /** |
2811 | * mpol_to_str - format a mempolicy structure for printing | 2815 | * mpol_to_str - format a mempolicy structure for printing |
2812 | * @buffer: to contain formatted mempolicy string | 2816 | * @buffer: to contain formatted mempolicy string |
2813 | * @maxlen: length of @buffer | 2817 | * @maxlen: length of @buffer |
2814 | * @pol: pointer to mempolicy to be formatted | 2818 | * @pol: pointer to mempolicy to be formatted |
2815 | * | 2819 | * |
2816 | * Convert @pol into a string. If @buffer is too short, truncate the string. | 2820 | * Convert @pol into a string. If @buffer is too short, truncate the string. |
2817 | * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the | 2821 | * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the |
2818 | * longest flag, "relative", and to display at least a few node ids. | 2822 | * longest flag, "relative", and to display at least a few node ids. |
2819 | */ | 2823 | */ |
2820 | void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | 2824 | void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) |
2821 | { | 2825 | { |
2822 | char *p = buffer; | 2826 | char *p = buffer; |
2823 | nodemask_t nodes = NODE_MASK_NONE; | 2827 | nodemask_t nodes = NODE_MASK_NONE; |
2824 | unsigned short mode = MPOL_DEFAULT; | 2828 | unsigned short mode = MPOL_DEFAULT; |
2825 | unsigned short flags = 0; | 2829 | unsigned short flags = 0; |
2826 | 2830 | ||
2827 | if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { | 2831 | if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { |
2828 | mode = pol->mode; | 2832 | mode = pol->mode; |
2829 | flags = pol->flags; | 2833 | flags = pol->flags; |
2830 | } | 2834 | } |
2831 | 2835 | ||
2832 | switch (mode) { | 2836 | switch (mode) { |
2833 | case MPOL_DEFAULT: | 2837 | case MPOL_DEFAULT: |
2834 | break; | 2838 | break; |
2835 | case MPOL_PREFERRED: | 2839 | case MPOL_PREFERRED: |
2836 | if (flags & MPOL_F_LOCAL) | 2840 | if (flags & MPOL_F_LOCAL) |
2837 | mode = MPOL_LOCAL; | 2841 | mode = MPOL_LOCAL; |
2838 | else | 2842 | else |
2839 | node_set(pol->v.preferred_node, nodes); | 2843 | node_set(pol->v.preferred_node, nodes); |
2840 | break; | 2844 | break; |
2841 | case MPOL_BIND: | 2845 | case MPOL_BIND: |
2842 | case MPOL_INTERLEAVE: | 2846 | case MPOL_INTERLEAVE: |
2843 | nodes = pol->v.nodes; | 2847 | nodes = pol->v.nodes; |
2844 | break; | 2848 | break; |
2845 | default: | 2849 | default: |
2846 | WARN_ON_ONCE(1); | 2850 | WARN_ON_ONCE(1); |
2847 | snprintf(p, maxlen, "unknown"); | 2851 | snprintf(p, maxlen, "unknown"); |
2848 | return; | 2852 | return; |
2849 | } | 2853 | } |
2850 | 2854 | ||
2851 | p += snprintf(p, maxlen, "%s", policy_modes[mode]); | 2855 | p += snprintf(p, maxlen, "%s", policy_modes[mode]); |
2852 | 2856 | ||
2853 | if (flags & MPOL_MODE_FLAGS) { | 2857 | if (flags & MPOL_MODE_FLAGS) { |
2854 | p += snprintf(p, buffer + maxlen - p, "="); | 2858 | p += snprintf(p, buffer + maxlen - p, "="); |
2855 | 2859 | ||
2856 | /* | 2860 | /* |
2857 | * Currently, the only defined flags are mutually exclusive | 2861 | * Currently, the only defined flags are mutually exclusive |
2858 | */ | 2862 | */ |
2859 | if (flags & MPOL_F_STATIC_NODES) | 2863 | if (flags & MPOL_F_STATIC_NODES) |
2860 | p += snprintf(p, buffer + maxlen - p, "static"); | 2864 | p += snprintf(p, buffer + maxlen - p, "static"); |
2861 | else if (flags & MPOL_F_RELATIVE_NODES) | 2865 | else if (flags & MPOL_F_RELATIVE_NODES) |
2862 | p += snprintf(p, buffer + maxlen - p, "relative"); | 2866 | p += snprintf(p, buffer + maxlen - p, "relative"); |
2863 | } | 2867 | } |
2864 | 2868 | ||
2865 | if (!nodes_empty(nodes)) { | 2869 | if (!nodes_empty(nodes)) { |
2866 | p += snprintf(p, buffer + maxlen - p, ":"); | 2870 | p += snprintf(p, buffer + maxlen - p, ":"); |
2867 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); | 2871 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); |
2868 | } | 2872 | } |
2869 | } | 2873 | } |
2870 | 2874 |