Commit 52f37629fd3c7b24e1e6c125e665454cd7ac1acb

Authored by Minchan Kim
Committed by Linus Torvalds
1 parent f1cb08798e

THP: fix comment about memory barrier

Currently the memory barrier in __do_huge_pmd_anonymous_page doesn't
work.  Because lru_cache_add_lru uses pagevec so it could miss spinlock
easily so above rule was broken so user might see inconsistent data.

I was not first person who pointed out the problem.  Mel and Peter
pointed out a few months ago and Peter pointed out further that even
spin_lock/unlock can't make sure of it:

  http://marc.info/?t=134333512700004

	In particular:

        	*A = a;
        	LOCK
        	UNLOCK
        	*B = b;

	may occur as:

        	LOCK, STORE *B, STORE *A, UNLOCK

At last, Hugh pointed out that even we don't need memory barrier in
there because __SetPageUpdate already have done it from Nick's commit
0ed361dec369 ("mm: fix PageUptodate data race") explicitly.

So this patch fixes comment on THP and adds same comment for
do_anonymous_page, too because everybody except Hugh was missing that.
It means we need a comment about that.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 10 additions and 6 deletions Inline Diff

1 /* 1 /*
2 * Copyright (C) 2009 Red Hat, Inc. 2 * Copyright (C) 2009 Red Hat, Inc.
3 * 3 *
4 * This work is licensed under the terms of the GNU GPL, version 2. See 4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory. 5 * the COPYING file in the top-level directory.
6 */ 6 */
7 7
8 #include <linux/mm.h> 8 #include <linux/mm.h>
9 #include <linux/sched.h> 9 #include <linux/sched.h>
10 #include <linux/highmem.h> 10 #include <linux/highmem.h>
11 #include <linux/hugetlb.h> 11 #include <linux/hugetlb.h>
12 #include <linux/mmu_notifier.h> 12 #include <linux/mmu_notifier.h>
13 #include <linux/rmap.h> 13 #include <linux/rmap.h>
14 #include <linux/swap.h> 14 #include <linux/swap.h>
15 #include <linux/shrinker.h> 15 #include <linux/shrinker.h>
16 #include <linux/mm_inline.h> 16 #include <linux/mm_inline.h>
17 #include <linux/kthread.h> 17 #include <linux/kthread.h>
18 #include <linux/khugepaged.h> 18 #include <linux/khugepaged.h>
19 #include <linux/freezer.h> 19 #include <linux/freezer.h>
20 #include <linux/mman.h> 20 #include <linux/mman.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/migrate.h> 22 #include <linux/migrate.h>
23 #include <linux/hashtable.h> 23 #include <linux/hashtable.h>
24 24
25 #include <asm/tlb.h> 25 #include <asm/tlb.h>
26 #include <asm/pgalloc.h> 26 #include <asm/pgalloc.h>
27 #include "internal.h" 27 #include "internal.h"
28 28
29 /* 29 /*
30 * By default transparent hugepage support is enabled for all mappings 30 * By default transparent hugepage support is enabled for all mappings
31 * and khugepaged scans all mappings. Defrag is only invoked by 31 * and khugepaged scans all mappings. Defrag is only invoked by
32 * khugepaged hugepage allocations and by page faults inside 32 * khugepaged hugepage allocations and by page faults inside
33 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived 33 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
34 * allocations. 34 * allocations.
35 */ 35 */
36 unsigned long transparent_hugepage_flags __read_mostly = 36 unsigned long transparent_hugepage_flags __read_mostly =
37 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 37 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
38 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 38 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
39 #endif 39 #endif
40 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 40 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
41 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 41 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
42 #endif 42 #endif
43 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 43 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
44 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 44 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
45 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 45 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
46 46
47 /* default scan 8*512 pte (or vmas) every 30 second */ 47 /* default scan 8*512 pte (or vmas) every 30 second */
48 static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 48 static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
49 static unsigned int khugepaged_pages_collapsed; 49 static unsigned int khugepaged_pages_collapsed;
50 static unsigned int khugepaged_full_scans; 50 static unsigned int khugepaged_full_scans;
51 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 51 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
52 /* during fragmentation poll the hugepage allocator once every minute */ 52 /* during fragmentation poll the hugepage allocator once every minute */
53 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 53 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
54 static struct task_struct *khugepaged_thread __read_mostly; 54 static struct task_struct *khugepaged_thread __read_mostly;
55 static DEFINE_MUTEX(khugepaged_mutex); 55 static DEFINE_MUTEX(khugepaged_mutex);
56 static DEFINE_SPINLOCK(khugepaged_mm_lock); 56 static DEFINE_SPINLOCK(khugepaged_mm_lock);
57 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 57 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
58 /* 58 /*
59 * default collapse hugepages if there is at least one pte mapped like 59 * default collapse hugepages if there is at least one pte mapped like
60 * it would have happened if the vma was large enough during page 60 * it would have happened if the vma was large enough during page
61 * fault. 61 * fault.
62 */ 62 */
63 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; 63 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
64 64
65 static int khugepaged(void *none); 65 static int khugepaged(void *none);
66 static int khugepaged_slab_init(void); 66 static int khugepaged_slab_init(void);
67 67
68 #define MM_SLOTS_HASH_BITS 10 68 #define MM_SLOTS_HASH_BITS 10
69 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 69 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
70 70
71 static struct kmem_cache *mm_slot_cache __read_mostly; 71 static struct kmem_cache *mm_slot_cache __read_mostly;
72 72
73 /** 73 /**
74 * struct mm_slot - hash lookup from mm to mm_slot 74 * struct mm_slot - hash lookup from mm to mm_slot
75 * @hash: hash collision list 75 * @hash: hash collision list
76 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head 76 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
77 * @mm: the mm that this information is valid for 77 * @mm: the mm that this information is valid for
78 */ 78 */
79 struct mm_slot { 79 struct mm_slot {
80 struct hlist_node hash; 80 struct hlist_node hash;
81 struct list_head mm_node; 81 struct list_head mm_node;
82 struct mm_struct *mm; 82 struct mm_struct *mm;
83 }; 83 };
84 84
85 /** 85 /**
86 * struct khugepaged_scan - cursor for scanning 86 * struct khugepaged_scan - cursor for scanning
87 * @mm_head: the head of the mm list to scan 87 * @mm_head: the head of the mm list to scan
88 * @mm_slot: the current mm_slot we are scanning 88 * @mm_slot: the current mm_slot we are scanning
89 * @address: the next address inside that to be scanned 89 * @address: the next address inside that to be scanned
90 * 90 *
91 * There is only the one khugepaged_scan instance of this cursor structure. 91 * There is only the one khugepaged_scan instance of this cursor structure.
92 */ 92 */
93 struct khugepaged_scan { 93 struct khugepaged_scan {
94 struct list_head mm_head; 94 struct list_head mm_head;
95 struct mm_slot *mm_slot; 95 struct mm_slot *mm_slot;
96 unsigned long address; 96 unsigned long address;
97 }; 97 };
98 static struct khugepaged_scan khugepaged_scan = { 98 static struct khugepaged_scan khugepaged_scan = {
99 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 99 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
100 }; 100 };
101 101
102 102
103 static int set_recommended_min_free_kbytes(void) 103 static int set_recommended_min_free_kbytes(void)
104 { 104 {
105 struct zone *zone; 105 struct zone *zone;
106 int nr_zones = 0; 106 int nr_zones = 0;
107 unsigned long recommended_min; 107 unsigned long recommended_min;
108 108
109 if (!khugepaged_enabled()) 109 if (!khugepaged_enabled())
110 return 0; 110 return 0;
111 111
112 for_each_populated_zone(zone) 112 for_each_populated_zone(zone)
113 nr_zones++; 113 nr_zones++;
114 114
115 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ 115 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
116 recommended_min = pageblock_nr_pages * nr_zones * 2; 116 recommended_min = pageblock_nr_pages * nr_zones * 2;
117 117
118 /* 118 /*
119 * Make sure that on average at least two pageblocks are almost free 119 * Make sure that on average at least two pageblocks are almost free
120 * of another type, one for a migratetype to fall back to and a 120 * of another type, one for a migratetype to fall back to and a
121 * second to avoid subsequent fallbacks of other types There are 3 121 * second to avoid subsequent fallbacks of other types There are 3
122 * MIGRATE_TYPES we care about. 122 * MIGRATE_TYPES we care about.
123 */ 123 */
124 recommended_min += pageblock_nr_pages * nr_zones * 124 recommended_min += pageblock_nr_pages * nr_zones *
125 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 125 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
126 126
127 /* don't ever allow to reserve more than 5% of the lowmem */ 127 /* don't ever allow to reserve more than 5% of the lowmem */
128 recommended_min = min(recommended_min, 128 recommended_min = min(recommended_min,
129 (unsigned long) nr_free_buffer_pages() / 20); 129 (unsigned long) nr_free_buffer_pages() / 20);
130 recommended_min <<= (PAGE_SHIFT-10); 130 recommended_min <<= (PAGE_SHIFT-10);
131 131
132 if (recommended_min > min_free_kbytes) 132 if (recommended_min > min_free_kbytes)
133 min_free_kbytes = recommended_min; 133 min_free_kbytes = recommended_min;
134 setup_per_zone_wmarks(); 134 setup_per_zone_wmarks();
135 return 0; 135 return 0;
136 } 136 }
137 late_initcall(set_recommended_min_free_kbytes); 137 late_initcall(set_recommended_min_free_kbytes);
138 138
139 static int start_khugepaged(void) 139 static int start_khugepaged(void)
140 { 140 {
141 int err = 0; 141 int err = 0;
142 if (khugepaged_enabled()) { 142 if (khugepaged_enabled()) {
143 if (!khugepaged_thread) 143 if (!khugepaged_thread)
144 khugepaged_thread = kthread_run(khugepaged, NULL, 144 khugepaged_thread = kthread_run(khugepaged, NULL,
145 "khugepaged"); 145 "khugepaged");
146 if (unlikely(IS_ERR(khugepaged_thread))) { 146 if (unlikely(IS_ERR(khugepaged_thread))) {
147 printk(KERN_ERR 147 printk(KERN_ERR
148 "khugepaged: kthread_run(khugepaged) failed\n"); 148 "khugepaged: kthread_run(khugepaged) failed\n");
149 err = PTR_ERR(khugepaged_thread); 149 err = PTR_ERR(khugepaged_thread);
150 khugepaged_thread = NULL; 150 khugepaged_thread = NULL;
151 } 151 }
152 152
153 if (!list_empty(&khugepaged_scan.mm_head)) 153 if (!list_empty(&khugepaged_scan.mm_head))
154 wake_up_interruptible(&khugepaged_wait); 154 wake_up_interruptible(&khugepaged_wait);
155 155
156 set_recommended_min_free_kbytes(); 156 set_recommended_min_free_kbytes();
157 } else if (khugepaged_thread) { 157 } else if (khugepaged_thread) {
158 kthread_stop(khugepaged_thread); 158 kthread_stop(khugepaged_thread);
159 khugepaged_thread = NULL; 159 khugepaged_thread = NULL;
160 } 160 }
161 161
162 return err; 162 return err;
163 } 163 }
164 164
165 static atomic_t huge_zero_refcount; 165 static atomic_t huge_zero_refcount;
166 static unsigned long huge_zero_pfn __read_mostly; 166 static unsigned long huge_zero_pfn __read_mostly;
167 167
168 static inline bool is_huge_zero_pfn(unsigned long pfn) 168 static inline bool is_huge_zero_pfn(unsigned long pfn)
169 { 169 {
170 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); 170 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
171 return zero_pfn && pfn == zero_pfn; 171 return zero_pfn && pfn == zero_pfn;
172 } 172 }
173 173
174 static inline bool is_huge_zero_pmd(pmd_t pmd) 174 static inline bool is_huge_zero_pmd(pmd_t pmd)
175 { 175 {
176 return is_huge_zero_pfn(pmd_pfn(pmd)); 176 return is_huge_zero_pfn(pmd_pfn(pmd));
177 } 177 }
178 178
179 static unsigned long get_huge_zero_page(void) 179 static unsigned long get_huge_zero_page(void)
180 { 180 {
181 struct page *zero_page; 181 struct page *zero_page;
182 retry: 182 retry:
183 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 183 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
184 return ACCESS_ONCE(huge_zero_pfn); 184 return ACCESS_ONCE(huge_zero_pfn);
185 185
186 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 186 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
187 HPAGE_PMD_ORDER); 187 HPAGE_PMD_ORDER);
188 if (!zero_page) { 188 if (!zero_page) {
189 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 189 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
190 return 0; 190 return 0;
191 } 191 }
192 count_vm_event(THP_ZERO_PAGE_ALLOC); 192 count_vm_event(THP_ZERO_PAGE_ALLOC);
193 preempt_disable(); 193 preempt_disable();
194 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { 194 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
195 preempt_enable(); 195 preempt_enable();
196 __free_page(zero_page); 196 __free_page(zero_page);
197 goto retry; 197 goto retry;
198 } 198 }
199 199
200 /* We take additional reference here. It will be put back by shrinker */ 200 /* We take additional reference here. It will be put back by shrinker */
201 atomic_set(&huge_zero_refcount, 2); 201 atomic_set(&huge_zero_refcount, 2);
202 preempt_enable(); 202 preempt_enable();
203 return ACCESS_ONCE(huge_zero_pfn); 203 return ACCESS_ONCE(huge_zero_pfn);
204 } 204 }
205 205
206 static void put_huge_zero_page(void) 206 static void put_huge_zero_page(void)
207 { 207 {
208 /* 208 /*
209 * Counter should never go to zero here. Only shrinker can put 209 * Counter should never go to zero here. Only shrinker can put
210 * last reference. 210 * last reference.
211 */ 211 */
212 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 212 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
213 } 213 }
214 214
215 static int shrink_huge_zero_page(struct shrinker *shrink, 215 static int shrink_huge_zero_page(struct shrinker *shrink,
216 struct shrink_control *sc) 216 struct shrink_control *sc)
217 { 217 {
218 if (!sc->nr_to_scan) 218 if (!sc->nr_to_scan)
219 /* we can free zero page only if last reference remains */ 219 /* we can free zero page only if last reference remains */
220 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 220 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
221 221
222 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 222 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
223 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); 223 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
224 BUG_ON(zero_pfn == 0); 224 BUG_ON(zero_pfn == 0);
225 __free_page(__pfn_to_page(zero_pfn)); 225 __free_page(__pfn_to_page(zero_pfn));
226 } 226 }
227 227
228 return 0; 228 return 0;
229 } 229 }
230 230
231 static struct shrinker huge_zero_page_shrinker = { 231 static struct shrinker huge_zero_page_shrinker = {
232 .shrink = shrink_huge_zero_page, 232 .shrink = shrink_huge_zero_page,
233 .seeks = DEFAULT_SEEKS, 233 .seeks = DEFAULT_SEEKS,
234 }; 234 };
235 235
236 #ifdef CONFIG_SYSFS 236 #ifdef CONFIG_SYSFS
237 237
238 static ssize_t double_flag_show(struct kobject *kobj, 238 static ssize_t double_flag_show(struct kobject *kobj,
239 struct kobj_attribute *attr, char *buf, 239 struct kobj_attribute *attr, char *buf,
240 enum transparent_hugepage_flag enabled, 240 enum transparent_hugepage_flag enabled,
241 enum transparent_hugepage_flag req_madv) 241 enum transparent_hugepage_flag req_madv)
242 { 242 {
243 if (test_bit(enabled, &transparent_hugepage_flags)) { 243 if (test_bit(enabled, &transparent_hugepage_flags)) {
244 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); 244 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
245 return sprintf(buf, "[always] madvise never\n"); 245 return sprintf(buf, "[always] madvise never\n");
246 } else if (test_bit(req_madv, &transparent_hugepage_flags)) 246 } else if (test_bit(req_madv, &transparent_hugepage_flags))
247 return sprintf(buf, "always [madvise] never\n"); 247 return sprintf(buf, "always [madvise] never\n");
248 else 248 else
249 return sprintf(buf, "always madvise [never]\n"); 249 return sprintf(buf, "always madvise [never]\n");
250 } 250 }
251 static ssize_t double_flag_store(struct kobject *kobj, 251 static ssize_t double_flag_store(struct kobject *kobj,
252 struct kobj_attribute *attr, 252 struct kobj_attribute *attr,
253 const char *buf, size_t count, 253 const char *buf, size_t count,
254 enum transparent_hugepage_flag enabled, 254 enum transparent_hugepage_flag enabled,
255 enum transparent_hugepage_flag req_madv) 255 enum transparent_hugepage_flag req_madv)
256 { 256 {
257 if (!memcmp("always", buf, 257 if (!memcmp("always", buf,
258 min(sizeof("always")-1, count))) { 258 min(sizeof("always")-1, count))) {
259 set_bit(enabled, &transparent_hugepage_flags); 259 set_bit(enabled, &transparent_hugepage_flags);
260 clear_bit(req_madv, &transparent_hugepage_flags); 260 clear_bit(req_madv, &transparent_hugepage_flags);
261 } else if (!memcmp("madvise", buf, 261 } else if (!memcmp("madvise", buf,
262 min(sizeof("madvise")-1, count))) { 262 min(sizeof("madvise")-1, count))) {
263 clear_bit(enabled, &transparent_hugepage_flags); 263 clear_bit(enabled, &transparent_hugepage_flags);
264 set_bit(req_madv, &transparent_hugepage_flags); 264 set_bit(req_madv, &transparent_hugepage_flags);
265 } else if (!memcmp("never", buf, 265 } else if (!memcmp("never", buf,
266 min(sizeof("never")-1, count))) { 266 min(sizeof("never")-1, count))) {
267 clear_bit(enabled, &transparent_hugepage_flags); 267 clear_bit(enabled, &transparent_hugepage_flags);
268 clear_bit(req_madv, &transparent_hugepage_flags); 268 clear_bit(req_madv, &transparent_hugepage_flags);
269 } else 269 } else
270 return -EINVAL; 270 return -EINVAL;
271 271
272 return count; 272 return count;
273 } 273 }
274 274
275 static ssize_t enabled_show(struct kobject *kobj, 275 static ssize_t enabled_show(struct kobject *kobj,
276 struct kobj_attribute *attr, char *buf) 276 struct kobj_attribute *attr, char *buf)
277 { 277 {
278 return double_flag_show(kobj, attr, buf, 278 return double_flag_show(kobj, attr, buf,
279 TRANSPARENT_HUGEPAGE_FLAG, 279 TRANSPARENT_HUGEPAGE_FLAG,
280 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 280 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
281 } 281 }
282 static ssize_t enabled_store(struct kobject *kobj, 282 static ssize_t enabled_store(struct kobject *kobj,
283 struct kobj_attribute *attr, 283 struct kobj_attribute *attr,
284 const char *buf, size_t count) 284 const char *buf, size_t count)
285 { 285 {
286 ssize_t ret; 286 ssize_t ret;
287 287
288 ret = double_flag_store(kobj, attr, buf, count, 288 ret = double_flag_store(kobj, attr, buf, count,
289 TRANSPARENT_HUGEPAGE_FLAG, 289 TRANSPARENT_HUGEPAGE_FLAG,
290 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 290 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
291 291
292 if (ret > 0) { 292 if (ret > 0) {
293 int err; 293 int err;
294 294
295 mutex_lock(&khugepaged_mutex); 295 mutex_lock(&khugepaged_mutex);
296 err = start_khugepaged(); 296 err = start_khugepaged();
297 mutex_unlock(&khugepaged_mutex); 297 mutex_unlock(&khugepaged_mutex);
298 298
299 if (err) 299 if (err)
300 ret = err; 300 ret = err;
301 } 301 }
302 302
303 return ret; 303 return ret;
304 } 304 }
305 static struct kobj_attribute enabled_attr = 305 static struct kobj_attribute enabled_attr =
306 __ATTR(enabled, 0644, enabled_show, enabled_store); 306 __ATTR(enabled, 0644, enabled_show, enabled_store);
307 307
308 static ssize_t single_flag_show(struct kobject *kobj, 308 static ssize_t single_flag_show(struct kobject *kobj,
309 struct kobj_attribute *attr, char *buf, 309 struct kobj_attribute *attr, char *buf,
310 enum transparent_hugepage_flag flag) 310 enum transparent_hugepage_flag flag)
311 { 311 {
312 return sprintf(buf, "%d\n", 312 return sprintf(buf, "%d\n",
313 !!test_bit(flag, &transparent_hugepage_flags)); 313 !!test_bit(flag, &transparent_hugepage_flags));
314 } 314 }
315 315
316 static ssize_t single_flag_store(struct kobject *kobj, 316 static ssize_t single_flag_store(struct kobject *kobj,
317 struct kobj_attribute *attr, 317 struct kobj_attribute *attr,
318 const char *buf, size_t count, 318 const char *buf, size_t count,
319 enum transparent_hugepage_flag flag) 319 enum transparent_hugepage_flag flag)
320 { 320 {
321 unsigned long value; 321 unsigned long value;
322 int ret; 322 int ret;
323 323
324 ret = kstrtoul(buf, 10, &value); 324 ret = kstrtoul(buf, 10, &value);
325 if (ret < 0) 325 if (ret < 0)
326 return ret; 326 return ret;
327 if (value > 1) 327 if (value > 1)
328 return -EINVAL; 328 return -EINVAL;
329 329
330 if (value) 330 if (value)
331 set_bit(flag, &transparent_hugepage_flags); 331 set_bit(flag, &transparent_hugepage_flags);
332 else 332 else
333 clear_bit(flag, &transparent_hugepage_flags); 333 clear_bit(flag, &transparent_hugepage_flags);
334 334
335 return count; 335 return count;
336 } 336 }
337 337
338 /* 338 /*
339 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind 339 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
340 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of 340 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
341 * memory just to allocate one more hugepage. 341 * memory just to allocate one more hugepage.
342 */ 342 */
343 static ssize_t defrag_show(struct kobject *kobj, 343 static ssize_t defrag_show(struct kobject *kobj,
344 struct kobj_attribute *attr, char *buf) 344 struct kobj_attribute *attr, char *buf)
345 { 345 {
346 return double_flag_show(kobj, attr, buf, 346 return double_flag_show(kobj, attr, buf,
347 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, 347 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
348 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 348 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
349 } 349 }
350 static ssize_t defrag_store(struct kobject *kobj, 350 static ssize_t defrag_store(struct kobject *kobj,
351 struct kobj_attribute *attr, 351 struct kobj_attribute *attr,
352 const char *buf, size_t count) 352 const char *buf, size_t count)
353 { 353 {
354 return double_flag_store(kobj, attr, buf, count, 354 return double_flag_store(kobj, attr, buf, count,
355 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, 355 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
356 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 356 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
357 } 357 }
358 static struct kobj_attribute defrag_attr = 358 static struct kobj_attribute defrag_attr =
359 __ATTR(defrag, 0644, defrag_show, defrag_store); 359 __ATTR(defrag, 0644, defrag_show, defrag_store);
360 360
361 static ssize_t use_zero_page_show(struct kobject *kobj, 361 static ssize_t use_zero_page_show(struct kobject *kobj,
362 struct kobj_attribute *attr, char *buf) 362 struct kobj_attribute *attr, char *buf)
363 { 363 {
364 return single_flag_show(kobj, attr, buf, 364 return single_flag_show(kobj, attr, buf,
365 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 365 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
366 } 366 }
367 static ssize_t use_zero_page_store(struct kobject *kobj, 367 static ssize_t use_zero_page_store(struct kobject *kobj,
368 struct kobj_attribute *attr, const char *buf, size_t count) 368 struct kobj_attribute *attr, const char *buf, size_t count)
369 { 369 {
370 return single_flag_store(kobj, attr, buf, count, 370 return single_flag_store(kobj, attr, buf, count,
371 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 371 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
372 } 372 }
373 static struct kobj_attribute use_zero_page_attr = 373 static struct kobj_attribute use_zero_page_attr =
374 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 374 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
375 #ifdef CONFIG_DEBUG_VM 375 #ifdef CONFIG_DEBUG_VM
376 static ssize_t debug_cow_show(struct kobject *kobj, 376 static ssize_t debug_cow_show(struct kobject *kobj,
377 struct kobj_attribute *attr, char *buf) 377 struct kobj_attribute *attr, char *buf)
378 { 378 {
379 return single_flag_show(kobj, attr, buf, 379 return single_flag_show(kobj, attr, buf,
380 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 380 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
381 } 381 }
382 static ssize_t debug_cow_store(struct kobject *kobj, 382 static ssize_t debug_cow_store(struct kobject *kobj,
383 struct kobj_attribute *attr, 383 struct kobj_attribute *attr,
384 const char *buf, size_t count) 384 const char *buf, size_t count)
385 { 385 {
386 return single_flag_store(kobj, attr, buf, count, 386 return single_flag_store(kobj, attr, buf, count,
387 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 387 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
388 } 388 }
389 static struct kobj_attribute debug_cow_attr = 389 static struct kobj_attribute debug_cow_attr =
390 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 390 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
391 #endif /* CONFIG_DEBUG_VM */ 391 #endif /* CONFIG_DEBUG_VM */
392 392
393 static struct attribute *hugepage_attr[] = { 393 static struct attribute *hugepage_attr[] = {
394 &enabled_attr.attr, 394 &enabled_attr.attr,
395 &defrag_attr.attr, 395 &defrag_attr.attr,
396 &use_zero_page_attr.attr, 396 &use_zero_page_attr.attr,
397 #ifdef CONFIG_DEBUG_VM 397 #ifdef CONFIG_DEBUG_VM
398 &debug_cow_attr.attr, 398 &debug_cow_attr.attr,
399 #endif 399 #endif
400 NULL, 400 NULL,
401 }; 401 };
402 402
403 static struct attribute_group hugepage_attr_group = { 403 static struct attribute_group hugepage_attr_group = {
404 .attrs = hugepage_attr, 404 .attrs = hugepage_attr,
405 }; 405 };
406 406
407 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 407 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
408 struct kobj_attribute *attr, 408 struct kobj_attribute *attr,
409 char *buf) 409 char *buf)
410 { 410 {
411 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); 411 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
412 } 412 }
413 413
414 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 414 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
415 struct kobj_attribute *attr, 415 struct kobj_attribute *attr,
416 const char *buf, size_t count) 416 const char *buf, size_t count)
417 { 417 {
418 unsigned long msecs; 418 unsigned long msecs;
419 int err; 419 int err;
420 420
421 err = strict_strtoul(buf, 10, &msecs); 421 err = strict_strtoul(buf, 10, &msecs);
422 if (err || msecs > UINT_MAX) 422 if (err || msecs > UINT_MAX)
423 return -EINVAL; 423 return -EINVAL;
424 424
425 khugepaged_scan_sleep_millisecs = msecs; 425 khugepaged_scan_sleep_millisecs = msecs;
426 wake_up_interruptible(&khugepaged_wait); 426 wake_up_interruptible(&khugepaged_wait);
427 427
428 return count; 428 return count;
429 } 429 }
430 static struct kobj_attribute scan_sleep_millisecs_attr = 430 static struct kobj_attribute scan_sleep_millisecs_attr =
431 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, 431 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
432 scan_sleep_millisecs_store); 432 scan_sleep_millisecs_store);
433 433
434 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 434 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
435 struct kobj_attribute *attr, 435 struct kobj_attribute *attr,
436 char *buf) 436 char *buf)
437 { 437 {
438 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 438 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
439 } 439 }
440 440
441 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 441 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
442 struct kobj_attribute *attr, 442 struct kobj_attribute *attr,
443 const char *buf, size_t count) 443 const char *buf, size_t count)
444 { 444 {
445 unsigned long msecs; 445 unsigned long msecs;
446 int err; 446 int err;
447 447
448 err = strict_strtoul(buf, 10, &msecs); 448 err = strict_strtoul(buf, 10, &msecs);
449 if (err || msecs > UINT_MAX) 449 if (err || msecs > UINT_MAX)
450 return -EINVAL; 450 return -EINVAL;
451 451
452 khugepaged_alloc_sleep_millisecs = msecs; 452 khugepaged_alloc_sleep_millisecs = msecs;
453 wake_up_interruptible(&khugepaged_wait); 453 wake_up_interruptible(&khugepaged_wait);
454 454
455 return count; 455 return count;
456 } 456 }
457 static struct kobj_attribute alloc_sleep_millisecs_attr = 457 static struct kobj_attribute alloc_sleep_millisecs_attr =
458 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, 458 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
459 alloc_sleep_millisecs_store); 459 alloc_sleep_millisecs_store);
460 460
461 static ssize_t pages_to_scan_show(struct kobject *kobj, 461 static ssize_t pages_to_scan_show(struct kobject *kobj,
462 struct kobj_attribute *attr, 462 struct kobj_attribute *attr,
463 char *buf) 463 char *buf)
464 { 464 {
465 return sprintf(buf, "%u\n", khugepaged_pages_to_scan); 465 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
466 } 466 }
467 static ssize_t pages_to_scan_store(struct kobject *kobj, 467 static ssize_t pages_to_scan_store(struct kobject *kobj,
468 struct kobj_attribute *attr, 468 struct kobj_attribute *attr,
469 const char *buf, size_t count) 469 const char *buf, size_t count)
470 { 470 {
471 int err; 471 int err;
472 unsigned long pages; 472 unsigned long pages;
473 473
474 err = strict_strtoul(buf, 10, &pages); 474 err = strict_strtoul(buf, 10, &pages);
475 if (err || !pages || pages > UINT_MAX) 475 if (err || !pages || pages > UINT_MAX)
476 return -EINVAL; 476 return -EINVAL;
477 477
478 khugepaged_pages_to_scan = pages; 478 khugepaged_pages_to_scan = pages;
479 479
480 return count; 480 return count;
481 } 481 }
482 static struct kobj_attribute pages_to_scan_attr = 482 static struct kobj_attribute pages_to_scan_attr =
483 __ATTR(pages_to_scan, 0644, pages_to_scan_show, 483 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
484 pages_to_scan_store); 484 pages_to_scan_store);
485 485
486 static ssize_t pages_collapsed_show(struct kobject *kobj, 486 static ssize_t pages_collapsed_show(struct kobject *kobj,
487 struct kobj_attribute *attr, 487 struct kobj_attribute *attr,
488 char *buf) 488 char *buf)
489 { 489 {
490 return sprintf(buf, "%u\n", khugepaged_pages_collapsed); 490 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
491 } 491 }
492 static struct kobj_attribute pages_collapsed_attr = 492 static struct kobj_attribute pages_collapsed_attr =
493 __ATTR_RO(pages_collapsed); 493 __ATTR_RO(pages_collapsed);
494 494
495 static ssize_t full_scans_show(struct kobject *kobj, 495 static ssize_t full_scans_show(struct kobject *kobj,
496 struct kobj_attribute *attr, 496 struct kobj_attribute *attr,
497 char *buf) 497 char *buf)
498 { 498 {
499 return sprintf(buf, "%u\n", khugepaged_full_scans); 499 return sprintf(buf, "%u\n", khugepaged_full_scans);
500 } 500 }
501 static struct kobj_attribute full_scans_attr = 501 static struct kobj_attribute full_scans_attr =
502 __ATTR_RO(full_scans); 502 __ATTR_RO(full_scans);
503 503
504 static ssize_t khugepaged_defrag_show(struct kobject *kobj, 504 static ssize_t khugepaged_defrag_show(struct kobject *kobj,
505 struct kobj_attribute *attr, char *buf) 505 struct kobj_attribute *attr, char *buf)
506 { 506 {
507 return single_flag_show(kobj, attr, buf, 507 return single_flag_show(kobj, attr, buf,
508 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 508 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
509 } 509 }
510 static ssize_t khugepaged_defrag_store(struct kobject *kobj, 510 static ssize_t khugepaged_defrag_store(struct kobject *kobj,
511 struct kobj_attribute *attr, 511 struct kobj_attribute *attr,
512 const char *buf, size_t count) 512 const char *buf, size_t count)
513 { 513 {
514 return single_flag_store(kobj, attr, buf, count, 514 return single_flag_store(kobj, attr, buf, count,
515 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 515 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
516 } 516 }
517 static struct kobj_attribute khugepaged_defrag_attr = 517 static struct kobj_attribute khugepaged_defrag_attr =
518 __ATTR(defrag, 0644, khugepaged_defrag_show, 518 __ATTR(defrag, 0644, khugepaged_defrag_show,
519 khugepaged_defrag_store); 519 khugepaged_defrag_store);
520 520
521 /* 521 /*
522 * max_ptes_none controls if khugepaged should collapse hugepages over 522 * max_ptes_none controls if khugepaged should collapse hugepages over
523 * any unmapped ptes in turn potentially increasing the memory 523 * any unmapped ptes in turn potentially increasing the memory
524 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 524 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
525 * reduce the available free memory in the system as it 525 * reduce the available free memory in the system as it
526 * runs. Increasing max_ptes_none will instead potentially reduce the 526 * runs. Increasing max_ptes_none will instead potentially reduce the
527 * free memory in the system during the khugepaged scan. 527 * free memory in the system during the khugepaged scan.
528 */ 528 */
529 static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, 529 static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
530 struct kobj_attribute *attr, 530 struct kobj_attribute *attr,
531 char *buf) 531 char *buf)
532 { 532 {
533 return sprintf(buf, "%u\n", khugepaged_max_ptes_none); 533 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
534 } 534 }
535 static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, 535 static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
536 struct kobj_attribute *attr, 536 struct kobj_attribute *attr,
537 const char *buf, size_t count) 537 const char *buf, size_t count)
538 { 538 {
539 int err; 539 int err;
540 unsigned long max_ptes_none; 540 unsigned long max_ptes_none;
541 541
542 err = strict_strtoul(buf, 10, &max_ptes_none); 542 err = strict_strtoul(buf, 10, &max_ptes_none);
543 if (err || max_ptes_none > HPAGE_PMD_NR-1) 543 if (err || max_ptes_none > HPAGE_PMD_NR-1)
544 return -EINVAL; 544 return -EINVAL;
545 545
546 khugepaged_max_ptes_none = max_ptes_none; 546 khugepaged_max_ptes_none = max_ptes_none;
547 547
548 return count; 548 return count;
549 } 549 }
550 static struct kobj_attribute khugepaged_max_ptes_none_attr = 550 static struct kobj_attribute khugepaged_max_ptes_none_attr =
551 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, 551 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
552 khugepaged_max_ptes_none_store); 552 khugepaged_max_ptes_none_store);
553 553
554 static struct attribute *khugepaged_attr[] = { 554 static struct attribute *khugepaged_attr[] = {
555 &khugepaged_defrag_attr.attr, 555 &khugepaged_defrag_attr.attr,
556 &khugepaged_max_ptes_none_attr.attr, 556 &khugepaged_max_ptes_none_attr.attr,
557 &pages_to_scan_attr.attr, 557 &pages_to_scan_attr.attr,
558 &pages_collapsed_attr.attr, 558 &pages_collapsed_attr.attr,
559 &full_scans_attr.attr, 559 &full_scans_attr.attr,
560 &scan_sleep_millisecs_attr.attr, 560 &scan_sleep_millisecs_attr.attr,
561 &alloc_sleep_millisecs_attr.attr, 561 &alloc_sleep_millisecs_attr.attr,
562 NULL, 562 NULL,
563 }; 563 };
564 564
565 static struct attribute_group khugepaged_attr_group = { 565 static struct attribute_group khugepaged_attr_group = {
566 .attrs = khugepaged_attr, 566 .attrs = khugepaged_attr,
567 .name = "khugepaged", 567 .name = "khugepaged",
568 }; 568 };
569 569
570 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 570 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
571 { 571 {
572 int err; 572 int err;
573 573
574 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 574 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
575 if (unlikely(!*hugepage_kobj)) { 575 if (unlikely(!*hugepage_kobj)) {
576 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); 576 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
577 return -ENOMEM; 577 return -ENOMEM;
578 } 578 }
579 579
580 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 580 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
581 if (err) { 581 if (err) {
582 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 582 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
583 goto delete_obj; 583 goto delete_obj;
584 } 584 }
585 585
586 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 586 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
587 if (err) { 587 if (err) {
588 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 588 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
589 goto remove_hp_group; 589 goto remove_hp_group;
590 } 590 }
591 591
592 return 0; 592 return 0;
593 593
594 remove_hp_group: 594 remove_hp_group:
595 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 595 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
596 delete_obj: 596 delete_obj:
597 kobject_put(*hugepage_kobj); 597 kobject_put(*hugepage_kobj);
598 return err; 598 return err;
599 } 599 }
600 600
601 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 601 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
602 { 602 {
603 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 603 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
604 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 604 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
605 kobject_put(hugepage_kobj); 605 kobject_put(hugepage_kobj);
606 } 606 }
607 #else 607 #else
608 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 608 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
609 { 609 {
610 return 0; 610 return 0;
611 } 611 }
612 612
613 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 613 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
614 { 614 {
615 } 615 }
616 #endif /* CONFIG_SYSFS */ 616 #endif /* CONFIG_SYSFS */
617 617
618 static int __init hugepage_init(void) 618 static int __init hugepage_init(void)
619 { 619 {
620 int err; 620 int err;
621 struct kobject *hugepage_kobj; 621 struct kobject *hugepage_kobj;
622 622
623 if (!has_transparent_hugepage()) { 623 if (!has_transparent_hugepage()) {
624 transparent_hugepage_flags = 0; 624 transparent_hugepage_flags = 0;
625 return -EINVAL; 625 return -EINVAL;
626 } 626 }
627 627
628 err = hugepage_init_sysfs(&hugepage_kobj); 628 err = hugepage_init_sysfs(&hugepage_kobj);
629 if (err) 629 if (err)
630 return err; 630 return err;
631 631
632 err = khugepaged_slab_init(); 632 err = khugepaged_slab_init();
633 if (err) 633 if (err)
634 goto out; 634 goto out;
635 635
636 register_shrinker(&huge_zero_page_shrinker); 636 register_shrinker(&huge_zero_page_shrinker);
637 637
638 /* 638 /*
639 * By default disable transparent hugepages on smaller systems, 639 * By default disable transparent hugepages on smaller systems,
640 * where the extra memory used could hurt more than TLB overhead 640 * where the extra memory used could hurt more than TLB overhead
641 * is likely to save. The admin can still enable it through /sys. 641 * is likely to save. The admin can still enable it through /sys.
642 */ 642 */
643 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) 643 if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
644 transparent_hugepage_flags = 0; 644 transparent_hugepage_flags = 0;
645 645
646 start_khugepaged(); 646 start_khugepaged();
647 647
648 return 0; 648 return 0;
649 out: 649 out:
650 hugepage_exit_sysfs(hugepage_kobj); 650 hugepage_exit_sysfs(hugepage_kobj);
651 return err; 651 return err;
652 } 652 }
653 module_init(hugepage_init) 653 module_init(hugepage_init)
654 654
655 static int __init setup_transparent_hugepage(char *str) 655 static int __init setup_transparent_hugepage(char *str)
656 { 656 {
657 int ret = 0; 657 int ret = 0;
658 if (!str) 658 if (!str)
659 goto out; 659 goto out;
660 if (!strcmp(str, "always")) { 660 if (!strcmp(str, "always")) {
661 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 661 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
662 &transparent_hugepage_flags); 662 &transparent_hugepage_flags);
663 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 663 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
664 &transparent_hugepage_flags); 664 &transparent_hugepage_flags);
665 ret = 1; 665 ret = 1;
666 } else if (!strcmp(str, "madvise")) { 666 } else if (!strcmp(str, "madvise")) {
667 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 667 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
668 &transparent_hugepage_flags); 668 &transparent_hugepage_flags);
669 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 669 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
670 &transparent_hugepage_flags); 670 &transparent_hugepage_flags);
671 ret = 1; 671 ret = 1;
672 } else if (!strcmp(str, "never")) { 672 } else if (!strcmp(str, "never")) {
673 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 673 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
674 &transparent_hugepage_flags); 674 &transparent_hugepage_flags);
675 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 675 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
676 &transparent_hugepage_flags); 676 &transparent_hugepage_flags);
677 ret = 1; 677 ret = 1;
678 } 678 }
679 out: 679 out:
680 if (!ret) 680 if (!ret)
681 printk(KERN_WARNING 681 printk(KERN_WARNING
682 "transparent_hugepage= cannot parse, ignored\n"); 682 "transparent_hugepage= cannot parse, ignored\n");
683 return ret; 683 return ret;
684 } 684 }
685 __setup("transparent_hugepage=", setup_transparent_hugepage); 685 __setup("transparent_hugepage=", setup_transparent_hugepage);
686 686
687 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 687 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
688 { 688 {
689 if (likely(vma->vm_flags & VM_WRITE)) 689 if (likely(vma->vm_flags & VM_WRITE))
690 pmd = pmd_mkwrite(pmd); 690 pmd = pmd_mkwrite(pmd);
691 return pmd; 691 return pmd;
692 } 692 }
693 693
694 static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) 694 static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
695 { 695 {
696 pmd_t entry; 696 pmd_t entry;
697 entry = mk_pmd(page, vma->vm_page_prot); 697 entry = mk_pmd(page, vma->vm_page_prot);
698 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 698 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
699 entry = pmd_mkhuge(entry); 699 entry = pmd_mkhuge(entry);
700 return entry; 700 return entry;
701 } 701 }
702 702
703 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 703 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
704 struct vm_area_struct *vma, 704 struct vm_area_struct *vma,
705 unsigned long haddr, pmd_t *pmd, 705 unsigned long haddr, pmd_t *pmd,
706 struct page *page) 706 struct page *page)
707 { 707 {
708 pgtable_t pgtable; 708 pgtable_t pgtable;
709 709
710 VM_BUG_ON(!PageCompound(page)); 710 VM_BUG_ON(!PageCompound(page));
711 pgtable = pte_alloc_one(mm, haddr); 711 pgtable = pte_alloc_one(mm, haddr);
712 if (unlikely(!pgtable)) 712 if (unlikely(!pgtable))
713 return VM_FAULT_OOM; 713 return VM_FAULT_OOM;
714 714
715 clear_huge_page(page, haddr, HPAGE_PMD_NR); 715 clear_huge_page(page, haddr, HPAGE_PMD_NR);
716 /*
717 * The memory barrier inside __SetPageUptodate makes sure that
718 * clear_huge_page writes become visible before the set_pmd_at()
719 * write.
720 */
716 __SetPageUptodate(page); 721 __SetPageUptodate(page);
717 722
718 spin_lock(&mm->page_table_lock); 723 spin_lock(&mm->page_table_lock);
719 if (unlikely(!pmd_none(*pmd))) { 724 if (unlikely(!pmd_none(*pmd))) {
720 spin_unlock(&mm->page_table_lock); 725 spin_unlock(&mm->page_table_lock);
721 mem_cgroup_uncharge_page(page); 726 mem_cgroup_uncharge_page(page);
722 put_page(page); 727 put_page(page);
723 pte_free(mm, pgtable); 728 pte_free(mm, pgtable);
724 } else { 729 } else {
725 pmd_t entry; 730 pmd_t entry;
726 entry = mk_huge_pmd(page, vma); 731 entry = mk_huge_pmd(page, vma);
727 /*
728 * The spinlocking to take the lru_lock inside
729 * page_add_new_anon_rmap() acts as a full memory
730 * barrier to be sure clear_huge_page writes become
731 * visible after the set_pmd_at() write.
732 */
733 page_add_new_anon_rmap(page, vma, haddr); 732 page_add_new_anon_rmap(page, vma, haddr);
734 set_pmd_at(mm, haddr, pmd, entry); 733 set_pmd_at(mm, haddr, pmd, entry);
735 pgtable_trans_huge_deposit(mm, pgtable); 734 pgtable_trans_huge_deposit(mm, pgtable);
736 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 735 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
737 mm->nr_ptes++; 736 mm->nr_ptes++;
738 spin_unlock(&mm->page_table_lock); 737 spin_unlock(&mm->page_table_lock);
739 } 738 }
740 739
741 return 0; 740 return 0;
742 } 741 }
743 742
744 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) 743 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
745 { 744 {
746 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; 745 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
747 } 746 }
748 747
749 static inline struct page *alloc_hugepage_vma(int defrag, 748 static inline struct page *alloc_hugepage_vma(int defrag,
750 struct vm_area_struct *vma, 749 struct vm_area_struct *vma,
751 unsigned long haddr, int nd, 750 unsigned long haddr, int nd,
752 gfp_t extra_gfp) 751 gfp_t extra_gfp)
753 { 752 {
754 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), 753 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
755 HPAGE_PMD_ORDER, vma, haddr, nd); 754 HPAGE_PMD_ORDER, vma, haddr, nd);
756 } 755 }
757 756
758 #ifndef CONFIG_NUMA 757 #ifndef CONFIG_NUMA
759 static inline struct page *alloc_hugepage(int defrag) 758 static inline struct page *alloc_hugepage(int defrag)
760 { 759 {
761 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), 760 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
762 HPAGE_PMD_ORDER); 761 HPAGE_PMD_ORDER);
763 } 762 }
764 #endif 763 #endif
765 764
766 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 765 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
767 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 766 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
768 unsigned long zero_pfn) 767 unsigned long zero_pfn)
769 { 768 {
770 pmd_t entry; 769 pmd_t entry;
771 if (!pmd_none(*pmd)) 770 if (!pmd_none(*pmd))
772 return false; 771 return false;
773 entry = pfn_pmd(zero_pfn, vma->vm_page_prot); 772 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
774 entry = pmd_wrprotect(entry); 773 entry = pmd_wrprotect(entry);
775 entry = pmd_mkhuge(entry); 774 entry = pmd_mkhuge(entry);
776 set_pmd_at(mm, haddr, pmd, entry); 775 set_pmd_at(mm, haddr, pmd, entry);
777 pgtable_trans_huge_deposit(mm, pgtable); 776 pgtable_trans_huge_deposit(mm, pgtable);
778 mm->nr_ptes++; 777 mm->nr_ptes++;
779 return true; 778 return true;
780 } 779 }
781 780
782 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 781 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
783 unsigned long address, pmd_t *pmd, 782 unsigned long address, pmd_t *pmd,
784 unsigned int flags) 783 unsigned int flags)
785 { 784 {
786 struct page *page; 785 struct page *page;
787 unsigned long haddr = address & HPAGE_PMD_MASK; 786 unsigned long haddr = address & HPAGE_PMD_MASK;
788 pte_t *pte; 787 pte_t *pte;
789 788
790 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { 789 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
791 if (unlikely(anon_vma_prepare(vma))) 790 if (unlikely(anon_vma_prepare(vma)))
792 return VM_FAULT_OOM; 791 return VM_FAULT_OOM;
793 if (unlikely(khugepaged_enter(vma))) 792 if (unlikely(khugepaged_enter(vma)))
794 return VM_FAULT_OOM; 793 return VM_FAULT_OOM;
795 if (!(flags & FAULT_FLAG_WRITE) && 794 if (!(flags & FAULT_FLAG_WRITE) &&
796 transparent_hugepage_use_zero_page()) { 795 transparent_hugepage_use_zero_page()) {
797 pgtable_t pgtable; 796 pgtable_t pgtable;
798 unsigned long zero_pfn; 797 unsigned long zero_pfn;
799 bool set; 798 bool set;
800 pgtable = pte_alloc_one(mm, haddr); 799 pgtable = pte_alloc_one(mm, haddr);
801 if (unlikely(!pgtable)) 800 if (unlikely(!pgtable))
802 return VM_FAULT_OOM; 801 return VM_FAULT_OOM;
803 zero_pfn = get_huge_zero_page(); 802 zero_pfn = get_huge_zero_page();
804 if (unlikely(!zero_pfn)) { 803 if (unlikely(!zero_pfn)) {
805 pte_free(mm, pgtable); 804 pte_free(mm, pgtable);
806 count_vm_event(THP_FAULT_FALLBACK); 805 count_vm_event(THP_FAULT_FALLBACK);
807 goto out; 806 goto out;
808 } 807 }
809 spin_lock(&mm->page_table_lock); 808 spin_lock(&mm->page_table_lock);
810 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, 809 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
811 zero_pfn); 810 zero_pfn);
812 spin_unlock(&mm->page_table_lock); 811 spin_unlock(&mm->page_table_lock);
813 if (!set) { 812 if (!set) {
814 pte_free(mm, pgtable); 813 pte_free(mm, pgtable);
815 put_huge_zero_page(); 814 put_huge_zero_page();
816 } 815 }
817 return 0; 816 return 0;
818 } 817 }
819 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 818 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
820 vma, haddr, numa_node_id(), 0); 819 vma, haddr, numa_node_id(), 0);
821 if (unlikely(!page)) { 820 if (unlikely(!page)) {
822 count_vm_event(THP_FAULT_FALLBACK); 821 count_vm_event(THP_FAULT_FALLBACK);
823 goto out; 822 goto out;
824 } 823 }
825 count_vm_event(THP_FAULT_ALLOC); 824 count_vm_event(THP_FAULT_ALLOC);
826 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 825 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
827 put_page(page); 826 put_page(page);
828 goto out; 827 goto out;
829 } 828 }
830 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, 829 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
831 page))) { 830 page))) {
832 mem_cgroup_uncharge_page(page); 831 mem_cgroup_uncharge_page(page);
833 put_page(page); 832 put_page(page);
834 goto out; 833 goto out;
835 } 834 }
836 835
837 return 0; 836 return 0;
838 } 837 }
839 out: 838 out:
840 /* 839 /*
841 * Use __pte_alloc instead of pte_alloc_map, because we can't 840 * Use __pte_alloc instead of pte_alloc_map, because we can't
842 * run pte_offset_map on the pmd, if an huge pmd could 841 * run pte_offset_map on the pmd, if an huge pmd could
843 * materialize from under us from a different thread. 842 * materialize from under us from a different thread.
844 */ 843 */
845 if (unlikely(pmd_none(*pmd)) && 844 if (unlikely(pmd_none(*pmd)) &&
846 unlikely(__pte_alloc(mm, vma, pmd, address))) 845 unlikely(__pte_alloc(mm, vma, pmd, address)))
847 return VM_FAULT_OOM; 846 return VM_FAULT_OOM;
848 /* if an huge pmd materialized from under us just retry later */ 847 /* if an huge pmd materialized from under us just retry later */
849 if (unlikely(pmd_trans_huge(*pmd))) 848 if (unlikely(pmd_trans_huge(*pmd)))
850 return 0; 849 return 0;
851 /* 850 /*
852 * A regular pmd is established and it can't morph into a huge pmd 851 * A regular pmd is established and it can't morph into a huge pmd
853 * from under us anymore at this point because we hold the mmap_sem 852 * from under us anymore at this point because we hold the mmap_sem
854 * read mode and khugepaged takes it in write mode. So now it's 853 * read mode and khugepaged takes it in write mode. So now it's
855 * safe to run pte_offset_map(). 854 * safe to run pte_offset_map().
856 */ 855 */
857 pte = pte_offset_map(pmd, address); 856 pte = pte_offset_map(pmd, address);
858 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 857 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
859 } 858 }
860 859
861 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 860 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
862 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 861 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
863 struct vm_area_struct *vma) 862 struct vm_area_struct *vma)
864 { 863 {
865 struct page *src_page; 864 struct page *src_page;
866 pmd_t pmd; 865 pmd_t pmd;
867 pgtable_t pgtable; 866 pgtable_t pgtable;
868 int ret; 867 int ret;
869 868
870 ret = -ENOMEM; 869 ret = -ENOMEM;
871 pgtable = pte_alloc_one(dst_mm, addr); 870 pgtable = pte_alloc_one(dst_mm, addr);
872 if (unlikely(!pgtable)) 871 if (unlikely(!pgtable))
873 goto out; 872 goto out;
874 873
875 spin_lock(&dst_mm->page_table_lock); 874 spin_lock(&dst_mm->page_table_lock);
876 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); 875 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
877 876
878 ret = -EAGAIN; 877 ret = -EAGAIN;
879 pmd = *src_pmd; 878 pmd = *src_pmd;
880 if (unlikely(!pmd_trans_huge(pmd))) { 879 if (unlikely(!pmd_trans_huge(pmd))) {
881 pte_free(dst_mm, pgtable); 880 pte_free(dst_mm, pgtable);
882 goto out_unlock; 881 goto out_unlock;
883 } 882 }
884 /* 883 /*
885 * mm->page_table_lock is enough to be sure that huge zero pmd is not 884 * mm->page_table_lock is enough to be sure that huge zero pmd is not
886 * under splitting since we don't split the page itself, only pmd to 885 * under splitting since we don't split the page itself, only pmd to
887 * a page table. 886 * a page table.
888 */ 887 */
889 if (is_huge_zero_pmd(pmd)) { 888 if (is_huge_zero_pmd(pmd)) {
890 unsigned long zero_pfn; 889 unsigned long zero_pfn;
891 bool set; 890 bool set;
892 /* 891 /*
893 * get_huge_zero_page() will never allocate a new page here, 892 * get_huge_zero_page() will never allocate a new page here,
894 * since we already have a zero page to copy. It just takes a 893 * since we already have a zero page to copy. It just takes a
895 * reference. 894 * reference.
896 */ 895 */
897 zero_pfn = get_huge_zero_page(); 896 zero_pfn = get_huge_zero_page();
898 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 897 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
899 zero_pfn); 898 zero_pfn);
900 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ 899 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
901 ret = 0; 900 ret = 0;
902 goto out_unlock; 901 goto out_unlock;
903 } 902 }
904 if (unlikely(pmd_trans_splitting(pmd))) { 903 if (unlikely(pmd_trans_splitting(pmd))) {
905 /* split huge page running from under us */ 904 /* split huge page running from under us */
906 spin_unlock(&src_mm->page_table_lock); 905 spin_unlock(&src_mm->page_table_lock);
907 spin_unlock(&dst_mm->page_table_lock); 906 spin_unlock(&dst_mm->page_table_lock);
908 pte_free(dst_mm, pgtable); 907 pte_free(dst_mm, pgtable);
909 908
910 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ 909 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
911 goto out; 910 goto out;
912 } 911 }
913 src_page = pmd_page(pmd); 912 src_page = pmd_page(pmd);
914 VM_BUG_ON(!PageHead(src_page)); 913 VM_BUG_ON(!PageHead(src_page));
915 get_page(src_page); 914 get_page(src_page);
916 page_dup_rmap(src_page); 915 page_dup_rmap(src_page);
917 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 916 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
918 917
919 pmdp_set_wrprotect(src_mm, addr, src_pmd); 918 pmdp_set_wrprotect(src_mm, addr, src_pmd);
920 pmd = pmd_mkold(pmd_wrprotect(pmd)); 919 pmd = pmd_mkold(pmd_wrprotect(pmd));
921 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 920 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
922 pgtable_trans_huge_deposit(dst_mm, pgtable); 921 pgtable_trans_huge_deposit(dst_mm, pgtable);
923 dst_mm->nr_ptes++; 922 dst_mm->nr_ptes++;
924 923
925 ret = 0; 924 ret = 0;
926 out_unlock: 925 out_unlock:
927 spin_unlock(&src_mm->page_table_lock); 926 spin_unlock(&src_mm->page_table_lock);
928 spin_unlock(&dst_mm->page_table_lock); 927 spin_unlock(&dst_mm->page_table_lock);
929 out: 928 out:
930 return ret; 929 return ret;
931 } 930 }
932 931
933 void huge_pmd_set_accessed(struct mm_struct *mm, 932 void huge_pmd_set_accessed(struct mm_struct *mm,
934 struct vm_area_struct *vma, 933 struct vm_area_struct *vma,
935 unsigned long address, 934 unsigned long address,
936 pmd_t *pmd, pmd_t orig_pmd, 935 pmd_t *pmd, pmd_t orig_pmd,
937 int dirty) 936 int dirty)
938 { 937 {
939 pmd_t entry; 938 pmd_t entry;
940 unsigned long haddr; 939 unsigned long haddr;
941 940
942 spin_lock(&mm->page_table_lock); 941 spin_lock(&mm->page_table_lock);
943 if (unlikely(!pmd_same(*pmd, orig_pmd))) 942 if (unlikely(!pmd_same(*pmd, orig_pmd)))
944 goto unlock; 943 goto unlock;
945 944
946 entry = pmd_mkyoung(orig_pmd); 945 entry = pmd_mkyoung(orig_pmd);
947 haddr = address & HPAGE_PMD_MASK; 946 haddr = address & HPAGE_PMD_MASK;
948 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) 947 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
949 update_mmu_cache_pmd(vma, address, pmd); 948 update_mmu_cache_pmd(vma, address, pmd);
950 949
951 unlock: 950 unlock:
952 spin_unlock(&mm->page_table_lock); 951 spin_unlock(&mm->page_table_lock);
953 } 952 }
954 953
955 static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, 954 static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
956 struct vm_area_struct *vma, unsigned long address, 955 struct vm_area_struct *vma, unsigned long address,
957 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) 956 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
958 { 957 {
959 pgtable_t pgtable; 958 pgtable_t pgtable;
960 pmd_t _pmd; 959 pmd_t _pmd;
961 struct page *page; 960 struct page *page;
962 int i, ret = 0; 961 int i, ret = 0;
963 unsigned long mmun_start; /* For mmu_notifiers */ 962 unsigned long mmun_start; /* For mmu_notifiers */
964 unsigned long mmun_end; /* For mmu_notifiers */ 963 unsigned long mmun_end; /* For mmu_notifiers */
965 964
966 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 965 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
967 if (!page) { 966 if (!page) {
968 ret |= VM_FAULT_OOM; 967 ret |= VM_FAULT_OOM;
969 goto out; 968 goto out;
970 } 969 }
971 970
972 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { 971 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
973 put_page(page); 972 put_page(page);
974 ret |= VM_FAULT_OOM; 973 ret |= VM_FAULT_OOM;
975 goto out; 974 goto out;
976 } 975 }
977 976
978 clear_user_highpage(page, address); 977 clear_user_highpage(page, address);
979 __SetPageUptodate(page); 978 __SetPageUptodate(page);
980 979
981 mmun_start = haddr; 980 mmun_start = haddr;
982 mmun_end = haddr + HPAGE_PMD_SIZE; 981 mmun_end = haddr + HPAGE_PMD_SIZE;
983 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 982 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
984 983
985 spin_lock(&mm->page_table_lock); 984 spin_lock(&mm->page_table_lock);
986 if (unlikely(!pmd_same(*pmd, orig_pmd))) 985 if (unlikely(!pmd_same(*pmd, orig_pmd)))
987 goto out_free_page; 986 goto out_free_page;
988 987
989 pmdp_clear_flush(vma, haddr, pmd); 988 pmdp_clear_flush(vma, haddr, pmd);
990 /* leave pmd empty until pte is filled */ 989 /* leave pmd empty until pte is filled */
991 990
992 pgtable = pgtable_trans_huge_withdraw(mm); 991 pgtable = pgtable_trans_huge_withdraw(mm);
993 pmd_populate(mm, &_pmd, pgtable); 992 pmd_populate(mm, &_pmd, pgtable);
994 993
995 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 994 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
996 pte_t *pte, entry; 995 pte_t *pte, entry;
997 if (haddr == (address & PAGE_MASK)) { 996 if (haddr == (address & PAGE_MASK)) {
998 entry = mk_pte(page, vma->vm_page_prot); 997 entry = mk_pte(page, vma->vm_page_prot);
999 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 998 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1000 page_add_new_anon_rmap(page, vma, haddr); 999 page_add_new_anon_rmap(page, vma, haddr);
1001 } else { 1000 } else {
1002 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 1001 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
1003 entry = pte_mkspecial(entry); 1002 entry = pte_mkspecial(entry);
1004 } 1003 }
1005 pte = pte_offset_map(&_pmd, haddr); 1004 pte = pte_offset_map(&_pmd, haddr);
1006 VM_BUG_ON(!pte_none(*pte)); 1005 VM_BUG_ON(!pte_none(*pte));
1007 set_pte_at(mm, haddr, pte, entry); 1006 set_pte_at(mm, haddr, pte, entry);
1008 pte_unmap(pte); 1007 pte_unmap(pte);
1009 } 1008 }
1010 smp_wmb(); /* make pte visible before pmd */ 1009 smp_wmb(); /* make pte visible before pmd */
1011 pmd_populate(mm, pmd, pgtable); 1010 pmd_populate(mm, pmd, pgtable);
1012 spin_unlock(&mm->page_table_lock); 1011 spin_unlock(&mm->page_table_lock);
1013 put_huge_zero_page(); 1012 put_huge_zero_page();
1014 inc_mm_counter(mm, MM_ANONPAGES); 1013 inc_mm_counter(mm, MM_ANONPAGES);
1015 1014
1016 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1015 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1017 1016
1018 ret |= VM_FAULT_WRITE; 1017 ret |= VM_FAULT_WRITE;
1019 out: 1018 out:
1020 return ret; 1019 return ret;
1021 out_free_page: 1020 out_free_page:
1022 spin_unlock(&mm->page_table_lock); 1021 spin_unlock(&mm->page_table_lock);
1023 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1022 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1024 mem_cgroup_uncharge_page(page); 1023 mem_cgroup_uncharge_page(page);
1025 put_page(page); 1024 put_page(page);
1026 goto out; 1025 goto out;
1027 } 1026 }
1028 1027
1029 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1028 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1030 struct vm_area_struct *vma, 1029 struct vm_area_struct *vma,
1031 unsigned long address, 1030 unsigned long address,
1032 pmd_t *pmd, pmd_t orig_pmd, 1031 pmd_t *pmd, pmd_t orig_pmd,
1033 struct page *page, 1032 struct page *page,
1034 unsigned long haddr) 1033 unsigned long haddr)
1035 { 1034 {
1036 pgtable_t pgtable; 1035 pgtable_t pgtable;
1037 pmd_t _pmd; 1036 pmd_t _pmd;
1038 int ret = 0, i; 1037 int ret = 0, i;
1039 struct page **pages; 1038 struct page **pages;
1040 unsigned long mmun_start; /* For mmu_notifiers */ 1039 unsigned long mmun_start; /* For mmu_notifiers */
1041 unsigned long mmun_end; /* For mmu_notifiers */ 1040 unsigned long mmun_end; /* For mmu_notifiers */
1042 1041
1043 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 1042 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
1044 GFP_KERNEL); 1043 GFP_KERNEL);
1045 if (unlikely(!pages)) { 1044 if (unlikely(!pages)) {
1046 ret |= VM_FAULT_OOM; 1045 ret |= VM_FAULT_OOM;
1047 goto out; 1046 goto out;
1048 } 1047 }
1049 1048
1050 for (i = 0; i < HPAGE_PMD_NR; i++) { 1049 for (i = 0; i < HPAGE_PMD_NR; i++) {
1051 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 1050 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
1052 __GFP_OTHER_NODE, 1051 __GFP_OTHER_NODE,
1053 vma, address, page_to_nid(page)); 1052 vma, address, page_to_nid(page));
1054 if (unlikely(!pages[i] || 1053 if (unlikely(!pages[i] ||
1055 mem_cgroup_newpage_charge(pages[i], mm, 1054 mem_cgroup_newpage_charge(pages[i], mm,
1056 GFP_KERNEL))) { 1055 GFP_KERNEL))) {
1057 if (pages[i]) 1056 if (pages[i])
1058 put_page(pages[i]); 1057 put_page(pages[i]);
1059 mem_cgroup_uncharge_start(); 1058 mem_cgroup_uncharge_start();
1060 while (--i >= 0) { 1059 while (--i >= 0) {
1061 mem_cgroup_uncharge_page(pages[i]); 1060 mem_cgroup_uncharge_page(pages[i]);
1062 put_page(pages[i]); 1061 put_page(pages[i]);
1063 } 1062 }
1064 mem_cgroup_uncharge_end(); 1063 mem_cgroup_uncharge_end();
1065 kfree(pages); 1064 kfree(pages);
1066 ret |= VM_FAULT_OOM; 1065 ret |= VM_FAULT_OOM;
1067 goto out; 1066 goto out;
1068 } 1067 }
1069 } 1068 }
1070 1069
1071 for (i = 0; i < HPAGE_PMD_NR; i++) { 1070 for (i = 0; i < HPAGE_PMD_NR; i++) {
1072 copy_user_highpage(pages[i], page + i, 1071 copy_user_highpage(pages[i], page + i,
1073 haddr + PAGE_SIZE * i, vma); 1072 haddr + PAGE_SIZE * i, vma);
1074 __SetPageUptodate(pages[i]); 1073 __SetPageUptodate(pages[i]);
1075 cond_resched(); 1074 cond_resched();
1076 } 1075 }
1077 1076
1078 mmun_start = haddr; 1077 mmun_start = haddr;
1079 mmun_end = haddr + HPAGE_PMD_SIZE; 1078 mmun_end = haddr + HPAGE_PMD_SIZE;
1080 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1079 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1081 1080
1082 spin_lock(&mm->page_table_lock); 1081 spin_lock(&mm->page_table_lock);
1083 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1082 if (unlikely(!pmd_same(*pmd, orig_pmd)))
1084 goto out_free_pages; 1083 goto out_free_pages;
1085 VM_BUG_ON(!PageHead(page)); 1084 VM_BUG_ON(!PageHead(page));
1086 1085
1087 pmdp_clear_flush(vma, haddr, pmd); 1086 pmdp_clear_flush(vma, haddr, pmd);
1088 /* leave pmd empty until pte is filled */ 1087 /* leave pmd empty until pte is filled */
1089 1088
1090 pgtable = pgtable_trans_huge_withdraw(mm); 1089 pgtable = pgtable_trans_huge_withdraw(mm);
1091 pmd_populate(mm, &_pmd, pgtable); 1090 pmd_populate(mm, &_pmd, pgtable);
1092 1091
1093 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1092 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1094 pte_t *pte, entry; 1093 pte_t *pte, entry;
1095 entry = mk_pte(pages[i], vma->vm_page_prot); 1094 entry = mk_pte(pages[i], vma->vm_page_prot);
1096 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1095 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1097 page_add_new_anon_rmap(pages[i], vma, haddr); 1096 page_add_new_anon_rmap(pages[i], vma, haddr);
1098 pte = pte_offset_map(&_pmd, haddr); 1097 pte = pte_offset_map(&_pmd, haddr);
1099 VM_BUG_ON(!pte_none(*pte)); 1098 VM_BUG_ON(!pte_none(*pte));
1100 set_pte_at(mm, haddr, pte, entry); 1099 set_pte_at(mm, haddr, pte, entry);
1101 pte_unmap(pte); 1100 pte_unmap(pte);
1102 } 1101 }
1103 kfree(pages); 1102 kfree(pages);
1104 1103
1105 smp_wmb(); /* make pte visible before pmd */ 1104 smp_wmb(); /* make pte visible before pmd */
1106 pmd_populate(mm, pmd, pgtable); 1105 pmd_populate(mm, pmd, pgtable);
1107 page_remove_rmap(page); 1106 page_remove_rmap(page);
1108 spin_unlock(&mm->page_table_lock); 1107 spin_unlock(&mm->page_table_lock);
1109 1108
1110 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1109 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1111 1110
1112 ret |= VM_FAULT_WRITE; 1111 ret |= VM_FAULT_WRITE;
1113 put_page(page); 1112 put_page(page);
1114 1113
1115 out: 1114 out:
1116 return ret; 1115 return ret;
1117 1116
1118 out_free_pages: 1117 out_free_pages:
1119 spin_unlock(&mm->page_table_lock); 1118 spin_unlock(&mm->page_table_lock);
1120 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1119 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1121 mem_cgroup_uncharge_start(); 1120 mem_cgroup_uncharge_start();
1122 for (i = 0; i < HPAGE_PMD_NR; i++) { 1121 for (i = 0; i < HPAGE_PMD_NR; i++) {
1123 mem_cgroup_uncharge_page(pages[i]); 1122 mem_cgroup_uncharge_page(pages[i]);
1124 put_page(pages[i]); 1123 put_page(pages[i]);
1125 } 1124 }
1126 mem_cgroup_uncharge_end(); 1125 mem_cgroup_uncharge_end();
1127 kfree(pages); 1126 kfree(pages);
1128 goto out; 1127 goto out;
1129 } 1128 }
1130 1129
1131 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1130 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1132 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1131 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
1133 { 1132 {
1134 int ret = 0; 1133 int ret = 0;
1135 struct page *page = NULL, *new_page; 1134 struct page *page = NULL, *new_page;
1136 unsigned long haddr; 1135 unsigned long haddr;
1137 unsigned long mmun_start; /* For mmu_notifiers */ 1136 unsigned long mmun_start; /* For mmu_notifiers */
1138 unsigned long mmun_end; /* For mmu_notifiers */ 1137 unsigned long mmun_end; /* For mmu_notifiers */
1139 1138
1140 VM_BUG_ON(!vma->anon_vma); 1139 VM_BUG_ON(!vma->anon_vma);
1141 haddr = address & HPAGE_PMD_MASK; 1140 haddr = address & HPAGE_PMD_MASK;
1142 if (is_huge_zero_pmd(orig_pmd)) 1141 if (is_huge_zero_pmd(orig_pmd))
1143 goto alloc; 1142 goto alloc;
1144 spin_lock(&mm->page_table_lock); 1143 spin_lock(&mm->page_table_lock);
1145 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1144 if (unlikely(!pmd_same(*pmd, orig_pmd)))
1146 goto out_unlock; 1145 goto out_unlock;
1147 1146
1148 page = pmd_page(orig_pmd); 1147 page = pmd_page(orig_pmd);
1149 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1148 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
1150 if (page_mapcount(page) == 1) { 1149 if (page_mapcount(page) == 1) {
1151 pmd_t entry; 1150 pmd_t entry;
1152 entry = pmd_mkyoung(orig_pmd); 1151 entry = pmd_mkyoung(orig_pmd);
1153 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1152 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1154 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 1153 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
1155 update_mmu_cache_pmd(vma, address, pmd); 1154 update_mmu_cache_pmd(vma, address, pmd);
1156 ret |= VM_FAULT_WRITE; 1155 ret |= VM_FAULT_WRITE;
1157 goto out_unlock; 1156 goto out_unlock;
1158 } 1157 }
1159 get_page(page); 1158 get_page(page);
1160 spin_unlock(&mm->page_table_lock); 1159 spin_unlock(&mm->page_table_lock);
1161 alloc: 1160 alloc:
1162 if (transparent_hugepage_enabled(vma) && 1161 if (transparent_hugepage_enabled(vma) &&
1163 !transparent_hugepage_debug_cow()) 1162 !transparent_hugepage_debug_cow())
1164 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1163 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
1165 vma, haddr, numa_node_id(), 0); 1164 vma, haddr, numa_node_id(), 0);
1166 else 1165 else
1167 new_page = NULL; 1166 new_page = NULL;
1168 1167
1169 if (unlikely(!new_page)) { 1168 if (unlikely(!new_page)) {
1170 count_vm_event(THP_FAULT_FALLBACK); 1169 count_vm_event(THP_FAULT_FALLBACK);
1171 if (is_huge_zero_pmd(orig_pmd)) { 1170 if (is_huge_zero_pmd(orig_pmd)) {
1172 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, 1171 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
1173 address, pmd, orig_pmd, haddr); 1172 address, pmd, orig_pmd, haddr);
1174 } else { 1173 } else {
1175 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1174 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1176 pmd, orig_pmd, page, haddr); 1175 pmd, orig_pmd, page, haddr);
1177 if (ret & VM_FAULT_OOM) 1176 if (ret & VM_FAULT_OOM)
1178 split_huge_page(page); 1177 split_huge_page(page);
1179 put_page(page); 1178 put_page(page);
1180 } 1179 }
1181 goto out; 1180 goto out;
1182 } 1181 }
1183 count_vm_event(THP_FAULT_ALLOC); 1182 count_vm_event(THP_FAULT_ALLOC);
1184 1183
1185 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1184 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1186 put_page(new_page); 1185 put_page(new_page);
1187 if (page) { 1186 if (page) {
1188 split_huge_page(page); 1187 split_huge_page(page);
1189 put_page(page); 1188 put_page(page);
1190 } 1189 }
1191 ret |= VM_FAULT_OOM; 1190 ret |= VM_FAULT_OOM;
1192 goto out; 1191 goto out;
1193 } 1192 }
1194 1193
1195 if (is_huge_zero_pmd(orig_pmd)) 1194 if (is_huge_zero_pmd(orig_pmd))
1196 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); 1195 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1197 else 1196 else
1198 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1197 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
1199 __SetPageUptodate(new_page); 1198 __SetPageUptodate(new_page);
1200 1199
1201 mmun_start = haddr; 1200 mmun_start = haddr;
1202 mmun_end = haddr + HPAGE_PMD_SIZE; 1201 mmun_end = haddr + HPAGE_PMD_SIZE;
1203 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1202 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1204 1203
1205 spin_lock(&mm->page_table_lock); 1204 spin_lock(&mm->page_table_lock);
1206 if (page) 1205 if (page)
1207 put_page(page); 1206 put_page(page);
1208 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1207 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
1209 spin_unlock(&mm->page_table_lock); 1208 spin_unlock(&mm->page_table_lock);
1210 mem_cgroup_uncharge_page(new_page); 1209 mem_cgroup_uncharge_page(new_page);
1211 put_page(new_page); 1210 put_page(new_page);
1212 goto out_mn; 1211 goto out_mn;
1213 } else { 1212 } else {
1214 pmd_t entry; 1213 pmd_t entry;
1215 entry = mk_huge_pmd(new_page, vma); 1214 entry = mk_huge_pmd(new_page, vma);
1216 pmdp_clear_flush(vma, haddr, pmd); 1215 pmdp_clear_flush(vma, haddr, pmd);
1217 page_add_new_anon_rmap(new_page, vma, haddr); 1216 page_add_new_anon_rmap(new_page, vma, haddr);
1218 set_pmd_at(mm, haddr, pmd, entry); 1217 set_pmd_at(mm, haddr, pmd, entry);
1219 update_mmu_cache_pmd(vma, address, pmd); 1218 update_mmu_cache_pmd(vma, address, pmd);
1220 if (is_huge_zero_pmd(orig_pmd)) { 1219 if (is_huge_zero_pmd(orig_pmd)) {
1221 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1220 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1222 put_huge_zero_page(); 1221 put_huge_zero_page();
1223 } else { 1222 } else {
1224 VM_BUG_ON(!PageHead(page)); 1223 VM_BUG_ON(!PageHead(page));
1225 page_remove_rmap(page); 1224 page_remove_rmap(page);
1226 put_page(page); 1225 put_page(page);
1227 } 1226 }
1228 ret |= VM_FAULT_WRITE; 1227 ret |= VM_FAULT_WRITE;
1229 } 1228 }
1230 spin_unlock(&mm->page_table_lock); 1229 spin_unlock(&mm->page_table_lock);
1231 out_mn: 1230 out_mn:
1232 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1231 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1233 out: 1232 out:
1234 return ret; 1233 return ret;
1235 out_unlock: 1234 out_unlock:
1236 spin_unlock(&mm->page_table_lock); 1235 spin_unlock(&mm->page_table_lock);
1237 return ret; 1236 return ret;
1238 } 1237 }
1239 1238
1240 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1239 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1241 unsigned long addr, 1240 unsigned long addr,
1242 pmd_t *pmd, 1241 pmd_t *pmd,
1243 unsigned int flags) 1242 unsigned int flags)
1244 { 1243 {
1245 struct mm_struct *mm = vma->vm_mm; 1244 struct mm_struct *mm = vma->vm_mm;
1246 struct page *page = NULL; 1245 struct page *page = NULL;
1247 1246
1248 assert_spin_locked(&mm->page_table_lock); 1247 assert_spin_locked(&mm->page_table_lock);
1249 1248
1250 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1249 if (flags & FOLL_WRITE && !pmd_write(*pmd))
1251 goto out; 1250 goto out;
1252 1251
1253 /* Avoid dumping huge zero page */ 1252 /* Avoid dumping huge zero page */
1254 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1253 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1255 return ERR_PTR(-EFAULT); 1254 return ERR_PTR(-EFAULT);
1256 1255
1257 page = pmd_page(*pmd); 1256 page = pmd_page(*pmd);
1258 VM_BUG_ON(!PageHead(page)); 1257 VM_BUG_ON(!PageHead(page));
1259 if (flags & FOLL_TOUCH) { 1258 if (flags & FOLL_TOUCH) {
1260 pmd_t _pmd; 1259 pmd_t _pmd;
1261 /* 1260 /*
1262 * We should set the dirty bit only for FOLL_WRITE but 1261 * We should set the dirty bit only for FOLL_WRITE but
1263 * for now the dirty bit in the pmd is meaningless. 1262 * for now the dirty bit in the pmd is meaningless.
1264 * And if the dirty bit will become meaningful and 1263 * And if the dirty bit will become meaningful and
1265 * we'll only set it with FOLL_WRITE, an atomic 1264 * we'll only set it with FOLL_WRITE, an atomic
1266 * set_bit will be required on the pmd to set the 1265 * set_bit will be required on the pmd to set the
1267 * young bit, instead of the current set_pmd_at. 1266 * young bit, instead of the current set_pmd_at.
1268 */ 1267 */
1269 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1268 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1270 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1269 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
1271 } 1270 }
1272 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1271 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1273 if (page->mapping && trylock_page(page)) { 1272 if (page->mapping && trylock_page(page)) {
1274 lru_add_drain(); 1273 lru_add_drain();
1275 if (page->mapping) 1274 if (page->mapping)
1276 mlock_vma_page(page); 1275 mlock_vma_page(page);
1277 unlock_page(page); 1276 unlock_page(page);
1278 } 1277 }
1279 } 1278 }
1280 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1279 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1281 VM_BUG_ON(!PageCompound(page)); 1280 VM_BUG_ON(!PageCompound(page));
1282 if (flags & FOLL_GET) 1281 if (flags & FOLL_GET)
1283 get_page_foll(page); 1282 get_page_foll(page);
1284 1283
1285 out: 1284 out:
1286 return page; 1285 return page;
1287 } 1286 }
1288 1287
1289 /* NUMA hinting page fault entry point for trans huge pmds */ 1288 /* NUMA hinting page fault entry point for trans huge pmds */
1290 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1289 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1291 unsigned long addr, pmd_t pmd, pmd_t *pmdp) 1290 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1292 { 1291 {
1293 struct page *page; 1292 struct page *page;
1294 unsigned long haddr = addr & HPAGE_PMD_MASK; 1293 unsigned long haddr = addr & HPAGE_PMD_MASK;
1295 int target_nid; 1294 int target_nid;
1296 int current_nid = -1; 1295 int current_nid = -1;
1297 bool migrated; 1296 bool migrated;
1298 1297
1299 spin_lock(&mm->page_table_lock); 1298 spin_lock(&mm->page_table_lock);
1300 if (unlikely(!pmd_same(pmd, *pmdp))) 1299 if (unlikely(!pmd_same(pmd, *pmdp)))
1301 goto out_unlock; 1300 goto out_unlock;
1302 1301
1303 page = pmd_page(pmd); 1302 page = pmd_page(pmd);
1304 get_page(page); 1303 get_page(page);
1305 current_nid = page_to_nid(page); 1304 current_nid = page_to_nid(page);
1306 count_vm_numa_event(NUMA_HINT_FAULTS); 1305 count_vm_numa_event(NUMA_HINT_FAULTS);
1307 if (current_nid == numa_node_id()) 1306 if (current_nid == numa_node_id())
1308 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1307 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1309 1308
1310 target_nid = mpol_misplaced(page, vma, haddr); 1309 target_nid = mpol_misplaced(page, vma, haddr);
1311 if (target_nid == -1) { 1310 if (target_nid == -1) {
1312 put_page(page); 1311 put_page(page);
1313 goto clear_pmdnuma; 1312 goto clear_pmdnuma;
1314 } 1313 }
1315 1314
1316 /* Acquire the page lock to serialise THP migrations */ 1315 /* Acquire the page lock to serialise THP migrations */
1317 spin_unlock(&mm->page_table_lock); 1316 spin_unlock(&mm->page_table_lock);
1318 lock_page(page); 1317 lock_page(page);
1319 1318
1320 /* Confirm the PTE did not while locked */ 1319 /* Confirm the PTE did not while locked */
1321 spin_lock(&mm->page_table_lock); 1320 spin_lock(&mm->page_table_lock);
1322 if (unlikely(!pmd_same(pmd, *pmdp))) { 1321 if (unlikely(!pmd_same(pmd, *pmdp))) {
1323 unlock_page(page); 1322 unlock_page(page);
1324 put_page(page); 1323 put_page(page);
1325 goto out_unlock; 1324 goto out_unlock;
1326 } 1325 }
1327 spin_unlock(&mm->page_table_lock); 1326 spin_unlock(&mm->page_table_lock);
1328 1327
1329 /* Migrate the THP to the requested node */ 1328 /* Migrate the THP to the requested node */
1330 migrated = migrate_misplaced_transhuge_page(mm, vma, 1329 migrated = migrate_misplaced_transhuge_page(mm, vma,
1331 pmdp, pmd, addr, page, target_nid); 1330 pmdp, pmd, addr, page, target_nid);
1332 if (!migrated) 1331 if (!migrated)
1333 goto check_same; 1332 goto check_same;
1334 1333
1335 task_numa_fault(target_nid, HPAGE_PMD_NR, true); 1334 task_numa_fault(target_nid, HPAGE_PMD_NR, true);
1336 return 0; 1335 return 0;
1337 1336
1338 check_same: 1337 check_same:
1339 spin_lock(&mm->page_table_lock); 1338 spin_lock(&mm->page_table_lock);
1340 if (unlikely(!pmd_same(pmd, *pmdp))) 1339 if (unlikely(!pmd_same(pmd, *pmdp)))
1341 goto out_unlock; 1340 goto out_unlock;
1342 clear_pmdnuma: 1341 clear_pmdnuma:
1343 pmd = pmd_mknonnuma(pmd); 1342 pmd = pmd_mknonnuma(pmd);
1344 set_pmd_at(mm, haddr, pmdp, pmd); 1343 set_pmd_at(mm, haddr, pmdp, pmd);
1345 VM_BUG_ON(pmd_numa(*pmdp)); 1344 VM_BUG_ON(pmd_numa(*pmdp));
1346 update_mmu_cache_pmd(vma, addr, pmdp); 1345 update_mmu_cache_pmd(vma, addr, pmdp);
1347 out_unlock: 1346 out_unlock:
1348 spin_unlock(&mm->page_table_lock); 1347 spin_unlock(&mm->page_table_lock);
1349 if (current_nid != -1) 1348 if (current_nid != -1)
1350 task_numa_fault(current_nid, HPAGE_PMD_NR, false); 1349 task_numa_fault(current_nid, HPAGE_PMD_NR, false);
1351 return 0; 1350 return 0;
1352 } 1351 }
1353 1352
1354 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1353 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1355 pmd_t *pmd, unsigned long addr) 1354 pmd_t *pmd, unsigned long addr)
1356 { 1355 {
1357 int ret = 0; 1356 int ret = 0;
1358 1357
1359 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1358 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1360 struct page *page; 1359 struct page *page;
1361 pgtable_t pgtable; 1360 pgtable_t pgtable;
1362 pmd_t orig_pmd; 1361 pmd_t orig_pmd;
1363 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1362 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1364 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1363 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1365 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1364 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1366 if (is_huge_zero_pmd(orig_pmd)) { 1365 if (is_huge_zero_pmd(orig_pmd)) {
1367 tlb->mm->nr_ptes--; 1366 tlb->mm->nr_ptes--;
1368 spin_unlock(&tlb->mm->page_table_lock); 1367 spin_unlock(&tlb->mm->page_table_lock);
1369 put_huge_zero_page(); 1368 put_huge_zero_page();
1370 } else { 1369 } else {
1371 page = pmd_page(orig_pmd); 1370 page = pmd_page(orig_pmd);
1372 page_remove_rmap(page); 1371 page_remove_rmap(page);
1373 VM_BUG_ON(page_mapcount(page) < 0); 1372 VM_BUG_ON(page_mapcount(page) < 0);
1374 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1373 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1375 VM_BUG_ON(!PageHead(page)); 1374 VM_BUG_ON(!PageHead(page));
1376 tlb->mm->nr_ptes--; 1375 tlb->mm->nr_ptes--;
1377 spin_unlock(&tlb->mm->page_table_lock); 1376 spin_unlock(&tlb->mm->page_table_lock);
1378 tlb_remove_page(tlb, page); 1377 tlb_remove_page(tlb, page);
1379 } 1378 }
1380 pte_free(tlb->mm, pgtable); 1379 pte_free(tlb->mm, pgtable);
1381 ret = 1; 1380 ret = 1;
1382 } 1381 }
1383 return ret; 1382 return ret;
1384 } 1383 }
1385 1384
1386 int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1385 int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1387 unsigned long addr, unsigned long end, 1386 unsigned long addr, unsigned long end,
1388 unsigned char *vec) 1387 unsigned char *vec)
1389 { 1388 {
1390 int ret = 0; 1389 int ret = 0;
1391 1390
1392 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1391 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1393 /* 1392 /*
1394 * All logical pages in the range are present 1393 * All logical pages in the range are present
1395 * if backed by a huge page. 1394 * if backed by a huge page.
1396 */ 1395 */
1397 spin_unlock(&vma->vm_mm->page_table_lock); 1396 spin_unlock(&vma->vm_mm->page_table_lock);
1398 memset(vec, 1, (end - addr) >> PAGE_SHIFT); 1397 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1399 ret = 1; 1398 ret = 1;
1400 } 1399 }
1401 1400
1402 return ret; 1401 return ret;
1403 } 1402 }
1404 1403
1405 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, 1404 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1406 unsigned long old_addr, 1405 unsigned long old_addr,
1407 unsigned long new_addr, unsigned long old_end, 1406 unsigned long new_addr, unsigned long old_end,
1408 pmd_t *old_pmd, pmd_t *new_pmd) 1407 pmd_t *old_pmd, pmd_t *new_pmd)
1409 { 1408 {
1410 int ret = 0; 1409 int ret = 0;
1411 pmd_t pmd; 1410 pmd_t pmd;
1412 1411
1413 struct mm_struct *mm = vma->vm_mm; 1412 struct mm_struct *mm = vma->vm_mm;
1414 1413
1415 if ((old_addr & ~HPAGE_PMD_MASK) || 1414 if ((old_addr & ~HPAGE_PMD_MASK) ||
1416 (new_addr & ~HPAGE_PMD_MASK) || 1415 (new_addr & ~HPAGE_PMD_MASK) ||
1417 old_end - old_addr < HPAGE_PMD_SIZE || 1416 old_end - old_addr < HPAGE_PMD_SIZE ||
1418 (new_vma->vm_flags & VM_NOHUGEPAGE)) 1417 (new_vma->vm_flags & VM_NOHUGEPAGE))
1419 goto out; 1418 goto out;
1420 1419
1421 /* 1420 /*
1422 * The destination pmd shouldn't be established, free_pgtables() 1421 * The destination pmd shouldn't be established, free_pgtables()
1423 * should have release it. 1422 * should have release it.
1424 */ 1423 */
1425 if (WARN_ON(!pmd_none(*new_pmd))) { 1424 if (WARN_ON(!pmd_none(*new_pmd))) {
1426 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1425 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1427 goto out; 1426 goto out;
1428 } 1427 }
1429 1428
1430 ret = __pmd_trans_huge_lock(old_pmd, vma); 1429 ret = __pmd_trans_huge_lock(old_pmd, vma);
1431 if (ret == 1) { 1430 if (ret == 1) {
1432 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1431 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1433 VM_BUG_ON(!pmd_none(*new_pmd)); 1432 VM_BUG_ON(!pmd_none(*new_pmd));
1434 set_pmd_at(mm, new_addr, new_pmd, pmd); 1433 set_pmd_at(mm, new_addr, new_pmd, pmd);
1435 spin_unlock(&mm->page_table_lock); 1434 spin_unlock(&mm->page_table_lock);
1436 } 1435 }
1437 out: 1436 out:
1438 return ret; 1437 return ret;
1439 } 1438 }
1440 1439
1441 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1440 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1442 unsigned long addr, pgprot_t newprot, int prot_numa) 1441 unsigned long addr, pgprot_t newprot, int prot_numa)
1443 { 1442 {
1444 struct mm_struct *mm = vma->vm_mm; 1443 struct mm_struct *mm = vma->vm_mm;
1445 int ret = 0; 1444 int ret = 0;
1446 1445
1447 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1446 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1448 pmd_t entry; 1447 pmd_t entry;
1449 entry = pmdp_get_and_clear(mm, addr, pmd); 1448 entry = pmdp_get_and_clear(mm, addr, pmd);
1450 if (!prot_numa) { 1449 if (!prot_numa) {
1451 entry = pmd_modify(entry, newprot); 1450 entry = pmd_modify(entry, newprot);
1452 BUG_ON(pmd_write(entry)); 1451 BUG_ON(pmd_write(entry));
1453 } else { 1452 } else {
1454 struct page *page = pmd_page(*pmd); 1453 struct page *page = pmd_page(*pmd);
1455 1454
1456 /* only check non-shared pages */ 1455 /* only check non-shared pages */
1457 if (page_mapcount(page) == 1 && 1456 if (page_mapcount(page) == 1 &&
1458 !pmd_numa(*pmd)) { 1457 !pmd_numa(*pmd)) {
1459 entry = pmd_mknuma(entry); 1458 entry = pmd_mknuma(entry);
1460 } 1459 }
1461 } 1460 }
1462 set_pmd_at(mm, addr, pmd, entry); 1461 set_pmd_at(mm, addr, pmd, entry);
1463 spin_unlock(&vma->vm_mm->page_table_lock); 1462 spin_unlock(&vma->vm_mm->page_table_lock);
1464 ret = 1; 1463 ret = 1;
1465 } 1464 }
1466 1465
1467 return ret; 1466 return ret;
1468 } 1467 }
1469 1468
1470 /* 1469 /*
1471 * Returns 1 if a given pmd maps a stable (not under splitting) thp. 1470 * Returns 1 if a given pmd maps a stable (not under splitting) thp.
1472 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. 1471 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1473 * 1472 *
1474 * Note that if it returns 1, this routine returns without unlocking page 1473 * Note that if it returns 1, this routine returns without unlocking page
1475 * table locks. So callers must unlock them. 1474 * table locks. So callers must unlock them.
1476 */ 1475 */
1477 int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1476 int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1478 { 1477 {
1479 spin_lock(&vma->vm_mm->page_table_lock); 1478 spin_lock(&vma->vm_mm->page_table_lock);
1480 if (likely(pmd_trans_huge(*pmd))) { 1479 if (likely(pmd_trans_huge(*pmd))) {
1481 if (unlikely(pmd_trans_splitting(*pmd))) { 1480 if (unlikely(pmd_trans_splitting(*pmd))) {
1482 spin_unlock(&vma->vm_mm->page_table_lock); 1481 spin_unlock(&vma->vm_mm->page_table_lock);
1483 wait_split_huge_page(vma->anon_vma, pmd); 1482 wait_split_huge_page(vma->anon_vma, pmd);
1484 return -1; 1483 return -1;
1485 } else { 1484 } else {
1486 /* Thp mapped by 'pmd' is stable, so we can 1485 /* Thp mapped by 'pmd' is stable, so we can
1487 * handle it as it is. */ 1486 * handle it as it is. */
1488 return 1; 1487 return 1;
1489 } 1488 }
1490 } 1489 }
1491 spin_unlock(&vma->vm_mm->page_table_lock); 1490 spin_unlock(&vma->vm_mm->page_table_lock);
1492 return 0; 1491 return 0;
1493 } 1492 }
1494 1493
1495 pmd_t *page_check_address_pmd(struct page *page, 1494 pmd_t *page_check_address_pmd(struct page *page,
1496 struct mm_struct *mm, 1495 struct mm_struct *mm,
1497 unsigned long address, 1496 unsigned long address,
1498 enum page_check_address_pmd_flag flag) 1497 enum page_check_address_pmd_flag flag)
1499 { 1498 {
1500 pmd_t *pmd, *ret = NULL; 1499 pmd_t *pmd, *ret = NULL;
1501 1500
1502 if (address & ~HPAGE_PMD_MASK) 1501 if (address & ~HPAGE_PMD_MASK)
1503 goto out; 1502 goto out;
1504 1503
1505 pmd = mm_find_pmd(mm, address); 1504 pmd = mm_find_pmd(mm, address);
1506 if (!pmd) 1505 if (!pmd)
1507 goto out; 1506 goto out;
1508 if (pmd_none(*pmd)) 1507 if (pmd_none(*pmd))
1509 goto out; 1508 goto out;
1510 if (pmd_page(*pmd) != page) 1509 if (pmd_page(*pmd) != page)
1511 goto out; 1510 goto out;
1512 /* 1511 /*
1513 * split_vma() may create temporary aliased mappings. There is 1512 * split_vma() may create temporary aliased mappings. There is
1514 * no risk as long as all huge pmd are found and have their 1513 * no risk as long as all huge pmd are found and have their
1515 * splitting bit set before __split_huge_page_refcount 1514 * splitting bit set before __split_huge_page_refcount
1516 * runs. Finding the same huge pmd more than once during the 1515 * runs. Finding the same huge pmd more than once during the
1517 * same rmap walk is not a problem. 1516 * same rmap walk is not a problem.
1518 */ 1517 */
1519 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && 1518 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1520 pmd_trans_splitting(*pmd)) 1519 pmd_trans_splitting(*pmd))
1521 goto out; 1520 goto out;
1522 if (pmd_trans_huge(*pmd)) { 1521 if (pmd_trans_huge(*pmd)) {
1523 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && 1522 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1524 !pmd_trans_splitting(*pmd)); 1523 !pmd_trans_splitting(*pmd));
1525 ret = pmd; 1524 ret = pmd;
1526 } 1525 }
1527 out: 1526 out:
1528 return ret; 1527 return ret;
1529 } 1528 }
1530 1529
1531 static int __split_huge_page_splitting(struct page *page, 1530 static int __split_huge_page_splitting(struct page *page,
1532 struct vm_area_struct *vma, 1531 struct vm_area_struct *vma,
1533 unsigned long address) 1532 unsigned long address)
1534 { 1533 {
1535 struct mm_struct *mm = vma->vm_mm; 1534 struct mm_struct *mm = vma->vm_mm;
1536 pmd_t *pmd; 1535 pmd_t *pmd;
1537 int ret = 0; 1536 int ret = 0;
1538 /* For mmu_notifiers */ 1537 /* For mmu_notifiers */
1539 const unsigned long mmun_start = address; 1538 const unsigned long mmun_start = address;
1540 const unsigned long mmun_end = address + HPAGE_PMD_SIZE; 1539 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1541 1540
1542 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1541 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1543 spin_lock(&mm->page_table_lock); 1542 spin_lock(&mm->page_table_lock);
1544 pmd = page_check_address_pmd(page, mm, address, 1543 pmd = page_check_address_pmd(page, mm, address,
1545 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1544 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1546 if (pmd) { 1545 if (pmd) {
1547 /* 1546 /*
1548 * We can't temporarily set the pmd to null in order 1547 * We can't temporarily set the pmd to null in order
1549 * to split it, the pmd must remain marked huge at all 1548 * to split it, the pmd must remain marked huge at all
1550 * times or the VM won't take the pmd_trans_huge paths 1549 * times or the VM won't take the pmd_trans_huge paths
1551 * and it won't wait on the anon_vma->root->rwsem to 1550 * and it won't wait on the anon_vma->root->rwsem to
1552 * serialize against split_huge_page*. 1551 * serialize against split_huge_page*.
1553 */ 1552 */
1554 pmdp_splitting_flush(vma, address, pmd); 1553 pmdp_splitting_flush(vma, address, pmd);
1555 ret = 1; 1554 ret = 1;
1556 } 1555 }
1557 spin_unlock(&mm->page_table_lock); 1556 spin_unlock(&mm->page_table_lock);
1558 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1557 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1559 1558
1560 return ret; 1559 return ret;
1561 } 1560 }
1562 1561
1563 static void __split_huge_page_refcount(struct page *page) 1562 static void __split_huge_page_refcount(struct page *page)
1564 { 1563 {
1565 int i; 1564 int i;
1566 struct zone *zone = page_zone(page); 1565 struct zone *zone = page_zone(page);
1567 struct lruvec *lruvec; 1566 struct lruvec *lruvec;
1568 int tail_count = 0; 1567 int tail_count = 0;
1569 1568
1570 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1569 /* prevent PageLRU to go away from under us, and freeze lru stats */
1571 spin_lock_irq(&zone->lru_lock); 1570 spin_lock_irq(&zone->lru_lock);
1572 lruvec = mem_cgroup_page_lruvec(page, zone); 1571 lruvec = mem_cgroup_page_lruvec(page, zone);
1573 1572
1574 compound_lock(page); 1573 compound_lock(page);
1575 /* complete memcg works before add pages to LRU */ 1574 /* complete memcg works before add pages to LRU */
1576 mem_cgroup_split_huge_fixup(page); 1575 mem_cgroup_split_huge_fixup(page);
1577 1576
1578 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 1577 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
1579 struct page *page_tail = page + i; 1578 struct page *page_tail = page + i;
1580 1579
1581 /* tail_page->_mapcount cannot change */ 1580 /* tail_page->_mapcount cannot change */
1582 BUG_ON(page_mapcount(page_tail) < 0); 1581 BUG_ON(page_mapcount(page_tail) < 0);
1583 tail_count += page_mapcount(page_tail); 1582 tail_count += page_mapcount(page_tail);
1584 /* check for overflow */ 1583 /* check for overflow */
1585 BUG_ON(tail_count < 0); 1584 BUG_ON(tail_count < 0);
1586 BUG_ON(atomic_read(&page_tail->_count) != 0); 1585 BUG_ON(atomic_read(&page_tail->_count) != 0);
1587 /* 1586 /*
1588 * tail_page->_count is zero and not changing from 1587 * tail_page->_count is zero and not changing from
1589 * under us. But get_page_unless_zero() may be running 1588 * under us. But get_page_unless_zero() may be running
1590 * from under us on the tail_page. If we used 1589 * from under us on the tail_page. If we used
1591 * atomic_set() below instead of atomic_add(), we 1590 * atomic_set() below instead of atomic_add(), we
1592 * would then run atomic_set() concurrently with 1591 * would then run atomic_set() concurrently with
1593 * get_page_unless_zero(), and atomic_set() is 1592 * get_page_unless_zero(), and atomic_set() is
1594 * implemented in C not using locked ops. spin_unlock 1593 * implemented in C not using locked ops. spin_unlock
1595 * on x86 sometime uses locked ops because of PPro 1594 * on x86 sometime uses locked ops because of PPro
1596 * errata 66, 92, so unless somebody can guarantee 1595 * errata 66, 92, so unless somebody can guarantee
1597 * atomic_set() here would be safe on all archs (and 1596 * atomic_set() here would be safe on all archs (and
1598 * not only on x86), it's safer to use atomic_add(). 1597 * not only on x86), it's safer to use atomic_add().
1599 */ 1598 */
1600 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, 1599 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1601 &page_tail->_count); 1600 &page_tail->_count);
1602 1601
1603 /* after clearing PageTail the gup refcount can be released */ 1602 /* after clearing PageTail the gup refcount can be released */
1604 smp_mb(); 1603 smp_mb();
1605 1604
1606 /* 1605 /*
1607 * retain hwpoison flag of the poisoned tail page: 1606 * retain hwpoison flag of the poisoned tail page:
1608 * fix for the unsuitable process killed on Guest Machine(KVM) 1607 * fix for the unsuitable process killed on Guest Machine(KVM)
1609 * by the memory-failure. 1608 * by the memory-failure.
1610 */ 1609 */
1611 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; 1610 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1612 page_tail->flags |= (page->flags & 1611 page_tail->flags |= (page->flags &
1613 ((1L << PG_referenced) | 1612 ((1L << PG_referenced) |
1614 (1L << PG_swapbacked) | 1613 (1L << PG_swapbacked) |
1615 (1L << PG_mlocked) | 1614 (1L << PG_mlocked) |
1616 (1L << PG_uptodate))); 1615 (1L << PG_uptodate)));
1617 page_tail->flags |= (1L << PG_dirty); 1616 page_tail->flags |= (1L << PG_dirty);
1618 1617
1619 /* clear PageTail before overwriting first_page */ 1618 /* clear PageTail before overwriting first_page */
1620 smp_wmb(); 1619 smp_wmb();
1621 1620
1622 /* 1621 /*
1623 * __split_huge_page_splitting() already set the 1622 * __split_huge_page_splitting() already set the
1624 * splitting bit in all pmd that could map this 1623 * splitting bit in all pmd that could map this
1625 * hugepage, that will ensure no CPU can alter the 1624 * hugepage, that will ensure no CPU can alter the
1626 * mapcount on the head page. The mapcount is only 1625 * mapcount on the head page. The mapcount is only
1627 * accounted in the head page and it has to be 1626 * accounted in the head page and it has to be
1628 * transferred to all tail pages in the below code. So 1627 * transferred to all tail pages in the below code. So
1629 * for this code to be safe, the split the mapcount 1628 * for this code to be safe, the split the mapcount
1630 * can't change. But that doesn't mean userland can't 1629 * can't change. But that doesn't mean userland can't
1631 * keep changing and reading the page contents while 1630 * keep changing and reading the page contents while
1632 * we transfer the mapcount, so the pmd splitting 1631 * we transfer the mapcount, so the pmd splitting
1633 * status is achieved setting a reserved bit in the 1632 * status is achieved setting a reserved bit in the
1634 * pmd, not by clearing the present bit. 1633 * pmd, not by clearing the present bit.
1635 */ 1634 */
1636 page_tail->_mapcount = page->_mapcount; 1635 page_tail->_mapcount = page->_mapcount;
1637 1636
1638 BUG_ON(page_tail->mapping); 1637 BUG_ON(page_tail->mapping);
1639 page_tail->mapping = page->mapping; 1638 page_tail->mapping = page->mapping;
1640 1639
1641 page_tail->index = page->index + i; 1640 page_tail->index = page->index + i;
1642 page_nid_xchg_last(page_tail, page_nid_last(page)); 1641 page_nid_xchg_last(page_tail, page_nid_last(page));
1643 1642
1644 BUG_ON(!PageAnon(page_tail)); 1643 BUG_ON(!PageAnon(page_tail));
1645 BUG_ON(!PageUptodate(page_tail)); 1644 BUG_ON(!PageUptodate(page_tail));
1646 BUG_ON(!PageDirty(page_tail)); 1645 BUG_ON(!PageDirty(page_tail));
1647 BUG_ON(!PageSwapBacked(page_tail)); 1646 BUG_ON(!PageSwapBacked(page_tail));
1648 1647
1649 lru_add_page_tail(page, page_tail, lruvec); 1648 lru_add_page_tail(page, page_tail, lruvec);
1650 } 1649 }
1651 atomic_sub(tail_count, &page->_count); 1650 atomic_sub(tail_count, &page->_count);
1652 BUG_ON(atomic_read(&page->_count) <= 0); 1651 BUG_ON(atomic_read(&page->_count) <= 0);
1653 1652
1654 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); 1653 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
1655 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1654 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1656 1655
1657 ClearPageCompound(page); 1656 ClearPageCompound(page);
1658 compound_unlock(page); 1657 compound_unlock(page);
1659 spin_unlock_irq(&zone->lru_lock); 1658 spin_unlock_irq(&zone->lru_lock);
1660 1659
1661 for (i = 1; i < HPAGE_PMD_NR; i++) { 1660 for (i = 1; i < HPAGE_PMD_NR; i++) {
1662 struct page *page_tail = page + i; 1661 struct page *page_tail = page + i;
1663 BUG_ON(page_count(page_tail) <= 0); 1662 BUG_ON(page_count(page_tail) <= 0);
1664 /* 1663 /*
1665 * Tail pages may be freed if there wasn't any mapping 1664 * Tail pages may be freed if there wasn't any mapping
1666 * like if add_to_swap() is running on a lru page that 1665 * like if add_to_swap() is running on a lru page that
1667 * had its mapping zapped. And freeing these pages 1666 * had its mapping zapped. And freeing these pages
1668 * requires taking the lru_lock so we do the put_page 1667 * requires taking the lru_lock so we do the put_page
1669 * of the tail pages after the split is complete. 1668 * of the tail pages after the split is complete.
1670 */ 1669 */
1671 put_page(page_tail); 1670 put_page(page_tail);
1672 } 1671 }
1673 1672
1674 /* 1673 /*
1675 * Only the head page (now become a regular page) is required 1674 * Only the head page (now become a regular page) is required
1676 * to be pinned by the caller. 1675 * to be pinned by the caller.
1677 */ 1676 */
1678 BUG_ON(page_count(page) <= 0); 1677 BUG_ON(page_count(page) <= 0);
1679 } 1678 }
1680 1679
1681 static int __split_huge_page_map(struct page *page, 1680 static int __split_huge_page_map(struct page *page,
1682 struct vm_area_struct *vma, 1681 struct vm_area_struct *vma,
1683 unsigned long address) 1682 unsigned long address)
1684 { 1683 {
1685 struct mm_struct *mm = vma->vm_mm; 1684 struct mm_struct *mm = vma->vm_mm;
1686 pmd_t *pmd, _pmd; 1685 pmd_t *pmd, _pmd;
1687 int ret = 0, i; 1686 int ret = 0, i;
1688 pgtable_t pgtable; 1687 pgtable_t pgtable;
1689 unsigned long haddr; 1688 unsigned long haddr;
1690 1689
1691 spin_lock(&mm->page_table_lock); 1690 spin_lock(&mm->page_table_lock);
1692 pmd = page_check_address_pmd(page, mm, address, 1691 pmd = page_check_address_pmd(page, mm, address,
1693 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1692 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1694 if (pmd) { 1693 if (pmd) {
1695 pgtable = pgtable_trans_huge_withdraw(mm); 1694 pgtable = pgtable_trans_huge_withdraw(mm);
1696 pmd_populate(mm, &_pmd, pgtable); 1695 pmd_populate(mm, &_pmd, pgtable);
1697 1696
1698 haddr = address; 1697 haddr = address;
1699 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1698 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1700 pte_t *pte, entry; 1699 pte_t *pte, entry;
1701 BUG_ON(PageCompound(page+i)); 1700 BUG_ON(PageCompound(page+i));
1702 entry = mk_pte(page + i, vma->vm_page_prot); 1701 entry = mk_pte(page + i, vma->vm_page_prot);
1703 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1702 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1704 if (!pmd_write(*pmd)) 1703 if (!pmd_write(*pmd))
1705 entry = pte_wrprotect(entry); 1704 entry = pte_wrprotect(entry);
1706 else 1705 else
1707 BUG_ON(page_mapcount(page) != 1); 1706 BUG_ON(page_mapcount(page) != 1);
1708 if (!pmd_young(*pmd)) 1707 if (!pmd_young(*pmd))
1709 entry = pte_mkold(entry); 1708 entry = pte_mkold(entry);
1710 if (pmd_numa(*pmd)) 1709 if (pmd_numa(*pmd))
1711 entry = pte_mknuma(entry); 1710 entry = pte_mknuma(entry);
1712 pte = pte_offset_map(&_pmd, haddr); 1711 pte = pte_offset_map(&_pmd, haddr);
1713 BUG_ON(!pte_none(*pte)); 1712 BUG_ON(!pte_none(*pte));
1714 set_pte_at(mm, haddr, pte, entry); 1713 set_pte_at(mm, haddr, pte, entry);
1715 pte_unmap(pte); 1714 pte_unmap(pte);
1716 } 1715 }
1717 1716
1718 smp_wmb(); /* make pte visible before pmd */ 1717 smp_wmb(); /* make pte visible before pmd */
1719 /* 1718 /*
1720 * Up to this point the pmd is present and huge and 1719 * Up to this point the pmd is present and huge and
1721 * userland has the whole access to the hugepage 1720 * userland has the whole access to the hugepage
1722 * during the split (which happens in place). If we 1721 * during the split (which happens in place). If we
1723 * overwrite the pmd with the not-huge version 1722 * overwrite the pmd with the not-huge version
1724 * pointing to the pte here (which of course we could 1723 * pointing to the pte here (which of course we could
1725 * if all CPUs were bug free), userland could trigger 1724 * if all CPUs were bug free), userland could trigger
1726 * a small page size TLB miss on the small sized TLB 1725 * a small page size TLB miss on the small sized TLB
1727 * while the hugepage TLB entry is still established 1726 * while the hugepage TLB entry is still established
1728 * in the huge TLB. Some CPU doesn't like that. See 1727 * in the huge TLB. Some CPU doesn't like that. See
1729 * http://support.amd.com/us/Processor_TechDocs/41322.pdf, 1728 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1730 * Erratum 383 on page 93. Intel should be safe but is 1729 * Erratum 383 on page 93. Intel should be safe but is
1731 * also warns that it's only safe if the permission 1730 * also warns that it's only safe if the permission
1732 * and cache attributes of the two entries loaded in 1731 * and cache attributes of the two entries loaded in
1733 * the two TLB is identical (which should be the case 1732 * the two TLB is identical (which should be the case
1734 * here). But it is generally safer to never allow 1733 * here). But it is generally safer to never allow
1735 * small and huge TLB entries for the same virtual 1734 * small and huge TLB entries for the same virtual
1736 * address to be loaded simultaneously. So instead of 1735 * address to be loaded simultaneously. So instead of
1737 * doing "pmd_populate(); flush_tlb_range();" we first 1736 * doing "pmd_populate(); flush_tlb_range();" we first
1738 * mark the current pmd notpresent (atomically because 1737 * mark the current pmd notpresent (atomically because
1739 * here the pmd_trans_huge and pmd_trans_splitting 1738 * here the pmd_trans_huge and pmd_trans_splitting
1740 * must remain set at all times on the pmd until the 1739 * must remain set at all times on the pmd until the
1741 * split is complete for this pmd), then we flush the 1740 * split is complete for this pmd), then we flush the
1742 * SMP TLB and finally we write the non-huge version 1741 * SMP TLB and finally we write the non-huge version
1743 * of the pmd entry with pmd_populate. 1742 * of the pmd entry with pmd_populate.
1744 */ 1743 */
1745 pmdp_invalidate(vma, address, pmd); 1744 pmdp_invalidate(vma, address, pmd);
1746 pmd_populate(mm, pmd, pgtable); 1745 pmd_populate(mm, pmd, pgtable);
1747 ret = 1; 1746 ret = 1;
1748 } 1747 }
1749 spin_unlock(&mm->page_table_lock); 1748 spin_unlock(&mm->page_table_lock);
1750 1749
1751 return ret; 1750 return ret;
1752 } 1751 }
1753 1752
1754 /* must be called with anon_vma->root->rwsem held */ 1753 /* must be called with anon_vma->root->rwsem held */
1755 static void __split_huge_page(struct page *page, 1754 static void __split_huge_page(struct page *page,
1756 struct anon_vma *anon_vma) 1755 struct anon_vma *anon_vma)
1757 { 1756 {
1758 int mapcount, mapcount2; 1757 int mapcount, mapcount2;
1759 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1758 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1760 struct anon_vma_chain *avc; 1759 struct anon_vma_chain *avc;
1761 1760
1762 BUG_ON(!PageHead(page)); 1761 BUG_ON(!PageHead(page));
1763 BUG_ON(PageTail(page)); 1762 BUG_ON(PageTail(page));
1764 1763
1765 mapcount = 0; 1764 mapcount = 0;
1766 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1765 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1767 struct vm_area_struct *vma = avc->vma; 1766 struct vm_area_struct *vma = avc->vma;
1768 unsigned long addr = vma_address(page, vma); 1767 unsigned long addr = vma_address(page, vma);
1769 BUG_ON(is_vma_temporary_stack(vma)); 1768 BUG_ON(is_vma_temporary_stack(vma));
1770 mapcount += __split_huge_page_splitting(page, vma, addr); 1769 mapcount += __split_huge_page_splitting(page, vma, addr);
1771 } 1770 }
1772 /* 1771 /*
1773 * It is critical that new vmas are added to the tail of the 1772 * It is critical that new vmas are added to the tail of the
1774 * anon_vma list. This guarantes that if copy_huge_pmd() runs 1773 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1775 * and establishes a child pmd before 1774 * and establishes a child pmd before
1776 * __split_huge_page_splitting() freezes the parent pmd (so if 1775 * __split_huge_page_splitting() freezes the parent pmd (so if
1777 * we fail to prevent copy_huge_pmd() from running until the 1776 * we fail to prevent copy_huge_pmd() from running until the
1778 * whole __split_huge_page() is complete), we will still see 1777 * whole __split_huge_page() is complete), we will still see
1779 * the newly established pmd of the child later during the 1778 * the newly established pmd of the child later during the
1780 * walk, to be able to set it as pmd_trans_splitting too. 1779 * walk, to be able to set it as pmd_trans_splitting too.
1781 */ 1780 */
1782 if (mapcount != page_mapcount(page)) 1781 if (mapcount != page_mapcount(page))
1783 printk(KERN_ERR "mapcount %d page_mapcount %d\n", 1782 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1784 mapcount, page_mapcount(page)); 1783 mapcount, page_mapcount(page));
1785 BUG_ON(mapcount != page_mapcount(page)); 1784 BUG_ON(mapcount != page_mapcount(page));
1786 1785
1787 __split_huge_page_refcount(page); 1786 __split_huge_page_refcount(page);
1788 1787
1789 mapcount2 = 0; 1788 mapcount2 = 0;
1790 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1789 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1791 struct vm_area_struct *vma = avc->vma; 1790 struct vm_area_struct *vma = avc->vma;
1792 unsigned long addr = vma_address(page, vma); 1791 unsigned long addr = vma_address(page, vma);
1793 BUG_ON(is_vma_temporary_stack(vma)); 1792 BUG_ON(is_vma_temporary_stack(vma));
1794 mapcount2 += __split_huge_page_map(page, vma, addr); 1793 mapcount2 += __split_huge_page_map(page, vma, addr);
1795 } 1794 }
1796 if (mapcount != mapcount2) 1795 if (mapcount != mapcount2)
1797 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", 1796 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1798 mapcount, mapcount2, page_mapcount(page)); 1797 mapcount, mapcount2, page_mapcount(page));
1799 BUG_ON(mapcount != mapcount2); 1798 BUG_ON(mapcount != mapcount2);
1800 } 1799 }
1801 1800
1802 int split_huge_page(struct page *page) 1801 int split_huge_page(struct page *page)
1803 { 1802 {
1804 struct anon_vma *anon_vma; 1803 struct anon_vma *anon_vma;
1805 int ret = 1; 1804 int ret = 1;
1806 1805
1807 BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); 1806 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1808 BUG_ON(!PageAnon(page)); 1807 BUG_ON(!PageAnon(page));
1809 1808
1810 /* 1809 /*
1811 * The caller does not necessarily hold an mmap_sem that would prevent 1810 * The caller does not necessarily hold an mmap_sem that would prevent
1812 * the anon_vma disappearing so we first we take a reference to it 1811 * the anon_vma disappearing so we first we take a reference to it
1813 * and then lock the anon_vma for write. This is similar to 1812 * and then lock the anon_vma for write. This is similar to
1814 * page_lock_anon_vma_read except the write lock is taken to serialise 1813 * page_lock_anon_vma_read except the write lock is taken to serialise
1815 * against parallel split or collapse operations. 1814 * against parallel split or collapse operations.
1816 */ 1815 */
1817 anon_vma = page_get_anon_vma(page); 1816 anon_vma = page_get_anon_vma(page);
1818 if (!anon_vma) 1817 if (!anon_vma)
1819 goto out; 1818 goto out;
1820 anon_vma_lock_write(anon_vma); 1819 anon_vma_lock_write(anon_vma);
1821 1820
1822 ret = 0; 1821 ret = 0;
1823 if (!PageCompound(page)) 1822 if (!PageCompound(page))
1824 goto out_unlock; 1823 goto out_unlock;
1825 1824
1826 BUG_ON(!PageSwapBacked(page)); 1825 BUG_ON(!PageSwapBacked(page));
1827 __split_huge_page(page, anon_vma); 1826 __split_huge_page(page, anon_vma);
1828 count_vm_event(THP_SPLIT); 1827 count_vm_event(THP_SPLIT);
1829 1828
1830 BUG_ON(PageCompound(page)); 1829 BUG_ON(PageCompound(page));
1831 out_unlock: 1830 out_unlock:
1832 anon_vma_unlock_write(anon_vma); 1831 anon_vma_unlock_write(anon_vma);
1833 put_anon_vma(anon_vma); 1832 put_anon_vma(anon_vma);
1834 out: 1833 out:
1835 return ret; 1834 return ret;
1836 } 1835 }
1837 1836
1838 #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) 1837 #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1839 1838
1840 int hugepage_madvise(struct vm_area_struct *vma, 1839 int hugepage_madvise(struct vm_area_struct *vma,
1841 unsigned long *vm_flags, int advice) 1840 unsigned long *vm_flags, int advice)
1842 { 1841 {
1843 struct mm_struct *mm = vma->vm_mm; 1842 struct mm_struct *mm = vma->vm_mm;
1844 1843
1845 switch (advice) { 1844 switch (advice) {
1846 case MADV_HUGEPAGE: 1845 case MADV_HUGEPAGE:
1847 /* 1846 /*
1848 * Be somewhat over-protective like KSM for now! 1847 * Be somewhat over-protective like KSM for now!
1849 */ 1848 */
1850 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 1849 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1851 return -EINVAL; 1850 return -EINVAL;
1852 if (mm->def_flags & VM_NOHUGEPAGE) 1851 if (mm->def_flags & VM_NOHUGEPAGE)
1853 return -EINVAL; 1852 return -EINVAL;
1854 *vm_flags &= ~VM_NOHUGEPAGE; 1853 *vm_flags &= ~VM_NOHUGEPAGE;
1855 *vm_flags |= VM_HUGEPAGE; 1854 *vm_flags |= VM_HUGEPAGE;
1856 /* 1855 /*
1857 * If the vma become good for khugepaged to scan, 1856 * If the vma become good for khugepaged to scan,
1858 * register it here without waiting a page fault that 1857 * register it here without waiting a page fault that
1859 * may not happen any time soon. 1858 * may not happen any time soon.
1860 */ 1859 */
1861 if (unlikely(khugepaged_enter_vma_merge(vma))) 1860 if (unlikely(khugepaged_enter_vma_merge(vma)))
1862 return -ENOMEM; 1861 return -ENOMEM;
1863 break; 1862 break;
1864 case MADV_NOHUGEPAGE: 1863 case MADV_NOHUGEPAGE:
1865 /* 1864 /*
1866 * Be somewhat over-protective like KSM for now! 1865 * Be somewhat over-protective like KSM for now!
1867 */ 1866 */
1868 if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) 1867 if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
1869 return -EINVAL; 1868 return -EINVAL;
1870 *vm_flags &= ~VM_HUGEPAGE; 1869 *vm_flags &= ~VM_HUGEPAGE;
1871 *vm_flags |= VM_NOHUGEPAGE; 1870 *vm_flags |= VM_NOHUGEPAGE;
1872 /* 1871 /*
1873 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 1872 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1874 * this vma even if we leave the mm registered in khugepaged if 1873 * this vma even if we leave the mm registered in khugepaged if
1875 * it got registered before VM_NOHUGEPAGE was set. 1874 * it got registered before VM_NOHUGEPAGE was set.
1876 */ 1875 */
1877 break; 1876 break;
1878 } 1877 }
1879 1878
1880 return 0; 1879 return 0;
1881 } 1880 }
1882 1881
1883 static int __init khugepaged_slab_init(void) 1882 static int __init khugepaged_slab_init(void)
1884 { 1883 {
1885 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", 1884 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1886 sizeof(struct mm_slot), 1885 sizeof(struct mm_slot),
1887 __alignof__(struct mm_slot), 0, NULL); 1886 __alignof__(struct mm_slot), 0, NULL);
1888 if (!mm_slot_cache) 1887 if (!mm_slot_cache)
1889 return -ENOMEM; 1888 return -ENOMEM;
1890 1889
1891 return 0; 1890 return 0;
1892 } 1891 }
1893 1892
1894 static inline struct mm_slot *alloc_mm_slot(void) 1893 static inline struct mm_slot *alloc_mm_slot(void)
1895 { 1894 {
1896 if (!mm_slot_cache) /* initialization failed */ 1895 if (!mm_slot_cache) /* initialization failed */
1897 return NULL; 1896 return NULL;
1898 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 1897 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1899 } 1898 }
1900 1899
1901 static inline void free_mm_slot(struct mm_slot *mm_slot) 1900 static inline void free_mm_slot(struct mm_slot *mm_slot)
1902 { 1901 {
1903 kmem_cache_free(mm_slot_cache, mm_slot); 1902 kmem_cache_free(mm_slot_cache, mm_slot);
1904 } 1903 }
1905 1904
1906 static struct mm_slot *get_mm_slot(struct mm_struct *mm) 1905 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1907 { 1906 {
1908 struct mm_slot *mm_slot; 1907 struct mm_slot *mm_slot;
1909 1908
1910 hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) 1909 hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
1911 if (mm == mm_slot->mm) 1910 if (mm == mm_slot->mm)
1912 return mm_slot; 1911 return mm_slot;
1913 1912
1914 return NULL; 1913 return NULL;
1915 } 1914 }
1916 1915
1917 static void insert_to_mm_slots_hash(struct mm_struct *mm, 1916 static void insert_to_mm_slots_hash(struct mm_struct *mm,
1918 struct mm_slot *mm_slot) 1917 struct mm_slot *mm_slot)
1919 { 1918 {
1920 mm_slot->mm = mm; 1919 mm_slot->mm = mm;
1921 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); 1920 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
1922 } 1921 }
1923 1922
1924 static inline int khugepaged_test_exit(struct mm_struct *mm) 1923 static inline int khugepaged_test_exit(struct mm_struct *mm)
1925 { 1924 {
1926 return atomic_read(&mm->mm_users) == 0; 1925 return atomic_read(&mm->mm_users) == 0;
1927 } 1926 }
1928 1927
1929 int __khugepaged_enter(struct mm_struct *mm) 1928 int __khugepaged_enter(struct mm_struct *mm)
1930 { 1929 {
1931 struct mm_slot *mm_slot; 1930 struct mm_slot *mm_slot;
1932 int wakeup; 1931 int wakeup;
1933 1932
1934 mm_slot = alloc_mm_slot(); 1933 mm_slot = alloc_mm_slot();
1935 if (!mm_slot) 1934 if (!mm_slot)
1936 return -ENOMEM; 1935 return -ENOMEM;
1937 1936
1938 /* __khugepaged_exit() must not run from under us */ 1937 /* __khugepaged_exit() must not run from under us */
1939 VM_BUG_ON(khugepaged_test_exit(mm)); 1938 VM_BUG_ON(khugepaged_test_exit(mm));
1940 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 1939 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1941 free_mm_slot(mm_slot); 1940 free_mm_slot(mm_slot);
1942 return 0; 1941 return 0;
1943 } 1942 }
1944 1943
1945 spin_lock(&khugepaged_mm_lock); 1944 spin_lock(&khugepaged_mm_lock);
1946 insert_to_mm_slots_hash(mm, mm_slot); 1945 insert_to_mm_slots_hash(mm, mm_slot);
1947 /* 1946 /*
1948 * Insert just behind the scanning cursor, to let the area settle 1947 * Insert just behind the scanning cursor, to let the area settle
1949 * down a little. 1948 * down a little.
1950 */ 1949 */
1951 wakeup = list_empty(&khugepaged_scan.mm_head); 1950 wakeup = list_empty(&khugepaged_scan.mm_head);
1952 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); 1951 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1953 spin_unlock(&khugepaged_mm_lock); 1952 spin_unlock(&khugepaged_mm_lock);
1954 1953
1955 atomic_inc(&mm->mm_count); 1954 atomic_inc(&mm->mm_count);
1956 if (wakeup) 1955 if (wakeup)
1957 wake_up_interruptible(&khugepaged_wait); 1956 wake_up_interruptible(&khugepaged_wait);
1958 1957
1959 return 0; 1958 return 0;
1960 } 1959 }
1961 1960
1962 int khugepaged_enter_vma_merge(struct vm_area_struct *vma) 1961 int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1963 { 1962 {
1964 unsigned long hstart, hend; 1963 unsigned long hstart, hend;
1965 if (!vma->anon_vma) 1964 if (!vma->anon_vma)
1966 /* 1965 /*
1967 * Not yet faulted in so we will register later in the 1966 * Not yet faulted in so we will register later in the
1968 * page fault if needed. 1967 * page fault if needed.
1969 */ 1968 */
1970 return 0; 1969 return 0;
1971 if (vma->vm_ops) 1970 if (vma->vm_ops)
1972 /* khugepaged not yet working on file or special mappings */ 1971 /* khugepaged not yet working on file or special mappings */
1973 return 0; 1972 return 0;
1974 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 1973 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1975 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1974 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1976 hend = vma->vm_end & HPAGE_PMD_MASK; 1975 hend = vma->vm_end & HPAGE_PMD_MASK;
1977 if (hstart < hend) 1976 if (hstart < hend)
1978 return khugepaged_enter(vma); 1977 return khugepaged_enter(vma);
1979 return 0; 1978 return 0;
1980 } 1979 }
1981 1980
1982 void __khugepaged_exit(struct mm_struct *mm) 1981 void __khugepaged_exit(struct mm_struct *mm)
1983 { 1982 {
1984 struct mm_slot *mm_slot; 1983 struct mm_slot *mm_slot;
1985 int free = 0; 1984 int free = 0;
1986 1985
1987 spin_lock(&khugepaged_mm_lock); 1986 spin_lock(&khugepaged_mm_lock);
1988 mm_slot = get_mm_slot(mm); 1987 mm_slot = get_mm_slot(mm);
1989 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 1988 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1990 hash_del(&mm_slot->hash); 1989 hash_del(&mm_slot->hash);
1991 list_del(&mm_slot->mm_node); 1990 list_del(&mm_slot->mm_node);
1992 free = 1; 1991 free = 1;
1993 } 1992 }
1994 spin_unlock(&khugepaged_mm_lock); 1993 spin_unlock(&khugepaged_mm_lock);
1995 1994
1996 if (free) { 1995 if (free) {
1997 clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1996 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1998 free_mm_slot(mm_slot); 1997 free_mm_slot(mm_slot);
1999 mmdrop(mm); 1998 mmdrop(mm);
2000 } else if (mm_slot) { 1999 } else if (mm_slot) {
2001 /* 2000 /*
2002 * This is required to serialize against 2001 * This is required to serialize against
2003 * khugepaged_test_exit() (which is guaranteed to run 2002 * khugepaged_test_exit() (which is guaranteed to run
2004 * under mmap sem read mode). Stop here (after we 2003 * under mmap sem read mode). Stop here (after we
2005 * return all pagetables will be destroyed) until 2004 * return all pagetables will be destroyed) until
2006 * khugepaged has finished working on the pagetables 2005 * khugepaged has finished working on the pagetables
2007 * under the mmap_sem. 2006 * under the mmap_sem.
2008 */ 2007 */
2009 down_write(&mm->mmap_sem); 2008 down_write(&mm->mmap_sem);
2010 up_write(&mm->mmap_sem); 2009 up_write(&mm->mmap_sem);
2011 } 2010 }
2012 } 2011 }
2013 2012
2014 static void release_pte_page(struct page *page) 2013 static void release_pte_page(struct page *page)
2015 { 2014 {
2016 /* 0 stands for page_is_file_cache(page) == false */ 2015 /* 0 stands for page_is_file_cache(page) == false */
2017 dec_zone_page_state(page, NR_ISOLATED_ANON + 0); 2016 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
2018 unlock_page(page); 2017 unlock_page(page);
2019 putback_lru_page(page); 2018 putback_lru_page(page);
2020 } 2019 }
2021 2020
2022 static void release_pte_pages(pte_t *pte, pte_t *_pte) 2021 static void release_pte_pages(pte_t *pte, pte_t *_pte)
2023 { 2022 {
2024 while (--_pte >= pte) { 2023 while (--_pte >= pte) {
2025 pte_t pteval = *_pte; 2024 pte_t pteval = *_pte;
2026 if (!pte_none(pteval)) 2025 if (!pte_none(pteval))
2027 release_pte_page(pte_page(pteval)); 2026 release_pte_page(pte_page(pteval));
2028 } 2027 }
2029 } 2028 }
2030 2029
2031 static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 2030 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2032 unsigned long address, 2031 unsigned long address,
2033 pte_t *pte) 2032 pte_t *pte)
2034 { 2033 {
2035 struct page *page; 2034 struct page *page;
2036 pte_t *_pte; 2035 pte_t *_pte;
2037 int referenced = 0, none = 0; 2036 int referenced = 0, none = 0;
2038 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2037 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2039 _pte++, address += PAGE_SIZE) { 2038 _pte++, address += PAGE_SIZE) {
2040 pte_t pteval = *_pte; 2039 pte_t pteval = *_pte;
2041 if (pte_none(pteval)) { 2040 if (pte_none(pteval)) {
2042 if (++none <= khugepaged_max_ptes_none) 2041 if (++none <= khugepaged_max_ptes_none)
2043 continue; 2042 continue;
2044 else 2043 else
2045 goto out; 2044 goto out;
2046 } 2045 }
2047 if (!pte_present(pteval) || !pte_write(pteval)) 2046 if (!pte_present(pteval) || !pte_write(pteval))
2048 goto out; 2047 goto out;
2049 page = vm_normal_page(vma, address, pteval); 2048 page = vm_normal_page(vma, address, pteval);
2050 if (unlikely(!page)) 2049 if (unlikely(!page))
2051 goto out; 2050 goto out;
2052 2051
2053 VM_BUG_ON(PageCompound(page)); 2052 VM_BUG_ON(PageCompound(page));
2054 BUG_ON(!PageAnon(page)); 2053 BUG_ON(!PageAnon(page));
2055 VM_BUG_ON(!PageSwapBacked(page)); 2054 VM_BUG_ON(!PageSwapBacked(page));
2056 2055
2057 /* cannot use mapcount: can't collapse if there's a gup pin */ 2056 /* cannot use mapcount: can't collapse if there's a gup pin */
2058 if (page_count(page) != 1) 2057 if (page_count(page) != 1)
2059 goto out; 2058 goto out;
2060 /* 2059 /*
2061 * We can do it before isolate_lru_page because the 2060 * We can do it before isolate_lru_page because the
2062 * page can't be freed from under us. NOTE: PG_lock 2061 * page can't be freed from under us. NOTE: PG_lock
2063 * is needed to serialize against split_huge_page 2062 * is needed to serialize against split_huge_page
2064 * when invoked from the VM. 2063 * when invoked from the VM.
2065 */ 2064 */
2066 if (!trylock_page(page)) 2065 if (!trylock_page(page))
2067 goto out; 2066 goto out;
2068 /* 2067 /*
2069 * Isolate the page to avoid collapsing an hugepage 2068 * Isolate the page to avoid collapsing an hugepage
2070 * currently in use by the VM. 2069 * currently in use by the VM.
2071 */ 2070 */
2072 if (isolate_lru_page(page)) { 2071 if (isolate_lru_page(page)) {
2073 unlock_page(page); 2072 unlock_page(page);
2074 goto out; 2073 goto out;
2075 } 2074 }
2076 /* 0 stands for page_is_file_cache(page) == false */ 2075 /* 0 stands for page_is_file_cache(page) == false */
2077 inc_zone_page_state(page, NR_ISOLATED_ANON + 0); 2076 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
2078 VM_BUG_ON(!PageLocked(page)); 2077 VM_BUG_ON(!PageLocked(page));
2079 VM_BUG_ON(PageLRU(page)); 2078 VM_BUG_ON(PageLRU(page));
2080 2079
2081 /* If there is no mapped pte young don't collapse the page */ 2080 /* If there is no mapped pte young don't collapse the page */
2082 if (pte_young(pteval) || PageReferenced(page) || 2081 if (pte_young(pteval) || PageReferenced(page) ||
2083 mmu_notifier_test_young(vma->vm_mm, address)) 2082 mmu_notifier_test_young(vma->vm_mm, address))
2084 referenced = 1; 2083 referenced = 1;
2085 } 2084 }
2086 if (likely(referenced)) 2085 if (likely(referenced))
2087 return 1; 2086 return 1;
2088 out: 2087 out:
2089 release_pte_pages(pte, _pte); 2088 release_pte_pages(pte, _pte);
2090 return 0; 2089 return 0;
2091 } 2090 }
2092 2091
2093 static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 2092 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
2094 struct vm_area_struct *vma, 2093 struct vm_area_struct *vma,
2095 unsigned long address, 2094 unsigned long address,
2096 spinlock_t *ptl) 2095 spinlock_t *ptl)
2097 { 2096 {
2098 pte_t *_pte; 2097 pte_t *_pte;
2099 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { 2098 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
2100 pte_t pteval = *_pte; 2099 pte_t pteval = *_pte;
2101 struct page *src_page; 2100 struct page *src_page;
2102 2101
2103 if (pte_none(pteval)) { 2102 if (pte_none(pteval)) {
2104 clear_user_highpage(page, address); 2103 clear_user_highpage(page, address);
2105 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 2104 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
2106 } else { 2105 } else {
2107 src_page = pte_page(pteval); 2106 src_page = pte_page(pteval);
2108 copy_user_highpage(page, src_page, address, vma); 2107 copy_user_highpage(page, src_page, address, vma);
2109 VM_BUG_ON(page_mapcount(src_page) != 1); 2108 VM_BUG_ON(page_mapcount(src_page) != 1);
2110 release_pte_page(src_page); 2109 release_pte_page(src_page);
2111 /* 2110 /*
2112 * ptl mostly unnecessary, but preempt has to 2111 * ptl mostly unnecessary, but preempt has to
2113 * be disabled to update the per-cpu stats 2112 * be disabled to update the per-cpu stats
2114 * inside page_remove_rmap(). 2113 * inside page_remove_rmap().
2115 */ 2114 */
2116 spin_lock(ptl); 2115 spin_lock(ptl);
2117 /* 2116 /*
2118 * paravirt calls inside pte_clear here are 2117 * paravirt calls inside pte_clear here are
2119 * superfluous. 2118 * superfluous.
2120 */ 2119 */
2121 pte_clear(vma->vm_mm, address, _pte); 2120 pte_clear(vma->vm_mm, address, _pte);
2122 page_remove_rmap(src_page); 2121 page_remove_rmap(src_page);
2123 spin_unlock(ptl); 2122 spin_unlock(ptl);
2124 free_page_and_swap_cache(src_page); 2123 free_page_and_swap_cache(src_page);
2125 } 2124 }
2126 2125
2127 address += PAGE_SIZE; 2126 address += PAGE_SIZE;
2128 page++; 2127 page++;
2129 } 2128 }
2130 } 2129 }
2131 2130
2132 static void khugepaged_alloc_sleep(void) 2131 static void khugepaged_alloc_sleep(void)
2133 { 2132 {
2134 wait_event_freezable_timeout(khugepaged_wait, false, 2133 wait_event_freezable_timeout(khugepaged_wait, false,
2135 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2134 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2136 } 2135 }
2137 2136
2138 #ifdef CONFIG_NUMA 2137 #ifdef CONFIG_NUMA
2139 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2138 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2140 { 2139 {
2141 if (IS_ERR(*hpage)) { 2140 if (IS_ERR(*hpage)) {
2142 if (!*wait) 2141 if (!*wait)
2143 return false; 2142 return false;
2144 2143
2145 *wait = false; 2144 *wait = false;
2146 *hpage = NULL; 2145 *hpage = NULL;
2147 khugepaged_alloc_sleep(); 2146 khugepaged_alloc_sleep();
2148 } else if (*hpage) { 2147 } else if (*hpage) {
2149 put_page(*hpage); 2148 put_page(*hpage);
2150 *hpage = NULL; 2149 *hpage = NULL;
2151 } 2150 }
2152 2151
2153 return true; 2152 return true;
2154 } 2153 }
2155 2154
2156 static struct page 2155 static struct page
2157 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2156 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
2158 struct vm_area_struct *vma, unsigned long address, 2157 struct vm_area_struct *vma, unsigned long address,
2159 int node) 2158 int node)
2160 { 2159 {
2161 VM_BUG_ON(*hpage); 2160 VM_BUG_ON(*hpage);
2162 /* 2161 /*
2163 * Allocate the page while the vma is still valid and under 2162 * Allocate the page while the vma is still valid and under
2164 * the mmap_sem read mode so there is no memory allocation 2163 * the mmap_sem read mode so there is no memory allocation
2165 * later when we take the mmap_sem in write mode. This is more 2164 * later when we take the mmap_sem in write mode. This is more
2166 * friendly behavior (OTOH it may actually hide bugs) to 2165 * friendly behavior (OTOH it may actually hide bugs) to
2167 * filesystems in userland with daemons allocating memory in 2166 * filesystems in userland with daemons allocating memory in
2168 * the userland I/O paths. Allocating memory with the 2167 * the userland I/O paths. Allocating memory with the
2169 * mmap_sem in read mode is good idea also to allow greater 2168 * mmap_sem in read mode is good idea also to allow greater
2170 * scalability. 2169 * scalability.
2171 */ 2170 */
2172 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 2171 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
2173 node, __GFP_OTHER_NODE); 2172 node, __GFP_OTHER_NODE);
2174 2173
2175 /* 2174 /*
2176 * After allocating the hugepage, release the mmap_sem read lock in 2175 * After allocating the hugepage, release the mmap_sem read lock in
2177 * preparation for taking it in write mode. 2176 * preparation for taking it in write mode.
2178 */ 2177 */
2179 up_read(&mm->mmap_sem); 2178 up_read(&mm->mmap_sem);
2180 if (unlikely(!*hpage)) { 2179 if (unlikely(!*hpage)) {
2181 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2180 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2182 *hpage = ERR_PTR(-ENOMEM); 2181 *hpage = ERR_PTR(-ENOMEM);
2183 return NULL; 2182 return NULL;
2184 } 2183 }
2185 2184
2186 count_vm_event(THP_COLLAPSE_ALLOC); 2185 count_vm_event(THP_COLLAPSE_ALLOC);
2187 return *hpage; 2186 return *hpage;
2188 } 2187 }
2189 #else 2188 #else
2190 static struct page *khugepaged_alloc_hugepage(bool *wait) 2189 static struct page *khugepaged_alloc_hugepage(bool *wait)
2191 { 2190 {
2192 struct page *hpage; 2191 struct page *hpage;
2193 2192
2194 do { 2193 do {
2195 hpage = alloc_hugepage(khugepaged_defrag()); 2194 hpage = alloc_hugepage(khugepaged_defrag());
2196 if (!hpage) { 2195 if (!hpage) {
2197 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2196 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2198 if (!*wait) 2197 if (!*wait)
2199 return NULL; 2198 return NULL;
2200 2199
2201 *wait = false; 2200 *wait = false;
2202 khugepaged_alloc_sleep(); 2201 khugepaged_alloc_sleep();
2203 } else 2202 } else
2204 count_vm_event(THP_COLLAPSE_ALLOC); 2203 count_vm_event(THP_COLLAPSE_ALLOC);
2205 } while (unlikely(!hpage) && likely(khugepaged_enabled())); 2204 } while (unlikely(!hpage) && likely(khugepaged_enabled()));
2206 2205
2207 return hpage; 2206 return hpage;
2208 } 2207 }
2209 2208
2210 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2209 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2211 { 2210 {
2212 if (!*hpage) 2211 if (!*hpage)
2213 *hpage = khugepaged_alloc_hugepage(wait); 2212 *hpage = khugepaged_alloc_hugepage(wait);
2214 2213
2215 if (unlikely(!*hpage)) 2214 if (unlikely(!*hpage))
2216 return false; 2215 return false;
2217 2216
2218 return true; 2217 return true;
2219 } 2218 }
2220 2219
2221 static struct page 2220 static struct page
2222 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2221 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
2223 struct vm_area_struct *vma, unsigned long address, 2222 struct vm_area_struct *vma, unsigned long address,
2224 int node) 2223 int node)
2225 { 2224 {
2226 up_read(&mm->mmap_sem); 2225 up_read(&mm->mmap_sem);
2227 VM_BUG_ON(!*hpage); 2226 VM_BUG_ON(!*hpage);
2228 return *hpage; 2227 return *hpage;
2229 } 2228 }
2230 #endif 2229 #endif
2231 2230
2232 static bool hugepage_vma_check(struct vm_area_struct *vma) 2231 static bool hugepage_vma_check(struct vm_area_struct *vma)
2233 { 2232 {
2234 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 2233 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2235 (vma->vm_flags & VM_NOHUGEPAGE)) 2234 (vma->vm_flags & VM_NOHUGEPAGE))
2236 return false; 2235 return false;
2237 2236
2238 if (!vma->anon_vma || vma->vm_ops) 2237 if (!vma->anon_vma || vma->vm_ops)
2239 return false; 2238 return false;
2240 if (is_vma_temporary_stack(vma)) 2239 if (is_vma_temporary_stack(vma))
2241 return false; 2240 return false;
2242 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2241 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2243 return true; 2242 return true;
2244 } 2243 }
2245 2244
2246 static void collapse_huge_page(struct mm_struct *mm, 2245 static void collapse_huge_page(struct mm_struct *mm,
2247 unsigned long address, 2246 unsigned long address,
2248 struct page **hpage, 2247 struct page **hpage,
2249 struct vm_area_struct *vma, 2248 struct vm_area_struct *vma,
2250 int node) 2249 int node)
2251 { 2250 {
2252 pmd_t *pmd, _pmd; 2251 pmd_t *pmd, _pmd;
2253 pte_t *pte; 2252 pte_t *pte;
2254 pgtable_t pgtable; 2253 pgtable_t pgtable;
2255 struct page *new_page; 2254 struct page *new_page;
2256 spinlock_t *ptl; 2255 spinlock_t *ptl;
2257 int isolated; 2256 int isolated;
2258 unsigned long hstart, hend; 2257 unsigned long hstart, hend;
2259 unsigned long mmun_start; /* For mmu_notifiers */ 2258 unsigned long mmun_start; /* For mmu_notifiers */
2260 unsigned long mmun_end; /* For mmu_notifiers */ 2259 unsigned long mmun_end; /* For mmu_notifiers */
2261 2260
2262 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2261 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2263 2262
2264 /* release the mmap_sem read lock. */ 2263 /* release the mmap_sem read lock. */
2265 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); 2264 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
2266 if (!new_page) 2265 if (!new_page)
2267 return; 2266 return;
2268 2267
2269 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) 2268 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
2270 return; 2269 return;
2271 2270
2272 /* 2271 /*
2273 * Prevent all access to pagetables with the exception of 2272 * Prevent all access to pagetables with the exception of
2274 * gup_fast later hanlded by the ptep_clear_flush and the VM 2273 * gup_fast later hanlded by the ptep_clear_flush and the VM
2275 * handled by the anon_vma lock + PG_lock. 2274 * handled by the anon_vma lock + PG_lock.
2276 */ 2275 */
2277 down_write(&mm->mmap_sem); 2276 down_write(&mm->mmap_sem);
2278 if (unlikely(khugepaged_test_exit(mm))) 2277 if (unlikely(khugepaged_test_exit(mm)))
2279 goto out; 2278 goto out;
2280 2279
2281 vma = find_vma(mm, address); 2280 vma = find_vma(mm, address);
2282 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2281 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2283 hend = vma->vm_end & HPAGE_PMD_MASK; 2282 hend = vma->vm_end & HPAGE_PMD_MASK;
2284 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 2283 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
2285 goto out; 2284 goto out;
2286 if (!hugepage_vma_check(vma)) 2285 if (!hugepage_vma_check(vma))
2287 goto out; 2286 goto out;
2288 pmd = mm_find_pmd(mm, address); 2287 pmd = mm_find_pmd(mm, address);
2289 if (!pmd) 2288 if (!pmd)
2290 goto out; 2289 goto out;
2291 if (pmd_trans_huge(*pmd)) 2290 if (pmd_trans_huge(*pmd))
2292 goto out; 2291 goto out;
2293 2292
2294 anon_vma_lock_write(vma->anon_vma); 2293 anon_vma_lock_write(vma->anon_vma);
2295 2294
2296 pte = pte_offset_map(pmd, address); 2295 pte = pte_offset_map(pmd, address);
2297 ptl = pte_lockptr(mm, pmd); 2296 ptl = pte_lockptr(mm, pmd);
2298 2297
2299 mmun_start = address; 2298 mmun_start = address;
2300 mmun_end = address + HPAGE_PMD_SIZE; 2299 mmun_end = address + HPAGE_PMD_SIZE;
2301 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2300 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2302 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 2301 spin_lock(&mm->page_table_lock); /* probably unnecessary */
2303 /* 2302 /*
2304 * After this gup_fast can't run anymore. This also removes 2303 * After this gup_fast can't run anymore. This also removes
2305 * any huge TLB entry from the CPU so we won't allow 2304 * any huge TLB entry from the CPU so we won't allow
2306 * huge and small TLB entries for the same virtual address 2305 * huge and small TLB entries for the same virtual address
2307 * to avoid the risk of CPU bugs in that area. 2306 * to avoid the risk of CPU bugs in that area.
2308 */ 2307 */
2309 _pmd = pmdp_clear_flush(vma, address, pmd); 2308 _pmd = pmdp_clear_flush(vma, address, pmd);
2310 spin_unlock(&mm->page_table_lock); 2309 spin_unlock(&mm->page_table_lock);
2311 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2310 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2312 2311
2313 spin_lock(ptl); 2312 spin_lock(ptl);
2314 isolated = __collapse_huge_page_isolate(vma, address, pte); 2313 isolated = __collapse_huge_page_isolate(vma, address, pte);
2315 spin_unlock(ptl); 2314 spin_unlock(ptl);
2316 2315
2317 if (unlikely(!isolated)) { 2316 if (unlikely(!isolated)) {
2318 pte_unmap(pte); 2317 pte_unmap(pte);
2319 spin_lock(&mm->page_table_lock); 2318 spin_lock(&mm->page_table_lock);
2320 BUG_ON(!pmd_none(*pmd)); 2319 BUG_ON(!pmd_none(*pmd));
2321 set_pmd_at(mm, address, pmd, _pmd); 2320 set_pmd_at(mm, address, pmd, _pmd);
2322 spin_unlock(&mm->page_table_lock); 2321 spin_unlock(&mm->page_table_lock);
2323 anon_vma_unlock_write(vma->anon_vma); 2322 anon_vma_unlock_write(vma->anon_vma);
2324 goto out; 2323 goto out;
2325 } 2324 }
2326 2325
2327 /* 2326 /*
2328 * All pages are isolated and locked so anon_vma rmap 2327 * All pages are isolated and locked so anon_vma rmap
2329 * can't run anymore. 2328 * can't run anymore.
2330 */ 2329 */
2331 anon_vma_unlock_write(vma->anon_vma); 2330 anon_vma_unlock_write(vma->anon_vma);
2332 2331
2333 __collapse_huge_page_copy(pte, new_page, vma, address, ptl); 2332 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
2334 pte_unmap(pte); 2333 pte_unmap(pte);
2335 __SetPageUptodate(new_page); 2334 __SetPageUptodate(new_page);
2336 pgtable = pmd_pgtable(_pmd); 2335 pgtable = pmd_pgtable(_pmd);
2337 2336
2338 _pmd = mk_huge_pmd(new_page, vma); 2337 _pmd = mk_huge_pmd(new_page, vma);
2339 2338
2340 /* 2339 /*
2341 * spin_lock() below is not the equivalent of smp_wmb(), so 2340 * spin_lock() below is not the equivalent of smp_wmb(), so
2342 * this is needed to avoid the copy_huge_page writes to become 2341 * this is needed to avoid the copy_huge_page writes to become
2343 * visible after the set_pmd_at() write. 2342 * visible after the set_pmd_at() write.
2344 */ 2343 */
2345 smp_wmb(); 2344 smp_wmb();
2346 2345
2347 spin_lock(&mm->page_table_lock); 2346 spin_lock(&mm->page_table_lock);
2348 BUG_ON(!pmd_none(*pmd)); 2347 BUG_ON(!pmd_none(*pmd));
2349 page_add_new_anon_rmap(new_page, vma, address); 2348 page_add_new_anon_rmap(new_page, vma, address);
2350 set_pmd_at(mm, address, pmd, _pmd); 2349 set_pmd_at(mm, address, pmd, _pmd);
2351 update_mmu_cache_pmd(vma, address, pmd); 2350 update_mmu_cache_pmd(vma, address, pmd);
2352 pgtable_trans_huge_deposit(mm, pgtable); 2351 pgtable_trans_huge_deposit(mm, pgtable);
2353 spin_unlock(&mm->page_table_lock); 2352 spin_unlock(&mm->page_table_lock);
2354 2353
2355 *hpage = NULL; 2354 *hpage = NULL;
2356 2355
2357 khugepaged_pages_collapsed++; 2356 khugepaged_pages_collapsed++;
2358 out_up_write: 2357 out_up_write:
2359 up_write(&mm->mmap_sem); 2358 up_write(&mm->mmap_sem);
2360 return; 2359 return;
2361 2360
2362 out: 2361 out:
2363 mem_cgroup_uncharge_page(new_page); 2362 mem_cgroup_uncharge_page(new_page);
2364 goto out_up_write; 2363 goto out_up_write;
2365 } 2364 }
2366 2365
2367 static int khugepaged_scan_pmd(struct mm_struct *mm, 2366 static int khugepaged_scan_pmd(struct mm_struct *mm,
2368 struct vm_area_struct *vma, 2367 struct vm_area_struct *vma,
2369 unsigned long address, 2368 unsigned long address,
2370 struct page **hpage) 2369 struct page **hpage)
2371 { 2370 {
2372 pmd_t *pmd; 2371 pmd_t *pmd;
2373 pte_t *pte, *_pte; 2372 pte_t *pte, *_pte;
2374 int ret = 0, referenced = 0, none = 0; 2373 int ret = 0, referenced = 0, none = 0;
2375 struct page *page; 2374 struct page *page;
2376 unsigned long _address; 2375 unsigned long _address;
2377 spinlock_t *ptl; 2376 spinlock_t *ptl;
2378 int node = NUMA_NO_NODE; 2377 int node = NUMA_NO_NODE;
2379 2378
2380 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2379 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2381 2380
2382 pmd = mm_find_pmd(mm, address); 2381 pmd = mm_find_pmd(mm, address);
2383 if (!pmd) 2382 if (!pmd)
2384 goto out; 2383 goto out;
2385 if (pmd_trans_huge(*pmd)) 2384 if (pmd_trans_huge(*pmd))
2386 goto out; 2385 goto out;
2387 2386
2388 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2387 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2389 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2388 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2390 _pte++, _address += PAGE_SIZE) { 2389 _pte++, _address += PAGE_SIZE) {
2391 pte_t pteval = *_pte; 2390 pte_t pteval = *_pte;
2392 if (pte_none(pteval)) { 2391 if (pte_none(pteval)) {
2393 if (++none <= khugepaged_max_ptes_none) 2392 if (++none <= khugepaged_max_ptes_none)
2394 continue; 2393 continue;
2395 else 2394 else
2396 goto out_unmap; 2395 goto out_unmap;
2397 } 2396 }
2398 if (!pte_present(pteval) || !pte_write(pteval)) 2397 if (!pte_present(pteval) || !pte_write(pteval))
2399 goto out_unmap; 2398 goto out_unmap;
2400 page = vm_normal_page(vma, _address, pteval); 2399 page = vm_normal_page(vma, _address, pteval);
2401 if (unlikely(!page)) 2400 if (unlikely(!page))
2402 goto out_unmap; 2401 goto out_unmap;
2403 /* 2402 /*
2404 * Chose the node of the first page. This could 2403 * Chose the node of the first page. This could
2405 * be more sophisticated and look at more pages, 2404 * be more sophisticated and look at more pages,
2406 * but isn't for now. 2405 * but isn't for now.
2407 */ 2406 */
2408 if (node == NUMA_NO_NODE) 2407 if (node == NUMA_NO_NODE)
2409 node = page_to_nid(page); 2408 node = page_to_nid(page);
2410 VM_BUG_ON(PageCompound(page)); 2409 VM_BUG_ON(PageCompound(page));
2411 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2410 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2412 goto out_unmap; 2411 goto out_unmap;
2413 /* cannot use mapcount: can't collapse if there's a gup pin */ 2412 /* cannot use mapcount: can't collapse if there's a gup pin */
2414 if (page_count(page) != 1) 2413 if (page_count(page) != 1)
2415 goto out_unmap; 2414 goto out_unmap;
2416 if (pte_young(pteval) || PageReferenced(page) || 2415 if (pte_young(pteval) || PageReferenced(page) ||
2417 mmu_notifier_test_young(vma->vm_mm, address)) 2416 mmu_notifier_test_young(vma->vm_mm, address))
2418 referenced = 1; 2417 referenced = 1;
2419 } 2418 }
2420 if (referenced) 2419 if (referenced)
2421 ret = 1; 2420 ret = 1;
2422 out_unmap: 2421 out_unmap:
2423 pte_unmap_unlock(pte, ptl); 2422 pte_unmap_unlock(pte, ptl);
2424 if (ret) 2423 if (ret)
2425 /* collapse_huge_page will return with the mmap_sem released */ 2424 /* collapse_huge_page will return with the mmap_sem released */
2426 collapse_huge_page(mm, address, hpage, vma, node); 2425 collapse_huge_page(mm, address, hpage, vma, node);
2427 out: 2426 out:
2428 return ret; 2427 return ret;
2429 } 2428 }
2430 2429
2431 static void collect_mm_slot(struct mm_slot *mm_slot) 2430 static void collect_mm_slot(struct mm_slot *mm_slot)
2432 { 2431 {
2433 struct mm_struct *mm = mm_slot->mm; 2432 struct mm_struct *mm = mm_slot->mm;
2434 2433
2435 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2434 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2436 2435
2437 if (khugepaged_test_exit(mm)) { 2436 if (khugepaged_test_exit(mm)) {
2438 /* free mm_slot */ 2437 /* free mm_slot */
2439 hash_del(&mm_slot->hash); 2438 hash_del(&mm_slot->hash);
2440 list_del(&mm_slot->mm_node); 2439 list_del(&mm_slot->mm_node);
2441 2440
2442 /* 2441 /*
2443 * Not strictly needed because the mm exited already. 2442 * Not strictly needed because the mm exited already.
2444 * 2443 *
2445 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 2444 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2446 */ 2445 */
2447 2446
2448 /* khugepaged_mm_lock actually not necessary for the below */ 2447 /* khugepaged_mm_lock actually not necessary for the below */
2449 free_mm_slot(mm_slot); 2448 free_mm_slot(mm_slot);
2450 mmdrop(mm); 2449 mmdrop(mm);
2451 } 2450 }
2452 } 2451 }
2453 2452
2454 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 2453 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2455 struct page **hpage) 2454 struct page **hpage)
2456 __releases(&khugepaged_mm_lock) 2455 __releases(&khugepaged_mm_lock)
2457 __acquires(&khugepaged_mm_lock) 2456 __acquires(&khugepaged_mm_lock)
2458 { 2457 {
2459 struct mm_slot *mm_slot; 2458 struct mm_slot *mm_slot;
2460 struct mm_struct *mm; 2459 struct mm_struct *mm;
2461 struct vm_area_struct *vma; 2460 struct vm_area_struct *vma;
2462 int progress = 0; 2461 int progress = 0;
2463 2462
2464 VM_BUG_ON(!pages); 2463 VM_BUG_ON(!pages);
2465 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2464 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2466 2465
2467 if (khugepaged_scan.mm_slot) 2466 if (khugepaged_scan.mm_slot)
2468 mm_slot = khugepaged_scan.mm_slot; 2467 mm_slot = khugepaged_scan.mm_slot;
2469 else { 2468 else {
2470 mm_slot = list_entry(khugepaged_scan.mm_head.next, 2469 mm_slot = list_entry(khugepaged_scan.mm_head.next,
2471 struct mm_slot, mm_node); 2470 struct mm_slot, mm_node);
2472 khugepaged_scan.address = 0; 2471 khugepaged_scan.address = 0;
2473 khugepaged_scan.mm_slot = mm_slot; 2472 khugepaged_scan.mm_slot = mm_slot;
2474 } 2473 }
2475 spin_unlock(&khugepaged_mm_lock); 2474 spin_unlock(&khugepaged_mm_lock);
2476 2475
2477 mm = mm_slot->mm; 2476 mm = mm_slot->mm;
2478 down_read(&mm->mmap_sem); 2477 down_read(&mm->mmap_sem);
2479 if (unlikely(khugepaged_test_exit(mm))) 2478 if (unlikely(khugepaged_test_exit(mm)))
2480 vma = NULL; 2479 vma = NULL;
2481 else 2480 else
2482 vma = find_vma(mm, khugepaged_scan.address); 2481 vma = find_vma(mm, khugepaged_scan.address);
2483 2482
2484 progress++; 2483 progress++;
2485 for (; vma; vma = vma->vm_next) { 2484 for (; vma; vma = vma->vm_next) {
2486 unsigned long hstart, hend; 2485 unsigned long hstart, hend;
2487 2486
2488 cond_resched(); 2487 cond_resched();
2489 if (unlikely(khugepaged_test_exit(mm))) { 2488 if (unlikely(khugepaged_test_exit(mm))) {
2490 progress++; 2489 progress++;
2491 break; 2490 break;
2492 } 2491 }
2493 if (!hugepage_vma_check(vma)) { 2492 if (!hugepage_vma_check(vma)) {
2494 skip: 2493 skip:
2495 progress++; 2494 progress++;
2496 continue; 2495 continue;
2497 } 2496 }
2498 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2497 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2499 hend = vma->vm_end & HPAGE_PMD_MASK; 2498 hend = vma->vm_end & HPAGE_PMD_MASK;
2500 if (hstart >= hend) 2499 if (hstart >= hend)
2501 goto skip; 2500 goto skip;
2502 if (khugepaged_scan.address > hend) 2501 if (khugepaged_scan.address > hend)
2503 goto skip; 2502 goto skip;
2504 if (khugepaged_scan.address < hstart) 2503 if (khugepaged_scan.address < hstart)
2505 khugepaged_scan.address = hstart; 2504 khugepaged_scan.address = hstart;
2506 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2505 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2507 2506
2508 while (khugepaged_scan.address < hend) { 2507 while (khugepaged_scan.address < hend) {
2509 int ret; 2508 int ret;
2510 cond_resched(); 2509 cond_resched();
2511 if (unlikely(khugepaged_test_exit(mm))) 2510 if (unlikely(khugepaged_test_exit(mm)))
2512 goto breakouterloop; 2511 goto breakouterloop;
2513 2512
2514 VM_BUG_ON(khugepaged_scan.address < hstart || 2513 VM_BUG_ON(khugepaged_scan.address < hstart ||
2515 khugepaged_scan.address + HPAGE_PMD_SIZE > 2514 khugepaged_scan.address + HPAGE_PMD_SIZE >
2516 hend); 2515 hend);
2517 ret = khugepaged_scan_pmd(mm, vma, 2516 ret = khugepaged_scan_pmd(mm, vma,
2518 khugepaged_scan.address, 2517 khugepaged_scan.address,
2519 hpage); 2518 hpage);
2520 /* move to next address */ 2519 /* move to next address */
2521 khugepaged_scan.address += HPAGE_PMD_SIZE; 2520 khugepaged_scan.address += HPAGE_PMD_SIZE;
2522 progress += HPAGE_PMD_NR; 2521 progress += HPAGE_PMD_NR;
2523 if (ret) 2522 if (ret)
2524 /* we released mmap_sem so break loop */ 2523 /* we released mmap_sem so break loop */
2525 goto breakouterloop_mmap_sem; 2524 goto breakouterloop_mmap_sem;
2526 if (progress >= pages) 2525 if (progress >= pages)
2527 goto breakouterloop; 2526 goto breakouterloop;
2528 } 2527 }
2529 } 2528 }
2530 breakouterloop: 2529 breakouterloop:
2531 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ 2530 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2532 breakouterloop_mmap_sem: 2531 breakouterloop_mmap_sem:
2533 2532
2534 spin_lock(&khugepaged_mm_lock); 2533 spin_lock(&khugepaged_mm_lock);
2535 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2534 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2536 /* 2535 /*
2537 * Release the current mm_slot if this mm is about to die, or 2536 * Release the current mm_slot if this mm is about to die, or
2538 * if we scanned all vmas of this mm. 2537 * if we scanned all vmas of this mm.
2539 */ 2538 */
2540 if (khugepaged_test_exit(mm) || !vma) { 2539 if (khugepaged_test_exit(mm) || !vma) {
2541 /* 2540 /*
2542 * Make sure that if mm_users is reaching zero while 2541 * Make sure that if mm_users is reaching zero while
2543 * khugepaged runs here, khugepaged_exit will find 2542 * khugepaged runs here, khugepaged_exit will find
2544 * mm_slot not pointing to the exiting mm. 2543 * mm_slot not pointing to the exiting mm.
2545 */ 2544 */
2546 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { 2545 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2547 khugepaged_scan.mm_slot = list_entry( 2546 khugepaged_scan.mm_slot = list_entry(
2548 mm_slot->mm_node.next, 2547 mm_slot->mm_node.next,
2549 struct mm_slot, mm_node); 2548 struct mm_slot, mm_node);
2550 khugepaged_scan.address = 0; 2549 khugepaged_scan.address = 0;
2551 } else { 2550 } else {
2552 khugepaged_scan.mm_slot = NULL; 2551 khugepaged_scan.mm_slot = NULL;
2553 khugepaged_full_scans++; 2552 khugepaged_full_scans++;
2554 } 2553 }
2555 2554
2556 collect_mm_slot(mm_slot); 2555 collect_mm_slot(mm_slot);
2557 } 2556 }
2558 2557
2559 return progress; 2558 return progress;
2560 } 2559 }
2561 2560
2562 static int khugepaged_has_work(void) 2561 static int khugepaged_has_work(void)
2563 { 2562 {
2564 return !list_empty(&khugepaged_scan.mm_head) && 2563 return !list_empty(&khugepaged_scan.mm_head) &&
2565 khugepaged_enabled(); 2564 khugepaged_enabled();
2566 } 2565 }
2567 2566
2568 static int khugepaged_wait_event(void) 2567 static int khugepaged_wait_event(void)
2569 { 2568 {
2570 return !list_empty(&khugepaged_scan.mm_head) || 2569 return !list_empty(&khugepaged_scan.mm_head) ||
2571 kthread_should_stop(); 2570 kthread_should_stop();
2572 } 2571 }
2573 2572
2574 static void khugepaged_do_scan(void) 2573 static void khugepaged_do_scan(void)
2575 { 2574 {
2576 struct page *hpage = NULL; 2575 struct page *hpage = NULL;
2577 unsigned int progress = 0, pass_through_head = 0; 2576 unsigned int progress = 0, pass_through_head = 0;
2578 unsigned int pages = khugepaged_pages_to_scan; 2577 unsigned int pages = khugepaged_pages_to_scan;
2579 bool wait = true; 2578 bool wait = true;
2580 2579
2581 barrier(); /* write khugepaged_pages_to_scan to local stack */ 2580 barrier(); /* write khugepaged_pages_to_scan to local stack */
2582 2581
2583 while (progress < pages) { 2582 while (progress < pages) {
2584 if (!khugepaged_prealloc_page(&hpage, &wait)) 2583 if (!khugepaged_prealloc_page(&hpage, &wait))
2585 break; 2584 break;
2586 2585
2587 cond_resched(); 2586 cond_resched();
2588 2587
2589 if (unlikely(kthread_should_stop() || freezing(current))) 2588 if (unlikely(kthread_should_stop() || freezing(current)))
2590 break; 2589 break;
2591 2590
2592 spin_lock(&khugepaged_mm_lock); 2591 spin_lock(&khugepaged_mm_lock);
2593 if (!khugepaged_scan.mm_slot) 2592 if (!khugepaged_scan.mm_slot)
2594 pass_through_head++; 2593 pass_through_head++;
2595 if (khugepaged_has_work() && 2594 if (khugepaged_has_work() &&
2596 pass_through_head < 2) 2595 pass_through_head < 2)
2597 progress += khugepaged_scan_mm_slot(pages - progress, 2596 progress += khugepaged_scan_mm_slot(pages - progress,
2598 &hpage); 2597 &hpage);
2599 else 2598 else
2600 progress = pages; 2599 progress = pages;
2601 spin_unlock(&khugepaged_mm_lock); 2600 spin_unlock(&khugepaged_mm_lock);
2602 } 2601 }
2603 2602
2604 if (!IS_ERR_OR_NULL(hpage)) 2603 if (!IS_ERR_OR_NULL(hpage))
2605 put_page(hpage); 2604 put_page(hpage);
2606 } 2605 }
2607 2606
2608 static void khugepaged_wait_work(void) 2607 static void khugepaged_wait_work(void)
2609 { 2608 {
2610 try_to_freeze(); 2609 try_to_freeze();
2611 2610
2612 if (khugepaged_has_work()) { 2611 if (khugepaged_has_work()) {
2613 if (!khugepaged_scan_sleep_millisecs) 2612 if (!khugepaged_scan_sleep_millisecs)
2614 return; 2613 return;
2615 2614
2616 wait_event_freezable_timeout(khugepaged_wait, 2615 wait_event_freezable_timeout(khugepaged_wait,
2617 kthread_should_stop(), 2616 kthread_should_stop(),
2618 msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); 2617 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2619 return; 2618 return;
2620 } 2619 }
2621 2620
2622 if (khugepaged_enabled()) 2621 if (khugepaged_enabled())
2623 wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 2622 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2624 } 2623 }
2625 2624
2626 static int khugepaged(void *none) 2625 static int khugepaged(void *none)
2627 { 2626 {
2628 struct mm_slot *mm_slot; 2627 struct mm_slot *mm_slot;
2629 2628
2630 set_freezable(); 2629 set_freezable();
2631 set_user_nice(current, 19); 2630 set_user_nice(current, 19);
2632 2631
2633 while (!kthread_should_stop()) { 2632 while (!kthread_should_stop()) {
2634 khugepaged_do_scan(); 2633 khugepaged_do_scan();
2635 khugepaged_wait_work(); 2634 khugepaged_wait_work();
2636 } 2635 }
2637 2636
2638 spin_lock(&khugepaged_mm_lock); 2637 spin_lock(&khugepaged_mm_lock);
2639 mm_slot = khugepaged_scan.mm_slot; 2638 mm_slot = khugepaged_scan.mm_slot;
2640 khugepaged_scan.mm_slot = NULL; 2639 khugepaged_scan.mm_slot = NULL;
2641 if (mm_slot) 2640 if (mm_slot)
2642 collect_mm_slot(mm_slot); 2641 collect_mm_slot(mm_slot);
2643 spin_unlock(&khugepaged_mm_lock); 2642 spin_unlock(&khugepaged_mm_lock);
2644 return 0; 2643 return 0;
2645 } 2644 }
2646 2645
2647 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2646 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2648 unsigned long haddr, pmd_t *pmd) 2647 unsigned long haddr, pmd_t *pmd)
2649 { 2648 {
2650 struct mm_struct *mm = vma->vm_mm; 2649 struct mm_struct *mm = vma->vm_mm;
2651 pgtable_t pgtable; 2650 pgtable_t pgtable;
2652 pmd_t _pmd; 2651 pmd_t _pmd;
2653 int i; 2652 int i;
2654 2653
2655 pmdp_clear_flush(vma, haddr, pmd); 2654 pmdp_clear_flush(vma, haddr, pmd);
2656 /* leave pmd empty until pte is filled */ 2655 /* leave pmd empty until pte is filled */
2657 2656
2658 pgtable = pgtable_trans_huge_withdraw(mm); 2657 pgtable = pgtable_trans_huge_withdraw(mm);
2659 pmd_populate(mm, &_pmd, pgtable); 2658 pmd_populate(mm, &_pmd, pgtable);
2660 2659
2661 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2660 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2662 pte_t *pte, entry; 2661 pte_t *pte, entry;
2663 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 2662 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2664 entry = pte_mkspecial(entry); 2663 entry = pte_mkspecial(entry);
2665 pte = pte_offset_map(&_pmd, haddr); 2664 pte = pte_offset_map(&_pmd, haddr);
2666 VM_BUG_ON(!pte_none(*pte)); 2665 VM_BUG_ON(!pte_none(*pte));
2667 set_pte_at(mm, haddr, pte, entry); 2666 set_pte_at(mm, haddr, pte, entry);
2668 pte_unmap(pte); 2667 pte_unmap(pte);
2669 } 2668 }
2670 smp_wmb(); /* make pte visible before pmd */ 2669 smp_wmb(); /* make pte visible before pmd */
2671 pmd_populate(mm, pmd, pgtable); 2670 pmd_populate(mm, pmd, pgtable);
2672 put_huge_zero_page(); 2671 put_huge_zero_page();
2673 } 2672 }
2674 2673
2675 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, 2674 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2676 pmd_t *pmd) 2675 pmd_t *pmd)
2677 { 2676 {
2678 struct page *page; 2677 struct page *page;
2679 struct mm_struct *mm = vma->vm_mm; 2678 struct mm_struct *mm = vma->vm_mm;
2680 unsigned long haddr = address & HPAGE_PMD_MASK; 2679 unsigned long haddr = address & HPAGE_PMD_MASK;
2681 unsigned long mmun_start; /* For mmu_notifiers */ 2680 unsigned long mmun_start; /* For mmu_notifiers */
2682 unsigned long mmun_end; /* For mmu_notifiers */ 2681 unsigned long mmun_end; /* For mmu_notifiers */
2683 2682
2684 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); 2683 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2685 2684
2686 mmun_start = haddr; 2685 mmun_start = haddr;
2687 mmun_end = haddr + HPAGE_PMD_SIZE; 2686 mmun_end = haddr + HPAGE_PMD_SIZE;
2688 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2687 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2689 spin_lock(&mm->page_table_lock); 2688 spin_lock(&mm->page_table_lock);
2690 if (unlikely(!pmd_trans_huge(*pmd))) { 2689 if (unlikely(!pmd_trans_huge(*pmd))) {
2691 spin_unlock(&mm->page_table_lock); 2690 spin_unlock(&mm->page_table_lock);
2692 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2691 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2693 return; 2692 return;
2694 } 2693 }
2695 if (is_huge_zero_pmd(*pmd)) { 2694 if (is_huge_zero_pmd(*pmd)) {
2696 __split_huge_zero_page_pmd(vma, haddr, pmd); 2695 __split_huge_zero_page_pmd(vma, haddr, pmd);
2697 spin_unlock(&mm->page_table_lock); 2696 spin_unlock(&mm->page_table_lock);
2698 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2697 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2699 return; 2698 return;
2700 } 2699 }
2701 page = pmd_page(*pmd); 2700 page = pmd_page(*pmd);
2702 VM_BUG_ON(!page_count(page)); 2701 VM_BUG_ON(!page_count(page));
2703 get_page(page); 2702 get_page(page);
2704 spin_unlock(&mm->page_table_lock); 2703 spin_unlock(&mm->page_table_lock);
2705 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2704 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2706 2705
2707 split_huge_page(page); 2706 split_huge_page(page);
2708 2707
2709 put_page(page); 2708 put_page(page);
2710 BUG_ON(pmd_trans_huge(*pmd)); 2709 BUG_ON(pmd_trans_huge(*pmd));
2711 } 2710 }
2712 2711
2713 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, 2712 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2714 pmd_t *pmd) 2713 pmd_t *pmd)
2715 { 2714 {
2716 struct vm_area_struct *vma; 2715 struct vm_area_struct *vma;
2717 2716
2718 vma = find_vma(mm, address); 2717 vma = find_vma(mm, address);
2719 BUG_ON(vma == NULL); 2718 BUG_ON(vma == NULL);
2720 split_huge_page_pmd(vma, address, pmd); 2719 split_huge_page_pmd(vma, address, pmd);
2721 } 2720 }
2722 2721
2723 static void split_huge_page_address(struct mm_struct *mm, 2722 static void split_huge_page_address(struct mm_struct *mm,
2724 unsigned long address) 2723 unsigned long address)
2725 { 2724 {
2726 pmd_t *pmd; 2725 pmd_t *pmd;
2727 2726
2728 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2727 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2729 2728
2730 pmd = mm_find_pmd(mm, address); 2729 pmd = mm_find_pmd(mm, address);
2731 if (!pmd) 2730 if (!pmd)
2732 return; 2731 return;
2733 /* 2732 /*
2734 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2733 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2735 * materialize from under us. 2734 * materialize from under us.
2736 */ 2735 */
2737 split_huge_page_pmd_mm(mm, address, pmd); 2736 split_huge_page_pmd_mm(mm, address, pmd);
2738 } 2737 }
2739 2738
2740 void __vma_adjust_trans_huge(struct vm_area_struct *vma, 2739 void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2741 unsigned long start, 2740 unsigned long start,
2742 unsigned long end, 2741 unsigned long end,
2743 long adjust_next) 2742 long adjust_next)
2744 { 2743 {
2745 /* 2744 /*
2746 * If the new start address isn't hpage aligned and it could 2745 * If the new start address isn't hpage aligned and it could
2747 * previously contain an hugepage: check if we need to split 2746 * previously contain an hugepage: check if we need to split
2748 * an huge pmd. 2747 * an huge pmd.
2749 */ 2748 */
2750 if (start & ~HPAGE_PMD_MASK && 2749 if (start & ~HPAGE_PMD_MASK &&
2751 (start & HPAGE_PMD_MASK) >= vma->vm_start && 2750 (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2752 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2751 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2753 split_huge_page_address(vma->vm_mm, start); 2752 split_huge_page_address(vma->vm_mm, start);
2754 2753
2755 /* 2754 /*
2756 * If the new end address isn't hpage aligned and it could 2755 * If the new end address isn't hpage aligned and it could
2757 * previously contain an hugepage: check if we need to split 2756 * previously contain an hugepage: check if we need to split
2758 * an huge pmd. 2757 * an huge pmd.
2759 */ 2758 */
2760 if (end & ~HPAGE_PMD_MASK && 2759 if (end & ~HPAGE_PMD_MASK &&
2761 (end & HPAGE_PMD_MASK) >= vma->vm_start && 2760 (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2762 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2761 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2763 split_huge_page_address(vma->vm_mm, end); 2762 split_huge_page_address(vma->vm_mm, end);
2764 2763
2765 /* 2764 /*
2766 * If we're also updating the vma->vm_next->vm_start, if the new 2765 * If we're also updating the vma->vm_next->vm_start, if the new
2767 * vm_next->vm_start isn't page aligned and it could previously 2766 * vm_next->vm_start isn't page aligned and it could previously
2768 * contain an hugepage: check if we need to split an huge pmd. 2767 * contain an hugepage: check if we need to split an huge pmd.
2769 */ 2768 */
2770 if (adjust_next > 0) { 2769 if (adjust_next > 0) {
2771 struct vm_area_struct *next = vma->vm_next; 2770 struct vm_area_struct *next = vma->vm_next;
2772 unsigned long nstart = next->vm_start; 2771 unsigned long nstart = next->vm_start;
2773 nstart += adjust_next << PAGE_SHIFT; 2772 nstart += adjust_next << PAGE_SHIFT;
2774 if (nstart & ~HPAGE_PMD_MASK && 2773 if (nstart & ~HPAGE_PMD_MASK &&
2775 (nstart & HPAGE_PMD_MASK) >= next->vm_start && 2774 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
1 /* 1 /*
2 * linux/mm/memory.c 2 * linux/mm/memory.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */ 5 */
6 6
7 /* 7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of 8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus 9 * things wanted, and it should be easy to implement. - Linus
10 */ 10 */
11 11
12 /* 12 /*
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared 13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus. 14 * pages started 02.12.91, seems to work. - Linus.
15 * 15 *
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it 16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as 17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see. 18 * far as I could see.
19 * 19 *
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them. 20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21 */ 21 */
22 22
23 /* 23 /*
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and 24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well.. 25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. 26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now. 27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root. 28 * 20.12.91 - Ok, making the swap-device changeable like the root.
29 */ 29 */
30 30
31 /* 31 /*
32 * 05.04.94 - Multi-page memory management added for v1.1. 32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk) 33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
34 * 34 *
35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG 35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
36 * (Gerhard.Wichert@pdb.siemens.de) 36 * (Gerhard.Wichert@pdb.siemens.de)
37 * 37 *
38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) 38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
39 */ 39 */
40 40
41 #include <linux/kernel_stat.h> 41 #include <linux/kernel_stat.h>
42 #include <linux/mm.h> 42 #include <linux/mm.h>
43 #include <linux/hugetlb.h> 43 #include <linux/hugetlb.h>
44 #include <linux/mman.h> 44 #include <linux/mman.h>
45 #include <linux/swap.h> 45 #include <linux/swap.h>
46 #include <linux/highmem.h> 46 #include <linux/highmem.h>
47 #include <linux/pagemap.h> 47 #include <linux/pagemap.h>
48 #include <linux/ksm.h> 48 #include <linux/ksm.h>
49 #include <linux/rmap.h> 49 #include <linux/rmap.h>
50 #include <linux/export.h> 50 #include <linux/export.h>
51 #include <linux/delayacct.h> 51 #include <linux/delayacct.h>
52 #include <linux/init.h> 52 #include <linux/init.h>
53 #include <linux/writeback.h> 53 #include <linux/writeback.h>
54 #include <linux/memcontrol.h> 54 #include <linux/memcontrol.h>
55 #include <linux/mmu_notifier.h> 55 #include <linux/mmu_notifier.h>
56 #include <linux/kallsyms.h> 56 #include <linux/kallsyms.h>
57 #include <linux/swapops.h> 57 #include <linux/swapops.h>
58 #include <linux/elf.h> 58 #include <linux/elf.h>
59 #include <linux/gfp.h> 59 #include <linux/gfp.h>
60 #include <linux/migrate.h> 60 #include <linux/migrate.h>
61 #include <linux/string.h> 61 #include <linux/string.h>
62 62
63 #include <asm/io.h> 63 #include <asm/io.h>
64 #include <asm/pgalloc.h> 64 #include <asm/pgalloc.h>
65 #include <asm/uaccess.h> 65 #include <asm/uaccess.h>
66 #include <asm/tlb.h> 66 #include <asm/tlb.h>
67 #include <asm/tlbflush.h> 67 #include <asm/tlbflush.h>
68 #include <asm/pgtable.h> 68 #include <asm/pgtable.h>
69 69
70 #include "internal.h" 70 #include "internal.h"
71 71
72 #ifdef LAST_NID_NOT_IN_PAGE_FLAGS 72 #ifdef LAST_NID_NOT_IN_PAGE_FLAGS
73 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. 73 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
74 #endif 74 #endif
75 75
76 #ifndef CONFIG_NEED_MULTIPLE_NODES 76 #ifndef CONFIG_NEED_MULTIPLE_NODES
77 /* use the per-pgdat data instead for discontigmem - mbligh */ 77 /* use the per-pgdat data instead for discontigmem - mbligh */
78 unsigned long max_mapnr; 78 unsigned long max_mapnr;
79 struct page *mem_map; 79 struct page *mem_map;
80 80
81 EXPORT_SYMBOL(max_mapnr); 81 EXPORT_SYMBOL(max_mapnr);
82 EXPORT_SYMBOL(mem_map); 82 EXPORT_SYMBOL(mem_map);
83 #endif 83 #endif
84 84
85 unsigned long num_physpages; 85 unsigned long num_physpages;
86 /* 86 /*
87 * A number of key systems in x86 including ioremap() rely on the assumption 87 * A number of key systems in x86 including ioremap() rely on the assumption
88 * that high_memory defines the upper bound on direct map memory, then end 88 * that high_memory defines the upper bound on direct map memory, then end
89 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and 89 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
90 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL 90 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
91 * and ZONE_HIGHMEM. 91 * and ZONE_HIGHMEM.
92 */ 92 */
93 void * high_memory; 93 void * high_memory;
94 94
95 EXPORT_SYMBOL(num_physpages); 95 EXPORT_SYMBOL(num_physpages);
96 EXPORT_SYMBOL(high_memory); 96 EXPORT_SYMBOL(high_memory);
97 97
98 /* 98 /*
99 * Randomize the address space (stacks, mmaps, brk, etc.). 99 * Randomize the address space (stacks, mmaps, brk, etc.).
100 * 100 *
101 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, 101 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
102 * as ancient (libc5 based) binaries can segfault. ) 102 * as ancient (libc5 based) binaries can segfault. )
103 */ 103 */
104 int randomize_va_space __read_mostly = 104 int randomize_va_space __read_mostly =
105 #ifdef CONFIG_COMPAT_BRK 105 #ifdef CONFIG_COMPAT_BRK
106 1; 106 1;
107 #else 107 #else
108 2; 108 2;
109 #endif 109 #endif
110 110
111 static int __init disable_randmaps(char *s) 111 static int __init disable_randmaps(char *s)
112 { 112 {
113 randomize_va_space = 0; 113 randomize_va_space = 0;
114 return 1; 114 return 1;
115 } 115 }
116 __setup("norandmaps", disable_randmaps); 116 __setup("norandmaps", disable_randmaps);
117 117
118 unsigned long zero_pfn __read_mostly; 118 unsigned long zero_pfn __read_mostly;
119 unsigned long highest_memmap_pfn __read_mostly; 119 unsigned long highest_memmap_pfn __read_mostly;
120 120
121 /* 121 /*
122 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() 122 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
123 */ 123 */
124 static int __init init_zero_pfn(void) 124 static int __init init_zero_pfn(void)
125 { 125 {
126 zero_pfn = page_to_pfn(ZERO_PAGE(0)); 126 zero_pfn = page_to_pfn(ZERO_PAGE(0));
127 return 0; 127 return 0;
128 } 128 }
129 core_initcall(init_zero_pfn); 129 core_initcall(init_zero_pfn);
130 130
131 131
132 #if defined(SPLIT_RSS_COUNTING) 132 #if defined(SPLIT_RSS_COUNTING)
133 133
134 void sync_mm_rss(struct mm_struct *mm) 134 void sync_mm_rss(struct mm_struct *mm)
135 { 135 {
136 int i; 136 int i;
137 137
138 for (i = 0; i < NR_MM_COUNTERS; i++) { 138 for (i = 0; i < NR_MM_COUNTERS; i++) {
139 if (current->rss_stat.count[i]) { 139 if (current->rss_stat.count[i]) {
140 add_mm_counter(mm, i, current->rss_stat.count[i]); 140 add_mm_counter(mm, i, current->rss_stat.count[i]);
141 current->rss_stat.count[i] = 0; 141 current->rss_stat.count[i] = 0;
142 } 142 }
143 } 143 }
144 current->rss_stat.events = 0; 144 current->rss_stat.events = 0;
145 } 145 }
146 146
147 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) 147 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
148 { 148 {
149 struct task_struct *task = current; 149 struct task_struct *task = current;
150 150
151 if (likely(task->mm == mm)) 151 if (likely(task->mm == mm))
152 task->rss_stat.count[member] += val; 152 task->rss_stat.count[member] += val;
153 else 153 else
154 add_mm_counter(mm, member, val); 154 add_mm_counter(mm, member, val);
155 } 155 }
156 #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) 156 #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
157 #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) 157 #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
158 158
159 /* sync counter once per 64 page faults */ 159 /* sync counter once per 64 page faults */
160 #define TASK_RSS_EVENTS_THRESH (64) 160 #define TASK_RSS_EVENTS_THRESH (64)
161 static void check_sync_rss_stat(struct task_struct *task) 161 static void check_sync_rss_stat(struct task_struct *task)
162 { 162 {
163 if (unlikely(task != current)) 163 if (unlikely(task != current))
164 return; 164 return;
165 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) 165 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
166 sync_mm_rss(task->mm); 166 sync_mm_rss(task->mm);
167 } 167 }
168 #else /* SPLIT_RSS_COUNTING */ 168 #else /* SPLIT_RSS_COUNTING */
169 169
170 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) 170 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
171 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) 171 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
172 172
173 static void check_sync_rss_stat(struct task_struct *task) 173 static void check_sync_rss_stat(struct task_struct *task)
174 { 174 {
175 } 175 }
176 176
177 #endif /* SPLIT_RSS_COUNTING */ 177 #endif /* SPLIT_RSS_COUNTING */
178 178
179 #ifdef HAVE_GENERIC_MMU_GATHER 179 #ifdef HAVE_GENERIC_MMU_GATHER
180 180
181 static int tlb_next_batch(struct mmu_gather *tlb) 181 static int tlb_next_batch(struct mmu_gather *tlb)
182 { 182 {
183 struct mmu_gather_batch *batch; 183 struct mmu_gather_batch *batch;
184 184
185 batch = tlb->active; 185 batch = tlb->active;
186 if (batch->next) { 186 if (batch->next) {
187 tlb->active = batch->next; 187 tlb->active = batch->next;
188 return 1; 188 return 1;
189 } 189 }
190 190
191 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) 191 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
192 return 0; 192 return 0;
193 193
194 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); 194 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
195 if (!batch) 195 if (!batch)
196 return 0; 196 return 0;
197 197
198 tlb->batch_count++; 198 tlb->batch_count++;
199 batch->next = NULL; 199 batch->next = NULL;
200 batch->nr = 0; 200 batch->nr = 0;
201 batch->max = MAX_GATHER_BATCH; 201 batch->max = MAX_GATHER_BATCH;
202 202
203 tlb->active->next = batch; 203 tlb->active->next = batch;
204 tlb->active = batch; 204 tlb->active = batch;
205 205
206 return 1; 206 return 1;
207 } 207 }
208 208
209 /* tlb_gather_mmu 209 /* tlb_gather_mmu
210 * Called to initialize an (on-stack) mmu_gather structure for page-table 210 * Called to initialize an (on-stack) mmu_gather structure for page-table
211 * tear-down from @mm. The @fullmm argument is used when @mm is without 211 * tear-down from @mm. The @fullmm argument is used when @mm is without
212 * users and we're going to destroy the full address space (exit/execve). 212 * users and we're going to destroy the full address space (exit/execve).
213 */ 213 */
214 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) 214 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
215 { 215 {
216 tlb->mm = mm; 216 tlb->mm = mm;
217 217
218 tlb->fullmm = fullmm; 218 tlb->fullmm = fullmm;
219 tlb->need_flush_all = 0; 219 tlb->need_flush_all = 0;
220 tlb->start = -1UL; 220 tlb->start = -1UL;
221 tlb->end = 0; 221 tlb->end = 0;
222 tlb->need_flush = 0; 222 tlb->need_flush = 0;
223 tlb->fast_mode = (num_possible_cpus() == 1); 223 tlb->fast_mode = (num_possible_cpus() == 1);
224 tlb->local.next = NULL; 224 tlb->local.next = NULL;
225 tlb->local.nr = 0; 225 tlb->local.nr = 0;
226 tlb->local.max = ARRAY_SIZE(tlb->__pages); 226 tlb->local.max = ARRAY_SIZE(tlb->__pages);
227 tlb->active = &tlb->local; 227 tlb->active = &tlb->local;
228 tlb->batch_count = 0; 228 tlb->batch_count = 0;
229 229
230 #ifdef CONFIG_HAVE_RCU_TABLE_FREE 230 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
231 tlb->batch = NULL; 231 tlb->batch = NULL;
232 #endif 232 #endif
233 } 233 }
234 234
235 void tlb_flush_mmu(struct mmu_gather *tlb) 235 void tlb_flush_mmu(struct mmu_gather *tlb)
236 { 236 {
237 struct mmu_gather_batch *batch; 237 struct mmu_gather_batch *batch;
238 238
239 if (!tlb->need_flush) 239 if (!tlb->need_flush)
240 return; 240 return;
241 tlb->need_flush = 0; 241 tlb->need_flush = 0;
242 tlb_flush(tlb); 242 tlb_flush(tlb);
243 #ifdef CONFIG_HAVE_RCU_TABLE_FREE 243 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
244 tlb_table_flush(tlb); 244 tlb_table_flush(tlb);
245 #endif 245 #endif
246 246
247 if (tlb_fast_mode(tlb)) 247 if (tlb_fast_mode(tlb))
248 return; 248 return;
249 249
250 for (batch = &tlb->local; batch; batch = batch->next) { 250 for (batch = &tlb->local; batch; batch = batch->next) {
251 free_pages_and_swap_cache(batch->pages, batch->nr); 251 free_pages_and_swap_cache(batch->pages, batch->nr);
252 batch->nr = 0; 252 batch->nr = 0;
253 } 253 }
254 tlb->active = &tlb->local; 254 tlb->active = &tlb->local;
255 } 255 }
256 256
257 /* tlb_finish_mmu 257 /* tlb_finish_mmu
258 * Called at the end of the shootdown operation to free up any resources 258 * Called at the end of the shootdown operation to free up any resources
259 * that were required. 259 * that were required.
260 */ 260 */
261 void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) 261 void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
262 { 262 {
263 struct mmu_gather_batch *batch, *next; 263 struct mmu_gather_batch *batch, *next;
264 264
265 tlb->start = start; 265 tlb->start = start;
266 tlb->end = end; 266 tlb->end = end;
267 tlb_flush_mmu(tlb); 267 tlb_flush_mmu(tlb);
268 268
269 /* keep the page table cache within bounds */ 269 /* keep the page table cache within bounds */
270 check_pgt_cache(); 270 check_pgt_cache();
271 271
272 for (batch = tlb->local.next; batch; batch = next) { 272 for (batch = tlb->local.next; batch; batch = next) {
273 next = batch->next; 273 next = batch->next;
274 free_pages((unsigned long)batch, 0); 274 free_pages((unsigned long)batch, 0);
275 } 275 }
276 tlb->local.next = NULL; 276 tlb->local.next = NULL;
277 } 277 }
278 278
279 /* __tlb_remove_page 279 /* __tlb_remove_page
280 * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while 280 * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
281 * handling the additional races in SMP caused by other CPUs caching valid 281 * handling the additional races in SMP caused by other CPUs caching valid
282 * mappings in their TLBs. Returns the number of free page slots left. 282 * mappings in their TLBs. Returns the number of free page slots left.
283 * When out of page slots we must call tlb_flush_mmu(). 283 * When out of page slots we must call tlb_flush_mmu().
284 */ 284 */
285 int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 285 int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
286 { 286 {
287 struct mmu_gather_batch *batch; 287 struct mmu_gather_batch *batch;
288 288
289 VM_BUG_ON(!tlb->need_flush); 289 VM_BUG_ON(!tlb->need_flush);
290 290
291 if (tlb_fast_mode(tlb)) { 291 if (tlb_fast_mode(tlb)) {
292 free_page_and_swap_cache(page); 292 free_page_and_swap_cache(page);
293 return 1; /* avoid calling tlb_flush_mmu() */ 293 return 1; /* avoid calling tlb_flush_mmu() */
294 } 294 }
295 295
296 batch = tlb->active; 296 batch = tlb->active;
297 batch->pages[batch->nr++] = page; 297 batch->pages[batch->nr++] = page;
298 if (batch->nr == batch->max) { 298 if (batch->nr == batch->max) {
299 if (!tlb_next_batch(tlb)) 299 if (!tlb_next_batch(tlb))
300 return 0; 300 return 0;
301 batch = tlb->active; 301 batch = tlb->active;
302 } 302 }
303 VM_BUG_ON(batch->nr > batch->max); 303 VM_BUG_ON(batch->nr > batch->max);
304 304
305 return batch->max - batch->nr; 305 return batch->max - batch->nr;
306 } 306 }
307 307
308 #endif /* HAVE_GENERIC_MMU_GATHER */ 308 #endif /* HAVE_GENERIC_MMU_GATHER */
309 309
310 #ifdef CONFIG_HAVE_RCU_TABLE_FREE 310 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
311 311
312 /* 312 /*
313 * See the comment near struct mmu_table_batch. 313 * See the comment near struct mmu_table_batch.
314 */ 314 */
315 315
316 static void tlb_remove_table_smp_sync(void *arg) 316 static void tlb_remove_table_smp_sync(void *arg)
317 { 317 {
318 /* Simply deliver the interrupt */ 318 /* Simply deliver the interrupt */
319 } 319 }
320 320
321 static void tlb_remove_table_one(void *table) 321 static void tlb_remove_table_one(void *table)
322 { 322 {
323 /* 323 /*
324 * This isn't an RCU grace period and hence the page-tables cannot be 324 * This isn't an RCU grace period and hence the page-tables cannot be
325 * assumed to be actually RCU-freed. 325 * assumed to be actually RCU-freed.
326 * 326 *
327 * It is however sufficient for software page-table walkers that rely on 327 * It is however sufficient for software page-table walkers that rely on
328 * IRQ disabling. See the comment near struct mmu_table_batch. 328 * IRQ disabling. See the comment near struct mmu_table_batch.
329 */ 329 */
330 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 330 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
331 __tlb_remove_table(table); 331 __tlb_remove_table(table);
332 } 332 }
333 333
334 static void tlb_remove_table_rcu(struct rcu_head *head) 334 static void tlb_remove_table_rcu(struct rcu_head *head)
335 { 335 {
336 struct mmu_table_batch *batch; 336 struct mmu_table_batch *batch;
337 int i; 337 int i;
338 338
339 batch = container_of(head, struct mmu_table_batch, rcu); 339 batch = container_of(head, struct mmu_table_batch, rcu);
340 340
341 for (i = 0; i < batch->nr; i++) 341 for (i = 0; i < batch->nr; i++)
342 __tlb_remove_table(batch->tables[i]); 342 __tlb_remove_table(batch->tables[i]);
343 343
344 free_page((unsigned long)batch); 344 free_page((unsigned long)batch);
345 } 345 }
346 346
347 void tlb_table_flush(struct mmu_gather *tlb) 347 void tlb_table_flush(struct mmu_gather *tlb)
348 { 348 {
349 struct mmu_table_batch **batch = &tlb->batch; 349 struct mmu_table_batch **batch = &tlb->batch;
350 350
351 if (*batch) { 351 if (*batch) {
352 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 352 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
353 *batch = NULL; 353 *batch = NULL;
354 } 354 }
355 } 355 }
356 356
357 void tlb_remove_table(struct mmu_gather *tlb, void *table) 357 void tlb_remove_table(struct mmu_gather *tlb, void *table)
358 { 358 {
359 struct mmu_table_batch **batch = &tlb->batch; 359 struct mmu_table_batch **batch = &tlb->batch;
360 360
361 tlb->need_flush = 1; 361 tlb->need_flush = 1;
362 362
363 /* 363 /*
364 * When there's less then two users of this mm there cannot be a 364 * When there's less then two users of this mm there cannot be a
365 * concurrent page-table walk. 365 * concurrent page-table walk.
366 */ 366 */
367 if (atomic_read(&tlb->mm->mm_users) < 2) { 367 if (atomic_read(&tlb->mm->mm_users) < 2) {
368 __tlb_remove_table(table); 368 __tlb_remove_table(table);
369 return; 369 return;
370 } 370 }
371 371
372 if (*batch == NULL) { 372 if (*batch == NULL) {
373 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); 373 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
374 if (*batch == NULL) { 374 if (*batch == NULL) {
375 tlb_remove_table_one(table); 375 tlb_remove_table_one(table);
376 return; 376 return;
377 } 377 }
378 (*batch)->nr = 0; 378 (*batch)->nr = 0;
379 } 379 }
380 (*batch)->tables[(*batch)->nr++] = table; 380 (*batch)->tables[(*batch)->nr++] = table;
381 if ((*batch)->nr == MAX_TABLE_BATCH) 381 if ((*batch)->nr == MAX_TABLE_BATCH)
382 tlb_table_flush(tlb); 382 tlb_table_flush(tlb);
383 } 383 }
384 384
385 #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ 385 #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
386 386
387 /* 387 /*
388 * If a p?d_bad entry is found while walking page tables, report 388 * If a p?d_bad entry is found while walking page tables, report
389 * the error, before resetting entry to p?d_none. Usually (but 389 * the error, before resetting entry to p?d_none. Usually (but
390 * very seldom) called out from the p?d_none_or_clear_bad macros. 390 * very seldom) called out from the p?d_none_or_clear_bad macros.
391 */ 391 */
392 392
393 void pgd_clear_bad(pgd_t *pgd) 393 void pgd_clear_bad(pgd_t *pgd)
394 { 394 {
395 pgd_ERROR(*pgd); 395 pgd_ERROR(*pgd);
396 pgd_clear(pgd); 396 pgd_clear(pgd);
397 } 397 }
398 398
399 void pud_clear_bad(pud_t *pud) 399 void pud_clear_bad(pud_t *pud)
400 { 400 {
401 pud_ERROR(*pud); 401 pud_ERROR(*pud);
402 pud_clear(pud); 402 pud_clear(pud);
403 } 403 }
404 404
405 void pmd_clear_bad(pmd_t *pmd) 405 void pmd_clear_bad(pmd_t *pmd)
406 { 406 {
407 pmd_ERROR(*pmd); 407 pmd_ERROR(*pmd);
408 pmd_clear(pmd); 408 pmd_clear(pmd);
409 } 409 }
410 410
411 /* 411 /*
412 * Note: this doesn't free the actual pages themselves. That 412 * Note: this doesn't free the actual pages themselves. That
413 * has been handled earlier when unmapping all the memory regions. 413 * has been handled earlier when unmapping all the memory regions.
414 */ 414 */
415 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 415 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
416 unsigned long addr) 416 unsigned long addr)
417 { 417 {
418 pgtable_t token = pmd_pgtable(*pmd); 418 pgtable_t token = pmd_pgtable(*pmd);
419 pmd_clear(pmd); 419 pmd_clear(pmd);
420 pte_free_tlb(tlb, token, addr); 420 pte_free_tlb(tlb, token, addr);
421 tlb->mm->nr_ptes--; 421 tlb->mm->nr_ptes--;
422 } 422 }
423 423
424 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 424 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
425 unsigned long addr, unsigned long end, 425 unsigned long addr, unsigned long end,
426 unsigned long floor, unsigned long ceiling) 426 unsigned long floor, unsigned long ceiling)
427 { 427 {
428 pmd_t *pmd; 428 pmd_t *pmd;
429 unsigned long next; 429 unsigned long next;
430 unsigned long start; 430 unsigned long start;
431 431
432 start = addr; 432 start = addr;
433 pmd = pmd_offset(pud, addr); 433 pmd = pmd_offset(pud, addr);
434 do { 434 do {
435 next = pmd_addr_end(addr, end); 435 next = pmd_addr_end(addr, end);
436 if (pmd_none_or_clear_bad(pmd)) 436 if (pmd_none_or_clear_bad(pmd))
437 continue; 437 continue;
438 free_pte_range(tlb, pmd, addr); 438 free_pte_range(tlb, pmd, addr);
439 } while (pmd++, addr = next, addr != end); 439 } while (pmd++, addr = next, addr != end);
440 440
441 start &= PUD_MASK; 441 start &= PUD_MASK;
442 if (start < floor) 442 if (start < floor)
443 return; 443 return;
444 if (ceiling) { 444 if (ceiling) {
445 ceiling &= PUD_MASK; 445 ceiling &= PUD_MASK;
446 if (!ceiling) 446 if (!ceiling)
447 return; 447 return;
448 } 448 }
449 if (end - 1 > ceiling - 1) 449 if (end - 1 > ceiling - 1)
450 return; 450 return;
451 451
452 pmd = pmd_offset(pud, start); 452 pmd = pmd_offset(pud, start);
453 pud_clear(pud); 453 pud_clear(pud);
454 pmd_free_tlb(tlb, pmd, start); 454 pmd_free_tlb(tlb, pmd, start);
455 } 455 }
456 456
457 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 457 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
458 unsigned long addr, unsigned long end, 458 unsigned long addr, unsigned long end,
459 unsigned long floor, unsigned long ceiling) 459 unsigned long floor, unsigned long ceiling)
460 { 460 {
461 pud_t *pud; 461 pud_t *pud;
462 unsigned long next; 462 unsigned long next;
463 unsigned long start; 463 unsigned long start;
464 464
465 start = addr; 465 start = addr;
466 pud = pud_offset(pgd, addr); 466 pud = pud_offset(pgd, addr);
467 do { 467 do {
468 next = pud_addr_end(addr, end); 468 next = pud_addr_end(addr, end);
469 if (pud_none_or_clear_bad(pud)) 469 if (pud_none_or_clear_bad(pud))
470 continue; 470 continue;
471 free_pmd_range(tlb, pud, addr, next, floor, ceiling); 471 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
472 } while (pud++, addr = next, addr != end); 472 } while (pud++, addr = next, addr != end);
473 473
474 start &= PGDIR_MASK; 474 start &= PGDIR_MASK;
475 if (start < floor) 475 if (start < floor)
476 return; 476 return;
477 if (ceiling) { 477 if (ceiling) {
478 ceiling &= PGDIR_MASK; 478 ceiling &= PGDIR_MASK;
479 if (!ceiling) 479 if (!ceiling)
480 return; 480 return;
481 } 481 }
482 if (end - 1 > ceiling - 1) 482 if (end - 1 > ceiling - 1)
483 return; 483 return;
484 484
485 pud = pud_offset(pgd, start); 485 pud = pud_offset(pgd, start);
486 pgd_clear(pgd); 486 pgd_clear(pgd);
487 pud_free_tlb(tlb, pud, start); 487 pud_free_tlb(tlb, pud, start);
488 } 488 }
489 489
490 /* 490 /*
491 * This function frees user-level page tables of a process. 491 * This function frees user-level page tables of a process.
492 * 492 *
493 * Must be called with pagetable lock held. 493 * Must be called with pagetable lock held.
494 */ 494 */
495 void free_pgd_range(struct mmu_gather *tlb, 495 void free_pgd_range(struct mmu_gather *tlb,
496 unsigned long addr, unsigned long end, 496 unsigned long addr, unsigned long end,
497 unsigned long floor, unsigned long ceiling) 497 unsigned long floor, unsigned long ceiling)
498 { 498 {
499 pgd_t *pgd; 499 pgd_t *pgd;
500 unsigned long next; 500 unsigned long next;
501 501
502 /* 502 /*
503 * The next few lines have given us lots of grief... 503 * The next few lines have given us lots of grief...
504 * 504 *
505 * Why are we testing PMD* at this top level? Because often 505 * Why are we testing PMD* at this top level? Because often
506 * there will be no work to do at all, and we'd prefer not to 506 * there will be no work to do at all, and we'd prefer not to
507 * go all the way down to the bottom just to discover that. 507 * go all the way down to the bottom just to discover that.
508 * 508 *
509 * Why all these "- 1"s? Because 0 represents both the bottom 509 * Why all these "- 1"s? Because 0 represents both the bottom
510 * of the address space and the top of it (using -1 for the 510 * of the address space and the top of it (using -1 for the
511 * top wouldn't help much: the masks would do the wrong thing). 511 * top wouldn't help much: the masks would do the wrong thing).
512 * The rule is that addr 0 and floor 0 refer to the bottom of 512 * The rule is that addr 0 and floor 0 refer to the bottom of
513 * the address space, but end 0 and ceiling 0 refer to the top 513 * the address space, but end 0 and ceiling 0 refer to the top
514 * Comparisons need to use "end - 1" and "ceiling - 1" (though 514 * Comparisons need to use "end - 1" and "ceiling - 1" (though
515 * that end 0 case should be mythical). 515 * that end 0 case should be mythical).
516 * 516 *
517 * Wherever addr is brought up or ceiling brought down, we must 517 * Wherever addr is brought up or ceiling brought down, we must
518 * be careful to reject "the opposite 0" before it confuses the 518 * be careful to reject "the opposite 0" before it confuses the
519 * subsequent tests. But what about where end is brought down 519 * subsequent tests. But what about where end is brought down
520 * by PMD_SIZE below? no, end can't go down to 0 there. 520 * by PMD_SIZE below? no, end can't go down to 0 there.
521 * 521 *
522 * Whereas we round start (addr) and ceiling down, by different 522 * Whereas we round start (addr) and ceiling down, by different
523 * masks at different levels, in order to test whether a table 523 * masks at different levels, in order to test whether a table
524 * now has no other vmas using it, so can be freed, we don't 524 * now has no other vmas using it, so can be freed, we don't
525 * bother to round floor or end up - the tests don't need that. 525 * bother to round floor or end up - the tests don't need that.
526 */ 526 */
527 527
528 addr &= PMD_MASK; 528 addr &= PMD_MASK;
529 if (addr < floor) { 529 if (addr < floor) {
530 addr += PMD_SIZE; 530 addr += PMD_SIZE;
531 if (!addr) 531 if (!addr)
532 return; 532 return;
533 } 533 }
534 if (ceiling) { 534 if (ceiling) {
535 ceiling &= PMD_MASK; 535 ceiling &= PMD_MASK;
536 if (!ceiling) 536 if (!ceiling)
537 return; 537 return;
538 } 538 }
539 if (end - 1 > ceiling - 1) 539 if (end - 1 > ceiling - 1)
540 end -= PMD_SIZE; 540 end -= PMD_SIZE;
541 if (addr > end - 1) 541 if (addr > end - 1)
542 return; 542 return;
543 543
544 pgd = pgd_offset(tlb->mm, addr); 544 pgd = pgd_offset(tlb->mm, addr);
545 do { 545 do {
546 next = pgd_addr_end(addr, end); 546 next = pgd_addr_end(addr, end);
547 if (pgd_none_or_clear_bad(pgd)) 547 if (pgd_none_or_clear_bad(pgd))
548 continue; 548 continue;
549 free_pud_range(tlb, pgd, addr, next, floor, ceiling); 549 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
550 } while (pgd++, addr = next, addr != end); 550 } while (pgd++, addr = next, addr != end);
551 } 551 }
552 552
553 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, 553 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
554 unsigned long floor, unsigned long ceiling) 554 unsigned long floor, unsigned long ceiling)
555 { 555 {
556 while (vma) { 556 while (vma) {
557 struct vm_area_struct *next = vma->vm_next; 557 struct vm_area_struct *next = vma->vm_next;
558 unsigned long addr = vma->vm_start; 558 unsigned long addr = vma->vm_start;
559 559
560 /* 560 /*
561 * Hide vma from rmap and truncate_pagecache before freeing 561 * Hide vma from rmap and truncate_pagecache before freeing
562 * pgtables 562 * pgtables
563 */ 563 */
564 unlink_anon_vmas(vma); 564 unlink_anon_vmas(vma);
565 unlink_file_vma(vma); 565 unlink_file_vma(vma);
566 566
567 if (is_vm_hugetlb_page(vma)) { 567 if (is_vm_hugetlb_page(vma)) {
568 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 568 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
569 floor, next? next->vm_start: ceiling); 569 floor, next? next->vm_start: ceiling);
570 } else { 570 } else {
571 /* 571 /*
572 * Optimization: gather nearby vmas into one call down 572 * Optimization: gather nearby vmas into one call down
573 */ 573 */
574 while (next && next->vm_start <= vma->vm_end + PMD_SIZE 574 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
575 && !is_vm_hugetlb_page(next)) { 575 && !is_vm_hugetlb_page(next)) {
576 vma = next; 576 vma = next;
577 next = vma->vm_next; 577 next = vma->vm_next;
578 unlink_anon_vmas(vma); 578 unlink_anon_vmas(vma);
579 unlink_file_vma(vma); 579 unlink_file_vma(vma);
580 } 580 }
581 free_pgd_range(tlb, addr, vma->vm_end, 581 free_pgd_range(tlb, addr, vma->vm_end,
582 floor, next? next->vm_start: ceiling); 582 floor, next? next->vm_start: ceiling);
583 } 583 }
584 vma = next; 584 vma = next;
585 } 585 }
586 } 586 }
587 587
588 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 588 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
589 pmd_t *pmd, unsigned long address) 589 pmd_t *pmd, unsigned long address)
590 { 590 {
591 pgtable_t new = pte_alloc_one(mm, address); 591 pgtable_t new = pte_alloc_one(mm, address);
592 int wait_split_huge_page; 592 int wait_split_huge_page;
593 if (!new) 593 if (!new)
594 return -ENOMEM; 594 return -ENOMEM;
595 595
596 /* 596 /*
597 * Ensure all pte setup (eg. pte page lock and page clearing) are 597 * Ensure all pte setup (eg. pte page lock and page clearing) are
598 * visible before the pte is made visible to other CPUs by being 598 * visible before the pte is made visible to other CPUs by being
599 * put into page tables. 599 * put into page tables.
600 * 600 *
601 * The other side of the story is the pointer chasing in the page 601 * The other side of the story is the pointer chasing in the page
602 * table walking code (when walking the page table without locking; 602 * table walking code (when walking the page table without locking;
603 * ie. most of the time). Fortunately, these data accesses consist 603 * ie. most of the time). Fortunately, these data accesses consist
604 * of a chain of data-dependent loads, meaning most CPUs (alpha 604 * of a chain of data-dependent loads, meaning most CPUs (alpha
605 * being the notable exception) will already guarantee loads are 605 * being the notable exception) will already guarantee loads are
606 * seen in-order. See the alpha page table accessors for the 606 * seen in-order. See the alpha page table accessors for the
607 * smp_read_barrier_depends() barriers in page table walking code. 607 * smp_read_barrier_depends() barriers in page table walking code.
608 */ 608 */
609 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 609 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
610 610
611 spin_lock(&mm->page_table_lock); 611 spin_lock(&mm->page_table_lock);
612 wait_split_huge_page = 0; 612 wait_split_huge_page = 0;
613 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 613 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
614 mm->nr_ptes++; 614 mm->nr_ptes++;
615 pmd_populate(mm, pmd, new); 615 pmd_populate(mm, pmd, new);
616 new = NULL; 616 new = NULL;
617 } else if (unlikely(pmd_trans_splitting(*pmd))) 617 } else if (unlikely(pmd_trans_splitting(*pmd)))
618 wait_split_huge_page = 1; 618 wait_split_huge_page = 1;
619 spin_unlock(&mm->page_table_lock); 619 spin_unlock(&mm->page_table_lock);
620 if (new) 620 if (new)
621 pte_free(mm, new); 621 pte_free(mm, new);
622 if (wait_split_huge_page) 622 if (wait_split_huge_page)
623 wait_split_huge_page(vma->anon_vma, pmd); 623 wait_split_huge_page(vma->anon_vma, pmd);
624 return 0; 624 return 0;
625 } 625 }
626 626
627 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) 627 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
628 { 628 {
629 pte_t *new = pte_alloc_one_kernel(&init_mm, address); 629 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
630 if (!new) 630 if (!new)
631 return -ENOMEM; 631 return -ENOMEM;
632 632
633 smp_wmb(); /* See comment in __pte_alloc */ 633 smp_wmb(); /* See comment in __pte_alloc */
634 634
635 spin_lock(&init_mm.page_table_lock); 635 spin_lock(&init_mm.page_table_lock);
636 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 636 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
637 pmd_populate_kernel(&init_mm, pmd, new); 637 pmd_populate_kernel(&init_mm, pmd, new);
638 new = NULL; 638 new = NULL;
639 } else 639 } else
640 VM_BUG_ON(pmd_trans_splitting(*pmd)); 640 VM_BUG_ON(pmd_trans_splitting(*pmd));
641 spin_unlock(&init_mm.page_table_lock); 641 spin_unlock(&init_mm.page_table_lock);
642 if (new) 642 if (new)
643 pte_free_kernel(&init_mm, new); 643 pte_free_kernel(&init_mm, new);
644 return 0; 644 return 0;
645 } 645 }
646 646
647 static inline void init_rss_vec(int *rss) 647 static inline void init_rss_vec(int *rss)
648 { 648 {
649 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); 649 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
650 } 650 }
651 651
652 static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) 652 static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
653 { 653 {
654 int i; 654 int i;
655 655
656 if (current->mm == mm) 656 if (current->mm == mm)
657 sync_mm_rss(mm); 657 sync_mm_rss(mm);
658 for (i = 0; i < NR_MM_COUNTERS; i++) 658 for (i = 0; i < NR_MM_COUNTERS; i++)
659 if (rss[i]) 659 if (rss[i])
660 add_mm_counter(mm, i, rss[i]); 660 add_mm_counter(mm, i, rss[i]);
661 } 661 }
662 662
663 /* 663 /*
664 * This function is called to print an error when a bad pte 664 * This function is called to print an error when a bad pte
665 * is found. For example, we might have a PFN-mapped pte in 665 * is found. For example, we might have a PFN-mapped pte in
666 * a region that doesn't allow it. 666 * a region that doesn't allow it.
667 * 667 *
668 * The calling function must still handle the error. 668 * The calling function must still handle the error.
669 */ 669 */
670 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, 670 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
671 pte_t pte, struct page *page) 671 pte_t pte, struct page *page)
672 { 672 {
673 pgd_t *pgd = pgd_offset(vma->vm_mm, addr); 673 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
674 pud_t *pud = pud_offset(pgd, addr); 674 pud_t *pud = pud_offset(pgd, addr);
675 pmd_t *pmd = pmd_offset(pud, addr); 675 pmd_t *pmd = pmd_offset(pud, addr);
676 struct address_space *mapping; 676 struct address_space *mapping;
677 pgoff_t index; 677 pgoff_t index;
678 static unsigned long resume; 678 static unsigned long resume;
679 static unsigned long nr_shown; 679 static unsigned long nr_shown;
680 static unsigned long nr_unshown; 680 static unsigned long nr_unshown;
681 681
682 /* 682 /*
683 * Allow a burst of 60 reports, then keep quiet for that minute; 683 * Allow a burst of 60 reports, then keep quiet for that minute;
684 * or allow a steady drip of one report per second. 684 * or allow a steady drip of one report per second.
685 */ 685 */
686 if (nr_shown == 60) { 686 if (nr_shown == 60) {
687 if (time_before(jiffies, resume)) { 687 if (time_before(jiffies, resume)) {
688 nr_unshown++; 688 nr_unshown++;
689 return; 689 return;
690 } 690 }
691 if (nr_unshown) { 691 if (nr_unshown) {
692 printk(KERN_ALERT 692 printk(KERN_ALERT
693 "BUG: Bad page map: %lu messages suppressed\n", 693 "BUG: Bad page map: %lu messages suppressed\n",
694 nr_unshown); 694 nr_unshown);
695 nr_unshown = 0; 695 nr_unshown = 0;
696 } 696 }
697 nr_shown = 0; 697 nr_shown = 0;
698 } 698 }
699 if (nr_shown++ == 0) 699 if (nr_shown++ == 0)
700 resume = jiffies + 60 * HZ; 700 resume = jiffies + 60 * HZ;
701 701
702 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; 702 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
703 index = linear_page_index(vma, addr); 703 index = linear_page_index(vma, addr);
704 704
705 printk(KERN_ALERT 705 printk(KERN_ALERT
706 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 706 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
707 current->comm, 707 current->comm,
708 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 708 (long long)pte_val(pte), (long long)pmd_val(*pmd));
709 if (page) 709 if (page)
710 dump_page(page); 710 dump_page(page);
711 printk(KERN_ALERT 711 printk(KERN_ALERT
712 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 712 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
713 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 713 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
714 /* 714 /*
715 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y 715 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
716 */ 716 */
717 if (vma->vm_ops) 717 if (vma->vm_ops)
718 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", 718 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
719 (unsigned long)vma->vm_ops->fault); 719 (unsigned long)vma->vm_ops->fault);
720 if (vma->vm_file && vma->vm_file->f_op) 720 if (vma->vm_file && vma->vm_file->f_op)
721 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", 721 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
722 (unsigned long)vma->vm_file->f_op->mmap); 722 (unsigned long)vma->vm_file->f_op->mmap);
723 dump_stack(); 723 dump_stack();
724 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 724 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
725 } 725 }
726 726
727 static inline bool is_cow_mapping(vm_flags_t flags) 727 static inline bool is_cow_mapping(vm_flags_t flags)
728 { 728 {
729 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 729 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
730 } 730 }
731 731
732 /* 732 /*
733 * vm_normal_page -- This function gets the "struct page" associated with a pte. 733 * vm_normal_page -- This function gets the "struct page" associated with a pte.
734 * 734 *
735 * "Special" mappings do not wish to be associated with a "struct page" (either 735 * "Special" mappings do not wish to be associated with a "struct page" (either
736 * it doesn't exist, or it exists but they don't want to touch it). In this 736 * it doesn't exist, or it exists but they don't want to touch it). In this
737 * case, NULL is returned here. "Normal" mappings do have a struct page. 737 * case, NULL is returned here. "Normal" mappings do have a struct page.
738 * 738 *
739 * There are 2 broad cases. Firstly, an architecture may define a pte_special() 739 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
740 * pte bit, in which case this function is trivial. Secondly, an architecture 740 * pte bit, in which case this function is trivial. Secondly, an architecture
741 * may not have a spare pte bit, which requires a more complicated scheme, 741 * may not have a spare pte bit, which requires a more complicated scheme,
742 * described below. 742 * described below.
743 * 743 *
744 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a 744 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
745 * special mapping (even if there are underlying and valid "struct pages"). 745 * special mapping (even if there are underlying and valid "struct pages").
746 * COWed pages of a VM_PFNMAP are always normal. 746 * COWed pages of a VM_PFNMAP are always normal.
747 * 747 *
748 * The way we recognize COWed pages within VM_PFNMAP mappings is through the 748 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
749 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit 749 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
750 * set, and the vm_pgoff will point to the first PFN mapped: thus every special 750 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
751 * mapping will always honor the rule 751 * mapping will always honor the rule
752 * 752 *
753 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 753 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
754 * 754 *
755 * And for normal mappings this is false. 755 * And for normal mappings this is false.
756 * 756 *
757 * This restricts such mappings to be a linear translation from virtual address 757 * This restricts such mappings to be a linear translation from virtual address
758 * to pfn. To get around this restriction, we allow arbitrary mappings so long 758 * to pfn. To get around this restriction, we allow arbitrary mappings so long
759 * as the vma is not a COW mapping; in that case, we know that all ptes are 759 * as the vma is not a COW mapping; in that case, we know that all ptes are
760 * special (because none can have been COWed). 760 * special (because none can have been COWed).
761 * 761 *
762 * 762 *
763 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. 763 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
764 * 764 *
765 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct 765 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
766 * page" backing, however the difference is that _all_ pages with a struct 766 * page" backing, however the difference is that _all_ pages with a struct
767 * page (that is, those where pfn_valid is true) are refcounted and considered 767 * page (that is, those where pfn_valid is true) are refcounted and considered
768 * normal pages by the VM. The disadvantage is that pages are refcounted 768 * normal pages by the VM. The disadvantage is that pages are refcounted
769 * (which can be slower and simply not an option for some PFNMAP users). The 769 * (which can be slower and simply not an option for some PFNMAP users). The
770 * advantage is that we don't have to follow the strict linearity rule of 770 * advantage is that we don't have to follow the strict linearity rule of
771 * PFNMAP mappings in order to support COWable mappings. 771 * PFNMAP mappings in order to support COWable mappings.
772 * 772 *
773 */ 773 */
774 #ifdef __HAVE_ARCH_PTE_SPECIAL 774 #ifdef __HAVE_ARCH_PTE_SPECIAL
775 # define HAVE_PTE_SPECIAL 1 775 # define HAVE_PTE_SPECIAL 1
776 #else 776 #else
777 # define HAVE_PTE_SPECIAL 0 777 # define HAVE_PTE_SPECIAL 0
778 #endif 778 #endif
779 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 779 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
780 pte_t pte) 780 pte_t pte)
781 { 781 {
782 unsigned long pfn = pte_pfn(pte); 782 unsigned long pfn = pte_pfn(pte);
783 783
784 if (HAVE_PTE_SPECIAL) { 784 if (HAVE_PTE_SPECIAL) {
785 if (likely(!pte_special(pte))) 785 if (likely(!pte_special(pte)))
786 goto check_pfn; 786 goto check_pfn;
787 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 787 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
788 return NULL; 788 return NULL;
789 if (!is_zero_pfn(pfn)) 789 if (!is_zero_pfn(pfn))
790 print_bad_pte(vma, addr, pte, NULL); 790 print_bad_pte(vma, addr, pte, NULL);
791 return NULL; 791 return NULL;
792 } 792 }
793 793
794 /* !HAVE_PTE_SPECIAL case follows: */ 794 /* !HAVE_PTE_SPECIAL case follows: */
795 795
796 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 796 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
797 if (vma->vm_flags & VM_MIXEDMAP) { 797 if (vma->vm_flags & VM_MIXEDMAP) {
798 if (!pfn_valid(pfn)) 798 if (!pfn_valid(pfn))
799 return NULL; 799 return NULL;
800 goto out; 800 goto out;
801 } else { 801 } else {
802 unsigned long off; 802 unsigned long off;
803 off = (addr - vma->vm_start) >> PAGE_SHIFT; 803 off = (addr - vma->vm_start) >> PAGE_SHIFT;
804 if (pfn == vma->vm_pgoff + off) 804 if (pfn == vma->vm_pgoff + off)
805 return NULL; 805 return NULL;
806 if (!is_cow_mapping(vma->vm_flags)) 806 if (!is_cow_mapping(vma->vm_flags))
807 return NULL; 807 return NULL;
808 } 808 }
809 } 809 }
810 810
811 if (is_zero_pfn(pfn)) 811 if (is_zero_pfn(pfn))
812 return NULL; 812 return NULL;
813 check_pfn: 813 check_pfn:
814 if (unlikely(pfn > highest_memmap_pfn)) { 814 if (unlikely(pfn > highest_memmap_pfn)) {
815 print_bad_pte(vma, addr, pte, NULL); 815 print_bad_pte(vma, addr, pte, NULL);
816 return NULL; 816 return NULL;
817 } 817 }
818 818
819 /* 819 /*
820 * NOTE! We still have PageReserved() pages in the page tables. 820 * NOTE! We still have PageReserved() pages in the page tables.
821 * eg. VDSO mappings can cause them to exist. 821 * eg. VDSO mappings can cause them to exist.
822 */ 822 */
823 out: 823 out:
824 return pfn_to_page(pfn); 824 return pfn_to_page(pfn);
825 } 825 }
826 826
827 /* 827 /*
828 * copy one vm_area from one task to the other. Assumes the page tables 828 * copy one vm_area from one task to the other. Assumes the page tables
829 * already present in the new task to be cleared in the whole range 829 * already present in the new task to be cleared in the whole range
830 * covered by this vma. 830 * covered by this vma.
831 */ 831 */
832 832
833 static inline unsigned long 833 static inline unsigned long
834 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 834 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
835 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, 835 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
836 unsigned long addr, int *rss) 836 unsigned long addr, int *rss)
837 { 837 {
838 unsigned long vm_flags = vma->vm_flags; 838 unsigned long vm_flags = vma->vm_flags;
839 pte_t pte = *src_pte; 839 pte_t pte = *src_pte;
840 struct page *page; 840 struct page *page;
841 841
842 /* pte contains position in swap or file, so copy. */ 842 /* pte contains position in swap or file, so copy. */
843 if (unlikely(!pte_present(pte))) { 843 if (unlikely(!pte_present(pte))) {
844 if (!pte_file(pte)) { 844 if (!pte_file(pte)) {
845 swp_entry_t entry = pte_to_swp_entry(pte); 845 swp_entry_t entry = pte_to_swp_entry(pte);
846 846
847 if (swap_duplicate(entry) < 0) 847 if (swap_duplicate(entry) < 0)
848 return entry.val; 848 return entry.val;
849 849
850 /* make sure dst_mm is on swapoff's mmlist. */ 850 /* make sure dst_mm is on swapoff's mmlist. */
851 if (unlikely(list_empty(&dst_mm->mmlist))) { 851 if (unlikely(list_empty(&dst_mm->mmlist))) {
852 spin_lock(&mmlist_lock); 852 spin_lock(&mmlist_lock);
853 if (list_empty(&dst_mm->mmlist)) 853 if (list_empty(&dst_mm->mmlist))
854 list_add(&dst_mm->mmlist, 854 list_add(&dst_mm->mmlist,
855 &src_mm->mmlist); 855 &src_mm->mmlist);
856 spin_unlock(&mmlist_lock); 856 spin_unlock(&mmlist_lock);
857 } 857 }
858 if (likely(!non_swap_entry(entry))) 858 if (likely(!non_swap_entry(entry)))
859 rss[MM_SWAPENTS]++; 859 rss[MM_SWAPENTS]++;
860 else if (is_migration_entry(entry)) { 860 else if (is_migration_entry(entry)) {
861 page = migration_entry_to_page(entry); 861 page = migration_entry_to_page(entry);
862 862
863 if (PageAnon(page)) 863 if (PageAnon(page))
864 rss[MM_ANONPAGES]++; 864 rss[MM_ANONPAGES]++;
865 else 865 else
866 rss[MM_FILEPAGES]++; 866 rss[MM_FILEPAGES]++;
867 867
868 if (is_write_migration_entry(entry) && 868 if (is_write_migration_entry(entry) &&
869 is_cow_mapping(vm_flags)) { 869 is_cow_mapping(vm_flags)) {
870 /* 870 /*
871 * COW mappings require pages in both 871 * COW mappings require pages in both
872 * parent and child to be set to read. 872 * parent and child to be set to read.
873 */ 873 */
874 make_migration_entry_read(&entry); 874 make_migration_entry_read(&entry);
875 pte = swp_entry_to_pte(entry); 875 pte = swp_entry_to_pte(entry);
876 set_pte_at(src_mm, addr, src_pte, pte); 876 set_pte_at(src_mm, addr, src_pte, pte);
877 } 877 }
878 } 878 }
879 } 879 }
880 goto out_set_pte; 880 goto out_set_pte;
881 } 881 }
882 882
883 /* 883 /*
884 * If it's a COW mapping, write protect it both 884 * If it's a COW mapping, write protect it both
885 * in the parent and the child 885 * in the parent and the child
886 */ 886 */
887 if (is_cow_mapping(vm_flags)) { 887 if (is_cow_mapping(vm_flags)) {
888 ptep_set_wrprotect(src_mm, addr, src_pte); 888 ptep_set_wrprotect(src_mm, addr, src_pte);
889 pte = pte_wrprotect(pte); 889 pte = pte_wrprotect(pte);
890 } 890 }
891 891
892 /* 892 /*
893 * If it's a shared mapping, mark it clean in 893 * If it's a shared mapping, mark it clean in
894 * the child 894 * the child
895 */ 895 */
896 if (vm_flags & VM_SHARED) 896 if (vm_flags & VM_SHARED)
897 pte = pte_mkclean(pte); 897 pte = pte_mkclean(pte);
898 pte = pte_mkold(pte); 898 pte = pte_mkold(pte);
899 899
900 page = vm_normal_page(vma, addr, pte); 900 page = vm_normal_page(vma, addr, pte);
901 if (page) { 901 if (page) {
902 get_page(page); 902 get_page(page);
903 page_dup_rmap(page); 903 page_dup_rmap(page);
904 if (PageAnon(page)) 904 if (PageAnon(page))
905 rss[MM_ANONPAGES]++; 905 rss[MM_ANONPAGES]++;
906 else 906 else
907 rss[MM_FILEPAGES]++; 907 rss[MM_FILEPAGES]++;
908 } 908 }
909 909
910 out_set_pte: 910 out_set_pte:
911 set_pte_at(dst_mm, addr, dst_pte, pte); 911 set_pte_at(dst_mm, addr, dst_pte, pte);
912 return 0; 912 return 0;
913 } 913 }
914 914
915 int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 915 int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
916 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 916 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
917 unsigned long addr, unsigned long end) 917 unsigned long addr, unsigned long end)
918 { 918 {
919 pte_t *orig_src_pte, *orig_dst_pte; 919 pte_t *orig_src_pte, *orig_dst_pte;
920 pte_t *src_pte, *dst_pte; 920 pte_t *src_pte, *dst_pte;
921 spinlock_t *src_ptl, *dst_ptl; 921 spinlock_t *src_ptl, *dst_ptl;
922 int progress = 0; 922 int progress = 0;
923 int rss[NR_MM_COUNTERS]; 923 int rss[NR_MM_COUNTERS];
924 swp_entry_t entry = (swp_entry_t){0}; 924 swp_entry_t entry = (swp_entry_t){0};
925 925
926 again: 926 again:
927 init_rss_vec(rss); 927 init_rss_vec(rss);
928 928
929 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 929 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
930 if (!dst_pte) 930 if (!dst_pte)
931 return -ENOMEM; 931 return -ENOMEM;
932 src_pte = pte_offset_map(src_pmd, addr); 932 src_pte = pte_offset_map(src_pmd, addr);
933 src_ptl = pte_lockptr(src_mm, src_pmd); 933 src_ptl = pte_lockptr(src_mm, src_pmd);
934 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 934 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
935 orig_src_pte = src_pte; 935 orig_src_pte = src_pte;
936 orig_dst_pte = dst_pte; 936 orig_dst_pte = dst_pte;
937 arch_enter_lazy_mmu_mode(); 937 arch_enter_lazy_mmu_mode();
938 938
939 do { 939 do {
940 /* 940 /*
941 * We are holding two locks at this point - either of them 941 * We are holding two locks at this point - either of them
942 * could generate latencies in another task on another CPU. 942 * could generate latencies in another task on another CPU.
943 */ 943 */
944 if (progress >= 32) { 944 if (progress >= 32) {
945 progress = 0; 945 progress = 0;
946 if (need_resched() || 946 if (need_resched() ||
947 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) 947 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
948 break; 948 break;
949 } 949 }
950 if (pte_none(*src_pte)) { 950 if (pte_none(*src_pte)) {
951 progress++; 951 progress++;
952 continue; 952 continue;
953 } 953 }
954 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, 954 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
955 vma, addr, rss); 955 vma, addr, rss);
956 if (entry.val) 956 if (entry.val)
957 break; 957 break;
958 progress += 8; 958 progress += 8;
959 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 959 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
960 960
961 arch_leave_lazy_mmu_mode(); 961 arch_leave_lazy_mmu_mode();
962 spin_unlock(src_ptl); 962 spin_unlock(src_ptl);
963 pte_unmap(orig_src_pte); 963 pte_unmap(orig_src_pte);
964 add_mm_rss_vec(dst_mm, rss); 964 add_mm_rss_vec(dst_mm, rss);
965 pte_unmap_unlock(orig_dst_pte, dst_ptl); 965 pte_unmap_unlock(orig_dst_pte, dst_ptl);
966 cond_resched(); 966 cond_resched();
967 967
968 if (entry.val) { 968 if (entry.val) {
969 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) 969 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
970 return -ENOMEM; 970 return -ENOMEM;
971 progress = 0; 971 progress = 0;
972 } 972 }
973 if (addr != end) 973 if (addr != end)
974 goto again; 974 goto again;
975 return 0; 975 return 0;
976 } 976 }
977 977
978 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 978 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
979 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, 979 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
980 unsigned long addr, unsigned long end) 980 unsigned long addr, unsigned long end)
981 { 981 {
982 pmd_t *src_pmd, *dst_pmd; 982 pmd_t *src_pmd, *dst_pmd;
983 unsigned long next; 983 unsigned long next;
984 984
985 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); 985 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
986 if (!dst_pmd) 986 if (!dst_pmd)
987 return -ENOMEM; 987 return -ENOMEM;
988 src_pmd = pmd_offset(src_pud, addr); 988 src_pmd = pmd_offset(src_pud, addr);
989 do { 989 do {
990 next = pmd_addr_end(addr, end); 990 next = pmd_addr_end(addr, end);
991 if (pmd_trans_huge(*src_pmd)) { 991 if (pmd_trans_huge(*src_pmd)) {
992 int err; 992 int err;
993 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); 993 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
994 err = copy_huge_pmd(dst_mm, src_mm, 994 err = copy_huge_pmd(dst_mm, src_mm,
995 dst_pmd, src_pmd, addr, vma); 995 dst_pmd, src_pmd, addr, vma);
996 if (err == -ENOMEM) 996 if (err == -ENOMEM)
997 return -ENOMEM; 997 return -ENOMEM;
998 if (!err) 998 if (!err)
999 continue; 999 continue;
1000 /* fall through */ 1000 /* fall through */
1001 } 1001 }
1002 if (pmd_none_or_clear_bad(src_pmd)) 1002 if (pmd_none_or_clear_bad(src_pmd))
1003 continue; 1003 continue;
1004 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, 1004 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1005 vma, addr, next)) 1005 vma, addr, next))
1006 return -ENOMEM; 1006 return -ENOMEM;
1007 } while (dst_pmd++, src_pmd++, addr = next, addr != end); 1007 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1008 return 0; 1008 return 0;
1009 } 1009 }
1010 1010
1011 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1011 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1012 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, 1012 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1013 unsigned long addr, unsigned long end) 1013 unsigned long addr, unsigned long end)
1014 { 1014 {
1015 pud_t *src_pud, *dst_pud; 1015 pud_t *src_pud, *dst_pud;
1016 unsigned long next; 1016 unsigned long next;
1017 1017
1018 dst_pud = pud_alloc(dst_mm, dst_pgd, addr); 1018 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
1019 if (!dst_pud) 1019 if (!dst_pud)
1020 return -ENOMEM; 1020 return -ENOMEM;
1021 src_pud = pud_offset(src_pgd, addr); 1021 src_pud = pud_offset(src_pgd, addr);
1022 do { 1022 do {
1023 next = pud_addr_end(addr, end); 1023 next = pud_addr_end(addr, end);
1024 if (pud_none_or_clear_bad(src_pud)) 1024 if (pud_none_or_clear_bad(src_pud))
1025 continue; 1025 continue;
1026 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, 1026 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1027 vma, addr, next)) 1027 vma, addr, next))
1028 return -ENOMEM; 1028 return -ENOMEM;
1029 } while (dst_pud++, src_pud++, addr = next, addr != end); 1029 } while (dst_pud++, src_pud++, addr = next, addr != end);
1030 return 0; 1030 return 0;
1031 } 1031 }
1032 1032
1033 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1033 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1034 struct vm_area_struct *vma) 1034 struct vm_area_struct *vma)
1035 { 1035 {
1036 pgd_t *src_pgd, *dst_pgd; 1036 pgd_t *src_pgd, *dst_pgd;
1037 unsigned long next; 1037 unsigned long next;
1038 unsigned long addr = vma->vm_start; 1038 unsigned long addr = vma->vm_start;
1039 unsigned long end = vma->vm_end; 1039 unsigned long end = vma->vm_end;
1040 unsigned long mmun_start; /* For mmu_notifiers */ 1040 unsigned long mmun_start; /* For mmu_notifiers */
1041 unsigned long mmun_end; /* For mmu_notifiers */ 1041 unsigned long mmun_end; /* For mmu_notifiers */
1042 bool is_cow; 1042 bool is_cow;
1043 int ret; 1043 int ret;
1044 1044
1045 /* 1045 /*
1046 * Don't copy ptes where a page fault will fill them correctly. 1046 * Don't copy ptes where a page fault will fill them correctly.
1047 * Fork becomes much lighter when there are big shared or private 1047 * Fork becomes much lighter when there are big shared or private
1048 * readonly mappings. The tradeoff is that copy_page_range is more 1048 * readonly mappings. The tradeoff is that copy_page_range is more
1049 * efficient than faulting. 1049 * efficient than faulting.
1050 */ 1050 */
1051 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | 1051 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
1052 VM_PFNMAP | VM_MIXEDMAP))) { 1052 VM_PFNMAP | VM_MIXEDMAP))) {
1053 if (!vma->anon_vma) 1053 if (!vma->anon_vma)
1054 return 0; 1054 return 0;
1055 } 1055 }
1056 1056
1057 if (is_vm_hugetlb_page(vma)) 1057 if (is_vm_hugetlb_page(vma))
1058 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 1058 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1059 1059
1060 if (unlikely(vma->vm_flags & VM_PFNMAP)) { 1060 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1061 /* 1061 /*
1062 * We do not free on error cases below as remove_vma 1062 * We do not free on error cases below as remove_vma
1063 * gets called on error from higher level routine 1063 * gets called on error from higher level routine
1064 */ 1064 */
1065 ret = track_pfn_copy(vma); 1065 ret = track_pfn_copy(vma);
1066 if (ret) 1066 if (ret)
1067 return ret; 1067 return ret;
1068 } 1068 }
1069 1069
1070 /* 1070 /*
1071 * We need to invalidate the secondary MMU mappings only when 1071 * We need to invalidate the secondary MMU mappings only when
1072 * there could be a permission downgrade on the ptes of the 1072 * there could be a permission downgrade on the ptes of the
1073 * parent mm. And a permission downgrade will only happen if 1073 * parent mm. And a permission downgrade will only happen if
1074 * is_cow_mapping() returns true. 1074 * is_cow_mapping() returns true.
1075 */ 1075 */
1076 is_cow = is_cow_mapping(vma->vm_flags); 1076 is_cow = is_cow_mapping(vma->vm_flags);
1077 mmun_start = addr; 1077 mmun_start = addr;
1078 mmun_end = end; 1078 mmun_end = end;
1079 if (is_cow) 1079 if (is_cow)
1080 mmu_notifier_invalidate_range_start(src_mm, mmun_start, 1080 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1081 mmun_end); 1081 mmun_end);
1082 1082
1083 ret = 0; 1083 ret = 0;
1084 dst_pgd = pgd_offset(dst_mm, addr); 1084 dst_pgd = pgd_offset(dst_mm, addr);
1085 src_pgd = pgd_offset(src_mm, addr); 1085 src_pgd = pgd_offset(src_mm, addr);
1086 do { 1086 do {
1087 next = pgd_addr_end(addr, end); 1087 next = pgd_addr_end(addr, end);
1088 if (pgd_none_or_clear_bad(src_pgd)) 1088 if (pgd_none_or_clear_bad(src_pgd))
1089 continue; 1089 continue;
1090 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, 1090 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1091 vma, addr, next))) { 1091 vma, addr, next))) {
1092 ret = -ENOMEM; 1092 ret = -ENOMEM;
1093 break; 1093 break;
1094 } 1094 }
1095 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 1095 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1096 1096
1097 if (is_cow) 1097 if (is_cow)
1098 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); 1098 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1099 return ret; 1099 return ret;
1100 } 1100 }
1101 1101
1102 static unsigned long zap_pte_range(struct mmu_gather *tlb, 1102 static unsigned long zap_pte_range(struct mmu_gather *tlb,
1103 struct vm_area_struct *vma, pmd_t *pmd, 1103 struct vm_area_struct *vma, pmd_t *pmd,
1104 unsigned long addr, unsigned long end, 1104 unsigned long addr, unsigned long end,
1105 struct zap_details *details) 1105 struct zap_details *details)
1106 { 1106 {
1107 struct mm_struct *mm = tlb->mm; 1107 struct mm_struct *mm = tlb->mm;
1108 int force_flush = 0; 1108 int force_flush = 0;
1109 int rss[NR_MM_COUNTERS]; 1109 int rss[NR_MM_COUNTERS];
1110 spinlock_t *ptl; 1110 spinlock_t *ptl;
1111 pte_t *start_pte; 1111 pte_t *start_pte;
1112 pte_t *pte; 1112 pte_t *pte;
1113 1113
1114 again: 1114 again:
1115 init_rss_vec(rss); 1115 init_rss_vec(rss);
1116 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1116 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1117 pte = start_pte; 1117 pte = start_pte;
1118 arch_enter_lazy_mmu_mode(); 1118 arch_enter_lazy_mmu_mode();
1119 do { 1119 do {
1120 pte_t ptent = *pte; 1120 pte_t ptent = *pte;
1121 if (pte_none(ptent)) { 1121 if (pte_none(ptent)) {
1122 continue; 1122 continue;
1123 } 1123 }
1124 1124
1125 if (pte_present(ptent)) { 1125 if (pte_present(ptent)) {
1126 struct page *page; 1126 struct page *page;
1127 1127
1128 page = vm_normal_page(vma, addr, ptent); 1128 page = vm_normal_page(vma, addr, ptent);
1129 if (unlikely(details) && page) { 1129 if (unlikely(details) && page) {
1130 /* 1130 /*
1131 * unmap_shared_mapping_pages() wants to 1131 * unmap_shared_mapping_pages() wants to
1132 * invalidate cache without truncating: 1132 * invalidate cache without truncating:
1133 * unmap shared but keep private pages. 1133 * unmap shared but keep private pages.
1134 */ 1134 */
1135 if (details->check_mapping && 1135 if (details->check_mapping &&
1136 details->check_mapping != page->mapping) 1136 details->check_mapping != page->mapping)
1137 continue; 1137 continue;
1138 /* 1138 /*
1139 * Each page->index must be checked when 1139 * Each page->index must be checked when
1140 * invalidating or truncating nonlinear. 1140 * invalidating or truncating nonlinear.
1141 */ 1141 */
1142 if (details->nonlinear_vma && 1142 if (details->nonlinear_vma &&
1143 (page->index < details->first_index || 1143 (page->index < details->first_index ||
1144 page->index > details->last_index)) 1144 page->index > details->last_index))
1145 continue; 1145 continue;
1146 } 1146 }
1147 ptent = ptep_get_and_clear_full(mm, addr, pte, 1147 ptent = ptep_get_and_clear_full(mm, addr, pte,
1148 tlb->fullmm); 1148 tlb->fullmm);
1149 tlb_remove_tlb_entry(tlb, pte, addr); 1149 tlb_remove_tlb_entry(tlb, pte, addr);
1150 if (unlikely(!page)) 1150 if (unlikely(!page))
1151 continue; 1151 continue;
1152 if (unlikely(details) && details->nonlinear_vma 1152 if (unlikely(details) && details->nonlinear_vma
1153 && linear_page_index(details->nonlinear_vma, 1153 && linear_page_index(details->nonlinear_vma,
1154 addr) != page->index) 1154 addr) != page->index)
1155 set_pte_at(mm, addr, pte, 1155 set_pte_at(mm, addr, pte,
1156 pgoff_to_pte(page->index)); 1156 pgoff_to_pte(page->index));
1157 if (PageAnon(page)) 1157 if (PageAnon(page))
1158 rss[MM_ANONPAGES]--; 1158 rss[MM_ANONPAGES]--;
1159 else { 1159 else {
1160 if (pte_dirty(ptent)) 1160 if (pte_dirty(ptent))
1161 set_page_dirty(page); 1161 set_page_dirty(page);
1162 if (pte_young(ptent) && 1162 if (pte_young(ptent) &&
1163 likely(!VM_SequentialReadHint(vma))) 1163 likely(!VM_SequentialReadHint(vma)))
1164 mark_page_accessed(page); 1164 mark_page_accessed(page);
1165 rss[MM_FILEPAGES]--; 1165 rss[MM_FILEPAGES]--;
1166 } 1166 }
1167 page_remove_rmap(page); 1167 page_remove_rmap(page);
1168 if (unlikely(page_mapcount(page) < 0)) 1168 if (unlikely(page_mapcount(page) < 0))
1169 print_bad_pte(vma, addr, ptent, page); 1169 print_bad_pte(vma, addr, ptent, page);
1170 force_flush = !__tlb_remove_page(tlb, page); 1170 force_flush = !__tlb_remove_page(tlb, page);
1171 if (force_flush) 1171 if (force_flush)
1172 break; 1172 break;
1173 continue; 1173 continue;
1174 } 1174 }
1175 /* 1175 /*
1176 * If details->check_mapping, we leave swap entries; 1176 * If details->check_mapping, we leave swap entries;
1177 * if details->nonlinear_vma, we leave file entries. 1177 * if details->nonlinear_vma, we leave file entries.
1178 */ 1178 */
1179 if (unlikely(details)) 1179 if (unlikely(details))
1180 continue; 1180 continue;
1181 if (pte_file(ptent)) { 1181 if (pte_file(ptent)) {
1182 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) 1182 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1183 print_bad_pte(vma, addr, ptent, NULL); 1183 print_bad_pte(vma, addr, ptent, NULL);
1184 } else { 1184 } else {
1185 swp_entry_t entry = pte_to_swp_entry(ptent); 1185 swp_entry_t entry = pte_to_swp_entry(ptent);
1186 1186
1187 if (!non_swap_entry(entry)) 1187 if (!non_swap_entry(entry))
1188 rss[MM_SWAPENTS]--; 1188 rss[MM_SWAPENTS]--;
1189 else if (is_migration_entry(entry)) { 1189 else if (is_migration_entry(entry)) {
1190 struct page *page; 1190 struct page *page;
1191 1191
1192 page = migration_entry_to_page(entry); 1192 page = migration_entry_to_page(entry);
1193 1193
1194 if (PageAnon(page)) 1194 if (PageAnon(page))
1195 rss[MM_ANONPAGES]--; 1195 rss[MM_ANONPAGES]--;
1196 else 1196 else
1197 rss[MM_FILEPAGES]--; 1197 rss[MM_FILEPAGES]--;
1198 } 1198 }
1199 if (unlikely(!free_swap_and_cache(entry))) 1199 if (unlikely(!free_swap_and_cache(entry)))
1200 print_bad_pte(vma, addr, ptent, NULL); 1200 print_bad_pte(vma, addr, ptent, NULL);
1201 } 1201 }
1202 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 1202 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1203 } while (pte++, addr += PAGE_SIZE, addr != end); 1203 } while (pte++, addr += PAGE_SIZE, addr != end);
1204 1204
1205 add_mm_rss_vec(mm, rss); 1205 add_mm_rss_vec(mm, rss);
1206 arch_leave_lazy_mmu_mode(); 1206 arch_leave_lazy_mmu_mode();
1207 pte_unmap_unlock(start_pte, ptl); 1207 pte_unmap_unlock(start_pte, ptl);
1208 1208
1209 /* 1209 /*
1210 * mmu_gather ran out of room to batch pages, we break out of 1210 * mmu_gather ran out of room to batch pages, we break out of
1211 * the PTE lock to avoid doing the potential expensive TLB invalidate 1211 * the PTE lock to avoid doing the potential expensive TLB invalidate
1212 * and page-free while holding it. 1212 * and page-free while holding it.
1213 */ 1213 */
1214 if (force_flush) { 1214 if (force_flush) {
1215 force_flush = 0; 1215 force_flush = 0;
1216 1216
1217 #ifdef HAVE_GENERIC_MMU_GATHER 1217 #ifdef HAVE_GENERIC_MMU_GATHER
1218 tlb->start = addr; 1218 tlb->start = addr;
1219 tlb->end = end; 1219 tlb->end = end;
1220 #endif 1220 #endif
1221 tlb_flush_mmu(tlb); 1221 tlb_flush_mmu(tlb);
1222 if (addr != end) 1222 if (addr != end)
1223 goto again; 1223 goto again;
1224 } 1224 }
1225 1225
1226 return addr; 1226 return addr;
1227 } 1227 }
1228 1228
1229 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, 1229 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1230 struct vm_area_struct *vma, pud_t *pud, 1230 struct vm_area_struct *vma, pud_t *pud,
1231 unsigned long addr, unsigned long end, 1231 unsigned long addr, unsigned long end,
1232 struct zap_details *details) 1232 struct zap_details *details)
1233 { 1233 {
1234 pmd_t *pmd; 1234 pmd_t *pmd;
1235 unsigned long next; 1235 unsigned long next;
1236 1236
1237 pmd = pmd_offset(pud, addr); 1237 pmd = pmd_offset(pud, addr);
1238 do { 1238 do {
1239 next = pmd_addr_end(addr, end); 1239 next = pmd_addr_end(addr, end);
1240 if (pmd_trans_huge(*pmd)) { 1240 if (pmd_trans_huge(*pmd)) {
1241 if (next - addr != HPAGE_PMD_SIZE) { 1241 if (next - addr != HPAGE_PMD_SIZE) {
1242 #ifdef CONFIG_DEBUG_VM 1242 #ifdef CONFIG_DEBUG_VM
1243 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { 1243 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1244 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", 1244 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1245 __func__, addr, end, 1245 __func__, addr, end,
1246 vma->vm_start, 1246 vma->vm_start,
1247 vma->vm_end); 1247 vma->vm_end);
1248 BUG(); 1248 BUG();
1249 } 1249 }
1250 #endif 1250 #endif
1251 split_huge_page_pmd(vma, addr, pmd); 1251 split_huge_page_pmd(vma, addr, pmd);
1252 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1252 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1253 goto next; 1253 goto next;
1254 /* fall through */ 1254 /* fall through */
1255 } 1255 }
1256 /* 1256 /*
1257 * Here there can be other concurrent MADV_DONTNEED or 1257 * Here there can be other concurrent MADV_DONTNEED or
1258 * trans huge page faults running, and if the pmd is 1258 * trans huge page faults running, and if the pmd is
1259 * none or trans huge it can change under us. This is 1259 * none or trans huge it can change under us. This is
1260 * because MADV_DONTNEED holds the mmap_sem in read 1260 * because MADV_DONTNEED holds the mmap_sem in read
1261 * mode. 1261 * mode.
1262 */ 1262 */
1263 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 1263 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1264 goto next; 1264 goto next;
1265 next = zap_pte_range(tlb, vma, pmd, addr, next, details); 1265 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1266 next: 1266 next:
1267 cond_resched(); 1267 cond_resched();
1268 } while (pmd++, addr = next, addr != end); 1268 } while (pmd++, addr = next, addr != end);
1269 1269
1270 return addr; 1270 return addr;
1271 } 1271 }
1272 1272
1273 static inline unsigned long zap_pud_range(struct mmu_gather *tlb, 1273 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1274 struct vm_area_struct *vma, pgd_t *pgd, 1274 struct vm_area_struct *vma, pgd_t *pgd,
1275 unsigned long addr, unsigned long end, 1275 unsigned long addr, unsigned long end,
1276 struct zap_details *details) 1276 struct zap_details *details)
1277 { 1277 {
1278 pud_t *pud; 1278 pud_t *pud;
1279 unsigned long next; 1279 unsigned long next;
1280 1280
1281 pud = pud_offset(pgd, addr); 1281 pud = pud_offset(pgd, addr);
1282 do { 1282 do {
1283 next = pud_addr_end(addr, end); 1283 next = pud_addr_end(addr, end);
1284 if (pud_none_or_clear_bad(pud)) 1284 if (pud_none_or_clear_bad(pud))
1285 continue; 1285 continue;
1286 next = zap_pmd_range(tlb, vma, pud, addr, next, details); 1286 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1287 } while (pud++, addr = next, addr != end); 1287 } while (pud++, addr = next, addr != end);
1288 1288
1289 return addr; 1289 return addr;
1290 } 1290 }
1291 1291
1292 static void unmap_page_range(struct mmu_gather *tlb, 1292 static void unmap_page_range(struct mmu_gather *tlb,
1293 struct vm_area_struct *vma, 1293 struct vm_area_struct *vma,
1294 unsigned long addr, unsigned long end, 1294 unsigned long addr, unsigned long end,
1295 struct zap_details *details) 1295 struct zap_details *details)
1296 { 1296 {
1297 pgd_t *pgd; 1297 pgd_t *pgd;
1298 unsigned long next; 1298 unsigned long next;
1299 1299
1300 if (details && !details->check_mapping && !details->nonlinear_vma) 1300 if (details && !details->check_mapping && !details->nonlinear_vma)
1301 details = NULL; 1301 details = NULL;
1302 1302
1303 BUG_ON(addr >= end); 1303 BUG_ON(addr >= end);
1304 mem_cgroup_uncharge_start(); 1304 mem_cgroup_uncharge_start();
1305 tlb_start_vma(tlb, vma); 1305 tlb_start_vma(tlb, vma);
1306 pgd = pgd_offset(vma->vm_mm, addr); 1306 pgd = pgd_offset(vma->vm_mm, addr);
1307 do { 1307 do {
1308 next = pgd_addr_end(addr, end); 1308 next = pgd_addr_end(addr, end);
1309 if (pgd_none_or_clear_bad(pgd)) 1309 if (pgd_none_or_clear_bad(pgd))
1310 continue; 1310 continue;
1311 next = zap_pud_range(tlb, vma, pgd, addr, next, details); 1311 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1312 } while (pgd++, addr = next, addr != end); 1312 } while (pgd++, addr = next, addr != end);
1313 tlb_end_vma(tlb, vma); 1313 tlb_end_vma(tlb, vma);
1314 mem_cgroup_uncharge_end(); 1314 mem_cgroup_uncharge_end();
1315 } 1315 }
1316 1316
1317 1317
1318 static void unmap_single_vma(struct mmu_gather *tlb, 1318 static void unmap_single_vma(struct mmu_gather *tlb,
1319 struct vm_area_struct *vma, unsigned long start_addr, 1319 struct vm_area_struct *vma, unsigned long start_addr,
1320 unsigned long end_addr, 1320 unsigned long end_addr,
1321 struct zap_details *details) 1321 struct zap_details *details)
1322 { 1322 {
1323 unsigned long start = max(vma->vm_start, start_addr); 1323 unsigned long start = max(vma->vm_start, start_addr);
1324 unsigned long end; 1324 unsigned long end;
1325 1325
1326 if (start >= vma->vm_end) 1326 if (start >= vma->vm_end)
1327 return; 1327 return;
1328 end = min(vma->vm_end, end_addr); 1328 end = min(vma->vm_end, end_addr);
1329 if (end <= vma->vm_start) 1329 if (end <= vma->vm_start)
1330 return; 1330 return;
1331 1331
1332 if (vma->vm_file) 1332 if (vma->vm_file)
1333 uprobe_munmap(vma, start, end); 1333 uprobe_munmap(vma, start, end);
1334 1334
1335 if (unlikely(vma->vm_flags & VM_PFNMAP)) 1335 if (unlikely(vma->vm_flags & VM_PFNMAP))
1336 untrack_pfn(vma, 0, 0); 1336 untrack_pfn(vma, 0, 0);
1337 1337
1338 if (start != end) { 1338 if (start != end) {
1339 if (unlikely(is_vm_hugetlb_page(vma))) { 1339 if (unlikely(is_vm_hugetlb_page(vma))) {
1340 /* 1340 /*
1341 * It is undesirable to test vma->vm_file as it 1341 * It is undesirable to test vma->vm_file as it
1342 * should be non-null for valid hugetlb area. 1342 * should be non-null for valid hugetlb area.
1343 * However, vm_file will be NULL in the error 1343 * However, vm_file will be NULL in the error
1344 * cleanup path of do_mmap_pgoff. When 1344 * cleanup path of do_mmap_pgoff. When
1345 * hugetlbfs ->mmap method fails, 1345 * hugetlbfs ->mmap method fails,
1346 * do_mmap_pgoff() nullifies vma->vm_file 1346 * do_mmap_pgoff() nullifies vma->vm_file
1347 * before calling this function to clean up. 1347 * before calling this function to clean up.
1348 * Since no pte has actually been setup, it is 1348 * Since no pte has actually been setup, it is
1349 * safe to do nothing in this case. 1349 * safe to do nothing in this case.
1350 */ 1350 */
1351 if (vma->vm_file) { 1351 if (vma->vm_file) {
1352 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 1352 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1353 __unmap_hugepage_range_final(tlb, vma, start, end, NULL); 1353 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1354 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 1354 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1355 } 1355 }
1356 } else 1356 } else
1357 unmap_page_range(tlb, vma, start, end, details); 1357 unmap_page_range(tlb, vma, start, end, details);
1358 } 1358 }
1359 } 1359 }
1360 1360
1361 /** 1361 /**
1362 * unmap_vmas - unmap a range of memory covered by a list of vma's 1362 * unmap_vmas - unmap a range of memory covered by a list of vma's
1363 * @tlb: address of the caller's struct mmu_gather 1363 * @tlb: address of the caller's struct mmu_gather
1364 * @vma: the starting vma 1364 * @vma: the starting vma
1365 * @start_addr: virtual address at which to start unmapping 1365 * @start_addr: virtual address at which to start unmapping
1366 * @end_addr: virtual address at which to end unmapping 1366 * @end_addr: virtual address at which to end unmapping
1367 * 1367 *
1368 * Unmap all pages in the vma list. 1368 * Unmap all pages in the vma list.
1369 * 1369 *
1370 * Only addresses between `start' and `end' will be unmapped. 1370 * Only addresses between `start' and `end' will be unmapped.
1371 * 1371 *
1372 * The VMA list must be sorted in ascending virtual address order. 1372 * The VMA list must be sorted in ascending virtual address order.
1373 * 1373 *
1374 * unmap_vmas() assumes that the caller will flush the whole unmapped address 1374 * unmap_vmas() assumes that the caller will flush the whole unmapped address
1375 * range after unmap_vmas() returns. So the only responsibility here is to 1375 * range after unmap_vmas() returns. So the only responsibility here is to
1376 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 1376 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1377 * drops the lock and schedules. 1377 * drops the lock and schedules.
1378 */ 1378 */
1379 void unmap_vmas(struct mmu_gather *tlb, 1379 void unmap_vmas(struct mmu_gather *tlb,
1380 struct vm_area_struct *vma, unsigned long start_addr, 1380 struct vm_area_struct *vma, unsigned long start_addr,
1381 unsigned long end_addr) 1381 unsigned long end_addr)
1382 { 1382 {
1383 struct mm_struct *mm = vma->vm_mm; 1383 struct mm_struct *mm = vma->vm_mm;
1384 1384
1385 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1385 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1386 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) 1386 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1387 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); 1387 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1388 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1388 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1389 } 1389 }
1390 1390
1391 /** 1391 /**
1392 * zap_page_range - remove user pages in a given range 1392 * zap_page_range - remove user pages in a given range
1393 * @vma: vm_area_struct holding the applicable pages 1393 * @vma: vm_area_struct holding the applicable pages
1394 * @start: starting address of pages to zap 1394 * @start: starting address of pages to zap
1395 * @size: number of bytes to zap 1395 * @size: number of bytes to zap
1396 * @details: details of nonlinear truncation or shared cache invalidation 1396 * @details: details of nonlinear truncation or shared cache invalidation
1397 * 1397 *
1398 * Caller must protect the VMA list 1398 * Caller must protect the VMA list
1399 */ 1399 */
1400 void zap_page_range(struct vm_area_struct *vma, unsigned long start, 1400 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1401 unsigned long size, struct zap_details *details) 1401 unsigned long size, struct zap_details *details)
1402 { 1402 {
1403 struct mm_struct *mm = vma->vm_mm; 1403 struct mm_struct *mm = vma->vm_mm;
1404 struct mmu_gather tlb; 1404 struct mmu_gather tlb;
1405 unsigned long end = start + size; 1405 unsigned long end = start + size;
1406 1406
1407 lru_add_drain(); 1407 lru_add_drain();
1408 tlb_gather_mmu(&tlb, mm, 0); 1408 tlb_gather_mmu(&tlb, mm, 0);
1409 update_hiwater_rss(mm); 1409 update_hiwater_rss(mm);
1410 mmu_notifier_invalidate_range_start(mm, start, end); 1410 mmu_notifier_invalidate_range_start(mm, start, end);
1411 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) 1411 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1412 unmap_single_vma(&tlb, vma, start, end, details); 1412 unmap_single_vma(&tlb, vma, start, end, details);
1413 mmu_notifier_invalidate_range_end(mm, start, end); 1413 mmu_notifier_invalidate_range_end(mm, start, end);
1414 tlb_finish_mmu(&tlb, start, end); 1414 tlb_finish_mmu(&tlb, start, end);
1415 } 1415 }
1416 1416
1417 /** 1417 /**
1418 * zap_page_range_single - remove user pages in a given range 1418 * zap_page_range_single - remove user pages in a given range
1419 * @vma: vm_area_struct holding the applicable pages 1419 * @vma: vm_area_struct holding the applicable pages
1420 * @address: starting address of pages to zap 1420 * @address: starting address of pages to zap
1421 * @size: number of bytes to zap 1421 * @size: number of bytes to zap
1422 * @details: details of nonlinear truncation or shared cache invalidation 1422 * @details: details of nonlinear truncation or shared cache invalidation
1423 * 1423 *
1424 * The range must fit into one VMA. 1424 * The range must fit into one VMA.
1425 */ 1425 */
1426 static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, 1426 static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1427 unsigned long size, struct zap_details *details) 1427 unsigned long size, struct zap_details *details)
1428 { 1428 {
1429 struct mm_struct *mm = vma->vm_mm; 1429 struct mm_struct *mm = vma->vm_mm;
1430 struct mmu_gather tlb; 1430 struct mmu_gather tlb;
1431 unsigned long end = address + size; 1431 unsigned long end = address + size;
1432 1432
1433 lru_add_drain(); 1433 lru_add_drain();
1434 tlb_gather_mmu(&tlb, mm, 0); 1434 tlb_gather_mmu(&tlb, mm, 0);
1435 update_hiwater_rss(mm); 1435 update_hiwater_rss(mm);
1436 mmu_notifier_invalidate_range_start(mm, address, end); 1436 mmu_notifier_invalidate_range_start(mm, address, end);
1437 unmap_single_vma(&tlb, vma, address, end, details); 1437 unmap_single_vma(&tlb, vma, address, end, details);
1438 mmu_notifier_invalidate_range_end(mm, address, end); 1438 mmu_notifier_invalidate_range_end(mm, address, end);
1439 tlb_finish_mmu(&tlb, address, end); 1439 tlb_finish_mmu(&tlb, address, end);
1440 } 1440 }
1441 1441
1442 /** 1442 /**
1443 * zap_vma_ptes - remove ptes mapping the vma 1443 * zap_vma_ptes - remove ptes mapping the vma
1444 * @vma: vm_area_struct holding ptes to be zapped 1444 * @vma: vm_area_struct holding ptes to be zapped
1445 * @address: starting address of pages to zap 1445 * @address: starting address of pages to zap
1446 * @size: number of bytes to zap 1446 * @size: number of bytes to zap
1447 * 1447 *
1448 * This function only unmaps ptes assigned to VM_PFNMAP vmas. 1448 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
1449 * 1449 *
1450 * The entire address range must be fully contained within the vma. 1450 * The entire address range must be fully contained within the vma.
1451 * 1451 *
1452 * Returns 0 if successful. 1452 * Returns 0 if successful.
1453 */ 1453 */
1454 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 1454 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1455 unsigned long size) 1455 unsigned long size)
1456 { 1456 {
1457 if (address < vma->vm_start || address + size > vma->vm_end || 1457 if (address < vma->vm_start || address + size > vma->vm_end ||
1458 !(vma->vm_flags & VM_PFNMAP)) 1458 !(vma->vm_flags & VM_PFNMAP))
1459 return -1; 1459 return -1;
1460 zap_page_range_single(vma, address, size, NULL); 1460 zap_page_range_single(vma, address, size, NULL);
1461 return 0; 1461 return 0;
1462 } 1462 }
1463 EXPORT_SYMBOL_GPL(zap_vma_ptes); 1463 EXPORT_SYMBOL_GPL(zap_vma_ptes);
1464 1464
1465 /** 1465 /**
1466 * follow_page_mask - look up a page descriptor from a user-virtual address 1466 * follow_page_mask - look up a page descriptor from a user-virtual address
1467 * @vma: vm_area_struct mapping @address 1467 * @vma: vm_area_struct mapping @address
1468 * @address: virtual address to look up 1468 * @address: virtual address to look up
1469 * @flags: flags modifying lookup behaviour 1469 * @flags: flags modifying lookup behaviour
1470 * @page_mask: on output, *page_mask is set according to the size of the page 1470 * @page_mask: on output, *page_mask is set according to the size of the page
1471 * 1471 *
1472 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 1472 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1473 * 1473 *
1474 * Returns the mapped (struct page *), %NULL if no mapping exists, or 1474 * Returns the mapped (struct page *), %NULL if no mapping exists, or
1475 * an error pointer if there is a mapping to something not represented 1475 * an error pointer if there is a mapping to something not represented
1476 * by a page descriptor (see also vm_normal_page()). 1476 * by a page descriptor (see also vm_normal_page()).
1477 */ 1477 */
1478 struct page *follow_page_mask(struct vm_area_struct *vma, 1478 struct page *follow_page_mask(struct vm_area_struct *vma,
1479 unsigned long address, unsigned int flags, 1479 unsigned long address, unsigned int flags,
1480 unsigned int *page_mask) 1480 unsigned int *page_mask)
1481 { 1481 {
1482 pgd_t *pgd; 1482 pgd_t *pgd;
1483 pud_t *pud; 1483 pud_t *pud;
1484 pmd_t *pmd; 1484 pmd_t *pmd;
1485 pte_t *ptep, pte; 1485 pte_t *ptep, pte;
1486 spinlock_t *ptl; 1486 spinlock_t *ptl;
1487 struct page *page; 1487 struct page *page;
1488 struct mm_struct *mm = vma->vm_mm; 1488 struct mm_struct *mm = vma->vm_mm;
1489 1489
1490 *page_mask = 0; 1490 *page_mask = 0;
1491 1491
1492 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 1492 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1493 if (!IS_ERR(page)) { 1493 if (!IS_ERR(page)) {
1494 BUG_ON(flags & FOLL_GET); 1494 BUG_ON(flags & FOLL_GET);
1495 goto out; 1495 goto out;
1496 } 1496 }
1497 1497
1498 page = NULL; 1498 page = NULL;
1499 pgd = pgd_offset(mm, address); 1499 pgd = pgd_offset(mm, address);
1500 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 1500 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1501 goto no_page_table; 1501 goto no_page_table;
1502 1502
1503 pud = pud_offset(pgd, address); 1503 pud = pud_offset(pgd, address);
1504 if (pud_none(*pud)) 1504 if (pud_none(*pud))
1505 goto no_page_table; 1505 goto no_page_table;
1506 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 1506 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1507 BUG_ON(flags & FOLL_GET); 1507 BUG_ON(flags & FOLL_GET);
1508 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1508 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1509 goto out; 1509 goto out;
1510 } 1510 }
1511 if (unlikely(pud_bad(*pud))) 1511 if (unlikely(pud_bad(*pud)))
1512 goto no_page_table; 1512 goto no_page_table;
1513 1513
1514 pmd = pmd_offset(pud, address); 1514 pmd = pmd_offset(pud, address);
1515 if (pmd_none(*pmd)) 1515 if (pmd_none(*pmd))
1516 goto no_page_table; 1516 goto no_page_table;
1517 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { 1517 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1518 BUG_ON(flags & FOLL_GET); 1518 BUG_ON(flags & FOLL_GET);
1519 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1519 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1520 goto out; 1520 goto out;
1521 } 1521 }
1522 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 1522 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1523 goto no_page_table; 1523 goto no_page_table;
1524 if (pmd_trans_huge(*pmd)) { 1524 if (pmd_trans_huge(*pmd)) {
1525 if (flags & FOLL_SPLIT) { 1525 if (flags & FOLL_SPLIT) {
1526 split_huge_page_pmd(vma, address, pmd); 1526 split_huge_page_pmd(vma, address, pmd);
1527 goto split_fallthrough; 1527 goto split_fallthrough;
1528 } 1528 }
1529 spin_lock(&mm->page_table_lock); 1529 spin_lock(&mm->page_table_lock);
1530 if (likely(pmd_trans_huge(*pmd))) { 1530 if (likely(pmd_trans_huge(*pmd))) {
1531 if (unlikely(pmd_trans_splitting(*pmd))) { 1531 if (unlikely(pmd_trans_splitting(*pmd))) {
1532 spin_unlock(&mm->page_table_lock); 1532 spin_unlock(&mm->page_table_lock);
1533 wait_split_huge_page(vma->anon_vma, pmd); 1533 wait_split_huge_page(vma->anon_vma, pmd);
1534 } else { 1534 } else {
1535 page = follow_trans_huge_pmd(vma, address, 1535 page = follow_trans_huge_pmd(vma, address,
1536 pmd, flags); 1536 pmd, flags);
1537 spin_unlock(&mm->page_table_lock); 1537 spin_unlock(&mm->page_table_lock);
1538 *page_mask = HPAGE_PMD_NR - 1; 1538 *page_mask = HPAGE_PMD_NR - 1;
1539 goto out; 1539 goto out;
1540 } 1540 }
1541 } else 1541 } else
1542 spin_unlock(&mm->page_table_lock); 1542 spin_unlock(&mm->page_table_lock);
1543 /* fall through */ 1543 /* fall through */
1544 } 1544 }
1545 split_fallthrough: 1545 split_fallthrough:
1546 if (unlikely(pmd_bad(*pmd))) 1546 if (unlikely(pmd_bad(*pmd)))
1547 goto no_page_table; 1547 goto no_page_table;
1548 1548
1549 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 1549 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1550 1550
1551 pte = *ptep; 1551 pte = *ptep;
1552 if (!pte_present(pte)) { 1552 if (!pte_present(pte)) {
1553 swp_entry_t entry; 1553 swp_entry_t entry;
1554 /* 1554 /*
1555 * KSM's break_ksm() relies upon recognizing a ksm page 1555 * KSM's break_ksm() relies upon recognizing a ksm page
1556 * even while it is being migrated, so for that case we 1556 * even while it is being migrated, so for that case we
1557 * need migration_entry_wait(). 1557 * need migration_entry_wait().
1558 */ 1558 */
1559 if (likely(!(flags & FOLL_MIGRATION))) 1559 if (likely(!(flags & FOLL_MIGRATION)))
1560 goto no_page; 1560 goto no_page;
1561 if (pte_none(pte) || pte_file(pte)) 1561 if (pte_none(pte) || pte_file(pte))
1562 goto no_page; 1562 goto no_page;
1563 entry = pte_to_swp_entry(pte); 1563 entry = pte_to_swp_entry(pte);
1564 if (!is_migration_entry(entry)) 1564 if (!is_migration_entry(entry))
1565 goto no_page; 1565 goto no_page;
1566 pte_unmap_unlock(ptep, ptl); 1566 pte_unmap_unlock(ptep, ptl);
1567 migration_entry_wait(mm, pmd, address); 1567 migration_entry_wait(mm, pmd, address);
1568 goto split_fallthrough; 1568 goto split_fallthrough;
1569 } 1569 }
1570 if ((flags & FOLL_NUMA) && pte_numa(pte)) 1570 if ((flags & FOLL_NUMA) && pte_numa(pte))
1571 goto no_page; 1571 goto no_page;
1572 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1572 if ((flags & FOLL_WRITE) && !pte_write(pte))
1573 goto unlock; 1573 goto unlock;
1574 1574
1575 page = vm_normal_page(vma, address, pte); 1575 page = vm_normal_page(vma, address, pte);
1576 if (unlikely(!page)) { 1576 if (unlikely(!page)) {
1577 if ((flags & FOLL_DUMP) || 1577 if ((flags & FOLL_DUMP) ||
1578 !is_zero_pfn(pte_pfn(pte))) 1578 !is_zero_pfn(pte_pfn(pte)))
1579 goto bad_page; 1579 goto bad_page;
1580 page = pte_page(pte); 1580 page = pte_page(pte);
1581 } 1581 }
1582 1582
1583 if (flags & FOLL_GET) 1583 if (flags & FOLL_GET)
1584 get_page_foll(page); 1584 get_page_foll(page);
1585 if (flags & FOLL_TOUCH) { 1585 if (flags & FOLL_TOUCH) {
1586 if ((flags & FOLL_WRITE) && 1586 if ((flags & FOLL_WRITE) &&
1587 !pte_dirty(pte) && !PageDirty(page)) 1587 !pte_dirty(pte) && !PageDirty(page))
1588 set_page_dirty(page); 1588 set_page_dirty(page);
1589 /* 1589 /*
1590 * pte_mkyoung() would be more correct here, but atomic care 1590 * pte_mkyoung() would be more correct here, but atomic care
1591 * is needed to avoid losing the dirty bit: it is easier to use 1591 * is needed to avoid losing the dirty bit: it is easier to use
1592 * mark_page_accessed(). 1592 * mark_page_accessed().
1593 */ 1593 */
1594 mark_page_accessed(page); 1594 mark_page_accessed(page);
1595 } 1595 }
1596 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1596 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1597 /* 1597 /*
1598 * The preliminary mapping check is mainly to avoid the 1598 * The preliminary mapping check is mainly to avoid the
1599 * pointless overhead of lock_page on the ZERO_PAGE 1599 * pointless overhead of lock_page on the ZERO_PAGE
1600 * which might bounce very badly if there is contention. 1600 * which might bounce very badly if there is contention.
1601 * 1601 *
1602 * If the page is already locked, we don't need to 1602 * If the page is already locked, we don't need to
1603 * handle it now - vmscan will handle it later if and 1603 * handle it now - vmscan will handle it later if and
1604 * when it attempts to reclaim the page. 1604 * when it attempts to reclaim the page.
1605 */ 1605 */
1606 if (page->mapping && trylock_page(page)) { 1606 if (page->mapping && trylock_page(page)) {
1607 lru_add_drain(); /* push cached pages to LRU */ 1607 lru_add_drain(); /* push cached pages to LRU */
1608 /* 1608 /*
1609 * Because we lock page here, and migration is 1609 * Because we lock page here, and migration is
1610 * blocked by the pte's page reference, and we 1610 * blocked by the pte's page reference, and we
1611 * know the page is still mapped, we don't even 1611 * know the page is still mapped, we don't even
1612 * need to check for file-cache page truncation. 1612 * need to check for file-cache page truncation.
1613 */ 1613 */
1614 mlock_vma_page(page); 1614 mlock_vma_page(page);
1615 unlock_page(page); 1615 unlock_page(page);
1616 } 1616 }
1617 } 1617 }
1618 unlock: 1618 unlock:
1619 pte_unmap_unlock(ptep, ptl); 1619 pte_unmap_unlock(ptep, ptl);
1620 out: 1620 out:
1621 return page; 1621 return page;
1622 1622
1623 bad_page: 1623 bad_page:
1624 pte_unmap_unlock(ptep, ptl); 1624 pte_unmap_unlock(ptep, ptl);
1625 return ERR_PTR(-EFAULT); 1625 return ERR_PTR(-EFAULT);
1626 1626
1627 no_page: 1627 no_page:
1628 pte_unmap_unlock(ptep, ptl); 1628 pte_unmap_unlock(ptep, ptl);
1629 if (!pte_none(pte)) 1629 if (!pte_none(pte))
1630 return page; 1630 return page;
1631 1631
1632 no_page_table: 1632 no_page_table:
1633 /* 1633 /*
1634 * When core dumping an enormous anonymous area that nobody 1634 * When core dumping an enormous anonymous area that nobody
1635 * has touched so far, we don't want to allocate unnecessary pages or 1635 * has touched so far, we don't want to allocate unnecessary pages or
1636 * page tables. Return error instead of NULL to skip handle_mm_fault, 1636 * page tables. Return error instead of NULL to skip handle_mm_fault,
1637 * then get_dump_page() will return NULL to leave a hole in the dump. 1637 * then get_dump_page() will return NULL to leave a hole in the dump.
1638 * But we can only make this optimization where a hole would surely 1638 * But we can only make this optimization where a hole would surely
1639 * be zero-filled if handle_mm_fault() actually did handle it. 1639 * be zero-filled if handle_mm_fault() actually did handle it.
1640 */ 1640 */
1641 if ((flags & FOLL_DUMP) && 1641 if ((flags & FOLL_DUMP) &&
1642 (!vma->vm_ops || !vma->vm_ops->fault)) 1642 (!vma->vm_ops || !vma->vm_ops->fault))
1643 return ERR_PTR(-EFAULT); 1643 return ERR_PTR(-EFAULT);
1644 return page; 1644 return page;
1645 } 1645 }
1646 1646
1647 static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) 1647 static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1648 { 1648 {
1649 return stack_guard_page_start(vma, addr) || 1649 return stack_guard_page_start(vma, addr) ||
1650 stack_guard_page_end(vma, addr+PAGE_SIZE); 1650 stack_guard_page_end(vma, addr+PAGE_SIZE);
1651 } 1651 }
1652 1652
1653 /** 1653 /**
1654 * __get_user_pages() - pin user pages in memory 1654 * __get_user_pages() - pin user pages in memory
1655 * @tsk: task_struct of target task 1655 * @tsk: task_struct of target task
1656 * @mm: mm_struct of target mm 1656 * @mm: mm_struct of target mm
1657 * @start: starting user address 1657 * @start: starting user address
1658 * @nr_pages: number of pages from start to pin 1658 * @nr_pages: number of pages from start to pin
1659 * @gup_flags: flags modifying pin behaviour 1659 * @gup_flags: flags modifying pin behaviour
1660 * @pages: array that receives pointers to the pages pinned. 1660 * @pages: array that receives pointers to the pages pinned.
1661 * Should be at least nr_pages long. Or NULL, if caller 1661 * Should be at least nr_pages long. Or NULL, if caller
1662 * only intends to ensure the pages are faulted in. 1662 * only intends to ensure the pages are faulted in.
1663 * @vmas: array of pointers to vmas corresponding to each page. 1663 * @vmas: array of pointers to vmas corresponding to each page.
1664 * Or NULL if the caller does not require them. 1664 * Or NULL if the caller does not require them.
1665 * @nonblocking: whether waiting for disk IO or mmap_sem contention 1665 * @nonblocking: whether waiting for disk IO or mmap_sem contention
1666 * 1666 *
1667 * Returns number of pages pinned. This may be fewer than the number 1667 * Returns number of pages pinned. This may be fewer than the number
1668 * requested. If nr_pages is 0 or negative, returns 0. If no pages 1668 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1669 * were pinned, returns -errno. Each page returned must be released 1669 * were pinned, returns -errno. Each page returned must be released
1670 * with a put_page() call when it is finished with. vmas will only 1670 * with a put_page() call when it is finished with. vmas will only
1671 * remain valid while mmap_sem is held. 1671 * remain valid while mmap_sem is held.
1672 * 1672 *
1673 * Must be called with mmap_sem held for read or write. 1673 * Must be called with mmap_sem held for read or write.
1674 * 1674 *
1675 * __get_user_pages walks a process's page tables and takes a reference to 1675 * __get_user_pages walks a process's page tables and takes a reference to
1676 * each struct page that each user address corresponds to at a given 1676 * each struct page that each user address corresponds to at a given
1677 * instant. That is, it takes the page that would be accessed if a user 1677 * instant. That is, it takes the page that would be accessed if a user
1678 * thread accesses the given user virtual address at that instant. 1678 * thread accesses the given user virtual address at that instant.
1679 * 1679 *
1680 * This does not guarantee that the page exists in the user mappings when 1680 * This does not guarantee that the page exists in the user mappings when
1681 * __get_user_pages returns, and there may even be a completely different 1681 * __get_user_pages returns, and there may even be a completely different
1682 * page there in some cases (eg. if mmapped pagecache has been invalidated 1682 * page there in some cases (eg. if mmapped pagecache has been invalidated
1683 * and subsequently re faulted). However it does guarantee that the page 1683 * and subsequently re faulted). However it does guarantee that the page
1684 * won't be freed completely. And mostly callers simply care that the page 1684 * won't be freed completely. And mostly callers simply care that the page
1685 * contains data that was valid *at some point in time*. Typically, an IO 1685 * contains data that was valid *at some point in time*. Typically, an IO
1686 * or similar operation cannot guarantee anything stronger anyway because 1686 * or similar operation cannot guarantee anything stronger anyway because
1687 * locks can't be held over the syscall boundary. 1687 * locks can't be held over the syscall boundary.
1688 * 1688 *
1689 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If 1689 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1690 * the page is written to, set_page_dirty (or set_page_dirty_lock, as 1690 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1691 * appropriate) must be called after the page is finished with, and 1691 * appropriate) must be called after the page is finished with, and
1692 * before put_page is called. 1692 * before put_page is called.
1693 * 1693 *
1694 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO 1694 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
1695 * or mmap_sem contention, and if waiting is needed to pin all pages, 1695 * or mmap_sem contention, and if waiting is needed to pin all pages,
1696 * *@nonblocking will be set to 0. 1696 * *@nonblocking will be set to 0.
1697 * 1697 *
1698 * In most cases, get_user_pages or get_user_pages_fast should be used 1698 * In most cases, get_user_pages or get_user_pages_fast should be used
1699 * instead of __get_user_pages. __get_user_pages should be used only if 1699 * instead of __get_user_pages. __get_user_pages should be used only if
1700 * you need some special @gup_flags. 1700 * you need some special @gup_flags.
1701 */ 1701 */
1702 long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1702 long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1703 unsigned long start, unsigned long nr_pages, 1703 unsigned long start, unsigned long nr_pages,
1704 unsigned int gup_flags, struct page **pages, 1704 unsigned int gup_flags, struct page **pages,
1705 struct vm_area_struct **vmas, int *nonblocking) 1705 struct vm_area_struct **vmas, int *nonblocking)
1706 { 1706 {
1707 long i; 1707 long i;
1708 unsigned long vm_flags; 1708 unsigned long vm_flags;
1709 unsigned int page_mask; 1709 unsigned int page_mask;
1710 1710
1711 if (!nr_pages) 1711 if (!nr_pages)
1712 return 0; 1712 return 0;
1713 1713
1714 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 1714 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1715 1715
1716 /* 1716 /*
1717 * Require read or write permissions. 1717 * Require read or write permissions.
1718 * If FOLL_FORCE is set, we only require the "MAY" flags. 1718 * If FOLL_FORCE is set, we only require the "MAY" flags.
1719 */ 1719 */
1720 vm_flags = (gup_flags & FOLL_WRITE) ? 1720 vm_flags = (gup_flags & FOLL_WRITE) ?
1721 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1721 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1722 vm_flags &= (gup_flags & FOLL_FORCE) ? 1722 vm_flags &= (gup_flags & FOLL_FORCE) ?
1723 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1723 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1724 1724
1725 /* 1725 /*
1726 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault 1726 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1727 * would be called on PROT_NONE ranges. We must never invoke 1727 * would be called on PROT_NONE ranges. We must never invoke
1728 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting 1728 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1729 * page faults would unprotect the PROT_NONE ranges if 1729 * page faults would unprotect the PROT_NONE ranges if
1730 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd 1730 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1731 * bitflag. So to avoid that, don't set FOLL_NUMA if 1731 * bitflag. So to avoid that, don't set FOLL_NUMA if
1732 * FOLL_FORCE is set. 1732 * FOLL_FORCE is set.
1733 */ 1733 */
1734 if (!(gup_flags & FOLL_FORCE)) 1734 if (!(gup_flags & FOLL_FORCE))
1735 gup_flags |= FOLL_NUMA; 1735 gup_flags |= FOLL_NUMA;
1736 1736
1737 i = 0; 1737 i = 0;
1738 1738
1739 do { 1739 do {
1740 struct vm_area_struct *vma; 1740 struct vm_area_struct *vma;
1741 1741
1742 vma = find_extend_vma(mm, start); 1742 vma = find_extend_vma(mm, start);
1743 if (!vma && in_gate_area(mm, start)) { 1743 if (!vma && in_gate_area(mm, start)) {
1744 unsigned long pg = start & PAGE_MASK; 1744 unsigned long pg = start & PAGE_MASK;
1745 pgd_t *pgd; 1745 pgd_t *pgd;
1746 pud_t *pud; 1746 pud_t *pud;
1747 pmd_t *pmd; 1747 pmd_t *pmd;
1748 pte_t *pte; 1748 pte_t *pte;
1749 1749
1750 /* user gate pages are read-only */ 1750 /* user gate pages are read-only */
1751 if (gup_flags & FOLL_WRITE) 1751 if (gup_flags & FOLL_WRITE)
1752 return i ? : -EFAULT; 1752 return i ? : -EFAULT;
1753 if (pg > TASK_SIZE) 1753 if (pg > TASK_SIZE)
1754 pgd = pgd_offset_k(pg); 1754 pgd = pgd_offset_k(pg);
1755 else 1755 else
1756 pgd = pgd_offset_gate(mm, pg); 1756 pgd = pgd_offset_gate(mm, pg);
1757 BUG_ON(pgd_none(*pgd)); 1757 BUG_ON(pgd_none(*pgd));
1758 pud = pud_offset(pgd, pg); 1758 pud = pud_offset(pgd, pg);
1759 BUG_ON(pud_none(*pud)); 1759 BUG_ON(pud_none(*pud));
1760 pmd = pmd_offset(pud, pg); 1760 pmd = pmd_offset(pud, pg);
1761 if (pmd_none(*pmd)) 1761 if (pmd_none(*pmd))
1762 return i ? : -EFAULT; 1762 return i ? : -EFAULT;
1763 VM_BUG_ON(pmd_trans_huge(*pmd)); 1763 VM_BUG_ON(pmd_trans_huge(*pmd));
1764 pte = pte_offset_map(pmd, pg); 1764 pte = pte_offset_map(pmd, pg);
1765 if (pte_none(*pte)) { 1765 if (pte_none(*pte)) {
1766 pte_unmap(pte); 1766 pte_unmap(pte);
1767 return i ? : -EFAULT; 1767 return i ? : -EFAULT;
1768 } 1768 }
1769 vma = get_gate_vma(mm); 1769 vma = get_gate_vma(mm);
1770 if (pages) { 1770 if (pages) {
1771 struct page *page; 1771 struct page *page;
1772 1772
1773 page = vm_normal_page(vma, start, *pte); 1773 page = vm_normal_page(vma, start, *pte);
1774 if (!page) { 1774 if (!page) {
1775 if (!(gup_flags & FOLL_DUMP) && 1775 if (!(gup_flags & FOLL_DUMP) &&
1776 is_zero_pfn(pte_pfn(*pte))) 1776 is_zero_pfn(pte_pfn(*pte)))
1777 page = pte_page(*pte); 1777 page = pte_page(*pte);
1778 else { 1778 else {
1779 pte_unmap(pte); 1779 pte_unmap(pte);
1780 return i ? : -EFAULT; 1780 return i ? : -EFAULT;
1781 } 1781 }
1782 } 1782 }
1783 pages[i] = page; 1783 pages[i] = page;
1784 get_page(page); 1784 get_page(page);
1785 } 1785 }
1786 pte_unmap(pte); 1786 pte_unmap(pte);
1787 page_mask = 0; 1787 page_mask = 0;
1788 goto next_page; 1788 goto next_page;
1789 } 1789 }
1790 1790
1791 if (!vma || 1791 if (!vma ||
1792 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1792 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1793 !(vm_flags & vma->vm_flags)) 1793 !(vm_flags & vma->vm_flags))
1794 return i ? : -EFAULT; 1794 return i ? : -EFAULT;
1795 1795
1796 if (is_vm_hugetlb_page(vma)) { 1796 if (is_vm_hugetlb_page(vma)) {
1797 i = follow_hugetlb_page(mm, vma, pages, vmas, 1797 i = follow_hugetlb_page(mm, vma, pages, vmas,
1798 &start, &nr_pages, i, gup_flags); 1798 &start, &nr_pages, i, gup_flags);
1799 continue; 1799 continue;
1800 } 1800 }
1801 1801
1802 do { 1802 do {
1803 struct page *page; 1803 struct page *page;
1804 unsigned int foll_flags = gup_flags; 1804 unsigned int foll_flags = gup_flags;
1805 unsigned int page_increm; 1805 unsigned int page_increm;
1806 1806
1807 /* 1807 /*
1808 * If we have a pending SIGKILL, don't keep faulting 1808 * If we have a pending SIGKILL, don't keep faulting
1809 * pages and potentially allocating memory. 1809 * pages and potentially allocating memory.
1810 */ 1810 */
1811 if (unlikely(fatal_signal_pending(current))) 1811 if (unlikely(fatal_signal_pending(current)))
1812 return i ? i : -ERESTARTSYS; 1812 return i ? i : -ERESTARTSYS;
1813 1813
1814 cond_resched(); 1814 cond_resched();
1815 while (!(page = follow_page_mask(vma, start, 1815 while (!(page = follow_page_mask(vma, start,
1816 foll_flags, &page_mask))) { 1816 foll_flags, &page_mask))) {
1817 int ret; 1817 int ret;
1818 unsigned int fault_flags = 0; 1818 unsigned int fault_flags = 0;
1819 1819
1820 /* For mlock, just skip the stack guard page. */ 1820 /* For mlock, just skip the stack guard page. */
1821 if (foll_flags & FOLL_MLOCK) { 1821 if (foll_flags & FOLL_MLOCK) {
1822 if (stack_guard_page(vma, start)) 1822 if (stack_guard_page(vma, start))
1823 goto next_page; 1823 goto next_page;
1824 } 1824 }
1825 if (foll_flags & FOLL_WRITE) 1825 if (foll_flags & FOLL_WRITE)
1826 fault_flags |= FAULT_FLAG_WRITE; 1826 fault_flags |= FAULT_FLAG_WRITE;
1827 if (nonblocking) 1827 if (nonblocking)
1828 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 1828 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1829 if (foll_flags & FOLL_NOWAIT) 1829 if (foll_flags & FOLL_NOWAIT)
1830 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); 1830 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1831 1831
1832 ret = handle_mm_fault(mm, vma, start, 1832 ret = handle_mm_fault(mm, vma, start,
1833 fault_flags); 1833 fault_flags);
1834 1834
1835 if (ret & VM_FAULT_ERROR) { 1835 if (ret & VM_FAULT_ERROR) {
1836 if (ret & VM_FAULT_OOM) 1836 if (ret & VM_FAULT_OOM)
1837 return i ? i : -ENOMEM; 1837 return i ? i : -ENOMEM;
1838 if (ret & (VM_FAULT_HWPOISON | 1838 if (ret & (VM_FAULT_HWPOISON |
1839 VM_FAULT_HWPOISON_LARGE)) { 1839 VM_FAULT_HWPOISON_LARGE)) {
1840 if (i) 1840 if (i)
1841 return i; 1841 return i;
1842 else if (gup_flags & FOLL_HWPOISON) 1842 else if (gup_flags & FOLL_HWPOISON)
1843 return -EHWPOISON; 1843 return -EHWPOISON;
1844 else 1844 else
1845 return -EFAULT; 1845 return -EFAULT;
1846 } 1846 }
1847 if (ret & VM_FAULT_SIGBUS) 1847 if (ret & VM_FAULT_SIGBUS)
1848 return i ? i : -EFAULT; 1848 return i ? i : -EFAULT;
1849 BUG(); 1849 BUG();
1850 } 1850 }
1851 1851
1852 if (tsk) { 1852 if (tsk) {
1853 if (ret & VM_FAULT_MAJOR) 1853 if (ret & VM_FAULT_MAJOR)
1854 tsk->maj_flt++; 1854 tsk->maj_flt++;
1855 else 1855 else
1856 tsk->min_flt++; 1856 tsk->min_flt++;
1857 } 1857 }
1858 1858
1859 if (ret & VM_FAULT_RETRY) { 1859 if (ret & VM_FAULT_RETRY) {
1860 if (nonblocking) 1860 if (nonblocking)
1861 *nonblocking = 0; 1861 *nonblocking = 0;
1862 return i; 1862 return i;
1863 } 1863 }
1864 1864
1865 /* 1865 /*
1866 * The VM_FAULT_WRITE bit tells us that 1866 * The VM_FAULT_WRITE bit tells us that
1867 * do_wp_page has broken COW when necessary, 1867 * do_wp_page has broken COW when necessary,
1868 * even if maybe_mkwrite decided not to set 1868 * even if maybe_mkwrite decided not to set
1869 * pte_write. We can thus safely do subsequent 1869 * pte_write. We can thus safely do subsequent
1870 * page lookups as if they were reads. But only 1870 * page lookups as if they were reads. But only
1871 * do so when looping for pte_write is futile: 1871 * do so when looping for pte_write is futile:
1872 * in some cases userspace may also be wanting 1872 * in some cases userspace may also be wanting
1873 * to write to the gotten user page, which a 1873 * to write to the gotten user page, which a
1874 * read fault here might prevent (a readonly 1874 * read fault here might prevent (a readonly
1875 * page might get reCOWed by userspace write). 1875 * page might get reCOWed by userspace write).
1876 */ 1876 */
1877 if ((ret & VM_FAULT_WRITE) && 1877 if ((ret & VM_FAULT_WRITE) &&
1878 !(vma->vm_flags & VM_WRITE)) 1878 !(vma->vm_flags & VM_WRITE))
1879 foll_flags &= ~FOLL_WRITE; 1879 foll_flags &= ~FOLL_WRITE;
1880 1880
1881 cond_resched(); 1881 cond_resched();
1882 } 1882 }
1883 if (IS_ERR(page)) 1883 if (IS_ERR(page))
1884 return i ? i : PTR_ERR(page); 1884 return i ? i : PTR_ERR(page);
1885 if (pages) { 1885 if (pages) {
1886 pages[i] = page; 1886 pages[i] = page;
1887 1887
1888 flush_anon_page(vma, page, start); 1888 flush_anon_page(vma, page, start);
1889 flush_dcache_page(page); 1889 flush_dcache_page(page);
1890 page_mask = 0; 1890 page_mask = 0;
1891 } 1891 }
1892 next_page: 1892 next_page:
1893 if (vmas) { 1893 if (vmas) {
1894 vmas[i] = vma; 1894 vmas[i] = vma;
1895 page_mask = 0; 1895 page_mask = 0;
1896 } 1896 }
1897 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); 1897 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
1898 if (page_increm > nr_pages) 1898 if (page_increm > nr_pages)
1899 page_increm = nr_pages; 1899 page_increm = nr_pages;
1900 i += page_increm; 1900 i += page_increm;
1901 start += page_increm * PAGE_SIZE; 1901 start += page_increm * PAGE_SIZE;
1902 nr_pages -= page_increm; 1902 nr_pages -= page_increm;
1903 } while (nr_pages && start < vma->vm_end); 1903 } while (nr_pages && start < vma->vm_end);
1904 } while (nr_pages); 1904 } while (nr_pages);
1905 return i; 1905 return i;
1906 } 1906 }
1907 EXPORT_SYMBOL(__get_user_pages); 1907 EXPORT_SYMBOL(__get_user_pages);
1908 1908
1909 /* 1909 /*
1910 * fixup_user_fault() - manually resolve a user page fault 1910 * fixup_user_fault() - manually resolve a user page fault
1911 * @tsk: the task_struct to use for page fault accounting, or 1911 * @tsk: the task_struct to use for page fault accounting, or
1912 * NULL if faults are not to be recorded. 1912 * NULL if faults are not to be recorded.
1913 * @mm: mm_struct of target mm 1913 * @mm: mm_struct of target mm
1914 * @address: user address 1914 * @address: user address
1915 * @fault_flags:flags to pass down to handle_mm_fault() 1915 * @fault_flags:flags to pass down to handle_mm_fault()
1916 * 1916 *
1917 * This is meant to be called in the specific scenario where for locking reasons 1917 * This is meant to be called in the specific scenario where for locking reasons
1918 * we try to access user memory in atomic context (within a pagefault_disable() 1918 * we try to access user memory in atomic context (within a pagefault_disable()
1919 * section), this returns -EFAULT, and we want to resolve the user fault before 1919 * section), this returns -EFAULT, and we want to resolve the user fault before
1920 * trying again. 1920 * trying again.
1921 * 1921 *
1922 * Typically this is meant to be used by the futex code. 1922 * Typically this is meant to be used by the futex code.
1923 * 1923 *
1924 * The main difference with get_user_pages() is that this function will 1924 * The main difference with get_user_pages() is that this function will
1925 * unconditionally call handle_mm_fault() which will in turn perform all the 1925 * unconditionally call handle_mm_fault() which will in turn perform all the
1926 * necessary SW fixup of the dirty and young bits in the PTE, while 1926 * necessary SW fixup of the dirty and young bits in the PTE, while
1927 * handle_mm_fault() only guarantees to update these in the struct page. 1927 * handle_mm_fault() only guarantees to update these in the struct page.
1928 * 1928 *
1929 * This is important for some architectures where those bits also gate the 1929 * This is important for some architectures where those bits also gate the
1930 * access permission to the page because they are maintained in software. On 1930 * access permission to the page because they are maintained in software. On
1931 * such architectures, gup() will not be enough to make a subsequent access 1931 * such architectures, gup() will not be enough to make a subsequent access
1932 * succeed. 1932 * succeed.
1933 * 1933 *
1934 * This should be called with the mm_sem held for read. 1934 * This should be called with the mm_sem held for read.
1935 */ 1935 */
1936 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 1936 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1937 unsigned long address, unsigned int fault_flags) 1937 unsigned long address, unsigned int fault_flags)
1938 { 1938 {
1939 struct vm_area_struct *vma; 1939 struct vm_area_struct *vma;
1940 int ret; 1940 int ret;
1941 1941
1942 vma = find_extend_vma(mm, address); 1942 vma = find_extend_vma(mm, address);
1943 if (!vma || address < vma->vm_start) 1943 if (!vma || address < vma->vm_start)
1944 return -EFAULT; 1944 return -EFAULT;
1945 1945
1946 ret = handle_mm_fault(mm, vma, address, fault_flags); 1946 ret = handle_mm_fault(mm, vma, address, fault_flags);
1947 if (ret & VM_FAULT_ERROR) { 1947 if (ret & VM_FAULT_ERROR) {
1948 if (ret & VM_FAULT_OOM) 1948 if (ret & VM_FAULT_OOM)
1949 return -ENOMEM; 1949 return -ENOMEM;
1950 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) 1950 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1951 return -EHWPOISON; 1951 return -EHWPOISON;
1952 if (ret & VM_FAULT_SIGBUS) 1952 if (ret & VM_FAULT_SIGBUS)
1953 return -EFAULT; 1953 return -EFAULT;
1954 BUG(); 1954 BUG();
1955 } 1955 }
1956 if (tsk) { 1956 if (tsk) {
1957 if (ret & VM_FAULT_MAJOR) 1957 if (ret & VM_FAULT_MAJOR)
1958 tsk->maj_flt++; 1958 tsk->maj_flt++;
1959 else 1959 else
1960 tsk->min_flt++; 1960 tsk->min_flt++;
1961 } 1961 }
1962 return 0; 1962 return 0;
1963 } 1963 }
1964 1964
1965 /* 1965 /*
1966 * get_user_pages() - pin user pages in memory 1966 * get_user_pages() - pin user pages in memory
1967 * @tsk: the task_struct to use for page fault accounting, or 1967 * @tsk: the task_struct to use for page fault accounting, or
1968 * NULL if faults are not to be recorded. 1968 * NULL if faults are not to be recorded.
1969 * @mm: mm_struct of target mm 1969 * @mm: mm_struct of target mm
1970 * @start: starting user address 1970 * @start: starting user address
1971 * @nr_pages: number of pages from start to pin 1971 * @nr_pages: number of pages from start to pin
1972 * @write: whether pages will be written to by the caller 1972 * @write: whether pages will be written to by the caller
1973 * @force: whether to force write access even if user mapping is 1973 * @force: whether to force write access even if user mapping is
1974 * readonly. This will result in the page being COWed even 1974 * readonly. This will result in the page being COWed even
1975 * in MAP_SHARED mappings. You do not want this. 1975 * in MAP_SHARED mappings. You do not want this.
1976 * @pages: array that receives pointers to the pages pinned. 1976 * @pages: array that receives pointers to the pages pinned.
1977 * Should be at least nr_pages long. Or NULL, if caller 1977 * Should be at least nr_pages long. Or NULL, if caller
1978 * only intends to ensure the pages are faulted in. 1978 * only intends to ensure the pages are faulted in.
1979 * @vmas: array of pointers to vmas corresponding to each page. 1979 * @vmas: array of pointers to vmas corresponding to each page.
1980 * Or NULL if the caller does not require them. 1980 * Or NULL if the caller does not require them.
1981 * 1981 *
1982 * Returns number of pages pinned. This may be fewer than the number 1982 * Returns number of pages pinned. This may be fewer than the number
1983 * requested. If nr_pages is 0 or negative, returns 0. If no pages 1983 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1984 * were pinned, returns -errno. Each page returned must be released 1984 * were pinned, returns -errno. Each page returned must be released
1985 * with a put_page() call when it is finished with. vmas will only 1985 * with a put_page() call when it is finished with. vmas will only
1986 * remain valid while mmap_sem is held. 1986 * remain valid while mmap_sem is held.
1987 * 1987 *
1988 * Must be called with mmap_sem held for read or write. 1988 * Must be called with mmap_sem held for read or write.
1989 * 1989 *
1990 * get_user_pages walks a process's page tables and takes a reference to 1990 * get_user_pages walks a process's page tables and takes a reference to
1991 * each struct page that each user address corresponds to at a given 1991 * each struct page that each user address corresponds to at a given
1992 * instant. That is, it takes the page that would be accessed if a user 1992 * instant. That is, it takes the page that would be accessed if a user
1993 * thread accesses the given user virtual address at that instant. 1993 * thread accesses the given user virtual address at that instant.
1994 * 1994 *
1995 * This does not guarantee that the page exists in the user mappings when 1995 * This does not guarantee that the page exists in the user mappings when
1996 * get_user_pages returns, and there may even be a completely different 1996 * get_user_pages returns, and there may even be a completely different
1997 * page there in some cases (eg. if mmapped pagecache has been invalidated 1997 * page there in some cases (eg. if mmapped pagecache has been invalidated
1998 * and subsequently re faulted). However it does guarantee that the page 1998 * and subsequently re faulted). However it does guarantee that the page
1999 * won't be freed completely. And mostly callers simply care that the page 1999 * won't be freed completely. And mostly callers simply care that the page
2000 * contains data that was valid *at some point in time*. Typically, an IO 2000 * contains data that was valid *at some point in time*. Typically, an IO
2001 * or similar operation cannot guarantee anything stronger anyway because 2001 * or similar operation cannot guarantee anything stronger anyway because
2002 * locks can't be held over the syscall boundary. 2002 * locks can't be held over the syscall boundary.
2003 * 2003 *
2004 * If write=0, the page must not be written to. If the page is written to, 2004 * If write=0, the page must not be written to. If the page is written to,
2005 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called 2005 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
2006 * after the page is finished with, and before put_page is called. 2006 * after the page is finished with, and before put_page is called.
2007 * 2007 *
2008 * get_user_pages is typically used for fewer-copy IO operations, to get a 2008 * get_user_pages is typically used for fewer-copy IO operations, to get a
2009 * handle on the memory by some means other than accesses via the user virtual 2009 * handle on the memory by some means other than accesses via the user virtual
2010 * addresses. The pages may be submitted for DMA to devices or accessed via 2010 * addresses. The pages may be submitted for DMA to devices or accessed via
2011 * their kernel linear mapping (via the kmap APIs). Care should be taken to 2011 * their kernel linear mapping (via the kmap APIs). Care should be taken to
2012 * use the correct cache flushing APIs. 2012 * use the correct cache flushing APIs.
2013 * 2013 *
2014 * See also get_user_pages_fast, for performance critical applications. 2014 * See also get_user_pages_fast, for performance critical applications.
2015 */ 2015 */
2016 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 2016 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
2017 unsigned long start, unsigned long nr_pages, int write, 2017 unsigned long start, unsigned long nr_pages, int write,
2018 int force, struct page **pages, struct vm_area_struct **vmas) 2018 int force, struct page **pages, struct vm_area_struct **vmas)
2019 { 2019 {
2020 int flags = FOLL_TOUCH; 2020 int flags = FOLL_TOUCH;
2021 2021
2022 if (pages) 2022 if (pages)
2023 flags |= FOLL_GET; 2023 flags |= FOLL_GET;
2024 if (write) 2024 if (write)
2025 flags |= FOLL_WRITE; 2025 flags |= FOLL_WRITE;
2026 if (force) 2026 if (force)
2027 flags |= FOLL_FORCE; 2027 flags |= FOLL_FORCE;
2028 2028
2029 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, 2029 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
2030 NULL); 2030 NULL);
2031 } 2031 }
2032 EXPORT_SYMBOL(get_user_pages); 2032 EXPORT_SYMBOL(get_user_pages);
2033 2033
2034 /** 2034 /**
2035 * get_dump_page() - pin user page in memory while writing it to core dump 2035 * get_dump_page() - pin user page in memory while writing it to core dump
2036 * @addr: user address 2036 * @addr: user address
2037 * 2037 *
2038 * Returns struct page pointer of user page pinned for dump, 2038 * Returns struct page pointer of user page pinned for dump,
2039 * to be freed afterwards by page_cache_release() or put_page(). 2039 * to be freed afterwards by page_cache_release() or put_page().
2040 * 2040 *
2041 * Returns NULL on any kind of failure - a hole must then be inserted into 2041 * Returns NULL on any kind of failure - a hole must then be inserted into
2042 * the corefile, to preserve alignment with its headers; and also returns 2042 * the corefile, to preserve alignment with its headers; and also returns
2043 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - 2043 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
2044 * allowing a hole to be left in the corefile to save diskspace. 2044 * allowing a hole to be left in the corefile to save diskspace.
2045 * 2045 *
2046 * Called without mmap_sem, but after all other threads have been killed. 2046 * Called without mmap_sem, but after all other threads have been killed.
2047 */ 2047 */
2048 #ifdef CONFIG_ELF_CORE 2048 #ifdef CONFIG_ELF_CORE
2049 struct page *get_dump_page(unsigned long addr) 2049 struct page *get_dump_page(unsigned long addr)
2050 { 2050 {
2051 struct vm_area_struct *vma; 2051 struct vm_area_struct *vma;
2052 struct page *page; 2052 struct page *page;
2053 2053
2054 if (__get_user_pages(current, current->mm, addr, 1, 2054 if (__get_user_pages(current, current->mm, addr, 1,
2055 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, 2055 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
2056 NULL) < 1) 2056 NULL) < 1)
2057 return NULL; 2057 return NULL;
2058 flush_cache_page(vma, addr, page_to_pfn(page)); 2058 flush_cache_page(vma, addr, page_to_pfn(page));
2059 return page; 2059 return page;
2060 } 2060 }
2061 #endif /* CONFIG_ELF_CORE */ 2061 #endif /* CONFIG_ELF_CORE */
2062 2062
2063 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 2063 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
2064 spinlock_t **ptl) 2064 spinlock_t **ptl)
2065 { 2065 {
2066 pgd_t * pgd = pgd_offset(mm, addr); 2066 pgd_t * pgd = pgd_offset(mm, addr);
2067 pud_t * pud = pud_alloc(mm, pgd, addr); 2067 pud_t * pud = pud_alloc(mm, pgd, addr);
2068 if (pud) { 2068 if (pud) {
2069 pmd_t * pmd = pmd_alloc(mm, pud, addr); 2069 pmd_t * pmd = pmd_alloc(mm, pud, addr);
2070 if (pmd) { 2070 if (pmd) {
2071 VM_BUG_ON(pmd_trans_huge(*pmd)); 2071 VM_BUG_ON(pmd_trans_huge(*pmd));
2072 return pte_alloc_map_lock(mm, pmd, addr, ptl); 2072 return pte_alloc_map_lock(mm, pmd, addr, ptl);
2073 } 2073 }
2074 } 2074 }
2075 return NULL; 2075 return NULL;
2076 } 2076 }
2077 2077
2078 /* 2078 /*
2079 * This is the old fallback for page remapping. 2079 * This is the old fallback for page remapping.
2080 * 2080 *
2081 * For historical reasons, it only allows reserved pages. Only 2081 * For historical reasons, it only allows reserved pages. Only
2082 * old drivers should use this, and they needed to mark their 2082 * old drivers should use this, and they needed to mark their
2083 * pages reserved for the old functions anyway. 2083 * pages reserved for the old functions anyway.
2084 */ 2084 */
2085 static int insert_page(struct vm_area_struct *vma, unsigned long addr, 2085 static int insert_page(struct vm_area_struct *vma, unsigned long addr,
2086 struct page *page, pgprot_t prot) 2086 struct page *page, pgprot_t prot)
2087 { 2087 {
2088 struct mm_struct *mm = vma->vm_mm; 2088 struct mm_struct *mm = vma->vm_mm;
2089 int retval; 2089 int retval;
2090 pte_t *pte; 2090 pte_t *pte;
2091 spinlock_t *ptl; 2091 spinlock_t *ptl;
2092 2092
2093 retval = -EINVAL; 2093 retval = -EINVAL;
2094 if (PageAnon(page)) 2094 if (PageAnon(page))
2095 goto out; 2095 goto out;
2096 retval = -ENOMEM; 2096 retval = -ENOMEM;
2097 flush_dcache_page(page); 2097 flush_dcache_page(page);
2098 pte = get_locked_pte(mm, addr, &ptl); 2098 pte = get_locked_pte(mm, addr, &ptl);
2099 if (!pte) 2099 if (!pte)
2100 goto out; 2100 goto out;
2101 retval = -EBUSY; 2101 retval = -EBUSY;
2102 if (!pte_none(*pte)) 2102 if (!pte_none(*pte))
2103 goto out_unlock; 2103 goto out_unlock;
2104 2104
2105 /* Ok, finally just insert the thing.. */ 2105 /* Ok, finally just insert the thing.. */
2106 get_page(page); 2106 get_page(page);
2107 inc_mm_counter_fast(mm, MM_FILEPAGES); 2107 inc_mm_counter_fast(mm, MM_FILEPAGES);
2108 page_add_file_rmap(page); 2108 page_add_file_rmap(page);
2109 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 2109 set_pte_at(mm, addr, pte, mk_pte(page, prot));
2110 2110
2111 retval = 0; 2111 retval = 0;
2112 pte_unmap_unlock(pte, ptl); 2112 pte_unmap_unlock(pte, ptl);
2113 return retval; 2113 return retval;
2114 out_unlock: 2114 out_unlock:
2115 pte_unmap_unlock(pte, ptl); 2115 pte_unmap_unlock(pte, ptl);
2116 out: 2116 out:
2117 return retval; 2117 return retval;
2118 } 2118 }
2119 2119
2120 /** 2120 /**
2121 * vm_insert_page - insert single page into user vma 2121 * vm_insert_page - insert single page into user vma
2122 * @vma: user vma to map to 2122 * @vma: user vma to map to
2123 * @addr: target user address of this page 2123 * @addr: target user address of this page
2124 * @page: source kernel page 2124 * @page: source kernel page
2125 * 2125 *
2126 * This allows drivers to insert individual pages they've allocated 2126 * This allows drivers to insert individual pages they've allocated
2127 * into a user vma. 2127 * into a user vma.
2128 * 2128 *
2129 * The page has to be a nice clean _individual_ kernel allocation. 2129 * The page has to be a nice clean _individual_ kernel allocation.
2130 * If you allocate a compound page, you need to have marked it as 2130 * If you allocate a compound page, you need to have marked it as
2131 * such (__GFP_COMP), or manually just split the page up yourself 2131 * such (__GFP_COMP), or manually just split the page up yourself
2132 * (see split_page()). 2132 * (see split_page()).
2133 * 2133 *
2134 * NOTE! Traditionally this was done with "remap_pfn_range()" which 2134 * NOTE! Traditionally this was done with "remap_pfn_range()" which
2135 * took an arbitrary page protection parameter. This doesn't allow 2135 * took an arbitrary page protection parameter. This doesn't allow
2136 * that. Your vma protection will have to be set up correctly, which 2136 * that. Your vma protection will have to be set up correctly, which
2137 * means that if you want a shared writable mapping, you'd better 2137 * means that if you want a shared writable mapping, you'd better
2138 * ask for a shared writable mapping! 2138 * ask for a shared writable mapping!
2139 * 2139 *
2140 * The page does not need to be reserved. 2140 * The page does not need to be reserved.
2141 * 2141 *
2142 * Usually this function is called from f_op->mmap() handler 2142 * Usually this function is called from f_op->mmap() handler
2143 * under mm->mmap_sem write-lock, so it can change vma->vm_flags. 2143 * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
2144 * Caller must set VM_MIXEDMAP on vma if it wants to call this 2144 * Caller must set VM_MIXEDMAP on vma if it wants to call this
2145 * function from other places, for example from page-fault handler. 2145 * function from other places, for example from page-fault handler.
2146 */ 2146 */
2147 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 2147 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2148 struct page *page) 2148 struct page *page)
2149 { 2149 {
2150 if (addr < vma->vm_start || addr >= vma->vm_end) 2150 if (addr < vma->vm_start || addr >= vma->vm_end)
2151 return -EFAULT; 2151 return -EFAULT;
2152 if (!page_count(page)) 2152 if (!page_count(page))
2153 return -EINVAL; 2153 return -EINVAL;
2154 if (!(vma->vm_flags & VM_MIXEDMAP)) { 2154 if (!(vma->vm_flags & VM_MIXEDMAP)) {
2155 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); 2155 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
2156 BUG_ON(vma->vm_flags & VM_PFNMAP); 2156 BUG_ON(vma->vm_flags & VM_PFNMAP);
2157 vma->vm_flags |= VM_MIXEDMAP; 2157 vma->vm_flags |= VM_MIXEDMAP;
2158 } 2158 }
2159 return insert_page(vma, addr, page, vma->vm_page_prot); 2159 return insert_page(vma, addr, page, vma->vm_page_prot);
2160 } 2160 }
2161 EXPORT_SYMBOL(vm_insert_page); 2161 EXPORT_SYMBOL(vm_insert_page);
2162 2162
2163 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, 2163 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2164 unsigned long pfn, pgprot_t prot) 2164 unsigned long pfn, pgprot_t prot)
2165 { 2165 {
2166 struct mm_struct *mm = vma->vm_mm; 2166 struct mm_struct *mm = vma->vm_mm;
2167 int retval; 2167 int retval;
2168 pte_t *pte, entry; 2168 pte_t *pte, entry;
2169 spinlock_t *ptl; 2169 spinlock_t *ptl;
2170 2170
2171 retval = -ENOMEM; 2171 retval = -ENOMEM;
2172 pte = get_locked_pte(mm, addr, &ptl); 2172 pte = get_locked_pte(mm, addr, &ptl);
2173 if (!pte) 2173 if (!pte)
2174 goto out; 2174 goto out;
2175 retval = -EBUSY; 2175 retval = -EBUSY;
2176 if (!pte_none(*pte)) 2176 if (!pte_none(*pte))
2177 goto out_unlock; 2177 goto out_unlock;
2178 2178
2179 /* Ok, finally just insert the thing.. */ 2179 /* Ok, finally just insert the thing.. */
2180 entry = pte_mkspecial(pfn_pte(pfn, prot)); 2180 entry = pte_mkspecial(pfn_pte(pfn, prot));
2181 set_pte_at(mm, addr, pte, entry); 2181 set_pte_at(mm, addr, pte, entry);
2182 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ 2182 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
2183 2183
2184 retval = 0; 2184 retval = 0;
2185 out_unlock: 2185 out_unlock:
2186 pte_unmap_unlock(pte, ptl); 2186 pte_unmap_unlock(pte, ptl);
2187 out: 2187 out:
2188 return retval; 2188 return retval;
2189 } 2189 }
2190 2190
2191 /** 2191 /**
2192 * vm_insert_pfn - insert single pfn into user vma 2192 * vm_insert_pfn - insert single pfn into user vma
2193 * @vma: user vma to map to 2193 * @vma: user vma to map to
2194 * @addr: target user address of this page 2194 * @addr: target user address of this page
2195 * @pfn: source kernel pfn 2195 * @pfn: source kernel pfn
2196 * 2196 *
2197 * Similar to vm_insert_page, this allows drivers to insert individual pages 2197 * Similar to vm_insert_page, this allows drivers to insert individual pages
2198 * they've allocated into a user vma. Same comments apply. 2198 * they've allocated into a user vma. Same comments apply.
2199 * 2199 *
2200 * This function should only be called from a vm_ops->fault handler, and 2200 * This function should only be called from a vm_ops->fault handler, and
2201 * in that case the handler should return NULL. 2201 * in that case the handler should return NULL.
2202 * 2202 *
2203 * vma cannot be a COW mapping. 2203 * vma cannot be a COW mapping.
2204 * 2204 *
2205 * As this is called only for pages that do not currently exist, we 2205 * As this is called only for pages that do not currently exist, we
2206 * do not need to flush old virtual caches or the TLB. 2206 * do not need to flush old virtual caches or the TLB.
2207 */ 2207 */
2208 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 2208 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2209 unsigned long pfn) 2209 unsigned long pfn)
2210 { 2210 {
2211 int ret; 2211 int ret;
2212 pgprot_t pgprot = vma->vm_page_prot; 2212 pgprot_t pgprot = vma->vm_page_prot;
2213 /* 2213 /*
2214 * Technically, architectures with pte_special can avoid all these 2214 * Technically, architectures with pte_special can avoid all these
2215 * restrictions (same for remap_pfn_range). However we would like 2215 * restrictions (same for remap_pfn_range). However we would like
2216 * consistency in testing and feature parity among all, so we should 2216 * consistency in testing and feature parity among all, so we should
2217 * try to keep these invariants in place for everybody. 2217 * try to keep these invariants in place for everybody.
2218 */ 2218 */
2219 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 2219 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2220 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 2220 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2221 (VM_PFNMAP|VM_MIXEDMAP)); 2221 (VM_PFNMAP|VM_MIXEDMAP));
2222 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 2222 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2223 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); 2223 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2224 2224
2225 if (addr < vma->vm_start || addr >= vma->vm_end) 2225 if (addr < vma->vm_start || addr >= vma->vm_end)
2226 return -EFAULT; 2226 return -EFAULT;
2227 if (track_pfn_insert(vma, &pgprot, pfn)) 2227 if (track_pfn_insert(vma, &pgprot, pfn))
2228 return -EINVAL; 2228 return -EINVAL;
2229 2229
2230 ret = insert_pfn(vma, addr, pfn, pgprot); 2230 ret = insert_pfn(vma, addr, pfn, pgprot);
2231 2231
2232 return ret; 2232 return ret;
2233 } 2233 }
2234 EXPORT_SYMBOL(vm_insert_pfn); 2234 EXPORT_SYMBOL(vm_insert_pfn);
2235 2235
2236 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 2236 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2237 unsigned long pfn) 2237 unsigned long pfn)
2238 { 2238 {
2239 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); 2239 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
2240 2240
2241 if (addr < vma->vm_start || addr >= vma->vm_end) 2241 if (addr < vma->vm_start || addr >= vma->vm_end)
2242 return -EFAULT; 2242 return -EFAULT;
2243 2243
2244 /* 2244 /*
2245 * If we don't have pte special, then we have to use the pfn_valid() 2245 * If we don't have pte special, then we have to use the pfn_valid()
2246 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 2246 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
2247 * refcount the page if pfn_valid is true (hence insert_page rather 2247 * refcount the page if pfn_valid is true (hence insert_page rather
2248 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP 2248 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
2249 * without pte special, it would there be refcounted as a normal page. 2249 * without pte special, it would there be refcounted as a normal page.
2250 */ 2250 */
2251 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 2251 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
2252 struct page *page; 2252 struct page *page;
2253 2253
2254 page = pfn_to_page(pfn); 2254 page = pfn_to_page(pfn);
2255 return insert_page(vma, addr, page, vma->vm_page_prot); 2255 return insert_page(vma, addr, page, vma->vm_page_prot);
2256 } 2256 }
2257 return insert_pfn(vma, addr, pfn, vma->vm_page_prot); 2257 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
2258 } 2258 }
2259 EXPORT_SYMBOL(vm_insert_mixed); 2259 EXPORT_SYMBOL(vm_insert_mixed);
2260 2260
2261 /* 2261 /*
2262 * maps a range of physical memory into the requested pages. the old 2262 * maps a range of physical memory into the requested pages. the old
2263 * mappings are removed. any references to nonexistent pages results 2263 * mappings are removed. any references to nonexistent pages results
2264 * in null mappings (currently treated as "copy-on-access") 2264 * in null mappings (currently treated as "copy-on-access")
2265 */ 2265 */
2266 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, 2266 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2267 unsigned long addr, unsigned long end, 2267 unsigned long addr, unsigned long end,
2268 unsigned long pfn, pgprot_t prot) 2268 unsigned long pfn, pgprot_t prot)
2269 { 2269 {
2270 pte_t *pte; 2270 pte_t *pte;
2271 spinlock_t *ptl; 2271 spinlock_t *ptl;
2272 2272
2273 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 2273 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2274 if (!pte) 2274 if (!pte)
2275 return -ENOMEM; 2275 return -ENOMEM;
2276 arch_enter_lazy_mmu_mode(); 2276 arch_enter_lazy_mmu_mode();
2277 do { 2277 do {
2278 BUG_ON(!pte_none(*pte)); 2278 BUG_ON(!pte_none(*pte));
2279 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); 2279 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2280 pfn++; 2280 pfn++;
2281 } while (pte++, addr += PAGE_SIZE, addr != end); 2281 } while (pte++, addr += PAGE_SIZE, addr != end);
2282 arch_leave_lazy_mmu_mode(); 2282 arch_leave_lazy_mmu_mode();
2283 pte_unmap_unlock(pte - 1, ptl); 2283 pte_unmap_unlock(pte - 1, ptl);
2284 return 0; 2284 return 0;
2285 } 2285 }
2286 2286
2287 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, 2287 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2288 unsigned long addr, unsigned long end, 2288 unsigned long addr, unsigned long end,
2289 unsigned long pfn, pgprot_t prot) 2289 unsigned long pfn, pgprot_t prot)
2290 { 2290 {
2291 pmd_t *pmd; 2291 pmd_t *pmd;
2292 unsigned long next; 2292 unsigned long next;
2293 2293
2294 pfn -= addr >> PAGE_SHIFT; 2294 pfn -= addr >> PAGE_SHIFT;
2295 pmd = pmd_alloc(mm, pud, addr); 2295 pmd = pmd_alloc(mm, pud, addr);
2296 if (!pmd) 2296 if (!pmd)
2297 return -ENOMEM; 2297 return -ENOMEM;
2298 VM_BUG_ON(pmd_trans_huge(*pmd)); 2298 VM_BUG_ON(pmd_trans_huge(*pmd));
2299 do { 2299 do {
2300 next = pmd_addr_end(addr, end); 2300 next = pmd_addr_end(addr, end);
2301 if (remap_pte_range(mm, pmd, addr, next, 2301 if (remap_pte_range(mm, pmd, addr, next,
2302 pfn + (addr >> PAGE_SHIFT), prot)) 2302 pfn + (addr >> PAGE_SHIFT), prot))
2303 return -ENOMEM; 2303 return -ENOMEM;
2304 } while (pmd++, addr = next, addr != end); 2304 } while (pmd++, addr = next, addr != end);
2305 return 0; 2305 return 0;
2306 } 2306 }
2307 2307
2308 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, 2308 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
2309 unsigned long addr, unsigned long end, 2309 unsigned long addr, unsigned long end,
2310 unsigned long pfn, pgprot_t prot) 2310 unsigned long pfn, pgprot_t prot)
2311 { 2311 {
2312 pud_t *pud; 2312 pud_t *pud;
2313 unsigned long next; 2313 unsigned long next;
2314 2314
2315 pfn -= addr >> PAGE_SHIFT; 2315 pfn -= addr >> PAGE_SHIFT;
2316 pud = pud_alloc(mm, pgd, addr); 2316 pud = pud_alloc(mm, pgd, addr);
2317 if (!pud) 2317 if (!pud)
2318 return -ENOMEM; 2318 return -ENOMEM;
2319 do { 2319 do {
2320 next = pud_addr_end(addr, end); 2320 next = pud_addr_end(addr, end);
2321 if (remap_pmd_range(mm, pud, addr, next, 2321 if (remap_pmd_range(mm, pud, addr, next,
2322 pfn + (addr >> PAGE_SHIFT), prot)) 2322 pfn + (addr >> PAGE_SHIFT), prot))
2323 return -ENOMEM; 2323 return -ENOMEM;
2324 } while (pud++, addr = next, addr != end); 2324 } while (pud++, addr = next, addr != end);
2325 return 0; 2325 return 0;
2326 } 2326 }
2327 2327
2328 /** 2328 /**
2329 * remap_pfn_range - remap kernel memory to userspace 2329 * remap_pfn_range - remap kernel memory to userspace
2330 * @vma: user vma to map to 2330 * @vma: user vma to map to
2331 * @addr: target user address to start at 2331 * @addr: target user address to start at
2332 * @pfn: physical address of kernel memory 2332 * @pfn: physical address of kernel memory
2333 * @size: size of map area 2333 * @size: size of map area
2334 * @prot: page protection flags for this mapping 2334 * @prot: page protection flags for this mapping
2335 * 2335 *
2336 * Note: this is only safe if the mm semaphore is held when called. 2336 * Note: this is only safe if the mm semaphore is held when called.
2337 */ 2337 */
2338 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 2338 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2339 unsigned long pfn, unsigned long size, pgprot_t prot) 2339 unsigned long pfn, unsigned long size, pgprot_t prot)
2340 { 2340 {
2341 pgd_t *pgd; 2341 pgd_t *pgd;
2342 unsigned long next; 2342 unsigned long next;
2343 unsigned long end = addr + PAGE_ALIGN(size); 2343 unsigned long end = addr + PAGE_ALIGN(size);
2344 struct mm_struct *mm = vma->vm_mm; 2344 struct mm_struct *mm = vma->vm_mm;
2345 int err; 2345 int err;
2346 2346
2347 /* 2347 /*
2348 * Physically remapped pages are special. Tell the 2348 * Physically remapped pages are special. Tell the
2349 * rest of the world about it: 2349 * rest of the world about it:
2350 * VM_IO tells people not to look at these pages 2350 * VM_IO tells people not to look at these pages
2351 * (accesses can have side effects). 2351 * (accesses can have side effects).
2352 * VM_PFNMAP tells the core MM that the base pages are just 2352 * VM_PFNMAP tells the core MM that the base pages are just
2353 * raw PFN mappings, and do not have a "struct page" associated 2353 * raw PFN mappings, and do not have a "struct page" associated
2354 * with them. 2354 * with them.
2355 * VM_DONTEXPAND 2355 * VM_DONTEXPAND
2356 * Disable vma merging and expanding with mremap(). 2356 * Disable vma merging and expanding with mremap().
2357 * VM_DONTDUMP 2357 * VM_DONTDUMP
2358 * Omit vma from core dump, even when VM_IO turned off. 2358 * Omit vma from core dump, even when VM_IO turned off.
2359 * 2359 *
2360 * There's a horrible special case to handle copy-on-write 2360 * There's a horrible special case to handle copy-on-write
2361 * behaviour that some programs depend on. We mark the "original" 2361 * behaviour that some programs depend on. We mark the "original"
2362 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 2362 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
2363 * See vm_normal_page() for details. 2363 * See vm_normal_page() for details.
2364 */ 2364 */
2365 if (is_cow_mapping(vma->vm_flags)) { 2365 if (is_cow_mapping(vma->vm_flags)) {
2366 if (addr != vma->vm_start || end != vma->vm_end) 2366 if (addr != vma->vm_start || end != vma->vm_end)
2367 return -EINVAL; 2367 return -EINVAL;
2368 vma->vm_pgoff = pfn; 2368 vma->vm_pgoff = pfn;
2369 } 2369 }
2370 2370
2371 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); 2371 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2372 if (err) 2372 if (err)
2373 return -EINVAL; 2373 return -EINVAL;
2374 2374
2375 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 2375 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2376 2376
2377 BUG_ON(addr >= end); 2377 BUG_ON(addr >= end);
2378 pfn -= addr >> PAGE_SHIFT; 2378 pfn -= addr >> PAGE_SHIFT;
2379 pgd = pgd_offset(mm, addr); 2379 pgd = pgd_offset(mm, addr);
2380 flush_cache_range(vma, addr, end); 2380 flush_cache_range(vma, addr, end);
2381 do { 2381 do {
2382 next = pgd_addr_end(addr, end); 2382 next = pgd_addr_end(addr, end);
2383 err = remap_pud_range(mm, pgd, addr, next, 2383 err = remap_pud_range(mm, pgd, addr, next,
2384 pfn + (addr >> PAGE_SHIFT), prot); 2384 pfn + (addr >> PAGE_SHIFT), prot);
2385 if (err) 2385 if (err)
2386 break; 2386 break;
2387 } while (pgd++, addr = next, addr != end); 2387 } while (pgd++, addr = next, addr != end);
2388 2388
2389 if (err) 2389 if (err)
2390 untrack_pfn(vma, pfn, PAGE_ALIGN(size)); 2390 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2391 2391
2392 return err; 2392 return err;
2393 } 2393 }
2394 EXPORT_SYMBOL(remap_pfn_range); 2394 EXPORT_SYMBOL(remap_pfn_range);
2395 2395
2396 /** 2396 /**
2397 * vm_iomap_memory - remap memory to userspace 2397 * vm_iomap_memory - remap memory to userspace
2398 * @vma: user vma to map to 2398 * @vma: user vma to map to
2399 * @start: start of area 2399 * @start: start of area
2400 * @len: size of area 2400 * @len: size of area
2401 * 2401 *
2402 * This is a simplified io_remap_pfn_range() for common driver use. The 2402 * This is a simplified io_remap_pfn_range() for common driver use. The
2403 * driver just needs to give us the physical memory range to be mapped, 2403 * driver just needs to give us the physical memory range to be mapped,
2404 * we'll figure out the rest from the vma information. 2404 * we'll figure out the rest from the vma information.
2405 * 2405 *
2406 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get 2406 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
2407 * whatever write-combining details or similar. 2407 * whatever write-combining details or similar.
2408 */ 2408 */
2409 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) 2409 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2410 { 2410 {
2411 unsigned long vm_len, pfn, pages; 2411 unsigned long vm_len, pfn, pages;
2412 2412
2413 /* Check that the physical memory area passed in looks valid */ 2413 /* Check that the physical memory area passed in looks valid */
2414 if (start + len < start) 2414 if (start + len < start)
2415 return -EINVAL; 2415 return -EINVAL;
2416 /* 2416 /*
2417 * You *really* shouldn't map things that aren't page-aligned, 2417 * You *really* shouldn't map things that aren't page-aligned,
2418 * but we've historically allowed it because IO memory might 2418 * but we've historically allowed it because IO memory might
2419 * just have smaller alignment. 2419 * just have smaller alignment.
2420 */ 2420 */
2421 len += start & ~PAGE_MASK; 2421 len += start & ~PAGE_MASK;
2422 pfn = start >> PAGE_SHIFT; 2422 pfn = start >> PAGE_SHIFT;
2423 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; 2423 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2424 if (pfn + pages < pfn) 2424 if (pfn + pages < pfn)
2425 return -EINVAL; 2425 return -EINVAL;
2426 2426
2427 /* We start the mapping 'vm_pgoff' pages into the area */ 2427 /* We start the mapping 'vm_pgoff' pages into the area */
2428 if (vma->vm_pgoff > pages) 2428 if (vma->vm_pgoff > pages)
2429 return -EINVAL; 2429 return -EINVAL;
2430 pfn += vma->vm_pgoff; 2430 pfn += vma->vm_pgoff;
2431 pages -= vma->vm_pgoff; 2431 pages -= vma->vm_pgoff;
2432 2432
2433 /* Can we fit all of the mapping? */ 2433 /* Can we fit all of the mapping? */
2434 vm_len = vma->vm_end - vma->vm_start; 2434 vm_len = vma->vm_end - vma->vm_start;
2435 if (vm_len >> PAGE_SHIFT > pages) 2435 if (vm_len >> PAGE_SHIFT > pages)
2436 return -EINVAL; 2436 return -EINVAL;
2437 2437
2438 /* Ok, let it rip */ 2438 /* Ok, let it rip */
2439 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); 2439 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2440 } 2440 }
2441 EXPORT_SYMBOL(vm_iomap_memory); 2441 EXPORT_SYMBOL(vm_iomap_memory);
2442 2442
2443 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, 2443 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2444 unsigned long addr, unsigned long end, 2444 unsigned long addr, unsigned long end,
2445 pte_fn_t fn, void *data) 2445 pte_fn_t fn, void *data)
2446 { 2446 {
2447 pte_t *pte; 2447 pte_t *pte;
2448 int err; 2448 int err;
2449 pgtable_t token; 2449 pgtable_t token;
2450 spinlock_t *uninitialized_var(ptl); 2450 spinlock_t *uninitialized_var(ptl);
2451 2451
2452 pte = (mm == &init_mm) ? 2452 pte = (mm == &init_mm) ?
2453 pte_alloc_kernel(pmd, addr) : 2453 pte_alloc_kernel(pmd, addr) :
2454 pte_alloc_map_lock(mm, pmd, addr, &ptl); 2454 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2455 if (!pte) 2455 if (!pte)
2456 return -ENOMEM; 2456 return -ENOMEM;
2457 2457
2458 BUG_ON(pmd_huge(*pmd)); 2458 BUG_ON(pmd_huge(*pmd));
2459 2459
2460 arch_enter_lazy_mmu_mode(); 2460 arch_enter_lazy_mmu_mode();
2461 2461
2462 token = pmd_pgtable(*pmd); 2462 token = pmd_pgtable(*pmd);
2463 2463
2464 do { 2464 do {
2465 err = fn(pte++, token, addr, data); 2465 err = fn(pte++, token, addr, data);
2466 if (err) 2466 if (err)
2467 break; 2467 break;
2468 } while (addr += PAGE_SIZE, addr != end); 2468 } while (addr += PAGE_SIZE, addr != end);
2469 2469
2470 arch_leave_lazy_mmu_mode(); 2470 arch_leave_lazy_mmu_mode();
2471 2471
2472 if (mm != &init_mm) 2472 if (mm != &init_mm)
2473 pte_unmap_unlock(pte-1, ptl); 2473 pte_unmap_unlock(pte-1, ptl);
2474 return err; 2474 return err;
2475 } 2475 }
2476 2476
2477 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, 2477 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2478 unsigned long addr, unsigned long end, 2478 unsigned long addr, unsigned long end,
2479 pte_fn_t fn, void *data) 2479 pte_fn_t fn, void *data)
2480 { 2480 {
2481 pmd_t *pmd; 2481 pmd_t *pmd;
2482 unsigned long next; 2482 unsigned long next;
2483 int err; 2483 int err;
2484 2484
2485 BUG_ON(pud_huge(*pud)); 2485 BUG_ON(pud_huge(*pud));
2486 2486
2487 pmd = pmd_alloc(mm, pud, addr); 2487 pmd = pmd_alloc(mm, pud, addr);
2488 if (!pmd) 2488 if (!pmd)
2489 return -ENOMEM; 2489 return -ENOMEM;
2490 do { 2490 do {
2491 next = pmd_addr_end(addr, end); 2491 next = pmd_addr_end(addr, end);
2492 err = apply_to_pte_range(mm, pmd, addr, next, fn, data); 2492 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2493 if (err) 2493 if (err)
2494 break; 2494 break;
2495 } while (pmd++, addr = next, addr != end); 2495 } while (pmd++, addr = next, addr != end);
2496 return err; 2496 return err;
2497 } 2497 }
2498 2498
2499 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, 2499 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2500 unsigned long addr, unsigned long end, 2500 unsigned long addr, unsigned long end,
2501 pte_fn_t fn, void *data) 2501 pte_fn_t fn, void *data)
2502 { 2502 {
2503 pud_t *pud; 2503 pud_t *pud;
2504 unsigned long next; 2504 unsigned long next;
2505 int err; 2505 int err;
2506 2506
2507 pud = pud_alloc(mm, pgd, addr); 2507 pud = pud_alloc(mm, pgd, addr);
2508 if (!pud) 2508 if (!pud)
2509 return -ENOMEM; 2509 return -ENOMEM;
2510 do { 2510 do {
2511 next = pud_addr_end(addr, end); 2511 next = pud_addr_end(addr, end);
2512 err = apply_to_pmd_range(mm, pud, addr, next, fn, data); 2512 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2513 if (err) 2513 if (err)
2514 break; 2514 break;
2515 } while (pud++, addr = next, addr != end); 2515 } while (pud++, addr = next, addr != end);
2516 return err; 2516 return err;
2517 } 2517 }
2518 2518
2519 /* 2519 /*
2520 * Scan a region of virtual memory, filling in page tables as necessary 2520 * Scan a region of virtual memory, filling in page tables as necessary
2521 * and calling a provided function on each leaf page table. 2521 * and calling a provided function on each leaf page table.
2522 */ 2522 */
2523 int apply_to_page_range(struct mm_struct *mm, unsigned long addr, 2523 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2524 unsigned long size, pte_fn_t fn, void *data) 2524 unsigned long size, pte_fn_t fn, void *data)
2525 { 2525 {
2526 pgd_t *pgd; 2526 pgd_t *pgd;
2527 unsigned long next; 2527 unsigned long next;
2528 unsigned long end = addr + size; 2528 unsigned long end = addr + size;
2529 int err; 2529 int err;
2530 2530
2531 BUG_ON(addr >= end); 2531 BUG_ON(addr >= end);
2532 pgd = pgd_offset(mm, addr); 2532 pgd = pgd_offset(mm, addr);
2533 do { 2533 do {
2534 next = pgd_addr_end(addr, end); 2534 next = pgd_addr_end(addr, end);
2535 err = apply_to_pud_range(mm, pgd, addr, next, fn, data); 2535 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2536 if (err) 2536 if (err)
2537 break; 2537 break;
2538 } while (pgd++, addr = next, addr != end); 2538 } while (pgd++, addr = next, addr != end);
2539 2539
2540 return err; 2540 return err;
2541 } 2541 }
2542 EXPORT_SYMBOL_GPL(apply_to_page_range); 2542 EXPORT_SYMBOL_GPL(apply_to_page_range);
2543 2543
2544 /* 2544 /*
2545 * handle_pte_fault chooses page fault handler according to an entry 2545 * handle_pte_fault chooses page fault handler according to an entry
2546 * which was read non-atomically. Before making any commitment, on 2546 * which was read non-atomically. Before making any commitment, on
2547 * those architectures or configurations (e.g. i386 with PAE) which 2547 * those architectures or configurations (e.g. i386 with PAE) which
2548 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault 2548 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
2549 * must check under lock before unmapping the pte and proceeding 2549 * must check under lock before unmapping the pte and proceeding
2550 * (but do_wp_page is only called after already making such a check; 2550 * (but do_wp_page is only called after already making such a check;
2551 * and do_anonymous_page can safely check later on). 2551 * and do_anonymous_page can safely check later on).
2552 */ 2552 */
2553 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, 2553 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2554 pte_t *page_table, pte_t orig_pte) 2554 pte_t *page_table, pte_t orig_pte)
2555 { 2555 {
2556 int same = 1; 2556 int same = 1;
2557 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) 2557 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2558 if (sizeof(pte_t) > sizeof(unsigned long)) { 2558 if (sizeof(pte_t) > sizeof(unsigned long)) {
2559 spinlock_t *ptl = pte_lockptr(mm, pmd); 2559 spinlock_t *ptl = pte_lockptr(mm, pmd);
2560 spin_lock(ptl); 2560 spin_lock(ptl);
2561 same = pte_same(*page_table, orig_pte); 2561 same = pte_same(*page_table, orig_pte);
2562 spin_unlock(ptl); 2562 spin_unlock(ptl);
2563 } 2563 }
2564 #endif 2564 #endif
2565 pte_unmap(page_table); 2565 pte_unmap(page_table);
2566 return same; 2566 return same;
2567 } 2567 }
2568 2568
2569 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2569 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2570 { 2570 {
2571 /* 2571 /*
2572 * If the source page was a PFN mapping, we don't have 2572 * If the source page was a PFN mapping, we don't have
2573 * a "struct page" for it. We do a best-effort copy by 2573 * a "struct page" for it. We do a best-effort copy by
2574 * just copying from the original user address. If that 2574 * just copying from the original user address. If that
2575 * fails, we just zero-fill it. Live with it. 2575 * fails, we just zero-fill it. Live with it.
2576 */ 2576 */
2577 if (unlikely(!src)) { 2577 if (unlikely(!src)) {
2578 void *kaddr = kmap_atomic(dst); 2578 void *kaddr = kmap_atomic(dst);
2579 void __user *uaddr = (void __user *)(va & PAGE_MASK); 2579 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2580 2580
2581 /* 2581 /*
2582 * This really shouldn't fail, because the page is there 2582 * This really shouldn't fail, because the page is there
2583 * in the page tables. But it might just be unreadable, 2583 * in the page tables. But it might just be unreadable,
2584 * in which case we just give up and fill the result with 2584 * in which case we just give up and fill the result with
2585 * zeroes. 2585 * zeroes.
2586 */ 2586 */
2587 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 2587 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2588 clear_page(kaddr); 2588 clear_page(kaddr);
2589 kunmap_atomic(kaddr); 2589 kunmap_atomic(kaddr);
2590 flush_dcache_page(dst); 2590 flush_dcache_page(dst);
2591 } else 2591 } else
2592 copy_user_highpage(dst, src, va, vma); 2592 copy_user_highpage(dst, src, va, vma);
2593 } 2593 }
2594 2594
2595 /* 2595 /*
2596 * This routine handles present pages, when users try to write 2596 * This routine handles present pages, when users try to write
2597 * to a shared page. It is done by copying the page to a new address 2597 * to a shared page. It is done by copying the page to a new address
2598 * and decrementing the shared-page counter for the old page. 2598 * and decrementing the shared-page counter for the old page.
2599 * 2599 *
2600 * Note that this routine assumes that the protection checks have been 2600 * Note that this routine assumes that the protection checks have been
2601 * done by the caller (the low-level page fault routine in most cases). 2601 * done by the caller (the low-level page fault routine in most cases).
2602 * Thus we can safely just mark it writable once we've done any necessary 2602 * Thus we can safely just mark it writable once we've done any necessary
2603 * COW. 2603 * COW.
2604 * 2604 *
2605 * We also mark the page dirty at this point even though the page will 2605 * We also mark the page dirty at this point even though the page will
2606 * change only once the write actually happens. This avoids a few races, 2606 * change only once the write actually happens. This avoids a few races,
2607 * and potentially makes it more efficient. 2607 * and potentially makes it more efficient.
2608 * 2608 *
2609 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2609 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2610 * but allow concurrent faults), with pte both mapped and locked. 2610 * but allow concurrent faults), with pte both mapped and locked.
2611 * We return with mmap_sem still held, but pte unmapped and unlocked. 2611 * We return with mmap_sem still held, but pte unmapped and unlocked.
2612 */ 2612 */
2613 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 2613 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2614 unsigned long address, pte_t *page_table, pmd_t *pmd, 2614 unsigned long address, pte_t *page_table, pmd_t *pmd,
2615 spinlock_t *ptl, pte_t orig_pte) 2615 spinlock_t *ptl, pte_t orig_pte)
2616 __releases(ptl) 2616 __releases(ptl)
2617 { 2617 {
2618 struct page *old_page, *new_page = NULL; 2618 struct page *old_page, *new_page = NULL;
2619 pte_t entry; 2619 pte_t entry;
2620 int ret = 0; 2620 int ret = 0;
2621 int page_mkwrite = 0; 2621 int page_mkwrite = 0;
2622 struct page *dirty_page = NULL; 2622 struct page *dirty_page = NULL;
2623 unsigned long mmun_start = 0; /* For mmu_notifiers */ 2623 unsigned long mmun_start = 0; /* For mmu_notifiers */
2624 unsigned long mmun_end = 0; /* For mmu_notifiers */ 2624 unsigned long mmun_end = 0; /* For mmu_notifiers */
2625 2625
2626 old_page = vm_normal_page(vma, address, orig_pte); 2626 old_page = vm_normal_page(vma, address, orig_pte);
2627 if (!old_page) { 2627 if (!old_page) {
2628 /* 2628 /*
2629 * VM_MIXEDMAP !pfn_valid() case 2629 * VM_MIXEDMAP !pfn_valid() case
2630 * 2630 *
2631 * We should not cow pages in a shared writeable mapping. 2631 * We should not cow pages in a shared writeable mapping.
2632 * Just mark the pages writable as we can't do any dirty 2632 * Just mark the pages writable as we can't do any dirty
2633 * accounting on raw pfn maps. 2633 * accounting on raw pfn maps.
2634 */ 2634 */
2635 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2635 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2636 (VM_WRITE|VM_SHARED)) 2636 (VM_WRITE|VM_SHARED))
2637 goto reuse; 2637 goto reuse;
2638 goto gotten; 2638 goto gotten;
2639 } 2639 }
2640 2640
2641 /* 2641 /*
2642 * Take out anonymous pages first, anonymous shared vmas are 2642 * Take out anonymous pages first, anonymous shared vmas are
2643 * not dirty accountable. 2643 * not dirty accountable.
2644 */ 2644 */
2645 if (PageAnon(old_page) && !PageKsm(old_page)) { 2645 if (PageAnon(old_page) && !PageKsm(old_page)) {
2646 if (!trylock_page(old_page)) { 2646 if (!trylock_page(old_page)) {
2647 page_cache_get(old_page); 2647 page_cache_get(old_page);
2648 pte_unmap_unlock(page_table, ptl); 2648 pte_unmap_unlock(page_table, ptl);
2649 lock_page(old_page); 2649 lock_page(old_page);
2650 page_table = pte_offset_map_lock(mm, pmd, address, 2650 page_table = pte_offset_map_lock(mm, pmd, address,
2651 &ptl); 2651 &ptl);
2652 if (!pte_same(*page_table, orig_pte)) { 2652 if (!pte_same(*page_table, orig_pte)) {
2653 unlock_page(old_page); 2653 unlock_page(old_page);
2654 goto unlock; 2654 goto unlock;
2655 } 2655 }
2656 page_cache_release(old_page); 2656 page_cache_release(old_page);
2657 } 2657 }
2658 if (reuse_swap_page(old_page)) { 2658 if (reuse_swap_page(old_page)) {
2659 /* 2659 /*
2660 * The page is all ours. Move it to our anon_vma so 2660 * The page is all ours. Move it to our anon_vma so
2661 * the rmap code will not search our parent or siblings. 2661 * the rmap code will not search our parent or siblings.
2662 * Protected against the rmap code by the page lock. 2662 * Protected against the rmap code by the page lock.
2663 */ 2663 */
2664 page_move_anon_rmap(old_page, vma, address); 2664 page_move_anon_rmap(old_page, vma, address);
2665 unlock_page(old_page); 2665 unlock_page(old_page);
2666 goto reuse; 2666 goto reuse;
2667 } 2667 }
2668 unlock_page(old_page); 2668 unlock_page(old_page);
2669 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2669 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2670 (VM_WRITE|VM_SHARED))) { 2670 (VM_WRITE|VM_SHARED))) {
2671 /* 2671 /*
2672 * Only catch write-faults on shared writable pages, 2672 * Only catch write-faults on shared writable pages,
2673 * read-only shared pages can get COWed by 2673 * read-only shared pages can get COWed by
2674 * get_user_pages(.write=1, .force=1). 2674 * get_user_pages(.write=1, .force=1).
2675 */ 2675 */
2676 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2676 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2677 struct vm_fault vmf; 2677 struct vm_fault vmf;
2678 int tmp; 2678 int tmp;
2679 2679
2680 vmf.virtual_address = (void __user *)(address & 2680 vmf.virtual_address = (void __user *)(address &
2681 PAGE_MASK); 2681 PAGE_MASK);
2682 vmf.pgoff = old_page->index; 2682 vmf.pgoff = old_page->index;
2683 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 2683 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2684 vmf.page = old_page; 2684 vmf.page = old_page;
2685 2685
2686 /* 2686 /*
2687 * Notify the address space that the page is about to 2687 * Notify the address space that the page is about to
2688 * become writable so that it can prohibit this or wait 2688 * become writable so that it can prohibit this or wait
2689 * for the page to get into an appropriate state. 2689 * for the page to get into an appropriate state.
2690 * 2690 *
2691 * We do this without the lock held, so that it can 2691 * We do this without the lock held, so that it can
2692 * sleep if it needs to. 2692 * sleep if it needs to.
2693 */ 2693 */
2694 page_cache_get(old_page); 2694 page_cache_get(old_page);
2695 pte_unmap_unlock(page_table, ptl); 2695 pte_unmap_unlock(page_table, ptl);
2696 2696
2697 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 2697 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2698 if (unlikely(tmp & 2698 if (unlikely(tmp &
2699 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 2699 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2700 ret = tmp; 2700 ret = tmp;
2701 goto unwritable_page; 2701 goto unwritable_page;
2702 } 2702 }
2703 if (unlikely(!(tmp & VM_FAULT_LOCKED))) { 2703 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2704 lock_page(old_page); 2704 lock_page(old_page);
2705 if (!old_page->mapping) { 2705 if (!old_page->mapping) {
2706 ret = 0; /* retry the fault */ 2706 ret = 0; /* retry the fault */
2707 unlock_page(old_page); 2707 unlock_page(old_page);
2708 goto unwritable_page; 2708 goto unwritable_page;
2709 } 2709 }
2710 } else 2710 } else
2711 VM_BUG_ON(!PageLocked(old_page)); 2711 VM_BUG_ON(!PageLocked(old_page));
2712 2712
2713 /* 2713 /*
2714 * Since we dropped the lock we need to revalidate 2714 * Since we dropped the lock we need to revalidate
2715 * the PTE as someone else may have changed it. If 2715 * the PTE as someone else may have changed it. If
2716 * they did, we just return, as we can count on the 2716 * they did, we just return, as we can count on the
2717 * MMU to tell us if they didn't also make it writable. 2717 * MMU to tell us if they didn't also make it writable.
2718 */ 2718 */
2719 page_table = pte_offset_map_lock(mm, pmd, address, 2719 page_table = pte_offset_map_lock(mm, pmd, address,
2720 &ptl); 2720 &ptl);
2721 if (!pte_same(*page_table, orig_pte)) { 2721 if (!pte_same(*page_table, orig_pte)) {
2722 unlock_page(old_page); 2722 unlock_page(old_page);
2723 goto unlock; 2723 goto unlock;
2724 } 2724 }
2725 2725
2726 page_mkwrite = 1; 2726 page_mkwrite = 1;
2727 } 2727 }
2728 dirty_page = old_page; 2728 dirty_page = old_page;
2729 get_page(dirty_page); 2729 get_page(dirty_page);
2730 2730
2731 reuse: 2731 reuse:
2732 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2732 flush_cache_page(vma, address, pte_pfn(orig_pte));
2733 entry = pte_mkyoung(orig_pte); 2733 entry = pte_mkyoung(orig_pte);
2734 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2734 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2735 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2735 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2736 update_mmu_cache(vma, address, page_table); 2736 update_mmu_cache(vma, address, page_table);
2737 pte_unmap_unlock(page_table, ptl); 2737 pte_unmap_unlock(page_table, ptl);
2738 ret |= VM_FAULT_WRITE; 2738 ret |= VM_FAULT_WRITE;
2739 2739
2740 if (!dirty_page) 2740 if (!dirty_page)
2741 return ret; 2741 return ret;
2742 2742
2743 /* 2743 /*
2744 * Yes, Virginia, this is actually required to prevent a race 2744 * Yes, Virginia, this is actually required to prevent a race
2745 * with clear_page_dirty_for_io() from clearing the page dirty 2745 * with clear_page_dirty_for_io() from clearing the page dirty
2746 * bit after it clear all dirty ptes, but before a racing 2746 * bit after it clear all dirty ptes, but before a racing
2747 * do_wp_page installs a dirty pte. 2747 * do_wp_page installs a dirty pte.
2748 * 2748 *
2749 * __do_fault is protected similarly. 2749 * __do_fault is protected similarly.
2750 */ 2750 */
2751 if (!page_mkwrite) { 2751 if (!page_mkwrite) {
2752 wait_on_page_locked(dirty_page); 2752 wait_on_page_locked(dirty_page);
2753 set_page_dirty_balance(dirty_page, page_mkwrite); 2753 set_page_dirty_balance(dirty_page, page_mkwrite);
2754 /* file_update_time outside page_lock */ 2754 /* file_update_time outside page_lock */
2755 if (vma->vm_file) 2755 if (vma->vm_file)
2756 file_update_time(vma->vm_file); 2756 file_update_time(vma->vm_file);
2757 } 2757 }
2758 put_page(dirty_page); 2758 put_page(dirty_page);
2759 if (page_mkwrite) { 2759 if (page_mkwrite) {
2760 struct address_space *mapping = dirty_page->mapping; 2760 struct address_space *mapping = dirty_page->mapping;
2761 2761
2762 set_page_dirty(dirty_page); 2762 set_page_dirty(dirty_page);
2763 unlock_page(dirty_page); 2763 unlock_page(dirty_page);
2764 page_cache_release(dirty_page); 2764 page_cache_release(dirty_page);
2765 if (mapping) { 2765 if (mapping) {
2766 /* 2766 /*
2767 * Some device drivers do not set page.mapping 2767 * Some device drivers do not set page.mapping
2768 * but still dirty their pages 2768 * but still dirty their pages
2769 */ 2769 */
2770 balance_dirty_pages_ratelimited(mapping); 2770 balance_dirty_pages_ratelimited(mapping);
2771 } 2771 }
2772 } 2772 }
2773 2773
2774 return ret; 2774 return ret;
2775 } 2775 }
2776 2776
2777 /* 2777 /*
2778 * Ok, we need to copy. Oh, well.. 2778 * Ok, we need to copy. Oh, well..
2779 */ 2779 */
2780 page_cache_get(old_page); 2780 page_cache_get(old_page);
2781 gotten: 2781 gotten:
2782 pte_unmap_unlock(page_table, ptl); 2782 pte_unmap_unlock(page_table, ptl);
2783 2783
2784 if (unlikely(anon_vma_prepare(vma))) 2784 if (unlikely(anon_vma_prepare(vma)))
2785 goto oom; 2785 goto oom;
2786 2786
2787 if (is_zero_pfn(pte_pfn(orig_pte))) { 2787 if (is_zero_pfn(pte_pfn(orig_pte))) {
2788 new_page = alloc_zeroed_user_highpage_movable(vma, address); 2788 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2789 if (!new_page) 2789 if (!new_page)
2790 goto oom; 2790 goto oom;
2791 } else { 2791 } else {
2792 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2792 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2793 if (!new_page) 2793 if (!new_page)
2794 goto oom; 2794 goto oom;
2795 cow_user_page(new_page, old_page, address, vma); 2795 cow_user_page(new_page, old_page, address, vma);
2796 } 2796 }
2797 __SetPageUptodate(new_page); 2797 __SetPageUptodate(new_page);
2798 2798
2799 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2799 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2800 goto oom_free_new; 2800 goto oom_free_new;
2801 2801
2802 mmun_start = address & PAGE_MASK; 2802 mmun_start = address & PAGE_MASK;
2803 mmun_end = mmun_start + PAGE_SIZE; 2803 mmun_end = mmun_start + PAGE_SIZE;
2804 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2804 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2805 2805
2806 /* 2806 /*
2807 * Re-check the pte - we dropped the lock 2807 * Re-check the pte - we dropped the lock
2808 */ 2808 */
2809 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2809 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2810 if (likely(pte_same(*page_table, orig_pte))) { 2810 if (likely(pte_same(*page_table, orig_pte))) {
2811 if (old_page) { 2811 if (old_page) {
2812 if (!PageAnon(old_page)) { 2812 if (!PageAnon(old_page)) {
2813 dec_mm_counter_fast(mm, MM_FILEPAGES); 2813 dec_mm_counter_fast(mm, MM_FILEPAGES);
2814 inc_mm_counter_fast(mm, MM_ANONPAGES); 2814 inc_mm_counter_fast(mm, MM_ANONPAGES);
2815 } 2815 }
2816 } else 2816 } else
2817 inc_mm_counter_fast(mm, MM_ANONPAGES); 2817 inc_mm_counter_fast(mm, MM_ANONPAGES);
2818 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2818 flush_cache_page(vma, address, pte_pfn(orig_pte));
2819 entry = mk_pte(new_page, vma->vm_page_prot); 2819 entry = mk_pte(new_page, vma->vm_page_prot);
2820 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2820 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2821 /* 2821 /*
2822 * Clear the pte entry and flush it first, before updating the 2822 * Clear the pte entry and flush it first, before updating the
2823 * pte with the new entry. This will avoid a race condition 2823 * pte with the new entry. This will avoid a race condition
2824 * seen in the presence of one thread doing SMC and another 2824 * seen in the presence of one thread doing SMC and another
2825 * thread doing COW. 2825 * thread doing COW.
2826 */ 2826 */
2827 ptep_clear_flush(vma, address, page_table); 2827 ptep_clear_flush(vma, address, page_table);
2828 page_add_new_anon_rmap(new_page, vma, address); 2828 page_add_new_anon_rmap(new_page, vma, address);
2829 /* 2829 /*
2830 * We call the notify macro here because, when using secondary 2830 * We call the notify macro here because, when using secondary
2831 * mmu page tables (such as kvm shadow page tables), we want the 2831 * mmu page tables (such as kvm shadow page tables), we want the
2832 * new page to be mapped directly into the secondary page table. 2832 * new page to be mapped directly into the secondary page table.
2833 */ 2833 */
2834 set_pte_at_notify(mm, address, page_table, entry); 2834 set_pte_at_notify(mm, address, page_table, entry);
2835 update_mmu_cache(vma, address, page_table); 2835 update_mmu_cache(vma, address, page_table);
2836 if (old_page) { 2836 if (old_page) {
2837 /* 2837 /*
2838 * Only after switching the pte to the new page may 2838 * Only after switching the pte to the new page may
2839 * we remove the mapcount here. Otherwise another 2839 * we remove the mapcount here. Otherwise another
2840 * process may come and find the rmap count decremented 2840 * process may come and find the rmap count decremented
2841 * before the pte is switched to the new page, and 2841 * before the pte is switched to the new page, and
2842 * "reuse" the old page writing into it while our pte 2842 * "reuse" the old page writing into it while our pte
2843 * here still points into it and can be read by other 2843 * here still points into it and can be read by other
2844 * threads. 2844 * threads.
2845 * 2845 *
2846 * The critical issue is to order this 2846 * The critical issue is to order this
2847 * page_remove_rmap with the ptp_clear_flush above. 2847 * page_remove_rmap with the ptp_clear_flush above.
2848 * Those stores are ordered by (if nothing else,) 2848 * Those stores are ordered by (if nothing else,)
2849 * the barrier present in the atomic_add_negative 2849 * the barrier present in the atomic_add_negative
2850 * in page_remove_rmap. 2850 * in page_remove_rmap.
2851 * 2851 *
2852 * Then the TLB flush in ptep_clear_flush ensures that 2852 * Then the TLB flush in ptep_clear_flush ensures that
2853 * no process can access the old page before the 2853 * no process can access the old page before the
2854 * decremented mapcount is visible. And the old page 2854 * decremented mapcount is visible. And the old page
2855 * cannot be reused until after the decremented 2855 * cannot be reused until after the decremented
2856 * mapcount is visible. So transitively, TLBs to 2856 * mapcount is visible. So transitively, TLBs to
2857 * old page will be flushed before it can be reused. 2857 * old page will be flushed before it can be reused.
2858 */ 2858 */
2859 page_remove_rmap(old_page); 2859 page_remove_rmap(old_page);
2860 } 2860 }
2861 2861
2862 /* Free the old page.. */ 2862 /* Free the old page.. */
2863 new_page = old_page; 2863 new_page = old_page;
2864 ret |= VM_FAULT_WRITE; 2864 ret |= VM_FAULT_WRITE;
2865 } else 2865 } else
2866 mem_cgroup_uncharge_page(new_page); 2866 mem_cgroup_uncharge_page(new_page);
2867 2867
2868 if (new_page) 2868 if (new_page)
2869 page_cache_release(new_page); 2869 page_cache_release(new_page);
2870 unlock: 2870 unlock:
2871 pte_unmap_unlock(page_table, ptl); 2871 pte_unmap_unlock(page_table, ptl);
2872 if (mmun_end > mmun_start) 2872 if (mmun_end > mmun_start)
2873 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2873 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2874 if (old_page) { 2874 if (old_page) {
2875 /* 2875 /*
2876 * Don't let another task, with possibly unlocked vma, 2876 * Don't let another task, with possibly unlocked vma,
2877 * keep the mlocked page. 2877 * keep the mlocked page.
2878 */ 2878 */
2879 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { 2879 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2880 lock_page(old_page); /* LRU manipulation */ 2880 lock_page(old_page); /* LRU manipulation */
2881 munlock_vma_page(old_page); 2881 munlock_vma_page(old_page);
2882 unlock_page(old_page); 2882 unlock_page(old_page);
2883 } 2883 }
2884 page_cache_release(old_page); 2884 page_cache_release(old_page);
2885 } 2885 }
2886 return ret; 2886 return ret;
2887 oom_free_new: 2887 oom_free_new:
2888 page_cache_release(new_page); 2888 page_cache_release(new_page);
2889 oom: 2889 oom:
2890 if (old_page) 2890 if (old_page)
2891 page_cache_release(old_page); 2891 page_cache_release(old_page);
2892 return VM_FAULT_OOM; 2892 return VM_FAULT_OOM;
2893 2893
2894 unwritable_page: 2894 unwritable_page:
2895 page_cache_release(old_page); 2895 page_cache_release(old_page);
2896 return ret; 2896 return ret;
2897 } 2897 }
2898 2898
2899 static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2899 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2900 unsigned long start_addr, unsigned long end_addr, 2900 unsigned long start_addr, unsigned long end_addr,
2901 struct zap_details *details) 2901 struct zap_details *details)
2902 { 2902 {
2903 zap_page_range_single(vma, start_addr, end_addr - start_addr, details); 2903 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2904 } 2904 }
2905 2905
2906 static inline void unmap_mapping_range_tree(struct rb_root *root, 2906 static inline void unmap_mapping_range_tree(struct rb_root *root,
2907 struct zap_details *details) 2907 struct zap_details *details)
2908 { 2908 {
2909 struct vm_area_struct *vma; 2909 struct vm_area_struct *vma;
2910 pgoff_t vba, vea, zba, zea; 2910 pgoff_t vba, vea, zba, zea;
2911 2911
2912 vma_interval_tree_foreach(vma, root, 2912 vma_interval_tree_foreach(vma, root,
2913 details->first_index, details->last_index) { 2913 details->first_index, details->last_index) {
2914 2914
2915 vba = vma->vm_pgoff; 2915 vba = vma->vm_pgoff;
2916 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; 2916 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2917 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ 2917 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
2918 zba = details->first_index; 2918 zba = details->first_index;
2919 if (zba < vba) 2919 if (zba < vba)
2920 zba = vba; 2920 zba = vba;
2921 zea = details->last_index; 2921 zea = details->last_index;
2922 if (zea > vea) 2922 if (zea > vea)
2923 zea = vea; 2923 zea = vea;
2924 2924
2925 unmap_mapping_range_vma(vma, 2925 unmap_mapping_range_vma(vma,
2926 ((zba - vba) << PAGE_SHIFT) + vma->vm_start, 2926 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2927 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, 2927 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2928 details); 2928 details);
2929 } 2929 }
2930 } 2930 }
2931 2931
2932 static inline void unmap_mapping_range_list(struct list_head *head, 2932 static inline void unmap_mapping_range_list(struct list_head *head,
2933 struct zap_details *details) 2933 struct zap_details *details)
2934 { 2934 {
2935 struct vm_area_struct *vma; 2935 struct vm_area_struct *vma;
2936 2936
2937 /* 2937 /*
2938 * In nonlinear VMAs there is no correspondence between virtual address 2938 * In nonlinear VMAs there is no correspondence between virtual address
2939 * offset and file offset. So we must perform an exhaustive search 2939 * offset and file offset. So we must perform an exhaustive search
2940 * across *all* the pages in each nonlinear VMA, not just the pages 2940 * across *all* the pages in each nonlinear VMA, not just the pages
2941 * whose virtual address lies outside the file truncation point. 2941 * whose virtual address lies outside the file truncation point.
2942 */ 2942 */
2943 list_for_each_entry(vma, head, shared.nonlinear) { 2943 list_for_each_entry(vma, head, shared.nonlinear) {
2944 details->nonlinear_vma = vma; 2944 details->nonlinear_vma = vma;
2945 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); 2945 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2946 } 2946 }
2947 } 2947 }
2948 2948
2949 /** 2949 /**
2950 * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. 2950 * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
2951 * @mapping: the address space containing mmaps to be unmapped. 2951 * @mapping: the address space containing mmaps to be unmapped.
2952 * @holebegin: byte in first page to unmap, relative to the start of 2952 * @holebegin: byte in first page to unmap, relative to the start of
2953 * the underlying file. This will be rounded down to a PAGE_SIZE 2953 * the underlying file. This will be rounded down to a PAGE_SIZE
2954 * boundary. Note that this is different from truncate_pagecache(), which 2954 * boundary. Note that this is different from truncate_pagecache(), which
2955 * must keep the partial page. In contrast, we must get rid of 2955 * must keep the partial page. In contrast, we must get rid of
2956 * partial pages. 2956 * partial pages.
2957 * @holelen: size of prospective hole in bytes. This will be rounded 2957 * @holelen: size of prospective hole in bytes. This will be rounded
2958 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the 2958 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
2959 * end of the file. 2959 * end of the file.
2960 * @even_cows: 1 when truncating a file, unmap even private COWed pages; 2960 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
2961 * but 0 when invalidating pagecache, don't throw away private data. 2961 * but 0 when invalidating pagecache, don't throw away private data.
2962 */ 2962 */
2963 void unmap_mapping_range(struct address_space *mapping, 2963 void unmap_mapping_range(struct address_space *mapping,
2964 loff_t const holebegin, loff_t const holelen, int even_cows) 2964 loff_t const holebegin, loff_t const holelen, int even_cows)
2965 { 2965 {
2966 struct zap_details details; 2966 struct zap_details details;
2967 pgoff_t hba = holebegin >> PAGE_SHIFT; 2967 pgoff_t hba = holebegin >> PAGE_SHIFT;
2968 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2968 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2969 2969
2970 /* Check for overflow. */ 2970 /* Check for overflow. */
2971 if (sizeof(holelen) > sizeof(hlen)) { 2971 if (sizeof(holelen) > sizeof(hlen)) {
2972 long long holeend = 2972 long long holeend =
2973 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2973 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2974 if (holeend & ~(long long)ULONG_MAX) 2974 if (holeend & ~(long long)ULONG_MAX)
2975 hlen = ULONG_MAX - hba + 1; 2975 hlen = ULONG_MAX - hba + 1;
2976 } 2976 }
2977 2977
2978 details.check_mapping = even_cows? NULL: mapping; 2978 details.check_mapping = even_cows? NULL: mapping;
2979 details.nonlinear_vma = NULL; 2979 details.nonlinear_vma = NULL;
2980 details.first_index = hba; 2980 details.first_index = hba;
2981 details.last_index = hba + hlen - 1; 2981 details.last_index = hba + hlen - 1;
2982 if (details.last_index < details.first_index) 2982 if (details.last_index < details.first_index)
2983 details.last_index = ULONG_MAX; 2983 details.last_index = ULONG_MAX;
2984 2984
2985 2985
2986 mutex_lock(&mapping->i_mmap_mutex); 2986 mutex_lock(&mapping->i_mmap_mutex);
2987 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) 2987 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2988 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2988 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2989 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2989 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2990 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2990 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2991 mutex_unlock(&mapping->i_mmap_mutex); 2991 mutex_unlock(&mapping->i_mmap_mutex);
2992 } 2992 }
2993 EXPORT_SYMBOL(unmap_mapping_range); 2993 EXPORT_SYMBOL(unmap_mapping_range);
2994 2994
2995 /* 2995 /*
2996 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2996 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2997 * but allow concurrent faults), and pte mapped but not yet locked. 2997 * but allow concurrent faults), and pte mapped but not yet locked.
2998 * We return with mmap_sem still held, but pte unmapped and unlocked. 2998 * We return with mmap_sem still held, but pte unmapped and unlocked.
2999 */ 2999 */
3000 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 3000 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
3001 unsigned long address, pte_t *page_table, pmd_t *pmd, 3001 unsigned long address, pte_t *page_table, pmd_t *pmd,
3002 unsigned int flags, pte_t orig_pte) 3002 unsigned int flags, pte_t orig_pte)
3003 { 3003 {
3004 spinlock_t *ptl; 3004 spinlock_t *ptl;
3005 struct page *page, *swapcache; 3005 struct page *page, *swapcache;
3006 swp_entry_t entry; 3006 swp_entry_t entry;
3007 pte_t pte; 3007 pte_t pte;
3008 int locked; 3008 int locked;
3009 struct mem_cgroup *ptr; 3009 struct mem_cgroup *ptr;
3010 int exclusive = 0; 3010 int exclusive = 0;
3011 int ret = 0; 3011 int ret = 0;
3012 3012
3013 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 3013 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3014 goto out; 3014 goto out;
3015 3015
3016 entry = pte_to_swp_entry(orig_pte); 3016 entry = pte_to_swp_entry(orig_pte);
3017 if (unlikely(non_swap_entry(entry))) { 3017 if (unlikely(non_swap_entry(entry))) {
3018 if (is_migration_entry(entry)) { 3018 if (is_migration_entry(entry)) {
3019 migration_entry_wait(mm, pmd, address); 3019 migration_entry_wait(mm, pmd, address);
3020 } else if (is_hwpoison_entry(entry)) { 3020 } else if (is_hwpoison_entry(entry)) {
3021 ret = VM_FAULT_HWPOISON; 3021 ret = VM_FAULT_HWPOISON;
3022 } else { 3022 } else {
3023 print_bad_pte(vma, address, orig_pte, NULL); 3023 print_bad_pte(vma, address, orig_pte, NULL);
3024 ret = VM_FAULT_SIGBUS; 3024 ret = VM_FAULT_SIGBUS;
3025 } 3025 }
3026 goto out; 3026 goto out;
3027 } 3027 }
3028 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 3028 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
3029 page = lookup_swap_cache(entry); 3029 page = lookup_swap_cache(entry);
3030 if (!page) { 3030 if (!page) {
3031 page = swapin_readahead(entry, 3031 page = swapin_readahead(entry,
3032 GFP_HIGHUSER_MOVABLE, vma, address); 3032 GFP_HIGHUSER_MOVABLE, vma, address);
3033 if (!page) { 3033 if (!page) {
3034 /* 3034 /*
3035 * Back out if somebody else faulted in this pte 3035 * Back out if somebody else faulted in this pte
3036 * while we released the pte lock. 3036 * while we released the pte lock.
3037 */ 3037 */
3038 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 3038 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3039 if (likely(pte_same(*page_table, orig_pte))) 3039 if (likely(pte_same(*page_table, orig_pte)))
3040 ret = VM_FAULT_OOM; 3040 ret = VM_FAULT_OOM;
3041 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 3041 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3042 goto unlock; 3042 goto unlock;
3043 } 3043 }
3044 3044
3045 /* Had to read the page from swap area: Major fault */ 3045 /* Had to read the page from swap area: Major fault */
3046 ret = VM_FAULT_MAJOR; 3046 ret = VM_FAULT_MAJOR;
3047 count_vm_event(PGMAJFAULT); 3047 count_vm_event(PGMAJFAULT);
3048 mem_cgroup_count_vm_event(mm, PGMAJFAULT); 3048 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
3049 } else if (PageHWPoison(page)) { 3049 } else if (PageHWPoison(page)) {
3050 /* 3050 /*
3051 * hwpoisoned dirty swapcache pages are kept for killing 3051 * hwpoisoned dirty swapcache pages are kept for killing
3052 * owner processes (which may be unknown at hwpoison time) 3052 * owner processes (which may be unknown at hwpoison time)
3053 */ 3053 */
3054 ret = VM_FAULT_HWPOISON; 3054 ret = VM_FAULT_HWPOISON;
3055 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 3055 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3056 swapcache = page; 3056 swapcache = page;
3057 goto out_release; 3057 goto out_release;
3058 } 3058 }
3059 3059
3060 swapcache = page; 3060 swapcache = page;
3061 locked = lock_page_or_retry(page, mm, flags); 3061 locked = lock_page_or_retry(page, mm, flags);
3062 3062
3063 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 3063 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3064 if (!locked) { 3064 if (!locked) {
3065 ret |= VM_FAULT_RETRY; 3065 ret |= VM_FAULT_RETRY;
3066 goto out_release; 3066 goto out_release;
3067 } 3067 }
3068 3068
3069 /* 3069 /*
3070 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not 3070 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
3071 * release the swapcache from under us. The page pin, and pte_same 3071 * release the swapcache from under us. The page pin, and pte_same
3072 * test below, are not enough to exclude that. Even if it is still 3072 * test below, are not enough to exclude that. Even if it is still
3073 * swapcache, we need to check that the page's swap has not changed. 3073 * swapcache, we need to check that the page's swap has not changed.
3074 */ 3074 */
3075 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 3075 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
3076 goto out_page; 3076 goto out_page;
3077 3077
3078 page = ksm_might_need_to_copy(page, vma, address); 3078 page = ksm_might_need_to_copy(page, vma, address);
3079 if (unlikely(!page)) { 3079 if (unlikely(!page)) {
3080 ret = VM_FAULT_OOM; 3080 ret = VM_FAULT_OOM;
3081 page = swapcache; 3081 page = swapcache;
3082 goto out_page; 3082 goto out_page;
3083 } 3083 }
3084 3084
3085 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 3085 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
3086 ret = VM_FAULT_OOM; 3086 ret = VM_FAULT_OOM;
3087 goto out_page; 3087 goto out_page;
3088 } 3088 }
3089 3089
3090 /* 3090 /*
3091 * Back out if somebody else already faulted in this pte. 3091 * Back out if somebody else already faulted in this pte.
3092 */ 3092 */
3093 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 3093 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3094 if (unlikely(!pte_same(*page_table, orig_pte))) 3094 if (unlikely(!pte_same(*page_table, orig_pte)))
3095 goto out_nomap; 3095 goto out_nomap;
3096 3096
3097 if (unlikely(!PageUptodate(page))) { 3097 if (unlikely(!PageUptodate(page))) {
3098 ret = VM_FAULT_SIGBUS; 3098 ret = VM_FAULT_SIGBUS;
3099 goto out_nomap; 3099 goto out_nomap;
3100 } 3100 }
3101 3101
3102 /* 3102 /*
3103 * The page isn't present yet, go ahead with the fault. 3103 * The page isn't present yet, go ahead with the fault.
3104 * 3104 *
3105 * Be careful about the sequence of operations here. 3105 * Be careful about the sequence of operations here.
3106 * To get its accounting right, reuse_swap_page() must be called 3106 * To get its accounting right, reuse_swap_page() must be called
3107 * while the page is counted on swap but not yet in mapcount i.e. 3107 * while the page is counted on swap but not yet in mapcount i.e.
3108 * before page_add_anon_rmap() and swap_free(); try_to_free_swap() 3108 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
3109 * must be called after the swap_free(), or it will never succeed. 3109 * must be called after the swap_free(), or it will never succeed.
3110 * Because delete_from_swap_page() may be called by reuse_swap_page(), 3110 * Because delete_from_swap_page() may be called by reuse_swap_page(),
3111 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry 3111 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
3112 * in page->private. In this case, a record in swap_cgroup is silently 3112 * in page->private. In this case, a record in swap_cgroup is silently
3113 * discarded at swap_free(). 3113 * discarded at swap_free().
3114 */ 3114 */
3115 3115
3116 inc_mm_counter_fast(mm, MM_ANONPAGES); 3116 inc_mm_counter_fast(mm, MM_ANONPAGES);
3117 dec_mm_counter_fast(mm, MM_SWAPENTS); 3117 dec_mm_counter_fast(mm, MM_SWAPENTS);
3118 pte = mk_pte(page, vma->vm_page_prot); 3118 pte = mk_pte(page, vma->vm_page_prot);
3119 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 3119 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
3120 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 3120 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3121 flags &= ~FAULT_FLAG_WRITE; 3121 flags &= ~FAULT_FLAG_WRITE;
3122 ret |= VM_FAULT_WRITE; 3122 ret |= VM_FAULT_WRITE;
3123 exclusive = 1; 3123 exclusive = 1;
3124 } 3124 }
3125 flush_icache_page(vma, page); 3125 flush_icache_page(vma, page);
3126 set_pte_at(mm, address, page_table, pte); 3126 set_pte_at(mm, address, page_table, pte);
3127 if (page == swapcache) 3127 if (page == swapcache)
3128 do_page_add_anon_rmap(page, vma, address, exclusive); 3128 do_page_add_anon_rmap(page, vma, address, exclusive);
3129 else /* ksm created a completely new copy */ 3129 else /* ksm created a completely new copy */
3130 page_add_new_anon_rmap(page, vma, address); 3130 page_add_new_anon_rmap(page, vma, address);
3131 /* It's better to call commit-charge after rmap is established */ 3131 /* It's better to call commit-charge after rmap is established */
3132 mem_cgroup_commit_charge_swapin(page, ptr); 3132 mem_cgroup_commit_charge_swapin(page, ptr);
3133 3133
3134 swap_free(entry); 3134 swap_free(entry);
3135 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 3135 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3136 try_to_free_swap(page); 3136 try_to_free_swap(page);
3137 unlock_page(page); 3137 unlock_page(page);
3138 if (page != swapcache) { 3138 if (page != swapcache) {
3139 /* 3139 /*
3140 * Hold the lock to avoid the swap entry to be reused 3140 * Hold the lock to avoid the swap entry to be reused
3141 * until we take the PT lock for the pte_same() check 3141 * until we take the PT lock for the pte_same() check
3142 * (to avoid false positives from pte_same). For 3142 * (to avoid false positives from pte_same). For
3143 * further safety release the lock after the swap_free 3143 * further safety release the lock after the swap_free
3144 * so that the swap count won't change under a 3144 * so that the swap count won't change under a
3145 * parallel locked swapcache. 3145 * parallel locked swapcache.
3146 */ 3146 */
3147 unlock_page(swapcache); 3147 unlock_page(swapcache);
3148 page_cache_release(swapcache); 3148 page_cache_release(swapcache);
3149 } 3149 }
3150 3150
3151 if (flags & FAULT_FLAG_WRITE) { 3151 if (flags & FAULT_FLAG_WRITE) {
3152 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 3152 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
3153 if (ret & VM_FAULT_ERROR) 3153 if (ret & VM_FAULT_ERROR)
3154 ret &= VM_FAULT_ERROR; 3154 ret &= VM_FAULT_ERROR;
3155 goto out; 3155 goto out;
3156 } 3156 }
3157 3157
3158 /* No need to invalidate - it was non-present before */ 3158 /* No need to invalidate - it was non-present before */
3159 update_mmu_cache(vma, address, page_table); 3159 update_mmu_cache(vma, address, page_table);
3160 unlock: 3160 unlock:
3161 pte_unmap_unlock(page_table, ptl); 3161 pte_unmap_unlock(page_table, ptl);
3162 out: 3162 out:
3163 return ret; 3163 return ret;
3164 out_nomap: 3164 out_nomap:
3165 mem_cgroup_cancel_charge_swapin(ptr); 3165 mem_cgroup_cancel_charge_swapin(ptr);
3166 pte_unmap_unlock(page_table, ptl); 3166 pte_unmap_unlock(page_table, ptl);
3167 out_page: 3167 out_page:
3168 unlock_page(page); 3168 unlock_page(page);
3169 out_release: 3169 out_release:
3170 page_cache_release(page); 3170 page_cache_release(page);
3171 if (page != swapcache) { 3171 if (page != swapcache) {
3172 unlock_page(swapcache); 3172 unlock_page(swapcache);
3173 page_cache_release(swapcache); 3173 page_cache_release(swapcache);
3174 } 3174 }
3175 return ret; 3175 return ret;
3176 } 3176 }
3177 3177
3178 /* 3178 /*
3179 * This is like a special single-page "expand_{down|up}wards()", 3179 * This is like a special single-page "expand_{down|up}wards()",
3180 * except we must first make sure that 'address{-|+}PAGE_SIZE' 3180 * except we must first make sure that 'address{-|+}PAGE_SIZE'
3181 * doesn't hit another vma. 3181 * doesn't hit another vma.
3182 */ 3182 */
3183 static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) 3183 static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
3184 { 3184 {
3185 address &= PAGE_MASK; 3185 address &= PAGE_MASK;
3186 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { 3186 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
3187 struct vm_area_struct *prev = vma->vm_prev; 3187 struct vm_area_struct *prev = vma->vm_prev;
3188 3188
3189 /* 3189 /*
3190 * Is there a mapping abutting this one below? 3190 * Is there a mapping abutting this one below?
3191 * 3191 *
3192 * That's only ok if it's the same stack mapping 3192 * That's only ok if it's the same stack mapping
3193 * that has gotten split.. 3193 * that has gotten split..
3194 */ 3194 */
3195 if (prev && prev->vm_end == address) 3195 if (prev && prev->vm_end == address)
3196 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; 3196 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
3197 3197
3198 expand_downwards(vma, address - PAGE_SIZE); 3198 expand_downwards(vma, address - PAGE_SIZE);
3199 } 3199 }
3200 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { 3200 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
3201 struct vm_area_struct *next = vma->vm_next; 3201 struct vm_area_struct *next = vma->vm_next;
3202 3202
3203 /* As VM_GROWSDOWN but s/below/above/ */ 3203 /* As VM_GROWSDOWN but s/below/above/ */
3204 if (next && next->vm_start == address + PAGE_SIZE) 3204 if (next && next->vm_start == address + PAGE_SIZE)
3205 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; 3205 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
3206 3206
3207 expand_upwards(vma, address + PAGE_SIZE); 3207 expand_upwards(vma, address + PAGE_SIZE);
3208 } 3208 }
3209 return 0; 3209 return 0;
3210 } 3210 }
3211 3211
3212 /* 3212 /*
3213 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3213 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3214 * but allow concurrent faults), and pte mapped but not yet locked. 3214 * but allow concurrent faults), and pte mapped but not yet locked.
3215 * We return with mmap_sem still held, but pte unmapped and unlocked. 3215 * We return with mmap_sem still held, but pte unmapped and unlocked.
3216 */ 3216 */
3217 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 3217 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3218 unsigned long address, pte_t *page_table, pmd_t *pmd, 3218 unsigned long address, pte_t *page_table, pmd_t *pmd,
3219 unsigned int flags) 3219 unsigned int flags)
3220 { 3220 {
3221 struct page *page; 3221 struct page *page;
3222 spinlock_t *ptl; 3222 spinlock_t *ptl;
3223 pte_t entry; 3223 pte_t entry;
3224 3224
3225 pte_unmap(page_table); 3225 pte_unmap(page_table);
3226 3226
3227 /* Check if we need to add a guard page to the stack */ 3227 /* Check if we need to add a guard page to the stack */
3228 if (check_stack_guard_page(vma, address) < 0) 3228 if (check_stack_guard_page(vma, address) < 0)
3229 return VM_FAULT_SIGBUS; 3229 return VM_FAULT_SIGBUS;
3230 3230
3231 /* Use the zero-page for reads */ 3231 /* Use the zero-page for reads */
3232 if (!(flags & FAULT_FLAG_WRITE)) { 3232 if (!(flags & FAULT_FLAG_WRITE)) {
3233 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), 3233 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
3234 vma->vm_page_prot)); 3234 vma->vm_page_prot));
3235 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 3235 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3236 if (!pte_none(*page_table)) 3236 if (!pte_none(*page_table))
3237 goto unlock; 3237 goto unlock;
3238 goto setpte; 3238 goto setpte;
3239 } 3239 }
3240 3240
3241 /* Allocate our own private page. */ 3241 /* Allocate our own private page. */
3242 if (unlikely(anon_vma_prepare(vma))) 3242 if (unlikely(anon_vma_prepare(vma)))
3243 goto oom; 3243 goto oom;
3244 page = alloc_zeroed_user_highpage_movable(vma, address); 3244 page = alloc_zeroed_user_highpage_movable(vma, address);
3245 if (!page) 3245 if (!page)
3246 goto oom; 3246 goto oom;
3247 /*
3248 * The memory barrier inside __SetPageUptodate makes sure that
3249 * preceeding stores to the page contents become visible before
3250 * the set_pte_at() write.
3251 */
3247 __SetPageUptodate(page); 3252 __SetPageUptodate(page);
3248 3253
3249 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) 3254 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3250 goto oom_free_page; 3255 goto oom_free_page;
3251 3256
3252 entry = mk_pte(page, vma->vm_page_prot); 3257 entry = mk_pte(page, vma->vm_page_prot);
3253 if (vma->vm_flags & VM_WRITE) 3258 if (vma->vm_flags & VM_WRITE)
3254 entry = pte_mkwrite(pte_mkdirty(entry)); 3259 entry = pte_mkwrite(pte_mkdirty(entry));
3255 3260
3256 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 3261 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3257 if (!pte_none(*page_table)) 3262 if (!pte_none(*page_table))
3258 goto release; 3263 goto release;
3259 3264
3260 inc_mm_counter_fast(mm, MM_ANONPAGES); 3265 inc_mm_counter_fast(mm, MM_ANONPAGES);
3261 page_add_new_anon_rmap(page, vma, address); 3266 page_add_new_anon_rmap(page, vma, address);
3262 setpte: 3267 setpte:
3263 set_pte_at(mm, address, page_table, entry); 3268 set_pte_at(mm, address, page_table, entry);
3264 3269
3265 /* No need to invalidate - it was non-present before */ 3270 /* No need to invalidate - it was non-present before */
3266 update_mmu_cache(vma, address, page_table); 3271 update_mmu_cache(vma, address, page_table);
3267 unlock: 3272 unlock:
3268 pte_unmap_unlock(page_table, ptl); 3273 pte_unmap_unlock(page_table, ptl);
3269 return 0; 3274 return 0;
3270 release: 3275 release:
3271 mem_cgroup_uncharge_page(page); 3276 mem_cgroup_uncharge_page(page);
3272 page_cache_release(page); 3277 page_cache_release(page);
3273 goto unlock; 3278 goto unlock;
3274 oom_free_page: 3279 oom_free_page:
3275 page_cache_release(page); 3280 page_cache_release(page);
3276 oom: 3281 oom:
3277 return VM_FAULT_OOM; 3282 return VM_FAULT_OOM;
3278 } 3283 }
3279 3284
3280 /* 3285 /*
3281 * __do_fault() tries to create a new page mapping. It aggressively 3286 * __do_fault() tries to create a new page mapping. It aggressively
3282 * tries to share with existing pages, but makes a separate copy if 3287 * tries to share with existing pages, but makes a separate copy if
3283 * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid 3288 * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
3284 * the next page fault. 3289 * the next page fault.
3285 * 3290 *
3286 * As this is called only for pages that do not currently exist, we 3291 * As this is called only for pages that do not currently exist, we
3287 * do not need to flush old virtual caches or the TLB. 3292 * do not need to flush old virtual caches or the TLB.
3288 * 3293 *
3289 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3294 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3290 * but allow concurrent faults), and pte neither mapped nor locked. 3295 * but allow concurrent faults), and pte neither mapped nor locked.
3291 * We return with mmap_sem still held, but pte unmapped and unlocked. 3296 * We return with mmap_sem still held, but pte unmapped and unlocked.
3292 */ 3297 */
3293 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3298 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3294 unsigned long address, pmd_t *pmd, 3299 unsigned long address, pmd_t *pmd,
3295 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 3300 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3296 { 3301 {
3297 pte_t *page_table; 3302 pte_t *page_table;
3298 spinlock_t *ptl; 3303 spinlock_t *ptl;
3299 struct page *page; 3304 struct page *page;
3300 struct page *cow_page; 3305 struct page *cow_page;
3301 pte_t entry; 3306 pte_t entry;
3302 int anon = 0; 3307 int anon = 0;
3303 struct page *dirty_page = NULL; 3308 struct page *dirty_page = NULL;
3304 struct vm_fault vmf; 3309 struct vm_fault vmf;
3305 int ret; 3310 int ret;
3306 int page_mkwrite = 0; 3311 int page_mkwrite = 0;
3307 3312
3308 /* 3313 /*
3309 * If we do COW later, allocate page befor taking lock_page() 3314 * If we do COW later, allocate page befor taking lock_page()
3310 * on the file cache page. This will reduce lock holding time. 3315 * on the file cache page. This will reduce lock holding time.
3311 */ 3316 */
3312 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 3317 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3313 3318
3314 if (unlikely(anon_vma_prepare(vma))) 3319 if (unlikely(anon_vma_prepare(vma)))
3315 return VM_FAULT_OOM; 3320 return VM_FAULT_OOM;
3316 3321
3317 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 3322 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3318 if (!cow_page) 3323 if (!cow_page)
3319 return VM_FAULT_OOM; 3324 return VM_FAULT_OOM;
3320 3325
3321 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { 3326 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3322 page_cache_release(cow_page); 3327 page_cache_release(cow_page);
3323 return VM_FAULT_OOM; 3328 return VM_FAULT_OOM;
3324 } 3329 }
3325 } else 3330 } else
3326 cow_page = NULL; 3331 cow_page = NULL;
3327 3332
3328 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 3333 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3329 vmf.pgoff = pgoff; 3334 vmf.pgoff = pgoff;
3330 vmf.flags = flags; 3335 vmf.flags = flags;
3331 vmf.page = NULL; 3336 vmf.page = NULL;
3332 3337
3333 ret = vma->vm_ops->fault(vma, &vmf); 3338 ret = vma->vm_ops->fault(vma, &vmf);
3334 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3339 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3335 VM_FAULT_RETRY))) 3340 VM_FAULT_RETRY)))
3336 goto uncharge_out; 3341 goto uncharge_out;
3337 3342
3338 if (unlikely(PageHWPoison(vmf.page))) { 3343 if (unlikely(PageHWPoison(vmf.page))) {
3339 if (ret & VM_FAULT_LOCKED) 3344 if (ret & VM_FAULT_LOCKED)
3340 unlock_page(vmf.page); 3345 unlock_page(vmf.page);
3341 ret = VM_FAULT_HWPOISON; 3346 ret = VM_FAULT_HWPOISON;
3342 goto uncharge_out; 3347 goto uncharge_out;
3343 } 3348 }
3344 3349
3345 /* 3350 /*
3346 * For consistency in subsequent calls, make the faulted page always 3351 * For consistency in subsequent calls, make the faulted page always
3347 * locked. 3352 * locked.
3348 */ 3353 */
3349 if (unlikely(!(ret & VM_FAULT_LOCKED))) 3354 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3350 lock_page(vmf.page); 3355 lock_page(vmf.page);
3351 else 3356 else
3352 VM_BUG_ON(!PageLocked(vmf.page)); 3357 VM_BUG_ON(!PageLocked(vmf.page));
3353 3358
3354 /* 3359 /*
3355 * Should we do an early C-O-W break? 3360 * Should we do an early C-O-W break?
3356 */ 3361 */
3357 page = vmf.page; 3362 page = vmf.page;
3358 if (flags & FAULT_FLAG_WRITE) { 3363 if (flags & FAULT_FLAG_WRITE) {
3359 if (!(vma->vm_flags & VM_SHARED)) { 3364 if (!(vma->vm_flags & VM_SHARED)) {
3360 page = cow_page; 3365 page = cow_page;
3361 anon = 1; 3366 anon = 1;
3362 copy_user_highpage(page, vmf.page, address, vma); 3367 copy_user_highpage(page, vmf.page, address, vma);
3363 __SetPageUptodate(page); 3368 __SetPageUptodate(page);
3364 } else { 3369 } else {
3365 /* 3370 /*
3366 * If the page will be shareable, see if the backing 3371 * If the page will be shareable, see if the backing
3367 * address space wants to know that the page is about 3372 * address space wants to know that the page is about
3368 * to become writable 3373 * to become writable
3369 */ 3374 */
3370 if (vma->vm_ops->page_mkwrite) { 3375 if (vma->vm_ops->page_mkwrite) {
3371 int tmp; 3376 int tmp;
3372 3377
3373 unlock_page(page); 3378 unlock_page(page);
3374 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 3379 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3375 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 3380 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3376 if (unlikely(tmp & 3381 if (unlikely(tmp &
3377 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 3382 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3378 ret = tmp; 3383 ret = tmp;
3379 goto unwritable_page; 3384 goto unwritable_page;
3380 } 3385 }
3381 if (unlikely(!(tmp & VM_FAULT_LOCKED))) { 3386 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3382 lock_page(page); 3387 lock_page(page);
3383 if (!page->mapping) { 3388 if (!page->mapping) {
3384 ret = 0; /* retry the fault */ 3389 ret = 0; /* retry the fault */
3385 unlock_page(page); 3390 unlock_page(page);
3386 goto unwritable_page; 3391 goto unwritable_page;
3387 } 3392 }
3388 } else 3393 } else
3389 VM_BUG_ON(!PageLocked(page)); 3394 VM_BUG_ON(!PageLocked(page));
3390 page_mkwrite = 1; 3395 page_mkwrite = 1;
3391 } 3396 }
3392 } 3397 }
3393 3398
3394 } 3399 }
3395 3400
3396 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 3401 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3397 3402
3398 /* 3403 /*
3399 * This silly early PAGE_DIRTY setting removes a race 3404 * This silly early PAGE_DIRTY setting removes a race
3400 * due to the bad i386 page protection. But it's valid 3405 * due to the bad i386 page protection. But it's valid
3401 * for other architectures too. 3406 * for other architectures too.
3402 * 3407 *
3403 * Note that if FAULT_FLAG_WRITE is set, we either now have 3408 * Note that if FAULT_FLAG_WRITE is set, we either now have
3404 * an exclusive copy of the page, or this is a shared mapping, 3409 * an exclusive copy of the page, or this is a shared mapping,
3405 * so we can make it writable and dirty to avoid having to 3410 * so we can make it writable and dirty to avoid having to
3406 * handle that later. 3411 * handle that later.
3407 */ 3412 */
3408 /* Only go through if we didn't race with anybody else... */ 3413 /* Only go through if we didn't race with anybody else... */
3409 if (likely(pte_same(*page_table, orig_pte))) { 3414 if (likely(pte_same(*page_table, orig_pte))) {
3410 flush_icache_page(vma, page); 3415 flush_icache_page(vma, page);
3411 entry = mk_pte(page, vma->vm_page_prot); 3416 entry = mk_pte(page, vma->vm_page_prot);
3412 if (flags & FAULT_FLAG_WRITE) 3417 if (flags & FAULT_FLAG_WRITE)
3413 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 3418 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3414 if (anon) { 3419 if (anon) {
3415 inc_mm_counter_fast(mm, MM_ANONPAGES); 3420 inc_mm_counter_fast(mm, MM_ANONPAGES);
3416 page_add_new_anon_rmap(page, vma, address); 3421 page_add_new_anon_rmap(page, vma, address);
3417 } else { 3422 } else {
3418 inc_mm_counter_fast(mm, MM_FILEPAGES); 3423 inc_mm_counter_fast(mm, MM_FILEPAGES);
3419 page_add_file_rmap(page); 3424 page_add_file_rmap(page);
3420 if (flags & FAULT_FLAG_WRITE) { 3425 if (flags & FAULT_FLAG_WRITE) {
3421 dirty_page = page; 3426 dirty_page = page;
3422 get_page(dirty_page); 3427 get_page(dirty_page);
3423 } 3428 }
3424 } 3429 }
3425 set_pte_at(mm, address, page_table, entry); 3430 set_pte_at(mm, address, page_table, entry);
3426 3431
3427 /* no need to invalidate: a not-present page won't be cached */ 3432 /* no need to invalidate: a not-present page won't be cached */
3428 update_mmu_cache(vma, address, page_table); 3433 update_mmu_cache(vma, address, page_table);
3429 } else { 3434 } else {
3430 if (cow_page) 3435 if (cow_page)
3431 mem_cgroup_uncharge_page(cow_page); 3436 mem_cgroup_uncharge_page(cow_page);
3432 if (anon) 3437 if (anon)
3433 page_cache_release(page); 3438 page_cache_release(page);
3434 else 3439 else
3435 anon = 1; /* no anon but release faulted_page */ 3440 anon = 1; /* no anon but release faulted_page */
3436 } 3441 }
3437 3442
3438 pte_unmap_unlock(page_table, ptl); 3443 pte_unmap_unlock(page_table, ptl);
3439 3444
3440 if (dirty_page) { 3445 if (dirty_page) {
3441 struct address_space *mapping = page->mapping; 3446 struct address_space *mapping = page->mapping;
3442 int dirtied = 0; 3447 int dirtied = 0;
3443 3448
3444 if (set_page_dirty(dirty_page)) 3449 if (set_page_dirty(dirty_page))
3445 dirtied = 1; 3450 dirtied = 1;
3446 unlock_page(dirty_page); 3451 unlock_page(dirty_page);
3447 put_page(dirty_page); 3452 put_page(dirty_page);
3448 if ((dirtied || page_mkwrite) && mapping) { 3453 if ((dirtied || page_mkwrite) && mapping) {
3449 /* 3454 /*
3450 * Some device drivers do not set page.mapping but still 3455 * Some device drivers do not set page.mapping but still
3451 * dirty their pages 3456 * dirty their pages
3452 */ 3457 */
3453 balance_dirty_pages_ratelimited(mapping); 3458 balance_dirty_pages_ratelimited(mapping);
3454 } 3459 }
3455 3460
3456 /* file_update_time outside page_lock */ 3461 /* file_update_time outside page_lock */
3457 if (vma->vm_file && !page_mkwrite) 3462 if (vma->vm_file && !page_mkwrite)
3458 file_update_time(vma->vm_file); 3463 file_update_time(vma->vm_file);
3459 } else { 3464 } else {
3460 unlock_page(vmf.page); 3465 unlock_page(vmf.page);
3461 if (anon) 3466 if (anon)
3462 page_cache_release(vmf.page); 3467 page_cache_release(vmf.page);
3463 } 3468 }
3464 3469
3465 return ret; 3470 return ret;
3466 3471
3467 unwritable_page: 3472 unwritable_page:
3468 page_cache_release(page); 3473 page_cache_release(page);
3469 return ret; 3474 return ret;
3470 uncharge_out: 3475 uncharge_out:
3471 /* fs's fault handler get error */ 3476 /* fs's fault handler get error */
3472 if (cow_page) { 3477 if (cow_page) {
3473 mem_cgroup_uncharge_page(cow_page); 3478 mem_cgroup_uncharge_page(cow_page);
3474 page_cache_release(cow_page); 3479 page_cache_release(cow_page);
3475 } 3480 }
3476 return ret; 3481 return ret;
3477 } 3482 }
3478 3483
3479 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3484 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3480 unsigned long address, pte_t *page_table, pmd_t *pmd, 3485 unsigned long address, pte_t *page_table, pmd_t *pmd,
3481 unsigned int flags, pte_t orig_pte) 3486 unsigned int flags, pte_t orig_pte)
3482 { 3487 {
3483 pgoff_t pgoff = (((address & PAGE_MASK) 3488 pgoff_t pgoff = (((address & PAGE_MASK)
3484 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 3489 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3485 3490
3486 pte_unmap(page_table); 3491 pte_unmap(page_table);
3487 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3492 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3488 } 3493 }
3489 3494
3490 /* 3495 /*
3491 * Fault of a previously existing named mapping. Repopulate the pte 3496 * Fault of a previously existing named mapping. Repopulate the pte
3492 * from the encoded file_pte if possible. This enables swappable 3497 * from the encoded file_pte if possible. This enables swappable
3493 * nonlinear vmas. 3498 * nonlinear vmas.
3494 * 3499 *
3495 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3500 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3496 * but allow concurrent faults), and pte mapped but not yet locked. 3501 * but allow concurrent faults), and pte mapped but not yet locked.
3497 * We return with mmap_sem still held, but pte unmapped and unlocked. 3502 * We return with mmap_sem still held, but pte unmapped and unlocked.
3498 */ 3503 */
3499 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3504 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3500 unsigned long address, pte_t *page_table, pmd_t *pmd, 3505 unsigned long address, pte_t *page_table, pmd_t *pmd,
3501 unsigned int flags, pte_t orig_pte) 3506 unsigned int flags, pte_t orig_pte)
3502 { 3507 {
3503 pgoff_t pgoff; 3508 pgoff_t pgoff;
3504 3509
3505 flags |= FAULT_FLAG_NONLINEAR; 3510 flags |= FAULT_FLAG_NONLINEAR;
3506 3511
3507 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 3512 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3508 return 0; 3513 return 0;
3509 3514
3510 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { 3515 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3511 /* 3516 /*
3512 * Page table corrupted: show pte and kill process. 3517 * Page table corrupted: show pte and kill process.
3513 */ 3518 */
3514 print_bad_pte(vma, address, orig_pte, NULL); 3519 print_bad_pte(vma, address, orig_pte, NULL);
3515 return VM_FAULT_SIGBUS; 3520 return VM_FAULT_SIGBUS;
3516 } 3521 }
3517 3522
3518 pgoff = pte_to_pgoff(orig_pte); 3523 pgoff = pte_to_pgoff(orig_pte);
3519 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3524 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3520 } 3525 }
3521 3526
3522 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3527 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3523 unsigned long addr, int current_nid) 3528 unsigned long addr, int current_nid)
3524 { 3529 {
3525 get_page(page); 3530 get_page(page);
3526 3531
3527 count_vm_numa_event(NUMA_HINT_FAULTS); 3532 count_vm_numa_event(NUMA_HINT_FAULTS);
3528 if (current_nid == numa_node_id()) 3533 if (current_nid == numa_node_id())
3529 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 3534 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3530 3535
3531 return mpol_misplaced(page, vma, addr); 3536 return mpol_misplaced(page, vma, addr);
3532 } 3537 }
3533 3538
3534 int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 3539 int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3535 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) 3540 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3536 { 3541 {
3537 struct page *page = NULL; 3542 struct page *page = NULL;
3538 spinlock_t *ptl; 3543 spinlock_t *ptl;
3539 int current_nid = -1; 3544 int current_nid = -1;
3540 int target_nid; 3545 int target_nid;
3541 bool migrated = false; 3546 bool migrated = false;
3542 3547
3543 /* 3548 /*
3544 * The "pte" at this point cannot be used safely without 3549 * The "pte" at this point cannot be used safely without
3545 * validation through pte_unmap_same(). It's of NUMA type but 3550 * validation through pte_unmap_same(). It's of NUMA type but
3546 * the pfn may be screwed if the read is non atomic. 3551 * the pfn may be screwed if the read is non atomic.
3547 * 3552 *
3548 * ptep_modify_prot_start is not called as this is clearing 3553 * ptep_modify_prot_start is not called as this is clearing
3549 * the _PAGE_NUMA bit and it is not really expected that there 3554 * the _PAGE_NUMA bit and it is not really expected that there
3550 * would be concurrent hardware modifications to the PTE. 3555 * would be concurrent hardware modifications to the PTE.
3551 */ 3556 */
3552 ptl = pte_lockptr(mm, pmd); 3557 ptl = pte_lockptr(mm, pmd);
3553 spin_lock(ptl); 3558 spin_lock(ptl);
3554 if (unlikely(!pte_same(*ptep, pte))) { 3559 if (unlikely(!pte_same(*ptep, pte))) {
3555 pte_unmap_unlock(ptep, ptl); 3560 pte_unmap_unlock(ptep, ptl);
3556 goto out; 3561 goto out;
3557 } 3562 }
3558 3563
3559 pte = pte_mknonnuma(pte); 3564 pte = pte_mknonnuma(pte);
3560 set_pte_at(mm, addr, ptep, pte); 3565 set_pte_at(mm, addr, ptep, pte);
3561 update_mmu_cache(vma, addr, ptep); 3566 update_mmu_cache(vma, addr, ptep);
3562 3567
3563 page = vm_normal_page(vma, addr, pte); 3568 page = vm_normal_page(vma, addr, pte);
3564 if (!page) { 3569 if (!page) {
3565 pte_unmap_unlock(ptep, ptl); 3570 pte_unmap_unlock(ptep, ptl);
3566 return 0; 3571 return 0;
3567 } 3572 }
3568 3573
3569 current_nid = page_to_nid(page); 3574 current_nid = page_to_nid(page);
3570 target_nid = numa_migrate_prep(page, vma, addr, current_nid); 3575 target_nid = numa_migrate_prep(page, vma, addr, current_nid);
3571 pte_unmap_unlock(ptep, ptl); 3576 pte_unmap_unlock(ptep, ptl);
3572 if (target_nid == -1) { 3577 if (target_nid == -1) {
3573 /* 3578 /*
3574 * Account for the fault against the current node if it not 3579 * Account for the fault against the current node if it not
3575 * being replaced regardless of where the page is located. 3580 * being replaced regardless of where the page is located.
3576 */ 3581 */
3577 current_nid = numa_node_id(); 3582 current_nid = numa_node_id();
3578 put_page(page); 3583 put_page(page);
3579 goto out; 3584 goto out;
3580 } 3585 }
3581 3586
3582 /* Migrate to the requested node */ 3587 /* Migrate to the requested node */
3583 migrated = migrate_misplaced_page(page, target_nid); 3588 migrated = migrate_misplaced_page(page, target_nid);
3584 if (migrated) 3589 if (migrated)
3585 current_nid = target_nid; 3590 current_nid = target_nid;
3586 3591
3587 out: 3592 out:
3588 if (current_nid != -1) 3593 if (current_nid != -1)
3589 task_numa_fault(current_nid, 1, migrated); 3594 task_numa_fault(current_nid, 1, migrated);
3590 return 0; 3595 return 0;
3591 } 3596 }
3592 3597
3593 /* NUMA hinting page fault entry point for regular pmds */ 3598 /* NUMA hinting page fault entry point for regular pmds */
3594 #ifdef CONFIG_NUMA_BALANCING 3599 #ifdef CONFIG_NUMA_BALANCING
3595 static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 3600 static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3596 unsigned long addr, pmd_t *pmdp) 3601 unsigned long addr, pmd_t *pmdp)
3597 { 3602 {
3598 pmd_t pmd; 3603 pmd_t pmd;
3599 pte_t *pte, *orig_pte; 3604 pte_t *pte, *orig_pte;
3600 unsigned long _addr = addr & PMD_MASK; 3605 unsigned long _addr = addr & PMD_MASK;
3601 unsigned long offset; 3606 unsigned long offset;
3602 spinlock_t *ptl; 3607 spinlock_t *ptl;
3603 bool numa = false; 3608 bool numa = false;
3604 int local_nid = numa_node_id(); 3609 int local_nid = numa_node_id();
3605 3610
3606 spin_lock(&mm->page_table_lock); 3611 spin_lock(&mm->page_table_lock);
3607 pmd = *pmdp; 3612 pmd = *pmdp;
3608 if (pmd_numa(pmd)) { 3613 if (pmd_numa(pmd)) {
3609 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); 3614 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3610 numa = true; 3615 numa = true;
3611 } 3616 }
3612 spin_unlock(&mm->page_table_lock); 3617 spin_unlock(&mm->page_table_lock);
3613 3618
3614 if (!numa) 3619 if (!numa)
3615 return 0; 3620 return 0;
3616 3621
3617 /* we're in a page fault so some vma must be in the range */ 3622 /* we're in a page fault so some vma must be in the range */
3618 BUG_ON(!vma); 3623 BUG_ON(!vma);
3619 BUG_ON(vma->vm_start >= _addr + PMD_SIZE); 3624 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3620 offset = max(_addr, vma->vm_start) & ~PMD_MASK; 3625 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3621 VM_BUG_ON(offset >= PMD_SIZE); 3626 VM_BUG_ON(offset >= PMD_SIZE);
3622 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); 3627 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3623 pte += offset >> PAGE_SHIFT; 3628 pte += offset >> PAGE_SHIFT;
3624 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { 3629 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3625 pte_t pteval = *pte; 3630 pte_t pteval = *pte;
3626 struct page *page; 3631 struct page *page;
3627 int curr_nid = local_nid; 3632 int curr_nid = local_nid;
3628 int target_nid; 3633 int target_nid;
3629 bool migrated; 3634 bool migrated;
3630 if (!pte_present(pteval)) 3635 if (!pte_present(pteval))
3631 continue; 3636 continue;
3632 if (!pte_numa(pteval)) 3637 if (!pte_numa(pteval))
3633 continue; 3638 continue;
3634 if (addr >= vma->vm_end) { 3639 if (addr >= vma->vm_end) {
3635 vma = find_vma(mm, addr); 3640 vma = find_vma(mm, addr);
3636 /* there's a pte present so there must be a vma */ 3641 /* there's a pte present so there must be a vma */
3637 BUG_ON(!vma); 3642 BUG_ON(!vma);
3638 BUG_ON(addr < vma->vm_start); 3643 BUG_ON(addr < vma->vm_start);
3639 } 3644 }
3640 if (pte_numa(pteval)) { 3645 if (pte_numa(pteval)) {
3641 pteval = pte_mknonnuma(pteval); 3646 pteval = pte_mknonnuma(pteval);
3642 set_pte_at(mm, addr, pte, pteval); 3647 set_pte_at(mm, addr, pte, pteval);
3643 } 3648 }
3644 page = vm_normal_page(vma, addr, pteval); 3649 page = vm_normal_page(vma, addr, pteval);
3645 if (unlikely(!page)) 3650 if (unlikely(!page))
3646 continue; 3651 continue;
3647 /* only check non-shared pages */ 3652 /* only check non-shared pages */
3648 if (unlikely(page_mapcount(page) != 1)) 3653 if (unlikely(page_mapcount(page) != 1))
3649 continue; 3654 continue;
3650 3655
3651 /* 3656 /*
3652 * Note that the NUMA fault is later accounted to either 3657 * Note that the NUMA fault is later accounted to either
3653 * the node that is currently running or where the page is 3658 * the node that is currently running or where the page is
3654 * migrated to. 3659 * migrated to.
3655 */ 3660 */
3656 curr_nid = local_nid; 3661 curr_nid = local_nid;
3657 target_nid = numa_migrate_prep(page, vma, addr, 3662 target_nid = numa_migrate_prep(page, vma, addr,
3658 page_to_nid(page)); 3663 page_to_nid(page));
3659 if (target_nid == -1) { 3664 if (target_nid == -1) {
3660 put_page(page); 3665 put_page(page);
3661 continue; 3666 continue;
3662 } 3667 }
3663 3668
3664 /* Migrate to the requested node */ 3669 /* Migrate to the requested node */
3665 pte_unmap_unlock(pte, ptl); 3670 pte_unmap_unlock(pte, ptl);
3666 migrated = migrate_misplaced_page(page, target_nid); 3671 migrated = migrate_misplaced_page(page, target_nid);
3667 if (migrated) 3672 if (migrated)
3668 curr_nid = target_nid; 3673 curr_nid = target_nid;
3669 task_numa_fault(curr_nid, 1, migrated); 3674 task_numa_fault(curr_nid, 1, migrated);
3670 3675
3671 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); 3676 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3672 } 3677 }
3673 pte_unmap_unlock(orig_pte, ptl); 3678 pte_unmap_unlock(orig_pte, ptl);
3674 3679
3675 return 0; 3680 return 0;
3676 } 3681 }
3677 #else 3682 #else
3678 static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 3683 static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3679 unsigned long addr, pmd_t *pmdp) 3684 unsigned long addr, pmd_t *pmdp)
3680 { 3685 {
3681 BUG(); 3686 BUG();
3682 return 0; 3687 return 0;
3683 } 3688 }
3684 #endif /* CONFIG_NUMA_BALANCING */ 3689 #endif /* CONFIG_NUMA_BALANCING */
3685 3690
3686 /* 3691 /*
3687 * These routines also need to handle stuff like marking pages dirty 3692 * These routines also need to handle stuff like marking pages dirty
3688 * and/or accessed for architectures that don't do it in hardware (most 3693 * and/or accessed for architectures that don't do it in hardware (most
3689 * RISC architectures). The early dirtying is also good on the i386. 3694 * RISC architectures). The early dirtying is also good on the i386.
3690 * 3695 *
3691 * There is also a hook called "update_mmu_cache()" that architectures 3696 * There is also a hook called "update_mmu_cache()" that architectures
3692 * with external mmu caches can use to update those (ie the Sparc or 3697 * with external mmu caches can use to update those (ie the Sparc or
3693 * PowerPC hashed page tables that act as extended TLBs). 3698 * PowerPC hashed page tables that act as extended TLBs).
3694 * 3699 *
3695 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3700 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3696 * but allow concurrent faults), and pte mapped but not yet locked. 3701 * but allow concurrent faults), and pte mapped but not yet locked.
3697 * We return with mmap_sem still held, but pte unmapped and unlocked. 3702 * We return with mmap_sem still held, but pte unmapped and unlocked.
3698 */ 3703 */
3699 int handle_pte_fault(struct mm_struct *mm, 3704 int handle_pte_fault(struct mm_struct *mm,
3700 struct vm_area_struct *vma, unsigned long address, 3705 struct vm_area_struct *vma, unsigned long address,
3701 pte_t *pte, pmd_t *pmd, unsigned int flags) 3706 pte_t *pte, pmd_t *pmd, unsigned int flags)
3702 { 3707 {
3703 pte_t entry; 3708 pte_t entry;
3704 spinlock_t *ptl; 3709 spinlock_t *ptl;
3705 3710
3706 entry = *pte; 3711 entry = *pte;
3707 if (!pte_present(entry)) { 3712 if (!pte_present(entry)) {
3708 if (pte_none(entry)) { 3713 if (pte_none(entry)) {
3709 if (vma->vm_ops) { 3714 if (vma->vm_ops) {
3710 if (likely(vma->vm_ops->fault)) 3715 if (likely(vma->vm_ops->fault))
3711 return do_linear_fault(mm, vma, address, 3716 return do_linear_fault(mm, vma, address,
3712 pte, pmd, flags, entry); 3717 pte, pmd, flags, entry);
3713 } 3718 }
3714 return do_anonymous_page(mm, vma, address, 3719 return do_anonymous_page(mm, vma, address,
3715 pte, pmd, flags); 3720 pte, pmd, flags);
3716 } 3721 }
3717 if (pte_file(entry)) 3722 if (pte_file(entry))
3718 return do_nonlinear_fault(mm, vma, address, 3723 return do_nonlinear_fault(mm, vma, address,
3719 pte, pmd, flags, entry); 3724 pte, pmd, flags, entry);
3720 return do_swap_page(mm, vma, address, 3725 return do_swap_page(mm, vma, address,
3721 pte, pmd, flags, entry); 3726 pte, pmd, flags, entry);
3722 } 3727 }
3723 3728
3724 if (pte_numa(entry)) 3729 if (pte_numa(entry))
3725 return do_numa_page(mm, vma, address, entry, pte, pmd); 3730 return do_numa_page(mm, vma, address, entry, pte, pmd);
3726 3731
3727 ptl = pte_lockptr(mm, pmd); 3732 ptl = pte_lockptr(mm, pmd);
3728 spin_lock(ptl); 3733 spin_lock(ptl);
3729 if (unlikely(!pte_same(*pte, entry))) 3734 if (unlikely(!pte_same(*pte, entry)))
3730 goto unlock; 3735 goto unlock;
3731 if (flags & FAULT_FLAG_WRITE) { 3736 if (flags & FAULT_FLAG_WRITE) {
3732 if (!pte_write(entry)) 3737 if (!pte_write(entry))
3733 return do_wp_page(mm, vma, address, 3738 return do_wp_page(mm, vma, address,
3734 pte, pmd, ptl, entry); 3739 pte, pmd, ptl, entry);
3735 entry = pte_mkdirty(entry); 3740 entry = pte_mkdirty(entry);
3736 } 3741 }
3737 entry = pte_mkyoung(entry); 3742 entry = pte_mkyoung(entry);
3738 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 3743 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3739 update_mmu_cache(vma, address, pte); 3744 update_mmu_cache(vma, address, pte);
3740 } else { 3745 } else {
3741 /* 3746 /*
3742 * This is needed only for protection faults but the arch code 3747 * This is needed only for protection faults but the arch code
3743 * is not yet telling us if this is a protection fault or not. 3748 * is not yet telling us if this is a protection fault or not.
3744 * This still avoids useless tlb flushes for .text page faults 3749 * This still avoids useless tlb flushes for .text page faults
3745 * with threads. 3750 * with threads.
3746 */ 3751 */
3747 if (flags & FAULT_FLAG_WRITE) 3752 if (flags & FAULT_FLAG_WRITE)
3748 flush_tlb_fix_spurious_fault(vma, address); 3753 flush_tlb_fix_spurious_fault(vma, address);
3749 } 3754 }
3750 unlock: 3755 unlock:
3751 pte_unmap_unlock(pte, ptl); 3756 pte_unmap_unlock(pte, ptl);
3752 return 0; 3757 return 0;
3753 } 3758 }
3754 3759
3755 /* 3760 /*
3756 * By the time we get here, we already hold the mm semaphore 3761 * By the time we get here, we already hold the mm semaphore
3757 */ 3762 */
3758 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3763 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3759 unsigned long address, unsigned int flags) 3764 unsigned long address, unsigned int flags)
3760 { 3765 {
3761 pgd_t *pgd; 3766 pgd_t *pgd;
3762 pud_t *pud; 3767 pud_t *pud;
3763 pmd_t *pmd; 3768 pmd_t *pmd;
3764 pte_t *pte; 3769 pte_t *pte;
3765 3770
3766 __set_current_state(TASK_RUNNING); 3771 __set_current_state(TASK_RUNNING);
3767 3772
3768 count_vm_event(PGFAULT); 3773 count_vm_event(PGFAULT);
3769 mem_cgroup_count_vm_event(mm, PGFAULT); 3774 mem_cgroup_count_vm_event(mm, PGFAULT);
3770 3775
3771 /* do counter updates before entering really critical section. */ 3776 /* do counter updates before entering really critical section. */
3772 check_sync_rss_stat(current); 3777 check_sync_rss_stat(current);
3773 3778
3774 if (unlikely(is_vm_hugetlb_page(vma))) 3779 if (unlikely(is_vm_hugetlb_page(vma)))
3775 return hugetlb_fault(mm, vma, address, flags); 3780 return hugetlb_fault(mm, vma, address, flags);
3776 3781
3777 retry: 3782 retry:
3778 pgd = pgd_offset(mm, address); 3783 pgd = pgd_offset(mm, address);
3779 pud = pud_alloc(mm, pgd, address); 3784 pud = pud_alloc(mm, pgd, address);
3780 if (!pud) 3785 if (!pud)
3781 return VM_FAULT_OOM; 3786 return VM_FAULT_OOM;
3782 pmd = pmd_alloc(mm, pud, address); 3787 pmd = pmd_alloc(mm, pud, address);
3783 if (!pmd) 3788 if (!pmd)
3784 return VM_FAULT_OOM; 3789 return VM_FAULT_OOM;
3785 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { 3790 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3786 if (!vma->vm_ops) 3791 if (!vma->vm_ops)
3787 return do_huge_pmd_anonymous_page(mm, vma, address, 3792 return do_huge_pmd_anonymous_page(mm, vma, address,
3788 pmd, flags); 3793 pmd, flags);
3789 } else { 3794 } else {
3790 pmd_t orig_pmd = *pmd; 3795 pmd_t orig_pmd = *pmd;
3791 int ret; 3796 int ret;
3792 3797
3793 barrier(); 3798 barrier();
3794 if (pmd_trans_huge(orig_pmd)) { 3799 if (pmd_trans_huge(orig_pmd)) {
3795 unsigned int dirty = flags & FAULT_FLAG_WRITE; 3800 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3796 3801
3797 /* 3802 /*
3798 * If the pmd is splitting, return and retry the 3803 * If the pmd is splitting, return and retry the
3799 * the fault. Alternative: wait until the split 3804 * the fault. Alternative: wait until the split
3800 * is done, and goto retry. 3805 * is done, and goto retry.
3801 */ 3806 */
3802 if (pmd_trans_splitting(orig_pmd)) 3807 if (pmd_trans_splitting(orig_pmd))
3803 return 0; 3808 return 0;
3804 3809
3805 if (pmd_numa(orig_pmd)) 3810 if (pmd_numa(orig_pmd))
3806 return do_huge_pmd_numa_page(mm, vma, address, 3811 return do_huge_pmd_numa_page(mm, vma, address,
3807 orig_pmd, pmd); 3812 orig_pmd, pmd);
3808 3813
3809 if (dirty && !pmd_write(orig_pmd)) { 3814 if (dirty && !pmd_write(orig_pmd)) {
3810 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3815 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3811 orig_pmd); 3816 orig_pmd);
3812 /* 3817 /*
3813 * If COW results in an oom, the huge pmd will 3818 * If COW results in an oom, the huge pmd will
3814 * have been split, so retry the fault on the 3819 * have been split, so retry the fault on the
3815 * pte for a smaller charge. 3820 * pte for a smaller charge.
3816 */ 3821 */
3817 if (unlikely(ret & VM_FAULT_OOM)) 3822 if (unlikely(ret & VM_FAULT_OOM))
3818 goto retry; 3823 goto retry;
3819 return ret; 3824 return ret;
3820 } else { 3825 } else {
3821 huge_pmd_set_accessed(mm, vma, address, pmd, 3826 huge_pmd_set_accessed(mm, vma, address, pmd,
3822 orig_pmd, dirty); 3827 orig_pmd, dirty);
3823 } 3828 }
3824 3829
3825 return 0; 3830 return 0;
3826 } 3831 }
3827 } 3832 }
3828 3833
3829 if (pmd_numa(*pmd)) 3834 if (pmd_numa(*pmd))
3830 return do_pmd_numa_page(mm, vma, address, pmd); 3835 return do_pmd_numa_page(mm, vma, address, pmd);
3831 3836
3832 /* 3837 /*
3833 * Use __pte_alloc instead of pte_alloc_map, because we can't 3838 * Use __pte_alloc instead of pte_alloc_map, because we can't
3834 * run pte_offset_map on the pmd, if an huge pmd could 3839 * run pte_offset_map on the pmd, if an huge pmd could
3835 * materialize from under us from a different thread. 3840 * materialize from under us from a different thread.
3836 */ 3841 */
3837 if (unlikely(pmd_none(*pmd)) && 3842 if (unlikely(pmd_none(*pmd)) &&
3838 unlikely(__pte_alloc(mm, vma, pmd, address))) 3843 unlikely(__pte_alloc(mm, vma, pmd, address)))
3839 return VM_FAULT_OOM; 3844 return VM_FAULT_OOM;
3840 /* if an huge pmd materialized from under us just retry later */ 3845 /* if an huge pmd materialized from under us just retry later */
3841 if (unlikely(pmd_trans_huge(*pmd))) 3846 if (unlikely(pmd_trans_huge(*pmd)))
3842 return 0; 3847 return 0;
3843 /* 3848 /*
3844 * A regular pmd is established and it can't morph into a huge pmd 3849 * A regular pmd is established and it can't morph into a huge pmd
3845 * from under us anymore at this point because we hold the mmap_sem 3850 * from under us anymore at this point because we hold the mmap_sem
3846 * read mode and khugepaged takes it in write mode. So now it's 3851 * read mode and khugepaged takes it in write mode. So now it's
3847 * safe to run pte_offset_map(). 3852 * safe to run pte_offset_map().
3848 */ 3853 */
3849 pte = pte_offset_map(pmd, address); 3854 pte = pte_offset_map(pmd, address);
3850 3855
3851 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3856 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3852 } 3857 }
3853 3858
3854 #ifndef __PAGETABLE_PUD_FOLDED 3859 #ifndef __PAGETABLE_PUD_FOLDED
3855 /* 3860 /*
3856 * Allocate page upper directory. 3861 * Allocate page upper directory.
3857 * We've already handled the fast-path in-line. 3862 * We've already handled the fast-path in-line.
3858 */ 3863 */
3859 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 3864 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3860 { 3865 {
3861 pud_t *new = pud_alloc_one(mm, address); 3866 pud_t *new = pud_alloc_one(mm, address);
3862 if (!new) 3867 if (!new)
3863 return -ENOMEM; 3868 return -ENOMEM;
3864 3869
3865 smp_wmb(); /* See comment in __pte_alloc */ 3870 smp_wmb(); /* See comment in __pte_alloc */
3866 3871
3867 spin_lock(&mm->page_table_lock); 3872 spin_lock(&mm->page_table_lock);
3868 if (pgd_present(*pgd)) /* Another has populated it */ 3873 if (pgd_present(*pgd)) /* Another has populated it */
3869 pud_free(mm, new); 3874 pud_free(mm, new);
3870 else 3875 else
3871 pgd_populate(mm, pgd, new); 3876 pgd_populate(mm, pgd, new);
3872 spin_unlock(&mm->page_table_lock); 3877 spin_unlock(&mm->page_table_lock);
3873 return 0; 3878 return 0;
3874 } 3879 }
3875 #endif /* __PAGETABLE_PUD_FOLDED */ 3880 #endif /* __PAGETABLE_PUD_FOLDED */
3876 3881
3877 #ifndef __PAGETABLE_PMD_FOLDED 3882 #ifndef __PAGETABLE_PMD_FOLDED
3878 /* 3883 /*
3879 * Allocate page middle directory. 3884 * Allocate page middle directory.
3880 * We've already handled the fast-path in-line. 3885 * We've already handled the fast-path in-line.
3881 */ 3886 */
3882 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 3887 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3883 { 3888 {
3884 pmd_t *new = pmd_alloc_one(mm, address); 3889 pmd_t *new = pmd_alloc_one(mm, address);
3885 if (!new) 3890 if (!new)
3886 return -ENOMEM; 3891 return -ENOMEM;
3887 3892
3888 smp_wmb(); /* See comment in __pte_alloc */ 3893 smp_wmb(); /* See comment in __pte_alloc */
3889 3894
3890 spin_lock(&mm->page_table_lock); 3895 spin_lock(&mm->page_table_lock);
3891 #ifndef __ARCH_HAS_4LEVEL_HACK 3896 #ifndef __ARCH_HAS_4LEVEL_HACK
3892 if (pud_present(*pud)) /* Another has populated it */ 3897 if (pud_present(*pud)) /* Another has populated it */
3893 pmd_free(mm, new); 3898 pmd_free(mm, new);
3894 else 3899 else
3895 pud_populate(mm, pud, new); 3900 pud_populate(mm, pud, new);
3896 #else 3901 #else
3897 if (pgd_present(*pud)) /* Another has populated it */ 3902 if (pgd_present(*pud)) /* Another has populated it */
3898 pmd_free(mm, new); 3903 pmd_free(mm, new);
3899 else 3904 else
3900 pgd_populate(mm, pud, new); 3905 pgd_populate(mm, pud, new);
3901 #endif /* __ARCH_HAS_4LEVEL_HACK */ 3906 #endif /* __ARCH_HAS_4LEVEL_HACK */
3902 spin_unlock(&mm->page_table_lock); 3907 spin_unlock(&mm->page_table_lock);
3903 return 0; 3908 return 0;
3904 } 3909 }
3905 #endif /* __PAGETABLE_PMD_FOLDED */ 3910 #endif /* __PAGETABLE_PMD_FOLDED */
3906 3911
3907 #if !defined(__HAVE_ARCH_GATE_AREA) 3912 #if !defined(__HAVE_ARCH_GATE_AREA)
3908 3913
3909 #if defined(AT_SYSINFO_EHDR) 3914 #if defined(AT_SYSINFO_EHDR)
3910 static struct vm_area_struct gate_vma; 3915 static struct vm_area_struct gate_vma;
3911 3916
3912 static int __init gate_vma_init(void) 3917 static int __init gate_vma_init(void)
3913 { 3918 {
3914 gate_vma.vm_mm = NULL; 3919 gate_vma.vm_mm = NULL;
3915 gate_vma.vm_start = FIXADDR_USER_START; 3920 gate_vma.vm_start = FIXADDR_USER_START;
3916 gate_vma.vm_end = FIXADDR_USER_END; 3921 gate_vma.vm_end = FIXADDR_USER_END;
3917 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; 3922 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3918 gate_vma.vm_page_prot = __P101; 3923 gate_vma.vm_page_prot = __P101;
3919 3924
3920 return 0; 3925 return 0;
3921 } 3926 }
3922 __initcall(gate_vma_init); 3927 __initcall(gate_vma_init);
3923 #endif 3928 #endif
3924 3929
3925 struct vm_area_struct *get_gate_vma(struct mm_struct *mm) 3930 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3926 { 3931 {
3927 #ifdef AT_SYSINFO_EHDR 3932 #ifdef AT_SYSINFO_EHDR
3928 return &gate_vma; 3933 return &gate_vma;
3929 #else 3934 #else
3930 return NULL; 3935 return NULL;
3931 #endif 3936 #endif
3932 } 3937 }
3933 3938
3934 int in_gate_area_no_mm(unsigned long addr) 3939 int in_gate_area_no_mm(unsigned long addr)
3935 { 3940 {
3936 #ifdef AT_SYSINFO_EHDR 3941 #ifdef AT_SYSINFO_EHDR
3937 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) 3942 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3938 return 1; 3943 return 1;
3939 #endif 3944 #endif
3940 return 0; 3945 return 0;
3941 } 3946 }
3942 3947
3943 #endif /* __HAVE_ARCH_GATE_AREA */ 3948 #endif /* __HAVE_ARCH_GATE_AREA */
3944 3949
3945 static int __follow_pte(struct mm_struct *mm, unsigned long address, 3950 static int __follow_pte(struct mm_struct *mm, unsigned long address,
3946 pte_t **ptepp, spinlock_t **ptlp) 3951 pte_t **ptepp, spinlock_t **ptlp)
3947 { 3952 {
3948 pgd_t *pgd; 3953 pgd_t *pgd;
3949 pud_t *pud; 3954 pud_t *pud;
3950 pmd_t *pmd; 3955 pmd_t *pmd;
3951 pte_t *ptep; 3956 pte_t *ptep;
3952 3957
3953 pgd = pgd_offset(mm, address); 3958 pgd = pgd_offset(mm, address);
3954 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 3959 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3955 goto out; 3960 goto out;
3956 3961
3957 pud = pud_offset(pgd, address); 3962 pud = pud_offset(pgd, address);
3958 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 3963 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3959 goto out; 3964 goto out;
3960 3965
3961 pmd = pmd_offset(pud, address); 3966 pmd = pmd_offset(pud, address);
3962 VM_BUG_ON(pmd_trans_huge(*pmd)); 3967 VM_BUG_ON(pmd_trans_huge(*pmd));
3963 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 3968 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3964 goto out; 3969 goto out;
3965 3970
3966 /* We cannot handle huge page PFN maps. Luckily they don't exist. */ 3971 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
3967 if (pmd_huge(*pmd)) 3972 if (pmd_huge(*pmd))
3968 goto out; 3973 goto out;
3969 3974
3970 ptep = pte_offset_map_lock(mm, pmd, address, ptlp); 3975 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3971 if (!ptep) 3976 if (!ptep)
3972 goto out; 3977 goto out;
3973 if (!pte_present(*ptep)) 3978 if (!pte_present(*ptep))
3974 goto unlock; 3979 goto unlock;
3975 *ptepp = ptep; 3980 *ptepp = ptep;
3976 return 0; 3981 return 0;
3977 unlock: 3982 unlock:
3978 pte_unmap_unlock(ptep, *ptlp); 3983 pte_unmap_unlock(ptep, *ptlp);
3979 out: 3984 out:
3980 return -EINVAL; 3985 return -EINVAL;
3981 } 3986 }
3982 3987
3983 static inline int follow_pte(struct mm_struct *mm, unsigned long address, 3988 static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3984 pte_t **ptepp, spinlock_t **ptlp) 3989 pte_t **ptepp, spinlock_t **ptlp)
3985 { 3990 {
3986 int res; 3991 int res;
3987 3992
3988 /* (void) is needed to make gcc happy */ 3993 /* (void) is needed to make gcc happy */
3989 (void) __cond_lock(*ptlp, 3994 (void) __cond_lock(*ptlp,
3990 !(res = __follow_pte(mm, address, ptepp, ptlp))); 3995 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3991 return res; 3996 return res;
3992 } 3997 }
3993 3998
3994 /** 3999 /**
3995 * follow_pfn - look up PFN at a user virtual address 4000 * follow_pfn - look up PFN at a user virtual address
3996 * @vma: memory mapping 4001 * @vma: memory mapping
3997 * @address: user virtual address 4002 * @address: user virtual address
3998 * @pfn: location to store found PFN 4003 * @pfn: location to store found PFN
3999 * 4004 *
4000 * Only IO mappings and raw PFN mappings are allowed. 4005 * Only IO mappings and raw PFN mappings are allowed.
4001 * 4006 *
4002 * Returns zero and the pfn at @pfn on success, -ve otherwise. 4007 * Returns zero and the pfn at @pfn on success, -ve otherwise.
4003 */ 4008 */
4004 int follow_pfn(struct vm_area_struct *vma, unsigned long address, 4009 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4005 unsigned long *pfn) 4010 unsigned long *pfn)
4006 { 4011 {
4007 int ret = -EINVAL; 4012 int ret = -EINVAL;
4008 spinlock_t *ptl; 4013 spinlock_t *ptl;
4009 pte_t *ptep; 4014 pte_t *ptep;
4010 4015
4011 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 4016 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4012 return ret; 4017 return ret;
4013 4018
4014 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); 4019 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4015 if (ret) 4020 if (ret)
4016 return ret; 4021 return ret;
4017 *pfn = pte_pfn(*ptep); 4022 *pfn = pte_pfn(*ptep);
4018 pte_unmap_unlock(ptep, ptl); 4023 pte_unmap_unlock(ptep, ptl);
4019 return 0; 4024 return 0;
4020 } 4025 }
4021 EXPORT_SYMBOL(follow_pfn); 4026 EXPORT_SYMBOL(follow_pfn);
4022 4027
4023 #ifdef CONFIG_HAVE_IOREMAP_PROT 4028 #ifdef CONFIG_HAVE_IOREMAP_PROT
4024 int follow_phys(struct vm_area_struct *vma, 4029 int follow_phys(struct vm_area_struct *vma,
4025 unsigned long address, unsigned int flags, 4030 unsigned long address, unsigned int flags,
4026 unsigned long *prot, resource_size_t *phys) 4031 unsigned long *prot, resource_size_t *phys)
4027 { 4032 {
4028 int ret = -EINVAL; 4033 int ret = -EINVAL;
4029 pte_t *ptep, pte; 4034 pte_t *ptep, pte;
4030 spinlock_t *ptl; 4035 spinlock_t *ptl;
4031 4036
4032 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 4037 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4033 goto out; 4038 goto out;
4034 4039
4035 if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) 4040 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4036 goto out; 4041 goto out;
4037 pte = *ptep; 4042 pte = *ptep;
4038 4043
4039 if ((flags & FOLL_WRITE) && !pte_write(pte)) 4044 if ((flags & FOLL_WRITE) && !pte_write(pte))
4040 goto unlock; 4045 goto unlock;
4041 4046
4042 *prot = pgprot_val(pte_pgprot(pte)); 4047 *prot = pgprot_val(pte_pgprot(pte));
4043 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; 4048 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4044 4049
4045 ret = 0; 4050 ret = 0;
4046 unlock: 4051 unlock:
4047 pte_unmap_unlock(ptep, ptl); 4052 pte_unmap_unlock(ptep, ptl);
4048 out: 4053 out:
4049 return ret; 4054 return ret;
4050 } 4055 }
4051 4056
4052 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, 4057 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4053 void *buf, int len, int write) 4058 void *buf, int len, int write)
4054 { 4059 {
4055 resource_size_t phys_addr; 4060 resource_size_t phys_addr;
4056 unsigned long prot = 0; 4061 unsigned long prot = 0;
4057 void __iomem *maddr; 4062 void __iomem *maddr;
4058 int offset = addr & (PAGE_SIZE-1); 4063 int offset = addr & (PAGE_SIZE-1);
4059 4064
4060 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 4065 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4061 return -EINVAL; 4066 return -EINVAL;
4062 4067
4063 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); 4068 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
4064 if (write) 4069 if (write)
4065 memcpy_toio(maddr + offset, buf, len); 4070 memcpy_toio(maddr + offset, buf, len);
4066 else 4071 else
4067 memcpy_fromio(buf, maddr + offset, len); 4072 memcpy_fromio(buf, maddr + offset, len);
4068 iounmap(maddr); 4073 iounmap(maddr);
4069 4074
4070 return len; 4075 return len;
4071 } 4076 }
4072 #endif 4077 #endif
4073 4078
4074 /* 4079 /*
4075 * Access another process' address space as given in mm. If non-NULL, use the 4080 * Access another process' address space as given in mm. If non-NULL, use the
4076 * given task for page fault accounting. 4081 * given task for page fault accounting.
4077 */ 4082 */
4078 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 4083 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4079 unsigned long addr, void *buf, int len, int write) 4084 unsigned long addr, void *buf, int len, int write)
4080 { 4085 {
4081 struct vm_area_struct *vma; 4086 struct vm_area_struct *vma;
4082 void *old_buf = buf; 4087 void *old_buf = buf;
4083 4088
4084 down_read(&mm->mmap_sem); 4089 down_read(&mm->mmap_sem);
4085 /* ignore errors, just check how much was successfully transferred */ 4090 /* ignore errors, just check how much was successfully transferred */
4086 while (len) { 4091 while (len) {
4087 int bytes, ret, offset; 4092 int bytes, ret, offset;
4088 void *maddr; 4093 void *maddr;
4089 struct page *page = NULL; 4094 struct page *page = NULL;
4090 4095
4091 ret = get_user_pages(tsk, mm, addr, 1, 4096 ret = get_user_pages(tsk, mm, addr, 1,
4092 write, 1, &page, &vma); 4097 write, 1, &page, &vma);
4093 if (ret <= 0) { 4098 if (ret <= 0) {
4094 /* 4099 /*
4095 * Check if this is a VM_IO | VM_PFNMAP VMA, which 4100 * Check if this is a VM_IO | VM_PFNMAP VMA, which
4096 * we can access using slightly different code. 4101 * we can access using slightly different code.
4097 */ 4102 */
4098 #ifdef CONFIG_HAVE_IOREMAP_PROT 4103 #ifdef CONFIG_HAVE_IOREMAP_PROT
4099 vma = find_vma(mm, addr); 4104 vma = find_vma(mm, addr);
4100 if (!vma || vma->vm_start > addr) 4105 if (!vma || vma->vm_start > addr)
4101 break; 4106 break;
4102 if (vma->vm_ops && vma->vm_ops->access) 4107 if (vma->vm_ops && vma->vm_ops->access)
4103 ret = vma->vm_ops->access(vma, addr, buf, 4108 ret = vma->vm_ops->access(vma, addr, buf,
4104 len, write); 4109 len, write);
4105 if (ret <= 0) 4110 if (ret <= 0)
4106 #endif 4111 #endif
4107 break; 4112 break;
4108 bytes = ret; 4113 bytes = ret;
4109 } else { 4114 } else {
4110 bytes = len; 4115 bytes = len;
4111 offset = addr & (PAGE_SIZE-1); 4116 offset = addr & (PAGE_SIZE-1);
4112 if (bytes > PAGE_SIZE-offset) 4117 if (bytes > PAGE_SIZE-offset)
4113 bytes = PAGE_SIZE-offset; 4118 bytes = PAGE_SIZE-offset;
4114 4119
4115 maddr = kmap(page); 4120 maddr = kmap(page);
4116 if (write) { 4121 if (write) {
4117 copy_to_user_page(vma, page, addr, 4122 copy_to_user_page(vma, page, addr,
4118 maddr + offset, buf, bytes); 4123 maddr + offset, buf, bytes);
4119 set_page_dirty_lock(page); 4124 set_page_dirty_lock(page);
4120 } else { 4125 } else {
4121 copy_from_user_page(vma, page, addr, 4126 copy_from_user_page(vma, page, addr,
4122 buf, maddr + offset, bytes); 4127 buf, maddr + offset, bytes);
4123 } 4128 }
4124 kunmap(page); 4129 kunmap(page);
4125 page_cache_release(page); 4130 page_cache_release(page);
4126 } 4131 }
4127 len -= bytes; 4132 len -= bytes;
4128 buf += bytes; 4133 buf += bytes;
4129 addr += bytes; 4134 addr += bytes;
4130 } 4135 }
4131 up_read(&mm->mmap_sem); 4136 up_read(&mm->mmap_sem);
4132 4137
4133 return buf - old_buf; 4138 return buf - old_buf;
4134 } 4139 }
4135 4140
4136 /** 4141 /**
4137 * access_remote_vm - access another process' address space 4142 * access_remote_vm - access another process' address space
4138 * @mm: the mm_struct of the target address space 4143 * @mm: the mm_struct of the target address space
4139 * @addr: start address to access 4144 * @addr: start address to access
4140 * @buf: source or destination buffer 4145 * @buf: source or destination buffer
4141 * @len: number of bytes to transfer 4146 * @len: number of bytes to transfer
4142 * @write: whether the access is a write 4147 * @write: whether the access is a write
4143 * 4148 *
4144 * The caller must hold a reference on @mm. 4149 * The caller must hold a reference on @mm.
4145 */ 4150 */
4146 int access_remote_vm(struct mm_struct *mm, unsigned long addr, 4151 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4147 void *buf, int len, int write) 4152 void *buf, int len, int write)
4148 { 4153 {
4149 return __access_remote_vm(NULL, mm, addr, buf, len, write); 4154 return __access_remote_vm(NULL, mm, addr, buf, len, write);
4150 } 4155 }
4151 4156
4152 /* 4157 /*
4153 * Access another process' address space. 4158 * Access another process' address space.
4154 * Source/target buffer must be kernel space, 4159 * Source/target buffer must be kernel space,
4155 * Do not walk the page table directly, use get_user_pages 4160 * Do not walk the page table directly, use get_user_pages
4156 */ 4161 */
4157 int access_process_vm(struct task_struct *tsk, unsigned long addr, 4162 int access_process_vm(struct task_struct *tsk, unsigned long addr,
4158 void *buf, int len, int write) 4163 void *buf, int len, int write)
4159 { 4164 {
4160 struct mm_struct *mm; 4165 struct mm_struct *mm;
4161 int ret; 4166 int ret;
4162 4167
4163 mm = get_task_mm(tsk); 4168 mm = get_task_mm(tsk);
4164 if (!mm) 4169 if (!mm)
4165 return 0; 4170 return 0;
4166 4171
4167 ret = __access_remote_vm(tsk, mm, addr, buf, len, write); 4172 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
4168 mmput(mm); 4173 mmput(mm);
4169 4174
4170 return ret; 4175 return ret;
4171 } 4176 }
4172 4177
4173 /* 4178 /*
4174 * Print the name of a VMA. 4179 * Print the name of a VMA.
4175 */ 4180 */
4176 void print_vma_addr(char *prefix, unsigned long ip) 4181 void print_vma_addr(char *prefix, unsigned long ip)
4177 { 4182 {
4178 struct mm_struct *mm = current->mm; 4183 struct mm_struct *mm = current->mm;
4179 struct vm_area_struct *vma; 4184 struct vm_area_struct *vma;
4180 4185
4181 /* 4186 /*
4182 * Do not print if we are in atomic 4187 * Do not print if we are in atomic
4183 * contexts (in exception stacks, etc.): 4188 * contexts (in exception stacks, etc.):
4184 */ 4189 */
4185 if (preempt_count()) 4190 if (preempt_count())
4186 return; 4191 return;
4187 4192
4188 down_read(&mm->mmap_sem); 4193 down_read(&mm->mmap_sem);
4189 vma = find_vma(mm, ip); 4194 vma = find_vma(mm, ip);
4190 if (vma && vma->vm_file) { 4195 if (vma && vma->vm_file) {
4191 struct file *f = vma->vm_file; 4196 struct file *f = vma->vm_file;
4192 char *buf = (char *)__get_free_page(GFP_KERNEL); 4197 char *buf = (char *)__get_free_page(GFP_KERNEL);
4193 if (buf) { 4198 if (buf) {
4194 char *p; 4199 char *p;
4195 4200
4196 p = d_path(&f->f_path, buf, PAGE_SIZE); 4201 p = d_path(&f->f_path, buf, PAGE_SIZE);
4197 if (IS_ERR(p)) 4202 if (IS_ERR(p))
4198 p = "?"; 4203 p = "?";
4199 printk("%s%s[%lx+%lx]", prefix, kbasename(p), 4204 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4200 vma->vm_start, 4205 vma->vm_start,
4201 vma->vm_end - vma->vm_start); 4206 vma->vm_end - vma->vm_start);
4202 free_page((unsigned long)buf); 4207 free_page((unsigned long)buf);
4203 } 4208 }
4204 } 4209 }
4205 up_read(&mm->mmap_sem); 4210 up_read(&mm->mmap_sem);
4206 } 4211 }
4207 4212
4208 #ifdef CONFIG_PROVE_LOCKING 4213 #ifdef CONFIG_PROVE_LOCKING
4209 void might_fault(void) 4214 void might_fault(void)
4210 { 4215 {
4211 /* 4216 /*
4212 * Some code (nfs/sunrpc) uses socket ops on kernel memory while 4217 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
4213 * holding the mmap_sem, this is safe because kernel memory doesn't 4218 * holding the mmap_sem, this is safe because kernel memory doesn't
4214 * get paged out, therefore we'll never actually fault, and the 4219 * get paged out, therefore we'll never actually fault, and the
4215 * below annotations will generate false positives. 4220 * below annotations will generate false positives.
4216 */ 4221 */
4217 if (segment_eq(get_fs(), KERNEL_DS)) 4222 if (segment_eq(get_fs(), KERNEL_DS))
4218 return; 4223 return;
4219 4224
4220 might_sleep(); 4225 might_sleep();
4221 /* 4226 /*
4222 * it would be nicer only to annotate paths which are not under 4227 * it would be nicer only to annotate paths which are not under
4223 * pagefault_disable, however that requires a larger audit and 4228 * pagefault_disable, however that requires a larger audit and
4224 * providing helpers like get_user_atomic. 4229 * providing helpers like get_user_atomic.
4225 */ 4230 */
4226 if (!in_atomic() && current->mm) 4231 if (!in_atomic() && current->mm)
4227 might_lock_read(&current->mm->mmap_sem); 4232 might_lock_read(&current->mm->mmap_sem);
4228 } 4233 }
4229 EXPORT_SYMBOL(might_fault); 4234 EXPORT_SYMBOL(might_fault);
4230 #endif 4235 #endif
4231 4236
4232 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 4237 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4233 static void clear_gigantic_page(struct page *page, 4238 static void clear_gigantic_page(struct page *page,
4234 unsigned long addr, 4239 unsigned long addr,
4235 unsigned int pages_per_huge_page) 4240 unsigned int pages_per_huge_page)
4236 { 4241 {
4237 int i; 4242 int i;
4238 struct page *p = page; 4243 struct page *p = page;
4239 4244
4240 might_sleep(); 4245 might_sleep();
4241 for (i = 0; i < pages_per_huge_page; 4246 for (i = 0; i < pages_per_huge_page;
4242 i++, p = mem_map_next(p, page, i)) { 4247 i++, p = mem_map_next(p, page, i)) {
4243 cond_resched(); 4248 cond_resched();
4244 clear_user_highpage(p, addr + i * PAGE_SIZE); 4249 clear_user_highpage(p, addr + i * PAGE_SIZE);
4245 } 4250 }
4246 } 4251 }
4247 void clear_huge_page(struct page *page, 4252 void clear_huge_page(struct page *page,
4248 unsigned long addr, unsigned int pages_per_huge_page) 4253 unsigned long addr, unsigned int pages_per_huge_page)
4249 { 4254 {
4250 int i; 4255 int i;
4251 4256
4252 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 4257 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4253 clear_gigantic_page(page, addr, pages_per_huge_page); 4258 clear_gigantic_page(page, addr, pages_per_huge_page);
4254 return; 4259 return;
4255 } 4260 }
4256 4261
4257 might_sleep(); 4262 might_sleep();
4258 for (i = 0; i < pages_per_huge_page; i++) { 4263 for (i = 0; i < pages_per_huge_page; i++) {
4259 cond_resched(); 4264 cond_resched();
4260 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 4265 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4261 } 4266 }
4262 } 4267 }
4263 4268
4264 static void copy_user_gigantic_page(struct page *dst, struct page *src, 4269 static void copy_user_gigantic_page(struct page *dst, struct page *src,
4265 unsigned long addr, 4270 unsigned long addr,
4266 struct vm_area_struct *vma, 4271 struct vm_area_struct *vma,
4267 unsigned int pages_per_huge_page) 4272 unsigned int pages_per_huge_page)
4268 { 4273 {
4269 int i; 4274 int i;
4270 struct page *dst_base = dst; 4275 struct page *dst_base = dst;
4271 struct page *src_base = src; 4276 struct page *src_base = src;
4272 4277
4273 for (i = 0; i < pages_per_huge_page; ) { 4278 for (i = 0; i < pages_per_huge_page; ) {
4274 cond_resched(); 4279 cond_resched();
4275 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 4280 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4276 4281
4277 i++; 4282 i++;
4278 dst = mem_map_next(dst, dst_base, i); 4283 dst = mem_map_next(dst, dst_base, i);
4279 src = mem_map_next(src, src_base, i); 4284 src = mem_map_next(src, src_base, i);
4280 } 4285 }
4281 } 4286 }
4282 4287
4283 void copy_user_huge_page(struct page *dst, struct page *src, 4288 void copy_user_huge_page(struct page *dst, struct page *src,
4284 unsigned long addr, struct vm_area_struct *vma, 4289 unsigned long addr, struct vm_area_struct *vma,
4285 unsigned int pages_per_huge_page) 4290 unsigned int pages_per_huge_page)
4286 { 4291 {
4287 int i; 4292 int i;
4288 4293
4289 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 4294 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4290 copy_user_gigantic_page(dst, src, addr, vma, 4295 copy_user_gigantic_page(dst, src, addr, vma,
4291 pages_per_huge_page); 4296 pages_per_huge_page);
4292 return; 4297 return;
4293 } 4298 }
4294 4299
4295 might_sleep(); 4300 might_sleep();
4296 for (i = 0; i < pages_per_huge_page; i++) { 4301 for (i = 0; i < pages_per_huge_page; i++) {
4297 cond_resched(); 4302 cond_resched();
4298 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 4303 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4299 } 4304 }
4300 } 4305 }
4301 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 4306 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
4302 4307