Commit 6a01f8dd2508cf79abbdccc44a6a41b2e17fb3cb

Authored by David Rientjes
Committed by Jiri Slaby
1 parent 1d08848674

mm, thp: only collapse hugepages to nodes with affinity for zone_reclaim_mode

commit 14a4e2141e24304fff2c697be6382ffb83888185 upstream.

Commit 9f1b868a13ac ("mm: thp: khugepaged: add policy for finding target
node") improved the previous khugepaged logic which allocated a
transparent hugepages from the node of the first page being collapsed.

However, it is still possible to collapse pages to remote memory which
may suffer from additional access latency.  With the current policy, it
is possible that 255 pages (with PAGE_SHIFT == 12) will be collapsed
remotely if the majority are allocated from that node.

When zone_reclaim_mode is enabled, it means the VM should make every
attempt to allocate locally to prevent NUMA performance degradation.  In
this case, we do not want to collapse hugepages to remote nodes that
would suffer from increased access latency.  Thus, when
zone_reclaim_mode is enabled, only allow collapsing to nodes with
RECLAIM_DISTANCE or less.

There is no functional change for systems that disable
zone_reclaim_mode.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 1 changed file with 26 additions and 0 deletions Inline Diff

1 /* 1 /*
2 * Copyright (C) 2009 Red Hat, Inc. 2 * Copyright (C) 2009 Red Hat, Inc.
3 * 3 *
4 * This work is licensed under the terms of the GNU GPL, version 2. See 4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory. 5 * the COPYING file in the top-level directory.
6 */ 6 */
7 7
8 #include <linux/mm.h> 8 #include <linux/mm.h>
9 #include <linux/sched.h> 9 #include <linux/sched.h>
10 #include <linux/highmem.h> 10 #include <linux/highmem.h>
11 #include <linux/hugetlb.h> 11 #include <linux/hugetlb.h>
12 #include <linux/mmu_notifier.h> 12 #include <linux/mmu_notifier.h>
13 #include <linux/rmap.h> 13 #include <linux/rmap.h>
14 #include <linux/swap.h> 14 #include <linux/swap.h>
15 #include <linux/shrinker.h> 15 #include <linux/shrinker.h>
16 #include <linux/mm_inline.h> 16 #include <linux/mm_inline.h>
17 #include <linux/kthread.h> 17 #include <linux/kthread.h>
18 #include <linux/khugepaged.h> 18 #include <linux/khugepaged.h>
19 #include <linux/freezer.h> 19 #include <linux/freezer.h>
20 #include <linux/mman.h> 20 #include <linux/mman.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/migrate.h> 22 #include <linux/migrate.h>
23 #include <linux/hashtable.h> 23 #include <linux/hashtable.h>
24 24
25 #include <asm/tlb.h> 25 #include <asm/tlb.h>
26 #include <asm/pgalloc.h> 26 #include <asm/pgalloc.h>
27 #include "internal.h" 27 #include "internal.h"
28 28
29 /* 29 /*
30 * By default transparent hugepage support is enabled for all mappings 30 * By default transparent hugepage support is enabled for all mappings
31 * and khugepaged scans all mappings. Defrag is only invoked by 31 * and khugepaged scans all mappings. Defrag is only invoked by
32 * khugepaged hugepage allocations and by page faults inside 32 * khugepaged hugepage allocations and by page faults inside
33 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived 33 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
34 * allocations. 34 * allocations.
35 */ 35 */
36 unsigned long transparent_hugepage_flags __read_mostly = 36 unsigned long transparent_hugepage_flags __read_mostly =
37 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 37 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
38 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 38 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
39 #endif 39 #endif
40 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 40 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
41 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 41 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
42 #endif 42 #endif
43 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 43 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
44 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 44 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
45 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 45 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
46 46
47 /* default scan 8*512 pte (or vmas) every 30 second */ 47 /* default scan 8*512 pte (or vmas) every 30 second */
48 static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 48 static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
49 static unsigned int khugepaged_pages_collapsed; 49 static unsigned int khugepaged_pages_collapsed;
50 static unsigned int khugepaged_full_scans; 50 static unsigned int khugepaged_full_scans;
51 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 51 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
52 /* during fragmentation poll the hugepage allocator once every minute */ 52 /* during fragmentation poll the hugepage allocator once every minute */
53 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 53 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
54 static struct task_struct *khugepaged_thread __read_mostly; 54 static struct task_struct *khugepaged_thread __read_mostly;
55 static DEFINE_MUTEX(khugepaged_mutex); 55 static DEFINE_MUTEX(khugepaged_mutex);
56 static DEFINE_SPINLOCK(khugepaged_mm_lock); 56 static DEFINE_SPINLOCK(khugepaged_mm_lock);
57 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 57 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
58 /* 58 /*
59 * default collapse hugepages if there is at least one pte mapped like 59 * default collapse hugepages if there is at least one pte mapped like
60 * it would have happened if the vma was large enough during page 60 * it would have happened if the vma was large enough during page
61 * fault. 61 * fault.
62 */ 62 */
63 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; 63 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
64 64
65 static int khugepaged(void *none); 65 static int khugepaged(void *none);
66 static int khugepaged_slab_init(void); 66 static int khugepaged_slab_init(void);
67 67
68 #define MM_SLOTS_HASH_BITS 10 68 #define MM_SLOTS_HASH_BITS 10
69 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 69 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
70 70
71 static struct kmem_cache *mm_slot_cache __read_mostly; 71 static struct kmem_cache *mm_slot_cache __read_mostly;
72 72
73 /** 73 /**
74 * struct mm_slot - hash lookup from mm to mm_slot 74 * struct mm_slot - hash lookup from mm to mm_slot
75 * @hash: hash collision list 75 * @hash: hash collision list
76 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head 76 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
77 * @mm: the mm that this information is valid for 77 * @mm: the mm that this information is valid for
78 */ 78 */
79 struct mm_slot { 79 struct mm_slot {
80 struct hlist_node hash; 80 struct hlist_node hash;
81 struct list_head mm_node; 81 struct list_head mm_node;
82 struct mm_struct *mm; 82 struct mm_struct *mm;
83 }; 83 };
84 84
85 /** 85 /**
86 * struct khugepaged_scan - cursor for scanning 86 * struct khugepaged_scan - cursor for scanning
87 * @mm_head: the head of the mm list to scan 87 * @mm_head: the head of the mm list to scan
88 * @mm_slot: the current mm_slot we are scanning 88 * @mm_slot: the current mm_slot we are scanning
89 * @address: the next address inside that to be scanned 89 * @address: the next address inside that to be scanned
90 * 90 *
91 * There is only the one khugepaged_scan instance of this cursor structure. 91 * There is only the one khugepaged_scan instance of this cursor structure.
92 */ 92 */
93 struct khugepaged_scan { 93 struct khugepaged_scan {
94 struct list_head mm_head; 94 struct list_head mm_head;
95 struct mm_slot *mm_slot; 95 struct mm_slot *mm_slot;
96 unsigned long address; 96 unsigned long address;
97 }; 97 };
98 static struct khugepaged_scan khugepaged_scan = { 98 static struct khugepaged_scan khugepaged_scan = {
99 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 99 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
100 }; 100 };
101 101
102 102
103 static int set_recommended_min_free_kbytes(void) 103 static int set_recommended_min_free_kbytes(void)
104 { 104 {
105 struct zone *zone; 105 struct zone *zone;
106 int nr_zones = 0; 106 int nr_zones = 0;
107 unsigned long recommended_min; 107 unsigned long recommended_min;
108 108
109 if (!khugepaged_enabled()) 109 if (!khugepaged_enabled())
110 return 0; 110 return 0;
111 111
112 for_each_populated_zone(zone) 112 for_each_populated_zone(zone)
113 nr_zones++; 113 nr_zones++;
114 114
115 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ 115 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
116 recommended_min = pageblock_nr_pages * nr_zones * 2; 116 recommended_min = pageblock_nr_pages * nr_zones * 2;
117 117
118 /* 118 /*
119 * Make sure that on average at least two pageblocks are almost free 119 * Make sure that on average at least two pageblocks are almost free
120 * of another type, one for a migratetype to fall back to and a 120 * of another type, one for a migratetype to fall back to and a
121 * second to avoid subsequent fallbacks of other types There are 3 121 * second to avoid subsequent fallbacks of other types There are 3
122 * MIGRATE_TYPES we care about. 122 * MIGRATE_TYPES we care about.
123 */ 123 */
124 recommended_min += pageblock_nr_pages * nr_zones * 124 recommended_min += pageblock_nr_pages * nr_zones *
125 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 125 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
126 126
127 /* don't ever allow to reserve more than 5% of the lowmem */ 127 /* don't ever allow to reserve more than 5% of the lowmem */
128 recommended_min = min(recommended_min, 128 recommended_min = min(recommended_min,
129 (unsigned long) nr_free_buffer_pages() / 20); 129 (unsigned long) nr_free_buffer_pages() / 20);
130 recommended_min <<= (PAGE_SHIFT-10); 130 recommended_min <<= (PAGE_SHIFT-10);
131 131
132 if (recommended_min > min_free_kbytes) 132 if (recommended_min > min_free_kbytes)
133 min_free_kbytes = recommended_min; 133 min_free_kbytes = recommended_min;
134 setup_per_zone_wmarks(); 134 setup_per_zone_wmarks();
135 return 0; 135 return 0;
136 } 136 }
137 late_initcall(set_recommended_min_free_kbytes); 137 late_initcall(set_recommended_min_free_kbytes);
138 138
139 static int start_khugepaged(void) 139 static int start_khugepaged(void)
140 { 140 {
141 int err = 0; 141 int err = 0;
142 if (khugepaged_enabled()) { 142 if (khugepaged_enabled()) {
143 if (!khugepaged_thread) 143 if (!khugepaged_thread)
144 khugepaged_thread = kthread_run(khugepaged, NULL, 144 khugepaged_thread = kthread_run(khugepaged, NULL,
145 "khugepaged"); 145 "khugepaged");
146 if (unlikely(IS_ERR(khugepaged_thread))) { 146 if (unlikely(IS_ERR(khugepaged_thread))) {
147 printk(KERN_ERR 147 printk(KERN_ERR
148 "khugepaged: kthread_run(khugepaged) failed\n"); 148 "khugepaged: kthread_run(khugepaged) failed\n");
149 err = PTR_ERR(khugepaged_thread); 149 err = PTR_ERR(khugepaged_thread);
150 khugepaged_thread = NULL; 150 khugepaged_thread = NULL;
151 } 151 }
152 152
153 if (!list_empty(&khugepaged_scan.mm_head)) 153 if (!list_empty(&khugepaged_scan.mm_head))
154 wake_up_interruptible(&khugepaged_wait); 154 wake_up_interruptible(&khugepaged_wait);
155 155
156 set_recommended_min_free_kbytes(); 156 set_recommended_min_free_kbytes();
157 } else if (khugepaged_thread) { 157 } else if (khugepaged_thread) {
158 kthread_stop(khugepaged_thread); 158 kthread_stop(khugepaged_thread);
159 khugepaged_thread = NULL; 159 khugepaged_thread = NULL;
160 } 160 }
161 161
162 return err; 162 return err;
163 } 163 }
164 164
165 static atomic_t huge_zero_refcount; 165 static atomic_t huge_zero_refcount;
166 static struct page *huge_zero_page __read_mostly; 166 static struct page *huge_zero_page __read_mostly;
167 167
168 static inline bool is_huge_zero_page(struct page *page) 168 static inline bool is_huge_zero_page(struct page *page)
169 { 169 {
170 return ACCESS_ONCE(huge_zero_page) == page; 170 return ACCESS_ONCE(huge_zero_page) == page;
171 } 171 }
172 172
173 static inline bool is_huge_zero_pmd(pmd_t pmd) 173 static inline bool is_huge_zero_pmd(pmd_t pmd)
174 { 174 {
175 return is_huge_zero_page(pmd_page(pmd)); 175 return is_huge_zero_page(pmd_page(pmd));
176 } 176 }
177 177
178 static struct page *get_huge_zero_page(void) 178 static struct page *get_huge_zero_page(void)
179 { 179 {
180 struct page *zero_page; 180 struct page *zero_page;
181 retry: 181 retry:
182 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 182 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
183 return ACCESS_ONCE(huge_zero_page); 183 return ACCESS_ONCE(huge_zero_page);
184 184
185 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 185 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
186 HPAGE_PMD_ORDER); 186 HPAGE_PMD_ORDER);
187 if (!zero_page) { 187 if (!zero_page) {
188 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 188 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
189 return NULL; 189 return NULL;
190 } 190 }
191 count_vm_event(THP_ZERO_PAGE_ALLOC); 191 count_vm_event(THP_ZERO_PAGE_ALLOC);
192 preempt_disable(); 192 preempt_disable();
193 if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 193 if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
194 preempt_enable(); 194 preempt_enable();
195 __free_page(zero_page); 195 __free_page(zero_page);
196 goto retry; 196 goto retry;
197 } 197 }
198 198
199 /* We take additional reference here. It will be put back by shrinker */ 199 /* We take additional reference here. It will be put back by shrinker */
200 atomic_set(&huge_zero_refcount, 2); 200 atomic_set(&huge_zero_refcount, 2);
201 preempt_enable(); 201 preempt_enable();
202 return ACCESS_ONCE(huge_zero_page); 202 return ACCESS_ONCE(huge_zero_page);
203 } 203 }
204 204
205 static void put_huge_zero_page(void) 205 static void put_huge_zero_page(void)
206 { 206 {
207 /* 207 /*
208 * Counter should never go to zero here. Only shrinker can put 208 * Counter should never go to zero here. Only shrinker can put
209 * last reference. 209 * last reference.
210 */ 210 */
211 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 211 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
212 } 212 }
213 213
214 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 214 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
215 struct shrink_control *sc) 215 struct shrink_control *sc)
216 { 216 {
217 /* we can free zero page only if last reference remains */ 217 /* we can free zero page only if last reference remains */
218 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 218 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
219 } 219 }
220 220
221 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 221 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
222 struct shrink_control *sc) 222 struct shrink_control *sc)
223 { 223 {
224 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 224 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
225 struct page *zero_page = xchg(&huge_zero_page, NULL); 225 struct page *zero_page = xchg(&huge_zero_page, NULL);
226 BUG_ON(zero_page == NULL); 226 BUG_ON(zero_page == NULL);
227 __free_page(zero_page); 227 __free_page(zero_page);
228 return HPAGE_PMD_NR; 228 return HPAGE_PMD_NR;
229 } 229 }
230 230
231 return 0; 231 return 0;
232 } 232 }
233 233
234 static struct shrinker huge_zero_page_shrinker = { 234 static struct shrinker huge_zero_page_shrinker = {
235 .count_objects = shrink_huge_zero_page_count, 235 .count_objects = shrink_huge_zero_page_count,
236 .scan_objects = shrink_huge_zero_page_scan, 236 .scan_objects = shrink_huge_zero_page_scan,
237 .seeks = DEFAULT_SEEKS, 237 .seeks = DEFAULT_SEEKS,
238 }; 238 };
239 239
240 #ifdef CONFIG_SYSFS 240 #ifdef CONFIG_SYSFS
241 241
242 static ssize_t double_flag_show(struct kobject *kobj, 242 static ssize_t double_flag_show(struct kobject *kobj,
243 struct kobj_attribute *attr, char *buf, 243 struct kobj_attribute *attr, char *buf,
244 enum transparent_hugepage_flag enabled, 244 enum transparent_hugepage_flag enabled,
245 enum transparent_hugepage_flag req_madv) 245 enum transparent_hugepage_flag req_madv)
246 { 246 {
247 if (test_bit(enabled, &transparent_hugepage_flags)) { 247 if (test_bit(enabled, &transparent_hugepage_flags)) {
248 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); 248 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
249 return sprintf(buf, "[always] madvise never\n"); 249 return sprintf(buf, "[always] madvise never\n");
250 } else if (test_bit(req_madv, &transparent_hugepage_flags)) 250 } else if (test_bit(req_madv, &transparent_hugepage_flags))
251 return sprintf(buf, "always [madvise] never\n"); 251 return sprintf(buf, "always [madvise] never\n");
252 else 252 else
253 return sprintf(buf, "always madvise [never]\n"); 253 return sprintf(buf, "always madvise [never]\n");
254 } 254 }
255 static ssize_t double_flag_store(struct kobject *kobj, 255 static ssize_t double_flag_store(struct kobject *kobj,
256 struct kobj_attribute *attr, 256 struct kobj_attribute *attr,
257 const char *buf, size_t count, 257 const char *buf, size_t count,
258 enum transparent_hugepage_flag enabled, 258 enum transparent_hugepage_flag enabled,
259 enum transparent_hugepage_flag req_madv) 259 enum transparent_hugepage_flag req_madv)
260 { 260 {
261 if (!memcmp("always", buf, 261 if (!memcmp("always", buf,
262 min(sizeof("always")-1, count))) { 262 min(sizeof("always")-1, count))) {
263 set_bit(enabled, &transparent_hugepage_flags); 263 set_bit(enabled, &transparent_hugepage_flags);
264 clear_bit(req_madv, &transparent_hugepage_flags); 264 clear_bit(req_madv, &transparent_hugepage_flags);
265 } else if (!memcmp("madvise", buf, 265 } else if (!memcmp("madvise", buf,
266 min(sizeof("madvise")-1, count))) { 266 min(sizeof("madvise")-1, count))) {
267 clear_bit(enabled, &transparent_hugepage_flags); 267 clear_bit(enabled, &transparent_hugepage_flags);
268 set_bit(req_madv, &transparent_hugepage_flags); 268 set_bit(req_madv, &transparent_hugepage_flags);
269 } else if (!memcmp("never", buf, 269 } else if (!memcmp("never", buf,
270 min(sizeof("never")-1, count))) { 270 min(sizeof("never")-1, count))) {
271 clear_bit(enabled, &transparent_hugepage_flags); 271 clear_bit(enabled, &transparent_hugepage_flags);
272 clear_bit(req_madv, &transparent_hugepage_flags); 272 clear_bit(req_madv, &transparent_hugepage_flags);
273 } else 273 } else
274 return -EINVAL; 274 return -EINVAL;
275 275
276 return count; 276 return count;
277 } 277 }
278 278
279 static ssize_t enabled_show(struct kobject *kobj, 279 static ssize_t enabled_show(struct kobject *kobj,
280 struct kobj_attribute *attr, char *buf) 280 struct kobj_attribute *attr, char *buf)
281 { 281 {
282 return double_flag_show(kobj, attr, buf, 282 return double_flag_show(kobj, attr, buf,
283 TRANSPARENT_HUGEPAGE_FLAG, 283 TRANSPARENT_HUGEPAGE_FLAG,
284 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 284 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
285 } 285 }
286 static ssize_t enabled_store(struct kobject *kobj, 286 static ssize_t enabled_store(struct kobject *kobj,
287 struct kobj_attribute *attr, 287 struct kobj_attribute *attr,
288 const char *buf, size_t count) 288 const char *buf, size_t count)
289 { 289 {
290 ssize_t ret; 290 ssize_t ret;
291 291
292 ret = double_flag_store(kobj, attr, buf, count, 292 ret = double_flag_store(kobj, attr, buf, count,
293 TRANSPARENT_HUGEPAGE_FLAG, 293 TRANSPARENT_HUGEPAGE_FLAG,
294 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 294 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
295 295
296 if (ret > 0) { 296 if (ret > 0) {
297 int err; 297 int err;
298 298
299 mutex_lock(&khugepaged_mutex); 299 mutex_lock(&khugepaged_mutex);
300 err = start_khugepaged(); 300 err = start_khugepaged();
301 mutex_unlock(&khugepaged_mutex); 301 mutex_unlock(&khugepaged_mutex);
302 302
303 if (err) 303 if (err)
304 ret = err; 304 ret = err;
305 } 305 }
306 306
307 return ret; 307 return ret;
308 } 308 }
309 static struct kobj_attribute enabled_attr = 309 static struct kobj_attribute enabled_attr =
310 __ATTR(enabled, 0644, enabled_show, enabled_store); 310 __ATTR(enabled, 0644, enabled_show, enabled_store);
311 311
312 static ssize_t single_flag_show(struct kobject *kobj, 312 static ssize_t single_flag_show(struct kobject *kobj,
313 struct kobj_attribute *attr, char *buf, 313 struct kobj_attribute *attr, char *buf,
314 enum transparent_hugepage_flag flag) 314 enum transparent_hugepage_flag flag)
315 { 315 {
316 return sprintf(buf, "%d\n", 316 return sprintf(buf, "%d\n",
317 !!test_bit(flag, &transparent_hugepage_flags)); 317 !!test_bit(flag, &transparent_hugepage_flags));
318 } 318 }
319 319
320 static ssize_t single_flag_store(struct kobject *kobj, 320 static ssize_t single_flag_store(struct kobject *kobj,
321 struct kobj_attribute *attr, 321 struct kobj_attribute *attr,
322 const char *buf, size_t count, 322 const char *buf, size_t count,
323 enum transparent_hugepage_flag flag) 323 enum transparent_hugepage_flag flag)
324 { 324 {
325 unsigned long value; 325 unsigned long value;
326 int ret; 326 int ret;
327 327
328 ret = kstrtoul(buf, 10, &value); 328 ret = kstrtoul(buf, 10, &value);
329 if (ret < 0) 329 if (ret < 0)
330 return ret; 330 return ret;
331 if (value > 1) 331 if (value > 1)
332 return -EINVAL; 332 return -EINVAL;
333 333
334 if (value) 334 if (value)
335 set_bit(flag, &transparent_hugepage_flags); 335 set_bit(flag, &transparent_hugepage_flags);
336 else 336 else
337 clear_bit(flag, &transparent_hugepage_flags); 337 clear_bit(flag, &transparent_hugepage_flags);
338 338
339 return count; 339 return count;
340 } 340 }
341 341
342 /* 342 /*
343 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind 343 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
344 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of 344 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
345 * memory just to allocate one more hugepage. 345 * memory just to allocate one more hugepage.
346 */ 346 */
347 static ssize_t defrag_show(struct kobject *kobj, 347 static ssize_t defrag_show(struct kobject *kobj,
348 struct kobj_attribute *attr, char *buf) 348 struct kobj_attribute *attr, char *buf)
349 { 349 {
350 return double_flag_show(kobj, attr, buf, 350 return double_flag_show(kobj, attr, buf,
351 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, 351 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
352 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 352 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
353 } 353 }
354 static ssize_t defrag_store(struct kobject *kobj, 354 static ssize_t defrag_store(struct kobject *kobj,
355 struct kobj_attribute *attr, 355 struct kobj_attribute *attr,
356 const char *buf, size_t count) 356 const char *buf, size_t count)
357 { 357 {
358 return double_flag_store(kobj, attr, buf, count, 358 return double_flag_store(kobj, attr, buf, count,
359 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, 359 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
360 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 360 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
361 } 361 }
362 static struct kobj_attribute defrag_attr = 362 static struct kobj_attribute defrag_attr =
363 __ATTR(defrag, 0644, defrag_show, defrag_store); 363 __ATTR(defrag, 0644, defrag_show, defrag_store);
364 364
365 static ssize_t use_zero_page_show(struct kobject *kobj, 365 static ssize_t use_zero_page_show(struct kobject *kobj,
366 struct kobj_attribute *attr, char *buf) 366 struct kobj_attribute *attr, char *buf)
367 { 367 {
368 return single_flag_show(kobj, attr, buf, 368 return single_flag_show(kobj, attr, buf,
369 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 369 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
370 } 370 }
371 static ssize_t use_zero_page_store(struct kobject *kobj, 371 static ssize_t use_zero_page_store(struct kobject *kobj,
372 struct kobj_attribute *attr, const char *buf, size_t count) 372 struct kobj_attribute *attr, const char *buf, size_t count)
373 { 373 {
374 return single_flag_store(kobj, attr, buf, count, 374 return single_flag_store(kobj, attr, buf, count,
375 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 375 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
376 } 376 }
377 static struct kobj_attribute use_zero_page_attr = 377 static struct kobj_attribute use_zero_page_attr =
378 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 378 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
379 #ifdef CONFIG_DEBUG_VM 379 #ifdef CONFIG_DEBUG_VM
380 static ssize_t debug_cow_show(struct kobject *kobj, 380 static ssize_t debug_cow_show(struct kobject *kobj,
381 struct kobj_attribute *attr, char *buf) 381 struct kobj_attribute *attr, char *buf)
382 { 382 {
383 return single_flag_show(kobj, attr, buf, 383 return single_flag_show(kobj, attr, buf,
384 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 384 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
385 } 385 }
386 static ssize_t debug_cow_store(struct kobject *kobj, 386 static ssize_t debug_cow_store(struct kobject *kobj,
387 struct kobj_attribute *attr, 387 struct kobj_attribute *attr,
388 const char *buf, size_t count) 388 const char *buf, size_t count)
389 { 389 {
390 return single_flag_store(kobj, attr, buf, count, 390 return single_flag_store(kobj, attr, buf, count,
391 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 391 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
392 } 392 }
393 static struct kobj_attribute debug_cow_attr = 393 static struct kobj_attribute debug_cow_attr =
394 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 394 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
395 #endif /* CONFIG_DEBUG_VM */ 395 #endif /* CONFIG_DEBUG_VM */
396 396
397 static struct attribute *hugepage_attr[] = { 397 static struct attribute *hugepage_attr[] = {
398 &enabled_attr.attr, 398 &enabled_attr.attr,
399 &defrag_attr.attr, 399 &defrag_attr.attr,
400 &use_zero_page_attr.attr, 400 &use_zero_page_attr.attr,
401 #ifdef CONFIG_DEBUG_VM 401 #ifdef CONFIG_DEBUG_VM
402 &debug_cow_attr.attr, 402 &debug_cow_attr.attr,
403 #endif 403 #endif
404 NULL, 404 NULL,
405 }; 405 };
406 406
407 static struct attribute_group hugepage_attr_group = { 407 static struct attribute_group hugepage_attr_group = {
408 .attrs = hugepage_attr, 408 .attrs = hugepage_attr,
409 }; 409 };
410 410
411 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 411 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
412 struct kobj_attribute *attr, 412 struct kobj_attribute *attr,
413 char *buf) 413 char *buf)
414 { 414 {
415 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); 415 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
416 } 416 }
417 417
418 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 418 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
419 struct kobj_attribute *attr, 419 struct kobj_attribute *attr,
420 const char *buf, size_t count) 420 const char *buf, size_t count)
421 { 421 {
422 unsigned long msecs; 422 unsigned long msecs;
423 int err; 423 int err;
424 424
425 err = kstrtoul(buf, 10, &msecs); 425 err = kstrtoul(buf, 10, &msecs);
426 if (err || msecs > UINT_MAX) 426 if (err || msecs > UINT_MAX)
427 return -EINVAL; 427 return -EINVAL;
428 428
429 khugepaged_scan_sleep_millisecs = msecs; 429 khugepaged_scan_sleep_millisecs = msecs;
430 wake_up_interruptible(&khugepaged_wait); 430 wake_up_interruptible(&khugepaged_wait);
431 431
432 return count; 432 return count;
433 } 433 }
434 static struct kobj_attribute scan_sleep_millisecs_attr = 434 static struct kobj_attribute scan_sleep_millisecs_attr =
435 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, 435 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
436 scan_sleep_millisecs_store); 436 scan_sleep_millisecs_store);
437 437
438 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 438 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
439 struct kobj_attribute *attr, 439 struct kobj_attribute *attr,
440 char *buf) 440 char *buf)
441 { 441 {
442 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 442 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
443 } 443 }
444 444
445 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 445 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
446 struct kobj_attribute *attr, 446 struct kobj_attribute *attr,
447 const char *buf, size_t count) 447 const char *buf, size_t count)
448 { 448 {
449 unsigned long msecs; 449 unsigned long msecs;
450 int err; 450 int err;
451 451
452 err = kstrtoul(buf, 10, &msecs); 452 err = kstrtoul(buf, 10, &msecs);
453 if (err || msecs > UINT_MAX) 453 if (err || msecs > UINT_MAX)
454 return -EINVAL; 454 return -EINVAL;
455 455
456 khugepaged_alloc_sleep_millisecs = msecs; 456 khugepaged_alloc_sleep_millisecs = msecs;
457 wake_up_interruptible(&khugepaged_wait); 457 wake_up_interruptible(&khugepaged_wait);
458 458
459 return count; 459 return count;
460 } 460 }
461 static struct kobj_attribute alloc_sleep_millisecs_attr = 461 static struct kobj_attribute alloc_sleep_millisecs_attr =
462 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, 462 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
463 alloc_sleep_millisecs_store); 463 alloc_sleep_millisecs_store);
464 464
465 static ssize_t pages_to_scan_show(struct kobject *kobj, 465 static ssize_t pages_to_scan_show(struct kobject *kobj,
466 struct kobj_attribute *attr, 466 struct kobj_attribute *attr,
467 char *buf) 467 char *buf)
468 { 468 {
469 return sprintf(buf, "%u\n", khugepaged_pages_to_scan); 469 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
470 } 470 }
471 static ssize_t pages_to_scan_store(struct kobject *kobj, 471 static ssize_t pages_to_scan_store(struct kobject *kobj,
472 struct kobj_attribute *attr, 472 struct kobj_attribute *attr,
473 const char *buf, size_t count) 473 const char *buf, size_t count)
474 { 474 {
475 int err; 475 int err;
476 unsigned long pages; 476 unsigned long pages;
477 477
478 err = kstrtoul(buf, 10, &pages); 478 err = kstrtoul(buf, 10, &pages);
479 if (err || !pages || pages > UINT_MAX) 479 if (err || !pages || pages > UINT_MAX)
480 return -EINVAL; 480 return -EINVAL;
481 481
482 khugepaged_pages_to_scan = pages; 482 khugepaged_pages_to_scan = pages;
483 483
484 return count; 484 return count;
485 } 485 }
486 static struct kobj_attribute pages_to_scan_attr = 486 static struct kobj_attribute pages_to_scan_attr =
487 __ATTR(pages_to_scan, 0644, pages_to_scan_show, 487 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
488 pages_to_scan_store); 488 pages_to_scan_store);
489 489
490 static ssize_t pages_collapsed_show(struct kobject *kobj, 490 static ssize_t pages_collapsed_show(struct kobject *kobj,
491 struct kobj_attribute *attr, 491 struct kobj_attribute *attr,
492 char *buf) 492 char *buf)
493 { 493 {
494 return sprintf(buf, "%u\n", khugepaged_pages_collapsed); 494 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
495 } 495 }
496 static struct kobj_attribute pages_collapsed_attr = 496 static struct kobj_attribute pages_collapsed_attr =
497 __ATTR_RO(pages_collapsed); 497 __ATTR_RO(pages_collapsed);
498 498
499 static ssize_t full_scans_show(struct kobject *kobj, 499 static ssize_t full_scans_show(struct kobject *kobj,
500 struct kobj_attribute *attr, 500 struct kobj_attribute *attr,
501 char *buf) 501 char *buf)
502 { 502 {
503 return sprintf(buf, "%u\n", khugepaged_full_scans); 503 return sprintf(buf, "%u\n", khugepaged_full_scans);
504 } 504 }
505 static struct kobj_attribute full_scans_attr = 505 static struct kobj_attribute full_scans_attr =
506 __ATTR_RO(full_scans); 506 __ATTR_RO(full_scans);
507 507
508 static ssize_t khugepaged_defrag_show(struct kobject *kobj, 508 static ssize_t khugepaged_defrag_show(struct kobject *kobj,
509 struct kobj_attribute *attr, char *buf) 509 struct kobj_attribute *attr, char *buf)
510 { 510 {
511 return single_flag_show(kobj, attr, buf, 511 return single_flag_show(kobj, attr, buf,
512 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 512 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
513 } 513 }
514 static ssize_t khugepaged_defrag_store(struct kobject *kobj, 514 static ssize_t khugepaged_defrag_store(struct kobject *kobj,
515 struct kobj_attribute *attr, 515 struct kobj_attribute *attr,
516 const char *buf, size_t count) 516 const char *buf, size_t count)
517 { 517 {
518 return single_flag_store(kobj, attr, buf, count, 518 return single_flag_store(kobj, attr, buf, count,
519 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 519 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
520 } 520 }
521 static struct kobj_attribute khugepaged_defrag_attr = 521 static struct kobj_attribute khugepaged_defrag_attr =
522 __ATTR(defrag, 0644, khugepaged_defrag_show, 522 __ATTR(defrag, 0644, khugepaged_defrag_show,
523 khugepaged_defrag_store); 523 khugepaged_defrag_store);
524 524
525 /* 525 /*
526 * max_ptes_none controls if khugepaged should collapse hugepages over 526 * max_ptes_none controls if khugepaged should collapse hugepages over
527 * any unmapped ptes in turn potentially increasing the memory 527 * any unmapped ptes in turn potentially increasing the memory
528 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 528 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
529 * reduce the available free memory in the system as it 529 * reduce the available free memory in the system as it
530 * runs. Increasing max_ptes_none will instead potentially reduce the 530 * runs. Increasing max_ptes_none will instead potentially reduce the
531 * free memory in the system during the khugepaged scan. 531 * free memory in the system during the khugepaged scan.
532 */ 532 */
533 static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, 533 static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
534 struct kobj_attribute *attr, 534 struct kobj_attribute *attr,
535 char *buf) 535 char *buf)
536 { 536 {
537 return sprintf(buf, "%u\n", khugepaged_max_ptes_none); 537 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
538 } 538 }
539 static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, 539 static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
540 struct kobj_attribute *attr, 540 struct kobj_attribute *attr,
541 const char *buf, size_t count) 541 const char *buf, size_t count)
542 { 542 {
543 int err; 543 int err;
544 unsigned long max_ptes_none; 544 unsigned long max_ptes_none;
545 545
546 err = kstrtoul(buf, 10, &max_ptes_none); 546 err = kstrtoul(buf, 10, &max_ptes_none);
547 if (err || max_ptes_none > HPAGE_PMD_NR-1) 547 if (err || max_ptes_none > HPAGE_PMD_NR-1)
548 return -EINVAL; 548 return -EINVAL;
549 549
550 khugepaged_max_ptes_none = max_ptes_none; 550 khugepaged_max_ptes_none = max_ptes_none;
551 551
552 return count; 552 return count;
553 } 553 }
554 static struct kobj_attribute khugepaged_max_ptes_none_attr = 554 static struct kobj_attribute khugepaged_max_ptes_none_attr =
555 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, 555 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
556 khugepaged_max_ptes_none_store); 556 khugepaged_max_ptes_none_store);
557 557
558 static struct attribute *khugepaged_attr[] = { 558 static struct attribute *khugepaged_attr[] = {
559 &khugepaged_defrag_attr.attr, 559 &khugepaged_defrag_attr.attr,
560 &khugepaged_max_ptes_none_attr.attr, 560 &khugepaged_max_ptes_none_attr.attr,
561 &pages_to_scan_attr.attr, 561 &pages_to_scan_attr.attr,
562 &pages_collapsed_attr.attr, 562 &pages_collapsed_attr.attr,
563 &full_scans_attr.attr, 563 &full_scans_attr.attr,
564 &scan_sleep_millisecs_attr.attr, 564 &scan_sleep_millisecs_attr.attr,
565 &alloc_sleep_millisecs_attr.attr, 565 &alloc_sleep_millisecs_attr.attr,
566 NULL, 566 NULL,
567 }; 567 };
568 568
569 static struct attribute_group khugepaged_attr_group = { 569 static struct attribute_group khugepaged_attr_group = {
570 .attrs = khugepaged_attr, 570 .attrs = khugepaged_attr,
571 .name = "khugepaged", 571 .name = "khugepaged",
572 }; 572 };
573 573
574 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 574 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
575 { 575 {
576 int err; 576 int err;
577 577
578 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 578 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
579 if (unlikely(!*hugepage_kobj)) { 579 if (unlikely(!*hugepage_kobj)) {
580 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); 580 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
581 return -ENOMEM; 581 return -ENOMEM;
582 } 582 }
583 583
584 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 584 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
585 if (err) { 585 if (err) {
586 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 586 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
587 goto delete_obj; 587 goto delete_obj;
588 } 588 }
589 589
590 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 590 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
591 if (err) { 591 if (err) {
592 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 592 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
593 goto remove_hp_group; 593 goto remove_hp_group;
594 } 594 }
595 595
596 return 0; 596 return 0;
597 597
598 remove_hp_group: 598 remove_hp_group:
599 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 599 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
600 delete_obj: 600 delete_obj:
601 kobject_put(*hugepage_kobj); 601 kobject_put(*hugepage_kobj);
602 return err; 602 return err;
603 } 603 }
604 604
605 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 605 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
606 { 606 {
607 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 607 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
608 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 608 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
609 kobject_put(hugepage_kobj); 609 kobject_put(hugepage_kobj);
610 } 610 }
611 #else 611 #else
612 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 612 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
613 { 613 {
614 return 0; 614 return 0;
615 } 615 }
616 616
617 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 617 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
618 { 618 {
619 } 619 }
620 #endif /* CONFIG_SYSFS */ 620 #endif /* CONFIG_SYSFS */
621 621
622 static int __init hugepage_init(void) 622 static int __init hugepage_init(void)
623 { 623 {
624 int err; 624 int err;
625 struct kobject *hugepage_kobj; 625 struct kobject *hugepage_kobj;
626 626
627 if (!has_transparent_hugepage()) { 627 if (!has_transparent_hugepage()) {
628 transparent_hugepage_flags = 0; 628 transparent_hugepage_flags = 0;
629 return -EINVAL; 629 return -EINVAL;
630 } 630 }
631 631
632 err = hugepage_init_sysfs(&hugepage_kobj); 632 err = hugepage_init_sysfs(&hugepage_kobj);
633 if (err) 633 if (err)
634 return err; 634 return err;
635 635
636 err = khugepaged_slab_init(); 636 err = khugepaged_slab_init();
637 if (err) 637 if (err)
638 goto out; 638 goto out;
639 639
640 register_shrinker(&huge_zero_page_shrinker); 640 register_shrinker(&huge_zero_page_shrinker);
641 641
642 /* 642 /*
643 * By default disable transparent hugepages on smaller systems, 643 * By default disable transparent hugepages on smaller systems,
644 * where the extra memory used could hurt more than TLB overhead 644 * where the extra memory used could hurt more than TLB overhead
645 * is likely to save. The admin can still enable it through /sys. 645 * is likely to save. The admin can still enable it through /sys.
646 */ 646 */
647 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) 647 if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
648 transparent_hugepage_flags = 0; 648 transparent_hugepage_flags = 0;
649 649
650 start_khugepaged(); 650 start_khugepaged();
651 651
652 return 0; 652 return 0;
653 out: 653 out:
654 hugepage_exit_sysfs(hugepage_kobj); 654 hugepage_exit_sysfs(hugepage_kobj);
655 return err; 655 return err;
656 } 656 }
657 module_init(hugepage_init) 657 module_init(hugepage_init)
658 658
659 static int __init setup_transparent_hugepage(char *str) 659 static int __init setup_transparent_hugepage(char *str)
660 { 660 {
661 int ret = 0; 661 int ret = 0;
662 if (!str) 662 if (!str)
663 goto out; 663 goto out;
664 if (!strcmp(str, "always")) { 664 if (!strcmp(str, "always")) {
665 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 665 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
666 &transparent_hugepage_flags); 666 &transparent_hugepage_flags);
667 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 667 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
668 &transparent_hugepage_flags); 668 &transparent_hugepage_flags);
669 ret = 1; 669 ret = 1;
670 } else if (!strcmp(str, "madvise")) { 670 } else if (!strcmp(str, "madvise")) {
671 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 671 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
672 &transparent_hugepage_flags); 672 &transparent_hugepage_flags);
673 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 673 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
674 &transparent_hugepage_flags); 674 &transparent_hugepage_flags);
675 ret = 1; 675 ret = 1;
676 } else if (!strcmp(str, "never")) { 676 } else if (!strcmp(str, "never")) {
677 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 677 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
678 &transparent_hugepage_flags); 678 &transparent_hugepage_flags);
679 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 679 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
680 &transparent_hugepage_flags); 680 &transparent_hugepage_flags);
681 ret = 1; 681 ret = 1;
682 } 682 }
683 out: 683 out:
684 if (!ret) 684 if (!ret)
685 printk(KERN_WARNING 685 printk(KERN_WARNING
686 "transparent_hugepage= cannot parse, ignored\n"); 686 "transparent_hugepage= cannot parse, ignored\n");
687 return ret; 687 return ret;
688 } 688 }
689 __setup("transparent_hugepage=", setup_transparent_hugepage); 689 __setup("transparent_hugepage=", setup_transparent_hugepage);
690 690
691 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 691 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
692 { 692 {
693 if (likely(vma->vm_flags & VM_WRITE)) 693 if (likely(vma->vm_flags & VM_WRITE))
694 pmd = pmd_mkwrite(pmd); 694 pmd = pmd_mkwrite(pmd);
695 return pmd; 695 return pmd;
696 } 696 }
697 697
698 static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) 698 static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
699 { 699 {
700 pmd_t entry; 700 pmd_t entry;
701 entry = mk_pmd(page, prot); 701 entry = mk_pmd(page, prot);
702 entry = pmd_mkhuge(entry); 702 entry = pmd_mkhuge(entry);
703 return entry; 703 return entry;
704 } 704 }
705 705
706 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 706 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
707 struct vm_area_struct *vma, 707 struct vm_area_struct *vma,
708 unsigned long haddr, pmd_t *pmd, 708 unsigned long haddr, pmd_t *pmd,
709 struct page *page) 709 struct page *page)
710 { 710 {
711 pgtable_t pgtable; 711 pgtable_t pgtable;
712 712
713 VM_BUG_ON(!PageCompound(page)); 713 VM_BUG_ON(!PageCompound(page));
714 pgtable = pte_alloc_one(mm, haddr); 714 pgtable = pte_alloc_one(mm, haddr);
715 if (unlikely(!pgtable)) 715 if (unlikely(!pgtable))
716 return VM_FAULT_OOM; 716 return VM_FAULT_OOM;
717 717
718 clear_huge_page(page, haddr, HPAGE_PMD_NR); 718 clear_huge_page(page, haddr, HPAGE_PMD_NR);
719 /* 719 /*
720 * The memory barrier inside __SetPageUptodate makes sure that 720 * The memory barrier inside __SetPageUptodate makes sure that
721 * clear_huge_page writes become visible before the set_pmd_at() 721 * clear_huge_page writes become visible before the set_pmd_at()
722 * write. 722 * write.
723 */ 723 */
724 __SetPageUptodate(page); 724 __SetPageUptodate(page);
725 725
726 spin_lock(&mm->page_table_lock); 726 spin_lock(&mm->page_table_lock);
727 if (unlikely(!pmd_none(*pmd))) { 727 if (unlikely(!pmd_none(*pmd))) {
728 spin_unlock(&mm->page_table_lock); 728 spin_unlock(&mm->page_table_lock);
729 mem_cgroup_uncharge_page(page); 729 mem_cgroup_uncharge_page(page);
730 put_page(page); 730 put_page(page);
731 pte_free(mm, pgtable); 731 pte_free(mm, pgtable);
732 } else { 732 } else {
733 pmd_t entry; 733 pmd_t entry;
734 entry = mk_huge_pmd(page, vma->vm_page_prot); 734 entry = mk_huge_pmd(page, vma->vm_page_prot);
735 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 735 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
736 page_add_new_anon_rmap(page, vma, haddr); 736 page_add_new_anon_rmap(page, vma, haddr);
737 pgtable_trans_huge_deposit(mm, pmd, pgtable); 737 pgtable_trans_huge_deposit(mm, pmd, pgtable);
738 set_pmd_at(mm, haddr, pmd, entry); 738 set_pmd_at(mm, haddr, pmd, entry);
739 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 739 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
740 mm->nr_ptes++; 740 mm->nr_ptes++;
741 spin_unlock(&mm->page_table_lock); 741 spin_unlock(&mm->page_table_lock);
742 } 742 }
743 743
744 return 0; 744 return 0;
745 } 745 }
746 746
747 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) 747 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
748 { 748 {
749 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; 749 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
750 } 750 }
751 751
752 static inline struct page *alloc_hugepage_vma(int defrag, 752 static inline struct page *alloc_hugepage_vma(int defrag,
753 struct vm_area_struct *vma, 753 struct vm_area_struct *vma,
754 unsigned long haddr, int nd, 754 unsigned long haddr, int nd,
755 gfp_t extra_gfp) 755 gfp_t extra_gfp)
756 { 756 {
757 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), 757 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
758 HPAGE_PMD_ORDER, vma, haddr, nd); 758 HPAGE_PMD_ORDER, vma, haddr, nd);
759 } 759 }
760 760
761 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 761 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
762 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 762 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
763 struct page *zero_page) 763 struct page *zero_page)
764 { 764 {
765 pmd_t entry; 765 pmd_t entry;
766 if (!pmd_none(*pmd)) 766 if (!pmd_none(*pmd))
767 return false; 767 return false;
768 entry = mk_pmd(zero_page, vma->vm_page_prot); 768 entry = mk_pmd(zero_page, vma->vm_page_prot);
769 entry = pmd_wrprotect(entry); 769 entry = pmd_wrprotect(entry);
770 entry = pmd_mkhuge(entry); 770 entry = pmd_mkhuge(entry);
771 pgtable_trans_huge_deposit(mm, pmd, pgtable); 771 pgtable_trans_huge_deposit(mm, pmd, pgtable);
772 set_pmd_at(mm, haddr, pmd, entry); 772 set_pmd_at(mm, haddr, pmd, entry);
773 mm->nr_ptes++; 773 mm->nr_ptes++;
774 return true; 774 return true;
775 } 775 }
776 776
777 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 777 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
778 unsigned long address, pmd_t *pmd, 778 unsigned long address, pmd_t *pmd,
779 unsigned int flags) 779 unsigned int flags)
780 { 780 {
781 struct page *page; 781 struct page *page;
782 unsigned long haddr = address & HPAGE_PMD_MASK; 782 unsigned long haddr = address & HPAGE_PMD_MASK;
783 783
784 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 784 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
785 return VM_FAULT_FALLBACK; 785 return VM_FAULT_FALLBACK;
786 if (unlikely(anon_vma_prepare(vma))) 786 if (unlikely(anon_vma_prepare(vma)))
787 return VM_FAULT_OOM; 787 return VM_FAULT_OOM;
788 if (unlikely(khugepaged_enter(vma))) 788 if (unlikely(khugepaged_enter(vma)))
789 return VM_FAULT_OOM; 789 return VM_FAULT_OOM;
790 if (!(flags & FAULT_FLAG_WRITE) && 790 if (!(flags & FAULT_FLAG_WRITE) &&
791 transparent_hugepage_use_zero_page()) { 791 transparent_hugepage_use_zero_page()) {
792 pgtable_t pgtable; 792 pgtable_t pgtable;
793 struct page *zero_page; 793 struct page *zero_page;
794 bool set; 794 bool set;
795 pgtable = pte_alloc_one(mm, haddr); 795 pgtable = pte_alloc_one(mm, haddr);
796 if (unlikely(!pgtable)) 796 if (unlikely(!pgtable))
797 return VM_FAULT_OOM; 797 return VM_FAULT_OOM;
798 zero_page = get_huge_zero_page(); 798 zero_page = get_huge_zero_page();
799 if (unlikely(!zero_page)) { 799 if (unlikely(!zero_page)) {
800 pte_free(mm, pgtable); 800 pte_free(mm, pgtable);
801 count_vm_event(THP_FAULT_FALLBACK); 801 count_vm_event(THP_FAULT_FALLBACK);
802 return VM_FAULT_FALLBACK; 802 return VM_FAULT_FALLBACK;
803 } 803 }
804 spin_lock(&mm->page_table_lock); 804 spin_lock(&mm->page_table_lock);
805 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, 805 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
806 zero_page); 806 zero_page);
807 spin_unlock(&mm->page_table_lock); 807 spin_unlock(&mm->page_table_lock);
808 if (!set) { 808 if (!set) {
809 pte_free(mm, pgtable); 809 pte_free(mm, pgtable);
810 put_huge_zero_page(); 810 put_huge_zero_page();
811 } 811 }
812 return 0; 812 return 0;
813 } 813 }
814 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 814 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
815 vma, haddr, numa_node_id(), 0); 815 vma, haddr, numa_node_id(), 0);
816 if (unlikely(!page)) { 816 if (unlikely(!page)) {
817 count_vm_event(THP_FAULT_FALLBACK); 817 count_vm_event(THP_FAULT_FALLBACK);
818 return VM_FAULT_FALLBACK; 818 return VM_FAULT_FALLBACK;
819 } 819 }
820 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 820 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
821 put_page(page); 821 put_page(page);
822 count_vm_event(THP_FAULT_FALLBACK); 822 count_vm_event(THP_FAULT_FALLBACK);
823 return VM_FAULT_FALLBACK; 823 return VM_FAULT_FALLBACK;
824 } 824 }
825 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { 825 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
826 mem_cgroup_uncharge_page(page); 826 mem_cgroup_uncharge_page(page);
827 put_page(page); 827 put_page(page);
828 count_vm_event(THP_FAULT_FALLBACK); 828 count_vm_event(THP_FAULT_FALLBACK);
829 return VM_FAULT_FALLBACK; 829 return VM_FAULT_FALLBACK;
830 } 830 }
831 831
832 count_vm_event(THP_FAULT_ALLOC); 832 count_vm_event(THP_FAULT_ALLOC);
833 return 0; 833 return 0;
834 } 834 }
835 835
836 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 836 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
837 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 837 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
838 struct vm_area_struct *vma) 838 struct vm_area_struct *vma)
839 { 839 {
840 struct page *src_page; 840 struct page *src_page;
841 pmd_t pmd; 841 pmd_t pmd;
842 pgtable_t pgtable; 842 pgtable_t pgtable;
843 int ret; 843 int ret;
844 844
845 ret = -ENOMEM; 845 ret = -ENOMEM;
846 pgtable = pte_alloc_one(dst_mm, addr); 846 pgtable = pte_alloc_one(dst_mm, addr);
847 if (unlikely(!pgtable)) 847 if (unlikely(!pgtable))
848 goto out; 848 goto out;
849 849
850 spin_lock(&dst_mm->page_table_lock); 850 spin_lock(&dst_mm->page_table_lock);
851 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); 851 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
852 852
853 ret = -EAGAIN; 853 ret = -EAGAIN;
854 pmd = *src_pmd; 854 pmd = *src_pmd;
855 if (unlikely(!pmd_trans_huge(pmd))) { 855 if (unlikely(!pmd_trans_huge(pmd))) {
856 pte_free(dst_mm, pgtable); 856 pte_free(dst_mm, pgtable);
857 goto out_unlock; 857 goto out_unlock;
858 } 858 }
859 /* 859 /*
860 * mm->page_table_lock is enough to be sure that huge zero pmd is not 860 * mm->page_table_lock is enough to be sure that huge zero pmd is not
861 * under splitting since we don't split the page itself, only pmd to 861 * under splitting since we don't split the page itself, only pmd to
862 * a page table. 862 * a page table.
863 */ 863 */
864 if (is_huge_zero_pmd(pmd)) { 864 if (is_huge_zero_pmd(pmd)) {
865 struct page *zero_page; 865 struct page *zero_page;
866 bool set; 866 bool set;
867 /* 867 /*
868 * get_huge_zero_page() will never allocate a new page here, 868 * get_huge_zero_page() will never allocate a new page here,
869 * since we already have a zero page to copy. It just takes a 869 * since we already have a zero page to copy. It just takes a
870 * reference. 870 * reference.
871 */ 871 */
872 zero_page = get_huge_zero_page(); 872 zero_page = get_huge_zero_page();
873 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 873 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
874 zero_page); 874 zero_page);
875 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ 875 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
876 ret = 0; 876 ret = 0;
877 goto out_unlock; 877 goto out_unlock;
878 } 878 }
879 879
880 /* mmap_sem prevents this happening but warn if that changes */ 880 /* mmap_sem prevents this happening but warn if that changes */
881 WARN_ON(pmd_trans_migrating(pmd)); 881 WARN_ON(pmd_trans_migrating(pmd));
882 882
883 if (unlikely(pmd_trans_splitting(pmd))) { 883 if (unlikely(pmd_trans_splitting(pmd))) {
884 /* split huge page running from under us */ 884 /* split huge page running from under us */
885 spin_unlock(&src_mm->page_table_lock); 885 spin_unlock(&src_mm->page_table_lock);
886 spin_unlock(&dst_mm->page_table_lock); 886 spin_unlock(&dst_mm->page_table_lock);
887 pte_free(dst_mm, pgtable); 887 pte_free(dst_mm, pgtable);
888 888
889 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ 889 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
890 goto out; 890 goto out;
891 } 891 }
892 src_page = pmd_page(pmd); 892 src_page = pmd_page(pmd);
893 VM_BUG_ON(!PageHead(src_page)); 893 VM_BUG_ON(!PageHead(src_page));
894 get_page(src_page); 894 get_page(src_page);
895 page_dup_rmap(src_page); 895 page_dup_rmap(src_page);
896 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 896 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
897 897
898 pmdp_set_wrprotect(src_mm, addr, src_pmd); 898 pmdp_set_wrprotect(src_mm, addr, src_pmd);
899 pmd = pmd_mkold(pmd_wrprotect(pmd)); 899 pmd = pmd_mkold(pmd_wrprotect(pmd));
900 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 900 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
901 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 901 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
902 dst_mm->nr_ptes++; 902 dst_mm->nr_ptes++;
903 903
904 ret = 0; 904 ret = 0;
905 out_unlock: 905 out_unlock:
906 spin_unlock(&src_mm->page_table_lock); 906 spin_unlock(&src_mm->page_table_lock);
907 spin_unlock(&dst_mm->page_table_lock); 907 spin_unlock(&dst_mm->page_table_lock);
908 out: 908 out:
909 return ret; 909 return ret;
910 } 910 }
911 911
912 void huge_pmd_set_accessed(struct mm_struct *mm, 912 void huge_pmd_set_accessed(struct mm_struct *mm,
913 struct vm_area_struct *vma, 913 struct vm_area_struct *vma,
914 unsigned long address, 914 unsigned long address,
915 pmd_t *pmd, pmd_t orig_pmd, 915 pmd_t *pmd, pmd_t orig_pmd,
916 int dirty) 916 int dirty)
917 { 917 {
918 pmd_t entry; 918 pmd_t entry;
919 unsigned long haddr; 919 unsigned long haddr;
920 920
921 spin_lock(&mm->page_table_lock); 921 spin_lock(&mm->page_table_lock);
922 if (unlikely(!pmd_same(*pmd, orig_pmd))) 922 if (unlikely(!pmd_same(*pmd, orig_pmd)))
923 goto unlock; 923 goto unlock;
924 924
925 entry = pmd_mkyoung(orig_pmd); 925 entry = pmd_mkyoung(orig_pmd);
926 haddr = address & HPAGE_PMD_MASK; 926 haddr = address & HPAGE_PMD_MASK;
927 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) 927 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
928 update_mmu_cache_pmd(vma, address, pmd); 928 update_mmu_cache_pmd(vma, address, pmd);
929 929
930 unlock: 930 unlock:
931 spin_unlock(&mm->page_table_lock); 931 spin_unlock(&mm->page_table_lock);
932 } 932 }
933 933
934 static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, 934 static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
935 struct vm_area_struct *vma, unsigned long address, 935 struct vm_area_struct *vma, unsigned long address,
936 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) 936 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
937 { 937 {
938 pgtable_t pgtable; 938 pgtable_t pgtable;
939 pmd_t _pmd; 939 pmd_t _pmd;
940 struct page *page; 940 struct page *page;
941 int i, ret = 0; 941 int i, ret = 0;
942 unsigned long mmun_start; /* For mmu_notifiers */ 942 unsigned long mmun_start; /* For mmu_notifiers */
943 unsigned long mmun_end; /* For mmu_notifiers */ 943 unsigned long mmun_end; /* For mmu_notifiers */
944 944
945 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 945 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
946 if (!page) { 946 if (!page) {
947 ret |= VM_FAULT_OOM; 947 ret |= VM_FAULT_OOM;
948 goto out; 948 goto out;
949 } 949 }
950 950
951 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { 951 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
952 put_page(page); 952 put_page(page);
953 ret |= VM_FAULT_OOM; 953 ret |= VM_FAULT_OOM;
954 goto out; 954 goto out;
955 } 955 }
956 956
957 clear_user_highpage(page, address); 957 clear_user_highpage(page, address);
958 __SetPageUptodate(page); 958 __SetPageUptodate(page);
959 959
960 mmun_start = haddr; 960 mmun_start = haddr;
961 mmun_end = haddr + HPAGE_PMD_SIZE; 961 mmun_end = haddr + HPAGE_PMD_SIZE;
962 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 962 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
963 963
964 spin_lock(&mm->page_table_lock); 964 spin_lock(&mm->page_table_lock);
965 if (unlikely(!pmd_same(*pmd, orig_pmd))) 965 if (unlikely(!pmd_same(*pmd, orig_pmd)))
966 goto out_free_page; 966 goto out_free_page;
967 967
968 pmdp_clear_flush(vma, haddr, pmd); 968 pmdp_clear_flush(vma, haddr, pmd);
969 /* leave pmd empty until pte is filled */ 969 /* leave pmd empty until pte is filled */
970 970
971 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 971 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
972 pmd_populate(mm, &_pmd, pgtable); 972 pmd_populate(mm, &_pmd, pgtable);
973 973
974 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 974 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
975 pte_t *pte, entry; 975 pte_t *pte, entry;
976 if (haddr == (address & PAGE_MASK)) { 976 if (haddr == (address & PAGE_MASK)) {
977 entry = mk_pte(page, vma->vm_page_prot); 977 entry = mk_pte(page, vma->vm_page_prot);
978 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 978 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
979 page_add_new_anon_rmap(page, vma, haddr); 979 page_add_new_anon_rmap(page, vma, haddr);
980 } else { 980 } else {
981 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 981 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
982 entry = pte_mkspecial(entry); 982 entry = pte_mkspecial(entry);
983 } 983 }
984 pte = pte_offset_map(&_pmd, haddr); 984 pte = pte_offset_map(&_pmd, haddr);
985 VM_BUG_ON(!pte_none(*pte)); 985 VM_BUG_ON(!pte_none(*pte));
986 set_pte_at(mm, haddr, pte, entry); 986 set_pte_at(mm, haddr, pte, entry);
987 pte_unmap(pte); 987 pte_unmap(pte);
988 } 988 }
989 smp_wmb(); /* make pte visible before pmd */ 989 smp_wmb(); /* make pte visible before pmd */
990 pmd_populate(mm, pmd, pgtable); 990 pmd_populate(mm, pmd, pgtable);
991 spin_unlock(&mm->page_table_lock); 991 spin_unlock(&mm->page_table_lock);
992 put_huge_zero_page(); 992 put_huge_zero_page();
993 inc_mm_counter(mm, MM_ANONPAGES); 993 inc_mm_counter(mm, MM_ANONPAGES);
994 994
995 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 995 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
996 996
997 ret |= VM_FAULT_WRITE; 997 ret |= VM_FAULT_WRITE;
998 out: 998 out:
999 return ret; 999 return ret;
1000 out_free_page: 1000 out_free_page:
1001 spin_unlock(&mm->page_table_lock); 1001 spin_unlock(&mm->page_table_lock);
1002 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1002 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1003 mem_cgroup_uncharge_page(page); 1003 mem_cgroup_uncharge_page(page);
1004 put_page(page); 1004 put_page(page);
1005 goto out; 1005 goto out;
1006 } 1006 }
1007 1007
1008 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1008 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1009 struct vm_area_struct *vma, 1009 struct vm_area_struct *vma,
1010 unsigned long address, 1010 unsigned long address,
1011 pmd_t *pmd, pmd_t orig_pmd, 1011 pmd_t *pmd, pmd_t orig_pmd,
1012 struct page *page, 1012 struct page *page,
1013 unsigned long haddr) 1013 unsigned long haddr)
1014 { 1014 {
1015 pgtable_t pgtable; 1015 pgtable_t pgtable;
1016 pmd_t _pmd; 1016 pmd_t _pmd;
1017 int ret = 0, i; 1017 int ret = 0, i;
1018 struct page **pages; 1018 struct page **pages;
1019 unsigned long mmun_start; /* For mmu_notifiers */ 1019 unsigned long mmun_start; /* For mmu_notifiers */
1020 unsigned long mmun_end; /* For mmu_notifiers */ 1020 unsigned long mmun_end; /* For mmu_notifiers */
1021 1021
1022 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 1022 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
1023 GFP_KERNEL); 1023 GFP_KERNEL);
1024 if (unlikely(!pages)) { 1024 if (unlikely(!pages)) {
1025 ret |= VM_FAULT_OOM; 1025 ret |= VM_FAULT_OOM;
1026 goto out; 1026 goto out;
1027 } 1027 }
1028 1028
1029 for (i = 0; i < HPAGE_PMD_NR; i++) { 1029 for (i = 0; i < HPAGE_PMD_NR; i++) {
1030 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 1030 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
1031 __GFP_OTHER_NODE, 1031 __GFP_OTHER_NODE,
1032 vma, address, page_to_nid(page)); 1032 vma, address, page_to_nid(page));
1033 if (unlikely(!pages[i] || 1033 if (unlikely(!pages[i] ||
1034 mem_cgroup_newpage_charge(pages[i], mm, 1034 mem_cgroup_newpage_charge(pages[i], mm,
1035 GFP_KERNEL))) { 1035 GFP_KERNEL))) {
1036 if (pages[i]) 1036 if (pages[i])
1037 put_page(pages[i]); 1037 put_page(pages[i]);
1038 mem_cgroup_uncharge_start(); 1038 mem_cgroup_uncharge_start();
1039 while (--i >= 0) { 1039 while (--i >= 0) {
1040 mem_cgroup_uncharge_page(pages[i]); 1040 mem_cgroup_uncharge_page(pages[i]);
1041 put_page(pages[i]); 1041 put_page(pages[i]);
1042 } 1042 }
1043 mem_cgroup_uncharge_end(); 1043 mem_cgroup_uncharge_end();
1044 kfree(pages); 1044 kfree(pages);
1045 ret |= VM_FAULT_OOM; 1045 ret |= VM_FAULT_OOM;
1046 goto out; 1046 goto out;
1047 } 1047 }
1048 } 1048 }
1049 1049
1050 for (i = 0; i < HPAGE_PMD_NR; i++) { 1050 for (i = 0; i < HPAGE_PMD_NR; i++) {
1051 copy_user_highpage(pages[i], page + i, 1051 copy_user_highpage(pages[i], page + i,
1052 haddr + PAGE_SIZE * i, vma); 1052 haddr + PAGE_SIZE * i, vma);
1053 __SetPageUptodate(pages[i]); 1053 __SetPageUptodate(pages[i]);
1054 cond_resched(); 1054 cond_resched();
1055 } 1055 }
1056 1056
1057 mmun_start = haddr; 1057 mmun_start = haddr;
1058 mmun_end = haddr + HPAGE_PMD_SIZE; 1058 mmun_end = haddr + HPAGE_PMD_SIZE;
1059 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1059 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1060 1060
1061 spin_lock(&mm->page_table_lock); 1061 spin_lock(&mm->page_table_lock);
1062 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1062 if (unlikely(!pmd_same(*pmd, orig_pmd)))
1063 goto out_free_pages; 1063 goto out_free_pages;
1064 VM_BUG_ON(!PageHead(page)); 1064 VM_BUG_ON(!PageHead(page));
1065 1065
1066 pmdp_clear_flush(vma, haddr, pmd); 1066 pmdp_clear_flush(vma, haddr, pmd);
1067 /* leave pmd empty until pte is filled */ 1067 /* leave pmd empty until pte is filled */
1068 1068
1069 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1069 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1070 pmd_populate(mm, &_pmd, pgtable); 1070 pmd_populate(mm, &_pmd, pgtable);
1071 1071
1072 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1072 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1073 pte_t *pte, entry; 1073 pte_t *pte, entry;
1074 entry = mk_pte(pages[i], vma->vm_page_prot); 1074 entry = mk_pte(pages[i], vma->vm_page_prot);
1075 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1075 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1076 page_add_new_anon_rmap(pages[i], vma, haddr); 1076 page_add_new_anon_rmap(pages[i], vma, haddr);
1077 pte = pte_offset_map(&_pmd, haddr); 1077 pte = pte_offset_map(&_pmd, haddr);
1078 VM_BUG_ON(!pte_none(*pte)); 1078 VM_BUG_ON(!pte_none(*pte));
1079 set_pte_at(mm, haddr, pte, entry); 1079 set_pte_at(mm, haddr, pte, entry);
1080 pte_unmap(pte); 1080 pte_unmap(pte);
1081 } 1081 }
1082 kfree(pages); 1082 kfree(pages);
1083 1083
1084 smp_wmb(); /* make pte visible before pmd */ 1084 smp_wmb(); /* make pte visible before pmd */
1085 pmd_populate(mm, pmd, pgtable); 1085 pmd_populate(mm, pmd, pgtable);
1086 page_remove_rmap(page); 1086 page_remove_rmap(page);
1087 spin_unlock(&mm->page_table_lock); 1087 spin_unlock(&mm->page_table_lock);
1088 1088
1089 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1089 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1090 1090
1091 ret |= VM_FAULT_WRITE; 1091 ret |= VM_FAULT_WRITE;
1092 put_page(page); 1092 put_page(page);
1093 1093
1094 out: 1094 out:
1095 return ret; 1095 return ret;
1096 1096
1097 out_free_pages: 1097 out_free_pages:
1098 spin_unlock(&mm->page_table_lock); 1098 spin_unlock(&mm->page_table_lock);
1099 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1099 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1100 mem_cgroup_uncharge_start(); 1100 mem_cgroup_uncharge_start();
1101 for (i = 0; i < HPAGE_PMD_NR; i++) { 1101 for (i = 0; i < HPAGE_PMD_NR; i++) {
1102 mem_cgroup_uncharge_page(pages[i]); 1102 mem_cgroup_uncharge_page(pages[i]);
1103 put_page(pages[i]); 1103 put_page(pages[i]);
1104 } 1104 }
1105 mem_cgroup_uncharge_end(); 1105 mem_cgroup_uncharge_end();
1106 kfree(pages); 1106 kfree(pages);
1107 goto out; 1107 goto out;
1108 } 1108 }
1109 1109
1110 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1110 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1111 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1111 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
1112 { 1112 {
1113 int ret = 0; 1113 int ret = 0;
1114 struct page *page = NULL, *new_page; 1114 struct page *page = NULL, *new_page;
1115 unsigned long haddr; 1115 unsigned long haddr;
1116 unsigned long mmun_start; /* For mmu_notifiers */ 1116 unsigned long mmun_start; /* For mmu_notifiers */
1117 unsigned long mmun_end; /* For mmu_notifiers */ 1117 unsigned long mmun_end; /* For mmu_notifiers */
1118 1118
1119 VM_BUG_ON(!vma->anon_vma); 1119 VM_BUG_ON(!vma->anon_vma);
1120 haddr = address & HPAGE_PMD_MASK; 1120 haddr = address & HPAGE_PMD_MASK;
1121 if (is_huge_zero_pmd(orig_pmd)) 1121 if (is_huge_zero_pmd(orig_pmd))
1122 goto alloc; 1122 goto alloc;
1123 spin_lock(&mm->page_table_lock); 1123 spin_lock(&mm->page_table_lock);
1124 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1124 if (unlikely(!pmd_same(*pmd, orig_pmd)))
1125 goto out_unlock; 1125 goto out_unlock;
1126 1126
1127 page = pmd_page(orig_pmd); 1127 page = pmd_page(orig_pmd);
1128 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1128 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
1129 if (page_mapcount(page) == 1) { 1129 if (page_mapcount(page) == 1) {
1130 pmd_t entry; 1130 pmd_t entry;
1131 entry = pmd_mkyoung(orig_pmd); 1131 entry = pmd_mkyoung(orig_pmd);
1132 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1132 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1133 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 1133 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
1134 update_mmu_cache_pmd(vma, address, pmd); 1134 update_mmu_cache_pmd(vma, address, pmd);
1135 ret |= VM_FAULT_WRITE; 1135 ret |= VM_FAULT_WRITE;
1136 goto out_unlock; 1136 goto out_unlock;
1137 } 1137 }
1138 get_page(page); 1138 get_page(page);
1139 spin_unlock(&mm->page_table_lock); 1139 spin_unlock(&mm->page_table_lock);
1140 alloc: 1140 alloc:
1141 if (transparent_hugepage_enabled(vma) && 1141 if (transparent_hugepage_enabled(vma) &&
1142 !transparent_hugepage_debug_cow()) 1142 !transparent_hugepage_debug_cow())
1143 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1143 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
1144 vma, haddr, numa_node_id(), 0); 1144 vma, haddr, numa_node_id(), 0);
1145 else 1145 else
1146 new_page = NULL; 1146 new_page = NULL;
1147 1147
1148 if (unlikely(!new_page)) { 1148 if (unlikely(!new_page)) {
1149 if (!page) { 1149 if (!page) {
1150 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, 1150 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
1151 address, pmd, orig_pmd, haddr); 1151 address, pmd, orig_pmd, haddr);
1152 } else { 1152 } else {
1153 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1153 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1154 pmd, orig_pmd, page, haddr); 1154 pmd, orig_pmd, page, haddr);
1155 if (ret & VM_FAULT_OOM) { 1155 if (ret & VM_FAULT_OOM) {
1156 split_huge_page(page); 1156 split_huge_page(page);
1157 ret |= VM_FAULT_FALLBACK; 1157 ret |= VM_FAULT_FALLBACK;
1158 } 1158 }
1159 put_page(page); 1159 put_page(page);
1160 } 1160 }
1161 count_vm_event(THP_FAULT_FALLBACK); 1161 count_vm_event(THP_FAULT_FALLBACK);
1162 goto out; 1162 goto out;
1163 } 1163 }
1164 1164
1165 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1165 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1166 put_page(new_page); 1166 put_page(new_page);
1167 if (page) { 1167 if (page) {
1168 split_huge_page(page); 1168 split_huge_page(page);
1169 put_page(page); 1169 put_page(page);
1170 } else 1170 } else
1171 split_huge_page_pmd(vma, address, pmd); 1171 split_huge_page_pmd(vma, address, pmd);
1172 ret |= VM_FAULT_FALLBACK; 1172 ret |= VM_FAULT_FALLBACK;
1173 count_vm_event(THP_FAULT_FALLBACK); 1173 count_vm_event(THP_FAULT_FALLBACK);
1174 goto out; 1174 goto out;
1175 } 1175 }
1176 1176
1177 count_vm_event(THP_FAULT_ALLOC); 1177 count_vm_event(THP_FAULT_ALLOC);
1178 1178
1179 if (!page) 1179 if (!page)
1180 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); 1180 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1181 else 1181 else
1182 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1182 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
1183 __SetPageUptodate(new_page); 1183 __SetPageUptodate(new_page);
1184 1184
1185 mmun_start = haddr; 1185 mmun_start = haddr;
1186 mmun_end = haddr + HPAGE_PMD_SIZE; 1186 mmun_end = haddr + HPAGE_PMD_SIZE;
1187 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1187 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1188 1188
1189 spin_lock(&mm->page_table_lock); 1189 spin_lock(&mm->page_table_lock);
1190 if (page) 1190 if (page)
1191 put_page(page); 1191 put_page(page);
1192 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1192 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
1193 spin_unlock(&mm->page_table_lock); 1193 spin_unlock(&mm->page_table_lock);
1194 mem_cgroup_uncharge_page(new_page); 1194 mem_cgroup_uncharge_page(new_page);
1195 put_page(new_page); 1195 put_page(new_page);
1196 goto out_mn; 1196 goto out_mn;
1197 } else { 1197 } else {
1198 pmd_t entry; 1198 pmd_t entry;
1199 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1199 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1200 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1200 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1201 pmdp_clear_flush(vma, haddr, pmd); 1201 pmdp_clear_flush(vma, haddr, pmd);
1202 page_add_new_anon_rmap(new_page, vma, haddr); 1202 page_add_new_anon_rmap(new_page, vma, haddr);
1203 set_pmd_at(mm, haddr, pmd, entry); 1203 set_pmd_at(mm, haddr, pmd, entry);
1204 update_mmu_cache_pmd(vma, address, pmd); 1204 update_mmu_cache_pmd(vma, address, pmd);
1205 if (!page) { 1205 if (!page) {
1206 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1206 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1207 put_huge_zero_page(); 1207 put_huge_zero_page();
1208 } else { 1208 } else {
1209 VM_BUG_ON(!PageHead(page)); 1209 VM_BUG_ON(!PageHead(page));
1210 page_remove_rmap(page); 1210 page_remove_rmap(page);
1211 put_page(page); 1211 put_page(page);
1212 } 1212 }
1213 ret |= VM_FAULT_WRITE; 1213 ret |= VM_FAULT_WRITE;
1214 } 1214 }
1215 spin_unlock(&mm->page_table_lock); 1215 spin_unlock(&mm->page_table_lock);
1216 out_mn: 1216 out_mn:
1217 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1217 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1218 out: 1218 out:
1219 return ret; 1219 return ret;
1220 out_unlock: 1220 out_unlock:
1221 spin_unlock(&mm->page_table_lock); 1221 spin_unlock(&mm->page_table_lock);
1222 return ret; 1222 return ret;
1223 } 1223 }
1224 1224
1225 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1225 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1226 unsigned long addr, 1226 unsigned long addr,
1227 pmd_t *pmd, 1227 pmd_t *pmd,
1228 unsigned int flags) 1228 unsigned int flags)
1229 { 1229 {
1230 struct mm_struct *mm = vma->vm_mm; 1230 struct mm_struct *mm = vma->vm_mm;
1231 struct page *page = NULL; 1231 struct page *page = NULL;
1232 1232
1233 assert_spin_locked(&mm->page_table_lock); 1233 assert_spin_locked(&mm->page_table_lock);
1234 1234
1235 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1235 if (flags & FOLL_WRITE && !pmd_write(*pmd))
1236 goto out; 1236 goto out;
1237 1237
1238 /* Avoid dumping huge zero page */ 1238 /* Avoid dumping huge zero page */
1239 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1239 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1240 return ERR_PTR(-EFAULT); 1240 return ERR_PTR(-EFAULT);
1241 1241
1242 /* Full NUMA hinting faults to serialise migration in fault paths */ 1242 /* Full NUMA hinting faults to serialise migration in fault paths */
1243 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 1243 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1244 goto out; 1244 goto out;
1245 1245
1246 page = pmd_page(*pmd); 1246 page = pmd_page(*pmd);
1247 VM_BUG_ON(!PageHead(page)); 1247 VM_BUG_ON(!PageHead(page));
1248 if (flags & FOLL_TOUCH) { 1248 if (flags & FOLL_TOUCH) {
1249 pmd_t _pmd; 1249 pmd_t _pmd;
1250 /* 1250 /*
1251 * We should set the dirty bit only for FOLL_WRITE but 1251 * We should set the dirty bit only for FOLL_WRITE but
1252 * for now the dirty bit in the pmd is meaningless. 1252 * for now the dirty bit in the pmd is meaningless.
1253 * And if the dirty bit will become meaningful and 1253 * And if the dirty bit will become meaningful and
1254 * we'll only set it with FOLL_WRITE, an atomic 1254 * we'll only set it with FOLL_WRITE, an atomic
1255 * set_bit will be required on the pmd to set the 1255 * set_bit will be required on the pmd to set the
1256 * young bit, instead of the current set_pmd_at. 1256 * young bit, instead of the current set_pmd_at.
1257 */ 1257 */
1258 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1258 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1259 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1259 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1260 pmd, _pmd, 1)) 1260 pmd, _pmd, 1))
1261 update_mmu_cache_pmd(vma, addr, pmd); 1261 update_mmu_cache_pmd(vma, addr, pmd);
1262 } 1262 }
1263 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1263 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1264 if (page->mapping && trylock_page(page)) { 1264 if (page->mapping && trylock_page(page)) {
1265 lru_add_drain(); 1265 lru_add_drain();
1266 if (page->mapping) 1266 if (page->mapping)
1267 mlock_vma_page(page); 1267 mlock_vma_page(page);
1268 unlock_page(page); 1268 unlock_page(page);
1269 } 1269 }
1270 } 1270 }
1271 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1271 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1272 VM_BUG_ON(!PageCompound(page)); 1272 VM_BUG_ON(!PageCompound(page));
1273 if (flags & FOLL_GET) 1273 if (flags & FOLL_GET)
1274 get_page_foll(page); 1274 get_page_foll(page);
1275 1275
1276 out: 1276 out:
1277 return page; 1277 return page;
1278 } 1278 }
1279 1279
1280 /* NUMA hinting page fault entry point for trans huge pmds */ 1280 /* NUMA hinting page fault entry point for trans huge pmds */
1281 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1281 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1282 unsigned long addr, pmd_t pmd, pmd_t *pmdp) 1282 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1283 { 1283 {
1284 struct anon_vma *anon_vma = NULL; 1284 struct anon_vma *anon_vma = NULL;
1285 struct page *page; 1285 struct page *page;
1286 unsigned long haddr = addr & HPAGE_PMD_MASK; 1286 unsigned long haddr = addr & HPAGE_PMD_MASK;
1287 int page_nid = -1, this_nid = numa_node_id(); 1287 int page_nid = -1, this_nid = numa_node_id();
1288 int target_nid; 1288 int target_nid;
1289 bool page_locked; 1289 bool page_locked;
1290 bool migrated = false; 1290 bool migrated = false;
1291 1291
1292 spin_lock(&mm->page_table_lock); 1292 spin_lock(&mm->page_table_lock);
1293 if (unlikely(!pmd_same(pmd, *pmdp))) 1293 if (unlikely(!pmd_same(pmd, *pmdp)))
1294 goto out_unlock; 1294 goto out_unlock;
1295 1295
1296 /* 1296 /*
1297 * If there are potential migrations, wait for completion and retry 1297 * If there are potential migrations, wait for completion and retry
1298 * without disrupting NUMA hinting information. Do not relock and 1298 * without disrupting NUMA hinting information. Do not relock and
1299 * check_same as the page may no longer be mapped. 1299 * check_same as the page may no longer be mapped.
1300 */ 1300 */
1301 if (unlikely(pmd_trans_migrating(*pmdp))) { 1301 if (unlikely(pmd_trans_migrating(*pmdp))) {
1302 spin_unlock(&mm->page_table_lock); 1302 spin_unlock(&mm->page_table_lock);
1303 wait_migrate_huge_page(vma->anon_vma, pmdp); 1303 wait_migrate_huge_page(vma->anon_vma, pmdp);
1304 goto out; 1304 goto out;
1305 } 1305 }
1306 1306
1307 page = pmd_page(pmd); 1307 page = pmd_page(pmd);
1308 page_nid = page_to_nid(page); 1308 page_nid = page_to_nid(page);
1309 count_vm_numa_event(NUMA_HINT_FAULTS); 1309 count_vm_numa_event(NUMA_HINT_FAULTS);
1310 if (page_nid == this_nid) 1310 if (page_nid == this_nid)
1311 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1311 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1312 1312
1313 /* 1313 /*
1314 * Acquire the page lock to serialise THP migrations but avoid dropping 1314 * Acquire the page lock to serialise THP migrations but avoid dropping
1315 * page_table_lock if at all possible 1315 * page_table_lock if at all possible
1316 */ 1316 */
1317 page_locked = trylock_page(page); 1317 page_locked = trylock_page(page);
1318 target_nid = mpol_misplaced(page, vma, haddr); 1318 target_nid = mpol_misplaced(page, vma, haddr);
1319 if (target_nid == -1) { 1319 if (target_nid == -1) {
1320 /* If the page was locked, there are no parallel migrations */ 1320 /* If the page was locked, there are no parallel migrations */
1321 if (page_locked) 1321 if (page_locked)
1322 goto clear_pmdnuma; 1322 goto clear_pmdnuma;
1323 } 1323 }
1324 1324
1325 /* Migration could have started since the pmd_trans_migrating check */ 1325 /* Migration could have started since the pmd_trans_migrating check */
1326 if (!page_locked) { 1326 if (!page_locked) {
1327 spin_unlock(&mm->page_table_lock); 1327 spin_unlock(&mm->page_table_lock);
1328 wait_on_page_locked(page); 1328 wait_on_page_locked(page);
1329 page_nid = -1; 1329 page_nid = -1;
1330 goto out; 1330 goto out;
1331 } 1331 }
1332 1332
1333 /* 1333 /*
1334 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma 1334 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
1335 * to serialises splits 1335 * to serialises splits
1336 */ 1336 */
1337 get_page(page); 1337 get_page(page);
1338 spin_unlock(&mm->page_table_lock); 1338 spin_unlock(&mm->page_table_lock);
1339 anon_vma = page_lock_anon_vma_read(page); 1339 anon_vma = page_lock_anon_vma_read(page);
1340 1340
1341 /* Confirm the PTE did not while locked */ 1341 /* Confirm the PTE did not while locked */
1342 spin_lock(&mm->page_table_lock); 1342 spin_lock(&mm->page_table_lock);
1343 if (unlikely(!pmd_same(pmd, *pmdp))) { 1343 if (unlikely(!pmd_same(pmd, *pmdp))) {
1344 unlock_page(page); 1344 unlock_page(page);
1345 put_page(page); 1345 put_page(page);
1346 page_nid = -1; 1346 page_nid = -1;
1347 goto out_unlock; 1347 goto out_unlock;
1348 } 1348 }
1349 1349
1350 /* Bail if we fail to protect against THP splits for any reason */ 1350 /* Bail if we fail to protect against THP splits for any reason */
1351 if (unlikely(!anon_vma)) { 1351 if (unlikely(!anon_vma)) {
1352 put_page(page); 1352 put_page(page);
1353 page_nid = -1; 1353 page_nid = -1;
1354 goto clear_pmdnuma; 1354 goto clear_pmdnuma;
1355 } 1355 }
1356 1356
1357 /* 1357 /*
1358 * Migrate the THP to the requested node, returns with page unlocked 1358 * Migrate the THP to the requested node, returns with page unlocked
1359 * and pmd_numa cleared. 1359 * and pmd_numa cleared.
1360 */ 1360 */
1361 spin_unlock(&mm->page_table_lock); 1361 spin_unlock(&mm->page_table_lock);
1362 migrated = migrate_misplaced_transhuge_page(mm, vma, 1362 migrated = migrate_misplaced_transhuge_page(mm, vma,
1363 pmdp, pmd, addr, page, target_nid); 1363 pmdp, pmd, addr, page, target_nid);
1364 if (migrated) 1364 if (migrated)
1365 page_nid = target_nid; 1365 page_nid = target_nid;
1366 1366
1367 goto out; 1367 goto out;
1368 clear_pmdnuma: 1368 clear_pmdnuma:
1369 BUG_ON(!PageLocked(page)); 1369 BUG_ON(!PageLocked(page));
1370 pmd = pmd_mknonnuma(pmd); 1370 pmd = pmd_mknonnuma(pmd);
1371 set_pmd_at(mm, haddr, pmdp, pmd); 1371 set_pmd_at(mm, haddr, pmdp, pmd);
1372 VM_BUG_ON(pmd_numa(*pmdp)); 1372 VM_BUG_ON(pmd_numa(*pmdp));
1373 update_mmu_cache_pmd(vma, addr, pmdp); 1373 update_mmu_cache_pmd(vma, addr, pmdp);
1374 unlock_page(page); 1374 unlock_page(page);
1375 out_unlock: 1375 out_unlock:
1376 spin_unlock(&mm->page_table_lock); 1376 spin_unlock(&mm->page_table_lock);
1377 1377
1378 out: 1378 out:
1379 if (anon_vma) 1379 if (anon_vma)
1380 page_unlock_anon_vma_read(anon_vma); 1380 page_unlock_anon_vma_read(anon_vma);
1381 1381
1382 if (page_nid != -1) 1382 if (page_nid != -1)
1383 task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); 1383 task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
1384 1384
1385 return 0; 1385 return 0;
1386 } 1386 }
1387 1387
1388 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1388 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1389 pmd_t *pmd, unsigned long addr) 1389 pmd_t *pmd, unsigned long addr)
1390 { 1390 {
1391 int ret = 0; 1391 int ret = 0;
1392 1392
1393 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1393 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1394 struct page *page; 1394 struct page *page;
1395 pgtable_t pgtable; 1395 pgtable_t pgtable;
1396 pmd_t orig_pmd; 1396 pmd_t orig_pmd;
1397 /* 1397 /*
1398 * For architectures like ppc64 we look at deposited pgtable 1398 * For architectures like ppc64 we look at deposited pgtable
1399 * when calling pmdp_get_and_clear. So do the 1399 * when calling pmdp_get_and_clear. So do the
1400 * pgtable_trans_huge_withdraw after finishing pmdp related 1400 * pgtable_trans_huge_withdraw after finishing pmdp related
1401 * operations. 1401 * operations.
1402 */ 1402 */
1403 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1403 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1404 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1404 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1405 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); 1405 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
1406 if (is_huge_zero_pmd(orig_pmd)) { 1406 if (is_huge_zero_pmd(orig_pmd)) {
1407 tlb->mm->nr_ptes--; 1407 tlb->mm->nr_ptes--;
1408 spin_unlock(&tlb->mm->page_table_lock); 1408 spin_unlock(&tlb->mm->page_table_lock);
1409 put_huge_zero_page(); 1409 put_huge_zero_page();
1410 } else { 1410 } else {
1411 page = pmd_page(orig_pmd); 1411 page = pmd_page(orig_pmd);
1412 page_remove_rmap(page); 1412 page_remove_rmap(page);
1413 VM_BUG_ON(page_mapcount(page) < 0); 1413 VM_BUG_ON(page_mapcount(page) < 0);
1414 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1414 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1415 VM_BUG_ON(!PageHead(page)); 1415 VM_BUG_ON(!PageHead(page));
1416 tlb->mm->nr_ptes--; 1416 tlb->mm->nr_ptes--;
1417 spin_unlock(&tlb->mm->page_table_lock); 1417 spin_unlock(&tlb->mm->page_table_lock);
1418 tlb_remove_page(tlb, page); 1418 tlb_remove_page(tlb, page);
1419 } 1419 }
1420 pte_free(tlb->mm, pgtable); 1420 pte_free(tlb->mm, pgtable);
1421 ret = 1; 1421 ret = 1;
1422 } 1422 }
1423 return ret; 1423 return ret;
1424 } 1424 }
1425 1425
1426 int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1426 int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1427 unsigned long addr, unsigned long end, 1427 unsigned long addr, unsigned long end,
1428 unsigned char *vec) 1428 unsigned char *vec)
1429 { 1429 {
1430 int ret = 0; 1430 int ret = 0;
1431 1431
1432 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1432 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1433 /* 1433 /*
1434 * All logical pages in the range are present 1434 * All logical pages in the range are present
1435 * if backed by a huge page. 1435 * if backed by a huge page.
1436 */ 1436 */
1437 spin_unlock(&vma->vm_mm->page_table_lock); 1437 spin_unlock(&vma->vm_mm->page_table_lock);
1438 memset(vec, 1, (end - addr) >> PAGE_SHIFT); 1438 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1439 ret = 1; 1439 ret = 1;
1440 } 1440 }
1441 1441
1442 return ret; 1442 return ret;
1443 } 1443 }
1444 1444
1445 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, 1445 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1446 unsigned long old_addr, 1446 unsigned long old_addr,
1447 unsigned long new_addr, unsigned long old_end, 1447 unsigned long new_addr, unsigned long old_end,
1448 pmd_t *old_pmd, pmd_t *new_pmd) 1448 pmd_t *old_pmd, pmd_t *new_pmd)
1449 { 1449 {
1450 int ret = 0; 1450 int ret = 0;
1451 pmd_t pmd; 1451 pmd_t pmd;
1452 1452
1453 struct mm_struct *mm = vma->vm_mm; 1453 struct mm_struct *mm = vma->vm_mm;
1454 1454
1455 if ((old_addr & ~HPAGE_PMD_MASK) || 1455 if ((old_addr & ~HPAGE_PMD_MASK) ||
1456 (new_addr & ~HPAGE_PMD_MASK) || 1456 (new_addr & ~HPAGE_PMD_MASK) ||
1457 old_end - old_addr < HPAGE_PMD_SIZE || 1457 old_end - old_addr < HPAGE_PMD_SIZE ||
1458 (new_vma->vm_flags & VM_NOHUGEPAGE)) 1458 (new_vma->vm_flags & VM_NOHUGEPAGE))
1459 goto out; 1459 goto out;
1460 1460
1461 /* 1461 /*
1462 * The destination pmd shouldn't be established, free_pgtables() 1462 * The destination pmd shouldn't be established, free_pgtables()
1463 * should have release it. 1463 * should have release it.
1464 */ 1464 */
1465 if (WARN_ON(!pmd_none(*new_pmd))) { 1465 if (WARN_ON(!pmd_none(*new_pmd))) {
1466 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1466 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1467 goto out; 1467 goto out;
1468 } 1468 }
1469 1469
1470 ret = __pmd_trans_huge_lock(old_pmd, vma); 1470 ret = __pmd_trans_huge_lock(old_pmd, vma);
1471 if (ret == 1) { 1471 if (ret == 1) {
1472 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1472 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1473 VM_BUG_ON(!pmd_none(*new_pmd)); 1473 VM_BUG_ON(!pmd_none(*new_pmd));
1474 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); 1474 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1475 spin_unlock(&mm->page_table_lock); 1475 spin_unlock(&mm->page_table_lock);
1476 } 1476 }
1477 out: 1477 out:
1478 return ret; 1478 return ret;
1479 } 1479 }
1480 1480
1481 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1481 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1482 unsigned long addr, pgprot_t newprot, int prot_numa) 1482 unsigned long addr, pgprot_t newprot, int prot_numa)
1483 { 1483 {
1484 struct mm_struct *mm = vma->vm_mm; 1484 struct mm_struct *mm = vma->vm_mm;
1485 int ret = 0; 1485 int ret = 0;
1486 1486
1487 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1487 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1488 pmd_t entry; 1488 pmd_t entry;
1489 if (!prot_numa) { 1489 if (!prot_numa) {
1490 entry = pmdp_get_and_clear(mm, addr, pmd); 1490 entry = pmdp_get_and_clear(mm, addr, pmd);
1491 if (pmd_numa(entry)) 1491 if (pmd_numa(entry))
1492 entry = pmd_mknonnuma(entry); 1492 entry = pmd_mknonnuma(entry);
1493 entry = pmd_modify(entry, newprot); 1493 entry = pmd_modify(entry, newprot);
1494 BUG_ON(pmd_write(entry)); 1494 BUG_ON(pmd_write(entry));
1495 set_pmd_at(mm, addr, pmd, entry); 1495 set_pmd_at(mm, addr, pmd, entry);
1496 } else { 1496 } else {
1497 struct page *page = pmd_page(*pmd); 1497 struct page *page = pmd_page(*pmd);
1498 entry = *pmd; 1498 entry = *pmd;
1499 1499
1500 /* only check non-shared pages */ 1500 /* only check non-shared pages */
1501 if (page_mapcount(page) == 1 && 1501 if (page_mapcount(page) == 1 &&
1502 !pmd_numa(*pmd)) { 1502 !pmd_numa(*pmd)) {
1503 entry = pmd_mknuma(entry); 1503 entry = pmd_mknuma(entry);
1504 set_pmd_at(mm, addr, pmd, entry); 1504 set_pmd_at(mm, addr, pmd, entry);
1505 } 1505 }
1506 } 1506 }
1507 spin_unlock(&vma->vm_mm->page_table_lock); 1507 spin_unlock(&vma->vm_mm->page_table_lock);
1508 ret = 1; 1508 ret = 1;
1509 } 1509 }
1510 1510
1511 return ret; 1511 return ret;
1512 } 1512 }
1513 1513
1514 /* 1514 /*
1515 * Returns 1 if a given pmd maps a stable (not under splitting) thp. 1515 * Returns 1 if a given pmd maps a stable (not under splitting) thp.
1516 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. 1516 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1517 * 1517 *
1518 * Note that if it returns 1, this routine returns without unlocking page 1518 * Note that if it returns 1, this routine returns without unlocking page
1519 * table locks. So callers must unlock them. 1519 * table locks. So callers must unlock them.
1520 */ 1520 */
1521 int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1521 int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1522 { 1522 {
1523 spin_lock(&vma->vm_mm->page_table_lock); 1523 spin_lock(&vma->vm_mm->page_table_lock);
1524 if (likely(pmd_trans_huge(*pmd))) { 1524 if (likely(pmd_trans_huge(*pmd))) {
1525 if (unlikely(pmd_trans_splitting(*pmd))) { 1525 if (unlikely(pmd_trans_splitting(*pmd))) {
1526 spin_unlock(&vma->vm_mm->page_table_lock); 1526 spin_unlock(&vma->vm_mm->page_table_lock);
1527 wait_split_huge_page(vma->anon_vma, pmd); 1527 wait_split_huge_page(vma->anon_vma, pmd);
1528 return -1; 1528 return -1;
1529 } else { 1529 } else {
1530 /* Thp mapped by 'pmd' is stable, so we can 1530 /* Thp mapped by 'pmd' is stable, so we can
1531 * handle it as it is. */ 1531 * handle it as it is. */
1532 return 1; 1532 return 1;
1533 } 1533 }
1534 } 1534 }
1535 spin_unlock(&vma->vm_mm->page_table_lock); 1535 spin_unlock(&vma->vm_mm->page_table_lock);
1536 return 0; 1536 return 0;
1537 } 1537 }
1538 1538
1539 pmd_t *page_check_address_pmd(struct page *page, 1539 pmd_t *page_check_address_pmd(struct page *page,
1540 struct mm_struct *mm, 1540 struct mm_struct *mm,
1541 unsigned long address, 1541 unsigned long address,
1542 enum page_check_address_pmd_flag flag) 1542 enum page_check_address_pmd_flag flag)
1543 { 1543 {
1544 pmd_t *pmd, *ret = NULL; 1544 pmd_t *pmd, *ret = NULL;
1545 1545
1546 if (address & ~HPAGE_PMD_MASK) 1546 if (address & ~HPAGE_PMD_MASK)
1547 goto out; 1547 goto out;
1548 1548
1549 pmd = mm_find_pmd(mm, address); 1549 pmd = mm_find_pmd(mm, address);
1550 if (!pmd) 1550 if (!pmd)
1551 goto out; 1551 goto out;
1552 if (pmd_none(*pmd)) 1552 if (pmd_none(*pmd))
1553 goto out; 1553 goto out;
1554 if (pmd_page(*pmd) != page) 1554 if (pmd_page(*pmd) != page)
1555 goto out; 1555 goto out;
1556 /* 1556 /*
1557 * split_vma() may create temporary aliased mappings. There is 1557 * split_vma() may create temporary aliased mappings. There is
1558 * no risk as long as all huge pmd are found and have their 1558 * no risk as long as all huge pmd are found and have their
1559 * splitting bit set before __split_huge_page_refcount 1559 * splitting bit set before __split_huge_page_refcount
1560 * runs. Finding the same huge pmd more than once during the 1560 * runs. Finding the same huge pmd more than once during the
1561 * same rmap walk is not a problem. 1561 * same rmap walk is not a problem.
1562 */ 1562 */
1563 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && 1563 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1564 pmd_trans_splitting(*pmd)) 1564 pmd_trans_splitting(*pmd))
1565 goto out; 1565 goto out;
1566 if (pmd_trans_huge(*pmd)) { 1566 if (pmd_trans_huge(*pmd)) {
1567 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && 1567 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1568 !pmd_trans_splitting(*pmd)); 1568 !pmd_trans_splitting(*pmd));
1569 ret = pmd; 1569 ret = pmd;
1570 } 1570 }
1571 out: 1571 out:
1572 return ret; 1572 return ret;
1573 } 1573 }
1574 1574
1575 static int __split_huge_page_splitting(struct page *page, 1575 static int __split_huge_page_splitting(struct page *page,
1576 struct vm_area_struct *vma, 1576 struct vm_area_struct *vma,
1577 unsigned long address) 1577 unsigned long address)
1578 { 1578 {
1579 struct mm_struct *mm = vma->vm_mm; 1579 struct mm_struct *mm = vma->vm_mm;
1580 pmd_t *pmd; 1580 pmd_t *pmd;
1581 int ret = 0; 1581 int ret = 0;
1582 /* For mmu_notifiers */ 1582 /* For mmu_notifiers */
1583 const unsigned long mmun_start = address; 1583 const unsigned long mmun_start = address;
1584 const unsigned long mmun_end = address + HPAGE_PMD_SIZE; 1584 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1585 1585
1586 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1586 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1587 spin_lock(&mm->page_table_lock); 1587 spin_lock(&mm->page_table_lock);
1588 pmd = page_check_address_pmd(page, mm, address, 1588 pmd = page_check_address_pmd(page, mm, address,
1589 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1589 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1590 if (pmd) { 1590 if (pmd) {
1591 /* 1591 /*
1592 * We can't temporarily set the pmd to null in order 1592 * We can't temporarily set the pmd to null in order
1593 * to split it, the pmd must remain marked huge at all 1593 * to split it, the pmd must remain marked huge at all
1594 * times or the VM won't take the pmd_trans_huge paths 1594 * times or the VM won't take the pmd_trans_huge paths
1595 * and it won't wait on the anon_vma->root->rwsem to 1595 * and it won't wait on the anon_vma->root->rwsem to
1596 * serialize against split_huge_page*. 1596 * serialize against split_huge_page*.
1597 */ 1597 */
1598 pmdp_splitting_flush(vma, address, pmd); 1598 pmdp_splitting_flush(vma, address, pmd);
1599 ret = 1; 1599 ret = 1;
1600 } 1600 }
1601 spin_unlock(&mm->page_table_lock); 1601 spin_unlock(&mm->page_table_lock);
1602 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1602 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1603 1603
1604 return ret; 1604 return ret;
1605 } 1605 }
1606 1606
1607 static void __split_huge_page_refcount(struct page *page, 1607 static void __split_huge_page_refcount(struct page *page,
1608 struct list_head *list) 1608 struct list_head *list)
1609 { 1609 {
1610 int i; 1610 int i;
1611 struct zone *zone = page_zone(page); 1611 struct zone *zone = page_zone(page);
1612 struct lruvec *lruvec; 1612 struct lruvec *lruvec;
1613 int tail_count = 0; 1613 int tail_count = 0;
1614 1614
1615 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1615 /* prevent PageLRU to go away from under us, and freeze lru stats */
1616 spin_lock_irq(&zone->lru_lock); 1616 spin_lock_irq(&zone->lru_lock);
1617 lruvec = mem_cgroup_page_lruvec(page, zone); 1617 lruvec = mem_cgroup_page_lruvec(page, zone);
1618 1618
1619 compound_lock(page); 1619 compound_lock(page);
1620 /* complete memcg works before add pages to LRU */ 1620 /* complete memcg works before add pages to LRU */
1621 mem_cgroup_split_huge_fixup(page); 1621 mem_cgroup_split_huge_fixup(page);
1622 1622
1623 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 1623 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
1624 struct page *page_tail = page + i; 1624 struct page *page_tail = page + i;
1625 1625
1626 /* tail_page->_mapcount cannot change */ 1626 /* tail_page->_mapcount cannot change */
1627 BUG_ON(page_mapcount(page_tail) < 0); 1627 BUG_ON(page_mapcount(page_tail) < 0);
1628 tail_count += page_mapcount(page_tail); 1628 tail_count += page_mapcount(page_tail);
1629 /* check for overflow */ 1629 /* check for overflow */
1630 BUG_ON(tail_count < 0); 1630 BUG_ON(tail_count < 0);
1631 BUG_ON(atomic_read(&page_tail->_count) != 0); 1631 BUG_ON(atomic_read(&page_tail->_count) != 0);
1632 /* 1632 /*
1633 * tail_page->_count is zero and not changing from 1633 * tail_page->_count is zero and not changing from
1634 * under us. But get_page_unless_zero() may be running 1634 * under us. But get_page_unless_zero() may be running
1635 * from under us on the tail_page. If we used 1635 * from under us on the tail_page. If we used
1636 * atomic_set() below instead of atomic_add(), we 1636 * atomic_set() below instead of atomic_add(), we
1637 * would then run atomic_set() concurrently with 1637 * would then run atomic_set() concurrently with
1638 * get_page_unless_zero(), and atomic_set() is 1638 * get_page_unless_zero(), and atomic_set() is
1639 * implemented in C not using locked ops. spin_unlock 1639 * implemented in C not using locked ops. spin_unlock
1640 * on x86 sometime uses locked ops because of PPro 1640 * on x86 sometime uses locked ops because of PPro
1641 * errata 66, 92, so unless somebody can guarantee 1641 * errata 66, 92, so unless somebody can guarantee
1642 * atomic_set() here would be safe on all archs (and 1642 * atomic_set() here would be safe on all archs (and
1643 * not only on x86), it's safer to use atomic_add(). 1643 * not only on x86), it's safer to use atomic_add().
1644 */ 1644 */
1645 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, 1645 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1646 &page_tail->_count); 1646 &page_tail->_count);
1647 1647
1648 /* after clearing PageTail the gup refcount can be released */ 1648 /* after clearing PageTail the gup refcount can be released */
1649 smp_mb(); 1649 smp_mb();
1650 1650
1651 /* 1651 /*
1652 * retain hwpoison flag of the poisoned tail page: 1652 * retain hwpoison flag of the poisoned tail page:
1653 * fix for the unsuitable process killed on Guest Machine(KVM) 1653 * fix for the unsuitable process killed on Guest Machine(KVM)
1654 * by the memory-failure. 1654 * by the memory-failure.
1655 */ 1655 */
1656 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; 1656 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1657 page_tail->flags |= (page->flags & 1657 page_tail->flags |= (page->flags &
1658 ((1L << PG_referenced) | 1658 ((1L << PG_referenced) |
1659 (1L << PG_swapbacked) | 1659 (1L << PG_swapbacked) |
1660 (1L << PG_mlocked) | 1660 (1L << PG_mlocked) |
1661 (1L << PG_uptodate) | 1661 (1L << PG_uptodate) |
1662 (1L << PG_active) | 1662 (1L << PG_active) |
1663 (1L << PG_unevictable))); 1663 (1L << PG_unevictable)));
1664 page_tail->flags |= (1L << PG_dirty); 1664 page_tail->flags |= (1L << PG_dirty);
1665 1665
1666 /* clear PageTail before overwriting first_page */ 1666 /* clear PageTail before overwriting first_page */
1667 smp_wmb(); 1667 smp_wmb();
1668 1668
1669 /* 1669 /*
1670 * __split_huge_page_splitting() already set the 1670 * __split_huge_page_splitting() already set the
1671 * splitting bit in all pmd that could map this 1671 * splitting bit in all pmd that could map this
1672 * hugepage, that will ensure no CPU can alter the 1672 * hugepage, that will ensure no CPU can alter the
1673 * mapcount on the head page. The mapcount is only 1673 * mapcount on the head page. The mapcount is only
1674 * accounted in the head page and it has to be 1674 * accounted in the head page and it has to be
1675 * transferred to all tail pages in the below code. So 1675 * transferred to all tail pages in the below code. So
1676 * for this code to be safe, the split the mapcount 1676 * for this code to be safe, the split the mapcount
1677 * can't change. But that doesn't mean userland can't 1677 * can't change. But that doesn't mean userland can't
1678 * keep changing and reading the page contents while 1678 * keep changing and reading the page contents while
1679 * we transfer the mapcount, so the pmd splitting 1679 * we transfer the mapcount, so the pmd splitting
1680 * status is achieved setting a reserved bit in the 1680 * status is achieved setting a reserved bit in the
1681 * pmd, not by clearing the present bit. 1681 * pmd, not by clearing the present bit.
1682 */ 1682 */
1683 page_tail->_mapcount = page->_mapcount; 1683 page_tail->_mapcount = page->_mapcount;
1684 1684
1685 BUG_ON(page_tail->mapping); 1685 BUG_ON(page_tail->mapping);
1686 page_tail->mapping = page->mapping; 1686 page_tail->mapping = page->mapping;
1687 1687
1688 page_tail->index = page->index + i; 1688 page_tail->index = page->index + i;
1689 page_nid_xchg_last(page_tail, page_nid_last(page)); 1689 page_nid_xchg_last(page_tail, page_nid_last(page));
1690 1690
1691 BUG_ON(!PageAnon(page_tail)); 1691 BUG_ON(!PageAnon(page_tail));
1692 BUG_ON(!PageUptodate(page_tail)); 1692 BUG_ON(!PageUptodate(page_tail));
1693 BUG_ON(!PageDirty(page_tail)); 1693 BUG_ON(!PageDirty(page_tail));
1694 BUG_ON(!PageSwapBacked(page_tail)); 1694 BUG_ON(!PageSwapBacked(page_tail));
1695 1695
1696 lru_add_page_tail(page, page_tail, lruvec, list); 1696 lru_add_page_tail(page, page_tail, lruvec, list);
1697 } 1697 }
1698 atomic_sub(tail_count, &page->_count); 1698 atomic_sub(tail_count, &page->_count);
1699 BUG_ON(atomic_read(&page->_count) <= 0); 1699 BUG_ON(atomic_read(&page->_count) <= 0);
1700 1700
1701 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); 1701 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
1702 1702
1703 ClearPageCompound(page); 1703 ClearPageCompound(page);
1704 compound_unlock(page); 1704 compound_unlock(page);
1705 spin_unlock_irq(&zone->lru_lock); 1705 spin_unlock_irq(&zone->lru_lock);
1706 1706
1707 for (i = 1; i < HPAGE_PMD_NR; i++) { 1707 for (i = 1; i < HPAGE_PMD_NR; i++) {
1708 struct page *page_tail = page + i; 1708 struct page *page_tail = page + i;
1709 BUG_ON(page_count(page_tail) <= 0); 1709 BUG_ON(page_count(page_tail) <= 0);
1710 /* 1710 /*
1711 * Tail pages may be freed if there wasn't any mapping 1711 * Tail pages may be freed if there wasn't any mapping
1712 * like if add_to_swap() is running on a lru page that 1712 * like if add_to_swap() is running on a lru page that
1713 * had its mapping zapped. And freeing these pages 1713 * had its mapping zapped. And freeing these pages
1714 * requires taking the lru_lock so we do the put_page 1714 * requires taking the lru_lock so we do the put_page
1715 * of the tail pages after the split is complete. 1715 * of the tail pages after the split is complete.
1716 */ 1716 */
1717 put_page(page_tail); 1717 put_page(page_tail);
1718 } 1718 }
1719 1719
1720 /* 1720 /*
1721 * Only the head page (now become a regular page) is required 1721 * Only the head page (now become a regular page) is required
1722 * to be pinned by the caller. 1722 * to be pinned by the caller.
1723 */ 1723 */
1724 BUG_ON(page_count(page) <= 0); 1724 BUG_ON(page_count(page) <= 0);
1725 } 1725 }
1726 1726
1727 static int __split_huge_page_map(struct page *page, 1727 static int __split_huge_page_map(struct page *page,
1728 struct vm_area_struct *vma, 1728 struct vm_area_struct *vma,
1729 unsigned long address) 1729 unsigned long address)
1730 { 1730 {
1731 struct mm_struct *mm = vma->vm_mm; 1731 struct mm_struct *mm = vma->vm_mm;
1732 pmd_t *pmd, _pmd; 1732 pmd_t *pmd, _pmd;
1733 int ret = 0, i; 1733 int ret = 0, i;
1734 pgtable_t pgtable; 1734 pgtable_t pgtable;
1735 unsigned long haddr; 1735 unsigned long haddr;
1736 1736
1737 spin_lock(&mm->page_table_lock); 1737 spin_lock(&mm->page_table_lock);
1738 pmd = page_check_address_pmd(page, mm, address, 1738 pmd = page_check_address_pmd(page, mm, address,
1739 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1739 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1740 if (pmd) { 1740 if (pmd) {
1741 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1741 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1742 pmd_populate(mm, &_pmd, pgtable); 1742 pmd_populate(mm, &_pmd, pgtable);
1743 1743
1744 haddr = address; 1744 haddr = address;
1745 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1745 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1746 pte_t *pte, entry; 1746 pte_t *pte, entry;
1747 BUG_ON(PageCompound(page+i)); 1747 BUG_ON(PageCompound(page+i));
1748 entry = mk_pte(page + i, vma->vm_page_prot); 1748 entry = mk_pte(page + i, vma->vm_page_prot);
1749 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1749 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1750 if (!pmd_write(*pmd)) 1750 if (!pmd_write(*pmd))
1751 entry = pte_wrprotect(entry); 1751 entry = pte_wrprotect(entry);
1752 else 1752 else
1753 BUG_ON(page_mapcount(page) != 1); 1753 BUG_ON(page_mapcount(page) != 1);
1754 if (!pmd_young(*pmd)) 1754 if (!pmd_young(*pmd))
1755 entry = pte_mkold(entry); 1755 entry = pte_mkold(entry);
1756 if (pmd_numa(*pmd)) 1756 if (pmd_numa(*pmd))
1757 entry = pte_mknuma(entry); 1757 entry = pte_mknuma(entry);
1758 pte = pte_offset_map(&_pmd, haddr); 1758 pte = pte_offset_map(&_pmd, haddr);
1759 BUG_ON(!pte_none(*pte)); 1759 BUG_ON(!pte_none(*pte));
1760 set_pte_at(mm, haddr, pte, entry); 1760 set_pte_at(mm, haddr, pte, entry);
1761 pte_unmap(pte); 1761 pte_unmap(pte);
1762 } 1762 }
1763 1763
1764 smp_wmb(); /* make pte visible before pmd */ 1764 smp_wmb(); /* make pte visible before pmd */
1765 /* 1765 /*
1766 * Up to this point the pmd is present and huge and 1766 * Up to this point the pmd is present and huge and
1767 * userland has the whole access to the hugepage 1767 * userland has the whole access to the hugepage
1768 * during the split (which happens in place). If we 1768 * during the split (which happens in place). If we
1769 * overwrite the pmd with the not-huge version 1769 * overwrite the pmd with the not-huge version
1770 * pointing to the pte here (which of course we could 1770 * pointing to the pte here (which of course we could
1771 * if all CPUs were bug free), userland could trigger 1771 * if all CPUs were bug free), userland could trigger
1772 * a small page size TLB miss on the small sized TLB 1772 * a small page size TLB miss on the small sized TLB
1773 * while the hugepage TLB entry is still established 1773 * while the hugepage TLB entry is still established
1774 * in the huge TLB. Some CPU doesn't like that. See 1774 * in the huge TLB. Some CPU doesn't like that. See
1775 * http://support.amd.com/us/Processor_TechDocs/41322.pdf, 1775 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1776 * Erratum 383 on page 93. Intel should be safe but is 1776 * Erratum 383 on page 93. Intel should be safe but is
1777 * also warns that it's only safe if the permission 1777 * also warns that it's only safe if the permission
1778 * and cache attributes of the two entries loaded in 1778 * and cache attributes of the two entries loaded in
1779 * the two TLB is identical (which should be the case 1779 * the two TLB is identical (which should be the case
1780 * here). But it is generally safer to never allow 1780 * here). But it is generally safer to never allow
1781 * small and huge TLB entries for the same virtual 1781 * small and huge TLB entries for the same virtual
1782 * address to be loaded simultaneously. So instead of 1782 * address to be loaded simultaneously. So instead of
1783 * doing "pmd_populate(); flush_tlb_range();" we first 1783 * doing "pmd_populate(); flush_tlb_range();" we first
1784 * mark the current pmd notpresent (atomically because 1784 * mark the current pmd notpresent (atomically because
1785 * here the pmd_trans_huge and pmd_trans_splitting 1785 * here the pmd_trans_huge and pmd_trans_splitting
1786 * must remain set at all times on the pmd until the 1786 * must remain set at all times on the pmd until the
1787 * split is complete for this pmd), then we flush the 1787 * split is complete for this pmd), then we flush the
1788 * SMP TLB and finally we write the non-huge version 1788 * SMP TLB and finally we write the non-huge version
1789 * of the pmd entry with pmd_populate. 1789 * of the pmd entry with pmd_populate.
1790 */ 1790 */
1791 pmdp_invalidate(vma, address, pmd); 1791 pmdp_invalidate(vma, address, pmd);
1792 pmd_populate(mm, pmd, pgtable); 1792 pmd_populate(mm, pmd, pgtable);
1793 ret = 1; 1793 ret = 1;
1794 } 1794 }
1795 spin_unlock(&mm->page_table_lock); 1795 spin_unlock(&mm->page_table_lock);
1796 1796
1797 return ret; 1797 return ret;
1798 } 1798 }
1799 1799
1800 /* must be called with anon_vma->root->rwsem held */ 1800 /* must be called with anon_vma->root->rwsem held */
1801 static void __split_huge_page(struct page *page, 1801 static void __split_huge_page(struct page *page,
1802 struct anon_vma *anon_vma, 1802 struct anon_vma *anon_vma,
1803 struct list_head *list) 1803 struct list_head *list)
1804 { 1804 {
1805 int mapcount, mapcount2; 1805 int mapcount, mapcount2;
1806 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1806 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1807 struct anon_vma_chain *avc; 1807 struct anon_vma_chain *avc;
1808 1808
1809 BUG_ON(!PageHead(page)); 1809 BUG_ON(!PageHead(page));
1810 BUG_ON(PageTail(page)); 1810 BUG_ON(PageTail(page));
1811 1811
1812 mapcount = 0; 1812 mapcount = 0;
1813 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1813 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1814 struct vm_area_struct *vma = avc->vma; 1814 struct vm_area_struct *vma = avc->vma;
1815 unsigned long addr = vma_address(page, vma); 1815 unsigned long addr = vma_address(page, vma);
1816 BUG_ON(is_vma_temporary_stack(vma)); 1816 BUG_ON(is_vma_temporary_stack(vma));
1817 mapcount += __split_huge_page_splitting(page, vma, addr); 1817 mapcount += __split_huge_page_splitting(page, vma, addr);
1818 } 1818 }
1819 /* 1819 /*
1820 * It is critical that new vmas are added to the tail of the 1820 * It is critical that new vmas are added to the tail of the
1821 * anon_vma list. This guarantes that if copy_huge_pmd() runs 1821 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1822 * and establishes a child pmd before 1822 * and establishes a child pmd before
1823 * __split_huge_page_splitting() freezes the parent pmd (so if 1823 * __split_huge_page_splitting() freezes the parent pmd (so if
1824 * we fail to prevent copy_huge_pmd() from running until the 1824 * we fail to prevent copy_huge_pmd() from running until the
1825 * whole __split_huge_page() is complete), we will still see 1825 * whole __split_huge_page() is complete), we will still see
1826 * the newly established pmd of the child later during the 1826 * the newly established pmd of the child later during the
1827 * walk, to be able to set it as pmd_trans_splitting too. 1827 * walk, to be able to set it as pmd_trans_splitting too.
1828 */ 1828 */
1829 if (mapcount != page_mapcount(page)) 1829 if (mapcount != page_mapcount(page))
1830 printk(KERN_ERR "mapcount %d page_mapcount %d\n", 1830 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1831 mapcount, page_mapcount(page)); 1831 mapcount, page_mapcount(page));
1832 BUG_ON(mapcount != page_mapcount(page)); 1832 BUG_ON(mapcount != page_mapcount(page));
1833 1833
1834 __split_huge_page_refcount(page, list); 1834 __split_huge_page_refcount(page, list);
1835 1835
1836 mapcount2 = 0; 1836 mapcount2 = 0;
1837 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1837 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1838 struct vm_area_struct *vma = avc->vma; 1838 struct vm_area_struct *vma = avc->vma;
1839 unsigned long addr = vma_address(page, vma); 1839 unsigned long addr = vma_address(page, vma);
1840 BUG_ON(is_vma_temporary_stack(vma)); 1840 BUG_ON(is_vma_temporary_stack(vma));
1841 mapcount2 += __split_huge_page_map(page, vma, addr); 1841 mapcount2 += __split_huge_page_map(page, vma, addr);
1842 } 1842 }
1843 if (mapcount != mapcount2) 1843 if (mapcount != mapcount2)
1844 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", 1844 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1845 mapcount, mapcount2, page_mapcount(page)); 1845 mapcount, mapcount2, page_mapcount(page));
1846 BUG_ON(mapcount != mapcount2); 1846 BUG_ON(mapcount != mapcount2);
1847 } 1847 }
1848 1848
1849 /* 1849 /*
1850 * Split a hugepage into normal pages. This doesn't change the position of head 1850 * Split a hugepage into normal pages. This doesn't change the position of head
1851 * page. If @list is null, tail pages will be added to LRU list, otherwise, to 1851 * page. If @list is null, tail pages will be added to LRU list, otherwise, to
1852 * @list. Both head page and tail pages will inherit mapping, flags, and so on 1852 * @list. Both head page and tail pages will inherit mapping, flags, and so on
1853 * from the hugepage. 1853 * from the hugepage.
1854 * Return 0 if the hugepage is split successfully otherwise return 1. 1854 * Return 0 if the hugepage is split successfully otherwise return 1.
1855 */ 1855 */
1856 int split_huge_page_to_list(struct page *page, struct list_head *list) 1856 int split_huge_page_to_list(struct page *page, struct list_head *list)
1857 { 1857 {
1858 struct anon_vma *anon_vma; 1858 struct anon_vma *anon_vma;
1859 int ret = 1; 1859 int ret = 1;
1860 1860
1861 BUG_ON(is_huge_zero_page(page)); 1861 BUG_ON(is_huge_zero_page(page));
1862 BUG_ON(!PageAnon(page)); 1862 BUG_ON(!PageAnon(page));
1863 1863
1864 /* 1864 /*
1865 * The caller does not necessarily hold an mmap_sem that would prevent 1865 * The caller does not necessarily hold an mmap_sem that would prevent
1866 * the anon_vma disappearing so we first we take a reference to it 1866 * the anon_vma disappearing so we first we take a reference to it
1867 * and then lock the anon_vma for write. This is similar to 1867 * and then lock the anon_vma for write. This is similar to
1868 * page_lock_anon_vma_read except the write lock is taken to serialise 1868 * page_lock_anon_vma_read except the write lock is taken to serialise
1869 * against parallel split or collapse operations. 1869 * against parallel split or collapse operations.
1870 */ 1870 */
1871 anon_vma = page_get_anon_vma(page); 1871 anon_vma = page_get_anon_vma(page);
1872 if (!anon_vma) 1872 if (!anon_vma)
1873 goto out; 1873 goto out;
1874 anon_vma_lock_write(anon_vma); 1874 anon_vma_lock_write(anon_vma);
1875 1875
1876 ret = 0; 1876 ret = 0;
1877 if (!PageCompound(page)) 1877 if (!PageCompound(page))
1878 goto out_unlock; 1878 goto out_unlock;
1879 1879
1880 BUG_ON(!PageSwapBacked(page)); 1880 BUG_ON(!PageSwapBacked(page));
1881 __split_huge_page(page, anon_vma, list); 1881 __split_huge_page(page, anon_vma, list);
1882 count_vm_event(THP_SPLIT); 1882 count_vm_event(THP_SPLIT);
1883 1883
1884 BUG_ON(PageCompound(page)); 1884 BUG_ON(PageCompound(page));
1885 out_unlock: 1885 out_unlock:
1886 anon_vma_unlock_write(anon_vma); 1886 anon_vma_unlock_write(anon_vma);
1887 put_anon_vma(anon_vma); 1887 put_anon_vma(anon_vma);
1888 out: 1888 out:
1889 return ret; 1889 return ret;
1890 } 1890 }
1891 1891
1892 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) 1892 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
1893 1893
1894 int hugepage_madvise(struct vm_area_struct *vma, 1894 int hugepage_madvise(struct vm_area_struct *vma,
1895 unsigned long *vm_flags, int advice) 1895 unsigned long *vm_flags, int advice)
1896 { 1896 {
1897 struct mm_struct *mm = vma->vm_mm; 1897 struct mm_struct *mm = vma->vm_mm;
1898 1898
1899 switch (advice) { 1899 switch (advice) {
1900 case MADV_HUGEPAGE: 1900 case MADV_HUGEPAGE:
1901 /* 1901 /*
1902 * Be somewhat over-protective like KSM for now! 1902 * Be somewhat over-protective like KSM for now!
1903 */ 1903 */
1904 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 1904 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1905 return -EINVAL; 1905 return -EINVAL;
1906 if (mm->def_flags & VM_NOHUGEPAGE) 1906 if (mm->def_flags & VM_NOHUGEPAGE)
1907 return -EINVAL; 1907 return -EINVAL;
1908 *vm_flags &= ~VM_NOHUGEPAGE; 1908 *vm_flags &= ~VM_NOHUGEPAGE;
1909 *vm_flags |= VM_HUGEPAGE; 1909 *vm_flags |= VM_HUGEPAGE;
1910 /* 1910 /*
1911 * If the vma become good for khugepaged to scan, 1911 * If the vma become good for khugepaged to scan,
1912 * register it here without waiting a page fault that 1912 * register it here without waiting a page fault that
1913 * may not happen any time soon. 1913 * may not happen any time soon.
1914 */ 1914 */
1915 if (unlikely(khugepaged_enter_vma_merge(vma))) 1915 if (unlikely(khugepaged_enter_vma_merge(vma)))
1916 return -ENOMEM; 1916 return -ENOMEM;
1917 break; 1917 break;
1918 case MADV_NOHUGEPAGE: 1918 case MADV_NOHUGEPAGE:
1919 /* 1919 /*
1920 * Be somewhat over-protective like KSM for now! 1920 * Be somewhat over-protective like KSM for now!
1921 */ 1921 */
1922 if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) 1922 if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
1923 return -EINVAL; 1923 return -EINVAL;
1924 *vm_flags &= ~VM_HUGEPAGE; 1924 *vm_flags &= ~VM_HUGEPAGE;
1925 *vm_flags |= VM_NOHUGEPAGE; 1925 *vm_flags |= VM_NOHUGEPAGE;
1926 /* 1926 /*
1927 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 1927 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1928 * this vma even if we leave the mm registered in khugepaged if 1928 * this vma even if we leave the mm registered in khugepaged if
1929 * it got registered before VM_NOHUGEPAGE was set. 1929 * it got registered before VM_NOHUGEPAGE was set.
1930 */ 1930 */
1931 break; 1931 break;
1932 } 1932 }
1933 1933
1934 return 0; 1934 return 0;
1935 } 1935 }
1936 1936
1937 static int __init khugepaged_slab_init(void) 1937 static int __init khugepaged_slab_init(void)
1938 { 1938 {
1939 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", 1939 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1940 sizeof(struct mm_slot), 1940 sizeof(struct mm_slot),
1941 __alignof__(struct mm_slot), 0, NULL); 1941 __alignof__(struct mm_slot), 0, NULL);
1942 if (!mm_slot_cache) 1942 if (!mm_slot_cache)
1943 return -ENOMEM; 1943 return -ENOMEM;
1944 1944
1945 return 0; 1945 return 0;
1946 } 1946 }
1947 1947
1948 static inline struct mm_slot *alloc_mm_slot(void) 1948 static inline struct mm_slot *alloc_mm_slot(void)
1949 { 1949 {
1950 if (!mm_slot_cache) /* initialization failed */ 1950 if (!mm_slot_cache) /* initialization failed */
1951 return NULL; 1951 return NULL;
1952 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 1952 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1953 } 1953 }
1954 1954
1955 static inline void free_mm_slot(struct mm_slot *mm_slot) 1955 static inline void free_mm_slot(struct mm_slot *mm_slot)
1956 { 1956 {
1957 kmem_cache_free(mm_slot_cache, mm_slot); 1957 kmem_cache_free(mm_slot_cache, mm_slot);
1958 } 1958 }
1959 1959
1960 static struct mm_slot *get_mm_slot(struct mm_struct *mm) 1960 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1961 { 1961 {
1962 struct mm_slot *mm_slot; 1962 struct mm_slot *mm_slot;
1963 1963
1964 hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) 1964 hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
1965 if (mm == mm_slot->mm) 1965 if (mm == mm_slot->mm)
1966 return mm_slot; 1966 return mm_slot;
1967 1967
1968 return NULL; 1968 return NULL;
1969 } 1969 }
1970 1970
1971 static void insert_to_mm_slots_hash(struct mm_struct *mm, 1971 static void insert_to_mm_slots_hash(struct mm_struct *mm,
1972 struct mm_slot *mm_slot) 1972 struct mm_slot *mm_slot)
1973 { 1973 {
1974 mm_slot->mm = mm; 1974 mm_slot->mm = mm;
1975 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); 1975 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
1976 } 1976 }
1977 1977
1978 static inline int khugepaged_test_exit(struct mm_struct *mm) 1978 static inline int khugepaged_test_exit(struct mm_struct *mm)
1979 { 1979 {
1980 return atomic_read(&mm->mm_users) == 0; 1980 return atomic_read(&mm->mm_users) == 0;
1981 } 1981 }
1982 1982
1983 int __khugepaged_enter(struct mm_struct *mm) 1983 int __khugepaged_enter(struct mm_struct *mm)
1984 { 1984 {
1985 struct mm_slot *mm_slot; 1985 struct mm_slot *mm_slot;
1986 int wakeup; 1986 int wakeup;
1987 1987
1988 mm_slot = alloc_mm_slot(); 1988 mm_slot = alloc_mm_slot();
1989 if (!mm_slot) 1989 if (!mm_slot)
1990 return -ENOMEM; 1990 return -ENOMEM;
1991 1991
1992 /* __khugepaged_exit() must not run from under us */ 1992 /* __khugepaged_exit() must not run from under us */
1993 VM_BUG_ON(khugepaged_test_exit(mm)); 1993 VM_BUG_ON(khugepaged_test_exit(mm));
1994 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 1994 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1995 free_mm_slot(mm_slot); 1995 free_mm_slot(mm_slot);
1996 return 0; 1996 return 0;
1997 } 1997 }
1998 1998
1999 spin_lock(&khugepaged_mm_lock); 1999 spin_lock(&khugepaged_mm_lock);
2000 insert_to_mm_slots_hash(mm, mm_slot); 2000 insert_to_mm_slots_hash(mm, mm_slot);
2001 /* 2001 /*
2002 * Insert just behind the scanning cursor, to let the area settle 2002 * Insert just behind the scanning cursor, to let the area settle
2003 * down a little. 2003 * down a little.
2004 */ 2004 */
2005 wakeup = list_empty(&khugepaged_scan.mm_head); 2005 wakeup = list_empty(&khugepaged_scan.mm_head);
2006 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); 2006 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
2007 spin_unlock(&khugepaged_mm_lock); 2007 spin_unlock(&khugepaged_mm_lock);
2008 2008
2009 atomic_inc(&mm->mm_count); 2009 atomic_inc(&mm->mm_count);
2010 if (wakeup) 2010 if (wakeup)
2011 wake_up_interruptible(&khugepaged_wait); 2011 wake_up_interruptible(&khugepaged_wait);
2012 2012
2013 return 0; 2013 return 0;
2014 } 2014 }
2015 2015
2016 int khugepaged_enter_vma_merge(struct vm_area_struct *vma) 2016 int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
2017 { 2017 {
2018 unsigned long hstart, hend; 2018 unsigned long hstart, hend;
2019 if (!vma->anon_vma) 2019 if (!vma->anon_vma)
2020 /* 2020 /*
2021 * Not yet faulted in so we will register later in the 2021 * Not yet faulted in so we will register later in the
2022 * page fault if needed. 2022 * page fault if needed.
2023 */ 2023 */
2024 return 0; 2024 return 0;
2025 if (vma->vm_ops) 2025 if (vma->vm_ops)
2026 /* khugepaged not yet working on file or special mappings */ 2026 /* khugepaged not yet working on file or special mappings */
2027 return 0; 2027 return 0;
2028 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2028 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2029 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2029 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2030 hend = vma->vm_end & HPAGE_PMD_MASK; 2030 hend = vma->vm_end & HPAGE_PMD_MASK;
2031 if (hstart < hend) 2031 if (hstart < hend)
2032 return khugepaged_enter(vma); 2032 return khugepaged_enter(vma);
2033 return 0; 2033 return 0;
2034 } 2034 }
2035 2035
2036 void __khugepaged_exit(struct mm_struct *mm) 2036 void __khugepaged_exit(struct mm_struct *mm)
2037 { 2037 {
2038 struct mm_slot *mm_slot; 2038 struct mm_slot *mm_slot;
2039 int free = 0; 2039 int free = 0;
2040 2040
2041 spin_lock(&khugepaged_mm_lock); 2041 spin_lock(&khugepaged_mm_lock);
2042 mm_slot = get_mm_slot(mm); 2042 mm_slot = get_mm_slot(mm);
2043 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 2043 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
2044 hash_del(&mm_slot->hash); 2044 hash_del(&mm_slot->hash);
2045 list_del(&mm_slot->mm_node); 2045 list_del(&mm_slot->mm_node);
2046 free = 1; 2046 free = 1;
2047 } 2047 }
2048 spin_unlock(&khugepaged_mm_lock); 2048 spin_unlock(&khugepaged_mm_lock);
2049 2049
2050 if (free) { 2050 if (free) {
2051 clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 2051 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2052 free_mm_slot(mm_slot); 2052 free_mm_slot(mm_slot);
2053 mmdrop(mm); 2053 mmdrop(mm);
2054 } else if (mm_slot) { 2054 } else if (mm_slot) {
2055 /* 2055 /*
2056 * This is required to serialize against 2056 * This is required to serialize against
2057 * khugepaged_test_exit() (which is guaranteed to run 2057 * khugepaged_test_exit() (which is guaranteed to run
2058 * under mmap sem read mode). Stop here (after we 2058 * under mmap sem read mode). Stop here (after we
2059 * return all pagetables will be destroyed) until 2059 * return all pagetables will be destroyed) until
2060 * khugepaged has finished working on the pagetables 2060 * khugepaged has finished working on the pagetables
2061 * under the mmap_sem. 2061 * under the mmap_sem.
2062 */ 2062 */
2063 down_write(&mm->mmap_sem); 2063 down_write(&mm->mmap_sem);
2064 up_write(&mm->mmap_sem); 2064 up_write(&mm->mmap_sem);
2065 } 2065 }
2066 } 2066 }
2067 2067
2068 static void release_pte_page(struct page *page) 2068 static void release_pte_page(struct page *page)
2069 { 2069 {
2070 /* 0 stands for page_is_file_cache(page) == false */ 2070 /* 0 stands for page_is_file_cache(page) == false */
2071 dec_zone_page_state(page, NR_ISOLATED_ANON + 0); 2071 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
2072 unlock_page(page); 2072 unlock_page(page);
2073 putback_lru_page(page); 2073 putback_lru_page(page);
2074 } 2074 }
2075 2075
2076 static void release_pte_pages(pte_t *pte, pte_t *_pte) 2076 static void release_pte_pages(pte_t *pte, pte_t *_pte)
2077 { 2077 {
2078 while (--_pte >= pte) { 2078 while (--_pte >= pte) {
2079 pte_t pteval = *_pte; 2079 pte_t pteval = *_pte;
2080 if (!pte_none(pteval)) 2080 if (!pte_none(pteval))
2081 release_pte_page(pte_page(pteval)); 2081 release_pte_page(pte_page(pteval));
2082 } 2082 }
2083 } 2083 }
2084 2084
2085 static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 2085 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2086 unsigned long address, 2086 unsigned long address,
2087 pte_t *pte) 2087 pte_t *pte)
2088 { 2088 {
2089 struct page *page; 2089 struct page *page;
2090 pte_t *_pte; 2090 pte_t *_pte;
2091 int referenced = 0, none = 0; 2091 int referenced = 0, none = 0;
2092 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2092 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2093 _pte++, address += PAGE_SIZE) { 2093 _pte++, address += PAGE_SIZE) {
2094 pte_t pteval = *_pte; 2094 pte_t pteval = *_pte;
2095 if (pte_none(pteval)) { 2095 if (pte_none(pteval)) {
2096 if (++none <= khugepaged_max_ptes_none) 2096 if (++none <= khugepaged_max_ptes_none)
2097 continue; 2097 continue;
2098 else 2098 else
2099 goto out; 2099 goto out;
2100 } 2100 }
2101 if (!pte_present(pteval) || !pte_write(pteval)) 2101 if (!pte_present(pteval) || !pte_write(pteval))
2102 goto out; 2102 goto out;
2103 page = vm_normal_page(vma, address, pteval); 2103 page = vm_normal_page(vma, address, pteval);
2104 if (unlikely(!page)) 2104 if (unlikely(!page))
2105 goto out; 2105 goto out;
2106 2106
2107 VM_BUG_ON(PageCompound(page)); 2107 VM_BUG_ON(PageCompound(page));
2108 BUG_ON(!PageAnon(page)); 2108 BUG_ON(!PageAnon(page));
2109 VM_BUG_ON(!PageSwapBacked(page)); 2109 VM_BUG_ON(!PageSwapBacked(page));
2110 2110
2111 /* cannot use mapcount: can't collapse if there's a gup pin */ 2111 /* cannot use mapcount: can't collapse if there's a gup pin */
2112 if (page_count(page) != 1) 2112 if (page_count(page) != 1)
2113 goto out; 2113 goto out;
2114 /* 2114 /*
2115 * We can do it before isolate_lru_page because the 2115 * We can do it before isolate_lru_page because the
2116 * page can't be freed from under us. NOTE: PG_lock 2116 * page can't be freed from under us. NOTE: PG_lock
2117 * is needed to serialize against split_huge_page 2117 * is needed to serialize against split_huge_page
2118 * when invoked from the VM. 2118 * when invoked from the VM.
2119 */ 2119 */
2120 if (!trylock_page(page)) 2120 if (!trylock_page(page))
2121 goto out; 2121 goto out;
2122 /* 2122 /*
2123 * Isolate the page to avoid collapsing an hugepage 2123 * Isolate the page to avoid collapsing an hugepage
2124 * currently in use by the VM. 2124 * currently in use by the VM.
2125 */ 2125 */
2126 if (isolate_lru_page(page)) { 2126 if (isolate_lru_page(page)) {
2127 unlock_page(page); 2127 unlock_page(page);
2128 goto out; 2128 goto out;
2129 } 2129 }
2130 /* 0 stands for page_is_file_cache(page) == false */ 2130 /* 0 stands for page_is_file_cache(page) == false */
2131 inc_zone_page_state(page, NR_ISOLATED_ANON + 0); 2131 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
2132 VM_BUG_ON(!PageLocked(page)); 2132 VM_BUG_ON(!PageLocked(page));
2133 VM_BUG_ON(PageLRU(page)); 2133 VM_BUG_ON(PageLRU(page));
2134 2134
2135 /* If there is no mapped pte young don't collapse the page */ 2135 /* If there is no mapped pte young don't collapse the page */
2136 if (pte_young(pteval) || PageReferenced(page) || 2136 if (pte_young(pteval) || PageReferenced(page) ||
2137 mmu_notifier_test_young(vma->vm_mm, address)) 2137 mmu_notifier_test_young(vma->vm_mm, address))
2138 referenced = 1; 2138 referenced = 1;
2139 } 2139 }
2140 if (likely(referenced)) 2140 if (likely(referenced))
2141 return 1; 2141 return 1;
2142 out: 2142 out:
2143 release_pte_pages(pte, _pte); 2143 release_pte_pages(pte, _pte);
2144 return 0; 2144 return 0;
2145 } 2145 }
2146 2146
2147 static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 2147 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
2148 struct vm_area_struct *vma, 2148 struct vm_area_struct *vma,
2149 unsigned long address, 2149 unsigned long address,
2150 spinlock_t *ptl) 2150 spinlock_t *ptl)
2151 { 2151 {
2152 pte_t *_pte; 2152 pte_t *_pte;
2153 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { 2153 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
2154 pte_t pteval = *_pte; 2154 pte_t pteval = *_pte;
2155 struct page *src_page; 2155 struct page *src_page;
2156 2156
2157 if (pte_none(pteval)) { 2157 if (pte_none(pteval)) {
2158 clear_user_highpage(page, address); 2158 clear_user_highpage(page, address);
2159 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 2159 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
2160 } else { 2160 } else {
2161 src_page = pte_page(pteval); 2161 src_page = pte_page(pteval);
2162 copy_user_highpage(page, src_page, address, vma); 2162 copy_user_highpage(page, src_page, address, vma);
2163 VM_BUG_ON(page_mapcount(src_page) != 1); 2163 VM_BUG_ON(page_mapcount(src_page) != 1);
2164 release_pte_page(src_page); 2164 release_pte_page(src_page);
2165 /* 2165 /*
2166 * ptl mostly unnecessary, but preempt has to 2166 * ptl mostly unnecessary, but preempt has to
2167 * be disabled to update the per-cpu stats 2167 * be disabled to update the per-cpu stats
2168 * inside page_remove_rmap(). 2168 * inside page_remove_rmap().
2169 */ 2169 */
2170 spin_lock(ptl); 2170 spin_lock(ptl);
2171 /* 2171 /*
2172 * paravirt calls inside pte_clear here are 2172 * paravirt calls inside pte_clear here are
2173 * superfluous. 2173 * superfluous.
2174 */ 2174 */
2175 pte_clear(vma->vm_mm, address, _pte); 2175 pte_clear(vma->vm_mm, address, _pte);
2176 page_remove_rmap(src_page); 2176 page_remove_rmap(src_page);
2177 spin_unlock(ptl); 2177 spin_unlock(ptl);
2178 free_page_and_swap_cache(src_page); 2178 free_page_and_swap_cache(src_page);
2179 } 2179 }
2180 2180
2181 address += PAGE_SIZE; 2181 address += PAGE_SIZE;
2182 page++; 2182 page++;
2183 } 2183 }
2184 } 2184 }
2185 2185
2186 static void khugepaged_alloc_sleep(void) 2186 static void khugepaged_alloc_sleep(void)
2187 { 2187 {
2188 wait_event_freezable_timeout(khugepaged_wait, false, 2188 wait_event_freezable_timeout(khugepaged_wait, false,
2189 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2189 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2190 } 2190 }
2191 2191
2192 static int khugepaged_node_load[MAX_NUMNODES]; 2192 static int khugepaged_node_load[MAX_NUMNODES];
2193 2193
2194 static bool khugepaged_scan_abort(int nid)
2195 {
2196 int i;
2197
2198 /*
2199 * If zone_reclaim_mode is disabled, then no extra effort is made to
2200 * allocate memory locally.
2201 */
2202 if (!zone_reclaim_mode)
2203 return false;
2204
2205 /* If there is a count for this node already, it must be acceptable */
2206 if (khugepaged_node_load[nid])
2207 return false;
2208
2209 for (i = 0; i < MAX_NUMNODES; i++) {
2210 if (!khugepaged_node_load[i])
2211 continue;
2212 if (node_distance(nid, i) > RECLAIM_DISTANCE)
2213 return true;
2214 }
2215 return false;
2216 }
2217
2194 #ifdef CONFIG_NUMA 2218 #ifdef CONFIG_NUMA
2195 static int khugepaged_find_target_node(void) 2219 static int khugepaged_find_target_node(void)
2196 { 2220 {
2197 static int last_khugepaged_target_node = NUMA_NO_NODE; 2221 static int last_khugepaged_target_node = NUMA_NO_NODE;
2198 int nid, target_node = 0, max_value = 0; 2222 int nid, target_node = 0, max_value = 0;
2199 2223
2200 /* find first node with max normal pages hit */ 2224 /* find first node with max normal pages hit */
2201 for (nid = 0; nid < MAX_NUMNODES; nid++) 2225 for (nid = 0; nid < MAX_NUMNODES; nid++)
2202 if (khugepaged_node_load[nid] > max_value) { 2226 if (khugepaged_node_load[nid] > max_value) {
2203 max_value = khugepaged_node_load[nid]; 2227 max_value = khugepaged_node_load[nid];
2204 target_node = nid; 2228 target_node = nid;
2205 } 2229 }
2206 2230
2207 /* do some balance if several nodes have the same hit record */ 2231 /* do some balance if several nodes have the same hit record */
2208 if (target_node <= last_khugepaged_target_node) 2232 if (target_node <= last_khugepaged_target_node)
2209 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; 2233 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
2210 nid++) 2234 nid++)
2211 if (max_value == khugepaged_node_load[nid]) { 2235 if (max_value == khugepaged_node_load[nid]) {
2212 target_node = nid; 2236 target_node = nid;
2213 break; 2237 break;
2214 } 2238 }
2215 2239
2216 last_khugepaged_target_node = target_node; 2240 last_khugepaged_target_node = target_node;
2217 return target_node; 2241 return target_node;
2218 } 2242 }
2219 2243
2220 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2244 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2221 { 2245 {
2222 if (IS_ERR(*hpage)) { 2246 if (IS_ERR(*hpage)) {
2223 if (!*wait) 2247 if (!*wait)
2224 return false; 2248 return false;
2225 2249
2226 *wait = false; 2250 *wait = false;
2227 *hpage = NULL; 2251 *hpage = NULL;
2228 khugepaged_alloc_sleep(); 2252 khugepaged_alloc_sleep();
2229 } else if (*hpage) { 2253 } else if (*hpage) {
2230 put_page(*hpage); 2254 put_page(*hpage);
2231 *hpage = NULL; 2255 *hpage = NULL;
2232 } 2256 }
2233 2257
2234 return true; 2258 return true;
2235 } 2259 }
2236 2260
2237 static struct page 2261 static struct page
2238 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2262 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
2239 struct vm_area_struct *vma, unsigned long address, 2263 struct vm_area_struct *vma, unsigned long address,
2240 int node) 2264 int node)
2241 { 2265 {
2242 VM_BUG_ON(*hpage); 2266 VM_BUG_ON(*hpage);
2243 /* 2267 /*
2244 * Allocate the page while the vma is still valid and under 2268 * Allocate the page while the vma is still valid and under
2245 * the mmap_sem read mode so there is no memory allocation 2269 * the mmap_sem read mode so there is no memory allocation
2246 * later when we take the mmap_sem in write mode. This is more 2270 * later when we take the mmap_sem in write mode. This is more
2247 * friendly behavior (OTOH it may actually hide bugs) to 2271 * friendly behavior (OTOH it may actually hide bugs) to
2248 * filesystems in userland with daemons allocating memory in 2272 * filesystems in userland with daemons allocating memory in
2249 * the userland I/O paths. Allocating memory with the 2273 * the userland I/O paths. Allocating memory with the
2250 * mmap_sem in read mode is good idea also to allow greater 2274 * mmap_sem in read mode is good idea also to allow greater
2251 * scalability. 2275 * scalability.
2252 */ 2276 */
2253 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( 2277 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
2254 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); 2278 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
2255 /* 2279 /*
2256 * After allocating the hugepage, release the mmap_sem read lock in 2280 * After allocating the hugepage, release the mmap_sem read lock in
2257 * preparation for taking it in write mode. 2281 * preparation for taking it in write mode.
2258 */ 2282 */
2259 up_read(&mm->mmap_sem); 2283 up_read(&mm->mmap_sem);
2260 if (unlikely(!*hpage)) { 2284 if (unlikely(!*hpage)) {
2261 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2285 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2262 *hpage = ERR_PTR(-ENOMEM); 2286 *hpage = ERR_PTR(-ENOMEM);
2263 return NULL; 2287 return NULL;
2264 } 2288 }
2265 2289
2266 count_vm_event(THP_COLLAPSE_ALLOC); 2290 count_vm_event(THP_COLLAPSE_ALLOC);
2267 return *hpage; 2291 return *hpage;
2268 } 2292 }
2269 #else 2293 #else
2270 static int khugepaged_find_target_node(void) 2294 static int khugepaged_find_target_node(void)
2271 { 2295 {
2272 return 0; 2296 return 0;
2273 } 2297 }
2274 2298
2275 static inline struct page *alloc_hugepage(int defrag) 2299 static inline struct page *alloc_hugepage(int defrag)
2276 { 2300 {
2277 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), 2301 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
2278 HPAGE_PMD_ORDER); 2302 HPAGE_PMD_ORDER);
2279 } 2303 }
2280 2304
2281 static struct page *khugepaged_alloc_hugepage(bool *wait) 2305 static struct page *khugepaged_alloc_hugepage(bool *wait)
2282 { 2306 {
2283 struct page *hpage; 2307 struct page *hpage;
2284 2308
2285 do { 2309 do {
2286 hpage = alloc_hugepage(khugepaged_defrag()); 2310 hpage = alloc_hugepage(khugepaged_defrag());
2287 if (!hpage) { 2311 if (!hpage) {
2288 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2312 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2289 if (!*wait) 2313 if (!*wait)
2290 return NULL; 2314 return NULL;
2291 2315
2292 *wait = false; 2316 *wait = false;
2293 khugepaged_alloc_sleep(); 2317 khugepaged_alloc_sleep();
2294 } else 2318 } else
2295 count_vm_event(THP_COLLAPSE_ALLOC); 2319 count_vm_event(THP_COLLAPSE_ALLOC);
2296 } while (unlikely(!hpage) && likely(khugepaged_enabled())); 2320 } while (unlikely(!hpage) && likely(khugepaged_enabled()));
2297 2321
2298 return hpage; 2322 return hpage;
2299 } 2323 }
2300 2324
2301 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2325 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2302 { 2326 {
2303 if (!*hpage) 2327 if (!*hpage)
2304 *hpage = khugepaged_alloc_hugepage(wait); 2328 *hpage = khugepaged_alloc_hugepage(wait);
2305 2329
2306 if (unlikely(!*hpage)) 2330 if (unlikely(!*hpage))
2307 return false; 2331 return false;
2308 2332
2309 return true; 2333 return true;
2310 } 2334 }
2311 2335
2312 static struct page 2336 static struct page
2313 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2337 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
2314 struct vm_area_struct *vma, unsigned long address, 2338 struct vm_area_struct *vma, unsigned long address,
2315 int node) 2339 int node)
2316 { 2340 {
2317 up_read(&mm->mmap_sem); 2341 up_read(&mm->mmap_sem);
2318 VM_BUG_ON(!*hpage); 2342 VM_BUG_ON(!*hpage);
2319 return *hpage; 2343 return *hpage;
2320 } 2344 }
2321 #endif 2345 #endif
2322 2346
2323 static bool hugepage_vma_check(struct vm_area_struct *vma) 2347 static bool hugepage_vma_check(struct vm_area_struct *vma)
2324 { 2348 {
2325 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 2349 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2326 (vma->vm_flags & VM_NOHUGEPAGE)) 2350 (vma->vm_flags & VM_NOHUGEPAGE))
2327 return false; 2351 return false;
2328 2352
2329 if (!vma->anon_vma || vma->vm_ops) 2353 if (!vma->anon_vma || vma->vm_ops)
2330 return false; 2354 return false;
2331 if (is_vma_temporary_stack(vma)) 2355 if (is_vma_temporary_stack(vma))
2332 return false; 2356 return false;
2333 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2357 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2334 return true; 2358 return true;
2335 } 2359 }
2336 2360
2337 static void collapse_huge_page(struct mm_struct *mm, 2361 static void collapse_huge_page(struct mm_struct *mm,
2338 unsigned long address, 2362 unsigned long address,
2339 struct page **hpage, 2363 struct page **hpage,
2340 struct vm_area_struct *vma, 2364 struct vm_area_struct *vma,
2341 int node) 2365 int node)
2342 { 2366 {
2343 pmd_t *pmd, _pmd; 2367 pmd_t *pmd, _pmd;
2344 pte_t *pte; 2368 pte_t *pte;
2345 pgtable_t pgtable; 2369 pgtable_t pgtable;
2346 struct page *new_page; 2370 struct page *new_page;
2347 spinlock_t *ptl; 2371 spinlock_t *ptl;
2348 int isolated; 2372 int isolated;
2349 unsigned long hstart, hend; 2373 unsigned long hstart, hend;
2350 unsigned long mmun_start; /* For mmu_notifiers */ 2374 unsigned long mmun_start; /* For mmu_notifiers */
2351 unsigned long mmun_end; /* For mmu_notifiers */ 2375 unsigned long mmun_end; /* For mmu_notifiers */
2352 2376
2353 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2377 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2354 2378
2355 /* release the mmap_sem read lock. */ 2379 /* release the mmap_sem read lock. */
2356 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); 2380 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
2357 if (!new_page) 2381 if (!new_page)
2358 return; 2382 return;
2359 2383
2360 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) 2384 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
2361 return; 2385 return;
2362 2386
2363 /* 2387 /*
2364 * Prevent all access to pagetables with the exception of 2388 * Prevent all access to pagetables with the exception of
2365 * gup_fast later hanlded by the ptep_clear_flush and the VM 2389 * gup_fast later hanlded by the ptep_clear_flush and the VM
2366 * handled by the anon_vma lock + PG_lock. 2390 * handled by the anon_vma lock + PG_lock.
2367 */ 2391 */
2368 down_write(&mm->mmap_sem); 2392 down_write(&mm->mmap_sem);
2369 if (unlikely(khugepaged_test_exit(mm))) 2393 if (unlikely(khugepaged_test_exit(mm)))
2370 goto out; 2394 goto out;
2371 2395
2372 vma = find_vma(mm, address); 2396 vma = find_vma(mm, address);
2373 if (!vma) 2397 if (!vma)
2374 goto out; 2398 goto out;
2375 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2399 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2376 hend = vma->vm_end & HPAGE_PMD_MASK; 2400 hend = vma->vm_end & HPAGE_PMD_MASK;
2377 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 2401 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
2378 goto out; 2402 goto out;
2379 if (!hugepage_vma_check(vma)) 2403 if (!hugepage_vma_check(vma))
2380 goto out; 2404 goto out;
2381 pmd = mm_find_pmd(mm, address); 2405 pmd = mm_find_pmd(mm, address);
2382 if (!pmd) 2406 if (!pmd)
2383 goto out; 2407 goto out;
2384 if (pmd_trans_huge(*pmd)) 2408 if (pmd_trans_huge(*pmd))
2385 goto out; 2409 goto out;
2386 2410
2387 anon_vma_lock_write(vma->anon_vma); 2411 anon_vma_lock_write(vma->anon_vma);
2388 2412
2389 pte = pte_offset_map(pmd, address); 2413 pte = pte_offset_map(pmd, address);
2390 ptl = pte_lockptr(mm, pmd); 2414 ptl = pte_lockptr(mm, pmd);
2391 2415
2392 mmun_start = address; 2416 mmun_start = address;
2393 mmun_end = address + HPAGE_PMD_SIZE; 2417 mmun_end = address + HPAGE_PMD_SIZE;
2394 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2418 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2395 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 2419 spin_lock(&mm->page_table_lock); /* probably unnecessary */
2396 /* 2420 /*
2397 * After this gup_fast can't run anymore. This also removes 2421 * After this gup_fast can't run anymore. This also removes
2398 * any huge TLB entry from the CPU so we won't allow 2422 * any huge TLB entry from the CPU so we won't allow
2399 * huge and small TLB entries for the same virtual address 2423 * huge and small TLB entries for the same virtual address
2400 * to avoid the risk of CPU bugs in that area. 2424 * to avoid the risk of CPU bugs in that area.
2401 */ 2425 */
2402 _pmd = pmdp_clear_flush(vma, address, pmd); 2426 _pmd = pmdp_clear_flush(vma, address, pmd);
2403 spin_unlock(&mm->page_table_lock); 2427 spin_unlock(&mm->page_table_lock);
2404 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2428 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2405 2429
2406 spin_lock(ptl); 2430 spin_lock(ptl);
2407 isolated = __collapse_huge_page_isolate(vma, address, pte); 2431 isolated = __collapse_huge_page_isolate(vma, address, pte);
2408 spin_unlock(ptl); 2432 spin_unlock(ptl);
2409 2433
2410 if (unlikely(!isolated)) { 2434 if (unlikely(!isolated)) {
2411 pte_unmap(pte); 2435 pte_unmap(pte);
2412 spin_lock(&mm->page_table_lock); 2436 spin_lock(&mm->page_table_lock);
2413 BUG_ON(!pmd_none(*pmd)); 2437 BUG_ON(!pmd_none(*pmd));
2414 /* 2438 /*
2415 * We can only use set_pmd_at when establishing 2439 * We can only use set_pmd_at when establishing
2416 * hugepmds and never for establishing regular pmds that 2440 * hugepmds and never for establishing regular pmds that
2417 * points to regular pagetables. Use pmd_populate for that 2441 * points to regular pagetables. Use pmd_populate for that
2418 */ 2442 */
2419 pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 2443 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
2420 spin_unlock(&mm->page_table_lock); 2444 spin_unlock(&mm->page_table_lock);
2421 anon_vma_unlock_write(vma->anon_vma); 2445 anon_vma_unlock_write(vma->anon_vma);
2422 goto out; 2446 goto out;
2423 } 2447 }
2424 2448
2425 /* 2449 /*
2426 * All pages are isolated and locked so anon_vma rmap 2450 * All pages are isolated and locked so anon_vma rmap
2427 * can't run anymore. 2451 * can't run anymore.
2428 */ 2452 */
2429 anon_vma_unlock_write(vma->anon_vma); 2453 anon_vma_unlock_write(vma->anon_vma);
2430 2454
2431 __collapse_huge_page_copy(pte, new_page, vma, address, ptl); 2455 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
2432 pte_unmap(pte); 2456 pte_unmap(pte);
2433 __SetPageUptodate(new_page); 2457 __SetPageUptodate(new_page);
2434 pgtable = pmd_pgtable(_pmd); 2458 pgtable = pmd_pgtable(_pmd);
2435 2459
2436 _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); 2460 _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
2437 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 2461 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
2438 2462
2439 /* 2463 /*
2440 * spin_lock() below is not the equivalent of smp_wmb(), so 2464 * spin_lock() below is not the equivalent of smp_wmb(), so
2441 * this is needed to avoid the copy_huge_page writes to become 2465 * this is needed to avoid the copy_huge_page writes to become
2442 * visible after the set_pmd_at() write. 2466 * visible after the set_pmd_at() write.
2443 */ 2467 */
2444 smp_wmb(); 2468 smp_wmb();
2445 2469
2446 spin_lock(&mm->page_table_lock); 2470 spin_lock(&mm->page_table_lock);
2447 BUG_ON(!pmd_none(*pmd)); 2471 BUG_ON(!pmd_none(*pmd));
2448 page_add_new_anon_rmap(new_page, vma, address); 2472 page_add_new_anon_rmap(new_page, vma, address);
2449 pgtable_trans_huge_deposit(mm, pmd, pgtable); 2473 pgtable_trans_huge_deposit(mm, pmd, pgtable);
2450 set_pmd_at(mm, address, pmd, _pmd); 2474 set_pmd_at(mm, address, pmd, _pmd);
2451 update_mmu_cache_pmd(vma, address, pmd); 2475 update_mmu_cache_pmd(vma, address, pmd);
2452 spin_unlock(&mm->page_table_lock); 2476 spin_unlock(&mm->page_table_lock);
2453 2477
2454 *hpage = NULL; 2478 *hpage = NULL;
2455 2479
2456 khugepaged_pages_collapsed++; 2480 khugepaged_pages_collapsed++;
2457 out_up_write: 2481 out_up_write:
2458 up_write(&mm->mmap_sem); 2482 up_write(&mm->mmap_sem);
2459 return; 2483 return;
2460 2484
2461 out: 2485 out:
2462 mem_cgroup_uncharge_page(new_page); 2486 mem_cgroup_uncharge_page(new_page);
2463 goto out_up_write; 2487 goto out_up_write;
2464 } 2488 }
2465 2489
2466 static int khugepaged_scan_pmd(struct mm_struct *mm, 2490 static int khugepaged_scan_pmd(struct mm_struct *mm,
2467 struct vm_area_struct *vma, 2491 struct vm_area_struct *vma,
2468 unsigned long address, 2492 unsigned long address,
2469 struct page **hpage) 2493 struct page **hpage)
2470 { 2494 {
2471 pmd_t *pmd; 2495 pmd_t *pmd;
2472 pte_t *pte, *_pte; 2496 pte_t *pte, *_pte;
2473 int ret = 0, referenced = 0, none = 0; 2497 int ret = 0, referenced = 0, none = 0;
2474 struct page *page; 2498 struct page *page;
2475 unsigned long _address; 2499 unsigned long _address;
2476 spinlock_t *ptl; 2500 spinlock_t *ptl;
2477 int node = NUMA_NO_NODE; 2501 int node = NUMA_NO_NODE;
2478 2502
2479 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2503 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2480 2504
2481 pmd = mm_find_pmd(mm, address); 2505 pmd = mm_find_pmd(mm, address);
2482 if (!pmd) 2506 if (!pmd)
2483 goto out; 2507 goto out;
2484 if (pmd_trans_huge(*pmd)) 2508 if (pmd_trans_huge(*pmd))
2485 goto out; 2509 goto out;
2486 2510
2487 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 2511 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2488 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2512 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2489 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2513 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2490 _pte++, _address += PAGE_SIZE) { 2514 _pte++, _address += PAGE_SIZE) {
2491 pte_t pteval = *_pte; 2515 pte_t pteval = *_pte;
2492 if (pte_none(pteval)) { 2516 if (pte_none(pteval)) {
2493 if (++none <= khugepaged_max_ptes_none) 2517 if (++none <= khugepaged_max_ptes_none)
2494 continue; 2518 continue;
2495 else 2519 else
2496 goto out_unmap; 2520 goto out_unmap;
2497 } 2521 }
2498 if (!pte_present(pteval) || !pte_write(pteval)) 2522 if (!pte_present(pteval) || !pte_write(pteval))
2499 goto out_unmap; 2523 goto out_unmap;
2500 page = vm_normal_page(vma, _address, pteval); 2524 page = vm_normal_page(vma, _address, pteval);
2501 if (unlikely(!page)) 2525 if (unlikely(!page))
2502 goto out_unmap; 2526 goto out_unmap;
2503 /* 2527 /*
2504 * Record which node the original page is from and save this 2528 * Record which node the original page is from and save this
2505 * information to khugepaged_node_load[]. 2529 * information to khugepaged_node_load[].
2506 * Khupaged will allocate hugepage from the node has the max 2530 * Khupaged will allocate hugepage from the node has the max
2507 * hit record. 2531 * hit record.
2508 */ 2532 */
2509 node = page_to_nid(page); 2533 node = page_to_nid(page);
2534 if (khugepaged_scan_abort(node))
2535 goto out_unmap;
2510 khugepaged_node_load[node]++; 2536 khugepaged_node_load[node]++;
2511 VM_BUG_ON(PageCompound(page)); 2537 VM_BUG_ON(PageCompound(page));
2512 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2538 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2513 goto out_unmap; 2539 goto out_unmap;
2514 /* cannot use mapcount: can't collapse if there's a gup pin */ 2540 /* cannot use mapcount: can't collapse if there's a gup pin */
2515 if (page_count(page) != 1) 2541 if (page_count(page) != 1)
2516 goto out_unmap; 2542 goto out_unmap;
2517 if (pte_young(pteval) || PageReferenced(page) || 2543 if (pte_young(pteval) || PageReferenced(page) ||
2518 mmu_notifier_test_young(vma->vm_mm, address)) 2544 mmu_notifier_test_young(vma->vm_mm, address))
2519 referenced = 1; 2545 referenced = 1;
2520 } 2546 }
2521 if (referenced) 2547 if (referenced)
2522 ret = 1; 2548 ret = 1;
2523 out_unmap: 2549 out_unmap:
2524 pte_unmap_unlock(pte, ptl); 2550 pte_unmap_unlock(pte, ptl);
2525 if (ret) { 2551 if (ret) {
2526 node = khugepaged_find_target_node(); 2552 node = khugepaged_find_target_node();
2527 /* collapse_huge_page will return with the mmap_sem released */ 2553 /* collapse_huge_page will return with the mmap_sem released */
2528 collapse_huge_page(mm, address, hpage, vma, node); 2554 collapse_huge_page(mm, address, hpage, vma, node);
2529 } 2555 }
2530 out: 2556 out:
2531 return ret; 2557 return ret;
2532 } 2558 }
2533 2559
2534 static void collect_mm_slot(struct mm_slot *mm_slot) 2560 static void collect_mm_slot(struct mm_slot *mm_slot)
2535 { 2561 {
2536 struct mm_struct *mm = mm_slot->mm; 2562 struct mm_struct *mm = mm_slot->mm;
2537 2563
2538 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2564 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2539 2565
2540 if (khugepaged_test_exit(mm)) { 2566 if (khugepaged_test_exit(mm)) {
2541 /* free mm_slot */ 2567 /* free mm_slot */
2542 hash_del(&mm_slot->hash); 2568 hash_del(&mm_slot->hash);
2543 list_del(&mm_slot->mm_node); 2569 list_del(&mm_slot->mm_node);
2544 2570
2545 /* 2571 /*
2546 * Not strictly needed because the mm exited already. 2572 * Not strictly needed because the mm exited already.
2547 * 2573 *
2548 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 2574 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2549 */ 2575 */
2550 2576
2551 /* khugepaged_mm_lock actually not necessary for the below */ 2577 /* khugepaged_mm_lock actually not necessary for the below */
2552 free_mm_slot(mm_slot); 2578 free_mm_slot(mm_slot);
2553 mmdrop(mm); 2579 mmdrop(mm);
2554 } 2580 }
2555 } 2581 }
2556 2582
2557 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 2583 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2558 struct page **hpage) 2584 struct page **hpage)
2559 __releases(&khugepaged_mm_lock) 2585 __releases(&khugepaged_mm_lock)
2560 __acquires(&khugepaged_mm_lock) 2586 __acquires(&khugepaged_mm_lock)
2561 { 2587 {
2562 struct mm_slot *mm_slot; 2588 struct mm_slot *mm_slot;
2563 struct mm_struct *mm; 2589 struct mm_struct *mm;
2564 struct vm_area_struct *vma; 2590 struct vm_area_struct *vma;
2565 int progress = 0; 2591 int progress = 0;
2566 2592
2567 VM_BUG_ON(!pages); 2593 VM_BUG_ON(!pages);
2568 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2594 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2569 2595
2570 if (khugepaged_scan.mm_slot) 2596 if (khugepaged_scan.mm_slot)
2571 mm_slot = khugepaged_scan.mm_slot; 2597 mm_slot = khugepaged_scan.mm_slot;
2572 else { 2598 else {
2573 mm_slot = list_entry(khugepaged_scan.mm_head.next, 2599 mm_slot = list_entry(khugepaged_scan.mm_head.next,
2574 struct mm_slot, mm_node); 2600 struct mm_slot, mm_node);
2575 khugepaged_scan.address = 0; 2601 khugepaged_scan.address = 0;
2576 khugepaged_scan.mm_slot = mm_slot; 2602 khugepaged_scan.mm_slot = mm_slot;
2577 } 2603 }
2578 spin_unlock(&khugepaged_mm_lock); 2604 spin_unlock(&khugepaged_mm_lock);
2579 2605
2580 mm = mm_slot->mm; 2606 mm = mm_slot->mm;
2581 down_read(&mm->mmap_sem); 2607 down_read(&mm->mmap_sem);
2582 if (unlikely(khugepaged_test_exit(mm))) 2608 if (unlikely(khugepaged_test_exit(mm)))
2583 vma = NULL; 2609 vma = NULL;
2584 else 2610 else
2585 vma = find_vma(mm, khugepaged_scan.address); 2611 vma = find_vma(mm, khugepaged_scan.address);
2586 2612
2587 progress++; 2613 progress++;
2588 for (; vma; vma = vma->vm_next) { 2614 for (; vma; vma = vma->vm_next) {
2589 unsigned long hstart, hend; 2615 unsigned long hstart, hend;
2590 2616
2591 cond_resched(); 2617 cond_resched();
2592 if (unlikely(khugepaged_test_exit(mm))) { 2618 if (unlikely(khugepaged_test_exit(mm))) {
2593 progress++; 2619 progress++;
2594 break; 2620 break;
2595 } 2621 }
2596 if (!hugepage_vma_check(vma)) { 2622 if (!hugepage_vma_check(vma)) {
2597 skip: 2623 skip:
2598 progress++; 2624 progress++;
2599 continue; 2625 continue;
2600 } 2626 }
2601 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2627 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2602 hend = vma->vm_end & HPAGE_PMD_MASK; 2628 hend = vma->vm_end & HPAGE_PMD_MASK;
2603 if (hstart >= hend) 2629 if (hstart >= hend)
2604 goto skip; 2630 goto skip;
2605 if (khugepaged_scan.address > hend) 2631 if (khugepaged_scan.address > hend)
2606 goto skip; 2632 goto skip;
2607 if (khugepaged_scan.address < hstart) 2633 if (khugepaged_scan.address < hstart)
2608 khugepaged_scan.address = hstart; 2634 khugepaged_scan.address = hstart;
2609 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2635 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2610 2636
2611 while (khugepaged_scan.address < hend) { 2637 while (khugepaged_scan.address < hend) {
2612 int ret; 2638 int ret;
2613 cond_resched(); 2639 cond_resched();
2614 if (unlikely(khugepaged_test_exit(mm))) 2640 if (unlikely(khugepaged_test_exit(mm)))
2615 goto breakouterloop; 2641 goto breakouterloop;
2616 2642
2617 VM_BUG_ON(khugepaged_scan.address < hstart || 2643 VM_BUG_ON(khugepaged_scan.address < hstart ||
2618 khugepaged_scan.address + HPAGE_PMD_SIZE > 2644 khugepaged_scan.address + HPAGE_PMD_SIZE >
2619 hend); 2645 hend);
2620 ret = khugepaged_scan_pmd(mm, vma, 2646 ret = khugepaged_scan_pmd(mm, vma,
2621 khugepaged_scan.address, 2647 khugepaged_scan.address,
2622 hpage); 2648 hpage);
2623 /* move to next address */ 2649 /* move to next address */
2624 khugepaged_scan.address += HPAGE_PMD_SIZE; 2650 khugepaged_scan.address += HPAGE_PMD_SIZE;
2625 progress += HPAGE_PMD_NR; 2651 progress += HPAGE_PMD_NR;
2626 if (ret) 2652 if (ret)
2627 /* we released mmap_sem so break loop */ 2653 /* we released mmap_sem so break loop */
2628 goto breakouterloop_mmap_sem; 2654 goto breakouterloop_mmap_sem;
2629 if (progress >= pages) 2655 if (progress >= pages)
2630 goto breakouterloop; 2656 goto breakouterloop;
2631 } 2657 }
2632 } 2658 }
2633 breakouterloop: 2659 breakouterloop:
2634 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ 2660 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2635 breakouterloop_mmap_sem: 2661 breakouterloop_mmap_sem:
2636 2662
2637 spin_lock(&khugepaged_mm_lock); 2663 spin_lock(&khugepaged_mm_lock);
2638 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2664 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2639 /* 2665 /*
2640 * Release the current mm_slot if this mm is about to die, or 2666 * Release the current mm_slot if this mm is about to die, or
2641 * if we scanned all vmas of this mm. 2667 * if we scanned all vmas of this mm.
2642 */ 2668 */
2643 if (khugepaged_test_exit(mm) || !vma) { 2669 if (khugepaged_test_exit(mm) || !vma) {
2644 /* 2670 /*
2645 * Make sure that if mm_users is reaching zero while 2671 * Make sure that if mm_users is reaching zero while
2646 * khugepaged runs here, khugepaged_exit will find 2672 * khugepaged runs here, khugepaged_exit will find
2647 * mm_slot not pointing to the exiting mm. 2673 * mm_slot not pointing to the exiting mm.
2648 */ 2674 */
2649 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { 2675 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2650 khugepaged_scan.mm_slot = list_entry( 2676 khugepaged_scan.mm_slot = list_entry(
2651 mm_slot->mm_node.next, 2677 mm_slot->mm_node.next,
2652 struct mm_slot, mm_node); 2678 struct mm_slot, mm_node);
2653 khugepaged_scan.address = 0; 2679 khugepaged_scan.address = 0;
2654 } else { 2680 } else {
2655 khugepaged_scan.mm_slot = NULL; 2681 khugepaged_scan.mm_slot = NULL;
2656 khugepaged_full_scans++; 2682 khugepaged_full_scans++;
2657 } 2683 }
2658 2684
2659 collect_mm_slot(mm_slot); 2685 collect_mm_slot(mm_slot);
2660 } 2686 }
2661 2687
2662 return progress; 2688 return progress;
2663 } 2689 }
2664 2690
2665 static int khugepaged_has_work(void) 2691 static int khugepaged_has_work(void)
2666 { 2692 {
2667 return !list_empty(&khugepaged_scan.mm_head) && 2693 return !list_empty(&khugepaged_scan.mm_head) &&
2668 khugepaged_enabled(); 2694 khugepaged_enabled();
2669 } 2695 }
2670 2696
2671 static int khugepaged_wait_event(void) 2697 static int khugepaged_wait_event(void)
2672 { 2698 {
2673 return !list_empty(&khugepaged_scan.mm_head) || 2699 return !list_empty(&khugepaged_scan.mm_head) ||
2674 kthread_should_stop(); 2700 kthread_should_stop();
2675 } 2701 }
2676 2702
2677 static void khugepaged_do_scan(void) 2703 static void khugepaged_do_scan(void)
2678 { 2704 {
2679 struct page *hpage = NULL; 2705 struct page *hpage = NULL;
2680 unsigned int progress = 0, pass_through_head = 0; 2706 unsigned int progress = 0, pass_through_head = 0;
2681 unsigned int pages = khugepaged_pages_to_scan; 2707 unsigned int pages = khugepaged_pages_to_scan;
2682 bool wait = true; 2708 bool wait = true;
2683 2709
2684 barrier(); /* write khugepaged_pages_to_scan to local stack */ 2710 barrier(); /* write khugepaged_pages_to_scan to local stack */
2685 2711
2686 while (progress < pages) { 2712 while (progress < pages) {
2687 if (!khugepaged_prealloc_page(&hpage, &wait)) 2713 if (!khugepaged_prealloc_page(&hpage, &wait))
2688 break; 2714 break;
2689 2715
2690 cond_resched(); 2716 cond_resched();
2691 2717
2692 if (unlikely(kthread_should_stop() || freezing(current))) 2718 if (unlikely(kthread_should_stop() || freezing(current)))
2693 break; 2719 break;
2694 2720
2695 spin_lock(&khugepaged_mm_lock); 2721 spin_lock(&khugepaged_mm_lock);
2696 if (!khugepaged_scan.mm_slot) 2722 if (!khugepaged_scan.mm_slot)
2697 pass_through_head++; 2723 pass_through_head++;
2698 if (khugepaged_has_work() && 2724 if (khugepaged_has_work() &&
2699 pass_through_head < 2) 2725 pass_through_head < 2)
2700 progress += khugepaged_scan_mm_slot(pages - progress, 2726 progress += khugepaged_scan_mm_slot(pages - progress,
2701 &hpage); 2727 &hpage);
2702 else 2728 else
2703 progress = pages; 2729 progress = pages;
2704 spin_unlock(&khugepaged_mm_lock); 2730 spin_unlock(&khugepaged_mm_lock);
2705 } 2731 }
2706 2732
2707 if (!IS_ERR_OR_NULL(hpage)) 2733 if (!IS_ERR_OR_NULL(hpage))
2708 put_page(hpage); 2734 put_page(hpage);
2709 } 2735 }
2710 2736
2711 static void khugepaged_wait_work(void) 2737 static void khugepaged_wait_work(void)
2712 { 2738 {
2713 try_to_freeze(); 2739 try_to_freeze();
2714 2740
2715 if (khugepaged_has_work()) { 2741 if (khugepaged_has_work()) {
2716 if (!khugepaged_scan_sleep_millisecs) 2742 if (!khugepaged_scan_sleep_millisecs)
2717 return; 2743 return;
2718 2744
2719 wait_event_freezable_timeout(khugepaged_wait, 2745 wait_event_freezable_timeout(khugepaged_wait,
2720 kthread_should_stop(), 2746 kthread_should_stop(),
2721 msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); 2747 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2722 return; 2748 return;
2723 } 2749 }
2724 2750
2725 if (khugepaged_enabled()) 2751 if (khugepaged_enabled())
2726 wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 2752 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2727 } 2753 }
2728 2754
2729 static int khugepaged(void *none) 2755 static int khugepaged(void *none)
2730 { 2756 {
2731 struct mm_slot *mm_slot; 2757 struct mm_slot *mm_slot;
2732 2758
2733 set_freezable(); 2759 set_freezable();
2734 set_user_nice(current, 19); 2760 set_user_nice(current, 19);
2735 2761
2736 while (!kthread_should_stop()) { 2762 while (!kthread_should_stop()) {
2737 khugepaged_do_scan(); 2763 khugepaged_do_scan();
2738 khugepaged_wait_work(); 2764 khugepaged_wait_work();
2739 } 2765 }
2740 2766
2741 spin_lock(&khugepaged_mm_lock); 2767 spin_lock(&khugepaged_mm_lock);
2742 mm_slot = khugepaged_scan.mm_slot; 2768 mm_slot = khugepaged_scan.mm_slot;
2743 khugepaged_scan.mm_slot = NULL; 2769 khugepaged_scan.mm_slot = NULL;
2744 if (mm_slot) 2770 if (mm_slot)
2745 collect_mm_slot(mm_slot); 2771 collect_mm_slot(mm_slot);
2746 spin_unlock(&khugepaged_mm_lock); 2772 spin_unlock(&khugepaged_mm_lock);
2747 return 0; 2773 return 0;
2748 } 2774 }
2749 2775
2750 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2776 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2751 unsigned long haddr, pmd_t *pmd) 2777 unsigned long haddr, pmd_t *pmd)
2752 { 2778 {
2753 struct mm_struct *mm = vma->vm_mm; 2779 struct mm_struct *mm = vma->vm_mm;
2754 pgtable_t pgtable; 2780 pgtable_t pgtable;
2755 pmd_t _pmd; 2781 pmd_t _pmd;
2756 int i; 2782 int i;
2757 2783
2758 pmdp_clear_flush(vma, haddr, pmd); 2784 pmdp_clear_flush(vma, haddr, pmd);
2759 /* leave pmd empty until pte is filled */ 2785 /* leave pmd empty until pte is filled */
2760 2786
2761 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2787 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2762 pmd_populate(mm, &_pmd, pgtable); 2788 pmd_populate(mm, &_pmd, pgtable);
2763 2789
2764 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2790 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2765 pte_t *pte, entry; 2791 pte_t *pte, entry;
2766 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 2792 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2767 entry = pte_mkspecial(entry); 2793 entry = pte_mkspecial(entry);
2768 pte = pte_offset_map(&_pmd, haddr); 2794 pte = pte_offset_map(&_pmd, haddr);
2769 VM_BUG_ON(!pte_none(*pte)); 2795 VM_BUG_ON(!pte_none(*pte));
2770 set_pte_at(mm, haddr, pte, entry); 2796 set_pte_at(mm, haddr, pte, entry);
2771 pte_unmap(pte); 2797 pte_unmap(pte);
2772 } 2798 }
2773 smp_wmb(); /* make pte visible before pmd */ 2799 smp_wmb(); /* make pte visible before pmd */
2774 pmd_populate(mm, pmd, pgtable); 2800 pmd_populate(mm, pmd, pgtable);
2775 put_huge_zero_page(); 2801 put_huge_zero_page();
2776 } 2802 }
2777 2803
2778 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, 2804 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2779 pmd_t *pmd) 2805 pmd_t *pmd)
2780 { 2806 {
2781 struct page *page; 2807 struct page *page;
2782 struct mm_struct *mm = vma->vm_mm; 2808 struct mm_struct *mm = vma->vm_mm;
2783 unsigned long haddr = address & HPAGE_PMD_MASK; 2809 unsigned long haddr = address & HPAGE_PMD_MASK;
2784 unsigned long mmun_start; /* For mmu_notifiers */ 2810 unsigned long mmun_start; /* For mmu_notifiers */
2785 unsigned long mmun_end; /* For mmu_notifiers */ 2811 unsigned long mmun_end; /* For mmu_notifiers */
2786 2812
2787 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); 2813 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2788 2814
2789 mmun_start = haddr; 2815 mmun_start = haddr;
2790 mmun_end = haddr + HPAGE_PMD_SIZE; 2816 mmun_end = haddr + HPAGE_PMD_SIZE;
2791 again: 2817 again:
2792 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2818 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2793 spin_lock(&mm->page_table_lock); 2819 spin_lock(&mm->page_table_lock);
2794 if (unlikely(!pmd_trans_huge(*pmd))) { 2820 if (unlikely(!pmd_trans_huge(*pmd))) {
2795 spin_unlock(&mm->page_table_lock); 2821 spin_unlock(&mm->page_table_lock);
2796 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2822 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2797 return; 2823 return;
2798 } 2824 }
2799 if (is_huge_zero_pmd(*pmd)) { 2825 if (is_huge_zero_pmd(*pmd)) {
2800 __split_huge_zero_page_pmd(vma, haddr, pmd); 2826 __split_huge_zero_page_pmd(vma, haddr, pmd);
2801 spin_unlock(&mm->page_table_lock); 2827 spin_unlock(&mm->page_table_lock);
2802 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2828 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2803 return; 2829 return;
2804 } 2830 }
2805 page = pmd_page(*pmd); 2831 page = pmd_page(*pmd);
2806 VM_BUG_ON(!page_count(page)); 2832 VM_BUG_ON(!page_count(page));
2807 get_page(page); 2833 get_page(page);
2808 spin_unlock(&mm->page_table_lock); 2834 spin_unlock(&mm->page_table_lock);
2809 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2835 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2810 2836
2811 split_huge_page(page); 2837 split_huge_page(page);
2812 2838
2813 put_page(page); 2839 put_page(page);
2814 2840
2815 /* 2841 /*
2816 * We don't always have down_write of mmap_sem here: a racing 2842 * We don't always have down_write of mmap_sem here: a racing
2817 * do_huge_pmd_wp_page() might have copied-on-write to another 2843 * do_huge_pmd_wp_page() might have copied-on-write to another
2818 * huge page before our split_huge_page() got the anon_vma lock. 2844 * huge page before our split_huge_page() got the anon_vma lock.
2819 */ 2845 */
2820 if (unlikely(pmd_trans_huge(*pmd))) 2846 if (unlikely(pmd_trans_huge(*pmd)))
2821 goto again; 2847 goto again;
2822 } 2848 }
2823 2849
2824 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, 2850 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2825 pmd_t *pmd) 2851 pmd_t *pmd)
2826 { 2852 {
2827 struct vm_area_struct *vma; 2853 struct vm_area_struct *vma;
2828 2854
2829 vma = find_vma(mm, address); 2855 vma = find_vma(mm, address);
2830 BUG_ON(vma == NULL); 2856 BUG_ON(vma == NULL);
2831 split_huge_page_pmd(vma, address, pmd); 2857 split_huge_page_pmd(vma, address, pmd);
2832 } 2858 }
2833 2859
2834 static void split_huge_page_address(struct mm_struct *mm, 2860 static void split_huge_page_address(struct mm_struct *mm,
2835 unsigned long address) 2861 unsigned long address)
2836 { 2862 {
2837 pmd_t *pmd; 2863 pmd_t *pmd;
2838 2864
2839 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2865 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2840 2866
2841 pmd = mm_find_pmd(mm, address); 2867 pmd = mm_find_pmd(mm, address);
2842 if (!pmd) 2868 if (!pmd)
2843 return; 2869 return;
2844 /* 2870 /*
2845 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2871 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2846 * materialize from under us. 2872 * materialize from under us.
2847 */ 2873 */
2848 split_huge_page_pmd_mm(mm, address, pmd); 2874 split_huge_page_pmd_mm(mm, address, pmd);
2849 } 2875 }
2850 2876
2851 void __vma_adjust_trans_huge(struct vm_area_struct *vma, 2877 void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2852 unsigned long start, 2878 unsigned long start,
2853 unsigned long end, 2879 unsigned long end,
2854 long adjust_next) 2880 long adjust_next)
2855 { 2881 {
2856 /* 2882 /*
2857 * If the new start address isn't hpage aligned and it could 2883 * If the new start address isn't hpage aligned and it could
2858 * previously contain an hugepage: check if we need to split 2884 * previously contain an hugepage: check if we need to split
2859 * an huge pmd. 2885 * an huge pmd.
2860 */ 2886 */
2861 if (start & ~HPAGE_PMD_MASK && 2887 if (start & ~HPAGE_PMD_MASK &&
2862 (start & HPAGE_PMD_MASK) >= vma->vm_start && 2888 (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2863 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2889 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2864 split_huge_page_address(vma->vm_mm, start); 2890 split_huge_page_address(vma->vm_mm, start);
2865 2891
2866 /* 2892 /*
2867 * If the new end address isn't hpage aligned and it could 2893 * If the new end address isn't hpage aligned and it could
2868 * previously contain an hugepage: check if we need to split 2894 * previously contain an hugepage: check if we need to split
2869 * an huge pmd. 2895 * an huge pmd.
2870 */ 2896 */
2871 if (end & ~HPAGE_PMD_MASK && 2897 if (end & ~HPAGE_PMD_MASK &&
2872 (end & HPAGE_PMD_MASK) >= vma->vm_start && 2898 (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2873 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2899 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2874 split_huge_page_address(vma->vm_mm, end); 2900 split_huge_page_address(vma->vm_mm, end);
2875 2901
2876 /* 2902 /*
2877 * If we're also updating the vma->vm_next->vm_start, if the new 2903 * If we're also updating the vma->vm_next->vm_start, if the new
2878 * vm_next->vm_start isn't page aligned and it could previously 2904 * vm_next->vm_start isn't page aligned and it could previously
2879 * contain an hugepage: check if we need to split an huge pmd. 2905 * contain an hugepage: check if we need to split an huge pmd.
2880 */ 2906 */
2881 if (adjust_next > 0) { 2907 if (adjust_next > 0) {
2882 struct vm_area_struct *next = vma->vm_next; 2908 struct vm_area_struct *next = vma->vm_next;
2883 unsigned long nstart = next->vm_start; 2909 unsigned long nstart = next->vm_start;
2884 nstart += adjust_next << PAGE_SHIFT; 2910 nstart += adjust_next << PAGE_SHIFT;
2885 if (nstart & ~HPAGE_PMD_MASK && 2911 if (nstart & ~HPAGE_PMD_MASK &&
2886 (nstart & HPAGE_PMD_MASK) >= next->vm_start && 2912 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2887 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) 2913 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2888 split_huge_page_address(next->vm_mm, nstart); 2914 split_huge_page_address(next->vm_mm, nstart);
2889 } 2915 }
2890 } 2916 }
2891 2917