Commit 062f1af2170afe817133d358d900a5f33e3856e4

Authored by Mel Gorman
Committed by Linus Torvalds
1 parent 572043c90d

mm: thp: acquire the anon_vma rwsem for write during split

Zhouping Liu reported the following against 3.8-rc1 when running a mmap
testcase from LTP.

  mapcount 0 page_mapcount 3
  ------------[ cut here ]------------
  kernel BUG at mm/huge_memory.c:1798!
  invalid opcode: 0000 [#1] SMP
  Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables bnep bluetooth rfkill iptable_mangle ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack iptable_filter ip_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i cxgb3 mdio libcxgbi ib_iser rdma_cm ib_addr iw_cm ib_cm ib_sa ib_mad ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi vfat fat dm_mirror dm_region_hash dm_log dm_mod cdc_ether iTCO_wdt i7core_edac coretemp usbnet iTCO_vendor_support mii crc32c_intel edac_core lpc_ich shpchp ioatdma mfd_core i2c_i801 pcspkr serio_raw bnx2 microcode dca vhost_net tun macvtap macvlan kvm_intel kvm uinput mgag200 sr_mod cdrom i2c_algo_bit sd_mod drm_kms_helper crc_t10dif ata_generic pata_acpi ttm ata_piix drm libata i2c_core megaraid_sas
  CPU 1
  Pid: 23217, comm: mmap10 Not tainted 3.8.0-rc1mainline+ #17 IBM IBM System x3400 M3 Server -[7379I08]-/69Y4356
  RIP: __split_huge_page+0x677/0x6d0
  RSP: 0000:ffff88017a03fc08  EFLAGS: 00010293
  RAX: 0000000000000003 RBX: ffff88027a6c22e0 RCX: 00000000000034d2
  RDX: 000000000000748b RSI: 0000000000000046 RDI: 0000000000000246
  RBP: ffff88017a03fcb8 R08: ffffffff819d2440 R09: 000000000000054a
  R10: 0000000000aaaaaa R11: 00000000ffffffff R12: 0000000000000000
  R13: 00007f4f11a00000 R14: ffff880179e96e00 R15: ffffea0005c08000
  FS:  00007f4f11f4a740(0000) GS:ffff88017bc20000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 00000037e9ebb404 CR3: 000000017a436000 CR4: 00000000000007e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Process mmap10 (pid: 23217, threadinfo ffff88017a03e000, task ffff880172dd32e0)
  Stack:
   ffff88017a540ec8 ffff88017a03fc20 ffffffff816017b5 ffff88017a03fc88
   ffffffff812fa014 0000000000000000 ffff880279ebd5c0 00000000f4f11a4c
   00000007f4f11f49 00000007f4f11a00 ffff88017a540ef0 ffff88017a540ee8
  Call Trace:
    split_huge_page+0x68/0xb0
    __split_huge_page_pmd+0x134/0x330
    split_huge_page_pmd_mm+0x51/0x60
    split_huge_page_address+0x3b/0x50
    __vma_adjust_trans_huge+0x9c/0xf0
    vma_adjust+0x684/0x750
    __split_vma.isra.28+0x1fa/0x220
    do_munmap+0xf9/0x420
    vm_munmap+0x4e/0x70
    sys_munmap+0x2b/0x40
    system_call_fastpath+0x16/0x1b

Alexander Beregalov and Alex Xu reported similar bugs and Hillf Danton
identified that commit 5a505085f043 ("mm/rmap: Convert the struct
anon_vma::mutex to an rwsem") and commit 4fc3f1d66b1e ("mm/rmap,
migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable")
were likely the problem.  Reverting these commits was reported to solve
the problem for Alexander.

Despite the reason for these commits, NUMA balancing is not the direct
source of the problem.  split_huge_page() expects the anon_vma lock to
be exclusive to serialise the whole split operation.  Ordinarily it is
expected that the anon_vma lock would only be required when updating the
avcs but THP also uses the anon_vma rwsem for collapse and split
operations where the page lock or compound lock cannot be used (as the
page is changing from base to THP or vice versa) and the page table
locks are insufficient.

This patch takes the anon_vma lock for write to serialise against parallel
split_huge_page as THP expected before the conversion to rwsem.

Reported-and-tested-by: Zhouping Liu <zliu@redhat.com>
Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Reported-by: Alex Xu <alex_y_xu@yahoo.ca>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 13 additions and 2 deletions Inline Diff

1 /* 1 /*
2 * Copyright (C) 2009 Red Hat, Inc. 2 * Copyright (C) 2009 Red Hat, Inc.
3 * 3 *
4 * This work is licensed under the terms of the GNU GPL, version 2. See 4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory. 5 * the COPYING file in the top-level directory.
6 */ 6 */
7 7
8 #include <linux/mm.h> 8 #include <linux/mm.h>
9 #include <linux/sched.h> 9 #include <linux/sched.h>
10 #include <linux/highmem.h> 10 #include <linux/highmem.h>
11 #include <linux/hugetlb.h> 11 #include <linux/hugetlb.h>
12 #include <linux/mmu_notifier.h> 12 #include <linux/mmu_notifier.h>
13 #include <linux/rmap.h> 13 #include <linux/rmap.h>
14 #include <linux/swap.h> 14 #include <linux/swap.h>
15 #include <linux/shrinker.h> 15 #include <linux/shrinker.h>
16 #include <linux/mm_inline.h> 16 #include <linux/mm_inline.h>
17 #include <linux/kthread.h> 17 #include <linux/kthread.h>
18 #include <linux/khugepaged.h> 18 #include <linux/khugepaged.h>
19 #include <linux/freezer.h> 19 #include <linux/freezer.h>
20 #include <linux/mman.h> 20 #include <linux/mman.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/migrate.h> 22 #include <linux/migrate.h>
23 23
24 #include <asm/tlb.h> 24 #include <asm/tlb.h>
25 #include <asm/pgalloc.h> 25 #include <asm/pgalloc.h>
26 #include "internal.h" 26 #include "internal.h"
27 27
28 /* 28 /*
29 * By default transparent hugepage support is enabled for all mappings 29 * By default transparent hugepage support is enabled for all mappings
30 * and khugepaged scans all mappings. Defrag is only invoked by 30 * and khugepaged scans all mappings. Defrag is only invoked by
31 * khugepaged hugepage allocations and by page faults inside 31 * khugepaged hugepage allocations and by page faults inside
32 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived 32 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
33 * allocations. 33 * allocations.
34 */ 34 */
35 unsigned long transparent_hugepage_flags __read_mostly = 35 unsigned long transparent_hugepage_flags __read_mostly =
36 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 36 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
37 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 37 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
38 #endif 38 #endif
39 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 39 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
40 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 40 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
41 #endif 41 #endif
42 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 42 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
43 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 43 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
44 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 44 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
45 45
46 /* default scan 8*512 pte (or vmas) every 30 second */ 46 /* default scan 8*512 pte (or vmas) every 30 second */
47 static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 47 static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
48 static unsigned int khugepaged_pages_collapsed; 48 static unsigned int khugepaged_pages_collapsed;
49 static unsigned int khugepaged_full_scans; 49 static unsigned int khugepaged_full_scans;
50 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 50 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
51 /* during fragmentation poll the hugepage allocator once every minute */ 51 /* during fragmentation poll the hugepage allocator once every minute */
52 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 52 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
53 static struct task_struct *khugepaged_thread __read_mostly; 53 static struct task_struct *khugepaged_thread __read_mostly;
54 static DEFINE_MUTEX(khugepaged_mutex); 54 static DEFINE_MUTEX(khugepaged_mutex);
55 static DEFINE_SPINLOCK(khugepaged_mm_lock); 55 static DEFINE_SPINLOCK(khugepaged_mm_lock);
56 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 56 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
57 /* 57 /*
58 * default collapse hugepages if there is at least one pte mapped like 58 * default collapse hugepages if there is at least one pte mapped like
59 * it would have happened if the vma was large enough during page 59 * it would have happened if the vma was large enough during page
60 * fault. 60 * fault.
61 */ 61 */
62 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; 62 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
63 63
64 static int khugepaged(void *none); 64 static int khugepaged(void *none);
65 static int mm_slots_hash_init(void); 65 static int mm_slots_hash_init(void);
66 static int khugepaged_slab_init(void); 66 static int khugepaged_slab_init(void);
67 static void khugepaged_slab_free(void); 67 static void khugepaged_slab_free(void);
68 68
69 #define MM_SLOTS_HASH_HEADS 1024 69 #define MM_SLOTS_HASH_HEADS 1024
70 static struct hlist_head *mm_slots_hash __read_mostly; 70 static struct hlist_head *mm_slots_hash __read_mostly;
71 static struct kmem_cache *mm_slot_cache __read_mostly; 71 static struct kmem_cache *mm_slot_cache __read_mostly;
72 72
73 /** 73 /**
74 * struct mm_slot - hash lookup from mm to mm_slot 74 * struct mm_slot - hash lookup from mm to mm_slot
75 * @hash: hash collision list 75 * @hash: hash collision list
76 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head 76 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
77 * @mm: the mm that this information is valid for 77 * @mm: the mm that this information is valid for
78 */ 78 */
79 struct mm_slot { 79 struct mm_slot {
80 struct hlist_node hash; 80 struct hlist_node hash;
81 struct list_head mm_node; 81 struct list_head mm_node;
82 struct mm_struct *mm; 82 struct mm_struct *mm;
83 }; 83 };
84 84
85 /** 85 /**
86 * struct khugepaged_scan - cursor for scanning 86 * struct khugepaged_scan - cursor for scanning
87 * @mm_head: the head of the mm list to scan 87 * @mm_head: the head of the mm list to scan
88 * @mm_slot: the current mm_slot we are scanning 88 * @mm_slot: the current mm_slot we are scanning
89 * @address: the next address inside that to be scanned 89 * @address: the next address inside that to be scanned
90 * 90 *
91 * There is only the one khugepaged_scan instance of this cursor structure. 91 * There is only the one khugepaged_scan instance of this cursor structure.
92 */ 92 */
93 struct khugepaged_scan { 93 struct khugepaged_scan {
94 struct list_head mm_head; 94 struct list_head mm_head;
95 struct mm_slot *mm_slot; 95 struct mm_slot *mm_slot;
96 unsigned long address; 96 unsigned long address;
97 }; 97 };
98 static struct khugepaged_scan khugepaged_scan = { 98 static struct khugepaged_scan khugepaged_scan = {
99 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 99 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
100 }; 100 };
101 101
102 102
103 static int set_recommended_min_free_kbytes(void) 103 static int set_recommended_min_free_kbytes(void)
104 { 104 {
105 struct zone *zone; 105 struct zone *zone;
106 int nr_zones = 0; 106 int nr_zones = 0;
107 unsigned long recommended_min; 107 unsigned long recommended_min;
108 extern int min_free_kbytes; 108 extern int min_free_kbytes;
109 109
110 if (!khugepaged_enabled()) 110 if (!khugepaged_enabled())
111 return 0; 111 return 0;
112 112
113 for_each_populated_zone(zone) 113 for_each_populated_zone(zone)
114 nr_zones++; 114 nr_zones++;
115 115
116 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ 116 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
117 recommended_min = pageblock_nr_pages * nr_zones * 2; 117 recommended_min = pageblock_nr_pages * nr_zones * 2;
118 118
119 /* 119 /*
120 * Make sure that on average at least two pageblocks are almost free 120 * Make sure that on average at least two pageblocks are almost free
121 * of another type, one for a migratetype to fall back to and a 121 * of another type, one for a migratetype to fall back to and a
122 * second to avoid subsequent fallbacks of other types There are 3 122 * second to avoid subsequent fallbacks of other types There are 3
123 * MIGRATE_TYPES we care about. 123 * MIGRATE_TYPES we care about.
124 */ 124 */
125 recommended_min += pageblock_nr_pages * nr_zones * 125 recommended_min += pageblock_nr_pages * nr_zones *
126 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 126 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
127 127
128 /* don't ever allow to reserve more than 5% of the lowmem */ 128 /* don't ever allow to reserve more than 5% of the lowmem */
129 recommended_min = min(recommended_min, 129 recommended_min = min(recommended_min,
130 (unsigned long) nr_free_buffer_pages() / 20); 130 (unsigned long) nr_free_buffer_pages() / 20);
131 recommended_min <<= (PAGE_SHIFT-10); 131 recommended_min <<= (PAGE_SHIFT-10);
132 132
133 if (recommended_min > min_free_kbytes) 133 if (recommended_min > min_free_kbytes)
134 min_free_kbytes = recommended_min; 134 min_free_kbytes = recommended_min;
135 setup_per_zone_wmarks(); 135 setup_per_zone_wmarks();
136 return 0; 136 return 0;
137 } 137 }
138 late_initcall(set_recommended_min_free_kbytes); 138 late_initcall(set_recommended_min_free_kbytes);
139 139
140 static int start_khugepaged(void) 140 static int start_khugepaged(void)
141 { 141 {
142 int err = 0; 142 int err = 0;
143 if (khugepaged_enabled()) { 143 if (khugepaged_enabled()) {
144 if (!khugepaged_thread) 144 if (!khugepaged_thread)
145 khugepaged_thread = kthread_run(khugepaged, NULL, 145 khugepaged_thread = kthread_run(khugepaged, NULL,
146 "khugepaged"); 146 "khugepaged");
147 if (unlikely(IS_ERR(khugepaged_thread))) { 147 if (unlikely(IS_ERR(khugepaged_thread))) {
148 printk(KERN_ERR 148 printk(KERN_ERR
149 "khugepaged: kthread_run(khugepaged) failed\n"); 149 "khugepaged: kthread_run(khugepaged) failed\n");
150 err = PTR_ERR(khugepaged_thread); 150 err = PTR_ERR(khugepaged_thread);
151 khugepaged_thread = NULL; 151 khugepaged_thread = NULL;
152 } 152 }
153 153
154 if (!list_empty(&khugepaged_scan.mm_head)) 154 if (!list_empty(&khugepaged_scan.mm_head))
155 wake_up_interruptible(&khugepaged_wait); 155 wake_up_interruptible(&khugepaged_wait);
156 156
157 set_recommended_min_free_kbytes(); 157 set_recommended_min_free_kbytes();
158 } else if (khugepaged_thread) { 158 } else if (khugepaged_thread) {
159 kthread_stop(khugepaged_thread); 159 kthread_stop(khugepaged_thread);
160 khugepaged_thread = NULL; 160 khugepaged_thread = NULL;
161 } 161 }
162 162
163 return err; 163 return err;
164 } 164 }
165 165
166 static atomic_t huge_zero_refcount; 166 static atomic_t huge_zero_refcount;
167 static unsigned long huge_zero_pfn __read_mostly; 167 static unsigned long huge_zero_pfn __read_mostly;
168 168
169 static inline bool is_huge_zero_pfn(unsigned long pfn) 169 static inline bool is_huge_zero_pfn(unsigned long pfn)
170 { 170 {
171 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); 171 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
172 return zero_pfn && pfn == zero_pfn; 172 return zero_pfn && pfn == zero_pfn;
173 } 173 }
174 174
175 static inline bool is_huge_zero_pmd(pmd_t pmd) 175 static inline bool is_huge_zero_pmd(pmd_t pmd)
176 { 176 {
177 return is_huge_zero_pfn(pmd_pfn(pmd)); 177 return is_huge_zero_pfn(pmd_pfn(pmd));
178 } 178 }
179 179
180 static unsigned long get_huge_zero_page(void) 180 static unsigned long get_huge_zero_page(void)
181 { 181 {
182 struct page *zero_page; 182 struct page *zero_page;
183 retry: 183 retry:
184 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 184 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
185 return ACCESS_ONCE(huge_zero_pfn); 185 return ACCESS_ONCE(huge_zero_pfn);
186 186
187 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 187 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
188 HPAGE_PMD_ORDER); 188 HPAGE_PMD_ORDER);
189 if (!zero_page) { 189 if (!zero_page) {
190 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 190 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
191 return 0; 191 return 0;
192 } 192 }
193 count_vm_event(THP_ZERO_PAGE_ALLOC); 193 count_vm_event(THP_ZERO_PAGE_ALLOC);
194 preempt_disable(); 194 preempt_disable();
195 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { 195 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
196 preempt_enable(); 196 preempt_enable();
197 __free_page(zero_page); 197 __free_page(zero_page);
198 goto retry; 198 goto retry;
199 } 199 }
200 200
201 /* We take additional reference here. It will be put back by shrinker */ 201 /* We take additional reference here. It will be put back by shrinker */
202 atomic_set(&huge_zero_refcount, 2); 202 atomic_set(&huge_zero_refcount, 2);
203 preempt_enable(); 203 preempt_enable();
204 return ACCESS_ONCE(huge_zero_pfn); 204 return ACCESS_ONCE(huge_zero_pfn);
205 } 205 }
206 206
207 static void put_huge_zero_page(void) 207 static void put_huge_zero_page(void)
208 { 208 {
209 /* 209 /*
210 * Counter should never go to zero here. Only shrinker can put 210 * Counter should never go to zero here. Only shrinker can put
211 * last reference. 211 * last reference.
212 */ 212 */
213 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 213 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
214 } 214 }
215 215
216 static int shrink_huge_zero_page(struct shrinker *shrink, 216 static int shrink_huge_zero_page(struct shrinker *shrink,
217 struct shrink_control *sc) 217 struct shrink_control *sc)
218 { 218 {
219 if (!sc->nr_to_scan) 219 if (!sc->nr_to_scan)
220 /* we can free zero page only if last reference remains */ 220 /* we can free zero page only if last reference remains */
221 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 221 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
222 222
223 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 223 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
224 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); 224 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
225 BUG_ON(zero_pfn == 0); 225 BUG_ON(zero_pfn == 0);
226 __free_page(__pfn_to_page(zero_pfn)); 226 __free_page(__pfn_to_page(zero_pfn));
227 } 227 }
228 228
229 return 0; 229 return 0;
230 } 230 }
231 231
232 static struct shrinker huge_zero_page_shrinker = { 232 static struct shrinker huge_zero_page_shrinker = {
233 .shrink = shrink_huge_zero_page, 233 .shrink = shrink_huge_zero_page,
234 .seeks = DEFAULT_SEEKS, 234 .seeks = DEFAULT_SEEKS,
235 }; 235 };
236 236
237 #ifdef CONFIG_SYSFS 237 #ifdef CONFIG_SYSFS
238 238
239 static ssize_t double_flag_show(struct kobject *kobj, 239 static ssize_t double_flag_show(struct kobject *kobj,
240 struct kobj_attribute *attr, char *buf, 240 struct kobj_attribute *attr, char *buf,
241 enum transparent_hugepage_flag enabled, 241 enum transparent_hugepage_flag enabled,
242 enum transparent_hugepage_flag req_madv) 242 enum transparent_hugepage_flag req_madv)
243 { 243 {
244 if (test_bit(enabled, &transparent_hugepage_flags)) { 244 if (test_bit(enabled, &transparent_hugepage_flags)) {
245 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); 245 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
246 return sprintf(buf, "[always] madvise never\n"); 246 return sprintf(buf, "[always] madvise never\n");
247 } else if (test_bit(req_madv, &transparent_hugepage_flags)) 247 } else if (test_bit(req_madv, &transparent_hugepage_flags))
248 return sprintf(buf, "always [madvise] never\n"); 248 return sprintf(buf, "always [madvise] never\n");
249 else 249 else
250 return sprintf(buf, "always madvise [never]\n"); 250 return sprintf(buf, "always madvise [never]\n");
251 } 251 }
252 static ssize_t double_flag_store(struct kobject *kobj, 252 static ssize_t double_flag_store(struct kobject *kobj,
253 struct kobj_attribute *attr, 253 struct kobj_attribute *attr,
254 const char *buf, size_t count, 254 const char *buf, size_t count,
255 enum transparent_hugepage_flag enabled, 255 enum transparent_hugepage_flag enabled,
256 enum transparent_hugepage_flag req_madv) 256 enum transparent_hugepage_flag req_madv)
257 { 257 {
258 if (!memcmp("always", buf, 258 if (!memcmp("always", buf,
259 min(sizeof("always")-1, count))) { 259 min(sizeof("always")-1, count))) {
260 set_bit(enabled, &transparent_hugepage_flags); 260 set_bit(enabled, &transparent_hugepage_flags);
261 clear_bit(req_madv, &transparent_hugepage_flags); 261 clear_bit(req_madv, &transparent_hugepage_flags);
262 } else if (!memcmp("madvise", buf, 262 } else if (!memcmp("madvise", buf,
263 min(sizeof("madvise")-1, count))) { 263 min(sizeof("madvise")-1, count))) {
264 clear_bit(enabled, &transparent_hugepage_flags); 264 clear_bit(enabled, &transparent_hugepage_flags);
265 set_bit(req_madv, &transparent_hugepage_flags); 265 set_bit(req_madv, &transparent_hugepage_flags);
266 } else if (!memcmp("never", buf, 266 } else if (!memcmp("never", buf,
267 min(sizeof("never")-1, count))) { 267 min(sizeof("never")-1, count))) {
268 clear_bit(enabled, &transparent_hugepage_flags); 268 clear_bit(enabled, &transparent_hugepage_flags);
269 clear_bit(req_madv, &transparent_hugepage_flags); 269 clear_bit(req_madv, &transparent_hugepage_flags);
270 } else 270 } else
271 return -EINVAL; 271 return -EINVAL;
272 272
273 return count; 273 return count;
274 } 274 }
275 275
276 static ssize_t enabled_show(struct kobject *kobj, 276 static ssize_t enabled_show(struct kobject *kobj,
277 struct kobj_attribute *attr, char *buf) 277 struct kobj_attribute *attr, char *buf)
278 { 278 {
279 return double_flag_show(kobj, attr, buf, 279 return double_flag_show(kobj, attr, buf,
280 TRANSPARENT_HUGEPAGE_FLAG, 280 TRANSPARENT_HUGEPAGE_FLAG,
281 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 281 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
282 } 282 }
283 static ssize_t enabled_store(struct kobject *kobj, 283 static ssize_t enabled_store(struct kobject *kobj,
284 struct kobj_attribute *attr, 284 struct kobj_attribute *attr,
285 const char *buf, size_t count) 285 const char *buf, size_t count)
286 { 286 {
287 ssize_t ret; 287 ssize_t ret;
288 288
289 ret = double_flag_store(kobj, attr, buf, count, 289 ret = double_flag_store(kobj, attr, buf, count,
290 TRANSPARENT_HUGEPAGE_FLAG, 290 TRANSPARENT_HUGEPAGE_FLAG,
291 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 291 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
292 292
293 if (ret > 0) { 293 if (ret > 0) {
294 int err; 294 int err;
295 295
296 mutex_lock(&khugepaged_mutex); 296 mutex_lock(&khugepaged_mutex);
297 err = start_khugepaged(); 297 err = start_khugepaged();
298 mutex_unlock(&khugepaged_mutex); 298 mutex_unlock(&khugepaged_mutex);
299 299
300 if (err) 300 if (err)
301 ret = err; 301 ret = err;
302 } 302 }
303 303
304 return ret; 304 return ret;
305 } 305 }
306 static struct kobj_attribute enabled_attr = 306 static struct kobj_attribute enabled_attr =
307 __ATTR(enabled, 0644, enabled_show, enabled_store); 307 __ATTR(enabled, 0644, enabled_show, enabled_store);
308 308
309 static ssize_t single_flag_show(struct kobject *kobj, 309 static ssize_t single_flag_show(struct kobject *kobj,
310 struct kobj_attribute *attr, char *buf, 310 struct kobj_attribute *attr, char *buf,
311 enum transparent_hugepage_flag flag) 311 enum transparent_hugepage_flag flag)
312 { 312 {
313 return sprintf(buf, "%d\n", 313 return sprintf(buf, "%d\n",
314 !!test_bit(flag, &transparent_hugepage_flags)); 314 !!test_bit(flag, &transparent_hugepage_flags));
315 } 315 }
316 316
317 static ssize_t single_flag_store(struct kobject *kobj, 317 static ssize_t single_flag_store(struct kobject *kobj,
318 struct kobj_attribute *attr, 318 struct kobj_attribute *attr,
319 const char *buf, size_t count, 319 const char *buf, size_t count,
320 enum transparent_hugepage_flag flag) 320 enum transparent_hugepage_flag flag)
321 { 321 {
322 unsigned long value; 322 unsigned long value;
323 int ret; 323 int ret;
324 324
325 ret = kstrtoul(buf, 10, &value); 325 ret = kstrtoul(buf, 10, &value);
326 if (ret < 0) 326 if (ret < 0)
327 return ret; 327 return ret;
328 if (value > 1) 328 if (value > 1)
329 return -EINVAL; 329 return -EINVAL;
330 330
331 if (value) 331 if (value)
332 set_bit(flag, &transparent_hugepage_flags); 332 set_bit(flag, &transparent_hugepage_flags);
333 else 333 else
334 clear_bit(flag, &transparent_hugepage_flags); 334 clear_bit(flag, &transparent_hugepage_flags);
335 335
336 return count; 336 return count;
337 } 337 }
338 338
339 /* 339 /*
340 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind 340 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
341 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of 341 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
342 * memory just to allocate one more hugepage. 342 * memory just to allocate one more hugepage.
343 */ 343 */
344 static ssize_t defrag_show(struct kobject *kobj, 344 static ssize_t defrag_show(struct kobject *kobj,
345 struct kobj_attribute *attr, char *buf) 345 struct kobj_attribute *attr, char *buf)
346 { 346 {
347 return double_flag_show(kobj, attr, buf, 347 return double_flag_show(kobj, attr, buf,
348 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, 348 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
349 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 349 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
350 } 350 }
351 static ssize_t defrag_store(struct kobject *kobj, 351 static ssize_t defrag_store(struct kobject *kobj,
352 struct kobj_attribute *attr, 352 struct kobj_attribute *attr,
353 const char *buf, size_t count) 353 const char *buf, size_t count)
354 { 354 {
355 return double_flag_store(kobj, attr, buf, count, 355 return double_flag_store(kobj, attr, buf, count,
356 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, 356 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
357 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 357 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
358 } 358 }
359 static struct kobj_attribute defrag_attr = 359 static struct kobj_attribute defrag_attr =
360 __ATTR(defrag, 0644, defrag_show, defrag_store); 360 __ATTR(defrag, 0644, defrag_show, defrag_store);
361 361
362 static ssize_t use_zero_page_show(struct kobject *kobj, 362 static ssize_t use_zero_page_show(struct kobject *kobj,
363 struct kobj_attribute *attr, char *buf) 363 struct kobj_attribute *attr, char *buf)
364 { 364 {
365 return single_flag_show(kobj, attr, buf, 365 return single_flag_show(kobj, attr, buf,
366 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 366 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
367 } 367 }
368 static ssize_t use_zero_page_store(struct kobject *kobj, 368 static ssize_t use_zero_page_store(struct kobject *kobj,
369 struct kobj_attribute *attr, const char *buf, size_t count) 369 struct kobj_attribute *attr, const char *buf, size_t count)
370 { 370 {
371 return single_flag_store(kobj, attr, buf, count, 371 return single_flag_store(kobj, attr, buf, count,
372 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 372 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
373 } 373 }
374 static struct kobj_attribute use_zero_page_attr = 374 static struct kobj_attribute use_zero_page_attr =
375 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 375 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
376 #ifdef CONFIG_DEBUG_VM 376 #ifdef CONFIG_DEBUG_VM
377 static ssize_t debug_cow_show(struct kobject *kobj, 377 static ssize_t debug_cow_show(struct kobject *kobj,
378 struct kobj_attribute *attr, char *buf) 378 struct kobj_attribute *attr, char *buf)
379 { 379 {
380 return single_flag_show(kobj, attr, buf, 380 return single_flag_show(kobj, attr, buf,
381 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 381 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
382 } 382 }
383 static ssize_t debug_cow_store(struct kobject *kobj, 383 static ssize_t debug_cow_store(struct kobject *kobj,
384 struct kobj_attribute *attr, 384 struct kobj_attribute *attr,
385 const char *buf, size_t count) 385 const char *buf, size_t count)
386 { 386 {
387 return single_flag_store(kobj, attr, buf, count, 387 return single_flag_store(kobj, attr, buf, count,
388 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 388 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
389 } 389 }
390 static struct kobj_attribute debug_cow_attr = 390 static struct kobj_attribute debug_cow_attr =
391 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 391 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
392 #endif /* CONFIG_DEBUG_VM */ 392 #endif /* CONFIG_DEBUG_VM */
393 393
394 static struct attribute *hugepage_attr[] = { 394 static struct attribute *hugepage_attr[] = {
395 &enabled_attr.attr, 395 &enabled_attr.attr,
396 &defrag_attr.attr, 396 &defrag_attr.attr,
397 &use_zero_page_attr.attr, 397 &use_zero_page_attr.attr,
398 #ifdef CONFIG_DEBUG_VM 398 #ifdef CONFIG_DEBUG_VM
399 &debug_cow_attr.attr, 399 &debug_cow_attr.attr,
400 #endif 400 #endif
401 NULL, 401 NULL,
402 }; 402 };
403 403
404 static struct attribute_group hugepage_attr_group = { 404 static struct attribute_group hugepage_attr_group = {
405 .attrs = hugepage_attr, 405 .attrs = hugepage_attr,
406 }; 406 };
407 407
408 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 408 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
409 struct kobj_attribute *attr, 409 struct kobj_attribute *attr,
410 char *buf) 410 char *buf)
411 { 411 {
412 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); 412 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
413 } 413 }
414 414
415 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 415 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
416 struct kobj_attribute *attr, 416 struct kobj_attribute *attr,
417 const char *buf, size_t count) 417 const char *buf, size_t count)
418 { 418 {
419 unsigned long msecs; 419 unsigned long msecs;
420 int err; 420 int err;
421 421
422 err = strict_strtoul(buf, 10, &msecs); 422 err = strict_strtoul(buf, 10, &msecs);
423 if (err || msecs > UINT_MAX) 423 if (err || msecs > UINT_MAX)
424 return -EINVAL; 424 return -EINVAL;
425 425
426 khugepaged_scan_sleep_millisecs = msecs; 426 khugepaged_scan_sleep_millisecs = msecs;
427 wake_up_interruptible(&khugepaged_wait); 427 wake_up_interruptible(&khugepaged_wait);
428 428
429 return count; 429 return count;
430 } 430 }
431 static struct kobj_attribute scan_sleep_millisecs_attr = 431 static struct kobj_attribute scan_sleep_millisecs_attr =
432 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, 432 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
433 scan_sleep_millisecs_store); 433 scan_sleep_millisecs_store);
434 434
435 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 435 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
436 struct kobj_attribute *attr, 436 struct kobj_attribute *attr,
437 char *buf) 437 char *buf)
438 { 438 {
439 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 439 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
440 } 440 }
441 441
442 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 442 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
443 struct kobj_attribute *attr, 443 struct kobj_attribute *attr,
444 const char *buf, size_t count) 444 const char *buf, size_t count)
445 { 445 {
446 unsigned long msecs; 446 unsigned long msecs;
447 int err; 447 int err;
448 448
449 err = strict_strtoul(buf, 10, &msecs); 449 err = strict_strtoul(buf, 10, &msecs);
450 if (err || msecs > UINT_MAX) 450 if (err || msecs > UINT_MAX)
451 return -EINVAL; 451 return -EINVAL;
452 452
453 khugepaged_alloc_sleep_millisecs = msecs; 453 khugepaged_alloc_sleep_millisecs = msecs;
454 wake_up_interruptible(&khugepaged_wait); 454 wake_up_interruptible(&khugepaged_wait);
455 455
456 return count; 456 return count;
457 } 457 }
458 static struct kobj_attribute alloc_sleep_millisecs_attr = 458 static struct kobj_attribute alloc_sleep_millisecs_attr =
459 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, 459 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
460 alloc_sleep_millisecs_store); 460 alloc_sleep_millisecs_store);
461 461
462 static ssize_t pages_to_scan_show(struct kobject *kobj, 462 static ssize_t pages_to_scan_show(struct kobject *kobj,
463 struct kobj_attribute *attr, 463 struct kobj_attribute *attr,
464 char *buf) 464 char *buf)
465 { 465 {
466 return sprintf(buf, "%u\n", khugepaged_pages_to_scan); 466 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
467 } 467 }
468 static ssize_t pages_to_scan_store(struct kobject *kobj, 468 static ssize_t pages_to_scan_store(struct kobject *kobj,
469 struct kobj_attribute *attr, 469 struct kobj_attribute *attr,
470 const char *buf, size_t count) 470 const char *buf, size_t count)
471 { 471 {
472 int err; 472 int err;
473 unsigned long pages; 473 unsigned long pages;
474 474
475 err = strict_strtoul(buf, 10, &pages); 475 err = strict_strtoul(buf, 10, &pages);
476 if (err || !pages || pages > UINT_MAX) 476 if (err || !pages || pages > UINT_MAX)
477 return -EINVAL; 477 return -EINVAL;
478 478
479 khugepaged_pages_to_scan = pages; 479 khugepaged_pages_to_scan = pages;
480 480
481 return count; 481 return count;
482 } 482 }
483 static struct kobj_attribute pages_to_scan_attr = 483 static struct kobj_attribute pages_to_scan_attr =
484 __ATTR(pages_to_scan, 0644, pages_to_scan_show, 484 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
485 pages_to_scan_store); 485 pages_to_scan_store);
486 486
487 static ssize_t pages_collapsed_show(struct kobject *kobj, 487 static ssize_t pages_collapsed_show(struct kobject *kobj,
488 struct kobj_attribute *attr, 488 struct kobj_attribute *attr,
489 char *buf) 489 char *buf)
490 { 490 {
491 return sprintf(buf, "%u\n", khugepaged_pages_collapsed); 491 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
492 } 492 }
493 static struct kobj_attribute pages_collapsed_attr = 493 static struct kobj_attribute pages_collapsed_attr =
494 __ATTR_RO(pages_collapsed); 494 __ATTR_RO(pages_collapsed);
495 495
496 static ssize_t full_scans_show(struct kobject *kobj, 496 static ssize_t full_scans_show(struct kobject *kobj,
497 struct kobj_attribute *attr, 497 struct kobj_attribute *attr,
498 char *buf) 498 char *buf)
499 { 499 {
500 return sprintf(buf, "%u\n", khugepaged_full_scans); 500 return sprintf(buf, "%u\n", khugepaged_full_scans);
501 } 501 }
502 static struct kobj_attribute full_scans_attr = 502 static struct kobj_attribute full_scans_attr =
503 __ATTR_RO(full_scans); 503 __ATTR_RO(full_scans);
504 504
505 static ssize_t khugepaged_defrag_show(struct kobject *kobj, 505 static ssize_t khugepaged_defrag_show(struct kobject *kobj,
506 struct kobj_attribute *attr, char *buf) 506 struct kobj_attribute *attr, char *buf)
507 { 507 {
508 return single_flag_show(kobj, attr, buf, 508 return single_flag_show(kobj, attr, buf,
509 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 509 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
510 } 510 }
511 static ssize_t khugepaged_defrag_store(struct kobject *kobj, 511 static ssize_t khugepaged_defrag_store(struct kobject *kobj,
512 struct kobj_attribute *attr, 512 struct kobj_attribute *attr,
513 const char *buf, size_t count) 513 const char *buf, size_t count)
514 { 514 {
515 return single_flag_store(kobj, attr, buf, count, 515 return single_flag_store(kobj, attr, buf, count,
516 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 516 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
517 } 517 }
518 static struct kobj_attribute khugepaged_defrag_attr = 518 static struct kobj_attribute khugepaged_defrag_attr =
519 __ATTR(defrag, 0644, khugepaged_defrag_show, 519 __ATTR(defrag, 0644, khugepaged_defrag_show,
520 khugepaged_defrag_store); 520 khugepaged_defrag_store);
521 521
522 /* 522 /*
523 * max_ptes_none controls if khugepaged should collapse hugepages over 523 * max_ptes_none controls if khugepaged should collapse hugepages over
524 * any unmapped ptes in turn potentially increasing the memory 524 * any unmapped ptes in turn potentially increasing the memory
525 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 525 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
526 * reduce the available free memory in the system as it 526 * reduce the available free memory in the system as it
527 * runs. Increasing max_ptes_none will instead potentially reduce the 527 * runs. Increasing max_ptes_none will instead potentially reduce the
528 * free memory in the system during the khugepaged scan. 528 * free memory in the system during the khugepaged scan.
529 */ 529 */
530 static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, 530 static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
531 struct kobj_attribute *attr, 531 struct kobj_attribute *attr,
532 char *buf) 532 char *buf)
533 { 533 {
534 return sprintf(buf, "%u\n", khugepaged_max_ptes_none); 534 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
535 } 535 }
536 static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, 536 static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
537 struct kobj_attribute *attr, 537 struct kobj_attribute *attr,
538 const char *buf, size_t count) 538 const char *buf, size_t count)
539 { 539 {
540 int err; 540 int err;
541 unsigned long max_ptes_none; 541 unsigned long max_ptes_none;
542 542
543 err = strict_strtoul(buf, 10, &max_ptes_none); 543 err = strict_strtoul(buf, 10, &max_ptes_none);
544 if (err || max_ptes_none > HPAGE_PMD_NR-1) 544 if (err || max_ptes_none > HPAGE_PMD_NR-1)
545 return -EINVAL; 545 return -EINVAL;
546 546
547 khugepaged_max_ptes_none = max_ptes_none; 547 khugepaged_max_ptes_none = max_ptes_none;
548 548
549 return count; 549 return count;
550 } 550 }
551 static struct kobj_attribute khugepaged_max_ptes_none_attr = 551 static struct kobj_attribute khugepaged_max_ptes_none_attr =
552 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, 552 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
553 khugepaged_max_ptes_none_store); 553 khugepaged_max_ptes_none_store);
554 554
555 static struct attribute *khugepaged_attr[] = { 555 static struct attribute *khugepaged_attr[] = {
556 &khugepaged_defrag_attr.attr, 556 &khugepaged_defrag_attr.attr,
557 &khugepaged_max_ptes_none_attr.attr, 557 &khugepaged_max_ptes_none_attr.attr,
558 &pages_to_scan_attr.attr, 558 &pages_to_scan_attr.attr,
559 &pages_collapsed_attr.attr, 559 &pages_collapsed_attr.attr,
560 &full_scans_attr.attr, 560 &full_scans_attr.attr,
561 &scan_sleep_millisecs_attr.attr, 561 &scan_sleep_millisecs_attr.attr,
562 &alloc_sleep_millisecs_attr.attr, 562 &alloc_sleep_millisecs_attr.attr,
563 NULL, 563 NULL,
564 }; 564 };
565 565
566 static struct attribute_group khugepaged_attr_group = { 566 static struct attribute_group khugepaged_attr_group = {
567 .attrs = khugepaged_attr, 567 .attrs = khugepaged_attr,
568 .name = "khugepaged", 568 .name = "khugepaged",
569 }; 569 };
570 570
571 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 571 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
572 { 572 {
573 int err; 573 int err;
574 574
575 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 575 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
576 if (unlikely(!*hugepage_kobj)) { 576 if (unlikely(!*hugepage_kobj)) {
577 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); 577 printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
578 return -ENOMEM; 578 return -ENOMEM;
579 } 579 }
580 580
581 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 581 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
582 if (err) { 582 if (err) {
583 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 583 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
584 goto delete_obj; 584 goto delete_obj;
585 } 585 }
586 586
587 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 587 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
588 if (err) { 588 if (err) {
589 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); 589 printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
590 goto remove_hp_group; 590 goto remove_hp_group;
591 } 591 }
592 592
593 return 0; 593 return 0;
594 594
595 remove_hp_group: 595 remove_hp_group:
596 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 596 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
597 delete_obj: 597 delete_obj:
598 kobject_put(*hugepage_kobj); 598 kobject_put(*hugepage_kobj);
599 return err; 599 return err;
600 } 600 }
601 601
602 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 602 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
603 { 603 {
604 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 604 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
605 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 605 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
606 kobject_put(hugepage_kobj); 606 kobject_put(hugepage_kobj);
607 } 607 }
608 #else 608 #else
609 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 609 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
610 { 610 {
611 return 0; 611 return 0;
612 } 612 }
613 613
614 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 614 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
615 { 615 {
616 } 616 }
617 #endif /* CONFIG_SYSFS */ 617 #endif /* CONFIG_SYSFS */
618 618
619 static int __init hugepage_init(void) 619 static int __init hugepage_init(void)
620 { 620 {
621 int err; 621 int err;
622 struct kobject *hugepage_kobj; 622 struct kobject *hugepage_kobj;
623 623
624 if (!has_transparent_hugepage()) { 624 if (!has_transparent_hugepage()) {
625 transparent_hugepage_flags = 0; 625 transparent_hugepage_flags = 0;
626 return -EINVAL; 626 return -EINVAL;
627 } 627 }
628 628
629 err = hugepage_init_sysfs(&hugepage_kobj); 629 err = hugepage_init_sysfs(&hugepage_kobj);
630 if (err) 630 if (err)
631 return err; 631 return err;
632 632
633 err = khugepaged_slab_init(); 633 err = khugepaged_slab_init();
634 if (err) 634 if (err)
635 goto out; 635 goto out;
636 636
637 err = mm_slots_hash_init(); 637 err = mm_slots_hash_init();
638 if (err) { 638 if (err) {
639 khugepaged_slab_free(); 639 khugepaged_slab_free();
640 goto out; 640 goto out;
641 } 641 }
642 642
643 register_shrinker(&huge_zero_page_shrinker); 643 register_shrinker(&huge_zero_page_shrinker);
644 644
645 /* 645 /*
646 * By default disable transparent hugepages on smaller systems, 646 * By default disable transparent hugepages on smaller systems,
647 * where the extra memory used could hurt more than TLB overhead 647 * where the extra memory used could hurt more than TLB overhead
648 * is likely to save. The admin can still enable it through /sys. 648 * is likely to save. The admin can still enable it through /sys.
649 */ 649 */
650 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) 650 if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
651 transparent_hugepage_flags = 0; 651 transparent_hugepage_flags = 0;
652 652
653 start_khugepaged(); 653 start_khugepaged();
654 654
655 return 0; 655 return 0;
656 out: 656 out:
657 hugepage_exit_sysfs(hugepage_kobj); 657 hugepage_exit_sysfs(hugepage_kobj);
658 return err; 658 return err;
659 } 659 }
660 module_init(hugepage_init) 660 module_init(hugepage_init)
661 661
662 static int __init setup_transparent_hugepage(char *str) 662 static int __init setup_transparent_hugepage(char *str)
663 { 663 {
664 int ret = 0; 664 int ret = 0;
665 if (!str) 665 if (!str)
666 goto out; 666 goto out;
667 if (!strcmp(str, "always")) { 667 if (!strcmp(str, "always")) {
668 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 668 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
669 &transparent_hugepage_flags); 669 &transparent_hugepage_flags);
670 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 670 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
671 &transparent_hugepage_flags); 671 &transparent_hugepage_flags);
672 ret = 1; 672 ret = 1;
673 } else if (!strcmp(str, "madvise")) { 673 } else if (!strcmp(str, "madvise")) {
674 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 674 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
675 &transparent_hugepage_flags); 675 &transparent_hugepage_flags);
676 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 676 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
677 &transparent_hugepage_flags); 677 &transparent_hugepage_flags);
678 ret = 1; 678 ret = 1;
679 } else if (!strcmp(str, "never")) { 679 } else if (!strcmp(str, "never")) {
680 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 680 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
681 &transparent_hugepage_flags); 681 &transparent_hugepage_flags);
682 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 682 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
683 &transparent_hugepage_flags); 683 &transparent_hugepage_flags);
684 ret = 1; 684 ret = 1;
685 } 685 }
686 out: 686 out:
687 if (!ret) 687 if (!ret)
688 printk(KERN_WARNING 688 printk(KERN_WARNING
689 "transparent_hugepage= cannot parse, ignored\n"); 689 "transparent_hugepage= cannot parse, ignored\n");
690 return ret; 690 return ret;
691 } 691 }
692 __setup("transparent_hugepage=", setup_transparent_hugepage); 692 __setup("transparent_hugepage=", setup_transparent_hugepage);
693 693
694 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 694 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
695 { 695 {
696 if (likely(vma->vm_flags & VM_WRITE)) 696 if (likely(vma->vm_flags & VM_WRITE))
697 pmd = pmd_mkwrite(pmd); 697 pmd = pmd_mkwrite(pmd);
698 return pmd; 698 return pmd;
699 } 699 }
700 700
701 static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) 701 static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
702 { 702 {
703 pmd_t entry; 703 pmd_t entry;
704 entry = mk_pmd(page, vma->vm_page_prot); 704 entry = mk_pmd(page, vma->vm_page_prot);
705 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 705 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
706 entry = pmd_mkhuge(entry); 706 entry = pmd_mkhuge(entry);
707 return entry; 707 return entry;
708 } 708 }
709 709
710 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 710 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
711 struct vm_area_struct *vma, 711 struct vm_area_struct *vma,
712 unsigned long haddr, pmd_t *pmd, 712 unsigned long haddr, pmd_t *pmd,
713 struct page *page) 713 struct page *page)
714 { 714 {
715 pgtable_t pgtable; 715 pgtable_t pgtable;
716 716
717 VM_BUG_ON(!PageCompound(page)); 717 VM_BUG_ON(!PageCompound(page));
718 pgtable = pte_alloc_one(mm, haddr); 718 pgtable = pte_alloc_one(mm, haddr);
719 if (unlikely(!pgtable)) 719 if (unlikely(!pgtable))
720 return VM_FAULT_OOM; 720 return VM_FAULT_OOM;
721 721
722 clear_huge_page(page, haddr, HPAGE_PMD_NR); 722 clear_huge_page(page, haddr, HPAGE_PMD_NR);
723 __SetPageUptodate(page); 723 __SetPageUptodate(page);
724 724
725 spin_lock(&mm->page_table_lock); 725 spin_lock(&mm->page_table_lock);
726 if (unlikely(!pmd_none(*pmd))) { 726 if (unlikely(!pmd_none(*pmd))) {
727 spin_unlock(&mm->page_table_lock); 727 spin_unlock(&mm->page_table_lock);
728 mem_cgroup_uncharge_page(page); 728 mem_cgroup_uncharge_page(page);
729 put_page(page); 729 put_page(page);
730 pte_free(mm, pgtable); 730 pte_free(mm, pgtable);
731 } else { 731 } else {
732 pmd_t entry; 732 pmd_t entry;
733 entry = mk_huge_pmd(page, vma); 733 entry = mk_huge_pmd(page, vma);
734 /* 734 /*
735 * The spinlocking to take the lru_lock inside 735 * The spinlocking to take the lru_lock inside
736 * page_add_new_anon_rmap() acts as a full memory 736 * page_add_new_anon_rmap() acts as a full memory
737 * barrier to be sure clear_huge_page writes become 737 * barrier to be sure clear_huge_page writes become
738 * visible after the set_pmd_at() write. 738 * visible after the set_pmd_at() write.
739 */ 739 */
740 page_add_new_anon_rmap(page, vma, haddr); 740 page_add_new_anon_rmap(page, vma, haddr);
741 set_pmd_at(mm, haddr, pmd, entry); 741 set_pmd_at(mm, haddr, pmd, entry);
742 pgtable_trans_huge_deposit(mm, pgtable); 742 pgtable_trans_huge_deposit(mm, pgtable);
743 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 743 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
744 mm->nr_ptes++; 744 mm->nr_ptes++;
745 spin_unlock(&mm->page_table_lock); 745 spin_unlock(&mm->page_table_lock);
746 } 746 }
747 747
748 return 0; 748 return 0;
749 } 749 }
750 750
751 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) 751 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
752 { 752 {
753 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; 753 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
754 } 754 }
755 755
756 static inline struct page *alloc_hugepage_vma(int defrag, 756 static inline struct page *alloc_hugepage_vma(int defrag,
757 struct vm_area_struct *vma, 757 struct vm_area_struct *vma,
758 unsigned long haddr, int nd, 758 unsigned long haddr, int nd,
759 gfp_t extra_gfp) 759 gfp_t extra_gfp)
760 { 760 {
761 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), 761 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
762 HPAGE_PMD_ORDER, vma, haddr, nd); 762 HPAGE_PMD_ORDER, vma, haddr, nd);
763 } 763 }
764 764
765 #ifndef CONFIG_NUMA 765 #ifndef CONFIG_NUMA
766 static inline struct page *alloc_hugepage(int defrag) 766 static inline struct page *alloc_hugepage(int defrag)
767 { 767 {
768 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), 768 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
769 HPAGE_PMD_ORDER); 769 HPAGE_PMD_ORDER);
770 } 770 }
771 #endif 771 #endif
772 772
773 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 773 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
774 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 774 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
775 unsigned long zero_pfn) 775 unsigned long zero_pfn)
776 { 776 {
777 pmd_t entry; 777 pmd_t entry;
778 if (!pmd_none(*pmd)) 778 if (!pmd_none(*pmd))
779 return false; 779 return false;
780 entry = pfn_pmd(zero_pfn, vma->vm_page_prot); 780 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
781 entry = pmd_wrprotect(entry); 781 entry = pmd_wrprotect(entry);
782 entry = pmd_mkhuge(entry); 782 entry = pmd_mkhuge(entry);
783 set_pmd_at(mm, haddr, pmd, entry); 783 set_pmd_at(mm, haddr, pmd, entry);
784 pgtable_trans_huge_deposit(mm, pgtable); 784 pgtable_trans_huge_deposit(mm, pgtable);
785 mm->nr_ptes++; 785 mm->nr_ptes++;
786 return true; 786 return true;
787 } 787 }
788 788
789 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 789 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
790 unsigned long address, pmd_t *pmd, 790 unsigned long address, pmd_t *pmd,
791 unsigned int flags) 791 unsigned int flags)
792 { 792 {
793 struct page *page; 793 struct page *page;
794 unsigned long haddr = address & HPAGE_PMD_MASK; 794 unsigned long haddr = address & HPAGE_PMD_MASK;
795 pte_t *pte; 795 pte_t *pte;
796 796
797 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { 797 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
798 if (unlikely(anon_vma_prepare(vma))) 798 if (unlikely(anon_vma_prepare(vma)))
799 return VM_FAULT_OOM; 799 return VM_FAULT_OOM;
800 if (unlikely(khugepaged_enter(vma))) 800 if (unlikely(khugepaged_enter(vma)))
801 return VM_FAULT_OOM; 801 return VM_FAULT_OOM;
802 if (!(flags & FAULT_FLAG_WRITE) && 802 if (!(flags & FAULT_FLAG_WRITE) &&
803 transparent_hugepage_use_zero_page()) { 803 transparent_hugepage_use_zero_page()) {
804 pgtable_t pgtable; 804 pgtable_t pgtable;
805 unsigned long zero_pfn; 805 unsigned long zero_pfn;
806 bool set; 806 bool set;
807 pgtable = pte_alloc_one(mm, haddr); 807 pgtable = pte_alloc_one(mm, haddr);
808 if (unlikely(!pgtable)) 808 if (unlikely(!pgtable))
809 return VM_FAULT_OOM; 809 return VM_FAULT_OOM;
810 zero_pfn = get_huge_zero_page(); 810 zero_pfn = get_huge_zero_page();
811 if (unlikely(!zero_pfn)) { 811 if (unlikely(!zero_pfn)) {
812 pte_free(mm, pgtable); 812 pte_free(mm, pgtable);
813 count_vm_event(THP_FAULT_FALLBACK); 813 count_vm_event(THP_FAULT_FALLBACK);
814 goto out; 814 goto out;
815 } 815 }
816 spin_lock(&mm->page_table_lock); 816 spin_lock(&mm->page_table_lock);
817 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, 817 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
818 zero_pfn); 818 zero_pfn);
819 spin_unlock(&mm->page_table_lock); 819 spin_unlock(&mm->page_table_lock);
820 if (!set) { 820 if (!set) {
821 pte_free(mm, pgtable); 821 pte_free(mm, pgtable);
822 put_huge_zero_page(); 822 put_huge_zero_page();
823 } 823 }
824 return 0; 824 return 0;
825 } 825 }
826 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 826 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
827 vma, haddr, numa_node_id(), 0); 827 vma, haddr, numa_node_id(), 0);
828 if (unlikely(!page)) { 828 if (unlikely(!page)) {
829 count_vm_event(THP_FAULT_FALLBACK); 829 count_vm_event(THP_FAULT_FALLBACK);
830 goto out; 830 goto out;
831 } 831 }
832 count_vm_event(THP_FAULT_ALLOC); 832 count_vm_event(THP_FAULT_ALLOC);
833 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 833 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
834 put_page(page); 834 put_page(page);
835 goto out; 835 goto out;
836 } 836 }
837 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, 837 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
838 page))) { 838 page))) {
839 mem_cgroup_uncharge_page(page); 839 mem_cgroup_uncharge_page(page);
840 put_page(page); 840 put_page(page);
841 goto out; 841 goto out;
842 } 842 }
843 843
844 return 0; 844 return 0;
845 } 845 }
846 out: 846 out:
847 /* 847 /*
848 * Use __pte_alloc instead of pte_alloc_map, because we can't 848 * Use __pte_alloc instead of pte_alloc_map, because we can't
849 * run pte_offset_map on the pmd, if an huge pmd could 849 * run pte_offset_map on the pmd, if an huge pmd could
850 * materialize from under us from a different thread. 850 * materialize from under us from a different thread.
851 */ 851 */
852 if (unlikely(pmd_none(*pmd)) && 852 if (unlikely(pmd_none(*pmd)) &&
853 unlikely(__pte_alloc(mm, vma, pmd, address))) 853 unlikely(__pte_alloc(mm, vma, pmd, address)))
854 return VM_FAULT_OOM; 854 return VM_FAULT_OOM;
855 /* if an huge pmd materialized from under us just retry later */ 855 /* if an huge pmd materialized from under us just retry later */
856 if (unlikely(pmd_trans_huge(*pmd))) 856 if (unlikely(pmd_trans_huge(*pmd)))
857 return 0; 857 return 0;
858 /* 858 /*
859 * A regular pmd is established and it can't morph into a huge pmd 859 * A regular pmd is established and it can't morph into a huge pmd
860 * from under us anymore at this point because we hold the mmap_sem 860 * from under us anymore at this point because we hold the mmap_sem
861 * read mode and khugepaged takes it in write mode. So now it's 861 * read mode and khugepaged takes it in write mode. So now it's
862 * safe to run pte_offset_map(). 862 * safe to run pte_offset_map().
863 */ 863 */
864 pte = pte_offset_map(pmd, address); 864 pte = pte_offset_map(pmd, address);
865 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 865 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
866 } 866 }
867 867
868 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 868 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
869 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 869 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
870 struct vm_area_struct *vma) 870 struct vm_area_struct *vma)
871 { 871 {
872 struct page *src_page; 872 struct page *src_page;
873 pmd_t pmd; 873 pmd_t pmd;
874 pgtable_t pgtable; 874 pgtable_t pgtable;
875 int ret; 875 int ret;
876 876
877 ret = -ENOMEM; 877 ret = -ENOMEM;
878 pgtable = pte_alloc_one(dst_mm, addr); 878 pgtable = pte_alloc_one(dst_mm, addr);
879 if (unlikely(!pgtable)) 879 if (unlikely(!pgtable))
880 goto out; 880 goto out;
881 881
882 spin_lock(&dst_mm->page_table_lock); 882 spin_lock(&dst_mm->page_table_lock);
883 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); 883 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
884 884
885 ret = -EAGAIN; 885 ret = -EAGAIN;
886 pmd = *src_pmd; 886 pmd = *src_pmd;
887 if (unlikely(!pmd_trans_huge(pmd))) { 887 if (unlikely(!pmd_trans_huge(pmd))) {
888 pte_free(dst_mm, pgtable); 888 pte_free(dst_mm, pgtable);
889 goto out_unlock; 889 goto out_unlock;
890 } 890 }
891 /* 891 /*
892 * mm->page_table_lock is enough to be sure that huge zero pmd is not 892 * mm->page_table_lock is enough to be sure that huge zero pmd is not
893 * under splitting since we don't split the page itself, only pmd to 893 * under splitting since we don't split the page itself, only pmd to
894 * a page table. 894 * a page table.
895 */ 895 */
896 if (is_huge_zero_pmd(pmd)) { 896 if (is_huge_zero_pmd(pmd)) {
897 unsigned long zero_pfn; 897 unsigned long zero_pfn;
898 bool set; 898 bool set;
899 /* 899 /*
900 * get_huge_zero_page() will never allocate a new page here, 900 * get_huge_zero_page() will never allocate a new page here,
901 * since we already have a zero page to copy. It just takes a 901 * since we already have a zero page to copy. It just takes a
902 * reference. 902 * reference.
903 */ 903 */
904 zero_pfn = get_huge_zero_page(); 904 zero_pfn = get_huge_zero_page();
905 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 905 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
906 zero_pfn); 906 zero_pfn);
907 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ 907 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
908 ret = 0; 908 ret = 0;
909 goto out_unlock; 909 goto out_unlock;
910 } 910 }
911 if (unlikely(pmd_trans_splitting(pmd))) { 911 if (unlikely(pmd_trans_splitting(pmd))) {
912 /* split huge page running from under us */ 912 /* split huge page running from under us */
913 spin_unlock(&src_mm->page_table_lock); 913 spin_unlock(&src_mm->page_table_lock);
914 spin_unlock(&dst_mm->page_table_lock); 914 spin_unlock(&dst_mm->page_table_lock);
915 pte_free(dst_mm, pgtable); 915 pte_free(dst_mm, pgtable);
916 916
917 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ 917 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
918 goto out; 918 goto out;
919 } 919 }
920 src_page = pmd_page(pmd); 920 src_page = pmd_page(pmd);
921 VM_BUG_ON(!PageHead(src_page)); 921 VM_BUG_ON(!PageHead(src_page));
922 get_page(src_page); 922 get_page(src_page);
923 page_dup_rmap(src_page); 923 page_dup_rmap(src_page);
924 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 924 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
925 925
926 pmdp_set_wrprotect(src_mm, addr, src_pmd); 926 pmdp_set_wrprotect(src_mm, addr, src_pmd);
927 pmd = pmd_mkold(pmd_wrprotect(pmd)); 927 pmd = pmd_mkold(pmd_wrprotect(pmd));
928 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 928 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
929 pgtable_trans_huge_deposit(dst_mm, pgtable); 929 pgtable_trans_huge_deposit(dst_mm, pgtable);
930 dst_mm->nr_ptes++; 930 dst_mm->nr_ptes++;
931 931
932 ret = 0; 932 ret = 0;
933 out_unlock: 933 out_unlock:
934 spin_unlock(&src_mm->page_table_lock); 934 spin_unlock(&src_mm->page_table_lock);
935 spin_unlock(&dst_mm->page_table_lock); 935 spin_unlock(&dst_mm->page_table_lock);
936 out: 936 out:
937 return ret; 937 return ret;
938 } 938 }
939 939
940 void huge_pmd_set_accessed(struct mm_struct *mm, 940 void huge_pmd_set_accessed(struct mm_struct *mm,
941 struct vm_area_struct *vma, 941 struct vm_area_struct *vma,
942 unsigned long address, 942 unsigned long address,
943 pmd_t *pmd, pmd_t orig_pmd, 943 pmd_t *pmd, pmd_t orig_pmd,
944 int dirty) 944 int dirty)
945 { 945 {
946 pmd_t entry; 946 pmd_t entry;
947 unsigned long haddr; 947 unsigned long haddr;
948 948
949 spin_lock(&mm->page_table_lock); 949 spin_lock(&mm->page_table_lock);
950 if (unlikely(!pmd_same(*pmd, orig_pmd))) 950 if (unlikely(!pmd_same(*pmd, orig_pmd)))
951 goto unlock; 951 goto unlock;
952 952
953 entry = pmd_mkyoung(orig_pmd); 953 entry = pmd_mkyoung(orig_pmd);
954 haddr = address & HPAGE_PMD_MASK; 954 haddr = address & HPAGE_PMD_MASK;
955 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) 955 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
956 update_mmu_cache_pmd(vma, address, pmd); 956 update_mmu_cache_pmd(vma, address, pmd);
957 957
958 unlock: 958 unlock:
959 spin_unlock(&mm->page_table_lock); 959 spin_unlock(&mm->page_table_lock);
960 } 960 }
961 961
962 static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, 962 static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
963 struct vm_area_struct *vma, unsigned long address, 963 struct vm_area_struct *vma, unsigned long address,
964 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) 964 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
965 { 965 {
966 pgtable_t pgtable; 966 pgtable_t pgtable;
967 pmd_t _pmd; 967 pmd_t _pmd;
968 struct page *page; 968 struct page *page;
969 int i, ret = 0; 969 int i, ret = 0;
970 unsigned long mmun_start; /* For mmu_notifiers */ 970 unsigned long mmun_start; /* For mmu_notifiers */
971 unsigned long mmun_end; /* For mmu_notifiers */ 971 unsigned long mmun_end; /* For mmu_notifiers */
972 972
973 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 973 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
974 if (!page) { 974 if (!page) {
975 ret |= VM_FAULT_OOM; 975 ret |= VM_FAULT_OOM;
976 goto out; 976 goto out;
977 } 977 }
978 978
979 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { 979 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
980 put_page(page); 980 put_page(page);
981 ret |= VM_FAULT_OOM; 981 ret |= VM_FAULT_OOM;
982 goto out; 982 goto out;
983 } 983 }
984 984
985 clear_user_highpage(page, address); 985 clear_user_highpage(page, address);
986 __SetPageUptodate(page); 986 __SetPageUptodate(page);
987 987
988 mmun_start = haddr; 988 mmun_start = haddr;
989 mmun_end = haddr + HPAGE_PMD_SIZE; 989 mmun_end = haddr + HPAGE_PMD_SIZE;
990 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 990 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
991 991
992 spin_lock(&mm->page_table_lock); 992 spin_lock(&mm->page_table_lock);
993 if (unlikely(!pmd_same(*pmd, orig_pmd))) 993 if (unlikely(!pmd_same(*pmd, orig_pmd)))
994 goto out_free_page; 994 goto out_free_page;
995 995
996 pmdp_clear_flush(vma, haddr, pmd); 996 pmdp_clear_flush(vma, haddr, pmd);
997 /* leave pmd empty until pte is filled */ 997 /* leave pmd empty until pte is filled */
998 998
999 pgtable = pgtable_trans_huge_withdraw(mm); 999 pgtable = pgtable_trans_huge_withdraw(mm);
1000 pmd_populate(mm, &_pmd, pgtable); 1000 pmd_populate(mm, &_pmd, pgtable);
1001 1001
1002 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1002 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1003 pte_t *pte, entry; 1003 pte_t *pte, entry;
1004 if (haddr == (address & PAGE_MASK)) { 1004 if (haddr == (address & PAGE_MASK)) {
1005 entry = mk_pte(page, vma->vm_page_prot); 1005 entry = mk_pte(page, vma->vm_page_prot);
1006 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1006 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1007 page_add_new_anon_rmap(page, vma, haddr); 1007 page_add_new_anon_rmap(page, vma, haddr);
1008 } else { 1008 } else {
1009 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 1009 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
1010 entry = pte_mkspecial(entry); 1010 entry = pte_mkspecial(entry);
1011 } 1011 }
1012 pte = pte_offset_map(&_pmd, haddr); 1012 pte = pte_offset_map(&_pmd, haddr);
1013 VM_BUG_ON(!pte_none(*pte)); 1013 VM_BUG_ON(!pte_none(*pte));
1014 set_pte_at(mm, haddr, pte, entry); 1014 set_pte_at(mm, haddr, pte, entry);
1015 pte_unmap(pte); 1015 pte_unmap(pte);
1016 } 1016 }
1017 smp_wmb(); /* make pte visible before pmd */ 1017 smp_wmb(); /* make pte visible before pmd */
1018 pmd_populate(mm, pmd, pgtable); 1018 pmd_populate(mm, pmd, pgtable);
1019 spin_unlock(&mm->page_table_lock); 1019 spin_unlock(&mm->page_table_lock);
1020 put_huge_zero_page(); 1020 put_huge_zero_page();
1021 inc_mm_counter(mm, MM_ANONPAGES); 1021 inc_mm_counter(mm, MM_ANONPAGES);
1022 1022
1023 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1023 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1024 1024
1025 ret |= VM_FAULT_WRITE; 1025 ret |= VM_FAULT_WRITE;
1026 out: 1026 out:
1027 return ret; 1027 return ret;
1028 out_free_page: 1028 out_free_page:
1029 spin_unlock(&mm->page_table_lock); 1029 spin_unlock(&mm->page_table_lock);
1030 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1030 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1031 mem_cgroup_uncharge_page(page); 1031 mem_cgroup_uncharge_page(page);
1032 put_page(page); 1032 put_page(page);
1033 goto out; 1033 goto out;
1034 } 1034 }
1035 1035
1036 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1036 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1037 struct vm_area_struct *vma, 1037 struct vm_area_struct *vma,
1038 unsigned long address, 1038 unsigned long address,
1039 pmd_t *pmd, pmd_t orig_pmd, 1039 pmd_t *pmd, pmd_t orig_pmd,
1040 struct page *page, 1040 struct page *page,
1041 unsigned long haddr) 1041 unsigned long haddr)
1042 { 1042 {
1043 pgtable_t pgtable; 1043 pgtable_t pgtable;
1044 pmd_t _pmd; 1044 pmd_t _pmd;
1045 int ret = 0, i; 1045 int ret = 0, i;
1046 struct page **pages; 1046 struct page **pages;
1047 unsigned long mmun_start; /* For mmu_notifiers */ 1047 unsigned long mmun_start; /* For mmu_notifiers */
1048 unsigned long mmun_end; /* For mmu_notifiers */ 1048 unsigned long mmun_end; /* For mmu_notifiers */
1049 1049
1050 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 1050 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
1051 GFP_KERNEL); 1051 GFP_KERNEL);
1052 if (unlikely(!pages)) { 1052 if (unlikely(!pages)) {
1053 ret |= VM_FAULT_OOM; 1053 ret |= VM_FAULT_OOM;
1054 goto out; 1054 goto out;
1055 } 1055 }
1056 1056
1057 for (i = 0; i < HPAGE_PMD_NR; i++) { 1057 for (i = 0; i < HPAGE_PMD_NR; i++) {
1058 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 1058 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
1059 __GFP_OTHER_NODE, 1059 __GFP_OTHER_NODE,
1060 vma, address, page_to_nid(page)); 1060 vma, address, page_to_nid(page));
1061 if (unlikely(!pages[i] || 1061 if (unlikely(!pages[i] ||
1062 mem_cgroup_newpage_charge(pages[i], mm, 1062 mem_cgroup_newpage_charge(pages[i], mm,
1063 GFP_KERNEL))) { 1063 GFP_KERNEL))) {
1064 if (pages[i]) 1064 if (pages[i])
1065 put_page(pages[i]); 1065 put_page(pages[i]);
1066 mem_cgroup_uncharge_start(); 1066 mem_cgroup_uncharge_start();
1067 while (--i >= 0) { 1067 while (--i >= 0) {
1068 mem_cgroup_uncharge_page(pages[i]); 1068 mem_cgroup_uncharge_page(pages[i]);
1069 put_page(pages[i]); 1069 put_page(pages[i]);
1070 } 1070 }
1071 mem_cgroup_uncharge_end(); 1071 mem_cgroup_uncharge_end();
1072 kfree(pages); 1072 kfree(pages);
1073 ret |= VM_FAULT_OOM; 1073 ret |= VM_FAULT_OOM;
1074 goto out; 1074 goto out;
1075 } 1075 }
1076 } 1076 }
1077 1077
1078 for (i = 0; i < HPAGE_PMD_NR; i++) { 1078 for (i = 0; i < HPAGE_PMD_NR; i++) {
1079 copy_user_highpage(pages[i], page + i, 1079 copy_user_highpage(pages[i], page + i,
1080 haddr + PAGE_SIZE * i, vma); 1080 haddr + PAGE_SIZE * i, vma);
1081 __SetPageUptodate(pages[i]); 1081 __SetPageUptodate(pages[i]);
1082 cond_resched(); 1082 cond_resched();
1083 } 1083 }
1084 1084
1085 mmun_start = haddr; 1085 mmun_start = haddr;
1086 mmun_end = haddr + HPAGE_PMD_SIZE; 1086 mmun_end = haddr + HPAGE_PMD_SIZE;
1087 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1087 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1088 1088
1089 spin_lock(&mm->page_table_lock); 1089 spin_lock(&mm->page_table_lock);
1090 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1090 if (unlikely(!pmd_same(*pmd, orig_pmd)))
1091 goto out_free_pages; 1091 goto out_free_pages;
1092 VM_BUG_ON(!PageHead(page)); 1092 VM_BUG_ON(!PageHead(page));
1093 1093
1094 pmdp_clear_flush(vma, haddr, pmd); 1094 pmdp_clear_flush(vma, haddr, pmd);
1095 /* leave pmd empty until pte is filled */ 1095 /* leave pmd empty until pte is filled */
1096 1096
1097 pgtable = pgtable_trans_huge_withdraw(mm); 1097 pgtable = pgtable_trans_huge_withdraw(mm);
1098 pmd_populate(mm, &_pmd, pgtable); 1098 pmd_populate(mm, &_pmd, pgtable);
1099 1099
1100 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1100 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1101 pte_t *pte, entry; 1101 pte_t *pte, entry;
1102 entry = mk_pte(pages[i], vma->vm_page_prot); 1102 entry = mk_pte(pages[i], vma->vm_page_prot);
1103 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1103 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1104 page_add_new_anon_rmap(pages[i], vma, haddr); 1104 page_add_new_anon_rmap(pages[i], vma, haddr);
1105 pte = pte_offset_map(&_pmd, haddr); 1105 pte = pte_offset_map(&_pmd, haddr);
1106 VM_BUG_ON(!pte_none(*pte)); 1106 VM_BUG_ON(!pte_none(*pte));
1107 set_pte_at(mm, haddr, pte, entry); 1107 set_pte_at(mm, haddr, pte, entry);
1108 pte_unmap(pte); 1108 pte_unmap(pte);
1109 } 1109 }
1110 kfree(pages); 1110 kfree(pages);
1111 1111
1112 smp_wmb(); /* make pte visible before pmd */ 1112 smp_wmb(); /* make pte visible before pmd */
1113 pmd_populate(mm, pmd, pgtable); 1113 pmd_populate(mm, pmd, pgtable);
1114 page_remove_rmap(page); 1114 page_remove_rmap(page);
1115 spin_unlock(&mm->page_table_lock); 1115 spin_unlock(&mm->page_table_lock);
1116 1116
1117 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1117 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1118 1118
1119 ret |= VM_FAULT_WRITE; 1119 ret |= VM_FAULT_WRITE;
1120 put_page(page); 1120 put_page(page);
1121 1121
1122 out: 1122 out:
1123 return ret; 1123 return ret;
1124 1124
1125 out_free_pages: 1125 out_free_pages:
1126 spin_unlock(&mm->page_table_lock); 1126 spin_unlock(&mm->page_table_lock);
1127 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1127 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1128 mem_cgroup_uncharge_start(); 1128 mem_cgroup_uncharge_start();
1129 for (i = 0; i < HPAGE_PMD_NR; i++) { 1129 for (i = 0; i < HPAGE_PMD_NR; i++) {
1130 mem_cgroup_uncharge_page(pages[i]); 1130 mem_cgroup_uncharge_page(pages[i]);
1131 put_page(pages[i]); 1131 put_page(pages[i]);
1132 } 1132 }
1133 mem_cgroup_uncharge_end(); 1133 mem_cgroup_uncharge_end();
1134 kfree(pages); 1134 kfree(pages);
1135 goto out; 1135 goto out;
1136 } 1136 }
1137 1137
1138 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1138 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1139 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1139 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
1140 { 1140 {
1141 int ret = 0; 1141 int ret = 0;
1142 struct page *page = NULL, *new_page; 1142 struct page *page = NULL, *new_page;
1143 unsigned long haddr; 1143 unsigned long haddr;
1144 unsigned long mmun_start; /* For mmu_notifiers */ 1144 unsigned long mmun_start; /* For mmu_notifiers */
1145 unsigned long mmun_end; /* For mmu_notifiers */ 1145 unsigned long mmun_end; /* For mmu_notifiers */
1146 1146
1147 VM_BUG_ON(!vma->anon_vma); 1147 VM_BUG_ON(!vma->anon_vma);
1148 haddr = address & HPAGE_PMD_MASK; 1148 haddr = address & HPAGE_PMD_MASK;
1149 if (is_huge_zero_pmd(orig_pmd)) 1149 if (is_huge_zero_pmd(orig_pmd))
1150 goto alloc; 1150 goto alloc;
1151 spin_lock(&mm->page_table_lock); 1151 spin_lock(&mm->page_table_lock);
1152 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1152 if (unlikely(!pmd_same(*pmd, orig_pmd)))
1153 goto out_unlock; 1153 goto out_unlock;
1154 1154
1155 page = pmd_page(orig_pmd); 1155 page = pmd_page(orig_pmd);
1156 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1156 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
1157 if (page_mapcount(page) == 1) { 1157 if (page_mapcount(page) == 1) {
1158 pmd_t entry; 1158 pmd_t entry;
1159 entry = pmd_mkyoung(orig_pmd); 1159 entry = pmd_mkyoung(orig_pmd);
1160 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1160 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1161 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 1161 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
1162 update_mmu_cache_pmd(vma, address, pmd); 1162 update_mmu_cache_pmd(vma, address, pmd);
1163 ret |= VM_FAULT_WRITE; 1163 ret |= VM_FAULT_WRITE;
1164 goto out_unlock; 1164 goto out_unlock;
1165 } 1165 }
1166 get_page(page); 1166 get_page(page);
1167 spin_unlock(&mm->page_table_lock); 1167 spin_unlock(&mm->page_table_lock);
1168 alloc: 1168 alloc:
1169 if (transparent_hugepage_enabled(vma) && 1169 if (transparent_hugepage_enabled(vma) &&
1170 !transparent_hugepage_debug_cow()) 1170 !transparent_hugepage_debug_cow())
1171 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1171 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
1172 vma, haddr, numa_node_id(), 0); 1172 vma, haddr, numa_node_id(), 0);
1173 else 1173 else
1174 new_page = NULL; 1174 new_page = NULL;
1175 1175
1176 if (unlikely(!new_page)) { 1176 if (unlikely(!new_page)) {
1177 count_vm_event(THP_FAULT_FALLBACK); 1177 count_vm_event(THP_FAULT_FALLBACK);
1178 if (is_huge_zero_pmd(orig_pmd)) { 1178 if (is_huge_zero_pmd(orig_pmd)) {
1179 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, 1179 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
1180 address, pmd, orig_pmd, haddr); 1180 address, pmd, orig_pmd, haddr);
1181 } else { 1181 } else {
1182 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1182 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1183 pmd, orig_pmd, page, haddr); 1183 pmd, orig_pmd, page, haddr);
1184 if (ret & VM_FAULT_OOM) 1184 if (ret & VM_FAULT_OOM)
1185 split_huge_page(page); 1185 split_huge_page(page);
1186 put_page(page); 1186 put_page(page);
1187 } 1187 }
1188 goto out; 1188 goto out;
1189 } 1189 }
1190 count_vm_event(THP_FAULT_ALLOC); 1190 count_vm_event(THP_FAULT_ALLOC);
1191 1191
1192 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1192 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1193 put_page(new_page); 1193 put_page(new_page);
1194 if (page) { 1194 if (page) {
1195 split_huge_page(page); 1195 split_huge_page(page);
1196 put_page(page); 1196 put_page(page);
1197 } 1197 }
1198 ret |= VM_FAULT_OOM; 1198 ret |= VM_FAULT_OOM;
1199 goto out; 1199 goto out;
1200 } 1200 }
1201 1201
1202 if (is_huge_zero_pmd(orig_pmd)) 1202 if (is_huge_zero_pmd(orig_pmd))
1203 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); 1203 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1204 else 1204 else
1205 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1205 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
1206 __SetPageUptodate(new_page); 1206 __SetPageUptodate(new_page);
1207 1207
1208 mmun_start = haddr; 1208 mmun_start = haddr;
1209 mmun_end = haddr + HPAGE_PMD_SIZE; 1209 mmun_end = haddr + HPAGE_PMD_SIZE;
1210 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1210 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1211 1211
1212 spin_lock(&mm->page_table_lock); 1212 spin_lock(&mm->page_table_lock);
1213 if (page) 1213 if (page)
1214 put_page(page); 1214 put_page(page);
1215 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1215 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
1216 spin_unlock(&mm->page_table_lock); 1216 spin_unlock(&mm->page_table_lock);
1217 mem_cgroup_uncharge_page(new_page); 1217 mem_cgroup_uncharge_page(new_page);
1218 put_page(new_page); 1218 put_page(new_page);
1219 goto out_mn; 1219 goto out_mn;
1220 } else { 1220 } else {
1221 pmd_t entry; 1221 pmd_t entry;
1222 entry = mk_huge_pmd(new_page, vma); 1222 entry = mk_huge_pmd(new_page, vma);
1223 pmdp_clear_flush(vma, haddr, pmd); 1223 pmdp_clear_flush(vma, haddr, pmd);
1224 page_add_new_anon_rmap(new_page, vma, haddr); 1224 page_add_new_anon_rmap(new_page, vma, haddr);
1225 set_pmd_at(mm, haddr, pmd, entry); 1225 set_pmd_at(mm, haddr, pmd, entry);
1226 update_mmu_cache_pmd(vma, address, pmd); 1226 update_mmu_cache_pmd(vma, address, pmd);
1227 if (is_huge_zero_pmd(orig_pmd)) { 1227 if (is_huge_zero_pmd(orig_pmd)) {
1228 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1228 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1229 put_huge_zero_page(); 1229 put_huge_zero_page();
1230 } else { 1230 } else {
1231 VM_BUG_ON(!PageHead(page)); 1231 VM_BUG_ON(!PageHead(page));
1232 page_remove_rmap(page); 1232 page_remove_rmap(page);
1233 put_page(page); 1233 put_page(page);
1234 } 1234 }
1235 ret |= VM_FAULT_WRITE; 1235 ret |= VM_FAULT_WRITE;
1236 } 1236 }
1237 spin_unlock(&mm->page_table_lock); 1237 spin_unlock(&mm->page_table_lock);
1238 out_mn: 1238 out_mn:
1239 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1239 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1240 out: 1240 out:
1241 return ret; 1241 return ret;
1242 out_unlock: 1242 out_unlock:
1243 spin_unlock(&mm->page_table_lock); 1243 spin_unlock(&mm->page_table_lock);
1244 return ret; 1244 return ret;
1245 } 1245 }
1246 1246
1247 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1247 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1248 unsigned long addr, 1248 unsigned long addr,
1249 pmd_t *pmd, 1249 pmd_t *pmd,
1250 unsigned int flags) 1250 unsigned int flags)
1251 { 1251 {
1252 struct mm_struct *mm = vma->vm_mm; 1252 struct mm_struct *mm = vma->vm_mm;
1253 struct page *page = NULL; 1253 struct page *page = NULL;
1254 1254
1255 assert_spin_locked(&mm->page_table_lock); 1255 assert_spin_locked(&mm->page_table_lock);
1256 1256
1257 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1257 if (flags & FOLL_WRITE && !pmd_write(*pmd))
1258 goto out; 1258 goto out;
1259 1259
1260 page = pmd_page(*pmd); 1260 page = pmd_page(*pmd);
1261 VM_BUG_ON(!PageHead(page)); 1261 VM_BUG_ON(!PageHead(page));
1262 if (flags & FOLL_TOUCH) { 1262 if (flags & FOLL_TOUCH) {
1263 pmd_t _pmd; 1263 pmd_t _pmd;
1264 /* 1264 /*
1265 * We should set the dirty bit only for FOLL_WRITE but 1265 * We should set the dirty bit only for FOLL_WRITE but
1266 * for now the dirty bit in the pmd is meaningless. 1266 * for now the dirty bit in the pmd is meaningless.
1267 * And if the dirty bit will become meaningful and 1267 * And if the dirty bit will become meaningful and
1268 * we'll only set it with FOLL_WRITE, an atomic 1268 * we'll only set it with FOLL_WRITE, an atomic
1269 * set_bit will be required on the pmd to set the 1269 * set_bit will be required on the pmd to set the
1270 * young bit, instead of the current set_pmd_at. 1270 * young bit, instead of the current set_pmd_at.
1271 */ 1271 */
1272 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1272 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1273 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1273 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
1274 } 1274 }
1275 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1275 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1276 if (page->mapping && trylock_page(page)) { 1276 if (page->mapping && trylock_page(page)) {
1277 lru_add_drain(); 1277 lru_add_drain();
1278 if (page->mapping) 1278 if (page->mapping)
1279 mlock_vma_page(page); 1279 mlock_vma_page(page);
1280 unlock_page(page); 1280 unlock_page(page);
1281 } 1281 }
1282 } 1282 }
1283 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1283 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1284 VM_BUG_ON(!PageCompound(page)); 1284 VM_BUG_ON(!PageCompound(page));
1285 if (flags & FOLL_GET) 1285 if (flags & FOLL_GET)
1286 get_page_foll(page); 1286 get_page_foll(page);
1287 1287
1288 out: 1288 out:
1289 return page; 1289 return page;
1290 } 1290 }
1291 1291
1292 /* NUMA hinting page fault entry point for trans huge pmds */ 1292 /* NUMA hinting page fault entry point for trans huge pmds */
1293 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1293 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1294 unsigned long addr, pmd_t pmd, pmd_t *pmdp) 1294 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1295 { 1295 {
1296 struct page *page; 1296 struct page *page;
1297 unsigned long haddr = addr & HPAGE_PMD_MASK; 1297 unsigned long haddr = addr & HPAGE_PMD_MASK;
1298 int target_nid; 1298 int target_nid;
1299 int current_nid = -1; 1299 int current_nid = -1;
1300 bool migrated; 1300 bool migrated;
1301 bool page_locked = false; 1301 bool page_locked = false;
1302 1302
1303 spin_lock(&mm->page_table_lock); 1303 spin_lock(&mm->page_table_lock);
1304 if (unlikely(!pmd_same(pmd, *pmdp))) 1304 if (unlikely(!pmd_same(pmd, *pmdp)))
1305 goto out_unlock; 1305 goto out_unlock;
1306 1306
1307 page = pmd_page(pmd); 1307 page = pmd_page(pmd);
1308 get_page(page); 1308 get_page(page);
1309 current_nid = page_to_nid(page); 1309 current_nid = page_to_nid(page);
1310 count_vm_numa_event(NUMA_HINT_FAULTS); 1310 count_vm_numa_event(NUMA_HINT_FAULTS);
1311 if (current_nid == numa_node_id()) 1311 if (current_nid == numa_node_id())
1312 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1312 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1313 1313
1314 target_nid = mpol_misplaced(page, vma, haddr); 1314 target_nid = mpol_misplaced(page, vma, haddr);
1315 if (target_nid == -1) { 1315 if (target_nid == -1) {
1316 put_page(page); 1316 put_page(page);
1317 goto clear_pmdnuma; 1317 goto clear_pmdnuma;
1318 } 1318 }
1319 1319
1320 /* Acquire the page lock to serialise THP migrations */ 1320 /* Acquire the page lock to serialise THP migrations */
1321 spin_unlock(&mm->page_table_lock); 1321 spin_unlock(&mm->page_table_lock);
1322 lock_page(page); 1322 lock_page(page);
1323 page_locked = true; 1323 page_locked = true;
1324 1324
1325 /* Confirm the PTE did not while locked */ 1325 /* Confirm the PTE did not while locked */
1326 spin_lock(&mm->page_table_lock); 1326 spin_lock(&mm->page_table_lock);
1327 if (unlikely(!pmd_same(pmd, *pmdp))) { 1327 if (unlikely(!pmd_same(pmd, *pmdp))) {
1328 unlock_page(page); 1328 unlock_page(page);
1329 put_page(page); 1329 put_page(page);
1330 goto out_unlock; 1330 goto out_unlock;
1331 } 1331 }
1332 spin_unlock(&mm->page_table_lock); 1332 spin_unlock(&mm->page_table_lock);
1333 1333
1334 /* Migrate the THP to the requested node */ 1334 /* Migrate the THP to the requested node */
1335 migrated = migrate_misplaced_transhuge_page(mm, vma, 1335 migrated = migrate_misplaced_transhuge_page(mm, vma,
1336 pmdp, pmd, addr, 1336 pmdp, pmd, addr,
1337 page, target_nid); 1337 page, target_nid);
1338 if (migrated) 1338 if (migrated)
1339 current_nid = target_nid; 1339 current_nid = target_nid;
1340 else { 1340 else {
1341 spin_lock(&mm->page_table_lock); 1341 spin_lock(&mm->page_table_lock);
1342 if (unlikely(!pmd_same(pmd, *pmdp))) { 1342 if (unlikely(!pmd_same(pmd, *pmdp))) {
1343 unlock_page(page); 1343 unlock_page(page);
1344 goto out_unlock; 1344 goto out_unlock;
1345 } 1345 }
1346 goto clear_pmdnuma; 1346 goto clear_pmdnuma;
1347 } 1347 }
1348 1348
1349 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); 1349 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1350 return 0; 1350 return 0;
1351 1351
1352 clear_pmdnuma: 1352 clear_pmdnuma:
1353 pmd = pmd_mknonnuma(pmd); 1353 pmd = pmd_mknonnuma(pmd);
1354 set_pmd_at(mm, haddr, pmdp, pmd); 1354 set_pmd_at(mm, haddr, pmdp, pmd);
1355 VM_BUG_ON(pmd_numa(*pmdp)); 1355 VM_BUG_ON(pmd_numa(*pmdp));
1356 update_mmu_cache_pmd(vma, addr, pmdp); 1356 update_mmu_cache_pmd(vma, addr, pmdp);
1357 if (page_locked) 1357 if (page_locked)
1358 unlock_page(page); 1358 unlock_page(page);
1359 1359
1360 out_unlock: 1360 out_unlock:
1361 spin_unlock(&mm->page_table_lock); 1361 spin_unlock(&mm->page_table_lock);
1362 if (current_nid != -1) 1362 if (current_nid != -1)
1363 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); 1363 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1364 return 0; 1364 return 0;
1365 } 1365 }
1366 1366
1367 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1367 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1368 pmd_t *pmd, unsigned long addr) 1368 pmd_t *pmd, unsigned long addr)
1369 { 1369 {
1370 int ret = 0; 1370 int ret = 0;
1371 1371
1372 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1372 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1373 struct page *page; 1373 struct page *page;
1374 pgtable_t pgtable; 1374 pgtable_t pgtable;
1375 pmd_t orig_pmd; 1375 pmd_t orig_pmd;
1376 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1376 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1377 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1377 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1378 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1378 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1379 if (is_huge_zero_pmd(orig_pmd)) { 1379 if (is_huge_zero_pmd(orig_pmd)) {
1380 tlb->mm->nr_ptes--; 1380 tlb->mm->nr_ptes--;
1381 spin_unlock(&tlb->mm->page_table_lock); 1381 spin_unlock(&tlb->mm->page_table_lock);
1382 put_huge_zero_page(); 1382 put_huge_zero_page();
1383 } else { 1383 } else {
1384 page = pmd_page(orig_pmd); 1384 page = pmd_page(orig_pmd);
1385 page_remove_rmap(page); 1385 page_remove_rmap(page);
1386 VM_BUG_ON(page_mapcount(page) < 0); 1386 VM_BUG_ON(page_mapcount(page) < 0);
1387 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1387 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1388 VM_BUG_ON(!PageHead(page)); 1388 VM_BUG_ON(!PageHead(page));
1389 tlb->mm->nr_ptes--; 1389 tlb->mm->nr_ptes--;
1390 spin_unlock(&tlb->mm->page_table_lock); 1390 spin_unlock(&tlb->mm->page_table_lock);
1391 tlb_remove_page(tlb, page); 1391 tlb_remove_page(tlb, page);
1392 } 1392 }
1393 pte_free(tlb->mm, pgtable); 1393 pte_free(tlb->mm, pgtable);
1394 ret = 1; 1394 ret = 1;
1395 } 1395 }
1396 return ret; 1396 return ret;
1397 } 1397 }
1398 1398
1399 int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1399 int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1400 unsigned long addr, unsigned long end, 1400 unsigned long addr, unsigned long end,
1401 unsigned char *vec) 1401 unsigned char *vec)
1402 { 1402 {
1403 int ret = 0; 1403 int ret = 0;
1404 1404
1405 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1405 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1406 /* 1406 /*
1407 * All logical pages in the range are present 1407 * All logical pages in the range are present
1408 * if backed by a huge page. 1408 * if backed by a huge page.
1409 */ 1409 */
1410 spin_unlock(&vma->vm_mm->page_table_lock); 1410 spin_unlock(&vma->vm_mm->page_table_lock);
1411 memset(vec, 1, (end - addr) >> PAGE_SHIFT); 1411 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1412 ret = 1; 1412 ret = 1;
1413 } 1413 }
1414 1414
1415 return ret; 1415 return ret;
1416 } 1416 }
1417 1417
1418 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, 1418 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1419 unsigned long old_addr, 1419 unsigned long old_addr,
1420 unsigned long new_addr, unsigned long old_end, 1420 unsigned long new_addr, unsigned long old_end,
1421 pmd_t *old_pmd, pmd_t *new_pmd) 1421 pmd_t *old_pmd, pmd_t *new_pmd)
1422 { 1422 {
1423 int ret = 0; 1423 int ret = 0;
1424 pmd_t pmd; 1424 pmd_t pmd;
1425 1425
1426 struct mm_struct *mm = vma->vm_mm; 1426 struct mm_struct *mm = vma->vm_mm;
1427 1427
1428 if ((old_addr & ~HPAGE_PMD_MASK) || 1428 if ((old_addr & ~HPAGE_PMD_MASK) ||
1429 (new_addr & ~HPAGE_PMD_MASK) || 1429 (new_addr & ~HPAGE_PMD_MASK) ||
1430 old_end - old_addr < HPAGE_PMD_SIZE || 1430 old_end - old_addr < HPAGE_PMD_SIZE ||
1431 (new_vma->vm_flags & VM_NOHUGEPAGE)) 1431 (new_vma->vm_flags & VM_NOHUGEPAGE))
1432 goto out; 1432 goto out;
1433 1433
1434 /* 1434 /*
1435 * The destination pmd shouldn't be established, free_pgtables() 1435 * The destination pmd shouldn't be established, free_pgtables()
1436 * should have release it. 1436 * should have release it.
1437 */ 1437 */
1438 if (WARN_ON(!pmd_none(*new_pmd))) { 1438 if (WARN_ON(!pmd_none(*new_pmd))) {
1439 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1439 VM_BUG_ON(pmd_trans_huge(*new_pmd));
1440 goto out; 1440 goto out;
1441 } 1441 }
1442 1442
1443 ret = __pmd_trans_huge_lock(old_pmd, vma); 1443 ret = __pmd_trans_huge_lock(old_pmd, vma);
1444 if (ret == 1) { 1444 if (ret == 1) {
1445 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1445 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1446 VM_BUG_ON(!pmd_none(*new_pmd)); 1446 VM_BUG_ON(!pmd_none(*new_pmd));
1447 set_pmd_at(mm, new_addr, new_pmd, pmd); 1447 set_pmd_at(mm, new_addr, new_pmd, pmd);
1448 spin_unlock(&mm->page_table_lock); 1448 spin_unlock(&mm->page_table_lock);
1449 } 1449 }
1450 out: 1450 out:
1451 return ret; 1451 return ret;
1452 } 1452 }
1453 1453
1454 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1454 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1455 unsigned long addr, pgprot_t newprot, int prot_numa) 1455 unsigned long addr, pgprot_t newprot, int prot_numa)
1456 { 1456 {
1457 struct mm_struct *mm = vma->vm_mm; 1457 struct mm_struct *mm = vma->vm_mm;
1458 int ret = 0; 1458 int ret = 0;
1459 1459
1460 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1460 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1461 pmd_t entry; 1461 pmd_t entry;
1462 entry = pmdp_get_and_clear(mm, addr, pmd); 1462 entry = pmdp_get_and_clear(mm, addr, pmd);
1463 if (!prot_numa) { 1463 if (!prot_numa) {
1464 entry = pmd_modify(entry, newprot); 1464 entry = pmd_modify(entry, newprot);
1465 BUG_ON(pmd_write(entry)); 1465 BUG_ON(pmd_write(entry));
1466 } else { 1466 } else {
1467 struct page *page = pmd_page(*pmd); 1467 struct page *page = pmd_page(*pmd);
1468 1468
1469 /* only check non-shared pages */ 1469 /* only check non-shared pages */
1470 if (page_mapcount(page) == 1 && 1470 if (page_mapcount(page) == 1 &&
1471 !pmd_numa(*pmd)) { 1471 !pmd_numa(*pmd)) {
1472 entry = pmd_mknuma(entry); 1472 entry = pmd_mknuma(entry);
1473 } 1473 }
1474 } 1474 }
1475 set_pmd_at(mm, addr, pmd, entry); 1475 set_pmd_at(mm, addr, pmd, entry);
1476 spin_unlock(&vma->vm_mm->page_table_lock); 1476 spin_unlock(&vma->vm_mm->page_table_lock);
1477 ret = 1; 1477 ret = 1;
1478 } 1478 }
1479 1479
1480 return ret; 1480 return ret;
1481 } 1481 }
1482 1482
1483 /* 1483 /*
1484 * Returns 1 if a given pmd maps a stable (not under splitting) thp. 1484 * Returns 1 if a given pmd maps a stable (not under splitting) thp.
1485 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. 1485 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1486 * 1486 *
1487 * Note that if it returns 1, this routine returns without unlocking page 1487 * Note that if it returns 1, this routine returns without unlocking page
1488 * table locks. So callers must unlock them. 1488 * table locks. So callers must unlock them.
1489 */ 1489 */
1490 int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1490 int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1491 { 1491 {
1492 spin_lock(&vma->vm_mm->page_table_lock); 1492 spin_lock(&vma->vm_mm->page_table_lock);
1493 if (likely(pmd_trans_huge(*pmd))) { 1493 if (likely(pmd_trans_huge(*pmd))) {
1494 if (unlikely(pmd_trans_splitting(*pmd))) { 1494 if (unlikely(pmd_trans_splitting(*pmd))) {
1495 spin_unlock(&vma->vm_mm->page_table_lock); 1495 spin_unlock(&vma->vm_mm->page_table_lock);
1496 wait_split_huge_page(vma->anon_vma, pmd); 1496 wait_split_huge_page(vma->anon_vma, pmd);
1497 return -1; 1497 return -1;
1498 } else { 1498 } else {
1499 /* Thp mapped by 'pmd' is stable, so we can 1499 /* Thp mapped by 'pmd' is stable, so we can
1500 * handle it as it is. */ 1500 * handle it as it is. */
1501 return 1; 1501 return 1;
1502 } 1502 }
1503 } 1503 }
1504 spin_unlock(&vma->vm_mm->page_table_lock); 1504 spin_unlock(&vma->vm_mm->page_table_lock);
1505 return 0; 1505 return 0;
1506 } 1506 }
1507 1507
1508 pmd_t *page_check_address_pmd(struct page *page, 1508 pmd_t *page_check_address_pmd(struct page *page,
1509 struct mm_struct *mm, 1509 struct mm_struct *mm,
1510 unsigned long address, 1510 unsigned long address,
1511 enum page_check_address_pmd_flag flag) 1511 enum page_check_address_pmd_flag flag)
1512 { 1512 {
1513 pmd_t *pmd, *ret = NULL; 1513 pmd_t *pmd, *ret = NULL;
1514 1514
1515 if (address & ~HPAGE_PMD_MASK) 1515 if (address & ~HPAGE_PMD_MASK)
1516 goto out; 1516 goto out;
1517 1517
1518 pmd = mm_find_pmd(mm, address); 1518 pmd = mm_find_pmd(mm, address);
1519 if (!pmd) 1519 if (!pmd)
1520 goto out; 1520 goto out;
1521 if (pmd_none(*pmd)) 1521 if (pmd_none(*pmd))
1522 goto out; 1522 goto out;
1523 if (pmd_page(*pmd) != page) 1523 if (pmd_page(*pmd) != page)
1524 goto out; 1524 goto out;
1525 /* 1525 /*
1526 * split_vma() may create temporary aliased mappings. There is 1526 * split_vma() may create temporary aliased mappings. There is
1527 * no risk as long as all huge pmd are found and have their 1527 * no risk as long as all huge pmd are found and have their
1528 * splitting bit set before __split_huge_page_refcount 1528 * splitting bit set before __split_huge_page_refcount
1529 * runs. Finding the same huge pmd more than once during the 1529 * runs. Finding the same huge pmd more than once during the
1530 * same rmap walk is not a problem. 1530 * same rmap walk is not a problem.
1531 */ 1531 */
1532 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && 1532 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1533 pmd_trans_splitting(*pmd)) 1533 pmd_trans_splitting(*pmd))
1534 goto out; 1534 goto out;
1535 if (pmd_trans_huge(*pmd)) { 1535 if (pmd_trans_huge(*pmd)) {
1536 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && 1536 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1537 !pmd_trans_splitting(*pmd)); 1537 !pmd_trans_splitting(*pmd));
1538 ret = pmd; 1538 ret = pmd;
1539 } 1539 }
1540 out: 1540 out:
1541 return ret; 1541 return ret;
1542 } 1542 }
1543 1543
1544 static int __split_huge_page_splitting(struct page *page, 1544 static int __split_huge_page_splitting(struct page *page,
1545 struct vm_area_struct *vma, 1545 struct vm_area_struct *vma,
1546 unsigned long address) 1546 unsigned long address)
1547 { 1547 {
1548 struct mm_struct *mm = vma->vm_mm; 1548 struct mm_struct *mm = vma->vm_mm;
1549 pmd_t *pmd; 1549 pmd_t *pmd;
1550 int ret = 0; 1550 int ret = 0;
1551 /* For mmu_notifiers */ 1551 /* For mmu_notifiers */
1552 const unsigned long mmun_start = address; 1552 const unsigned long mmun_start = address;
1553 const unsigned long mmun_end = address + HPAGE_PMD_SIZE; 1553 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1554 1554
1555 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1555 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1556 spin_lock(&mm->page_table_lock); 1556 spin_lock(&mm->page_table_lock);
1557 pmd = page_check_address_pmd(page, mm, address, 1557 pmd = page_check_address_pmd(page, mm, address,
1558 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1558 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1559 if (pmd) { 1559 if (pmd) {
1560 /* 1560 /*
1561 * We can't temporarily set the pmd to null in order 1561 * We can't temporarily set the pmd to null in order
1562 * to split it, the pmd must remain marked huge at all 1562 * to split it, the pmd must remain marked huge at all
1563 * times or the VM won't take the pmd_trans_huge paths 1563 * times or the VM won't take the pmd_trans_huge paths
1564 * and it won't wait on the anon_vma->root->rwsem to 1564 * and it won't wait on the anon_vma->root->rwsem to
1565 * serialize against split_huge_page*. 1565 * serialize against split_huge_page*.
1566 */ 1566 */
1567 pmdp_splitting_flush(vma, address, pmd); 1567 pmdp_splitting_flush(vma, address, pmd);
1568 ret = 1; 1568 ret = 1;
1569 } 1569 }
1570 spin_unlock(&mm->page_table_lock); 1570 spin_unlock(&mm->page_table_lock);
1571 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1571 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1572 1572
1573 return ret; 1573 return ret;
1574 } 1574 }
1575 1575
1576 static void __split_huge_page_refcount(struct page *page) 1576 static void __split_huge_page_refcount(struct page *page)
1577 { 1577 {
1578 int i; 1578 int i;
1579 struct zone *zone = page_zone(page); 1579 struct zone *zone = page_zone(page);
1580 struct lruvec *lruvec; 1580 struct lruvec *lruvec;
1581 int tail_count = 0; 1581 int tail_count = 0;
1582 1582
1583 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1583 /* prevent PageLRU to go away from under us, and freeze lru stats */
1584 spin_lock_irq(&zone->lru_lock); 1584 spin_lock_irq(&zone->lru_lock);
1585 lruvec = mem_cgroup_page_lruvec(page, zone); 1585 lruvec = mem_cgroup_page_lruvec(page, zone);
1586 1586
1587 compound_lock(page); 1587 compound_lock(page);
1588 /* complete memcg works before add pages to LRU */ 1588 /* complete memcg works before add pages to LRU */
1589 mem_cgroup_split_huge_fixup(page); 1589 mem_cgroup_split_huge_fixup(page);
1590 1590
1591 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 1591 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
1592 struct page *page_tail = page + i; 1592 struct page *page_tail = page + i;
1593 1593
1594 /* tail_page->_mapcount cannot change */ 1594 /* tail_page->_mapcount cannot change */
1595 BUG_ON(page_mapcount(page_tail) < 0); 1595 BUG_ON(page_mapcount(page_tail) < 0);
1596 tail_count += page_mapcount(page_tail); 1596 tail_count += page_mapcount(page_tail);
1597 /* check for overflow */ 1597 /* check for overflow */
1598 BUG_ON(tail_count < 0); 1598 BUG_ON(tail_count < 0);
1599 BUG_ON(atomic_read(&page_tail->_count) != 0); 1599 BUG_ON(atomic_read(&page_tail->_count) != 0);
1600 /* 1600 /*
1601 * tail_page->_count is zero and not changing from 1601 * tail_page->_count is zero and not changing from
1602 * under us. But get_page_unless_zero() may be running 1602 * under us. But get_page_unless_zero() may be running
1603 * from under us on the tail_page. If we used 1603 * from under us on the tail_page. If we used
1604 * atomic_set() below instead of atomic_add(), we 1604 * atomic_set() below instead of atomic_add(), we
1605 * would then run atomic_set() concurrently with 1605 * would then run atomic_set() concurrently with
1606 * get_page_unless_zero(), and atomic_set() is 1606 * get_page_unless_zero(), and atomic_set() is
1607 * implemented in C not using locked ops. spin_unlock 1607 * implemented in C not using locked ops. spin_unlock
1608 * on x86 sometime uses locked ops because of PPro 1608 * on x86 sometime uses locked ops because of PPro
1609 * errata 66, 92, so unless somebody can guarantee 1609 * errata 66, 92, so unless somebody can guarantee
1610 * atomic_set() here would be safe on all archs (and 1610 * atomic_set() here would be safe on all archs (and
1611 * not only on x86), it's safer to use atomic_add(). 1611 * not only on x86), it's safer to use atomic_add().
1612 */ 1612 */
1613 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, 1613 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1614 &page_tail->_count); 1614 &page_tail->_count);
1615 1615
1616 /* after clearing PageTail the gup refcount can be released */ 1616 /* after clearing PageTail the gup refcount can be released */
1617 smp_mb(); 1617 smp_mb();
1618 1618
1619 /* 1619 /*
1620 * retain hwpoison flag of the poisoned tail page: 1620 * retain hwpoison flag of the poisoned tail page:
1621 * fix for the unsuitable process killed on Guest Machine(KVM) 1621 * fix for the unsuitable process killed on Guest Machine(KVM)
1622 * by the memory-failure. 1622 * by the memory-failure.
1623 */ 1623 */
1624 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; 1624 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1625 page_tail->flags |= (page->flags & 1625 page_tail->flags |= (page->flags &
1626 ((1L << PG_referenced) | 1626 ((1L << PG_referenced) |
1627 (1L << PG_swapbacked) | 1627 (1L << PG_swapbacked) |
1628 (1L << PG_mlocked) | 1628 (1L << PG_mlocked) |
1629 (1L << PG_uptodate))); 1629 (1L << PG_uptodate)));
1630 page_tail->flags |= (1L << PG_dirty); 1630 page_tail->flags |= (1L << PG_dirty);
1631 1631
1632 /* clear PageTail before overwriting first_page */ 1632 /* clear PageTail before overwriting first_page */
1633 smp_wmb(); 1633 smp_wmb();
1634 1634
1635 /* 1635 /*
1636 * __split_huge_page_splitting() already set the 1636 * __split_huge_page_splitting() already set the
1637 * splitting bit in all pmd that could map this 1637 * splitting bit in all pmd that could map this
1638 * hugepage, that will ensure no CPU can alter the 1638 * hugepage, that will ensure no CPU can alter the
1639 * mapcount on the head page. The mapcount is only 1639 * mapcount on the head page. The mapcount is only
1640 * accounted in the head page and it has to be 1640 * accounted in the head page and it has to be
1641 * transferred to all tail pages in the below code. So 1641 * transferred to all tail pages in the below code. So
1642 * for this code to be safe, the split the mapcount 1642 * for this code to be safe, the split the mapcount
1643 * can't change. But that doesn't mean userland can't 1643 * can't change. But that doesn't mean userland can't
1644 * keep changing and reading the page contents while 1644 * keep changing and reading the page contents while
1645 * we transfer the mapcount, so the pmd splitting 1645 * we transfer the mapcount, so the pmd splitting
1646 * status is achieved setting a reserved bit in the 1646 * status is achieved setting a reserved bit in the
1647 * pmd, not by clearing the present bit. 1647 * pmd, not by clearing the present bit.
1648 */ 1648 */
1649 page_tail->_mapcount = page->_mapcount; 1649 page_tail->_mapcount = page->_mapcount;
1650 1650
1651 BUG_ON(page_tail->mapping); 1651 BUG_ON(page_tail->mapping);
1652 page_tail->mapping = page->mapping; 1652 page_tail->mapping = page->mapping;
1653 1653
1654 page_tail->index = page->index + i; 1654 page_tail->index = page->index + i;
1655 page_xchg_last_nid(page_tail, page_last_nid(page)); 1655 page_xchg_last_nid(page_tail, page_last_nid(page));
1656 1656
1657 BUG_ON(!PageAnon(page_tail)); 1657 BUG_ON(!PageAnon(page_tail));
1658 BUG_ON(!PageUptodate(page_tail)); 1658 BUG_ON(!PageUptodate(page_tail));
1659 BUG_ON(!PageDirty(page_tail)); 1659 BUG_ON(!PageDirty(page_tail));
1660 BUG_ON(!PageSwapBacked(page_tail)); 1660 BUG_ON(!PageSwapBacked(page_tail));
1661 1661
1662 lru_add_page_tail(page, page_tail, lruvec); 1662 lru_add_page_tail(page, page_tail, lruvec);
1663 } 1663 }
1664 atomic_sub(tail_count, &page->_count); 1664 atomic_sub(tail_count, &page->_count);
1665 BUG_ON(atomic_read(&page->_count) <= 0); 1665 BUG_ON(atomic_read(&page->_count) <= 0);
1666 1666
1667 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); 1667 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
1668 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1668 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1669 1669
1670 ClearPageCompound(page); 1670 ClearPageCompound(page);
1671 compound_unlock(page); 1671 compound_unlock(page);
1672 spin_unlock_irq(&zone->lru_lock); 1672 spin_unlock_irq(&zone->lru_lock);
1673 1673
1674 for (i = 1; i < HPAGE_PMD_NR; i++) { 1674 for (i = 1; i < HPAGE_PMD_NR; i++) {
1675 struct page *page_tail = page + i; 1675 struct page *page_tail = page + i;
1676 BUG_ON(page_count(page_tail) <= 0); 1676 BUG_ON(page_count(page_tail) <= 0);
1677 /* 1677 /*
1678 * Tail pages may be freed if there wasn't any mapping 1678 * Tail pages may be freed if there wasn't any mapping
1679 * like if add_to_swap() is running on a lru page that 1679 * like if add_to_swap() is running on a lru page that
1680 * had its mapping zapped. And freeing these pages 1680 * had its mapping zapped. And freeing these pages
1681 * requires taking the lru_lock so we do the put_page 1681 * requires taking the lru_lock so we do the put_page
1682 * of the tail pages after the split is complete. 1682 * of the tail pages after the split is complete.
1683 */ 1683 */
1684 put_page(page_tail); 1684 put_page(page_tail);
1685 } 1685 }
1686 1686
1687 /* 1687 /*
1688 * Only the head page (now become a regular page) is required 1688 * Only the head page (now become a regular page) is required
1689 * to be pinned by the caller. 1689 * to be pinned by the caller.
1690 */ 1690 */
1691 BUG_ON(page_count(page) <= 0); 1691 BUG_ON(page_count(page) <= 0);
1692 } 1692 }
1693 1693
1694 static int __split_huge_page_map(struct page *page, 1694 static int __split_huge_page_map(struct page *page,
1695 struct vm_area_struct *vma, 1695 struct vm_area_struct *vma,
1696 unsigned long address) 1696 unsigned long address)
1697 { 1697 {
1698 struct mm_struct *mm = vma->vm_mm; 1698 struct mm_struct *mm = vma->vm_mm;
1699 pmd_t *pmd, _pmd; 1699 pmd_t *pmd, _pmd;
1700 int ret = 0, i; 1700 int ret = 0, i;
1701 pgtable_t pgtable; 1701 pgtable_t pgtable;
1702 unsigned long haddr; 1702 unsigned long haddr;
1703 1703
1704 spin_lock(&mm->page_table_lock); 1704 spin_lock(&mm->page_table_lock);
1705 pmd = page_check_address_pmd(page, mm, address, 1705 pmd = page_check_address_pmd(page, mm, address,
1706 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1706 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1707 if (pmd) { 1707 if (pmd) {
1708 pgtable = pgtable_trans_huge_withdraw(mm); 1708 pgtable = pgtable_trans_huge_withdraw(mm);
1709 pmd_populate(mm, &_pmd, pgtable); 1709 pmd_populate(mm, &_pmd, pgtable);
1710 1710
1711 haddr = address; 1711 haddr = address;
1712 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1712 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1713 pte_t *pte, entry; 1713 pte_t *pte, entry;
1714 BUG_ON(PageCompound(page+i)); 1714 BUG_ON(PageCompound(page+i));
1715 entry = mk_pte(page + i, vma->vm_page_prot); 1715 entry = mk_pte(page + i, vma->vm_page_prot);
1716 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1716 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1717 if (!pmd_write(*pmd)) 1717 if (!pmd_write(*pmd))
1718 entry = pte_wrprotect(entry); 1718 entry = pte_wrprotect(entry);
1719 else 1719 else
1720 BUG_ON(page_mapcount(page) != 1); 1720 BUG_ON(page_mapcount(page) != 1);
1721 if (!pmd_young(*pmd)) 1721 if (!pmd_young(*pmd))
1722 entry = pte_mkold(entry); 1722 entry = pte_mkold(entry);
1723 if (pmd_numa(*pmd)) 1723 if (pmd_numa(*pmd))
1724 entry = pte_mknuma(entry); 1724 entry = pte_mknuma(entry);
1725 pte = pte_offset_map(&_pmd, haddr); 1725 pte = pte_offset_map(&_pmd, haddr);
1726 BUG_ON(!pte_none(*pte)); 1726 BUG_ON(!pte_none(*pte));
1727 set_pte_at(mm, haddr, pte, entry); 1727 set_pte_at(mm, haddr, pte, entry);
1728 pte_unmap(pte); 1728 pte_unmap(pte);
1729 } 1729 }
1730 1730
1731 smp_wmb(); /* make pte visible before pmd */ 1731 smp_wmb(); /* make pte visible before pmd */
1732 /* 1732 /*
1733 * Up to this point the pmd is present and huge and 1733 * Up to this point the pmd is present and huge and
1734 * userland has the whole access to the hugepage 1734 * userland has the whole access to the hugepage
1735 * during the split (which happens in place). If we 1735 * during the split (which happens in place). If we
1736 * overwrite the pmd with the not-huge version 1736 * overwrite the pmd with the not-huge version
1737 * pointing to the pte here (which of course we could 1737 * pointing to the pte here (which of course we could
1738 * if all CPUs were bug free), userland could trigger 1738 * if all CPUs were bug free), userland could trigger
1739 * a small page size TLB miss on the small sized TLB 1739 * a small page size TLB miss on the small sized TLB
1740 * while the hugepage TLB entry is still established 1740 * while the hugepage TLB entry is still established
1741 * in the huge TLB. Some CPU doesn't like that. See 1741 * in the huge TLB. Some CPU doesn't like that. See
1742 * http://support.amd.com/us/Processor_TechDocs/41322.pdf, 1742 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1743 * Erratum 383 on page 93. Intel should be safe but is 1743 * Erratum 383 on page 93. Intel should be safe but is
1744 * also warns that it's only safe if the permission 1744 * also warns that it's only safe if the permission
1745 * and cache attributes of the two entries loaded in 1745 * and cache attributes of the two entries loaded in
1746 * the two TLB is identical (which should be the case 1746 * the two TLB is identical (which should be the case
1747 * here). But it is generally safer to never allow 1747 * here). But it is generally safer to never allow
1748 * small and huge TLB entries for the same virtual 1748 * small and huge TLB entries for the same virtual
1749 * address to be loaded simultaneously. So instead of 1749 * address to be loaded simultaneously. So instead of
1750 * doing "pmd_populate(); flush_tlb_range();" we first 1750 * doing "pmd_populate(); flush_tlb_range();" we first
1751 * mark the current pmd notpresent (atomically because 1751 * mark the current pmd notpresent (atomically because
1752 * here the pmd_trans_huge and pmd_trans_splitting 1752 * here the pmd_trans_huge and pmd_trans_splitting
1753 * must remain set at all times on the pmd until the 1753 * must remain set at all times on the pmd until the
1754 * split is complete for this pmd), then we flush the 1754 * split is complete for this pmd), then we flush the
1755 * SMP TLB and finally we write the non-huge version 1755 * SMP TLB and finally we write the non-huge version
1756 * of the pmd entry with pmd_populate. 1756 * of the pmd entry with pmd_populate.
1757 */ 1757 */
1758 pmdp_invalidate(vma, address, pmd); 1758 pmdp_invalidate(vma, address, pmd);
1759 pmd_populate(mm, pmd, pgtable); 1759 pmd_populate(mm, pmd, pgtable);
1760 ret = 1; 1760 ret = 1;
1761 } 1761 }
1762 spin_unlock(&mm->page_table_lock); 1762 spin_unlock(&mm->page_table_lock);
1763 1763
1764 return ret; 1764 return ret;
1765 } 1765 }
1766 1766
1767 /* must be called with anon_vma->root->rwsem held */ 1767 /* must be called with anon_vma->root->rwsem held */
1768 static void __split_huge_page(struct page *page, 1768 static void __split_huge_page(struct page *page,
1769 struct anon_vma *anon_vma) 1769 struct anon_vma *anon_vma)
1770 { 1770 {
1771 int mapcount, mapcount2; 1771 int mapcount, mapcount2;
1772 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1772 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1773 struct anon_vma_chain *avc; 1773 struct anon_vma_chain *avc;
1774 1774
1775 BUG_ON(!PageHead(page)); 1775 BUG_ON(!PageHead(page));
1776 BUG_ON(PageTail(page)); 1776 BUG_ON(PageTail(page));
1777 1777
1778 mapcount = 0; 1778 mapcount = 0;
1779 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1779 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1780 struct vm_area_struct *vma = avc->vma; 1780 struct vm_area_struct *vma = avc->vma;
1781 unsigned long addr = vma_address(page, vma); 1781 unsigned long addr = vma_address(page, vma);
1782 BUG_ON(is_vma_temporary_stack(vma)); 1782 BUG_ON(is_vma_temporary_stack(vma));
1783 mapcount += __split_huge_page_splitting(page, vma, addr); 1783 mapcount += __split_huge_page_splitting(page, vma, addr);
1784 } 1784 }
1785 /* 1785 /*
1786 * It is critical that new vmas are added to the tail of the 1786 * It is critical that new vmas are added to the tail of the
1787 * anon_vma list. This guarantes that if copy_huge_pmd() runs 1787 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1788 * and establishes a child pmd before 1788 * and establishes a child pmd before
1789 * __split_huge_page_splitting() freezes the parent pmd (so if 1789 * __split_huge_page_splitting() freezes the parent pmd (so if
1790 * we fail to prevent copy_huge_pmd() from running until the 1790 * we fail to prevent copy_huge_pmd() from running until the
1791 * whole __split_huge_page() is complete), we will still see 1791 * whole __split_huge_page() is complete), we will still see
1792 * the newly established pmd of the child later during the 1792 * the newly established pmd of the child later during the
1793 * walk, to be able to set it as pmd_trans_splitting too. 1793 * walk, to be able to set it as pmd_trans_splitting too.
1794 */ 1794 */
1795 if (mapcount != page_mapcount(page)) 1795 if (mapcount != page_mapcount(page))
1796 printk(KERN_ERR "mapcount %d page_mapcount %d\n", 1796 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1797 mapcount, page_mapcount(page)); 1797 mapcount, page_mapcount(page));
1798 BUG_ON(mapcount != page_mapcount(page)); 1798 BUG_ON(mapcount != page_mapcount(page));
1799 1799
1800 __split_huge_page_refcount(page); 1800 __split_huge_page_refcount(page);
1801 1801
1802 mapcount2 = 0; 1802 mapcount2 = 0;
1803 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1803 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1804 struct vm_area_struct *vma = avc->vma; 1804 struct vm_area_struct *vma = avc->vma;
1805 unsigned long addr = vma_address(page, vma); 1805 unsigned long addr = vma_address(page, vma);
1806 BUG_ON(is_vma_temporary_stack(vma)); 1806 BUG_ON(is_vma_temporary_stack(vma));
1807 mapcount2 += __split_huge_page_map(page, vma, addr); 1807 mapcount2 += __split_huge_page_map(page, vma, addr);
1808 } 1808 }
1809 if (mapcount != mapcount2) 1809 if (mapcount != mapcount2)
1810 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", 1810 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1811 mapcount, mapcount2, page_mapcount(page)); 1811 mapcount, mapcount2, page_mapcount(page));
1812 BUG_ON(mapcount != mapcount2); 1812 BUG_ON(mapcount != mapcount2);
1813 } 1813 }
1814 1814
1815 int split_huge_page(struct page *page) 1815 int split_huge_page(struct page *page)
1816 { 1816 {
1817 struct anon_vma *anon_vma; 1817 struct anon_vma *anon_vma;
1818 int ret = 1; 1818 int ret = 1;
1819 1819
1820 BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); 1820 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1821 BUG_ON(!PageAnon(page)); 1821 BUG_ON(!PageAnon(page));
1822 anon_vma = page_lock_anon_vma_read(page); 1822
1823 /*
1824 * The caller does not necessarily hold an mmap_sem that would prevent
1825 * the anon_vma disappearing so we first we take a reference to it
1826 * and then lock the anon_vma for write. This is similar to
1827 * page_lock_anon_vma_read except the write lock is taken to serialise
1828 * against parallel split or collapse operations.
1829 */
1830 anon_vma = page_get_anon_vma(page);
1823 if (!anon_vma) 1831 if (!anon_vma)
1824 goto out; 1832 goto out;
1833 anon_vma_lock_write(anon_vma);
1834
1825 ret = 0; 1835 ret = 0;
1826 if (!PageCompound(page)) 1836 if (!PageCompound(page))
1827 goto out_unlock; 1837 goto out_unlock;
1828 1838
1829 BUG_ON(!PageSwapBacked(page)); 1839 BUG_ON(!PageSwapBacked(page));
1830 __split_huge_page(page, anon_vma); 1840 __split_huge_page(page, anon_vma);
1831 count_vm_event(THP_SPLIT); 1841 count_vm_event(THP_SPLIT);
1832 1842
1833 BUG_ON(PageCompound(page)); 1843 BUG_ON(PageCompound(page));
1834 out_unlock: 1844 out_unlock:
1835 page_unlock_anon_vma_read(anon_vma); 1845 anon_vma_unlock(anon_vma);
1846 put_anon_vma(anon_vma);
1836 out: 1847 out:
1837 return ret; 1848 return ret;
1838 } 1849 }
1839 1850
1840 #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) 1851 #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1841 1852
1842 int hugepage_madvise(struct vm_area_struct *vma, 1853 int hugepage_madvise(struct vm_area_struct *vma,
1843 unsigned long *vm_flags, int advice) 1854 unsigned long *vm_flags, int advice)
1844 { 1855 {
1845 struct mm_struct *mm = vma->vm_mm; 1856 struct mm_struct *mm = vma->vm_mm;
1846 1857
1847 switch (advice) { 1858 switch (advice) {
1848 case MADV_HUGEPAGE: 1859 case MADV_HUGEPAGE:
1849 /* 1860 /*
1850 * Be somewhat over-protective like KSM for now! 1861 * Be somewhat over-protective like KSM for now!
1851 */ 1862 */
1852 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 1863 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1853 return -EINVAL; 1864 return -EINVAL;
1854 if (mm->def_flags & VM_NOHUGEPAGE) 1865 if (mm->def_flags & VM_NOHUGEPAGE)
1855 return -EINVAL; 1866 return -EINVAL;
1856 *vm_flags &= ~VM_NOHUGEPAGE; 1867 *vm_flags &= ~VM_NOHUGEPAGE;
1857 *vm_flags |= VM_HUGEPAGE; 1868 *vm_flags |= VM_HUGEPAGE;
1858 /* 1869 /*
1859 * If the vma become good for khugepaged to scan, 1870 * If the vma become good for khugepaged to scan,
1860 * register it here without waiting a page fault that 1871 * register it here without waiting a page fault that
1861 * may not happen any time soon. 1872 * may not happen any time soon.
1862 */ 1873 */
1863 if (unlikely(khugepaged_enter_vma_merge(vma))) 1874 if (unlikely(khugepaged_enter_vma_merge(vma)))
1864 return -ENOMEM; 1875 return -ENOMEM;
1865 break; 1876 break;
1866 case MADV_NOHUGEPAGE: 1877 case MADV_NOHUGEPAGE:
1867 /* 1878 /*
1868 * Be somewhat over-protective like KSM for now! 1879 * Be somewhat over-protective like KSM for now!
1869 */ 1880 */
1870 if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) 1881 if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
1871 return -EINVAL; 1882 return -EINVAL;
1872 *vm_flags &= ~VM_HUGEPAGE; 1883 *vm_flags &= ~VM_HUGEPAGE;
1873 *vm_flags |= VM_NOHUGEPAGE; 1884 *vm_flags |= VM_NOHUGEPAGE;
1874 /* 1885 /*
1875 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 1886 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1876 * this vma even if we leave the mm registered in khugepaged if 1887 * this vma even if we leave the mm registered in khugepaged if
1877 * it got registered before VM_NOHUGEPAGE was set. 1888 * it got registered before VM_NOHUGEPAGE was set.
1878 */ 1889 */
1879 break; 1890 break;
1880 } 1891 }
1881 1892
1882 return 0; 1893 return 0;
1883 } 1894 }
1884 1895
1885 static int __init khugepaged_slab_init(void) 1896 static int __init khugepaged_slab_init(void)
1886 { 1897 {
1887 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", 1898 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1888 sizeof(struct mm_slot), 1899 sizeof(struct mm_slot),
1889 __alignof__(struct mm_slot), 0, NULL); 1900 __alignof__(struct mm_slot), 0, NULL);
1890 if (!mm_slot_cache) 1901 if (!mm_slot_cache)
1891 return -ENOMEM; 1902 return -ENOMEM;
1892 1903
1893 return 0; 1904 return 0;
1894 } 1905 }
1895 1906
1896 static void __init khugepaged_slab_free(void) 1907 static void __init khugepaged_slab_free(void)
1897 { 1908 {
1898 kmem_cache_destroy(mm_slot_cache); 1909 kmem_cache_destroy(mm_slot_cache);
1899 mm_slot_cache = NULL; 1910 mm_slot_cache = NULL;
1900 } 1911 }
1901 1912
1902 static inline struct mm_slot *alloc_mm_slot(void) 1913 static inline struct mm_slot *alloc_mm_slot(void)
1903 { 1914 {
1904 if (!mm_slot_cache) /* initialization failed */ 1915 if (!mm_slot_cache) /* initialization failed */
1905 return NULL; 1916 return NULL;
1906 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 1917 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1907 } 1918 }
1908 1919
1909 static inline void free_mm_slot(struct mm_slot *mm_slot) 1920 static inline void free_mm_slot(struct mm_slot *mm_slot)
1910 { 1921 {
1911 kmem_cache_free(mm_slot_cache, mm_slot); 1922 kmem_cache_free(mm_slot_cache, mm_slot);
1912 } 1923 }
1913 1924
1914 static int __init mm_slots_hash_init(void) 1925 static int __init mm_slots_hash_init(void)
1915 { 1926 {
1916 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), 1927 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1917 GFP_KERNEL); 1928 GFP_KERNEL);
1918 if (!mm_slots_hash) 1929 if (!mm_slots_hash)
1919 return -ENOMEM; 1930 return -ENOMEM;
1920 return 0; 1931 return 0;
1921 } 1932 }
1922 1933
1923 #if 0 1934 #if 0
1924 static void __init mm_slots_hash_free(void) 1935 static void __init mm_slots_hash_free(void)
1925 { 1936 {
1926 kfree(mm_slots_hash); 1937 kfree(mm_slots_hash);
1927 mm_slots_hash = NULL; 1938 mm_slots_hash = NULL;
1928 } 1939 }
1929 #endif 1940 #endif
1930 1941
1931 static struct mm_slot *get_mm_slot(struct mm_struct *mm) 1942 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1932 { 1943 {
1933 struct mm_slot *mm_slot; 1944 struct mm_slot *mm_slot;
1934 struct hlist_head *bucket; 1945 struct hlist_head *bucket;
1935 struct hlist_node *node; 1946 struct hlist_node *node;
1936 1947
1937 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 1948 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1938 % MM_SLOTS_HASH_HEADS]; 1949 % MM_SLOTS_HASH_HEADS];
1939 hlist_for_each_entry(mm_slot, node, bucket, hash) { 1950 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1940 if (mm == mm_slot->mm) 1951 if (mm == mm_slot->mm)
1941 return mm_slot; 1952 return mm_slot;
1942 } 1953 }
1943 return NULL; 1954 return NULL;
1944 } 1955 }
1945 1956
1946 static void insert_to_mm_slots_hash(struct mm_struct *mm, 1957 static void insert_to_mm_slots_hash(struct mm_struct *mm,
1947 struct mm_slot *mm_slot) 1958 struct mm_slot *mm_slot)
1948 { 1959 {
1949 struct hlist_head *bucket; 1960 struct hlist_head *bucket;
1950 1961
1951 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 1962 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1952 % MM_SLOTS_HASH_HEADS]; 1963 % MM_SLOTS_HASH_HEADS];
1953 mm_slot->mm = mm; 1964 mm_slot->mm = mm;
1954 hlist_add_head(&mm_slot->hash, bucket); 1965 hlist_add_head(&mm_slot->hash, bucket);
1955 } 1966 }
1956 1967
1957 static inline int khugepaged_test_exit(struct mm_struct *mm) 1968 static inline int khugepaged_test_exit(struct mm_struct *mm)
1958 { 1969 {
1959 return atomic_read(&mm->mm_users) == 0; 1970 return atomic_read(&mm->mm_users) == 0;
1960 } 1971 }
1961 1972
1962 int __khugepaged_enter(struct mm_struct *mm) 1973 int __khugepaged_enter(struct mm_struct *mm)
1963 { 1974 {
1964 struct mm_slot *mm_slot; 1975 struct mm_slot *mm_slot;
1965 int wakeup; 1976 int wakeup;
1966 1977
1967 mm_slot = alloc_mm_slot(); 1978 mm_slot = alloc_mm_slot();
1968 if (!mm_slot) 1979 if (!mm_slot)
1969 return -ENOMEM; 1980 return -ENOMEM;
1970 1981
1971 /* __khugepaged_exit() must not run from under us */ 1982 /* __khugepaged_exit() must not run from under us */
1972 VM_BUG_ON(khugepaged_test_exit(mm)); 1983 VM_BUG_ON(khugepaged_test_exit(mm));
1973 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 1984 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1974 free_mm_slot(mm_slot); 1985 free_mm_slot(mm_slot);
1975 return 0; 1986 return 0;
1976 } 1987 }
1977 1988
1978 spin_lock(&khugepaged_mm_lock); 1989 spin_lock(&khugepaged_mm_lock);
1979 insert_to_mm_slots_hash(mm, mm_slot); 1990 insert_to_mm_slots_hash(mm, mm_slot);
1980 /* 1991 /*
1981 * Insert just behind the scanning cursor, to let the area settle 1992 * Insert just behind the scanning cursor, to let the area settle
1982 * down a little. 1993 * down a little.
1983 */ 1994 */
1984 wakeup = list_empty(&khugepaged_scan.mm_head); 1995 wakeup = list_empty(&khugepaged_scan.mm_head);
1985 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); 1996 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1986 spin_unlock(&khugepaged_mm_lock); 1997 spin_unlock(&khugepaged_mm_lock);
1987 1998
1988 atomic_inc(&mm->mm_count); 1999 atomic_inc(&mm->mm_count);
1989 if (wakeup) 2000 if (wakeup)
1990 wake_up_interruptible(&khugepaged_wait); 2001 wake_up_interruptible(&khugepaged_wait);
1991 2002
1992 return 0; 2003 return 0;
1993 } 2004 }
1994 2005
1995 int khugepaged_enter_vma_merge(struct vm_area_struct *vma) 2006 int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1996 { 2007 {
1997 unsigned long hstart, hend; 2008 unsigned long hstart, hend;
1998 if (!vma->anon_vma) 2009 if (!vma->anon_vma)
1999 /* 2010 /*
2000 * Not yet faulted in so we will register later in the 2011 * Not yet faulted in so we will register later in the
2001 * page fault if needed. 2012 * page fault if needed.
2002 */ 2013 */
2003 return 0; 2014 return 0;
2004 if (vma->vm_ops) 2015 if (vma->vm_ops)
2005 /* khugepaged not yet working on file or special mappings */ 2016 /* khugepaged not yet working on file or special mappings */
2006 return 0; 2017 return 0;
2007 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2018 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2008 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2019 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2009 hend = vma->vm_end & HPAGE_PMD_MASK; 2020 hend = vma->vm_end & HPAGE_PMD_MASK;
2010 if (hstart < hend) 2021 if (hstart < hend)
2011 return khugepaged_enter(vma); 2022 return khugepaged_enter(vma);
2012 return 0; 2023 return 0;
2013 } 2024 }
2014 2025
2015 void __khugepaged_exit(struct mm_struct *mm) 2026 void __khugepaged_exit(struct mm_struct *mm)
2016 { 2027 {
2017 struct mm_slot *mm_slot; 2028 struct mm_slot *mm_slot;
2018 int free = 0; 2029 int free = 0;
2019 2030
2020 spin_lock(&khugepaged_mm_lock); 2031 spin_lock(&khugepaged_mm_lock);
2021 mm_slot = get_mm_slot(mm); 2032 mm_slot = get_mm_slot(mm);
2022 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 2033 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
2023 hlist_del(&mm_slot->hash); 2034 hlist_del(&mm_slot->hash);
2024 list_del(&mm_slot->mm_node); 2035 list_del(&mm_slot->mm_node);
2025 free = 1; 2036 free = 1;
2026 } 2037 }
2027 spin_unlock(&khugepaged_mm_lock); 2038 spin_unlock(&khugepaged_mm_lock);
2028 2039
2029 if (free) { 2040 if (free) {
2030 clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 2041 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2031 free_mm_slot(mm_slot); 2042 free_mm_slot(mm_slot);
2032 mmdrop(mm); 2043 mmdrop(mm);
2033 } else if (mm_slot) { 2044 } else if (mm_slot) {
2034 /* 2045 /*
2035 * This is required to serialize against 2046 * This is required to serialize against
2036 * khugepaged_test_exit() (which is guaranteed to run 2047 * khugepaged_test_exit() (which is guaranteed to run
2037 * under mmap sem read mode). Stop here (after we 2048 * under mmap sem read mode). Stop here (after we
2038 * return all pagetables will be destroyed) until 2049 * return all pagetables will be destroyed) until
2039 * khugepaged has finished working on the pagetables 2050 * khugepaged has finished working on the pagetables
2040 * under the mmap_sem. 2051 * under the mmap_sem.
2041 */ 2052 */
2042 down_write(&mm->mmap_sem); 2053 down_write(&mm->mmap_sem);
2043 up_write(&mm->mmap_sem); 2054 up_write(&mm->mmap_sem);
2044 } 2055 }
2045 } 2056 }
2046 2057
2047 static void release_pte_page(struct page *page) 2058 static void release_pte_page(struct page *page)
2048 { 2059 {
2049 /* 0 stands for page_is_file_cache(page) == false */ 2060 /* 0 stands for page_is_file_cache(page) == false */
2050 dec_zone_page_state(page, NR_ISOLATED_ANON + 0); 2061 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
2051 unlock_page(page); 2062 unlock_page(page);
2052 putback_lru_page(page); 2063 putback_lru_page(page);
2053 } 2064 }
2054 2065
2055 static void release_pte_pages(pte_t *pte, pte_t *_pte) 2066 static void release_pte_pages(pte_t *pte, pte_t *_pte)
2056 { 2067 {
2057 while (--_pte >= pte) { 2068 while (--_pte >= pte) {
2058 pte_t pteval = *_pte; 2069 pte_t pteval = *_pte;
2059 if (!pte_none(pteval)) 2070 if (!pte_none(pteval))
2060 release_pte_page(pte_page(pteval)); 2071 release_pte_page(pte_page(pteval));
2061 } 2072 }
2062 } 2073 }
2063 2074
2064 static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 2075 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2065 unsigned long address, 2076 unsigned long address,
2066 pte_t *pte) 2077 pte_t *pte)
2067 { 2078 {
2068 struct page *page; 2079 struct page *page;
2069 pte_t *_pte; 2080 pte_t *_pte;
2070 int referenced = 0, none = 0; 2081 int referenced = 0, none = 0;
2071 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2082 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2072 _pte++, address += PAGE_SIZE) { 2083 _pte++, address += PAGE_SIZE) {
2073 pte_t pteval = *_pte; 2084 pte_t pteval = *_pte;
2074 if (pte_none(pteval)) { 2085 if (pte_none(pteval)) {
2075 if (++none <= khugepaged_max_ptes_none) 2086 if (++none <= khugepaged_max_ptes_none)
2076 continue; 2087 continue;
2077 else 2088 else
2078 goto out; 2089 goto out;
2079 } 2090 }
2080 if (!pte_present(pteval) || !pte_write(pteval)) 2091 if (!pte_present(pteval) || !pte_write(pteval))
2081 goto out; 2092 goto out;
2082 page = vm_normal_page(vma, address, pteval); 2093 page = vm_normal_page(vma, address, pteval);
2083 if (unlikely(!page)) 2094 if (unlikely(!page))
2084 goto out; 2095 goto out;
2085 2096
2086 VM_BUG_ON(PageCompound(page)); 2097 VM_BUG_ON(PageCompound(page));
2087 BUG_ON(!PageAnon(page)); 2098 BUG_ON(!PageAnon(page));
2088 VM_BUG_ON(!PageSwapBacked(page)); 2099 VM_BUG_ON(!PageSwapBacked(page));
2089 2100
2090 /* cannot use mapcount: can't collapse if there's a gup pin */ 2101 /* cannot use mapcount: can't collapse if there's a gup pin */
2091 if (page_count(page) != 1) 2102 if (page_count(page) != 1)
2092 goto out; 2103 goto out;
2093 /* 2104 /*
2094 * We can do it before isolate_lru_page because the 2105 * We can do it before isolate_lru_page because the
2095 * page can't be freed from under us. NOTE: PG_lock 2106 * page can't be freed from under us. NOTE: PG_lock
2096 * is needed to serialize against split_huge_page 2107 * is needed to serialize against split_huge_page
2097 * when invoked from the VM. 2108 * when invoked from the VM.
2098 */ 2109 */
2099 if (!trylock_page(page)) 2110 if (!trylock_page(page))
2100 goto out; 2111 goto out;
2101 /* 2112 /*
2102 * Isolate the page to avoid collapsing an hugepage 2113 * Isolate the page to avoid collapsing an hugepage
2103 * currently in use by the VM. 2114 * currently in use by the VM.
2104 */ 2115 */
2105 if (isolate_lru_page(page)) { 2116 if (isolate_lru_page(page)) {
2106 unlock_page(page); 2117 unlock_page(page);
2107 goto out; 2118 goto out;
2108 } 2119 }
2109 /* 0 stands for page_is_file_cache(page) == false */ 2120 /* 0 stands for page_is_file_cache(page) == false */
2110 inc_zone_page_state(page, NR_ISOLATED_ANON + 0); 2121 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
2111 VM_BUG_ON(!PageLocked(page)); 2122 VM_BUG_ON(!PageLocked(page));
2112 VM_BUG_ON(PageLRU(page)); 2123 VM_BUG_ON(PageLRU(page));
2113 2124
2114 /* If there is no mapped pte young don't collapse the page */ 2125 /* If there is no mapped pte young don't collapse the page */
2115 if (pte_young(pteval) || PageReferenced(page) || 2126 if (pte_young(pteval) || PageReferenced(page) ||
2116 mmu_notifier_test_young(vma->vm_mm, address)) 2127 mmu_notifier_test_young(vma->vm_mm, address))
2117 referenced = 1; 2128 referenced = 1;
2118 } 2129 }
2119 if (likely(referenced)) 2130 if (likely(referenced))
2120 return 1; 2131 return 1;
2121 out: 2132 out:
2122 release_pte_pages(pte, _pte); 2133 release_pte_pages(pte, _pte);
2123 return 0; 2134 return 0;
2124 } 2135 }
2125 2136
2126 static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 2137 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
2127 struct vm_area_struct *vma, 2138 struct vm_area_struct *vma,
2128 unsigned long address, 2139 unsigned long address,
2129 spinlock_t *ptl) 2140 spinlock_t *ptl)
2130 { 2141 {
2131 pte_t *_pte; 2142 pte_t *_pte;
2132 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { 2143 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
2133 pte_t pteval = *_pte; 2144 pte_t pteval = *_pte;
2134 struct page *src_page; 2145 struct page *src_page;
2135 2146
2136 if (pte_none(pteval)) { 2147 if (pte_none(pteval)) {
2137 clear_user_highpage(page, address); 2148 clear_user_highpage(page, address);
2138 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 2149 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
2139 } else { 2150 } else {
2140 src_page = pte_page(pteval); 2151 src_page = pte_page(pteval);
2141 copy_user_highpage(page, src_page, address, vma); 2152 copy_user_highpage(page, src_page, address, vma);
2142 VM_BUG_ON(page_mapcount(src_page) != 1); 2153 VM_BUG_ON(page_mapcount(src_page) != 1);
2143 release_pte_page(src_page); 2154 release_pte_page(src_page);
2144 /* 2155 /*
2145 * ptl mostly unnecessary, but preempt has to 2156 * ptl mostly unnecessary, but preempt has to
2146 * be disabled to update the per-cpu stats 2157 * be disabled to update the per-cpu stats
2147 * inside page_remove_rmap(). 2158 * inside page_remove_rmap().
2148 */ 2159 */
2149 spin_lock(ptl); 2160 spin_lock(ptl);
2150 /* 2161 /*
2151 * paravirt calls inside pte_clear here are 2162 * paravirt calls inside pte_clear here are
2152 * superfluous. 2163 * superfluous.
2153 */ 2164 */
2154 pte_clear(vma->vm_mm, address, _pte); 2165 pte_clear(vma->vm_mm, address, _pte);
2155 page_remove_rmap(src_page); 2166 page_remove_rmap(src_page);
2156 spin_unlock(ptl); 2167 spin_unlock(ptl);
2157 free_page_and_swap_cache(src_page); 2168 free_page_and_swap_cache(src_page);
2158 } 2169 }
2159 2170
2160 address += PAGE_SIZE; 2171 address += PAGE_SIZE;
2161 page++; 2172 page++;
2162 } 2173 }
2163 } 2174 }
2164 2175
2165 static void khugepaged_alloc_sleep(void) 2176 static void khugepaged_alloc_sleep(void)
2166 { 2177 {
2167 wait_event_freezable_timeout(khugepaged_wait, false, 2178 wait_event_freezable_timeout(khugepaged_wait, false,
2168 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2179 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2169 } 2180 }
2170 2181
2171 #ifdef CONFIG_NUMA 2182 #ifdef CONFIG_NUMA
2172 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2183 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2173 { 2184 {
2174 if (IS_ERR(*hpage)) { 2185 if (IS_ERR(*hpage)) {
2175 if (!*wait) 2186 if (!*wait)
2176 return false; 2187 return false;
2177 2188
2178 *wait = false; 2189 *wait = false;
2179 *hpage = NULL; 2190 *hpage = NULL;
2180 khugepaged_alloc_sleep(); 2191 khugepaged_alloc_sleep();
2181 } else if (*hpage) { 2192 } else if (*hpage) {
2182 put_page(*hpage); 2193 put_page(*hpage);
2183 *hpage = NULL; 2194 *hpage = NULL;
2184 } 2195 }
2185 2196
2186 return true; 2197 return true;
2187 } 2198 }
2188 2199
2189 static struct page 2200 static struct page
2190 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2201 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
2191 struct vm_area_struct *vma, unsigned long address, 2202 struct vm_area_struct *vma, unsigned long address,
2192 int node) 2203 int node)
2193 { 2204 {
2194 VM_BUG_ON(*hpage); 2205 VM_BUG_ON(*hpage);
2195 /* 2206 /*
2196 * Allocate the page while the vma is still valid and under 2207 * Allocate the page while the vma is still valid and under
2197 * the mmap_sem read mode so there is no memory allocation 2208 * the mmap_sem read mode so there is no memory allocation
2198 * later when we take the mmap_sem in write mode. This is more 2209 * later when we take the mmap_sem in write mode. This is more
2199 * friendly behavior (OTOH it may actually hide bugs) to 2210 * friendly behavior (OTOH it may actually hide bugs) to
2200 * filesystems in userland with daemons allocating memory in 2211 * filesystems in userland with daemons allocating memory in
2201 * the userland I/O paths. Allocating memory with the 2212 * the userland I/O paths. Allocating memory with the
2202 * mmap_sem in read mode is good idea also to allow greater 2213 * mmap_sem in read mode is good idea also to allow greater
2203 * scalability. 2214 * scalability.
2204 */ 2215 */
2205 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 2216 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
2206 node, __GFP_OTHER_NODE); 2217 node, __GFP_OTHER_NODE);
2207 2218
2208 /* 2219 /*
2209 * After allocating the hugepage, release the mmap_sem read lock in 2220 * After allocating the hugepage, release the mmap_sem read lock in
2210 * preparation for taking it in write mode. 2221 * preparation for taking it in write mode.
2211 */ 2222 */
2212 up_read(&mm->mmap_sem); 2223 up_read(&mm->mmap_sem);
2213 if (unlikely(!*hpage)) { 2224 if (unlikely(!*hpage)) {
2214 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2225 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2215 *hpage = ERR_PTR(-ENOMEM); 2226 *hpage = ERR_PTR(-ENOMEM);
2216 return NULL; 2227 return NULL;
2217 } 2228 }
2218 2229
2219 count_vm_event(THP_COLLAPSE_ALLOC); 2230 count_vm_event(THP_COLLAPSE_ALLOC);
2220 return *hpage; 2231 return *hpage;
2221 } 2232 }
2222 #else 2233 #else
2223 static struct page *khugepaged_alloc_hugepage(bool *wait) 2234 static struct page *khugepaged_alloc_hugepage(bool *wait)
2224 { 2235 {
2225 struct page *hpage; 2236 struct page *hpage;
2226 2237
2227 do { 2238 do {
2228 hpage = alloc_hugepage(khugepaged_defrag()); 2239 hpage = alloc_hugepage(khugepaged_defrag());
2229 if (!hpage) { 2240 if (!hpage) {
2230 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2241 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2231 if (!*wait) 2242 if (!*wait)
2232 return NULL; 2243 return NULL;
2233 2244
2234 *wait = false; 2245 *wait = false;
2235 khugepaged_alloc_sleep(); 2246 khugepaged_alloc_sleep();
2236 } else 2247 } else
2237 count_vm_event(THP_COLLAPSE_ALLOC); 2248 count_vm_event(THP_COLLAPSE_ALLOC);
2238 } while (unlikely(!hpage) && likely(khugepaged_enabled())); 2249 } while (unlikely(!hpage) && likely(khugepaged_enabled()));
2239 2250
2240 return hpage; 2251 return hpage;
2241 } 2252 }
2242 2253
2243 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2254 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2244 { 2255 {
2245 if (!*hpage) 2256 if (!*hpage)
2246 *hpage = khugepaged_alloc_hugepage(wait); 2257 *hpage = khugepaged_alloc_hugepage(wait);
2247 2258
2248 if (unlikely(!*hpage)) 2259 if (unlikely(!*hpage))
2249 return false; 2260 return false;
2250 2261
2251 return true; 2262 return true;
2252 } 2263 }
2253 2264
2254 static struct page 2265 static struct page
2255 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2266 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
2256 struct vm_area_struct *vma, unsigned long address, 2267 struct vm_area_struct *vma, unsigned long address,
2257 int node) 2268 int node)
2258 { 2269 {
2259 up_read(&mm->mmap_sem); 2270 up_read(&mm->mmap_sem);
2260 VM_BUG_ON(!*hpage); 2271 VM_BUG_ON(!*hpage);
2261 return *hpage; 2272 return *hpage;
2262 } 2273 }
2263 #endif 2274 #endif
2264 2275
2265 static bool hugepage_vma_check(struct vm_area_struct *vma) 2276 static bool hugepage_vma_check(struct vm_area_struct *vma)
2266 { 2277 {
2267 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 2278 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2268 (vma->vm_flags & VM_NOHUGEPAGE)) 2279 (vma->vm_flags & VM_NOHUGEPAGE))
2269 return false; 2280 return false;
2270 2281
2271 if (!vma->anon_vma || vma->vm_ops) 2282 if (!vma->anon_vma || vma->vm_ops)
2272 return false; 2283 return false;
2273 if (is_vma_temporary_stack(vma)) 2284 if (is_vma_temporary_stack(vma))
2274 return false; 2285 return false;
2275 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2286 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2276 return true; 2287 return true;
2277 } 2288 }
2278 2289
2279 static void collapse_huge_page(struct mm_struct *mm, 2290 static void collapse_huge_page(struct mm_struct *mm,
2280 unsigned long address, 2291 unsigned long address,
2281 struct page **hpage, 2292 struct page **hpage,
2282 struct vm_area_struct *vma, 2293 struct vm_area_struct *vma,
2283 int node) 2294 int node)
2284 { 2295 {
2285 pmd_t *pmd, _pmd; 2296 pmd_t *pmd, _pmd;
2286 pte_t *pte; 2297 pte_t *pte;
2287 pgtable_t pgtable; 2298 pgtable_t pgtable;
2288 struct page *new_page; 2299 struct page *new_page;
2289 spinlock_t *ptl; 2300 spinlock_t *ptl;
2290 int isolated; 2301 int isolated;
2291 unsigned long hstart, hend; 2302 unsigned long hstart, hend;
2292 unsigned long mmun_start; /* For mmu_notifiers */ 2303 unsigned long mmun_start; /* For mmu_notifiers */
2293 unsigned long mmun_end; /* For mmu_notifiers */ 2304 unsigned long mmun_end; /* For mmu_notifiers */
2294 2305
2295 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2306 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2296 2307
2297 /* release the mmap_sem read lock. */ 2308 /* release the mmap_sem read lock. */
2298 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); 2309 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
2299 if (!new_page) 2310 if (!new_page)
2300 return; 2311 return;
2301 2312
2302 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) 2313 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
2303 return; 2314 return;
2304 2315
2305 /* 2316 /*
2306 * Prevent all access to pagetables with the exception of 2317 * Prevent all access to pagetables with the exception of
2307 * gup_fast later hanlded by the ptep_clear_flush and the VM 2318 * gup_fast later hanlded by the ptep_clear_flush and the VM
2308 * handled by the anon_vma lock + PG_lock. 2319 * handled by the anon_vma lock + PG_lock.
2309 */ 2320 */
2310 down_write(&mm->mmap_sem); 2321 down_write(&mm->mmap_sem);
2311 if (unlikely(khugepaged_test_exit(mm))) 2322 if (unlikely(khugepaged_test_exit(mm)))
2312 goto out; 2323 goto out;
2313 2324
2314 vma = find_vma(mm, address); 2325 vma = find_vma(mm, address);
2315 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2326 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2316 hend = vma->vm_end & HPAGE_PMD_MASK; 2327 hend = vma->vm_end & HPAGE_PMD_MASK;
2317 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 2328 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
2318 goto out; 2329 goto out;
2319 if (!hugepage_vma_check(vma)) 2330 if (!hugepage_vma_check(vma))
2320 goto out; 2331 goto out;
2321 pmd = mm_find_pmd(mm, address); 2332 pmd = mm_find_pmd(mm, address);
2322 if (!pmd) 2333 if (!pmd)
2323 goto out; 2334 goto out;
2324 if (pmd_trans_huge(*pmd)) 2335 if (pmd_trans_huge(*pmd))
2325 goto out; 2336 goto out;
2326 2337
2327 anon_vma_lock_write(vma->anon_vma); 2338 anon_vma_lock_write(vma->anon_vma);
2328 2339
2329 pte = pte_offset_map(pmd, address); 2340 pte = pte_offset_map(pmd, address);
2330 ptl = pte_lockptr(mm, pmd); 2341 ptl = pte_lockptr(mm, pmd);
2331 2342
2332 mmun_start = address; 2343 mmun_start = address;
2333 mmun_end = address + HPAGE_PMD_SIZE; 2344 mmun_end = address + HPAGE_PMD_SIZE;
2334 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2345 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2335 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 2346 spin_lock(&mm->page_table_lock); /* probably unnecessary */
2336 /* 2347 /*
2337 * After this gup_fast can't run anymore. This also removes 2348 * After this gup_fast can't run anymore. This also removes
2338 * any huge TLB entry from the CPU so we won't allow 2349 * any huge TLB entry from the CPU so we won't allow
2339 * huge and small TLB entries for the same virtual address 2350 * huge and small TLB entries for the same virtual address
2340 * to avoid the risk of CPU bugs in that area. 2351 * to avoid the risk of CPU bugs in that area.
2341 */ 2352 */
2342 _pmd = pmdp_clear_flush(vma, address, pmd); 2353 _pmd = pmdp_clear_flush(vma, address, pmd);
2343 spin_unlock(&mm->page_table_lock); 2354 spin_unlock(&mm->page_table_lock);
2344 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2355 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2345 2356
2346 spin_lock(ptl); 2357 spin_lock(ptl);
2347 isolated = __collapse_huge_page_isolate(vma, address, pte); 2358 isolated = __collapse_huge_page_isolate(vma, address, pte);
2348 spin_unlock(ptl); 2359 spin_unlock(ptl);
2349 2360
2350 if (unlikely(!isolated)) { 2361 if (unlikely(!isolated)) {
2351 pte_unmap(pte); 2362 pte_unmap(pte);
2352 spin_lock(&mm->page_table_lock); 2363 spin_lock(&mm->page_table_lock);
2353 BUG_ON(!pmd_none(*pmd)); 2364 BUG_ON(!pmd_none(*pmd));
2354 set_pmd_at(mm, address, pmd, _pmd); 2365 set_pmd_at(mm, address, pmd, _pmd);
2355 spin_unlock(&mm->page_table_lock); 2366 spin_unlock(&mm->page_table_lock);
2356 anon_vma_unlock(vma->anon_vma); 2367 anon_vma_unlock(vma->anon_vma);
2357 goto out; 2368 goto out;
2358 } 2369 }
2359 2370
2360 /* 2371 /*
2361 * All pages are isolated and locked so anon_vma rmap 2372 * All pages are isolated and locked so anon_vma rmap
2362 * can't run anymore. 2373 * can't run anymore.
2363 */ 2374 */
2364 anon_vma_unlock(vma->anon_vma); 2375 anon_vma_unlock(vma->anon_vma);
2365 2376
2366 __collapse_huge_page_copy(pte, new_page, vma, address, ptl); 2377 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
2367 pte_unmap(pte); 2378 pte_unmap(pte);
2368 __SetPageUptodate(new_page); 2379 __SetPageUptodate(new_page);
2369 pgtable = pmd_pgtable(_pmd); 2380 pgtable = pmd_pgtable(_pmd);
2370 2381
2371 _pmd = mk_huge_pmd(new_page, vma); 2382 _pmd = mk_huge_pmd(new_page, vma);
2372 2383
2373 /* 2384 /*
2374 * spin_lock() below is not the equivalent of smp_wmb(), so 2385 * spin_lock() below is not the equivalent of smp_wmb(), so
2375 * this is needed to avoid the copy_huge_page writes to become 2386 * this is needed to avoid the copy_huge_page writes to become
2376 * visible after the set_pmd_at() write. 2387 * visible after the set_pmd_at() write.
2377 */ 2388 */
2378 smp_wmb(); 2389 smp_wmb();
2379 2390
2380 spin_lock(&mm->page_table_lock); 2391 spin_lock(&mm->page_table_lock);
2381 BUG_ON(!pmd_none(*pmd)); 2392 BUG_ON(!pmd_none(*pmd));
2382 page_add_new_anon_rmap(new_page, vma, address); 2393 page_add_new_anon_rmap(new_page, vma, address);
2383 set_pmd_at(mm, address, pmd, _pmd); 2394 set_pmd_at(mm, address, pmd, _pmd);
2384 update_mmu_cache_pmd(vma, address, pmd); 2395 update_mmu_cache_pmd(vma, address, pmd);
2385 pgtable_trans_huge_deposit(mm, pgtable); 2396 pgtable_trans_huge_deposit(mm, pgtable);
2386 spin_unlock(&mm->page_table_lock); 2397 spin_unlock(&mm->page_table_lock);
2387 2398
2388 *hpage = NULL; 2399 *hpage = NULL;
2389 2400
2390 khugepaged_pages_collapsed++; 2401 khugepaged_pages_collapsed++;
2391 out_up_write: 2402 out_up_write:
2392 up_write(&mm->mmap_sem); 2403 up_write(&mm->mmap_sem);
2393 return; 2404 return;
2394 2405
2395 out: 2406 out:
2396 mem_cgroup_uncharge_page(new_page); 2407 mem_cgroup_uncharge_page(new_page);
2397 goto out_up_write; 2408 goto out_up_write;
2398 } 2409 }
2399 2410
2400 static int khugepaged_scan_pmd(struct mm_struct *mm, 2411 static int khugepaged_scan_pmd(struct mm_struct *mm,
2401 struct vm_area_struct *vma, 2412 struct vm_area_struct *vma,
2402 unsigned long address, 2413 unsigned long address,
2403 struct page **hpage) 2414 struct page **hpage)
2404 { 2415 {
2405 pmd_t *pmd; 2416 pmd_t *pmd;
2406 pte_t *pte, *_pte; 2417 pte_t *pte, *_pte;
2407 int ret = 0, referenced = 0, none = 0; 2418 int ret = 0, referenced = 0, none = 0;
2408 struct page *page; 2419 struct page *page;
2409 unsigned long _address; 2420 unsigned long _address;
2410 spinlock_t *ptl; 2421 spinlock_t *ptl;
2411 int node = -1; 2422 int node = -1;
2412 2423
2413 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2424 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2414 2425
2415 pmd = mm_find_pmd(mm, address); 2426 pmd = mm_find_pmd(mm, address);
2416 if (!pmd) 2427 if (!pmd)
2417 goto out; 2428 goto out;
2418 if (pmd_trans_huge(*pmd)) 2429 if (pmd_trans_huge(*pmd))
2419 goto out; 2430 goto out;
2420 2431
2421 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2432 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2422 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2433 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2423 _pte++, _address += PAGE_SIZE) { 2434 _pte++, _address += PAGE_SIZE) {
2424 pte_t pteval = *_pte; 2435 pte_t pteval = *_pte;
2425 if (pte_none(pteval)) { 2436 if (pte_none(pteval)) {
2426 if (++none <= khugepaged_max_ptes_none) 2437 if (++none <= khugepaged_max_ptes_none)
2427 continue; 2438 continue;
2428 else 2439 else
2429 goto out_unmap; 2440 goto out_unmap;
2430 } 2441 }
2431 if (!pte_present(pteval) || !pte_write(pteval)) 2442 if (!pte_present(pteval) || !pte_write(pteval))
2432 goto out_unmap; 2443 goto out_unmap;
2433 page = vm_normal_page(vma, _address, pteval); 2444 page = vm_normal_page(vma, _address, pteval);
2434 if (unlikely(!page)) 2445 if (unlikely(!page))
2435 goto out_unmap; 2446 goto out_unmap;
2436 /* 2447 /*
2437 * Chose the node of the first page. This could 2448 * Chose the node of the first page. This could
2438 * be more sophisticated and look at more pages, 2449 * be more sophisticated and look at more pages,
2439 * but isn't for now. 2450 * but isn't for now.
2440 */ 2451 */
2441 if (node == -1) 2452 if (node == -1)
2442 node = page_to_nid(page); 2453 node = page_to_nid(page);
2443 VM_BUG_ON(PageCompound(page)); 2454 VM_BUG_ON(PageCompound(page));
2444 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2455 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2445 goto out_unmap; 2456 goto out_unmap;
2446 /* cannot use mapcount: can't collapse if there's a gup pin */ 2457 /* cannot use mapcount: can't collapse if there's a gup pin */
2447 if (page_count(page) != 1) 2458 if (page_count(page) != 1)
2448 goto out_unmap; 2459 goto out_unmap;
2449 if (pte_young(pteval) || PageReferenced(page) || 2460 if (pte_young(pteval) || PageReferenced(page) ||
2450 mmu_notifier_test_young(vma->vm_mm, address)) 2461 mmu_notifier_test_young(vma->vm_mm, address))
2451 referenced = 1; 2462 referenced = 1;
2452 } 2463 }
2453 if (referenced) 2464 if (referenced)
2454 ret = 1; 2465 ret = 1;
2455 out_unmap: 2466 out_unmap:
2456 pte_unmap_unlock(pte, ptl); 2467 pte_unmap_unlock(pte, ptl);
2457 if (ret) 2468 if (ret)
2458 /* collapse_huge_page will return with the mmap_sem released */ 2469 /* collapse_huge_page will return with the mmap_sem released */
2459 collapse_huge_page(mm, address, hpage, vma, node); 2470 collapse_huge_page(mm, address, hpage, vma, node);
2460 out: 2471 out:
2461 return ret; 2472 return ret;
2462 } 2473 }
2463 2474
2464 static void collect_mm_slot(struct mm_slot *mm_slot) 2475 static void collect_mm_slot(struct mm_slot *mm_slot)
2465 { 2476 {
2466 struct mm_struct *mm = mm_slot->mm; 2477 struct mm_struct *mm = mm_slot->mm;
2467 2478
2468 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2479 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2469 2480
2470 if (khugepaged_test_exit(mm)) { 2481 if (khugepaged_test_exit(mm)) {
2471 /* free mm_slot */ 2482 /* free mm_slot */
2472 hlist_del(&mm_slot->hash); 2483 hlist_del(&mm_slot->hash);
2473 list_del(&mm_slot->mm_node); 2484 list_del(&mm_slot->mm_node);
2474 2485
2475 /* 2486 /*
2476 * Not strictly needed because the mm exited already. 2487 * Not strictly needed because the mm exited already.
2477 * 2488 *
2478 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 2489 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2479 */ 2490 */
2480 2491
2481 /* khugepaged_mm_lock actually not necessary for the below */ 2492 /* khugepaged_mm_lock actually not necessary for the below */
2482 free_mm_slot(mm_slot); 2493 free_mm_slot(mm_slot);
2483 mmdrop(mm); 2494 mmdrop(mm);
2484 } 2495 }
2485 } 2496 }
2486 2497
2487 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 2498 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2488 struct page **hpage) 2499 struct page **hpage)
2489 __releases(&khugepaged_mm_lock) 2500 __releases(&khugepaged_mm_lock)
2490 __acquires(&khugepaged_mm_lock) 2501 __acquires(&khugepaged_mm_lock)
2491 { 2502 {
2492 struct mm_slot *mm_slot; 2503 struct mm_slot *mm_slot;
2493 struct mm_struct *mm; 2504 struct mm_struct *mm;
2494 struct vm_area_struct *vma; 2505 struct vm_area_struct *vma;
2495 int progress = 0; 2506 int progress = 0;
2496 2507
2497 VM_BUG_ON(!pages); 2508 VM_BUG_ON(!pages);
2498 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2509 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2499 2510
2500 if (khugepaged_scan.mm_slot) 2511 if (khugepaged_scan.mm_slot)
2501 mm_slot = khugepaged_scan.mm_slot; 2512 mm_slot = khugepaged_scan.mm_slot;
2502 else { 2513 else {
2503 mm_slot = list_entry(khugepaged_scan.mm_head.next, 2514 mm_slot = list_entry(khugepaged_scan.mm_head.next,
2504 struct mm_slot, mm_node); 2515 struct mm_slot, mm_node);
2505 khugepaged_scan.address = 0; 2516 khugepaged_scan.address = 0;
2506 khugepaged_scan.mm_slot = mm_slot; 2517 khugepaged_scan.mm_slot = mm_slot;
2507 } 2518 }
2508 spin_unlock(&khugepaged_mm_lock); 2519 spin_unlock(&khugepaged_mm_lock);
2509 2520
2510 mm = mm_slot->mm; 2521 mm = mm_slot->mm;
2511 down_read(&mm->mmap_sem); 2522 down_read(&mm->mmap_sem);
2512 if (unlikely(khugepaged_test_exit(mm))) 2523 if (unlikely(khugepaged_test_exit(mm)))
2513 vma = NULL; 2524 vma = NULL;
2514 else 2525 else
2515 vma = find_vma(mm, khugepaged_scan.address); 2526 vma = find_vma(mm, khugepaged_scan.address);
2516 2527
2517 progress++; 2528 progress++;
2518 for (; vma; vma = vma->vm_next) { 2529 for (; vma; vma = vma->vm_next) {
2519 unsigned long hstart, hend; 2530 unsigned long hstart, hend;
2520 2531
2521 cond_resched(); 2532 cond_resched();
2522 if (unlikely(khugepaged_test_exit(mm))) { 2533 if (unlikely(khugepaged_test_exit(mm))) {
2523 progress++; 2534 progress++;
2524 break; 2535 break;
2525 } 2536 }
2526 if (!hugepage_vma_check(vma)) { 2537 if (!hugepage_vma_check(vma)) {
2527 skip: 2538 skip:
2528 progress++; 2539 progress++;
2529 continue; 2540 continue;
2530 } 2541 }
2531 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2542 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2532 hend = vma->vm_end & HPAGE_PMD_MASK; 2543 hend = vma->vm_end & HPAGE_PMD_MASK;
2533 if (hstart >= hend) 2544 if (hstart >= hend)
2534 goto skip; 2545 goto skip;
2535 if (khugepaged_scan.address > hend) 2546 if (khugepaged_scan.address > hend)
2536 goto skip; 2547 goto skip;
2537 if (khugepaged_scan.address < hstart) 2548 if (khugepaged_scan.address < hstart)
2538 khugepaged_scan.address = hstart; 2549 khugepaged_scan.address = hstart;
2539 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2550 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2540 2551
2541 while (khugepaged_scan.address < hend) { 2552 while (khugepaged_scan.address < hend) {
2542 int ret; 2553 int ret;
2543 cond_resched(); 2554 cond_resched();
2544 if (unlikely(khugepaged_test_exit(mm))) 2555 if (unlikely(khugepaged_test_exit(mm)))
2545 goto breakouterloop; 2556 goto breakouterloop;
2546 2557
2547 VM_BUG_ON(khugepaged_scan.address < hstart || 2558 VM_BUG_ON(khugepaged_scan.address < hstart ||
2548 khugepaged_scan.address + HPAGE_PMD_SIZE > 2559 khugepaged_scan.address + HPAGE_PMD_SIZE >
2549 hend); 2560 hend);
2550 ret = khugepaged_scan_pmd(mm, vma, 2561 ret = khugepaged_scan_pmd(mm, vma,
2551 khugepaged_scan.address, 2562 khugepaged_scan.address,
2552 hpage); 2563 hpage);
2553 /* move to next address */ 2564 /* move to next address */
2554 khugepaged_scan.address += HPAGE_PMD_SIZE; 2565 khugepaged_scan.address += HPAGE_PMD_SIZE;
2555 progress += HPAGE_PMD_NR; 2566 progress += HPAGE_PMD_NR;
2556 if (ret) 2567 if (ret)
2557 /* we released mmap_sem so break loop */ 2568 /* we released mmap_sem so break loop */
2558 goto breakouterloop_mmap_sem; 2569 goto breakouterloop_mmap_sem;
2559 if (progress >= pages) 2570 if (progress >= pages)
2560 goto breakouterloop; 2571 goto breakouterloop;
2561 } 2572 }
2562 } 2573 }
2563 breakouterloop: 2574 breakouterloop:
2564 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ 2575 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2565 breakouterloop_mmap_sem: 2576 breakouterloop_mmap_sem:
2566 2577
2567 spin_lock(&khugepaged_mm_lock); 2578 spin_lock(&khugepaged_mm_lock);
2568 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2579 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2569 /* 2580 /*
2570 * Release the current mm_slot if this mm is about to die, or 2581 * Release the current mm_slot if this mm is about to die, or
2571 * if we scanned all vmas of this mm. 2582 * if we scanned all vmas of this mm.
2572 */ 2583 */
2573 if (khugepaged_test_exit(mm) || !vma) { 2584 if (khugepaged_test_exit(mm) || !vma) {
2574 /* 2585 /*
2575 * Make sure that if mm_users is reaching zero while 2586 * Make sure that if mm_users is reaching zero while
2576 * khugepaged runs here, khugepaged_exit will find 2587 * khugepaged runs here, khugepaged_exit will find
2577 * mm_slot not pointing to the exiting mm. 2588 * mm_slot not pointing to the exiting mm.
2578 */ 2589 */
2579 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { 2590 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2580 khugepaged_scan.mm_slot = list_entry( 2591 khugepaged_scan.mm_slot = list_entry(
2581 mm_slot->mm_node.next, 2592 mm_slot->mm_node.next,
2582 struct mm_slot, mm_node); 2593 struct mm_slot, mm_node);
2583 khugepaged_scan.address = 0; 2594 khugepaged_scan.address = 0;
2584 } else { 2595 } else {
2585 khugepaged_scan.mm_slot = NULL; 2596 khugepaged_scan.mm_slot = NULL;
2586 khugepaged_full_scans++; 2597 khugepaged_full_scans++;
2587 } 2598 }
2588 2599
2589 collect_mm_slot(mm_slot); 2600 collect_mm_slot(mm_slot);
2590 } 2601 }
2591 2602
2592 return progress; 2603 return progress;
2593 } 2604 }
2594 2605
2595 static int khugepaged_has_work(void) 2606 static int khugepaged_has_work(void)
2596 { 2607 {
2597 return !list_empty(&khugepaged_scan.mm_head) && 2608 return !list_empty(&khugepaged_scan.mm_head) &&
2598 khugepaged_enabled(); 2609 khugepaged_enabled();
2599 } 2610 }
2600 2611
2601 static int khugepaged_wait_event(void) 2612 static int khugepaged_wait_event(void)
2602 { 2613 {
2603 return !list_empty(&khugepaged_scan.mm_head) || 2614 return !list_empty(&khugepaged_scan.mm_head) ||
2604 kthread_should_stop(); 2615 kthread_should_stop();
2605 } 2616 }
2606 2617
2607 static void khugepaged_do_scan(void) 2618 static void khugepaged_do_scan(void)
2608 { 2619 {
2609 struct page *hpage = NULL; 2620 struct page *hpage = NULL;
2610 unsigned int progress = 0, pass_through_head = 0; 2621 unsigned int progress = 0, pass_through_head = 0;
2611 unsigned int pages = khugepaged_pages_to_scan; 2622 unsigned int pages = khugepaged_pages_to_scan;
2612 bool wait = true; 2623 bool wait = true;
2613 2624
2614 barrier(); /* write khugepaged_pages_to_scan to local stack */ 2625 barrier(); /* write khugepaged_pages_to_scan to local stack */
2615 2626
2616 while (progress < pages) { 2627 while (progress < pages) {
2617 if (!khugepaged_prealloc_page(&hpage, &wait)) 2628 if (!khugepaged_prealloc_page(&hpage, &wait))
2618 break; 2629 break;
2619 2630
2620 cond_resched(); 2631 cond_resched();
2621 2632
2622 if (unlikely(kthread_should_stop() || freezing(current))) 2633 if (unlikely(kthread_should_stop() || freezing(current)))
2623 break; 2634 break;
2624 2635
2625 spin_lock(&khugepaged_mm_lock); 2636 spin_lock(&khugepaged_mm_lock);
2626 if (!khugepaged_scan.mm_slot) 2637 if (!khugepaged_scan.mm_slot)
2627 pass_through_head++; 2638 pass_through_head++;
2628 if (khugepaged_has_work() && 2639 if (khugepaged_has_work() &&
2629 pass_through_head < 2) 2640 pass_through_head < 2)
2630 progress += khugepaged_scan_mm_slot(pages - progress, 2641 progress += khugepaged_scan_mm_slot(pages - progress,
2631 &hpage); 2642 &hpage);
2632 else 2643 else
2633 progress = pages; 2644 progress = pages;
2634 spin_unlock(&khugepaged_mm_lock); 2645 spin_unlock(&khugepaged_mm_lock);
2635 } 2646 }
2636 2647
2637 if (!IS_ERR_OR_NULL(hpage)) 2648 if (!IS_ERR_OR_NULL(hpage))
2638 put_page(hpage); 2649 put_page(hpage);
2639 } 2650 }
2640 2651
2641 static void khugepaged_wait_work(void) 2652 static void khugepaged_wait_work(void)
2642 { 2653 {
2643 try_to_freeze(); 2654 try_to_freeze();
2644 2655
2645 if (khugepaged_has_work()) { 2656 if (khugepaged_has_work()) {
2646 if (!khugepaged_scan_sleep_millisecs) 2657 if (!khugepaged_scan_sleep_millisecs)
2647 return; 2658 return;
2648 2659
2649 wait_event_freezable_timeout(khugepaged_wait, 2660 wait_event_freezable_timeout(khugepaged_wait,
2650 kthread_should_stop(), 2661 kthread_should_stop(),
2651 msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); 2662 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2652 return; 2663 return;
2653 } 2664 }
2654 2665
2655 if (khugepaged_enabled()) 2666 if (khugepaged_enabled())
2656 wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 2667 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2657 } 2668 }
2658 2669
2659 static int khugepaged(void *none) 2670 static int khugepaged(void *none)
2660 { 2671 {
2661 struct mm_slot *mm_slot; 2672 struct mm_slot *mm_slot;
2662 2673
2663 set_freezable(); 2674 set_freezable();
2664 set_user_nice(current, 19); 2675 set_user_nice(current, 19);
2665 2676
2666 while (!kthread_should_stop()) { 2677 while (!kthread_should_stop()) {
2667 khugepaged_do_scan(); 2678 khugepaged_do_scan();
2668 khugepaged_wait_work(); 2679 khugepaged_wait_work();
2669 } 2680 }
2670 2681
2671 spin_lock(&khugepaged_mm_lock); 2682 spin_lock(&khugepaged_mm_lock);
2672 mm_slot = khugepaged_scan.mm_slot; 2683 mm_slot = khugepaged_scan.mm_slot;
2673 khugepaged_scan.mm_slot = NULL; 2684 khugepaged_scan.mm_slot = NULL;
2674 if (mm_slot) 2685 if (mm_slot)
2675 collect_mm_slot(mm_slot); 2686 collect_mm_slot(mm_slot);
2676 spin_unlock(&khugepaged_mm_lock); 2687 spin_unlock(&khugepaged_mm_lock);
2677 return 0; 2688 return 0;
2678 } 2689 }
2679 2690
2680 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2691 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2681 unsigned long haddr, pmd_t *pmd) 2692 unsigned long haddr, pmd_t *pmd)
2682 { 2693 {
2683 struct mm_struct *mm = vma->vm_mm; 2694 struct mm_struct *mm = vma->vm_mm;
2684 pgtable_t pgtable; 2695 pgtable_t pgtable;
2685 pmd_t _pmd; 2696 pmd_t _pmd;
2686 int i; 2697 int i;
2687 2698
2688 pmdp_clear_flush(vma, haddr, pmd); 2699 pmdp_clear_flush(vma, haddr, pmd);
2689 /* leave pmd empty until pte is filled */ 2700 /* leave pmd empty until pte is filled */
2690 2701
2691 pgtable = pgtable_trans_huge_withdraw(mm); 2702 pgtable = pgtable_trans_huge_withdraw(mm);
2692 pmd_populate(mm, &_pmd, pgtable); 2703 pmd_populate(mm, &_pmd, pgtable);
2693 2704
2694 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2705 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2695 pte_t *pte, entry; 2706 pte_t *pte, entry;
2696 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 2707 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2697 entry = pte_mkspecial(entry); 2708 entry = pte_mkspecial(entry);
2698 pte = pte_offset_map(&_pmd, haddr); 2709 pte = pte_offset_map(&_pmd, haddr);
2699 VM_BUG_ON(!pte_none(*pte)); 2710 VM_BUG_ON(!pte_none(*pte));
2700 set_pte_at(mm, haddr, pte, entry); 2711 set_pte_at(mm, haddr, pte, entry);
2701 pte_unmap(pte); 2712 pte_unmap(pte);
2702 } 2713 }
2703 smp_wmb(); /* make pte visible before pmd */ 2714 smp_wmb(); /* make pte visible before pmd */
2704 pmd_populate(mm, pmd, pgtable); 2715 pmd_populate(mm, pmd, pgtable);
2705 put_huge_zero_page(); 2716 put_huge_zero_page();
2706 } 2717 }
2707 2718
2708 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, 2719 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2709 pmd_t *pmd) 2720 pmd_t *pmd)
2710 { 2721 {
2711 struct page *page; 2722 struct page *page;
2712 struct mm_struct *mm = vma->vm_mm; 2723 struct mm_struct *mm = vma->vm_mm;
2713 unsigned long haddr = address & HPAGE_PMD_MASK; 2724 unsigned long haddr = address & HPAGE_PMD_MASK;
2714 unsigned long mmun_start; /* For mmu_notifiers */ 2725 unsigned long mmun_start; /* For mmu_notifiers */
2715 unsigned long mmun_end; /* For mmu_notifiers */ 2726 unsigned long mmun_end; /* For mmu_notifiers */
2716 2727
2717 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); 2728 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2718 2729
2719 mmun_start = haddr; 2730 mmun_start = haddr;
2720 mmun_end = haddr + HPAGE_PMD_SIZE; 2731 mmun_end = haddr + HPAGE_PMD_SIZE;
2721 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2732 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2722 spin_lock(&mm->page_table_lock); 2733 spin_lock(&mm->page_table_lock);
2723 if (unlikely(!pmd_trans_huge(*pmd))) { 2734 if (unlikely(!pmd_trans_huge(*pmd))) {
2724 spin_unlock(&mm->page_table_lock); 2735 spin_unlock(&mm->page_table_lock);
2725 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2736 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2726 return; 2737 return;
2727 } 2738 }
2728 if (is_huge_zero_pmd(*pmd)) { 2739 if (is_huge_zero_pmd(*pmd)) {
2729 __split_huge_zero_page_pmd(vma, haddr, pmd); 2740 __split_huge_zero_page_pmd(vma, haddr, pmd);
2730 spin_unlock(&mm->page_table_lock); 2741 spin_unlock(&mm->page_table_lock);
2731 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2742 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2732 return; 2743 return;
2733 } 2744 }
2734 page = pmd_page(*pmd); 2745 page = pmd_page(*pmd);
2735 VM_BUG_ON(!page_count(page)); 2746 VM_BUG_ON(!page_count(page));
2736 get_page(page); 2747 get_page(page);
2737 spin_unlock(&mm->page_table_lock); 2748 spin_unlock(&mm->page_table_lock);
2738 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2749 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2739 2750
2740 split_huge_page(page); 2751 split_huge_page(page);
2741 2752
2742 put_page(page); 2753 put_page(page);
2743 BUG_ON(pmd_trans_huge(*pmd)); 2754 BUG_ON(pmd_trans_huge(*pmd));
2744 } 2755 }
2745 2756
2746 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, 2757 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2747 pmd_t *pmd) 2758 pmd_t *pmd)
2748 { 2759 {
2749 struct vm_area_struct *vma; 2760 struct vm_area_struct *vma;
2750 2761
2751 vma = find_vma(mm, address); 2762 vma = find_vma(mm, address);
2752 BUG_ON(vma == NULL); 2763 BUG_ON(vma == NULL);
2753 split_huge_page_pmd(vma, address, pmd); 2764 split_huge_page_pmd(vma, address, pmd);
2754 } 2765 }
2755 2766
2756 static void split_huge_page_address(struct mm_struct *mm, 2767 static void split_huge_page_address(struct mm_struct *mm,
2757 unsigned long address) 2768 unsigned long address)
2758 { 2769 {
2759 pmd_t *pmd; 2770 pmd_t *pmd;
2760 2771
2761 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2772 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2762 2773
2763 pmd = mm_find_pmd(mm, address); 2774 pmd = mm_find_pmd(mm, address);
2764 if (!pmd) 2775 if (!pmd)
2765 return; 2776 return;
2766 /* 2777 /*
2767 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2778 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2768 * materialize from under us. 2779 * materialize from under us.
2769 */ 2780 */
2770 split_huge_page_pmd_mm(mm, address, pmd); 2781 split_huge_page_pmd_mm(mm, address, pmd);
2771 } 2782 }
2772 2783
2773 void __vma_adjust_trans_huge(struct vm_area_struct *vma, 2784 void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2774 unsigned long start, 2785 unsigned long start,
2775 unsigned long end, 2786 unsigned long end,
2776 long adjust_next) 2787 long adjust_next)
2777 { 2788 {
2778 /* 2789 /*
2779 * If the new start address isn't hpage aligned and it could 2790 * If the new start address isn't hpage aligned and it could
2780 * previously contain an hugepage: check if we need to split 2791 * previously contain an hugepage: check if we need to split
2781 * an huge pmd. 2792 * an huge pmd.
2782 */ 2793 */
2783 if (start & ~HPAGE_PMD_MASK && 2794 if (start & ~HPAGE_PMD_MASK &&
2784 (start & HPAGE_PMD_MASK) >= vma->vm_start && 2795 (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2785 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2796 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2786 split_huge_page_address(vma->vm_mm, start); 2797 split_huge_page_address(vma->vm_mm, start);
2787 2798
2788 /* 2799 /*
2789 * If the new end address isn't hpage aligned and it could 2800 * If the new end address isn't hpage aligned and it could
2790 * previously contain an hugepage: check if we need to split 2801 * previously contain an hugepage: check if we need to split
2791 * an huge pmd. 2802 * an huge pmd.
2792 */ 2803 */
2793 if (end & ~HPAGE_PMD_MASK && 2804 if (end & ~HPAGE_PMD_MASK &&
2794 (end & HPAGE_PMD_MASK) >= vma->vm_start && 2805 (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2795 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2806 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2796 split_huge_page_address(vma->vm_mm, end); 2807 split_huge_page_address(vma->vm_mm, end);
2797 2808
2798 /* 2809 /*
2799 * If we're also updating the vma->vm_next->vm_start, if the new 2810 * If we're also updating the vma->vm_next->vm_start, if the new
2800 * vm_next->vm_start isn't page aligned and it could previously 2811 * vm_next->vm_start isn't page aligned and it could previously
2801 * contain an hugepage: check if we need to split an huge pmd. 2812 * contain an hugepage: check if we need to split an huge pmd.
2802 */ 2813 */
2803 if (adjust_next > 0) { 2814 if (adjust_next > 0) {
2804 struct vm_area_struct *next = vma->vm_next; 2815 struct vm_area_struct *next = vma->vm_next;
2805 unsigned long nstart = next->vm_start; 2816 unsigned long nstart = next->vm_start;
2806 nstart += adjust_next << PAGE_SHIFT; 2817 nstart += adjust_next << PAGE_SHIFT;
2807 if (nstart & ~HPAGE_PMD_MASK && 2818 if (nstart & ~HPAGE_PMD_MASK &&
2808 (nstart & HPAGE_PMD_MASK) >= next->vm_start && 2819 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2809 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) 2820 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2810 split_huge_page_address(next->vm_mm, nstart); 2821 split_huge_page_address(next->vm_mm, nstart);
2811 } 2822 }
2812 } 2823 }
2813 2824