Commit 062f1af2170afe817133d358d900a5f33e3856e4
Committed by
Linus Torvalds
1 parent
572043c90d
Exists in
master
and in
20 other branches
mm: thp: acquire the anon_vma rwsem for write during split
Zhouping Liu reported the following against 3.8-rc1 when running a mmap testcase from LTP. mapcount 0 page_mapcount 3 ------------[ cut here ]------------ kernel BUG at mm/huge_memory.c:1798! invalid opcode: 0000 [#1] SMP Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables bnep bluetooth rfkill iptable_mangle ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack iptable_filter ip_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i cxgb3 mdio libcxgbi ib_iser rdma_cm ib_addr iw_cm ib_cm ib_sa ib_mad ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi vfat fat dm_mirror dm_region_hash dm_log dm_mod cdc_ether iTCO_wdt i7core_edac coretemp usbnet iTCO_vendor_support mii crc32c_intel edac_core lpc_ich shpchp ioatdma mfd_core i2c_i801 pcspkr serio_raw bnx2 microcode dca vhost_net tun macvtap macvlan kvm_intel kvm uinput mgag200 sr_mod cdrom i2c_algo_bit sd_mod drm_kms_helper crc_t10dif ata_generic pata_acpi ttm ata_piix drm libata i2c_core megaraid_sas CPU 1 Pid: 23217, comm: mmap10 Not tainted 3.8.0-rc1mainline+ #17 IBM IBM System x3400 M3 Server -[7379I08]-/69Y4356 RIP: __split_huge_page+0x677/0x6d0 RSP: 0000:ffff88017a03fc08 EFLAGS: 00010293 RAX: 0000000000000003 RBX: ffff88027a6c22e0 RCX: 00000000000034d2 RDX: 000000000000748b RSI: 0000000000000046 RDI: 0000000000000246 RBP: ffff88017a03fcb8 R08: ffffffff819d2440 R09: 000000000000054a R10: 0000000000aaaaaa R11: 00000000ffffffff R12: 0000000000000000 R13: 00007f4f11a00000 R14: ffff880179e96e00 R15: ffffea0005c08000 FS: 00007f4f11f4a740(0000) GS:ffff88017bc20000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000037e9ebb404 CR3: 000000017a436000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process mmap10 (pid: 23217, threadinfo ffff88017a03e000, task ffff880172dd32e0) Stack: ffff88017a540ec8 ffff88017a03fc20 ffffffff816017b5 ffff88017a03fc88 ffffffff812fa014 0000000000000000 ffff880279ebd5c0 00000000f4f11a4c 00000007f4f11f49 00000007f4f11a00 ffff88017a540ef0 ffff88017a540ee8 Call Trace: split_huge_page+0x68/0xb0 __split_huge_page_pmd+0x134/0x330 split_huge_page_pmd_mm+0x51/0x60 split_huge_page_address+0x3b/0x50 __vma_adjust_trans_huge+0x9c/0xf0 vma_adjust+0x684/0x750 __split_vma.isra.28+0x1fa/0x220 do_munmap+0xf9/0x420 vm_munmap+0x4e/0x70 sys_munmap+0x2b/0x40 system_call_fastpath+0x16/0x1b Alexander Beregalov and Alex Xu reported similar bugs and Hillf Danton identified that commit 5a505085f043 ("mm/rmap: Convert the struct anon_vma::mutex to an rwsem") and commit 4fc3f1d66b1e ("mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable") were likely the problem. Reverting these commits was reported to solve the problem for Alexander. Despite the reason for these commits, NUMA balancing is not the direct source of the problem. split_huge_page() expects the anon_vma lock to be exclusive to serialise the whole split operation. Ordinarily it is expected that the anon_vma lock would only be required when updating the avcs but THP also uses the anon_vma rwsem for collapse and split operations where the page lock or compound lock cannot be used (as the page is changing from base to THP or vice versa) and the page table locks are insufficient. This patch takes the anon_vma lock for write to serialise against parallel split_huge_page as THP expected before the conversion to rwsem. Reported-and-tested-by: Zhouping Liu <zliu@redhat.com> Reported-by: Alexander Beregalov <a.beregalov@gmail.com> Reported-by: Alex Xu <alex_y_xu@yahoo.ca> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 13 additions and 2 deletions Inline Diff
mm/huge_memory.c
1 | /* | 1 | /* |
2 | * Copyright (C) 2009 Red Hat, Inc. | 2 | * Copyright (C) 2009 Red Hat, Inc. |
3 | * | 3 | * |
4 | * This work is licensed under the terms of the GNU GPL, version 2. See | 4 | * This work is licensed under the terms of the GNU GPL, version 2. See |
5 | * the COPYING file in the top-level directory. | 5 | * the COPYING file in the top-level directory. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/highmem.h> | 10 | #include <linux/highmem.h> |
11 | #include <linux/hugetlb.h> | 11 | #include <linux/hugetlb.h> |
12 | #include <linux/mmu_notifier.h> | 12 | #include <linux/mmu_notifier.h> |
13 | #include <linux/rmap.h> | 13 | #include <linux/rmap.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/shrinker.h> | 15 | #include <linux/shrinker.h> |
16 | #include <linux/mm_inline.h> | 16 | #include <linux/mm_inline.h> |
17 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
18 | #include <linux/khugepaged.h> | 18 | #include <linux/khugepaged.h> |
19 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
20 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/migrate.h> | 22 | #include <linux/migrate.h> |
23 | 23 | ||
24 | #include <asm/tlb.h> | 24 | #include <asm/tlb.h> |
25 | #include <asm/pgalloc.h> | 25 | #include <asm/pgalloc.h> |
26 | #include "internal.h" | 26 | #include "internal.h" |
27 | 27 | ||
28 | /* | 28 | /* |
29 | * By default transparent hugepage support is enabled for all mappings | 29 | * By default transparent hugepage support is enabled for all mappings |
30 | * and khugepaged scans all mappings. Defrag is only invoked by | 30 | * and khugepaged scans all mappings. Defrag is only invoked by |
31 | * khugepaged hugepage allocations and by page faults inside | 31 | * khugepaged hugepage allocations and by page faults inside |
32 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived | 32 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived |
33 | * allocations. | 33 | * allocations. |
34 | */ | 34 | */ |
35 | unsigned long transparent_hugepage_flags __read_mostly = | 35 | unsigned long transparent_hugepage_flags __read_mostly = |
36 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS | 36 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS |
37 | (1<<TRANSPARENT_HUGEPAGE_FLAG)| | 37 | (1<<TRANSPARENT_HUGEPAGE_FLAG)| |
38 | #endif | 38 | #endif |
39 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE | 39 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE |
40 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | 40 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| |
41 | #endif | 41 | #endif |
42 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | 42 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| |
43 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| | 43 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| |
44 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | 44 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); |
45 | 45 | ||
46 | /* default scan 8*512 pte (or vmas) every 30 second */ | 46 | /* default scan 8*512 pte (or vmas) every 30 second */ |
47 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | 47 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; |
48 | static unsigned int khugepaged_pages_collapsed; | 48 | static unsigned int khugepaged_pages_collapsed; |
49 | static unsigned int khugepaged_full_scans; | 49 | static unsigned int khugepaged_full_scans; |
50 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; | 50 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; |
51 | /* during fragmentation poll the hugepage allocator once every minute */ | 51 | /* during fragmentation poll the hugepage allocator once every minute */ |
52 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; | 52 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; |
53 | static struct task_struct *khugepaged_thread __read_mostly; | 53 | static struct task_struct *khugepaged_thread __read_mostly; |
54 | static DEFINE_MUTEX(khugepaged_mutex); | 54 | static DEFINE_MUTEX(khugepaged_mutex); |
55 | static DEFINE_SPINLOCK(khugepaged_mm_lock); | 55 | static DEFINE_SPINLOCK(khugepaged_mm_lock); |
56 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | 56 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); |
57 | /* | 57 | /* |
58 | * default collapse hugepages if there is at least one pte mapped like | 58 | * default collapse hugepages if there is at least one pte mapped like |
59 | * it would have happened if the vma was large enough during page | 59 | * it would have happened if the vma was large enough during page |
60 | * fault. | 60 | * fault. |
61 | */ | 61 | */ |
62 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | 62 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; |
63 | 63 | ||
64 | static int khugepaged(void *none); | 64 | static int khugepaged(void *none); |
65 | static int mm_slots_hash_init(void); | 65 | static int mm_slots_hash_init(void); |
66 | static int khugepaged_slab_init(void); | 66 | static int khugepaged_slab_init(void); |
67 | static void khugepaged_slab_free(void); | 67 | static void khugepaged_slab_free(void); |
68 | 68 | ||
69 | #define MM_SLOTS_HASH_HEADS 1024 | 69 | #define MM_SLOTS_HASH_HEADS 1024 |
70 | static struct hlist_head *mm_slots_hash __read_mostly; | 70 | static struct hlist_head *mm_slots_hash __read_mostly; |
71 | static struct kmem_cache *mm_slot_cache __read_mostly; | 71 | static struct kmem_cache *mm_slot_cache __read_mostly; |
72 | 72 | ||
73 | /** | 73 | /** |
74 | * struct mm_slot - hash lookup from mm to mm_slot | 74 | * struct mm_slot - hash lookup from mm to mm_slot |
75 | * @hash: hash collision list | 75 | * @hash: hash collision list |
76 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head | 76 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head |
77 | * @mm: the mm that this information is valid for | 77 | * @mm: the mm that this information is valid for |
78 | */ | 78 | */ |
79 | struct mm_slot { | 79 | struct mm_slot { |
80 | struct hlist_node hash; | 80 | struct hlist_node hash; |
81 | struct list_head mm_node; | 81 | struct list_head mm_node; |
82 | struct mm_struct *mm; | 82 | struct mm_struct *mm; |
83 | }; | 83 | }; |
84 | 84 | ||
85 | /** | 85 | /** |
86 | * struct khugepaged_scan - cursor for scanning | 86 | * struct khugepaged_scan - cursor for scanning |
87 | * @mm_head: the head of the mm list to scan | 87 | * @mm_head: the head of the mm list to scan |
88 | * @mm_slot: the current mm_slot we are scanning | 88 | * @mm_slot: the current mm_slot we are scanning |
89 | * @address: the next address inside that to be scanned | 89 | * @address: the next address inside that to be scanned |
90 | * | 90 | * |
91 | * There is only the one khugepaged_scan instance of this cursor structure. | 91 | * There is only the one khugepaged_scan instance of this cursor structure. |
92 | */ | 92 | */ |
93 | struct khugepaged_scan { | 93 | struct khugepaged_scan { |
94 | struct list_head mm_head; | 94 | struct list_head mm_head; |
95 | struct mm_slot *mm_slot; | 95 | struct mm_slot *mm_slot; |
96 | unsigned long address; | 96 | unsigned long address; |
97 | }; | 97 | }; |
98 | static struct khugepaged_scan khugepaged_scan = { | 98 | static struct khugepaged_scan khugepaged_scan = { |
99 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | 99 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), |
100 | }; | 100 | }; |
101 | 101 | ||
102 | 102 | ||
103 | static int set_recommended_min_free_kbytes(void) | 103 | static int set_recommended_min_free_kbytes(void) |
104 | { | 104 | { |
105 | struct zone *zone; | 105 | struct zone *zone; |
106 | int nr_zones = 0; | 106 | int nr_zones = 0; |
107 | unsigned long recommended_min; | 107 | unsigned long recommended_min; |
108 | extern int min_free_kbytes; | 108 | extern int min_free_kbytes; |
109 | 109 | ||
110 | if (!khugepaged_enabled()) | 110 | if (!khugepaged_enabled()) |
111 | return 0; | 111 | return 0; |
112 | 112 | ||
113 | for_each_populated_zone(zone) | 113 | for_each_populated_zone(zone) |
114 | nr_zones++; | 114 | nr_zones++; |
115 | 115 | ||
116 | /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ | 116 | /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ |
117 | recommended_min = pageblock_nr_pages * nr_zones * 2; | 117 | recommended_min = pageblock_nr_pages * nr_zones * 2; |
118 | 118 | ||
119 | /* | 119 | /* |
120 | * Make sure that on average at least two pageblocks are almost free | 120 | * Make sure that on average at least two pageblocks are almost free |
121 | * of another type, one for a migratetype to fall back to and a | 121 | * of another type, one for a migratetype to fall back to and a |
122 | * second to avoid subsequent fallbacks of other types There are 3 | 122 | * second to avoid subsequent fallbacks of other types There are 3 |
123 | * MIGRATE_TYPES we care about. | 123 | * MIGRATE_TYPES we care about. |
124 | */ | 124 | */ |
125 | recommended_min += pageblock_nr_pages * nr_zones * | 125 | recommended_min += pageblock_nr_pages * nr_zones * |
126 | MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; | 126 | MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; |
127 | 127 | ||
128 | /* don't ever allow to reserve more than 5% of the lowmem */ | 128 | /* don't ever allow to reserve more than 5% of the lowmem */ |
129 | recommended_min = min(recommended_min, | 129 | recommended_min = min(recommended_min, |
130 | (unsigned long) nr_free_buffer_pages() / 20); | 130 | (unsigned long) nr_free_buffer_pages() / 20); |
131 | recommended_min <<= (PAGE_SHIFT-10); | 131 | recommended_min <<= (PAGE_SHIFT-10); |
132 | 132 | ||
133 | if (recommended_min > min_free_kbytes) | 133 | if (recommended_min > min_free_kbytes) |
134 | min_free_kbytes = recommended_min; | 134 | min_free_kbytes = recommended_min; |
135 | setup_per_zone_wmarks(); | 135 | setup_per_zone_wmarks(); |
136 | return 0; | 136 | return 0; |
137 | } | 137 | } |
138 | late_initcall(set_recommended_min_free_kbytes); | 138 | late_initcall(set_recommended_min_free_kbytes); |
139 | 139 | ||
140 | static int start_khugepaged(void) | 140 | static int start_khugepaged(void) |
141 | { | 141 | { |
142 | int err = 0; | 142 | int err = 0; |
143 | if (khugepaged_enabled()) { | 143 | if (khugepaged_enabled()) { |
144 | if (!khugepaged_thread) | 144 | if (!khugepaged_thread) |
145 | khugepaged_thread = kthread_run(khugepaged, NULL, | 145 | khugepaged_thread = kthread_run(khugepaged, NULL, |
146 | "khugepaged"); | 146 | "khugepaged"); |
147 | if (unlikely(IS_ERR(khugepaged_thread))) { | 147 | if (unlikely(IS_ERR(khugepaged_thread))) { |
148 | printk(KERN_ERR | 148 | printk(KERN_ERR |
149 | "khugepaged: kthread_run(khugepaged) failed\n"); | 149 | "khugepaged: kthread_run(khugepaged) failed\n"); |
150 | err = PTR_ERR(khugepaged_thread); | 150 | err = PTR_ERR(khugepaged_thread); |
151 | khugepaged_thread = NULL; | 151 | khugepaged_thread = NULL; |
152 | } | 152 | } |
153 | 153 | ||
154 | if (!list_empty(&khugepaged_scan.mm_head)) | 154 | if (!list_empty(&khugepaged_scan.mm_head)) |
155 | wake_up_interruptible(&khugepaged_wait); | 155 | wake_up_interruptible(&khugepaged_wait); |
156 | 156 | ||
157 | set_recommended_min_free_kbytes(); | 157 | set_recommended_min_free_kbytes(); |
158 | } else if (khugepaged_thread) { | 158 | } else if (khugepaged_thread) { |
159 | kthread_stop(khugepaged_thread); | 159 | kthread_stop(khugepaged_thread); |
160 | khugepaged_thread = NULL; | 160 | khugepaged_thread = NULL; |
161 | } | 161 | } |
162 | 162 | ||
163 | return err; | 163 | return err; |
164 | } | 164 | } |
165 | 165 | ||
166 | static atomic_t huge_zero_refcount; | 166 | static atomic_t huge_zero_refcount; |
167 | static unsigned long huge_zero_pfn __read_mostly; | 167 | static unsigned long huge_zero_pfn __read_mostly; |
168 | 168 | ||
169 | static inline bool is_huge_zero_pfn(unsigned long pfn) | 169 | static inline bool is_huge_zero_pfn(unsigned long pfn) |
170 | { | 170 | { |
171 | unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); | 171 | unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); |
172 | return zero_pfn && pfn == zero_pfn; | 172 | return zero_pfn && pfn == zero_pfn; |
173 | } | 173 | } |
174 | 174 | ||
175 | static inline bool is_huge_zero_pmd(pmd_t pmd) | 175 | static inline bool is_huge_zero_pmd(pmd_t pmd) |
176 | { | 176 | { |
177 | return is_huge_zero_pfn(pmd_pfn(pmd)); | 177 | return is_huge_zero_pfn(pmd_pfn(pmd)); |
178 | } | 178 | } |
179 | 179 | ||
180 | static unsigned long get_huge_zero_page(void) | 180 | static unsigned long get_huge_zero_page(void) |
181 | { | 181 | { |
182 | struct page *zero_page; | 182 | struct page *zero_page; |
183 | retry: | 183 | retry: |
184 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | 184 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) |
185 | return ACCESS_ONCE(huge_zero_pfn); | 185 | return ACCESS_ONCE(huge_zero_pfn); |
186 | 186 | ||
187 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | 187 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, |
188 | HPAGE_PMD_ORDER); | 188 | HPAGE_PMD_ORDER); |
189 | if (!zero_page) { | 189 | if (!zero_page) { |
190 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); | 190 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); |
191 | return 0; | 191 | return 0; |
192 | } | 192 | } |
193 | count_vm_event(THP_ZERO_PAGE_ALLOC); | 193 | count_vm_event(THP_ZERO_PAGE_ALLOC); |
194 | preempt_disable(); | 194 | preempt_disable(); |
195 | if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { | 195 | if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { |
196 | preempt_enable(); | 196 | preempt_enable(); |
197 | __free_page(zero_page); | 197 | __free_page(zero_page); |
198 | goto retry; | 198 | goto retry; |
199 | } | 199 | } |
200 | 200 | ||
201 | /* We take additional reference here. It will be put back by shrinker */ | 201 | /* We take additional reference here. It will be put back by shrinker */ |
202 | atomic_set(&huge_zero_refcount, 2); | 202 | atomic_set(&huge_zero_refcount, 2); |
203 | preempt_enable(); | 203 | preempt_enable(); |
204 | return ACCESS_ONCE(huge_zero_pfn); | 204 | return ACCESS_ONCE(huge_zero_pfn); |
205 | } | 205 | } |
206 | 206 | ||
207 | static void put_huge_zero_page(void) | 207 | static void put_huge_zero_page(void) |
208 | { | 208 | { |
209 | /* | 209 | /* |
210 | * Counter should never go to zero here. Only shrinker can put | 210 | * Counter should never go to zero here. Only shrinker can put |
211 | * last reference. | 211 | * last reference. |
212 | */ | 212 | */ |
213 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | 213 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); |
214 | } | 214 | } |
215 | 215 | ||
216 | static int shrink_huge_zero_page(struct shrinker *shrink, | 216 | static int shrink_huge_zero_page(struct shrinker *shrink, |
217 | struct shrink_control *sc) | 217 | struct shrink_control *sc) |
218 | { | 218 | { |
219 | if (!sc->nr_to_scan) | 219 | if (!sc->nr_to_scan) |
220 | /* we can free zero page only if last reference remains */ | 220 | /* we can free zero page only if last reference remains */ |
221 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | 221 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; |
222 | 222 | ||
223 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | 223 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { |
224 | unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); | 224 | unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); |
225 | BUG_ON(zero_pfn == 0); | 225 | BUG_ON(zero_pfn == 0); |
226 | __free_page(__pfn_to_page(zero_pfn)); | 226 | __free_page(__pfn_to_page(zero_pfn)); |
227 | } | 227 | } |
228 | 228 | ||
229 | return 0; | 229 | return 0; |
230 | } | 230 | } |
231 | 231 | ||
232 | static struct shrinker huge_zero_page_shrinker = { | 232 | static struct shrinker huge_zero_page_shrinker = { |
233 | .shrink = shrink_huge_zero_page, | 233 | .shrink = shrink_huge_zero_page, |
234 | .seeks = DEFAULT_SEEKS, | 234 | .seeks = DEFAULT_SEEKS, |
235 | }; | 235 | }; |
236 | 236 | ||
237 | #ifdef CONFIG_SYSFS | 237 | #ifdef CONFIG_SYSFS |
238 | 238 | ||
239 | static ssize_t double_flag_show(struct kobject *kobj, | 239 | static ssize_t double_flag_show(struct kobject *kobj, |
240 | struct kobj_attribute *attr, char *buf, | 240 | struct kobj_attribute *attr, char *buf, |
241 | enum transparent_hugepage_flag enabled, | 241 | enum transparent_hugepage_flag enabled, |
242 | enum transparent_hugepage_flag req_madv) | 242 | enum transparent_hugepage_flag req_madv) |
243 | { | 243 | { |
244 | if (test_bit(enabled, &transparent_hugepage_flags)) { | 244 | if (test_bit(enabled, &transparent_hugepage_flags)) { |
245 | VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); | 245 | VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); |
246 | return sprintf(buf, "[always] madvise never\n"); | 246 | return sprintf(buf, "[always] madvise never\n"); |
247 | } else if (test_bit(req_madv, &transparent_hugepage_flags)) | 247 | } else if (test_bit(req_madv, &transparent_hugepage_flags)) |
248 | return sprintf(buf, "always [madvise] never\n"); | 248 | return sprintf(buf, "always [madvise] never\n"); |
249 | else | 249 | else |
250 | return sprintf(buf, "always madvise [never]\n"); | 250 | return sprintf(buf, "always madvise [never]\n"); |
251 | } | 251 | } |
252 | static ssize_t double_flag_store(struct kobject *kobj, | 252 | static ssize_t double_flag_store(struct kobject *kobj, |
253 | struct kobj_attribute *attr, | 253 | struct kobj_attribute *attr, |
254 | const char *buf, size_t count, | 254 | const char *buf, size_t count, |
255 | enum transparent_hugepage_flag enabled, | 255 | enum transparent_hugepage_flag enabled, |
256 | enum transparent_hugepage_flag req_madv) | 256 | enum transparent_hugepage_flag req_madv) |
257 | { | 257 | { |
258 | if (!memcmp("always", buf, | 258 | if (!memcmp("always", buf, |
259 | min(sizeof("always")-1, count))) { | 259 | min(sizeof("always")-1, count))) { |
260 | set_bit(enabled, &transparent_hugepage_flags); | 260 | set_bit(enabled, &transparent_hugepage_flags); |
261 | clear_bit(req_madv, &transparent_hugepage_flags); | 261 | clear_bit(req_madv, &transparent_hugepage_flags); |
262 | } else if (!memcmp("madvise", buf, | 262 | } else if (!memcmp("madvise", buf, |
263 | min(sizeof("madvise")-1, count))) { | 263 | min(sizeof("madvise")-1, count))) { |
264 | clear_bit(enabled, &transparent_hugepage_flags); | 264 | clear_bit(enabled, &transparent_hugepage_flags); |
265 | set_bit(req_madv, &transparent_hugepage_flags); | 265 | set_bit(req_madv, &transparent_hugepage_flags); |
266 | } else if (!memcmp("never", buf, | 266 | } else if (!memcmp("never", buf, |
267 | min(sizeof("never")-1, count))) { | 267 | min(sizeof("never")-1, count))) { |
268 | clear_bit(enabled, &transparent_hugepage_flags); | 268 | clear_bit(enabled, &transparent_hugepage_flags); |
269 | clear_bit(req_madv, &transparent_hugepage_flags); | 269 | clear_bit(req_madv, &transparent_hugepage_flags); |
270 | } else | 270 | } else |
271 | return -EINVAL; | 271 | return -EINVAL; |
272 | 272 | ||
273 | return count; | 273 | return count; |
274 | } | 274 | } |
275 | 275 | ||
276 | static ssize_t enabled_show(struct kobject *kobj, | 276 | static ssize_t enabled_show(struct kobject *kobj, |
277 | struct kobj_attribute *attr, char *buf) | 277 | struct kobj_attribute *attr, char *buf) |
278 | { | 278 | { |
279 | return double_flag_show(kobj, attr, buf, | 279 | return double_flag_show(kobj, attr, buf, |
280 | TRANSPARENT_HUGEPAGE_FLAG, | 280 | TRANSPARENT_HUGEPAGE_FLAG, |
281 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | 281 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); |
282 | } | 282 | } |
283 | static ssize_t enabled_store(struct kobject *kobj, | 283 | static ssize_t enabled_store(struct kobject *kobj, |
284 | struct kobj_attribute *attr, | 284 | struct kobj_attribute *attr, |
285 | const char *buf, size_t count) | 285 | const char *buf, size_t count) |
286 | { | 286 | { |
287 | ssize_t ret; | 287 | ssize_t ret; |
288 | 288 | ||
289 | ret = double_flag_store(kobj, attr, buf, count, | 289 | ret = double_flag_store(kobj, attr, buf, count, |
290 | TRANSPARENT_HUGEPAGE_FLAG, | 290 | TRANSPARENT_HUGEPAGE_FLAG, |
291 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | 291 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); |
292 | 292 | ||
293 | if (ret > 0) { | 293 | if (ret > 0) { |
294 | int err; | 294 | int err; |
295 | 295 | ||
296 | mutex_lock(&khugepaged_mutex); | 296 | mutex_lock(&khugepaged_mutex); |
297 | err = start_khugepaged(); | 297 | err = start_khugepaged(); |
298 | mutex_unlock(&khugepaged_mutex); | 298 | mutex_unlock(&khugepaged_mutex); |
299 | 299 | ||
300 | if (err) | 300 | if (err) |
301 | ret = err; | 301 | ret = err; |
302 | } | 302 | } |
303 | 303 | ||
304 | return ret; | 304 | return ret; |
305 | } | 305 | } |
306 | static struct kobj_attribute enabled_attr = | 306 | static struct kobj_attribute enabled_attr = |
307 | __ATTR(enabled, 0644, enabled_show, enabled_store); | 307 | __ATTR(enabled, 0644, enabled_show, enabled_store); |
308 | 308 | ||
309 | static ssize_t single_flag_show(struct kobject *kobj, | 309 | static ssize_t single_flag_show(struct kobject *kobj, |
310 | struct kobj_attribute *attr, char *buf, | 310 | struct kobj_attribute *attr, char *buf, |
311 | enum transparent_hugepage_flag flag) | 311 | enum transparent_hugepage_flag flag) |
312 | { | 312 | { |
313 | return sprintf(buf, "%d\n", | 313 | return sprintf(buf, "%d\n", |
314 | !!test_bit(flag, &transparent_hugepage_flags)); | 314 | !!test_bit(flag, &transparent_hugepage_flags)); |
315 | } | 315 | } |
316 | 316 | ||
317 | static ssize_t single_flag_store(struct kobject *kobj, | 317 | static ssize_t single_flag_store(struct kobject *kobj, |
318 | struct kobj_attribute *attr, | 318 | struct kobj_attribute *attr, |
319 | const char *buf, size_t count, | 319 | const char *buf, size_t count, |
320 | enum transparent_hugepage_flag flag) | 320 | enum transparent_hugepage_flag flag) |
321 | { | 321 | { |
322 | unsigned long value; | 322 | unsigned long value; |
323 | int ret; | 323 | int ret; |
324 | 324 | ||
325 | ret = kstrtoul(buf, 10, &value); | 325 | ret = kstrtoul(buf, 10, &value); |
326 | if (ret < 0) | 326 | if (ret < 0) |
327 | return ret; | 327 | return ret; |
328 | if (value > 1) | 328 | if (value > 1) |
329 | return -EINVAL; | 329 | return -EINVAL; |
330 | 330 | ||
331 | if (value) | 331 | if (value) |
332 | set_bit(flag, &transparent_hugepage_flags); | 332 | set_bit(flag, &transparent_hugepage_flags); |
333 | else | 333 | else |
334 | clear_bit(flag, &transparent_hugepage_flags); | 334 | clear_bit(flag, &transparent_hugepage_flags); |
335 | 335 | ||
336 | return count; | 336 | return count; |
337 | } | 337 | } |
338 | 338 | ||
339 | /* | 339 | /* |
340 | * Currently defrag only disables __GFP_NOWAIT for allocation. A blind | 340 | * Currently defrag only disables __GFP_NOWAIT for allocation. A blind |
341 | * __GFP_REPEAT is too aggressive, it's never worth swapping tons of | 341 | * __GFP_REPEAT is too aggressive, it's never worth swapping tons of |
342 | * memory just to allocate one more hugepage. | 342 | * memory just to allocate one more hugepage. |
343 | */ | 343 | */ |
344 | static ssize_t defrag_show(struct kobject *kobj, | 344 | static ssize_t defrag_show(struct kobject *kobj, |
345 | struct kobj_attribute *attr, char *buf) | 345 | struct kobj_attribute *attr, char *buf) |
346 | { | 346 | { |
347 | return double_flag_show(kobj, attr, buf, | 347 | return double_flag_show(kobj, attr, buf, |
348 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | 348 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, |
349 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | 349 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); |
350 | } | 350 | } |
351 | static ssize_t defrag_store(struct kobject *kobj, | 351 | static ssize_t defrag_store(struct kobject *kobj, |
352 | struct kobj_attribute *attr, | 352 | struct kobj_attribute *attr, |
353 | const char *buf, size_t count) | 353 | const char *buf, size_t count) |
354 | { | 354 | { |
355 | return double_flag_store(kobj, attr, buf, count, | 355 | return double_flag_store(kobj, attr, buf, count, |
356 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | 356 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, |
357 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | 357 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); |
358 | } | 358 | } |
359 | static struct kobj_attribute defrag_attr = | 359 | static struct kobj_attribute defrag_attr = |
360 | __ATTR(defrag, 0644, defrag_show, defrag_store); | 360 | __ATTR(defrag, 0644, defrag_show, defrag_store); |
361 | 361 | ||
362 | static ssize_t use_zero_page_show(struct kobject *kobj, | 362 | static ssize_t use_zero_page_show(struct kobject *kobj, |
363 | struct kobj_attribute *attr, char *buf) | 363 | struct kobj_attribute *attr, char *buf) |
364 | { | 364 | { |
365 | return single_flag_show(kobj, attr, buf, | 365 | return single_flag_show(kobj, attr, buf, |
366 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | 366 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); |
367 | } | 367 | } |
368 | static ssize_t use_zero_page_store(struct kobject *kobj, | 368 | static ssize_t use_zero_page_store(struct kobject *kobj, |
369 | struct kobj_attribute *attr, const char *buf, size_t count) | 369 | struct kobj_attribute *attr, const char *buf, size_t count) |
370 | { | 370 | { |
371 | return single_flag_store(kobj, attr, buf, count, | 371 | return single_flag_store(kobj, attr, buf, count, |
372 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | 372 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); |
373 | } | 373 | } |
374 | static struct kobj_attribute use_zero_page_attr = | 374 | static struct kobj_attribute use_zero_page_attr = |
375 | __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); | 375 | __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); |
376 | #ifdef CONFIG_DEBUG_VM | 376 | #ifdef CONFIG_DEBUG_VM |
377 | static ssize_t debug_cow_show(struct kobject *kobj, | 377 | static ssize_t debug_cow_show(struct kobject *kobj, |
378 | struct kobj_attribute *attr, char *buf) | 378 | struct kobj_attribute *attr, char *buf) |
379 | { | 379 | { |
380 | return single_flag_show(kobj, attr, buf, | 380 | return single_flag_show(kobj, attr, buf, |
381 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | 381 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); |
382 | } | 382 | } |
383 | static ssize_t debug_cow_store(struct kobject *kobj, | 383 | static ssize_t debug_cow_store(struct kobject *kobj, |
384 | struct kobj_attribute *attr, | 384 | struct kobj_attribute *attr, |
385 | const char *buf, size_t count) | 385 | const char *buf, size_t count) |
386 | { | 386 | { |
387 | return single_flag_store(kobj, attr, buf, count, | 387 | return single_flag_store(kobj, attr, buf, count, |
388 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | 388 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); |
389 | } | 389 | } |
390 | static struct kobj_attribute debug_cow_attr = | 390 | static struct kobj_attribute debug_cow_attr = |
391 | __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); | 391 | __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); |
392 | #endif /* CONFIG_DEBUG_VM */ | 392 | #endif /* CONFIG_DEBUG_VM */ |
393 | 393 | ||
394 | static struct attribute *hugepage_attr[] = { | 394 | static struct attribute *hugepage_attr[] = { |
395 | &enabled_attr.attr, | 395 | &enabled_attr.attr, |
396 | &defrag_attr.attr, | 396 | &defrag_attr.attr, |
397 | &use_zero_page_attr.attr, | 397 | &use_zero_page_attr.attr, |
398 | #ifdef CONFIG_DEBUG_VM | 398 | #ifdef CONFIG_DEBUG_VM |
399 | &debug_cow_attr.attr, | 399 | &debug_cow_attr.attr, |
400 | #endif | 400 | #endif |
401 | NULL, | 401 | NULL, |
402 | }; | 402 | }; |
403 | 403 | ||
404 | static struct attribute_group hugepage_attr_group = { | 404 | static struct attribute_group hugepage_attr_group = { |
405 | .attrs = hugepage_attr, | 405 | .attrs = hugepage_attr, |
406 | }; | 406 | }; |
407 | 407 | ||
408 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, | 408 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, |
409 | struct kobj_attribute *attr, | 409 | struct kobj_attribute *attr, |
410 | char *buf) | 410 | char *buf) |
411 | { | 411 | { |
412 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); | 412 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); |
413 | } | 413 | } |
414 | 414 | ||
415 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, | 415 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, |
416 | struct kobj_attribute *attr, | 416 | struct kobj_attribute *attr, |
417 | const char *buf, size_t count) | 417 | const char *buf, size_t count) |
418 | { | 418 | { |
419 | unsigned long msecs; | 419 | unsigned long msecs; |
420 | int err; | 420 | int err; |
421 | 421 | ||
422 | err = strict_strtoul(buf, 10, &msecs); | 422 | err = strict_strtoul(buf, 10, &msecs); |
423 | if (err || msecs > UINT_MAX) | 423 | if (err || msecs > UINT_MAX) |
424 | return -EINVAL; | 424 | return -EINVAL; |
425 | 425 | ||
426 | khugepaged_scan_sleep_millisecs = msecs; | 426 | khugepaged_scan_sleep_millisecs = msecs; |
427 | wake_up_interruptible(&khugepaged_wait); | 427 | wake_up_interruptible(&khugepaged_wait); |
428 | 428 | ||
429 | return count; | 429 | return count; |
430 | } | 430 | } |
431 | static struct kobj_attribute scan_sleep_millisecs_attr = | 431 | static struct kobj_attribute scan_sleep_millisecs_attr = |
432 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, | 432 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, |
433 | scan_sleep_millisecs_store); | 433 | scan_sleep_millisecs_store); |
434 | 434 | ||
435 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, | 435 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, |
436 | struct kobj_attribute *attr, | 436 | struct kobj_attribute *attr, |
437 | char *buf) | 437 | char *buf) |
438 | { | 438 | { |
439 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); | 439 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); |
440 | } | 440 | } |
441 | 441 | ||
442 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, | 442 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, |
443 | struct kobj_attribute *attr, | 443 | struct kobj_attribute *attr, |
444 | const char *buf, size_t count) | 444 | const char *buf, size_t count) |
445 | { | 445 | { |
446 | unsigned long msecs; | 446 | unsigned long msecs; |
447 | int err; | 447 | int err; |
448 | 448 | ||
449 | err = strict_strtoul(buf, 10, &msecs); | 449 | err = strict_strtoul(buf, 10, &msecs); |
450 | if (err || msecs > UINT_MAX) | 450 | if (err || msecs > UINT_MAX) |
451 | return -EINVAL; | 451 | return -EINVAL; |
452 | 452 | ||
453 | khugepaged_alloc_sleep_millisecs = msecs; | 453 | khugepaged_alloc_sleep_millisecs = msecs; |
454 | wake_up_interruptible(&khugepaged_wait); | 454 | wake_up_interruptible(&khugepaged_wait); |
455 | 455 | ||
456 | return count; | 456 | return count; |
457 | } | 457 | } |
458 | static struct kobj_attribute alloc_sleep_millisecs_attr = | 458 | static struct kobj_attribute alloc_sleep_millisecs_attr = |
459 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, | 459 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, |
460 | alloc_sleep_millisecs_store); | 460 | alloc_sleep_millisecs_store); |
461 | 461 | ||
462 | static ssize_t pages_to_scan_show(struct kobject *kobj, | 462 | static ssize_t pages_to_scan_show(struct kobject *kobj, |
463 | struct kobj_attribute *attr, | 463 | struct kobj_attribute *attr, |
464 | char *buf) | 464 | char *buf) |
465 | { | 465 | { |
466 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); | 466 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); |
467 | } | 467 | } |
468 | static ssize_t pages_to_scan_store(struct kobject *kobj, | 468 | static ssize_t pages_to_scan_store(struct kobject *kobj, |
469 | struct kobj_attribute *attr, | 469 | struct kobj_attribute *attr, |
470 | const char *buf, size_t count) | 470 | const char *buf, size_t count) |
471 | { | 471 | { |
472 | int err; | 472 | int err; |
473 | unsigned long pages; | 473 | unsigned long pages; |
474 | 474 | ||
475 | err = strict_strtoul(buf, 10, &pages); | 475 | err = strict_strtoul(buf, 10, &pages); |
476 | if (err || !pages || pages > UINT_MAX) | 476 | if (err || !pages || pages > UINT_MAX) |
477 | return -EINVAL; | 477 | return -EINVAL; |
478 | 478 | ||
479 | khugepaged_pages_to_scan = pages; | 479 | khugepaged_pages_to_scan = pages; |
480 | 480 | ||
481 | return count; | 481 | return count; |
482 | } | 482 | } |
483 | static struct kobj_attribute pages_to_scan_attr = | 483 | static struct kobj_attribute pages_to_scan_attr = |
484 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, | 484 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, |
485 | pages_to_scan_store); | 485 | pages_to_scan_store); |
486 | 486 | ||
487 | static ssize_t pages_collapsed_show(struct kobject *kobj, | 487 | static ssize_t pages_collapsed_show(struct kobject *kobj, |
488 | struct kobj_attribute *attr, | 488 | struct kobj_attribute *attr, |
489 | char *buf) | 489 | char *buf) |
490 | { | 490 | { |
491 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); | 491 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); |
492 | } | 492 | } |
493 | static struct kobj_attribute pages_collapsed_attr = | 493 | static struct kobj_attribute pages_collapsed_attr = |
494 | __ATTR_RO(pages_collapsed); | 494 | __ATTR_RO(pages_collapsed); |
495 | 495 | ||
496 | static ssize_t full_scans_show(struct kobject *kobj, | 496 | static ssize_t full_scans_show(struct kobject *kobj, |
497 | struct kobj_attribute *attr, | 497 | struct kobj_attribute *attr, |
498 | char *buf) | 498 | char *buf) |
499 | { | 499 | { |
500 | return sprintf(buf, "%u\n", khugepaged_full_scans); | 500 | return sprintf(buf, "%u\n", khugepaged_full_scans); |
501 | } | 501 | } |
502 | static struct kobj_attribute full_scans_attr = | 502 | static struct kobj_attribute full_scans_attr = |
503 | __ATTR_RO(full_scans); | 503 | __ATTR_RO(full_scans); |
504 | 504 | ||
505 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, | 505 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, |
506 | struct kobj_attribute *attr, char *buf) | 506 | struct kobj_attribute *attr, char *buf) |
507 | { | 507 | { |
508 | return single_flag_show(kobj, attr, buf, | 508 | return single_flag_show(kobj, attr, buf, |
509 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | 509 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); |
510 | } | 510 | } |
511 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, | 511 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, |
512 | struct kobj_attribute *attr, | 512 | struct kobj_attribute *attr, |
513 | const char *buf, size_t count) | 513 | const char *buf, size_t count) |
514 | { | 514 | { |
515 | return single_flag_store(kobj, attr, buf, count, | 515 | return single_flag_store(kobj, attr, buf, count, |
516 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | 516 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); |
517 | } | 517 | } |
518 | static struct kobj_attribute khugepaged_defrag_attr = | 518 | static struct kobj_attribute khugepaged_defrag_attr = |
519 | __ATTR(defrag, 0644, khugepaged_defrag_show, | 519 | __ATTR(defrag, 0644, khugepaged_defrag_show, |
520 | khugepaged_defrag_store); | 520 | khugepaged_defrag_store); |
521 | 521 | ||
522 | /* | 522 | /* |
523 | * max_ptes_none controls if khugepaged should collapse hugepages over | 523 | * max_ptes_none controls if khugepaged should collapse hugepages over |
524 | * any unmapped ptes in turn potentially increasing the memory | 524 | * any unmapped ptes in turn potentially increasing the memory |
525 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not | 525 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not |
526 | * reduce the available free memory in the system as it | 526 | * reduce the available free memory in the system as it |
527 | * runs. Increasing max_ptes_none will instead potentially reduce the | 527 | * runs. Increasing max_ptes_none will instead potentially reduce the |
528 | * free memory in the system during the khugepaged scan. | 528 | * free memory in the system during the khugepaged scan. |
529 | */ | 529 | */ |
530 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, | 530 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, |
531 | struct kobj_attribute *attr, | 531 | struct kobj_attribute *attr, |
532 | char *buf) | 532 | char *buf) |
533 | { | 533 | { |
534 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); | 534 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); |
535 | } | 535 | } |
536 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, | 536 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, |
537 | struct kobj_attribute *attr, | 537 | struct kobj_attribute *attr, |
538 | const char *buf, size_t count) | 538 | const char *buf, size_t count) |
539 | { | 539 | { |
540 | int err; | 540 | int err; |
541 | unsigned long max_ptes_none; | 541 | unsigned long max_ptes_none; |
542 | 542 | ||
543 | err = strict_strtoul(buf, 10, &max_ptes_none); | 543 | err = strict_strtoul(buf, 10, &max_ptes_none); |
544 | if (err || max_ptes_none > HPAGE_PMD_NR-1) | 544 | if (err || max_ptes_none > HPAGE_PMD_NR-1) |
545 | return -EINVAL; | 545 | return -EINVAL; |
546 | 546 | ||
547 | khugepaged_max_ptes_none = max_ptes_none; | 547 | khugepaged_max_ptes_none = max_ptes_none; |
548 | 548 | ||
549 | return count; | 549 | return count; |
550 | } | 550 | } |
551 | static struct kobj_attribute khugepaged_max_ptes_none_attr = | 551 | static struct kobj_attribute khugepaged_max_ptes_none_attr = |
552 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, | 552 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, |
553 | khugepaged_max_ptes_none_store); | 553 | khugepaged_max_ptes_none_store); |
554 | 554 | ||
555 | static struct attribute *khugepaged_attr[] = { | 555 | static struct attribute *khugepaged_attr[] = { |
556 | &khugepaged_defrag_attr.attr, | 556 | &khugepaged_defrag_attr.attr, |
557 | &khugepaged_max_ptes_none_attr.attr, | 557 | &khugepaged_max_ptes_none_attr.attr, |
558 | &pages_to_scan_attr.attr, | 558 | &pages_to_scan_attr.attr, |
559 | &pages_collapsed_attr.attr, | 559 | &pages_collapsed_attr.attr, |
560 | &full_scans_attr.attr, | 560 | &full_scans_attr.attr, |
561 | &scan_sleep_millisecs_attr.attr, | 561 | &scan_sleep_millisecs_attr.attr, |
562 | &alloc_sleep_millisecs_attr.attr, | 562 | &alloc_sleep_millisecs_attr.attr, |
563 | NULL, | 563 | NULL, |
564 | }; | 564 | }; |
565 | 565 | ||
566 | static struct attribute_group khugepaged_attr_group = { | 566 | static struct attribute_group khugepaged_attr_group = { |
567 | .attrs = khugepaged_attr, | 567 | .attrs = khugepaged_attr, |
568 | .name = "khugepaged", | 568 | .name = "khugepaged", |
569 | }; | 569 | }; |
570 | 570 | ||
571 | static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) | 571 | static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) |
572 | { | 572 | { |
573 | int err; | 573 | int err; |
574 | 574 | ||
575 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); | 575 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); |
576 | if (unlikely(!*hugepage_kobj)) { | 576 | if (unlikely(!*hugepage_kobj)) { |
577 | printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); | 577 | printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); |
578 | return -ENOMEM; | 578 | return -ENOMEM; |
579 | } | 579 | } |
580 | 580 | ||
581 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); | 581 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); |
582 | if (err) { | 582 | if (err) { |
583 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); | 583 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); |
584 | goto delete_obj; | 584 | goto delete_obj; |
585 | } | 585 | } |
586 | 586 | ||
587 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); | 587 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); |
588 | if (err) { | 588 | if (err) { |
589 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); | 589 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); |
590 | goto remove_hp_group; | 590 | goto remove_hp_group; |
591 | } | 591 | } |
592 | 592 | ||
593 | return 0; | 593 | return 0; |
594 | 594 | ||
595 | remove_hp_group: | 595 | remove_hp_group: |
596 | sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); | 596 | sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); |
597 | delete_obj: | 597 | delete_obj: |
598 | kobject_put(*hugepage_kobj); | 598 | kobject_put(*hugepage_kobj); |
599 | return err; | 599 | return err; |
600 | } | 600 | } |
601 | 601 | ||
602 | static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) | 602 | static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) |
603 | { | 603 | { |
604 | sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); | 604 | sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); |
605 | sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); | 605 | sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); |
606 | kobject_put(hugepage_kobj); | 606 | kobject_put(hugepage_kobj); |
607 | } | 607 | } |
608 | #else | 608 | #else |
609 | static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) | 609 | static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) |
610 | { | 610 | { |
611 | return 0; | 611 | return 0; |
612 | } | 612 | } |
613 | 613 | ||
614 | static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) | 614 | static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) |
615 | { | 615 | { |
616 | } | 616 | } |
617 | #endif /* CONFIG_SYSFS */ | 617 | #endif /* CONFIG_SYSFS */ |
618 | 618 | ||
619 | static int __init hugepage_init(void) | 619 | static int __init hugepage_init(void) |
620 | { | 620 | { |
621 | int err; | 621 | int err; |
622 | struct kobject *hugepage_kobj; | 622 | struct kobject *hugepage_kobj; |
623 | 623 | ||
624 | if (!has_transparent_hugepage()) { | 624 | if (!has_transparent_hugepage()) { |
625 | transparent_hugepage_flags = 0; | 625 | transparent_hugepage_flags = 0; |
626 | return -EINVAL; | 626 | return -EINVAL; |
627 | } | 627 | } |
628 | 628 | ||
629 | err = hugepage_init_sysfs(&hugepage_kobj); | 629 | err = hugepage_init_sysfs(&hugepage_kobj); |
630 | if (err) | 630 | if (err) |
631 | return err; | 631 | return err; |
632 | 632 | ||
633 | err = khugepaged_slab_init(); | 633 | err = khugepaged_slab_init(); |
634 | if (err) | 634 | if (err) |
635 | goto out; | 635 | goto out; |
636 | 636 | ||
637 | err = mm_slots_hash_init(); | 637 | err = mm_slots_hash_init(); |
638 | if (err) { | 638 | if (err) { |
639 | khugepaged_slab_free(); | 639 | khugepaged_slab_free(); |
640 | goto out; | 640 | goto out; |
641 | } | 641 | } |
642 | 642 | ||
643 | register_shrinker(&huge_zero_page_shrinker); | 643 | register_shrinker(&huge_zero_page_shrinker); |
644 | 644 | ||
645 | /* | 645 | /* |
646 | * By default disable transparent hugepages on smaller systems, | 646 | * By default disable transparent hugepages on smaller systems, |
647 | * where the extra memory used could hurt more than TLB overhead | 647 | * where the extra memory used could hurt more than TLB overhead |
648 | * is likely to save. The admin can still enable it through /sys. | 648 | * is likely to save. The admin can still enable it through /sys. |
649 | */ | 649 | */ |
650 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) | 650 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) |
651 | transparent_hugepage_flags = 0; | 651 | transparent_hugepage_flags = 0; |
652 | 652 | ||
653 | start_khugepaged(); | 653 | start_khugepaged(); |
654 | 654 | ||
655 | return 0; | 655 | return 0; |
656 | out: | 656 | out: |
657 | hugepage_exit_sysfs(hugepage_kobj); | 657 | hugepage_exit_sysfs(hugepage_kobj); |
658 | return err; | 658 | return err; |
659 | } | 659 | } |
660 | module_init(hugepage_init) | 660 | module_init(hugepage_init) |
661 | 661 | ||
662 | static int __init setup_transparent_hugepage(char *str) | 662 | static int __init setup_transparent_hugepage(char *str) |
663 | { | 663 | { |
664 | int ret = 0; | 664 | int ret = 0; |
665 | if (!str) | 665 | if (!str) |
666 | goto out; | 666 | goto out; |
667 | if (!strcmp(str, "always")) { | 667 | if (!strcmp(str, "always")) { |
668 | set_bit(TRANSPARENT_HUGEPAGE_FLAG, | 668 | set_bit(TRANSPARENT_HUGEPAGE_FLAG, |
669 | &transparent_hugepage_flags); | 669 | &transparent_hugepage_flags); |
670 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | 670 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, |
671 | &transparent_hugepage_flags); | 671 | &transparent_hugepage_flags); |
672 | ret = 1; | 672 | ret = 1; |
673 | } else if (!strcmp(str, "madvise")) { | 673 | } else if (!strcmp(str, "madvise")) { |
674 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | 674 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, |
675 | &transparent_hugepage_flags); | 675 | &transparent_hugepage_flags); |
676 | set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | 676 | set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, |
677 | &transparent_hugepage_flags); | 677 | &transparent_hugepage_flags); |
678 | ret = 1; | 678 | ret = 1; |
679 | } else if (!strcmp(str, "never")) { | 679 | } else if (!strcmp(str, "never")) { |
680 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | 680 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, |
681 | &transparent_hugepage_flags); | 681 | &transparent_hugepage_flags); |
682 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | 682 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, |
683 | &transparent_hugepage_flags); | 683 | &transparent_hugepage_flags); |
684 | ret = 1; | 684 | ret = 1; |
685 | } | 685 | } |
686 | out: | 686 | out: |
687 | if (!ret) | 687 | if (!ret) |
688 | printk(KERN_WARNING | 688 | printk(KERN_WARNING |
689 | "transparent_hugepage= cannot parse, ignored\n"); | 689 | "transparent_hugepage= cannot parse, ignored\n"); |
690 | return ret; | 690 | return ret; |
691 | } | 691 | } |
692 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 692 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
693 | 693 | ||
694 | pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | 694 | pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
695 | { | 695 | { |
696 | if (likely(vma->vm_flags & VM_WRITE)) | 696 | if (likely(vma->vm_flags & VM_WRITE)) |
697 | pmd = pmd_mkwrite(pmd); | 697 | pmd = pmd_mkwrite(pmd); |
698 | return pmd; | 698 | return pmd; |
699 | } | 699 | } |
700 | 700 | ||
701 | static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) | 701 | static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) |
702 | { | 702 | { |
703 | pmd_t entry; | 703 | pmd_t entry; |
704 | entry = mk_pmd(page, vma->vm_page_prot); | 704 | entry = mk_pmd(page, vma->vm_page_prot); |
705 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 705 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
706 | entry = pmd_mkhuge(entry); | 706 | entry = pmd_mkhuge(entry); |
707 | return entry; | 707 | return entry; |
708 | } | 708 | } |
709 | 709 | ||
710 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 710 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
711 | struct vm_area_struct *vma, | 711 | struct vm_area_struct *vma, |
712 | unsigned long haddr, pmd_t *pmd, | 712 | unsigned long haddr, pmd_t *pmd, |
713 | struct page *page) | 713 | struct page *page) |
714 | { | 714 | { |
715 | pgtable_t pgtable; | 715 | pgtable_t pgtable; |
716 | 716 | ||
717 | VM_BUG_ON(!PageCompound(page)); | 717 | VM_BUG_ON(!PageCompound(page)); |
718 | pgtable = pte_alloc_one(mm, haddr); | 718 | pgtable = pte_alloc_one(mm, haddr); |
719 | if (unlikely(!pgtable)) | 719 | if (unlikely(!pgtable)) |
720 | return VM_FAULT_OOM; | 720 | return VM_FAULT_OOM; |
721 | 721 | ||
722 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | 722 | clear_huge_page(page, haddr, HPAGE_PMD_NR); |
723 | __SetPageUptodate(page); | 723 | __SetPageUptodate(page); |
724 | 724 | ||
725 | spin_lock(&mm->page_table_lock); | 725 | spin_lock(&mm->page_table_lock); |
726 | if (unlikely(!pmd_none(*pmd))) { | 726 | if (unlikely(!pmd_none(*pmd))) { |
727 | spin_unlock(&mm->page_table_lock); | 727 | spin_unlock(&mm->page_table_lock); |
728 | mem_cgroup_uncharge_page(page); | 728 | mem_cgroup_uncharge_page(page); |
729 | put_page(page); | 729 | put_page(page); |
730 | pte_free(mm, pgtable); | 730 | pte_free(mm, pgtable); |
731 | } else { | 731 | } else { |
732 | pmd_t entry; | 732 | pmd_t entry; |
733 | entry = mk_huge_pmd(page, vma); | 733 | entry = mk_huge_pmd(page, vma); |
734 | /* | 734 | /* |
735 | * The spinlocking to take the lru_lock inside | 735 | * The spinlocking to take the lru_lock inside |
736 | * page_add_new_anon_rmap() acts as a full memory | 736 | * page_add_new_anon_rmap() acts as a full memory |
737 | * barrier to be sure clear_huge_page writes become | 737 | * barrier to be sure clear_huge_page writes become |
738 | * visible after the set_pmd_at() write. | 738 | * visible after the set_pmd_at() write. |
739 | */ | 739 | */ |
740 | page_add_new_anon_rmap(page, vma, haddr); | 740 | page_add_new_anon_rmap(page, vma, haddr); |
741 | set_pmd_at(mm, haddr, pmd, entry); | 741 | set_pmd_at(mm, haddr, pmd, entry); |
742 | pgtable_trans_huge_deposit(mm, pgtable); | 742 | pgtable_trans_huge_deposit(mm, pgtable); |
743 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 743 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
744 | mm->nr_ptes++; | 744 | mm->nr_ptes++; |
745 | spin_unlock(&mm->page_table_lock); | 745 | spin_unlock(&mm->page_table_lock); |
746 | } | 746 | } |
747 | 747 | ||
748 | return 0; | 748 | return 0; |
749 | } | 749 | } |
750 | 750 | ||
751 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | 751 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) |
752 | { | 752 | { |
753 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; | 753 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; |
754 | } | 754 | } |
755 | 755 | ||
756 | static inline struct page *alloc_hugepage_vma(int defrag, | 756 | static inline struct page *alloc_hugepage_vma(int defrag, |
757 | struct vm_area_struct *vma, | 757 | struct vm_area_struct *vma, |
758 | unsigned long haddr, int nd, | 758 | unsigned long haddr, int nd, |
759 | gfp_t extra_gfp) | 759 | gfp_t extra_gfp) |
760 | { | 760 | { |
761 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), | 761 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), |
762 | HPAGE_PMD_ORDER, vma, haddr, nd); | 762 | HPAGE_PMD_ORDER, vma, haddr, nd); |
763 | } | 763 | } |
764 | 764 | ||
765 | #ifndef CONFIG_NUMA | 765 | #ifndef CONFIG_NUMA |
766 | static inline struct page *alloc_hugepage(int defrag) | 766 | static inline struct page *alloc_hugepage(int defrag) |
767 | { | 767 | { |
768 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), | 768 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), |
769 | HPAGE_PMD_ORDER); | 769 | HPAGE_PMD_ORDER); |
770 | } | 770 | } |
771 | #endif | 771 | #endif |
772 | 772 | ||
773 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 773 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
774 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 774 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
775 | unsigned long zero_pfn) | 775 | unsigned long zero_pfn) |
776 | { | 776 | { |
777 | pmd_t entry; | 777 | pmd_t entry; |
778 | if (!pmd_none(*pmd)) | 778 | if (!pmd_none(*pmd)) |
779 | return false; | 779 | return false; |
780 | entry = pfn_pmd(zero_pfn, vma->vm_page_prot); | 780 | entry = pfn_pmd(zero_pfn, vma->vm_page_prot); |
781 | entry = pmd_wrprotect(entry); | 781 | entry = pmd_wrprotect(entry); |
782 | entry = pmd_mkhuge(entry); | 782 | entry = pmd_mkhuge(entry); |
783 | set_pmd_at(mm, haddr, pmd, entry); | 783 | set_pmd_at(mm, haddr, pmd, entry); |
784 | pgtable_trans_huge_deposit(mm, pgtable); | 784 | pgtable_trans_huge_deposit(mm, pgtable); |
785 | mm->nr_ptes++; | 785 | mm->nr_ptes++; |
786 | return true; | 786 | return true; |
787 | } | 787 | } |
788 | 788 | ||
789 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 789 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
790 | unsigned long address, pmd_t *pmd, | 790 | unsigned long address, pmd_t *pmd, |
791 | unsigned int flags) | 791 | unsigned int flags) |
792 | { | 792 | { |
793 | struct page *page; | 793 | struct page *page; |
794 | unsigned long haddr = address & HPAGE_PMD_MASK; | 794 | unsigned long haddr = address & HPAGE_PMD_MASK; |
795 | pte_t *pte; | 795 | pte_t *pte; |
796 | 796 | ||
797 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { | 797 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { |
798 | if (unlikely(anon_vma_prepare(vma))) | 798 | if (unlikely(anon_vma_prepare(vma))) |
799 | return VM_FAULT_OOM; | 799 | return VM_FAULT_OOM; |
800 | if (unlikely(khugepaged_enter(vma))) | 800 | if (unlikely(khugepaged_enter(vma))) |
801 | return VM_FAULT_OOM; | 801 | return VM_FAULT_OOM; |
802 | if (!(flags & FAULT_FLAG_WRITE) && | 802 | if (!(flags & FAULT_FLAG_WRITE) && |
803 | transparent_hugepage_use_zero_page()) { | 803 | transparent_hugepage_use_zero_page()) { |
804 | pgtable_t pgtable; | 804 | pgtable_t pgtable; |
805 | unsigned long zero_pfn; | 805 | unsigned long zero_pfn; |
806 | bool set; | 806 | bool set; |
807 | pgtable = pte_alloc_one(mm, haddr); | 807 | pgtable = pte_alloc_one(mm, haddr); |
808 | if (unlikely(!pgtable)) | 808 | if (unlikely(!pgtable)) |
809 | return VM_FAULT_OOM; | 809 | return VM_FAULT_OOM; |
810 | zero_pfn = get_huge_zero_page(); | 810 | zero_pfn = get_huge_zero_page(); |
811 | if (unlikely(!zero_pfn)) { | 811 | if (unlikely(!zero_pfn)) { |
812 | pte_free(mm, pgtable); | 812 | pte_free(mm, pgtable); |
813 | count_vm_event(THP_FAULT_FALLBACK); | 813 | count_vm_event(THP_FAULT_FALLBACK); |
814 | goto out; | 814 | goto out; |
815 | } | 815 | } |
816 | spin_lock(&mm->page_table_lock); | 816 | spin_lock(&mm->page_table_lock); |
817 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | 817 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, |
818 | zero_pfn); | 818 | zero_pfn); |
819 | spin_unlock(&mm->page_table_lock); | 819 | spin_unlock(&mm->page_table_lock); |
820 | if (!set) { | 820 | if (!set) { |
821 | pte_free(mm, pgtable); | 821 | pte_free(mm, pgtable); |
822 | put_huge_zero_page(); | 822 | put_huge_zero_page(); |
823 | } | 823 | } |
824 | return 0; | 824 | return 0; |
825 | } | 825 | } |
826 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 826 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
827 | vma, haddr, numa_node_id(), 0); | 827 | vma, haddr, numa_node_id(), 0); |
828 | if (unlikely(!page)) { | 828 | if (unlikely(!page)) { |
829 | count_vm_event(THP_FAULT_FALLBACK); | 829 | count_vm_event(THP_FAULT_FALLBACK); |
830 | goto out; | 830 | goto out; |
831 | } | 831 | } |
832 | count_vm_event(THP_FAULT_ALLOC); | 832 | count_vm_event(THP_FAULT_ALLOC); |
833 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | 833 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { |
834 | put_page(page); | 834 | put_page(page); |
835 | goto out; | 835 | goto out; |
836 | } | 836 | } |
837 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, | 837 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, |
838 | page))) { | 838 | page))) { |
839 | mem_cgroup_uncharge_page(page); | 839 | mem_cgroup_uncharge_page(page); |
840 | put_page(page); | 840 | put_page(page); |
841 | goto out; | 841 | goto out; |
842 | } | 842 | } |
843 | 843 | ||
844 | return 0; | 844 | return 0; |
845 | } | 845 | } |
846 | out: | 846 | out: |
847 | /* | 847 | /* |
848 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 848 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
849 | * run pte_offset_map on the pmd, if an huge pmd could | 849 | * run pte_offset_map on the pmd, if an huge pmd could |
850 | * materialize from under us from a different thread. | 850 | * materialize from under us from a different thread. |
851 | */ | 851 | */ |
852 | if (unlikely(pmd_none(*pmd)) && | 852 | if (unlikely(pmd_none(*pmd)) && |
853 | unlikely(__pte_alloc(mm, vma, pmd, address))) | 853 | unlikely(__pte_alloc(mm, vma, pmd, address))) |
854 | return VM_FAULT_OOM; | 854 | return VM_FAULT_OOM; |
855 | /* if an huge pmd materialized from under us just retry later */ | 855 | /* if an huge pmd materialized from under us just retry later */ |
856 | if (unlikely(pmd_trans_huge(*pmd))) | 856 | if (unlikely(pmd_trans_huge(*pmd))) |
857 | return 0; | 857 | return 0; |
858 | /* | 858 | /* |
859 | * A regular pmd is established and it can't morph into a huge pmd | 859 | * A regular pmd is established and it can't morph into a huge pmd |
860 | * from under us anymore at this point because we hold the mmap_sem | 860 | * from under us anymore at this point because we hold the mmap_sem |
861 | * read mode and khugepaged takes it in write mode. So now it's | 861 | * read mode and khugepaged takes it in write mode. So now it's |
862 | * safe to run pte_offset_map(). | 862 | * safe to run pte_offset_map(). |
863 | */ | 863 | */ |
864 | pte = pte_offset_map(pmd, address); | 864 | pte = pte_offset_map(pmd, address); |
865 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 865 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
866 | } | 866 | } |
867 | 867 | ||
868 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 868 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
869 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 869 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
870 | struct vm_area_struct *vma) | 870 | struct vm_area_struct *vma) |
871 | { | 871 | { |
872 | struct page *src_page; | 872 | struct page *src_page; |
873 | pmd_t pmd; | 873 | pmd_t pmd; |
874 | pgtable_t pgtable; | 874 | pgtable_t pgtable; |
875 | int ret; | 875 | int ret; |
876 | 876 | ||
877 | ret = -ENOMEM; | 877 | ret = -ENOMEM; |
878 | pgtable = pte_alloc_one(dst_mm, addr); | 878 | pgtable = pte_alloc_one(dst_mm, addr); |
879 | if (unlikely(!pgtable)) | 879 | if (unlikely(!pgtable)) |
880 | goto out; | 880 | goto out; |
881 | 881 | ||
882 | spin_lock(&dst_mm->page_table_lock); | 882 | spin_lock(&dst_mm->page_table_lock); |
883 | spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); | 883 | spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); |
884 | 884 | ||
885 | ret = -EAGAIN; | 885 | ret = -EAGAIN; |
886 | pmd = *src_pmd; | 886 | pmd = *src_pmd; |
887 | if (unlikely(!pmd_trans_huge(pmd))) { | 887 | if (unlikely(!pmd_trans_huge(pmd))) { |
888 | pte_free(dst_mm, pgtable); | 888 | pte_free(dst_mm, pgtable); |
889 | goto out_unlock; | 889 | goto out_unlock; |
890 | } | 890 | } |
891 | /* | 891 | /* |
892 | * mm->page_table_lock is enough to be sure that huge zero pmd is not | 892 | * mm->page_table_lock is enough to be sure that huge zero pmd is not |
893 | * under splitting since we don't split the page itself, only pmd to | 893 | * under splitting since we don't split the page itself, only pmd to |
894 | * a page table. | 894 | * a page table. |
895 | */ | 895 | */ |
896 | if (is_huge_zero_pmd(pmd)) { | 896 | if (is_huge_zero_pmd(pmd)) { |
897 | unsigned long zero_pfn; | 897 | unsigned long zero_pfn; |
898 | bool set; | 898 | bool set; |
899 | /* | 899 | /* |
900 | * get_huge_zero_page() will never allocate a new page here, | 900 | * get_huge_zero_page() will never allocate a new page here, |
901 | * since we already have a zero page to copy. It just takes a | 901 | * since we already have a zero page to copy. It just takes a |
902 | * reference. | 902 | * reference. |
903 | */ | 903 | */ |
904 | zero_pfn = get_huge_zero_page(); | 904 | zero_pfn = get_huge_zero_page(); |
905 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | 905 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, |
906 | zero_pfn); | 906 | zero_pfn); |
907 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ | 907 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ |
908 | ret = 0; | 908 | ret = 0; |
909 | goto out_unlock; | 909 | goto out_unlock; |
910 | } | 910 | } |
911 | if (unlikely(pmd_trans_splitting(pmd))) { | 911 | if (unlikely(pmd_trans_splitting(pmd))) { |
912 | /* split huge page running from under us */ | 912 | /* split huge page running from under us */ |
913 | spin_unlock(&src_mm->page_table_lock); | 913 | spin_unlock(&src_mm->page_table_lock); |
914 | spin_unlock(&dst_mm->page_table_lock); | 914 | spin_unlock(&dst_mm->page_table_lock); |
915 | pte_free(dst_mm, pgtable); | 915 | pte_free(dst_mm, pgtable); |
916 | 916 | ||
917 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ | 917 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ |
918 | goto out; | 918 | goto out; |
919 | } | 919 | } |
920 | src_page = pmd_page(pmd); | 920 | src_page = pmd_page(pmd); |
921 | VM_BUG_ON(!PageHead(src_page)); | 921 | VM_BUG_ON(!PageHead(src_page)); |
922 | get_page(src_page); | 922 | get_page(src_page); |
923 | page_dup_rmap(src_page); | 923 | page_dup_rmap(src_page); |
924 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 924 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
925 | 925 | ||
926 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | 926 | pmdp_set_wrprotect(src_mm, addr, src_pmd); |
927 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 927 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
928 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 928 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
929 | pgtable_trans_huge_deposit(dst_mm, pgtable); | 929 | pgtable_trans_huge_deposit(dst_mm, pgtable); |
930 | dst_mm->nr_ptes++; | 930 | dst_mm->nr_ptes++; |
931 | 931 | ||
932 | ret = 0; | 932 | ret = 0; |
933 | out_unlock: | 933 | out_unlock: |
934 | spin_unlock(&src_mm->page_table_lock); | 934 | spin_unlock(&src_mm->page_table_lock); |
935 | spin_unlock(&dst_mm->page_table_lock); | 935 | spin_unlock(&dst_mm->page_table_lock); |
936 | out: | 936 | out: |
937 | return ret; | 937 | return ret; |
938 | } | 938 | } |
939 | 939 | ||
940 | void huge_pmd_set_accessed(struct mm_struct *mm, | 940 | void huge_pmd_set_accessed(struct mm_struct *mm, |
941 | struct vm_area_struct *vma, | 941 | struct vm_area_struct *vma, |
942 | unsigned long address, | 942 | unsigned long address, |
943 | pmd_t *pmd, pmd_t orig_pmd, | 943 | pmd_t *pmd, pmd_t orig_pmd, |
944 | int dirty) | 944 | int dirty) |
945 | { | 945 | { |
946 | pmd_t entry; | 946 | pmd_t entry; |
947 | unsigned long haddr; | 947 | unsigned long haddr; |
948 | 948 | ||
949 | spin_lock(&mm->page_table_lock); | 949 | spin_lock(&mm->page_table_lock); |
950 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 950 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
951 | goto unlock; | 951 | goto unlock; |
952 | 952 | ||
953 | entry = pmd_mkyoung(orig_pmd); | 953 | entry = pmd_mkyoung(orig_pmd); |
954 | haddr = address & HPAGE_PMD_MASK; | 954 | haddr = address & HPAGE_PMD_MASK; |
955 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) | 955 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) |
956 | update_mmu_cache_pmd(vma, address, pmd); | 956 | update_mmu_cache_pmd(vma, address, pmd); |
957 | 957 | ||
958 | unlock: | 958 | unlock: |
959 | spin_unlock(&mm->page_table_lock); | 959 | spin_unlock(&mm->page_table_lock); |
960 | } | 960 | } |
961 | 961 | ||
962 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | 962 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, |
963 | struct vm_area_struct *vma, unsigned long address, | 963 | struct vm_area_struct *vma, unsigned long address, |
964 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) | 964 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) |
965 | { | 965 | { |
966 | pgtable_t pgtable; | 966 | pgtable_t pgtable; |
967 | pmd_t _pmd; | 967 | pmd_t _pmd; |
968 | struct page *page; | 968 | struct page *page; |
969 | int i, ret = 0; | 969 | int i, ret = 0; |
970 | unsigned long mmun_start; /* For mmu_notifiers */ | 970 | unsigned long mmun_start; /* For mmu_notifiers */ |
971 | unsigned long mmun_end; /* For mmu_notifiers */ | 971 | unsigned long mmun_end; /* For mmu_notifiers */ |
972 | 972 | ||
973 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 973 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
974 | if (!page) { | 974 | if (!page) { |
975 | ret |= VM_FAULT_OOM; | 975 | ret |= VM_FAULT_OOM; |
976 | goto out; | 976 | goto out; |
977 | } | 977 | } |
978 | 978 | ||
979 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | 979 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { |
980 | put_page(page); | 980 | put_page(page); |
981 | ret |= VM_FAULT_OOM; | 981 | ret |= VM_FAULT_OOM; |
982 | goto out; | 982 | goto out; |
983 | } | 983 | } |
984 | 984 | ||
985 | clear_user_highpage(page, address); | 985 | clear_user_highpage(page, address); |
986 | __SetPageUptodate(page); | 986 | __SetPageUptodate(page); |
987 | 987 | ||
988 | mmun_start = haddr; | 988 | mmun_start = haddr; |
989 | mmun_end = haddr + HPAGE_PMD_SIZE; | 989 | mmun_end = haddr + HPAGE_PMD_SIZE; |
990 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 990 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
991 | 991 | ||
992 | spin_lock(&mm->page_table_lock); | 992 | spin_lock(&mm->page_table_lock); |
993 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 993 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
994 | goto out_free_page; | 994 | goto out_free_page; |
995 | 995 | ||
996 | pmdp_clear_flush(vma, haddr, pmd); | 996 | pmdp_clear_flush(vma, haddr, pmd); |
997 | /* leave pmd empty until pte is filled */ | 997 | /* leave pmd empty until pte is filled */ |
998 | 998 | ||
999 | pgtable = pgtable_trans_huge_withdraw(mm); | 999 | pgtable = pgtable_trans_huge_withdraw(mm); |
1000 | pmd_populate(mm, &_pmd, pgtable); | 1000 | pmd_populate(mm, &_pmd, pgtable); |
1001 | 1001 | ||
1002 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 1002 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1003 | pte_t *pte, entry; | 1003 | pte_t *pte, entry; |
1004 | if (haddr == (address & PAGE_MASK)) { | 1004 | if (haddr == (address & PAGE_MASK)) { |
1005 | entry = mk_pte(page, vma->vm_page_prot); | 1005 | entry = mk_pte(page, vma->vm_page_prot); |
1006 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1006 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1007 | page_add_new_anon_rmap(page, vma, haddr); | 1007 | page_add_new_anon_rmap(page, vma, haddr); |
1008 | } else { | 1008 | } else { |
1009 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | 1009 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); |
1010 | entry = pte_mkspecial(entry); | 1010 | entry = pte_mkspecial(entry); |
1011 | } | 1011 | } |
1012 | pte = pte_offset_map(&_pmd, haddr); | 1012 | pte = pte_offset_map(&_pmd, haddr); |
1013 | VM_BUG_ON(!pte_none(*pte)); | 1013 | VM_BUG_ON(!pte_none(*pte)); |
1014 | set_pte_at(mm, haddr, pte, entry); | 1014 | set_pte_at(mm, haddr, pte, entry); |
1015 | pte_unmap(pte); | 1015 | pte_unmap(pte); |
1016 | } | 1016 | } |
1017 | smp_wmb(); /* make pte visible before pmd */ | 1017 | smp_wmb(); /* make pte visible before pmd */ |
1018 | pmd_populate(mm, pmd, pgtable); | 1018 | pmd_populate(mm, pmd, pgtable); |
1019 | spin_unlock(&mm->page_table_lock); | 1019 | spin_unlock(&mm->page_table_lock); |
1020 | put_huge_zero_page(); | 1020 | put_huge_zero_page(); |
1021 | inc_mm_counter(mm, MM_ANONPAGES); | 1021 | inc_mm_counter(mm, MM_ANONPAGES); |
1022 | 1022 | ||
1023 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1023 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1024 | 1024 | ||
1025 | ret |= VM_FAULT_WRITE; | 1025 | ret |= VM_FAULT_WRITE; |
1026 | out: | 1026 | out: |
1027 | return ret; | 1027 | return ret; |
1028 | out_free_page: | 1028 | out_free_page: |
1029 | spin_unlock(&mm->page_table_lock); | 1029 | spin_unlock(&mm->page_table_lock); |
1030 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1030 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1031 | mem_cgroup_uncharge_page(page); | 1031 | mem_cgroup_uncharge_page(page); |
1032 | put_page(page); | 1032 | put_page(page); |
1033 | goto out; | 1033 | goto out; |
1034 | } | 1034 | } |
1035 | 1035 | ||
1036 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 1036 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
1037 | struct vm_area_struct *vma, | 1037 | struct vm_area_struct *vma, |
1038 | unsigned long address, | 1038 | unsigned long address, |
1039 | pmd_t *pmd, pmd_t orig_pmd, | 1039 | pmd_t *pmd, pmd_t orig_pmd, |
1040 | struct page *page, | 1040 | struct page *page, |
1041 | unsigned long haddr) | 1041 | unsigned long haddr) |
1042 | { | 1042 | { |
1043 | pgtable_t pgtable; | 1043 | pgtable_t pgtable; |
1044 | pmd_t _pmd; | 1044 | pmd_t _pmd; |
1045 | int ret = 0, i; | 1045 | int ret = 0, i; |
1046 | struct page **pages; | 1046 | struct page **pages; |
1047 | unsigned long mmun_start; /* For mmu_notifiers */ | 1047 | unsigned long mmun_start; /* For mmu_notifiers */ |
1048 | unsigned long mmun_end; /* For mmu_notifiers */ | 1048 | unsigned long mmun_end; /* For mmu_notifiers */ |
1049 | 1049 | ||
1050 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, | 1050 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, |
1051 | GFP_KERNEL); | 1051 | GFP_KERNEL); |
1052 | if (unlikely(!pages)) { | 1052 | if (unlikely(!pages)) { |
1053 | ret |= VM_FAULT_OOM; | 1053 | ret |= VM_FAULT_OOM; |
1054 | goto out; | 1054 | goto out; |
1055 | } | 1055 | } |
1056 | 1056 | ||
1057 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1057 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1058 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | | 1058 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | |
1059 | __GFP_OTHER_NODE, | 1059 | __GFP_OTHER_NODE, |
1060 | vma, address, page_to_nid(page)); | 1060 | vma, address, page_to_nid(page)); |
1061 | if (unlikely(!pages[i] || | 1061 | if (unlikely(!pages[i] || |
1062 | mem_cgroup_newpage_charge(pages[i], mm, | 1062 | mem_cgroup_newpage_charge(pages[i], mm, |
1063 | GFP_KERNEL))) { | 1063 | GFP_KERNEL))) { |
1064 | if (pages[i]) | 1064 | if (pages[i]) |
1065 | put_page(pages[i]); | 1065 | put_page(pages[i]); |
1066 | mem_cgroup_uncharge_start(); | 1066 | mem_cgroup_uncharge_start(); |
1067 | while (--i >= 0) { | 1067 | while (--i >= 0) { |
1068 | mem_cgroup_uncharge_page(pages[i]); | 1068 | mem_cgroup_uncharge_page(pages[i]); |
1069 | put_page(pages[i]); | 1069 | put_page(pages[i]); |
1070 | } | 1070 | } |
1071 | mem_cgroup_uncharge_end(); | 1071 | mem_cgroup_uncharge_end(); |
1072 | kfree(pages); | 1072 | kfree(pages); |
1073 | ret |= VM_FAULT_OOM; | 1073 | ret |= VM_FAULT_OOM; |
1074 | goto out; | 1074 | goto out; |
1075 | } | 1075 | } |
1076 | } | 1076 | } |
1077 | 1077 | ||
1078 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1078 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1079 | copy_user_highpage(pages[i], page + i, | 1079 | copy_user_highpage(pages[i], page + i, |
1080 | haddr + PAGE_SIZE * i, vma); | 1080 | haddr + PAGE_SIZE * i, vma); |
1081 | __SetPageUptodate(pages[i]); | 1081 | __SetPageUptodate(pages[i]); |
1082 | cond_resched(); | 1082 | cond_resched(); |
1083 | } | 1083 | } |
1084 | 1084 | ||
1085 | mmun_start = haddr; | 1085 | mmun_start = haddr; |
1086 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1086 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1087 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1087 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
1088 | 1088 | ||
1089 | spin_lock(&mm->page_table_lock); | 1089 | spin_lock(&mm->page_table_lock); |
1090 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1090 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
1091 | goto out_free_pages; | 1091 | goto out_free_pages; |
1092 | VM_BUG_ON(!PageHead(page)); | 1092 | VM_BUG_ON(!PageHead(page)); |
1093 | 1093 | ||
1094 | pmdp_clear_flush(vma, haddr, pmd); | 1094 | pmdp_clear_flush(vma, haddr, pmd); |
1095 | /* leave pmd empty until pte is filled */ | 1095 | /* leave pmd empty until pte is filled */ |
1096 | 1096 | ||
1097 | pgtable = pgtable_trans_huge_withdraw(mm); | 1097 | pgtable = pgtable_trans_huge_withdraw(mm); |
1098 | pmd_populate(mm, &_pmd, pgtable); | 1098 | pmd_populate(mm, &_pmd, pgtable); |
1099 | 1099 | ||
1100 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 1100 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1101 | pte_t *pte, entry; | 1101 | pte_t *pte, entry; |
1102 | entry = mk_pte(pages[i], vma->vm_page_prot); | 1102 | entry = mk_pte(pages[i], vma->vm_page_prot); |
1103 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1103 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1104 | page_add_new_anon_rmap(pages[i], vma, haddr); | 1104 | page_add_new_anon_rmap(pages[i], vma, haddr); |
1105 | pte = pte_offset_map(&_pmd, haddr); | 1105 | pte = pte_offset_map(&_pmd, haddr); |
1106 | VM_BUG_ON(!pte_none(*pte)); | 1106 | VM_BUG_ON(!pte_none(*pte)); |
1107 | set_pte_at(mm, haddr, pte, entry); | 1107 | set_pte_at(mm, haddr, pte, entry); |
1108 | pte_unmap(pte); | 1108 | pte_unmap(pte); |
1109 | } | 1109 | } |
1110 | kfree(pages); | 1110 | kfree(pages); |
1111 | 1111 | ||
1112 | smp_wmb(); /* make pte visible before pmd */ | 1112 | smp_wmb(); /* make pte visible before pmd */ |
1113 | pmd_populate(mm, pmd, pgtable); | 1113 | pmd_populate(mm, pmd, pgtable); |
1114 | page_remove_rmap(page); | 1114 | page_remove_rmap(page); |
1115 | spin_unlock(&mm->page_table_lock); | 1115 | spin_unlock(&mm->page_table_lock); |
1116 | 1116 | ||
1117 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1117 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1118 | 1118 | ||
1119 | ret |= VM_FAULT_WRITE; | 1119 | ret |= VM_FAULT_WRITE; |
1120 | put_page(page); | 1120 | put_page(page); |
1121 | 1121 | ||
1122 | out: | 1122 | out: |
1123 | return ret; | 1123 | return ret; |
1124 | 1124 | ||
1125 | out_free_pages: | 1125 | out_free_pages: |
1126 | spin_unlock(&mm->page_table_lock); | 1126 | spin_unlock(&mm->page_table_lock); |
1127 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1127 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1128 | mem_cgroup_uncharge_start(); | 1128 | mem_cgroup_uncharge_start(); |
1129 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1129 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1130 | mem_cgroup_uncharge_page(pages[i]); | 1130 | mem_cgroup_uncharge_page(pages[i]); |
1131 | put_page(pages[i]); | 1131 | put_page(pages[i]); |
1132 | } | 1132 | } |
1133 | mem_cgroup_uncharge_end(); | 1133 | mem_cgroup_uncharge_end(); |
1134 | kfree(pages); | 1134 | kfree(pages); |
1135 | goto out; | 1135 | goto out; |
1136 | } | 1136 | } |
1137 | 1137 | ||
1138 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1138 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1139 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | 1139 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) |
1140 | { | 1140 | { |
1141 | int ret = 0; | 1141 | int ret = 0; |
1142 | struct page *page = NULL, *new_page; | 1142 | struct page *page = NULL, *new_page; |
1143 | unsigned long haddr; | 1143 | unsigned long haddr; |
1144 | unsigned long mmun_start; /* For mmu_notifiers */ | 1144 | unsigned long mmun_start; /* For mmu_notifiers */ |
1145 | unsigned long mmun_end; /* For mmu_notifiers */ | 1145 | unsigned long mmun_end; /* For mmu_notifiers */ |
1146 | 1146 | ||
1147 | VM_BUG_ON(!vma->anon_vma); | 1147 | VM_BUG_ON(!vma->anon_vma); |
1148 | haddr = address & HPAGE_PMD_MASK; | 1148 | haddr = address & HPAGE_PMD_MASK; |
1149 | if (is_huge_zero_pmd(orig_pmd)) | 1149 | if (is_huge_zero_pmd(orig_pmd)) |
1150 | goto alloc; | 1150 | goto alloc; |
1151 | spin_lock(&mm->page_table_lock); | 1151 | spin_lock(&mm->page_table_lock); |
1152 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1152 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
1153 | goto out_unlock; | 1153 | goto out_unlock; |
1154 | 1154 | ||
1155 | page = pmd_page(orig_pmd); | 1155 | page = pmd_page(orig_pmd); |
1156 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | 1156 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); |
1157 | if (page_mapcount(page) == 1) { | 1157 | if (page_mapcount(page) == 1) { |
1158 | pmd_t entry; | 1158 | pmd_t entry; |
1159 | entry = pmd_mkyoung(orig_pmd); | 1159 | entry = pmd_mkyoung(orig_pmd); |
1160 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1160 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1161 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | 1161 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) |
1162 | update_mmu_cache_pmd(vma, address, pmd); | 1162 | update_mmu_cache_pmd(vma, address, pmd); |
1163 | ret |= VM_FAULT_WRITE; | 1163 | ret |= VM_FAULT_WRITE; |
1164 | goto out_unlock; | 1164 | goto out_unlock; |
1165 | } | 1165 | } |
1166 | get_page(page); | 1166 | get_page(page); |
1167 | spin_unlock(&mm->page_table_lock); | 1167 | spin_unlock(&mm->page_table_lock); |
1168 | alloc: | 1168 | alloc: |
1169 | if (transparent_hugepage_enabled(vma) && | 1169 | if (transparent_hugepage_enabled(vma) && |
1170 | !transparent_hugepage_debug_cow()) | 1170 | !transparent_hugepage_debug_cow()) |
1171 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1171 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
1172 | vma, haddr, numa_node_id(), 0); | 1172 | vma, haddr, numa_node_id(), 0); |
1173 | else | 1173 | else |
1174 | new_page = NULL; | 1174 | new_page = NULL; |
1175 | 1175 | ||
1176 | if (unlikely(!new_page)) { | 1176 | if (unlikely(!new_page)) { |
1177 | count_vm_event(THP_FAULT_FALLBACK); | 1177 | count_vm_event(THP_FAULT_FALLBACK); |
1178 | if (is_huge_zero_pmd(orig_pmd)) { | 1178 | if (is_huge_zero_pmd(orig_pmd)) { |
1179 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, | 1179 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
1180 | address, pmd, orig_pmd, haddr); | 1180 | address, pmd, orig_pmd, haddr); |
1181 | } else { | 1181 | } else { |
1182 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1182 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
1183 | pmd, orig_pmd, page, haddr); | 1183 | pmd, orig_pmd, page, haddr); |
1184 | if (ret & VM_FAULT_OOM) | 1184 | if (ret & VM_FAULT_OOM) |
1185 | split_huge_page(page); | 1185 | split_huge_page(page); |
1186 | put_page(page); | 1186 | put_page(page); |
1187 | } | 1187 | } |
1188 | goto out; | 1188 | goto out; |
1189 | } | 1189 | } |
1190 | count_vm_event(THP_FAULT_ALLOC); | 1190 | count_vm_event(THP_FAULT_ALLOC); |
1191 | 1191 | ||
1192 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1192 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
1193 | put_page(new_page); | 1193 | put_page(new_page); |
1194 | if (page) { | 1194 | if (page) { |
1195 | split_huge_page(page); | 1195 | split_huge_page(page); |
1196 | put_page(page); | 1196 | put_page(page); |
1197 | } | 1197 | } |
1198 | ret |= VM_FAULT_OOM; | 1198 | ret |= VM_FAULT_OOM; |
1199 | goto out; | 1199 | goto out; |
1200 | } | 1200 | } |
1201 | 1201 | ||
1202 | if (is_huge_zero_pmd(orig_pmd)) | 1202 | if (is_huge_zero_pmd(orig_pmd)) |
1203 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | 1203 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); |
1204 | else | 1204 | else |
1205 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 1205 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); |
1206 | __SetPageUptodate(new_page); | 1206 | __SetPageUptodate(new_page); |
1207 | 1207 | ||
1208 | mmun_start = haddr; | 1208 | mmun_start = haddr; |
1209 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1209 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1210 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1210 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
1211 | 1211 | ||
1212 | spin_lock(&mm->page_table_lock); | 1212 | spin_lock(&mm->page_table_lock); |
1213 | if (page) | 1213 | if (page) |
1214 | put_page(page); | 1214 | put_page(page); |
1215 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1215 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
1216 | spin_unlock(&mm->page_table_lock); | 1216 | spin_unlock(&mm->page_table_lock); |
1217 | mem_cgroup_uncharge_page(new_page); | 1217 | mem_cgroup_uncharge_page(new_page); |
1218 | put_page(new_page); | 1218 | put_page(new_page); |
1219 | goto out_mn; | 1219 | goto out_mn; |
1220 | } else { | 1220 | } else { |
1221 | pmd_t entry; | 1221 | pmd_t entry; |
1222 | entry = mk_huge_pmd(new_page, vma); | 1222 | entry = mk_huge_pmd(new_page, vma); |
1223 | pmdp_clear_flush(vma, haddr, pmd); | 1223 | pmdp_clear_flush(vma, haddr, pmd); |
1224 | page_add_new_anon_rmap(new_page, vma, haddr); | 1224 | page_add_new_anon_rmap(new_page, vma, haddr); |
1225 | set_pmd_at(mm, haddr, pmd, entry); | 1225 | set_pmd_at(mm, haddr, pmd, entry); |
1226 | update_mmu_cache_pmd(vma, address, pmd); | 1226 | update_mmu_cache_pmd(vma, address, pmd); |
1227 | if (is_huge_zero_pmd(orig_pmd)) { | 1227 | if (is_huge_zero_pmd(orig_pmd)) { |
1228 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1228 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1229 | put_huge_zero_page(); | 1229 | put_huge_zero_page(); |
1230 | } else { | 1230 | } else { |
1231 | VM_BUG_ON(!PageHead(page)); | 1231 | VM_BUG_ON(!PageHead(page)); |
1232 | page_remove_rmap(page); | 1232 | page_remove_rmap(page); |
1233 | put_page(page); | 1233 | put_page(page); |
1234 | } | 1234 | } |
1235 | ret |= VM_FAULT_WRITE; | 1235 | ret |= VM_FAULT_WRITE; |
1236 | } | 1236 | } |
1237 | spin_unlock(&mm->page_table_lock); | 1237 | spin_unlock(&mm->page_table_lock); |
1238 | out_mn: | 1238 | out_mn: |
1239 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1239 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1240 | out: | 1240 | out: |
1241 | return ret; | 1241 | return ret; |
1242 | out_unlock: | 1242 | out_unlock: |
1243 | spin_unlock(&mm->page_table_lock); | 1243 | spin_unlock(&mm->page_table_lock); |
1244 | return ret; | 1244 | return ret; |
1245 | } | 1245 | } |
1246 | 1246 | ||
1247 | struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | 1247 | struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
1248 | unsigned long addr, | 1248 | unsigned long addr, |
1249 | pmd_t *pmd, | 1249 | pmd_t *pmd, |
1250 | unsigned int flags) | 1250 | unsigned int flags) |
1251 | { | 1251 | { |
1252 | struct mm_struct *mm = vma->vm_mm; | 1252 | struct mm_struct *mm = vma->vm_mm; |
1253 | struct page *page = NULL; | 1253 | struct page *page = NULL; |
1254 | 1254 | ||
1255 | assert_spin_locked(&mm->page_table_lock); | 1255 | assert_spin_locked(&mm->page_table_lock); |
1256 | 1256 | ||
1257 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) | 1257 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) |
1258 | goto out; | 1258 | goto out; |
1259 | 1259 | ||
1260 | page = pmd_page(*pmd); | 1260 | page = pmd_page(*pmd); |
1261 | VM_BUG_ON(!PageHead(page)); | 1261 | VM_BUG_ON(!PageHead(page)); |
1262 | if (flags & FOLL_TOUCH) { | 1262 | if (flags & FOLL_TOUCH) { |
1263 | pmd_t _pmd; | 1263 | pmd_t _pmd; |
1264 | /* | 1264 | /* |
1265 | * We should set the dirty bit only for FOLL_WRITE but | 1265 | * We should set the dirty bit only for FOLL_WRITE but |
1266 | * for now the dirty bit in the pmd is meaningless. | 1266 | * for now the dirty bit in the pmd is meaningless. |
1267 | * And if the dirty bit will become meaningful and | 1267 | * And if the dirty bit will become meaningful and |
1268 | * we'll only set it with FOLL_WRITE, an atomic | 1268 | * we'll only set it with FOLL_WRITE, an atomic |
1269 | * set_bit will be required on the pmd to set the | 1269 | * set_bit will be required on the pmd to set the |
1270 | * young bit, instead of the current set_pmd_at. | 1270 | * young bit, instead of the current set_pmd_at. |
1271 | */ | 1271 | */ |
1272 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | 1272 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); |
1273 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); | 1273 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); |
1274 | } | 1274 | } |
1275 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | 1275 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { |
1276 | if (page->mapping && trylock_page(page)) { | 1276 | if (page->mapping && trylock_page(page)) { |
1277 | lru_add_drain(); | 1277 | lru_add_drain(); |
1278 | if (page->mapping) | 1278 | if (page->mapping) |
1279 | mlock_vma_page(page); | 1279 | mlock_vma_page(page); |
1280 | unlock_page(page); | 1280 | unlock_page(page); |
1281 | } | 1281 | } |
1282 | } | 1282 | } |
1283 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1283 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
1284 | VM_BUG_ON(!PageCompound(page)); | 1284 | VM_BUG_ON(!PageCompound(page)); |
1285 | if (flags & FOLL_GET) | 1285 | if (flags & FOLL_GET) |
1286 | get_page_foll(page); | 1286 | get_page_foll(page); |
1287 | 1287 | ||
1288 | out: | 1288 | out: |
1289 | return page; | 1289 | return page; |
1290 | } | 1290 | } |
1291 | 1291 | ||
1292 | /* NUMA hinting page fault entry point for trans huge pmds */ | 1292 | /* NUMA hinting page fault entry point for trans huge pmds */ |
1293 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1293 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1294 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | 1294 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) |
1295 | { | 1295 | { |
1296 | struct page *page; | 1296 | struct page *page; |
1297 | unsigned long haddr = addr & HPAGE_PMD_MASK; | 1297 | unsigned long haddr = addr & HPAGE_PMD_MASK; |
1298 | int target_nid; | 1298 | int target_nid; |
1299 | int current_nid = -1; | 1299 | int current_nid = -1; |
1300 | bool migrated; | 1300 | bool migrated; |
1301 | bool page_locked = false; | 1301 | bool page_locked = false; |
1302 | 1302 | ||
1303 | spin_lock(&mm->page_table_lock); | 1303 | spin_lock(&mm->page_table_lock); |
1304 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1304 | if (unlikely(!pmd_same(pmd, *pmdp))) |
1305 | goto out_unlock; | 1305 | goto out_unlock; |
1306 | 1306 | ||
1307 | page = pmd_page(pmd); | 1307 | page = pmd_page(pmd); |
1308 | get_page(page); | 1308 | get_page(page); |
1309 | current_nid = page_to_nid(page); | 1309 | current_nid = page_to_nid(page); |
1310 | count_vm_numa_event(NUMA_HINT_FAULTS); | 1310 | count_vm_numa_event(NUMA_HINT_FAULTS); |
1311 | if (current_nid == numa_node_id()) | 1311 | if (current_nid == numa_node_id()) |
1312 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | 1312 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); |
1313 | 1313 | ||
1314 | target_nid = mpol_misplaced(page, vma, haddr); | 1314 | target_nid = mpol_misplaced(page, vma, haddr); |
1315 | if (target_nid == -1) { | 1315 | if (target_nid == -1) { |
1316 | put_page(page); | 1316 | put_page(page); |
1317 | goto clear_pmdnuma; | 1317 | goto clear_pmdnuma; |
1318 | } | 1318 | } |
1319 | 1319 | ||
1320 | /* Acquire the page lock to serialise THP migrations */ | 1320 | /* Acquire the page lock to serialise THP migrations */ |
1321 | spin_unlock(&mm->page_table_lock); | 1321 | spin_unlock(&mm->page_table_lock); |
1322 | lock_page(page); | 1322 | lock_page(page); |
1323 | page_locked = true; | 1323 | page_locked = true; |
1324 | 1324 | ||
1325 | /* Confirm the PTE did not while locked */ | 1325 | /* Confirm the PTE did not while locked */ |
1326 | spin_lock(&mm->page_table_lock); | 1326 | spin_lock(&mm->page_table_lock); |
1327 | if (unlikely(!pmd_same(pmd, *pmdp))) { | 1327 | if (unlikely(!pmd_same(pmd, *pmdp))) { |
1328 | unlock_page(page); | 1328 | unlock_page(page); |
1329 | put_page(page); | 1329 | put_page(page); |
1330 | goto out_unlock; | 1330 | goto out_unlock; |
1331 | } | 1331 | } |
1332 | spin_unlock(&mm->page_table_lock); | 1332 | spin_unlock(&mm->page_table_lock); |
1333 | 1333 | ||
1334 | /* Migrate the THP to the requested node */ | 1334 | /* Migrate the THP to the requested node */ |
1335 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1335 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
1336 | pmdp, pmd, addr, | 1336 | pmdp, pmd, addr, |
1337 | page, target_nid); | 1337 | page, target_nid); |
1338 | if (migrated) | 1338 | if (migrated) |
1339 | current_nid = target_nid; | 1339 | current_nid = target_nid; |
1340 | else { | 1340 | else { |
1341 | spin_lock(&mm->page_table_lock); | 1341 | spin_lock(&mm->page_table_lock); |
1342 | if (unlikely(!pmd_same(pmd, *pmdp))) { | 1342 | if (unlikely(!pmd_same(pmd, *pmdp))) { |
1343 | unlock_page(page); | 1343 | unlock_page(page); |
1344 | goto out_unlock; | 1344 | goto out_unlock; |
1345 | } | 1345 | } |
1346 | goto clear_pmdnuma; | 1346 | goto clear_pmdnuma; |
1347 | } | 1347 | } |
1348 | 1348 | ||
1349 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | 1349 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); |
1350 | return 0; | 1350 | return 0; |
1351 | 1351 | ||
1352 | clear_pmdnuma: | 1352 | clear_pmdnuma: |
1353 | pmd = pmd_mknonnuma(pmd); | 1353 | pmd = pmd_mknonnuma(pmd); |
1354 | set_pmd_at(mm, haddr, pmdp, pmd); | 1354 | set_pmd_at(mm, haddr, pmdp, pmd); |
1355 | VM_BUG_ON(pmd_numa(*pmdp)); | 1355 | VM_BUG_ON(pmd_numa(*pmdp)); |
1356 | update_mmu_cache_pmd(vma, addr, pmdp); | 1356 | update_mmu_cache_pmd(vma, addr, pmdp); |
1357 | if (page_locked) | 1357 | if (page_locked) |
1358 | unlock_page(page); | 1358 | unlock_page(page); |
1359 | 1359 | ||
1360 | out_unlock: | 1360 | out_unlock: |
1361 | spin_unlock(&mm->page_table_lock); | 1361 | spin_unlock(&mm->page_table_lock); |
1362 | if (current_nid != -1) | 1362 | if (current_nid != -1) |
1363 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | 1363 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); |
1364 | return 0; | 1364 | return 0; |
1365 | } | 1365 | } |
1366 | 1366 | ||
1367 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1367 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1368 | pmd_t *pmd, unsigned long addr) | 1368 | pmd_t *pmd, unsigned long addr) |
1369 | { | 1369 | { |
1370 | int ret = 0; | 1370 | int ret = 0; |
1371 | 1371 | ||
1372 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1372 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1373 | struct page *page; | 1373 | struct page *page; |
1374 | pgtable_t pgtable; | 1374 | pgtable_t pgtable; |
1375 | pmd_t orig_pmd; | 1375 | pmd_t orig_pmd; |
1376 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); | 1376 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); |
1377 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); | 1377 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1378 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1378 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1379 | if (is_huge_zero_pmd(orig_pmd)) { | 1379 | if (is_huge_zero_pmd(orig_pmd)) { |
1380 | tlb->mm->nr_ptes--; | 1380 | tlb->mm->nr_ptes--; |
1381 | spin_unlock(&tlb->mm->page_table_lock); | 1381 | spin_unlock(&tlb->mm->page_table_lock); |
1382 | put_huge_zero_page(); | 1382 | put_huge_zero_page(); |
1383 | } else { | 1383 | } else { |
1384 | page = pmd_page(orig_pmd); | 1384 | page = pmd_page(orig_pmd); |
1385 | page_remove_rmap(page); | 1385 | page_remove_rmap(page); |
1386 | VM_BUG_ON(page_mapcount(page) < 0); | 1386 | VM_BUG_ON(page_mapcount(page) < 0); |
1387 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1387 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1388 | VM_BUG_ON(!PageHead(page)); | 1388 | VM_BUG_ON(!PageHead(page)); |
1389 | tlb->mm->nr_ptes--; | 1389 | tlb->mm->nr_ptes--; |
1390 | spin_unlock(&tlb->mm->page_table_lock); | 1390 | spin_unlock(&tlb->mm->page_table_lock); |
1391 | tlb_remove_page(tlb, page); | 1391 | tlb_remove_page(tlb, page); |
1392 | } | 1392 | } |
1393 | pte_free(tlb->mm, pgtable); | 1393 | pte_free(tlb->mm, pgtable); |
1394 | ret = 1; | 1394 | ret = 1; |
1395 | } | 1395 | } |
1396 | return ret; | 1396 | return ret; |
1397 | } | 1397 | } |
1398 | 1398 | ||
1399 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1399 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1400 | unsigned long addr, unsigned long end, | 1400 | unsigned long addr, unsigned long end, |
1401 | unsigned char *vec) | 1401 | unsigned char *vec) |
1402 | { | 1402 | { |
1403 | int ret = 0; | 1403 | int ret = 0; |
1404 | 1404 | ||
1405 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1405 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1406 | /* | 1406 | /* |
1407 | * All logical pages in the range are present | 1407 | * All logical pages in the range are present |
1408 | * if backed by a huge page. | 1408 | * if backed by a huge page. |
1409 | */ | 1409 | */ |
1410 | spin_unlock(&vma->vm_mm->page_table_lock); | 1410 | spin_unlock(&vma->vm_mm->page_table_lock); |
1411 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | 1411 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); |
1412 | ret = 1; | 1412 | ret = 1; |
1413 | } | 1413 | } |
1414 | 1414 | ||
1415 | return ret; | 1415 | return ret; |
1416 | } | 1416 | } |
1417 | 1417 | ||
1418 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | 1418 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, |
1419 | unsigned long old_addr, | 1419 | unsigned long old_addr, |
1420 | unsigned long new_addr, unsigned long old_end, | 1420 | unsigned long new_addr, unsigned long old_end, |
1421 | pmd_t *old_pmd, pmd_t *new_pmd) | 1421 | pmd_t *old_pmd, pmd_t *new_pmd) |
1422 | { | 1422 | { |
1423 | int ret = 0; | 1423 | int ret = 0; |
1424 | pmd_t pmd; | 1424 | pmd_t pmd; |
1425 | 1425 | ||
1426 | struct mm_struct *mm = vma->vm_mm; | 1426 | struct mm_struct *mm = vma->vm_mm; |
1427 | 1427 | ||
1428 | if ((old_addr & ~HPAGE_PMD_MASK) || | 1428 | if ((old_addr & ~HPAGE_PMD_MASK) || |
1429 | (new_addr & ~HPAGE_PMD_MASK) || | 1429 | (new_addr & ~HPAGE_PMD_MASK) || |
1430 | old_end - old_addr < HPAGE_PMD_SIZE || | 1430 | old_end - old_addr < HPAGE_PMD_SIZE || |
1431 | (new_vma->vm_flags & VM_NOHUGEPAGE)) | 1431 | (new_vma->vm_flags & VM_NOHUGEPAGE)) |
1432 | goto out; | 1432 | goto out; |
1433 | 1433 | ||
1434 | /* | 1434 | /* |
1435 | * The destination pmd shouldn't be established, free_pgtables() | 1435 | * The destination pmd shouldn't be established, free_pgtables() |
1436 | * should have release it. | 1436 | * should have release it. |
1437 | */ | 1437 | */ |
1438 | if (WARN_ON(!pmd_none(*new_pmd))) { | 1438 | if (WARN_ON(!pmd_none(*new_pmd))) { |
1439 | VM_BUG_ON(pmd_trans_huge(*new_pmd)); | 1439 | VM_BUG_ON(pmd_trans_huge(*new_pmd)); |
1440 | goto out; | 1440 | goto out; |
1441 | } | 1441 | } |
1442 | 1442 | ||
1443 | ret = __pmd_trans_huge_lock(old_pmd, vma); | 1443 | ret = __pmd_trans_huge_lock(old_pmd, vma); |
1444 | if (ret == 1) { | 1444 | if (ret == 1) { |
1445 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | 1445 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
1446 | VM_BUG_ON(!pmd_none(*new_pmd)); | 1446 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1447 | set_pmd_at(mm, new_addr, new_pmd, pmd); | 1447 | set_pmd_at(mm, new_addr, new_pmd, pmd); |
1448 | spin_unlock(&mm->page_table_lock); | 1448 | spin_unlock(&mm->page_table_lock); |
1449 | } | 1449 | } |
1450 | out: | 1450 | out: |
1451 | return ret; | 1451 | return ret; |
1452 | } | 1452 | } |
1453 | 1453 | ||
1454 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1454 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1455 | unsigned long addr, pgprot_t newprot, int prot_numa) | 1455 | unsigned long addr, pgprot_t newprot, int prot_numa) |
1456 | { | 1456 | { |
1457 | struct mm_struct *mm = vma->vm_mm; | 1457 | struct mm_struct *mm = vma->vm_mm; |
1458 | int ret = 0; | 1458 | int ret = 0; |
1459 | 1459 | ||
1460 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1460 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1461 | pmd_t entry; | 1461 | pmd_t entry; |
1462 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1462 | entry = pmdp_get_and_clear(mm, addr, pmd); |
1463 | if (!prot_numa) { | 1463 | if (!prot_numa) { |
1464 | entry = pmd_modify(entry, newprot); | 1464 | entry = pmd_modify(entry, newprot); |
1465 | BUG_ON(pmd_write(entry)); | 1465 | BUG_ON(pmd_write(entry)); |
1466 | } else { | 1466 | } else { |
1467 | struct page *page = pmd_page(*pmd); | 1467 | struct page *page = pmd_page(*pmd); |
1468 | 1468 | ||
1469 | /* only check non-shared pages */ | 1469 | /* only check non-shared pages */ |
1470 | if (page_mapcount(page) == 1 && | 1470 | if (page_mapcount(page) == 1 && |
1471 | !pmd_numa(*pmd)) { | 1471 | !pmd_numa(*pmd)) { |
1472 | entry = pmd_mknuma(entry); | 1472 | entry = pmd_mknuma(entry); |
1473 | } | 1473 | } |
1474 | } | 1474 | } |
1475 | set_pmd_at(mm, addr, pmd, entry); | 1475 | set_pmd_at(mm, addr, pmd, entry); |
1476 | spin_unlock(&vma->vm_mm->page_table_lock); | 1476 | spin_unlock(&vma->vm_mm->page_table_lock); |
1477 | ret = 1; | 1477 | ret = 1; |
1478 | } | 1478 | } |
1479 | 1479 | ||
1480 | return ret; | 1480 | return ret; |
1481 | } | 1481 | } |
1482 | 1482 | ||
1483 | /* | 1483 | /* |
1484 | * Returns 1 if a given pmd maps a stable (not under splitting) thp. | 1484 | * Returns 1 if a given pmd maps a stable (not under splitting) thp. |
1485 | * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. | 1485 | * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. |
1486 | * | 1486 | * |
1487 | * Note that if it returns 1, this routine returns without unlocking page | 1487 | * Note that if it returns 1, this routine returns without unlocking page |
1488 | * table locks. So callers must unlock them. | 1488 | * table locks. So callers must unlock them. |
1489 | */ | 1489 | */ |
1490 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | 1490 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) |
1491 | { | 1491 | { |
1492 | spin_lock(&vma->vm_mm->page_table_lock); | 1492 | spin_lock(&vma->vm_mm->page_table_lock); |
1493 | if (likely(pmd_trans_huge(*pmd))) { | 1493 | if (likely(pmd_trans_huge(*pmd))) { |
1494 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1494 | if (unlikely(pmd_trans_splitting(*pmd))) { |
1495 | spin_unlock(&vma->vm_mm->page_table_lock); | 1495 | spin_unlock(&vma->vm_mm->page_table_lock); |
1496 | wait_split_huge_page(vma->anon_vma, pmd); | 1496 | wait_split_huge_page(vma->anon_vma, pmd); |
1497 | return -1; | 1497 | return -1; |
1498 | } else { | 1498 | } else { |
1499 | /* Thp mapped by 'pmd' is stable, so we can | 1499 | /* Thp mapped by 'pmd' is stable, so we can |
1500 | * handle it as it is. */ | 1500 | * handle it as it is. */ |
1501 | return 1; | 1501 | return 1; |
1502 | } | 1502 | } |
1503 | } | 1503 | } |
1504 | spin_unlock(&vma->vm_mm->page_table_lock); | 1504 | spin_unlock(&vma->vm_mm->page_table_lock); |
1505 | return 0; | 1505 | return 0; |
1506 | } | 1506 | } |
1507 | 1507 | ||
1508 | pmd_t *page_check_address_pmd(struct page *page, | 1508 | pmd_t *page_check_address_pmd(struct page *page, |
1509 | struct mm_struct *mm, | 1509 | struct mm_struct *mm, |
1510 | unsigned long address, | 1510 | unsigned long address, |
1511 | enum page_check_address_pmd_flag flag) | 1511 | enum page_check_address_pmd_flag flag) |
1512 | { | 1512 | { |
1513 | pmd_t *pmd, *ret = NULL; | 1513 | pmd_t *pmd, *ret = NULL; |
1514 | 1514 | ||
1515 | if (address & ~HPAGE_PMD_MASK) | 1515 | if (address & ~HPAGE_PMD_MASK) |
1516 | goto out; | 1516 | goto out; |
1517 | 1517 | ||
1518 | pmd = mm_find_pmd(mm, address); | 1518 | pmd = mm_find_pmd(mm, address); |
1519 | if (!pmd) | 1519 | if (!pmd) |
1520 | goto out; | 1520 | goto out; |
1521 | if (pmd_none(*pmd)) | 1521 | if (pmd_none(*pmd)) |
1522 | goto out; | 1522 | goto out; |
1523 | if (pmd_page(*pmd) != page) | 1523 | if (pmd_page(*pmd) != page) |
1524 | goto out; | 1524 | goto out; |
1525 | /* | 1525 | /* |
1526 | * split_vma() may create temporary aliased mappings. There is | 1526 | * split_vma() may create temporary aliased mappings. There is |
1527 | * no risk as long as all huge pmd are found and have their | 1527 | * no risk as long as all huge pmd are found and have their |
1528 | * splitting bit set before __split_huge_page_refcount | 1528 | * splitting bit set before __split_huge_page_refcount |
1529 | * runs. Finding the same huge pmd more than once during the | 1529 | * runs. Finding the same huge pmd more than once during the |
1530 | * same rmap walk is not a problem. | 1530 | * same rmap walk is not a problem. |
1531 | */ | 1531 | */ |
1532 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && | 1532 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && |
1533 | pmd_trans_splitting(*pmd)) | 1533 | pmd_trans_splitting(*pmd)) |
1534 | goto out; | 1534 | goto out; |
1535 | if (pmd_trans_huge(*pmd)) { | 1535 | if (pmd_trans_huge(*pmd)) { |
1536 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && | 1536 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && |
1537 | !pmd_trans_splitting(*pmd)); | 1537 | !pmd_trans_splitting(*pmd)); |
1538 | ret = pmd; | 1538 | ret = pmd; |
1539 | } | 1539 | } |
1540 | out: | 1540 | out: |
1541 | return ret; | 1541 | return ret; |
1542 | } | 1542 | } |
1543 | 1543 | ||
1544 | static int __split_huge_page_splitting(struct page *page, | 1544 | static int __split_huge_page_splitting(struct page *page, |
1545 | struct vm_area_struct *vma, | 1545 | struct vm_area_struct *vma, |
1546 | unsigned long address) | 1546 | unsigned long address) |
1547 | { | 1547 | { |
1548 | struct mm_struct *mm = vma->vm_mm; | 1548 | struct mm_struct *mm = vma->vm_mm; |
1549 | pmd_t *pmd; | 1549 | pmd_t *pmd; |
1550 | int ret = 0; | 1550 | int ret = 0; |
1551 | /* For mmu_notifiers */ | 1551 | /* For mmu_notifiers */ |
1552 | const unsigned long mmun_start = address; | 1552 | const unsigned long mmun_start = address; |
1553 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; | 1553 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; |
1554 | 1554 | ||
1555 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1555 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
1556 | spin_lock(&mm->page_table_lock); | 1556 | spin_lock(&mm->page_table_lock); |
1557 | pmd = page_check_address_pmd(page, mm, address, | 1557 | pmd = page_check_address_pmd(page, mm, address, |
1558 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | 1558 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); |
1559 | if (pmd) { | 1559 | if (pmd) { |
1560 | /* | 1560 | /* |
1561 | * We can't temporarily set the pmd to null in order | 1561 | * We can't temporarily set the pmd to null in order |
1562 | * to split it, the pmd must remain marked huge at all | 1562 | * to split it, the pmd must remain marked huge at all |
1563 | * times or the VM won't take the pmd_trans_huge paths | 1563 | * times or the VM won't take the pmd_trans_huge paths |
1564 | * and it won't wait on the anon_vma->root->rwsem to | 1564 | * and it won't wait on the anon_vma->root->rwsem to |
1565 | * serialize against split_huge_page*. | 1565 | * serialize against split_huge_page*. |
1566 | */ | 1566 | */ |
1567 | pmdp_splitting_flush(vma, address, pmd); | 1567 | pmdp_splitting_flush(vma, address, pmd); |
1568 | ret = 1; | 1568 | ret = 1; |
1569 | } | 1569 | } |
1570 | spin_unlock(&mm->page_table_lock); | 1570 | spin_unlock(&mm->page_table_lock); |
1571 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1571 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1572 | 1572 | ||
1573 | return ret; | 1573 | return ret; |
1574 | } | 1574 | } |
1575 | 1575 | ||
1576 | static void __split_huge_page_refcount(struct page *page) | 1576 | static void __split_huge_page_refcount(struct page *page) |
1577 | { | 1577 | { |
1578 | int i; | 1578 | int i; |
1579 | struct zone *zone = page_zone(page); | 1579 | struct zone *zone = page_zone(page); |
1580 | struct lruvec *lruvec; | 1580 | struct lruvec *lruvec; |
1581 | int tail_count = 0; | 1581 | int tail_count = 0; |
1582 | 1582 | ||
1583 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 1583 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
1584 | spin_lock_irq(&zone->lru_lock); | 1584 | spin_lock_irq(&zone->lru_lock); |
1585 | lruvec = mem_cgroup_page_lruvec(page, zone); | 1585 | lruvec = mem_cgroup_page_lruvec(page, zone); |
1586 | 1586 | ||
1587 | compound_lock(page); | 1587 | compound_lock(page); |
1588 | /* complete memcg works before add pages to LRU */ | 1588 | /* complete memcg works before add pages to LRU */ |
1589 | mem_cgroup_split_huge_fixup(page); | 1589 | mem_cgroup_split_huge_fixup(page); |
1590 | 1590 | ||
1591 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { | 1591 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { |
1592 | struct page *page_tail = page + i; | 1592 | struct page *page_tail = page + i; |
1593 | 1593 | ||
1594 | /* tail_page->_mapcount cannot change */ | 1594 | /* tail_page->_mapcount cannot change */ |
1595 | BUG_ON(page_mapcount(page_tail) < 0); | 1595 | BUG_ON(page_mapcount(page_tail) < 0); |
1596 | tail_count += page_mapcount(page_tail); | 1596 | tail_count += page_mapcount(page_tail); |
1597 | /* check for overflow */ | 1597 | /* check for overflow */ |
1598 | BUG_ON(tail_count < 0); | 1598 | BUG_ON(tail_count < 0); |
1599 | BUG_ON(atomic_read(&page_tail->_count) != 0); | 1599 | BUG_ON(atomic_read(&page_tail->_count) != 0); |
1600 | /* | 1600 | /* |
1601 | * tail_page->_count is zero and not changing from | 1601 | * tail_page->_count is zero and not changing from |
1602 | * under us. But get_page_unless_zero() may be running | 1602 | * under us. But get_page_unless_zero() may be running |
1603 | * from under us on the tail_page. If we used | 1603 | * from under us on the tail_page. If we used |
1604 | * atomic_set() below instead of atomic_add(), we | 1604 | * atomic_set() below instead of atomic_add(), we |
1605 | * would then run atomic_set() concurrently with | 1605 | * would then run atomic_set() concurrently with |
1606 | * get_page_unless_zero(), and atomic_set() is | 1606 | * get_page_unless_zero(), and atomic_set() is |
1607 | * implemented in C not using locked ops. spin_unlock | 1607 | * implemented in C not using locked ops. spin_unlock |
1608 | * on x86 sometime uses locked ops because of PPro | 1608 | * on x86 sometime uses locked ops because of PPro |
1609 | * errata 66, 92, so unless somebody can guarantee | 1609 | * errata 66, 92, so unless somebody can guarantee |
1610 | * atomic_set() here would be safe on all archs (and | 1610 | * atomic_set() here would be safe on all archs (and |
1611 | * not only on x86), it's safer to use atomic_add(). | 1611 | * not only on x86), it's safer to use atomic_add(). |
1612 | */ | 1612 | */ |
1613 | atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, | 1613 | atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, |
1614 | &page_tail->_count); | 1614 | &page_tail->_count); |
1615 | 1615 | ||
1616 | /* after clearing PageTail the gup refcount can be released */ | 1616 | /* after clearing PageTail the gup refcount can be released */ |
1617 | smp_mb(); | 1617 | smp_mb(); |
1618 | 1618 | ||
1619 | /* | 1619 | /* |
1620 | * retain hwpoison flag of the poisoned tail page: | 1620 | * retain hwpoison flag of the poisoned tail page: |
1621 | * fix for the unsuitable process killed on Guest Machine(KVM) | 1621 | * fix for the unsuitable process killed on Guest Machine(KVM) |
1622 | * by the memory-failure. | 1622 | * by the memory-failure. |
1623 | */ | 1623 | */ |
1624 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; | 1624 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; |
1625 | page_tail->flags |= (page->flags & | 1625 | page_tail->flags |= (page->flags & |
1626 | ((1L << PG_referenced) | | 1626 | ((1L << PG_referenced) | |
1627 | (1L << PG_swapbacked) | | 1627 | (1L << PG_swapbacked) | |
1628 | (1L << PG_mlocked) | | 1628 | (1L << PG_mlocked) | |
1629 | (1L << PG_uptodate))); | 1629 | (1L << PG_uptodate))); |
1630 | page_tail->flags |= (1L << PG_dirty); | 1630 | page_tail->flags |= (1L << PG_dirty); |
1631 | 1631 | ||
1632 | /* clear PageTail before overwriting first_page */ | 1632 | /* clear PageTail before overwriting first_page */ |
1633 | smp_wmb(); | 1633 | smp_wmb(); |
1634 | 1634 | ||
1635 | /* | 1635 | /* |
1636 | * __split_huge_page_splitting() already set the | 1636 | * __split_huge_page_splitting() already set the |
1637 | * splitting bit in all pmd that could map this | 1637 | * splitting bit in all pmd that could map this |
1638 | * hugepage, that will ensure no CPU can alter the | 1638 | * hugepage, that will ensure no CPU can alter the |
1639 | * mapcount on the head page. The mapcount is only | 1639 | * mapcount on the head page. The mapcount is only |
1640 | * accounted in the head page and it has to be | 1640 | * accounted in the head page and it has to be |
1641 | * transferred to all tail pages in the below code. So | 1641 | * transferred to all tail pages in the below code. So |
1642 | * for this code to be safe, the split the mapcount | 1642 | * for this code to be safe, the split the mapcount |
1643 | * can't change. But that doesn't mean userland can't | 1643 | * can't change. But that doesn't mean userland can't |
1644 | * keep changing and reading the page contents while | 1644 | * keep changing and reading the page contents while |
1645 | * we transfer the mapcount, so the pmd splitting | 1645 | * we transfer the mapcount, so the pmd splitting |
1646 | * status is achieved setting a reserved bit in the | 1646 | * status is achieved setting a reserved bit in the |
1647 | * pmd, not by clearing the present bit. | 1647 | * pmd, not by clearing the present bit. |
1648 | */ | 1648 | */ |
1649 | page_tail->_mapcount = page->_mapcount; | 1649 | page_tail->_mapcount = page->_mapcount; |
1650 | 1650 | ||
1651 | BUG_ON(page_tail->mapping); | 1651 | BUG_ON(page_tail->mapping); |
1652 | page_tail->mapping = page->mapping; | 1652 | page_tail->mapping = page->mapping; |
1653 | 1653 | ||
1654 | page_tail->index = page->index + i; | 1654 | page_tail->index = page->index + i; |
1655 | page_xchg_last_nid(page_tail, page_last_nid(page)); | 1655 | page_xchg_last_nid(page_tail, page_last_nid(page)); |
1656 | 1656 | ||
1657 | BUG_ON(!PageAnon(page_tail)); | 1657 | BUG_ON(!PageAnon(page_tail)); |
1658 | BUG_ON(!PageUptodate(page_tail)); | 1658 | BUG_ON(!PageUptodate(page_tail)); |
1659 | BUG_ON(!PageDirty(page_tail)); | 1659 | BUG_ON(!PageDirty(page_tail)); |
1660 | BUG_ON(!PageSwapBacked(page_tail)); | 1660 | BUG_ON(!PageSwapBacked(page_tail)); |
1661 | 1661 | ||
1662 | lru_add_page_tail(page, page_tail, lruvec); | 1662 | lru_add_page_tail(page, page_tail, lruvec); |
1663 | } | 1663 | } |
1664 | atomic_sub(tail_count, &page->_count); | 1664 | atomic_sub(tail_count, &page->_count); |
1665 | BUG_ON(atomic_read(&page->_count) <= 0); | 1665 | BUG_ON(atomic_read(&page->_count) <= 0); |
1666 | 1666 | ||
1667 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); | 1667 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); |
1668 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | 1668 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); |
1669 | 1669 | ||
1670 | ClearPageCompound(page); | 1670 | ClearPageCompound(page); |
1671 | compound_unlock(page); | 1671 | compound_unlock(page); |
1672 | spin_unlock_irq(&zone->lru_lock); | 1672 | spin_unlock_irq(&zone->lru_lock); |
1673 | 1673 | ||
1674 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 1674 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
1675 | struct page *page_tail = page + i; | 1675 | struct page *page_tail = page + i; |
1676 | BUG_ON(page_count(page_tail) <= 0); | 1676 | BUG_ON(page_count(page_tail) <= 0); |
1677 | /* | 1677 | /* |
1678 | * Tail pages may be freed if there wasn't any mapping | 1678 | * Tail pages may be freed if there wasn't any mapping |
1679 | * like if add_to_swap() is running on a lru page that | 1679 | * like if add_to_swap() is running on a lru page that |
1680 | * had its mapping zapped. And freeing these pages | 1680 | * had its mapping zapped. And freeing these pages |
1681 | * requires taking the lru_lock so we do the put_page | 1681 | * requires taking the lru_lock so we do the put_page |
1682 | * of the tail pages after the split is complete. | 1682 | * of the tail pages after the split is complete. |
1683 | */ | 1683 | */ |
1684 | put_page(page_tail); | 1684 | put_page(page_tail); |
1685 | } | 1685 | } |
1686 | 1686 | ||
1687 | /* | 1687 | /* |
1688 | * Only the head page (now become a regular page) is required | 1688 | * Only the head page (now become a regular page) is required |
1689 | * to be pinned by the caller. | 1689 | * to be pinned by the caller. |
1690 | */ | 1690 | */ |
1691 | BUG_ON(page_count(page) <= 0); | 1691 | BUG_ON(page_count(page) <= 0); |
1692 | } | 1692 | } |
1693 | 1693 | ||
1694 | static int __split_huge_page_map(struct page *page, | 1694 | static int __split_huge_page_map(struct page *page, |
1695 | struct vm_area_struct *vma, | 1695 | struct vm_area_struct *vma, |
1696 | unsigned long address) | 1696 | unsigned long address) |
1697 | { | 1697 | { |
1698 | struct mm_struct *mm = vma->vm_mm; | 1698 | struct mm_struct *mm = vma->vm_mm; |
1699 | pmd_t *pmd, _pmd; | 1699 | pmd_t *pmd, _pmd; |
1700 | int ret = 0, i; | 1700 | int ret = 0, i; |
1701 | pgtable_t pgtable; | 1701 | pgtable_t pgtable; |
1702 | unsigned long haddr; | 1702 | unsigned long haddr; |
1703 | 1703 | ||
1704 | spin_lock(&mm->page_table_lock); | 1704 | spin_lock(&mm->page_table_lock); |
1705 | pmd = page_check_address_pmd(page, mm, address, | 1705 | pmd = page_check_address_pmd(page, mm, address, |
1706 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | 1706 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); |
1707 | if (pmd) { | 1707 | if (pmd) { |
1708 | pgtable = pgtable_trans_huge_withdraw(mm); | 1708 | pgtable = pgtable_trans_huge_withdraw(mm); |
1709 | pmd_populate(mm, &_pmd, pgtable); | 1709 | pmd_populate(mm, &_pmd, pgtable); |
1710 | 1710 | ||
1711 | haddr = address; | 1711 | haddr = address; |
1712 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 1712 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1713 | pte_t *pte, entry; | 1713 | pte_t *pte, entry; |
1714 | BUG_ON(PageCompound(page+i)); | 1714 | BUG_ON(PageCompound(page+i)); |
1715 | entry = mk_pte(page + i, vma->vm_page_prot); | 1715 | entry = mk_pte(page + i, vma->vm_page_prot); |
1716 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1716 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1717 | if (!pmd_write(*pmd)) | 1717 | if (!pmd_write(*pmd)) |
1718 | entry = pte_wrprotect(entry); | 1718 | entry = pte_wrprotect(entry); |
1719 | else | 1719 | else |
1720 | BUG_ON(page_mapcount(page) != 1); | 1720 | BUG_ON(page_mapcount(page) != 1); |
1721 | if (!pmd_young(*pmd)) | 1721 | if (!pmd_young(*pmd)) |
1722 | entry = pte_mkold(entry); | 1722 | entry = pte_mkold(entry); |
1723 | if (pmd_numa(*pmd)) | 1723 | if (pmd_numa(*pmd)) |
1724 | entry = pte_mknuma(entry); | 1724 | entry = pte_mknuma(entry); |
1725 | pte = pte_offset_map(&_pmd, haddr); | 1725 | pte = pte_offset_map(&_pmd, haddr); |
1726 | BUG_ON(!pte_none(*pte)); | 1726 | BUG_ON(!pte_none(*pte)); |
1727 | set_pte_at(mm, haddr, pte, entry); | 1727 | set_pte_at(mm, haddr, pte, entry); |
1728 | pte_unmap(pte); | 1728 | pte_unmap(pte); |
1729 | } | 1729 | } |
1730 | 1730 | ||
1731 | smp_wmb(); /* make pte visible before pmd */ | 1731 | smp_wmb(); /* make pte visible before pmd */ |
1732 | /* | 1732 | /* |
1733 | * Up to this point the pmd is present and huge and | 1733 | * Up to this point the pmd is present and huge and |
1734 | * userland has the whole access to the hugepage | 1734 | * userland has the whole access to the hugepage |
1735 | * during the split (which happens in place). If we | 1735 | * during the split (which happens in place). If we |
1736 | * overwrite the pmd with the not-huge version | 1736 | * overwrite the pmd with the not-huge version |
1737 | * pointing to the pte here (which of course we could | 1737 | * pointing to the pte here (which of course we could |
1738 | * if all CPUs were bug free), userland could trigger | 1738 | * if all CPUs were bug free), userland could trigger |
1739 | * a small page size TLB miss on the small sized TLB | 1739 | * a small page size TLB miss on the small sized TLB |
1740 | * while the hugepage TLB entry is still established | 1740 | * while the hugepage TLB entry is still established |
1741 | * in the huge TLB. Some CPU doesn't like that. See | 1741 | * in the huge TLB. Some CPU doesn't like that. See |
1742 | * http://support.amd.com/us/Processor_TechDocs/41322.pdf, | 1742 | * http://support.amd.com/us/Processor_TechDocs/41322.pdf, |
1743 | * Erratum 383 on page 93. Intel should be safe but is | 1743 | * Erratum 383 on page 93. Intel should be safe but is |
1744 | * also warns that it's only safe if the permission | 1744 | * also warns that it's only safe if the permission |
1745 | * and cache attributes of the two entries loaded in | 1745 | * and cache attributes of the two entries loaded in |
1746 | * the two TLB is identical (which should be the case | 1746 | * the two TLB is identical (which should be the case |
1747 | * here). But it is generally safer to never allow | 1747 | * here). But it is generally safer to never allow |
1748 | * small and huge TLB entries for the same virtual | 1748 | * small and huge TLB entries for the same virtual |
1749 | * address to be loaded simultaneously. So instead of | 1749 | * address to be loaded simultaneously. So instead of |
1750 | * doing "pmd_populate(); flush_tlb_range();" we first | 1750 | * doing "pmd_populate(); flush_tlb_range();" we first |
1751 | * mark the current pmd notpresent (atomically because | 1751 | * mark the current pmd notpresent (atomically because |
1752 | * here the pmd_trans_huge and pmd_trans_splitting | 1752 | * here the pmd_trans_huge and pmd_trans_splitting |
1753 | * must remain set at all times on the pmd until the | 1753 | * must remain set at all times on the pmd until the |
1754 | * split is complete for this pmd), then we flush the | 1754 | * split is complete for this pmd), then we flush the |
1755 | * SMP TLB and finally we write the non-huge version | 1755 | * SMP TLB and finally we write the non-huge version |
1756 | * of the pmd entry with pmd_populate. | 1756 | * of the pmd entry with pmd_populate. |
1757 | */ | 1757 | */ |
1758 | pmdp_invalidate(vma, address, pmd); | 1758 | pmdp_invalidate(vma, address, pmd); |
1759 | pmd_populate(mm, pmd, pgtable); | 1759 | pmd_populate(mm, pmd, pgtable); |
1760 | ret = 1; | 1760 | ret = 1; |
1761 | } | 1761 | } |
1762 | spin_unlock(&mm->page_table_lock); | 1762 | spin_unlock(&mm->page_table_lock); |
1763 | 1763 | ||
1764 | return ret; | 1764 | return ret; |
1765 | } | 1765 | } |
1766 | 1766 | ||
1767 | /* must be called with anon_vma->root->rwsem held */ | 1767 | /* must be called with anon_vma->root->rwsem held */ |
1768 | static void __split_huge_page(struct page *page, | 1768 | static void __split_huge_page(struct page *page, |
1769 | struct anon_vma *anon_vma) | 1769 | struct anon_vma *anon_vma) |
1770 | { | 1770 | { |
1771 | int mapcount, mapcount2; | 1771 | int mapcount, mapcount2; |
1772 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1772 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1773 | struct anon_vma_chain *avc; | 1773 | struct anon_vma_chain *avc; |
1774 | 1774 | ||
1775 | BUG_ON(!PageHead(page)); | 1775 | BUG_ON(!PageHead(page)); |
1776 | BUG_ON(PageTail(page)); | 1776 | BUG_ON(PageTail(page)); |
1777 | 1777 | ||
1778 | mapcount = 0; | 1778 | mapcount = 0; |
1779 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1779 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1780 | struct vm_area_struct *vma = avc->vma; | 1780 | struct vm_area_struct *vma = avc->vma; |
1781 | unsigned long addr = vma_address(page, vma); | 1781 | unsigned long addr = vma_address(page, vma); |
1782 | BUG_ON(is_vma_temporary_stack(vma)); | 1782 | BUG_ON(is_vma_temporary_stack(vma)); |
1783 | mapcount += __split_huge_page_splitting(page, vma, addr); | 1783 | mapcount += __split_huge_page_splitting(page, vma, addr); |
1784 | } | 1784 | } |
1785 | /* | 1785 | /* |
1786 | * It is critical that new vmas are added to the tail of the | 1786 | * It is critical that new vmas are added to the tail of the |
1787 | * anon_vma list. This guarantes that if copy_huge_pmd() runs | 1787 | * anon_vma list. This guarantes that if copy_huge_pmd() runs |
1788 | * and establishes a child pmd before | 1788 | * and establishes a child pmd before |
1789 | * __split_huge_page_splitting() freezes the parent pmd (so if | 1789 | * __split_huge_page_splitting() freezes the parent pmd (so if |
1790 | * we fail to prevent copy_huge_pmd() from running until the | 1790 | * we fail to prevent copy_huge_pmd() from running until the |
1791 | * whole __split_huge_page() is complete), we will still see | 1791 | * whole __split_huge_page() is complete), we will still see |
1792 | * the newly established pmd of the child later during the | 1792 | * the newly established pmd of the child later during the |
1793 | * walk, to be able to set it as pmd_trans_splitting too. | 1793 | * walk, to be able to set it as pmd_trans_splitting too. |
1794 | */ | 1794 | */ |
1795 | if (mapcount != page_mapcount(page)) | 1795 | if (mapcount != page_mapcount(page)) |
1796 | printk(KERN_ERR "mapcount %d page_mapcount %d\n", | 1796 | printk(KERN_ERR "mapcount %d page_mapcount %d\n", |
1797 | mapcount, page_mapcount(page)); | 1797 | mapcount, page_mapcount(page)); |
1798 | BUG_ON(mapcount != page_mapcount(page)); | 1798 | BUG_ON(mapcount != page_mapcount(page)); |
1799 | 1799 | ||
1800 | __split_huge_page_refcount(page); | 1800 | __split_huge_page_refcount(page); |
1801 | 1801 | ||
1802 | mapcount2 = 0; | 1802 | mapcount2 = 0; |
1803 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1803 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1804 | struct vm_area_struct *vma = avc->vma; | 1804 | struct vm_area_struct *vma = avc->vma; |
1805 | unsigned long addr = vma_address(page, vma); | 1805 | unsigned long addr = vma_address(page, vma); |
1806 | BUG_ON(is_vma_temporary_stack(vma)); | 1806 | BUG_ON(is_vma_temporary_stack(vma)); |
1807 | mapcount2 += __split_huge_page_map(page, vma, addr); | 1807 | mapcount2 += __split_huge_page_map(page, vma, addr); |
1808 | } | 1808 | } |
1809 | if (mapcount != mapcount2) | 1809 | if (mapcount != mapcount2) |
1810 | printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", | 1810 | printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", |
1811 | mapcount, mapcount2, page_mapcount(page)); | 1811 | mapcount, mapcount2, page_mapcount(page)); |
1812 | BUG_ON(mapcount != mapcount2); | 1812 | BUG_ON(mapcount != mapcount2); |
1813 | } | 1813 | } |
1814 | 1814 | ||
1815 | int split_huge_page(struct page *page) | 1815 | int split_huge_page(struct page *page) |
1816 | { | 1816 | { |
1817 | struct anon_vma *anon_vma; | 1817 | struct anon_vma *anon_vma; |
1818 | int ret = 1; | 1818 | int ret = 1; |
1819 | 1819 | ||
1820 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | 1820 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); |
1821 | BUG_ON(!PageAnon(page)); | 1821 | BUG_ON(!PageAnon(page)); |
1822 | anon_vma = page_lock_anon_vma_read(page); | 1822 | |
1823 | /* | ||
1824 | * The caller does not necessarily hold an mmap_sem that would prevent | ||
1825 | * the anon_vma disappearing so we first we take a reference to it | ||
1826 | * and then lock the anon_vma for write. This is similar to | ||
1827 | * page_lock_anon_vma_read except the write lock is taken to serialise | ||
1828 | * against parallel split or collapse operations. | ||
1829 | */ | ||
1830 | anon_vma = page_get_anon_vma(page); | ||
1823 | if (!anon_vma) | 1831 | if (!anon_vma) |
1824 | goto out; | 1832 | goto out; |
1833 | anon_vma_lock_write(anon_vma); | ||
1834 | |||
1825 | ret = 0; | 1835 | ret = 0; |
1826 | if (!PageCompound(page)) | 1836 | if (!PageCompound(page)) |
1827 | goto out_unlock; | 1837 | goto out_unlock; |
1828 | 1838 | ||
1829 | BUG_ON(!PageSwapBacked(page)); | 1839 | BUG_ON(!PageSwapBacked(page)); |
1830 | __split_huge_page(page, anon_vma); | 1840 | __split_huge_page(page, anon_vma); |
1831 | count_vm_event(THP_SPLIT); | 1841 | count_vm_event(THP_SPLIT); |
1832 | 1842 | ||
1833 | BUG_ON(PageCompound(page)); | 1843 | BUG_ON(PageCompound(page)); |
1834 | out_unlock: | 1844 | out_unlock: |
1835 | page_unlock_anon_vma_read(anon_vma); | 1845 | anon_vma_unlock(anon_vma); |
1846 | put_anon_vma(anon_vma); | ||
1836 | out: | 1847 | out: |
1837 | return ret; | 1848 | return ret; |
1838 | } | 1849 | } |
1839 | 1850 | ||
1840 | #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) | 1851 | #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) |
1841 | 1852 | ||
1842 | int hugepage_madvise(struct vm_area_struct *vma, | 1853 | int hugepage_madvise(struct vm_area_struct *vma, |
1843 | unsigned long *vm_flags, int advice) | 1854 | unsigned long *vm_flags, int advice) |
1844 | { | 1855 | { |
1845 | struct mm_struct *mm = vma->vm_mm; | 1856 | struct mm_struct *mm = vma->vm_mm; |
1846 | 1857 | ||
1847 | switch (advice) { | 1858 | switch (advice) { |
1848 | case MADV_HUGEPAGE: | 1859 | case MADV_HUGEPAGE: |
1849 | /* | 1860 | /* |
1850 | * Be somewhat over-protective like KSM for now! | 1861 | * Be somewhat over-protective like KSM for now! |
1851 | */ | 1862 | */ |
1852 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) | 1863 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) |
1853 | return -EINVAL; | 1864 | return -EINVAL; |
1854 | if (mm->def_flags & VM_NOHUGEPAGE) | 1865 | if (mm->def_flags & VM_NOHUGEPAGE) |
1855 | return -EINVAL; | 1866 | return -EINVAL; |
1856 | *vm_flags &= ~VM_NOHUGEPAGE; | 1867 | *vm_flags &= ~VM_NOHUGEPAGE; |
1857 | *vm_flags |= VM_HUGEPAGE; | 1868 | *vm_flags |= VM_HUGEPAGE; |
1858 | /* | 1869 | /* |
1859 | * If the vma become good for khugepaged to scan, | 1870 | * If the vma become good for khugepaged to scan, |
1860 | * register it here without waiting a page fault that | 1871 | * register it here without waiting a page fault that |
1861 | * may not happen any time soon. | 1872 | * may not happen any time soon. |
1862 | */ | 1873 | */ |
1863 | if (unlikely(khugepaged_enter_vma_merge(vma))) | 1874 | if (unlikely(khugepaged_enter_vma_merge(vma))) |
1864 | return -ENOMEM; | 1875 | return -ENOMEM; |
1865 | break; | 1876 | break; |
1866 | case MADV_NOHUGEPAGE: | 1877 | case MADV_NOHUGEPAGE: |
1867 | /* | 1878 | /* |
1868 | * Be somewhat over-protective like KSM for now! | 1879 | * Be somewhat over-protective like KSM for now! |
1869 | */ | 1880 | */ |
1870 | if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) | 1881 | if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) |
1871 | return -EINVAL; | 1882 | return -EINVAL; |
1872 | *vm_flags &= ~VM_HUGEPAGE; | 1883 | *vm_flags &= ~VM_HUGEPAGE; |
1873 | *vm_flags |= VM_NOHUGEPAGE; | 1884 | *vm_flags |= VM_NOHUGEPAGE; |
1874 | /* | 1885 | /* |
1875 | * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning | 1886 | * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning |
1876 | * this vma even if we leave the mm registered in khugepaged if | 1887 | * this vma even if we leave the mm registered in khugepaged if |
1877 | * it got registered before VM_NOHUGEPAGE was set. | 1888 | * it got registered before VM_NOHUGEPAGE was set. |
1878 | */ | 1889 | */ |
1879 | break; | 1890 | break; |
1880 | } | 1891 | } |
1881 | 1892 | ||
1882 | return 0; | 1893 | return 0; |
1883 | } | 1894 | } |
1884 | 1895 | ||
1885 | static int __init khugepaged_slab_init(void) | 1896 | static int __init khugepaged_slab_init(void) |
1886 | { | 1897 | { |
1887 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", | 1898 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", |
1888 | sizeof(struct mm_slot), | 1899 | sizeof(struct mm_slot), |
1889 | __alignof__(struct mm_slot), 0, NULL); | 1900 | __alignof__(struct mm_slot), 0, NULL); |
1890 | if (!mm_slot_cache) | 1901 | if (!mm_slot_cache) |
1891 | return -ENOMEM; | 1902 | return -ENOMEM; |
1892 | 1903 | ||
1893 | return 0; | 1904 | return 0; |
1894 | } | 1905 | } |
1895 | 1906 | ||
1896 | static void __init khugepaged_slab_free(void) | 1907 | static void __init khugepaged_slab_free(void) |
1897 | { | 1908 | { |
1898 | kmem_cache_destroy(mm_slot_cache); | 1909 | kmem_cache_destroy(mm_slot_cache); |
1899 | mm_slot_cache = NULL; | 1910 | mm_slot_cache = NULL; |
1900 | } | 1911 | } |
1901 | 1912 | ||
1902 | static inline struct mm_slot *alloc_mm_slot(void) | 1913 | static inline struct mm_slot *alloc_mm_slot(void) |
1903 | { | 1914 | { |
1904 | if (!mm_slot_cache) /* initialization failed */ | 1915 | if (!mm_slot_cache) /* initialization failed */ |
1905 | return NULL; | 1916 | return NULL; |
1906 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); | 1917 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); |
1907 | } | 1918 | } |
1908 | 1919 | ||
1909 | static inline void free_mm_slot(struct mm_slot *mm_slot) | 1920 | static inline void free_mm_slot(struct mm_slot *mm_slot) |
1910 | { | 1921 | { |
1911 | kmem_cache_free(mm_slot_cache, mm_slot); | 1922 | kmem_cache_free(mm_slot_cache, mm_slot); |
1912 | } | 1923 | } |
1913 | 1924 | ||
1914 | static int __init mm_slots_hash_init(void) | 1925 | static int __init mm_slots_hash_init(void) |
1915 | { | 1926 | { |
1916 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), | 1927 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), |
1917 | GFP_KERNEL); | 1928 | GFP_KERNEL); |
1918 | if (!mm_slots_hash) | 1929 | if (!mm_slots_hash) |
1919 | return -ENOMEM; | 1930 | return -ENOMEM; |
1920 | return 0; | 1931 | return 0; |
1921 | } | 1932 | } |
1922 | 1933 | ||
1923 | #if 0 | 1934 | #if 0 |
1924 | static void __init mm_slots_hash_free(void) | 1935 | static void __init mm_slots_hash_free(void) |
1925 | { | 1936 | { |
1926 | kfree(mm_slots_hash); | 1937 | kfree(mm_slots_hash); |
1927 | mm_slots_hash = NULL; | 1938 | mm_slots_hash = NULL; |
1928 | } | 1939 | } |
1929 | #endif | 1940 | #endif |
1930 | 1941 | ||
1931 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | 1942 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) |
1932 | { | 1943 | { |
1933 | struct mm_slot *mm_slot; | 1944 | struct mm_slot *mm_slot; |
1934 | struct hlist_head *bucket; | 1945 | struct hlist_head *bucket; |
1935 | struct hlist_node *node; | 1946 | struct hlist_node *node; |
1936 | 1947 | ||
1937 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 1948 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) |
1938 | % MM_SLOTS_HASH_HEADS]; | 1949 | % MM_SLOTS_HASH_HEADS]; |
1939 | hlist_for_each_entry(mm_slot, node, bucket, hash) { | 1950 | hlist_for_each_entry(mm_slot, node, bucket, hash) { |
1940 | if (mm == mm_slot->mm) | 1951 | if (mm == mm_slot->mm) |
1941 | return mm_slot; | 1952 | return mm_slot; |
1942 | } | 1953 | } |
1943 | return NULL; | 1954 | return NULL; |
1944 | } | 1955 | } |
1945 | 1956 | ||
1946 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | 1957 | static void insert_to_mm_slots_hash(struct mm_struct *mm, |
1947 | struct mm_slot *mm_slot) | 1958 | struct mm_slot *mm_slot) |
1948 | { | 1959 | { |
1949 | struct hlist_head *bucket; | 1960 | struct hlist_head *bucket; |
1950 | 1961 | ||
1951 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 1962 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) |
1952 | % MM_SLOTS_HASH_HEADS]; | 1963 | % MM_SLOTS_HASH_HEADS]; |
1953 | mm_slot->mm = mm; | 1964 | mm_slot->mm = mm; |
1954 | hlist_add_head(&mm_slot->hash, bucket); | 1965 | hlist_add_head(&mm_slot->hash, bucket); |
1955 | } | 1966 | } |
1956 | 1967 | ||
1957 | static inline int khugepaged_test_exit(struct mm_struct *mm) | 1968 | static inline int khugepaged_test_exit(struct mm_struct *mm) |
1958 | { | 1969 | { |
1959 | return atomic_read(&mm->mm_users) == 0; | 1970 | return atomic_read(&mm->mm_users) == 0; |
1960 | } | 1971 | } |
1961 | 1972 | ||
1962 | int __khugepaged_enter(struct mm_struct *mm) | 1973 | int __khugepaged_enter(struct mm_struct *mm) |
1963 | { | 1974 | { |
1964 | struct mm_slot *mm_slot; | 1975 | struct mm_slot *mm_slot; |
1965 | int wakeup; | 1976 | int wakeup; |
1966 | 1977 | ||
1967 | mm_slot = alloc_mm_slot(); | 1978 | mm_slot = alloc_mm_slot(); |
1968 | if (!mm_slot) | 1979 | if (!mm_slot) |
1969 | return -ENOMEM; | 1980 | return -ENOMEM; |
1970 | 1981 | ||
1971 | /* __khugepaged_exit() must not run from under us */ | 1982 | /* __khugepaged_exit() must not run from under us */ |
1972 | VM_BUG_ON(khugepaged_test_exit(mm)); | 1983 | VM_BUG_ON(khugepaged_test_exit(mm)); |
1973 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | 1984 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { |
1974 | free_mm_slot(mm_slot); | 1985 | free_mm_slot(mm_slot); |
1975 | return 0; | 1986 | return 0; |
1976 | } | 1987 | } |
1977 | 1988 | ||
1978 | spin_lock(&khugepaged_mm_lock); | 1989 | spin_lock(&khugepaged_mm_lock); |
1979 | insert_to_mm_slots_hash(mm, mm_slot); | 1990 | insert_to_mm_slots_hash(mm, mm_slot); |
1980 | /* | 1991 | /* |
1981 | * Insert just behind the scanning cursor, to let the area settle | 1992 | * Insert just behind the scanning cursor, to let the area settle |
1982 | * down a little. | 1993 | * down a little. |
1983 | */ | 1994 | */ |
1984 | wakeup = list_empty(&khugepaged_scan.mm_head); | 1995 | wakeup = list_empty(&khugepaged_scan.mm_head); |
1985 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); | 1996 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); |
1986 | spin_unlock(&khugepaged_mm_lock); | 1997 | spin_unlock(&khugepaged_mm_lock); |
1987 | 1998 | ||
1988 | atomic_inc(&mm->mm_count); | 1999 | atomic_inc(&mm->mm_count); |
1989 | if (wakeup) | 2000 | if (wakeup) |
1990 | wake_up_interruptible(&khugepaged_wait); | 2001 | wake_up_interruptible(&khugepaged_wait); |
1991 | 2002 | ||
1992 | return 0; | 2003 | return 0; |
1993 | } | 2004 | } |
1994 | 2005 | ||
1995 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | 2006 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma) |
1996 | { | 2007 | { |
1997 | unsigned long hstart, hend; | 2008 | unsigned long hstart, hend; |
1998 | if (!vma->anon_vma) | 2009 | if (!vma->anon_vma) |
1999 | /* | 2010 | /* |
2000 | * Not yet faulted in so we will register later in the | 2011 | * Not yet faulted in so we will register later in the |
2001 | * page fault if needed. | 2012 | * page fault if needed. |
2002 | */ | 2013 | */ |
2003 | return 0; | 2014 | return 0; |
2004 | if (vma->vm_ops) | 2015 | if (vma->vm_ops) |
2005 | /* khugepaged not yet working on file or special mappings */ | 2016 | /* khugepaged not yet working on file or special mappings */ |
2006 | return 0; | 2017 | return 0; |
2007 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2018 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
2008 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2019 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2009 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2020 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2010 | if (hstart < hend) | 2021 | if (hstart < hend) |
2011 | return khugepaged_enter(vma); | 2022 | return khugepaged_enter(vma); |
2012 | return 0; | 2023 | return 0; |
2013 | } | 2024 | } |
2014 | 2025 | ||
2015 | void __khugepaged_exit(struct mm_struct *mm) | 2026 | void __khugepaged_exit(struct mm_struct *mm) |
2016 | { | 2027 | { |
2017 | struct mm_slot *mm_slot; | 2028 | struct mm_slot *mm_slot; |
2018 | int free = 0; | 2029 | int free = 0; |
2019 | 2030 | ||
2020 | spin_lock(&khugepaged_mm_lock); | 2031 | spin_lock(&khugepaged_mm_lock); |
2021 | mm_slot = get_mm_slot(mm); | 2032 | mm_slot = get_mm_slot(mm); |
2022 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | 2033 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { |
2023 | hlist_del(&mm_slot->hash); | 2034 | hlist_del(&mm_slot->hash); |
2024 | list_del(&mm_slot->mm_node); | 2035 | list_del(&mm_slot->mm_node); |
2025 | free = 1; | 2036 | free = 1; |
2026 | } | 2037 | } |
2027 | spin_unlock(&khugepaged_mm_lock); | 2038 | spin_unlock(&khugepaged_mm_lock); |
2028 | 2039 | ||
2029 | if (free) { | 2040 | if (free) { |
2030 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | 2041 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); |
2031 | free_mm_slot(mm_slot); | 2042 | free_mm_slot(mm_slot); |
2032 | mmdrop(mm); | 2043 | mmdrop(mm); |
2033 | } else if (mm_slot) { | 2044 | } else if (mm_slot) { |
2034 | /* | 2045 | /* |
2035 | * This is required to serialize against | 2046 | * This is required to serialize against |
2036 | * khugepaged_test_exit() (which is guaranteed to run | 2047 | * khugepaged_test_exit() (which is guaranteed to run |
2037 | * under mmap sem read mode). Stop here (after we | 2048 | * under mmap sem read mode). Stop here (after we |
2038 | * return all pagetables will be destroyed) until | 2049 | * return all pagetables will be destroyed) until |
2039 | * khugepaged has finished working on the pagetables | 2050 | * khugepaged has finished working on the pagetables |
2040 | * under the mmap_sem. | 2051 | * under the mmap_sem. |
2041 | */ | 2052 | */ |
2042 | down_write(&mm->mmap_sem); | 2053 | down_write(&mm->mmap_sem); |
2043 | up_write(&mm->mmap_sem); | 2054 | up_write(&mm->mmap_sem); |
2044 | } | 2055 | } |
2045 | } | 2056 | } |
2046 | 2057 | ||
2047 | static void release_pte_page(struct page *page) | 2058 | static void release_pte_page(struct page *page) |
2048 | { | 2059 | { |
2049 | /* 0 stands for page_is_file_cache(page) == false */ | 2060 | /* 0 stands for page_is_file_cache(page) == false */ |
2050 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); | 2061 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); |
2051 | unlock_page(page); | 2062 | unlock_page(page); |
2052 | putback_lru_page(page); | 2063 | putback_lru_page(page); |
2053 | } | 2064 | } |
2054 | 2065 | ||
2055 | static void release_pte_pages(pte_t *pte, pte_t *_pte) | 2066 | static void release_pte_pages(pte_t *pte, pte_t *_pte) |
2056 | { | 2067 | { |
2057 | while (--_pte >= pte) { | 2068 | while (--_pte >= pte) { |
2058 | pte_t pteval = *_pte; | 2069 | pte_t pteval = *_pte; |
2059 | if (!pte_none(pteval)) | 2070 | if (!pte_none(pteval)) |
2060 | release_pte_page(pte_page(pteval)); | 2071 | release_pte_page(pte_page(pteval)); |
2061 | } | 2072 | } |
2062 | } | 2073 | } |
2063 | 2074 | ||
2064 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | 2075 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, |
2065 | unsigned long address, | 2076 | unsigned long address, |
2066 | pte_t *pte) | 2077 | pte_t *pte) |
2067 | { | 2078 | { |
2068 | struct page *page; | 2079 | struct page *page; |
2069 | pte_t *_pte; | 2080 | pte_t *_pte; |
2070 | int referenced = 0, none = 0; | 2081 | int referenced = 0, none = 0; |
2071 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 2082 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
2072 | _pte++, address += PAGE_SIZE) { | 2083 | _pte++, address += PAGE_SIZE) { |
2073 | pte_t pteval = *_pte; | 2084 | pte_t pteval = *_pte; |
2074 | if (pte_none(pteval)) { | 2085 | if (pte_none(pteval)) { |
2075 | if (++none <= khugepaged_max_ptes_none) | 2086 | if (++none <= khugepaged_max_ptes_none) |
2076 | continue; | 2087 | continue; |
2077 | else | 2088 | else |
2078 | goto out; | 2089 | goto out; |
2079 | } | 2090 | } |
2080 | if (!pte_present(pteval) || !pte_write(pteval)) | 2091 | if (!pte_present(pteval) || !pte_write(pteval)) |
2081 | goto out; | 2092 | goto out; |
2082 | page = vm_normal_page(vma, address, pteval); | 2093 | page = vm_normal_page(vma, address, pteval); |
2083 | if (unlikely(!page)) | 2094 | if (unlikely(!page)) |
2084 | goto out; | 2095 | goto out; |
2085 | 2096 | ||
2086 | VM_BUG_ON(PageCompound(page)); | 2097 | VM_BUG_ON(PageCompound(page)); |
2087 | BUG_ON(!PageAnon(page)); | 2098 | BUG_ON(!PageAnon(page)); |
2088 | VM_BUG_ON(!PageSwapBacked(page)); | 2099 | VM_BUG_ON(!PageSwapBacked(page)); |
2089 | 2100 | ||
2090 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2101 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
2091 | if (page_count(page) != 1) | 2102 | if (page_count(page) != 1) |
2092 | goto out; | 2103 | goto out; |
2093 | /* | 2104 | /* |
2094 | * We can do it before isolate_lru_page because the | 2105 | * We can do it before isolate_lru_page because the |
2095 | * page can't be freed from under us. NOTE: PG_lock | 2106 | * page can't be freed from under us. NOTE: PG_lock |
2096 | * is needed to serialize against split_huge_page | 2107 | * is needed to serialize against split_huge_page |
2097 | * when invoked from the VM. | 2108 | * when invoked from the VM. |
2098 | */ | 2109 | */ |
2099 | if (!trylock_page(page)) | 2110 | if (!trylock_page(page)) |
2100 | goto out; | 2111 | goto out; |
2101 | /* | 2112 | /* |
2102 | * Isolate the page to avoid collapsing an hugepage | 2113 | * Isolate the page to avoid collapsing an hugepage |
2103 | * currently in use by the VM. | 2114 | * currently in use by the VM. |
2104 | */ | 2115 | */ |
2105 | if (isolate_lru_page(page)) { | 2116 | if (isolate_lru_page(page)) { |
2106 | unlock_page(page); | 2117 | unlock_page(page); |
2107 | goto out; | 2118 | goto out; |
2108 | } | 2119 | } |
2109 | /* 0 stands for page_is_file_cache(page) == false */ | 2120 | /* 0 stands for page_is_file_cache(page) == false */ |
2110 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | 2121 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); |
2111 | VM_BUG_ON(!PageLocked(page)); | 2122 | VM_BUG_ON(!PageLocked(page)); |
2112 | VM_BUG_ON(PageLRU(page)); | 2123 | VM_BUG_ON(PageLRU(page)); |
2113 | 2124 | ||
2114 | /* If there is no mapped pte young don't collapse the page */ | 2125 | /* If there is no mapped pte young don't collapse the page */ |
2115 | if (pte_young(pteval) || PageReferenced(page) || | 2126 | if (pte_young(pteval) || PageReferenced(page) || |
2116 | mmu_notifier_test_young(vma->vm_mm, address)) | 2127 | mmu_notifier_test_young(vma->vm_mm, address)) |
2117 | referenced = 1; | 2128 | referenced = 1; |
2118 | } | 2129 | } |
2119 | if (likely(referenced)) | 2130 | if (likely(referenced)) |
2120 | return 1; | 2131 | return 1; |
2121 | out: | 2132 | out: |
2122 | release_pte_pages(pte, _pte); | 2133 | release_pte_pages(pte, _pte); |
2123 | return 0; | 2134 | return 0; |
2124 | } | 2135 | } |
2125 | 2136 | ||
2126 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | 2137 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, |
2127 | struct vm_area_struct *vma, | 2138 | struct vm_area_struct *vma, |
2128 | unsigned long address, | 2139 | unsigned long address, |
2129 | spinlock_t *ptl) | 2140 | spinlock_t *ptl) |
2130 | { | 2141 | { |
2131 | pte_t *_pte; | 2142 | pte_t *_pte; |
2132 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { | 2143 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { |
2133 | pte_t pteval = *_pte; | 2144 | pte_t pteval = *_pte; |
2134 | struct page *src_page; | 2145 | struct page *src_page; |
2135 | 2146 | ||
2136 | if (pte_none(pteval)) { | 2147 | if (pte_none(pteval)) { |
2137 | clear_user_highpage(page, address); | 2148 | clear_user_highpage(page, address); |
2138 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); | 2149 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); |
2139 | } else { | 2150 | } else { |
2140 | src_page = pte_page(pteval); | 2151 | src_page = pte_page(pteval); |
2141 | copy_user_highpage(page, src_page, address, vma); | 2152 | copy_user_highpage(page, src_page, address, vma); |
2142 | VM_BUG_ON(page_mapcount(src_page) != 1); | 2153 | VM_BUG_ON(page_mapcount(src_page) != 1); |
2143 | release_pte_page(src_page); | 2154 | release_pte_page(src_page); |
2144 | /* | 2155 | /* |
2145 | * ptl mostly unnecessary, but preempt has to | 2156 | * ptl mostly unnecessary, but preempt has to |
2146 | * be disabled to update the per-cpu stats | 2157 | * be disabled to update the per-cpu stats |
2147 | * inside page_remove_rmap(). | 2158 | * inside page_remove_rmap(). |
2148 | */ | 2159 | */ |
2149 | spin_lock(ptl); | 2160 | spin_lock(ptl); |
2150 | /* | 2161 | /* |
2151 | * paravirt calls inside pte_clear here are | 2162 | * paravirt calls inside pte_clear here are |
2152 | * superfluous. | 2163 | * superfluous. |
2153 | */ | 2164 | */ |
2154 | pte_clear(vma->vm_mm, address, _pte); | 2165 | pte_clear(vma->vm_mm, address, _pte); |
2155 | page_remove_rmap(src_page); | 2166 | page_remove_rmap(src_page); |
2156 | spin_unlock(ptl); | 2167 | spin_unlock(ptl); |
2157 | free_page_and_swap_cache(src_page); | 2168 | free_page_and_swap_cache(src_page); |
2158 | } | 2169 | } |
2159 | 2170 | ||
2160 | address += PAGE_SIZE; | 2171 | address += PAGE_SIZE; |
2161 | page++; | 2172 | page++; |
2162 | } | 2173 | } |
2163 | } | 2174 | } |
2164 | 2175 | ||
2165 | static void khugepaged_alloc_sleep(void) | 2176 | static void khugepaged_alloc_sleep(void) |
2166 | { | 2177 | { |
2167 | wait_event_freezable_timeout(khugepaged_wait, false, | 2178 | wait_event_freezable_timeout(khugepaged_wait, false, |
2168 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | 2179 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); |
2169 | } | 2180 | } |
2170 | 2181 | ||
2171 | #ifdef CONFIG_NUMA | 2182 | #ifdef CONFIG_NUMA |
2172 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | 2183 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
2173 | { | 2184 | { |
2174 | if (IS_ERR(*hpage)) { | 2185 | if (IS_ERR(*hpage)) { |
2175 | if (!*wait) | 2186 | if (!*wait) |
2176 | return false; | 2187 | return false; |
2177 | 2188 | ||
2178 | *wait = false; | 2189 | *wait = false; |
2179 | *hpage = NULL; | 2190 | *hpage = NULL; |
2180 | khugepaged_alloc_sleep(); | 2191 | khugepaged_alloc_sleep(); |
2181 | } else if (*hpage) { | 2192 | } else if (*hpage) { |
2182 | put_page(*hpage); | 2193 | put_page(*hpage); |
2183 | *hpage = NULL; | 2194 | *hpage = NULL; |
2184 | } | 2195 | } |
2185 | 2196 | ||
2186 | return true; | 2197 | return true; |
2187 | } | 2198 | } |
2188 | 2199 | ||
2189 | static struct page | 2200 | static struct page |
2190 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2201 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, |
2191 | struct vm_area_struct *vma, unsigned long address, | 2202 | struct vm_area_struct *vma, unsigned long address, |
2192 | int node) | 2203 | int node) |
2193 | { | 2204 | { |
2194 | VM_BUG_ON(*hpage); | 2205 | VM_BUG_ON(*hpage); |
2195 | /* | 2206 | /* |
2196 | * Allocate the page while the vma is still valid and under | 2207 | * Allocate the page while the vma is still valid and under |
2197 | * the mmap_sem read mode so there is no memory allocation | 2208 | * the mmap_sem read mode so there is no memory allocation |
2198 | * later when we take the mmap_sem in write mode. This is more | 2209 | * later when we take the mmap_sem in write mode. This is more |
2199 | * friendly behavior (OTOH it may actually hide bugs) to | 2210 | * friendly behavior (OTOH it may actually hide bugs) to |
2200 | * filesystems in userland with daemons allocating memory in | 2211 | * filesystems in userland with daemons allocating memory in |
2201 | * the userland I/O paths. Allocating memory with the | 2212 | * the userland I/O paths. Allocating memory with the |
2202 | * mmap_sem in read mode is good idea also to allow greater | 2213 | * mmap_sem in read mode is good idea also to allow greater |
2203 | * scalability. | 2214 | * scalability. |
2204 | */ | 2215 | */ |
2205 | *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, | 2216 | *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, |
2206 | node, __GFP_OTHER_NODE); | 2217 | node, __GFP_OTHER_NODE); |
2207 | 2218 | ||
2208 | /* | 2219 | /* |
2209 | * After allocating the hugepage, release the mmap_sem read lock in | 2220 | * After allocating the hugepage, release the mmap_sem read lock in |
2210 | * preparation for taking it in write mode. | 2221 | * preparation for taking it in write mode. |
2211 | */ | 2222 | */ |
2212 | up_read(&mm->mmap_sem); | 2223 | up_read(&mm->mmap_sem); |
2213 | if (unlikely(!*hpage)) { | 2224 | if (unlikely(!*hpage)) { |
2214 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2225 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
2215 | *hpage = ERR_PTR(-ENOMEM); | 2226 | *hpage = ERR_PTR(-ENOMEM); |
2216 | return NULL; | 2227 | return NULL; |
2217 | } | 2228 | } |
2218 | 2229 | ||
2219 | count_vm_event(THP_COLLAPSE_ALLOC); | 2230 | count_vm_event(THP_COLLAPSE_ALLOC); |
2220 | return *hpage; | 2231 | return *hpage; |
2221 | } | 2232 | } |
2222 | #else | 2233 | #else |
2223 | static struct page *khugepaged_alloc_hugepage(bool *wait) | 2234 | static struct page *khugepaged_alloc_hugepage(bool *wait) |
2224 | { | 2235 | { |
2225 | struct page *hpage; | 2236 | struct page *hpage; |
2226 | 2237 | ||
2227 | do { | 2238 | do { |
2228 | hpage = alloc_hugepage(khugepaged_defrag()); | 2239 | hpage = alloc_hugepage(khugepaged_defrag()); |
2229 | if (!hpage) { | 2240 | if (!hpage) { |
2230 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2241 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
2231 | if (!*wait) | 2242 | if (!*wait) |
2232 | return NULL; | 2243 | return NULL; |
2233 | 2244 | ||
2234 | *wait = false; | 2245 | *wait = false; |
2235 | khugepaged_alloc_sleep(); | 2246 | khugepaged_alloc_sleep(); |
2236 | } else | 2247 | } else |
2237 | count_vm_event(THP_COLLAPSE_ALLOC); | 2248 | count_vm_event(THP_COLLAPSE_ALLOC); |
2238 | } while (unlikely(!hpage) && likely(khugepaged_enabled())); | 2249 | } while (unlikely(!hpage) && likely(khugepaged_enabled())); |
2239 | 2250 | ||
2240 | return hpage; | 2251 | return hpage; |
2241 | } | 2252 | } |
2242 | 2253 | ||
2243 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | 2254 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
2244 | { | 2255 | { |
2245 | if (!*hpage) | 2256 | if (!*hpage) |
2246 | *hpage = khugepaged_alloc_hugepage(wait); | 2257 | *hpage = khugepaged_alloc_hugepage(wait); |
2247 | 2258 | ||
2248 | if (unlikely(!*hpage)) | 2259 | if (unlikely(!*hpage)) |
2249 | return false; | 2260 | return false; |
2250 | 2261 | ||
2251 | return true; | 2262 | return true; |
2252 | } | 2263 | } |
2253 | 2264 | ||
2254 | static struct page | 2265 | static struct page |
2255 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2266 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, |
2256 | struct vm_area_struct *vma, unsigned long address, | 2267 | struct vm_area_struct *vma, unsigned long address, |
2257 | int node) | 2268 | int node) |
2258 | { | 2269 | { |
2259 | up_read(&mm->mmap_sem); | 2270 | up_read(&mm->mmap_sem); |
2260 | VM_BUG_ON(!*hpage); | 2271 | VM_BUG_ON(!*hpage); |
2261 | return *hpage; | 2272 | return *hpage; |
2262 | } | 2273 | } |
2263 | #endif | 2274 | #endif |
2264 | 2275 | ||
2265 | static bool hugepage_vma_check(struct vm_area_struct *vma) | 2276 | static bool hugepage_vma_check(struct vm_area_struct *vma) |
2266 | { | 2277 | { |
2267 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | 2278 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || |
2268 | (vma->vm_flags & VM_NOHUGEPAGE)) | 2279 | (vma->vm_flags & VM_NOHUGEPAGE)) |
2269 | return false; | 2280 | return false; |
2270 | 2281 | ||
2271 | if (!vma->anon_vma || vma->vm_ops) | 2282 | if (!vma->anon_vma || vma->vm_ops) |
2272 | return false; | 2283 | return false; |
2273 | if (is_vma_temporary_stack(vma)) | 2284 | if (is_vma_temporary_stack(vma)) |
2274 | return false; | 2285 | return false; |
2275 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2286 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
2276 | return true; | 2287 | return true; |
2277 | } | 2288 | } |
2278 | 2289 | ||
2279 | static void collapse_huge_page(struct mm_struct *mm, | 2290 | static void collapse_huge_page(struct mm_struct *mm, |
2280 | unsigned long address, | 2291 | unsigned long address, |
2281 | struct page **hpage, | 2292 | struct page **hpage, |
2282 | struct vm_area_struct *vma, | 2293 | struct vm_area_struct *vma, |
2283 | int node) | 2294 | int node) |
2284 | { | 2295 | { |
2285 | pmd_t *pmd, _pmd; | 2296 | pmd_t *pmd, _pmd; |
2286 | pte_t *pte; | 2297 | pte_t *pte; |
2287 | pgtable_t pgtable; | 2298 | pgtable_t pgtable; |
2288 | struct page *new_page; | 2299 | struct page *new_page; |
2289 | spinlock_t *ptl; | 2300 | spinlock_t *ptl; |
2290 | int isolated; | 2301 | int isolated; |
2291 | unsigned long hstart, hend; | 2302 | unsigned long hstart, hend; |
2292 | unsigned long mmun_start; /* For mmu_notifiers */ | 2303 | unsigned long mmun_start; /* For mmu_notifiers */ |
2293 | unsigned long mmun_end; /* For mmu_notifiers */ | 2304 | unsigned long mmun_end; /* For mmu_notifiers */ |
2294 | 2305 | ||
2295 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2306 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2296 | 2307 | ||
2297 | /* release the mmap_sem read lock. */ | 2308 | /* release the mmap_sem read lock. */ |
2298 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); | 2309 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); |
2299 | if (!new_page) | 2310 | if (!new_page) |
2300 | return; | 2311 | return; |
2301 | 2312 | ||
2302 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) | 2313 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) |
2303 | return; | 2314 | return; |
2304 | 2315 | ||
2305 | /* | 2316 | /* |
2306 | * Prevent all access to pagetables with the exception of | 2317 | * Prevent all access to pagetables with the exception of |
2307 | * gup_fast later hanlded by the ptep_clear_flush and the VM | 2318 | * gup_fast later hanlded by the ptep_clear_flush and the VM |
2308 | * handled by the anon_vma lock + PG_lock. | 2319 | * handled by the anon_vma lock + PG_lock. |
2309 | */ | 2320 | */ |
2310 | down_write(&mm->mmap_sem); | 2321 | down_write(&mm->mmap_sem); |
2311 | if (unlikely(khugepaged_test_exit(mm))) | 2322 | if (unlikely(khugepaged_test_exit(mm))) |
2312 | goto out; | 2323 | goto out; |
2313 | 2324 | ||
2314 | vma = find_vma(mm, address); | 2325 | vma = find_vma(mm, address); |
2315 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2326 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2316 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2327 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2317 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | 2328 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) |
2318 | goto out; | 2329 | goto out; |
2319 | if (!hugepage_vma_check(vma)) | 2330 | if (!hugepage_vma_check(vma)) |
2320 | goto out; | 2331 | goto out; |
2321 | pmd = mm_find_pmd(mm, address); | 2332 | pmd = mm_find_pmd(mm, address); |
2322 | if (!pmd) | 2333 | if (!pmd) |
2323 | goto out; | 2334 | goto out; |
2324 | if (pmd_trans_huge(*pmd)) | 2335 | if (pmd_trans_huge(*pmd)) |
2325 | goto out; | 2336 | goto out; |
2326 | 2337 | ||
2327 | anon_vma_lock_write(vma->anon_vma); | 2338 | anon_vma_lock_write(vma->anon_vma); |
2328 | 2339 | ||
2329 | pte = pte_offset_map(pmd, address); | 2340 | pte = pte_offset_map(pmd, address); |
2330 | ptl = pte_lockptr(mm, pmd); | 2341 | ptl = pte_lockptr(mm, pmd); |
2331 | 2342 | ||
2332 | mmun_start = address; | 2343 | mmun_start = address; |
2333 | mmun_end = address + HPAGE_PMD_SIZE; | 2344 | mmun_end = address + HPAGE_PMD_SIZE; |
2334 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2345 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2335 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | 2346 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ |
2336 | /* | 2347 | /* |
2337 | * After this gup_fast can't run anymore. This also removes | 2348 | * After this gup_fast can't run anymore. This also removes |
2338 | * any huge TLB entry from the CPU so we won't allow | 2349 | * any huge TLB entry from the CPU so we won't allow |
2339 | * huge and small TLB entries for the same virtual address | 2350 | * huge and small TLB entries for the same virtual address |
2340 | * to avoid the risk of CPU bugs in that area. | 2351 | * to avoid the risk of CPU bugs in that area. |
2341 | */ | 2352 | */ |
2342 | _pmd = pmdp_clear_flush(vma, address, pmd); | 2353 | _pmd = pmdp_clear_flush(vma, address, pmd); |
2343 | spin_unlock(&mm->page_table_lock); | 2354 | spin_unlock(&mm->page_table_lock); |
2344 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2355 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2345 | 2356 | ||
2346 | spin_lock(ptl); | 2357 | spin_lock(ptl); |
2347 | isolated = __collapse_huge_page_isolate(vma, address, pte); | 2358 | isolated = __collapse_huge_page_isolate(vma, address, pte); |
2348 | spin_unlock(ptl); | 2359 | spin_unlock(ptl); |
2349 | 2360 | ||
2350 | if (unlikely(!isolated)) { | 2361 | if (unlikely(!isolated)) { |
2351 | pte_unmap(pte); | 2362 | pte_unmap(pte); |
2352 | spin_lock(&mm->page_table_lock); | 2363 | spin_lock(&mm->page_table_lock); |
2353 | BUG_ON(!pmd_none(*pmd)); | 2364 | BUG_ON(!pmd_none(*pmd)); |
2354 | set_pmd_at(mm, address, pmd, _pmd); | 2365 | set_pmd_at(mm, address, pmd, _pmd); |
2355 | spin_unlock(&mm->page_table_lock); | 2366 | spin_unlock(&mm->page_table_lock); |
2356 | anon_vma_unlock(vma->anon_vma); | 2367 | anon_vma_unlock(vma->anon_vma); |
2357 | goto out; | 2368 | goto out; |
2358 | } | 2369 | } |
2359 | 2370 | ||
2360 | /* | 2371 | /* |
2361 | * All pages are isolated and locked so anon_vma rmap | 2372 | * All pages are isolated and locked so anon_vma rmap |
2362 | * can't run anymore. | 2373 | * can't run anymore. |
2363 | */ | 2374 | */ |
2364 | anon_vma_unlock(vma->anon_vma); | 2375 | anon_vma_unlock(vma->anon_vma); |
2365 | 2376 | ||
2366 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | 2377 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); |
2367 | pte_unmap(pte); | 2378 | pte_unmap(pte); |
2368 | __SetPageUptodate(new_page); | 2379 | __SetPageUptodate(new_page); |
2369 | pgtable = pmd_pgtable(_pmd); | 2380 | pgtable = pmd_pgtable(_pmd); |
2370 | 2381 | ||
2371 | _pmd = mk_huge_pmd(new_page, vma); | 2382 | _pmd = mk_huge_pmd(new_page, vma); |
2372 | 2383 | ||
2373 | /* | 2384 | /* |
2374 | * spin_lock() below is not the equivalent of smp_wmb(), so | 2385 | * spin_lock() below is not the equivalent of smp_wmb(), so |
2375 | * this is needed to avoid the copy_huge_page writes to become | 2386 | * this is needed to avoid the copy_huge_page writes to become |
2376 | * visible after the set_pmd_at() write. | 2387 | * visible after the set_pmd_at() write. |
2377 | */ | 2388 | */ |
2378 | smp_wmb(); | 2389 | smp_wmb(); |
2379 | 2390 | ||
2380 | spin_lock(&mm->page_table_lock); | 2391 | spin_lock(&mm->page_table_lock); |
2381 | BUG_ON(!pmd_none(*pmd)); | 2392 | BUG_ON(!pmd_none(*pmd)); |
2382 | page_add_new_anon_rmap(new_page, vma, address); | 2393 | page_add_new_anon_rmap(new_page, vma, address); |
2383 | set_pmd_at(mm, address, pmd, _pmd); | 2394 | set_pmd_at(mm, address, pmd, _pmd); |
2384 | update_mmu_cache_pmd(vma, address, pmd); | 2395 | update_mmu_cache_pmd(vma, address, pmd); |
2385 | pgtable_trans_huge_deposit(mm, pgtable); | 2396 | pgtable_trans_huge_deposit(mm, pgtable); |
2386 | spin_unlock(&mm->page_table_lock); | 2397 | spin_unlock(&mm->page_table_lock); |
2387 | 2398 | ||
2388 | *hpage = NULL; | 2399 | *hpage = NULL; |
2389 | 2400 | ||
2390 | khugepaged_pages_collapsed++; | 2401 | khugepaged_pages_collapsed++; |
2391 | out_up_write: | 2402 | out_up_write: |
2392 | up_write(&mm->mmap_sem); | 2403 | up_write(&mm->mmap_sem); |
2393 | return; | 2404 | return; |
2394 | 2405 | ||
2395 | out: | 2406 | out: |
2396 | mem_cgroup_uncharge_page(new_page); | 2407 | mem_cgroup_uncharge_page(new_page); |
2397 | goto out_up_write; | 2408 | goto out_up_write; |
2398 | } | 2409 | } |
2399 | 2410 | ||
2400 | static int khugepaged_scan_pmd(struct mm_struct *mm, | 2411 | static int khugepaged_scan_pmd(struct mm_struct *mm, |
2401 | struct vm_area_struct *vma, | 2412 | struct vm_area_struct *vma, |
2402 | unsigned long address, | 2413 | unsigned long address, |
2403 | struct page **hpage) | 2414 | struct page **hpage) |
2404 | { | 2415 | { |
2405 | pmd_t *pmd; | 2416 | pmd_t *pmd; |
2406 | pte_t *pte, *_pte; | 2417 | pte_t *pte, *_pte; |
2407 | int ret = 0, referenced = 0, none = 0; | 2418 | int ret = 0, referenced = 0, none = 0; |
2408 | struct page *page; | 2419 | struct page *page; |
2409 | unsigned long _address; | 2420 | unsigned long _address; |
2410 | spinlock_t *ptl; | 2421 | spinlock_t *ptl; |
2411 | int node = -1; | 2422 | int node = -1; |
2412 | 2423 | ||
2413 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2424 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2414 | 2425 | ||
2415 | pmd = mm_find_pmd(mm, address); | 2426 | pmd = mm_find_pmd(mm, address); |
2416 | if (!pmd) | 2427 | if (!pmd) |
2417 | goto out; | 2428 | goto out; |
2418 | if (pmd_trans_huge(*pmd)) | 2429 | if (pmd_trans_huge(*pmd)) |
2419 | goto out; | 2430 | goto out; |
2420 | 2431 | ||
2421 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2432 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2422 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | 2433 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; |
2423 | _pte++, _address += PAGE_SIZE) { | 2434 | _pte++, _address += PAGE_SIZE) { |
2424 | pte_t pteval = *_pte; | 2435 | pte_t pteval = *_pte; |
2425 | if (pte_none(pteval)) { | 2436 | if (pte_none(pteval)) { |
2426 | if (++none <= khugepaged_max_ptes_none) | 2437 | if (++none <= khugepaged_max_ptes_none) |
2427 | continue; | 2438 | continue; |
2428 | else | 2439 | else |
2429 | goto out_unmap; | 2440 | goto out_unmap; |
2430 | } | 2441 | } |
2431 | if (!pte_present(pteval) || !pte_write(pteval)) | 2442 | if (!pte_present(pteval) || !pte_write(pteval)) |
2432 | goto out_unmap; | 2443 | goto out_unmap; |
2433 | page = vm_normal_page(vma, _address, pteval); | 2444 | page = vm_normal_page(vma, _address, pteval); |
2434 | if (unlikely(!page)) | 2445 | if (unlikely(!page)) |
2435 | goto out_unmap; | 2446 | goto out_unmap; |
2436 | /* | 2447 | /* |
2437 | * Chose the node of the first page. This could | 2448 | * Chose the node of the first page. This could |
2438 | * be more sophisticated and look at more pages, | 2449 | * be more sophisticated and look at more pages, |
2439 | * but isn't for now. | 2450 | * but isn't for now. |
2440 | */ | 2451 | */ |
2441 | if (node == -1) | 2452 | if (node == -1) |
2442 | node = page_to_nid(page); | 2453 | node = page_to_nid(page); |
2443 | VM_BUG_ON(PageCompound(page)); | 2454 | VM_BUG_ON(PageCompound(page)); |
2444 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2455 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
2445 | goto out_unmap; | 2456 | goto out_unmap; |
2446 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2457 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
2447 | if (page_count(page) != 1) | 2458 | if (page_count(page) != 1) |
2448 | goto out_unmap; | 2459 | goto out_unmap; |
2449 | if (pte_young(pteval) || PageReferenced(page) || | 2460 | if (pte_young(pteval) || PageReferenced(page) || |
2450 | mmu_notifier_test_young(vma->vm_mm, address)) | 2461 | mmu_notifier_test_young(vma->vm_mm, address)) |
2451 | referenced = 1; | 2462 | referenced = 1; |
2452 | } | 2463 | } |
2453 | if (referenced) | 2464 | if (referenced) |
2454 | ret = 1; | 2465 | ret = 1; |
2455 | out_unmap: | 2466 | out_unmap: |
2456 | pte_unmap_unlock(pte, ptl); | 2467 | pte_unmap_unlock(pte, ptl); |
2457 | if (ret) | 2468 | if (ret) |
2458 | /* collapse_huge_page will return with the mmap_sem released */ | 2469 | /* collapse_huge_page will return with the mmap_sem released */ |
2459 | collapse_huge_page(mm, address, hpage, vma, node); | 2470 | collapse_huge_page(mm, address, hpage, vma, node); |
2460 | out: | 2471 | out: |
2461 | return ret; | 2472 | return ret; |
2462 | } | 2473 | } |
2463 | 2474 | ||
2464 | static void collect_mm_slot(struct mm_slot *mm_slot) | 2475 | static void collect_mm_slot(struct mm_slot *mm_slot) |
2465 | { | 2476 | { |
2466 | struct mm_struct *mm = mm_slot->mm; | 2477 | struct mm_struct *mm = mm_slot->mm; |
2467 | 2478 | ||
2468 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); | 2479 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); |
2469 | 2480 | ||
2470 | if (khugepaged_test_exit(mm)) { | 2481 | if (khugepaged_test_exit(mm)) { |
2471 | /* free mm_slot */ | 2482 | /* free mm_slot */ |
2472 | hlist_del(&mm_slot->hash); | 2483 | hlist_del(&mm_slot->hash); |
2473 | list_del(&mm_slot->mm_node); | 2484 | list_del(&mm_slot->mm_node); |
2474 | 2485 | ||
2475 | /* | 2486 | /* |
2476 | * Not strictly needed because the mm exited already. | 2487 | * Not strictly needed because the mm exited already. |
2477 | * | 2488 | * |
2478 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | 2489 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); |
2479 | */ | 2490 | */ |
2480 | 2491 | ||
2481 | /* khugepaged_mm_lock actually not necessary for the below */ | 2492 | /* khugepaged_mm_lock actually not necessary for the below */ |
2482 | free_mm_slot(mm_slot); | 2493 | free_mm_slot(mm_slot); |
2483 | mmdrop(mm); | 2494 | mmdrop(mm); |
2484 | } | 2495 | } |
2485 | } | 2496 | } |
2486 | 2497 | ||
2487 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | 2498 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, |
2488 | struct page **hpage) | 2499 | struct page **hpage) |
2489 | __releases(&khugepaged_mm_lock) | 2500 | __releases(&khugepaged_mm_lock) |
2490 | __acquires(&khugepaged_mm_lock) | 2501 | __acquires(&khugepaged_mm_lock) |
2491 | { | 2502 | { |
2492 | struct mm_slot *mm_slot; | 2503 | struct mm_slot *mm_slot; |
2493 | struct mm_struct *mm; | 2504 | struct mm_struct *mm; |
2494 | struct vm_area_struct *vma; | 2505 | struct vm_area_struct *vma; |
2495 | int progress = 0; | 2506 | int progress = 0; |
2496 | 2507 | ||
2497 | VM_BUG_ON(!pages); | 2508 | VM_BUG_ON(!pages); |
2498 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); | 2509 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); |
2499 | 2510 | ||
2500 | if (khugepaged_scan.mm_slot) | 2511 | if (khugepaged_scan.mm_slot) |
2501 | mm_slot = khugepaged_scan.mm_slot; | 2512 | mm_slot = khugepaged_scan.mm_slot; |
2502 | else { | 2513 | else { |
2503 | mm_slot = list_entry(khugepaged_scan.mm_head.next, | 2514 | mm_slot = list_entry(khugepaged_scan.mm_head.next, |
2504 | struct mm_slot, mm_node); | 2515 | struct mm_slot, mm_node); |
2505 | khugepaged_scan.address = 0; | 2516 | khugepaged_scan.address = 0; |
2506 | khugepaged_scan.mm_slot = mm_slot; | 2517 | khugepaged_scan.mm_slot = mm_slot; |
2507 | } | 2518 | } |
2508 | spin_unlock(&khugepaged_mm_lock); | 2519 | spin_unlock(&khugepaged_mm_lock); |
2509 | 2520 | ||
2510 | mm = mm_slot->mm; | 2521 | mm = mm_slot->mm; |
2511 | down_read(&mm->mmap_sem); | 2522 | down_read(&mm->mmap_sem); |
2512 | if (unlikely(khugepaged_test_exit(mm))) | 2523 | if (unlikely(khugepaged_test_exit(mm))) |
2513 | vma = NULL; | 2524 | vma = NULL; |
2514 | else | 2525 | else |
2515 | vma = find_vma(mm, khugepaged_scan.address); | 2526 | vma = find_vma(mm, khugepaged_scan.address); |
2516 | 2527 | ||
2517 | progress++; | 2528 | progress++; |
2518 | for (; vma; vma = vma->vm_next) { | 2529 | for (; vma; vma = vma->vm_next) { |
2519 | unsigned long hstart, hend; | 2530 | unsigned long hstart, hend; |
2520 | 2531 | ||
2521 | cond_resched(); | 2532 | cond_resched(); |
2522 | if (unlikely(khugepaged_test_exit(mm))) { | 2533 | if (unlikely(khugepaged_test_exit(mm))) { |
2523 | progress++; | 2534 | progress++; |
2524 | break; | 2535 | break; |
2525 | } | 2536 | } |
2526 | if (!hugepage_vma_check(vma)) { | 2537 | if (!hugepage_vma_check(vma)) { |
2527 | skip: | 2538 | skip: |
2528 | progress++; | 2539 | progress++; |
2529 | continue; | 2540 | continue; |
2530 | } | 2541 | } |
2531 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2542 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2532 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2543 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2533 | if (hstart >= hend) | 2544 | if (hstart >= hend) |
2534 | goto skip; | 2545 | goto skip; |
2535 | if (khugepaged_scan.address > hend) | 2546 | if (khugepaged_scan.address > hend) |
2536 | goto skip; | 2547 | goto skip; |
2537 | if (khugepaged_scan.address < hstart) | 2548 | if (khugepaged_scan.address < hstart) |
2538 | khugepaged_scan.address = hstart; | 2549 | khugepaged_scan.address = hstart; |
2539 | VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); | 2550 | VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); |
2540 | 2551 | ||
2541 | while (khugepaged_scan.address < hend) { | 2552 | while (khugepaged_scan.address < hend) { |
2542 | int ret; | 2553 | int ret; |
2543 | cond_resched(); | 2554 | cond_resched(); |
2544 | if (unlikely(khugepaged_test_exit(mm))) | 2555 | if (unlikely(khugepaged_test_exit(mm))) |
2545 | goto breakouterloop; | 2556 | goto breakouterloop; |
2546 | 2557 | ||
2547 | VM_BUG_ON(khugepaged_scan.address < hstart || | 2558 | VM_BUG_ON(khugepaged_scan.address < hstart || |
2548 | khugepaged_scan.address + HPAGE_PMD_SIZE > | 2559 | khugepaged_scan.address + HPAGE_PMD_SIZE > |
2549 | hend); | 2560 | hend); |
2550 | ret = khugepaged_scan_pmd(mm, vma, | 2561 | ret = khugepaged_scan_pmd(mm, vma, |
2551 | khugepaged_scan.address, | 2562 | khugepaged_scan.address, |
2552 | hpage); | 2563 | hpage); |
2553 | /* move to next address */ | 2564 | /* move to next address */ |
2554 | khugepaged_scan.address += HPAGE_PMD_SIZE; | 2565 | khugepaged_scan.address += HPAGE_PMD_SIZE; |
2555 | progress += HPAGE_PMD_NR; | 2566 | progress += HPAGE_PMD_NR; |
2556 | if (ret) | 2567 | if (ret) |
2557 | /* we released mmap_sem so break loop */ | 2568 | /* we released mmap_sem so break loop */ |
2558 | goto breakouterloop_mmap_sem; | 2569 | goto breakouterloop_mmap_sem; |
2559 | if (progress >= pages) | 2570 | if (progress >= pages) |
2560 | goto breakouterloop; | 2571 | goto breakouterloop; |
2561 | } | 2572 | } |
2562 | } | 2573 | } |
2563 | breakouterloop: | 2574 | breakouterloop: |
2564 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ | 2575 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ |
2565 | breakouterloop_mmap_sem: | 2576 | breakouterloop_mmap_sem: |
2566 | 2577 | ||
2567 | spin_lock(&khugepaged_mm_lock); | 2578 | spin_lock(&khugepaged_mm_lock); |
2568 | VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); | 2579 | VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); |
2569 | /* | 2580 | /* |
2570 | * Release the current mm_slot if this mm is about to die, or | 2581 | * Release the current mm_slot if this mm is about to die, or |
2571 | * if we scanned all vmas of this mm. | 2582 | * if we scanned all vmas of this mm. |
2572 | */ | 2583 | */ |
2573 | if (khugepaged_test_exit(mm) || !vma) { | 2584 | if (khugepaged_test_exit(mm) || !vma) { |
2574 | /* | 2585 | /* |
2575 | * Make sure that if mm_users is reaching zero while | 2586 | * Make sure that if mm_users is reaching zero while |
2576 | * khugepaged runs here, khugepaged_exit will find | 2587 | * khugepaged runs here, khugepaged_exit will find |
2577 | * mm_slot not pointing to the exiting mm. | 2588 | * mm_slot not pointing to the exiting mm. |
2578 | */ | 2589 | */ |
2579 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { | 2590 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { |
2580 | khugepaged_scan.mm_slot = list_entry( | 2591 | khugepaged_scan.mm_slot = list_entry( |
2581 | mm_slot->mm_node.next, | 2592 | mm_slot->mm_node.next, |
2582 | struct mm_slot, mm_node); | 2593 | struct mm_slot, mm_node); |
2583 | khugepaged_scan.address = 0; | 2594 | khugepaged_scan.address = 0; |
2584 | } else { | 2595 | } else { |
2585 | khugepaged_scan.mm_slot = NULL; | 2596 | khugepaged_scan.mm_slot = NULL; |
2586 | khugepaged_full_scans++; | 2597 | khugepaged_full_scans++; |
2587 | } | 2598 | } |
2588 | 2599 | ||
2589 | collect_mm_slot(mm_slot); | 2600 | collect_mm_slot(mm_slot); |
2590 | } | 2601 | } |
2591 | 2602 | ||
2592 | return progress; | 2603 | return progress; |
2593 | } | 2604 | } |
2594 | 2605 | ||
2595 | static int khugepaged_has_work(void) | 2606 | static int khugepaged_has_work(void) |
2596 | { | 2607 | { |
2597 | return !list_empty(&khugepaged_scan.mm_head) && | 2608 | return !list_empty(&khugepaged_scan.mm_head) && |
2598 | khugepaged_enabled(); | 2609 | khugepaged_enabled(); |
2599 | } | 2610 | } |
2600 | 2611 | ||
2601 | static int khugepaged_wait_event(void) | 2612 | static int khugepaged_wait_event(void) |
2602 | { | 2613 | { |
2603 | return !list_empty(&khugepaged_scan.mm_head) || | 2614 | return !list_empty(&khugepaged_scan.mm_head) || |
2604 | kthread_should_stop(); | 2615 | kthread_should_stop(); |
2605 | } | 2616 | } |
2606 | 2617 | ||
2607 | static void khugepaged_do_scan(void) | 2618 | static void khugepaged_do_scan(void) |
2608 | { | 2619 | { |
2609 | struct page *hpage = NULL; | 2620 | struct page *hpage = NULL; |
2610 | unsigned int progress = 0, pass_through_head = 0; | 2621 | unsigned int progress = 0, pass_through_head = 0; |
2611 | unsigned int pages = khugepaged_pages_to_scan; | 2622 | unsigned int pages = khugepaged_pages_to_scan; |
2612 | bool wait = true; | 2623 | bool wait = true; |
2613 | 2624 | ||
2614 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | 2625 | barrier(); /* write khugepaged_pages_to_scan to local stack */ |
2615 | 2626 | ||
2616 | while (progress < pages) { | 2627 | while (progress < pages) { |
2617 | if (!khugepaged_prealloc_page(&hpage, &wait)) | 2628 | if (!khugepaged_prealloc_page(&hpage, &wait)) |
2618 | break; | 2629 | break; |
2619 | 2630 | ||
2620 | cond_resched(); | 2631 | cond_resched(); |
2621 | 2632 | ||
2622 | if (unlikely(kthread_should_stop() || freezing(current))) | 2633 | if (unlikely(kthread_should_stop() || freezing(current))) |
2623 | break; | 2634 | break; |
2624 | 2635 | ||
2625 | spin_lock(&khugepaged_mm_lock); | 2636 | spin_lock(&khugepaged_mm_lock); |
2626 | if (!khugepaged_scan.mm_slot) | 2637 | if (!khugepaged_scan.mm_slot) |
2627 | pass_through_head++; | 2638 | pass_through_head++; |
2628 | if (khugepaged_has_work() && | 2639 | if (khugepaged_has_work() && |
2629 | pass_through_head < 2) | 2640 | pass_through_head < 2) |
2630 | progress += khugepaged_scan_mm_slot(pages - progress, | 2641 | progress += khugepaged_scan_mm_slot(pages - progress, |
2631 | &hpage); | 2642 | &hpage); |
2632 | else | 2643 | else |
2633 | progress = pages; | 2644 | progress = pages; |
2634 | spin_unlock(&khugepaged_mm_lock); | 2645 | spin_unlock(&khugepaged_mm_lock); |
2635 | } | 2646 | } |
2636 | 2647 | ||
2637 | if (!IS_ERR_OR_NULL(hpage)) | 2648 | if (!IS_ERR_OR_NULL(hpage)) |
2638 | put_page(hpage); | 2649 | put_page(hpage); |
2639 | } | 2650 | } |
2640 | 2651 | ||
2641 | static void khugepaged_wait_work(void) | 2652 | static void khugepaged_wait_work(void) |
2642 | { | 2653 | { |
2643 | try_to_freeze(); | 2654 | try_to_freeze(); |
2644 | 2655 | ||
2645 | if (khugepaged_has_work()) { | 2656 | if (khugepaged_has_work()) { |
2646 | if (!khugepaged_scan_sleep_millisecs) | 2657 | if (!khugepaged_scan_sleep_millisecs) |
2647 | return; | 2658 | return; |
2648 | 2659 | ||
2649 | wait_event_freezable_timeout(khugepaged_wait, | 2660 | wait_event_freezable_timeout(khugepaged_wait, |
2650 | kthread_should_stop(), | 2661 | kthread_should_stop(), |
2651 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); | 2662 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); |
2652 | return; | 2663 | return; |
2653 | } | 2664 | } |
2654 | 2665 | ||
2655 | if (khugepaged_enabled()) | 2666 | if (khugepaged_enabled()) |
2656 | wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); | 2667 | wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); |
2657 | } | 2668 | } |
2658 | 2669 | ||
2659 | static int khugepaged(void *none) | 2670 | static int khugepaged(void *none) |
2660 | { | 2671 | { |
2661 | struct mm_slot *mm_slot; | 2672 | struct mm_slot *mm_slot; |
2662 | 2673 | ||
2663 | set_freezable(); | 2674 | set_freezable(); |
2664 | set_user_nice(current, 19); | 2675 | set_user_nice(current, 19); |
2665 | 2676 | ||
2666 | while (!kthread_should_stop()) { | 2677 | while (!kthread_should_stop()) { |
2667 | khugepaged_do_scan(); | 2678 | khugepaged_do_scan(); |
2668 | khugepaged_wait_work(); | 2679 | khugepaged_wait_work(); |
2669 | } | 2680 | } |
2670 | 2681 | ||
2671 | spin_lock(&khugepaged_mm_lock); | 2682 | spin_lock(&khugepaged_mm_lock); |
2672 | mm_slot = khugepaged_scan.mm_slot; | 2683 | mm_slot = khugepaged_scan.mm_slot; |
2673 | khugepaged_scan.mm_slot = NULL; | 2684 | khugepaged_scan.mm_slot = NULL; |
2674 | if (mm_slot) | 2685 | if (mm_slot) |
2675 | collect_mm_slot(mm_slot); | 2686 | collect_mm_slot(mm_slot); |
2676 | spin_unlock(&khugepaged_mm_lock); | 2687 | spin_unlock(&khugepaged_mm_lock); |
2677 | return 0; | 2688 | return 0; |
2678 | } | 2689 | } |
2679 | 2690 | ||
2680 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | 2691 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, |
2681 | unsigned long haddr, pmd_t *pmd) | 2692 | unsigned long haddr, pmd_t *pmd) |
2682 | { | 2693 | { |
2683 | struct mm_struct *mm = vma->vm_mm; | 2694 | struct mm_struct *mm = vma->vm_mm; |
2684 | pgtable_t pgtable; | 2695 | pgtable_t pgtable; |
2685 | pmd_t _pmd; | 2696 | pmd_t _pmd; |
2686 | int i; | 2697 | int i; |
2687 | 2698 | ||
2688 | pmdp_clear_flush(vma, haddr, pmd); | 2699 | pmdp_clear_flush(vma, haddr, pmd); |
2689 | /* leave pmd empty until pte is filled */ | 2700 | /* leave pmd empty until pte is filled */ |
2690 | 2701 | ||
2691 | pgtable = pgtable_trans_huge_withdraw(mm); | 2702 | pgtable = pgtable_trans_huge_withdraw(mm); |
2692 | pmd_populate(mm, &_pmd, pgtable); | 2703 | pmd_populate(mm, &_pmd, pgtable); |
2693 | 2704 | ||
2694 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 2705 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
2695 | pte_t *pte, entry; | 2706 | pte_t *pte, entry; |
2696 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | 2707 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); |
2697 | entry = pte_mkspecial(entry); | 2708 | entry = pte_mkspecial(entry); |
2698 | pte = pte_offset_map(&_pmd, haddr); | 2709 | pte = pte_offset_map(&_pmd, haddr); |
2699 | VM_BUG_ON(!pte_none(*pte)); | 2710 | VM_BUG_ON(!pte_none(*pte)); |
2700 | set_pte_at(mm, haddr, pte, entry); | 2711 | set_pte_at(mm, haddr, pte, entry); |
2701 | pte_unmap(pte); | 2712 | pte_unmap(pte); |
2702 | } | 2713 | } |
2703 | smp_wmb(); /* make pte visible before pmd */ | 2714 | smp_wmb(); /* make pte visible before pmd */ |
2704 | pmd_populate(mm, pmd, pgtable); | 2715 | pmd_populate(mm, pmd, pgtable); |
2705 | put_huge_zero_page(); | 2716 | put_huge_zero_page(); |
2706 | } | 2717 | } |
2707 | 2718 | ||
2708 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | 2719 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, |
2709 | pmd_t *pmd) | 2720 | pmd_t *pmd) |
2710 | { | 2721 | { |
2711 | struct page *page; | 2722 | struct page *page; |
2712 | struct mm_struct *mm = vma->vm_mm; | 2723 | struct mm_struct *mm = vma->vm_mm; |
2713 | unsigned long haddr = address & HPAGE_PMD_MASK; | 2724 | unsigned long haddr = address & HPAGE_PMD_MASK; |
2714 | unsigned long mmun_start; /* For mmu_notifiers */ | 2725 | unsigned long mmun_start; /* For mmu_notifiers */ |
2715 | unsigned long mmun_end; /* For mmu_notifiers */ | 2726 | unsigned long mmun_end; /* For mmu_notifiers */ |
2716 | 2727 | ||
2717 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); | 2728 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); |
2718 | 2729 | ||
2719 | mmun_start = haddr; | 2730 | mmun_start = haddr; |
2720 | mmun_end = haddr + HPAGE_PMD_SIZE; | 2731 | mmun_end = haddr + HPAGE_PMD_SIZE; |
2721 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2732 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2722 | spin_lock(&mm->page_table_lock); | 2733 | spin_lock(&mm->page_table_lock); |
2723 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2734 | if (unlikely(!pmd_trans_huge(*pmd))) { |
2724 | spin_unlock(&mm->page_table_lock); | 2735 | spin_unlock(&mm->page_table_lock); |
2725 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2736 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2726 | return; | 2737 | return; |
2727 | } | 2738 | } |
2728 | if (is_huge_zero_pmd(*pmd)) { | 2739 | if (is_huge_zero_pmd(*pmd)) { |
2729 | __split_huge_zero_page_pmd(vma, haddr, pmd); | 2740 | __split_huge_zero_page_pmd(vma, haddr, pmd); |
2730 | spin_unlock(&mm->page_table_lock); | 2741 | spin_unlock(&mm->page_table_lock); |
2731 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2742 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2732 | return; | 2743 | return; |
2733 | } | 2744 | } |
2734 | page = pmd_page(*pmd); | 2745 | page = pmd_page(*pmd); |
2735 | VM_BUG_ON(!page_count(page)); | 2746 | VM_BUG_ON(!page_count(page)); |
2736 | get_page(page); | 2747 | get_page(page); |
2737 | spin_unlock(&mm->page_table_lock); | 2748 | spin_unlock(&mm->page_table_lock); |
2738 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2749 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2739 | 2750 | ||
2740 | split_huge_page(page); | 2751 | split_huge_page(page); |
2741 | 2752 | ||
2742 | put_page(page); | 2753 | put_page(page); |
2743 | BUG_ON(pmd_trans_huge(*pmd)); | 2754 | BUG_ON(pmd_trans_huge(*pmd)); |
2744 | } | 2755 | } |
2745 | 2756 | ||
2746 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | 2757 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, |
2747 | pmd_t *pmd) | 2758 | pmd_t *pmd) |
2748 | { | 2759 | { |
2749 | struct vm_area_struct *vma; | 2760 | struct vm_area_struct *vma; |
2750 | 2761 | ||
2751 | vma = find_vma(mm, address); | 2762 | vma = find_vma(mm, address); |
2752 | BUG_ON(vma == NULL); | 2763 | BUG_ON(vma == NULL); |
2753 | split_huge_page_pmd(vma, address, pmd); | 2764 | split_huge_page_pmd(vma, address, pmd); |
2754 | } | 2765 | } |
2755 | 2766 | ||
2756 | static void split_huge_page_address(struct mm_struct *mm, | 2767 | static void split_huge_page_address(struct mm_struct *mm, |
2757 | unsigned long address) | 2768 | unsigned long address) |
2758 | { | 2769 | { |
2759 | pmd_t *pmd; | 2770 | pmd_t *pmd; |
2760 | 2771 | ||
2761 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | 2772 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); |
2762 | 2773 | ||
2763 | pmd = mm_find_pmd(mm, address); | 2774 | pmd = mm_find_pmd(mm, address); |
2764 | if (!pmd) | 2775 | if (!pmd) |
2765 | return; | 2776 | return; |
2766 | /* | 2777 | /* |
2767 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2778 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
2768 | * materialize from under us. | 2779 | * materialize from under us. |
2769 | */ | 2780 | */ |
2770 | split_huge_page_pmd_mm(mm, address, pmd); | 2781 | split_huge_page_pmd_mm(mm, address, pmd); |
2771 | } | 2782 | } |
2772 | 2783 | ||
2773 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | 2784 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, |
2774 | unsigned long start, | 2785 | unsigned long start, |
2775 | unsigned long end, | 2786 | unsigned long end, |
2776 | long adjust_next) | 2787 | long adjust_next) |
2777 | { | 2788 | { |
2778 | /* | 2789 | /* |
2779 | * If the new start address isn't hpage aligned and it could | 2790 | * If the new start address isn't hpage aligned and it could |
2780 | * previously contain an hugepage: check if we need to split | 2791 | * previously contain an hugepage: check if we need to split |
2781 | * an huge pmd. | 2792 | * an huge pmd. |
2782 | */ | 2793 | */ |
2783 | if (start & ~HPAGE_PMD_MASK && | 2794 | if (start & ~HPAGE_PMD_MASK && |
2784 | (start & HPAGE_PMD_MASK) >= vma->vm_start && | 2795 | (start & HPAGE_PMD_MASK) >= vma->vm_start && |
2785 | (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | 2796 | (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) |
2786 | split_huge_page_address(vma->vm_mm, start); | 2797 | split_huge_page_address(vma->vm_mm, start); |
2787 | 2798 | ||
2788 | /* | 2799 | /* |
2789 | * If the new end address isn't hpage aligned and it could | 2800 | * If the new end address isn't hpage aligned and it could |
2790 | * previously contain an hugepage: check if we need to split | 2801 | * previously contain an hugepage: check if we need to split |
2791 | * an huge pmd. | 2802 | * an huge pmd. |
2792 | */ | 2803 | */ |
2793 | if (end & ~HPAGE_PMD_MASK && | 2804 | if (end & ~HPAGE_PMD_MASK && |
2794 | (end & HPAGE_PMD_MASK) >= vma->vm_start && | 2805 | (end & HPAGE_PMD_MASK) >= vma->vm_start && |
2795 | (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | 2806 | (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) |
2796 | split_huge_page_address(vma->vm_mm, end); | 2807 | split_huge_page_address(vma->vm_mm, end); |
2797 | 2808 | ||
2798 | /* | 2809 | /* |
2799 | * If we're also updating the vma->vm_next->vm_start, if the new | 2810 | * If we're also updating the vma->vm_next->vm_start, if the new |
2800 | * vm_next->vm_start isn't page aligned and it could previously | 2811 | * vm_next->vm_start isn't page aligned and it could previously |
2801 | * contain an hugepage: check if we need to split an huge pmd. | 2812 | * contain an hugepage: check if we need to split an huge pmd. |
2802 | */ | 2813 | */ |
2803 | if (adjust_next > 0) { | 2814 | if (adjust_next > 0) { |
2804 | struct vm_area_struct *next = vma->vm_next; | 2815 | struct vm_area_struct *next = vma->vm_next; |
2805 | unsigned long nstart = next->vm_start; | 2816 | unsigned long nstart = next->vm_start; |
2806 | nstart += adjust_next << PAGE_SHIFT; | 2817 | nstart += adjust_next << PAGE_SHIFT; |
2807 | if (nstart & ~HPAGE_PMD_MASK && | 2818 | if (nstart & ~HPAGE_PMD_MASK && |
2808 | (nstart & HPAGE_PMD_MASK) >= next->vm_start && | 2819 | (nstart & HPAGE_PMD_MASK) >= next->vm_start && |
2809 | (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) | 2820 | (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) |
2810 | split_huge_page_address(next->vm_mm, nstart); | 2821 | split_huge_page_address(next->vm_mm, nstart); |
2811 | } | 2822 | } |
2812 | } | 2823 | } |
2813 | 2824 |