Commit 6a01f8dd2508cf79abbdccc44a6a41b2e17fb3cb
Committed by
Jiri Slaby
1 parent
1d08848674
Exists in
ti-linux-3.12.y
and in
2 other branches
mm, thp: only collapse hugepages to nodes with affinity for zone_reclaim_mode
commit 14a4e2141e24304fff2c697be6382ffb83888185 upstream. Commit 9f1b868a13ac ("mm: thp: khugepaged: add policy for finding target node") improved the previous khugepaged logic which allocated a transparent hugepages from the node of the first page being collapsed. However, it is still possible to collapse pages to remote memory which may suffer from additional access latency. With the current policy, it is possible that 255 pages (with PAGE_SHIFT == 12) will be collapsed remotely if the majority are allocated from that node. When zone_reclaim_mode is enabled, it means the VM should make every attempt to allocate locally to prevent NUMA performance degradation. In this case, we do not want to collapse hugepages to remote nodes that would suffer from increased access latency. Thus, when zone_reclaim_mode is enabled, only allow collapsing to nodes with RECLAIM_DISTANCE or less. There is no functional change for systems that disable zone_reclaim_mode. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Bob Liu <bob.liu@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Showing 1 changed file with 26 additions and 0 deletions Inline Diff
mm/huge_memory.c
1 | /* | 1 | /* |
2 | * Copyright (C) 2009 Red Hat, Inc. | 2 | * Copyright (C) 2009 Red Hat, Inc. |
3 | * | 3 | * |
4 | * This work is licensed under the terms of the GNU GPL, version 2. See | 4 | * This work is licensed under the terms of the GNU GPL, version 2. See |
5 | * the COPYING file in the top-level directory. | 5 | * the COPYING file in the top-level directory. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/highmem.h> | 10 | #include <linux/highmem.h> |
11 | #include <linux/hugetlb.h> | 11 | #include <linux/hugetlb.h> |
12 | #include <linux/mmu_notifier.h> | 12 | #include <linux/mmu_notifier.h> |
13 | #include <linux/rmap.h> | 13 | #include <linux/rmap.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/shrinker.h> | 15 | #include <linux/shrinker.h> |
16 | #include <linux/mm_inline.h> | 16 | #include <linux/mm_inline.h> |
17 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
18 | #include <linux/khugepaged.h> | 18 | #include <linux/khugepaged.h> |
19 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
20 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/migrate.h> | 22 | #include <linux/migrate.h> |
23 | #include <linux/hashtable.h> | 23 | #include <linux/hashtable.h> |
24 | 24 | ||
25 | #include <asm/tlb.h> | 25 | #include <asm/tlb.h> |
26 | #include <asm/pgalloc.h> | 26 | #include <asm/pgalloc.h> |
27 | #include "internal.h" | 27 | #include "internal.h" |
28 | 28 | ||
29 | /* | 29 | /* |
30 | * By default transparent hugepage support is enabled for all mappings | 30 | * By default transparent hugepage support is enabled for all mappings |
31 | * and khugepaged scans all mappings. Defrag is only invoked by | 31 | * and khugepaged scans all mappings. Defrag is only invoked by |
32 | * khugepaged hugepage allocations and by page faults inside | 32 | * khugepaged hugepage allocations and by page faults inside |
33 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived | 33 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived |
34 | * allocations. | 34 | * allocations. |
35 | */ | 35 | */ |
36 | unsigned long transparent_hugepage_flags __read_mostly = | 36 | unsigned long transparent_hugepage_flags __read_mostly = |
37 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS | 37 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS |
38 | (1<<TRANSPARENT_HUGEPAGE_FLAG)| | 38 | (1<<TRANSPARENT_HUGEPAGE_FLAG)| |
39 | #endif | 39 | #endif |
40 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE | 40 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE |
41 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | 41 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| |
42 | #endif | 42 | #endif |
43 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | 43 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| |
44 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| | 44 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| |
45 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | 45 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); |
46 | 46 | ||
47 | /* default scan 8*512 pte (or vmas) every 30 second */ | 47 | /* default scan 8*512 pte (or vmas) every 30 second */ |
48 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | 48 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; |
49 | static unsigned int khugepaged_pages_collapsed; | 49 | static unsigned int khugepaged_pages_collapsed; |
50 | static unsigned int khugepaged_full_scans; | 50 | static unsigned int khugepaged_full_scans; |
51 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; | 51 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; |
52 | /* during fragmentation poll the hugepage allocator once every minute */ | 52 | /* during fragmentation poll the hugepage allocator once every minute */ |
53 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; | 53 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; |
54 | static struct task_struct *khugepaged_thread __read_mostly; | 54 | static struct task_struct *khugepaged_thread __read_mostly; |
55 | static DEFINE_MUTEX(khugepaged_mutex); | 55 | static DEFINE_MUTEX(khugepaged_mutex); |
56 | static DEFINE_SPINLOCK(khugepaged_mm_lock); | 56 | static DEFINE_SPINLOCK(khugepaged_mm_lock); |
57 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | 57 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); |
58 | /* | 58 | /* |
59 | * default collapse hugepages if there is at least one pte mapped like | 59 | * default collapse hugepages if there is at least one pte mapped like |
60 | * it would have happened if the vma was large enough during page | 60 | * it would have happened if the vma was large enough during page |
61 | * fault. | 61 | * fault. |
62 | */ | 62 | */ |
63 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | 63 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; |
64 | 64 | ||
65 | static int khugepaged(void *none); | 65 | static int khugepaged(void *none); |
66 | static int khugepaged_slab_init(void); | 66 | static int khugepaged_slab_init(void); |
67 | 67 | ||
68 | #define MM_SLOTS_HASH_BITS 10 | 68 | #define MM_SLOTS_HASH_BITS 10 |
69 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | 69 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); |
70 | 70 | ||
71 | static struct kmem_cache *mm_slot_cache __read_mostly; | 71 | static struct kmem_cache *mm_slot_cache __read_mostly; |
72 | 72 | ||
73 | /** | 73 | /** |
74 | * struct mm_slot - hash lookup from mm to mm_slot | 74 | * struct mm_slot - hash lookup from mm to mm_slot |
75 | * @hash: hash collision list | 75 | * @hash: hash collision list |
76 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head | 76 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head |
77 | * @mm: the mm that this information is valid for | 77 | * @mm: the mm that this information is valid for |
78 | */ | 78 | */ |
79 | struct mm_slot { | 79 | struct mm_slot { |
80 | struct hlist_node hash; | 80 | struct hlist_node hash; |
81 | struct list_head mm_node; | 81 | struct list_head mm_node; |
82 | struct mm_struct *mm; | 82 | struct mm_struct *mm; |
83 | }; | 83 | }; |
84 | 84 | ||
85 | /** | 85 | /** |
86 | * struct khugepaged_scan - cursor for scanning | 86 | * struct khugepaged_scan - cursor for scanning |
87 | * @mm_head: the head of the mm list to scan | 87 | * @mm_head: the head of the mm list to scan |
88 | * @mm_slot: the current mm_slot we are scanning | 88 | * @mm_slot: the current mm_slot we are scanning |
89 | * @address: the next address inside that to be scanned | 89 | * @address: the next address inside that to be scanned |
90 | * | 90 | * |
91 | * There is only the one khugepaged_scan instance of this cursor structure. | 91 | * There is only the one khugepaged_scan instance of this cursor structure. |
92 | */ | 92 | */ |
93 | struct khugepaged_scan { | 93 | struct khugepaged_scan { |
94 | struct list_head mm_head; | 94 | struct list_head mm_head; |
95 | struct mm_slot *mm_slot; | 95 | struct mm_slot *mm_slot; |
96 | unsigned long address; | 96 | unsigned long address; |
97 | }; | 97 | }; |
98 | static struct khugepaged_scan khugepaged_scan = { | 98 | static struct khugepaged_scan khugepaged_scan = { |
99 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | 99 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), |
100 | }; | 100 | }; |
101 | 101 | ||
102 | 102 | ||
103 | static int set_recommended_min_free_kbytes(void) | 103 | static int set_recommended_min_free_kbytes(void) |
104 | { | 104 | { |
105 | struct zone *zone; | 105 | struct zone *zone; |
106 | int nr_zones = 0; | 106 | int nr_zones = 0; |
107 | unsigned long recommended_min; | 107 | unsigned long recommended_min; |
108 | 108 | ||
109 | if (!khugepaged_enabled()) | 109 | if (!khugepaged_enabled()) |
110 | return 0; | 110 | return 0; |
111 | 111 | ||
112 | for_each_populated_zone(zone) | 112 | for_each_populated_zone(zone) |
113 | nr_zones++; | 113 | nr_zones++; |
114 | 114 | ||
115 | /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ | 115 | /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ |
116 | recommended_min = pageblock_nr_pages * nr_zones * 2; | 116 | recommended_min = pageblock_nr_pages * nr_zones * 2; |
117 | 117 | ||
118 | /* | 118 | /* |
119 | * Make sure that on average at least two pageblocks are almost free | 119 | * Make sure that on average at least two pageblocks are almost free |
120 | * of another type, one for a migratetype to fall back to and a | 120 | * of another type, one for a migratetype to fall back to and a |
121 | * second to avoid subsequent fallbacks of other types There are 3 | 121 | * second to avoid subsequent fallbacks of other types There are 3 |
122 | * MIGRATE_TYPES we care about. | 122 | * MIGRATE_TYPES we care about. |
123 | */ | 123 | */ |
124 | recommended_min += pageblock_nr_pages * nr_zones * | 124 | recommended_min += pageblock_nr_pages * nr_zones * |
125 | MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; | 125 | MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; |
126 | 126 | ||
127 | /* don't ever allow to reserve more than 5% of the lowmem */ | 127 | /* don't ever allow to reserve more than 5% of the lowmem */ |
128 | recommended_min = min(recommended_min, | 128 | recommended_min = min(recommended_min, |
129 | (unsigned long) nr_free_buffer_pages() / 20); | 129 | (unsigned long) nr_free_buffer_pages() / 20); |
130 | recommended_min <<= (PAGE_SHIFT-10); | 130 | recommended_min <<= (PAGE_SHIFT-10); |
131 | 131 | ||
132 | if (recommended_min > min_free_kbytes) | 132 | if (recommended_min > min_free_kbytes) |
133 | min_free_kbytes = recommended_min; | 133 | min_free_kbytes = recommended_min; |
134 | setup_per_zone_wmarks(); | 134 | setup_per_zone_wmarks(); |
135 | return 0; | 135 | return 0; |
136 | } | 136 | } |
137 | late_initcall(set_recommended_min_free_kbytes); | 137 | late_initcall(set_recommended_min_free_kbytes); |
138 | 138 | ||
139 | static int start_khugepaged(void) | 139 | static int start_khugepaged(void) |
140 | { | 140 | { |
141 | int err = 0; | 141 | int err = 0; |
142 | if (khugepaged_enabled()) { | 142 | if (khugepaged_enabled()) { |
143 | if (!khugepaged_thread) | 143 | if (!khugepaged_thread) |
144 | khugepaged_thread = kthread_run(khugepaged, NULL, | 144 | khugepaged_thread = kthread_run(khugepaged, NULL, |
145 | "khugepaged"); | 145 | "khugepaged"); |
146 | if (unlikely(IS_ERR(khugepaged_thread))) { | 146 | if (unlikely(IS_ERR(khugepaged_thread))) { |
147 | printk(KERN_ERR | 147 | printk(KERN_ERR |
148 | "khugepaged: kthread_run(khugepaged) failed\n"); | 148 | "khugepaged: kthread_run(khugepaged) failed\n"); |
149 | err = PTR_ERR(khugepaged_thread); | 149 | err = PTR_ERR(khugepaged_thread); |
150 | khugepaged_thread = NULL; | 150 | khugepaged_thread = NULL; |
151 | } | 151 | } |
152 | 152 | ||
153 | if (!list_empty(&khugepaged_scan.mm_head)) | 153 | if (!list_empty(&khugepaged_scan.mm_head)) |
154 | wake_up_interruptible(&khugepaged_wait); | 154 | wake_up_interruptible(&khugepaged_wait); |
155 | 155 | ||
156 | set_recommended_min_free_kbytes(); | 156 | set_recommended_min_free_kbytes(); |
157 | } else if (khugepaged_thread) { | 157 | } else if (khugepaged_thread) { |
158 | kthread_stop(khugepaged_thread); | 158 | kthread_stop(khugepaged_thread); |
159 | khugepaged_thread = NULL; | 159 | khugepaged_thread = NULL; |
160 | } | 160 | } |
161 | 161 | ||
162 | return err; | 162 | return err; |
163 | } | 163 | } |
164 | 164 | ||
165 | static atomic_t huge_zero_refcount; | 165 | static atomic_t huge_zero_refcount; |
166 | static struct page *huge_zero_page __read_mostly; | 166 | static struct page *huge_zero_page __read_mostly; |
167 | 167 | ||
168 | static inline bool is_huge_zero_page(struct page *page) | 168 | static inline bool is_huge_zero_page(struct page *page) |
169 | { | 169 | { |
170 | return ACCESS_ONCE(huge_zero_page) == page; | 170 | return ACCESS_ONCE(huge_zero_page) == page; |
171 | } | 171 | } |
172 | 172 | ||
173 | static inline bool is_huge_zero_pmd(pmd_t pmd) | 173 | static inline bool is_huge_zero_pmd(pmd_t pmd) |
174 | { | 174 | { |
175 | return is_huge_zero_page(pmd_page(pmd)); | 175 | return is_huge_zero_page(pmd_page(pmd)); |
176 | } | 176 | } |
177 | 177 | ||
178 | static struct page *get_huge_zero_page(void) | 178 | static struct page *get_huge_zero_page(void) |
179 | { | 179 | { |
180 | struct page *zero_page; | 180 | struct page *zero_page; |
181 | retry: | 181 | retry: |
182 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | 182 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) |
183 | return ACCESS_ONCE(huge_zero_page); | 183 | return ACCESS_ONCE(huge_zero_page); |
184 | 184 | ||
185 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | 185 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, |
186 | HPAGE_PMD_ORDER); | 186 | HPAGE_PMD_ORDER); |
187 | if (!zero_page) { | 187 | if (!zero_page) { |
188 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); | 188 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); |
189 | return NULL; | 189 | return NULL; |
190 | } | 190 | } |
191 | count_vm_event(THP_ZERO_PAGE_ALLOC); | 191 | count_vm_event(THP_ZERO_PAGE_ALLOC); |
192 | preempt_disable(); | 192 | preempt_disable(); |
193 | if (cmpxchg(&huge_zero_page, NULL, zero_page)) { | 193 | if (cmpxchg(&huge_zero_page, NULL, zero_page)) { |
194 | preempt_enable(); | 194 | preempt_enable(); |
195 | __free_page(zero_page); | 195 | __free_page(zero_page); |
196 | goto retry; | 196 | goto retry; |
197 | } | 197 | } |
198 | 198 | ||
199 | /* We take additional reference here. It will be put back by shrinker */ | 199 | /* We take additional reference here. It will be put back by shrinker */ |
200 | atomic_set(&huge_zero_refcount, 2); | 200 | atomic_set(&huge_zero_refcount, 2); |
201 | preempt_enable(); | 201 | preempt_enable(); |
202 | return ACCESS_ONCE(huge_zero_page); | 202 | return ACCESS_ONCE(huge_zero_page); |
203 | } | 203 | } |
204 | 204 | ||
205 | static void put_huge_zero_page(void) | 205 | static void put_huge_zero_page(void) |
206 | { | 206 | { |
207 | /* | 207 | /* |
208 | * Counter should never go to zero here. Only shrinker can put | 208 | * Counter should never go to zero here. Only shrinker can put |
209 | * last reference. | 209 | * last reference. |
210 | */ | 210 | */ |
211 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | 211 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); |
212 | } | 212 | } |
213 | 213 | ||
214 | static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, | 214 | static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, |
215 | struct shrink_control *sc) | 215 | struct shrink_control *sc) |
216 | { | 216 | { |
217 | /* we can free zero page only if last reference remains */ | 217 | /* we can free zero page only if last reference remains */ |
218 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | 218 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; |
219 | } | 219 | } |
220 | 220 | ||
221 | static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, | 221 | static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, |
222 | struct shrink_control *sc) | 222 | struct shrink_control *sc) |
223 | { | 223 | { |
224 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | 224 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { |
225 | struct page *zero_page = xchg(&huge_zero_page, NULL); | 225 | struct page *zero_page = xchg(&huge_zero_page, NULL); |
226 | BUG_ON(zero_page == NULL); | 226 | BUG_ON(zero_page == NULL); |
227 | __free_page(zero_page); | 227 | __free_page(zero_page); |
228 | return HPAGE_PMD_NR; | 228 | return HPAGE_PMD_NR; |
229 | } | 229 | } |
230 | 230 | ||
231 | return 0; | 231 | return 0; |
232 | } | 232 | } |
233 | 233 | ||
234 | static struct shrinker huge_zero_page_shrinker = { | 234 | static struct shrinker huge_zero_page_shrinker = { |
235 | .count_objects = shrink_huge_zero_page_count, | 235 | .count_objects = shrink_huge_zero_page_count, |
236 | .scan_objects = shrink_huge_zero_page_scan, | 236 | .scan_objects = shrink_huge_zero_page_scan, |
237 | .seeks = DEFAULT_SEEKS, | 237 | .seeks = DEFAULT_SEEKS, |
238 | }; | 238 | }; |
239 | 239 | ||
240 | #ifdef CONFIG_SYSFS | 240 | #ifdef CONFIG_SYSFS |
241 | 241 | ||
242 | static ssize_t double_flag_show(struct kobject *kobj, | 242 | static ssize_t double_flag_show(struct kobject *kobj, |
243 | struct kobj_attribute *attr, char *buf, | 243 | struct kobj_attribute *attr, char *buf, |
244 | enum transparent_hugepage_flag enabled, | 244 | enum transparent_hugepage_flag enabled, |
245 | enum transparent_hugepage_flag req_madv) | 245 | enum transparent_hugepage_flag req_madv) |
246 | { | 246 | { |
247 | if (test_bit(enabled, &transparent_hugepage_flags)) { | 247 | if (test_bit(enabled, &transparent_hugepage_flags)) { |
248 | VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); | 248 | VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); |
249 | return sprintf(buf, "[always] madvise never\n"); | 249 | return sprintf(buf, "[always] madvise never\n"); |
250 | } else if (test_bit(req_madv, &transparent_hugepage_flags)) | 250 | } else if (test_bit(req_madv, &transparent_hugepage_flags)) |
251 | return sprintf(buf, "always [madvise] never\n"); | 251 | return sprintf(buf, "always [madvise] never\n"); |
252 | else | 252 | else |
253 | return sprintf(buf, "always madvise [never]\n"); | 253 | return sprintf(buf, "always madvise [never]\n"); |
254 | } | 254 | } |
255 | static ssize_t double_flag_store(struct kobject *kobj, | 255 | static ssize_t double_flag_store(struct kobject *kobj, |
256 | struct kobj_attribute *attr, | 256 | struct kobj_attribute *attr, |
257 | const char *buf, size_t count, | 257 | const char *buf, size_t count, |
258 | enum transparent_hugepage_flag enabled, | 258 | enum transparent_hugepage_flag enabled, |
259 | enum transparent_hugepage_flag req_madv) | 259 | enum transparent_hugepage_flag req_madv) |
260 | { | 260 | { |
261 | if (!memcmp("always", buf, | 261 | if (!memcmp("always", buf, |
262 | min(sizeof("always")-1, count))) { | 262 | min(sizeof("always")-1, count))) { |
263 | set_bit(enabled, &transparent_hugepage_flags); | 263 | set_bit(enabled, &transparent_hugepage_flags); |
264 | clear_bit(req_madv, &transparent_hugepage_flags); | 264 | clear_bit(req_madv, &transparent_hugepage_flags); |
265 | } else if (!memcmp("madvise", buf, | 265 | } else if (!memcmp("madvise", buf, |
266 | min(sizeof("madvise")-1, count))) { | 266 | min(sizeof("madvise")-1, count))) { |
267 | clear_bit(enabled, &transparent_hugepage_flags); | 267 | clear_bit(enabled, &transparent_hugepage_flags); |
268 | set_bit(req_madv, &transparent_hugepage_flags); | 268 | set_bit(req_madv, &transparent_hugepage_flags); |
269 | } else if (!memcmp("never", buf, | 269 | } else if (!memcmp("never", buf, |
270 | min(sizeof("never")-1, count))) { | 270 | min(sizeof("never")-1, count))) { |
271 | clear_bit(enabled, &transparent_hugepage_flags); | 271 | clear_bit(enabled, &transparent_hugepage_flags); |
272 | clear_bit(req_madv, &transparent_hugepage_flags); | 272 | clear_bit(req_madv, &transparent_hugepage_flags); |
273 | } else | 273 | } else |
274 | return -EINVAL; | 274 | return -EINVAL; |
275 | 275 | ||
276 | return count; | 276 | return count; |
277 | } | 277 | } |
278 | 278 | ||
279 | static ssize_t enabled_show(struct kobject *kobj, | 279 | static ssize_t enabled_show(struct kobject *kobj, |
280 | struct kobj_attribute *attr, char *buf) | 280 | struct kobj_attribute *attr, char *buf) |
281 | { | 281 | { |
282 | return double_flag_show(kobj, attr, buf, | 282 | return double_flag_show(kobj, attr, buf, |
283 | TRANSPARENT_HUGEPAGE_FLAG, | 283 | TRANSPARENT_HUGEPAGE_FLAG, |
284 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | 284 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); |
285 | } | 285 | } |
286 | static ssize_t enabled_store(struct kobject *kobj, | 286 | static ssize_t enabled_store(struct kobject *kobj, |
287 | struct kobj_attribute *attr, | 287 | struct kobj_attribute *attr, |
288 | const char *buf, size_t count) | 288 | const char *buf, size_t count) |
289 | { | 289 | { |
290 | ssize_t ret; | 290 | ssize_t ret; |
291 | 291 | ||
292 | ret = double_flag_store(kobj, attr, buf, count, | 292 | ret = double_flag_store(kobj, attr, buf, count, |
293 | TRANSPARENT_HUGEPAGE_FLAG, | 293 | TRANSPARENT_HUGEPAGE_FLAG, |
294 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | 294 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); |
295 | 295 | ||
296 | if (ret > 0) { | 296 | if (ret > 0) { |
297 | int err; | 297 | int err; |
298 | 298 | ||
299 | mutex_lock(&khugepaged_mutex); | 299 | mutex_lock(&khugepaged_mutex); |
300 | err = start_khugepaged(); | 300 | err = start_khugepaged(); |
301 | mutex_unlock(&khugepaged_mutex); | 301 | mutex_unlock(&khugepaged_mutex); |
302 | 302 | ||
303 | if (err) | 303 | if (err) |
304 | ret = err; | 304 | ret = err; |
305 | } | 305 | } |
306 | 306 | ||
307 | return ret; | 307 | return ret; |
308 | } | 308 | } |
309 | static struct kobj_attribute enabled_attr = | 309 | static struct kobj_attribute enabled_attr = |
310 | __ATTR(enabled, 0644, enabled_show, enabled_store); | 310 | __ATTR(enabled, 0644, enabled_show, enabled_store); |
311 | 311 | ||
312 | static ssize_t single_flag_show(struct kobject *kobj, | 312 | static ssize_t single_flag_show(struct kobject *kobj, |
313 | struct kobj_attribute *attr, char *buf, | 313 | struct kobj_attribute *attr, char *buf, |
314 | enum transparent_hugepage_flag flag) | 314 | enum transparent_hugepage_flag flag) |
315 | { | 315 | { |
316 | return sprintf(buf, "%d\n", | 316 | return sprintf(buf, "%d\n", |
317 | !!test_bit(flag, &transparent_hugepage_flags)); | 317 | !!test_bit(flag, &transparent_hugepage_flags)); |
318 | } | 318 | } |
319 | 319 | ||
320 | static ssize_t single_flag_store(struct kobject *kobj, | 320 | static ssize_t single_flag_store(struct kobject *kobj, |
321 | struct kobj_attribute *attr, | 321 | struct kobj_attribute *attr, |
322 | const char *buf, size_t count, | 322 | const char *buf, size_t count, |
323 | enum transparent_hugepage_flag flag) | 323 | enum transparent_hugepage_flag flag) |
324 | { | 324 | { |
325 | unsigned long value; | 325 | unsigned long value; |
326 | int ret; | 326 | int ret; |
327 | 327 | ||
328 | ret = kstrtoul(buf, 10, &value); | 328 | ret = kstrtoul(buf, 10, &value); |
329 | if (ret < 0) | 329 | if (ret < 0) |
330 | return ret; | 330 | return ret; |
331 | if (value > 1) | 331 | if (value > 1) |
332 | return -EINVAL; | 332 | return -EINVAL; |
333 | 333 | ||
334 | if (value) | 334 | if (value) |
335 | set_bit(flag, &transparent_hugepage_flags); | 335 | set_bit(flag, &transparent_hugepage_flags); |
336 | else | 336 | else |
337 | clear_bit(flag, &transparent_hugepage_flags); | 337 | clear_bit(flag, &transparent_hugepage_flags); |
338 | 338 | ||
339 | return count; | 339 | return count; |
340 | } | 340 | } |
341 | 341 | ||
342 | /* | 342 | /* |
343 | * Currently defrag only disables __GFP_NOWAIT for allocation. A blind | 343 | * Currently defrag only disables __GFP_NOWAIT for allocation. A blind |
344 | * __GFP_REPEAT is too aggressive, it's never worth swapping tons of | 344 | * __GFP_REPEAT is too aggressive, it's never worth swapping tons of |
345 | * memory just to allocate one more hugepage. | 345 | * memory just to allocate one more hugepage. |
346 | */ | 346 | */ |
347 | static ssize_t defrag_show(struct kobject *kobj, | 347 | static ssize_t defrag_show(struct kobject *kobj, |
348 | struct kobj_attribute *attr, char *buf) | 348 | struct kobj_attribute *attr, char *buf) |
349 | { | 349 | { |
350 | return double_flag_show(kobj, attr, buf, | 350 | return double_flag_show(kobj, attr, buf, |
351 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | 351 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, |
352 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | 352 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); |
353 | } | 353 | } |
354 | static ssize_t defrag_store(struct kobject *kobj, | 354 | static ssize_t defrag_store(struct kobject *kobj, |
355 | struct kobj_attribute *attr, | 355 | struct kobj_attribute *attr, |
356 | const char *buf, size_t count) | 356 | const char *buf, size_t count) |
357 | { | 357 | { |
358 | return double_flag_store(kobj, attr, buf, count, | 358 | return double_flag_store(kobj, attr, buf, count, |
359 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | 359 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, |
360 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | 360 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); |
361 | } | 361 | } |
362 | static struct kobj_attribute defrag_attr = | 362 | static struct kobj_attribute defrag_attr = |
363 | __ATTR(defrag, 0644, defrag_show, defrag_store); | 363 | __ATTR(defrag, 0644, defrag_show, defrag_store); |
364 | 364 | ||
365 | static ssize_t use_zero_page_show(struct kobject *kobj, | 365 | static ssize_t use_zero_page_show(struct kobject *kobj, |
366 | struct kobj_attribute *attr, char *buf) | 366 | struct kobj_attribute *attr, char *buf) |
367 | { | 367 | { |
368 | return single_flag_show(kobj, attr, buf, | 368 | return single_flag_show(kobj, attr, buf, |
369 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | 369 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); |
370 | } | 370 | } |
371 | static ssize_t use_zero_page_store(struct kobject *kobj, | 371 | static ssize_t use_zero_page_store(struct kobject *kobj, |
372 | struct kobj_attribute *attr, const char *buf, size_t count) | 372 | struct kobj_attribute *attr, const char *buf, size_t count) |
373 | { | 373 | { |
374 | return single_flag_store(kobj, attr, buf, count, | 374 | return single_flag_store(kobj, attr, buf, count, |
375 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | 375 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); |
376 | } | 376 | } |
377 | static struct kobj_attribute use_zero_page_attr = | 377 | static struct kobj_attribute use_zero_page_attr = |
378 | __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); | 378 | __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); |
379 | #ifdef CONFIG_DEBUG_VM | 379 | #ifdef CONFIG_DEBUG_VM |
380 | static ssize_t debug_cow_show(struct kobject *kobj, | 380 | static ssize_t debug_cow_show(struct kobject *kobj, |
381 | struct kobj_attribute *attr, char *buf) | 381 | struct kobj_attribute *attr, char *buf) |
382 | { | 382 | { |
383 | return single_flag_show(kobj, attr, buf, | 383 | return single_flag_show(kobj, attr, buf, |
384 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | 384 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); |
385 | } | 385 | } |
386 | static ssize_t debug_cow_store(struct kobject *kobj, | 386 | static ssize_t debug_cow_store(struct kobject *kobj, |
387 | struct kobj_attribute *attr, | 387 | struct kobj_attribute *attr, |
388 | const char *buf, size_t count) | 388 | const char *buf, size_t count) |
389 | { | 389 | { |
390 | return single_flag_store(kobj, attr, buf, count, | 390 | return single_flag_store(kobj, attr, buf, count, |
391 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | 391 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); |
392 | } | 392 | } |
393 | static struct kobj_attribute debug_cow_attr = | 393 | static struct kobj_attribute debug_cow_attr = |
394 | __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); | 394 | __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); |
395 | #endif /* CONFIG_DEBUG_VM */ | 395 | #endif /* CONFIG_DEBUG_VM */ |
396 | 396 | ||
397 | static struct attribute *hugepage_attr[] = { | 397 | static struct attribute *hugepage_attr[] = { |
398 | &enabled_attr.attr, | 398 | &enabled_attr.attr, |
399 | &defrag_attr.attr, | 399 | &defrag_attr.attr, |
400 | &use_zero_page_attr.attr, | 400 | &use_zero_page_attr.attr, |
401 | #ifdef CONFIG_DEBUG_VM | 401 | #ifdef CONFIG_DEBUG_VM |
402 | &debug_cow_attr.attr, | 402 | &debug_cow_attr.attr, |
403 | #endif | 403 | #endif |
404 | NULL, | 404 | NULL, |
405 | }; | 405 | }; |
406 | 406 | ||
407 | static struct attribute_group hugepage_attr_group = { | 407 | static struct attribute_group hugepage_attr_group = { |
408 | .attrs = hugepage_attr, | 408 | .attrs = hugepage_attr, |
409 | }; | 409 | }; |
410 | 410 | ||
411 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, | 411 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, |
412 | struct kobj_attribute *attr, | 412 | struct kobj_attribute *attr, |
413 | char *buf) | 413 | char *buf) |
414 | { | 414 | { |
415 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); | 415 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); |
416 | } | 416 | } |
417 | 417 | ||
418 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, | 418 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, |
419 | struct kobj_attribute *attr, | 419 | struct kobj_attribute *attr, |
420 | const char *buf, size_t count) | 420 | const char *buf, size_t count) |
421 | { | 421 | { |
422 | unsigned long msecs; | 422 | unsigned long msecs; |
423 | int err; | 423 | int err; |
424 | 424 | ||
425 | err = kstrtoul(buf, 10, &msecs); | 425 | err = kstrtoul(buf, 10, &msecs); |
426 | if (err || msecs > UINT_MAX) | 426 | if (err || msecs > UINT_MAX) |
427 | return -EINVAL; | 427 | return -EINVAL; |
428 | 428 | ||
429 | khugepaged_scan_sleep_millisecs = msecs; | 429 | khugepaged_scan_sleep_millisecs = msecs; |
430 | wake_up_interruptible(&khugepaged_wait); | 430 | wake_up_interruptible(&khugepaged_wait); |
431 | 431 | ||
432 | return count; | 432 | return count; |
433 | } | 433 | } |
434 | static struct kobj_attribute scan_sleep_millisecs_attr = | 434 | static struct kobj_attribute scan_sleep_millisecs_attr = |
435 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, | 435 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, |
436 | scan_sleep_millisecs_store); | 436 | scan_sleep_millisecs_store); |
437 | 437 | ||
438 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, | 438 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, |
439 | struct kobj_attribute *attr, | 439 | struct kobj_attribute *attr, |
440 | char *buf) | 440 | char *buf) |
441 | { | 441 | { |
442 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); | 442 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); |
443 | } | 443 | } |
444 | 444 | ||
445 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, | 445 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, |
446 | struct kobj_attribute *attr, | 446 | struct kobj_attribute *attr, |
447 | const char *buf, size_t count) | 447 | const char *buf, size_t count) |
448 | { | 448 | { |
449 | unsigned long msecs; | 449 | unsigned long msecs; |
450 | int err; | 450 | int err; |
451 | 451 | ||
452 | err = kstrtoul(buf, 10, &msecs); | 452 | err = kstrtoul(buf, 10, &msecs); |
453 | if (err || msecs > UINT_MAX) | 453 | if (err || msecs > UINT_MAX) |
454 | return -EINVAL; | 454 | return -EINVAL; |
455 | 455 | ||
456 | khugepaged_alloc_sleep_millisecs = msecs; | 456 | khugepaged_alloc_sleep_millisecs = msecs; |
457 | wake_up_interruptible(&khugepaged_wait); | 457 | wake_up_interruptible(&khugepaged_wait); |
458 | 458 | ||
459 | return count; | 459 | return count; |
460 | } | 460 | } |
461 | static struct kobj_attribute alloc_sleep_millisecs_attr = | 461 | static struct kobj_attribute alloc_sleep_millisecs_attr = |
462 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, | 462 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, |
463 | alloc_sleep_millisecs_store); | 463 | alloc_sleep_millisecs_store); |
464 | 464 | ||
465 | static ssize_t pages_to_scan_show(struct kobject *kobj, | 465 | static ssize_t pages_to_scan_show(struct kobject *kobj, |
466 | struct kobj_attribute *attr, | 466 | struct kobj_attribute *attr, |
467 | char *buf) | 467 | char *buf) |
468 | { | 468 | { |
469 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); | 469 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); |
470 | } | 470 | } |
471 | static ssize_t pages_to_scan_store(struct kobject *kobj, | 471 | static ssize_t pages_to_scan_store(struct kobject *kobj, |
472 | struct kobj_attribute *attr, | 472 | struct kobj_attribute *attr, |
473 | const char *buf, size_t count) | 473 | const char *buf, size_t count) |
474 | { | 474 | { |
475 | int err; | 475 | int err; |
476 | unsigned long pages; | 476 | unsigned long pages; |
477 | 477 | ||
478 | err = kstrtoul(buf, 10, &pages); | 478 | err = kstrtoul(buf, 10, &pages); |
479 | if (err || !pages || pages > UINT_MAX) | 479 | if (err || !pages || pages > UINT_MAX) |
480 | return -EINVAL; | 480 | return -EINVAL; |
481 | 481 | ||
482 | khugepaged_pages_to_scan = pages; | 482 | khugepaged_pages_to_scan = pages; |
483 | 483 | ||
484 | return count; | 484 | return count; |
485 | } | 485 | } |
486 | static struct kobj_attribute pages_to_scan_attr = | 486 | static struct kobj_attribute pages_to_scan_attr = |
487 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, | 487 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, |
488 | pages_to_scan_store); | 488 | pages_to_scan_store); |
489 | 489 | ||
490 | static ssize_t pages_collapsed_show(struct kobject *kobj, | 490 | static ssize_t pages_collapsed_show(struct kobject *kobj, |
491 | struct kobj_attribute *attr, | 491 | struct kobj_attribute *attr, |
492 | char *buf) | 492 | char *buf) |
493 | { | 493 | { |
494 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); | 494 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); |
495 | } | 495 | } |
496 | static struct kobj_attribute pages_collapsed_attr = | 496 | static struct kobj_attribute pages_collapsed_attr = |
497 | __ATTR_RO(pages_collapsed); | 497 | __ATTR_RO(pages_collapsed); |
498 | 498 | ||
499 | static ssize_t full_scans_show(struct kobject *kobj, | 499 | static ssize_t full_scans_show(struct kobject *kobj, |
500 | struct kobj_attribute *attr, | 500 | struct kobj_attribute *attr, |
501 | char *buf) | 501 | char *buf) |
502 | { | 502 | { |
503 | return sprintf(buf, "%u\n", khugepaged_full_scans); | 503 | return sprintf(buf, "%u\n", khugepaged_full_scans); |
504 | } | 504 | } |
505 | static struct kobj_attribute full_scans_attr = | 505 | static struct kobj_attribute full_scans_attr = |
506 | __ATTR_RO(full_scans); | 506 | __ATTR_RO(full_scans); |
507 | 507 | ||
508 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, | 508 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, |
509 | struct kobj_attribute *attr, char *buf) | 509 | struct kobj_attribute *attr, char *buf) |
510 | { | 510 | { |
511 | return single_flag_show(kobj, attr, buf, | 511 | return single_flag_show(kobj, attr, buf, |
512 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | 512 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); |
513 | } | 513 | } |
514 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, | 514 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, |
515 | struct kobj_attribute *attr, | 515 | struct kobj_attribute *attr, |
516 | const char *buf, size_t count) | 516 | const char *buf, size_t count) |
517 | { | 517 | { |
518 | return single_flag_store(kobj, attr, buf, count, | 518 | return single_flag_store(kobj, attr, buf, count, |
519 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | 519 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); |
520 | } | 520 | } |
521 | static struct kobj_attribute khugepaged_defrag_attr = | 521 | static struct kobj_attribute khugepaged_defrag_attr = |
522 | __ATTR(defrag, 0644, khugepaged_defrag_show, | 522 | __ATTR(defrag, 0644, khugepaged_defrag_show, |
523 | khugepaged_defrag_store); | 523 | khugepaged_defrag_store); |
524 | 524 | ||
525 | /* | 525 | /* |
526 | * max_ptes_none controls if khugepaged should collapse hugepages over | 526 | * max_ptes_none controls if khugepaged should collapse hugepages over |
527 | * any unmapped ptes in turn potentially increasing the memory | 527 | * any unmapped ptes in turn potentially increasing the memory |
528 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not | 528 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not |
529 | * reduce the available free memory in the system as it | 529 | * reduce the available free memory in the system as it |
530 | * runs. Increasing max_ptes_none will instead potentially reduce the | 530 | * runs. Increasing max_ptes_none will instead potentially reduce the |
531 | * free memory in the system during the khugepaged scan. | 531 | * free memory in the system during the khugepaged scan. |
532 | */ | 532 | */ |
533 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, | 533 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, |
534 | struct kobj_attribute *attr, | 534 | struct kobj_attribute *attr, |
535 | char *buf) | 535 | char *buf) |
536 | { | 536 | { |
537 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); | 537 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); |
538 | } | 538 | } |
539 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, | 539 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, |
540 | struct kobj_attribute *attr, | 540 | struct kobj_attribute *attr, |
541 | const char *buf, size_t count) | 541 | const char *buf, size_t count) |
542 | { | 542 | { |
543 | int err; | 543 | int err; |
544 | unsigned long max_ptes_none; | 544 | unsigned long max_ptes_none; |
545 | 545 | ||
546 | err = kstrtoul(buf, 10, &max_ptes_none); | 546 | err = kstrtoul(buf, 10, &max_ptes_none); |
547 | if (err || max_ptes_none > HPAGE_PMD_NR-1) | 547 | if (err || max_ptes_none > HPAGE_PMD_NR-1) |
548 | return -EINVAL; | 548 | return -EINVAL; |
549 | 549 | ||
550 | khugepaged_max_ptes_none = max_ptes_none; | 550 | khugepaged_max_ptes_none = max_ptes_none; |
551 | 551 | ||
552 | return count; | 552 | return count; |
553 | } | 553 | } |
554 | static struct kobj_attribute khugepaged_max_ptes_none_attr = | 554 | static struct kobj_attribute khugepaged_max_ptes_none_attr = |
555 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, | 555 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, |
556 | khugepaged_max_ptes_none_store); | 556 | khugepaged_max_ptes_none_store); |
557 | 557 | ||
558 | static struct attribute *khugepaged_attr[] = { | 558 | static struct attribute *khugepaged_attr[] = { |
559 | &khugepaged_defrag_attr.attr, | 559 | &khugepaged_defrag_attr.attr, |
560 | &khugepaged_max_ptes_none_attr.attr, | 560 | &khugepaged_max_ptes_none_attr.attr, |
561 | &pages_to_scan_attr.attr, | 561 | &pages_to_scan_attr.attr, |
562 | &pages_collapsed_attr.attr, | 562 | &pages_collapsed_attr.attr, |
563 | &full_scans_attr.attr, | 563 | &full_scans_attr.attr, |
564 | &scan_sleep_millisecs_attr.attr, | 564 | &scan_sleep_millisecs_attr.attr, |
565 | &alloc_sleep_millisecs_attr.attr, | 565 | &alloc_sleep_millisecs_attr.attr, |
566 | NULL, | 566 | NULL, |
567 | }; | 567 | }; |
568 | 568 | ||
569 | static struct attribute_group khugepaged_attr_group = { | 569 | static struct attribute_group khugepaged_attr_group = { |
570 | .attrs = khugepaged_attr, | 570 | .attrs = khugepaged_attr, |
571 | .name = "khugepaged", | 571 | .name = "khugepaged", |
572 | }; | 572 | }; |
573 | 573 | ||
574 | static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) | 574 | static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) |
575 | { | 575 | { |
576 | int err; | 576 | int err; |
577 | 577 | ||
578 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); | 578 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); |
579 | if (unlikely(!*hugepage_kobj)) { | 579 | if (unlikely(!*hugepage_kobj)) { |
580 | printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); | 580 | printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); |
581 | return -ENOMEM; | 581 | return -ENOMEM; |
582 | } | 582 | } |
583 | 583 | ||
584 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); | 584 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); |
585 | if (err) { | 585 | if (err) { |
586 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); | 586 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); |
587 | goto delete_obj; | 587 | goto delete_obj; |
588 | } | 588 | } |
589 | 589 | ||
590 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); | 590 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); |
591 | if (err) { | 591 | if (err) { |
592 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); | 592 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); |
593 | goto remove_hp_group; | 593 | goto remove_hp_group; |
594 | } | 594 | } |
595 | 595 | ||
596 | return 0; | 596 | return 0; |
597 | 597 | ||
598 | remove_hp_group: | 598 | remove_hp_group: |
599 | sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); | 599 | sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); |
600 | delete_obj: | 600 | delete_obj: |
601 | kobject_put(*hugepage_kobj); | 601 | kobject_put(*hugepage_kobj); |
602 | return err; | 602 | return err; |
603 | } | 603 | } |
604 | 604 | ||
605 | static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) | 605 | static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) |
606 | { | 606 | { |
607 | sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); | 607 | sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); |
608 | sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); | 608 | sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); |
609 | kobject_put(hugepage_kobj); | 609 | kobject_put(hugepage_kobj); |
610 | } | 610 | } |
611 | #else | 611 | #else |
612 | static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) | 612 | static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) |
613 | { | 613 | { |
614 | return 0; | 614 | return 0; |
615 | } | 615 | } |
616 | 616 | ||
617 | static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) | 617 | static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) |
618 | { | 618 | { |
619 | } | 619 | } |
620 | #endif /* CONFIG_SYSFS */ | 620 | #endif /* CONFIG_SYSFS */ |
621 | 621 | ||
622 | static int __init hugepage_init(void) | 622 | static int __init hugepage_init(void) |
623 | { | 623 | { |
624 | int err; | 624 | int err; |
625 | struct kobject *hugepage_kobj; | 625 | struct kobject *hugepage_kobj; |
626 | 626 | ||
627 | if (!has_transparent_hugepage()) { | 627 | if (!has_transparent_hugepage()) { |
628 | transparent_hugepage_flags = 0; | 628 | transparent_hugepage_flags = 0; |
629 | return -EINVAL; | 629 | return -EINVAL; |
630 | } | 630 | } |
631 | 631 | ||
632 | err = hugepage_init_sysfs(&hugepage_kobj); | 632 | err = hugepage_init_sysfs(&hugepage_kobj); |
633 | if (err) | 633 | if (err) |
634 | return err; | 634 | return err; |
635 | 635 | ||
636 | err = khugepaged_slab_init(); | 636 | err = khugepaged_slab_init(); |
637 | if (err) | 637 | if (err) |
638 | goto out; | 638 | goto out; |
639 | 639 | ||
640 | register_shrinker(&huge_zero_page_shrinker); | 640 | register_shrinker(&huge_zero_page_shrinker); |
641 | 641 | ||
642 | /* | 642 | /* |
643 | * By default disable transparent hugepages on smaller systems, | 643 | * By default disable transparent hugepages on smaller systems, |
644 | * where the extra memory used could hurt more than TLB overhead | 644 | * where the extra memory used could hurt more than TLB overhead |
645 | * is likely to save. The admin can still enable it through /sys. | 645 | * is likely to save. The admin can still enable it through /sys. |
646 | */ | 646 | */ |
647 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) | 647 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) |
648 | transparent_hugepage_flags = 0; | 648 | transparent_hugepage_flags = 0; |
649 | 649 | ||
650 | start_khugepaged(); | 650 | start_khugepaged(); |
651 | 651 | ||
652 | return 0; | 652 | return 0; |
653 | out: | 653 | out: |
654 | hugepage_exit_sysfs(hugepage_kobj); | 654 | hugepage_exit_sysfs(hugepage_kobj); |
655 | return err; | 655 | return err; |
656 | } | 656 | } |
657 | module_init(hugepage_init) | 657 | module_init(hugepage_init) |
658 | 658 | ||
659 | static int __init setup_transparent_hugepage(char *str) | 659 | static int __init setup_transparent_hugepage(char *str) |
660 | { | 660 | { |
661 | int ret = 0; | 661 | int ret = 0; |
662 | if (!str) | 662 | if (!str) |
663 | goto out; | 663 | goto out; |
664 | if (!strcmp(str, "always")) { | 664 | if (!strcmp(str, "always")) { |
665 | set_bit(TRANSPARENT_HUGEPAGE_FLAG, | 665 | set_bit(TRANSPARENT_HUGEPAGE_FLAG, |
666 | &transparent_hugepage_flags); | 666 | &transparent_hugepage_flags); |
667 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | 667 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, |
668 | &transparent_hugepage_flags); | 668 | &transparent_hugepage_flags); |
669 | ret = 1; | 669 | ret = 1; |
670 | } else if (!strcmp(str, "madvise")) { | 670 | } else if (!strcmp(str, "madvise")) { |
671 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | 671 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, |
672 | &transparent_hugepage_flags); | 672 | &transparent_hugepage_flags); |
673 | set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | 673 | set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, |
674 | &transparent_hugepage_flags); | 674 | &transparent_hugepage_flags); |
675 | ret = 1; | 675 | ret = 1; |
676 | } else if (!strcmp(str, "never")) { | 676 | } else if (!strcmp(str, "never")) { |
677 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | 677 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, |
678 | &transparent_hugepage_flags); | 678 | &transparent_hugepage_flags); |
679 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | 679 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, |
680 | &transparent_hugepage_flags); | 680 | &transparent_hugepage_flags); |
681 | ret = 1; | 681 | ret = 1; |
682 | } | 682 | } |
683 | out: | 683 | out: |
684 | if (!ret) | 684 | if (!ret) |
685 | printk(KERN_WARNING | 685 | printk(KERN_WARNING |
686 | "transparent_hugepage= cannot parse, ignored\n"); | 686 | "transparent_hugepage= cannot parse, ignored\n"); |
687 | return ret; | 687 | return ret; |
688 | } | 688 | } |
689 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 689 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
690 | 690 | ||
691 | pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | 691 | pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
692 | { | 692 | { |
693 | if (likely(vma->vm_flags & VM_WRITE)) | 693 | if (likely(vma->vm_flags & VM_WRITE)) |
694 | pmd = pmd_mkwrite(pmd); | 694 | pmd = pmd_mkwrite(pmd); |
695 | return pmd; | 695 | return pmd; |
696 | } | 696 | } |
697 | 697 | ||
698 | static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) | 698 | static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) |
699 | { | 699 | { |
700 | pmd_t entry; | 700 | pmd_t entry; |
701 | entry = mk_pmd(page, prot); | 701 | entry = mk_pmd(page, prot); |
702 | entry = pmd_mkhuge(entry); | 702 | entry = pmd_mkhuge(entry); |
703 | return entry; | 703 | return entry; |
704 | } | 704 | } |
705 | 705 | ||
706 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 706 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
707 | struct vm_area_struct *vma, | 707 | struct vm_area_struct *vma, |
708 | unsigned long haddr, pmd_t *pmd, | 708 | unsigned long haddr, pmd_t *pmd, |
709 | struct page *page) | 709 | struct page *page) |
710 | { | 710 | { |
711 | pgtable_t pgtable; | 711 | pgtable_t pgtable; |
712 | 712 | ||
713 | VM_BUG_ON(!PageCompound(page)); | 713 | VM_BUG_ON(!PageCompound(page)); |
714 | pgtable = pte_alloc_one(mm, haddr); | 714 | pgtable = pte_alloc_one(mm, haddr); |
715 | if (unlikely(!pgtable)) | 715 | if (unlikely(!pgtable)) |
716 | return VM_FAULT_OOM; | 716 | return VM_FAULT_OOM; |
717 | 717 | ||
718 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | 718 | clear_huge_page(page, haddr, HPAGE_PMD_NR); |
719 | /* | 719 | /* |
720 | * The memory barrier inside __SetPageUptodate makes sure that | 720 | * The memory barrier inside __SetPageUptodate makes sure that |
721 | * clear_huge_page writes become visible before the set_pmd_at() | 721 | * clear_huge_page writes become visible before the set_pmd_at() |
722 | * write. | 722 | * write. |
723 | */ | 723 | */ |
724 | __SetPageUptodate(page); | 724 | __SetPageUptodate(page); |
725 | 725 | ||
726 | spin_lock(&mm->page_table_lock); | 726 | spin_lock(&mm->page_table_lock); |
727 | if (unlikely(!pmd_none(*pmd))) { | 727 | if (unlikely(!pmd_none(*pmd))) { |
728 | spin_unlock(&mm->page_table_lock); | 728 | spin_unlock(&mm->page_table_lock); |
729 | mem_cgroup_uncharge_page(page); | 729 | mem_cgroup_uncharge_page(page); |
730 | put_page(page); | 730 | put_page(page); |
731 | pte_free(mm, pgtable); | 731 | pte_free(mm, pgtable); |
732 | } else { | 732 | } else { |
733 | pmd_t entry; | 733 | pmd_t entry; |
734 | entry = mk_huge_pmd(page, vma->vm_page_prot); | 734 | entry = mk_huge_pmd(page, vma->vm_page_prot); |
735 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 735 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
736 | page_add_new_anon_rmap(page, vma, haddr); | 736 | page_add_new_anon_rmap(page, vma, haddr); |
737 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 737 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
738 | set_pmd_at(mm, haddr, pmd, entry); | 738 | set_pmd_at(mm, haddr, pmd, entry); |
739 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 739 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
740 | mm->nr_ptes++; | 740 | mm->nr_ptes++; |
741 | spin_unlock(&mm->page_table_lock); | 741 | spin_unlock(&mm->page_table_lock); |
742 | } | 742 | } |
743 | 743 | ||
744 | return 0; | 744 | return 0; |
745 | } | 745 | } |
746 | 746 | ||
747 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | 747 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) |
748 | { | 748 | { |
749 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; | 749 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; |
750 | } | 750 | } |
751 | 751 | ||
752 | static inline struct page *alloc_hugepage_vma(int defrag, | 752 | static inline struct page *alloc_hugepage_vma(int defrag, |
753 | struct vm_area_struct *vma, | 753 | struct vm_area_struct *vma, |
754 | unsigned long haddr, int nd, | 754 | unsigned long haddr, int nd, |
755 | gfp_t extra_gfp) | 755 | gfp_t extra_gfp) |
756 | { | 756 | { |
757 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), | 757 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), |
758 | HPAGE_PMD_ORDER, vma, haddr, nd); | 758 | HPAGE_PMD_ORDER, vma, haddr, nd); |
759 | } | 759 | } |
760 | 760 | ||
761 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 761 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
762 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 762 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
763 | struct page *zero_page) | 763 | struct page *zero_page) |
764 | { | 764 | { |
765 | pmd_t entry; | 765 | pmd_t entry; |
766 | if (!pmd_none(*pmd)) | 766 | if (!pmd_none(*pmd)) |
767 | return false; | 767 | return false; |
768 | entry = mk_pmd(zero_page, vma->vm_page_prot); | 768 | entry = mk_pmd(zero_page, vma->vm_page_prot); |
769 | entry = pmd_wrprotect(entry); | 769 | entry = pmd_wrprotect(entry); |
770 | entry = pmd_mkhuge(entry); | 770 | entry = pmd_mkhuge(entry); |
771 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 771 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
772 | set_pmd_at(mm, haddr, pmd, entry); | 772 | set_pmd_at(mm, haddr, pmd, entry); |
773 | mm->nr_ptes++; | 773 | mm->nr_ptes++; |
774 | return true; | 774 | return true; |
775 | } | 775 | } |
776 | 776 | ||
777 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 777 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
778 | unsigned long address, pmd_t *pmd, | 778 | unsigned long address, pmd_t *pmd, |
779 | unsigned int flags) | 779 | unsigned int flags) |
780 | { | 780 | { |
781 | struct page *page; | 781 | struct page *page; |
782 | unsigned long haddr = address & HPAGE_PMD_MASK; | 782 | unsigned long haddr = address & HPAGE_PMD_MASK; |
783 | 783 | ||
784 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) | 784 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) |
785 | return VM_FAULT_FALLBACK; | 785 | return VM_FAULT_FALLBACK; |
786 | if (unlikely(anon_vma_prepare(vma))) | 786 | if (unlikely(anon_vma_prepare(vma))) |
787 | return VM_FAULT_OOM; | 787 | return VM_FAULT_OOM; |
788 | if (unlikely(khugepaged_enter(vma))) | 788 | if (unlikely(khugepaged_enter(vma))) |
789 | return VM_FAULT_OOM; | 789 | return VM_FAULT_OOM; |
790 | if (!(flags & FAULT_FLAG_WRITE) && | 790 | if (!(flags & FAULT_FLAG_WRITE) && |
791 | transparent_hugepage_use_zero_page()) { | 791 | transparent_hugepage_use_zero_page()) { |
792 | pgtable_t pgtable; | 792 | pgtable_t pgtable; |
793 | struct page *zero_page; | 793 | struct page *zero_page; |
794 | bool set; | 794 | bool set; |
795 | pgtable = pte_alloc_one(mm, haddr); | 795 | pgtable = pte_alloc_one(mm, haddr); |
796 | if (unlikely(!pgtable)) | 796 | if (unlikely(!pgtable)) |
797 | return VM_FAULT_OOM; | 797 | return VM_FAULT_OOM; |
798 | zero_page = get_huge_zero_page(); | 798 | zero_page = get_huge_zero_page(); |
799 | if (unlikely(!zero_page)) { | 799 | if (unlikely(!zero_page)) { |
800 | pte_free(mm, pgtable); | 800 | pte_free(mm, pgtable); |
801 | count_vm_event(THP_FAULT_FALLBACK); | 801 | count_vm_event(THP_FAULT_FALLBACK); |
802 | return VM_FAULT_FALLBACK; | 802 | return VM_FAULT_FALLBACK; |
803 | } | 803 | } |
804 | spin_lock(&mm->page_table_lock); | 804 | spin_lock(&mm->page_table_lock); |
805 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | 805 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, |
806 | zero_page); | 806 | zero_page); |
807 | spin_unlock(&mm->page_table_lock); | 807 | spin_unlock(&mm->page_table_lock); |
808 | if (!set) { | 808 | if (!set) { |
809 | pte_free(mm, pgtable); | 809 | pte_free(mm, pgtable); |
810 | put_huge_zero_page(); | 810 | put_huge_zero_page(); |
811 | } | 811 | } |
812 | return 0; | 812 | return 0; |
813 | } | 813 | } |
814 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 814 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
815 | vma, haddr, numa_node_id(), 0); | 815 | vma, haddr, numa_node_id(), 0); |
816 | if (unlikely(!page)) { | 816 | if (unlikely(!page)) { |
817 | count_vm_event(THP_FAULT_FALLBACK); | 817 | count_vm_event(THP_FAULT_FALLBACK); |
818 | return VM_FAULT_FALLBACK; | 818 | return VM_FAULT_FALLBACK; |
819 | } | 819 | } |
820 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | 820 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { |
821 | put_page(page); | 821 | put_page(page); |
822 | count_vm_event(THP_FAULT_FALLBACK); | 822 | count_vm_event(THP_FAULT_FALLBACK); |
823 | return VM_FAULT_FALLBACK; | 823 | return VM_FAULT_FALLBACK; |
824 | } | 824 | } |
825 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { | 825 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { |
826 | mem_cgroup_uncharge_page(page); | 826 | mem_cgroup_uncharge_page(page); |
827 | put_page(page); | 827 | put_page(page); |
828 | count_vm_event(THP_FAULT_FALLBACK); | 828 | count_vm_event(THP_FAULT_FALLBACK); |
829 | return VM_FAULT_FALLBACK; | 829 | return VM_FAULT_FALLBACK; |
830 | } | 830 | } |
831 | 831 | ||
832 | count_vm_event(THP_FAULT_ALLOC); | 832 | count_vm_event(THP_FAULT_ALLOC); |
833 | return 0; | 833 | return 0; |
834 | } | 834 | } |
835 | 835 | ||
836 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 836 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
837 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 837 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
838 | struct vm_area_struct *vma) | 838 | struct vm_area_struct *vma) |
839 | { | 839 | { |
840 | struct page *src_page; | 840 | struct page *src_page; |
841 | pmd_t pmd; | 841 | pmd_t pmd; |
842 | pgtable_t pgtable; | 842 | pgtable_t pgtable; |
843 | int ret; | 843 | int ret; |
844 | 844 | ||
845 | ret = -ENOMEM; | 845 | ret = -ENOMEM; |
846 | pgtable = pte_alloc_one(dst_mm, addr); | 846 | pgtable = pte_alloc_one(dst_mm, addr); |
847 | if (unlikely(!pgtable)) | 847 | if (unlikely(!pgtable)) |
848 | goto out; | 848 | goto out; |
849 | 849 | ||
850 | spin_lock(&dst_mm->page_table_lock); | 850 | spin_lock(&dst_mm->page_table_lock); |
851 | spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); | 851 | spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); |
852 | 852 | ||
853 | ret = -EAGAIN; | 853 | ret = -EAGAIN; |
854 | pmd = *src_pmd; | 854 | pmd = *src_pmd; |
855 | if (unlikely(!pmd_trans_huge(pmd))) { | 855 | if (unlikely(!pmd_trans_huge(pmd))) { |
856 | pte_free(dst_mm, pgtable); | 856 | pte_free(dst_mm, pgtable); |
857 | goto out_unlock; | 857 | goto out_unlock; |
858 | } | 858 | } |
859 | /* | 859 | /* |
860 | * mm->page_table_lock is enough to be sure that huge zero pmd is not | 860 | * mm->page_table_lock is enough to be sure that huge zero pmd is not |
861 | * under splitting since we don't split the page itself, only pmd to | 861 | * under splitting since we don't split the page itself, only pmd to |
862 | * a page table. | 862 | * a page table. |
863 | */ | 863 | */ |
864 | if (is_huge_zero_pmd(pmd)) { | 864 | if (is_huge_zero_pmd(pmd)) { |
865 | struct page *zero_page; | 865 | struct page *zero_page; |
866 | bool set; | 866 | bool set; |
867 | /* | 867 | /* |
868 | * get_huge_zero_page() will never allocate a new page here, | 868 | * get_huge_zero_page() will never allocate a new page here, |
869 | * since we already have a zero page to copy. It just takes a | 869 | * since we already have a zero page to copy. It just takes a |
870 | * reference. | 870 | * reference. |
871 | */ | 871 | */ |
872 | zero_page = get_huge_zero_page(); | 872 | zero_page = get_huge_zero_page(); |
873 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | 873 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, |
874 | zero_page); | 874 | zero_page); |
875 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ | 875 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ |
876 | ret = 0; | 876 | ret = 0; |
877 | goto out_unlock; | 877 | goto out_unlock; |
878 | } | 878 | } |
879 | 879 | ||
880 | /* mmap_sem prevents this happening but warn if that changes */ | 880 | /* mmap_sem prevents this happening but warn if that changes */ |
881 | WARN_ON(pmd_trans_migrating(pmd)); | 881 | WARN_ON(pmd_trans_migrating(pmd)); |
882 | 882 | ||
883 | if (unlikely(pmd_trans_splitting(pmd))) { | 883 | if (unlikely(pmd_trans_splitting(pmd))) { |
884 | /* split huge page running from under us */ | 884 | /* split huge page running from under us */ |
885 | spin_unlock(&src_mm->page_table_lock); | 885 | spin_unlock(&src_mm->page_table_lock); |
886 | spin_unlock(&dst_mm->page_table_lock); | 886 | spin_unlock(&dst_mm->page_table_lock); |
887 | pte_free(dst_mm, pgtable); | 887 | pte_free(dst_mm, pgtable); |
888 | 888 | ||
889 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ | 889 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ |
890 | goto out; | 890 | goto out; |
891 | } | 891 | } |
892 | src_page = pmd_page(pmd); | 892 | src_page = pmd_page(pmd); |
893 | VM_BUG_ON(!PageHead(src_page)); | 893 | VM_BUG_ON(!PageHead(src_page)); |
894 | get_page(src_page); | 894 | get_page(src_page); |
895 | page_dup_rmap(src_page); | 895 | page_dup_rmap(src_page); |
896 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 896 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
897 | 897 | ||
898 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | 898 | pmdp_set_wrprotect(src_mm, addr, src_pmd); |
899 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 899 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
900 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); | 900 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); |
901 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 901 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
902 | dst_mm->nr_ptes++; | 902 | dst_mm->nr_ptes++; |
903 | 903 | ||
904 | ret = 0; | 904 | ret = 0; |
905 | out_unlock: | 905 | out_unlock: |
906 | spin_unlock(&src_mm->page_table_lock); | 906 | spin_unlock(&src_mm->page_table_lock); |
907 | spin_unlock(&dst_mm->page_table_lock); | 907 | spin_unlock(&dst_mm->page_table_lock); |
908 | out: | 908 | out: |
909 | return ret; | 909 | return ret; |
910 | } | 910 | } |
911 | 911 | ||
912 | void huge_pmd_set_accessed(struct mm_struct *mm, | 912 | void huge_pmd_set_accessed(struct mm_struct *mm, |
913 | struct vm_area_struct *vma, | 913 | struct vm_area_struct *vma, |
914 | unsigned long address, | 914 | unsigned long address, |
915 | pmd_t *pmd, pmd_t orig_pmd, | 915 | pmd_t *pmd, pmd_t orig_pmd, |
916 | int dirty) | 916 | int dirty) |
917 | { | 917 | { |
918 | pmd_t entry; | 918 | pmd_t entry; |
919 | unsigned long haddr; | 919 | unsigned long haddr; |
920 | 920 | ||
921 | spin_lock(&mm->page_table_lock); | 921 | spin_lock(&mm->page_table_lock); |
922 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 922 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
923 | goto unlock; | 923 | goto unlock; |
924 | 924 | ||
925 | entry = pmd_mkyoung(orig_pmd); | 925 | entry = pmd_mkyoung(orig_pmd); |
926 | haddr = address & HPAGE_PMD_MASK; | 926 | haddr = address & HPAGE_PMD_MASK; |
927 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) | 927 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) |
928 | update_mmu_cache_pmd(vma, address, pmd); | 928 | update_mmu_cache_pmd(vma, address, pmd); |
929 | 929 | ||
930 | unlock: | 930 | unlock: |
931 | spin_unlock(&mm->page_table_lock); | 931 | spin_unlock(&mm->page_table_lock); |
932 | } | 932 | } |
933 | 933 | ||
934 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | 934 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, |
935 | struct vm_area_struct *vma, unsigned long address, | 935 | struct vm_area_struct *vma, unsigned long address, |
936 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) | 936 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) |
937 | { | 937 | { |
938 | pgtable_t pgtable; | 938 | pgtable_t pgtable; |
939 | pmd_t _pmd; | 939 | pmd_t _pmd; |
940 | struct page *page; | 940 | struct page *page; |
941 | int i, ret = 0; | 941 | int i, ret = 0; |
942 | unsigned long mmun_start; /* For mmu_notifiers */ | 942 | unsigned long mmun_start; /* For mmu_notifiers */ |
943 | unsigned long mmun_end; /* For mmu_notifiers */ | 943 | unsigned long mmun_end; /* For mmu_notifiers */ |
944 | 944 | ||
945 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 945 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
946 | if (!page) { | 946 | if (!page) { |
947 | ret |= VM_FAULT_OOM; | 947 | ret |= VM_FAULT_OOM; |
948 | goto out; | 948 | goto out; |
949 | } | 949 | } |
950 | 950 | ||
951 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | 951 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { |
952 | put_page(page); | 952 | put_page(page); |
953 | ret |= VM_FAULT_OOM; | 953 | ret |= VM_FAULT_OOM; |
954 | goto out; | 954 | goto out; |
955 | } | 955 | } |
956 | 956 | ||
957 | clear_user_highpage(page, address); | 957 | clear_user_highpage(page, address); |
958 | __SetPageUptodate(page); | 958 | __SetPageUptodate(page); |
959 | 959 | ||
960 | mmun_start = haddr; | 960 | mmun_start = haddr; |
961 | mmun_end = haddr + HPAGE_PMD_SIZE; | 961 | mmun_end = haddr + HPAGE_PMD_SIZE; |
962 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 962 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
963 | 963 | ||
964 | spin_lock(&mm->page_table_lock); | 964 | spin_lock(&mm->page_table_lock); |
965 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 965 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
966 | goto out_free_page; | 966 | goto out_free_page; |
967 | 967 | ||
968 | pmdp_clear_flush(vma, haddr, pmd); | 968 | pmdp_clear_flush(vma, haddr, pmd); |
969 | /* leave pmd empty until pte is filled */ | 969 | /* leave pmd empty until pte is filled */ |
970 | 970 | ||
971 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 971 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
972 | pmd_populate(mm, &_pmd, pgtable); | 972 | pmd_populate(mm, &_pmd, pgtable); |
973 | 973 | ||
974 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 974 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
975 | pte_t *pte, entry; | 975 | pte_t *pte, entry; |
976 | if (haddr == (address & PAGE_MASK)) { | 976 | if (haddr == (address & PAGE_MASK)) { |
977 | entry = mk_pte(page, vma->vm_page_prot); | 977 | entry = mk_pte(page, vma->vm_page_prot); |
978 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 978 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
979 | page_add_new_anon_rmap(page, vma, haddr); | 979 | page_add_new_anon_rmap(page, vma, haddr); |
980 | } else { | 980 | } else { |
981 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | 981 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); |
982 | entry = pte_mkspecial(entry); | 982 | entry = pte_mkspecial(entry); |
983 | } | 983 | } |
984 | pte = pte_offset_map(&_pmd, haddr); | 984 | pte = pte_offset_map(&_pmd, haddr); |
985 | VM_BUG_ON(!pte_none(*pte)); | 985 | VM_BUG_ON(!pte_none(*pte)); |
986 | set_pte_at(mm, haddr, pte, entry); | 986 | set_pte_at(mm, haddr, pte, entry); |
987 | pte_unmap(pte); | 987 | pte_unmap(pte); |
988 | } | 988 | } |
989 | smp_wmb(); /* make pte visible before pmd */ | 989 | smp_wmb(); /* make pte visible before pmd */ |
990 | pmd_populate(mm, pmd, pgtable); | 990 | pmd_populate(mm, pmd, pgtable); |
991 | spin_unlock(&mm->page_table_lock); | 991 | spin_unlock(&mm->page_table_lock); |
992 | put_huge_zero_page(); | 992 | put_huge_zero_page(); |
993 | inc_mm_counter(mm, MM_ANONPAGES); | 993 | inc_mm_counter(mm, MM_ANONPAGES); |
994 | 994 | ||
995 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 995 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
996 | 996 | ||
997 | ret |= VM_FAULT_WRITE; | 997 | ret |= VM_FAULT_WRITE; |
998 | out: | 998 | out: |
999 | return ret; | 999 | return ret; |
1000 | out_free_page: | 1000 | out_free_page: |
1001 | spin_unlock(&mm->page_table_lock); | 1001 | spin_unlock(&mm->page_table_lock); |
1002 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1002 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1003 | mem_cgroup_uncharge_page(page); | 1003 | mem_cgroup_uncharge_page(page); |
1004 | put_page(page); | 1004 | put_page(page); |
1005 | goto out; | 1005 | goto out; |
1006 | } | 1006 | } |
1007 | 1007 | ||
1008 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 1008 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
1009 | struct vm_area_struct *vma, | 1009 | struct vm_area_struct *vma, |
1010 | unsigned long address, | 1010 | unsigned long address, |
1011 | pmd_t *pmd, pmd_t orig_pmd, | 1011 | pmd_t *pmd, pmd_t orig_pmd, |
1012 | struct page *page, | 1012 | struct page *page, |
1013 | unsigned long haddr) | 1013 | unsigned long haddr) |
1014 | { | 1014 | { |
1015 | pgtable_t pgtable; | 1015 | pgtable_t pgtable; |
1016 | pmd_t _pmd; | 1016 | pmd_t _pmd; |
1017 | int ret = 0, i; | 1017 | int ret = 0, i; |
1018 | struct page **pages; | 1018 | struct page **pages; |
1019 | unsigned long mmun_start; /* For mmu_notifiers */ | 1019 | unsigned long mmun_start; /* For mmu_notifiers */ |
1020 | unsigned long mmun_end; /* For mmu_notifiers */ | 1020 | unsigned long mmun_end; /* For mmu_notifiers */ |
1021 | 1021 | ||
1022 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, | 1022 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, |
1023 | GFP_KERNEL); | 1023 | GFP_KERNEL); |
1024 | if (unlikely(!pages)) { | 1024 | if (unlikely(!pages)) { |
1025 | ret |= VM_FAULT_OOM; | 1025 | ret |= VM_FAULT_OOM; |
1026 | goto out; | 1026 | goto out; |
1027 | } | 1027 | } |
1028 | 1028 | ||
1029 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1029 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1030 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | | 1030 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | |
1031 | __GFP_OTHER_NODE, | 1031 | __GFP_OTHER_NODE, |
1032 | vma, address, page_to_nid(page)); | 1032 | vma, address, page_to_nid(page)); |
1033 | if (unlikely(!pages[i] || | 1033 | if (unlikely(!pages[i] || |
1034 | mem_cgroup_newpage_charge(pages[i], mm, | 1034 | mem_cgroup_newpage_charge(pages[i], mm, |
1035 | GFP_KERNEL))) { | 1035 | GFP_KERNEL))) { |
1036 | if (pages[i]) | 1036 | if (pages[i]) |
1037 | put_page(pages[i]); | 1037 | put_page(pages[i]); |
1038 | mem_cgroup_uncharge_start(); | 1038 | mem_cgroup_uncharge_start(); |
1039 | while (--i >= 0) { | 1039 | while (--i >= 0) { |
1040 | mem_cgroup_uncharge_page(pages[i]); | 1040 | mem_cgroup_uncharge_page(pages[i]); |
1041 | put_page(pages[i]); | 1041 | put_page(pages[i]); |
1042 | } | 1042 | } |
1043 | mem_cgroup_uncharge_end(); | 1043 | mem_cgroup_uncharge_end(); |
1044 | kfree(pages); | 1044 | kfree(pages); |
1045 | ret |= VM_FAULT_OOM; | 1045 | ret |= VM_FAULT_OOM; |
1046 | goto out; | 1046 | goto out; |
1047 | } | 1047 | } |
1048 | } | 1048 | } |
1049 | 1049 | ||
1050 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1050 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1051 | copy_user_highpage(pages[i], page + i, | 1051 | copy_user_highpage(pages[i], page + i, |
1052 | haddr + PAGE_SIZE * i, vma); | 1052 | haddr + PAGE_SIZE * i, vma); |
1053 | __SetPageUptodate(pages[i]); | 1053 | __SetPageUptodate(pages[i]); |
1054 | cond_resched(); | 1054 | cond_resched(); |
1055 | } | 1055 | } |
1056 | 1056 | ||
1057 | mmun_start = haddr; | 1057 | mmun_start = haddr; |
1058 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1058 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1059 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1059 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
1060 | 1060 | ||
1061 | spin_lock(&mm->page_table_lock); | 1061 | spin_lock(&mm->page_table_lock); |
1062 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1062 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
1063 | goto out_free_pages; | 1063 | goto out_free_pages; |
1064 | VM_BUG_ON(!PageHead(page)); | 1064 | VM_BUG_ON(!PageHead(page)); |
1065 | 1065 | ||
1066 | pmdp_clear_flush(vma, haddr, pmd); | 1066 | pmdp_clear_flush(vma, haddr, pmd); |
1067 | /* leave pmd empty until pte is filled */ | 1067 | /* leave pmd empty until pte is filled */ |
1068 | 1068 | ||
1069 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 1069 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
1070 | pmd_populate(mm, &_pmd, pgtable); | 1070 | pmd_populate(mm, &_pmd, pgtable); |
1071 | 1071 | ||
1072 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 1072 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1073 | pte_t *pte, entry; | 1073 | pte_t *pte, entry; |
1074 | entry = mk_pte(pages[i], vma->vm_page_prot); | 1074 | entry = mk_pte(pages[i], vma->vm_page_prot); |
1075 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1075 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1076 | page_add_new_anon_rmap(pages[i], vma, haddr); | 1076 | page_add_new_anon_rmap(pages[i], vma, haddr); |
1077 | pte = pte_offset_map(&_pmd, haddr); | 1077 | pte = pte_offset_map(&_pmd, haddr); |
1078 | VM_BUG_ON(!pte_none(*pte)); | 1078 | VM_BUG_ON(!pte_none(*pte)); |
1079 | set_pte_at(mm, haddr, pte, entry); | 1079 | set_pte_at(mm, haddr, pte, entry); |
1080 | pte_unmap(pte); | 1080 | pte_unmap(pte); |
1081 | } | 1081 | } |
1082 | kfree(pages); | 1082 | kfree(pages); |
1083 | 1083 | ||
1084 | smp_wmb(); /* make pte visible before pmd */ | 1084 | smp_wmb(); /* make pte visible before pmd */ |
1085 | pmd_populate(mm, pmd, pgtable); | 1085 | pmd_populate(mm, pmd, pgtable); |
1086 | page_remove_rmap(page); | 1086 | page_remove_rmap(page); |
1087 | spin_unlock(&mm->page_table_lock); | 1087 | spin_unlock(&mm->page_table_lock); |
1088 | 1088 | ||
1089 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1089 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1090 | 1090 | ||
1091 | ret |= VM_FAULT_WRITE; | 1091 | ret |= VM_FAULT_WRITE; |
1092 | put_page(page); | 1092 | put_page(page); |
1093 | 1093 | ||
1094 | out: | 1094 | out: |
1095 | return ret; | 1095 | return ret; |
1096 | 1096 | ||
1097 | out_free_pages: | 1097 | out_free_pages: |
1098 | spin_unlock(&mm->page_table_lock); | 1098 | spin_unlock(&mm->page_table_lock); |
1099 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1099 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1100 | mem_cgroup_uncharge_start(); | 1100 | mem_cgroup_uncharge_start(); |
1101 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1101 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1102 | mem_cgroup_uncharge_page(pages[i]); | 1102 | mem_cgroup_uncharge_page(pages[i]); |
1103 | put_page(pages[i]); | 1103 | put_page(pages[i]); |
1104 | } | 1104 | } |
1105 | mem_cgroup_uncharge_end(); | 1105 | mem_cgroup_uncharge_end(); |
1106 | kfree(pages); | 1106 | kfree(pages); |
1107 | goto out; | 1107 | goto out; |
1108 | } | 1108 | } |
1109 | 1109 | ||
1110 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1110 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1111 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | 1111 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) |
1112 | { | 1112 | { |
1113 | int ret = 0; | 1113 | int ret = 0; |
1114 | struct page *page = NULL, *new_page; | 1114 | struct page *page = NULL, *new_page; |
1115 | unsigned long haddr; | 1115 | unsigned long haddr; |
1116 | unsigned long mmun_start; /* For mmu_notifiers */ | 1116 | unsigned long mmun_start; /* For mmu_notifiers */ |
1117 | unsigned long mmun_end; /* For mmu_notifiers */ | 1117 | unsigned long mmun_end; /* For mmu_notifiers */ |
1118 | 1118 | ||
1119 | VM_BUG_ON(!vma->anon_vma); | 1119 | VM_BUG_ON(!vma->anon_vma); |
1120 | haddr = address & HPAGE_PMD_MASK; | 1120 | haddr = address & HPAGE_PMD_MASK; |
1121 | if (is_huge_zero_pmd(orig_pmd)) | 1121 | if (is_huge_zero_pmd(orig_pmd)) |
1122 | goto alloc; | 1122 | goto alloc; |
1123 | spin_lock(&mm->page_table_lock); | 1123 | spin_lock(&mm->page_table_lock); |
1124 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1124 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
1125 | goto out_unlock; | 1125 | goto out_unlock; |
1126 | 1126 | ||
1127 | page = pmd_page(orig_pmd); | 1127 | page = pmd_page(orig_pmd); |
1128 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | 1128 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); |
1129 | if (page_mapcount(page) == 1) { | 1129 | if (page_mapcount(page) == 1) { |
1130 | pmd_t entry; | 1130 | pmd_t entry; |
1131 | entry = pmd_mkyoung(orig_pmd); | 1131 | entry = pmd_mkyoung(orig_pmd); |
1132 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1132 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1133 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | 1133 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) |
1134 | update_mmu_cache_pmd(vma, address, pmd); | 1134 | update_mmu_cache_pmd(vma, address, pmd); |
1135 | ret |= VM_FAULT_WRITE; | 1135 | ret |= VM_FAULT_WRITE; |
1136 | goto out_unlock; | 1136 | goto out_unlock; |
1137 | } | 1137 | } |
1138 | get_page(page); | 1138 | get_page(page); |
1139 | spin_unlock(&mm->page_table_lock); | 1139 | spin_unlock(&mm->page_table_lock); |
1140 | alloc: | 1140 | alloc: |
1141 | if (transparent_hugepage_enabled(vma) && | 1141 | if (transparent_hugepage_enabled(vma) && |
1142 | !transparent_hugepage_debug_cow()) | 1142 | !transparent_hugepage_debug_cow()) |
1143 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1143 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
1144 | vma, haddr, numa_node_id(), 0); | 1144 | vma, haddr, numa_node_id(), 0); |
1145 | else | 1145 | else |
1146 | new_page = NULL; | 1146 | new_page = NULL; |
1147 | 1147 | ||
1148 | if (unlikely(!new_page)) { | 1148 | if (unlikely(!new_page)) { |
1149 | if (!page) { | 1149 | if (!page) { |
1150 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, | 1150 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
1151 | address, pmd, orig_pmd, haddr); | 1151 | address, pmd, orig_pmd, haddr); |
1152 | } else { | 1152 | } else { |
1153 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1153 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
1154 | pmd, orig_pmd, page, haddr); | 1154 | pmd, orig_pmd, page, haddr); |
1155 | if (ret & VM_FAULT_OOM) { | 1155 | if (ret & VM_FAULT_OOM) { |
1156 | split_huge_page(page); | 1156 | split_huge_page(page); |
1157 | ret |= VM_FAULT_FALLBACK; | 1157 | ret |= VM_FAULT_FALLBACK; |
1158 | } | 1158 | } |
1159 | put_page(page); | 1159 | put_page(page); |
1160 | } | 1160 | } |
1161 | count_vm_event(THP_FAULT_FALLBACK); | 1161 | count_vm_event(THP_FAULT_FALLBACK); |
1162 | goto out; | 1162 | goto out; |
1163 | } | 1163 | } |
1164 | 1164 | ||
1165 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1165 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
1166 | put_page(new_page); | 1166 | put_page(new_page); |
1167 | if (page) { | 1167 | if (page) { |
1168 | split_huge_page(page); | 1168 | split_huge_page(page); |
1169 | put_page(page); | 1169 | put_page(page); |
1170 | } else | 1170 | } else |
1171 | split_huge_page_pmd(vma, address, pmd); | 1171 | split_huge_page_pmd(vma, address, pmd); |
1172 | ret |= VM_FAULT_FALLBACK; | 1172 | ret |= VM_FAULT_FALLBACK; |
1173 | count_vm_event(THP_FAULT_FALLBACK); | 1173 | count_vm_event(THP_FAULT_FALLBACK); |
1174 | goto out; | 1174 | goto out; |
1175 | } | 1175 | } |
1176 | 1176 | ||
1177 | count_vm_event(THP_FAULT_ALLOC); | 1177 | count_vm_event(THP_FAULT_ALLOC); |
1178 | 1178 | ||
1179 | if (!page) | 1179 | if (!page) |
1180 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | 1180 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); |
1181 | else | 1181 | else |
1182 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 1182 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); |
1183 | __SetPageUptodate(new_page); | 1183 | __SetPageUptodate(new_page); |
1184 | 1184 | ||
1185 | mmun_start = haddr; | 1185 | mmun_start = haddr; |
1186 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1186 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1187 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1187 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
1188 | 1188 | ||
1189 | spin_lock(&mm->page_table_lock); | 1189 | spin_lock(&mm->page_table_lock); |
1190 | if (page) | 1190 | if (page) |
1191 | put_page(page); | 1191 | put_page(page); |
1192 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1192 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
1193 | spin_unlock(&mm->page_table_lock); | 1193 | spin_unlock(&mm->page_table_lock); |
1194 | mem_cgroup_uncharge_page(new_page); | 1194 | mem_cgroup_uncharge_page(new_page); |
1195 | put_page(new_page); | 1195 | put_page(new_page); |
1196 | goto out_mn; | 1196 | goto out_mn; |
1197 | } else { | 1197 | } else { |
1198 | pmd_t entry; | 1198 | pmd_t entry; |
1199 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); | 1199 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
1200 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1200 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1201 | pmdp_clear_flush(vma, haddr, pmd); | 1201 | pmdp_clear_flush(vma, haddr, pmd); |
1202 | page_add_new_anon_rmap(new_page, vma, haddr); | 1202 | page_add_new_anon_rmap(new_page, vma, haddr); |
1203 | set_pmd_at(mm, haddr, pmd, entry); | 1203 | set_pmd_at(mm, haddr, pmd, entry); |
1204 | update_mmu_cache_pmd(vma, address, pmd); | 1204 | update_mmu_cache_pmd(vma, address, pmd); |
1205 | if (!page) { | 1205 | if (!page) { |
1206 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1206 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1207 | put_huge_zero_page(); | 1207 | put_huge_zero_page(); |
1208 | } else { | 1208 | } else { |
1209 | VM_BUG_ON(!PageHead(page)); | 1209 | VM_BUG_ON(!PageHead(page)); |
1210 | page_remove_rmap(page); | 1210 | page_remove_rmap(page); |
1211 | put_page(page); | 1211 | put_page(page); |
1212 | } | 1212 | } |
1213 | ret |= VM_FAULT_WRITE; | 1213 | ret |= VM_FAULT_WRITE; |
1214 | } | 1214 | } |
1215 | spin_unlock(&mm->page_table_lock); | 1215 | spin_unlock(&mm->page_table_lock); |
1216 | out_mn: | 1216 | out_mn: |
1217 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1217 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1218 | out: | 1218 | out: |
1219 | return ret; | 1219 | return ret; |
1220 | out_unlock: | 1220 | out_unlock: |
1221 | spin_unlock(&mm->page_table_lock); | 1221 | spin_unlock(&mm->page_table_lock); |
1222 | return ret; | 1222 | return ret; |
1223 | } | 1223 | } |
1224 | 1224 | ||
1225 | struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | 1225 | struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
1226 | unsigned long addr, | 1226 | unsigned long addr, |
1227 | pmd_t *pmd, | 1227 | pmd_t *pmd, |
1228 | unsigned int flags) | 1228 | unsigned int flags) |
1229 | { | 1229 | { |
1230 | struct mm_struct *mm = vma->vm_mm; | 1230 | struct mm_struct *mm = vma->vm_mm; |
1231 | struct page *page = NULL; | 1231 | struct page *page = NULL; |
1232 | 1232 | ||
1233 | assert_spin_locked(&mm->page_table_lock); | 1233 | assert_spin_locked(&mm->page_table_lock); |
1234 | 1234 | ||
1235 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) | 1235 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) |
1236 | goto out; | 1236 | goto out; |
1237 | 1237 | ||
1238 | /* Avoid dumping huge zero page */ | 1238 | /* Avoid dumping huge zero page */ |
1239 | if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) | 1239 | if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) |
1240 | return ERR_PTR(-EFAULT); | 1240 | return ERR_PTR(-EFAULT); |
1241 | 1241 | ||
1242 | /* Full NUMA hinting faults to serialise migration in fault paths */ | 1242 | /* Full NUMA hinting faults to serialise migration in fault paths */ |
1243 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 1243 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) |
1244 | goto out; | 1244 | goto out; |
1245 | 1245 | ||
1246 | page = pmd_page(*pmd); | 1246 | page = pmd_page(*pmd); |
1247 | VM_BUG_ON(!PageHead(page)); | 1247 | VM_BUG_ON(!PageHead(page)); |
1248 | if (flags & FOLL_TOUCH) { | 1248 | if (flags & FOLL_TOUCH) { |
1249 | pmd_t _pmd; | 1249 | pmd_t _pmd; |
1250 | /* | 1250 | /* |
1251 | * We should set the dirty bit only for FOLL_WRITE but | 1251 | * We should set the dirty bit only for FOLL_WRITE but |
1252 | * for now the dirty bit in the pmd is meaningless. | 1252 | * for now the dirty bit in the pmd is meaningless. |
1253 | * And if the dirty bit will become meaningful and | 1253 | * And if the dirty bit will become meaningful and |
1254 | * we'll only set it with FOLL_WRITE, an atomic | 1254 | * we'll only set it with FOLL_WRITE, an atomic |
1255 | * set_bit will be required on the pmd to set the | 1255 | * set_bit will be required on the pmd to set the |
1256 | * young bit, instead of the current set_pmd_at. | 1256 | * young bit, instead of the current set_pmd_at. |
1257 | */ | 1257 | */ |
1258 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | 1258 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); |
1259 | if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, | 1259 | if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, |
1260 | pmd, _pmd, 1)) | 1260 | pmd, _pmd, 1)) |
1261 | update_mmu_cache_pmd(vma, addr, pmd); | 1261 | update_mmu_cache_pmd(vma, addr, pmd); |
1262 | } | 1262 | } |
1263 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | 1263 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { |
1264 | if (page->mapping && trylock_page(page)) { | 1264 | if (page->mapping && trylock_page(page)) { |
1265 | lru_add_drain(); | 1265 | lru_add_drain(); |
1266 | if (page->mapping) | 1266 | if (page->mapping) |
1267 | mlock_vma_page(page); | 1267 | mlock_vma_page(page); |
1268 | unlock_page(page); | 1268 | unlock_page(page); |
1269 | } | 1269 | } |
1270 | } | 1270 | } |
1271 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1271 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
1272 | VM_BUG_ON(!PageCompound(page)); | 1272 | VM_BUG_ON(!PageCompound(page)); |
1273 | if (flags & FOLL_GET) | 1273 | if (flags & FOLL_GET) |
1274 | get_page_foll(page); | 1274 | get_page_foll(page); |
1275 | 1275 | ||
1276 | out: | 1276 | out: |
1277 | return page; | 1277 | return page; |
1278 | } | 1278 | } |
1279 | 1279 | ||
1280 | /* NUMA hinting page fault entry point for trans huge pmds */ | 1280 | /* NUMA hinting page fault entry point for trans huge pmds */ |
1281 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1281 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1282 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | 1282 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) |
1283 | { | 1283 | { |
1284 | struct anon_vma *anon_vma = NULL; | 1284 | struct anon_vma *anon_vma = NULL; |
1285 | struct page *page; | 1285 | struct page *page; |
1286 | unsigned long haddr = addr & HPAGE_PMD_MASK; | 1286 | unsigned long haddr = addr & HPAGE_PMD_MASK; |
1287 | int page_nid = -1, this_nid = numa_node_id(); | 1287 | int page_nid = -1, this_nid = numa_node_id(); |
1288 | int target_nid; | 1288 | int target_nid; |
1289 | bool page_locked; | 1289 | bool page_locked; |
1290 | bool migrated = false; | 1290 | bool migrated = false; |
1291 | 1291 | ||
1292 | spin_lock(&mm->page_table_lock); | 1292 | spin_lock(&mm->page_table_lock); |
1293 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1293 | if (unlikely(!pmd_same(pmd, *pmdp))) |
1294 | goto out_unlock; | 1294 | goto out_unlock; |
1295 | 1295 | ||
1296 | /* | 1296 | /* |
1297 | * If there are potential migrations, wait for completion and retry | 1297 | * If there are potential migrations, wait for completion and retry |
1298 | * without disrupting NUMA hinting information. Do not relock and | 1298 | * without disrupting NUMA hinting information. Do not relock and |
1299 | * check_same as the page may no longer be mapped. | 1299 | * check_same as the page may no longer be mapped. |
1300 | */ | 1300 | */ |
1301 | if (unlikely(pmd_trans_migrating(*pmdp))) { | 1301 | if (unlikely(pmd_trans_migrating(*pmdp))) { |
1302 | spin_unlock(&mm->page_table_lock); | 1302 | spin_unlock(&mm->page_table_lock); |
1303 | wait_migrate_huge_page(vma->anon_vma, pmdp); | 1303 | wait_migrate_huge_page(vma->anon_vma, pmdp); |
1304 | goto out; | 1304 | goto out; |
1305 | } | 1305 | } |
1306 | 1306 | ||
1307 | page = pmd_page(pmd); | 1307 | page = pmd_page(pmd); |
1308 | page_nid = page_to_nid(page); | 1308 | page_nid = page_to_nid(page); |
1309 | count_vm_numa_event(NUMA_HINT_FAULTS); | 1309 | count_vm_numa_event(NUMA_HINT_FAULTS); |
1310 | if (page_nid == this_nid) | 1310 | if (page_nid == this_nid) |
1311 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | 1311 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); |
1312 | 1312 | ||
1313 | /* | 1313 | /* |
1314 | * Acquire the page lock to serialise THP migrations but avoid dropping | 1314 | * Acquire the page lock to serialise THP migrations but avoid dropping |
1315 | * page_table_lock if at all possible | 1315 | * page_table_lock if at all possible |
1316 | */ | 1316 | */ |
1317 | page_locked = trylock_page(page); | 1317 | page_locked = trylock_page(page); |
1318 | target_nid = mpol_misplaced(page, vma, haddr); | 1318 | target_nid = mpol_misplaced(page, vma, haddr); |
1319 | if (target_nid == -1) { | 1319 | if (target_nid == -1) { |
1320 | /* If the page was locked, there are no parallel migrations */ | 1320 | /* If the page was locked, there are no parallel migrations */ |
1321 | if (page_locked) | 1321 | if (page_locked) |
1322 | goto clear_pmdnuma; | 1322 | goto clear_pmdnuma; |
1323 | } | 1323 | } |
1324 | 1324 | ||
1325 | /* Migration could have started since the pmd_trans_migrating check */ | 1325 | /* Migration could have started since the pmd_trans_migrating check */ |
1326 | if (!page_locked) { | 1326 | if (!page_locked) { |
1327 | spin_unlock(&mm->page_table_lock); | 1327 | spin_unlock(&mm->page_table_lock); |
1328 | wait_on_page_locked(page); | 1328 | wait_on_page_locked(page); |
1329 | page_nid = -1; | 1329 | page_nid = -1; |
1330 | goto out; | 1330 | goto out; |
1331 | } | 1331 | } |
1332 | 1332 | ||
1333 | /* | 1333 | /* |
1334 | * Page is misplaced. Page lock serialises migrations. Acquire anon_vma | 1334 | * Page is misplaced. Page lock serialises migrations. Acquire anon_vma |
1335 | * to serialises splits | 1335 | * to serialises splits |
1336 | */ | 1336 | */ |
1337 | get_page(page); | 1337 | get_page(page); |
1338 | spin_unlock(&mm->page_table_lock); | 1338 | spin_unlock(&mm->page_table_lock); |
1339 | anon_vma = page_lock_anon_vma_read(page); | 1339 | anon_vma = page_lock_anon_vma_read(page); |
1340 | 1340 | ||
1341 | /* Confirm the PTE did not while locked */ | 1341 | /* Confirm the PTE did not while locked */ |
1342 | spin_lock(&mm->page_table_lock); | 1342 | spin_lock(&mm->page_table_lock); |
1343 | if (unlikely(!pmd_same(pmd, *pmdp))) { | 1343 | if (unlikely(!pmd_same(pmd, *pmdp))) { |
1344 | unlock_page(page); | 1344 | unlock_page(page); |
1345 | put_page(page); | 1345 | put_page(page); |
1346 | page_nid = -1; | 1346 | page_nid = -1; |
1347 | goto out_unlock; | 1347 | goto out_unlock; |
1348 | } | 1348 | } |
1349 | 1349 | ||
1350 | /* Bail if we fail to protect against THP splits for any reason */ | 1350 | /* Bail if we fail to protect against THP splits for any reason */ |
1351 | if (unlikely(!anon_vma)) { | 1351 | if (unlikely(!anon_vma)) { |
1352 | put_page(page); | 1352 | put_page(page); |
1353 | page_nid = -1; | 1353 | page_nid = -1; |
1354 | goto clear_pmdnuma; | 1354 | goto clear_pmdnuma; |
1355 | } | 1355 | } |
1356 | 1356 | ||
1357 | /* | 1357 | /* |
1358 | * Migrate the THP to the requested node, returns with page unlocked | 1358 | * Migrate the THP to the requested node, returns with page unlocked |
1359 | * and pmd_numa cleared. | 1359 | * and pmd_numa cleared. |
1360 | */ | 1360 | */ |
1361 | spin_unlock(&mm->page_table_lock); | 1361 | spin_unlock(&mm->page_table_lock); |
1362 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1362 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
1363 | pmdp, pmd, addr, page, target_nid); | 1363 | pmdp, pmd, addr, page, target_nid); |
1364 | if (migrated) | 1364 | if (migrated) |
1365 | page_nid = target_nid; | 1365 | page_nid = target_nid; |
1366 | 1366 | ||
1367 | goto out; | 1367 | goto out; |
1368 | clear_pmdnuma: | 1368 | clear_pmdnuma: |
1369 | BUG_ON(!PageLocked(page)); | 1369 | BUG_ON(!PageLocked(page)); |
1370 | pmd = pmd_mknonnuma(pmd); | 1370 | pmd = pmd_mknonnuma(pmd); |
1371 | set_pmd_at(mm, haddr, pmdp, pmd); | 1371 | set_pmd_at(mm, haddr, pmdp, pmd); |
1372 | VM_BUG_ON(pmd_numa(*pmdp)); | 1372 | VM_BUG_ON(pmd_numa(*pmdp)); |
1373 | update_mmu_cache_pmd(vma, addr, pmdp); | 1373 | update_mmu_cache_pmd(vma, addr, pmdp); |
1374 | unlock_page(page); | 1374 | unlock_page(page); |
1375 | out_unlock: | 1375 | out_unlock: |
1376 | spin_unlock(&mm->page_table_lock); | 1376 | spin_unlock(&mm->page_table_lock); |
1377 | 1377 | ||
1378 | out: | 1378 | out: |
1379 | if (anon_vma) | 1379 | if (anon_vma) |
1380 | page_unlock_anon_vma_read(anon_vma); | 1380 | page_unlock_anon_vma_read(anon_vma); |
1381 | 1381 | ||
1382 | if (page_nid != -1) | 1382 | if (page_nid != -1) |
1383 | task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); | 1383 | task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); |
1384 | 1384 | ||
1385 | return 0; | 1385 | return 0; |
1386 | } | 1386 | } |
1387 | 1387 | ||
1388 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1388 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1389 | pmd_t *pmd, unsigned long addr) | 1389 | pmd_t *pmd, unsigned long addr) |
1390 | { | 1390 | { |
1391 | int ret = 0; | 1391 | int ret = 0; |
1392 | 1392 | ||
1393 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1393 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1394 | struct page *page; | 1394 | struct page *page; |
1395 | pgtable_t pgtable; | 1395 | pgtable_t pgtable; |
1396 | pmd_t orig_pmd; | 1396 | pmd_t orig_pmd; |
1397 | /* | 1397 | /* |
1398 | * For architectures like ppc64 we look at deposited pgtable | 1398 | * For architectures like ppc64 we look at deposited pgtable |
1399 | * when calling pmdp_get_and_clear. So do the | 1399 | * when calling pmdp_get_and_clear. So do the |
1400 | * pgtable_trans_huge_withdraw after finishing pmdp related | 1400 | * pgtable_trans_huge_withdraw after finishing pmdp related |
1401 | * operations. | 1401 | * operations. |
1402 | */ | 1402 | */ |
1403 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); | 1403 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1404 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1404 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1405 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); | 1405 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); |
1406 | if (is_huge_zero_pmd(orig_pmd)) { | 1406 | if (is_huge_zero_pmd(orig_pmd)) { |
1407 | tlb->mm->nr_ptes--; | 1407 | tlb->mm->nr_ptes--; |
1408 | spin_unlock(&tlb->mm->page_table_lock); | 1408 | spin_unlock(&tlb->mm->page_table_lock); |
1409 | put_huge_zero_page(); | 1409 | put_huge_zero_page(); |
1410 | } else { | 1410 | } else { |
1411 | page = pmd_page(orig_pmd); | 1411 | page = pmd_page(orig_pmd); |
1412 | page_remove_rmap(page); | 1412 | page_remove_rmap(page); |
1413 | VM_BUG_ON(page_mapcount(page) < 0); | 1413 | VM_BUG_ON(page_mapcount(page) < 0); |
1414 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1414 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1415 | VM_BUG_ON(!PageHead(page)); | 1415 | VM_BUG_ON(!PageHead(page)); |
1416 | tlb->mm->nr_ptes--; | 1416 | tlb->mm->nr_ptes--; |
1417 | spin_unlock(&tlb->mm->page_table_lock); | 1417 | spin_unlock(&tlb->mm->page_table_lock); |
1418 | tlb_remove_page(tlb, page); | 1418 | tlb_remove_page(tlb, page); |
1419 | } | 1419 | } |
1420 | pte_free(tlb->mm, pgtable); | 1420 | pte_free(tlb->mm, pgtable); |
1421 | ret = 1; | 1421 | ret = 1; |
1422 | } | 1422 | } |
1423 | return ret; | 1423 | return ret; |
1424 | } | 1424 | } |
1425 | 1425 | ||
1426 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1426 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1427 | unsigned long addr, unsigned long end, | 1427 | unsigned long addr, unsigned long end, |
1428 | unsigned char *vec) | 1428 | unsigned char *vec) |
1429 | { | 1429 | { |
1430 | int ret = 0; | 1430 | int ret = 0; |
1431 | 1431 | ||
1432 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1432 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1433 | /* | 1433 | /* |
1434 | * All logical pages in the range are present | 1434 | * All logical pages in the range are present |
1435 | * if backed by a huge page. | 1435 | * if backed by a huge page. |
1436 | */ | 1436 | */ |
1437 | spin_unlock(&vma->vm_mm->page_table_lock); | 1437 | spin_unlock(&vma->vm_mm->page_table_lock); |
1438 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | 1438 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); |
1439 | ret = 1; | 1439 | ret = 1; |
1440 | } | 1440 | } |
1441 | 1441 | ||
1442 | return ret; | 1442 | return ret; |
1443 | } | 1443 | } |
1444 | 1444 | ||
1445 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | 1445 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, |
1446 | unsigned long old_addr, | 1446 | unsigned long old_addr, |
1447 | unsigned long new_addr, unsigned long old_end, | 1447 | unsigned long new_addr, unsigned long old_end, |
1448 | pmd_t *old_pmd, pmd_t *new_pmd) | 1448 | pmd_t *old_pmd, pmd_t *new_pmd) |
1449 | { | 1449 | { |
1450 | int ret = 0; | 1450 | int ret = 0; |
1451 | pmd_t pmd; | 1451 | pmd_t pmd; |
1452 | 1452 | ||
1453 | struct mm_struct *mm = vma->vm_mm; | 1453 | struct mm_struct *mm = vma->vm_mm; |
1454 | 1454 | ||
1455 | if ((old_addr & ~HPAGE_PMD_MASK) || | 1455 | if ((old_addr & ~HPAGE_PMD_MASK) || |
1456 | (new_addr & ~HPAGE_PMD_MASK) || | 1456 | (new_addr & ~HPAGE_PMD_MASK) || |
1457 | old_end - old_addr < HPAGE_PMD_SIZE || | 1457 | old_end - old_addr < HPAGE_PMD_SIZE || |
1458 | (new_vma->vm_flags & VM_NOHUGEPAGE)) | 1458 | (new_vma->vm_flags & VM_NOHUGEPAGE)) |
1459 | goto out; | 1459 | goto out; |
1460 | 1460 | ||
1461 | /* | 1461 | /* |
1462 | * The destination pmd shouldn't be established, free_pgtables() | 1462 | * The destination pmd shouldn't be established, free_pgtables() |
1463 | * should have release it. | 1463 | * should have release it. |
1464 | */ | 1464 | */ |
1465 | if (WARN_ON(!pmd_none(*new_pmd))) { | 1465 | if (WARN_ON(!pmd_none(*new_pmd))) { |
1466 | VM_BUG_ON(pmd_trans_huge(*new_pmd)); | 1466 | VM_BUG_ON(pmd_trans_huge(*new_pmd)); |
1467 | goto out; | 1467 | goto out; |
1468 | } | 1468 | } |
1469 | 1469 | ||
1470 | ret = __pmd_trans_huge_lock(old_pmd, vma); | 1470 | ret = __pmd_trans_huge_lock(old_pmd, vma); |
1471 | if (ret == 1) { | 1471 | if (ret == 1) { |
1472 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | 1472 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
1473 | VM_BUG_ON(!pmd_none(*new_pmd)); | 1473 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1474 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); | 1474 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); |
1475 | spin_unlock(&mm->page_table_lock); | 1475 | spin_unlock(&mm->page_table_lock); |
1476 | } | 1476 | } |
1477 | out: | 1477 | out: |
1478 | return ret; | 1478 | return ret; |
1479 | } | 1479 | } |
1480 | 1480 | ||
1481 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1481 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1482 | unsigned long addr, pgprot_t newprot, int prot_numa) | 1482 | unsigned long addr, pgprot_t newprot, int prot_numa) |
1483 | { | 1483 | { |
1484 | struct mm_struct *mm = vma->vm_mm; | 1484 | struct mm_struct *mm = vma->vm_mm; |
1485 | int ret = 0; | 1485 | int ret = 0; |
1486 | 1486 | ||
1487 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1487 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1488 | pmd_t entry; | 1488 | pmd_t entry; |
1489 | if (!prot_numa) { | 1489 | if (!prot_numa) { |
1490 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1490 | entry = pmdp_get_and_clear(mm, addr, pmd); |
1491 | if (pmd_numa(entry)) | 1491 | if (pmd_numa(entry)) |
1492 | entry = pmd_mknonnuma(entry); | 1492 | entry = pmd_mknonnuma(entry); |
1493 | entry = pmd_modify(entry, newprot); | 1493 | entry = pmd_modify(entry, newprot); |
1494 | BUG_ON(pmd_write(entry)); | 1494 | BUG_ON(pmd_write(entry)); |
1495 | set_pmd_at(mm, addr, pmd, entry); | 1495 | set_pmd_at(mm, addr, pmd, entry); |
1496 | } else { | 1496 | } else { |
1497 | struct page *page = pmd_page(*pmd); | 1497 | struct page *page = pmd_page(*pmd); |
1498 | entry = *pmd; | 1498 | entry = *pmd; |
1499 | 1499 | ||
1500 | /* only check non-shared pages */ | 1500 | /* only check non-shared pages */ |
1501 | if (page_mapcount(page) == 1 && | 1501 | if (page_mapcount(page) == 1 && |
1502 | !pmd_numa(*pmd)) { | 1502 | !pmd_numa(*pmd)) { |
1503 | entry = pmd_mknuma(entry); | 1503 | entry = pmd_mknuma(entry); |
1504 | set_pmd_at(mm, addr, pmd, entry); | 1504 | set_pmd_at(mm, addr, pmd, entry); |
1505 | } | 1505 | } |
1506 | } | 1506 | } |
1507 | spin_unlock(&vma->vm_mm->page_table_lock); | 1507 | spin_unlock(&vma->vm_mm->page_table_lock); |
1508 | ret = 1; | 1508 | ret = 1; |
1509 | } | 1509 | } |
1510 | 1510 | ||
1511 | return ret; | 1511 | return ret; |
1512 | } | 1512 | } |
1513 | 1513 | ||
1514 | /* | 1514 | /* |
1515 | * Returns 1 if a given pmd maps a stable (not under splitting) thp. | 1515 | * Returns 1 if a given pmd maps a stable (not under splitting) thp. |
1516 | * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. | 1516 | * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. |
1517 | * | 1517 | * |
1518 | * Note that if it returns 1, this routine returns without unlocking page | 1518 | * Note that if it returns 1, this routine returns without unlocking page |
1519 | * table locks. So callers must unlock them. | 1519 | * table locks. So callers must unlock them. |
1520 | */ | 1520 | */ |
1521 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | 1521 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) |
1522 | { | 1522 | { |
1523 | spin_lock(&vma->vm_mm->page_table_lock); | 1523 | spin_lock(&vma->vm_mm->page_table_lock); |
1524 | if (likely(pmd_trans_huge(*pmd))) { | 1524 | if (likely(pmd_trans_huge(*pmd))) { |
1525 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1525 | if (unlikely(pmd_trans_splitting(*pmd))) { |
1526 | spin_unlock(&vma->vm_mm->page_table_lock); | 1526 | spin_unlock(&vma->vm_mm->page_table_lock); |
1527 | wait_split_huge_page(vma->anon_vma, pmd); | 1527 | wait_split_huge_page(vma->anon_vma, pmd); |
1528 | return -1; | 1528 | return -1; |
1529 | } else { | 1529 | } else { |
1530 | /* Thp mapped by 'pmd' is stable, so we can | 1530 | /* Thp mapped by 'pmd' is stable, so we can |
1531 | * handle it as it is. */ | 1531 | * handle it as it is. */ |
1532 | return 1; | 1532 | return 1; |
1533 | } | 1533 | } |
1534 | } | 1534 | } |
1535 | spin_unlock(&vma->vm_mm->page_table_lock); | 1535 | spin_unlock(&vma->vm_mm->page_table_lock); |
1536 | return 0; | 1536 | return 0; |
1537 | } | 1537 | } |
1538 | 1538 | ||
1539 | pmd_t *page_check_address_pmd(struct page *page, | 1539 | pmd_t *page_check_address_pmd(struct page *page, |
1540 | struct mm_struct *mm, | 1540 | struct mm_struct *mm, |
1541 | unsigned long address, | 1541 | unsigned long address, |
1542 | enum page_check_address_pmd_flag flag) | 1542 | enum page_check_address_pmd_flag flag) |
1543 | { | 1543 | { |
1544 | pmd_t *pmd, *ret = NULL; | 1544 | pmd_t *pmd, *ret = NULL; |
1545 | 1545 | ||
1546 | if (address & ~HPAGE_PMD_MASK) | 1546 | if (address & ~HPAGE_PMD_MASK) |
1547 | goto out; | 1547 | goto out; |
1548 | 1548 | ||
1549 | pmd = mm_find_pmd(mm, address); | 1549 | pmd = mm_find_pmd(mm, address); |
1550 | if (!pmd) | 1550 | if (!pmd) |
1551 | goto out; | 1551 | goto out; |
1552 | if (pmd_none(*pmd)) | 1552 | if (pmd_none(*pmd)) |
1553 | goto out; | 1553 | goto out; |
1554 | if (pmd_page(*pmd) != page) | 1554 | if (pmd_page(*pmd) != page) |
1555 | goto out; | 1555 | goto out; |
1556 | /* | 1556 | /* |
1557 | * split_vma() may create temporary aliased mappings. There is | 1557 | * split_vma() may create temporary aliased mappings. There is |
1558 | * no risk as long as all huge pmd are found and have their | 1558 | * no risk as long as all huge pmd are found and have their |
1559 | * splitting bit set before __split_huge_page_refcount | 1559 | * splitting bit set before __split_huge_page_refcount |
1560 | * runs. Finding the same huge pmd more than once during the | 1560 | * runs. Finding the same huge pmd more than once during the |
1561 | * same rmap walk is not a problem. | 1561 | * same rmap walk is not a problem. |
1562 | */ | 1562 | */ |
1563 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && | 1563 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && |
1564 | pmd_trans_splitting(*pmd)) | 1564 | pmd_trans_splitting(*pmd)) |
1565 | goto out; | 1565 | goto out; |
1566 | if (pmd_trans_huge(*pmd)) { | 1566 | if (pmd_trans_huge(*pmd)) { |
1567 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && | 1567 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && |
1568 | !pmd_trans_splitting(*pmd)); | 1568 | !pmd_trans_splitting(*pmd)); |
1569 | ret = pmd; | 1569 | ret = pmd; |
1570 | } | 1570 | } |
1571 | out: | 1571 | out: |
1572 | return ret; | 1572 | return ret; |
1573 | } | 1573 | } |
1574 | 1574 | ||
1575 | static int __split_huge_page_splitting(struct page *page, | 1575 | static int __split_huge_page_splitting(struct page *page, |
1576 | struct vm_area_struct *vma, | 1576 | struct vm_area_struct *vma, |
1577 | unsigned long address) | 1577 | unsigned long address) |
1578 | { | 1578 | { |
1579 | struct mm_struct *mm = vma->vm_mm; | 1579 | struct mm_struct *mm = vma->vm_mm; |
1580 | pmd_t *pmd; | 1580 | pmd_t *pmd; |
1581 | int ret = 0; | 1581 | int ret = 0; |
1582 | /* For mmu_notifiers */ | 1582 | /* For mmu_notifiers */ |
1583 | const unsigned long mmun_start = address; | 1583 | const unsigned long mmun_start = address; |
1584 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; | 1584 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; |
1585 | 1585 | ||
1586 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1586 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
1587 | spin_lock(&mm->page_table_lock); | 1587 | spin_lock(&mm->page_table_lock); |
1588 | pmd = page_check_address_pmd(page, mm, address, | 1588 | pmd = page_check_address_pmd(page, mm, address, |
1589 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | 1589 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); |
1590 | if (pmd) { | 1590 | if (pmd) { |
1591 | /* | 1591 | /* |
1592 | * We can't temporarily set the pmd to null in order | 1592 | * We can't temporarily set the pmd to null in order |
1593 | * to split it, the pmd must remain marked huge at all | 1593 | * to split it, the pmd must remain marked huge at all |
1594 | * times or the VM won't take the pmd_trans_huge paths | 1594 | * times or the VM won't take the pmd_trans_huge paths |
1595 | * and it won't wait on the anon_vma->root->rwsem to | 1595 | * and it won't wait on the anon_vma->root->rwsem to |
1596 | * serialize against split_huge_page*. | 1596 | * serialize against split_huge_page*. |
1597 | */ | 1597 | */ |
1598 | pmdp_splitting_flush(vma, address, pmd); | 1598 | pmdp_splitting_flush(vma, address, pmd); |
1599 | ret = 1; | 1599 | ret = 1; |
1600 | } | 1600 | } |
1601 | spin_unlock(&mm->page_table_lock); | 1601 | spin_unlock(&mm->page_table_lock); |
1602 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1602 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1603 | 1603 | ||
1604 | return ret; | 1604 | return ret; |
1605 | } | 1605 | } |
1606 | 1606 | ||
1607 | static void __split_huge_page_refcount(struct page *page, | 1607 | static void __split_huge_page_refcount(struct page *page, |
1608 | struct list_head *list) | 1608 | struct list_head *list) |
1609 | { | 1609 | { |
1610 | int i; | 1610 | int i; |
1611 | struct zone *zone = page_zone(page); | 1611 | struct zone *zone = page_zone(page); |
1612 | struct lruvec *lruvec; | 1612 | struct lruvec *lruvec; |
1613 | int tail_count = 0; | 1613 | int tail_count = 0; |
1614 | 1614 | ||
1615 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 1615 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
1616 | spin_lock_irq(&zone->lru_lock); | 1616 | spin_lock_irq(&zone->lru_lock); |
1617 | lruvec = mem_cgroup_page_lruvec(page, zone); | 1617 | lruvec = mem_cgroup_page_lruvec(page, zone); |
1618 | 1618 | ||
1619 | compound_lock(page); | 1619 | compound_lock(page); |
1620 | /* complete memcg works before add pages to LRU */ | 1620 | /* complete memcg works before add pages to LRU */ |
1621 | mem_cgroup_split_huge_fixup(page); | 1621 | mem_cgroup_split_huge_fixup(page); |
1622 | 1622 | ||
1623 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { | 1623 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { |
1624 | struct page *page_tail = page + i; | 1624 | struct page *page_tail = page + i; |
1625 | 1625 | ||
1626 | /* tail_page->_mapcount cannot change */ | 1626 | /* tail_page->_mapcount cannot change */ |
1627 | BUG_ON(page_mapcount(page_tail) < 0); | 1627 | BUG_ON(page_mapcount(page_tail) < 0); |
1628 | tail_count += page_mapcount(page_tail); | 1628 | tail_count += page_mapcount(page_tail); |
1629 | /* check for overflow */ | 1629 | /* check for overflow */ |
1630 | BUG_ON(tail_count < 0); | 1630 | BUG_ON(tail_count < 0); |
1631 | BUG_ON(atomic_read(&page_tail->_count) != 0); | 1631 | BUG_ON(atomic_read(&page_tail->_count) != 0); |
1632 | /* | 1632 | /* |
1633 | * tail_page->_count is zero and not changing from | 1633 | * tail_page->_count is zero and not changing from |
1634 | * under us. But get_page_unless_zero() may be running | 1634 | * under us. But get_page_unless_zero() may be running |
1635 | * from under us on the tail_page. If we used | 1635 | * from under us on the tail_page. If we used |
1636 | * atomic_set() below instead of atomic_add(), we | 1636 | * atomic_set() below instead of atomic_add(), we |
1637 | * would then run atomic_set() concurrently with | 1637 | * would then run atomic_set() concurrently with |
1638 | * get_page_unless_zero(), and atomic_set() is | 1638 | * get_page_unless_zero(), and atomic_set() is |
1639 | * implemented in C not using locked ops. spin_unlock | 1639 | * implemented in C not using locked ops. spin_unlock |
1640 | * on x86 sometime uses locked ops because of PPro | 1640 | * on x86 sometime uses locked ops because of PPro |
1641 | * errata 66, 92, so unless somebody can guarantee | 1641 | * errata 66, 92, so unless somebody can guarantee |
1642 | * atomic_set() here would be safe on all archs (and | 1642 | * atomic_set() here would be safe on all archs (and |
1643 | * not only on x86), it's safer to use atomic_add(). | 1643 | * not only on x86), it's safer to use atomic_add(). |
1644 | */ | 1644 | */ |
1645 | atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, | 1645 | atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, |
1646 | &page_tail->_count); | 1646 | &page_tail->_count); |
1647 | 1647 | ||
1648 | /* after clearing PageTail the gup refcount can be released */ | 1648 | /* after clearing PageTail the gup refcount can be released */ |
1649 | smp_mb(); | 1649 | smp_mb(); |
1650 | 1650 | ||
1651 | /* | 1651 | /* |
1652 | * retain hwpoison flag of the poisoned tail page: | 1652 | * retain hwpoison flag of the poisoned tail page: |
1653 | * fix for the unsuitable process killed on Guest Machine(KVM) | 1653 | * fix for the unsuitable process killed on Guest Machine(KVM) |
1654 | * by the memory-failure. | 1654 | * by the memory-failure. |
1655 | */ | 1655 | */ |
1656 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; | 1656 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; |
1657 | page_tail->flags |= (page->flags & | 1657 | page_tail->flags |= (page->flags & |
1658 | ((1L << PG_referenced) | | 1658 | ((1L << PG_referenced) | |
1659 | (1L << PG_swapbacked) | | 1659 | (1L << PG_swapbacked) | |
1660 | (1L << PG_mlocked) | | 1660 | (1L << PG_mlocked) | |
1661 | (1L << PG_uptodate) | | 1661 | (1L << PG_uptodate) | |
1662 | (1L << PG_active) | | 1662 | (1L << PG_active) | |
1663 | (1L << PG_unevictable))); | 1663 | (1L << PG_unevictable))); |
1664 | page_tail->flags |= (1L << PG_dirty); | 1664 | page_tail->flags |= (1L << PG_dirty); |
1665 | 1665 | ||
1666 | /* clear PageTail before overwriting first_page */ | 1666 | /* clear PageTail before overwriting first_page */ |
1667 | smp_wmb(); | 1667 | smp_wmb(); |
1668 | 1668 | ||
1669 | /* | 1669 | /* |
1670 | * __split_huge_page_splitting() already set the | 1670 | * __split_huge_page_splitting() already set the |
1671 | * splitting bit in all pmd that could map this | 1671 | * splitting bit in all pmd that could map this |
1672 | * hugepage, that will ensure no CPU can alter the | 1672 | * hugepage, that will ensure no CPU can alter the |
1673 | * mapcount on the head page. The mapcount is only | 1673 | * mapcount on the head page. The mapcount is only |
1674 | * accounted in the head page and it has to be | 1674 | * accounted in the head page and it has to be |
1675 | * transferred to all tail pages in the below code. So | 1675 | * transferred to all tail pages in the below code. So |
1676 | * for this code to be safe, the split the mapcount | 1676 | * for this code to be safe, the split the mapcount |
1677 | * can't change. But that doesn't mean userland can't | 1677 | * can't change. But that doesn't mean userland can't |
1678 | * keep changing and reading the page contents while | 1678 | * keep changing and reading the page contents while |
1679 | * we transfer the mapcount, so the pmd splitting | 1679 | * we transfer the mapcount, so the pmd splitting |
1680 | * status is achieved setting a reserved bit in the | 1680 | * status is achieved setting a reserved bit in the |
1681 | * pmd, not by clearing the present bit. | 1681 | * pmd, not by clearing the present bit. |
1682 | */ | 1682 | */ |
1683 | page_tail->_mapcount = page->_mapcount; | 1683 | page_tail->_mapcount = page->_mapcount; |
1684 | 1684 | ||
1685 | BUG_ON(page_tail->mapping); | 1685 | BUG_ON(page_tail->mapping); |
1686 | page_tail->mapping = page->mapping; | 1686 | page_tail->mapping = page->mapping; |
1687 | 1687 | ||
1688 | page_tail->index = page->index + i; | 1688 | page_tail->index = page->index + i; |
1689 | page_nid_xchg_last(page_tail, page_nid_last(page)); | 1689 | page_nid_xchg_last(page_tail, page_nid_last(page)); |
1690 | 1690 | ||
1691 | BUG_ON(!PageAnon(page_tail)); | 1691 | BUG_ON(!PageAnon(page_tail)); |
1692 | BUG_ON(!PageUptodate(page_tail)); | 1692 | BUG_ON(!PageUptodate(page_tail)); |
1693 | BUG_ON(!PageDirty(page_tail)); | 1693 | BUG_ON(!PageDirty(page_tail)); |
1694 | BUG_ON(!PageSwapBacked(page_tail)); | 1694 | BUG_ON(!PageSwapBacked(page_tail)); |
1695 | 1695 | ||
1696 | lru_add_page_tail(page, page_tail, lruvec, list); | 1696 | lru_add_page_tail(page, page_tail, lruvec, list); |
1697 | } | 1697 | } |
1698 | atomic_sub(tail_count, &page->_count); | 1698 | atomic_sub(tail_count, &page->_count); |
1699 | BUG_ON(atomic_read(&page->_count) <= 0); | 1699 | BUG_ON(atomic_read(&page->_count) <= 0); |
1700 | 1700 | ||
1701 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); | 1701 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); |
1702 | 1702 | ||
1703 | ClearPageCompound(page); | 1703 | ClearPageCompound(page); |
1704 | compound_unlock(page); | 1704 | compound_unlock(page); |
1705 | spin_unlock_irq(&zone->lru_lock); | 1705 | spin_unlock_irq(&zone->lru_lock); |
1706 | 1706 | ||
1707 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 1707 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
1708 | struct page *page_tail = page + i; | 1708 | struct page *page_tail = page + i; |
1709 | BUG_ON(page_count(page_tail) <= 0); | 1709 | BUG_ON(page_count(page_tail) <= 0); |
1710 | /* | 1710 | /* |
1711 | * Tail pages may be freed if there wasn't any mapping | 1711 | * Tail pages may be freed if there wasn't any mapping |
1712 | * like if add_to_swap() is running on a lru page that | 1712 | * like if add_to_swap() is running on a lru page that |
1713 | * had its mapping zapped. And freeing these pages | 1713 | * had its mapping zapped. And freeing these pages |
1714 | * requires taking the lru_lock so we do the put_page | 1714 | * requires taking the lru_lock so we do the put_page |
1715 | * of the tail pages after the split is complete. | 1715 | * of the tail pages after the split is complete. |
1716 | */ | 1716 | */ |
1717 | put_page(page_tail); | 1717 | put_page(page_tail); |
1718 | } | 1718 | } |
1719 | 1719 | ||
1720 | /* | 1720 | /* |
1721 | * Only the head page (now become a regular page) is required | 1721 | * Only the head page (now become a regular page) is required |
1722 | * to be pinned by the caller. | 1722 | * to be pinned by the caller. |
1723 | */ | 1723 | */ |
1724 | BUG_ON(page_count(page) <= 0); | 1724 | BUG_ON(page_count(page) <= 0); |
1725 | } | 1725 | } |
1726 | 1726 | ||
1727 | static int __split_huge_page_map(struct page *page, | 1727 | static int __split_huge_page_map(struct page *page, |
1728 | struct vm_area_struct *vma, | 1728 | struct vm_area_struct *vma, |
1729 | unsigned long address) | 1729 | unsigned long address) |
1730 | { | 1730 | { |
1731 | struct mm_struct *mm = vma->vm_mm; | 1731 | struct mm_struct *mm = vma->vm_mm; |
1732 | pmd_t *pmd, _pmd; | 1732 | pmd_t *pmd, _pmd; |
1733 | int ret = 0, i; | 1733 | int ret = 0, i; |
1734 | pgtable_t pgtable; | 1734 | pgtable_t pgtable; |
1735 | unsigned long haddr; | 1735 | unsigned long haddr; |
1736 | 1736 | ||
1737 | spin_lock(&mm->page_table_lock); | 1737 | spin_lock(&mm->page_table_lock); |
1738 | pmd = page_check_address_pmd(page, mm, address, | 1738 | pmd = page_check_address_pmd(page, mm, address, |
1739 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | 1739 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); |
1740 | if (pmd) { | 1740 | if (pmd) { |
1741 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 1741 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
1742 | pmd_populate(mm, &_pmd, pgtable); | 1742 | pmd_populate(mm, &_pmd, pgtable); |
1743 | 1743 | ||
1744 | haddr = address; | 1744 | haddr = address; |
1745 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 1745 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1746 | pte_t *pte, entry; | 1746 | pte_t *pte, entry; |
1747 | BUG_ON(PageCompound(page+i)); | 1747 | BUG_ON(PageCompound(page+i)); |
1748 | entry = mk_pte(page + i, vma->vm_page_prot); | 1748 | entry = mk_pte(page + i, vma->vm_page_prot); |
1749 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1749 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1750 | if (!pmd_write(*pmd)) | 1750 | if (!pmd_write(*pmd)) |
1751 | entry = pte_wrprotect(entry); | 1751 | entry = pte_wrprotect(entry); |
1752 | else | 1752 | else |
1753 | BUG_ON(page_mapcount(page) != 1); | 1753 | BUG_ON(page_mapcount(page) != 1); |
1754 | if (!pmd_young(*pmd)) | 1754 | if (!pmd_young(*pmd)) |
1755 | entry = pte_mkold(entry); | 1755 | entry = pte_mkold(entry); |
1756 | if (pmd_numa(*pmd)) | 1756 | if (pmd_numa(*pmd)) |
1757 | entry = pte_mknuma(entry); | 1757 | entry = pte_mknuma(entry); |
1758 | pte = pte_offset_map(&_pmd, haddr); | 1758 | pte = pte_offset_map(&_pmd, haddr); |
1759 | BUG_ON(!pte_none(*pte)); | 1759 | BUG_ON(!pte_none(*pte)); |
1760 | set_pte_at(mm, haddr, pte, entry); | 1760 | set_pte_at(mm, haddr, pte, entry); |
1761 | pte_unmap(pte); | 1761 | pte_unmap(pte); |
1762 | } | 1762 | } |
1763 | 1763 | ||
1764 | smp_wmb(); /* make pte visible before pmd */ | 1764 | smp_wmb(); /* make pte visible before pmd */ |
1765 | /* | 1765 | /* |
1766 | * Up to this point the pmd is present and huge and | 1766 | * Up to this point the pmd is present and huge and |
1767 | * userland has the whole access to the hugepage | 1767 | * userland has the whole access to the hugepage |
1768 | * during the split (which happens in place). If we | 1768 | * during the split (which happens in place). If we |
1769 | * overwrite the pmd with the not-huge version | 1769 | * overwrite the pmd with the not-huge version |
1770 | * pointing to the pte here (which of course we could | 1770 | * pointing to the pte here (which of course we could |
1771 | * if all CPUs were bug free), userland could trigger | 1771 | * if all CPUs were bug free), userland could trigger |
1772 | * a small page size TLB miss on the small sized TLB | 1772 | * a small page size TLB miss on the small sized TLB |
1773 | * while the hugepage TLB entry is still established | 1773 | * while the hugepage TLB entry is still established |
1774 | * in the huge TLB. Some CPU doesn't like that. See | 1774 | * in the huge TLB. Some CPU doesn't like that. See |
1775 | * http://support.amd.com/us/Processor_TechDocs/41322.pdf, | 1775 | * http://support.amd.com/us/Processor_TechDocs/41322.pdf, |
1776 | * Erratum 383 on page 93. Intel should be safe but is | 1776 | * Erratum 383 on page 93. Intel should be safe but is |
1777 | * also warns that it's only safe if the permission | 1777 | * also warns that it's only safe if the permission |
1778 | * and cache attributes of the two entries loaded in | 1778 | * and cache attributes of the two entries loaded in |
1779 | * the two TLB is identical (which should be the case | 1779 | * the two TLB is identical (which should be the case |
1780 | * here). But it is generally safer to never allow | 1780 | * here). But it is generally safer to never allow |
1781 | * small and huge TLB entries for the same virtual | 1781 | * small and huge TLB entries for the same virtual |
1782 | * address to be loaded simultaneously. So instead of | 1782 | * address to be loaded simultaneously. So instead of |
1783 | * doing "pmd_populate(); flush_tlb_range();" we first | 1783 | * doing "pmd_populate(); flush_tlb_range();" we first |
1784 | * mark the current pmd notpresent (atomically because | 1784 | * mark the current pmd notpresent (atomically because |
1785 | * here the pmd_trans_huge and pmd_trans_splitting | 1785 | * here the pmd_trans_huge and pmd_trans_splitting |
1786 | * must remain set at all times on the pmd until the | 1786 | * must remain set at all times on the pmd until the |
1787 | * split is complete for this pmd), then we flush the | 1787 | * split is complete for this pmd), then we flush the |
1788 | * SMP TLB and finally we write the non-huge version | 1788 | * SMP TLB and finally we write the non-huge version |
1789 | * of the pmd entry with pmd_populate. | 1789 | * of the pmd entry with pmd_populate. |
1790 | */ | 1790 | */ |
1791 | pmdp_invalidate(vma, address, pmd); | 1791 | pmdp_invalidate(vma, address, pmd); |
1792 | pmd_populate(mm, pmd, pgtable); | 1792 | pmd_populate(mm, pmd, pgtable); |
1793 | ret = 1; | 1793 | ret = 1; |
1794 | } | 1794 | } |
1795 | spin_unlock(&mm->page_table_lock); | 1795 | spin_unlock(&mm->page_table_lock); |
1796 | 1796 | ||
1797 | return ret; | 1797 | return ret; |
1798 | } | 1798 | } |
1799 | 1799 | ||
1800 | /* must be called with anon_vma->root->rwsem held */ | 1800 | /* must be called with anon_vma->root->rwsem held */ |
1801 | static void __split_huge_page(struct page *page, | 1801 | static void __split_huge_page(struct page *page, |
1802 | struct anon_vma *anon_vma, | 1802 | struct anon_vma *anon_vma, |
1803 | struct list_head *list) | 1803 | struct list_head *list) |
1804 | { | 1804 | { |
1805 | int mapcount, mapcount2; | 1805 | int mapcount, mapcount2; |
1806 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1806 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1807 | struct anon_vma_chain *avc; | 1807 | struct anon_vma_chain *avc; |
1808 | 1808 | ||
1809 | BUG_ON(!PageHead(page)); | 1809 | BUG_ON(!PageHead(page)); |
1810 | BUG_ON(PageTail(page)); | 1810 | BUG_ON(PageTail(page)); |
1811 | 1811 | ||
1812 | mapcount = 0; | 1812 | mapcount = 0; |
1813 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1813 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1814 | struct vm_area_struct *vma = avc->vma; | 1814 | struct vm_area_struct *vma = avc->vma; |
1815 | unsigned long addr = vma_address(page, vma); | 1815 | unsigned long addr = vma_address(page, vma); |
1816 | BUG_ON(is_vma_temporary_stack(vma)); | 1816 | BUG_ON(is_vma_temporary_stack(vma)); |
1817 | mapcount += __split_huge_page_splitting(page, vma, addr); | 1817 | mapcount += __split_huge_page_splitting(page, vma, addr); |
1818 | } | 1818 | } |
1819 | /* | 1819 | /* |
1820 | * It is critical that new vmas are added to the tail of the | 1820 | * It is critical that new vmas are added to the tail of the |
1821 | * anon_vma list. This guarantes that if copy_huge_pmd() runs | 1821 | * anon_vma list. This guarantes that if copy_huge_pmd() runs |
1822 | * and establishes a child pmd before | 1822 | * and establishes a child pmd before |
1823 | * __split_huge_page_splitting() freezes the parent pmd (so if | 1823 | * __split_huge_page_splitting() freezes the parent pmd (so if |
1824 | * we fail to prevent copy_huge_pmd() from running until the | 1824 | * we fail to prevent copy_huge_pmd() from running until the |
1825 | * whole __split_huge_page() is complete), we will still see | 1825 | * whole __split_huge_page() is complete), we will still see |
1826 | * the newly established pmd of the child later during the | 1826 | * the newly established pmd of the child later during the |
1827 | * walk, to be able to set it as pmd_trans_splitting too. | 1827 | * walk, to be able to set it as pmd_trans_splitting too. |
1828 | */ | 1828 | */ |
1829 | if (mapcount != page_mapcount(page)) | 1829 | if (mapcount != page_mapcount(page)) |
1830 | printk(KERN_ERR "mapcount %d page_mapcount %d\n", | 1830 | printk(KERN_ERR "mapcount %d page_mapcount %d\n", |
1831 | mapcount, page_mapcount(page)); | 1831 | mapcount, page_mapcount(page)); |
1832 | BUG_ON(mapcount != page_mapcount(page)); | 1832 | BUG_ON(mapcount != page_mapcount(page)); |
1833 | 1833 | ||
1834 | __split_huge_page_refcount(page, list); | 1834 | __split_huge_page_refcount(page, list); |
1835 | 1835 | ||
1836 | mapcount2 = 0; | 1836 | mapcount2 = 0; |
1837 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1837 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1838 | struct vm_area_struct *vma = avc->vma; | 1838 | struct vm_area_struct *vma = avc->vma; |
1839 | unsigned long addr = vma_address(page, vma); | 1839 | unsigned long addr = vma_address(page, vma); |
1840 | BUG_ON(is_vma_temporary_stack(vma)); | 1840 | BUG_ON(is_vma_temporary_stack(vma)); |
1841 | mapcount2 += __split_huge_page_map(page, vma, addr); | 1841 | mapcount2 += __split_huge_page_map(page, vma, addr); |
1842 | } | 1842 | } |
1843 | if (mapcount != mapcount2) | 1843 | if (mapcount != mapcount2) |
1844 | printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", | 1844 | printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", |
1845 | mapcount, mapcount2, page_mapcount(page)); | 1845 | mapcount, mapcount2, page_mapcount(page)); |
1846 | BUG_ON(mapcount != mapcount2); | 1846 | BUG_ON(mapcount != mapcount2); |
1847 | } | 1847 | } |
1848 | 1848 | ||
1849 | /* | 1849 | /* |
1850 | * Split a hugepage into normal pages. This doesn't change the position of head | 1850 | * Split a hugepage into normal pages. This doesn't change the position of head |
1851 | * page. If @list is null, tail pages will be added to LRU list, otherwise, to | 1851 | * page. If @list is null, tail pages will be added to LRU list, otherwise, to |
1852 | * @list. Both head page and tail pages will inherit mapping, flags, and so on | 1852 | * @list. Both head page and tail pages will inherit mapping, flags, and so on |
1853 | * from the hugepage. | 1853 | * from the hugepage. |
1854 | * Return 0 if the hugepage is split successfully otherwise return 1. | 1854 | * Return 0 if the hugepage is split successfully otherwise return 1. |
1855 | */ | 1855 | */ |
1856 | int split_huge_page_to_list(struct page *page, struct list_head *list) | 1856 | int split_huge_page_to_list(struct page *page, struct list_head *list) |
1857 | { | 1857 | { |
1858 | struct anon_vma *anon_vma; | 1858 | struct anon_vma *anon_vma; |
1859 | int ret = 1; | 1859 | int ret = 1; |
1860 | 1860 | ||
1861 | BUG_ON(is_huge_zero_page(page)); | 1861 | BUG_ON(is_huge_zero_page(page)); |
1862 | BUG_ON(!PageAnon(page)); | 1862 | BUG_ON(!PageAnon(page)); |
1863 | 1863 | ||
1864 | /* | 1864 | /* |
1865 | * The caller does not necessarily hold an mmap_sem that would prevent | 1865 | * The caller does not necessarily hold an mmap_sem that would prevent |
1866 | * the anon_vma disappearing so we first we take a reference to it | 1866 | * the anon_vma disappearing so we first we take a reference to it |
1867 | * and then lock the anon_vma for write. This is similar to | 1867 | * and then lock the anon_vma for write. This is similar to |
1868 | * page_lock_anon_vma_read except the write lock is taken to serialise | 1868 | * page_lock_anon_vma_read except the write lock is taken to serialise |
1869 | * against parallel split or collapse operations. | 1869 | * against parallel split or collapse operations. |
1870 | */ | 1870 | */ |
1871 | anon_vma = page_get_anon_vma(page); | 1871 | anon_vma = page_get_anon_vma(page); |
1872 | if (!anon_vma) | 1872 | if (!anon_vma) |
1873 | goto out; | 1873 | goto out; |
1874 | anon_vma_lock_write(anon_vma); | 1874 | anon_vma_lock_write(anon_vma); |
1875 | 1875 | ||
1876 | ret = 0; | 1876 | ret = 0; |
1877 | if (!PageCompound(page)) | 1877 | if (!PageCompound(page)) |
1878 | goto out_unlock; | 1878 | goto out_unlock; |
1879 | 1879 | ||
1880 | BUG_ON(!PageSwapBacked(page)); | 1880 | BUG_ON(!PageSwapBacked(page)); |
1881 | __split_huge_page(page, anon_vma, list); | 1881 | __split_huge_page(page, anon_vma, list); |
1882 | count_vm_event(THP_SPLIT); | 1882 | count_vm_event(THP_SPLIT); |
1883 | 1883 | ||
1884 | BUG_ON(PageCompound(page)); | 1884 | BUG_ON(PageCompound(page)); |
1885 | out_unlock: | 1885 | out_unlock: |
1886 | anon_vma_unlock_write(anon_vma); | 1886 | anon_vma_unlock_write(anon_vma); |
1887 | put_anon_vma(anon_vma); | 1887 | put_anon_vma(anon_vma); |
1888 | out: | 1888 | out: |
1889 | return ret; | 1889 | return ret; |
1890 | } | 1890 | } |
1891 | 1891 | ||
1892 | #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) | 1892 | #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) |
1893 | 1893 | ||
1894 | int hugepage_madvise(struct vm_area_struct *vma, | 1894 | int hugepage_madvise(struct vm_area_struct *vma, |
1895 | unsigned long *vm_flags, int advice) | 1895 | unsigned long *vm_flags, int advice) |
1896 | { | 1896 | { |
1897 | struct mm_struct *mm = vma->vm_mm; | 1897 | struct mm_struct *mm = vma->vm_mm; |
1898 | 1898 | ||
1899 | switch (advice) { | 1899 | switch (advice) { |
1900 | case MADV_HUGEPAGE: | 1900 | case MADV_HUGEPAGE: |
1901 | /* | 1901 | /* |
1902 | * Be somewhat over-protective like KSM for now! | 1902 | * Be somewhat over-protective like KSM for now! |
1903 | */ | 1903 | */ |
1904 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) | 1904 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) |
1905 | return -EINVAL; | 1905 | return -EINVAL; |
1906 | if (mm->def_flags & VM_NOHUGEPAGE) | 1906 | if (mm->def_flags & VM_NOHUGEPAGE) |
1907 | return -EINVAL; | 1907 | return -EINVAL; |
1908 | *vm_flags &= ~VM_NOHUGEPAGE; | 1908 | *vm_flags &= ~VM_NOHUGEPAGE; |
1909 | *vm_flags |= VM_HUGEPAGE; | 1909 | *vm_flags |= VM_HUGEPAGE; |
1910 | /* | 1910 | /* |
1911 | * If the vma become good for khugepaged to scan, | 1911 | * If the vma become good for khugepaged to scan, |
1912 | * register it here without waiting a page fault that | 1912 | * register it here without waiting a page fault that |
1913 | * may not happen any time soon. | 1913 | * may not happen any time soon. |
1914 | */ | 1914 | */ |
1915 | if (unlikely(khugepaged_enter_vma_merge(vma))) | 1915 | if (unlikely(khugepaged_enter_vma_merge(vma))) |
1916 | return -ENOMEM; | 1916 | return -ENOMEM; |
1917 | break; | 1917 | break; |
1918 | case MADV_NOHUGEPAGE: | 1918 | case MADV_NOHUGEPAGE: |
1919 | /* | 1919 | /* |
1920 | * Be somewhat over-protective like KSM for now! | 1920 | * Be somewhat over-protective like KSM for now! |
1921 | */ | 1921 | */ |
1922 | if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) | 1922 | if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) |
1923 | return -EINVAL; | 1923 | return -EINVAL; |
1924 | *vm_flags &= ~VM_HUGEPAGE; | 1924 | *vm_flags &= ~VM_HUGEPAGE; |
1925 | *vm_flags |= VM_NOHUGEPAGE; | 1925 | *vm_flags |= VM_NOHUGEPAGE; |
1926 | /* | 1926 | /* |
1927 | * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning | 1927 | * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning |
1928 | * this vma even if we leave the mm registered in khugepaged if | 1928 | * this vma even if we leave the mm registered in khugepaged if |
1929 | * it got registered before VM_NOHUGEPAGE was set. | 1929 | * it got registered before VM_NOHUGEPAGE was set. |
1930 | */ | 1930 | */ |
1931 | break; | 1931 | break; |
1932 | } | 1932 | } |
1933 | 1933 | ||
1934 | return 0; | 1934 | return 0; |
1935 | } | 1935 | } |
1936 | 1936 | ||
1937 | static int __init khugepaged_slab_init(void) | 1937 | static int __init khugepaged_slab_init(void) |
1938 | { | 1938 | { |
1939 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", | 1939 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", |
1940 | sizeof(struct mm_slot), | 1940 | sizeof(struct mm_slot), |
1941 | __alignof__(struct mm_slot), 0, NULL); | 1941 | __alignof__(struct mm_slot), 0, NULL); |
1942 | if (!mm_slot_cache) | 1942 | if (!mm_slot_cache) |
1943 | return -ENOMEM; | 1943 | return -ENOMEM; |
1944 | 1944 | ||
1945 | return 0; | 1945 | return 0; |
1946 | } | 1946 | } |
1947 | 1947 | ||
1948 | static inline struct mm_slot *alloc_mm_slot(void) | 1948 | static inline struct mm_slot *alloc_mm_slot(void) |
1949 | { | 1949 | { |
1950 | if (!mm_slot_cache) /* initialization failed */ | 1950 | if (!mm_slot_cache) /* initialization failed */ |
1951 | return NULL; | 1951 | return NULL; |
1952 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); | 1952 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); |
1953 | } | 1953 | } |
1954 | 1954 | ||
1955 | static inline void free_mm_slot(struct mm_slot *mm_slot) | 1955 | static inline void free_mm_slot(struct mm_slot *mm_slot) |
1956 | { | 1956 | { |
1957 | kmem_cache_free(mm_slot_cache, mm_slot); | 1957 | kmem_cache_free(mm_slot_cache, mm_slot); |
1958 | } | 1958 | } |
1959 | 1959 | ||
1960 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | 1960 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) |
1961 | { | 1961 | { |
1962 | struct mm_slot *mm_slot; | 1962 | struct mm_slot *mm_slot; |
1963 | 1963 | ||
1964 | hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) | 1964 | hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) |
1965 | if (mm == mm_slot->mm) | 1965 | if (mm == mm_slot->mm) |
1966 | return mm_slot; | 1966 | return mm_slot; |
1967 | 1967 | ||
1968 | return NULL; | 1968 | return NULL; |
1969 | } | 1969 | } |
1970 | 1970 | ||
1971 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | 1971 | static void insert_to_mm_slots_hash(struct mm_struct *mm, |
1972 | struct mm_slot *mm_slot) | 1972 | struct mm_slot *mm_slot) |
1973 | { | 1973 | { |
1974 | mm_slot->mm = mm; | 1974 | mm_slot->mm = mm; |
1975 | hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); | 1975 | hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); |
1976 | } | 1976 | } |
1977 | 1977 | ||
1978 | static inline int khugepaged_test_exit(struct mm_struct *mm) | 1978 | static inline int khugepaged_test_exit(struct mm_struct *mm) |
1979 | { | 1979 | { |
1980 | return atomic_read(&mm->mm_users) == 0; | 1980 | return atomic_read(&mm->mm_users) == 0; |
1981 | } | 1981 | } |
1982 | 1982 | ||
1983 | int __khugepaged_enter(struct mm_struct *mm) | 1983 | int __khugepaged_enter(struct mm_struct *mm) |
1984 | { | 1984 | { |
1985 | struct mm_slot *mm_slot; | 1985 | struct mm_slot *mm_slot; |
1986 | int wakeup; | 1986 | int wakeup; |
1987 | 1987 | ||
1988 | mm_slot = alloc_mm_slot(); | 1988 | mm_slot = alloc_mm_slot(); |
1989 | if (!mm_slot) | 1989 | if (!mm_slot) |
1990 | return -ENOMEM; | 1990 | return -ENOMEM; |
1991 | 1991 | ||
1992 | /* __khugepaged_exit() must not run from under us */ | 1992 | /* __khugepaged_exit() must not run from under us */ |
1993 | VM_BUG_ON(khugepaged_test_exit(mm)); | 1993 | VM_BUG_ON(khugepaged_test_exit(mm)); |
1994 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | 1994 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { |
1995 | free_mm_slot(mm_slot); | 1995 | free_mm_slot(mm_slot); |
1996 | return 0; | 1996 | return 0; |
1997 | } | 1997 | } |
1998 | 1998 | ||
1999 | spin_lock(&khugepaged_mm_lock); | 1999 | spin_lock(&khugepaged_mm_lock); |
2000 | insert_to_mm_slots_hash(mm, mm_slot); | 2000 | insert_to_mm_slots_hash(mm, mm_slot); |
2001 | /* | 2001 | /* |
2002 | * Insert just behind the scanning cursor, to let the area settle | 2002 | * Insert just behind the scanning cursor, to let the area settle |
2003 | * down a little. | 2003 | * down a little. |
2004 | */ | 2004 | */ |
2005 | wakeup = list_empty(&khugepaged_scan.mm_head); | 2005 | wakeup = list_empty(&khugepaged_scan.mm_head); |
2006 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); | 2006 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); |
2007 | spin_unlock(&khugepaged_mm_lock); | 2007 | spin_unlock(&khugepaged_mm_lock); |
2008 | 2008 | ||
2009 | atomic_inc(&mm->mm_count); | 2009 | atomic_inc(&mm->mm_count); |
2010 | if (wakeup) | 2010 | if (wakeup) |
2011 | wake_up_interruptible(&khugepaged_wait); | 2011 | wake_up_interruptible(&khugepaged_wait); |
2012 | 2012 | ||
2013 | return 0; | 2013 | return 0; |
2014 | } | 2014 | } |
2015 | 2015 | ||
2016 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | 2016 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma) |
2017 | { | 2017 | { |
2018 | unsigned long hstart, hend; | 2018 | unsigned long hstart, hend; |
2019 | if (!vma->anon_vma) | 2019 | if (!vma->anon_vma) |
2020 | /* | 2020 | /* |
2021 | * Not yet faulted in so we will register later in the | 2021 | * Not yet faulted in so we will register later in the |
2022 | * page fault if needed. | 2022 | * page fault if needed. |
2023 | */ | 2023 | */ |
2024 | return 0; | 2024 | return 0; |
2025 | if (vma->vm_ops) | 2025 | if (vma->vm_ops) |
2026 | /* khugepaged not yet working on file or special mappings */ | 2026 | /* khugepaged not yet working on file or special mappings */ |
2027 | return 0; | 2027 | return 0; |
2028 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2028 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
2029 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2029 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2030 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2030 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2031 | if (hstart < hend) | 2031 | if (hstart < hend) |
2032 | return khugepaged_enter(vma); | 2032 | return khugepaged_enter(vma); |
2033 | return 0; | 2033 | return 0; |
2034 | } | 2034 | } |
2035 | 2035 | ||
2036 | void __khugepaged_exit(struct mm_struct *mm) | 2036 | void __khugepaged_exit(struct mm_struct *mm) |
2037 | { | 2037 | { |
2038 | struct mm_slot *mm_slot; | 2038 | struct mm_slot *mm_slot; |
2039 | int free = 0; | 2039 | int free = 0; |
2040 | 2040 | ||
2041 | spin_lock(&khugepaged_mm_lock); | 2041 | spin_lock(&khugepaged_mm_lock); |
2042 | mm_slot = get_mm_slot(mm); | 2042 | mm_slot = get_mm_slot(mm); |
2043 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | 2043 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { |
2044 | hash_del(&mm_slot->hash); | 2044 | hash_del(&mm_slot->hash); |
2045 | list_del(&mm_slot->mm_node); | 2045 | list_del(&mm_slot->mm_node); |
2046 | free = 1; | 2046 | free = 1; |
2047 | } | 2047 | } |
2048 | spin_unlock(&khugepaged_mm_lock); | 2048 | spin_unlock(&khugepaged_mm_lock); |
2049 | 2049 | ||
2050 | if (free) { | 2050 | if (free) { |
2051 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | 2051 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); |
2052 | free_mm_slot(mm_slot); | 2052 | free_mm_slot(mm_slot); |
2053 | mmdrop(mm); | 2053 | mmdrop(mm); |
2054 | } else if (mm_slot) { | 2054 | } else if (mm_slot) { |
2055 | /* | 2055 | /* |
2056 | * This is required to serialize against | 2056 | * This is required to serialize against |
2057 | * khugepaged_test_exit() (which is guaranteed to run | 2057 | * khugepaged_test_exit() (which is guaranteed to run |
2058 | * under mmap sem read mode). Stop here (after we | 2058 | * under mmap sem read mode). Stop here (after we |
2059 | * return all pagetables will be destroyed) until | 2059 | * return all pagetables will be destroyed) until |
2060 | * khugepaged has finished working on the pagetables | 2060 | * khugepaged has finished working on the pagetables |
2061 | * under the mmap_sem. | 2061 | * under the mmap_sem. |
2062 | */ | 2062 | */ |
2063 | down_write(&mm->mmap_sem); | 2063 | down_write(&mm->mmap_sem); |
2064 | up_write(&mm->mmap_sem); | 2064 | up_write(&mm->mmap_sem); |
2065 | } | 2065 | } |
2066 | } | 2066 | } |
2067 | 2067 | ||
2068 | static void release_pte_page(struct page *page) | 2068 | static void release_pte_page(struct page *page) |
2069 | { | 2069 | { |
2070 | /* 0 stands for page_is_file_cache(page) == false */ | 2070 | /* 0 stands for page_is_file_cache(page) == false */ |
2071 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); | 2071 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); |
2072 | unlock_page(page); | 2072 | unlock_page(page); |
2073 | putback_lru_page(page); | 2073 | putback_lru_page(page); |
2074 | } | 2074 | } |
2075 | 2075 | ||
2076 | static void release_pte_pages(pte_t *pte, pte_t *_pte) | 2076 | static void release_pte_pages(pte_t *pte, pte_t *_pte) |
2077 | { | 2077 | { |
2078 | while (--_pte >= pte) { | 2078 | while (--_pte >= pte) { |
2079 | pte_t pteval = *_pte; | 2079 | pte_t pteval = *_pte; |
2080 | if (!pte_none(pteval)) | 2080 | if (!pte_none(pteval)) |
2081 | release_pte_page(pte_page(pteval)); | 2081 | release_pte_page(pte_page(pteval)); |
2082 | } | 2082 | } |
2083 | } | 2083 | } |
2084 | 2084 | ||
2085 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | 2085 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, |
2086 | unsigned long address, | 2086 | unsigned long address, |
2087 | pte_t *pte) | 2087 | pte_t *pte) |
2088 | { | 2088 | { |
2089 | struct page *page; | 2089 | struct page *page; |
2090 | pte_t *_pte; | 2090 | pte_t *_pte; |
2091 | int referenced = 0, none = 0; | 2091 | int referenced = 0, none = 0; |
2092 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 2092 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
2093 | _pte++, address += PAGE_SIZE) { | 2093 | _pte++, address += PAGE_SIZE) { |
2094 | pte_t pteval = *_pte; | 2094 | pte_t pteval = *_pte; |
2095 | if (pte_none(pteval)) { | 2095 | if (pte_none(pteval)) { |
2096 | if (++none <= khugepaged_max_ptes_none) | 2096 | if (++none <= khugepaged_max_ptes_none) |
2097 | continue; | 2097 | continue; |
2098 | else | 2098 | else |
2099 | goto out; | 2099 | goto out; |
2100 | } | 2100 | } |
2101 | if (!pte_present(pteval) || !pte_write(pteval)) | 2101 | if (!pte_present(pteval) || !pte_write(pteval)) |
2102 | goto out; | 2102 | goto out; |
2103 | page = vm_normal_page(vma, address, pteval); | 2103 | page = vm_normal_page(vma, address, pteval); |
2104 | if (unlikely(!page)) | 2104 | if (unlikely(!page)) |
2105 | goto out; | 2105 | goto out; |
2106 | 2106 | ||
2107 | VM_BUG_ON(PageCompound(page)); | 2107 | VM_BUG_ON(PageCompound(page)); |
2108 | BUG_ON(!PageAnon(page)); | 2108 | BUG_ON(!PageAnon(page)); |
2109 | VM_BUG_ON(!PageSwapBacked(page)); | 2109 | VM_BUG_ON(!PageSwapBacked(page)); |
2110 | 2110 | ||
2111 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2111 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
2112 | if (page_count(page) != 1) | 2112 | if (page_count(page) != 1) |
2113 | goto out; | 2113 | goto out; |
2114 | /* | 2114 | /* |
2115 | * We can do it before isolate_lru_page because the | 2115 | * We can do it before isolate_lru_page because the |
2116 | * page can't be freed from under us. NOTE: PG_lock | 2116 | * page can't be freed from under us. NOTE: PG_lock |
2117 | * is needed to serialize against split_huge_page | 2117 | * is needed to serialize against split_huge_page |
2118 | * when invoked from the VM. | 2118 | * when invoked from the VM. |
2119 | */ | 2119 | */ |
2120 | if (!trylock_page(page)) | 2120 | if (!trylock_page(page)) |
2121 | goto out; | 2121 | goto out; |
2122 | /* | 2122 | /* |
2123 | * Isolate the page to avoid collapsing an hugepage | 2123 | * Isolate the page to avoid collapsing an hugepage |
2124 | * currently in use by the VM. | 2124 | * currently in use by the VM. |
2125 | */ | 2125 | */ |
2126 | if (isolate_lru_page(page)) { | 2126 | if (isolate_lru_page(page)) { |
2127 | unlock_page(page); | 2127 | unlock_page(page); |
2128 | goto out; | 2128 | goto out; |
2129 | } | 2129 | } |
2130 | /* 0 stands for page_is_file_cache(page) == false */ | 2130 | /* 0 stands for page_is_file_cache(page) == false */ |
2131 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | 2131 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); |
2132 | VM_BUG_ON(!PageLocked(page)); | 2132 | VM_BUG_ON(!PageLocked(page)); |
2133 | VM_BUG_ON(PageLRU(page)); | 2133 | VM_BUG_ON(PageLRU(page)); |
2134 | 2134 | ||
2135 | /* If there is no mapped pte young don't collapse the page */ | 2135 | /* If there is no mapped pte young don't collapse the page */ |
2136 | if (pte_young(pteval) || PageReferenced(page) || | 2136 | if (pte_young(pteval) || PageReferenced(page) || |
2137 | mmu_notifier_test_young(vma->vm_mm, address)) | 2137 | mmu_notifier_test_young(vma->vm_mm, address)) |
2138 | referenced = 1; | 2138 | referenced = 1; |
2139 | } | 2139 | } |
2140 | if (likely(referenced)) | 2140 | if (likely(referenced)) |
2141 | return 1; | 2141 | return 1; |
2142 | out: | 2142 | out: |
2143 | release_pte_pages(pte, _pte); | 2143 | release_pte_pages(pte, _pte); |
2144 | return 0; | 2144 | return 0; |
2145 | } | 2145 | } |
2146 | 2146 | ||
2147 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | 2147 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, |
2148 | struct vm_area_struct *vma, | 2148 | struct vm_area_struct *vma, |
2149 | unsigned long address, | 2149 | unsigned long address, |
2150 | spinlock_t *ptl) | 2150 | spinlock_t *ptl) |
2151 | { | 2151 | { |
2152 | pte_t *_pte; | 2152 | pte_t *_pte; |
2153 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { | 2153 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { |
2154 | pte_t pteval = *_pte; | 2154 | pte_t pteval = *_pte; |
2155 | struct page *src_page; | 2155 | struct page *src_page; |
2156 | 2156 | ||
2157 | if (pte_none(pteval)) { | 2157 | if (pte_none(pteval)) { |
2158 | clear_user_highpage(page, address); | 2158 | clear_user_highpage(page, address); |
2159 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); | 2159 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); |
2160 | } else { | 2160 | } else { |
2161 | src_page = pte_page(pteval); | 2161 | src_page = pte_page(pteval); |
2162 | copy_user_highpage(page, src_page, address, vma); | 2162 | copy_user_highpage(page, src_page, address, vma); |
2163 | VM_BUG_ON(page_mapcount(src_page) != 1); | 2163 | VM_BUG_ON(page_mapcount(src_page) != 1); |
2164 | release_pte_page(src_page); | 2164 | release_pte_page(src_page); |
2165 | /* | 2165 | /* |
2166 | * ptl mostly unnecessary, but preempt has to | 2166 | * ptl mostly unnecessary, but preempt has to |
2167 | * be disabled to update the per-cpu stats | 2167 | * be disabled to update the per-cpu stats |
2168 | * inside page_remove_rmap(). | 2168 | * inside page_remove_rmap(). |
2169 | */ | 2169 | */ |
2170 | spin_lock(ptl); | 2170 | spin_lock(ptl); |
2171 | /* | 2171 | /* |
2172 | * paravirt calls inside pte_clear here are | 2172 | * paravirt calls inside pte_clear here are |
2173 | * superfluous. | 2173 | * superfluous. |
2174 | */ | 2174 | */ |
2175 | pte_clear(vma->vm_mm, address, _pte); | 2175 | pte_clear(vma->vm_mm, address, _pte); |
2176 | page_remove_rmap(src_page); | 2176 | page_remove_rmap(src_page); |
2177 | spin_unlock(ptl); | 2177 | spin_unlock(ptl); |
2178 | free_page_and_swap_cache(src_page); | 2178 | free_page_and_swap_cache(src_page); |
2179 | } | 2179 | } |
2180 | 2180 | ||
2181 | address += PAGE_SIZE; | 2181 | address += PAGE_SIZE; |
2182 | page++; | 2182 | page++; |
2183 | } | 2183 | } |
2184 | } | 2184 | } |
2185 | 2185 | ||
2186 | static void khugepaged_alloc_sleep(void) | 2186 | static void khugepaged_alloc_sleep(void) |
2187 | { | 2187 | { |
2188 | wait_event_freezable_timeout(khugepaged_wait, false, | 2188 | wait_event_freezable_timeout(khugepaged_wait, false, |
2189 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | 2189 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); |
2190 | } | 2190 | } |
2191 | 2191 | ||
2192 | static int khugepaged_node_load[MAX_NUMNODES]; | 2192 | static int khugepaged_node_load[MAX_NUMNODES]; |
2193 | 2193 | ||
2194 | static bool khugepaged_scan_abort(int nid) | ||
2195 | { | ||
2196 | int i; | ||
2197 | |||
2198 | /* | ||
2199 | * If zone_reclaim_mode is disabled, then no extra effort is made to | ||
2200 | * allocate memory locally. | ||
2201 | */ | ||
2202 | if (!zone_reclaim_mode) | ||
2203 | return false; | ||
2204 | |||
2205 | /* If there is a count for this node already, it must be acceptable */ | ||
2206 | if (khugepaged_node_load[nid]) | ||
2207 | return false; | ||
2208 | |||
2209 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
2210 | if (!khugepaged_node_load[i]) | ||
2211 | continue; | ||
2212 | if (node_distance(nid, i) > RECLAIM_DISTANCE) | ||
2213 | return true; | ||
2214 | } | ||
2215 | return false; | ||
2216 | } | ||
2217 | |||
2194 | #ifdef CONFIG_NUMA | 2218 | #ifdef CONFIG_NUMA |
2195 | static int khugepaged_find_target_node(void) | 2219 | static int khugepaged_find_target_node(void) |
2196 | { | 2220 | { |
2197 | static int last_khugepaged_target_node = NUMA_NO_NODE; | 2221 | static int last_khugepaged_target_node = NUMA_NO_NODE; |
2198 | int nid, target_node = 0, max_value = 0; | 2222 | int nid, target_node = 0, max_value = 0; |
2199 | 2223 | ||
2200 | /* find first node with max normal pages hit */ | 2224 | /* find first node with max normal pages hit */ |
2201 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 2225 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
2202 | if (khugepaged_node_load[nid] > max_value) { | 2226 | if (khugepaged_node_load[nid] > max_value) { |
2203 | max_value = khugepaged_node_load[nid]; | 2227 | max_value = khugepaged_node_load[nid]; |
2204 | target_node = nid; | 2228 | target_node = nid; |
2205 | } | 2229 | } |
2206 | 2230 | ||
2207 | /* do some balance if several nodes have the same hit record */ | 2231 | /* do some balance if several nodes have the same hit record */ |
2208 | if (target_node <= last_khugepaged_target_node) | 2232 | if (target_node <= last_khugepaged_target_node) |
2209 | for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; | 2233 | for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; |
2210 | nid++) | 2234 | nid++) |
2211 | if (max_value == khugepaged_node_load[nid]) { | 2235 | if (max_value == khugepaged_node_load[nid]) { |
2212 | target_node = nid; | 2236 | target_node = nid; |
2213 | break; | 2237 | break; |
2214 | } | 2238 | } |
2215 | 2239 | ||
2216 | last_khugepaged_target_node = target_node; | 2240 | last_khugepaged_target_node = target_node; |
2217 | return target_node; | 2241 | return target_node; |
2218 | } | 2242 | } |
2219 | 2243 | ||
2220 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | 2244 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
2221 | { | 2245 | { |
2222 | if (IS_ERR(*hpage)) { | 2246 | if (IS_ERR(*hpage)) { |
2223 | if (!*wait) | 2247 | if (!*wait) |
2224 | return false; | 2248 | return false; |
2225 | 2249 | ||
2226 | *wait = false; | 2250 | *wait = false; |
2227 | *hpage = NULL; | 2251 | *hpage = NULL; |
2228 | khugepaged_alloc_sleep(); | 2252 | khugepaged_alloc_sleep(); |
2229 | } else if (*hpage) { | 2253 | } else if (*hpage) { |
2230 | put_page(*hpage); | 2254 | put_page(*hpage); |
2231 | *hpage = NULL; | 2255 | *hpage = NULL; |
2232 | } | 2256 | } |
2233 | 2257 | ||
2234 | return true; | 2258 | return true; |
2235 | } | 2259 | } |
2236 | 2260 | ||
2237 | static struct page | 2261 | static struct page |
2238 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2262 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, |
2239 | struct vm_area_struct *vma, unsigned long address, | 2263 | struct vm_area_struct *vma, unsigned long address, |
2240 | int node) | 2264 | int node) |
2241 | { | 2265 | { |
2242 | VM_BUG_ON(*hpage); | 2266 | VM_BUG_ON(*hpage); |
2243 | /* | 2267 | /* |
2244 | * Allocate the page while the vma is still valid and under | 2268 | * Allocate the page while the vma is still valid and under |
2245 | * the mmap_sem read mode so there is no memory allocation | 2269 | * the mmap_sem read mode so there is no memory allocation |
2246 | * later when we take the mmap_sem in write mode. This is more | 2270 | * later when we take the mmap_sem in write mode. This is more |
2247 | * friendly behavior (OTOH it may actually hide bugs) to | 2271 | * friendly behavior (OTOH it may actually hide bugs) to |
2248 | * filesystems in userland with daemons allocating memory in | 2272 | * filesystems in userland with daemons allocating memory in |
2249 | * the userland I/O paths. Allocating memory with the | 2273 | * the userland I/O paths. Allocating memory with the |
2250 | * mmap_sem in read mode is good idea also to allow greater | 2274 | * mmap_sem in read mode is good idea also to allow greater |
2251 | * scalability. | 2275 | * scalability. |
2252 | */ | 2276 | */ |
2253 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( | 2277 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( |
2254 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); | 2278 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); |
2255 | /* | 2279 | /* |
2256 | * After allocating the hugepage, release the mmap_sem read lock in | 2280 | * After allocating the hugepage, release the mmap_sem read lock in |
2257 | * preparation for taking it in write mode. | 2281 | * preparation for taking it in write mode. |
2258 | */ | 2282 | */ |
2259 | up_read(&mm->mmap_sem); | 2283 | up_read(&mm->mmap_sem); |
2260 | if (unlikely(!*hpage)) { | 2284 | if (unlikely(!*hpage)) { |
2261 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2285 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
2262 | *hpage = ERR_PTR(-ENOMEM); | 2286 | *hpage = ERR_PTR(-ENOMEM); |
2263 | return NULL; | 2287 | return NULL; |
2264 | } | 2288 | } |
2265 | 2289 | ||
2266 | count_vm_event(THP_COLLAPSE_ALLOC); | 2290 | count_vm_event(THP_COLLAPSE_ALLOC); |
2267 | return *hpage; | 2291 | return *hpage; |
2268 | } | 2292 | } |
2269 | #else | 2293 | #else |
2270 | static int khugepaged_find_target_node(void) | 2294 | static int khugepaged_find_target_node(void) |
2271 | { | 2295 | { |
2272 | return 0; | 2296 | return 0; |
2273 | } | 2297 | } |
2274 | 2298 | ||
2275 | static inline struct page *alloc_hugepage(int defrag) | 2299 | static inline struct page *alloc_hugepage(int defrag) |
2276 | { | 2300 | { |
2277 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), | 2301 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), |
2278 | HPAGE_PMD_ORDER); | 2302 | HPAGE_PMD_ORDER); |
2279 | } | 2303 | } |
2280 | 2304 | ||
2281 | static struct page *khugepaged_alloc_hugepage(bool *wait) | 2305 | static struct page *khugepaged_alloc_hugepage(bool *wait) |
2282 | { | 2306 | { |
2283 | struct page *hpage; | 2307 | struct page *hpage; |
2284 | 2308 | ||
2285 | do { | 2309 | do { |
2286 | hpage = alloc_hugepage(khugepaged_defrag()); | 2310 | hpage = alloc_hugepage(khugepaged_defrag()); |
2287 | if (!hpage) { | 2311 | if (!hpage) { |
2288 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2312 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
2289 | if (!*wait) | 2313 | if (!*wait) |
2290 | return NULL; | 2314 | return NULL; |
2291 | 2315 | ||
2292 | *wait = false; | 2316 | *wait = false; |
2293 | khugepaged_alloc_sleep(); | 2317 | khugepaged_alloc_sleep(); |
2294 | } else | 2318 | } else |
2295 | count_vm_event(THP_COLLAPSE_ALLOC); | 2319 | count_vm_event(THP_COLLAPSE_ALLOC); |
2296 | } while (unlikely(!hpage) && likely(khugepaged_enabled())); | 2320 | } while (unlikely(!hpage) && likely(khugepaged_enabled())); |
2297 | 2321 | ||
2298 | return hpage; | 2322 | return hpage; |
2299 | } | 2323 | } |
2300 | 2324 | ||
2301 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | 2325 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
2302 | { | 2326 | { |
2303 | if (!*hpage) | 2327 | if (!*hpage) |
2304 | *hpage = khugepaged_alloc_hugepage(wait); | 2328 | *hpage = khugepaged_alloc_hugepage(wait); |
2305 | 2329 | ||
2306 | if (unlikely(!*hpage)) | 2330 | if (unlikely(!*hpage)) |
2307 | return false; | 2331 | return false; |
2308 | 2332 | ||
2309 | return true; | 2333 | return true; |
2310 | } | 2334 | } |
2311 | 2335 | ||
2312 | static struct page | 2336 | static struct page |
2313 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2337 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, |
2314 | struct vm_area_struct *vma, unsigned long address, | 2338 | struct vm_area_struct *vma, unsigned long address, |
2315 | int node) | 2339 | int node) |
2316 | { | 2340 | { |
2317 | up_read(&mm->mmap_sem); | 2341 | up_read(&mm->mmap_sem); |
2318 | VM_BUG_ON(!*hpage); | 2342 | VM_BUG_ON(!*hpage); |
2319 | return *hpage; | 2343 | return *hpage; |
2320 | } | 2344 | } |
2321 | #endif | 2345 | #endif |
2322 | 2346 | ||
2323 | static bool hugepage_vma_check(struct vm_area_struct *vma) | 2347 | static bool hugepage_vma_check(struct vm_area_struct *vma) |
2324 | { | 2348 | { |
2325 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | 2349 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || |
2326 | (vma->vm_flags & VM_NOHUGEPAGE)) | 2350 | (vma->vm_flags & VM_NOHUGEPAGE)) |
2327 | return false; | 2351 | return false; |
2328 | 2352 | ||
2329 | if (!vma->anon_vma || vma->vm_ops) | 2353 | if (!vma->anon_vma || vma->vm_ops) |
2330 | return false; | 2354 | return false; |
2331 | if (is_vma_temporary_stack(vma)) | 2355 | if (is_vma_temporary_stack(vma)) |
2332 | return false; | 2356 | return false; |
2333 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2357 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
2334 | return true; | 2358 | return true; |
2335 | } | 2359 | } |
2336 | 2360 | ||
2337 | static void collapse_huge_page(struct mm_struct *mm, | 2361 | static void collapse_huge_page(struct mm_struct *mm, |
2338 | unsigned long address, | 2362 | unsigned long address, |
2339 | struct page **hpage, | 2363 | struct page **hpage, |
2340 | struct vm_area_struct *vma, | 2364 | struct vm_area_struct *vma, |
2341 | int node) | 2365 | int node) |
2342 | { | 2366 | { |
2343 | pmd_t *pmd, _pmd; | 2367 | pmd_t *pmd, _pmd; |
2344 | pte_t *pte; | 2368 | pte_t *pte; |
2345 | pgtable_t pgtable; | 2369 | pgtable_t pgtable; |
2346 | struct page *new_page; | 2370 | struct page *new_page; |
2347 | spinlock_t *ptl; | 2371 | spinlock_t *ptl; |
2348 | int isolated; | 2372 | int isolated; |
2349 | unsigned long hstart, hend; | 2373 | unsigned long hstart, hend; |
2350 | unsigned long mmun_start; /* For mmu_notifiers */ | 2374 | unsigned long mmun_start; /* For mmu_notifiers */ |
2351 | unsigned long mmun_end; /* For mmu_notifiers */ | 2375 | unsigned long mmun_end; /* For mmu_notifiers */ |
2352 | 2376 | ||
2353 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2377 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2354 | 2378 | ||
2355 | /* release the mmap_sem read lock. */ | 2379 | /* release the mmap_sem read lock. */ |
2356 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); | 2380 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); |
2357 | if (!new_page) | 2381 | if (!new_page) |
2358 | return; | 2382 | return; |
2359 | 2383 | ||
2360 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) | 2384 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) |
2361 | return; | 2385 | return; |
2362 | 2386 | ||
2363 | /* | 2387 | /* |
2364 | * Prevent all access to pagetables with the exception of | 2388 | * Prevent all access to pagetables with the exception of |
2365 | * gup_fast later hanlded by the ptep_clear_flush and the VM | 2389 | * gup_fast later hanlded by the ptep_clear_flush and the VM |
2366 | * handled by the anon_vma lock + PG_lock. | 2390 | * handled by the anon_vma lock + PG_lock. |
2367 | */ | 2391 | */ |
2368 | down_write(&mm->mmap_sem); | 2392 | down_write(&mm->mmap_sem); |
2369 | if (unlikely(khugepaged_test_exit(mm))) | 2393 | if (unlikely(khugepaged_test_exit(mm))) |
2370 | goto out; | 2394 | goto out; |
2371 | 2395 | ||
2372 | vma = find_vma(mm, address); | 2396 | vma = find_vma(mm, address); |
2373 | if (!vma) | 2397 | if (!vma) |
2374 | goto out; | 2398 | goto out; |
2375 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2399 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2376 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2400 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2377 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | 2401 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) |
2378 | goto out; | 2402 | goto out; |
2379 | if (!hugepage_vma_check(vma)) | 2403 | if (!hugepage_vma_check(vma)) |
2380 | goto out; | 2404 | goto out; |
2381 | pmd = mm_find_pmd(mm, address); | 2405 | pmd = mm_find_pmd(mm, address); |
2382 | if (!pmd) | 2406 | if (!pmd) |
2383 | goto out; | 2407 | goto out; |
2384 | if (pmd_trans_huge(*pmd)) | 2408 | if (pmd_trans_huge(*pmd)) |
2385 | goto out; | 2409 | goto out; |
2386 | 2410 | ||
2387 | anon_vma_lock_write(vma->anon_vma); | 2411 | anon_vma_lock_write(vma->anon_vma); |
2388 | 2412 | ||
2389 | pte = pte_offset_map(pmd, address); | 2413 | pte = pte_offset_map(pmd, address); |
2390 | ptl = pte_lockptr(mm, pmd); | 2414 | ptl = pte_lockptr(mm, pmd); |
2391 | 2415 | ||
2392 | mmun_start = address; | 2416 | mmun_start = address; |
2393 | mmun_end = address + HPAGE_PMD_SIZE; | 2417 | mmun_end = address + HPAGE_PMD_SIZE; |
2394 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2418 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2395 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | 2419 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ |
2396 | /* | 2420 | /* |
2397 | * After this gup_fast can't run anymore. This also removes | 2421 | * After this gup_fast can't run anymore. This also removes |
2398 | * any huge TLB entry from the CPU so we won't allow | 2422 | * any huge TLB entry from the CPU so we won't allow |
2399 | * huge and small TLB entries for the same virtual address | 2423 | * huge and small TLB entries for the same virtual address |
2400 | * to avoid the risk of CPU bugs in that area. | 2424 | * to avoid the risk of CPU bugs in that area. |
2401 | */ | 2425 | */ |
2402 | _pmd = pmdp_clear_flush(vma, address, pmd); | 2426 | _pmd = pmdp_clear_flush(vma, address, pmd); |
2403 | spin_unlock(&mm->page_table_lock); | 2427 | spin_unlock(&mm->page_table_lock); |
2404 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2428 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2405 | 2429 | ||
2406 | spin_lock(ptl); | 2430 | spin_lock(ptl); |
2407 | isolated = __collapse_huge_page_isolate(vma, address, pte); | 2431 | isolated = __collapse_huge_page_isolate(vma, address, pte); |
2408 | spin_unlock(ptl); | 2432 | spin_unlock(ptl); |
2409 | 2433 | ||
2410 | if (unlikely(!isolated)) { | 2434 | if (unlikely(!isolated)) { |
2411 | pte_unmap(pte); | 2435 | pte_unmap(pte); |
2412 | spin_lock(&mm->page_table_lock); | 2436 | spin_lock(&mm->page_table_lock); |
2413 | BUG_ON(!pmd_none(*pmd)); | 2437 | BUG_ON(!pmd_none(*pmd)); |
2414 | /* | 2438 | /* |
2415 | * We can only use set_pmd_at when establishing | 2439 | * We can only use set_pmd_at when establishing |
2416 | * hugepmds and never for establishing regular pmds that | 2440 | * hugepmds and never for establishing regular pmds that |
2417 | * points to regular pagetables. Use pmd_populate for that | 2441 | * points to regular pagetables. Use pmd_populate for that |
2418 | */ | 2442 | */ |
2419 | pmd_populate(mm, pmd, pmd_pgtable(_pmd)); | 2443 | pmd_populate(mm, pmd, pmd_pgtable(_pmd)); |
2420 | spin_unlock(&mm->page_table_lock); | 2444 | spin_unlock(&mm->page_table_lock); |
2421 | anon_vma_unlock_write(vma->anon_vma); | 2445 | anon_vma_unlock_write(vma->anon_vma); |
2422 | goto out; | 2446 | goto out; |
2423 | } | 2447 | } |
2424 | 2448 | ||
2425 | /* | 2449 | /* |
2426 | * All pages are isolated and locked so anon_vma rmap | 2450 | * All pages are isolated and locked so anon_vma rmap |
2427 | * can't run anymore. | 2451 | * can't run anymore. |
2428 | */ | 2452 | */ |
2429 | anon_vma_unlock_write(vma->anon_vma); | 2453 | anon_vma_unlock_write(vma->anon_vma); |
2430 | 2454 | ||
2431 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | 2455 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); |
2432 | pte_unmap(pte); | 2456 | pte_unmap(pte); |
2433 | __SetPageUptodate(new_page); | 2457 | __SetPageUptodate(new_page); |
2434 | pgtable = pmd_pgtable(_pmd); | 2458 | pgtable = pmd_pgtable(_pmd); |
2435 | 2459 | ||
2436 | _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); | 2460 | _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); |
2437 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | 2461 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); |
2438 | 2462 | ||
2439 | /* | 2463 | /* |
2440 | * spin_lock() below is not the equivalent of smp_wmb(), so | 2464 | * spin_lock() below is not the equivalent of smp_wmb(), so |
2441 | * this is needed to avoid the copy_huge_page writes to become | 2465 | * this is needed to avoid the copy_huge_page writes to become |
2442 | * visible after the set_pmd_at() write. | 2466 | * visible after the set_pmd_at() write. |
2443 | */ | 2467 | */ |
2444 | smp_wmb(); | 2468 | smp_wmb(); |
2445 | 2469 | ||
2446 | spin_lock(&mm->page_table_lock); | 2470 | spin_lock(&mm->page_table_lock); |
2447 | BUG_ON(!pmd_none(*pmd)); | 2471 | BUG_ON(!pmd_none(*pmd)); |
2448 | page_add_new_anon_rmap(new_page, vma, address); | 2472 | page_add_new_anon_rmap(new_page, vma, address); |
2449 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 2473 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
2450 | set_pmd_at(mm, address, pmd, _pmd); | 2474 | set_pmd_at(mm, address, pmd, _pmd); |
2451 | update_mmu_cache_pmd(vma, address, pmd); | 2475 | update_mmu_cache_pmd(vma, address, pmd); |
2452 | spin_unlock(&mm->page_table_lock); | 2476 | spin_unlock(&mm->page_table_lock); |
2453 | 2477 | ||
2454 | *hpage = NULL; | 2478 | *hpage = NULL; |
2455 | 2479 | ||
2456 | khugepaged_pages_collapsed++; | 2480 | khugepaged_pages_collapsed++; |
2457 | out_up_write: | 2481 | out_up_write: |
2458 | up_write(&mm->mmap_sem); | 2482 | up_write(&mm->mmap_sem); |
2459 | return; | 2483 | return; |
2460 | 2484 | ||
2461 | out: | 2485 | out: |
2462 | mem_cgroup_uncharge_page(new_page); | 2486 | mem_cgroup_uncharge_page(new_page); |
2463 | goto out_up_write; | 2487 | goto out_up_write; |
2464 | } | 2488 | } |
2465 | 2489 | ||
2466 | static int khugepaged_scan_pmd(struct mm_struct *mm, | 2490 | static int khugepaged_scan_pmd(struct mm_struct *mm, |
2467 | struct vm_area_struct *vma, | 2491 | struct vm_area_struct *vma, |
2468 | unsigned long address, | 2492 | unsigned long address, |
2469 | struct page **hpage) | 2493 | struct page **hpage) |
2470 | { | 2494 | { |
2471 | pmd_t *pmd; | 2495 | pmd_t *pmd; |
2472 | pte_t *pte, *_pte; | 2496 | pte_t *pte, *_pte; |
2473 | int ret = 0, referenced = 0, none = 0; | 2497 | int ret = 0, referenced = 0, none = 0; |
2474 | struct page *page; | 2498 | struct page *page; |
2475 | unsigned long _address; | 2499 | unsigned long _address; |
2476 | spinlock_t *ptl; | 2500 | spinlock_t *ptl; |
2477 | int node = NUMA_NO_NODE; | 2501 | int node = NUMA_NO_NODE; |
2478 | 2502 | ||
2479 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2503 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2480 | 2504 | ||
2481 | pmd = mm_find_pmd(mm, address); | 2505 | pmd = mm_find_pmd(mm, address); |
2482 | if (!pmd) | 2506 | if (!pmd) |
2483 | goto out; | 2507 | goto out; |
2484 | if (pmd_trans_huge(*pmd)) | 2508 | if (pmd_trans_huge(*pmd)) |
2485 | goto out; | 2509 | goto out; |
2486 | 2510 | ||
2487 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); | 2511 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); |
2488 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2512 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2489 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | 2513 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; |
2490 | _pte++, _address += PAGE_SIZE) { | 2514 | _pte++, _address += PAGE_SIZE) { |
2491 | pte_t pteval = *_pte; | 2515 | pte_t pteval = *_pte; |
2492 | if (pte_none(pteval)) { | 2516 | if (pte_none(pteval)) { |
2493 | if (++none <= khugepaged_max_ptes_none) | 2517 | if (++none <= khugepaged_max_ptes_none) |
2494 | continue; | 2518 | continue; |
2495 | else | 2519 | else |
2496 | goto out_unmap; | 2520 | goto out_unmap; |
2497 | } | 2521 | } |
2498 | if (!pte_present(pteval) || !pte_write(pteval)) | 2522 | if (!pte_present(pteval) || !pte_write(pteval)) |
2499 | goto out_unmap; | 2523 | goto out_unmap; |
2500 | page = vm_normal_page(vma, _address, pteval); | 2524 | page = vm_normal_page(vma, _address, pteval); |
2501 | if (unlikely(!page)) | 2525 | if (unlikely(!page)) |
2502 | goto out_unmap; | 2526 | goto out_unmap; |
2503 | /* | 2527 | /* |
2504 | * Record which node the original page is from and save this | 2528 | * Record which node the original page is from and save this |
2505 | * information to khugepaged_node_load[]. | 2529 | * information to khugepaged_node_load[]. |
2506 | * Khupaged will allocate hugepage from the node has the max | 2530 | * Khupaged will allocate hugepage from the node has the max |
2507 | * hit record. | 2531 | * hit record. |
2508 | */ | 2532 | */ |
2509 | node = page_to_nid(page); | 2533 | node = page_to_nid(page); |
2534 | if (khugepaged_scan_abort(node)) | ||
2535 | goto out_unmap; | ||
2510 | khugepaged_node_load[node]++; | 2536 | khugepaged_node_load[node]++; |
2511 | VM_BUG_ON(PageCompound(page)); | 2537 | VM_BUG_ON(PageCompound(page)); |
2512 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2538 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
2513 | goto out_unmap; | 2539 | goto out_unmap; |
2514 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2540 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
2515 | if (page_count(page) != 1) | 2541 | if (page_count(page) != 1) |
2516 | goto out_unmap; | 2542 | goto out_unmap; |
2517 | if (pte_young(pteval) || PageReferenced(page) || | 2543 | if (pte_young(pteval) || PageReferenced(page) || |
2518 | mmu_notifier_test_young(vma->vm_mm, address)) | 2544 | mmu_notifier_test_young(vma->vm_mm, address)) |
2519 | referenced = 1; | 2545 | referenced = 1; |
2520 | } | 2546 | } |
2521 | if (referenced) | 2547 | if (referenced) |
2522 | ret = 1; | 2548 | ret = 1; |
2523 | out_unmap: | 2549 | out_unmap: |
2524 | pte_unmap_unlock(pte, ptl); | 2550 | pte_unmap_unlock(pte, ptl); |
2525 | if (ret) { | 2551 | if (ret) { |
2526 | node = khugepaged_find_target_node(); | 2552 | node = khugepaged_find_target_node(); |
2527 | /* collapse_huge_page will return with the mmap_sem released */ | 2553 | /* collapse_huge_page will return with the mmap_sem released */ |
2528 | collapse_huge_page(mm, address, hpage, vma, node); | 2554 | collapse_huge_page(mm, address, hpage, vma, node); |
2529 | } | 2555 | } |
2530 | out: | 2556 | out: |
2531 | return ret; | 2557 | return ret; |
2532 | } | 2558 | } |
2533 | 2559 | ||
2534 | static void collect_mm_slot(struct mm_slot *mm_slot) | 2560 | static void collect_mm_slot(struct mm_slot *mm_slot) |
2535 | { | 2561 | { |
2536 | struct mm_struct *mm = mm_slot->mm; | 2562 | struct mm_struct *mm = mm_slot->mm; |
2537 | 2563 | ||
2538 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); | 2564 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); |
2539 | 2565 | ||
2540 | if (khugepaged_test_exit(mm)) { | 2566 | if (khugepaged_test_exit(mm)) { |
2541 | /* free mm_slot */ | 2567 | /* free mm_slot */ |
2542 | hash_del(&mm_slot->hash); | 2568 | hash_del(&mm_slot->hash); |
2543 | list_del(&mm_slot->mm_node); | 2569 | list_del(&mm_slot->mm_node); |
2544 | 2570 | ||
2545 | /* | 2571 | /* |
2546 | * Not strictly needed because the mm exited already. | 2572 | * Not strictly needed because the mm exited already. |
2547 | * | 2573 | * |
2548 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | 2574 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); |
2549 | */ | 2575 | */ |
2550 | 2576 | ||
2551 | /* khugepaged_mm_lock actually not necessary for the below */ | 2577 | /* khugepaged_mm_lock actually not necessary for the below */ |
2552 | free_mm_slot(mm_slot); | 2578 | free_mm_slot(mm_slot); |
2553 | mmdrop(mm); | 2579 | mmdrop(mm); |
2554 | } | 2580 | } |
2555 | } | 2581 | } |
2556 | 2582 | ||
2557 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | 2583 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, |
2558 | struct page **hpage) | 2584 | struct page **hpage) |
2559 | __releases(&khugepaged_mm_lock) | 2585 | __releases(&khugepaged_mm_lock) |
2560 | __acquires(&khugepaged_mm_lock) | 2586 | __acquires(&khugepaged_mm_lock) |
2561 | { | 2587 | { |
2562 | struct mm_slot *mm_slot; | 2588 | struct mm_slot *mm_slot; |
2563 | struct mm_struct *mm; | 2589 | struct mm_struct *mm; |
2564 | struct vm_area_struct *vma; | 2590 | struct vm_area_struct *vma; |
2565 | int progress = 0; | 2591 | int progress = 0; |
2566 | 2592 | ||
2567 | VM_BUG_ON(!pages); | 2593 | VM_BUG_ON(!pages); |
2568 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); | 2594 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); |
2569 | 2595 | ||
2570 | if (khugepaged_scan.mm_slot) | 2596 | if (khugepaged_scan.mm_slot) |
2571 | mm_slot = khugepaged_scan.mm_slot; | 2597 | mm_slot = khugepaged_scan.mm_slot; |
2572 | else { | 2598 | else { |
2573 | mm_slot = list_entry(khugepaged_scan.mm_head.next, | 2599 | mm_slot = list_entry(khugepaged_scan.mm_head.next, |
2574 | struct mm_slot, mm_node); | 2600 | struct mm_slot, mm_node); |
2575 | khugepaged_scan.address = 0; | 2601 | khugepaged_scan.address = 0; |
2576 | khugepaged_scan.mm_slot = mm_slot; | 2602 | khugepaged_scan.mm_slot = mm_slot; |
2577 | } | 2603 | } |
2578 | spin_unlock(&khugepaged_mm_lock); | 2604 | spin_unlock(&khugepaged_mm_lock); |
2579 | 2605 | ||
2580 | mm = mm_slot->mm; | 2606 | mm = mm_slot->mm; |
2581 | down_read(&mm->mmap_sem); | 2607 | down_read(&mm->mmap_sem); |
2582 | if (unlikely(khugepaged_test_exit(mm))) | 2608 | if (unlikely(khugepaged_test_exit(mm))) |
2583 | vma = NULL; | 2609 | vma = NULL; |
2584 | else | 2610 | else |
2585 | vma = find_vma(mm, khugepaged_scan.address); | 2611 | vma = find_vma(mm, khugepaged_scan.address); |
2586 | 2612 | ||
2587 | progress++; | 2613 | progress++; |
2588 | for (; vma; vma = vma->vm_next) { | 2614 | for (; vma; vma = vma->vm_next) { |
2589 | unsigned long hstart, hend; | 2615 | unsigned long hstart, hend; |
2590 | 2616 | ||
2591 | cond_resched(); | 2617 | cond_resched(); |
2592 | if (unlikely(khugepaged_test_exit(mm))) { | 2618 | if (unlikely(khugepaged_test_exit(mm))) { |
2593 | progress++; | 2619 | progress++; |
2594 | break; | 2620 | break; |
2595 | } | 2621 | } |
2596 | if (!hugepage_vma_check(vma)) { | 2622 | if (!hugepage_vma_check(vma)) { |
2597 | skip: | 2623 | skip: |
2598 | progress++; | 2624 | progress++; |
2599 | continue; | 2625 | continue; |
2600 | } | 2626 | } |
2601 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2627 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2602 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2628 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2603 | if (hstart >= hend) | 2629 | if (hstart >= hend) |
2604 | goto skip; | 2630 | goto skip; |
2605 | if (khugepaged_scan.address > hend) | 2631 | if (khugepaged_scan.address > hend) |
2606 | goto skip; | 2632 | goto skip; |
2607 | if (khugepaged_scan.address < hstart) | 2633 | if (khugepaged_scan.address < hstart) |
2608 | khugepaged_scan.address = hstart; | 2634 | khugepaged_scan.address = hstart; |
2609 | VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); | 2635 | VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); |
2610 | 2636 | ||
2611 | while (khugepaged_scan.address < hend) { | 2637 | while (khugepaged_scan.address < hend) { |
2612 | int ret; | 2638 | int ret; |
2613 | cond_resched(); | 2639 | cond_resched(); |
2614 | if (unlikely(khugepaged_test_exit(mm))) | 2640 | if (unlikely(khugepaged_test_exit(mm))) |
2615 | goto breakouterloop; | 2641 | goto breakouterloop; |
2616 | 2642 | ||
2617 | VM_BUG_ON(khugepaged_scan.address < hstart || | 2643 | VM_BUG_ON(khugepaged_scan.address < hstart || |
2618 | khugepaged_scan.address + HPAGE_PMD_SIZE > | 2644 | khugepaged_scan.address + HPAGE_PMD_SIZE > |
2619 | hend); | 2645 | hend); |
2620 | ret = khugepaged_scan_pmd(mm, vma, | 2646 | ret = khugepaged_scan_pmd(mm, vma, |
2621 | khugepaged_scan.address, | 2647 | khugepaged_scan.address, |
2622 | hpage); | 2648 | hpage); |
2623 | /* move to next address */ | 2649 | /* move to next address */ |
2624 | khugepaged_scan.address += HPAGE_PMD_SIZE; | 2650 | khugepaged_scan.address += HPAGE_PMD_SIZE; |
2625 | progress += HPAGE_PMD_NR; | 2651 | progress += HPAGE_PMD_NR; |
2626 | if (ret) | 2652 | if (ret) |
2627 | /* we released mmap_sem so break loop */ | 2653 | /* we released mmap_sem so break loop */ |
2628 | goto breakouterloop_mmap_sem; | 2654 | goto breakouterloop_mmap_sem; |
2629 | if (progress >= pages) | 2655 | if (progress >= pages) |
2630 | goto breakouterloop; | 2656 | goto breakouterloop; |
2631 | } | 2657 | } |
2632 | } | 2658 | } |
2633 | breakouterloop: | 2659 | breakouterloop: |
2634 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ | 2660 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ |
2635 | breakouterloop_mmap_sem: | 2661 | breakouterloop_mmap_sem: |
2636 | 2662 | ||
2637 | spin_lock(&khugepaged_mm_lock); | 2663 | spin_lock(&khugepaged_mm_lock); |
2638 | VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); | 2664 | VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); |
2639 | /* | 2665 | /* |
2640 | * Release the current mm_slot if this mm is about to die, or | 2666 | * Release the current mm_slot if this mm is about to die, or |
2641 | * if we scanned all vmas of this mm. | 2667 | * if we scanned all vmas of this mm. |
2642 | */ | 2668 | */ |
2643 | if (khugepaged_test_exit(mm) || !vma) { | 2669 | if (khugepaged_test_exit(mm) || !vma) { |
2644 | /* | 2670 | /* |
2645 | * Make sure that if mm_users is reaching zero while | 2671 | * Make sure that if mm_users is reaching zero while |
2646 | * khugepaged runs here, khugepaged_exit will find | 2672 | * khugepaged runs here, khugepaged_exit will find |
2647 | * mm_slot not pointing to the exiting mm. | 2673 | * mm_slot not pointing to the exiting mm. |
2648 | */ | 2674 | */ |
2649 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { | 2675 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { |
2650 | khugepaged_scan.mm_slot = list_entry( | 2676 | khugepaged_scan.mm_slot = list_entry( |
2651 | mm_slot->mm_node.next, | 2677 | mm_slot->mm_node.next, |
2652 | struct mm_slot, mm_node); | 2678 | struct mm_slot, mm_node); |
2653 | khugepaged_scan.address = 0; | 2679 | khugepaged_scan.address = 0; |
2654 | } else { | 2680 | } else { |
2655 | khugepaged_scan.mm_slot = NULL; | 2681 | khugepaged_scan.mm_slot = NULL; |
2656 | khugepaged_full_scans++; | 2682 | khugepaged_full_scans++; |
2657 | } | 2683 | } |
2658 | 2684 | ||
2659 | collect_mm_slot(mm_slot); | 2685 | collect_mm_slot(mm_slot); |
2660 | } | 2686 | } |
2661 | 2687 | ||
2662 | return progress; | 2688 | return progress; |
2663 | } | 2689 | } |
2664 | 2690 | ||
2665 | static int khugepaged_has_work(void) | 2691 | static int khugepaged_has_work(void) |
2666 | { | 2692 | { |
2667 | return !list_empty(&khugepaged_scan.mm_head) && | 2693 | return !list_empty(&khugepaged_scan.mm_head) && |
2668 | khugepaged_enabled(); | 2694 | khugepaged_enabled(); |
2669 | } | 2695 | } |
2670 | 2696 | ||
2671 | static int khugepaged_wait_event(void) | 2697 | static int khugepaged_wait_event(void) |
2672 | { | 2698 | { |
2673 | return !list_empty(&khugepaged_scan.mm_head) || | 2699 | return !list_empty(&khugepaged_scan.mm_head) || |
2674 | kthread_should_stop(); | 2700 | kthread_should_stop(); |
2675 | } | 2701 | } |
2676 | 2702 | ||
2677 | static void khugepaged_do_scan(void) | 2703 | static void khugepaged_do_scan(void) |
2678 | { | 2704 | { |
2679 | struct page *hpage = NULL; | 2705 | struct page *hpage = NULL; |
2680 | unsigned int progress = 0, pass_through_head = 0; | 2706 | unsigned int progress = 0, pass_through_head = 0; |
2681 | unsigned int pages = khugepaged_pages_to_scan; | 2707 | unsigned int pages = khugepaged_pages_to_scan; |
2682 | bool wait = true; | 2708 | bool wait = true; |
2683 | 2709 | ||
2684 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | 2710 | barrier(); /* write khugepaged_pages_to_scan to local stack */ |
2685 | 2711 | ||
2686 | while (progress < pages) { | 2712 | while (progress < pages) { |
2687 | if (!khugepaged_prealloc_page(&hpage, &wait)) | 2713 | if (!khugepaged_prealloc_page(&hpage, &wait)) |
2688 | break; | 2714 | break; |
2689 | 2715 | ||
2690 | cond_resched(); | 2716 | cond_resched(); |
2691 | 2717 | ||
2692 | if (unlikely(kthread_should_stop() || freezing(current))) | 2718 | if (unlikely(kthread_should_stop() || freezing(current))) |
2693 | break; | 2719 | break; |
2694 | 2720 | ||
2695 | spin_lock(&khugepaged_mm_lock); | 2721 | spin_lock(&khugepaged_mm_lock); |
2696 | if (!khugepaged_scan.mm_slot) | 2722 | if (!khugepaged_scan.mm_slot) |
2697 | pass_through_head++; | 2723 | pass_through_head++; |
2698 | if (khugepaged_has_work() && | 2724 | if (khugepaged_has_work() && |
2699 | pass_through_head < 2) | 2725 | pass_through_head < 2) |
2700 | progress += khugepaged_scan_mm_slot(pages - progress, | 2726 | progress += khugepaged_scan_mm_slot(pages - progress, |
2701 | &hpage); | 2727 | &hpage); |
2702 | else | 2728 | else |
2703 | progress = pages; | 2729 | progress = pages; |
2704 | spin_unlock(&khugepaged_mm_lock); | 2730 | spin_unlock(&khugepaged_mm_lock); |
2705 | } | 2731 | } |
2706 | 2732 | ||
2707 | if (!IS_ERR_OR_NULL(hpage)) | 2733 | if (!IS_ERR_OR_NULL(hpage)) |
2708 | put_page(hpage); | 2734 | put_page(hpage); |
2709 | } | 2735 | } |
2710 | 2736 | ||
2711 | static void khugepaged_wait_work(void) | 2737 | static void khugepaged_wait_work(void) |
2712 | { | 2738 | { |
2713 | try_to_freeze(); | 2739 | try_to_freeze(); |
2714 | 2740 | ||
2715 | if (khugepaged_has_work()) { | 2741 | if (khugepaged_has_work()) { |
2716 | if (!khugepaged_scan_sleep_millisecs) | 2742 | if (!khugepaged_scan_sleep_millisecs) |
2717 | return; | 2743 | return; |
2718 | 2744 | ||
2719 | wait_event_freezable_timeout(khugepaged_wait, | 2745 | wait_event_freezable_timeout(khugepaged_wait, |
2720 | kthread_should_stop(), | 2746 | kthread_should_stop(), |
2721 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); | 2747 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); |
2722 | return; | 2748 | return; |
2723 | } | 2749 | } |
2724 | 2750 | ||
2725 | if (khugepaged_enabled()) | 2751 | if (khugepaged_enabled()) |
2726 | wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); | 2752 | wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); |
2727 | } | 2753 | } |
2728 | 2754 | ||
2729 | static int khugepaged(void *none) | 2755 | static int khugepaged(void *none) |
2730 | { | 2756 | { |
2731 | struct mm_slot *mm_slot; | 2757 | struct mm_slot *mm_slot; |
2732 | 2758 | ||
2733 | set_freezable(); | 2759 | set_freezable(); |
2734 | set_user_nice(current, 19); | 2760 | set_user_nice(current, 19); |
2735 | 2761 | ||
2736 | while (!kthread_should_stop()) { | 2762 | while (!kthread_should_stop()) { |
2737 | khugepaged_do_scan(); | 2763 | khugepaged_do_scan(); |
2738 | khugepaged_wait_work(); | 2764 | khugepaged_wait_work(); |
2739 | } | 2765 | } |
2740 | 2766 | ||
2741 | spin_lock(&khugepaged_mm_lock); | 2767 | spin_lock(&khugepaged_mm_lock); |
2742 | mm_slot = khugepaged_scan.mm_slot; | 2768 | mm_slot = khugepaged_scan.mm_slot; |
2743 | khugepaged_scan.mm_slot = NULL; | 2769 | khugepaged_scan.mm_slot = NULL; |
2744 | if (mm_slot) | 2770 | if (mm_slot) |
2745 | collect_mm_slot(mm_slot); | 2771 | collect_mm_slot(mm_slot); |
2746 | spin_unlock(&khugepaged_mm_lock); | 2772 | spin_unlock(&khugepaged_mm_lock); |
2747 | return 0; | 2773 | return 0; |
2748 | } | 2774 | } |
2749 | 2775 | ||
2750 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | 2776 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, |
2751 | unsigned long haddr, pmd_t *pmd) | 2777 | unsigned long haddr, pmd_t *pmd) |
2752 | { | 2778 | { |
2753 | struct mm_struct *mm = vma->vm_mm; | 2779 | struct mm_struct *mm = vma->vm_mm; |
2754 | pgtable_t pgtable; | 2780 | pgtable_t pgtable; |
2755 | pmd_t _pmd; | 2781 | pmd_t _pmd; |
2756 | int i; | 2782 | int i; |
2757 | 2783 | ||
2758 | pmdp_clear_flush(vma, haddr, pmd); | 2784 | pmdp_clear_flush(vma, haddr, pmd); |
2759 | /* leave pmd empty until pte is filled */ | 2785 | /* leave pmd empty until pte is filled */ |
2760 | 2786 | ||
2761 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 2787 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
2762 | pmd_populate(mm, &_pmd, pgtable); | 2788 | pmd_populate(mm, &_pmd, pgtable); |
2763 | 2789 | ||
2764 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 2790 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
2765 | pte_t *pte, entry; | 2791 | pte_t *pte, entry; |
2766 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | 2792 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); |
2767 | entry = pte_mkspecial(entry); | 2793 | entry = pte_mkspecial(entry); |
2768 | pte = pte_offset_map(&_pmd, haddr); | 2794 | pte = pte_offset_map(&_pmd, haddr); |
2769 | VM_BUG_ON(!pte_none(*pte)); | 2795 | VM_BUG_ON(!pte_none(*pte)); |
2770 | set_pte_at(mm, haddr, pte, entry); | 2796 | set_pte_at(mm, haddr, pte, entry); |
2771 | pte_unmap(pte); | 2797 | pte_unmap(pte); |
2772 | } | 2798 | } |
2773 | smp_wmb(); /* make pte visible before pmd */ | 2799 | smp_wmb(); /* make pte visible before pmd */ |
2774 | pmd_populate(mm, pmd, pgtable); | 2800 | pmd_populate(mm, pmd, pgtable); |
2775 | put_huge_zero_page(); | 2801 | put_huge_zero_page(); |
2776 | } | 2802 | } |
2777 | 2803 | ||
2778 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | 2804 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, |
2779 | pmd_t *pmd) | 2805 | pmd_t *pmd) |
2780 | { | 2806 | { |
2781 | struct page *page; | 2807 | struct page *page; |
2782 | struct mm_struct *mm = vma->vm_mm; | 2808 | struct mm_struct *mm = vma->vm_mm; |
2783 | unsigned long haddr = address & HPAGE_PMD_MASK; | 2809 | unsigned long haddr = address & HPAGE_PMD_MASK; |
2784 | unsigned long mmun_start; /* For mmu_notifiers */ | 2810 | unsigned long mmun_start; /* For mmu_notifiers */ |
2785 | unsigned long mmun_end; /* For mmu_notifiers */ | 2811 | unsigned long mmun_end; /* For mmu_notifiers */ |
2786 | 2812 | ||
2787 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); | 2813 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); |
2788 | 2814 | ||
2789 | mmun_start = haddr; | 2815 | mmun_start = haddr; |
2790 | mmun_end = haddr + HPAGE_PMD_SIZE; | 2816 | mmun_end = haddr + HPAGE_PMD_SIZE; |
2791 | again: | 2817 | again: |
2792 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2818 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2793 | spin_lock(&mm->page_table_lock); | 2819 | spin_lock(&mm->page_table_lock); |
2794 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2820 | if (unlikely(!pmd_trans_huge(*pmd))) { |
2795 | spin_unlock(&mm->page_table_lock); | 2821 | spin_unlock(&mm->page_table_lock); |
2796 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2822 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2797 | return; | 2823 | return; |
2798 | } | 2824 | } |
2799 | if (is_huge_zero_pmd(*pmd)) { | 2825 | if (is_huge_zero_pmd(*pmd)) { |
2800 | __split_huge_zero_page_pmd(vma, haddr, pmd); | 2826 | __split_huge_zero_page_pmd(vma, haddr, pmd); |
2801 | spin_unlock(&mm->page_table_lock); | 2827 | spin_unlock(&mm->page_table_lock); |
2802 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2828 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2803 | return; | 2829 | return; |
2804 | } | 2830 | } |
2805 | page = pmd_page(*pmd); | 2831 | page = pmd_page(*pmd); |
2806 | VM_BUG_ON(!page_count(page)); | 2832 | VM_BUG_ON(!page_count(page)); |
2807 | get_page(page); | 2833 | get_page(page); |
2808 | spin_unlock(&mm->page_table_lock); | 2834 | spin_unlock(&mm->page_table_lock); |
2809 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2835 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2810 | 2836 | ||
2811 | split_huge_page(page); | 2837 | split_huge_page(page); |
2812 | 2838 | ||
2813 | put_page(page); | 2839 | put_page(page); |
2814 | 2840 | ||
2815 | /* | 2841 | /* |
2816 | * We don't always have down_write of mmap_sem here: a racing | 2842 | * We don't always have down_write of mmap_sem here: a racing |
2817 | * do_huge_pmd_wp_page() might have copied-on-write to another | 2843 | * do_huge_pmd_wp_page() might have copied-on-write to another |
2818 | * huge page before our split_huge_page() got the anon_vma lock. | 2844 | * huge page before our split_huge_page() got the anon_vma lock. |
2819 | */ | 2845 | */ |
2820 | if (unlikely(pmd_trans_huge(*pmd))) | 2846 | if (unlikely(pmd_trans_huge(*pmd))) |
2821 | goto again; | 2847 | goto again; |
2822 | } | 2848 | } |
2823 | 2849 | ||
2824 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | 2850 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, |
2825 | pmd_t *pmd) | 2851 | pmd_t *pmd) |
2826 | { | 2852 | { |
2827 | struct vm_area_struct *vma; | 2853 | struct vm_area_struct *vma; |
2828 | 2854 | ||
2829 | vma = find_vma(mm, address); | 2855 | vma = find_vma(mm, address); |
2830 | BUG_ON(vma == NULL); | 2856 | BUG_ON(vma == NULL); |
2831 | split_huge_page_pmd(vma, address, pmd); | 2857 | split_huge_page_pmd(vma, address, pmd); |
2832 | } | 2858 | } |
2833 | 2859 | ||
2834 | static void split_huge_page_address(struct mm_struct *mm, | 2860 | static void split_huge_page_address(struct mm_struct *mm, |
2835 | unsigned long address) | 2861 | unsigned long address) |
2836 | { | 2862 | { |
2837 | pmd_t *pmd; | 2863 | pmd_t *pmd; |
2838 | 2864 | ||
2839 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | 2865 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); |
2840 | 2866 | ||
2841 | pmd = mm_find_pmd(mm, address); | 2867 | pmd = mm_find_pmd(mm, address); |
2842 | if (!pmd) | 2868 | if (!pmd) |
2843 | return; | 2869 | return; |
2844 | /* | 2870 | /* |
2845 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2871 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
2846 | * materialize from under us. | 2872 | * materialize from under us. |
2847 | */ | 2873 | */ |
2848 | split_huge_page_pmd_mm(mm, address, pmd); | 2874 | split_huge_page_pmd_mm(mm, address, pmd); |
2849 | } | 2875 | } |
2850 | 2876 | ||
2851 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | 2877 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, |
2852 | unsigned long start, | 2878 | unsigned long start, |
2853 | unsigned long end, | 2879 | unsigned long end, |
2854 | long adjust_next) | 2880 | long adjust_next) |
2855 | { | 2881 | { |
2856 | /* | 2882 | /* |
2857 | * If the new start address isn't hpage aligned and it could | 2883 | * If the new start address isn't hpage aligned and it could |
2858 | * previously contain an hugepage: check if we need to split | 2884 | * previously contain an hugepage: check if we need to split |
2859 | * an huge pmd. | 2885 | * an huge pmd. |
2860 | */ | 2886 | */ |
2861 | if (start & ~HPAGE_PMD_MASK && | 2887 | if (start & ~HPAGE_PMD_MASK && |
2862 | (start & HPAGE_PMD_MASK) >= vma->vm_start && | 2888 | (start & HPAGE_PMD_MASK) >= vma->vm_start && |
2863 | (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | 2889 | (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) |
2864 | split_huge_page_address(vma->vm_mm, start); | 2890 | split_huge_page_address(vma->vm_mm, start); |
2865 | 2891 | ||
2866 | /* | 2892 | /* |
2867 | * If the new end address isn't hpage aligned and it could | 2893 | * If the new end address isn't hpage aligned and it could |
2868 | * previously contain an hugepage: check if we need to split | 2894 | * previously contain an hugepage: check if we need to split |
2869 | * an huge pmd. | 2895 | * an huge pmd. |
2870 | */ | 2896 | */ |
2871 | if (end & ~HPAGE_PMD_MASK && | 2897 | if (end & ~HPAGE_PMD_MASK && |
2872 | (end & HPAGE_PMD_MASK) >= vma->vm_start && | 2898 | (end & HPAGE_PMD_MASK) >= vma->vm_start && |
2873 | (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | 2899 | (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) |
2874 | split_huge_page_address(vma->vm_mm, end); | 2900 | split_huge_page_address(vma->vm_mm, end); |
2875 | 2901 | ||
2876 | /* | 2902 | /* |
2877 | * If we're also updating the vma->vm_next->vm_start, if the new | 2903 | * If we're also updating the vma->vm_next->vm_start, if the new |
2878 | * vm_next->vm_start isn't page aligned and it could previously | 2904 | * vm_next->vm_start isn't page aligned and it could previously |
2879 | * contain an hugepage: check if we need to split an huge pmd. | 2905 | * contain an hugepage: check if we need to split an huge pmd. |
2880 | */ | 2906 | */ |
2881 | if (adjust_next > 0) { | 2907 | if (adjust_next > 0) { |
2882 | struct vm_area_struct *next = vma->vm_next; | 2908 | struct vm_area_struct *next = vma->vm_next; |
2883 | unsigned long nstart = next->vm_start; | 2909 | unsigned long nstart = next->vm_start; |
2884 | nstart += adjust_next << PAGE_SHIFT; | 2910 | nstart += adjust_next << PAGE_SHIFT; |
2885 | if (nstart & ~HPAGE_PMD_MASK && | 2911 | if (nstart & ~HPAGE_PMD_MASK && |
2886 | (nstart & HPAGE_PMD_MASK) >= next->vm_start && | 2912 | (nstart & HPAGE_PMD_MASK) >= next->vm_start && |
2887 | (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) | 2913 | (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) |
2888 | split_huge_page_address(next->vm_mm, nstart); | 2914 | split_huge_page_address(next->vm_mm, nstart); |
2889 | } | 2915 | } |
2890 | } | 2916 | } |
2891 | 2917 |