Blame view

mm/huge_memory.c 63.7 KB
71e3aac07   Andrea Arcangeli   thp: transparent ...
1
2
3
4
5
6
  /*
   *  Copyright (C) 2009  Red Hat, Inc.
   *
   *  This work is licensed under the terms of the GNU GPL, version 2. See
   *  the COPYING file in the top-level directory.
   */
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
7
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71e3aac07   Andrea Arcangeli   thp: transparent ...
8
9
10
11
12
13
14
  #include <linux/mm.h>
  #include <linux/sched.h>
  #include <linux/highmem.h>
  #include <linux/hugetlb.h>
  #include <linux/mmu_notifier.h>
  #include <linux/rmap.h>
  #include <linux/swap.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
15
  #include <linux/shrinker.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
16
  #include <linux/mm_inline.h>
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
17
  #include <linux/swapops.h>
4897c7655   Matthew Wilcox   thp: prepare for ...
18
  #include <linux/dax.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
19
  #include <linux/khugepaged.h>
878aee7d6   Andrea Arcangeli   thp: freeze khuge...
20
  #include <linux/freezer.h>
f25748e3c   Dan Williams   mm, dax: convert ...
21
  #include <linux/pfn_t.h>
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
22
  #include <linux/mman.h>
3565fce3a   Dan Williams   mm, x86: get_user...
23
  #include <linux/memremap.h>
325adeb55   Ralf Baechle   mm: huge_memory: ...
24
  #include <linux/pagemap.h>
49071d436   Kirill A. Shutemov   thp: add debugfs ...
25
  #include <linux/debugfs.h>
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
26
  #include <linux/migrate.h>
43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
27
  #include <linux/hashtable.h>
6b251fc96   Andrea Arcangeli   userfaultfd: call...
28
  #include <linux/userfaultfd_k.h>
33c3fc71c   Vladimir Davydov   mm: introduce idl...
29
  #include <linux/page_idle.h>
baa355fd3   Kirill A. Shutemov   thp: file pages s...
30
  #include <linux/shmem_fs.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
31

71e3aac07   Andrea Arcangeli   thp: transparent ...
32
33
34
  #include <asm/tlb.h>
  #include <asm/pgalloc.h>
  #include "internal.h"
ba76149f4   Andrea Arcangeli   thp: khugepaged
35
  /*
8bfa3f9a0   Jianguo Wu   mm/huge_memory.c:...
36
37
38
39
40
41
   * By default transparent hugepage support is disabled in order that avoid
   * to risk increase the memory footprint of applications without a guaranteed
   * benefit. When transparent hugepage support is enabled, is for all mappings,
   * and khugepaged scans all mappings.
   * Defrag is invoked by khugepaged hugepage allocations and by page faults
   * for all hugepage allocations.
ba76149f4   Andrea Arcangeli   thp: khugepaged
42
   */
71e3aac07   Andrea Arcangeli   thp: transparent ...
43
  unsigned long transparent_hugepage_flags __read_mostly =
13ece886d   Andrea Arcangeli   thp: transparent ...
44
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
ba76149f4   Andrea Arcangeli   thp: khugepaged
45
  	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
13ece886d   Andrea Arcangeli   thp: transparent ...
46
47
48
49
  #endif
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  #endif
444eb2a44   Mel Gorman   mm: thp: set THP ...
50
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
79da5407e   Kirill A. Shutemov   thp: introduce sy...
51
52
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
ba76149f4   Andrea Arcangeli   thp: khugepaged
53

9a982250f   Kirill A. Shutemov   thp: introduce de...
54
  static struct shrinker deferred_split_shrinker;
f000565ad   Andrea Arcangeli   thp: set recommen...
55

97ae17497   Kirill A. Shutemov   thp: implement re...
56
  static atomic_t huge_zero_refcount;
56873f43a   Wang, Yalin   mm:add KPF_ZERO_P...
57
  struct page *huge_zero_page __read_mostly;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
58

6fcb52a56   Aaron Lu   thp: reduce usage...
59
  static struct page *get_huge_zero_page(void)
97ae17497   Kirill A. Shutemov   thp: implement re...
60
61
62
63
  {
  	struct page *zero_page;
  retry:
  	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
4db0c3c29   Jason Low   mm: remove rest o...
64
  		return READ_ONCE(huge_zero_page);
97ae17497   Kirill A. Shutemov   thp: implement re...
65
66
  
  	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
67
  			HPAGE_PMD_ORDER);
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
68
69
  	if (!zero_page) {
  		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
70
  		return NULL;
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
71
72
  	}
  	count_vm_event(THP_ZERO_PAGE_ALLOC);
97ae17497   Kirill A. Shutemov   thp: implement re...
73
  	preempt_disable();
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
74
  	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
97ae17497   Kirill A. Shutemov   thp: implement re...
75
  		preempt_enable();
5ddacbe92   Yu Zhao   mm: free compound...
76
  		__free_pages(zero_page, compound_order(zero_page));
97ae17497   Kirill A. Shutemov   thp: implement re...
77
78
79
80
81
82
  		goto retry;
  	}
  
  	/* We take additional reference here. It will be put back by shrinker */
  	atomic_set(&huge_zero_refcount, 2);
  	preempt_enable();
4db0c3c29   Jason Low   mm: remove rest o...
83
  	return READ_ONCE(huge_zero_page);
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
84
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
85
  static void put_huge_zero_page(void)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
86
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
87
88
89
90
91
  	/*
  	 * Counter should never go to zero here. Only shrinker can put
  	 * last reference.
  	 */
  	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
92
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
  struct page *mm_get_huge_zero_page(struct mm_struct *mm)
  {
  	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		return READ_ONCE(huge_zero_page);
  
  	if (!get_huge_zero_page())
  		return NULL;
  
  	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		put_huge_zero_page();
  
  	return READ_ONCE(huge_zero_page);
  }
  
  void mm_put_huge_zero_page(struct mm_struct *mm)
  {
  	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		put_huge_zero_page();
  }
488964666   Glauber Costa   hugepage: convert...
112
113
  static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
  					struct shrink_control *sc)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
114
  {
488964666   Glauber Costa   hugepage: convert...
115
116
117
  	/* we can free zero page only if last reference remains */
  	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
118

488964666   Glauber Costa   hugepage: convert...
119
120
121
  static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
  				       struct shrink_control *sc)
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
122
  	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
123
124
  		struct page *zero_page = xchg(&huge_zero_page, NULL);
  		BUG_ON(zero_page == NULL);
5ddacbe92   Yu Zhao   mm: free compound...
125
  		__free_pages(zero_page, compound_order(zero_page));
488964666   Glauber Costa   hugepage: convert...
126
  		return HPAGE_PMD_NR;
97ae17497   Kirill A. Shutemov   thp: implement re...
127
128
129
  	}
  
  	return 0;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
130
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
131
  static struct shrinker huge_zero_page_shrinker = {
488964666   Glauber Costa   hugepage: convert...
132
133
  	.count_objects = shrink_huge_zero_page_count,
  	.scan_objects = shrink_huge_zero_page_scan,
97ae17497   Kirill A. Shutemov   thp: implement re...
134
135
  	.seeks = DEFAULT_SEEKS,
  };
71e3aac07   Andrea Arcangeli   thp: transparent ...
136
  #ifdef CONFIG_SYSFS
ba76149f4   Andrea Arcangeli   thp: khugepaged
137

444eb2a44   Mel Gorman   mm: thp: set THP ...
138
  static ssize_t triple_flag_store(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
139
140
141
  				 struct kobj_attribute *attr,
  				 const char *buf, size_t count,
  				 enum transparent_hugepage_flag enabled,
444eb2a44   Mel Gorman   mm: thp: set THP ...
142
  				 enum transparent_hugepage_flag deferred,
71e3aac07   Andrea Arcangeli   thp: transparent ...
143
144
  				 enum transparent_hugepage_flag req_madv)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
145
146
147
148
149
150
151
152
  	if (!memcmp("defer", buf,
  		    min(sizeof("defer")-1, count))) {
  		if (enabled == deferred)
  			return -EINVAL;
  		clear_bit(enabled, &transparent_hugepage_flags);
  		clear_bit(req_madv, &transparent_hugepage_flags);
  		set_bit(deferred, &transparent_hugepage_flags);
  	} else if (!memcmp("always", buf,
71e3aac07   Andrea Arcangeli   thp: transparent ...
153
  		    min(sizeof("always")-1, count))) {
444eb2a44   Mel Gorman   mm: thp: set THP ...
154
  		clear_bit(deferred, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
155
  		clear_bit(req_madv, &transparent_hugepage_flags);
444eb2a44   Mel Gorman   mm: thp: set THP ...
156
  		set_bit(enabled, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
157
158
159
  	} else if (!memcmp("madvise", buf,
  			   min(sizeof("madvise")-1, count))) {
  		clear_bit(enabled, &transparent_hugepage_flags);
444eb2a44   Mel Gorman   mm: thp: set THP ...
160
  		clear_bit(deferred, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
161
162
163
164
165
  		set_bit(req_madv, &transparent_hugepage_flags);
  	} else if (!memcmp("never", buf,
  			   min(sizeof("never")-1, count))) {
  		clear_bit(enabled, &transparent_hugepage_flags);
  		clear_bit(req_madv, &transparent_hugepage_flags);
444eb2a44   Mel Gorman   mm: thp: set THP ...
166
  		clear_bit(deferred, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
167
168
169
170
171
172
173
174
175
  	} else
  		return -EINVAL;
  
  	return count;
  }
  
  static ssize_t enabled_show(struct kobject *kobj,
  			    struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
176
177
178
179
180
181
182
183
184
  	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "[always] madvise never
  ");
  	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always [madvise] never
  ");
  	else
  		return sprintf(buf, "always madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
185
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
186

71e3aac07   Andrea Arcangeli   thp: transparent ...
187
188
189
190
  static ssize_t enabled_store(struct kobject *kobj,
  			     struct kobj_attribute *attr,
  			     const char *buf, size_t count)
  {
ba76149f4   Andrea Arcangeli   thp: khugepaged
191
  	ssize_t ret;
444eb2a44   Mel Gorman   mm: thp: set THP ...
192
193
  	ret = triple_flag_store(kobj, attr, buf, count,
  				TRANSPARENT_HUGEPAGE_FLAG,
ba76149f4   Andrea Arcangeli   thp: khugepaged
194
195
196
197
  				TRANSPARENT_HUGEPAGE_FLAG,
  				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
  
  	if (ret > 0) {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
198
  		int err = start_stop_khugepaged();
ba76149f4   Andrea Arcangeli   thp: khugepaged
199
200
201
202
203
  		if (err)
  			ret = err;
  	}
  
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
204
205
206
  }
  static struct kobj_attribute enabled_attr =
  	__ATTR(enabled, 0644, enabled_show, enabled_store);
b46e756f5   Kirill A. Shutemov   thp: extract khug...
207
  ssize_t single_hugepage_flag_show(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
208
209
210
  				struct kobj_attribute *attr, char *buf,
  				enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
211
212
213
  	return sprintf(buf, "%d
  ",
  		       !!test_bit(flag, &transparent_hugepage_flags));
71e3aac07   Andrea Arcangeli   thp: transparent ...
214
  }
e27e6151b   Ben Hutchings   mm/thp: use conve...
215

b46e756f5   Kirill A. Shutemov   thp: extract khug...
216
  ssize_t single_hugepage_flag_store(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
217
218
219
220
  				 struct kobj_attribute *attr,
  				 const char *buf, size_t count,
  				 enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
221
222
223
224
225
226
227
228
229
230
  	unsigned long value;
  	int ret;
  
  	ret = kstrtoul(buf, 10, &value);
  	if (ret < 0)
  		return ret;
  	if (value > 1)
  		return -EINVAL;
  
  	if (value)
71e3aac07   Andrea Arcangeli   thp: transparent ...
231
  		set_bit(flag, &transparent_hugepage_flags);
e27e6151b   Ben Hutchings   mm/thp: use conve...
232
  	else
71e3aac07   Andrea Arcangeli   thp: transparent ...
233
  		clear_bit(flag, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
234
235
236
237
238
239
240
241
242
243
244
245
  
  	return count;
  }
  
  /*
   * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
   * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
   * memory just to allocate one more hugepage.
   */
  static ssize_t defrag_show(struct kobject *kobj,
  			   struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
246
247
248
249
250
251
252
253
254
255
256
257
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "[always] defer madvise never
  ");
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always [defer] madvise never
  ");
  	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always defer [madvise] never
  ");
  	else
  		return sprintf(buf, "always defer madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
258
259
260
261
262
  }
  static ssize_t defrag_store(struct kobject *kobj,
  			    struct kobj_attribute *attr,
  			    const char *buf, size_t count)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
263
264
265
  	return triple_flag_store(kobj, attr, buf, count,
  				 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
  				 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
71e3aac07   Andrea Arcangeli   thp: transparent ...
266
267
268
269
  				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
  }
  static struct kobj_attribute defrag_attr =
  	__ATTR(defrag, 0644, defrag_show, defrag_store);
79da5407e   Kirill A. Shutemov   thp: introduce sy...
270
271
272
  static ssize_t use_zero_page_show(struct kobject *kobj,
  		struct kobj_attribute *attr, char *buf)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
273
  	return single_hugepage_flag_show(kobj, attr, buf,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
274
275
276
277
278
  				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static ssize_t use_zero_page_store(struct kobject *kobj,
  		struct kobj_attribute *attr, const char *buf, size_t count)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
279
  	return single_hugepage_flag_store(kobj, attr, buf, count,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
280
281
282
283
  				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static struct kobj_attribute use_zero_page_attr =
  	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
71e3aac07   Andrea Arcangeli   thp: transparent ...
284
285
286
287
  #ifdef CONFIG_DEBUG_VM
  static ssize_t debug_cow_show(struct kobject *kobj,
  				struct kobj_attribute *attr, char *buf)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
288
  	return single_hugepage_flag_show(kobj, attr, buf,
71e3aac07   Andrea Arcangeli   thp: transparent ...
289
290
291
292
293
294
  				TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  }
  static ssize_t debug_cow_store(struct kobject *kobj,
  			       struct kobj_attribute *attr,
  			       const char *buf, size_t count)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
295
  	return single_hugepage_flag_store(kobj, attr, buf, count,
71e3aac07   Andrea Arcangeli   thp: transparent ...
296
297
298
299
300
301
302
303
304
  				 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  }
  static struct kobj_attribute debug_cow_attr =
  	__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
  #endif /* CONFIG_DEBUG_VM */
  
  static struct attribute *hugepage_attr[] = {
  	&enabled_attr.attr,
  	&defrag_attr.attr,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
305
  	&use_zero_page_attr.attr,
e496cf3d7   Kirill A. Shutemov   thp: introduce CO...
306
  #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
5a6e75f81   Kirill A. Shutemov   shmem: prepare hu...
307
308
  	&shmem_enabled_attr.attr,
  #endif
71e3aac07   Andrea Arcangeli   thp: transparent ...
309
310
311
312
313
314
315
316
  #ifdef CONFIG_DEBUG_VM
  	&debug_cow_attr.attr,
  #endif
  	NULL,
  };
  
  static struct attribute_group hugepage_attr_group = {
  	.attrs = hugepage_attr,
ba76149f4   Andrea Arcangeli   thp: khugepaged
317
  };
569e55900   Shaohua Li   thp: improve the ...
318
  static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
71e3aac07   Andrea Arcangeli   thp: transparent ...
319
  {
71e3aac07   Andrea Arcangeli   thp: transparent ...
320
  	int err;
569e55900   Shaohua Li   thp: improve the ...
321
322
  	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
  	if (unlikely(!*hugepage_kobj)) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
323
324
  		pr_err("failed to create transparent hugepage kobject
  ");
569e55900   Shaohua Li   thp: improve the ...
325
  		return -ENOMEM;
ba76149f4   Andrea Arcangeli   thp: khugepaged
326
  	}
569e55900   Shaohua Li   thp: improve the ...
327
  	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
328
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
329
330
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
331
  		goto delete_obj;
ba76149f4   Andrea Arcangeli   thp: khugepaged
332
  	}
569e55900   Shaohua Li   thp: improve the ...
333
  	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
334
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
335
336
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
337
  		goto remove_hp_group;
ba76149f4   Andrea Arcangeli   thp: khugepaged
338
  	}
569e55900   Shaohua Li   thp: improve the ...
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
  
  	return 0;
  
  remove_hp_group:
  	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
  delete_obj:
  	kobject_put(*hugepage_kobj);
  	return err;
  }
  
  static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
  	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
  	kobject_put(hugepage_kobj);
  }
  #else
  static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
  {
  	return 0;
  }
  
  static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  }
  #endif /* CONFIG_SYSFS */
  
  static int __init hugepage_init(void)
  {
  	int err;
  	struct kobject *hugepage_kobj;
  
  	if (!has_transparent_hugepage()) {
  		transparent_hugepage_flags = 0;
  		return -EINVAL;
  	}
ff20c2e0a   Kirill A. Shutemov   mm: Some arch may...
375
376
377
378
379
380
381
382
383
  	/*
  	 * hugepages can't be allocated by the buddy allocator
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
  	/*
  	 * we use page->mapping and page->index in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
569e55900   Shaohua Li   thp: improve the ...
384
385
  	err = hugepage_init_sysfs(&hugepage_kobj);
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
386
  		goto err_sysfs;
ba76149f4   Andrea Arcangeli   thp: khugepaged
387

b46e756f5   Kirill A. Shutemov   thp: extract khug...
388
  	err = khugepaged_init();
ba76149f4   Andrea Arcangeli   thp: khugepaged
389
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
390
  		goto err_slab;
ba76149f4   Andrea Arcangeli   thp: khugepaged
391

65ebb64f4   Kirill A. Shutemov   thp: handle error...
392
393
394
  	err = register_shrinker(&huge_zero_page_shrinker);
  	if (err)
  		goto err_hzp_shrinker;
9a982250f   Kirill A. Shutemov   thp: introduce de...
395
396
397
  	err = register_shrinker(&deferred_split_shrinker);
  	if (err)
  		goto err_split_shrinker;
97ae17497   Kirill A. Shutemov   thp: implement re...
398

97562cd24   Rik van Riel   thp: disable tran...
399
400
401
402
403
  	/*
  	 * By default disable transparent hugepages on smaller systems,
  	 * where the extra memory used could hurt more than TLB overhead
  	 * is likely to save.  The admin can still enable it through /sys.
  	 */
79553da29   Kirill A. Shutemov   thp: cleanup khug...
404
  	if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
97562cd24   Rik van Riel   thp: disable tran...
405
  		transparent_hugepage_flags = 0;
79553da29   Kirill A. Shutemov   thp: cleanup khug...
406
407
  		return 0;
  	}
97562cd24   Rik van Riel   thp: disable tran...
408

79553da29   Kirill A. Shutemov   thp: cleanup khug...
409
  	err = start_stop_khugepaged();
65ebb64f4   Kirill A. Shutemov   thp: handle error...
410
411
  	if (err)
  		goto err_khugepaged;
ba76149f4   Andrea Arcangeli   thp: khugepaged
412

569e55900   Shaohua Li   thp: improve the ...
413
  	return 0;
65ebb64f4   Kirill A. Shutemov   thp: handle error...
414
  err_khugepaged:
9a982250f   Kirill A. Shutemov   thp: introduce de...
415
416
  	unregister_shrinker(&deferred_split_shrinker);
  err_split_shrinker:
65ebb64f4   Kirill A. Shutemov   thp: handle error...
417
418
  	unregister_shrinker(&huge_zero_page_shrinker);
  err_hzp_shrinker:
b46e756f5   Kirill A. Shutemov   thp: extract khug...
419
  	khugepaged_destroy();
65ebb64f4   Kirill A. Shutemov   thp: handle error...
420
  err_slab:
569e55900   Shaohua Li   thp: improve the ...
421
  	hugepage_exit_sysfs(hugepage_kobj);
65ebb64f4   Kirill A. Shutemov   thp: handle error...
422
  err_sysfs:
ba76149f4   Andrea Arcangeli   thp: khugepaged
423
  	return err;
71e3aac07   Andrea Arcangeli   thp: transparent ...
424
  }
a64fb3cd6   Paul Gortmaker   mm: audit/fix non...
425
  subsys_initcall(hugepage_init);
71e3aac07   Andrea Arcangeli   thp: transparent ...
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
  
  static int __init setup_transparent_hugepage(char *str)
  {
  	int ret = 0;
  	if (!str)
  		goto out;
  	if (!strcmp(str, "always")) {
  		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			&transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "madvise")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			&transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "never")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	}
  out:
  	if (!ret)
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
453
454
  		pr_warn("transparent_hugepage= cannot parse, ignored
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
455
456
457
  	return ret;
  }
  __setup("transparent_hugepage=", setup_transparent_hugepage);
b32967ff1   Mel Gorman   mm: numa: Add THP...
458
  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
71e3aac07   Andrea Arcangeli   thp: transparent ...
459
460
461
462
463
  {
  	if (likely(vma->vm_flags & VM_WRITE))
  		pmd = pmd_mkwrite(pmd);
  	return pmd;
  }
9a982250f   Kirill A. Shutemov   thp: introduce de...
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
  static inline struct list_head *page_deferred_list(struct page *page)
  {
  	/*
  	 * ->lru in the tail pages is occupied by compound_head.
  	 * Let's use ->mapping + ->index in the second tail page as list_head.
  	 */
  	return (struct list_head *)&page[2].mapping;
  }
  
  void prep_transhuge_page(struct page *page)
  {
  	/*
  	 * we use page->mapping and page->indexlru in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
9a982250f   Kirill A. Shutemov   thp: introduce de...
479
480
481
482
  
  	INIT_LIST_HEAD(page_deferred_list(page));
  	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
  }
74d2fad13   Toshi Kani   thp, dax: add thp...
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
  unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
  		loff_t off, unsigned long flags, unsigned long size)
  {
  	unsigned long addr;
  	loff_t off_end = off + len;
  	loff_t off_align = round_up(off, size);
  	unsigned long len_pad;
  
  	if (off_end <= off_align || (off_end - off_align) < size)
  		return 0;
  
  	len_pad = len + size;
  	if (len_pad < len || (off + len_pad) < off)
  		return 0;
  
  	addr = current->mm->get_unmapped_area(filp, 0, len_pad,
  					      off >> PAGE_SHIFT, flags);
  	if (IS_ERR_VALUE(addr))
  		return 0;
  
  	addr += (off - addr) & (size - 1);
  	return addr;
  }
  
  unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
  		unsigned long len, unsigned long pgoff, unsigned long flags)
  {
  	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
  
  	if (addr)
  		goto out;
  	if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
  		goto out;
  
  	addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
  	if (addr)
  		return addr;
  
   out:
  	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
  }
  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
525
526
  static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
  		gfp_t gfp)
71e3aac07   Andrea Arcangeli   thp: transparent ...
527
  {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
528
  	struct vm_area_struct *vma = fe->vma;
00501b531   Johannes Weiner   mm: memcontrol: r...
529
  	struct mem_cgroup *memcg;
71e3aac07   Andrea Arcangeli   thp: transparent ...
530
  	pgtable_t pgtable;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
531
  	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
71e3aac07   Andrea Arcangeli   thp: transparent ...
532

309381fea   Sasha Levin   mm: dump page whe...
533
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
00501b531   Johannes Weiner   mm: memcontrol: r...
534

bae473a42   Kirill A. Shutemov   mm: introduce fau...
535
  	if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
6b251fc96   Andrea Arcangeli   userfaultfd: call...
536
537
538
539
  		put_page(page);
  		count_vm_event(THP_FAULT_FALLBACK);
  		return VM_FAULT_FALLBACK;
  	}
00501b531   Johannes Weiner   mm: memcontrol: r...
540

bae473a42   Kirill A. Shutemov   mm: introduce fau...
541
  	pgtable = pte_alloc_one(vma->vm_mm, haddr);
00501b531   Johannes Weiner   mm: memcontrol: r...
542
  	if (unlikely(!pgtable)) {
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
543
  		mem_cgroup_cancel_charge(page, memcg, true);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
544
  		put_page(page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
545
  		return VM_FAULT_OOM;
00501b531   Johannes Weiner   mm: memcontrol: r...
546
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
547
548
  
  	clear_huge_page(page, haddr, HPAGE_PMD_NR);
52f37629f   Minchan Kim   THP: fix comment ...
549
550
551
552
553
  	/*
  	 * The memory barrier inside __SetPageUptodate makes sure that
  	 * clear_huge_page writes become visible before the set_pmd_at()
  	 * write.
  	 */
71e3aac07   Andrea Arcangeli   thp: transparent ...
554
  	__SetPageUptodate(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
555
556
557
  	fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
  	if (unlikely(!pmd_none(*fe->pmd))) {
  		spin_unlock(fe->ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
558
  		mem_cgroup_cancel_charge(page, memcg, true);
71e3aac07   Andrea Arcangeli   thp: transparent ...
559
  		put_page(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
560
  		pte_free(vma->vm_mm, pgtable);
71e3aac07   Andrea Arcangeli   thp: transparent ...
561
562
  	} else {
  		pmd_t entry;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
563
564
565
566
  
  		/* Deliver the page fault to userland */
  		if (userfaultfd_missing(vma)) {
  			int ret;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
567
  			spin_unlock(fe->ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
568
  			mem_cgroup_cancel_charge(page, memcg, true);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
569
  			put_page(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
570
571
  			pte_free(vma->vm_mm, pgtable);
  			ret = handle_userfault(fe, VM_UFFD_MISSING);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
572
573
574
  			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  			return ret;
  		}
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
575
576
  		entry = mk_huge_pmd(page, vma->vm_page_prot);
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
577
  		page_add_new_anon_rmap(page, vma, haddr, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
578
  		mem_cgroup_commit_charge(page, memcg, false, true);
00501b531   Johannes Weiner   mm: memcontrol: r...
579
  		lru_cache_add_active_or_unevictable(page, vma);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
580
581
582
583
584
  		pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
  		set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
  		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  		atomic_long_inc(&vma->vm_mm->nr_ptes);
  		spin_unlock(fe->ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
585
  		count_vm_event(THP_FAULT_ALLOC);
71e3aac07   Andrea Arcangeli   thp: transparent ...
586
  	}
aa2e878ef   David Rientjes   mm, thp: remove u...
587
  	return 0;
71e3aac07   Andrea Arcangeli   thp: transparent ...
588
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
589
  /*
251603549   Vlastimil Babka   mm, thp: remove _...
590
591
   * If THP defrag is set to always then directly reclaim/compact as necessary
   * If set to defer then do only background reclaim/compact and defer to khugepaged
444eb2a44   Mel Gorman   mm: thp: set THP ...
592
   * If set to madvise and the VMA is flagged then directly reclaim/compact
251603549   Vlastimil Babka   mm, thp: remove _...
593
   * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
444eb2a44   Mel Gorman   mm: thp: set THP ...
594
595
596
   */
  static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
  {
251603549   Vlastimil Babka   mm, thp: remove _...
597
598
599
600
601
602
603
604
605
606
607
608
609
  	bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
  
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
  				&transparent_hugepage_flags) && vma_madvised)
  		return GFP_TRANSHUGE;
  	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
  						&transparent_hugepage_flags))
  		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
  	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
  						&transparent_hugepage_flags))
  		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
  
  	return GFP_TRANSHUGE_LIGHT;
444eb2a44   Mel Gorman   mm: thp: set THP ...
610
  }
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
611
  /* Caller must hold page table lock. */
d295e3415   Kirill A. Shutemov   dax: don't use se...
612
  static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
97ae17497   Kirill A. Shutemov   thp: implement re...
613
  		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
614
  		struct page *zero_page)
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
615
616
  {
  	pmd_t entry;
7c4141645   Andrew Morton   dax: revert userf...
617
618
  	if (!pmd_none(*pmd))
  		return false;
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
619
  	entry = mk_pmd(zero_page, vma->vm_page_prot);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
620
  	entry = pmd_mkhuge(entry);
12c9d70bd   Matthew Wilcox   mm: fix memory le...
621
622
  	if (pgtable)
  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
623
  	set_pmd_at(mm, haddr, pmd, entry);
e1f56c89b   Kirill A. Shutemov   mm: convert mm->n...
624
  	atomic_long_inc(&mm->nr_ptes);
7c4141645   Andrew Morton   dax: revert userf...
625
  	return true;
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
626
  }
bae473a42   Kirill A. Shutemov   mm: introduce fau...
627
  int do_huge_pmd_anonymous_page(struct fault_env *fe)
71e3aac07   Andrea Arcangeli   thp: transparent ...
628
  {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
629
  	struct vm_area_struct *vma = fe->vma;
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
630
  	gfp_t gfp;
71e3aac07   Andrea Arcangeli   thp: transparent ...
631
  	struct page *page;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
632
  	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
71e3aac07   Andrea Arcangeli   thp: transparent ...
633

128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
634
  	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
c02925540   Kirill A. Shutemov   thp: consolidate ...
635
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
636
637
  	if (unlikely(anon_vma_prepare(vma)))
  		return VM_FAULT_OOM;
6d50e60cd   David Rientjes   mm, thp: fix coll...
638
  	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
639
  		return VM_FAULT_OOM;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
640
641
  	if (!(fe->flags & FAULT_FLAG_WRITE) &&
  			!mm_forbids_zeropage(vma->vm_mm) &&
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
642
643
644
645
  			transparent_hugepage_use_zero_page()) {
  		pgtable_t pgtable;
  		struct page *zero_page;
  		bool set;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
646
  		int ret;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
647
  		pgtable = pte_alloc_one(vma->vm_mm, haddr);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
648
  		if (unlikely(!pgtable))
ba76149f4   Andrea Arcangeli   thp: khugepaged
649
  			return VM_FAULT_OOM;
6fcb52a56   Aaron Lu   thp: reduce usage...
650
  		zero_page = mm_get_huge_zero_page(vma->vm_mm);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
651
  		if (unlikely(!zero_page)) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
652
  			pte_free(vma->vm_mm, pgtable);
81ab4201f   Andi Kleen   mm: add VM counte...
653
  			count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
654
  			return VM_FAULT_FALLBACK;
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
655
  		}
bae473a42   Kirill A. Shutemov   mm: introduce fau...
656
  		fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
657
658
  		ret = 0;
  		set = false;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
659
  		if (pmd_none(*fe->pmd)) {
6b251fc96   Andrea Arcangeli   userfaultfd: call...
660
  			if (userfaultfd_missing(vma)) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
661
662
  				spin_unlock(fe->ptl);
  				ret = handle_userfault(fe, VM_UFFD_MISSING);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
663
664
  				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  			} else {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
665
666
667
  				set_huge_zero_page(pgtable, vma->vm_mm, vma,
  						   haddr, fe->pmd, zero_page);
  				spin_unlock(fe->ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
668
669
670
  				set = true;
  			}
  		} else
bae473a42   Kirill A. Shutemov   mm: introduce fau...
671
  			spin_unlock(fe->ptl);
6fcb52a56   Aaron Lu   thp: reduce usage...
672
  		if (!set)
bae473a42   Kirill A. Shutemov   mm: introduce fau...
673
  			pte_free(vma->vm_mm, pgtable);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
674
  		return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
675
  	}
444eb2a44   Mel Gorman   mm: thp: set THP ...
676
  	gfp = alloc_hugepage_direct_gfpmask(vma);
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
677
  	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
678
679
  	if (unlikely(!page)) {
  		count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
680
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
681
  	}
9a982250f   Kirill A. Shutemov   thp: introduce de...
682
  	prep_transhuge_page(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
683
  	return __do_huge_pmd_anonymous_page(fe, page, gfp);
71e3aac07   Andrea Arcangeli   thp: transparent ...
684
  }
ae18d6dcf   Matthew Wilcox   thp: change inser...
685
  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
f25748e3c   Dan Williams   mm, dax: convert ...
686
  		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
687
688
689
690
691
692
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pmd_t entry;
  	spinlock_t *ptl;
  
  	ptl = pmd_lock(mm, pmd);
f25748e3c   Dan Williams   mm, dax: convert ...
693
694
695
  	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
  	if (pfn_t_devmap(pfn))
  		entry = pmd_mkdevmap(entry);
01871e59a   Ross Zwisler   mm, dax: fix live...
696
697
698
  	if (write) {
  		entry = pmd_mkyoung(pmd_mkdirty(entry));
  		entry = maybe_pmd_mkwrite(entry, vma);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
699
  	}
01871e59a   Ross Zwisler   mm, dax: fix live...
700
701
  	set_pmd_at(mm, addr, pmd, entry);
  	update_mmu_cache_pmd(vma, addr, pmd);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
702
  	spin_unlock(ptl);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
703
704
705
  }
  
  int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
f25748e3c   Dan Williams   mm, dax: convert ...
706
  			pmd_t *pmd, pfn_t pfn, bool write)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
707
708
709
710
711
712
713
714
715
716
717
  {
  	pgprot_t pgprot = vma->vm_page_prot;
  	/*
  	 * If we had pmd_special, we could avoid all these restrictions,
  	 * but we need to be consistent with PTEs and architectures that
  	 * can't support a 'special' bit.
  	 */
  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  						(VM_PFNMAP|VM_MIXEDMAP));
  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
f25748e3c   Dan Williams   mm, dax: convert ...
718
  	BUG_ON(!pfn_t_devmap(pfn));
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
719
720
721
722
723
  
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
  	if (track_pfn_insert(vma, &pgprot, pfn))
  		return VM_FAULT_SIGBUS;
ae18d6dcf   Matthew Wilcox   thp: change inser...
724
725
  	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
  	return VM_FAULT_NOPAGE;
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
726
  }
dee410792   Dan Williams   /dev/dax, core: f...
727
  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
728

3565fce3a   Dan Williams   mm, x86: get_user...
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
  static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
  		pmd_t *pmd)
  {
  	pmd_t _pmd;
  
  	/*
  	 * We should set the dirty bit only for FOLL_WRITE but for now
  	 * the dirty bit in the pmd is meaningless.  And if the dirty
  	 * bit will become meaningful and we'll only set it with
  	 * FOLL_WRITE, an atomic set_bit will be required on the pmd to
  	 * set the young bit, instead of the current set_pmd_at.
  	 */
  	_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
  	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
  				pmd, _pmd,  1))
  		update_mmu_cache_pmd(vma, addr, pmd);
  }
  
  struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
  		pmd_t *pmd, int flags)
  {
  	unsigned long pfn = pmd_pfn(*pmd);
  	struct mm_struct *mm = vma->vm_mm;
  	struct dev_pagemap *pgmap;
  	struct page *page;
  
  	assert_spin_locked(pmd_lockptr(mm, pmd));
6676aa654   Keno Fischer   mm/huge_memory.c:...
756
757
758
759
760
  	/*
  	 * When we COW a devmap PMD entry, we split it into PTEs, so we should
  	 * not be in this function with `flags & FOLL_COW` set.
  	 */
  	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
3565fce3a   Dan Williams   mm, x86: get_user...
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
  	if (flags & FOLL_WRITE && !pmd_write(*pmd))
  		return NULL;
  
  	if (pmd_present(*pmd) && pmd_devmap(*pmd))
  		/* pass */;
  	else
  		return NULL;
  
  	if (flags & FOLL_TOUCH)
  		touch_pmd(vma, addr, pmd);
  
  	/*
  	 * device mapped pages can only be returned if the
  	 * caller will manage the page reference count.
  	 */
  	if (!(flags & FOLL_GET))
  		return ERR_PTR(-EEXIST);
  
  	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
  	pgmap = get_dev_pagemap(pfn, NULL);
  	if (!pgmap)
  		return ERR_PTR(-EFAULT);
  	page = pfn_to_page(pfn);
  	get_page(page);
  	put_dev_pagemap(pgmap);
  
  	return page;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
789
790
791
792
  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
  		  struct vm_area_struct *vma)
  {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
793
  	spinlock_t *dst_ptl, *src_ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
794
795
  	struct page *src_page;
  	pmd_t pmd;
12c9d70bd   Matthew Wilcox   mm: fix memory le...
796
  	pgtable_t pgtable = NULL;
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
797
  	int ret = -ENOMEM;
71e3aac07   Andrea Arcangeli   thp: transparent ...
798

628d47ce9   Kirill A. Shutemov   thp: skip file hu...
799
800
801
802
803
804
805
  	/* Skip if can be re-fill on fault */
  	if (!vma_is_anonymous(vma))
  		return 0;
  
  	pgtable = pte_alloc_one(dst_mm, addr);
  	if (unlikely(!pgtable))
  		goto out;
71e3aac07   Andrea Arcangeli   thp: transparent ...
806

c4088ebdc   Kirill A. Shutemov   mm: convert the r...
807
808
809
  	dst_ptl = pmd_lock(dst_mm, dst_pmd);
  	src_ptl = pmd_lockptr(src_mm, src_pmd);
  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
71e3aac07   Andrea Arcangeli   thp: transparent ...
810
811
812
  
  	ret = -EAGAIN;
  	pmd = *src_pmd;
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
813
  	if (unlikely(!pmd_trans_huge(pmd))) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
814
815
816
  		pte_free(dst_mm, pgtable);
  		goto out_unlock;
  	}
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
817
  	/*
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
818
  	 * When page table lock is held, the huge zero pmd should not be
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
819
820
821
822
  	 * under splitting since we don't split the page itself, only pmd to
  	 * a page table.
  	 */
  	if (is_huge_zero_pmd(pmd)) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
823
  		struct page *zero_page;
97ae17497   Kirill A. Shutemov   thp: implement re...
824
825
826
827
828
  		/*
  		 * get_huge_zero_page() will never allocate a new page here,
  		 * since we already have a zero page to copy. It just takes a
  		 * reference.
  		 */
6fcb52a56   Aaron Lu   thp: reduce usage...
829
  		zero_page = mm_get_huge_zero_page(dst_mm);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
830
  		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
831
  				zero_page);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
832
833
834
  		ret = 0;
  		goto out_unlock;
  	}
de466bd62   Mel Gorman   mm: numa: avoid u...
835

628d47ce9   Kirill A. Shutemov   thp: skip file hu...
836
837
838
839
840
841
842
  	src_page = pmd_page(pmd);
  	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
  	get_page(src_page);
  	page_dup_rmap(src_page, true);
  	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  	atomic_long_inc(&dst_mm->nr_ptes);
  	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
71e3aac07   Andrea Arcangeli   thp: transparent ...
843
844
845
846
  
  	pmdp_set_wrprotect(src_mm, addr, src_pmd);
  	pmd = pmd_mkold(pmd_wrprotect(pmd));
  	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
847
848
849
  
  	ret = 0;
  out_unlock:
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
850
851
  	spin_unlock(src_ptl);
  	spin_unlock(dst_ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
852
853
854
  out:
  	return ret;
  }
bae473a42   Kirill A. Shutemov   mm: introduce fau...
855
  void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
a1dd450bc   Will Deacon   mm: thp: set the ...
856
857
858
  {
  	pmd_t entry;
  	unsigned long haddr;
8edd365ee   Minchan Kim   mm: pmd dirty emu...
859
  	bool write = fe->flags & FAULT_FLAG_WRITE;
a1dd450bc   Will Deacon   mm: thp: set the ...
860

bae473a42   Kirill A. Shutemov   mm: introduce fau...
861
862
  	fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
  	if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
a1dd450bc   Will Deacon   mm: thp: set the ...
863
864
865
  		goto unlock;
  
  	entry = pmd_mkyoung(orig_pmd);
8edd365ee   Minchan Kim   mm: pmd dirty emu...
866
867
  	if (write)
  		entry = pmd_mkdirty(entry);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
868
  	haddr = fe->address & HPAGE_PMD_MASK;
8edd365ee   Minchan Kim   mm: pmd dirty emu...
869
  	if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, write))
bae473a42   Kirill A. Shutemov   mm: introduce fau...
870
  		update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
a1dd450bc   Will Deacon   mm: thp: set the ...
871
872
  
  unlock:
bae473a42   Kirill A. Shutemov   mm: introduce fau...
873
  	spin_unlock(fe->ptl);
a1dd450bc   Will Deacon   mm: thp: set the ...
874
  }
bae473a42   Kirill A. Shutemov   mm: introduce fau...
875
876
  static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
  		struct page *page)
71e3aac07   Andrea Arcangeli   thp: transparent ...
877
  {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
878
879
  	struct vm_area_struct *vma = fe->vma;
  	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
00501b531   Johannes Weiner   mm: memcontrol: r...
880
  	struct mem_cgroup *memcg;
71e3aac07   Andrea Arcangeli   thp: transparent ...
881
882
883
884
  	pgtable_t pgtable;
  	pmd_t _pmd;
  	int ret = 0, i;
  	struct page **pages;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
885
886
  	unsigned long mmun_start;	/* For mmu_notifiers */
  	unsigned long mmun_end;		/* For mmu_notifiers */
71e3aac07   Andrea Arcangeli   thp: transparent ...
887
888
889
890
891
892
893
894
895
  
  	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
  			GFP_KERNEL);
  	if (unlikely(!pages)) {
  		ret |= VM_FAULT_OOM;
  		goto out;
  	}
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
cc5d462f7   Andi Kleen   mm: use __GFP_OTH...
896
  		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
bae473a42   Kirill A. Shutemov   mm: introduce fau...
897
898
  					       __GFP_OTHER_NODE, vma,
  					       fe->address, page_to_nid(page));
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
899
  		if (unlikely(!pages[i] ||
bae473a42   Kirill A. Shutemov   mm: introduce fau...
900
901
  			     mem_cgroup_try_charge(pages[i], vma->vm_mm,
  				     GFP_KERNEL, &memcg, false))) {
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
902
  			if (pages[i])
71e3aac07   Andrea Arcangeli   thp: transparent ...
903
  				put_page(pages[i]);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
904
  			while (--i >= 0) {
00501b531   Johannes Weiner   mm: memcontrol: r...
905
906
  				memcg = (void *)page_private(pages[i]);
  				set_page_private(pages[i], 0);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
907
908
  				mem_cgroup_cancel_charge(pages[i], memcg,
  						false);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
909
910
  				put_page(pages[i]);
  			}
71e3aac07   Andrea Arcangeli   thp: transparent ...
911
912
913
914
  			kfree(pages);
  			ret |= VM_FAULT_OOM;
  			goto out;
  		}
00501b531   Johannes Weiner   mm: memcontrol: r...
915
  		set_page_private(pages[i], (unsigned long)memcg);
71e3aac07   Andrea Arcangeli   thp: transparent ...
916
917
918
919
  	}
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		copy_user_highpage(pages[i], page + i,
0089e4853   Hillf Danton   mm/huge_memory: f...
920
  				   haddr + PAGE_SIZE * i, vma);
71e3aac07   Andrea Arcangeli   thp: transparent ...
921
922
923
  		__SetPageUptodate(pages[i]);
  		cond_resched();
  	}
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
924
925
  	mmun_start = haddr;
  	mmun_end   = haddr + HPAGE_PMD_SIZE;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
926
  	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
927

bae473a42   Kirill A. Shutemov   mm: introduce fau...
928
929
  	fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
  	if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
71e3aac07   Andrea Arcangeli   thp: transparent ...
930
  		goto out_free_pages;
309381fea   Sasha Levin   mm: dump page whe...
931
  	VM_BUG_ON_PAGE(!PageHead(page), page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
932

bae473a42   Kirill A. Shutemov   mm: introduce fau...
933
  	pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
934
  	/* leave pmd empty until pte is filled */
bae473a42   Kirill A. Shutemov   mm: introduce fau...
935
936
  	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
  	pmd_populate(vma->vm_mm, &_pmd, pgtable);
71e3aac07   Andrea Arcangeli   thp: transparent ...
937
938
  
  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
939
  		pte_t entry;
71e3aac07   Andrea Arcangeli   thp: transparent ...
940
941
  		entry = mk_pte(pages[i], vma->vm_page_prot);
  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
00501b531   Johannes Weiner   mm: memcontrol: r...
942
943
  		memcg = (void *)page_private(pages[i]);
  		set_page_private(pages[i], 0);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
944
  		page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
945
  		mem_cgroup_commit_charge(pages[i], memcg, false, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
946
  		lru_cache_add_active_or_unevictable(pages[i], vma);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
947
948
949
950
  		fe->pte = pte_offset_map(&_pmd, haddr);
  		VM_BUG_ON(!pte_none(*fe->pte));
  		set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
  		pte_unmap(fe->pte);
71e3aac07   Andrea Arcangeli   thp: transparent ...
951
952
  	}
  	kfree(pages);
71e3aac07   Andrea Arcangeli   thp: transparent ...
953
  	smp_wmb(); /* make pte visible before pmd */
bae473a42   Kirill A. Shutemov   mm: introduce fau...
954
  	pmd_populate(vma->vm_mm, fe->pmd, pgtable);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
955
  	page_remove_rmap(page, true);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
956
  	spin_unlock(fe->ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
957

bae473a42   Kirill A. Shutemov   mm: introduce fau...
958
  	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
959

71e3aac07   Andrea Arcangeli   thp: transparent ...
960
961
962
963
964
965
966
  	ret |= VM_FAULT_WRITE;
  	put_page(page);
  
  out:
  	return ret;
  
  out_free_pages:
bae473a42   Kirill A. Shutemov   mm: introduce fau...
967
968
  	spin_unlock(fe->ptl);
  	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
969
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
00501b531   Johannes Weiner   mm: memcontrol: r...
970
971
  		memcg = (void *)page_private(pages[i]);
  		set_page_private(pages[i], 0);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
972
  		mem_cgroup_cancel_charge(pages[i], memcg, false);
71e3aac07   Andrea Arcangeli   thp: transparent ...
973
  		put_page(pages[i]);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
974
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
975
976
977
  	kfree(pages);
  	goto out;
  }
bae473a42   Kirill A. Shutemov   mm: introduce fau...
978
  int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
71e3aac07   Andrea Arcangeli   thp: transparent ...
979
  {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
980
  	struct vm_area_struct *vma = fe->vma;
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
981
  	struct page *page = NULL, *new_page;
00501b531   Johannes Weiner   mm: memcontrol: r...
982
  	struct mem_cgroup *memcg;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
983
  	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
984
985
  	unsigned long mmun_start;	/* For mmu_notifiers */
  	unsigned long mmun_end;		/* For mmu_notifiers */
3b3636924   Michal Hocko   mm, memcg: sync a...
986
  	gfp_t huge_gfp;			/* for allocation and charge */
bae473a42   Kirill A. Shutemov   mm: introduce fau...
987
  	int ret = 0;
71e3aac07   Andrea Arcangeli   thp: transparent ...
988

bae473a42   Kirill A. Shutemov   mm: introduce fau...
989
  	fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
81d1b09c6   Sasha Levin   mm: convert a few...
990
  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
991
992
  	if (is_huge_zero_pmd(orig_pmd))
  		goto alloc;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
993
994
  	spin_lock(fe->ptl);
  	if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
71e3aac07   Andrea Arcangeli   thp: transparent ...
995
996
997
  		goto out_unlock;
  
  	page = pmd_page(orig_pmd);
309381fea   Sasha Levin   mm: dump page whe...
998
  	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1f25fe20a   Kirill A. Shutemov   mm, thp: adjust c...
999
1000
  	/*
  	 * We can only reuse the page if nobody else maps the huge page or it's
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1001
  	 * part.
1f25fe20a   Kirill A. Shutemov   mm, thp: adjust c...
1002
  	 */
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1003
  	if (page_trans_huge_mapcount(page, NULL) == 1) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1004
1005
1006
  		pmd_t entry;
  		entry = pmd_mkyoung(orig_pmd);
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1007
1008
  		if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry,  1))
  			update_mmu_cache_pmd(vma, fe->address, fe->pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1009
1010
1011
  		ret |= VM_FAULT_WRITE;
  		goto out_unlock;
  	}
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1012
  	get_page(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1013
  	spin_unlock(fe->ptl);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1014
  alloc:
71e3aac07   Andrea Arcangeli   thp: transparent ...
1015
  	if (transparent_hugepage_enabled(vma) &&
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
1016
  	    !transparent_hugepage_debug_cow()) {
444eb2a44   Mel Gorman   mm: thp: set THP ...
1017
  		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
3b3636924   Michal Hocko   mm, memcg: sync a...
1018
  		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
1019
  	} else
71e3aac07   Andrea Arcangeli   thp: transparent ...
1020
  		new_page = NULL;
9a982250f   Kirill A. Shutemov   thp: introduce de...
1021
1022
1023
  	if (likely(new_page)) {
  		prep_transhuge_page(new_page);
  	} else {
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1024
  		if (!page) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1025
  			split_huge_pmd(vma, fe->pmd, fe->address);
e9b71ca91   Kirill A. Shutemov   mm, thp: drop do_...
1026
  			ret |= VM_FAULT_FALLBACK;
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1027
  		} else {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1028
  			ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1029
  			if (ret & VM_FAULT_OOM) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1030
  				split_huge_pmd(vma, fe->pmd, fe->address);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1031
1032
  				ret |= VM_FAULT_FALLBACK;
  			}
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1033
  			put_page(page);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1034
  		}
17766dde3   David Rientjes   mm, thp: count th...
1035
  		count_vm_event(THP_FAULT_FALLBACK);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1036
1037
  		goto out;
  	}
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1038
1039
  	if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
  					huge_gfp, &memcg, true))) {
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1040
  		put_page(new_page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1041
1042
  		split_huge_pmd(vma, fe->pmd, fe->address);
  		if (page)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1043
  			put_page(page);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1044
  		ret |= VM_FAULT_FALLBACK;
17766dde3   David Rientjes   mm, thp: count th...
1045
  		count_vm_event(THP_FAULT_FALLBACK);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1046
1047
  		goto out;
  	}
17766dde3   David Rientjes   mm, thp: count th...
1048
  	count_vm_event(THP_FAULT_ALLOC);
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1049
  	if (!page)
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1050
1051
1052
  		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
  	else
  		copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1053
  	__SetPageUptodate(new_page);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1054
1055
  	mmun_start = haddr;
  	mmun_end   = haddr + HPAGE_PMD_SIZE;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1056
  	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1057

bae473a42   Kirill A. Shutemov   mm: introduce fau...
1058
  	spin_lock(fe->ptl);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1059
  	if (page)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1060
  		put_page(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1061
1062
  	if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
  		spin_unlock(fe->ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1063
  		mem_cgroup_cancel_charge(new_page, memcg, true);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1064
  		put_page(new_page);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1065
  		goto out_mn;
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1066
  	} else {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1067
  		pmd_t entry;
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
1068
1069
  		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1070
  		pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1071
  		page_add_new_anon_rmap(new_page, vma, haddr, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1072
  		mem_cgroup_commit_charge(new_page, memcg, false, true);
00501b531   Johannes Weiner   mm: memcontrol: r...
1073
  		lru_cache_add_active_or_unevictable(new_page, vma);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1074
1075
  		set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
  		update_mmu_cache_pmd(vma, fe->address, fe->pmd);
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1076
  		if (!page) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1077
  			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
97ae17497   Kirill A. Shutemov   thp: implement re...
1078
  		} else {
309381fea   Sasha Levin   mm: dump page whe...
1079
  			VM_BUG_ON_PAGE(!PageHead(page), page);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1080
  			page_remove_rmap(page, true);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1081
1082
  			put_page(page);
  		}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1083
1084
  		ret |= VM_FAULT_WRITE;
  	}
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1085
  	spin_unlock(fe->ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1086
  out_mn:
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1087
  	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1088
1089
  out:
  	return ret;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1090
  out_unlock:
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1091
  	spin_unlock(fe->ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1092
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1093
  }
6676aa654   Keno Fischer   mm/huge_memory.c:...
1094
1095
1096
1097
1098
1099
1100
1101
1102
  /*
   * FOLL_FORCE can write to even unwritable pmd's, but only
   * after we've gone through a COW cycle and they are dirty.
   */
  static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
  {
  	return pmd_write(pmd) ||
  	       ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
  }
b676b293f   David Rientjes   mm, thp: fix mapp...
1103
  struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
71e3aac07   Andrea Arcangeli   thp: transparent ...
1104
1105
1106
1107
  				   unsigned long addr,
  				   pmd_t *pmd,
  				   unsigned int flags)
  {
b676b293f   David Rientjes   mm, thp: fix mapp...
1108
  	struct mm_struct *mm = vma->vm_mm;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1109
  	struct page *page = NULL;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1110
  	assert_spin_locked(pmd_lockptr(mm, pmd));
71e3aac07   Andrea Arcangeli   thp: transparent ...
1111

6676aa654   Keno Fischer   mm/huge_memory.c:...
1112
  	if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
71e3aac07   Andrea Arcangeli   thp: transparent ...
1113
  		goto out;
85facf257   Kirill A. Shutemov   thp: avoid dumpin...
1114
1115
1116
  	/* Avoid dumping huge zero page */
  	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
  		return ERR_PTR(-EFAULT);
2b4847e73   Mel Gorman   mm: numa: seriali...
1117
  	/* Full NUMA hinting faults to serialise migration in fault paths */
8a0516ed8   Mel Gorman   mm: convert p[te|...
1118
  	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
2b4847e73   Mel Gorman   mm: numa: seriali...
1119
  		goto out;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1120
  	page = pmd_page(*pmd);
ca120cf68   Dan Williams   mm: fix show_smap...
1121
  	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
3565fce3a   Dan Williams   mm, x86: get_user...
1122
1123
  	if (flags & FOLL_TOUCH)
  		touch_pmd(vma, addr, pmd);
de60f5f10   Eric B Munson   mm: introduce VM_...
1124
  	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1125
1126
1127
1128
  		/*
  		 * We don't mlock() pte-mapped THPs. This way we can avoid
  		 * leaking mlocked pages into non-VM_LOCKED VMAs.
  		 *
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1129
1130
  		 * For anon THP:
  		 *
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1131
1132
1133
1134
1135
1136
1137
  		 * In most cases the pmd is the only mapping of the page as we
  		 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
  		 * writable private mappings in populate_vma_page_range().
  		 *
  		 * The only scenario when we have the page shared here is if we
  		 * mlocking read-only mapping shared over fork(). We skip
  		 * mlocking such pages.
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1138
1139
1140
1141
1142
1143
  		 *
  		 * For file THP:
  		 *
  		 * We can expect PageDoubleMap() to be stable under page lock:
  		 * for file pages we set it in page_add_file_rmap(), which
  		 * requires page to be locked.
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1144
  		 */
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
  
  		if (PageAnon(page) && compound_mapcount(page) != 1)
  			goto skip_mlock;
  		if (PageDoubleMap(page) || !page->mapping)
  			goto skip_mlock;
  		if (!trylock_page(page))
  			goto skip_mlock;
  		lru_add_drain();
  		if (page->mapping && !PageDoubleMap(page))
  			mlock_vma_page(page);
  		unlock_page(page);
b676b293f   David Rientjes   mm, thp: fix mapp...
1156
  	}
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1157
  skip_mlock:
71e3aac07   Andrea Arcangeli   thp: transparent ...
1158
  	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
ca120cf68   Dan Williams   mm: fix show_smap...
1159
  	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1160
  	if (flags & FOLL_GET)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1161
  		get_page(page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1162
1163
1164
1165
  
  out:
  	return page;
  }
d10e63f29   Mel Gorman   mm: numa: Create ...
1166
  /* NUMA hinting page fault entry point for trans huge pmds */
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1167
  int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
d10e63f29   Mel Gorman   mm: numa: Create ...
1168
  {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1169
  	struct vm_area_struct *vma = fe->vma;
b8916634b   Mel Gorman   mm: Prevent paral...
1170
  	struct anon_vma *anon_vma = NULL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1171
  	struct page *page;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1172
  	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1173
  	int page_nid = -1, this_nid = numa_node_id();
90572890d   Peter Zijlstra   mm: numa: Change ...
1174
  	int target_nid, last_cpupid = -1;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1175
1176
  	bool page_locked;
  	bool migrated = false;
b191f9b10   Mel Gorman   mm: numa: preserv...
1177
  	bool was_writable;
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1178
  	int flags = 0;
d10e63f29   Mel Gorman   mm: numa: Create ...
1179

bae473a42   Kirill A. Shutemov   mm: introduce fau...
1180
1181
  	fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
  	if (unlikely(!pmd_same(pmd, *fe->pmd)))
d10e63f29   Mel Gorman   mm: numa: Create ...
1182
  		goto out_unlock;
de466bd62   Mel Gorman   mm: numa: avoid u...
1183
1184
1185
1186
1187
  	/*
  	 * If there are potential migrations, wait for completion and retry
  	 * without disrupting NUMA hinting information. Do not relock and
  	 * check_same as the page may no longer be mapped.
  	 */
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1188
1189
1190
  	if (unlikely(pmd_trans_migrating(*fe->pmd))) {
  		page = pmd_page(*fe->pmd);
  		spin_unlock(fe->ptl);
5d8330621   Mel Gorman   mm: numa: do not ...
1191
  		wait_on_page_locked(page);
de466bd62   Mel Gorman   mm: numa: avoid u...
1192
1193
  		goto out;
  	}
d10e63f29   Mel Gorman   mm: numa: Create ...
1194
  	page = pmd_page(pmd);
a1a46184e   Mel Gorman   mm: numa: Do not ...
1195
  	BUG_ON(is_huge_zero_page(page));
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1196
  	page_nid = page_to_nid(page);
90572890d   Peter Zijlstra   mm: numa: Change ...
1197
  	last_cpupid = page_cpupid_last(page);
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1198
  	count_vm_numa_event(NUMA_HINT_FAULTS);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1199
  	if (page_nid == this_nid) {
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1200
  		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1201
1202
  		flags |= TNF_FAULT_LOCAL;
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1203

bea66fbd1   Mel Gorman   mm: numa: group r...
1204
  	/* See similar comment in do_numa_page for explanation */
d59dc7bcf   Rik van Riel   sched/numa, mm: R...
1205
  	if (!pmd_write(pmd))
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1206
1207
1208
  		flags |= TNF_NO_GROUP;
  
  	/*
ff9042b11   Mel Gorman   mm: Wait for THP ...
1209
1210
1211
  	 * Acquire the page lock to serialise THP migrations but avoid dropping
  	 * page_table_lock if at all possible
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1212
1213
1214
1215
  	page_locked = trylock_page(page);
  	target_nid = mpol_misplaced(page, vma, haddr);
  	if (target_nid == -1) {
  		/* If the page was locked, there are no parallel migrations */
a54a407fb   Mel Gorman   mm: Close races b...
1216
  		if (page_locked)
b8916634b   Mel Gorman   mm: Prevent paral...
1217
  			goto clear_pmdnuma;
2b4847e73   Mel Gorman   mm: numa: seriali...
1218
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1219

de466bd62   Mel Gorman   mm: numa: avoid u...
1220
  	/* Migration could have started since the pmd_trans_migrating check */
2b4847e73   Mel Gorman   mm: numa: seriali...
1221
  	if (!page_locked) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1222
  		spin_unlock(fe->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1223
  		wait_on_page_locked(page);
a54a407fb   Mel Gorman   mm: Close races b...
1224
  		page_nid = -1;
b8916634b   Mel Gorman   mm: Prevent paral...
1225
1226
  		goto out;
  	}
2b4847e73   Mel Gorman   mm: numa: seriali...
1227
1228
1229
1230
  	/*
  	 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
  	 * to serialises splits
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1231
  	get_page(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1232
  	spin_unlock(fe->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1233
  	anon_vma = page_lock_anon_vma_read(page);
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1234

c69307d53   Peter Zijlstra   sched/numa: Fix c...
1235
  	/* Confirm the PMD did not change while page_table_lock was released */
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1236
1237
  	spin_lock(fe->ptl);
  	if (unlikely(!pmd_same(pmd, *fe->pmd))) {
b32967ff1   Mel Gorman   mm: numa: Add THP...
1238
1239
  		unlock_page(page);
  		put_page(page);
a54a407fb   Mel Gorman   mm: Close races b...
1240
  		page_nid = -1;
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1241
  		goto out_unlock;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1242
  	}
ff9042b11   Mel Gorman   mm: Wait for THP ...
1243

c3a489cac   Mel Gorman   mm: numa: ensure ...
1244
1245
1246
1247
1248
1249
  	/* Bail if we fail to protect against THP splits for any reason */
  	if (unlikely(!anon_vma)) {
  		put_page(page);
  		page_nid = -1;
  		goto clear_pmdnuma;
  	}
a54a407fb   Mel Gorman   mm: Close races b...
1250
1251
  	/*
  	 * Migrate the THP to the requested node, returns with page unlocked
8a0516ed8   Mel Gorman   mm: convert p[te|...
1252
  	 * and access rights restored.
a54a407fb   Mel Gorman   mm: Close races b...
1253
  	 */
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1254
1255
1256
  	spin_unlock(fe->ptl);
  	migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
  				fe->pmd, pmd, fe->address, page, target_nid);
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1257
1258
  	if (migrated) {
  		flags |= TNF_MIGRATED;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1259
  		page_nid = target_nid;
074c23817   Mel Gorman   mm: numa: slow PT...
1260
1261
  	} else
  		flags |= TNF_MIGRATE_FAIL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1262

8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1263
  	goto out;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1264
  clear_pmdnuma:
a54a407fb   Mel Gorman   mm: Close races b...
1265
  	BUG_ON(!PageLocked(page));
b191f9b10   Mel Gorman   mm: numa: preserv...
1266
  	was_writable = pmd_write(pmd);
4d9424669   Mel Gorman   mm: convert p[te|...
1267
  	pmd = pmd_modify(pmd, vma->vm_page_prot);
b7b04004e   Mel Gorman   mm: numa: mark hu...
1268
  	pmd = pmd_mkyoung(pmd);
b191f9b10   Mel Gorman   mm: numa: preserv...
1269
1270
  	if (was_writable)
  		pmd = pmd_mkwrite(pmd);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1271
1272
  	set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
  	update_mmu_cache_pmd(vma, fe->address, fe->pmd);
a54a407fb   Mel Gorman   mm: Close races b...
1273
  	unlock_page(page);
d10e63f29   Mel Gorman   mm: numa: Create ...
1274
  out_unlock:
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1275
  	spin_unlock(fe->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1276
1277
1278
1279
  
  out:
  	if (anon_vma)
  		page_unlock_anon_vma_read(anon_vma);
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1280
  	if (page_nid != -1)
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1281
  		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1282

d10e63f29   Mel Gorman   mm: numa: Create ...
1283
1284
  	return 0;
  }
319904ad4   Huang Ying   mm, THP: clean up...
1285
1286
1287
1288
1289
  /*
   * Return true if we do MADV_FREE successfully on entire pmd page.
   * Otherwise, return false.
   */
  bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1290
  		pmd_t *pmd, unsigned long addr, unsigned long next)
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1291
1292
1293
1294
1295
  {
  	spinlock_t *ptl;
  	pmd_t orig_pmd;
  	struct page *page;
  	struct mm_struct *mm = tlb->mm;
319904ad4   Huang Ying   mm, THP: clean up...
1296
  	bool ret = false;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1297

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1298
1299
  	ptl = pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
25eedabe0   Linus Torvalds   vm: fix incorrect...
1300
  		goto out_unlocked;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1301
1302
  
  	orig_pmd = *pmd;
319904ad4   Huang Ying   mm, THP: clean up...
1303
  	if (is_huge_zero_pmd(orig_pmd))
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1304
  		goto out;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
  
  	page = pmd_page(orig_pmd);
  	/*
  	 * If other processes are mapping this page, we couldn't discard
  	 * the page unless they all do MADV_FREE so let's skip the page.
  	 */
  	if (page_mapcount(page) != 1)
  		goto out;
  
  	if (!trylock_page(page))
  		goto out;
  
  	/*
  	 * If user want to discard part-pages of THP, split it so MADV_FREE
  	 * will deactivate only them.
  	 */
  	if (next - addr != HPAGE_PMD_SIZE) {
  		get_page(page);
  		spin_unlock(ptl);
9818b8cde   Huang Ying   madvise_free, thp...
1324
  		split_huge_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1325
1326
  		put_page(page);
  		unlock_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
  		goto out_unlocked;
  	}
  
  	if (PageDirty(page))
  		ClearPageDirty(page);
  	unlock_page(page);
  
  	if (PageActive(page))
  		deactivate_page(page);
  
  	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
  		orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
  			tlb->fullmm);
  		orig_pmd = pmd_mkold(orig_pmd);
  		orig_pmd = pmd_mkclean(orig_pmd);
  
  		set_pmd_at(mm, addr, pmd, orig_pmd);
  		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  	}
319904ad4   Huang Ying   mm, THP: clean up...
1346
  	ret = true;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1347
1348
1349
1350
1351
  out:
  	spin_unlock(ptl);
  out_unlocked:
  	return ret;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
1352
  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
f21760b15   Shaohua Li   thp: add tlb_remo...
1353
  		 pmd_t *pmd, unsigned long addr)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1354
  {
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1355
  	pmd_t orig_pmd;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1356
  	spinlock_t *ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1357

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1358
1359
  	ptl = __pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
  		return 0;
  	/*
  	 * For architectures like ppc64 we look at deposited pgtable
  	 * when calling pmdp_huge_get_and_clear. So do the
  	 * pgtable_trans_huge_withdraw after finishing pmdp related
  	 * operations.
  	 */
  	orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
  			tlb->fullmm);
  	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  	if (vma_is_dax(vma)) {
  		spin_unlock(ptl);
  		if (is_huge_zero_pmd(orig_pmd))
aa88b68c3   Kirill A. Shutemov   thp: keep huge ze...
1373
  			tlb_remove_page(tlb, pmd_page(orig_pmd));
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1374
1375
1376
1377
  	} else if (is_huge_zero_pmd(orig_pmd)) {
  		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
  		atomic_long_dec(&tlb->mm->nr_ptes);
  		spin_unlock(ptl);
aa88b68c3   Kirill A. Shutemov   thp: keep huge ze...
1378
  		tlb_remove_page(tlb, pmd_page(orig_pmd));
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1379
1380
  	} else {
  		struct page *page = pmd_page(orig_pmd);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1381
  		page_remove_rmap(page, true);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1382
  		VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1383
  		VM_BUG_ON_PAGE(!PageHead(page), page);
b5072380e   Kirill A. Shutemov   thp: support file...
1384
1385
1386
1387
1388
1389
1390
1391
1392
  		if (PageAnon(page)) {
  			pgtable_t pgtable;
  			pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
  			pte_free(tlb->mm, pgtable);
  			atomic_long_dec(&tlb->mm->nr_ptes);
  			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  		} else {
  			add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
  		}
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1393
  		spin_unlock(ptl);
e77b0852b   Aneesh Kumar K.V   mm/mmu_gather: tr...
1394
  		tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1395
  	}
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1396
  	return 1;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1397
  }
bf8616d5f   Hugh Dickins   huge mm: move_hug...
1398
  bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1399
  		  unsigned long new_addr, unsigned long old_end,
5d1904204   Aaron Lu   mremap: fix race ...
1400
  		  pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1401
  {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1402
  	spinlock_t *old_ptl, *new_ptl;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1403
  	pmd_t pmd;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1404
  	struct mm_struct *mm = vma->vm_mm;
5d1904204   Aaron Lu   mremap: fix race ...
1405
  	bool force_flush = false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1406
1407
1408
  
  	if ((old_addr & ~HPAGE_PMD_MASK) ||
  	    (new_addr & ~HPAGE_PMD_MASK) ||
bf8616d5f   Hugh Dickins   huge mm: move_hug...
1409
  	    old_end - old_addr < HPAGE_PMD_SIZE)
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1410
  		return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1411
1412
1413
1414
1415
1416
1417
  
  	/*
  	 * The destination pmd shouldn't be established, free_pgtables()
  	 * should have release it.
  	 */
  	if (WARN_ON(!pmd_none(*new_pmd))) {
  		VM_BUG_ON(pmd_trans_huge(*new_pmd));
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1418
  		return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1419
  	}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1420
1421
1422
1423
  	/*
  	 * We don't have to worry about the ordering of src and dst
  	 * ptlocks because exclusive mmap_sem prevents deadlock.
  	 */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1424
1425
  	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
  	if (old_ptl) {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1426
1427
1428
  		new_ptl = pmd_lockptr(mm, new_pmd);
  		if (new_ptl != old_ptl)
  			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
8809aa2d2   Aneesh Kumar K.V   mm: clarify that ...
1429
  		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
a2ce2666a   Aaron Lu   mremap: move_ptes...
1430
1431
  		if (pmd_present(pmd) && pmd_dirty(pmd))
  			force_flush = true;
025c5b245   Naoya Horiguchi   thp: optimize awa...
1432
  		VM_BUG_ON(!pmd_none(*new_pmd));
3592806cf   Kirill A. Shutemov   thp: move preallo...
1433

69a8ec2d8   Kirill A. Shutemov   thp, dax: do not ...
1434
1435
  		if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
  				vma_is_anonymous(vma)) {
b3084f4db   Aneesh Kumar K.V   powerpc/thp: Fix ...
1436
  			pgtable_t pgtable;
3592806cf   Kirill A. Shutemov   thp: move preallo...
1437
1438
  			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
  			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
3592806cf   Kirill A. Shutemov   thp: move preallo...
1439
  		}
b3084f4db   Aneesh Kumar K.V   powerpc/thp: Fix ...
1440
1441
1442
  		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
  		if (new_ptl != old_ptl)
  			spin_unlock(new_ptl);
5d1904204   Aaron Lu   mremap: fix race ...
1443
1444
1445
1446
  		if (force_flush)
  			flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
  		else
  			*need_flush = true;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1447
  		spin_unlock(old_ptl);
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1448
  		return true;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1449
  	}
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1450
  	return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1451
  }
f123d74ab   Mel Gorman   mm: Only flush TL...
1452
1453
1454
1455
1456
1457
  /*
   * Returns
   *  - 0 if PMD could not be locked
   *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
   *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
   */
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1458
  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
e944fd67b   Mel Gorman   mm: numa: do not ...
1459
  		unsigned long addr, pgprot_t newprot, int prot_numa)
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1460
1461
  {
  	struct mm_struct *mm = vma->vm_mm;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1462
  	spinlock_t *ptl;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1463
  	int ret = 0;
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1464
1465
  	ptl = __pmd_trans_huge_lock(pmd, vma);
  	if (ptl) {
025c5b245   Naoya Horiguchi   thp: optimize awa...
1466
  		pmd_t entry;
b191f9b10   Mel Gorman   mm: numa: preserv...
1467
  		bool preserve_write = prot_numa && pmd_write(*pmd);
ba68bc011   Mel Gorman   mm: thp: Return t...
1468
  		ret = 1;
e944fd67b   Mel Gorman   mm: numa: do not ...
1469
1470
1471
1472
1473
1474
1475
1476
  
  		/*
  		 * Avoid trapping faults against the zero page. The read-only
  		 * data is likely to be read-cached on the local CPU and
  		 * local/remote hits to the zero page are not interesting.
  		 */
  		if (prot_numa && is_huge_zero_pmd(*pmd)) {
  			spin_unlock(ptl);
ba68bc011   Mel Gorman   mm: thp: Return t...
1477
  			return ret;
e944fd67b   Mel Gorman   mm: numa: do not ...
1478
  		}
10c1045f2   Mel Gorman   mm: numa: avoid u...
1479
  		if (!prot_numa || !pmd_protnone(*pmd)) {
8809aa2d2   Aneesh Kumar K.V   mm: clarify that ...
1480
  			entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
10c1045f2   Mel Gorman   mm: numa: avoid u...
1481
  			entry = pmd_modify(entry, newprot);
b191f9b10   Mel Gorman   mm: numa: preserv...
1482
1483
  			if (preserve_write)
  				entry = pmd_mkwrite(entry);
10c1045f2   Mel Gorman   mm: numa: avoid u...
1484
1485
  			ret = HPAGE_PMD_NR;
  			set_pmd_at(mm, addr, pmd, entry);
b237aded4   Kirill A. Shutemov   thp: prepare chan...
1486
1487
  			BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
  					pmd_write(entry));
10c1045f2   Mel Gorman   mm: numa: avoid u...
1488
  		}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1489
  		spin_unlock(ptl);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1490
1491
1492
1493
1494
1495
  	}
  
  	return ret;
  }
  
  /*
8f19b0c05   Huang Ying   thp: fix comments...
1496
   * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1497
   *
8f19b0c05   Huang Ying   thp: fix comments...
1498
1499
   * Note that if it returns page table lock pointer, this routine returns without
   * unlocking page table lock. So callers must unlock it.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1500
   */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1501
  spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
025c5b245   Naoya Horiguchi   thp: optimize awa...
1502
  {
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1503
1504
  	spinlock_t *ptl;
  	ptl = pmd_lock(vma->vm_mm, pmd);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
1505
  	if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1506
1507
1508
  		return ptl;
  	spin_unlock(ptl);
  	return NULL;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1509
  }
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
  static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
  		unsigned long haddr, pmd_t *pmd)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pgtable_t pgtable;
  	pmd_t _pmd;
  	int i;
  
  	/* leave pmd empty until pte is filled */
  	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
  
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
  
  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
  		pte_t *pte, entry;
  		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
  		entry = pte_mkspecial(entry);
  		pte = pte_offset_map(&_pmd, haddr);
  		VM_BUG_ON(!pte_none(*pte));
  		set_pte_at(mm, haddr, pte, entry);
  		pte_unmap(pte);
  	}
  	smp_wmb(); /* make pte visible before pmd */
  	pmd_populate(mm, pmd, pgtable);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1535
1536
1537
  }
  
  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
ba9882808   Kirill A. Shutemov   thp: add option t...
1538
  		unsigned long haddr, bool freeze)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1539
1540
1541
1542
1543
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct page *page;
  	pgtable_t pgtable;
  	pmd_t _pmd;
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
1544
  	bool young, write, dirty, soft_dirty;
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
1545
  	unsigned long addr;
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1546
1547
1548
1549
1550
  	int i;
  
  	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
1551
  	VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1552
1553
  
  	count_vm_event(THP_SPLIT_PMD);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
1554
1555
  	if (!vma_is_anonymous(vma)) {
  		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
1556
1557
1558
1559
1560
1561
1562
1563
  		if (vma_is_dax(vma))
  			return;
  		page = pmd_page(_pmd);
  		if (!PageReferenced(page) && pmd_young(_pmd))
  			SetPageReferenced(page);
  		page_remove_rmap(page, true);
  		put_page(page);
  		add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1564
1565
1566
1567
1568
1569
1570
  		return;
  	} else if (is_huge_zero_pmd(*pmd)) {
  		return __split_huge_zero_page_pmd(vma, haddr, pmd);
  	}
  
  	page = pmd_page(*pmd);
  	VM_BUG_ON_PAGE(!page_count(page), page);
fe896d187   Joonsoo Kim   mm: introduce pag...
1571
  	page_ref_add(page, HPAGE_PMD_NR - 1);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1572
1573
  	write = pmd_write(*pmd);
  	young = pmd_young(*pmd);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1574
  	dirty = pmd_dirty(*pmd);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
1575
  	soft_dirty = pmd_soft_dirty(*pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1576

c777e2a8b   Aneesh Kumar K.V   powerpc/mm: Fix M...
1577
  	pmdp_huge_split_prepare(vma, haddr, pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1578
1579
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
1580
  	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1581
1582
1583
1584
1585
1586
  		pte_t entry, *pte;
  		/*
  		 * Note that NUMA hinting access restrictions are not
  		 * transferred to avoid any possibility of altering
  		 * permissions across VMAs.
  		 */
ba9882808   Kirill A. Shutemov   thp: add option t...
1587
1588
1589
1590
  		if (freeze) {
  			swp_entry_t swp_entry;
  			swp_entry = make_migration_entry(page + i, write);
  			entry = swp_entry_to_pte(swp_entry);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
1591
1592
  			if (soft_dirty)
  				entry = pte_swp_mksoft_dirty(entry);
ba9882808   Kirill A. Shutemov   thp: add option t...
1593
  		} else {
6d2329f88   Andrea Arcangeli   mm: vm_page_prot:...
1594
  			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1595
  			entry = maybe_mkwrite(entry, vma);
ba9882808   Kirill A. Shutemov   thp: add option t...
1596
1597
1598
1599
  			if (!write)
  				entry = pte_wrprotect(entry);
  			if (!young)
  				entry = pte_mkold(entry);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
1600
1601
  			if (soft_dirty)
  				entry = pte_mksoft_dirty(entry);
ba9882808   Kirill A. Shutemov   thp: add option t...
1602
  		}
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1603
1604
  		if (dirty)
  			SetPageDirty(page + i);
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
1605
  		pte = pte_offset_map(&_pmd, addr);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1606
  		BUG_ON(!pte_none(*pte));
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
1607
  		set_pte_at(mm, addr, pte, entry);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
  		atomic_inc(&page[i]._mapcount);
  		pte_unmap(pte);
  	}
  
  	/*
  	 * Set PG_double_map before dropping compound_mapcount to avoid
  	 * false-negative page_mapped().
  	 */
  	if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
  		for (i = 0; i < HPAGE_PMD_NR; i++)
  			atomic_inc(&page[i]._mapcount);
  	}
  
  	if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
  		/* Last compound_mapcount is gone. */
11fb99898   Mel Gorman   mm: move most fil...
1623
  		__dec_node_page_state(page, NR_ANON_THPS);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1624
1625
1626
1627
1628
1629
1630
1631
  		if (TestClearPageDoubleMap(page)) {
  			/* No need in mapcount reference anymore */
  			for (i = 0; i < HPAGE_PMD_NR; i++)
  				atomic_dec(&page[i]._mapcount);
  		}
  	}
  
  	smp_wmb(); /* make pte visible before pmd */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
  	/*
  	 * Up to this point the pmd is present and huge and userland has the
  	 * whole access to the hugepage during the split (which happens in
  	 * place). If we overwrite the pmd with the not-huge version pointing
  	 * to the pte here (which of course we could if all CPUs were bug
  	 * free), userland could trigger a small page size TLB miss on the
  	 * small sized TLB while the hugepage TLB entry is still established in
  	 * the huge TLB. Some CPU doesn't like that.
  	 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
  	 * 383 on page 93. Intel should be safe but is also warns that it's
  	 * only safe if the permission and cache attributes of the two entries
  	 * loaded in the two TLB is identical (which should be the case here).
  	 * But it is generally safer to never allow small and huge TLB entries
  	 * for the same virtual address to be loaded simultaneously. So instead
  	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
  	 * current pmd notpresent (atomically because here the pmd_trans_huge
  	 * and pmd_trans_splitting must remain set at all times on the pmd
  	 * until the split is complete for this pmd), then we flush the SMP TLB
  	 * and finally we write the non-huge version of the pmd entry with
  	 * pmd_populate.
  	 */
  	pmdp_invalidate(vma, haddr, pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1654
  	pmd_populate(mm, pmd, pgtable);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1655
1656
  
  	if (freeze) {
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
1657
  		for (i = 0; i < HPAGE_PMD_NR; i++) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1658
1659
1660
1661
  			page_remove_rmap(page + i, false);
  			put_page(page + i);
  		}
  	}
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1662
1663
1664
  }
  
  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
1665
  		unsigned long address, bool freeze, struct page *page)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1666
1667
1668
1669
1670
1671
1672
  {
  	spinlock_t *ptl;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long haddr = address & HPAGE_PMD_MASK;
  
  	mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
  	ptl = pmd_lock(mm, pmd);
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
1673
1674
1675
1676
1677
1678
1679
1680
  
  	/*
  	 * If caller asks to setup a migration entries, we need a page to check
  	 * pmd against. Otherwise we can end up replacing wrong page.
  	 */
  	VM_BUG_ON(freeze && !page);
  	if (page && page != pmd_page(*pmd))
  	        goto out;
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
1681
  	if (pmd_trans_huge(*pmd)) {
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
1682
  		page = pmd_page(*pmd);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
1683
  		if (PageMlocked(page))
5f7377147   Kirill A. Shutemov   thp: fix deadlock...
1684
  			clear_page_mlock(page);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
1685
  	} else if (!pmd_devmap(*pmd))
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1686
  		goto out;
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1687
  	__split_huge_pmd_locked(vma, pmd, haddr, freeze);
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1688
  out:
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1689
1690
1691
  	spin_unlock(ptl);
  	mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
  }
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1692
1693
  void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
  		bool freeze, struct page *page)
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
1694
  {
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
1695
1696
  	pgd_t *pgd;
  	pud_t *pud;
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
1697
  	pmd_t *pmd;
78ddc5347   Kirill A. Shutemov   thp: rename split...
1698
  	pgd = pgd_offset(vma->vm_mm, address);
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
1699
1700
1701
1702
1703
1704
1705
1706
  	if (!pgd_present(*pgd))
  		return;
  
  	pud = pud_offset(pgd, address);
  	if (!pud_present(*pud))
  		return;
  
  	pmd = pmd_offset(pud, address);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1707

33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
1708
  	__split_huge_pmd(vma, pmd, address, freeze, page);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
1709
  }
e1b9996b8   Kirill A. Shutemov   thp: vma_adjust_t...
1710
  void vma_adjust_trans_huge(struct vm_area_struct *vma,
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
  			     unsigned long start,
  			     unsigned long end,
  			     long adjust_next)
  {
  	/*
  	 * If the new start address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (start & ~HPAGE_PMD_MASK &&
  	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1723
  		split_huge_pmd_address(vma, start, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
1724
1725
1726
1727
1728
1729
1730
1731
1732
  
  	/*
  	 * If the new end address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (end & ~HPAGE_PMD_MASK &&
  	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1733
  		split_huge_pmd_address(vma, end, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
  
  	/*
  	 * If we're also updating the vma->vm_next->vm_start, if the new
  	 * vm_next->vm_start isn't page aligned and it could previously
  	 * contain an hugepage: check if we need to split an huge pmd.
  	 */
  	if (adjust_next > 0) {
  		struct vm_area_struct *next = vma->vm_next;
  		unsigned long nstart = next->vm_start;
  		nstart += adjust_next << PAGE_SHIFT;
  		if (nstart & ~HPAGE_PMD_MASK &&
  		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
  		    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1747
  			split_huge_pmd_address(next, nstart, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
1748
1749
  	}
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1750

fec89c109   Kirill A. Shutemov   thp: rewrite free...
1751
  static void freeze_page(struct page *page)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1752
  {
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1753
1754
  	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
  		TTU_RMAP_LOCKED;
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1755
  	int i, ret;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1756
1757
  
  	VM_BUG_ON_PAGE(!PageHead(page), page);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1758
1759
  	if (PageAnon(page))
  		ttu_flags |= TTU_MIGRATION;
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1760
1761
1762
1763
1764
1765
  	/* We only need TTU_SPLIT_HUGE_PMD once */
  	ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
  	for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
  		/* Cut short if the page is unmapped */
  		if (page_count(page) == 1)
  			return;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1766

fec89c109   Kirill A. Shutemov   thp: rewrite free...
1767
  		ret = try_to_unmap(page + i, ttu_flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1768
  	}
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1769
  	VM_BUG_ON_PAGE(ret, page + i - 1);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1770
  }
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1771
  static void unfreeze_page(struct page *page)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1772
  {
fec89c109   Kirill A. Shutemov   thp: rewrite free...
1773
  	int i;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1774

fec89c109   Kirill A. Shutemov   thp: rewrite free...
1775
1776
  	for (i = 0; i < HPAGE_PMD_NR; i++)
  		remove_migration_ptes(page + i, page + i, true);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1777
  }
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
1778
  static void __split_huge_page_tail(struct page *head, int tail,
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1779
1780
  		struct lruvec *lruvec, struct list_head *list)
  {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1781
  	struct page *page_tail = head + tail;
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
1782
  	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
fe896d187   Joonsoo Kim   mm: introduce pag...
1783
  	VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1784
1785
  
  	/*
0139aa7b7   Joonsoo Kim   mm: rename _count...
1786
  	 * tail_page->_refcount is zero and not changing from under us. But
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1787
  	 * get_page_unless_zero() may be running from under us on the
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1788
1789
  	 * tail_page. If we used atomic_set() below instead of atomic_inc() or
  	 * atomic_add(), we would then run atomic_set() concurrently with
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1790
1791
1792
1793
  	 * get_page_unless_zero(), and atomic_set() is implemented in C not
  	 * using locked ops. spin_unlock on x86 sometime uses locked ops
  	 * because of PPro errata 66, 92, so unless somebody can guarantee
  	 * atomic_set() here would be safe on all archs (and not only on x86),
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1794
  	 * it's safer to use atomic_inc()/atomic_add().
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1795
  	 */
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1796
1797
1798
1799
1800
1801
  	if (PageAnon(head)) {
  		page_ref_inc(page_tail);
  	} else {
  		/* Additional pin to radix tree */
  		page_ref_add(page_tail, 2);
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1802
1803
1804
1805
1806
1807
1808
1809
1810
  
  	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  	page_tail->flags |= (head->flags &
  			((1L << PG_referenced) |
  			 (1L << PG_swapbacked) |
  			 (1L << PG_mlocked) |
  			 (1L << PG_uptodate) |
  			 (1L << PG_active) |
  			 (1L << PG_locked) |
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1811
1812
  			 (1L << PG_unevictable) |
  			 (1L << PG_dirty)));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
  
  	/*
  	 * After clearing PageTail the gup refcount can be released.
  	 * Page flags also must be visible before we make the page non-compound.
  	 */
  	smp_wmb();
  
  	clear_compound_head(page_tail);
  
  	if (page_is_young(head))
  		set_page_young(page_tail);
  	if (page_is_idle(head))
  		set_page_idle(page_tail);
  
  	/* ->mapping in first tail page is compound_mapcount */
9a982250f   Kirill A. Shutemov   thp: introduce de...
1828
  	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1829
1830
1831
1832
1833
1834
  			page_tail);
  	page_tail->mapping = head->mapping;
  
  	page_tail->index = head->index + tail;
  	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
  	lru_add_page_tail(head, page_tail, lruvec, list);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1835
  }
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1836
1837
  static void __split_huge_page(struct page *page, struct list_head *list,
  		unsigned long flags)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1838
1839
1840
1841
  {
  	struct page *head = compound_head(page);
  	struct zone *zone = page_zone(head);
  	struct lruvec *lruvec;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1842
  	pgoff_t end = -1;
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
1843
  	int i;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1844

599d0c954   Mel Gorman   mm, vmscan: move ...
1845
  	lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1846
1847
1848
  
  	/* complete memcg works before add pages to LRU */
  	mem_cgroup_split_huge_fixup(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1849
1850
1851
1852
  	if (!PageAnon(page))
  		end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
  
  	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
1853
  		__split_huge_page_tail(head, i, lruvec, list);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1854
1855
1856
1857
  		/* Some pages can be beyond i_size: drop them from page cache */
  		if (head[i].index >= end) {
  			__ClearPageDirty(head + i);
  			__delete_from_page_cache(head + i, NULL);
800d8c63b   Kirill A. Shutemov   shmem: add huge p...
1858
1859
  			if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
  				shmem_uncharge(head->mapping->host, 1);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1860
1861
1862
  			put_page(head + i);
  		}
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1863
1864
  
  	ClearPageCompound(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1865
1866
1867
1868
1869
1870
1871
1872
  	/* See comment in __split_huge_page_tail() */
  	if (PageAnon(head)) {
  		page_ref_inc(head);
  	} else {
  		/* Additional pin to radix tree */
  		page_ref_add(head, 2);
  		spin_unlock(&head->mapping->tree_lock);
  	}
a52633d8e   Mel Gorman   mm, vmscan: move ...
1873
  	spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1874

fec89c109   Kirill A. Shutemov   thp: rewrite free...
1875
  	unfreeze_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		struct page *subpage = head + i;
  		if (subpage == page)
  			continue;
  		unlock_page(subpage);
  
  		/*
  		 * Subpages may be freed if there wasn't any mapping
  		 * like if add_to_swap() is running on a lru page that
  		 * had its mapping zapped. And freeing these pages
  		 * requires taking the lru_lock so we do the put_page
  		 * of the tail pages after the split is complete.
  		 */
  		put_page(subpage);
  	}
  }
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
1893
1894
  int total_mapcount(struct page *page)
  {
dd78fedde   Kirill A. Shutemov   rmap: support fil...
1895
  	int i, compound, ret;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
1896
1897
1898
1899
1900
  
  	VM_BUG_ON_PAGE(PageTail(page), page);
  
  	if (likely(!PageCompound(page)))
  		return atomic_read(&page->_mapcount) + 1;
dd78fedde   Kirill A. Shutemov   rmap: support fil...
1901
  	compound = compound_mapcount(page);
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
1902
  	if (PageHuge(page))
dd78fedde   Kirill A. Shutemov   rmap: support fil...
1903
1904
  		return compound;
  	ret = compound;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
1905
1906
  	for (i = 0; i < HPAGE_PMD_NR; i++)
  		ret += atomic_read(&page[i]._mapcount) + 1;
dd78fedde   Kirill A. Shutemov   rmap: support fil...
1907
1908
1909
  	/* File pages has compound_mapcount included in _mapcount */
  	if (!PageAnon(page))
  		return ret - compound * HPAGE_PMD_NR;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
1910
1911
1912
1913
  	if (PageDoubleMap(page))
  		ret -= HPAGE_PMD_NR;
  	return ret;
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1914
  /*
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
   * This calculates accurately how many mappings a transparent hugepage
   * has (unlike page_mapcount() which isn't fully accurate). This full
   * accuracy is primarily needed to know if copy-on-write faults can
   * reuse the page and change the mapping to read-write instead of
   * copying them. At the same time this returns the total_mapcount too.
   *
   * The function returns the highest mapcount any one of the subpages
   * has. If the return value is one, even if different processes are
   * mapping different subpages of the transparent hugepage, they can
   * all reuse it, because each process is reusing a different subpage.
   *
   * The total_mapcount is instead counting all virtual mappings of the
   * subpages. If the total_mapcount is equal to "one", it tells the
   * caller all mappings belong to the same "mm" and in turn the
   * anon_vma of the transparent hugepage can become the vma->anon_vma
   * local one as no other process may be mapping any of the subpages.
   *
   * It would be more accurate to replace page_mapcount() with
   * page_trans_huge_mapcount(), however we only use
   * page_trans_huge_mapcount() in the copy-on-write faults where we
   * need full accuracy to avoid breaking page pinning, because
   * page_trans_huge_mapcount() is slower than page_mapcount().
   */
  int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
  {
  	int i, ret, _total_mapcount, mapcount;
  
  	/* hugetlbfs shouldn't call it */
  	VM_BUG_ON_PAGE(PageHuge(page), page);
  
  	if (likely(!PageTransCompound(page))) {
  		mapcount = atomic_read(&page->_mapcount) + 1;
  		if (total_mapcount)
  			*total_mapcount = mapcount;
  		return mapcount;
  	}
  
  	page = compound_head(page);
  
  	_total_mapcount = ret = 0;
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		mapcount = atomic_read(&page[i]._mapcount) + 1;
  		ret = max(ret, mapcount);
  		_total_mapcount += mapcount;
  	}
  	if (PageDoubleMap(page)) {
  		ret -= 1;
  		_total_mapcount -= HPAGE_PMD_NR;
  	}
  	mapcount = compound_mapcount(page);
  	ret += mapcount;
  	_total_mapcount += mapcount;
  	if (total_mapcount)
  		*total_mapcount = _total_mapcount;
  	return ret;
  }
  
  /*
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
   * This function splits huge page into normal pages. @page can point to any
   * subpage of huge page to split. Split doesn't change the position of @page.
   *
   * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
   * The huge page must be locked.
   *
   * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
   *
   * Both head page and tail pages will inherit mapping, flags, and so on from
   * the hugepage.
   *
   * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
   * they are not mapped.
   *
   * Returns 0 if the hugepage is split successfully.
   * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
   * us.
   */
  int split_huge_page_to_list(struct page *page, struct list_head *list)
  {
  	struct page *head = compound_head(page);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
1994
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
baa355fd3   Kirill A. Shutemov   thp: file pages s...
1995
1996
1997
  	struct anon_vma *anon_vma = NULL;
  	struct address_space *mapping = NULL;
  	int count, mapcount, extra_pins, ret;
d96543223   Kirill A. Shutemov   thp: increase spl...
1998
  	bool mlocked;
0b9b6fff7   Kirill A. Shutemov   thp: fix interrup...
1999
  	unsigned long flags;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2000
2001
  
  	VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2002
2003
2004
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
  	if (PageAnon(head)) {
  		/*
  		 * The caller does not necessarily hold an mmap_sem that would
  		 * prevent the anon_vma disappearing so we first we take a
  		 * reference to it and then lock the anon_vma for write. This
  		 * is similar to page_lock_anon_vma_read except the write lock
  		 * is taken to serialise against parallel split or collapse
  		 * operations.
  		 */
  		anon_vma = page_get_anon_vma(head);
  		if (!anon_vma) {
  			ret = -EBUSY;
  			goto out;
  		}
  		extra_pins = 0;
  		mapping = NULL;
  		anon_vma_lock_write(anon_vma);
  	} else {
  		mapping = head->mapping;
  
  		/* Truncated ? */
  		if (!mapping) {
  			ret = -EBUSY;
  			goto out;
  		}
  
  		/* Addidional pins from radix tree */
  		extra_pins = HPAGE_PMD_NR;
  		anon_vma = NULL;
  		i_mmap_lock_read(mapping);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2035
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2036
2037
2038
2039
2040
  
  	/*
  	 * Racy check if we can split the page, before freeze_page() will
  	 * split PMDs
  	 */
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2041
  	if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2042
2043
2044
  		ret = -EBUSY;
  		goto out_unlock;
  	}
d96543223   Kirill A. Shutemov   thp: increase spl...
2045
  	mlocked = PageMlocked(page);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2046
  	freeze_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2047
  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
d96543223   Kirill A. Shutemov   thp: increase spl...
2048
2049
2050
  	/* Make sure the page is not on per-CPU pagevec as it takes pin */
  	if (mlocked)
  		lru_add_drain();
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2051
  	/* prevent PageLRU to go away from under us, and freeze lru stats */
a52633d8e   Mel Gorman   mm, vmscan: move ...
2052
  	spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
  
  	if (mapping) {
  		void **pslot;
  
  		spin_lock(&mapping->tree_lock);
  		pslot = radix_tree_lookup_slot(&mapping->page_tree,
  				page_index(head));
  		/*
  		 * Check if the head page is present in radix tree.
  		 * We assume all tail are present too, if head is there.
  		 */
  		if (radix_tree_deref_slot_protected(pslot,
  					&mapping->tree_lock) != head)
  			goto fail;
  	}
0139aa7b7   Joonsoo Kim   mm: rename _count...
2068
  	/* Prevent deferred_split_scan() touching ->_refcount */
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2069
  	spin_lock(&pgdata->split_queue_lock);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2070
2071
  	count = page_count(head);
  	mapcount = total_mapcount(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2072
  	if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
2073
  		if (!list_empty(page_deferred_list(head))) {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2074
  			pgdata->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2075
2076
  			list_del(page_deferred_list(head));
  		}
65c453778   Kirill A. Shutemov   mm, rmap: account...
2077
  		if (mapping)
11fb99898   Mel Gorman   mm: move most fil...
2078
  			__dec_node_page_state(page, NR_SHMEM_THPS);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2079
2080
  		spin_unlock(&pgdata->split_queue_lock);
  		__split_huge_page(page, list, flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2081
  		ret = 0;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2082
  	} else {
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
  		if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
  			pr_alert("total_mapcount: %u, page_count(): %u
  ",
  					mapcount, count);
  			if (PageTail(page))
  				dump_page(head, NULL);
  			dump_page(page, "total_mapcount(head) > 0");
  			BUG();
  		}
  		spin_unlock(&pgdata->split_queue_lock);
  fail:		if (mapping)
  			spin_unlock(&mapping->tree_lock);
a52633d8e   Mel Gorman   mm, vmscan: move ...
2095
  		spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2096
  		unfreeze_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2097
2098
2099
2100
  		ret = -EBUSY;
  	}
  
  out_unlock:
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2101
2102
2103
2104
2105
2106
  	if (anon_vma) {
  		anon_vma_unlock_write(anon_vma);
  		put_anon_vma(anon_vma);
  	}
  	if (mapping)
  		i_mmap_unlock_read(mapping);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2107
2108
2109
2110
  out:
  	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
  	return ret;
  }
9a982250f   Kirill A. Shutemov   thp: introduce de...
2111
2112
2113
  
  void free_transhuge_page(struct page *page)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2114
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
9a982250f   Kirill A. Shutemov   thp: introduce de...
2115
  	unsigned long flags;
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2116
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2117
  	if (!list_empty(page_deferred_list(page))) {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2118
  		pgdata->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2119
2120
  		list_del(page_deferred_list(page));
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2121
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2122
2123
2124
2125
2126
  	free_compound_page(page);
  }
  
  void deferred_split_huge_page(struct page *page)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2127
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
9a982250f   Kirill A. Shutemov   thp: introduce de...
2128
2129
2130
  	unsigned long flags;
  
  	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2131
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2132
  	if (list_empty(page_deferred_list(page))) {
f9719a03d   Kirill A. Shutemov   thp, vmstats: cou...
2133
  		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2134
2135
  		list_add_tail(page_deferred_list(page), &pgdata->split_queue);
  		pgdata->split_queue_len++;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2136
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2137
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2138
2139
2140
2141
2142
  }
  
  static unsigned long deferred_split_count(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2143
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
cb8d68ec1   Kirill A. Shutemov   thp: change defer...
2144
  	return ACCESS_ONCE(pgdata->split_queue_len);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2145
2146
2147
2148
2149
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2150
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2151
2152
2153
2154
  	unsigned long flags;
  	LIST_HEAD(list), *pos, *next;
  	struct page *page;
  	int split = 0;
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2155
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2156
  	/* Take pin on all head pages to avoid freeing them under us */
ae026204a   Kirill A. Shutemov   thp: make deferre...
2157
  	list_for_each_safe(pos, next, &pgdata->split_queue) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
2158
2159
  		page = list_entry((void *)pos, struct page, mapping);
  		page = compound_head(page);
e3ae19535   Kirill A. Shutemov   thp: limit number...
2160
2161
2162
2163
  		if (get_page_unless_zero(page)) {
  			list_move(page_deferred_list(page), &list);
  		} else {
  			/* We lost race with put_compound_page() */
9a982250f   Kirill A. Shutemov   thp: introduce de...
2164
  			list_del_init(page_deferred_list(page));
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2165
  			pgdata->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2166
  		}
e3ae19535   Kirill A. Shutemov   thp: limit number...
2167
2168
  		if (!--sc->nr_to_scan)
  			break;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2169
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2170
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
  
  	list_for_each_safe(pos, next, &list) {
  		page = list_entry((void *)pos, struct page, mapping);
  		lock_page(page);
  		/* split_huge_page() removes page from list on success */
  		if (!split_huge_page(page))
  			split++;
  		unlock_page(page);
  		put_page(page);
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2181
2182
2183
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
  	list_splice_tail(&list, &pgdata->split_queue);
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2184

cb8d68ec1   Kirill A. Shutemov   thp: change defer...
2185
2186
2187
2188
2189
2190
2191
  	/*
  	 * Stop shrinker if we didn't split any page, but the queue is empty.
  	 * This can happen if pages were freed under us.
  	 */
  	if (!split && list_empty(&pgdata->split_queue))
  		return SHRINK_STOP;
  	return split;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2192
2193
2194
2195
2196
2197
  }
  
  static struct shrinker deferred_split_shrinker = {
  	.count_objects = deferred_split_count,
  	.scan_objects = deferred_split_scan,
  	.seeks = DEFAULT_SEEKS,
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2198
  	.flags = SHRINKER_NUMA_AWARE,
9a982250f   Kirill A. Shutemov   thp: introduce de...
2199
  };
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
  
  #ifdef CONFIG_DEBUG_FS
  static int split_huge_pages_set(void *data, u64 val)
  {
  	struct zone *zone;
  	struct page *page;
  	unsigned long pfn, max_zone_pfn;
  	unsigned long total = 0, split = 0;
  
  	if (val != 1)
  		return -EINVAL;
  
  	for_each_populated_zone(zone) {
  		max_zone_pfn = zone_end_pfn(zone);
  		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
  			if (!pfn_valid(pfn))
  				continue;
  
  			page = pfn_to_page(pfn);
  			if (!get_page_unless_zero(page))
  				continue;
  
  			if (zone != page_zone(page))
  				goto next;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2224
  			if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
  				goto next;
  
  			total++;
  			lock_page(page);
  			if (!split_huge_page(page))
  				split++;
  			unlock_page(page);
  next:
  			put_page(page);
  		}
  	}
145bdaa15   Yang Shi   mm: thp: correct ...
2236
2237
  	pr_info("%lu of %lu THP split
  ", split, total);
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
  
  	return 0;
  }
  DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
  		"%llu
  ");
  
  static int __init split_huge_pages_debugfs(void)
  {
  	void *ret;
145bdaa15   Yang Shi   mm: thp: correct ...
2248
  	ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2249
2250
2251
2252
2253
2254
2255
  			&split_huge_pages_fops);
  	if (!ret)
  		pr_warn("Failed to create split_huge_pages in debugfs");
  	return 0;
  }
  late_initcall(split_huge_pages_debugfs);
  #endif