Blame view

mm/huge_memory.c 84.9 KB
20c8ccb19   Thomas Gleixner   treewide: Replace...
1
  // SPDX-License-Identifier: GPL-2.0-only
71e3aac07   Andrea Arcangeli   thp: transparent ...
2
3
  /*
   *  Copyright (C) 2009  Red Hat, Inc.
71e3aac07   Andrea Arcangeli   thp: transparent ...
4
   */
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
5
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71e3aac07   Andrea Arcangeli   thp: transparent ...
6
7
  #include <linux/mm.h>
  #include <linux/sched.h>
f7ccbae45   Ingo Molnar   sched/headers: Pr...
8
  #include <linux/sched/coredump.h>
6a3827d75   Ingo Molnar   sched/headers: Pr...
9
  #include <linux/sched/numa_balancing.h>
71e3aac07   Andrea Arcangeli   thp: transparent ...
10
11
12
13
14
  #include <linux/highmem.h>
  #include <linux/hugetlb.h>
  #include <linux/mmu_notifier.h>
  #include <linux/rmap.h>
  #include <linux/swap.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
15
  #include <linux/shrinker.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
16
  #include <linux/mm_inline.h>
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
17
  #include <linux/swapops.h>
4897c7655   Matthew Wilcox   thp: prepare for ...
18
  #include <linux/dax.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
19
  #include <linux/khugepaged.h>
878aee7d6   Andrea Arcangeli   thp: freeze khuge...
20
  #include <linux/freezer.h>
f25748e3c   Dan Williams   mm, dax: convert ...
21
  #include <linux/pfn_t.h>
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
22
  #include <linux/mman.h>
3565fce3a   Dan Williams   mm, x86: get_user...
23
  #include <linux/memremap.h>
325adeb55   Ralf Baechle   mm: huge_memory: ...
24
  #include <linux/pagemap.h>
49071d436   Kirill A. Shutemov   thp: add debugfs ...
25
  #include <linux/debugfs.h>
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
26
  #include <linux/migrate.h>
43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
27
  #include <linux/hashtable.h>
6b251fc96   Andrea Arcangeli   userfaultfd: call...
28
  #include <linux/userfaultfd_k.h>
33c3fc71c   Vladimir Davydov   mm: introduce idl...
29
  #include <linux/page_idle.h>
baa355fd3   Kirill A. Shutemov   thp: file pages s...
30
  #include <linux/shmem_fs.h>
6b31d5955   Michal Hocko   mm, oom: fix pote...
31
  #include <linux/oom.h>
98fa15f34   Anshuman Khandual   mm: replace all o...
32
  #include <linux/numa.h>
f7da677bc   Vlastimil Babka   mm, page_owner: h...
33
  #include <linux/page_owner.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
34

71e3aac07   Andrea Arcangeli   thp: transparent ...
35
36
37
  #include <asm/tlb.h>
  #include <asm/pgalloc.h>
  #include "internal.h"
ba76149f4   Andrea Arcangeli   thp: khugepaged
38
  /*
b14d595aa   Michael DeGuzis   mm/huge_memory.c:...
39
40
41
42
   * By default, transparent hugepage support is disabled in order to avoid
   * risking an increased memory footprint for applications that are not
   * guaranteed to benefit from it. When transparent hugepage support is
   * enabled, it is for all mappings, and khugepaged scans all mappings.
8bfa3f9a0   Jianguo Wu   mm/huge_memory.c:...
43
44
   * Defrag is invoked by khugepaged hugepage allocations and by page faults
   * for all hugepage allocations.
ba76149f4   Andrea Arcangeli   thp: khugepaged
45
   */
71e3aac07   Andrea Arcangeli   thp: transparent ...
46
  unsigned long transparent_hugepage_flags __read_mostly =
13ece886d   Andrea Arcangeli   thp: transparent ...
47
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
ba76149f4   Andrea Arcangeli   thp: khugepaged
48
  	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
13ece886d   Andrea Arcangeli   thp: transparent ...
49
50
51
52
  #endif
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  #endif
444eb2a44   Mel Gorman   mm: thp: set THP ...
53
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
79da5407e   Kirill A. Shutemov   thp: introduce sy...
54
55
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
ba76149f4   Andrea Arcangeli   thp: khugepaged
56

9a982250f   Kirill A. Shutemov   thp: introduce de...
57
  static struct shrinker deferred_split_shrinker;
f000565ad   Andrea Arcangeli   thp: set recommen...
58

97ae17497   Kirill A. Shutemov   thp: implement re...
59
  static atomic_t huge_zero_refcount;
56873f43a   Wang, Yalin   mm:add KPF_ZERO_P...
60
  struct page *huge_zero_page __read_mostly;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
61

7635d9cbe   Michal Hocko   mm, thp, proc: re...
62
63
  bool transparent_hugepage_enabled(struct vm_area_struct *vma)
  {
c06306696   Yang Shi   mm: thp: fix fals...
64
65
66
67
68
  	/* The addr is used to check if the vma size fits */
  	unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
  
  	if (!transhuge_vma_suitable(vma, addr))
  		return false;
7635d9cbe   Michal Hocko   mm, thp, proc: re...
69
70
  	if (vma_is_anonymous(vma))
  		return __transparent_hugepage_enabled(vma);
c06306696   Yang Shi   mm: thp: fix fals...
71
72
  	if (vma_is_shmem(vma))
  		return shmem_huge_enabled(vma);
7635d9cbe   Michal Hocko   mm, thp, proc: re...
73
74
75
  
  	return false;
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
76
  static struct page *get_huge_zero_page(void)
97ae17497   Kirill A. Shutemov   thp: implement re...
77
78
79
80
  {
  	struct page *zero_page;
  retry:
  	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
4db0c3c29   Jason Low   mm: remove rest o...
81
  		return READ_ONCE(huge_zero_page);
97ae17497   Kirill A. Shutemov   thp: implement re...
82
83
  
  	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
84
  			HPAGE_PMD_ORDER);
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
85
86
  	if (!zero_page) {
  		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
87
  		return NULL;
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
88
89
  	}
  	count_vm_event(THP_ZERO_PAGE_ALLOC);
97ae17497   Kirill A. Shutemov   thp: implement re...
90
  	preempt_disable();
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
91
  	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
97ae17497   Kirill A. Shutemov   thp: implement re...
92
  		preempt_enable();
5ddacbe92   Yu Zhao   mm: free compound...
93
  		__free_pages(zero_page, compound_order(zero_page));
97ae17497   Kirill A. Shutemov   thp: implement re...
94
95
96
97
98
99
  		goto retry;
  	}
  
  	/* We take additional reference here. It will be put back by shrinker */
  	atomic_set(&huge_zero_refcount, 2);
  	preempt_enable();
4db0c3c29   Jason Low   mm: remove rest o...
100
  	return READ_ONCE(huge_zero_page);
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
101
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
102
  static void put_huge_zero_page(void)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
103
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
104
105
106
107
108
  	/*
  	 * Counter should never go to zero here. Only shrinker can put
  	 * last reference.
  	 */
  	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
109
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  struct page *mm_get_huge_zero_page(struct mm_struct *mm)
  {
  	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		return READ_ONCE(huge_zero_page);
  
  	if (!get_huge_zero_page())
  		return NULL;
  
  	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		put_huge_zero_page();
  
  	return READ_ONCE(huge_zero_page);
  }
  
  void mm_put_huge_zero_page(struct mm_struct *mm)
  {
  	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		put_huge_zero_page();
  }
488964666   Glauber Costa   hugepage: convert...
129
130
  static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
  					struct shrink_control *sc)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
131
  {
488964666   Glauber Costa   hugepage: convert...
132
133
134
  	/* we can free zero page only if last reference remains */
  	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
135

488964666   Glauber Costa   hugepage: convert...
136
137
138
  static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
  				       struct shrink_control *sc)
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
139
  	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
140
141
  		struct page *zero_page = xchg(&huge_zero_page, NULL);
  		BUG_ON(zero_page == NULL);
5ddacbe92   Yu Zhao   mm: free compound...
142
  		__free_pages(zero_page, compound_order(zero_page));
488964666   Glauber Costa   hugepage: convert...
143
  		return HPAGE_PMD_NR;
97ae17497   Kirill A. Shutemov   thp: implement re...
144
145
146
  	}
  
  	return 0;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
147
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
148
  static struct shrinker huge_zero_page_shrinker = {
488964666   Glauber Costa   hugepage: convert...
149
150
  	.count_objects = shrink_huge_zero_page_count,
  	.scan_objects = shrink_huge_zero_page_scan,
97ae17497   Kirill A. Shutemov   thp: implement re...
151
152
  	.seeks = DEFAULT_SEEKS,
  };
71e3aac07   Andrea Arcangeli   thp: transparent ...
153
  #ifdef CONFIG_SYSFS
71e3aac07   Andrea Arcangeli   thp: transparent ...
154
155
156
  static ssize_t enabled_show(struct kobject *kobj,
  			    struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
157
158
159
160
161
162
163
164
165
  	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "[always] madvise never
  ");
  	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always [madvise] never
  ");
  	else
  		return sprintf(buf, "always madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
166
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
167

71e3aac07   Andrea Arcangeli   thp: transparent ...
168
169
170
171
  static ssize_t enabled_store(struct kobject *kobj,
  			     struct kobj_attribute *attr,
  			     const char *buf, size_t count)
  {
21440d7eb   David Rientjes   mm, thp: add new ...
172
  	ssize_t ret = count;
ba76149f4   Andrea Arcangeli   thp: khugepaged
173

0bae7e7fa   David Rientjes   mm, thp: fix defr...
174
  	if (sysfs_streq(buf, "always")) {
21440d7eb   David Rientjes   mm, thp: add new ...
175
176
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
0bae7e7fa   David Rientjes   mm, thp: fix defr...
177
  	} else if (sysfs_streq(buf, "madvise")) {
21440d7eb   David Rientjes   mm, thp: add new ...
178
179
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
0bae7e7fa   David Rientjes   mm, thp: fix defr...
180
  	} else if (sysfs_streq(buf, "never")) {
21440d7eb   David Rientjes   mm, thp: add new ...
181
182
183
184
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  	} else
  		ret = -EINVAL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
185
186
  
  	if (ret > 0) {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
187
  		int err = start_stop_khugepaged();
ba76149f4   Andrea Arcangeli   thp: khugepaged
188
189
190
  		if (err)
  			ret = err;
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
191
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
192
193
194
  }
  static struct kobj_attribute enabled_attr =
  	__ATTR(enabled, 0644, enabled_show, enabled_store);
b46e756f5   Kirill A. Shutemov   thp: extract khug...
195
  ssize_t single_hugepage_flag_show(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
196
197
198
  				struct kobj_attribute *attr, char *buf,
  				enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
199
200
201
  	return sprintf(buf, "%d
  ",
  		       !!test_bit(flag, &transparent_hugepage_flags));
71e3aac07   Andrea Arcangeli   thp: transparent ...
202
  }
e27e6151b   Ben Hutchings   mm/thp: use conve...
203

b46e756f5   Kirill A. Shutemov   thp: extract khug...
204
  ssize_t single_hugepage_flag_store(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
205
206
207
208
  				 struct kobj_attribute *attr,
  				 const char *buf, size_t count,
  				 enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
209
210
211
212
213
214
215
216
217
218
  	unsigned long value;
  	int ret;
  
  	ret = kstrtoul(buf, 10, &value);
  	if (ret < 0)
  		return ret;
  	if (value > 1)
  		return -EINVAL;
  
  	if (value)
71e3aac07   Andrea Arcangeli   thp: transparent ...
219
  		set_bit(flag, &transparent_hugepage_flags);
e27e6151b   Ben Hutchings   mm/thp: use conve...
220
  	else
71e3aac07   Andrea Arcangeli   thp: transparent ...
221
  		clear_bit(flag, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
222
223
224
  
  	return count;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
225
226
227
  static ssize_t defrag_show(struct kobject *kobj,
  			   struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
228
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
21440d7eb   David Rientjes   mm, thp: add new ...
229
230
  		return sprintf(buf, "[always] defer defer+madvise madvise never
  ");
444eb2a44   Mel Gorman   mm: thp: set THP ...
231
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
21440d7eb   David Rientjes   mm, thp: add new ...
232
233
234
235
236
237
238
239
240
241
  		return sprintf(buf, "always [defer] defer+madvise madvise never
  ");
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always defer [defer+madvise] madvise never
  ");
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always defer defer+madvise [madvise] never
  ");
  	return sprintf(buf, "always defer defer+madvise madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
242
  }
21440d7eb   David Rientjes   mm, thp: add new ...
243

71e3aac07   Andrea Arcangeli   thp: transparent ...
244
245
246
247
  static ssize_t defrag_store(struct kobject *kobj,
  			    struct kobj_attribute *attr,
  			    const char *buf, size_t count)
  {
0bae7e7fa   David Rientjes   mm, thp: fix defr...
248
  	if (sysfs_streq(buf, "always")) {
21440d7eb   David Rientjes   mm, thp: add new ...
249
250
251
252
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
0bae7e7fa   David Rientjes   mm, thp: fix defr...
253
  	} else if (sysfs_streq(buf, "defer+madvise")) {
21440d7eb   David Rientjes   mm, thp: add new ...
254
255
256
257
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
0bae7e7fa   David Rientjes   mm, thp: fix defr...
258
  	} else if (sysfs_streq(buf, "defer")) {
4fad7fb6b   David Rientjes   mm, thp: fix sett...
259
260
261
262
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
0bae7e7fa   David Rientjes   mm, thp: fix defr...
263
  	} else if (sysfs_streq(buf, "madvise")) {
21440d7eb   David Rientjes   mm, thp: add new ...
264
265
266
267
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
0bae7e7fa   David Rientjes   mm, thp: fix defr...
268
  	} else if (sysfs_streq(buf, "never")) {
21440d7eb   David Rientjes   mm, thp: add new ...
269
270
271
272
273
274
275
276
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  	} else
  		return -EINVAL;
  
  	return count;
71e3aac07   Andrea Arcangeli   thp: transparent ...
277
278
279
  }
  static struct kobj_attribute defrag_attr =
  	__ATTR(defrag, 0644, defrag_show, defrag_store);
79da5407e   Kirill A. Shutemov   thp: introduce sy...
280
281
282
  static ssize_t use_zero_page_show(struct kobject *kobj,
  		struct kobj_attribute *attr, char *buf)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
283
  	return single_hugepage_flag_show(kobj, attr, buf,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
284
285
286
287
288
  				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static ssize_t use_zero_page_store(struct kobject *kobj,
  		struct kobj_attribute *attr, const char *buf, size_t count)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
289
  	return single_hugepage_flag_store(kobj, attr, buf, count,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
290
291
292
293
  				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static struct kobj_attribute use_zero_page_attr =
  	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
49920d287   Hugh Dickins   mm: make transpar...
294
295
296
297
298
299
300
301
302
  
  static ssize_t hpage_pmd_size_show(struct kobject *kobj,
  		struct kobj_attribute *attr, char *buf)
  {
  	return sprintf(buf, "%lu
  ", HPAGE_PMD_SIZE);
  }
  static struct kobj_attribute hpage_pmd_size_attr =
  	__ATTR_RO(hpage_pmd_size);
71e3aac07   Andrea Arcangeli   thp: transparent ...
303
304
305
306
  #ifdef CONFIG_DEBUG_VM
  static ssize_t debug_cow_show(struct kobject *kobj,
  				struct kobj_attribute *attr, char *buf)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
307
  	return single_hugepage_flag_show(kobj, attr, buf,
71e3aac07   Andrea Arcangeli   thp: transparent ...
308
309
310
311
312
313
  				TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  }
  static ssize_t debug_cow_store(struct kobject *kobj,
  			       struct kobj_attribute *attr,
  			       const char *buf, size_t count)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
314
  	return single_hugepage_flag_store(kobj, attr, buf, count,
71e3aac07   Andrea Arcangeli   thp: transparent ...
315
316
317
318
319
320
321
322
323
  				 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  }
  static struct kobj_attribute debug_cow_attr =
  	__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
  #endif /* CONFIG_DEBUG_VM */
  
  static struct attribute *hugepage_attr[] = {
  	&enabled_attr.attr,
  	&defrag_attr.attr,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
324
  	&use_zero_page_attr.attr,
49920d287   Hugh Dickins   mm: make transpar...
325
  	&hpage_pmd_size_attr.attr,
e496cf3d7   Kirill A. Shutemov   thp: introduce CO...
326
  #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
5a6e75f81   Kirill A. Shutemov   shmem: prepare hu...
327
328
  	&shmem_enabled_attr.attr,
  #endif
71e3aac07   Andrea Arcangeli   thp: transparent ...
329
330
331
332
333
  #ifdef CONFIG_DEBUG_VM
  	&debug_cow_attr.attr,
  #endif
  	NULL,
  };
8aa95a21b   Arvind Yadav   mm/huge_memory.c:...
334
  static const struct attribute_group hugepage_attr_group = {
71e3aac07   Andrea Arcangeli   thp: transparent ...
335
  	.attrs = hugepage_attr,
ba76149f4   Andrea Arcangeli   thp: khugepaged
336
  };
569e55900   Shaohua Li   thp: improve the ...
337
  static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
71e3aac07   Andrea Arcangeli   thp: transparent ...
338
  {
71e3aac07   Andrea Arcangeli   thp: transparent ...
339
  	int err;
569e55900   Shaohua Li   thp: improve the ...
340
341
  	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
  	if (unlikely(!*hugepage_kobj)) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
342
343
  		pr_err("failed to create transparent hugepage kobject
  ");
569e55900   Shaohua Li   thp: improve the ...
344
  		return -ENOMEM;
ba76149f4   Andrea Arcangeli   thp: khugepaged
345
  	}
569e55900   Shaohua Li   thp: improve the ...
346
  	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
347
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
348
349
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
350
  		goto delete_obj;
ba76149f4   Andrea Arcangeli   thp: khugepaged
351
  	}
569e55900   Shaohua Li   thp: improve the ...
352
  	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
353
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
354
355
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
356
  		goto remove_hp_group;
ba76149f4   Andrea Arcangeli   thp: khugepaged
357
  	}
569e55900   Shaohua Li   thp: improve the ...
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
  
  	return 0;
  
  remove_hp_group:
  	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
  delete_obj:
  	kobject_put(*hugepage_kobj);
  	return err;
  }
  
  static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
  	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
  	kobject_put(hugepage_kobj);
  }
  #else
  static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
  {
  	return 0;
  }
  
  static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  }
  #endif /* CONFIG_SYSFS */
  
  static int __init hugepage_init(void)
  {
  	int err;
  	struct kobject *hugepage_kobj;
  
  	if (!has_transparent_hugepage()) {
  		transparent_hugepage_flags = 0;
  		return -EINVAL;
  	}
ff20c2e0a   Kirill A. Shutemov   mm: Some arch may...
394
395
396
397
398
399
400
401
402
  	/*
  	 * hugepages can't be allocated by the buddy allocator
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
  	/*
  	 * we use page->mapping and page->index in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
569e55900   Shaohua Li   thp: improve the ...
403
404
  	err = hugepage_init_sysfs(&hugepage_kobj);
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
405
  		goto err_sysfs;
ba76149f4   Andrea Arcangeli   thp: khugepaged
406

b46e756f5   Kirill A. Shutemov   thp: extract khug...
407
  	err = khugepaged_init();
ba76149f4   Andrea Arcangeli   thp: khugepaged
408
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
409
  		goto err_slab;
ba76149f4   Andrea Arcangeli   thp: khugepaged
410

65ebb64f4   Kirill A. Shutemov   thp: handle error...
411
412
413
  	err = register_shrinker(&huge_zero_page_shrinker);
  	if (err)
  		goto err_hzp_shrinker;
9a982250f   Kirill A. Shutemov   thp: introduce de...
414
415
416
  	err = register_shrinker(&deferred_split_shrinker);
  	if (err)
  		goto err_split_shrinker;
97ae17497   Kirill A. Shutemov   thp: implement re...
417

97562cd24   Rik van Riel   thp: disable tran...
418
419
420
421
422
  	/*
  	 * By default disable transparent hugepages on smaller systems,
  	 * where the extra memory used could hurt more than TLB overhead
  	 * is likely to save.  The admin can still enable it through /sys.
  	 */
ca79b0c21   Arun KS   mm: convert total...
423
  	if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
97562cd24   Rik van Riel   thp: disable tran...
424
  		transparent_hugepage_flags = 0;
79553da29   Kirill A. Shutemov   thp: cleanup khug...
425
426
  		return 0;
  	}
97562cd24   Rik van Riel   thp: disable tran...
427

79553da29   Kirill A. Shutemov   thp: cleanup khug...
428
  	err = start_stop_khugepaged();
65ebb64f4   Kirill A. Shutemov   thp: handle error...
429
430
  	if (err)
  		goto err_khugepaged;
ba76149f4   Andrea Arcangeli   thp: khugepaged
431

569e55900   Shaohua Li   thp: improve the ...
432
  	return 0;
65ebb64f4   Kirill A. Shutemov   thp: handle error...
433
  err_khugepaged:
9a982250f   Kirill A. Shutemov   thp: introduce de...
434
435
  	unregister_shrinker(&deferred_split_shrinker);
  err_split_shrinker:
65ebb64f4   Kirill A. Shutemov   thp: handle error...
436
437
  	unregister_shrinker(&huge_zero_page_shrinker);
  err_hzp_shrinker:
b46e756f5   Kirill A. Shutemov   thp: extract khug...
438
  	khugepaged_destroy();
65ebb64f4   Kirill A. Shutemov   thp: handle error...
439
  err_slab:
569e55900   Shaohua Li   thp: improve the ...
440
  	hugepage_exit_sysfs(hugepage_kobj);
65ebb64f4   Kirill A. Shutemov   thp: handle error...
441
  err_sysfs:
ba76149f4   Andrea Arcangeli   thp: khugepaged
442
  	return err;
71e3aac07   Andrea Arcangeli   thp: transparent ...
443
  }
a64fb3cd6   Paul Gortmaker   mm: audit/fix non...
444
  subsys_initcall(hugepage_init);
71e3aac07   Andrea Arcangeli   thp: transparent ...
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
  
  static int __init setup_transparent_hugepage(char *str)
  {
  	int ret = 0;
  	if (!str)
  		goto out;
  	if (!strcmp(str, "always")) {
  		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			&transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "madvise")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			&transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "never")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	}
  out:
  	if (!ret)
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
472
473
  		pr_warn("transparent_hugepage= cannot parse, ignored
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
474
475
476
  	return ret;
  }
  __setup("transparent_hugepage=", setup_transparent_hugepage);
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
477
  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
71e3aac07   Andrea Arcangeli   thp: transparent ...
478
  {
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
479
  	if (likely(vma->vm_flags & VM_WRITE))
71e3aac07   Andrea Arcangeli   thp: transparent ...
480
481
482
  		pmd = pmd_mkwrite(pmd);
  	return pmd;
  }
87eaceb3f   Yang Shi   mm: thp: make def...
483
484
  #ifdef CONFIG_MEMCG
  static inline struct deferred_split *get_deferred_split_queue(struct page *page)
9a982250f   Kirill A. Shutemov   thp: introduce de...
485
  {
87eaceb3f   Yang Shi   mm: thp: make def...
486
487
488
489
490
491
492
  	struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
  	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
  
  	if (memcg)
  		return &memcg->deferred_split_queue;
  	else
  		return &pgdat->deferred_split_queue;
9a982250f   Kirill A. Shutemov   thp: introduce de...
493
  }
87eaceb3f   Yang Shi   mm: thp: make def...
494
495
496
497
498
499
500
501
  #else
  static inline struct deferred_split *get_deferred_split_queue(struct page *page)
  {
  	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
  
  	return &pgdat->deferred_split_queue;
  }
  #endif
9a982250f   Kirill A. Shutemov   thp: introduce de...
502
503
504
505
506
507
508
  
  void prep_transhuge_page(struct page *page)
  {
  	/*
  	 * we use page->mapping and page->indexlru in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
9a982250f   Kirill A. Shutemov   thp: introduce de...
509
510
511
512
  
  	INIT_LIST_HEAD(page_deferred_list(page));
  	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
  }
194e7fcd1   Kirill A. Shutemov   mm/huge_memory.c:...
513
514
  static unsigned long __thp_get_unmapped_area(struct file *filp,
  		unsigned long addr, unsigned long len,
74d2fad13   Toshi Kani   thp, dax: add thp...
515
516
  		loff_t off, unsigned long flags, unsigned long size)
  {
74d2fad13   Toshi Kani   thp, dax: add thp...
517
518
  	loff_t off_end = off + len;
  	loff_t off_align = round_up(off, size);
194e7fcd1   Kirill A. Shutemov   mm/huge_memory.c:...
519
  	unsigned long len_pad, ret;
74d2fad13   Toshi Kani   thp, dax: add thp...
520
521
522
523
524
525
526
  
  	if (off_end <= off_align || (off_end - off_align) < size)
  		return 0;
  
  	len_pad = len + size;
  	if (len_pad < len || (off + len_pad) < off)
  		return 0;
194e7fcd1   Kirill A. Shutemov   mm/huge_memory.c:...
527
  	ret = current->mm->get_unmapped_area(filp, addr, len_pad,
74d2fad13   Toshi Kani   thp, dax: add thp...
528
  					      off >> PAGE_SHIFT, flags);
194e7fcd1   Kirill A. Shutemov   mm/huge_memory.c:...
529
530
531
532
533
534
  
  	/*
  	 * The failure might be due to length padding. The caller will retry
  	 * without the padding.
  	 */
  	if (IS_ERR_VALUE(ret))
74d2fad13   Toshi Kani   thp, dax: add thp...
535
  		return 0;
194e7fcd1   Kirill A. Shutemov   mm/huge_memory.c:...
536
537
538
539
540
541
542
543
544
  	/*
  	 * Do not try to align to THP boundary if allocation at the address
  	 * hint succeeds.
  	 */
  	if (ret == addr)
  		return addr;
  
  	ret += (off - ret) & (size - 1);
  	return ret;
74d2fad13   Toshi Kani   thp, dax: add thp...
545
546
547
548
549
  }
  
  unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
  		unsigned long len, unsigned long pgoff, unsigned long flags)
  {
194e7fcd1   Kirill A. Shutemov   mm/huge_memory.c:...
550
  	unsigned long ret;
74d2fad13   Toshi Kani   thp, dax: add thp...
551
  	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
74d2fad13   Toshi Kani   thp, dax: add thp...
552
553
  	if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
  		goto out;
194e7fcd1   Kirill A. Shutemov   mm/huge_memory.c:...
554
555
556
557
  	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
  	if (ret)
  		return ret;
  out:
74d2fad13   Toshi Kani   thp, dax: add thp...
558
559
560
  	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
  }
  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
2b7403035   Souptick Joarder   mm: Change return...
561
562
  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
  			struct page *page, gfp_t gfp)
71e3aac07   Andrea Arcangeli   thp: transparent ...
563
  {
82b0f8c39   Jan Kara   mm: join struct f...
564
  	struct vm_area_struct *vma = vmf->vma;
00501b531   Johannes Weiner   mm: memcontrol: r...
565
  	struct mem_cgroup *memcg;
71e3aac07   Andrea Arcangeli   thp: transparent ...
566
  	pgtable_t pgtable;
82b0f8c39   Jan Kara   mm: join struct f...
567
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2b7403035   Souptick Joarder   mm: Change return...
568
  	vm_fault_t ret = 0;
71e3aac07   Andrea Arcangeli   thp: transparent ...
569

309381fea   Sasha Levin   mm: dump page whe...
570
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
00501b531   Johannes Weiner   mm: memcontrol: r...
571

2cf855837   Tejun Heo   memcontrol: sched...
572
  	if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
6b251fc96   Andrea Arcangeli   userfaultfd: call...
573
574
575
576
  		put_page(page);
  		count_vm_event(THP_FAULT_FALLBACK);
  		return VM_FAULT_FALLBACK;
  	}
00501b531   Johannes Weiner   mm: memcontrol: r...
577

4cf589249   Joel Fernandes (Google)   mm: treewide: rem...
578
  	pgtable = pte_alloc_one(vma->vm_mm);
00501b531   Johannes Weiner   mm: memcontrol: r...
579
  	if (unlikely(!pgtable)) {
6b31d5955   Michal Hocko   mm, oom: fix pote...
580
581
  		ret = VM_FAULT_OOM;
  		goto release;
00501b531   Johannes Weiner   mm: memcontrol: r...
582
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
583

c79b57e46   Huang Ying   mm: hugetlb: clea...
584
  	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
52f37629f   Minchan Kim   THP: fix comment ...
585
586
587
588
589
  	/*
  	 * The memory barrier inside __SetPageUptodate makes sure that
  	 * clear_huge_page writes become visible before the set_pmd_at()
  	 * write.
  	 */
71e3aac07   Andrea Arcangeli   thp: transparent ...
590
  	__SetPageUptodate(page);
82b0f8c39   Jan Kara   mm: join struct f...
591
592
  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_none(*vmf->pmd))) {
6b31d5955   Michal Hocko   mm, oom: fix pote...
593
  		goto unlock_release;
71e3aac07   Andrea Arcangeli   thp: transparent ...
594
595
  	} else {
  		pmd_t entry;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
596

6b31d5955   Michal Hocko   mm, oom: fix pote...
597
598
599
  		ret = check_stable_address_space(vma->vm_mm);
  		if (ret)
  			goto unlock_release;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
600
601
  		/* Deliver the page fault to userland */
  		if (userfaultfd_missing(vma)) {
2b7403035   Souptick Joarder   mm: Change return...
602
  			vm_fault_t ret2;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
603

82b0f8c39   Jan Kara   mm: join struct f...
604
  			spin_unlock(vmf->ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
605
  			mem_cgroup_cancel_charge(page, memcg, true);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
606
  			put_page(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
607
  			pte_free(vma->vm_mm, pgtable);
2b7403035   Souptick Joarder   mm: Change return...
608
609
610
  			ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
  			VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
  			return ret2;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
611
  		}
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
612
  		entry = mk_huge_pmd(page, vma->vm_page_prot);
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
613
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
614
  		page_add_new_anon_rmap(page, vma, haddr, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
615
  		mem_cgroup_commit_charge(page, memcg, false, true);
00501b531   Johannes Weiner   mm: memcontrol: r...
616
  		lru_cache_add_active_or_unevictable(page, vma);
82b0f8c39   Jan Kara   mm: join struct f...
617
618
  		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
619
  		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
620
  		mm_inc_nr_ptes(vma->vm_mm);
82b0f8c39   Jan Kara   mm: join struct f...
621
  		spin_unlock(vmf->ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
622
  		count_vm_event(THP_FAULT_ALLOC);
1ff9e6e17   Chris Down   mm: memcontrol: e...
623
  		count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
71e3aac07   Andrea Arcangeli   thp: transparent ...
624
  	}
aa2e878ef   David Rientjes   mm, thp: remove u...
625
  	return 0;
6b31d5955   Michal Hocko   mm, oom: fix pote...
626
627
628
629
630
631
632
633
  unlock_release:
  	spin_unlock(vmf->ptl);
  release:
  	if (pgtable)
  		pte_free(vma->vm_mm, pgtable);
  	mem_cgroup_cancel_charge(page, memcg, true);
  	put_page(page);
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
634
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
635
  /*
21440d7eb   David Rientjes   mm, thp: add new ...
636
637
638
639
640
641
642
   * always: directly stall for all thp allocations
   * defer: wake kswapd and fail if not immediately available
   * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
   *		  fail if not immediately available
   * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
   *	    available
   * never: never stall for any thp allocation
444eb2a44   Mel Gorman   mm: thp: set THP ...
643
   */
19deb7695   David Rientjes   Revert "Revert "R...
644
  static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
444eb2a44   Mel Gorman   mm: thp: set THP ...
645
  {
21440d7eb   David Rientjes   mm, thp: add new ...
646
  	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
2f0799a0f   David Rientjes   mm, thp: restore ...
647

ac79f78da   David Rientjes   Revert "Revert "m...
648
  	/* Always do synchronous compaction */
a8282608c   Andrea Arcangeli   Revert "mm, thp: ...
649
650
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
  		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
ac79f78da   David Rientjes   Revert "Revert "m...
651
652
  
  	/* Kick kcompactd and fail quickly */
21440d7eb   David Rientjes   mm, thp: add new ...
653
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
19deb7695   David Rientjes   Revert "Revert "R...
654
  		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
ac79f78da   David Rientjes   Revert "Revert "m...
655
656
  
  	/* Synchronous compaction if madvised, otherwise kick kcompactd */
21440d7eb   David Rientjes   mm, thp: add new ...
657
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
19deb7695   David Rientjes   Revert "Revert "R...
658
659
660
  		return GFP_TRANSHUGE_LIGHT |
  			(vma_madvised ? __GFP_DIRECT_RECLAIM :
  					__GFP_KSWAPD_RECLAIM);
ac79f78da   David Rientjes   Revert "Revert "m...
661
662
  
  	/* Only do synchronous compaction if madvised */
21440d7eb   David Rientjes   mm, thp: add new ...
663
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
19deb7695   David Rientjes   Revert "Revert "R...
664
665
  		return GFP_TRANSHUGE_LIGHT |
  		       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
ac79f78da   David Rientjes   Revert "Revert "m...
666

19deb7695   David Rientjes   Revert "Revert "R...
667
  	return GFP_TRANSHUGE_LIGHT;
444eb2a44   Mel Gorman   mm: thp: set THP ...
668
  }
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
669
  /* Caller must hold page table lock. */
d295e3415   Kirill A. Shutemov   dax: don't use se...
670
  static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
97ae17497   Kirill A. Shutemov   thp: implement re...
671
  		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
672
  		struct page *zero_page)
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
673
674
  {
  	pmd_t entry;
7c4141645   Andrew Morton   dax: revert userf...
675
676
  	if (!pmd_none(*pmd))
  		return false;
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
677
  	entry = mk_pmd(zero_page, vma->vm_page_prot);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
678
  	entry = pmd_mkhuge(entry);
12c9d70bd   Matthew Wilcox   mm: fix memory le...
679
680
  	if (pgtable)
  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
681
  	set_pmd_at(mm, haddr, pmd, entry);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
682
  	mm_inc_nr_ptes(mm);
7c4141645   Andrew Morton   dax: revert userf...
683
  	return true;
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
684
  }
2b7403035   Souptick Joarder   mm: Change return...
685
  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
71e3aac07   Andrea Arcangeli   thp: transparent ...
686
  {
82b0f8c39   Jan Kara   mm: join struct f...
687
  	struct vm_area_struct *vma = vmf->vma;
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
688
  	gfp_t gfp;
71e3aac07   Andrea Arcangeli   thp: transparent ...
689
  	struct page *page;
82b0f8c39   Jan Kara   mm: join struct f...
690
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
71e3aac07   Andrea Arcangeli   thp: transparent ...
691

43675e6fb   Yang Shi   mm: thp: make tra...
692
  	if (!transhuge_vma_suitable(vma, haddr))
c02925540   Kirill A. Shutemov   thp: consolidate ...
693
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
694
695
  	if (unlikely(anon_vma_prepare(vma)))
  		return VM_FAULT_OOM;
6d50e60cd   David Rientjes   mm, thp: fix coll...
696
  	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
697
  		return VM_FAULT_OOM;
82b0f8c39   Jan Kara   mm: join struct f...
698
  	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
bae473a42   Kirill A. Shutemov   mm: introduce fau...
699
  			!mm_forbids_zeropage(vma->vm_mm) &&
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
700
701
702
703
  			transparent_hugepage_use_zero_page()) {
  		pgtable_t pgtable;
  		struct page *zero_page;
  		bool set;
2b7403035   Souptick Joarder   mm: Change return...
704
  		vm_fault_t ret;
4cf589249   Joel Fernandes (Google)   mm: treewide: rem...
705
  		pgtable = pte_alloc_one(vma->vm_mm);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
706
  		if (unlikely(!pgtable))
ba76149f4   Andrea Arcangeli   thp: khugepaged
707
  			return VM_FAULT_OOM;
6fcb52a56   Aaron Lu   thp: reduce usage...
708
  		zero_page = mm_get_huge_zero_page(vma->vm_mm);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
709
  		if (unlikely(!zero_page)) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
710
  			pte_free(vma->vm_mm, pgtable);
81ab4201f   Andi Kleen   mm: add VM counte...
711
  			count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
712
  			return VM_FAULT_FALLBACK;
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
713
  		}
82b0f8c39   Jan Kara   mm: join struct f...
714
  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
715
716
  		ret = 0;
  		set = false;
82b0f8c39   Jan Kara   mm: join struct f...
717
  		if (pmd_none(*vmf->pmd)) {
6b31d5955   Michal Hocko   mm, oom: fix pote...
718
719
720
721
  			ret = check_stable_address_space(vma->vm_mm);
  			if (ret) {
  				spin_unlock(vmf->ptl);
  			} else if (userfaultfd_missing(vma)) {
82b0f8c39   Jan Kara   mm: join struct f...
722
723
  				spin_unlock(vmf->ptl);
  				ret = handle_userfault(vmf, VM_UFFD_MISSING);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
724
725
  				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  			} else {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
726
  				set_huge_zero_page(pgtable, vma->vm_mm, vma,
82b0f8c39   Jan Kara   mm: join struct f...
727
728
  						   haddr, vmf->pmd, zero_page);
  				spin_unlock(vmf->ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
729
730
731
  				set = true;
  			}
  		} else
82b0f8c39   Jan Kara   mm: join struct f...
732
  			spin_unlock(vmf->ptl);
6fcb52a56   Aaron Lu   thp: reduce usage...
733
  		if (!set)
bae473a42   Kirill A. Shutemov   mm: introduce fau...
734
  			pte_free(vma->vm_mm, pgtable);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
735
  		return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
736
  	}
19deb7695   David Rientjes   Revert "Revert "R...
737
738
  	gfp = alloc_hugepage_direct_gfpmask(vma);
  	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
739
740
  	if (unlikely(!page)) {
  		count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
741
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
742
  	}
9a982250f   Kirill A. Shutemov   thp: introduce de...
743
  	prep_transhuge_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
744
  	return __do_huge_pmd_anonymous_page(vmf, page, gfp);
71e3aac07   Andrea Arcangeli   thp: transparent ...
745
  }
ae18d6dcf   Matthew Wilcox   thp: change inser...
746
  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
747
748
  		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
  		pgtable_t pgtable)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
749
750
751
752
753
754
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pmd_t entry;
  	spinlock_t *ptl;
  
  	ptl = pmd_lock(mm, pmd);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
755
756
757
758
759
760
761
762
763
764
765
766
767
768
  	if (!pmd_none(*pmd)) {
  		if (write) {
  			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
  				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
  				goto out_unlock;
  			}
  			entry = pmd_mkyoung(*pmd);
  			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  			if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
  				update_mmu_cache_pmd(vma, addr, pmd);
  		}
  
  		goto out_unlock;
  	}
f25748e3c   Dan Williams   mm, dax: convert ...
769
770
771
  	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
  	if (pfn_t_devmap(pfn))
  		entry = pmd_mkdevmap(entry);
01871e59a   Ross Zwisler   mm, dax: fix live...
772
  	if (write) {
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
773
774
  		entry = pmd_mkyoung(pmd_mkdirty(entry));
  		entry = maybe_pmd_mkwrite(entry, vma);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
775
  	}
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
776
777
778
  
  	if (pgtable) {
  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
779
  		mm_inc_nr_ptes(mm);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
780
  		pgtable = NULL;
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
781
  	}
01871e59a   Ross Zwisler   mm, dax: fix live...
782
783
  	set_pmd_at(mm, addr, pmd, entry);
  	update_mmu_cache_pmd(vma, addr, pmd);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
784
785
  
  out_unlock:
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
786
  	spin_unlock(ptl);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
787
788
  	if (pgtable)
  		pte_free(mm, pgtable);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
789
  }
fce86ff58   Dan Williams   mm/huge_memory: f...
790
  vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
791
  {
fce86ff58   Dan Williams   mm/huge_memory: f...
792
793
  	unsigned long addr = vmf->address & PMD_MASK;
  	struct vm_area_struct *vma = vmf->vma;
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
794
  	pgprot_t pgprot = vma->vm_page_prot;
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
795
  	pgtable_t pgtable = NULL;
fce86ff58   Dan Williams   mm/huge_memory: f...
796

5cad465d7   Matthew Wilcox   mm: add vmf_inser...
797
798
799
800
801
  	/*
  	 * If we had pmd_special, we could avoid all these restrictions,
  	 * but we need to be consistent with PTEs and architectures that
  	 * can't support a 'special' bit.
  	 */
e1fb4a086   Dave Jiang   dax: remove VM_MI...
802
803
  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
  			!pfn_t_devmap(pfn));
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
804
805
806
  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  						(VM_PFNMAP|VM_MIXEDMAP));
  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
807
808
809
  
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
308a047c3   Borislav Petkov   x86/pat, mm: Make...
810

3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
811
  	if (arch_needs_pgtable_deposit()) {
4cf589249   Joel Fernandes (Google)   mm: treewide: rem...
812
  		pgtable = pte_alloc_one(vma->vm_mm);
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
813
814
815
  		if (!pgtable)
  			return VM_FAULT_OOM;
  	}
308a047c3   Borislav Petkov   x86/pat, mm: Make...
816
  	track_pfn_insert(vma, &pgprot, pfn);
fce86ff58   Dan Williams   mm/huge_memory: f...
817
  	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
ae18d6dcf   Matthew Wilcox   thp: change inser...
818
  	return VM_FAULT_NOPAGE;
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
819
  }
dee410792   Dan Williams   /dev/dax, core: f...
820
  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
821

a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
822
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
823
  static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
824
  {
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
825
  	if (likely(vma->vm_flags & VM_WRITE))
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
826
827
828
829
830
831
832
833
834
835
836
837
  		pud = pud_mkwrite(pud);
  	return pud;
  }
  
  static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
  		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pud_t entry;
  	spinlock_t *ptl;
  
  	ptl = pud_lock(mm, pud);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
838
839
840
841
842
843
844
845
846
847
848
849
850
  	if (!pud_none(*pud)) {
  		if (write) {
  			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
  				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
  				goto out_unlock;
  			}
  			entry = pud_mkyoung(*pud);
  			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
  			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
  				update_mmu_cache_pud(vma, addr, pud);
  		}
  		goto out_unlock;
  	}
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
851
852
853
854
  	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
  	if (pfn_t_devmap(pfn))
  		entry = pud_mkdevmap(entry);
  	if (write) {
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
855
856
  		entry = pud_mkyoung(pud_mkdirty(entry));
  		entry = maybe_pud_mkwrite(entry, vma);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
857
858
859
  	}
  	set_pud_at(mm, addr, pud, entry);
  	update_mmu_cache_pud(vma, addr, pud);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
860
861
  
  out_unlock:
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
862
863
  	spin_unlock(ptl);
  }
fce86ff58   Dan Williams   mm/huge_memory: f...
864
  vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
865
  {
fce86ff58   Dan Williams   mm/huge_memory: f...
866
867
  	unsigned long addr = vmf->address & PUD_MASK;
  	struct vm_area_struct *vma = vmf->vma;
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
868
  	pgprot_t pgprot = vma->vm_page_prot;
fce86ff58   Dan Williams   mm/huge_memory: f...
869

a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
870
871
872
873
874
  	/*
  	 * If we had pud_special, we could avoid all these restrictions,
  	 * but we need to be consistent with PTEs and architectures that
  	 * can't support a 'special' bit.
  	 */
62ec0d8c4   Dave Jiang   mm: fix BUG_ON() ...
875
876
  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
  			!pfn_t_devmap(pfn));
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
877
878
879
  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  						(VM_PFNMAP|VM_MIXEDMAP));
  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
880
881
882
883
884
  
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
  
  	track_pfn_insert(vma, &pgprot, pfn);
fce86ff58   Dan Williams   mm/huge_memory: f...
885
  	insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
886
887
888
889
  	return VM_FAULT_NOPAGE;
  }
  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
3565fce3a   Dan Williams   mm, x86: get_user...
890
  static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
891
  		pmd_t *pmd, int flags)
3565fce3a   Dan Williams   mm, x86: get_user...
892
893
  {
  	pmd_t _pmd;
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
894
895
896
  	_pmd = pmd_mkyoung(*pmd);
  	if (flags & FOLL_WRITE)
  		_pmd = pmd_mkdirty(_pmd);
3565fce3a   Dan Williams   mm, x86: get_user...
897
  	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
898
  				pmd, _pmd, flags & FOLL_WRITE))
3565fce3a   Dan Williams   mm, x86: get_user...
899
900
901
902
  		update_mmu_cache_pmd(vma, addr, pmd);
  }
  
  struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
df06b37ff   Keith Busch   mm/gup: cache dev...
903
  		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
3565fce3a   Dan Williams   mm, x86: get_user...
904
905
906
  {
  	unsigned long pfn = pmd_pfn(*pmd);
  	struct mm_struct *mm = vma->vm_mm;
3565fce3a   Dan Williams   mm, x86: get_user...
907
908
909
  	struct page *page;
  
  	assert_spin_locked(pmd_lockptr(mm, pmd));
8310d48b1   Keno Fischer   mm/huge_memory.c:...
910
911
912
913
914
  	/*
  	 * When we COW a devmap PMD entry, we split it into PTEs, so we should
  	 * not be in this function with `flags & FOLL_COW` set.
  	 */
  	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
f6f373216   Linus Torvalds   Revert "mm: repla...
915
  	if (flags & FOLL_WRITE && !pmd_write(*pmd))
3565fce3a   Dan Williams   mm, x86: get_user...
916
917
918
919
920
921
922
923
  		return NULL;
  
  	if (pmd_present(*pmd) && pmd_devmap(*pmd))
  		/* pass */;
  	else
  		return NULL;
  
  	if (flags & FOLL_TOUCH)
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
924
  		touch_pmd(vma, addr, pmd, flags);
3565fce3a   Dan Williams   mm, x86: get_user...
925
926
927
928
929
930
931
932
933
  
  	/*
  	 * device mapped pages can only be returned if the
  	 * caller will manage the page reference count.
  	 */
  	if (!(flags & FOLL_GET))
  		return ERR_PTR(-EEXIST);
  
  	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
df06b37ff   Keith Busch   mm/gup: cache dev...
934
935
  	*pgmap = get_dev_pagemap(pfn, *pgmap);
  	if (!*pgmap)
3565fce3a   Dan Williams   mm, x86: get_user...
936
937
938
  		return ERR_PTR(-EFAULT);
  	page = pfn_to_page(pfn);
  	get_page(page);
3565fce3a   Dan Williams   mm, x86: get_user...
939
940
941
  
  	return page;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
942
943
944
945
  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
  		  struct vm_area_struct *vma)
  {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
946
  	spinlock_t *dst_ptl, *src_ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
947
948
  	struct page *src_page;
  	pmd_t pmd;
12c9d70bd   Matthew Wilcox   mm: fix memory le...
949
  	pgtable_t pgtable = NULL;
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
950
  	int ret = -ENOMEM;
71e3aac07   Andrea Arcangeli   thp: transparent ...
951

628d47ce9   Kirill A. Shutemov   thp: skip file hu...
952
953
954
  	/* Skip if can be re-fill on fault */
  	if (!vma_is_anonymous(vma))
  		return 0;
4cf589249   Joel Fernandes (Google)   mm: treewide: rem...
955
  	pgtable = pte_alloc_one(dst_mm);
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
956
957
  	if (unlikely(!pgtable))
  		goto out;
71e3aac07   Andrea Arcangeli   thp: transparent ...
958

c4088ebdc   Kirill A. Shutemov   mm: convert the r...
959
960
961
  	dst_ptl = pmd_lock(dst_mm, dst_pmd);
  	src_ptl = pmd_lockptr(src_mm, src_pmd);
  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
71e3aac07   Andrea Arcangeli   thp: transparent ...
962
963
964
  
  	ret = -EAGAIN;
  	pmd = *src_pmd;
84c3fc4e9   Zi Yan   mm: thp: check pm...
965
966
967
968
969
970
971
972
973
  
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  	if (unlikely(is_swap_pmd(pmd))) {
  		swp_entry_t entry = pmd_to_swp_entry(pmd);
  
  		VM_BUG_ON(!is_pmd_migration_entry(pmd));
  		if (is_write_migration_entry(entry)) {
  			make_migration_entry_read(&entry);
  			pmd = swp_entry_to_pmd(entry);
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
974
975
  			if (pmd_swp_soft_dirty(*src_pmd))
  				pmd = pmd_swp_mksoft_dirty(pmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
976
977
  			set_pmd_at(src_mm, addr, src_pmd, pmd);
  		}
dd8a67f9a   Zi Yan   mm/huge_memory.c:...
978
  		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
af5b0f6a0   Kirill A. Shutemov   mm: consolidate p...
979
  		mm_inc_nr_ptes(dst_mm);
dd8a67f9a   Zi Yan   mm/huge_memory.c:...
980
  		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
84c3fc4e9   Zi Yan   mm: thp: check pm...
981
982
983
984
985
  		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
  		ret = 0;
  		goto out_unlock;
  	}
  #endif
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
986
  	if (unlikely(!pmd_trans_huge(pmd))) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
987
988
989
  		pte_free(dst_mm, pgtable);
  		goto out_unlock;
  	}
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
990
  	/*
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
991
  	 * When page table lock is held, the huge zero pmd should not be
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
992
993
994
995
  	 * under splitting since we don't split the page itself, only pmd to
  	 * a page table.
  	 */
  	if (is_huge_zero_pmd(pmd)) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
996
  		struct page *zero_page;
97ae17497   Kirill A. Shutemov   thp: implement re...
997
998
999
1000
1001
  		/*
  		 * get_huge_zero_page() will never allocate a new page here,
  		 * since we already have a zero page to copy. It just takes a
  		 * reference.
  		 */
6fcb52a56   Aaron Lu   thp: reduce usage...
1002
  		zero_page = mm_get_huge_zero_page(dst_mm);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
1003
  		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
1004
  				zero_page);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
1005
1006
1007
  		ret = 0;
  		goto out_unlock;
  	}
de466bd62   Mel Gorman   mm: numa: avoid u...
1008

628d47ce9   Kirill A. Shutemov   thp: skip file hu...
1009
1010
1011
1012
1013
  	src_page = pmd_page(pmd);
  	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
  	get_page(src_page);
  	page_dup_rmap(src_page, true);
  	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
1014
  	mm_inc_nr_ptes(dst_mm);
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
1015
  	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1016
1017
1018
1019
  
  	pmdp_set_wrprotect(src_mm, addr, src_pmd);
  	pmd = pmd_mkold(pmd_wrprotect(pmd));
  	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1020
1021
1022
  
  	ret = 0;
  out_unlock:
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1023
1024
  	spin_unlock(src_ptl);
  	spin_unlock(dst_ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1025
1026
1027
  out:
  	return ret;
  }
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1028
1029
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1030
  		pud_t *pud, int flags)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1031
1032
  {
  	pud_t _pud;
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1033
1034
1035
  	_pud = pud_mkyoung(*pud);
  	if (flags & FOLL_WRITE)
  		_pud = pud_mkdirty(_pud);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1036
  	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1037
  				pud, _pud, flags & FOLL_WRITE))
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1038
1039
1040
1041
  		update_mmu_cache_pud(vma, addr, pud);
  }
  
  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
df06b37ff   Keith Busch   mm/gup: cache dev...
1042
  		pud_t *pud, int flags, struct dev_pagemap **pgmap)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1043
1044
1045
  {
  	unsigned long pfn = pud_pfn(*pud);
  	struct mm_struct *mm = vma->vm_mm;
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1046
1047
1048
  	struct page *page;
  
  	assert_spin_locked(pud_lockptr(mm, pud));
f6f373216   Linus Torvalds   Revert "mm: repla...
1049
  	if (flags & FOLL_WRITE && !pud_write(*pud))
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1050
1051
1052
1053
1054
1055
1056
1057
  		return NULL;
  
  	if (pud_present(*pud) && pud_devmap(*pud))
  		/* pass */;
  	else
  		return NULL;
  
  	if (flags & FOLL_TOUCH)
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1058
  		touch_pud(vma, addr, pud, flags);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1059
1060
1061
1062
1063
1064
1065
1066
1067
  
  	/*
  	 * device mapped pages can only be returned if the
  	 * caller will manage the page reference count.
  	 */
  	if (!(flags & FOLL_GET))
  		return ERR_PTR(-EEXIST);
  
  	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
df06b37ff   Keith Busch   mm/gup: cache dev...
1068
1069
  	*pgmap = get_dev_pagemap(pfn, *pgmap);
  	if (!*pgmap)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1070
1071
1072
  		return ERR_PTR(-EFAULT);
  	page = pfn_to_page(pfn);
  	get_page(page);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
  
  	return page;
  }
  
  int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
  		  struct vm_area_struct *vma)
  {
  	spinlock_t *dst_ptl, *src_ptl;
  	pud_t pud;
  	int ret;
  
  	dst_ptl = pud_lock(dst_mm, dst_pud);
  	src_ptl = pud_lockptr(src_mm, src_pud);
  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  
  	ret = -EAGAIN;
  	pud = *src_pud;
  	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
  		goto out_unlock;
  
  	/*
  	 * When page table lock is held, the huge zero pud should not be
  	 * under splitting since we don't split the page itself, only pud to
  	 * a page table.
  	 */
  	if (is_huge_zero_pud(pud)) {
  		/* No huge zero pud yet */
  	}
  
  	pudp_set_wrprotect(src_mm, addr, src_pud);
  	pud = pud_mkold(pud_wrprotect(pud));
  	set_pud_at(dst_mm, addr, dst_pud, pud);
  
  	ret = 0;
  out_unlock:
  	spin_unlock(src_ptl);
  	spin_unlock(dst_ptl);
  	return ret;
  }
  
  void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
  {
  	pud_t entry;
  	unsigned long haddr;
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
  
  	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
  	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
  		goto unlock;
  
  	entry = pud_mkyoung(orig_pud);
  	if (write)
  		entry = pud_mkdirty(entry);
  	haddr = vmf->address & HPAGE_PUD_MASK;
  	if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
  		update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
  
  unlock:
  	spin_unlock(vmf->ptl);
  }
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
82b0f8c39   Jan Kara   mm: join struct f...
1135
  void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
a1dd450bc   Will Deacon   mm: thp: set the ...
1136
1137
1138
  {
  	pmd_t entry;
  	unsigned long haddr;
20f664aab   Minchan Kim   mm: pmd dirty emu...
1139
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
a1dd450bc   Will Deacon   mm: thp: set the ...
1140

82b0f8c39   Jan Kara   mm: join struct f...
1141
1142
  	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
a1dd450bc   Will Deacon   mm: thp: set the ...
1143
1144
1145
  		goto unlock;
  
  	entry = pmd_mkyoung(orig_pmd);
20f664aab   Minchan Kim   mm: pmd dirty emu...
1146
1147
  	if (write)
  		entry = pmd_mkdirty(entry);
82b0f8c39   Jan Kara   mm: join struct f...
1148
  	haddr = vmf->address & HPAGE_PMD_MASK;
20f664aab   Minchan Kim   mm: pmd dirty emu...
1149
  	if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
82b0f8c39   Jan Kara   mm: join struct f...
1150
  		update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
a1dd450bc   Will Deacon   mm: thp: set the ...
1151
1152
  
  unlock:
82b0f8c39   Jan Kara   mm: join struct f...
1153
  	spin_unlock(vmf->ptl);
a1dd450bc   Will Deacon   mm: thp: set the ...
1154
  }
2b7403035   Souptick Joarder   mm: Change return...
1155
1156
  static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
  			pmd_t orig_pmd, struct page *page)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1157
  {
82b0f8c39   Jan Kara   mm: join struct f...
1158
1159
  	struct vm_area_struct *vma = vmf->vma;
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
00501b531   Johannes Weiner   mm: memcontrol: r...
1160
  	struct mem_cgroup *memcg;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1161
1162
  	pgtable_t pgtable;
  	pmd_t _pmd;
2b7403035   Souptick Joarder   mm: Change return...
1163
1164
  	int i;
  	vm_fault_t ret = 0;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1165
  	struct page **pages;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1166
  	struct mmu_notifier_range range;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1167

6da2ec560   Kees Cook   treewide: kmalloc...
1168
1169
  	pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
  			      GFP_KERNEL);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1170
1171
1172
1173
1174
1175
  	if (unlikely(!pages)) {
  		ret |= VM_FAULT_OOM;
  		goto out;
  	}
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
41b6167e8   Michal Hocko   mm: get rid of __...
1176
  		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
82b0f8c39   Jan Kara   mm: join struct f...
1177
  					       vmf->address, page_to_nid(page));
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1178
  		if (unlikely(!pages[i] ||
2cf855837   Tejun Heo   memcontrol: sched...
1179
  			     mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1180
  				     GFP_KERNEL, &memcg, false))) {
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1181
  			if (pages[i])
71e3aac07   Andrea Arcangeli   thp: transparent ...
1182
  				put_page(pages[i]);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1183
  			while (--i >= 0) {
00501b531   Johannes Weiner   mm: memcontrol: r...
1184
1185
  				memcg = (void *)page_private(pages[i]);
  				set_page_private(pages[i], 0);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1186
1187
  				mem_cgroup_cancel_charge(pages[i], memcg,
  						false);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1188
1189
  				put_page(pages[i]);
  			}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1190
1191
1192
1193
  			kfree(pages);
  			ret |= VM_FAULT_OOM;
  			goto out;
  		}
00501b531   Johannes Weiner   mm: memcontrol: r...
1194
  		set_page_private(pages[i], (unsigned long)memcg);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1195
1196
1197
1198
  	}
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		copy_user_highpage(pages[i], page + i,
0089e4853   Hillf Danton   mm/huge_memory: f...
1199
  				   haddr + PAGE_SIZE * i, vma);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1200
1201
1202
  		__SetPageUptodate(pages[i]);
  		cond_resched();
  	}
7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
1203
1204
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
  				haddr, haddr + HPAGE_PMD_SIZE);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1205
  	mmu_notifier_invalidate_range_start(&range);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1206

82b0f8c39   Jan Kara   mm: join struct f...
1207
1208
  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
71e3aac07   Andrea Arcangeli   thp: transparent ...
1209
  		goto out_free_pages;
309381fea   Sasha Levin   mm: dump page whe...
1210
  	VM_BUG_ON_PAGE(!PageHead(page), page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1211

0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
1212
1213
1214
1215
1216
1217
  	/*
  	 * Leave pmd empty until pte is filled note we must notify here as
  	 * concurrent CPU thread might write to new page before the call to
  	 * mmu_notifier_invalidate_range_end() happens which can lead to a
  	 * device seeing memory write in different order than CPU.
  	 *
ad56b738c   Mike Rapoport   docs/vm: rename d...
1218
  	 * See Documentation/vm/mmu_notifier.rst
0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
1219
  	 */
82b0f8c39   Jan Kara   mm: join struct f...
1220
  	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1221

82b0f8c39   Jan Kara   mm: join struct f...
1222
  	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1223
  	pmd_populate(vma->vm_mm, &_pmd, pgtable);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1224
1225
  
  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1226
  		pte_t entry;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1227
1228
  		entry = mk_pte(pages[i], vma->vm_page_prot);
  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
00501b531   Johannes Weiner   mm: memcontrol: r...
1229
1230
  		memcg = (void *)page_private(pages[i]);
  		set_page_private(pages[i], 0);
82b0f8c39   Jan Kara   mm: join struct f...
1231
  		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1232
  		mem_cgroup_commit_charge(pages[i], memcg, false, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
1233
  		lru_cache_add_active_or_unevictable(pages[i], vma);
82b0f8c39   Jan Kara   mm: join struct f...
1234
1235
1236
1237
  		vmf->pte = pte_offset_map(&_pmd, haddr);
  		VM_BUG_ON(!pte_none(*vmf->pte));
  		set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
  		pte_unmap(vmf->pte);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1238
1239
  	}
  	kfree(pages);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1240
  	smp_wmb(); /* make pte visible before pmd */
82b0f8c39   Jan Kara   mm: join struct f...
1241
  	pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1242
  	page_remove_rmap(page, true);
82b0f8c39   Jan Kara   mm: join struct f...
1243
  	spin_unlock(vmf->ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1244

4645b9fe8   Jérôme Glisse   mm/mmu_notifier: ...
1245
1246
1247
1248
  	/*
  	 * No need to double call mmu_notifier->invalidate_range() callback as
  	 * the above pmdp_huge_clear_flush_notify() did already call it.
  	 */
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1249
  	mmu_notifier_invalidate_range_only_end(&range);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1250

71e3aac07   Andrea Arcangeli   thp: transparent ...
1251
1252
1253
1254
1255
1256
1257
  	ret |= VM_FAULT_WRITE;
  	put_page(page);
  
  out:
  	return ret;
  
  out_free_pages:
82b0f8c39   Jan Kara   mm: join struct f...
1258
  	spin_unlock(vmf->ptl);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1259
  	mmu_notifier_invalidate_range_end(&range);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1260
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
00501b531   Johannes Weiner   mm: memcontrol: r...
1261
1262
  		memcg = (void *)page_private(pages[i]);
  		set_page_private(pages[i], 0);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1263
  		mem_cgroup_cancel_charge(pages[i], memcg, false);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1264
  		put_page(pages[i]);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1265
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1266
1267
1268
  	kfree(pages);
  	goto out;
  }
2b7403035   Souptick Joarder   mm: Change return...
1269
  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1270
  {
82b0f8c39   Jan Kara   mm: join struct f...
1271
  	struct vm_area_struct *vma = vmf->vma;
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1272
  	struct page *page = NULL, *new_page;
00501b531   Johannes Weiner   mm: memcontrol: r...
1273
  	struct mem_cgroup *memcg;
82b0f8c39   Jan Kara   mm: join struct f...
1274
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1275
  	struct mmu_notifier_range range;
3b3636924   Michal Hocko   mm, memcg: sync a...
1276
  	gfp_t huge_gfp;			/* for allocation and charge */
2b7403035   Souptick Joarder   mm: Change return...
1277
  	vm_fault_t ret = 0;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1278

82b0f8c39   Jan Kara   mm: join struct f...
1279
  	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
81d1b09c6   Sasha Levin   mm: convert a few...
1280
  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1281
1282
  	if (is_huge_zero_pmd(orig_pmd))
  		goto alloc;
82b0f8c39   Jan Kara   mm: join struct f...
1283
1284
  	spin_lock(vmf->ptl);
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
71e3aac07   Andrea Arcangeli   thp: transparent ...
1285
1286
1287
  		goto out_unlock;
  
  	page = pmd_page(orig_pmd);
309381fea   Sasha Levin   mm: dump page whe...
1288
  	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1f25fe20a   Kirill A. Shutemov   mm, thp: adjust c...
1289
1290
  	/*
  	 * We can only reuse the page if nobody else maps the huge page or it's
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1291
  	 * part.
1f25fe20a   Kirill A. Shutemov   mm, thp: adjust c...
1292
  	 */
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
  	if (!trylock_page(page)) {
  		get_page(page);
  		spin_unlock(vmf->ptl);
  		lock_page(page);
  		spin_lock(vmf->ptl);
  		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
  			unlock_page(page);
  			put_page(page);
  			goto out_unlock;
  		}
  		put_page(page);
  	}
  	if (reuse_swap_page(page, NULL)) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1306
1307
  		pmd_t entry;
  		entry = pmd_mkyoung(orig_pmd);
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
1308
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
82b0f8c39   Jan Kara   mm: join struct f...
1309
1310
  		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
  			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1311
  		ret |= VM_FAULT_WRITE;
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1312
  		unlock_page(page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1313
1314
  		goto out_unlock;
  	}
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1315
  	unlock_page(page);
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1316
  	get_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
1317
  	spin_unlock(vmf->ptl);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1318
  alloc:
7635d9cbe   Michal Hocko   mm, thp, proc: re...
1319
  	if (__transparent_hugepage_enabled(vma) &&
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
1320
  	    !transparent_hugepage_debug_cow()) {
19deb7695   David Rientjes   Revert "Revert "R...
1321
1322
  		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
  		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
1323
  	} else
71e3aac07   Andrea Arcangeli   thp: transparent ...
1324
  		new_page = NULL;
9a982250f   Kirill A. Shutemov   thp: introduce de...
1325
1326
1327
  	if (likely(new_page)) {
  		prep_transhuge_page(new_page);
  	} else {
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1328
  		if (!page) {
82b0f8c39   Jan Kara   mm: join struct f...
1329
  			split_huge_pmd(vma, vmf->pmd, vmf->address);
e9b71ca91   Kirill A. Shutemov   mm, thp: drop do_...
1330
  			ret |= VM_FAULT_FALLBACK;
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1331
  		} else {
82b0f8c39   Jan Kara   mm: join struct f...
1332
  			ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1333
  			if (ret & VM_FAULT_OOM) {
82b0f8c39   Jan Kara   mm: join struct f...
1334
  				split_huge_pmd(vma, vmf->pmd, vmf->address);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1335
1336
  				ret |= VM_FAULT_FALLBACK;
  			}
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1337
  			put_page(page);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1338
  		}
17766dde3   David Rientjes   mm, thp: count th...
1339
  		count_vm_event(THP_FAULT_FALLBACK);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1340
1341
  		goto out;
  	}
2cf855837   Tejun Heo   memcontrol: sched...
1342
  	if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
2a70f6a76   Michal Hocko   memcg, thp: do no...
1343
  					huge_gfp, &memcg, true))) {
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1344
  		put_page(new_page);
82b0f8c39   Jan Kara   mm: join struct f...
1345
  		split_huge_pmd(vma, vmf->pmd, vmf->address);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1346
  		if (page)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1347
  			put_page(page);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1348
  		ret |= VM_FAULT_FALLBACK;
17766dde3   David Rientjes   mm, thp: count th...
1349
  		count_vm_event(THP_FAULT_FALLBACK);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1350
1351
  		goto out;
  	}
17766dde3   David Rientjes   mm, thp: count th...
1352
  	count_vm_event(THP_FAULT_ALLOC);
1ff9e6e17   Chris Down   mm: memcontrol: e...
1353
  	count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
17766dde3   David Rientjes   mm, thp: count th...
1354

eecc1e426   Hugh Dickins   thp: fix copy_pag...
1355
  	if (!page)
c79b57e46   Huang Ying   mm: hugetlb: clea...
1356
  		clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1357
  	else
c9f4cd713   Huang Ying   mm, huge page: co...
1358
1359
  		copy_user_huge_page(new_page, page, vmf->address,
  				    vma, HPAGE_PMD_NR);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1360
  	__SetPageUptodate(new_page);
7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
1361
1362
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
  				haddr, haddr + HPAGE_PMD_SIZE);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1363
  	mmu_notifier_invalidate_range_start(&range);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1364

82b0f8c39   Jan Kara   mm: join struct f...
1365
  	spin_lock(vmf->ptl);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1366
  	if (page)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1367
  		put_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
1368
1369
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
  		spin_unlock(vmf->ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1370
  		mem_cgroup_cancel_charge(new_page, memcg, true);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1371
  		put_page(new_page);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1372
  		goto out_mn;
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1373
  	} else {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1374
  		pmd_t entry;
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
1375
  		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
1376
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
82b0f8c39   Jan Kara   mm: join struct f...
1377
  		pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1378
  		page_add_new_anon_rmap(new_page, vma, haddr, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1379
  		mem_cgroup_commit_charge(new_page, memcg, false, true);
00501b531   Johannes Weiner   mm: memcontrol: r...
1380
  		lru_cache_add_active_or_unevictable(new_page, vma);
82b0f8c39   Jan Kara   mm: join struct f...
1381
1382
  		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
  		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1383
  		if (!page) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1384
  			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
97ae17497   Kirill A. Shutemov   thp: implement re...
1385
  		} else {
309381fea   Sasha Levin   mm: dump page whe...
1386
  			VM_BUG_ON_PAGE(!PageHead(page), page);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1387
  			page_remove_rmap(page, true);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1388
1389
  			put_page(page);
  		}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1390
1391
  		ret |= VM_FAULT_WRITE;
  	}
82b0f8c39   Jan Kara   mm: join struct f...
1392
  	spin_unlock(vmf->ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1393
  out_mn:
4645b9fe8   Jérôme Glisse   mm/mmu_notifier: ...
1394
1395
1396
1397
  	/*
  	 * No need to double call mmu_notifier->invalidate_range() callback as
  	 * the above pmdp_huge_clear_flush_notify() did already call it.
  	 */
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1398
  	mmu_notifier_invalidate_range_only_end(&range);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1399
1400
  out:
  	return ret;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1401
  out_unlock:
82b0f8c39   Jan Kara   mm: join struct f...
1402
  	spin_unlock(vmf->ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1403
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1404
  }
8310d48b1   Keno Fischer   mm/huge_memory.c:...
1405
1406
1407
1408
1409
1410
  /*
   * FOLL_FORCE can write to even unwritable pmd's, but only
   * after we've gone through a COW cycle and they are dirty.
   */
  static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
  {
f6f373216   Linus Torvalds   Revert "mm: repla...
1411
  	return pmd_write(pmd) ||
8310d48b1   Keno Fischer   mm/huge_memory.c:...
1412
1413
  	       ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
  }
b676b293f   David Rientjes   mm, thp: fix mapp...
1414
  struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
71e3aac07   Andrea Arcangeli   thp: transparent ...
1415
1416
1417
1418
  				   unsigned long addr,
  				   pmd_t *pmd,
  				   unsigned int flags)
  {
b676b293f   David Rientjes   mm, thp: fix mapp...
1419
  	struct mm_struct *mm = vma->vm_mm;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1420
  	struct page *page = NULL;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1421
  	assert_spin_locked(pmd_lockptr(mm, pmd));
71e3aac07   Andrea Arcangeli   thp: transparent ...
1422

8310d48b1   Keno Fischer   mm/huge_memory.c:...
1423
  	if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
71e3aac07   Andrea Arcangeli   thp: transparent ...
1424
  		goto out;
85facf257   Kirill A. Shutemov   thp: avoid dumpin...
1425
1426
1427
  	/* Avoid dumping huge zero page */
  	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
  		return ERR_PTR(-EFAULT);
2b4847e73   Mel Gorman   mm: numa: seriali...
1428
  	/* Full NUMA hinting faults to serialise migration in fault paths */
8a0516ed8   Mel Gorman   mm: convert p[te|...
1429
  	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
2b4847e73   Mel Gorman   mm: numa: seriali...
1430
  		goto out;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1431
  	page = pmd_page(*pmd);
ca120cf68   Dan Williams   mm: fix show_smap...
1432
  	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
3565fce3a   Dan Williams   mm, x86: get_user...
1433
  	if (flags & FOLL_TOUCH)
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1434
  		touch_pmd(vma, addr, pmd, flags);
de60f5f10   Eric B Munson   mm: introduce VM_...
1435
  	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1436
1437
1438
1439
  		/*
  		 * We don't mlock() pte-mapped THPs. This way we can avoid
  		 * leaking mlocked pages into non-VM_LOCKED VMAs.
  		 *
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1440
1441
  		 * For anon THP:
  		 *
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1442
1443
1444
1445
1446
1447
1448
  		 * In most cases the pmd is the only mapping of the page as we
  		 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
  		 * writable private mappings in populate_vma_page_range().
  		 *
  		 * The only scenario when we have the page shared here is if we
  		 * mlocking read-only mapping shared over fork(). We skip
  		 * mlocking such pages.
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1449
1450
1451
1452
1453
1454
  		 *
  		 * For file THP:
  		 *
  		 * We can expect PageDoubleMap() to be stable under page lock:
  		 * for file pages we set it in page_add_file_rmap(), which
  		 * requires page to be locked.
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1455
  		 */
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
  
  		if (PageAnon(page) && compound_mapcount(page) != 1)
  			goto skip_mlock;
  		if (PageDoubleMap(page) || !page->mapping)
  			goto skip_mlock;
  		if (!trylock_page(page))
  			goto skip_mlock;
  		lru_add_drain();
  		if (page->mapping && !PageDoubleMap(page))
  			mlock_vma_page(page);
  		unlock_page(page);
b676b293f   David Rientjes   mm, thp: fix mapp...
1467
  	}
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1468
  skip_mlock:
71e3aac07   Andrea Arcangeli   thp: transparent ...
1469
  	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
ca120cf68   Dan Williams   mm: fix show_smap...
1470
  	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1471
  	if (flags & FOLL_GET)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1472
  		get_page(page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1473
1474
1475
1476
  
  out:
  	return page;
  }
d10e63f29   Mel Gorman   mm: numa: Create ...
1477
  /* NUMA hinting page fault entry point for trans huge pmds */
2b7403035   Souptick Joarder   mm: Change return...
1478
  vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
d10e63f29   Mel Gorman   mm: numa: Create ...
1479
  {
82b0f8c39   Jan Kara   mm: join struct f...
1480
  	struct vm_area_struct *vma = vmf->vma;
b8916634b   Mel Gorman   mm: Prevent paral...
1481
  	struct anon_vma *anon_vma = NULL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1482
  	struct page *page;
82b0f8c39   Jan Kara   mm: join struct f...
1483
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
98fa15f34   Anshuman Khandual   mm: replace all o...
1484
  	int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
90572890d   Peter Zijlstra   mm: numa: Change ...
1485
  	int target_nid, last_cpupid = -1;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1486
1487
  	bool page_locked;
  	bool migrated = false;
b191f9b10   Mel Gorman   mm: numa: preserv...
1488
  	bool was_writable;
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1489
  	int flags = 0;
d10e63f29   Mel Gorman   mm: numa: Create ...
1490

82b0f8c39   Jan Kara   mm: join struct f...
1491
1492
  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_same(pmd, *vmf->pmd)))
d10e63f29   Mel Gorman   mm: numa: Create ...
1493
  		goto out_unlock;
de466bd62   Mel Gorman   mm: numa: avoid u...
1494
1495
1496
1497
1498
  	/*
  	 * If there are potential migrations, wait for completion and retry
  	 * without disrupting NUMA hinting information. Do not relock and
  	 * check_same as the page may no longer be mapped.
  	 */
82b0f8c39   Jan Kara   mm: join struct f...
1499
1500
  	if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
  		page = pmd_page(*vmf->pmd);
3c226c637   Mark Rutland   mm: numa: avoid w...
1501
1502
  		if (!get_page_unless_zero(page))
  			goto out_unlock;
82b0f8c39   Jan Kara   mm: join struct f...
1503
  		spin_unlock(vmf->ptl);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1504
  		put_and_wait_on_page_locked(page);
de466bd62   Mel Gorman   mm: numa: avoid u...
1505
1506
  		goto out;
  	}
d10e63f29   Mel Gorman   mm: numa: Create ...
1507
  	page = pmd_page(pmd);
a1a46184e   Mel Gorman   mm: numa: Do not ...
1508
  	BUG_ON(is_huge_zero_page(page));
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1509
  	page_nid = page_to_nid(page);
90572890d   Peter Zijlstra   mm: numa: Change ...
1510
  	last_cpupid = page_cpupid_last(page);
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1511
  	count_vm_numa_event(NUMA_HINT_FAULTS);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1512
  	if (page_nid == this_nid) {
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1513
  		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1514
1515
  		flags |= TNF_FAULT_LOCAL;
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1516

bea66fbd1   Mel Gorman   mm: numa: group r...
1517
  	/* See similar comment in do_numa_page for explanation */
288bc5494   Aneesh Kumar K.V   mm/autonuma: let ...
1518
  	if (!pmd_savedwrite(pmd))
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1519
1520
1521
  		flags |= TNF_NO_GROUP;
  
  	/*
ff9042b11   Mel Gorman   mm: Wait for THP ...
1522
1523
1524
  	 * Acquire the page lock to serialise THP migrations but avoid dropping
  	 * page_table_lock if at all possible
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1525
1526
  	page_locked = trylock_page(page);
  	target_nid = mpol_misplaced(page, vma, haddr);
98fa15f34   Anshuman Khandual   mm: replace all o...
1527
  	if (target_nid == NUMA_NO_NODE) {
b8916634b   Mel Gorman   mm: Prevent paral...
1528
  		/* If the page was locked, there are no parallel migrations */
a54a407fb   Mel Gorman   mm: Close races b...
1529
  		if (page_locked)
b8916634b   Mel Gorman   mm: Prevent paral...
1530
  			goto clear_pmdnuma;
2b4847e73   Mel Gorman   mm: numa: seriali...
1531
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1532

de466bd62   Mel Gorman   mm: numa: avoid u...
1533
  	/* Migration could have started since the pmd_trans_migrating check */
2b4847e73   Mel Gorman   mm: numa: seriali...
1534
  	if (!page_locked) {
98fa15f34   Anshuman Khandual   mm: replace all o...
1535
  		page_nid = NUMA_NO_NODE;
3c226c637   Mark Rutland   mm: numa: avoid w...
1536
1537
  		if (!get_page_unless_zero(page))
  			goto out_unlock;
82b0f8c39   Jan Kara   mm: join struct f...
1538
  		spin_unlock(vmf->ptl);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1539
  		put_and_wait_on_page_locked(page);
b8916634b   Mel Gorman   mm: Prevent paral...
1540
1541
  		goto out;
  	}
2b4847e73   Mel Gorman   mm: numa: seriali...
1542
1543
1544
1545
  	/*
  	 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
  	 * to serialises splits
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1546
  	get_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
1547
  	spin_unlock(vmf->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1548
  	anon_vma = page_lock_anon_vma_read(page);
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1549

c69307d53   Peter Zijlstra   sched/numa: Fix c...
1550
  	/* Confirm the PMD did not change while page_table_lock was released */
82b0f8c39   Jan Kara   mm: join struct f...
1551
1552
  	spin_lock(vmf->ptl);
  	if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
b32967ff1   Mel Gorman   mm: numa: Add THP...
1553
1554
  		unlock_page(page);
  		put_page(page);
98fa15f34   Anshuman Khandual   mm: replace all o...
1555
  		page_nid = NUMA_NO_NODE;
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1556
  		goto out_unlock;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1557
  	}
ff9042b11   Mel Gorman   mm: Wait for THP ...
1558

c3a489cac   Mel Gorman   mm: numa: ensure ...
1559
1560
1561
  	/* Bail if we fail to protect against THP splits for any reason */
  	if (unlikely(!anon_vma)) {
  		put_page(page);
98fa15f34   Anshuman Khandual   mm: replace all o...
1562
  		page_nid = NUMA_NO_NODE;
c3a489cac   Mel Gorman   mm: numa: ensure ...
1563
1564
  		goto clear_pmdnuma;
  	}
a54a407fb   Mel Gorman   mm: Close races b...
1565
  	/*
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1566
1567
1568
1569
1570
  	 * Since we took the NUMA fault, we must have observed the !accessible
  	 * bit. Make sure all other CPUs agree with that, to avoid them
  	 * modifying the page we're about to migrate.
  	 *
  	 * Must be done under PTL such that we'll observe the relevant
ccde85ba0   Peter Zijlstra   mm, locking: Fix ...
1571
1572
1573
1574
  	 * inc_tlb_flush_pending().
  	 *
  	 * We are not sure a pending tlb flush here is for a huge page
  	 * mapping or not. Hence use the tlb range variant
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1575
  	 */
7066f0f93   Andrea Arcangeli   mm: thp: fix mmu_...
1576
  	if (mm_tlb_flush_pending(vma->vm_mm)) {
ccde85ba0   Peter Zijlstra   mm, locking: Fix ...
1577
  		flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
7066f0f93   Andrea Arcangeli   mm: thp: fix mmu_...
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
  		/*
  		 * change_huge_pmd() released the pmd lock before
  		 * invalidating the secondary MMUs sharing the primary
  		 * MMU pagetables (with ->invalidate_range()). The
  		 * mmu_notifier_invalidate_range_end() (which
  		 * internally calls ->invalidate_range()) in
  		 * change_pmd_range() will run after us, so we can't
  		 * rely on it here and we need an explicit invalidate.
  		 */
  		mmu_notifier_invalidate_range(vma->vm_mm, haddr,
  					      haddr + HPAGE_PMD_SIZE);
  	}
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1590
1591
  
  	/*
a54a407fb   Mel Gorman   mm: Close races b...
1592
  	 * Migrate the THP to the requested node, returns with page unlocked
8a0516ed8   Mel Gorman   mm: convert p[te|...
1593
  	 * and access rights restored.
a54a407fb   Mel Gorman   mm: Close races b...
1594
  	 */
82b0f8c39   Jan Kara   mm: join struct f...
1595
  	spin_unlock(vmf->ptl);
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1596

bae473a42   Kirill A. Shutemov   mm: introduce fau...
1597
  	migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
82b0f8c39   Jan Kara   mm: join struct f...
1598
  				vmf->pmd, pmd, vmf->address, page, target_nid);
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1599
1600
  	if (migrated) {
  		flags |= TNF_MIGRATED;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1601
  		page_nid = target_nid;
074c23817   Mel Gorman   mm: numa: slow PT...
1602
1603
  	} else
  		flags |= TNF_MIGRATE_FAIL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1604

8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1605
  	goto out;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1606
  clear_pmdnuma:
a54a407fb   Mel Gorman   mm: Close races b...
1607
  	BUG_ON(!PageLocked(page));
288bc5494   Aneesh Kumar K.V   mm/autonuma: let ...
1608
  	was_writable = pmd_savedwrite(pmd);
4d9424669   Mel Gorman   mm: convert p[te|...
1609
  	pmd = pmd_modify(pmd, vma->vm_page_prot);
b7b04004e   Mel Gorman   mm: numa: mark hu...
1610
  	pmd = pmd_mkyoung(pmd);
b191f9b10   Mel Gorman   mm: numa: preserv...
1611
1612
  	if (was_writable)
  		pmd = pmd_mkwrite(pmd);
82b0f8c39   Jan Kara   mm: join struct f...
1613
1614
  	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
  	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
a54a407fb   Mel Gorman   mm: Close races b...
1615
  	unlock_page(page);
d10e63f29   Mel Gorman   mm: numa: Create ...
1616
  out_unlock:
82b0f8c39   Jan Kara   mm: join struct f...
1617
  	spin_unlock(vmf->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1618
1619
1620
1621
  
  out:
  	if (anon_vma)
  		page_unlock_anon_vma_read(anon_vma);
98fa15f34   Anshuman Khandual   mm: replace all o...
1622
  	if (page_nid != NUMA_NO_NODE)
82b0f8c39   Jan Kara   mm: join struct f...
1623
  		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
9a8b300f2   Aneesh Kumar K.V   mm/thp/autonuma: ...
1624
  				flags);
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1625

d10e63f29   Mel Gorman   mm: numa: Create ...
1626
1627
  	return 0;
  }
319904ad4   Huang Ying   mm, THP: clean up...
1628
1629
1630
1631
1632
  /*
   * Return true if we do MADV_FREE successfully on entire pmd page.
   * Otherwise, return false.
   */
  bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1633
  		pmd_t *pmd, unsigned long addr, unsigned long next)
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1634
1635
1636
1637
1638
  {
  	spinlock_t *ptl;
  	pmd_t orig_pmd;
  	struct page *page;
  	struct mm_struct *mm = tlb->mm;
319904ad4   Huang Ying   mm, THP: clean up...
1639
  	bool ret = false;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1640

ed6a79352   Peter Zijlstra   asm-generic/tlb, ...
1641
  	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
07e326610   Aneesh Kumar K.V   mm: add tlb_remov...
1642

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1643
1644
  	ptl = pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
25eedabe0   Linus Torvalds   vm: fix incorrect...
1645
  		goto out_unlocked;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1646
1647
  
  	orig_pmd = *pmd;
319904ad4   Huang Ying   mm, THP: clean up...
1648
  	if (is_huge_zero_pmd(orig_pmd))
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1649
  		goto out;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1650

84c3fc4e9   Zi Yan   mm: thp: check pm...
1651
1652
1653
1654
1655
  	if (unlikely(!pmd_present(orig_pmd))) {
  		VM_BUG_ON(thp_migration_supported() &&
  				  !is_pmd_migration_entry(orig_pmd));
  		goto out;
  	}
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
  	page = pmd_page(orig_pmd);
  	/*
  	 * If other processes are mapping this page, we couldn't discard
  	 * the page unless they all do MADV_FREE so let's skip the page.
  	 */
  	if (page_mapcount(page) != 1)
  		goto out;
  
  	if (!trylock_page(page))
  		goto out;
  
  	/*
  	 * If user want to discard part-pages of THP, split it so MADV_FREE
  	 * will deactivate only them.
  	 */
  	if (next - addr != HPAGE_PMD_SIZE) {
  		get_page(page);
  		spin_unlock(ptl);
9818b8cde   Huang Ying   madvise_free, thp...
1674
  		split_huge_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1675
  		unlock_page(page);
bbf29ffc7   Kirill A. Shutemov   thp, mm: fix cras...
1676
  		put_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1677
1678
1679
1680
1681
1682
  		goto out_unlocked;
  	}
  
  	if (PageDirty(page))
  		ClearPageDirty(page);
  	unlock_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1683
  	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
58ceeb6be   Kirill A. Shutemov   thp: fix MADV_DON...
1684
  		pmdp_invalidate(vma, addr, pmd);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1685
1686
1687
1688
1689
1690
  		orig_pmd = pmd_mkold(orig_pmd);
  		orig_pmd = pmd_mkclean(orig_pmd);
  
  		set_pmd_at(mm, addr, pmd, orig_pmd);
  		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  	}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1691
1692
  
  	mark_page_lazyfree(page);
319904ad4   Huang Ying   mm, THP: clean up...
1693
  	ret = true;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1694
1695
1696
1697
1698
  out:
  	spin_unlock(ptl);
  out_unlocked:
  	return ret;
  }
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1699
1700
1701
1702
1703
1704
  static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
  {
  	pgtable_t pgtable;
  
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pte_free(mm, pgtable);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
1705
  	mm_dec_nr_ptes(mm);
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1706
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
1707
  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
f21760b15   Shaohua Li   thp: add tlb_remo...
1708
  		 pmd_t *pmd, unsigned long addr)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1709
  {
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1710
  	pmd_t orig_pmd;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1711
  	spinlock_t *ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1712

ed6a79352   Peter Zijlstra   asm-generic/tlb, ...
1713
  	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
07e326610   Aneesh Kumar K.V   mm: add tlb_remov...
1714

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1715
1716
  	ptl = __pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
  		return 0;
  	/*
  	 * For architectures like ppc64 we look at deposited pgtable
  	 * when calling pmdp_huge_get_and_clear. So do the
  	 * pgtable_trans_huge_withdraw after finishing pmdp related
  	 * operations.
  	 */
  	orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
  			tlb->fullmm);
  	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  	if (vma_is_dax(vma)) {
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
1728
1729
  		if (arch_needs_pgtable_deposit())
  			zap_deposited_table(tlb->mm, pmd);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1730
1731
  		spin_unlock(ptl);
  		if (is_huge_zero_pmd(orig_pmd))
c0f2e176f   Aneesh Kumar K.V   mm: use the corre...
1732
  			tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1733
  	} else if (is_huge_zero_pmd(orig_pmd)) {
c14a6eb44   Oliver O'Halloran   mm/huge_memory.c:...
1734
  		zap_deposited_table(tlb->mm, pmd);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1735
  		spin_unlock(ptl);
c0f2e176f   Aneesh Kumar K.V   mm: use the corre...
1736
  		tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1737
  	} else {
616b83715   Zi Yan   mm: thp: enable t...
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
  		struct page *page = NULL;
  		int flush_needed = 1;
  
  		if (pmd_present(orig_pmd)) {
  			page = pmd_page(orig_pmd);
  			page_remove_rmap(page, true);
  			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
  			VM_BUG_ON_PAGE(!PageHead(page), page);
  		} else if (thp_migration_supported()) {
  			swp_entry_t entry;
  
  			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
  			entry = pmd_to_swp_entry(orig_pmd);
  			page = pfn_to_page(swp_offset(entry));
  			flush_needed = 0;
  		} else
  			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
b5072380e   Kirill A. Shutemov   thp: support file...
1755
  		if (PageAnon(page)) {
c14a6eb44   Oliver O'Halloran   mm/huge_memory.c:...
1756
  			zap_deposited_table(tlb->mm, pmd);
b5072380e   Kirill A. Shutemov   thp: support file...
1757
1758
  			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  		} else {
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1759
1760
  			if (arch_needs_pgtable_deposit())
  				zap_deposited_table(tlb->mm, pmd);
fadae2953   Yang Shi   thp: use mm_file_...
1761
  			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
b5072380e   Kirill A. Shutemov   thp: support file...
1762
  		}
616b83715   Zi Yan   mm: thp: enable t...
1763

da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1764
  		spin_unlock(ptl);
616b83715   Zi Yan   mm: thp: enable t...
1765
1766
  		if (flush_needed)
  			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1767
  	}
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1768
  	return 1;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1769
  }
1dd38b6c2   Aneesh Kumar K.V   mm: move vma_is_a...
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
  #ifndef pmd_move_must_withdraw
  static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
  					 spinlock_t *old_pmd_ptl,
  					 struct vm_area_struct *vma)
  {
  	/*
  	 * With split pmd lock we also need to move preallocated
  	 * PTE page table if new_pmd is on different PMD page table.
  	 *
  	 * We also don't deposit and withdraw tables for file pages.
  	 */
  	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
  }
  #endif
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
  static pmd_t move_soft_dirty_pmd(pmd_t pmd)
  {
  #ifdef CONFIG_MEM_SOFT_DIRTY
  	if (unlikely(is_pmd_migration_entry(pmd)))
  		pmd = pmd_swp_mksoft_dirty(pmd);
  	else if (pmd_present(pmd))
  		pmd = pmd_mksoft_dirty(pmd);
  #endif
  	return pmd;
  }
bf8616d5f   Hugh Dickins   huge mm: move_hug...
1794
  bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1795
  		  unsigned long new_addr, unsigned long old_end,
eb66ae030   Linus Torvalds   mremap: properly ...
1796
  		  pmd_t *old_pmd, pmd_t *new_pmd)
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1797
  {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1798
  	spinlock_t *old_ptl, *new_ptl;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1799
  	pmd_t pmd;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1800
  	struct mm_struct *mm = vma->vm_mm;
5d1904204   Aaron Lu   mremap: fix race ...
1801
  	bool force_flush = false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1802
1803
1804
  
  	if ((old_addr & ~HPAGE_PMD_MASK) ||
  	    (new_addr & ~HPAGE_PMD_MASK) ||
bf8616d5f   Hugh Dickins   huge mm: move_hug...
1805
  	    old_end - old_addr < HPAGE_PMD_SIZE)
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1806
  		return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1807
1808
1809
1810
1811
1812
1813
  
  	/*
  	 * The destination pmd shouldn't be established, free_pgtables()
  	 * should have release it.
  	 */
  	if (WARN_ON(!pmd_none(*new_pmd))) {
  		VM_BUG_ON(pmd_trans_huge(*new_pmd));
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1814
  		return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1815
  	}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1816
1817
1818
1819
  	/*
  	 * We don't have to worry about the ordering of src and dst
  	 * ptlocks because exclusive mmap_sem prevents deadlock.
  	 */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1820
1821
  	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
  	if (old_ptl) {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1822
1823
1824
  		new_ptl = pmd_lockptr(mm, new_pmd);
  		if (new_ptl != old_ptl)
  			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
8809aa2d2   Aneesh Kumar K.V   mm: clarify that ...
1825
  		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
eb66ae030   Linus Torvalds   mremap: properly ...
1826
  		if (pmd_present(pmd))
a2ce2666a   Aaron Lu   mremap: move_ptes...
1827
  			force_flush = true;
025c5b245   Naoya Horiguchi   thp: optimize awa...
1828
  		VM_BUG_ON(!pmd_none(*new_pmd));
3592806cf   Kirill A. Shutemov   thp: move preallo...
1829

1dd38b6c2   Aneesh Kumar K.V   mm: move vma_is_a...
1830
  		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
b3084f4db   Aneesh Kumar K.V   powerpc/thp: Fix ...
1831
  			pgtable_t pgtable;
3592806cf   Kirill A. Shutemov   thp: move preallo...
1832
1833
  			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
  			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
3592806cf   Kirill A. Shutemov   thp: move preallo...
1834
  		}
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
1835
1836
  		pmd = move_soft_dirty_pmd(pmd);
  		set_pmd_at(mm, new_addr, new_pmd, pmd);
5d1904204   Aaron Lu   mremap: fix race ...
1837
1838
  		if (force_flush)
  			flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
eb66ae030   Linus Torvalds   mremap: properly ...
1839
1840
  		if (new_ptl != old_ptl)
  			spin_unlock(new_ptl);
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1841
  		spin_unlock(old_ptl);
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1842
  		return true;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1843
  	}
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1844
  	return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1845
  }
f123d74ab   Mel Gorman   mm: Only flush TL...
1846
1847
1848
1849
1850
1851
  /*
   * Returns
   *  - 0 if PMD could not be locked
   *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
   *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
   */
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1852
  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
e944fd67b   Mel Gorman   mm: numa: do not ...
1853
  		unsigned long addr, pgprot_t newprot, int prot_numa)
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1854
1855
  {
  	struct mm_struct *mm = vma->vm_mm;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1856
  	spinlock_t *ptl;
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1857
1858
1859
  	pmd_t entry;
  	bool preserve_write;
  	int ret;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1860

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1861
  	ptl = __pmd_trans_huge_lock(pmd, vma);
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1862
1863
  	if (!ptl)
  		return 0;
e944fd67b   Mel Gorman   mm: numa: do not ...
1864

0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1865
1866
  	preserve_write = prot_numa && pmd_write(*pmd);
  	ret = 1;
e944fd67b   Mel Gorman   mm: numa: do not ...
1867

84c3fc4e9   Zi Yan   mm: thp: check pm...
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  	if (is_swap_pmd(*pmd)) {
  		swp_entry_t entry = pmd_to_swp_entry(*pmd);
  
  		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
  		if (is_write_migration_entry(entry)) {
  			pmd_t newpmd;
  			/*
  			 * A protection check is difficult so
  			 * just be safe and disable write
  			 */
  			make_migration_entry_read(&entry);
  			newpmd = swp_entry_to_pmd(entry);
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
1881
1882
  			if (pmd_swp_soft_dirty(*pmd))
  				newpmd = pmd_swp_mksoft_dirty(newpmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1883
1884
1885
1886
1887
  			set_pmd_at(mm, addr, pmd, newpmd);
  		}
  		goto unlock;
  	}
  #endif
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1888
1889
1890
1891
1892
1893
1894
  	/*
  	 * Avoid trapping faults against the zero page. The read-only
  	 * data is likely to be read-cached on the local CPU and
  	 * local/remote hits to the zero page are not interesting.
  	 */
  	if (prot_numa && is_huge_zero_pmd(*pmd))
  		goto unlock;
025c5b245   Naoya Horiguchi   thp: optimize awa...
1895

0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1896
1897
  	if (prot_numa && pmd_protnone(*pmd))
  		goto unlock;
ced108037   Kirill A. Shutemov   thp: fix MADV_DON...
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
  	/*
  	 * In case prot_numa, we are under down_read(mmap_sem). It's critical
  	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
  	 * which is also under down_read(mmap_sem):
  	 *
  	 *	CPU0:				CPU1:
  	 *				change_huge_pmd(prot_numa=1)
  	 *				 pmdp_huge_get_and_clear_notify()
  	 * madvise_dontneed()
  	 *  zap_pmd_range()
  	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
  	 *   // skip the pmd
  	 *				 set_pmd_at();
  	 *				 // pmd is re-established
  	 *
  	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
  	 * which may break userspace.
  	 *
  	 * pmdp_invalidate() is required to make sure we don't miss
  	 * dirty/young flags set by hardware.
  	 */
a3cf988fc   Kirill A. Shutemov   mm: use updated p...
1919
  	entry = pmdp_invalidate(vma, addr, pmd);
ced108037   Kirill A. Shutemov   thp: fix MADV_DON...
1920

0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1921
1922
1923
1924
1925
1926
1927
1928
  	entry = pmd_modify(entry, newprot);
  	if (preserve_write)
  		entry = pmd_mk_savedwrite(entry);
  	ret = HPAGE_PMD_NR;
  	set_pmd_at(mm, addr, pmd, entry);
  	BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
  unlock:
  	spin_unlock(ptl);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1929
1930
1931
1932
  	return ret;
  }
  
  /*
8f19b0c05   Huang Ying   thp: fix comments...
1933
   * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1934
   *
8f19b0c05   Huang Ying   thp: fix comments...
1935
1936
   * Note that if it returns page table lock pointer, this routine returns without
   * unlocking page table lock. So callers must unlock it.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1937
   */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1938
  spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
025c5b245   Naoya Horiguchi   thp: optimize awa...
1939
  {
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1940
1941
  	spinlock_t *ptl;
  	ptl = pmd_lock(vma->vm_mm, pmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1942
1943
  	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
  			pmd_devmap(*pmd)))
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1944
1945
1946
  		return ptl;
  	spin_unlock(ptl);
  	return NULL;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1947
  }
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
  /*
   * Returns true if a given pud maps a thp, false otherwise.
   *
   * Note that if it returns true, this routine returns without unlocking page
   * table lock. So callers must unlock it.
   */
  spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
  {
  	spinlock_t *ptl;
  
  	ptl = pud_lock(vma->vm_mm, pud);
  	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
  		return ptl;
  	spin_unlock(ptl);
  	return NULL;
  }
  
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
  		 pud_t *pud, unsigned long addr)
  {
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
  	spinlock_t *ptl;
  
  	ptl = __pud_trans_huge_lock(pud, vma);
  	if (!ptl)
  		return 0;
  	/*
  	 * For architectures like ppc64 we look at deposited pgtable
  	 * when calling pudp_huge_get_and_clear. So do the
  	 * pgtable_trans_huge_withdraw after finishing pudp related
  	 * operations.
  	 */
70516b936   Qian Cai   mm/huge_memory.c:...
1980
  	pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
  	tlb_remove_pud_tlb_entry(tlb, pud, addr);
  	if (vma_is_dax(vma)) {
  		spin_unlock(ptl);
  		/* No zero page support yet */
  	} else {
  		/* No support for anonymous PUD pages yet */
  		BUG();
  	}
  	return 1;
  }
  
  static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
  		unsigned long haddr)
  {
  	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
  	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
ce9311cf9   Yisheng Xie   mm/vmstats: add t...
1999
  	count_vm_event(THP_SPLIT_PUD);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
2000
2001
2002
2003
2004
2005
2006
2007
  
  	pudp_huge_clear_flush_notify(vma, haddr, pud);
  }
  
  void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
  		unsigned long address)
  {
  	spinlock_t *ptl;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2008
  	struct mmu_notifier_range range;
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
2009

7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
2010
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
6f4f13e8d   Jérôme Glisse   mm/mmu_notifier: ...
2011
  				address & HPAGE_PUD_MASK,
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2012
2013
2014
  				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
  	mmu_notifier_invalidate_range_start(&range);
  	ptl = pud_lock(vma->vm_mm, pud);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
2015
2016
  	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
  		goto out;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2017
  	__split_huge_pud_locked(vma, pud, range.start);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
2018
2019
2020
  
  out:
  	spin_unlock(ptl);
4645b9fe8   Jérôme Glisse   mm/mmu_notifier: ...
2021
2022
2023
2024
  	/*
  	 * No need to double call mmu_notifier->invalidate_range() callback as
  	 * the above pudp_huge_clear_flush_notify() did already call it.
  	 */
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2025
  	mmu_notifier_invalidate_range_only_end(&range);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
2026
2027
  }
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2028
2029
2030
2031
2032
2033
2034
  static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
  		unsigned long haddr, pmd_t *pmd)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pgtable_t pgtable;
  	pmd_t _pmd;
  	int i;
0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
2035
2036
2037
2038
2039
2040
  	/*
  	 * Leave pmd empty until pte is filled note that it is fine to delay
  	 * notification until mmu_notifier_invalidate_range_end() as we are
  	 * replacing a zero pmd write protected page with a zero pte write
  	 * protected page.
  	 *
ad56b738c   Mike Rapoport   docs/vm: rename d...
2041
  	 * See Documentation/vm/mmu_notifier.rst
0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
2042
2043
  	 */
  	pmdp_huge_clear_flush(vma, haddr, pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
  
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
  
  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
  		pte_t *pte, entry;
  		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
  		entry = pte_mkspecial(entry);
  		pte = pte_offset_map(&_pmd, haddr);
  		VM_BUG_ON(!pte_none(*pte));
  		set_pte_at(mm, haddr, pte, entry);
  		pte_unmap(pte);
  	}
  	smp_wmb(); /* make pte visible before pmd */
  	pmd_populate(mm, pmd, pgtable);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2059
2060
2061
  }
  
  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
ba9882808   Kirill A. Shutemov   thp: add option t...
2062
  		unsigned long haddr, bool freeze)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2063
2064
2065
2066
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct page *page;
  	pgtable_t pgtable;
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2067
  	pmd_t old_pmd, _pmd;
a3cf988fc   Kirill A. Shutemov   mm: use updated p...
2068
  	bool young, write, soft_dirty, pmd_migration = false;
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2069
  	unsigned long addr;
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2070
2071
2072
2073
2074
  	int i;
  
  	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
84c3fc4e9   Zi Yan   mm: thp: check pm...
2075
2076
  	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
  				&& !pmd_devmap(*pmd));
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2077
2078
  
  	count_vm_event(THP_SPLIT_PMD);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
2079
2080
  	if (!vma_is_anonymous(vma)) {
  		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
2081
2082
2083
2084
2085
2086
  		/*
  		 * We are going to unmap this huge page. So
  		 * just go ahead and zap it
  		 */
  		if (arch_needs_pgtable_deposit())
  			zap_deposited_table(mm, pmd);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
2087
2088
2089
  		if (vma_is_dax(vma))
  			return;
  		page = pmd_page(_pmd);
e1f1b1572   Hugh Dickins   mm/huge_memory.c:...
2090
2091
  		if (!PageDirty(page) && pmd_dirty(_pmd))
  			set_page_dirty(page);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
2092
2093
2094
2095
  		if (!PageReferenced(page) && pmd_young(_pmd))
  			SetPageReferenced(page);
  		page_remove_rmap(page, true);
  		put_page(page);
fadae2953   Yang Shi   thp: use mm_file_...
2096
  		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2097
2098
  		return;
  	} else if (is_huge_zero_pmd(*pmd)) {
4645b9fe8   Jérôme Glisse   mm/mmu_notifier: ...
2099
2100
2101
2102
2103
2104
2105
2106
2107
  		/*
  		 * FIXME: Do we want to invalidate secondary mmu by calling
  		 * mmu_notifier_invalidate_range() see comments below inside
  		 * __split_huge_pmd() ?
  		 *
  		 * We are going from a zero huge page write protected to zero
  		 * small page also write protected so it does not seems useful
  		 * to invalidate secondary mmu at this time.
  		 */
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2108
2109
  		return __split_huge_zero_page_pmd(vma, haddr, pmd);
  	}
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
  	/*
  	 * Up to this point the pmd is present and huge and userland has the
  	 * whole access to the hugepage during the split (which happens in
  	 * place). If we overwrite the pmd with the not-huge version pointing
  	 * to the pte here (which of course we could if all CPUs were bug
  	 * free), userland could trigger a small page size TLB miss on the
  	 * small sized TLB while the hugepage TLB entry is still established in
  	 * the huge TLB. Some CPU doesn't like that.
  	 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
  	 * 383 on page 93. Intel should be safe but is also warns that it's
  	 * only safe if the permission and cache attributes of the two entries
  	 * loaded in the two TLB is identical (which should be the case here).
  	 * But it is generally safer to never allow small and huge TLB entries
  	 * for the same virtual address to be loaded simultaneously. So instead
  	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
  	 * current pmd notpresent (atomically because here the pmd_trans_huge
  	 * must remain set at all times on the pmd until the split is complete
  	 * for this pmd), then we flush the SMP TLB and finally we write the
  	 * non-huge version of the pmd entry with pmd_populate.
  	 */
  	old_pmd = pmdp_invalidate(vma, haddr, pmd);
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2131
  	pmd_migration = is_pmd_migration_entry(old_pmd);
2e83ee1d8   Peter Xu   mm: thp: fix flag...
2132
  	if (unlikely(pmd_migration)) {
84c3fc4e9   Zi Yan   mm: thp: check pm...
2133
  		swp_entry_t entry;
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2134
  		entry = pmd_to_swp_entry(old_pmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
2135
  		page = pfn_to_page(swp_offset(entry));
2e83ee1d8   Peter Xu   mm: thp: fix flag...
2136
2137
2138
2139
  		write = is_write_migration_entry(entry);
  		young = false;
  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
  	} else {
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2140
  		page = pmd_page(old_pmd);
2e83ee1d8   Peter Xu   mm: thp: fix flag...
2141
2142
2143
2144
2145
2146
  		if (pmd_dirty(old_pmd))
  			SetPageDirty(page);
  		write = pmd_write(old_pmd);
  		young = pmd_young(old_pmd);
  		soft_dirty = pmd_soft_dirty(old_pmd);
  	}
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2147
  	VM_BUG_ON_PAGE(!page_count(page), page);
fe896d187   Joonsoo Kim   mm: introduce pag...
2148
  	page_ref_add(page, HPAGE_PMD_NR - 1);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2149

423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2150
2151
2152
2153
  	/*
  	 * Withdraw the table only after we mark the pmd entry invalid.
  	 * This's critical for some architectures (Power).
  	 */
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2154
2155
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2156
  	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2157
2158
2159
2160
2161
2162
  		pte_t entry, *pte;
  		/*
  		 * Note that NUMA hinting access restrictions are not
  		 * transferred to avoid any possibility of altering
  		 * permissions across VMAs.
  		 */
84c3fc4e9   Zi Yan   mm: thp: check pm...
2163
  		if (freeze || pmd_migration) {
ba9882808   Kirill A. Shutemov   thp: add option t...
2164
2165
2166
  			swp_entry_t swp_entry;
  			swp_entry = make_migration_entry(page + i, write);
  			entry = swp_entry_to_pte(swp_entry);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
2167
2168
  			if (soft_dirty)
  				entry = pte_swp_mksoft_dirty(entry);
ba9882808   Kirill A. Shutemov   thp: add option t...
2169
  		} else {
6d2329f88   Andrea Arcangeli   mm: vm_page_prot:...
2170
  			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2171
  			entry = maybe_mkwrite(entry, vma);
ba9882808   Kirill A. Shutemov   thp: add option t...
2172
2173
2174
2175
  			if (!write)
  				entry = pte_wrprotect(entry);
  			if (!young)
  				entry = pte_mkold(entry);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
2176
2177
  			if (soft_dirty)
  				entry = pte_mksoft_dirty(entry);
ba9882808   Kirill A. Shutemov   thp: add option t...
2178
  		}
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2179
  		pte = pte_offset_map(&_pmd, addr);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2180
  		BUG_ON(!pte_none(*pte));
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2181
  		set_pte_at(mm, addr, pte, entry);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
  		atomic_inc(&page[i]._mapcount);
  		pte_unmap(pte);
  	}
  
  	/*
  	 * Set PG_double_map before dropping compound_mapcount to avoid
  	 * false-negative page_mapped().
  	 */
  	if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
  		for (i = 0; i < HPAGE_PMD_NR; i++)
  			atomic_inc(&page[i]._mapcount);
  	}
  
  	if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
  		/* Last compound_mapcount is gone. */
11fb99898   Mel Gorman   mm: move most fil...
2197
  		__dec_node_page_state(page, NR_ANON_THPS);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2198
2199
2200
2201
2202
2203
2204
2205
2206
  		if (TestClearPageDoubleMap(page)) {
  			/* No need in mapcount reference anymore */
  			for (i = 0; i < HPAGE_PMD_NR; i++)
  				atomic_dec(&page[i]._mapcount);
  		}
  	}
  
  	smp_wmb(); /* make pte visible before pmd */
  	pmd_populate(mm, pmd, pgtable);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2207
2208
  
  	if (freeze) {
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2209
  		for (i = 0; i < HPAGE_PMD_NR; i++) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2210
2211
2212
2213
  			page_remove_rmap(page + i, false);
  			put_page(page + i);
  		}
  	}
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2214
2215
2216
  }
  
  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2217
  		unsigned long address, bool freeze, struct page *page)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2218
2219
  {
  	spinlock_t *ptl;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2220
  	struct mmu_notifier_range range;
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2221

7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
2222
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
6f4f13e8d   Jérôme Glisse   mm/mmu_notifier: ...
2223
  				address & HPAGE_PMD_MASK,
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2224
2225
2226
  				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
  	mmu_notifier_invalidate_range_start(&range);
  	ptl = pmd_lock(vma->vm_mm, pmd);
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2227
2228
2229
2230
2231
2232
2233
2234
  
  	/*
  	 * If caller asks to setup a migration entries, we need a page to check
  	 * pmd against. Otherwise we can end up replacing wrong page.
  	 */
  	VM_BUG_ON(freeze && !page);
  	if (page && page != pmd_page(*pmd))
  	        goto out;
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2235
  	if (pmd_trans_huge(*pmd)) {
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2236
  		page = pmd_page(*pmd);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2237
  		if (PageMlocked(page))
5f7377147   Kirill A. Shutemov   thp: fix deadlock...
2238
  			clear_page_mlock(page);
84c3fc4e9   Zi Yan   mm: thp: check pm...
2239
  	} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
2240
  		goto out;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2241
  	__split_huge_pmd_locked(vma, pmd, range.start, freeze);
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
2242
  out:
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2243
  	spin_unlock(ptl);
4645b9fe8   Jérôme Glisse   mm/mmu_notifier: ...
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
  	/*
  	 * No need to double call mmu_notifier->invalidate_range() callback.
  	 * They are 3 cases to consider inside __split_huge_pmd_locked():
  	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
  	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
  	 *    fault will trigger a flush_notify before pointing to a new page
  	 *    (it is fine if the secondary mmu keeps pointing to the old zero
  	 *    page in the meantime)
  	 *  3) Split a huge pmd into pte pointing to the same page. No need
  	 *     to invalidate secondary tlb entry they are all still valid.
  	 *     any further changes to individual pte will notify. So no need
  	 *     to call mmu_notifier->invalidate_range()
  	 */
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2257
  	mmu_notifier_invalidate_range_only_end(&range);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2258
  }
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2259
2260
  void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
  		bool freeze, struct page *page)
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2261
  {
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2262
  	pgd_t *pgd;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2263
  	p4d_t *p4d;
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2264
  	pud_t *pud;
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2265
  	pmd_t *pmd;
78ddc5347   Kirill A. Shutemov   thp: rename split...
2266
  	pgd = pgd_offset(vma->vm_mm, address);
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2267
2268
  	if (!pgd_present(*pgd))
  		return;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2269
2270
2271
2272
2273
  	p4d = p4d_offset(pgd, address);
  	if (!p4d_present(*p4d))
  		return;
  
  	pud = pud_offset(p4d, address);
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2274
2275
2276
2277
  	if (!pud_present(*pud))
  		return;
  
  	pmd = pmd_offset(pud, address);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2278

33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2279
  	__split_huge_pmd(vma, pmd, address, freeze, page);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2280
  }
e1b9996b8   Kirill A. Shutemov   thp: vma_adjust_t...
2281
  void vma_adjust_trans_huge(struct vm_area_struct *vma,
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
  			     unsigned long start,
  			     unsigned long end,
  			     long adjust_next)
  {
  	/*
  	 * If the new start address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (start & ~HPAGE_PMD_MASK &&
  	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2294
  		split_huge_pmd_address(vma, start, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2295
2296
2297
2298
2299
2300
2301
2302
2303
  
  	/*
  	 * If the new end address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (end & ~HPAGE_PMD_MASK &&
  	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2304
  		split_huge_pmd_address(vma, end, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
  
  	/*
  	 * If we're also updating the vma->vm_next->vm_start, if the new
  	 * vm_next->vm_start isn't page aligned and it could previously
  	 * contain an hugepage: check if we need to split an huge pmd.
  	 */
  	if (adjust_next > 0) {
  		struct vm_area_struct *next = vma->vm_next;
  		unsigned long nstart = next->vm_start;
  		nstart += adjust_next << PAGE_SHIFT;
  		if (nstart & ~HPAGE_PMD_MASK &&
  		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
  		    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2318
  			split_huge_pmd_address(next, nstart, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2319
2320
  	}
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2321

906f9cdfc   Hugh Dickins   mm/huge_memory: r...
2322
  static void unmap_page(struct page *page)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2323
  {
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2324
  	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
c7ab0d2fd   Kirill A. Shutemov   mm: convert try_t...
2325
  		TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
666e5a406   Minchan Kim   mm: make ttu's re...
2326
  	bool unmap_success;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2327
2328
  
  	VM_BUG_ON_PAGE(!PageHead(page), page);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2329
  	if (PageAnon(page))
b5ff8161e   Naoya Horiguchi   mm: thp: introduc...
2330
  		ttu_flags |= TTU_SPLIT_FREEZE;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2331

666e5a406   Minchan Kim   mm: make ttu's re...
2332
2333
  	unmap_success = try_to_unmap(page, ttu_flags);
  	VM_BUG_ON_PAGE(!unmap_success, page);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2334
  }
906f9cdfc   Hugh Dickins   mm/huge_memory: r...
2335
  static void remap_page(struct page *page)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2336
  {
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2337
  	int i;
ace71a19c   Kirill A. Shutemov   mm: introduce pag...
2338
2339
2340
2341
2342
2343
  	if (PageTransHuge(page)) {
  		remove_migration_ptes(page, page, true);
  	} else {
  		for (i = 0; i < HPAGE_PMD_NR; i++)
  			remove_migration_ptes(page + i, page + i, true);
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2344
  }
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2345
  static void __split_huge_page_tail(struct page *head, int tail,
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2346
2347
  		struct lruvec *lruvec, struct list_head *list)
  {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2348
  	struct page *page_tail = head + tail;
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2349
  	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2350
2351
  
  	/*
605ca5ede   Konstantin Khlebnikov   mm/huge_memory.c:...
2352
2353
2354
2355
  	 * Clone page flags before unfreezing refcount.
  	 *
  	 * After successful get_page_unless_zero() might follow flags change,
  	 * for exmaple lock_page() which set PG_waiters.
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2356
  	 */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2357
2358
2359
2360
  	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  	page_tail->flags |= (head->flags &
  			((1L << PG_referenced) |
  			 (1L << PG_swapbacked) |
38d8b4e6b   Huang Ying   mm, THP, swap: de...
2361
  			 (1L << PG_swapcache) |
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2362
2363
2364
  			 (1L << PG_mlocked) |
  			 (1L << PG_uptodate) |
  			 (1L << PG_active) |
1899ad18c   Johannes Weiner   mm: workingset: t...
2365
  			 (1L << PG_workingset) |
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2366
  			 (1L << PG_locked) |
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2367
2368
  			 (1L << PG_unevictable) |
  			 (1L << PG_dirty)));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2369

173d9d9fd   Hugh Dickins   mm/huge_memory: s...
2370
2371
2372
2373
2374
  	/* ->mapping in first tail page is compound_mapcount */
  	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
  			page_tail);
  	page_tail->mapping = head->mapping;
  	page_tail->index = head->index + tail;
605ca5ede   Konstantin Khlebnikov   mm/huge_memory.c:...
2375
  	/* Page flags must be visible before we make the page non-compound. */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2376
  	smp_wmb();
605ca5ede   Konstantin Khlebnikov   mm/huge_memory.c:...
2377
2378
2379
2380
2381
2382
  	/*
  	 * Clear PageTail before unfreezing page refcount.
  	 *
  	 * After successful get_page_unless_zero() might follow put_page()
  	 * which needs correct compound_head().
  	 */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2383
  	clear_compound_head(page_tail);
605ca5ede   Konstantin Khlebnikov   mm/huge_memory.c:...
2384
2385
2386
  	/* Finally unfreeze refcount. Additional reference from page cache. */
  	page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
  					  PageSwapCache(head)));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2387
2388
2389
2390
  	if (page_is_young(head))
  		set_page_young(page_tail);
  	if (page_is_idle(head))
  		set_page_idle(page_tail);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2391
  	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
94723aafb   Michal Hocko   mm: unclutter THP...
2392
2393
2394
2395
2396
2397
  
  	/*
  	 * always add to the tail because some iterators expect new
  	 * pages to show after the currently processed elements - e.g.
  	 * migrate_pages
  	 */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2398
  	lru_add_page_tail(head, page_tail, lruvec, list);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2399
  }
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2400
  static void __split_huge_page(struct page *page, struct list_head *list,
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2401
  		pgoff_t end, unsigned long flags)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2402
2403
  {
  	struct page *head = compound_head(page);
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2404
  	pg_data_t *pgdat = page_pgdat(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2405
  	struct lruvec *lruvec;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2406
2407
  	struct address_space *swap_cache = NULL;
  	unsigned long offset = 0;
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2408
  	int i;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2409

f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2410
  	lruvec = mem_cgroup_page_lruvec(head, pgdat);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2411
2412
2413
  
  	/* complete memcg works before add pages to LRU */
  	mem_cgroup_split_huge_fixup(head);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2414
2415
2416
2417
2418
2419
2420
  	if (PageAnon(head) && PageSwapCache(head)) {
  		swp_entry_t entry = { .val = page_private(head) };
  
  		offset = swp_offset(entry);
  		swap_cache = swap_address_space(entry);
  		xa_lock(&swap_cache->i_pages);
  	}
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2421
  	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2422
  		__split_huge_page_tail(head, i, lruvec, list);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2423
2424
  		/* Some pages can be beyond i_size: drop them from page cache */
  		if (head[i].index >= end) {
2d077d4b5   Hugh Dickins   mm/huge_memory.c:...
2425
  			ClearPageDirty(head + i);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2426
  			__delete_from_page_cache(head + i, NULL);
800d8c63b   Kirill A. Shutemov   shmem: add huge p...
2427
2428
  			if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
  				shmem_uncharge(head->mapping->host, 1);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2429
  			put_page(head + i);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2430
2431
2432
2433
2434
2435
  		} else if (!PageAnon(page)) {
  			__xa_store(&head->mapping->i_pages, head[i].index,
  					head + i, 0);
  		} else if (swap_cache) {
  			__xa_store(&swap_cache->i_pages, offset + i,
  					head + i, 0);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2436
2437
  		}
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2438
2439
  
  	ClearPageCompound(head);
f7da677bc   Vlastimil Babka   mm, page_owner: h...
2440
2441
  
  	split_page_owner(head, HPAGE_PMD_ORDER);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2442
2443
  	/* See comment in __split_huge_page_tail() */
  	if (PageAnon(head)) {
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2444
  		/* Additional pin to swap cache */
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2445
  		if (PageSwapCache(head)) {
38d8b4e6b   Huang Ying   mm, THP, swap: de...
2446
  			page_ref_add(head, 2);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2447
2448
  			xa_unlock(&swap_cache->i_pages);
  		} else {
38d8b4e6b   Huang Ying   mm, THP, swap: de...
2449
  			page_ref_inc(head);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2450
  		}
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2451
  	} else {
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2452
  		/* Additional pin to page cache */
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2453
  		page_ref_add(head, 2);
b93b01631   Matthew Wilcox   page cache: use x...
2454
  		xa_unlock(&head->mapping->i_pages);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2455
  	}
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2456
  	spin_unlock_irqrestore(&pgdat->lru_lock, flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2457

906f9cdfc   Hugh Dickins   mm/huge_memory: r...
2458
  	remap_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		struct page *subpage = head + i;
  		if (subpage == page)
  			continue;
  		unlock_page(subpage);
  
  		/*
  		 * Subpages may be freed if there wasn't any mapping
  		 * like if add_to_swap() is running on a lru page that
  		 * had its mapping zapped. And freeing these pages
  		 * requires taking the lru_lock so we do the put_page
  		 * of the tail pages after the split is complete.
  		 */
  		put_page(subpage);
  	}
  }
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2476
2477
  int total_mapcount(struct page *page)
  {
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2478
  	int i, compound, ret;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2479
2480
2481
2482
2483
  
  	VM_BUG_ON_PAGE(PageTail(page), page);
  
  	if (likely(!PageCompound(page)))
  		return atomic_read(&page->_mapcount) + 1;
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2484
  	compound = compound_mapcount(page);
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2485
  	if (PageHuge(page))
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2486
2487
  		return compound;
  	ret = compound;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2488
2489
  	for (i = 0; i < HPAGE_PMD_NR; i++)
  		ret += atomic_read(&page[i]._mapcount) + 1;
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2490
2491
2492
  	/* File pages has compound_mapcount included in _mapcount */
  	if (!PageAnon(page))
  		return ret - compound * HPAGE_PMD_NR;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2493
2494
2495
2496
  	if (PageDoubleMap(page))
  		ret -= HPAGE_PMD_NR;
  	return ret;
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2497
  /*
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
   * This calculates accurately how many mappings a transparent hugepage
   * has (unlike page_mapcount() which isn't fully accurate). This full
   * accuracy is primarily needed to know if copy-on-write faults can
   * reuse the page and change the mapping to read-write instead of
   * copying them. At the same time this returns the total_mapcount too.
   *
   * The function returns the highest mapcount any one of the subpages
   * has. If the return value is one, even if different processes are
   * mapping different subpages of the transparent hugepage, they can
   * all reuse it, because each process is reusing a different subpage.
   *
   * The total_mapcount is instead counting all virtual mappings of the
   * subpages. If the total_mapcount is equal to "one", it tells the
   * caller all mappings belong to the same "mm" and in turn the
   * anon_vma of the transparent hugepage can become the vma->anon_vma
   * local one as no other process may be mapping any of the subpages.
   *
   * It would be more accurate to replace page_mapcount() with
   * page_trans_huge_mapcount(), however we only use
   * page_trans_huge_mapcount() in the copy-on-write faults where we
   * need full accuracy to avoid breaking page pinning, because
   * page_trans_huge_mapcount() is slower than page_mapcount().
   */
  int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
  {
  	int i, ret, _total_mapcount, mapcount;
  
  	/* hugetlbfs shouldn't call it */
  	VM_BUG_ON_PAGE(PageHuge(page), page);
  
  	if (likely(!PageTransCompound(page))) {
  		mapcount = atomic_read(&page->_mapcount) + 1;
  		if (total_mapcount)
  			*total_mapcount = mapcount;
  		return mapcount;
  	}
  
  	page = compound_head(page);
  
  	_total_mapcount = ret = 0;
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		mapcount = atomic_read(&page[i]._mapcount) + 1;
  		ret = max(ret, mapcount);
  		_total_mapcount += mapcount;
  	}
  	if (PageDoubleMap(page)) {
  		ret -= 1;
  		_total_mapcount -= HPAGE_PMD_NR;
  	}
  	mapcount = compound_mapcount(page);
  	ret += mapcount;
  	_total_mapcount += mapcount;
  	if (total_mapcount)
  		*total_mapcount = _total_mapcount;
  	return ret;
  }
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2554
2555
2556
2557
  /* Racy check whether the huge page can be split */
  bool can_split_huge_page(struct page *page, int *pextra_pins)
  {
  	int extra_pins;
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2558
  	/* Additional pins from page cache */
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2559
2560
2561
2562
2563
2564
2565
2566
  	if (PageAnon(page))
  		extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
  	else
  		extra_pins = HPAGE_PMD_NR;
  	if (pextra_pins)
  		*pextra_pins = extra_pins;
  	return total_mapcount(page) == page_count(page) - extra_pins - 1;
  }
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
2567
  /*
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
   * This function splits huge page into normal pages. @page can point to any
   * subpage of huge page to split. Split doesn't change the position of @page.
   *
   * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
   * The huge page must be locked.
   *
   * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
   *
   * Both head page and tail pages will inherit mapping, flags, and so on from
   * the hugepage.
   *
   * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
   * they are not mapped.
   *
   * Returns 0 if the hugepage is split successfully.
   * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
   * us.
   */
  int split_huge_page_to_list(struct page *page, struct list_head *list)
  {
  	struct page *head = compound_head(page);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2589
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
87eaceb3f   Yang Shi   mm: thp: make def...
2590
  	struct deferred_split *ds_queue = get_deferred_split_queue(page);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2591
2592
2593
  	struct anon_vma *anon_vma = NULL;
  	struct address_space *mapping = NULL;
  	int count, mapcount, extra_pins, ret;
d96543223   Kirill A. Shutemov   thp: increase spl...
2594
  	bool mlocked;
0b9b6fff7   Kirill A. Shutemov   thp: fix interrup...
2595
  	unsigned long flags;
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2596
  	pgoff_t end;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2597

0eb282cb1   Wei Yang   mm/huge_memory.c:...
2598
  	VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2599
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2600
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
59807685a   Huang Ying   mm, THP, swap: su...
2601
2602
  	if (PageWriteback(page))
  		return -EBUSY;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
  	if (PageAnon(head)) {
  		/*
  		 * The caller does not necessarily hold an mmap_sem that would
  		 * prevent the anon_vma disappearing so we first we take a
  		 * reference to it and then lock the anon_vma for write. This
  		 * is similar to page_lock_anon_vma_read except the write lock
  		 * is taken to serialise against parallel split or collapse
  		 * operations.
  		 */
  		anon_vma = page_get_anon_vma(head);
  		if (!anon_vma) {
  			ret = -EBUSY;
  			goto out;
  		}
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2617
  		end = -1;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
  		mapping = NULL;
  		anon_vma_lock_write(anon_vma);
  	} else {
  		mapping = head->mapping;
  
  		/* Truncated ? */
  		if (!mapping) {
  			ret = -EBUSY;
  			goto out;
  		}
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2628
2629
  		anon_vma = NULL;
  		i_mmap_lock_read(mapping);
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2630
2631
2632
2633
2634
2635
2636
2637
2638
  
  		/*
  		 *__split_huge_page() may need to trim off pages beyond EOF:
  		 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
  		 * which cannot be nested inside the page tree lock. So note
  		 * end now: i_size itself may be changed at any moment, but
  		 * head page lock is good enough to serialize the trimming.
  		 */
  		end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2639
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2640
2641
  
  	/*
906f9cdfc   Hugh Dickins   mm/huge_memory: r...
2642
  	 * Racy check if we can split the page, before unmap_page() will
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2643
2644
  	 * split PMDs
  	 */
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2645
  	if (!can_split_huge_page(head, &extra_pins)) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2646
2647
2648
  		ret = -EBUSY;
  		goto out_unlock;
  	}
d96543223   Kirill A. Shutemov   thp: increase spl...
2649
  	mlocked = PageMlocked(page);
906f9cdfc   Hugh Dickins   mm/huge_memory: r...
2650
  	unmap_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2651
  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
d96543223   Kirill A. Shutemov   thp: increase spl...
2652
2653
2654
  	/* Make sure the page is not on per-CPU pagevec as it takes pin */
  	if (mlocked)
  		lru_add_drain();
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2655
  	/* prevent PageLRU to go away from under us, and freeze lru stats */
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2656
  	spin_lock_irqsave(&pgdata->lru_lock, flags);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2657
2658
  
  	if (mapping) {
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2659
  		XA_STATE(xas, &mapping->i_pages, page_index(head));
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2660

baa355fd3   Kirill A. Shutemov   thp: file pages s...
2661
  		/*
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2662
  		 * Check if the head page is present in page cache.
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2663
2664
  		 * We assume all tail are present too, if head is there.
  		 */
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2665
2666
  		xa_lock(&mapping->i_pages);
  		if (xas_load(&xas) != head)
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2667
2668
  			goto fail;
  	}
0139aa7b7   Joonsoo Kim   mm: rename _count...
2669
  	/* Prevent deferred_split_scan() touching ->_refcount */
364c1eebe   Yang Shi   mm: thp: extract ...
2670
  	spin_lock(&ds_queue->split_queue_lock);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2671
2672
  	count = page_count(head);
  	mapcount = total_mapcount(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2673
  	if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
2674
  		if (!list_empty(page_deferred_list(head))) {
364c1eebe   Yang Shi   mm: thp: extract ...
2675
  			ds_queue->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2676
2677
  			list_del(page_deferred_list(head));
  		}
06d3eff62   Kirill A. Shutemov   mm/thp: fix node ...
2678
2679
2680
2681
2682
2683
  		if (mapping) {
  			if (PageSwapBacked(page))
  				__dec_node_page_state(page, NR_SHMEM_THPS);
  			else
  				__dec_node_page_state(page, NR_FILE_THPS);
  		}
364c1eebe   Yang Shi   mm: thp: extract ...
2684
  		spin_unlock(&ds_queue->split_queue_lock);
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2685
  		__split_huge_page(page, list, end, flags);
59807685a   Huang Ying   mm, THP, swap: su...
2686
2687
2688
2689
2690
2691
  		if (PageSwapCache(head)) {
  			swp_entry_t entry = { .val = page_private(head) };
  
  			ret = split_swap_cluster(entry);
  		} else
  			ret = 0;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2692
  	} else {
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2693
2694
2695
2696
2697
2698
2699
2700
2701
  		if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
  			pr_alert("total_mapcount: %u, page_count(): %u
  ",
  					mapcount, count);
  			if (PageTail(page))
  				dump_page(head, NULL);
  			dump_page(page, "total_mapcount(head) > 0");
  			BUG();
  		}
364c1eebe   Yang Shi   mm: thp: extract ...
2702
  		spin_unlock(&ds_queue->split_queue_lock);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2703
  fail:		if (mapping)
b93b01631   Matthew Wilcox   page cache: use x...
2704
  			xa_unlock(&mapping->i_pages);
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2705
  		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
906f9cdfc   Hugh Dickins   mm/huge_memory: r...
2706
  		remap_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2707
2708
2709
2710
  		ret = -EBUSY;
  	}
  
  out_unlock:
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2711
2712
2713
2714
2715
2716
  	if (anon_vma) {
  		anon_vma_unlock_write(anon_vma);
  		put_anon_vma(anon_vma);
  	}
  	if (mapping)
  		i_mmap_unlock_read(mapping);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2717
2718
2719
2720
  out:
  	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
  	return ret;
  }
9a982250f   Kirill A. Shutemov   thp: introduce de...
2721
2722
2723
  
  void free_transhuge_page(struct page *page)
  {
87eaceb3f   Yang Shi   mm: thp: make def...
2724
  	struct deferred_split *ds_queue = get_deferred_split_queue(page);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2725
  	unsigned long flags;
364c1eebe   Yang Shi   mm: thp: extract ...
2726
  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2727
  	if (!list_empty(page_deferred_list(page))) {
364c1eebe   Yang Shi   mm: thp: extract ...
2728
  		ds_queue->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2729
2730
  		list_del(page_deferred_list(page));
  	}
364c1eebe   Yang Shi   mm: thp: extract ...
2731
  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2732
2733
2734
2735
2736
  	free_compound_page(page);
  }
  
  void deferred_split_huge_page(struct page *page)
  {
87eaceb3f   Yang Shi   mm: thp: make def...
2737
2738
2739
2740
  	struct deferred_split *ds_queue = get_deferred_split_queue(page);
  #ifdef CONFIG_MEMCG
  	struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
  #endif
9a982250f   Kirill A. Shutemov   thp: introduce de...
2741
2742
2743
  	unsigned long flags;
  
  	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
87eaceb3f   Yang Shi   mm: thp: make def...
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
  	/*
  	 * The try_to_unmap() in page reclaim path might reach here too,
  	 * this may cause a race condition to corrupt deferred split queue.
  	 * And, if page reclaim is already handling the same page, it is
  	 * unnecessary to handle it again in shrinker.
  	 *
  	 * Check PageSwapCache to determine if the page is being
  	 * handled by page reclaim since THP swap would add the page into
  	 * swap cache before calling try_to_unmap().
  	 */
  	if (PageSwapCache(page))
  		return;
364c1eebe   Yang Shi   mm: thp: extract ...
2756
  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2757
  	if (list_empty(page_deferred_list(page))) {
f9719a03d   Kirill A. Shutemov   thp, vmstats: cou...
2758
  		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
364c1eebe   Yang Shi   mm: thp: extract ...
2759
2760
  		list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
  		ds_queue->split_queue_len++;
87eaceb3f   Yang Shi   mm: thp: make def...
2761
2762
2763
2764
2765
  #ifdef CONFIG_MEMCG
  		if (memcg)
  			memcg_set_shrinker_bit(memcg, page_to_nid(page),
  					       deferred_split_shrinker.id);
  #endif
9a982250f   Kirill A. Shutemov   thp: introduce de...
2766
  	}
364c1eebe   Yang Shi   mm: thp: extract ...
2767
  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2768
2769
2770
2771
2772
  }
  
  static unsigned long deferred_split_count(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2773
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
364c1eebe   Yang Shi   mm: thp: extract ...
2774
  	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
87eaceb3f   Yang Shi   mm: thp: make def...
2775
2776
2777
2778
2779
  
  #ifdef CONFIG_MEMCG
  	if (sc->memcg)
  		ds_queue = &sc->memcg->deferred_split_queue;
  #endif
364c1eebe   Yang Shi   mm: thp: extract ...
2780
  	return READ_ONCE(ds_queue->split_queue_len);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2781
2782
2783
2784
2785
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2786
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
364c1eebe   Yang Shi   mm: thp: extract ...
2787
  	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2788
2789
2790
2791
  	unsigned long flags;
  	LIST_HEAD(list), *pos, *next;
  	struct page *page;
  	int split = 0;
87eaceb3f   Yang Shi   mm: thp: make def...
2792
2793
2794
2795
  #ifdef CONFIG_MEMCG
  	if (sc->memcg)
  		ds_queue = &sc->memcg->deferred_split_queue;
  #endif
364c1eebe   Yang Shi   mm: thp: extract ...
2796
  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2797
  	/* Take pin on all head pages to avoid freeing them under us */
364c1eebe   Yang Shi   mm: thp: extract ...
2798
  	list_for_each_safe(pos, next, &ds_queue->split_queue) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
2799
2800
  		page = list_entry((void *)pos, struct page, mapping);
  		page = compound_head(page);
e3ae19535   Kirill A. Shutemov   thp: limit number...
2801
2802
2803
2804
  		if (get_page_unless_zero(page)) {
  			list_move(page_deferred_list(page), &list);
  		} else {
  			/* We lost race with put_compound_page() */
9a982250f   Kirill A. Shutemov   thp: introduce de...
2805
  			list_del_init(page_deferred_list(page));
364c1eebe   Yang Shi   mm: thp: extract ...
2806
  			ds_queue->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2807
  		}
e3ae19535   Kirill A. Shutemov   thp: limit number...
2808
2809
  		if (!--sc->nr_to_scan)
  			break;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2810
  	}
364c1eebe   Yang Shi   mm: thp: extract ...
2811
  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2812
2813
2814
  
  	list_for_each_safe(pos, next, &list) {
  		page = list_entry((void *)pos, struct page, mapping);
fa41b900c   Kirill A. Shutemov   mm/thp: do not wa...
2815
2816
  		if (!trylock_page(page))
  			goto next;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2817
2818
2819
2820
  		/* split_huge_page() removes page from list on success */
  		if (!split_huge_page(page))
  			split++;
  		unlock_page(page);
fa41b900c   Kirill A. Shutemov   mm/thp: do not wa...
2821
  next:
9a982250f   Kirill A. Shutemov   thp: introduce de...
2822
2823
  		put_page(page);
  	}
364c1eebe   Yang Shi   mm: thp: extract ...
2824
2825
2826
  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
  	list_splice_tail(&list, &ds_queue->split_queue);
  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2827

cb8d68ec1   Kirill A. Shutemov   thp: change defer...
2828
2829
2830
2831
  	/*
  	 * Stop shrinker if we didn't split any page, but the queue is empty.
  	 * This can happen if pages were freed under us.
  	 */
364c1eebe   Yang Shi   mm: thp: extract ...
2832
  	if (!split && list_empty(&ds_queue->split_queue))
cb8d68ec1   Kirill A. Shutemov   thp: change defer...
2833
2834
  		return SHRINK_STOP;
  	return split;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2835
2836
2837
2838
2839
2840
  }
  
  static struct shrinker deferred_split_shrinker = {
  	.count_objects = deferred_split_count,
  	.scan_objects = deferred_split_scan,
  	.seeks = DEFAULT_SEEKS,
87eaceb3f   Yang Shi   mm: thp: make def...
2841
2842
  	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
  		 SHRINKER_NONSLAB,
9a982250f   Kirill A. Shutemov   thp: introduce de...
2843
  };
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
  
  #ifdef CONFIG_DEBUG_FS
  static int split_huge_pages_set(void *data, u64 val)
  {
  	struct zone *zone;
  	struct page *page;
  	unsigned long pfn, max_zone_pfn;
  	unsigned long total = 0, split = 0;
  
  	if (val != 1)
  		return -EINVAL;
  
  	for_each_populated_zone(zone) {
  		max_zone_pfn = zone_end_pfn(zone);
  		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
  			if (!pfn_valid(pfn))
  				continue;
  
  			page = pfn_to_page(pfn);
  			if (!get_page_unless_zero(page))
  				continue;
  
  			if (zone != page_zone(page))
  				goto next;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2868
  			if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
  				goto next;
  
  			total++;
  			lock_page(page);
  			if (!split_huge_page(page))
  				split++;
  			unlock_page(page);
  next:
  			put_page(page);
  		}
  	}
145bdaa15   Yang Shi   mm: thp: correct ...
2880
2881
  	pr_info("%lu of %lu THP split
  ", split, total);
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2882
2883
2884
2885
2886
2887
2888
2889
2890
  
  	return 0;
  }
  DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
  		"%llu
  ");
  
  static int __init split_huge_pages_debugfs(void)
  {
d9f7979c9   Greg Kroah-Hartman   mm: no need to ch...
2891
2892
  	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
  			    &split_huge_pages_fops);
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2893
2894
2895
2896
  	return 0;
  }
  late_initcall(split_huge_pages_debugfs);
  #endif
616b83715   Zi Yan   mm: thp: enable t...
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
  
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
  		struct page *page)
  {
  	struct vm_area_struct *vma = pvmw->vma;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address = pvmw->address;
  	pmd_t pmdval;
  	swp_entry_t entry;
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
2907
  	pmd_t pmdswp;
616b83715   Zi Yan   mm: thp: enable t...
2908
2909
2910
  
  	if (!(pvmw->pmd && !pvmw->pte))
  		return;
616b83715   Zi Yan   mm: thp: enable t...
2911
2912
2913
2914
2915
2916
  	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
  	pmdval = *pvmw->pmd;
  	pmdp_invalidate(vma, address, pvmw->pmd);
  	if (pmd_dirty(pmdval))
  		set_page_dirty(page);
  	entry = make_migration_entry(page, pmd_write(pmdval));
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
2917
2918
2919
2920
  	pmdswp = swp_entry_to_pmd(entry);
  	if (pmd_soft_dirty(pmdval))
  		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
  	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
616b83715   Zi Yan   mm: thp: enable t...
2921
2922
  	page_remove_rmap(page, true);
  	put_page(page);
616b83715   Zi Yan   mm: thp: enable t...
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
  }
  
  void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
  {
  	struct vm_area_struct *vma = pvmw->vma;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address = pvmw->address;
  	unsigned long mmun_start = address & HPAGE_PMD_MASK;
  	pmd_t pmde;
  	swp_entry_t entry;
  
  	if (!(pvmw->pmd && !pvmw->pte))
  		return;
  
  	entry = pmd_to_swp_entry(*pvmw->pmd);
  	get_page(new);
  	pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
2940
2941
  	if (pmd_swp_soft_dirty(*pvmw->pmd))
  		pmde = pmd_mksoft_dirty(pmde);
616b83715   Zi Yan   mm: thp: enable t...
2942
  	if (is_write_migration_entry(entry))
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
2943
  		pmde = maybe_pmd_mkwrite(pmde, vma);
616b83715   Zi Yan   mm: thp: enable t...
2944
2945
  
  	flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
e71769ae5   Naoya Horiguchi   mm: enable thp mi...
2946
2947
2948
2949
  	if (PageAnon(new))
  		page_add_anon_rmap(new, vma, mmun_start, true);
  	else
  		page_add_file_rmap(new, true);
616b83715   Zi Yan   mm: thp: enable t...
2950
  	set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
e125fe405   Kirill A. Shutemov   mm, thp: fix mloc...
2951
  	if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
616b83715   Zi Yan   mm: thp: enable t...
2952
2953
2954
2955
  		mlock_vma_page(new);
  	update_mmu_cache_pmd(vma, address, pvmw->pmd);
  }
  #endif