Blame view

mm/huge_memory.c 82 KB
20c8ccb19   Thomas Gleixner   treewide: Replace...
1
  // SPDX-License-Identifier: GPL-2.0-only
71e3aac07   Andrea Arcangeli   thp: transparent ...
2
3
  /*
   *  Copyright (C) 2009  Red Hat, Inc.
71e3aac07   Andrea Arcangeli   thp: transparent ...
4
   */
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
5
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71e3aac07   Andrea Arcangeli   thp: transparent ...
6
7
  #include <linux/mm.h>
  #include <linux/sched.h>
f7ccbae45   Ingo Molnar   sched/headers: Pr...
8
  #include <linux/sched/coredump.h>
6a3827d75   Ingo Molnar   sched/headers: Pr...
9
  #include <linux/sched/numa_balancing.h>
71e3aac07   Andrea Arcangeli   thp: transparent ...
10
11
12
13
14
  #include <linux/highmem.h>
  #include <linux/hugetlb.h>
  #include <linux/mmu_notifier.h>
  #include <linux/rmap.h>
  #include <linux/swap.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
15
  #include <linux/shrinker.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
16
  #include <linux/mm_inline.h>
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
17
  #include <linux/swapops.h>
4897c7655   Matthew Wilcox   thp: prepare for ...
18
  #include <linux/dax.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
19
  #include <linux/khugepaged.h>
878aee7d6   Andrea Arcangeli   thp: freeze khuge...
20
  #include <linux/freezer.h>
f25748e3c   Dan Williams   mm, dax: convert ...
21
  #include <linux/pfn_t.h>
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
22
  #include <linux/mman.h>
3565fce3a   Dan Williams   mm, x86: get_user...
23
  #include <linux/memremap.h>
325adeb55   Ralf Baechle   mm: huge_memory: ...
24
  #include <linux/pagemap.h>
49071d436   Kirill A. Shutemov   thp: add debugfs ...
25
  #include <linux/debugfs.h>
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
26
  #include <linux/migrate.h>
43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
27
  #include <linux/hashtable.h>
6b251fc96   Andrea Arcangeli   userfaultfd: call...
28
  #include <linux/userfaultfd_k.h>
33c3fc71c   Vladimir Davydov   mm: introduce idl...
29
  #include <linux/page_idle.h>
baa355fd3   Kirill A. Shutemov   thp: file pages s...
30
  #include <linux/shmem_fs.h>
6b31d5955   Michal Hocko   mm, oom: fix pote...
31
  #include <linux/oom.h>
98fa15f34   Anshuman Khandual   mm: replace all o...
32
  #include <linux/numa.h>
f7da677bc   Vlastimil Babka   mm, page_owner: h...
33
  #include <linux/page_owner.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
34

71e3aac07   Andrea Arcangeli   thp: transparent ...
35
36
37
  #include <asm/tlb.h>
  #include <asm/pgalloc.h>
  #include "internal.h"
ba76149f4   Andrea Arcangeli   thp: khugepaged
38
  /*
b14d595aa   Michael DeGuzis   mm/huge_memory.c:...
39
40
41
42
   * By default, transparent hugepage support is disabled in order to avoid
   * risking an increased memory footprint for applications that are not
   * guaranteed to benefit from it. When transparent hugepage support is
   * enabled, it is for all mappings, and khugepaged scans all mappings.
8bfa3f9a0   Jianguo Wu   mm/huge_memory.c:...
43
44
   * Defrag is invoked by khugepaged hugepage allocations and by page faults
   * for all hugepage allocations.
ba76149f4   Andrea Arcangeli   thp: khugepaged
45
   */
71e3aac07   Andrea Arcangeli   thp: transparent ...
46
  unsigned long transparent_hugepage_flags __read_mostly =
13ece886d   Andrea Arcangeli   thp: transparent ...
47
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
ba76149f4   Andrea Arcangeli   thp: khugepaged
48
  	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
13ece886d   Andrea Arcangeli   thp: transparent ...
49
50
51
52
  #endif
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  #endif
444eb2a44   Mel Gorman   mm: thp: set THP ...
53
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
79da5407e   Kirill A. Shutemov   thp: introduce sy...
54
55
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
ba76149f4   Andrea Arcangeli   thp: khugepaged
56

9a982250f   Kirill A. Shutemov   thp: introduce de...
57
  static struct shrinker deferred_split_shrinker;
f000565ad   Andrea Arcangeli   thp: set recommen...
58

97ae17497   Kirill A. Shutemov   thp: implement re...
59
  static atomic_t huge_zero_refcount;
56873f43a   Wang, Yalin   mm:add KPF_ZERO_P...
60
  struct page *huge_zero_page __read_mostly;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
61

7635d9cbe   Michal Hocko   mm, thp, proc: re...
62
63
  bool transparent_hugepage_enabled(struct vm_area_struct *vma)
  {
c06306696   Yang Shi   mm: thp: fix fals...
64
65
66
67
68
  	/* The addr is used to check if the vma size fits */
  	unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
  
  	if (!transhuge_vma_suitable(vma, addr))
  		return false;
7635d9cbe   Michal Hocko   mm, thp, proc: re...
69
70
  	if (vma_is_anonymous(vma))
  		return __transparent_hugepage_enabled(vma);
c06306696   Yang Shi   mm: thp: fix fals...
71
72
  	if (vma_is_shmem(vma))
  		return shmem_huge_enabled(vma);
7635d9cbe   Michal Hocko   mm, thp, proc: re...
73
74
75
  
  	return false;
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
76
  static struct page *get_huge_zero_page(void)
97ae17497   Kirill A. Shutemov   thp: implement re...
77
78
79
80
  {
  	struct page *zero_page;
  retry:
  	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
4db0c3c29   Jason Low   mm: remove rest o...
81
  		return READ_ONCE(huge_zero_page);
97ae17497   Kirill A. Shutemov   thp: implement re...
82
83
  
  	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
84
  			HPAGE_PMD_ORDER);
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
85
86
  	if (!zero_page) {
  		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
87
  		return NULL;
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
88
89
  	}
  	count_vm_event(THP_ZERO_PAGE_ALLOC);
97ae17497   Kirill A. Shutemov   thp: implement re...
90
  	preempt_disable();
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
91
  	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
97ae17497   Kirill A. Shutemov   thp: implement re...
92
  		preempt_enable();
5ddacbe92   Yu Zhao   mm: free compound...
93
  		__free_pages(zero_page, compound_order(zero_page));
97ae17497   Kirill A. Shutemov   thp: implement re...
94
95
96
97
98
99
  		goto retry;
  	}
  
  	/* We take additional reference here. It will be put back by shrinker */
  	atomic_set(&huge_zero_refcount, 2);
  	preempt_enable();
4db0c3c29   Jason Low   mm: remove rest o...
100
  	return READ_ONCE(huge_zero_page);
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
101
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
102
  static void put_huge_zero_page(void)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
103
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
104
105
106
107
108
  	/*
  	 * Counter should never go to zero here. Only shrinker can put
  	 * last reference.
  	 */
  	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
109
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  struct page *mm_get_huge_zero_page(struct mm_struct *mm)
  {
  	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		return READ_ONCE(huge_zero_page);
  
  	if (!get_huge_zero_page())
  		return NULL;
  
  	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		put_huge_zero_page();
  
  	return READ_ONCE(huge_zero_page);
  }
  
  void mm_put_huge_zero_page(struct mm_struct *mm)
  {
  	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		put_huge_zero_page();
  }
488964666   Glauber Costa   hugepage: convert...
129
130
  static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
  					struct shrink_control *sc)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
131
  {
488964666   Glauber Costa   hugepage: convert...
132
133
134
  	/* we can free zero page only if last reference remains */
  	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
135

488964666   Glauber Costa   hugepage: convert...
136
137
138
  static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
  				       struct shrink_control *sc)
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
139
  	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
140
141
  		struct page *zero_page = xchg(&huge_zero_page, NULL);
  		BUG_ON(zero_page == NULL);
5ddacbe92   Yu Zhao   mm: free compound...
142
  		__free_pages(zero_page, compound_order(zero_page));
488964666   Glauber Costa   hugepage: convert...
143
  		return HPAGE_PMD_NR;
97ae17497   Kirill A. Shutemov   thp: implement re...
144
145
146
  	}
  
  	return 0;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
147
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
148
  static struct shrinker huge_zero_page_shrinker = {
488964666   Glauber Costa   hugepage: convert...
149
150
  	.count_objects = shrink_huge_zero_page_count,
  	.scan_objects = shrink_huge_zero_page_scan,
97ae17497   Kirill A. Shutemov   thp: implement re...
151
152
  	.seeks = DEFAULT_SEEKS,
  };
71e3aac07   Andrea Arcangeli   thp: transparent ...
153
  #ifdef CONFIG_SYSFS
71e3aac07   Andrea Arcangeli   thp: transparent ...
154
155
156
  static ssize_t enabled_show(struct kobject *kobj,
  			    struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
157
158
159
160
161
162
163
164
165
  	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "[always] madvise never
  ");
  	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always [madvise] never
  ");
  	else
  		return sprintf(buf, "always madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
166
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
167

71e3aac07   Andrea Arcangeli   thp: transparent ...
168
169
170
171
  static ssize_t enabled_store(struct kobject *kobj,
  			     struct kobj_attribute *attr,
  			     const char *buf, size_t count)
  {
21440d7eb   David Rientjes   mm, thp: add new ...
172
  	ssize_t ret = count;
ba76149f4   Andrea Arcangeli   thp: khugepaged
173

f42f25526   David Rientjes   mm, thp: fix defr...
174
  	if (sysfs_streq(buf, "always")) {
21440d7eb   David Rientjes   mm, thp: add new ...
175
176
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
f42f25526   David Rientjes   mm, thp: fix defr...
177
  	} else if (sysfs_streq(buf, "madvise")) {
21440d7eb   David Rientjes   mm, thp: add new ...
178
179
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
f42f25526   David Rientjes   mm, thp: fix defr...
180
  	} else if (sysfs_streq(buf, "never")) {
21440d7eb   David Rientjes   mm, thp: add new ...
181
182
183
184
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  	} else
  		ret = -EINVAL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
185
186
  
  	if (ret > 0) {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
187
  		int err = start_stop_khugepaged();
ba76149f4   Andrea Arcangeli   thp: khugepaged
188
189
190
  		if (err)
  			ret = err;
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
191
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
192
193
194
  }
  static struct kobj_attribute enabled_attr =
  	__ATTR(enabled, 0644, enabled_show, enabled_store);
b46e756f5   Kirill A. Shutemov   thp: extract khug...
195
  ssize_t single_hugepage_flag_show(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
196
197
198
  				struct kobj_attribute *attr, char *buf,
  				enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
199
200
201
  	return sprintf(buf, "%d
  ",
  		       !!test_bit(flag, &transparent_hugepage_flags));
71e3aac07   Andrea Arcangeli   thp: transparent ...
202
  }
e27e6151b   Ben Hutchings   mm/thp: use conve...
203

b46e756f5   Kirill A. Shutemov   thp: extract khug...
204
  ssize_t single_hugepage_flag_store(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
205
206
207
208
  				 struct kobj_attribute *attr,
  				 const char *buf, size_t count,
  				 enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
209
210
211
212
213
214
215
216
217
218
  	unsigned long value;
  	int ret;
  
  	ret = kstrtoul(buf, 10, &value);
  	if (ret < 0)
  		return ret;
  	if (value > 1)
  		return -EINVAL;
  
  	if (value)
71e3aac07   Andrea Arcangeli   thp: transparent ...
219
  		set_bit(flag, &transparent_hugepage_flags);
e27e6151b   Ben Hutchings   mm/thp: use conve...
220
  	else
71e3aac07   Andrea Arcangeli   thp: transparent ...
221
  		clear_bit(flag, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
222
223
224
  
  	return count;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
225
226
227
  static ssize_t defrag_show(struct kobject *kobj,
  			   struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
228
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
21440d7eb   David Rientjes   mm, thp: add new ...
229
230
  		return sprintf(buf, "[always] defer defer+madvise madvise never
  ");
444eb2a44   Mel Gorman   mm: thp: set THP ...
231
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
21440d7eb   David Rientjes   mm, thp: add new ...
232
233
234
235
236
237
238
239
240
241
  		return sprintf(buf, "always [defer] defer+madvise madvise never
  ");
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always defer [defer+madvise] madvise never
  ");
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always defer defer+madvise [madvise] never
  ");
  	return sprintf(buf, "always defer defer+madvise madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
242
  }
21440d7eb   David Rientjes   mm, thp: add new ...
243

71e3aac07   Andrea Arcangeli   thp: transparent ...
244
245
246
247
  static ssize_t defrag_store(struct kobject *kobj,
  			    struct kobj_attribute *attr,
  			    const char *buf, size_t count)
  {
f42f25526   David Rientjes   mm, thp: fix defr...
248
  	if (sysfs_streq(buf, "always")) {
21440d7eb   David Rientjes   mm, thp: add new ...
249
250
251
252
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
f42f25526   David Rientjes   mm, thp: fix defr...
253
  	} else if (sysfs_streq(buf, "defer+madvise")) {
21440d7eb   David Rientjes   mm, thp: add new ...
254
255
256
257
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
f42f25526   David Rientjes   mm, thp: fix defr...
258
  	} else if (sysfs_streq(buf, "defer")) {
4fad7fb6b   David Rientjes   mm, thp: fix sett...
259
260
261
262
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
f42f25526   David Rientjes   mm, thp: fix defr...
263
  	} else if (sysfs_streq(buf, "madvise")) {
21440d7eb   David Rientjes   mm, thp: add new ...
264
265
266
267
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
f42f25526   David Rientjes   mm, thp: fix defr...
268
  	} else if (sysfs_streq(buf, "never")) {
21440d7eb   David Rientjes   mm, thp: add new ...
269
270
271
272
273
274
275
276
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  	} else
  		return -EINVAL;
  
  	return count;
71e3aac07   Andrea Arcangeli   thp: transparent ...
277
278
279
  }
  static struct kobj_attribute defrag_attr =
  	__ATTR(defrag, 0644, defrag_show, defrag_store);
79da5407e   Kirill A. Shutemov   thp: introduce sy...
280
281
282
  static ssize_t use_zero_page_show(struct kobject *kobj,
  		struct kobj_attribute *attr, char *buf)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
283
  	return single_hugepage_flag_show(kobj, attr, buf,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
284
285
286
287
288
  				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static ssize_t use_zero_page_store(struct kobject *kobj,
  		struct kobj_attribute *attr, const char *buf, size_t count)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
289
  	return single_hugepage_flag_store(kobj, attr, buf, count,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
290
291
292
293
  				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static struct kobj_attribute use_zero_page_attr =
  	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
49920d287   Hugh Dickins   mm: make transpar...
294
295
296
297
298
299
300
301
302
  
  static ssize_t hpage_pmd_size_show(struct kobject *kobj,
  		struct kobj_attribute *attr, char *buf)
  {
  	return sprintf(buf, "%lu
  ", HPAGE_PMD_SIZE);
  }
  static struct kobj_attribute hpage_pmd_size_attr =
  	__ATTR_RO(hpage_pmd_size);
71e3aac07   Andrea Arcangeli   thp: transparent ...
303
304
305
  static struct attribute *hugepage_attr[] = {
  	&enabled_attr.attr,
  	&defrag_attr.attr,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
306
  	&use_zero_page_attr.attr,
49920d287   Hugh Dickins   mm: make transpar...
307
  	&hpage_pmd_size_attr.attr,
396bcc529   Matthew Wilcox (Oracle)   mm: remove CONFIG...
308
  #ifdef CONFIG_SHMEM
5a6e75f81   Kirill A. Shutemov   shmem: prepare hu...
309
310
  	&shmem_enabled_attr.attr,
  #endif
71e3aac07   Andrea Arcangeli   thp: transparent ...
311
312
  	NULL,
  };
8aa95a21b   Arvind Yadav   mm/huge_memory.c:...
313
  static const struct attribute_group hugepage_attr_group = {
71e3aac07   Andrea Arcangeli   thp: transparent ...
314
  	.attrs = hugepage_attr,
ba76149f4   Andrea Arcangeli   thp: khugepaged
315
  };
569e55900   Shaohua Li   thp: improve the ...
316
  static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
71e3aac07   Andrea Arcangeli   thp: transparent ...
317
  {
71e3aac07   Andrea Arcangeli   thp: transparent ...
318
  	int err;
569e55900   Shaohua Li   thp: improve the ...
319
320
  	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
  	if (unlikely(!*hugepage_kobj)) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
321
322
  		pr_err("failed to create transparent hugepage kobject
  ");
569e55900   Shaohua Li   thp: improve the ...
323
  		return -ENOMEM;
ba76149f4   Andrea Arcangeli   thp: khugepaged
324
  	}
569e55900   Shaohua Li   thp: improve the ...
325
  	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
326
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
327
328
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
329
  		goto delete_obj;
ba76149f4   Andrea Arcangeli   thp: khugepaged
330
  	}
569e55900   Shaohua Li   thp: improve the ...
331
  	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
332
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
333
334
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
335
  		goto remove_hp_group;
ba76149f4   Andrea Arcangeli   thp: khugepaged
336
  	}
569e55900   Shaohua Li   thp: improve the ...
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
  
  	return 0;
  
  remove_hp_group:
  	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
  delete_obj:
  	kobject_put(*hugepage_kobj);
  	return err;
  }
  
  static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
  	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
  	kobject_put(hugepage_kobj);
  }
  #else
  static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
  {
  	return 0;
  }
  
  static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  }
  #endif /* CONFIG_SYSFS */
  
  static int __init hugepage_init(void)
  {
  	int err;
  	struct kobject *hugepage_kobj;
  
  	if (!has_transparent_hugepage()) {
  		transparent_hugepage_flags = 0;
  		return -EINVAL;
  	}
ff20c2e0a   Kirill A. Shutemov   mm: Some arch may...
373
374
375
376
377
378
379
380
381
  	/*
  	 * hugepages can't be allocated by the buddy allocator
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
  	/*
  	 * we use page->mapping and page->index in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
569e55900   Shaohua Li   thp: improve the ...
382
383
  	err = hugepage_init_sysfs(&hugepage_kobj);
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
384
  		goto err_sysfs;
ba76149f4   Andrea Arcangeli   thp: khugepaged
385

b46e756f5   Kirill A. Shutemov   thp: extract khug...
386
  	err = khugepaged_init();
ba76149f4   Andrea Arcangeli   thp: khugepaged
387
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
388
  		goto err_slab;
ba76149f4   Andrea Arcangeli   thp: khugepaged
389

65ebb64f4   Kirill A. Shutemov   thp: handle error...
390
391
392
  	err = register_shrinker(&huge_zero_page_shrinker);
  	if (err)
  		goto err_hzp_shrinker;
9a982250f   Kirill A. Shutemov   thp: introduce de...
393
394
395
  	err = register_shrinker(&deferred_split_shrinker);
  	if (err)
  		goto err_split_shrinker;
97ae17497   Kirill A. Shutemov   thp: implement re...
396

97562cd24   Rik van Riel   thp: disable tran...
397
398
399
400
401
  	/*
  	 * By default disable transparent hugepages on smaller systems,
  	 * where the extra memory used could hurt more than TLB overhead
  	 * is likely to save.  The admin can still enable it through /sys.
  	 */
ca79b0c21   Arun KS   mm: convert total...
402
  	if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
97562cd24   Rik van Riel   thp: disable tran...
403
  		transparent_hugepage_flags = 0;
79553da29   Kirill A. Shutemov   thp: cleanup khug...
404
405
  		return 0;
  	}
97562cd24   Rik van Riel   thp: disable tran...
406

79553da29   Kirill A. Shutemov   thp: cleanup khug...
407
  	err = start_stop_khugepaged();
65ebb64f4   Kirill A. Shutemov   thp: handle error...
408
409
  	if (err)
  		goto err_khugepaged;
ba76149f4   Andrea Arcangeli   thp: khugepaged
410

569e55900   Shaohua Li   thp: improve the ...
411
  	return 0;
65ebb64f4   Kirill A. Shutemov   thp: handle error...
412
  err_khugepaged:
9a982250f   Kirill A. Shutemov   thp: introduce de...
413
414
  	unregister_shrinker(&deferred_split_shrinker);
  err_split_shrinker:
65ebb64f4   Kirill A. Shutemov   thp: handle error...
415
416
  	unregister_shrinker(&huge_zero_page_shrinker);
  err_hzp_shrinker:
b46e756f5   Kirill A. Shutemov   thp: extract khug...
417
  	khugepaged_destroy();
65ebb64f4   Kirill A. Shutemov   thp: handle error...
418
  err_slab:
569e55900   Shaohua Li   thp: improve the ...
419
  	hugepage_exit_sysfs(hugepage_kobj);
65ebb64f4   Kirill A. Shutemov   thp: handle error...
420
  err_sysfs:
ba76149f4   Andrea Arcangeli   thp: khugepaged
421
  	return err;
71e3aac07   Andrea Arcangeli   thp: transparent ...
422
  }
a64fb3cd6   Paul Gortmaker   mm: audit/fix non...
423
  subsys_initcall(hugepage_init);
71e3aac07   Andrea Arcangeli   thp: transparent ...
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
  
  static int __init setup_transparent_hugepage(char *str)
  {
  	int ret = 0;
  	if (!str)
  		goto out;
  	if (!strcmp(str, "always")) {
  		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			&transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "madvise")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			&transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "never")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	}
  out:
  	if (!ret)
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
451
452
  		pr_warn("transparent_hugepage= cannot parse, ignored
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
453
454
455
  	return ret;
  }
  __setup("transparent_hugepage=", setup_transparent_hugepage);
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
456
  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
71e3aac07   Andrea Arcangeli   thp: transparent ...
457
  {
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
458
  	if (likely(vma->vm_flags & VM_WRITE))
71e3aac07   Andrea Arcangeli   thp: transparent ...
459
460
461
  		pmd = pmd_mkwrite(pmd);
  	return pmd;
  }
87eaceb3f   Yang Shi   mm: thp: make def...
462
463
  #ifdef CONFIG_MEMCG
  static inline struct deferred_split *get_deferred_split_queue(struct page *page)
9a982250f   Kirill A. Shutemov   thp: introduce de...
464
  {
87eaceb3f   Yang Shi   mm: thp: make def...
465
466
467
468
469
470
471
  	struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
  	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
  
  	if (memcg)
  		return &memcg->deferred_split_queue;
  	else
  		return &pgdat->deferred_split_queue;
9a982250f   Kirill A. Shutemov   thp: introduce de...
472
  }
87eaceb3f   Yang Shi   mm: thp: make def...
473
474
475
476
477
478
479
480
  #else
  static inline struct deferred_split *get_deferred_split_queue(struct page *page)
  {
  	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
  
  	return &pgdat->deferred_split_queue;
  }
  #endif
9a982250f   Kirill A. Shutemov   thp: introduce de...
481
482
483
484
485
486
487
  
  void prep_transhuge_page(struct page *page)
  {
  	/*
  	 * we use page->mapping and page->indexlru in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
9a982250f   Kirill A. Shutemov   thp: introduce de...
488
489
490
491
  
  	INIT_LIST_HEAD(page_deferred_list(page));
  	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
  }
005ba37cb   Sean Christopherson   mm: thp: KVM: Exp...
492
493
494
  bool is_transparent_hugepage(struct page *page)
  {
  	if (!PageCompound(page))
fa1f68cc8   Zou Wei   mm: use false for...
495
  		return false;
005ba37cb   Sean Christopherson   mm: thp: KVM: Exp...
496
497
498
499
500
501
  
  	page = compound_head(page);
  	return is_huge_zero_page(page) ||
  	       page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
  }
  EXPORT_SYMBOL_GPL(is_transparent_hugepage);
97d3d0f9a   Kirill A. Shutemov   mm/huge_memory.c:...
502
503
  static unsigned long __thp_get_unmapped_area(struct file *filp,
  		unsigned long addr, unsigned long len,
74d2fad13   Toshi Kani   thp, dax: add thp...
504
505
  		loff_t off, unsigned long flags, unsigned long size)
  {
74d2fad13   Toshi Kani   thp, dax: add thp...
506
507
  	loff_t off_end = off + len;
  	loff_t off_align = round_up(off, size);
97d3d0f9a   Kirill A. Shutemov   mm/huge_memory.c:...
508
  	unsigned long len_pad, ret;
74d2fad13   Toshi Kani   thp, dax: add thp...
509
510
511
512
513
514
515
  
  	if (off_end <= off_align || (off_end - off_align) < size)
  		return 0;
  
  	len_pad = len + size;
  	if (len_pad < len || (off + len_pad) < off)
  		return 0;
97d3d0f9a   Kirill A. Shutemov   mm/huge_memory.c:...
516
  	ret = current->mm->get_unmapped_area(filp, addr, len_pad,
74d2fad13   Toshi Kani   thp, dax: add thp...
517
  					      off >> PAGE_SHIFT, flags);
97d3d0f9a   Kirill A. Shutemov   mm/huge_memory.c:...
518
519
520
521
522
523
  
  	/*
  	 * The failure might be due to length padding. The caller will retry
  	 * without the padding.
  	 */
  	if (IS_ERR_VALUE(ret))
74d2fad13   Toshi Kani   thp, dax: add thp...
524
  		return 0;
97d3d0f9a   Kirill A. Shutemov   mm/huge_memory.c:...
525
526
527
528
529
530
531
532
533
  	/*
  	 * Do not try to align to THP boundary if allocation at the address
  	 * hint succeeds.
  	 */
  	if (ret == addr)
  		return addr;
  
  	ret += (off - ret) & (size - 1);
  	return ret;
74d2fad13   Toshi Kani   thp, dax: add thp...
534
535
536
537
538
  }
  
  unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
  		unsigned long len, unsigned long pgoff, unsigned long flags)
  {
97d3d0f9a   Kirill A. Shutemov   mm/huge_memory.c:...
539
  	unsigned long ret;
74d2fad13   Toshi Kani   thp, dax: add thp...
540
  	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
74d2fad13   Toshi Kani   thp, dax: add thp...
541
542
  	if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
  		goto out;
97d3d0f9a   Kirill A. Shutemov   mm/huge_memory.c:...
543
544
545
546
  	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
  	if (ret)
  		return ret;
  out:
74d2fad13   Toshi Kani   thp, dax: add thp...
547
548
549
  	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
  }
  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
2b7403035   Souptick Joarder   mm: Change return...
550
551
  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
  			struct page *page, gfp_t gfp)
71e3aac07   Andrea Arcangeli   thp: transparent ...
552
  {
82b0f8c39   Jan Kara   mm: join struct f...
553
  	struct vm_area_struct *vma = vmf->vma;
71e3aac07   Andrea Arcangeli   thp: transparent ...
554
  	pgtable_t pgtable;
82b0f8c39   Jan Kara   mm: join struct f...
555
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2b7403035   Souptick Joarder   mm: Change return...
556
  	vm_fault_t ret = 0;
71e3aac07   Andrea Arcangeli   thp: transparent ...
557

309381fea   Sasha Levin   mm: dump page whe...
558
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
00501b531   Johannes Weiner   mm: memcontrol: r...
559

d9eb1ea2b   Johannes Weiner   mm: memcontrol: d...
560
  	if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
6b251fc96   Andrea Arcangeli   userfaultfd: call...
561
562
  		put_page(page);
  		count_vm_event(THP_FAULT_FALLBACK);
85b9f46e8   David Rientjes   mm, thp: track fa...
563
  		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
564
565
  		return VM_FAULT_FALLBACK;
  	}
9d82c6943   Johannes Weiner   mm: memcontrol: c...
566
  	cgroup_throttle_swaprate(page, gfp);
00501b531   Johannes Weiner   mm: memcontrol: r...
567

4cf589249   Joel Fernandes (Google)   mm: treewide: rem...
568
  	pgtable = pte_alloc_one(vma->vm_mm);
00501b531   Johannes Weiner   mm: memcontrol: r...
569
  	if (unlikely(!pgtable)) {
6b31d5955   Michal Hocko   mm, oom: fix pote...
570
571
  		ret = VM_FAULT_OOM;
  		goto release;
00501b531   Johannes Weiner   mm: memcontrol: r...
572
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
573

c79b57e46   Huang Ying   mm: hugetlb: clea...
574
  	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
52f37629f   Minchan Kim   THP: fix comment ...
575
576
577
578
579
  	/*
  	 * The memory barrier inside __SetPageUptodate makes sure that
  	 * clear_huge_page writes become visible before the set_pmd_at()
  	 * write.
  	 */
71e3aac07   Andrea Arcangeli   thp: transparent ...
580
  	__SetPageUptodate(page);
82b0f8c39   Jan Kara   mm: join struct f...
581
582
  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_none(*vmf->pmd))) {
6b31d5955   Michal Hocko   mm, oom: fix pote...
583
  		goto unlock_release;
71e3aac07   Andrea Arcangeli   thp: transparent ...
584
585
  	} else {
  		pmd_t entry;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
586

6b31d5955   Michal Hocko   mm, oom: fix pote...
587
588
589
  		ret = check_stable_address_space(vma->vm_mm);
  		if (ret)
  			goto unlock_release;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
590
591
  		/* Deliver the page fault to userland */
  		if (userfaultfd_missing(vma)) {
2b7403035   Souptick Joarder   mm: Change return...
592
  			vm_fault_t ret2;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
593

82b0f8c39   Jan Kara   mm: join struct f...
594
  			spin_unlock(vmf->ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
595
  			put_page(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
596
  			pte_free(vma->vm_mm, pgtable);
2b7403035   Souptick Joarder   mm: Change return...
597
598
599
  			ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
  			VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
  			return ret2;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
600
  		}
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
601
  		entry = mk_huge_pmd(page, vma->vm_page_prot);
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
602
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
603
  		page_add_new_anon_rmap(page, vma, haddr, true);
b518154e5   Joonsoo Kim   mm/vmscan: protec...
604
  		lru_cache_add_inactive_or_unevictable(page, vma);
82b0f8c39   Jan Kara   mm: join struct f...
605
606
  		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
607
  		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
608
  		mm_inc_nr_ptes(vma->vm_mm);
82b0f8c39   Jan Kara   mm: join struct f...
609
  		spin_unlock(vmf->ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
610
  		count_vm_event(THP_FAULT_ALLOC);
9d82c6943   Johannes Weiner   mm: memcontrol: c...
611
  		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
71e3aac07   Andrea Arcangeli   thp: transparent ...
612
  	}
aa2e878ef   David Rientjes   mm, thp: remove u...
613
  	return 0;
6b31d5955   Michal Hocko   mm, oom: fix pote...
614
615
616
617
618
  unlock_release:
  	spin_unlock(vmf->ptl);
  release:
  	if (pgtable)
  		pte_free(vma->vm_mm, pgtable);
6b31d5955   Michal Hocko   mm, oom: fix pote...
619
620
  	put_page(page);
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
621
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
622
  /*
21440d7eb   David Rientjes   mm, thp: add new ...
623
624
625
626
627
628
629
   * always: directly stall for all thp allocations
   * defer: wake kswapd and fail if not immediately available
   * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
   *		  fail if not immediately available
   * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
   *	    available
   * never: never stall for any thp allocation
444eb2a44   Mel Gorman   mm: thp: set THP ...
630
   */
19deb7695   David Rientjes   Revert "Revert "R...
631
  static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
444eb2a44   Mel Gorman   mm: thp: set THP ...
632
  {
21440d7eb   David Rientjes   mm, thp: add new ...
633
  	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
2f0799a0f   David Rientjes   mm, thp: restore ...
634

ac79f78da   David Rientjes   Revert "Revert "m...
635
  	/* Always do synchronous compaction */
a8282608c   Andrea Arcangeli   Revert "mm, thp: ...
636
637
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
  		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
ac79f78da   David Rientjes   Revert "Revert "m...
638
639
  
  	/* Kick kcompactd and fail quickly */
21440d7eb   David Rientjes   mm, thp: add new ...
640
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
19deb7695   David Rientjes   Revert "Revert "R...
641
  		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
ac79f78da   David Rientjes   Revert "Revert "m...
642
643
  
  	/* Synchronous compaction if madvised, otherwise kick kcompactd */
21440d7eb   David Rientjes   mm, thp: add new ...
644
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
19deb7695   David Rientjes   Revert "Revert "R...
645
646
647
  		return GFP_TRANSHUGE_LIGHT |
  			(vma_madvised ? __GFP_DIRECT_RECLAIM :
  					__GFP_KSWAPD_RECLAIM);
ac79f78da   David Rientjes   Revert "Revert "m...
648
649
  
  	/* Only do synchronous compaction if madvised */
21440d7eb   David Rientjes   mm, thp: add new ...
650
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
19deb7695   David Rientjes   Revert "Revert "R...
651
652
  		return GFP_TRANSHUGE_LIGHT |
  		       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
ac79f78da   David Rientjes   Revert "Revert "m...
653

19deb7695   David Rientjes   Revert "Revert "R...
654
  	return GFP_TRANSHUGE_LIGHT;
444eb2a44   Mel Gorman   mm: thp: set THP ...
655
  }
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
656
  /* Caller must hold page table lock. */
d295e3415   Kirill A. Shutemov   dax: don't use se...
657
  static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
97ae17497   Kirill A. Shutemov   thp: implement re...
658
  		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
659
  		struct page *zero_page)
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
660
661
  {
  	pmd_t entry;
7c4141645   Andrew Morton   dax: revert userf...
662
663
  	if (!pmd_none(*pmd))
  		return false;
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
664
  	entry = mk_pmd(zero_page, vma->vm_page_prot);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
665
  	entry = pmd_mkhuge(entry);
12c9d70bd   Matthew Wilcox   mm: fix memory le...
666
667
  	if (pgtable)
  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
668
  	set_pmd_at(mm, haddr, pmd, entry);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
669
  	mm_inc_nr_ptes(mm);
7c4141645   Andrew Morton   dax: revert userf...
670
  	return true;
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
671
  }
2b7403035   Souptick Joarder   mm: Change return...
672
  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
71e3aac07   Andrea Arcangeli   thp: transparent ...
673
  {
82b0f8c39   Jan Kara   mm: join struct f...
674
  	struct vm_area_struct *vma = vmf->vma;
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
675
  	gfp_t gfp;
71e3aac07   Andrea Arcangeli   thp: transparent ...
676
  	struct page *page;
82b0f8c39   Jan Kara   mm: join struct f...
677
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
71e3aac07   Andrea Arcangeli   thp: transparent ...
678

43675e6fb   Yang Shi   mm: thp: make tra...
679
  	if (!transhuge_vma_suitable(vma, haddr))
c02925540   Kirill A. Shutemov   thp: consolidate ...
680
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
681
682
  	if (unlikely(anon_vma_prepare(vma)))
  		return VM_FAULT_OOM;
6d50e60cd   David Rientjes   mm, thp: fix coll...
683
  	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
684
  		return VM_FAULT_OOM;
82b0f8c39   Jan Kara   mm: join struct f...
685
  	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
bae473a42   Kirill A. Shutemov   mm: introduce fau...
686
  			!mm_forbids_zeropage(vma->vm_mm) &&
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
687
688
689
  			transparent_hugepage_use_zero_page()) {
  		pgtable_t pgtable;
  		struct page *zero_page;
2b7403035   Souptick Joarder   mm: Change return...
690
  		vm_fault_t ret;
4cf589249   Joel Fernandes (Google)   mm: treewide: rem...
691
  		pgtable = pte_alloc_one(vma->vm_mm);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
692
  		if (unlikely(!pgtable))
ba76149f4   Andrea Arcangeli   thp: khugepaged
693
  			return VM_FAULT_OOM;
6fcb52a56   Aaron Lu   thp: reduce usage...
694
  		zero_page = mm_get_huge_zero_page(vma->vm_mm);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
695
  		if (unlikely(!zero_page)) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
696
  			pte_free(vma->vm_mm, pgtable);
81ab4201f   Andi Kleen   mm: add VM counte...
697
  			count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
698
  			return VM_FAULT_FALLBACK;
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
699
  		}
82b0f8c39   Jan Kara   mm: join struct f...
700
  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
701
  		ret = 0;
82b0f8c39   Jan Kara   mm: join struct f...
702
  		if (pmd_none(*vmf->pmd)) {
6b31d5955   Michal Hocko   mm, oom: fix pote...
703
704
705
  			ret = check_stable_address_space(vma->vm_mm);
  			if (ret) {
  				spin_unlock(vmf->ptl);
bfe8cc1db   Gerald Schaefer   mm/userfaultfd: d...
706
  				pte_free(vma->vm_mm, pgtable);
6b31d5955   Michal Hocko   mm, oom: fix pote...
707
  			} else if (userfaultfd_missing(vma)) {
82b0f8c39   Jan Kara   mm: join struct f...
708
  				spin_unlock(vmf->ptl);
bfe8cc1db   Gerald Schaefer   mm/userfaultfd: d...
709
  				pte_free(vma->vm_mm, pgtable);
82b0f8c39   Jan Kara   mm: join struct f...
710
  				ret = handle_userfault(vmf, VM_UFFD_MISSING);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
711
712
  				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  			} else {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
713
  				set_huge_zero_page(pgtable, vma->vm_mm, vma,
82b0f8c39   Jan Kara   mm: join struct f...
714
715
  						   haddr, vmf->pmd, zero_page);
  				spin_unlock(vmf->ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
716
  			}
bfe8cc1db   Gerald Schaefer   mm/userfaultfd: d...
717
  		} else {
82b0f8c39   Jan Kara   mm: join struct f...
718
  			spin_unlock(vmf->ptl);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
719
  			pte_free(vma->vm_mm, pgtable);
bfe8cc1db   Gerald Schaefer   mm/userfaultfd: d...
720
  		}
6b251fc96   Andrea Arcangeli   userfaultfd: call...
721
  		return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
722
  	}
19deb7695   David Rientjes   Revert "Revert "R...
723
724
  	gfp = alloc_hugepage_direct_gfpmask(vma);
  	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
725
726
  	if (unlikely(!page)) {
  		count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
727
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
728
  	}
9a982250f   Kirill A. Shutemov   thp: introduce de...
729
  	prep_transhuge_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
730
  	return __do_huge_pmd_anonymous_page(vmf, page, gfp);
71e3aac07   Andrea Arcangeli   thp: transparent ...
731
  }
ae18d6dcf   Matthew Wilcox   thp: change inser...
732
  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
733
734
  		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
  		pgtable_t pgtable)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
735
736
737
738
739
740
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pmd_t entry;
  	spinlock_t *ptl;
  
  	ptl = pmd_lock(mm, pmd);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
741
742
743
744
745
746
747
748
749
750
751
752
753
754
  	if (!pmd_none(*pmd)) {
  		if (write) {
  			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
  				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
  				goto out_unlock;
  			}
  			entry = pmd_mkyoung(*pmd);
  			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  			if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
  				update_mmu_cache_pmd(vma, addr, pmd);
  		}
  
  		goto out_unlock;
  	}
f25748e3c   Dan Williams   mm, dax: convert ...
755
756
757
  	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
  	if (pfn_t_devmap(pfn))
  		entry = pmd_mkdevmap(entry);
01871e59a   Ross Zwisler   mm, dax: fix live...
758
  	if (write) {
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
759
760
  		entry = pmd_mkyoung(pmd_mkdirty(entry));
  		entry = maybe_pmd_mkwrite(entry, vma);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
761
  	}
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
762
763
764
  
  	if (pgtable) {
  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
765
  		mm_inc_nr_ptes(mm);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
766
  		pgtable = NULL;
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
767
  	}
01871e59a   Ross Zwisler   mm, dax: fix live...
768
769
  	set_pmd_at(mm, addr, pmd, entry);
  	update_mmu_cache_pmd(vma, addr, pmd);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
770
771
  
  out_unlock:
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
772
  	spin_unlock(ptl);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
773
774
  	if (pgtable)
  		pte_free(mm, pgtable);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
775
  }
9a9731b18   Thomas Hellstrom (VMware)   mm: Add vmf_inser...
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
  /**
   * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
   * @vmf: Structure describing the fault
   * @pfn: pfn to insert
   * @pgprot: page protection to use
   * @write: whether it's a write fault
   *
   * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
   * also consult the vmf_insert_mixed_prot() documentation when
   * @pgprot != @vmf->vma->vm_page_prot.
   *
   * Return: vm_fault_t value.
   */
  vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
  				   pgprot_t pgprot, bool write)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
791
  {
fce86ff58   Dan Williams   mm/huge_memory: f...
792
793
  	unsigned long addr = vmf->address & PMD_MASK;
  	struct vm_area_struct *vma = vmf->vma;
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
794
  	pgtable_t pgtable = NULL;
fce86ff58   Dan Williams   mm/huge_memory: f...
795

5cad465d7   Matthew Wilcox   mm: add vmf_inser...
796
797
798
799
800
  	/*
  	 * If we had pmd_special, we could avoid all these restrictions,
  	 * but we need to be consistent with PTEs and architectures that
  	 * can't support a 'special' bit.
  	 */
e1fb4a086   Dave Jiang   dax: remove VM_MI...
801
802
  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
  			!pfn_t_devmap(pfn));
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
803
804
805
  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  						(VM_PFNMAP|VM_MIXEDMAP));
  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
806
807
808
  
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
308a047c3   Borislav Petkov   x86/pat, mm: Make...
809

3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
810
  	if (arch_needs_pgtable_deposit()) {
4cf589249   Joel Fernandes (Google)   mm: treewide: rem...
811
  		pgtable = pte_alloc_one(vma->vm_mm);
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
812
813
814
  		if (!pgtable)
  			return VM_FAULT_OOM;
  	}
308a047c3   Borislav Petkov   x86/pat, mm: Make...
815
  	track_pfn_insert(vma, &pgprot, pfn);
fce86ff58   Dan Williams   mm/huge_memory: f...
816
  	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
ae18d6dcf   Matthew Wilcox   thp: change inser...
817
  	return VM_FAULT_NOPAGE;
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
818
  }
9a9731b18   Thomas Hellstrom (VMware)   mm: Add vmf_inser...
819
  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
820

a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
821
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
822
  static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
823
  {
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
824
  	if (likely(vma->vm_flags & VM_WRITE))
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
825
826
827
828
829
830
831
832
833
834
835
836
  		pud = pud_mkwrite(pud);
  	return pud;
  }
  
  static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
  		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pud_t entry;
  	spinlock_t *ptl;
  
  	ptl = pud_lock(mm, pud);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
837
838
839
840
841
842
843
844
845
846
847
848
849
  	if (!pud_none(*pud)) {
  		if (write) {
  			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
  				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
  				goto out_unlock;
  			}
  			entry = pud_mkyoung(*pud);
  			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
  			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
  				update_mmu_cache_pud(vma, addr, pud);
  		}
  		goto out_unlock;
  	}
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
850
851
852
853
  	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
  	if (pfn_t_devmap(pfn))
  		entry = pud_mkdevmap(entry);
  	if (write) {
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
854
855
  		entry = pud_mkyoung(pud_mkdirty(entry));
  		entry = maybe_pud_mkwrite(entry, vma);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
856
857
858
  	}
  	set_pud_at(mm, addr, pud, entry);
  	update_mmu_cache_pud(vma, addr, pud);
c6f3c5ee4   Aneesh Kumar K.V   mm/huge_memory.c:...
859
860
  
  out_unlock:
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
861
862
  	spin_unlock(ptl);
  }
9a9731b18   Thomas Hellstrom (VMware)   mm: Add vmf_inser...
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
  /**
   * vmf_insert_pfn_pud_prot - insert a pud size pfn
   * @vmf: Structure describing the fault
   * @pfn: pfn to insert
   * @pgprot: page protection to use
   * @write: whether it's a write fault
   *
   * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
   * also consult the vmf_insert_mixed_prot() documentation when
   * @pgprot != @vmf->vma->vm_page_prot.
   *
   * Return: vm_fault_t value.
   */
  vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
  				   pgprot_t pgprot, bool write)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
878
  {
fce86ff58   Dan Williams   mm/huge_memory: f...
879
880
  	unsigned long addr = vmf->address & PUD_MASK;
  	struct vm_area_struct *vma = vmf->vma;
fce86ff58   Dan Williams   mm/huge_memory: f...
881

a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
882
883
884
885
886
  	/*
  	 * If we had pud_special, we could avoid all these restrictions,
  	 * but we need to be consistent with PTEs and architectures that
  	 * can't support a 'special' bit.
  	 */
62ec0d8c4   Dave Jiang   mm: fix BUG_ON() ...
887
888
  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
  			!pfn_t_devmap(pfn));
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
889
890
891
  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  						(VM_PFNMAP|VM_MIXEDMAP));
  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
892
893
894
895
896
  
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
  
  	track_pfn_insert(vma, &pgprot, pfn);
fce86ff58   Dan Williams   mm/huge_memory: f...
897
  	insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
898
899
  	return VM_FAULT_NOPAGE;
  }
9a9731b18   Thomas Hellstrom (VMware)   mm: Add vmf_inser...
900
  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
901
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
3565fce3a   Dan Williams   mm, x86: get_user...
902
  static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
903
  		pmd_t *pmd, int flags)
3565fce3a   Dan Williams   mm, x86: get_user...
904
905
  {
  	pmd_t _pmd;
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
906
907
908
  	_pmd = pmd_mkyoung(*pmd);
  	if (flags & FOLL_WRITE)
  		_pmd = pmd_mkdirty(_pmd);
3565fce3a   Dan Williams   mm, x86: get_user...
909
  	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
910
  				pmd, _pmd, flags & FOLL_WRITE))
3565fce3a   Dan Williams   mm, x86: get_user...
911
912
913
914
  		update_mmu_cache_pmd(vma, addr, pmd);
  }
  
  struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
df06b37ff   Keith Busch   mm/gup: cache dev...
915
  		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
3565fce3a   Dan Williams   mm, x86: get_user...
916
917
918
  {
  	unsigned long pfn = pmd_pfn(*pmd);
  	struct mm_struct *mm = vma->vm_mm;
3565fce3a   Dan Williams   mm, x86: get_user...
919
920
921
  	struct page *page;
  
  	assert_spin_locked(pmd_lockptr(mm, pmd));
8310d48b1   Keno Fischer   mm/huge_memory.c:...
922
923
924
925
926
  	/*
  	 * When we COW a devmap PMD entry, we split it into PTEs, so we should
  	 * not be in this function with `flags & FOLL_COW` set.
  	 */
  	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
3faa52c03   John Hubbard   mm/gup: track FOL...
927
928
929
930
  	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
  	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
  			 (FOLL_PIN | FOLL_GET)))
  		return NULL;
f6f373216   Linus Torvalds   Revert "mm: repla...
931
  	if (flags & FOLL_WRITE && !pmd_write(*pmd))
3565fce3a   Dan Williams   mm, x86: get_user...
932
933
934
935
936
937
938
939
  		return NULL;
  
  	if (pmd_present(*pmd) && pmd_devmap(*pmd))
  		/* pass */;
  	else
  		return NULL;
  
  	if (flags & FOLL_TOUCH)
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
940
  		touch_pmd(vma, addr, pmd, flags);
3565fce3a   Dan Williams   mm, x86: get_user...
941
942
943
944
945
  
  	/*
  	 * device mapped pages can only be returned if the
  	 * caller will manage the page reference count.
  	 */
3faa52c03   John Hubbard   mm/gup: track FOL...
946
  	if (!(flags & (FOLL_GET | FOLL_PIN)))
3565fce3a   Dan Williams   mm, x86: get_user...
947
948
949
  		return ERR_PTR(-EEXIST);
  
  	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
df06b37ff   Keith Busch   mm/gup: cache dev...
950
951
  	*pgmap = get_dev_pagemap(pfn, *pgmap);
  	if (!*pgmap)
3565fce3a   Dan Williams   mm, x86: get_user...
952
953
  		return ERR_PTR(-EFAULT);
  	page = pfn_to_page(pfn);
3faa52c03   John Hubbard   mm/gup: track FOL...
954
955
  	if (!try_grab_page(page, flags))
  		page = ERR_PTR(-ENOMEM);
3565fce3a   Dan Williams   mm, x86: get_user...
956
957
958
  
  	return page;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
959
960
961
962
  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
  		  struct vm_area_struct *vma)
  {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
963
  	spinlock_t *dst_ptl, *src_ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
964
965
  	struct page *src_page;
  	pmd_t pmd;
12c9d70bd   Matthew Wilcox   mm: fix memory le...
966
  	pgtable_t pgtable = NULL;
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
967
  	int ret = -ENOMEM;
71e3aac07   Andrea Arcangeli   thp: transparent ...
968

628d47ce9   Kirill A. Shutemov   thp: skip file hu...
969
970
971
  	/* Skip if can be re-fill on fault */
  	if (!vma_is_anonymous(vma))
  		return 0;
4cf589249   Joel Fernandes (Google)   mm: treewide: rem...
972
  	pgtable = pte_alloc_one(dst_mm);
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
973
974
  	if (unlikely(!pgtable))
  		goto out;
71e3aac07   Andrea Arcangeli   thp: transparent ...
975

c4088ebdc   Kirill A. Shutemov   mm: convert the r...
976
977
978
  	dst_ptl = pmd_lock(dst_mm, dst_pmd);
  	src_ptl = pmd_lockptr(src_mm, src_pmd);
  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
71e3aac07   Andrea Arcangeli   thp: transparent ...
979
980
981
  
  	ret = -EAGAIN;
  	pmd = *src_pmd;
84c3fc4e9   Zi Yan   mm: thp: check pm...
982

b569a1760   Peter Xu   userfaultfd: wp: ...
983
984
985
986
987
988
989
  	/*
  	 * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
  	 * does not have the VM_UFFD_WP, which means that the uffd
  	 * fork event is not enabled.
  	 */
  	if (!(vma->vm_flags & VM_UFFD_WP))
  		pmd = pmd_clear_uffd_wp(pmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
990
991
992
993
994
995
996
997
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  	if (unlikely(is_swap_pmd(pmd))) {
  		swp_entry_t entry = pmd_to_swp_entry(pmd);
  
  		VM_BUG_ON(!is_pmd_migration_entry(pmd));
  		if (is_write_migration_entry(entry)) {
  			make_migration_entry_read(&entry);
  			pmd = swp_entry_to_pmd(entry);
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
998
999
  			if (pmd_swp_soft_dirty(*src_pmd))
  				pmd = pmd_swp_mksoft_dirty(pmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1000
1001
  			set_pmd_at(src_mm, addr, src_pmd, pmd);
  		}
dd8a67f9a   Zi Yan   mm/huge_memory.c:...
1002
  		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
af5b0f6a0   Kirill A. Shutemov   mm: consolidate p...
1003
  		mm_inc_nr_ptes(dst_mm);
dd8a67f9a   Zi Yan   mm/huge_memory.c:...
1004
  		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1005
1006
1007
1008
1009
  		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
  		ret = 0;
  		goto out_unlock;
  	}
  #endif
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
1010
  	if (unlikely(!pmd_trans_huge(pmd))) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1011
1012
1013
  		pte_free(dst_mm, pgtable);
  		goto out_unlock;
  	}
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
1014
  	/*
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1015
  	 * When page table lock is held, the huge zero pmd should not be
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
1016
1017
1018
1019
  	 * under splitting since we don't split the page itself, only pmd to
  	 * a page table.
  	 */
  	if (is_huge_zero_pmd(pmd)) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
1020
  		struct page *zero_page;
97ae17497   Kirill A. Shutemov   thp: implement re...
1021
1022
1023
1024
1025
  		/*
  		 * get_huge_zero_page() will never allocate a new page here,
  		 * since we already have a zero page to copy. It just takes a
  		 * reference.
  		 */
6fcb52a56   Aaron Lu   thp: reduce usage...
1026
  		zero_page = mm_get_huge_zero_page(dst_mm);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
1027
  		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
1028
  				zero_page);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
1029
1030
1031
  		ret = 0;
  		goto out_unlock;
  	}
de466bd62   Mel Gorman   mm: numa: avoid u...
1032

628d47ce9   Kirill A. Shutemov   thp: skip file hu...
1033
1034
  	src_page = pmd_page(pmd);
  	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
d042035ea   Peter Xu   mm/thp: Split hug...
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
  
  	/*
  	 * If this page is a potentially pinned page, split and retry the fault
  	 * with smaller page size.  Normally this should not happen because the
  	 * userspace should use MADV_DONTFORK upon pinned regions.  This is a
  	 * best effort that the pinned pages won't be replaced by another
  	 * random page during the coming copy-on-write.
  	 */
  	if (unlikely(is_cow_mapping(vma->vm_flags) &&
  		     atomic_read(&src_mm->has_pinned) &&
  		     page_maybe_dma_pinned(src_page))) {
  		pte_free(dst_mm, pgtable);
  		spin_unlock(src_ptl);
  		spin_unlock(dst_ptl);
  		__split_huge_pmd(vma, src_pmd, addr, false, NULL);
  		return -EAGAIN;
  	}
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
1052
1053
1054
  	get_page(src_page);
  	page_dup_rmap(src_page, true);
  	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
1055
  	mm_inc_nr_ptes(dst_mm);
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
1056
  	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1057
1058
1059
1060
  
  	pmdp_set_wrprotect(src_mm, addr, src_pmd);
  	pmd = pmd_mkold(pmd_wrprotect(pmd));
  	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1061
1062
1063
  
  	ret = 0;
  out_unlock:
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1064
1065
  	spin_unlock(src_ptl);
  	spin_unlock(dst_ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1066
1067
1068
  out:
  	return ret;
  }
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1069
1070
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1071
  		pud_t *pud, int flags)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1072
1073
  {
  	pud_t _pud;
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1074
1075
1076
  	_pud = pud_mkyoung(*pud);
  	if (flags & FOLL_WRITE)
  		_pud = pud_mkdirty(_pud);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1077
  	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1078
  				pud, _pud, flags & FOLL_WRITE))
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1079
1080
1081
1082
  		update_mmu_cache_pud(vma, addr, pud);
  }
  
  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
df06b37ff   Keith Busch   mm/gup: cache dev...
1083
  		pud_t *pud, int flags, struct dev_pagemap **pgmap)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1084
1085
1086
  {
  	unsigned long pfn = pud_pfn(*pud);
  	struct mm_struct *mm = vma->vm_mm;
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1087
1088
1089
  	struct page *page;
  
  	assert_spin_locked(pud_lockptr(mm, pud));
f6f373216   Linus Torvalds   Revert "mm: repla...
1090
  	if (flags & FOLL_WRITE && !pud_write(*pud))
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1091
  		return NULL;
3faa52c03   John Hubbard   mm/gup: track FOL...
1092
1093
1094
1095
  	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
  	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
  			 (FOLL_PIN | FOLL_GET)))
  		return NULL;
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1096
1097
1098
1099
1100
1101
  	if (pud_present(*pud) && pud_devmap(*pud))
  		/* pass */;
  	else
  		return NULL;
  
  	if (flags & FOLL_TOUCH)
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1102
  		touch_pud(vma, addr, pud, flags);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1103
1104
1105
1106
  
  	/*
  	 * device mapped pages can only be returned if the
  	 * caller will manage the page reference count.
3faa52c03   John Hubbard   mm/gup: track FOL...
1107
1108
  	 *
  	 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1109
  	 */
3faa52c03   John Hubbard   mm/gup: track FOL...
1110
  	if (!(flags & (FOLL_GET | FOLL_PIN)))
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1111
1112
1113
  		return ERR_PTR(-EEXIST);
  
  	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
df06b37ff   Keith Busch   mm/gup: cache dev...
1114
1115
  	*pgmap = get_dev_pagemap(pfn, *pgmap);
  	if (!*pgmap)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1116
1117
  		return ERR_PTR(-EFAULT);
  	page = pfn_to_page(pfn);
3faa52c03   John Hubbard   mm/gup: track FOL...
1118
1119
  	if (!try_grab_page(page, flags))
  		page = ERR_PTR(-ENOMEM);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
  
  	return page;
  }
  
  int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
  		  struct vm_area_struct *vma)
  {
  	spinlock_t *dst_ptl, *src_ptl;
  	pud_t pud;
  	int ret;
  
  	dst_ptl = pud_lock(dst_mm, dst_pud);
  	src_ptl = pud_lockptr(src_mm, src_pud);
  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  
  	ret = -EAGAIN;
  	pud = *src_pud;
  	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
  		goto out_unlock;
  
  	/*
  	 * When page table lock is held, the huge zero pud should not be
  	 * under splitting since we don't split the page itself, only pud to
  	 * a page table.
  	 */
  	if (is_huge_zero_pud(pud)) {
  		/* No huge zero pud yet */
  	}
d042035ea   Peter Xu   mm/thp: Split hug...
1149
1150
1151
1152
1153
1154
1155
1156
1157
  	/* Please refer to comments in copy_huge_pmd() */
  	if (unlikely(is_cow_mapping(vma->vm_flags) &&
  		     atomic_read(&src_mm->has_pinned) &&
  		     page_maybe_dma_pinned(pud_page(pud)))) {
  		spin_unlock(src_ptl);
  		spin_unlock(dst_ptl);
  		__split_huge_pud(vma, src_pud, addr);
  		return -EAGAIN;
  	}
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
  	pudp_set_wrprotect(src_mm, addr, src_pud);
  	pud = pud_mkold(pud_wrprotect(pud));
  	set_pud_at(dst_mm, addr, dst_pud, pud);
  
  	ret = 0;
  out_unlock:
  	spin_unlock(src_ptl);
  	spin_unlock(dst_ptl);
  	return ret;
  }
  
  void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
  {
  	pud_t entry;
  	unsigned long haddr;
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
  
  	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
  	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
  		goto unlock;
  
  	entry = pud_mkyoung(orig_pud);
  	if (write)
  		entry = pud_mkdirty(entry);
  	haddr = vmf->address & HPAGE_PUD_MASK;
  	if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
  		update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
  
  unlock:
  	spin_unlock(vmf->ptl);
  }
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
82b0f8c39   Jan Kara   mm: join struct f...
1190
  void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
a1dd450bc   Will Deacon   mm: thp: set the ...
1191
1192
1193
  {
  	pmd_t entry;
  	unsigned long haddr;
20f664aab   Minchan Kim   mm: pmd dirty emu...
1194
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
a1dd450bc   Will Deacon   mm: thp: set the ...
1195

82b0f8c39   Jan Kara   mm: join struct f...
1196
1197
  	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
a1dd450bc   Will Deacon   mm: thp: set the ...
1198
1199
1200
  		goto unlock;
  
  	entry = pmd_mkyoung(orig_pmd);
20f664aab   Minchan Kim   mm: pmd dirty emu...
1201
1202
  	if (write)
  		entry = pmd_mkdirty(entry);
82b0f8c39   Jan Kara   mm: join struct f...
1203
  	haddr = vmf->address & HPAGE_PMD_MASK;
20f664aab   Minchan Kim   mm: pmd dirty emu...
1204
  	if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
82b0f8c39   Jan Kara   mm: join struct f...
1205
  		update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
a1dd450bc   Will Deacon   mm: thp: set the ...
1206
1207
  
  unlock:
82b0f8c39   Jan Kara   mm: join struct f...
1208
  	spin_unlock(vmf->ptl);
a1dd450bc   Will Deacon   mm: thp: set the ...
1209
  }
2b7403035   Souptick Joarder   mm: Change return...
1210
  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1211
  {
82b0f8c39   Jan Kara   mm: join struct f...
1212
  	struct vm_area_struct *vma = vmf->vma;
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1213
  	struct page *page;
82b0f8c39   Jan Kara   mm: join struct f...
1214
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1215

82b0f8c39   Jan Kara   mm: join struct f...
1216
  	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
81d1b09c6   Sasha Levin   mm: convert a few...
1217
  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1218

93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1219
  	if (is_huge_zero_pmd(orig_pmd))
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1220
  		goto fallback;
82b0f8c39   Jan Kara   mm: join struct f...
1221
  	spin_lock(vmf->ptl);
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1222
1223
1224
1225
1226
  
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
  		spin_unlock(vmf->ptl);
  		return 0;
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1227
1228
  
  	page = pmd_page(orig_pmd);
309381fea   Sasha Levin   mm: dump page whe...
1229
  	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1230
1231
  
  	/* Lock page for reuse_swap_page() */
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1232
1233
1234
1235
1236
1237
  	if (!trylock_page(page)) {
  		get_page(page);
  		spin_unlock(vmf->ptl);
  		lock_page(page);
  		spin_lock(vmf->ptl);
  		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1238
  			spin_unlock(vmf->ptl);
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1239
1240
  			unlock_page(page);
  			put_page(page);
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1241
  			return 0;
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1242
1243
1244
  		}
  		put_page(page);
  	}
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1245
1246
1247
1248
1249
  
  	/*
  	 * We can only reuse the page if nobody else maps the huge page or it's
  	 * part.
  	 */
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1250
  	if (reuse_swap_page(page, NULL)) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1251
1252
  		pmd_t entry;
  		entry = pmd_mkyoung(orig_pmd);
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
1253
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1254
  		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
82b0f8c39   Jan Kara   mm: join struct f...
1255
  			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1256
  		unlock_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
1257
  		spin_unlock(vmf->ptl);
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1258
  		return VM_FAULT_WRITE;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1259
  	}
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1260
1261
  
  	unlock_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
1262
  	spin_unlock(vmf->ptl);
3917c8028   Kirill A. Shutemov   thp: change CoW s...
1263
1264
1265
  fallback:
  	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
  	return VM_FAULT_FALLBACK;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1266
  }
8310d48b1   Keno Fischer   mm/huge_memory.c:...
1267
  /*
a308c71bf   Peter Xu   mm/gup: Remove en...
1268
1269
   * FOLL_FORCE can write to even unwritable pmd's, but only
   * after we've gone through a COW cycle and they are dirty.
8310d48b1   Keno Fischer   mm/huge_memory.c:...
1270
1271
1272
   */
  static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
  {
a308c71bf   Peter Xu   mm/gup: Remove en...
1273
1274
  	return pmd_write(pmd) ||
  	       ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
8310d48b1   Keno Fischer   mm/huge_memory.c:...
1275
  }
b676b293f   David Rientjes   mm, thp: fix mapp...
1276
  struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
71e3aac07   Andrea Arcangeli   thp: transparent ...
1277
1278
1279
1280
  				   unsigned long addr,
  				   pmd_t *pmd,
  				   unsigned int flags)
  {
b676b293f   David Rientjes   mm, thp: fix mapp...
1281
  	struct mm_struct *mm = vma->vm_mm;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1282
  	struct page *page = NULL;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1283
  	assert_spin_locked(pmd_lockptr(mm, pmd));
71e3aac07   Andrea Arcangeli   thp: transparent ...
1284

8310d48b1   Keno Fischer   mm/huge_memory.c:...
1285
  	if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
71e3aac07   Andrea Arcangeli   thp: transparent ...
1286
  		goto out;
85facf257   Kirill A. Shutemov   thp: avoid dumpin...
1287
1288
1289
  	/* Avoid dumping huge zero page */
  	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
  		return ERR_PTR(-EFAULT);
2b4847e73   Mel Gorman   mm: numa: seriali...
1290
  	/* Full NUMA hinting faults to serialise migration in fault paths */
8a0516ed8   Mel Gorman   mm: convert p[te|...
1291
  	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
2b4847e73   Mel Gorman   mm: numa: seriali...
1292
  		goto out;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1293
  	page = pmd_page(*pmd);
ca120cf68   Dan Williams   mm: fix show_smap...
1294
  	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
3faa52c03   John Hubbard   mm/gup: track FOL...
1295
1296
1297
  
  	if (!try_grab_page(page, flags))
  		return ERR_PTR(-ENOMEM);
3565fce3a   Dan Williams   mm, x86: get_user...
1298
  	if (flags & FOLL_TOUCH)
a8f973664   Kirill A. Shutemov   mm, thp: Do not m...
1299
  		touch_pmd(vma, addr, pmd, flags);
3faa52c03   John Hubbard   mm/gup: track FOL...
1300

de60f5f10   Eric B Munson   mm: introduce VM_...
1301
  	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1302
1303
1304
1305
  		/*
  		 * We don't mlock() pte-mapped THPs. This way we can avoid
  		 * leaking mlocked pages into non-VM_LOCKED VMAs.
  		 *
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1306
1307
  		 * For anon THP:
  		 *
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1308
1309
1310
1311
1312
1313
1314
  		 * In most cases the pmd is the only mapping of the page as we
  		 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
  		 * writable private mappings in populate_vma_page_range().
  		 *
  		 * The only scenario when we have the page shared here is if we
  		 * mlocking read-only mapping shared over fork(). We skip
  		 * mlocking such pages.
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1315
1316
1317
1318
1319
1320
  		 *
  		 * For file THP:
  		 *
  		 * We can expect PageDoubleMap() to be stable under page lock:
  		 * for file pages we set it in page_add_file_rmap(), which
  		 * requires page to be locked.
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1321
  		 */
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1322
1323
1324
1325
1326
1327
1328
  
  		if (PageAnon(page) && compound_mapcount(page) != 1)
  			goto skip_mlock;
  		if (PageDoubleMap(page) || !page->mapping)
  			goto skip_mlock;
  		if (!trylock_page(page))
  			goto skip_mlock;
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1329
1330
1331
  		if (page->mapping && !PageDoubleMap(page))
  			mlock_vma_page(page);
  		unlock_page(page);
b676b293f   David Rientjes   mm, thp: fix mapp...
1332
  	}
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1333
  skip_mlock:
71e3aac07   Andrea Arcangeli   thp: transparent ...
1334
  	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
ca120cf68   Dan Williams   mm: fix show_smap...
1335
  	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1336
1337
1338
1339
  
  out:
  	return page;
  }
d10e63f29   Mel Gorman   mm: numa: Create ...
1340
  /* NUMA hinting page fault entry point for trans huge pmds */
2b7403035   Souptick Joarder   mm: Change return...
1341
  vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
d10e63f29   Mel Gorman   mm: numa: Create ...
1342
  {
82b0f8c39   Jan Kara   mm: join struct f...
1343
  	struct vm_area_struct *vma = vmf->vma;
b8916634b   Mel Gorman   mm: Prevent paral...
1344
  	struct anon_vma *anon_vma = NULL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1345
  	struct page *page;
82b0f8c39   Jan Kara   mm: join struct f...
1346
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
98fa15f34   Anshuman Khandual   mm: replace all o...
1347
  	int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
90572890d   Peter Zijlstra   mm: numa: Change ...
1348
  	int target_nid, last_cpupid = -1;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1349
1350
  	bool page_locked;
  	bool migrated = false;
b191f9b10   Mel Gorman   mm: numa: preserv...
1351
  	bool was_writable;
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1352
  	int flags = 0;
d10e63f29   Mel Gorman   mm: numa: Create ...
1353

82b0f8c39   Jan Kara   mm: join struct f...
1354
1355
  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_same(pmd, *vmf->pmd)))
d10e63f29   Mel Gorman   mm: numa: Create ...
1356
  		goto out_unlock;
de466bd62   Mel Gorman   mm: numa: avoid u...
1357
1358
1359
1360
1361
  	/*
  	 * If there are potential migrations, wait for completion and retry
  	 * without disrupting NUMA hinting information. Do not relock and
  	 * check_same as the page may no longer be mapped.
  	 */
82b0f8c39   Jan Kara   mm: join struct f...
1362
1363
  	if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
  		page = pmd_page(*vmf->pmd);
3c226c637   Mark Rutland   mm: numa: avoid w...
1364
1365
  		if (!get_page_unless_zero(page))
  			goto out_unlock;
82b0f8c39   Jan Kara   mm: join struct f...
1366
  		spin_unlock(vmf->ptl);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1367
  		put_and_wait_on_page_locked(page);
de466bd62   Mel Gorman   mm: numa: avoid u...
1368
1369
  		goto out;
  	}
d10e63f29   Mel Gorman   mm: numa: Create ...
1370
  	page = pmd_page(pmd);
a1a46184e   Mel Gorman   mm: numa: Do not ...
1371
  	BUG_ON(is_huge_zero_page(page));
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1372
  	page_nid = page_to_nid(page);
90572890d   Peter Zijlstra   mm: numa: Change ...
1373
  	last_cpupid = page_cpupid_last(page);
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1374
  	count_vm_numa_event(NUMA_HINT_FAULTS);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1375
  	if (page_nid == this_nid) {
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1376
  		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1377
1378
  		flags |= TNF_FAULT_LOCAL;
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1379

bea66fbd1   Mel Gorman   mm: numa: group r...
1380
  	/* See similar comment in do_numa_page for explanation */
288bc5494   Aneesh Kumar K.V   mm/autonuma: let ...
1381
  	if (!pmd_savedwrite(pmd))
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1382
1383
1384
  		flags |= TNF_NO_GROUP;
  
  	/*
ff9042b11   Mel Gorman   mm: Wait for THP ...
1385
1386
1387
  	 * Acquire the page lock to serialise THP migrations but avoid dropping
  	 * page_table_lock if at all possible
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1388
1389
  	page_locked = trylock_page(page);
  	target_nid = mpol_misplaced(page, vma, haddr);
98fa15f34   Anshuman Khandual   mm: replace all o...
1390
  	if (target_nid == NUMA_NO_NODE) {
b8916634b   Mel Gorman   mm: Prevent paral...
1391
  		/* If the page was locked, there are no parallel migrations */
a54a407fb   Mel Gorman   mm: Close races b...
1392
  		if (page_locked)
b8916634b   Mel Gorman   mm: Prevent paral...
1393
  			goto clear_pmdnuma;
2b4847e73   Mel Gorman   mm: numa: seriali...
1394
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1395

de466bd62   Mel Gorman   mm: numa: avoid u...
1396
  	/* Migration could have started since the pmd_trans_migrating check */
2b4847e73   Mel Gorman   mm: numa: seriali...
1397
  	if (!page_locked) {
98fa15f34   Anshuman Khandual   mm: replace all o...
1398
  		page_nid = NUMA_NO_NODE;
3c226c637   Mark Rutland   mm: numa: avoid w...
1399
1400
  		if (!get_page_unless_zero(page))
  			goto out_unlock;
82b0f8c39   Jan Kara   mm: join struct f...
1401
  		spin_unlock(vmf->ptl);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1402
  		put_and_wait_on_page_locked(page);
b8916634b   Mel Gorman   mm: Prevent paral...
1403
1404
  		goto out;
  	}
2b4847e73   Mel Gorman   mm: numa: seriali...
1405
1406
1407
1408
  	/*
  	 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
  	 * to serialises splits
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1409
  	get_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
1410
  	spin_unlock(vmf->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1411
  	anon_vma = page_lock_anon_vma_read(page);
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1412

c69307d53   Peter Zijlstra   sched/numa: Fix c...
1413
  	/* Confirm the PMD did not change while page_table_lock was released */
82b0f8c39   Jan Kara   mm: join struct f...
1414
1415
  	spin_lock(vmf->ptl);
  	if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
b32967ff1   Mel Gorman   mm: numa: Add THP...
1416
1417
  		unlock_page(page);
  		put_page(page);
98fa15f34   Anshuman Khandual   mm: replace all o...
1418
  		page_nid = NUMA_NO_NODE;
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1419
  		goto out_unlock;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1420
  	}
ff9042b11   Mel Gorman   mm: Wait for THP ...
1421

c3a489cac   Mel Gorman   mm: numa: ensure ...
1422
1423
1424
  	/* Bail if we fail to protect against THP splits for any reason */
  	if (unlikely(!anon_vma)) {
  		put_page(page);
98fa15f34   Anshuman Khandual   mm: replace all o...
1425
  		page_nid = NUMA_NO_NODE;
c3a489cac   Mel Gorman   mm: numa: ensure ...
1426
1427
  		goto clear_pmdnuma;
  	}
a54a407fb   Mel Gorman   mm: Close races b...
1428
  	/*
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1429
1430
1431
1432
1433
  	 * Since we took the NUMA fault, we must have observed the !accessible
  	 * bit. Make sure all other CPUs agree with that, to avoid them
  	 * modifying the page we're about to migrate.
  	 *
  	 * Must be done under PTL such that we'll observe the relevant
ccde85ba0   Peter Zijlstra   mm, locking: Fix ...
1434
1435
1436
1437
  	 * inc_tlb_flush_pending().
  	 *
  	 * We are not sure a pending tlb flush here is for a huge page
  	 * mapping or not. Hence use the tlb range variant
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1438
  	 */
7066f0f93   Andrea Arcangeli   mm: thp: fix mmu_...
1439
  	if (mm_tlb_flush_pending(vma->vm_mm)) {
ccde85ba0   Peter Zijlstra   mm, locking: Fix ...
1440
  		flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
7066f0f93   Andrea Arcangeli   mm: thp: fix mmu_...
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
  		/*
  		 * change_huge_pmd() released the pmd lock before
  		 * invalidating the secondary MMUs sharing the primary
  		 * MMU pagetables (with ->invalidate_range()). The
  		 * mmu_notifier_invalidate_range_end() (which
  		 * internally calls ->invalidate_range()) in
  		 * change_pmd_range() will run after us, so we can't
  		 * rely on it here and we need an explicit invalidate.
  		 */
  		mmu_notifier_invalidate_range(vma->vm_mm, haddr,
  					      haddr + HPAGE_PMD_SIZE);
  	}
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1453
1454
  
  	/*
a54a407fb   Mel Gorman   mm: Close races b...
1455
  	 * Migrate the THP to the requested node, returns with page unlocked
8a0516ed8   Mel Gorman   mm: convert p[te|...
1456
  	 * and access rights restored.
a54a407fb   Mel Gorman   mm: Close races b...
1457
  	 */
82b0f8c39   Jan Kara   mm: join struct f...
1458
  	spin_unlock(vmf->ptl);
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1459

bae473a42   Kirill A. Shutemov   mm: introduce fau...
1460
  	migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
82b0f8c39   Jan Kara   mm: join struct f...
1461
  				vmf->pmd, pmd, vmf->address, page, target_nid);
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1462
1463
  	if (migrated) {
  		flags |= TNF_MIGRATED;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1464
  		page_nid = target_nid;
074c23817   Mel Gorman   mm: numa: slow PT...
1465
1466
  	} else
  		flags |= TNF_MIGRATE_FAIL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1467

8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1468
  	goto out;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1469
  clear_pmdnuma:
a54a407fb   Mel Gorman   mm: Close races b...
1470
  	BUG_ON(!PageLocked(page));
288bc5494   Aneesh Kumar K.V   mm/autonuma: let ...
1471
  	was_writable = pmd_savedwrite(pmd);
4d9424669   Mel Gorman   mm: convert p[te|...
1472
  	pmd = pmd_modify(pmd, vma->vm_page_prot);
b7b04004e   Mel Gorman   mm: numa: mark hu...
1473
  	pmd = pmd_mkyoung(pmd);
b191f9b10   Mel Gorman   mm: numa: preserv...
1474
1475
  	if (was_writable)
  		pmd = pmd_mkwrite(pmd);
82b0f8c39   Jan Kara   mm: join struct f...
1476
1477
  	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
  	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
a54a407fb   Mel Gorman   mm: Close races b...
1478
  	unlock_page(page);
d10e63f29   Mel Gorman   mm: numa: Create ...
1479
  out_unlock:
82b0f8c39   Jan Kara   mm: join struct f...
1480
  	spin_unlock(vmf->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1481
1482
1483
1484
  
  out:
  	if (anon_vma)
  		page_unlock_anon_vma_read(anon_vma);
98fa15f34   Anshuman Khandual   mm: replace all o...
1485
  	if (page_nid != NUMA_NO_NODE)
82b0f8c39   Jan Kara   mm: join struct f...
1486
  		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
9a8b300f2   Aneesh Kumar K.V   mm/thp/autonuma: ...
1487
  				flags);
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1488

d10e63f29   Mel Gorman   mm: numa: Create ...
1489
1490
  	return 0;
  }
319904ad4   Huang Ying   mm, THP: clean up...
1491
1492
1493
1494
1495
  /*
   * Return true if we do MADV_FREE successfully on entire pmd page.
   * Otherwise, return false.
   */
  bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1496
  		pmd_t *pmd, unsigned long addr, unsigned long next)
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1497
1498
1499
1500
1501
  {
  	spinlock_t *ptl;
  	pmd_t orig_pmd;
  	struct page *page;
  	struct mm_struct *mm = tlb->mm;
319904ad4   Huang Ying   mm, THP: clean up...
1502
  	bool ret = false;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1503

ed6a79352   Peter Zijlstra   asm-generic/tlb, ...
1504
  	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
07e326610   Aneesh Kumar K.V   mm: add tlb_remov...
1505

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1506
1507
  	ptl = pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
25eedabe0   Linus Torvalds   vm: fix incorrect...
1508
  		goto out_unlocked;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1509
1510
  
  	orig_pmd = *pmd;
319904ad4   Huang Ying   mm, THP: clean up...
1511
  	if (is_huge_zero_pmd(orig_pmd))
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1512
  		goto out;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1513

84c3fc4e9   Zi Yan   mm: thp: check pm...
1514
1515
1516
1517
1518
  	if (unlikely(!pmd_present(orig_pmd))) {
  		VM_BUG_ON(thp_migration_supported() &&
  				  !is_pmd_migration_entry(orig_pmd));
  		goto out;
  	}
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
  	page = pmd_page(orig_pmd);
  	/*
  	 * If other processes are mapping this page, we couldn't discard
  	 * the page unless they all do MADV_FREE so let's skip the page.
  	 */
  	if (page_mapcount(page) != 1)
  		goto out;
  
  	if (!trylock_page(page))
  		goto out;
  
  	/*
  	 * If user want to discard part-pages of THP, split it so MADV_FREE
  	 * will deactivate only them.
  	 */
  	if (next - addr != HPAGE_PMD_SIZE) {
  		get_page(page);
  		spin_unlock(ptl);
9818b8cde   Huang Ying   madvise_free, thp...
1537
  		split_huge_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1538
  		unlock_page(page);
bbf29ffc7   Kirill A. Shutemov   thp, mm: fix cras...
1539
  		put_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1540
1541
1542
1543
1544
1545
  		goto out_unlocked;
  	}
  
  	if (PageDirty(page))
  		ClearPageDirty(page);
  	unlock_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1546
  	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
58ceeb6be   Kirill A. Shutemov   thp: fix MADV_DON...
1547
  		pmdp_invalidate(vma, addr, pmd);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1548
1549
1550
1551
1552
1553
  		orig_pmd = pmd_mkold(orig_pmd);
  		orig_pmd = pmd_mkclean(orig_pmd);
  
  		set_pmd_at(mm, addr, pmd, orig_pmd);
  		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  	}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1554
1555
  
  	mark_page_lazyfree(page);
319904ad4   Huang Ying   mm, THP: clean up...
1556
  	ret = true;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1557
1558
1559
1560
1561
  out:
  	spin_unlock(ptl);
  out_unlocked:
  	return ret;
  }
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1562
1563
1564
1565
1566
1567
  static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
  {
  	pgtable_t pgtable;
  
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pte_free(mm, pgtable);
c4812909f   Kirill A. Shutemov   mm: introduce wra...
1568
  	mm_dec_nr_ptes(mm);
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1569
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
1570
  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
f21760b15   Shaohua Li   thp: add tlb_remo...
1571
  		 pmd_t *pmd, unsigned long addr)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1572
  {
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1573
  	pmd_t orig_pmd;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1574
  	spinlock_t *ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1575

ed6a79352   Peter Zijlstra   asm-generic/tlb, ...
1576
  	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
07e326610   Aneesh Kumar K.V   mm: add tlb_remov...
1577

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1578
1579
  	ptl = __pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1580
1581
1582
1583
1584
1585
1586
  		return 0;
  	/*
  	 * For architectures like ppc64 we look at deposited pgtable
  	 * when calling pmdp_huge_get_and_clear. So do the
  	 * pgtable_trans_huge_withdraw after finishing pmdp related
  	 * operations.
  	 */
93a98695f   Aneesh Kumar K.V   mm: change pmdp_h...
1587
1588
  	orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
  						tlb->fullmm);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1589
  	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2484ca9b6   Thomas Hellstrom (VMware)   mm: Introduce vma...
1590
  	if (vma_is_special_huge(vma)) {
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
1591
1592
  		if (arch_needs_pgtable_deposit())
  			zap_deposited_table(tlb->mm, pmd);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1593
1594
  		spin_unlock(ptl);
  		if (is_huge_zero_pmd(orig_pmd))
c0f2e176f   Aneesh Kumar K.V   mm: use the corre...
1595
  			tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1596
  	} else if (is_huge_zero_pmd(orig_pmd)) {
c14a6eb44   Oliver O'Halloran   mm/huge_memory.c:...
1597
  		zap_deposited_table(tlb->mm, pmd);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1598
  		spin_unlock(ptl);
c0f2e176f   Aneesh Kumar K.V   mm: use the corre...
1599
  		tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1600
  	} else {
616b83715   Zi Yan   mm: thp: enable t...
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
  		struct page *page = NULL;
  		int flush_needed = 1;
  
  		if (pmd_present(orig_pmd)) {
  			page = pmd_page(orig_pmd);
  			page_remove_rmap(page, true);
  			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
  			VM_BUG_ON_PAGE(!PageHead(page), page);
  		} else if (thp_migration_supported()) {
  			swp_entry_t entry;
  
  			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
  			entry = pmd_to_swp_entry(orig_pmd);
  			page = pfn_to_page(swp_offset(entry));
  			flush_needed = 0;
  		} else
  			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
b5072380e   Kirill A. Shutemov   thp: support file...
1618
  		if (PageAnon(page)) {
c14a6eb44   Oliver O'Halloran   mm/huge_memory.c:...
1619
  			zap_deposited_table(tlb->mm, pmd);
b5072380e   Kirill A. Shutemov   thp: support file...
1620
1621
  			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  		} else {
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1622
1623
  			if (arch_needs_pgtable_deposit())
  				zap_deposited_table(tlb->mm, pmd);
fadae2953   Yang Shi   thp: use mm_file_...
1624
  			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
b5072380e   Kirill A. Shutemov   thp: support file...
1625
  		}
616b83715   Zi Yan   mm: thp: enable t...
1626

da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1627
  		spin_unlock(ptl);
616b83715   Zi Yan   mm: thp: enable t...
1628
1629
  		if (flush_needed)
  			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1630
  	}
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1631
  	return 1;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1632
  }
1dd38b6c2   Aneesh Kumar K.V   mm: move vma_is_a...
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
  #ifndef pmd_move_must_withdraw
  static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
  					 spinlock_t *old_pmd_ptl,
  					 struct vm_area_struct *vma)
  {
  	/*
  	 * With split pmd lock we also need to move preallocated
  	 * PTE page table if new_pmd is on different PMD page table.
  	 *
  	 * We also don't deposit and withdraw tables for file pages.
  	 */
  	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
  }
  #endif
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
  static pmd_t move_soft_dirty_pmd(pmd_t pmd)
  {
  #ifdef CONFIG_MEM_SOFT_DIRTY
  	if (unlikely(is_pmd_migration_entry(pmd)))
  		pmd = pmd_swp_mksoft_dirty(pmd);
  	else if (pmd_present(pmd))
  		pmd = pmd_mksoft_dirty(pmd);
  #endif
  	return pmd;
  }
bf8616d5f   Hugh Dickins   huge mm: move_hug...
1657
  bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
b8aa9d9d9   Wei Yang   mm/mremap: it is ...
1658
  		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1659
  {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1660
  	spinlock_t *old_ptl, *new_ptl;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1661
  	pmd_t pmd;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1662
  	struct mm_struct *mm = vma->vm_mm;
5d1904204   Aaron Lu   mremap: fix race ...
1663
  	bool force_flush = false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1664

37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1665
1666
1667
1668
1669
1670
  	/*
  	 * The destination pmd shouldn't be established, free_pgtables()
  	 * should have release it.
  	 */
  	if (WARN_ON(!pmd_none(*new_pmd))) {
  		VM_BUG_ON(pmd_trans_huge(*new_pmd));
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1671
  		return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1672
  	}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1673
1674
  	/*
  	 * We don't have to worry about the ordering of src and dst
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1675
  	 * ptlocks because exclusive mmap_lock prevents deadlock.
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1676
  	 */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1677
1678
  	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
  	if (old_ptl) {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1679
1680
1681
  		new_ptl = pmd_lockptr(mm, new_pmd);
  		if (new_ptl != old_ptl)
  			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
8809aa2d2   Aneesh Kumar K.V   mm: clarify that ...
1682
  		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
eb66ae030   Linus Torvalds   mremap: properly ...
1683
  		if (pmd_present(pmd))
a2ce2666a   Aaron Lu   mremap: move_ptes...
1684
  			force_flush = true;
025c5b245   Naoya Horiguchi   thp: optimize awa...
1685
  		VM_BUG_ON(!pmd_none(*new_pmd));
3592806cf   Kirill A. Shutemov   thp: move preallo...
1686

1dd38b6c2   Aneesh Kumar K.V   mm: move vma_is_a...
1687
  		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
b3084f4db   Aneesh Kumar K.V   powerpc/thp: Fix ...
1688
  			pgtable_t pgtable;
3592806cf   Kirill A. Shutemov   thp: move preallo...
1689
1690
  			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
  			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
3592806cf   Kirill A. Shutemov   thp: move preallo...
1691
  		}
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
1692
1693
  		pmd = move_soft_dirty_pmd(pmd);
  		set_pmd_at(mm, new_addr, new_pmd, pmd);
5d1904204   Aaron Lu   mremap: fix race ...
1694
1695
  		if (force_flush)
  			flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
eb66ae030   Linus Torvalds   mremap: properly ...
1696
1697
  		if (new_ptl != old_ptl)
  			spin_unlock(new_ptl);
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1698
  		spin_unlock(old_ptl);
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1699
  		return true;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1700
  	}
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1701
  	return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1702
  }
f123d74ab   Mel Gorman   mm: Only flush TL...
1703
1704
1705
1706
1707
1708
  /*
   * Returns
   *  - 0 if PMD could not be locked
   *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
   *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
   */
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1709
  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
58705444c   Peter Xu   mm: merge paramet...
1710
  		unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1711
1712
  {
  	struct mm_struct *mm = vma->vm_mm;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1713
  	spinlock_t *ptl;
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1714
1715
1716
  	pmd_t entry;
  	bool preserve_write;
  	int ret;
58705444c   Peter Xu   mm: merge paramet...
1717
  	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
292924b26   Peter Xu   userfaultfd: wp: ...
1718
1719
  	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
  	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1720

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1721
  	ptl = __pmd_trans_huge_lock(pmd, vma);
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1722
1723
  	if (!ptl)
  		return 0;
e944fd67b   Mel Gorman   mm: numa: do not ...
1724

0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1725
1726
  	preserve_write = prot_numa && pmd_write(*pmd);
  	ret = 1;
e944fd67b   Mel Gorman   mm: numa: do not ...
1727

84c3fc4e9   Zi Yan   mm: thp: check pm...
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  	if (is_swap_pmd(*pmd)) {
  		swp_entry_t entry = pmd_to_swp_entry(*pmd);
  
  		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
  		if (is_write_migration_entry(entry)) {
  			pmd_t newpmd;
  			/*
  			 * A protection check is difficult so
  			 * just be safe and disable write
  			 */
  			make_migration_entry_read(&entry);
  			newpmd = swp_entry_to_pmd(entry);
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
1741
1742
  			if (pmd_swp_soft_dirty(*pmd))
  				newpmd = pmd_swp_mksoft_dirty(newpmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1743
1744
1745
1746
1747
  			set_pmd_at(mm, addr, pmd, newpmd);
  		}
  		goto unlock;
  	}
  #endif
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1748
1749
1750
1751
1752
1753
1754
  	/*
  	 * Avoid trapping faults against the zero page. The read-only
  	 * data is likely to be read-cached on the local CPU and
  	 * local/remote hits to the zero page are not interesting.
  	 */
  	if (prot_numa && is_huge_zero_pmd(*pmd))
  		goto unlock;
025c5b245   Naoya Horiguchi   thp: optimize awa...
1755

0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1756
1757
  	if (prot_numa && pmd_protnone(*pmd))
  		goto unlock;
ced108037   Kirill A. Shutemov   thp: fix MADV_DON...
1758
  	/*
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1759
  	 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
ced108037   Kirill A. Shutemov   thp: fix MADV_DON...
1760
  	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1761
  	 * which is also under mmap_read_lock(mm):
ced108037   Kirill A. Shutemov   thp: fix MADV_DON...
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
  	 *
  	 *	CPU0:				CPU1:
  	 *				change_huge_pmd(prot_numa=1)
  	 *				 pmdp_huge_get_and_clear_notify()
  	 * madvise_dontneed()
  	 *  zap_pmd_range()
  	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
  	 *   // skip the pmd
  	 *				 set_pmd_at();
  	 *				 // pmd is re-established
  	 *
  	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
  	 * which may break userspace.
  	 *
  	 * pmdp_invalidate() is required to make sure we don't miss
  	 * dirty/young flags set by hardware.
  	 */
a3cf988fc   Kirill A. Shutemov   mm: use updated p...
1779
  	entry = pmdp_invalidate(vma, addr, pmd);
ced108037   Kirill A. Shutemov   thp: fix MADV_DON...
1780

0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1781
1782
1783
  	entry = pmd_modify(entry, newprot);
  	if (preserve_write)
  		entry = pmd_mk_savedwrite(entry);
292924b26   Peter Xu   userfaultfd: wp: ...
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
  	if (uffd_wp) {
  		entry = pmd_wrprotect(entry);
  		entry = pmd_mkuffd_wp(entry);
  	} else if (uffd_wp_resolve) {
  		/*
  		 * Leave the write bit to be handled by PF interrupt
  		 * handler, then things like COW could be properly
  		 * handled.
  		 */
  		entry = pmd_clear_uffd_wp(entry);
  	}
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1795
1796
1797
1798
1799
  	ret = HPAGE_PMD_NR;
  	set_pmd_at(mm, addr, pmd, entry);
  	BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
  unlock:
  	spin_unlock(ptl);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1800
1801
1802
1803
  	return ret;
  }
  
  /*
8f19b0c05   Huang Ying   thp: fix comments...
1804
   * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1805
   *
8f19b0c05   Huang Ying   thp: fix comments...
1806
1807
   * Note that if it returns page table lock pointer, this routine returns without
   * unlocking page table lock. So callers must unlock it.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1808
   */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1809
  spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
025c5b245   Naoya Horiguchi   thp: optimize awa...
1810
  {
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1811
1812
  	spinlock_t *ptl;
  	ptl = pmd_lock(vma->vm_mm, pmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1813
1814
  	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
  			pmd_devmap(*pmd)))
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1815
1816
1817
  		return ptl;
  	spin_unlock(ptl);
  	return NULL;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1818
  }
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
  /*
   * Returns true if a given pud maps a thp, false otherwise.
   *
   * Note that if it returns true, this routine returns without unlocking page
   * table lock. So callers must unlock it.
   */
  spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
  {
  	spinlock_t *ptl;
  
  	ptl = pud_lock(vma->vm_mm, pud);
  	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
  		return ptl;
  	spin_unlock(ptl);
  	return NULL;
  }
  
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
  		 pud_t *pud, unsigned long addr)
  {
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
  	spinlock_t *ptl;
  
  	ptl = __pud_trans_huge_lock(pud, vma);
  	if (!ptl)
  		return 0;
  	/*
  	 * For architectures like ppc64 we look at deposited pgtable
  	 * when calling pudp_huge_get_and_clear. So do the
  	 * pgtable_trans_huge_withdraw after finishing pudp related
  	 * operations.
  	 */
70516b936   Qian Cai   mm/huge_memory.c:...
1851
  	pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1852
  	tlb_remove_pud_tlb_entry(tlb, pud, addr);
2484ca9b6   Thomas Hellstrom (VMware)   mm: Introduce vma...
1853
  	if (vma_is_special_huge(vma)) {
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
  		spin_unlock(ptl);
  		/* No zero page support yet */
  	} else {
  		/* No support for anonymous PUD pages yet */
  		BUG();
  	}
  	return 1;
  }
  
  static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
  		unsigned long haddr)
  {
  	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
  	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
ce9311cf9   Yisheng Xie   mm/vmstats: add t...
1870
  	count_vm_event(THP_SPLIT_PUD);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1871
1872
1873
1874
1875
1876
1877
1878
  
  	pudp_huge_clear_flush_notify(vma, haddr, pud);
  }
  
  void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
  		unsigned long address)
  {
  	spinlock_t *ptl;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1879
  	struct mmu_notifier_range range;
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1880

7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
1881
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
6f4f13e8d   Jérôme Glisse   mm/mmu_notifier: ...
1882
  				address & HPAGE_PUD_MASK,
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1883
1884
1885
  				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
  	mmu_notifier_invalidate_range_start(&range);
  	ptl = pud_lock(vma->vm_mm, pud);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1886
1887
  	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
  		goto out;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1888
  	__split_huge_pud_locked(vma, pud, range.start);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1889
1890
1891
  
  out:
  	spin_unlock(ptl);
4645b9fe8   Jérôme Glisse   mm/mmu_notifier: ...
1892
1893
1894
1895
  	/*
  	 * No need to double call mmu_notifier->invalidate_range() callback as
  	 * the above pudp_huge_clear_flush_notify() did already call it.
  	 */
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
1896
  	mmu_notifier_invalidate_range_only_end(&range);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1897
1898
  }
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1899
1900
1901
1902
1903
1904
1905
  static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
  		unsigned long haddr, pmd_t *pmd)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pgtable_t pgtable;
  	pmd_t _pmd;
  	int i;
0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
1906
1907
1908
1909
1910
1911
  	/*
  	 * Leave pmd empty until pte is filled note that it is fine to delay
  	 * notification until mmu_notifier_invalidate_range_end() as we are
  	 * replacing a zero pmd write protected page with a zero pte write
  	 * protected page.
  	 *
ad56b738c   Mike Rapoport   docs/vm: rename d...
1912
  	 * See Documentation/vm/mmu_notifier.rst
0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
1913
1914
  	 */
  	pmdp_huge_clear_flush(vma, haddr, pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
  
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
  
  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
  		pte_t *pte, entry;
  		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
  		entry = pte_mkspecial(entry);
  		pte = pte_offset_map(&_pmd, haddr);
  		VM_BUG_ON(!pte_none(*pte));
  		set_pte_at(mm, haddr, pte, entry);
  		pte_unmap(pte);
  	}
  	smp_wmb(); /* make pte visible before pmd */
  	pmd_populate(mm, pmd, pgtable);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1930
1931
1932
  }
  
  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
ba9882808   Kirill A. Shutemov   thp: add option t...
1933
  		unsigned long haddr, bool freeze)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1934
1935
1936
1937
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct page *page;
  	pgtable_t pgtable;
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
1938
  	pmd_t old_pmd, _pmd;
292924b26   Peter Xu   userfaultfd: wp: ...
1939
  	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
1940
  	unsigned long addr;
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1941
1942
1943
1944
1945
  	int i;
  
  	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1946
1947
  	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
  				&& !pmd_devmap(*pmd));
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1948
1949
  
  	count_vm_event(THP_SPLIT_PMD);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
1950
1951
  	if (!vma_is_anonymous(vma)) {
  		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1952
1953
1954
1955
1956
1957
  		/*
  		 * We are going to unmap this huge page. So
  		 * just go ahead and zap it
  		 */
  		if (arch_needs_pgtable_deposit())
  			zap_deposited_table(mm, pmd);
2484ca9b6   Thomas Hellstrom (VMware)   mm: Introduce vma...
1958
  		if (vma_is_special_huge(vma))
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
1959
1960
  			return;
  		page = pmd_page(_pmd);
e1f1b1572   Hugh Dickins   mm/huge_memory.c:...
1961
1962
  		if (!PageDirty(page) && pmd_dirty(_pmd))
  			set_page_dirty(page);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
1963
1964
1965
1966
  		if (!PageReferenced(page) && pmd_young(_pmd))
  			SetPageReferenced(page);
  		page_remove_rmap(page, true);
  		put_page(page);
fadae2953   Yang Shi   thp: use mm_file_...
1967
  		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1968
  		return;
ec0abae6d   Ralph Campbell   mm/thp: fix __spl...
1969
  	} else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
4645b9fe8   Jérôme Glisse   mm/mmu_notifier: ...
1970
1971
1972
1973
1974
1975
1976
1977
1978
  		/*
  		 * FIXME: Do we want to invalidate secondary mmu by calling
  		 * mmu_notifier_invalidate_range() see comments below inside
  		 * __split_huge_pmd() ?
  		 *
  		 * We are going from a zero huge page write protected to zero
  		 * small page also write protected so it does not seems useful
  		 * to invalidate secondary mmu at this time.
  		 */
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1979
1980
  		return __split_huge_zero_page_pmd(vma, haddr, pmd);
  	}
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
1981
1982
1983
1984
1985
1986
1987
1988
  	/*
  	 * Up to this point the pmd is present and huge and userland has the
  	 * whole access to the hugepage during the split (which happens in
  	 * place). If we overwrite the pmd with the not-huge version pointing
  	 * to the pte here (which of course we could if all CPUs were bug
  	 * free), userland could trigger a small page size TLB miss on the
  	 * small sized TLB while the hugepage TLB entry is still established in
  	 * the huge TLB. Some CPU doesn't like that.
42742d9bd   Alexander A. Klimov   mm: thp: replace ...
1989
1990
  	 * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
  	 * 383 on page 105. Intel should be safe but is also warns that it's
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
  	 * only safe if the permission and cache attributes of the two entries
  	 * loaded in the two TLB is identical (which should be the case here).
  	 * But it is generally safer to never allow small and huge TLB entries
  	 * for the same virtual address to be loaded simultaneously. So instead
  	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
  	 * current pmd notpresent (atomically because here the pmd_trans_huge
  	 * must remain set at all times on the pmd until the split is complete
  	 * for this pmd), then we flush the SMP TLB and finally we write the
  	 * non-huge version of the pmd entry with pmd_populate.
  	 */
  	old_pmd = pmdp_invalidate(vma, haddr, pmd);
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2002
  	pmd_migration = is_pmd_migration_entry(old_pmd);
2e83ee1d8   Peter Xu   mm: thp: fix flag...
2003
  	if (unlikely(pmd_migration)) {
84c3fc4e9   Zi Yan   mm: thp: check pm...
2004
  		swp_entry_t entry;
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2005
  		entry = pmd_to_swp_entry(old_pmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
2006
  		page = pfn_to_page(swp_offset(entry));
2e83ee1d8   Peter Xu   mm: thp: fix flag...
2007
2008
2009
  		write = is_write_migration_entry(entry);
  		young = false;
  		soft_dirty = pmd_swp_soft_dirty(old_pmd);
f45ec5ff1   Peter Xu   userfaultfd: wp: ...
2010
  		uffd_wp = pmd_swp_uffd_wp(old_pmd);
2e83ee1d8   Peter Xu   mm: thp: fix flag...
2011
  	} else {
423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2012
  		page = pmd_page(old_pmd);
2e83ee1d8   Peter Xu   mm: thp: fix flag...
2013
2014
2015
2016
2017
  		if (pmd_dirty(old_pmd))
  			SetPageDirty(page);
  		write = pmd_write(old_pmd);
  		young = pmd_young(old_pmd);
  		soft_dirty = pmd_soft_dirty(old_pmd);
292924b26   Peter Xu   userfaultfd: wp: ...
2018
  		uffd_wp = pmd_uffd_wp(old_pmd);
2e83ee1d8   Peter Xu   mm: thp: fix flag...
2019
  	}
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2020
  	VM_BUG_ON_PAGE(!page_count(page), page);
fe896d187   Joonsoo Kim   mm: introduce pag...
2021
  	page_ref_add(page, HPAGE_PMD_NR - 1);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2022

423ac9af3   Aneesh Kumar K.V   mm/thp: remove pm...
2023
2024
2025
2026
  	/*
  	 * Withdraw the table only after we mark the pmd entry invalid.
  	 * This's critical for some architectures (Power).
  	 */
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2027
2028
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2029
  	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2030
2031
2032
2033
2034
2035
  		pte_t entry, *pte;
  		/*
  		 * Note that NUMA hinting access restrictions are not
  		 * transferred to avoid any possibility of altering
  		 * permissions across VMAs.
  		 */
84c3fc4e9   Zi Yan   mm: thp: check pm...
2036
  		if (freeze || pmd_migration) {
ba9882808   Kirill A. Shutemov   thp: add option t...
2037
2038
2039
  			swp_entry_t swp_entry;
  			swp_entry = make_migration_entry(page + i, write);
  			entry = swp_entry_to_pte(swp_entry);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
2040
2041
  			if (soft_dirty)
  				entry = pte_swp_mksoft_dirty(entry);
f45ec5ff1   Peter Xu   userfaultfd: wp: ...
2042
2043
  			if (uffd_wp)
  				entry = pte_swp_mkuffd_wp(entry);
ba9882808   Kirill A. Shutemov   thp: add option t...
2044
  		} else {
6d2329f88   Andrea Arcangeli   mm: vm_page_prot:...
2045
  			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2046
  			entry = maybe_mkwrite(entry, vma);
ba9882808   Kirill A. Shutemov   thp: add option t...
2047
2048
2049
2050
  			if (!write)
  				entry = pte_wrprotect(entry);
  			if (!young)
  				entry = pte_mkold(entry);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
2051
2052
  			if (soft_dirty)
  				entry = pte_mksoft_dirty(entry);
292924b26   Peter Xu   userfaultfd: wp: ...
2053
2054
  			if (uffd_wp)
  				entry = pte_mkuffd_wp(entry);
ba9882808   Kirill A. Shutemov   thp: add option t...
2055
  		}
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2056
  		pte = pte_offset_map(&_pmd, addr);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2057
  		BUG_ON(!pte_none(*pte));
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2058
  		set_pte_at(mm, addr, pte, entry);
ec0abae6d   Ralph Campbell   mm/thp: fix __spl...
2059
  		if (!pmd_migration)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2060
  			atomic_inc(&page[i]._mapcount);
ec0abae6d   Ralph Campbell   mm/thp: fix __spl...
2061
  		pte_unmap(pte);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2062
  	}
ec0abae6d   Ralph Campbell   mm/thp: fix __spl...
2063
2064
2065
2066
2067
2068
2069
  	if (!pmd_migration) {
  		/*
  		 * Set PG_double_map before dropping compound_mapcount to avoid
  		 * false-negative page_mapped().
  		 */
  		if (compound_mapcount(page) > 1 &&
  		    !TestSetPageDoubleMap(page)) {
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2070
  			for (i = 0; i < HPAGE_PMD_NR; i++)
ec0abae6d   Ralph Campbell   mm/thp: fix __spl...
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
  				atomic_inc(&page[i]._mapcount);
  		}
  
  		lock_page_memcg(page);
  		if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
  			/* Last compound_mapcount is gone. */
  			__dec_lruvec_page_state(page, NR_ANON_THPS);
  			if (TestClearPageDoubleMap(page)) {
  				/* No need in mapcount reference anymore */
  				for (i = 0; i < HPAGE_PMD_NR; i++)
  					atomic_dec(&page[i]._mapcount);
  			}
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2083
  		}
ec0abae6d   Ralph Campbell   mm/thp: fix __spl...
2084
  		unlock_page_memcg(page);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2085
2086
2087
2088
  	}
  
  	smp_wmb(); /* make pte visible before pmd */
  	pmd_populate(mm, pmd, pgtable);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2089
2090
  
  	if (freeze) {
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2091
  		for (i = 0; i < HPAGE_PMD_NR; i++) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2092
2093
2094
2095
  			page_remove_rmap(page + i, false);
  			put_page(page + i);
  		}
  	}
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2096
2097
2098
  }
  
  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2099
  		unsigned long address, bool freeze, struct page *page)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2100
2101
  {
  	spinlock_t *ptl;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2102
  	struct mmu_notifier_range range;
c444eb564   Andrea Arcangeli   mm: thp: make the...
2103
2104
  	bool was_locked = false;
  	pmd_t _pmd;
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2105

7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
2106
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
6f4f13e8d   Jérôme Glisse   mm/mmu_notifier: ...
2107
  				address & HPAGE_PMD_MASK,
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2108
2109
2110
  				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
  	mmu_notifier_invalidate_range_start(&range);
  	ptl = pmd_lock(vma->vm_mm, pmd);
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2111
2112
2113
2114
2115
2116
  
  	/*
  	 * If caller asks to setup a migration entries, we need a page to check
  	 * pmd against. Otherwise we can end up replacing wrong page.
  	 */
  	VM_BUG_ON(freeze && !page);
c444eb564   Andrea Arcangeli   mm: thp: make the...
2117
2118
2119
2120
2121
2122
  	if (page) {
  		VM_WARN_ON_ONCE(!PageLocked(page));
  		was_locked = true;
  		if (page != pmd_page(*pmd))
  			goto out;
  	}
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2123

c444eb564   Andrea Arcangeli   mm: thp: make the...
2124
  repeat:
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2125
  	if (pmd_trans_huge(*pmd)) {
c444eb564   Andrea Arcangeli   mm: thp: make the...
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
  		if (!page) {
  			page = pmd_page(*pmd);
  			if (unlikely(!trylock_page(page))) {
  				get_page(page);
  				_pmd = *pmd;
  				spin_unlock(ptl);
  				lock_page(page);
  				spin_lock(ptl);
  				if (unlikely(!pmd_same(*pmd, _pmd))) {
  					unlock_page(page);
  					put_page(page);
  					page = NULL;
  					goto repeat;
  				}
  				put_page(page);
  			}
  		}
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2143
  		if (PageMlocked(page))
5f7377147   Kirill A. Shutemov   thp: fix deadlock...
2144
  			clear_page_mlock(page);
84c3fc4e9   Zi Yan   mm: thp: check pm...
2145
  	} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
2146
  		goto out;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2147
  	__split_huge_pmd_locked(vma, pmd, range.start, freeze);
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
2148
  out:
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2149
  	spin_unlock(ptl);
c444eb564   Andrea Arcangeli   mm: thp: make the...
2150
2151
  	if (!was_locked && page)
  		unlock_page(page);
4645b9fe8   Jérôme Glisse   mm/mmu_notifier: ...
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
  	/*
  	 * No need to double call mmu_notifier->invalidate_range() callback.
  	 * They are 3 cases to consider inside __split_huge_pmd_locked():
  	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
  	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
  	 *    fault will trigger a flush_notify before pointing to a new page
  	 *    (it is fine if the secondary mmu keeps pointing to the old zero
  	 *    page in the meantime)
  	 *  3) Split a huge pmd into pte pointing to the same page. No need
  	 *     to invalidate secondary tlb entry they are all still valid.
  	 *     any further changes to individual pte will notify. So no need
  	 *     to call mmu_notifier->invalidate_range()
  	 */
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
2165
  	mmu_notifier_invalidate_range_only_end(&range);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2166
  }
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2167
2168
  void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
  		bool freeze, struct page *page)
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2169
  {
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2170
  	pgd_t *pgd;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2171
  	p4d_t *p4d;
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2172
  	pud_t *pud;
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2173
  	pmd_t *pmd;
78ddc5347   Kirill A. Shutemov   thp: rename split...
2174
  	pgd = pgd_offset(vma->vm_mm, address);
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2175
2176
  	if (!pgd_present(*pgd))
  		return;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2177
2178
2179
2180
2181
  	p4d = p4d_offset(pgd, address);
  	if (!p4d_present(*p4d))
  		return;
  
  	pud = pud_offset(p4d, address);
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2182
2183
2184
2185
  	if (!pud_present(*pud))
  		return;
  
  	pmd = pmd_offset(pud, address);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2186

33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2187
  	__split_huge_pmd(vma, pmd, address, freeze, page);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2188
  }
e1b9996b8   Kirill A. Shutemov   thp: vma_adjust_t...
2189
  void vma_adjust_trans_huge(struct vm_area_struct *vma,
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
  			     unsigned long start,
  			     unsigned long end,
  			     long adjust_next)
  {
  	/*
  	 * If the new start address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (start & ~HPAGE_PMD_MASK &&
  	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2202
  		split_huge_pmd_address(vma, start, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2203
2204
2205
2206
2207
2208
2209
2210
2211
  
  	/*
  	 * If the new end address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (end & ~HPAGE_PMD_MASK &&
  	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2212
  		split_huge_pmd_address(vma, end, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2213
2214
2215
  
  	/*
  	 * If we're also updating the vma->vm_next->vm_start, if the new
f9d86a605   Wei Yang   mm/mmap: leave ad...
2216
  	 * vm_next->vm_start isn't hpage aligned and it could previously
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2217
2218
2219
2220
2221
  	 * contain an hugepage: check if we need to split an huge pmd.
  	 */
  	if (adjust_next > 0) {
  		struct vm_area_struct *next = vma->vm_next;
  		unsigned long nstart = next->vm_start;
f9d86a605   Wei Yang   mm/mmap: leave ad...
2222
  		nstart += adjust_next;
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2223
2224
2225
  		if (nstart & ~HPAGE_PMD_MASK &&
  		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
  		    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2226
  			split_huge_pmd_address(next, nstart, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2227
2228
  	}
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2229

906f9cdfc   Hugh Dickins   mm/huge_memory: r...
2230
  static void unmap_page(struct page *page)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2231
  {
dd156e3fc   Shakeel Butt   mm/rmap: always d...
2232
  	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
c7ab0d2fd   Kirill A. Shutemov   mm: convert try_t...
2233
  		TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
666e5a406   Minchan Kim   mm: make ttu's re...
2234
  	bool unmap_success;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2235
2236
  
  	VM_BUG_ON_PAGE(!PageHead(page), page);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2237
  	if (PageAnon(page))
b5ff8161e   Naoya Horiguchi   mm: thp: introduc...
2238
  		ttu_flags |= TTU_SPLIT_FREEZE;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2239

666e5a406   Minchan Kim   mm: make ttu's re...
2240
2241
  	unmap_success = try_to_unmap(page, ttu_flags);
  	VM_BUG_ON_PAGE(!unmap_success, page);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2242
  }
8cce54756   Kirill A. Shutemov   mm/huge_memory: f...
2243
  static void remap_page(struct page *page, unsigned int nr)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2244
  {
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2245
  	int i;
ace71a19c   Kirill A. Shutemov   mm: introduce pag...
2246
2247
2248
  	if (PageTransHuge(page)) {
  		remove_migration_ptes(page, page, true);
  	} else {
8cce54756   Kirill A. Shutemov   mm/huge_memory: f...
2249
  		for (i = 0; i < nr; i++)
ace71a19c   Kirill A. Shutemov   mm: introduce pag...
2250
2251
  			remove_migration_ptes(page + i, page + i, true);
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2252
  }
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2253
  static void __split_huge_page_tail(struct page *head, int tail,
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2254
2255
  		struct lruvec *lruvec, struct list_head *list)
  {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2256
  	struct page *page_tail = head + tail;
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2257
  	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2258
2259
  
  	/*
605ca5ede   Konstantin Khlebnikov   mm/huge_memory.c:...
2260
2261
2262
2263
  	 * Clone page flags before unfreezing refcount.
  	 *
  	 * After successful get_page_unless_zero() might follow flags change,
  	 * for exmaple lock_page() which set PG_waiters.
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2264
  	 */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2265
2266
2267
2268
  	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  	page_tail->flags |= (head->flags &
  			((1L << PG_referenced) |
  			 (1L << PG_swapbacked) |
38d8b4e6b   Huang Ying   mm, THP, swap: de...
2269
  			 (1L << PG_swapcache) |
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2270
2271
2272
  			 (1L << PG_mlocked) |
  			 (1L << PG_uptodate) |
  			 (1L << PG_active) |
1899ad18c   Johannes Weiner   mm: workingset: t...
2273
  			 (1L << PG_workingset) |
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2274
  			 (1L << PG_locked) |
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2275
  			 (1L << PG_unevictable) |
72e6afa08   Catalin Marinas   mm: Preserve the ...
2276
2277
2278
  #ifdef CONFIG_64BIT
  			 (1L << PG_arch_2) |
  #endif
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2279
  			 (1L << PG_dirty)));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2280

173d9d9fd   Hugh Dickins   mm/huge_memory: s...
2281
2282
2283
2284
2285
  	/* ->mapping in first tail page is compound_mapcount */
  	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
  			page_tail);
  	page_tail->mapping = head->mapping;
  	page_tail->index = head->index + tail;
605ca5ede   Konstantin Khlebnikov   mm/huge_memory.c:...
2286
  	/* Page flags must be visible before we make the page non-compound. */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2287
  	smp_wmb();
605ca5ede   Konstantin Khlebnikov   mm/huge_memory.c:...
2288
2289
2290
2291
2292
2293
  	/*
  	 * Clear PageTail before unfreezing page refcount.
  	 *
  	 * After successful get_page_unless_zero() might follow put_page()
  	 * which needs correct compound_head().
  	 */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2294
  	clear_compound_head(page_tail);
605ca5ede   Konstantin Khlebnikov   mm/huge_memory.c:...
2295
2296
2297
  	/* Finally unfreeze refcount. Additional reference from page cache. */
  	page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
  					  PageSwapCache(head)));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2298
2299
2300
2301
  	if (page_is_young(head))
  		set_page_young(page_tail);
  	if (page_is_idle(head))
  		set_page_idle(page_tail);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2302
  	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
94723aafb   Michal Hocko   mm: unclutter THP...
2303
2304
2305
2306
2307
2308
  
  	/*
  	 * always add to the tail because some iterators expect new
  	 * pages to show after the currently processed elements - e.g.
  	 * migrate_pages
  	 */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2309
  	lru_add_page_tail(head, page_tail, lruvec, list);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2310
  }
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2311
  static void __split_huge_page(struct page *page, struct list_head *list,
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2312
  		pgoff_t end, unsigned long flags)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2313
2314
  {
  	struct page *head = compound_head(page);
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2315
  	pg_data_t *pgdat = page_pgdat(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2316
  	struct lruvec *lruvec;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2317
2318
  	struct address_space *swap_cache = NULL;
  	unsigned long offset = 0;
8cce54756   Kirill A. Shutemov   mm/huge_memory: f...
2319
  	unsigned int nr = thp_nr_pages(head);
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2320
  	int i;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2321

f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2322
  	lruvec = mem_cgroup_page_lruvec(head, pgdat);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2323
2324
2325
  
  	/* complete memcg works before add pages to LRU */
  	mem_cgroup_split_huge_fixup(head);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2326
2327
2328
2329
2330
2331
2332
  	if (PageAnon(head) && PageSwapCache(head)) {
  		swp_entry_t entry = { .val = page_private(head) };
  
  		offset = swp_offset(entry);
  		swap_cache = swap_address_space(entry);
  		xa_lock(&swap_cache->i_pages);
  	}
8cce54756   Kirill A. Shutemov   mm/huge_memory: f...
2333
  	for (i = nr - 1; i >= 1; i--) {
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2334
  		__split_huge_page_tail(head, i, lruvec, list);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2335
2336
  		/* Some pages can be beyond i_size: drop them from page cache */
  		if (head[i].index >= end) {
2d077d4b5   Hugh Dickins   mm/huge_memory.c:...
2337
  			ClearPageDirty(head + i);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2338
  			__delete_from_page_cache(head + i, NULL);
800d8c63b   Kirill A. Shutemov   shmem: add huge p...
2339
2340
  			if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
  				shmem_uncharge(head->mapping->host, 1);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2341
  			put_page(head + i);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2342
2343
2344
2345
2346
2347
  		} else if (!PageAnon(page)) {
  			__xa_store(&head->mapping->i_pages, head[i].index,
  					head + i, 0);
  		} else if (swap_cache) {
  			__xa_store(&swap_cache->i_pages, offset + i,
  					head + i, 0);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2348
2349
  		}
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2350
2351
  
  	ClearPageCompound(head);
f7da677bc   Vlastimil Babka   mm, page_owner: h...
2352

8cce54756   Kirill A. Shutemov   mm/huge_memory: f...
2353
  	split_page_owner(head, nr);
f7da677bc   Vlastimil Babka   mm, page_owner: h...
2354

baa355fd3   Kirill A. Shutemov   thp: file pages s...
2355
2356
  	/* See comment in __split_huge_page_tail() */
  	if (PageAnon(head)) {
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2357
  		/* Additional pin to swap cache */
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2358
  		if (PageSwapCache(head)) {
38d8b4e6b   Huang Ying   mm, THP, swap: de...
2359
  			page_ref_add(head, 2);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2360
2361
  			xa_unlock(&swap_cache->i_pages);
  		} else {
38d8b4e6b   Huang Ying   mm, THP, swap: de...
2362
  			page_ref_inc(head);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2363
  		}
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2364
  	} else {
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2365
  		/* Additional pin to page cache */
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2366
  		page_ref_add(head, 2);
b93b01631   Matthew Wilcox   page cache: use x...
2367
  		xa_unlock(&head->mapping->i_pages);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2368
  	}
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2369
  	spin_unlock_irqrestore(&pgdat->lru_lock, flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2370

8cce54756   Kirill A. Shutemov   mm/huge_memory: f...
2371
  	remap_page(head, nr);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2372

c4f9c701f   Huang Ying   mm: fix a race du...
2373
2374
2375
2376
2377
  	if (PageSwapCache(head)) {
  		swp_entry_t entry = { .val = page_private(head) };
  
  		split_swap_cluster(entry);
  	}
8cce54756   Kirill A. Shutemov   mm/huge_memory: f...
2378
  	for (i = 0; i < nr; i++) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
  		struct page *subpage = head + i;
  		if (subpage == page)
  			continue;
  		unlock_page(subpage);
  
  		/*
  		 * Subpages may be freed if there wasn't any mapping
  		 * like if add_to_swap() is running on a lru page that
  		 * had its mapping zapped. And freeing these pages
  		 * requires taking the lru_lock so we do the put_page
  		 * of the tail pages after the split is complete.
  		 */
  		put_page(subpage);
  	}
  }
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2394
2395
  int total_mapcount(struct page *page)
  {
86b562b62   Kirill A. Shutemov   mm/huge_memory: f...
2396
  	int i, compound, nr, ret;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2397
2398
2399
2400
2401
  
  	VM_BUG_ON_PAGE(PageTail(page), page);
  
  	if (likely(!PageCompound(page)))
  		return atomic_read(&page->_mapcount) + 1;
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2402
  	compound = compound_mapcount(page);
86b562b62   Kirill A. Shutemov   mm/huge_memory: f...
2403
  	nr = compound_nr(page);
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2404
  	if (PageHuge(page))
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2405
2406
  		return compound;
  	ret = compound;
86b562b62   Kirill A. Shutemov   mm/huge_memory: f...
2407
  	for (i = 0; i < nr; i++)
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2408
  		ret += atomic_read(&page[i]._mapcount) + 1;
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2409
2410
  	/* File pages has compound_mapcount included in _mapcount */
  	if (!PageAnon(page))
86b562b62   Kirill A. Shutemov   mm/huge_memory: f...
2411
  		return ret - compound * nr;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2412
  	if (PageDoubleMap(page))
86b562b62   Kirill A. Shutemov   mm/huge_memory: f...
2413
  		ret -= nr;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2414
2415
  	return ret;
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2416
  /*
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
   * This calculates accurately how many mappings a transparent hugepage
   * has (unlike page_mapcount() which isn't fully accurate). This full
   * accuracy is primarily needed to know if copy-on-write faults can
   * reuse the page and change the mapping to read-write instead of
   * copying them. At the same time this returns the total_mapcount too.
   *
   * The function returns the highest mapcount any one of the subpages
   * has. If the return value is one, even if different processes are
   * mapping different subpages of the transparent hugepage, they can
   * all reuse it, because each process is reusing a different subpage.
   *
   * The total_mapcount is instead counting all virtual mappings of the
   * subpages. If the total_mapcount is equal to "one", it tells the
   * caller all mappings belong to the same "mm" and in turn the
   * anon_vma of the transparent hugepage can become the vma->anon_vma
   * local one as no other process may be mapping any of the subpages.
   *
   * It would be more accurate to replace page_mapcount() with
   * page_trans_huge_mapcount(), however we only use
   * page_trans_huge_mapcount() in the copy-on-write faults where we
   * need full accuracy to avoid breaking page pinning, because
   * page_trans_huge_mapcount() is slower than page_mapcount().
   */
  int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
  {
  	int i, ret, _total_mapcount, mapcount;
  
  	/* hugetlbfs shouldn't call it */
  	VM_BUG_ON_PAGE(PageHuge(page), page);
  
  	if (likely(!PageTransCompound(page))) {
  		mapcount = atomic_read(&page->_mapcount) + 1;
  		if (total_mapcount)
  			*total_mapcount = mapcount;
  		return mapcount;
  	}
  
  	page = compound_head(page);
  
  	_total_mapcount = ret = 0;
65dfe3c3b   Matthew Wilcox (Oracle)   mm/huge_memory: f...
2457
  	for (i = 0; i < thp_nr_pages(page); i++) {
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
2458
2459
2460
2461
2462
2463
  		mapcount = atomic_read(&page[i]._mapcount) + 1;
  		ret = max(ret, mapcount);
  		_total_mapcount += mapcount;
  	}
  	if (PageDoubleMap(page)) {
  		ret -= 1;
65dfe3c3b   Matthew Wilcox (Oracle)   mm/huge_memory: f...
2464
  		_total_mapcount -= thp_nr_pages(page);
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
2465
2466
2467
2468
2469
2470
2471
2472
  	}
  	mapcount = compound_mapcount(page);
  	ret += mapcount;
  	_total_mapcount += mapcount;
  	if (total_mapcount)
  		*total_mapcount = _total_mapcount;
  	return ret;
  }
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2473
2474
2475
2476
  /* Racy check whether the huge page can be split */
  bool can_split_huge_page(struct page *page, int *pextra_pins)
  {
  	int extra_pins;
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2477
  	/* Additional pins from page cache */
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2478
  	if (PageAnon(page))
e2333dad2   Matthew Wilcox (Oracle)   mm/huge_memory: f...
2479
  		extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2480
  	else
e2333dad2   Matthew Wilcox (Oracle)   mm/huge_memory: f...
2481
  		extra_pins = thp_nr_pages(page);
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2482
2483
2484
2485
  	if (pextra_pins)
  		*pextra_pins = extra_pins;
  	return total_mapcount(page) == page_count(page) - extra_pins - 1;
  }
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
2486
  /*
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
   * This function splits huge page into normal pages. @page can point to any
   * subpage of huge page to split. Split doesn't change the position of @page.
   *
   * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
   * The huge page must be locked.
   *
   * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
   *
   * Both head page and tail pages will inherit mapping, flags, and so on from
   * the hugepage.
   *
   * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
   * they are not mapped.
   *
   * Returns 0 if the hugepage is split successfully.
   * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
   * us.
   */
  int split_huge_page_to_list(struct page *page, struct list_head *list)
  {
  	struct page *head = compound_head(page);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2508
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
a8803e6c1   Wei Yang   mm/huge_memory.c:...
2509
  	struct deferred_split *ds_queue = get_deferred_split_queue(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2510
2511
2512
  	struct anon_vma *anon_vma = NULL;
  	struct address_space *mapping = NULL;
  	int count, mapcount, extra_pins, ret;
0b9b6fff7   Kirill A. Shutemov   thp: fix interrup...
2513
  	unsigned long flags;
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2514
  	pgoff_t end;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2515

cb8296248   Wei Yang   mm/huge_memory.c:...
2516
  	VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
a8803e6c1   Wei Yang   mm/huge_memory.c:...
2517
2518
  	VM_BUG_ON_PAGE(!PageLocked(head), head);
  	VM_BUG_ON_PAGE(!PageCompound(head), head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2519

a8803e6c1   Wei Yang   mm/huge_memory.c:...
2520
  	if (PageWriteback(head))
59807685a   Huang Ying   mm, THP, swap: su...
2521
  		return -EBUSY;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2522
2523
  	if (PageAnon(head)) {
  		/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2524
  		 * The caller does not necessarily hold an mmap_lock that would
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
  		 * prevent the anon_vma disappearing so we first we take a
  		 * reference to it and then lock the anon_vma for write. This
  		 * is similar to page_lock_anon_vma_read except the write lock
  		 * is taken to serialise against parallel split or collapse
  		 * operations.
  		 */
  		anon_vma = page_get_anon_vma(head);
  		if (!anon_vma) {
  			ret = -EBUSY;
  			goto out;
  		}
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2536
  		end = -1;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
  		mapping = NULL;
  		anon_vma_lock_write(anon_vma);
  	} else {
  		mapping = head->mapping;
  
  		/* Truncated ? */
  		if (!mapping) {
  			ret = -EBUSY;
  			goto out;
  		}
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2547
2548
  		anon_vma = NULL;
  		i_mmap_lock_read(mapping);
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2549
2550
2551
2552
2553
2554
2555
2556
2557
  
  		/*
  		 *__split_huge_page() may need to trim off pages beyond EOF:
  		 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
  		 * which cannot be nested inside the page tree lock. So note
  		 * end now: i_size itself may be changed at any moment, but
  		 * head page lock is good enough to serialize the trimming.
  		 */
  		end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2558
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2559
2560
  
  	/*
906f9cdfc   Hugh Dickins   mm/huge_memory: r...
2561
  	 * Racy check if we can split the page, before unmap_page() will
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2562
2563
  	 * split PMDs
  	 */
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2564
  	if (!can_split_huge_page(head, &extra_pins)) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2565
2566
2567
  		ret = -EBUSY;
  		goto out_unlock;
  	}
906f9cdfc   Hugh Dickins   mm/huge_memory: r...
2568
  	unmap_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2569
  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2570
  	/* prevent PageLRU to go away from under us, and freeze lru stats */
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2571
  	spin_lock_irqsave(&pgdata->lru_lock, flags);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2572
2573
  
  	if (mapping) {
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2574
  		XA_STATE(xas, &mapping->i_pages, page_index(head));
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2575

baa355fd3   Kirill A. Shutemov   thp: file pages s...
2576
  		/*
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2577
  		 * Check if the head page is present in page cache.
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2578
2579
  		 * We assume all tail are present too, if head is there.
  		 */
aa5dc07f7   Matthew Wilcox   mm: Convert huge_...
2580
2581
  		xa_lock(&mapping->i_pages);
  		if (xas_load(&xas) != head)
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2582
2583
  			goto fail;
  	}
0139aa7b7   Joonsoo Kim   mm: rename _count...
2584
  	/* Prevent deferred_split_scan() touching ->_refcount */
364c1eebe   Yang Shi   mm: thp: extract ...
2585
  	spin_lock(&ds_queue->split_queue_lock);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2586
2587
  	count = page_count(head);
  	mapcount = total_mapcount(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2588
  	if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
2589
  		if (!list_empty(page_deferred_list(head))) {
364c1eebe   Yang Shi   mm: thp: extract ...
2590
  			ds_queue->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2591
2592
  			list_del(page_deferred_list(head));
  		}
afb971729   Wei Yang   mm/huge_memory.c:...
2593
  		spin_unlock(&ds_queue->split_queue_lock);
06d3eff62   Kirill A. Shutemov   mm/thp: fix node ...
2594
  		if (mapping) {
a8803e6c1   Wei Yang   mm/huge_memory.c:...
2595
2596
  			if (PageSwapBacked(head))
  				__dec_node_page_state(head, NR_SHMEM_THPS);
06d3eff62   Kirill A. Shutemov   mm/thp: fix node ...
2597
  			else
a8803e6c1   Wei Yang   mm/huge_memory.c:...
2598
  				__dec_node_page_state(head, NR_FILE_THPS);
06d3eff62   Kirill A. Shutemov   mm/thp: fix node ...
2599
  		}
006d3ff27   Hugh Dickins   mm/huge_memory: f...
2600
  		__split_huge_page(page, list, end, flags);
c4f9c701f   Huang Ying   mm: fix a race du...
2601
  		ret = 0;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2602
  	} else {
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2603
2604
2605
2606
2607
2608
2609
2610
2611
  		if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
  			pr_alert("total_mapcount: %u, page_count(): %u
  ",
  					mapcount, count);
  			if (PageTail(page))
  				dump_page(head, NULL);
  			dump_page(page, "total_mapcount(head) > 0");
  			BUG();
  		}
364c1eebe   Yang Shi   mm: thp: extract ...
2612
  		spin_unlock(&ds_queue->split_queue_lock);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2613
  fail:		if (mapping)
b93b01631   Matthew Wilcox   page cache: use x...
2614
  			xa_unlock(&mapping->i_pages);
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
2615
  		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
8cce54756   Kirill A. Shutemov   mm/huge_memory: f...
2616
  		remap_page(head, thp_nr_pages(head));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2617
2618
2619
2620
  		ret = -EBUSY;
  	}
  
  out_unlock:
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2621
2622
2623
2624
2625
2626
  	if (anon_vma) {
  		anon_vma_unlock_write(anon_vma);
  		put_anon_vma(anon_vma);
  	}
  	if (mapping)
  		i_mmap_unlock_read(mapping);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2627
2628
2629
2630
  out:
  	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
  	return ret;
  }
9a982250f   Kirill A. Shutemov   thp: introduce de...
2631
2632
2633
  
  void free_transhuge_page(struct page *page)
  {
87eaceb3f   Yang Shi   mm: thp: make def...
2634
  	struct deferred_split *ds_queue = get_deferred_split_queue(page);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2635
  	unsigned long flags;
364c1eebe   Yang Shi   mm: thp: extract ...
2636
  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2637
  	if (!list_empty(page_deferred_list(page))) {
364c1eebe   Yang Shi   mm: thp: extract ...
2638
  		ds_queue->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2639
2640
  		list_del(page_deferred_list(page));
  	}
364c1eebe   Yang Shi   mm: thp: extract ...
2641
  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2642
2643
2644
2645
2646
  	free_compound_page(page);
  }
  
  void deferred_split_huge_page(struct page *page)
  {
87eaceb3f   Yang Shi   mm: thp: make def...
2647
2648
2649
2650
  	struct deferred_split *ds_queue = get_deferred_split_queue(page);
  #ifdef CONFIG_MEMCG
  	struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
  #endif
9a982250f   Kirill A. Shutemov   thp: introduce de...
2651
2652
2653
  	unsigned long flags;
  
  	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
87eaceb3f   Yang Shi   mm: thp: make def...
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
  	/*
  	 * The try_to_unmap() in page reclaim path might reach here too,
  	 * this may cause a race condition to corrupt deferred split queue.
  	 * And, if page reclaim is already handling the same page, it is
  	 * unnecessary to handle it again in shrinker.
  	 *
  	 * Check PageSwapCache to determine if the page is being
  	 * handled by page reclaim since THP swap would add the page into
  	 * swap cache before calling try_to_unmap().
  	 */
  	if (PageSwapCache(page))
  		return;
364c1eebe   Yang Shi   mm: thp: extract ...
2666
  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2667
  	if (list_empty(page_deferred_list(page))) {
f9719a03d   Kirill A. Shutemov   thp, vmstats: cou...
2668
  		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
364c1eebe   Yang Shi   mm: thp: extract ...
2669
2670
  		list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
  		ds_queue->split_queue_len++;
87eaceb3f   Yang Shi   mm: thp: make def...
2671
2672
2673
2674
2675
  #ifdef CONFIG_MEMCG
  		if (memcg)
  			memcg_set_shrinker_bit(memcg, page_to_nid(page),
  					       deferred_split_shrinker.id);
  #endif
9a982250f   Kirill A. Shutemov   thp: introduce de...
2676
  	}
364c1eebe   Yang Shi   mm: thp: extract ...
2677
  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2678
2679
2680
2681
2682
  }
  
  static unsigned long deferred_split_count(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2683
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
364c1eebe   Yang Shi   mm: thp: extract ...
2684
  	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
87eaceb3f   Yang Shi   mm: thp: make def...
2685
2686
2687
2688
2689
  
  #ifdef CONFIG_MEMCG
  	if (sc->memcg)
  		ds_queue = &sc->memcg->deferred_split_queue;
  #endif
364c1eebe   Yang Shi   mm: thp: extract ...
2690
  	return READ_ONCE(ds_queue->split_queue_len);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2691
2692
2693
2694
2695
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2696
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
364c1eebe   Yang Shi   mm: thp: extract ...
2697
  	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2698
2699
2700
2701
  	unsigned long flags;
  	LIST_HEAD(list), *pos, *next;
  	struct page *page;
  	int split = 0;
87eaceb3f   Yang Shi   mm: thp: make def...
2702
2703
2704
2705
  #ifdef CONFIG_MEMCG
  	if (sc->memcg)
  		ds_queue = &sc->memcg->deferred_split_queue;
  #endif
364c1eebe   Yang Shi   mm: thp: extract ...
2706
  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2707
  	/* Take pin on all head pages to avoid freeing them under us */
364c1eebe   Yang Shi   mm: thp: extract ...
2708
  	list_for_each_safe(pos, next, &ds_queue->split_queue) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
2709
2710
  		page = list_entry((void *)pos, struct page, mapping);
  		page = compound_head(page);
e3ae19535   Kirill A. Shutemov   thp: limit number...
2711
2712
2713
2714
  		if (get_page_unless_zero(page)) {
  			list_move(page_deferred_list(page), &list);
  		} else {
  			/* We lost race with put_compound_page() */
9a982250f   Kirill A. Shutemov   thp: introduce de...
2715
  			list_del_init(page_deferred_list(page));
364c1eebe   Yang Shi   mm: thp: extract ...
2716
  			ds_queue->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2717
  		}
e3ae19535   Kirill A. Shutemov   thp: limit number...
2718
2719
  		if (!--sc->nr_to_scan)
  			break;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2720
  	}
364c1eebe   Yang Shi   mm: thp: extract ...
2721
  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2722
2723
2724
  
  	list_for_each_safe(pos, next, &list) {
  		page = list_entry((void *)pos, struct page, mapping);
fa41b900c   Kirill A. Shutemov   mm/thp: do not wa...
2725
2726
  		if (!trylock_page(page))
  			goto next;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2727
2728
2729
2730
  		/* split_huge_page() removes page from list on success */
  		if (!split_huge_page(page))
  			split++;
  		unlock_page(page);
fa41b900c   Kirill A. Shutemov   mm/thp: do not wa...
2731
  next:
9a982250f   Kirill A. Shutemov   thp: introduce de...
2732
2733
  		put_page(page);
  	}
364c1eebe   Yang Shi   mm: thp: extract ...
2734
2735
2736
  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
  	list_splice_tail(&list, &ds_queue->split_queue);
  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2737

cb8d68ec1   Kirill A. Shutemov   thp: change defer...
2738
2739
2740
2741
  	/*
  	 * Stop shrinker if we didn't split any page, but the queue is empty.
  	 * This can happen if pages were freed under us.
  	 */
364c1eebe   Yang Shi   mm: thp: extract ...
2742
  	if (!split && list_empty(&ds_queue->split_queue))
cb8d68ec1   Kirill A. Shutemov   thp: change defer...
2743
2744
  		return SHRINK_STOP;
  	return split;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2745
2746
2747
2748
2749
2750
  }
  
  static struct shrinker deferred_split_shrinker = {
  	.count_objects = deferred_split_count,
  	.scan_objects = deferred_split_scan,
  	.seeks = DEFAULT_SEEKS,
87eaceb3f   Yang Shi   mm: thp: make def...
2751
2752
  	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
  		 SHRINKER_NONSLAB,
9a982250f   Kirill A. Shutemov   thp: introduce de...
2753
  };
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
  
  #ifdef CONFIG_DEBUG_FS
  static int split_huge_pages_set(void *data, u64 val)
  {
  	struct zone *zone;
  	struct page *page;
  	unsigned long pfn, max_zone_pfn;
  	unsigned long total = 0, split = 0;
  
  	if (val != 1)
  		return -EINVAL;
  
  	for_each_populated_zone(zone) {
  		max_zone_pfn = zone_end_pfn(zone);
  		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
  			if (!pfn_valid(pfn))
  				continue;
  
  			page = pfn_to_page(pfn);
  			if (!get_page_unless_zero(page))
  				continue;
  
  			if (zone != page_zone(page))
  				goto next;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2778
  			if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
  				goto next;
  
  			total++;
  			lock_page(page);
  			if (!split_huge_page(page))
  				split++;
  			unlock_page(page);
  next:
  			put_page(page);
  		}
  	}
145bdaa15   Yang Shi   mm: thp: correct ...
2790
2791
  	pr_info("%lu of %lu THP split
  ", split, total);
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2792
2793
2794
  
  	return 0;
  }
f1287869e   zhong jiang   mm/huge_memory.c:...
2795
  DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2796
2797
2798
2799
2800
  		"%llu
  ");
  
  static int __init split_huge_pages_debugfs(void)
  {
d9f7979c9   Greg Kroah-Hartman   mm: no need to ch...
2801
2802
  	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
  			    &split_huge_pages_fops);
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2803
2804
2805
2806
  	return 0;
  }
  late_initcall(split_huge_pages_debugfs);
  #endif
616b83715   Zi Yan   mm: thp: enable t...
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
  
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
  		struct page *page)
  {
  	struct vm_area_struct *vma = pvmw->vma;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address = pvmw->address;
  	pmd_t pmdval;
  	swp_entry_t entry;
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
2817
  	pmd_t pmdswp;
616b83715   Zi Yan   mm: thp: enable t...
2818
2819
2820
  
  	if (!(pvmw->pmd && !pvmw->pte))
  		return;
616b83715   Zi Yan   mm: thp: enable t...
2821
  	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
8a8683ad9   Huang Ying   mm: fix possible ...
2822
  	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
616b83715   Zi Yan   mm: thp: enable t...
2823
2824
2825
  	if (pmd_dirty(pmdval))
  		set_page_dirty(page);
  	entry = make_migration_entry(page, pmd_write(pmdval));
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
2826
2827
2828
2829
  	pmdswp = swp_entry_to_pmd(entry);
  	if (pmd_soft_dirty(pmdval))
  		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
  	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
616b83715   Zi Yan   mm: thp: enable t...
2830
2831
  	page_remove_rmap(page, true);
  	put_page(page);
616b83715   Zi Yan   mm: thp: enable t...
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
  }
  
  void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
  {
  	struct vm_area_struct *vma = pvmw->vma;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address = pvmw->address;
  	unsigned long mmun_start = address & HPAGE_PMD_MASK;
  	pmd_t pmde;
  	swp_entry_t entry;
  
  	if (!(pvmw->pmd && !pvmw->pte))
  		return;
  
  	entry = pmd_to_swp_entry(*pvmw->pmd);
  	get_page(new);
  	pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
2849
2850
  	if (pmd_swp_soft_dirty(*pvmw->pmd))
  		pmde = pmd_mksoft_dirty(pmde);
616b83715   Zi Yan   mm: thp: enable t...
2851
  	if (is_write_migration_entry(entry))
f55e1014f   Linus Torvalds   Revert "mm, thp: ...
2852
  		pmde = maybe_pmd_mkwrite(pmde, vma);
616b83715   Zi Yan   mm: thp: enable t...
2853
2854
  
  	flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
e71769ae5   Naoya Horiguchi   mm: enable thp mi...
2855
2856
2857
2858
  	if (PageAnon(new))
  		page_add_anon_rmap(new, vma, mmun_start, true);
  	else
  		page_add_file_rmap(new, true);
616b83715   Zi Yan   mm: thp: enable t...
2859
  	set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
e125fe405   Kirill A. Shutemov   mm, thp: fix mloc...
2860
  	if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
616b83715   Zi Yan   mm: thp: enable t...
2861
2862
2863
2864
  		mlock_vma_page(new);
  	update_mmu_cache_pmd(vma, address, pvmw->pmd);
  }
  #endif