Blame view

mm/huge_memory.c 79.1 KB
71e3aac07   Andrea Arcangeli   thp: transparent ...
1
2
3
4
5
6
  /*
   *  Copyright (C) 2009  Red Hat, Inc.
   *
   *  This work is licensed under the terms of the GNU GPL, version 2. See
   *  the COPYING file in the top-level directory.
   */
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
7
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71e3aac07   Andrea Arcangeli   thp: transparent ...
8
9
  #include <linux/mm.h>
  #include <linux/sched.h>
f7ccbae45   Ingo Molnar   sched/headers: Pr...
10
  #include <linux/sched/coredump.h>
6a3827d75   Ingo Molnar   sched/headers: Pr...
11
  #include <linux/sched/numa_balancing.h>
71e3aac07   Andrea Arcangeli   thp: transparent ...
12
13
14
15
16
  #include <linux/highmem.h>
  #include <linux/hugetlb.h>
  #include <linux/mmu_notifier.h>
  #include <linux/rmap.h>
  #include <linux/swap.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
17
  #include <linux/shrinker.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
18
  #include <linux/mm_inline.h>
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
19
  #include <linux/swapops.h>
4897c7655   Matthew Wilcox   thp: prepare for ...
20
  #include <linux/dax.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
21
  #include <linux/khugepaged.h>
878aee7d6   Andrea Arcangeli   thp: freeze khuge...
22
  #include <linux/freezer.h>
f25748e3c   Dan Williams   mm, dax: convert ...
23
  #include <linux/pfn_t.h>
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
24
  #include <linux/mman.h>
3565fce3a   Dan Williams   mm, x86: get_user...
25
  #include <linux/memremap.h>
325adeb55   Ralf Baechle   mm: huge_memory: ...
26
  #include <linux/pagemap.h>
49071d436   Kirill A. Shutemov   thp: add debugfs ...
27
  #include <linux/debugfs.h>
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
28
  #include <linux/migrate.h>
43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
29
  #include <linux/hashtable.h>
6b251fc96   Andrea Arcangeli   userfaultfd: call...
30
  #include <linux/userfaultfd_k.h>
33c3fc71c   Vladimir Davydov   mm: introduce idl...
31
  #include <linux/page_idle.h>
baa355fd3   Kirill A. Shutemov   thp: file pages s...
32
  #include <linux/shmem_fs.h>
6b31d5955   Michal Hocko   mm, oom: fix pote...
33
  #include <linux/oom.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
34

71e3aac07   Andrea Arcangeli   thp: transparent ...
35
36
37
  #include <asm/tlb.h>
  #include <asm/pgalloc.h>
  #include "internal.h"
ba76149f4   Andrea Arcangeli   thp: khugepaged
38
  /*
8bfa3f9a0   Jianguo Wu   mm/huge_memory.c:...
39
40
41
42
43
44
   * By default transparent hugepage support is disabled in order that avoid
   * to risk increase the memory footprint of applications without a guaranteed
   * benefit. When transparent hugepage support is enabled, is for all mappings,
   * and khugepaged scans all mappings.
   * Defrag is invoked by khugepaged hugepage allocations and by page faults
   * for all hugepage allocations.
ba76149f4   Andrea Arcangeli   thp: khugepaged
45
   */
71e3aac07   Andrea Arcangeli   thp: transparent ...
46
  unsigned long transparent_hugepage_flags __read_mostly =
13ece886d   Andrea Arcangeli   thp: transparent ...
47
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
ba76149f4   Andrea Arcangeli   thp: khugepaged
48
  	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
13ece886d   Andrea Arcangeli   thp: transparent ...
49
50
51
52
  #endif
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  #endif
444eb2a44   Mel Gorman   mm: thp: set THP ...
53
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
79da5407e   Kirill A. Shutemov   thp: introduce sy...
54
55
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
ba76149f4   Andrea Arcangeli   thp: khugepaged
56

9a982250f   Kirill A. Shutemov   thp: introduce de...
57
  static struct shrinker deferred_split_shrinker;
f000565ad   Andrea Arcangeli   thp: set recommen...
58

97ae17497   Kirill A. Shutemov   thp: implement re...
59
  static atomic_t huge_zero_refcount;
56873f43a   Wang, Yalin   mm:add KPF_ZERO_P...
60
  struct page *huge_zero_page __read_mostly;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
61

6fcb52a56   Aaron Lu   thp: reduce usage...
62
  static struct page *get_huge_zero_page(void)
97ae17497   Kirill A. Shutemov   thp: implement re...
63
64
65
66
  {
  	struct page *zero_page;
  retry:
  	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
4db0c3c29   Jason Low   mm: remove rest o...
67
  		return READ_ONCE(huge_zero_page);
97ae17497   Kirill A. Shutemov   thp: implement re...
68
69
  
  	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
70
  			HPAGE_PMD_ORDER);
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
71
72
  	if (!zero_page) {
  		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
73
  		return NULL;
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
74
75
  	}
  	count_vm_event(THP_ZERO_PAGE_ALLOC);
97ae17497   Kirill A. Shutemov   thp: implement re...
76
  	preempt_disable();
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
77
  	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
97ae17497   Kirill A. Shutemov   thp: implement re...
78
  		preempt_enable();
5ddacbe92   Yu Zhao   mm: free compound...
79
  		__free_pages(zero_page, compound_order(zero_page));
97ae17497   Kirill A. Shutemov   thp: implement re...
80
81
82
83
84
85
  		goto retry;
  	}
  
  	/* We take additional reference here. It will be put back by shrinker */
  	atomic_set(&huge_zero_refcount, 2);
  	preempt_enable();
4db0c3c29   Jason Low   mm: remove rest o...
86
  	return READ_ONCE(huge_zero_page);
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
87
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
88
  static void put_huge_zero_page(void)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
89
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
90
91
92
93
94
  	/*
  	 * Counter should never go to zero here. Only shrinker can put
  	 * last reference.
  	 */
  	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
95
  }
6fcb52a56   Aaron Lu   thp: reduce usage...
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
  struct page *mm_get_huge_zero_page(struct mm_struct *mm)
  {
  	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		return READ_ONCE(huge_zero_page);
  
  	if (!get_huge_zero_page())
  		return NULL;
  
  	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		put_huge_zero_page();
  
  	return READ_ONCE(huge_zero_page);
  }
  
  void mm_put_huge_zero_page(struct mm_struct *mm)
  {
  	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
  		put_huge_zero_page();
  }
488964666   Glauber Costa   hugepage: convert...
115
116
  static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
  					struct shrink_control *sc)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
117
  {
488964666   Glauber Costa   hugepage: convert...
118
119
120
  	/* we can free zero page only if last reference remains */
  	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
121

488964666   Glauber Costa   hugepage: convert...
122
123
124
  static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
  				       struct shrink_control *sc)
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
125
  	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
126
127
  		struct page *zero_page = xchg(&huge_zero_page, NULL);
  		BUG_ON(zero_page == NULL);
5ddacbe92   Yu Zhao   mm: free compound...
128
  		__free_pages(zero_page, compound_order(zero_page));
488964666   Glauber Costa   hugepage: convert...
129
  		return HPAGE_PMD_NR;
97ae17497   Kirill A. Shutemov   thp: implement re...
130
131
132
  	}
  
  	return 0;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
133
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
134
  static struct shrinker huge_zero_page_shrinker = {
488964666   Glauber Costa   hugepage: convert...
135
136
  	.count_objects = shrink_huge_zero_page_count,
  	.scan_objects = shrink_huge_zero_page_scan,
97ae17497   Kirill A. Shutemov   thp: implement re...
137
138
  	.seeks = DEFAULT_SEEKS,
  };
71e3aac07   Andrea Arcangeli   thp: transparent ...
139
  #ifdef CONFIG_SYSFS
71e3aac07   Andrea Arcangeli   thp: transparent ...
140
141
142
  static ssize_t enabled_show(struct kobject *kobj,
  			    struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
143
144
145
146
147
148
149
150
151
  	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "[always] madvise never
  ");
  	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always [madvise] never
  ");
  	else
  		return sprintf(buf, "always madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
152
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
153

71e3aac07   Andrea Arcangeli   thp: transparent ...
154
155
156
157
  static ssize_t enabled_store(struct kobject *kobj,
  			     struct kobj_attribute *attr,
  			     const char *buf, size_t count)
  {
21440d7eb   David Rientjes   mm, thp: add new ...
158
  	ssize_t ret = count;
ba76149f4   Andrea Arcangeli   thp: khugepaged
159

21440d7eb   David Rientjes   mm, thp: add new ...
160
161
162
163
164
165
166
167
168
169
170
171
172
173
  	if (!memcmp("always", buf,
  		    min(sizeof("always")-1, count))) {
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  	} else if (!memcmp("madvise", buf,
  			   min(sizeof("madvise")-1, count))) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  	} else if (!memcmp("never", buf,
  			   min(sizeof("never")-1, count))) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  	} else
  		ret = -EINVAL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
174
175
  
  	if (ret > 0) {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
176
  		int err = start_stop_khugepaged();
ba76149f4   Andrea Arcangeli   thp: khugepaged
177
178
179
  		if (err)
  			ret = err;
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
180
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
181
182
183
  }
  static struct kobj_attribute enabled_attr =
  	__ATTR(enabled, 0644, enabled_show, enabled_store);
b46e756f5   Kirill A. Shutemov   thp: extract khug...
184
  ssize_t single_hugepage_flag_show(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
185
186
187
  				struct kobj_attribute *attr, char *buf,
  				enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
188
189
190
  	return sprintf(buf, "%d
  ",
  		       !!test_bit(flag, &transparent_hugepage_flags));
71e3aac07   Andrea Arcangeli   thp: transparent ...
191
  }
e27e6151b   Ben Hutchings   mm/thp: use conve...
192

b46e756f5   Kirill A. Shutemov   thp: extract khug...
193
  ssize_t single_hugepage_flag_store(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
194
195
196
197
  				 struct kobj_attribute *attr,
  				 const char *buf, size_t count,
  				 enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
198
199
200
201
202
203
204
205
206
207
  	unsigned long value;
  	int ret;
  
  	ret = kstrtoul(buf, 10, &value);
  	if (ret < 0)
  		return ret;
  	if (value > 1)
  		return -EINVAL;
  
  	if (value)
71e3aac07   Andrea Arcangeli   thp: transparent ...
208
  		set_bit(flag, &transparent_hugepage_flags);
e27e6151b   Ben Hutchings   mm/thp: use conve...
209
  	else
71e3aac07   Andrea Arcangeli   thp: transparent ...
210
  		clear_bit(flag, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
211
212
213
  
  	return count;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
214
215
216
  static ssize_t defrag_show(struct kobject *kobj,
  			   struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
217
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
21440d7eb   David Rientjes   mm, thp: add new ...
218
219
  		return sprintf(buf, "[always] defer defer+madvise madvise never
  ");
444eb2a44   Mel Gorman   mm: thp: set THP ...
220
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
21440d7eb   David Rientjes   mm, thp: add new ...
221
222
223
224
225
226
227
228
229
230
  		return sprintf(buf, "always [defer] defer+madvise madvise never
  ");
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always defer [defer+madvise] madvise never
  ");
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always defer defer+madvise [madvise] never
  ");
  	return sprintf(buf, "always defer defer+madvise madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
231
  }
21440d7eb   David Rientjes   mm, thp: add new ...
232

71e3aac07   Andrea Arcangeli   thp: transparent ...
233
234
235
236
  static ssize_t defrag_store(struct kobject *kobj,
  			    struct kobj_attribute *attr,
  			    const char *buf, size_t count)
  {
21440d7eb   David Rientjes   mm, thp: add new ...
237
238
239
240
241
242
  	if (!memcmp("always", buf,
  		    min(sizeof("always")-1, count))) {
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
21440d7eb   David Rientjes   mm, thp: add new ...
243
244
245
246
247
248
  	} else if (!memcmp("defer+madvise", buf,
  		    min(sizeof("defer+madvise")-1, count))) {
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
4fad7fb6b   David Rientjes   mm, thp: fix sett...
249
250
251
252
253
254
  	} else if (!memcmp("defer", buf,
  		    min(sizeof("defer")-1, count))) {
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
21440d7eb   David Rientjes   mm, thp: add new ...
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
  	} else if (!memcmp("madvise", buf,
  			   min(sizeof("madvise")-1, count))) {
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  	} else if (!memcmp("never", buf,
  			   min(sizeof("never")-1, count))) {
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  	} else
  		return -EINVAL;
  
  	return count;
71e3aac07   Andrea Arcangeli   thp: transparent ...
271
272
273
  }
  static struct kobj_attribute defrag_attr =
  	__ATTR(defrag, 0644, defrag_show, defrag_store);
79da5407e   Kirill A. Shutemov   thp: introduce sy...
274
275
276
  static ssize_t use_zero_page_show(struct kobject *kobj,
  		struct kobj_attribute *attr, char *buf)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
277
  	return single_hugepage_flag_show(kobj, attr, buf,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
278
279
280
281
282
  				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static ssize_t use_zero_page_store(struct kobject *kobj,
  		struct kobj_attribute *attr, const char *buf, size_t count)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
283
  	return single_hugepage_flag_store(kobj, attr, buf, count,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
284
285
286
287
  				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static struct kobj_attribute use_zero_page_attr =
  	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
49920d287   Hugh Dickins   mm: make transpar...
288
289
290
291
292
293
294
295
296
  
  static ssize_t hpage_pmd_size_show(struct kobject *kobj,
  		struct kobj_attribute *attr, char *buf)
  {
  	return sprintf(buf, "%lu
  ", HPAGE_PMD_SIZE);
  }
  static struct kobj_attribute hpage_pmd_size_attr =
  	__ATTR_RO(hpage_pmd_size);
71e3aac07   Andrea Arcangeli   thp: transparent ...
297
298
299
300
  #ifdef CONFIG_DEBUG_VM
  static ssize_t debug_cow_show(struct kobject *kobj,
  				struct kobj_attribute *attr, char *buf)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
301
  	return single_hugepage_flag_show(kobj, attr, buf,
71e3aac07   Andrea Arcangeli   thp: transparent ...
302
303
304
305
306
307
  				TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  }
  static ssize_t debug_cow_store(struct kobject *kobj,
  			       struct kobj_attribute *attr,
  			       const char *buf, size_t count)
  {
b46e756f5   Kirill A. Shutemov   thp: extract khug...
308
  	return single_hugepage_flag_store(kobj, attr, buf, count,
71e3aac07   Andrea Arcangeli   thp: transparent ...
309
310
311
312
313
314
315
316
317
  				 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  }
  static struct kobj_attribute debug_cow_attr =
  	__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
  #endif /* CONFIG_DEBUG_VM */
  
  static struct attribute *hugepage_attr[] = {
  	&enabled_attr.attr,
  	&defrag_attr.attr,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
318
  	&use_zero_page_attr.attr,
49920d287   Hugh Dickins   mm: make transpar...
319
  	&hpage_pmd_size_attr.attr,
e496cf3d7   Kirill A. Shutemov   thp: introduce CO...
320
  #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
5a6e75f81   Kirill A. Shutemov   shmem: prepare hu...
321
322
  	&shmem_enabled_attr.attr,
  #endif
71e3aac07   Andrea Arcangeli   thp: transparent ...
323
324
325
326
327
  #ifdef CONFIG_DEBUG_VM
  	&debug_cow_attr.attr,
  #endif
  	NULL,
  };
8aa95a21b   Arvind Yadav   mm/huge_memory.c:...
328
  static const struct attribute_group hugepage_attr_group = {
71e3aac07   Andrea Arcangeli   thp: transparent ...
329
  	.attrs = hugepage_attr,
ba76149f4   Andrea Arcangeli   thp: khugepaged
330
  };
569e55900   Shaohua Li   thp: improve the ...
331
  static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
71e3aac07   Andrea Arcangeli   thp: transparent ...
332
  {
71e3aac07   Andrea Arcangeli   thp: transparent ...
333
  	int err;
569e55900   Shaohua Li   thp: improve the ...
334
335
  	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
  	if (unlikely(!*hugepage_kobj)) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
336
337
  		pr_err("failed to create transparent hugepage kobject
  ");
569e55900   Shaohua Li   thp: improve the ...
338
  		return -ENOMEM;
ba76149f4   Andrea Arcangeli   thp: khugepaged
339
  	}
569e55900   Shaohua Li   thp: improve the ...
340
  	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
341
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
342
343
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
344
  		goto delete_obj;
ba76149f4   Andrea Arcangeli   thp: khugepaged
345
  	}
569e55900   Shaohua Li   thp: improve the ...
346
  	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
347
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
348
349
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
350
  		goto remove_hp_group;
ba76149f4   Andrea Arcangeli   thp: khugepaged
351
  	}
569e55900   Shaohua Li   thp: improve the ...
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
  
  	return 0;
  
  remove_hp_group:
  	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
  delete_obj:
  	kobject_put(*hugepage_kobj);
  	return err;
  }
  
  static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
  	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
  	kobject_put(hugepage_kobj);
  }
  #else
  static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
  {
  	return 0;
  }
  
  static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  }
  #endif /* CONFIG_SYSFS */
  
  static int __init hugepage_init(void)
  {
  	int err;
  	struct kobject *hugepage_kobj;
  
  	if (!has_transparent_hugepage()) {
  		transparent_hugepage_flags = 0;
  		return -EINVAL;
  	}
ff20c2e0a   Kirill A. Shutemov   mm: Some arch may...
388
389
390
391
392
393
394
395
396
  	/*
  	 * hugepages can't be allocated by the buddy allocator
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
  	/*
  	 * we use page->mapping and page->index in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
569e55900   Shaohua Li   thp: improve the ...
397
398
  	err = hugepage_init_sysfs(&hugepage_kobj);
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
399
  		goto err_sysfs;
ba76149f4   Andrea Arcangeli   thp: khugepaged
400

b46e756f5   Kirill A. Shutemov   thp: extract khug...
401
  	err = khugepaged_init();
ba76149f4   Andrea Arcangeli   thp: khugepaged
402
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
403
  		goto err_slab;
ba76149f4   Andrea Arcangeli   thp: khugepaged
404

65ebb64f4   Kirill A. Shutemov   thp: handle error...
405
406
407
  	err = register_shrinker(&huge_zero_page_shrinker);
  	if (err)
  		goto err_hzp_shrinker;
9a982250f   Kirill A. Shutemov   thp: introduce de...
408
409
410
  	err = register_shrinker(&deferred_split_shrinker);
  	if (err)
  		goto err_split_shrinker;
97ae17497   Kirill A. Shutemov   thp: implement re...
411

97562cd24   Rik van Riel   thp: disable tran...
412
413
414
415
416
  	/*
  	 * By default disable transparent hugepages on smaller systems,
  	 * where the extra memory used could hurt more than TLB overhead
  	 * is likely to save.  The admin can still enable it through /sys.
  	 */
79553da29   Kirill A. Shutemov   thp: cleanup khug...
417
  	if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
97562cd24   Rik van Riel   thp: disable tran...
418
  		transparent_hugepage_flags = 0;
79553da29   Kirill A. Shutemov   thp: cleanup khug...
419
420
  		return 0;
  	}
97562cd24   Rik van Riel   thp: disable tran...
421

79553da29   Kirill A. Shutemov   thp: cleanup khug...
422
  	err = start_stop_khugepaged();
65ebb64f4   Kirill A. Shutemov   thp: handle error...
423
424
  	if (err)
  		goto err_khugepaged;
ba76149f4   Andrea Arcangeli   thp: khugepaged
425

569e55900   Shaohua Li   thp: improve the ...
426
  	return 0;
65ebb64f4   Kirill A. Shutemov   thp: handle error...
427
  err_khugepaged:
9a982250f   Kirill A. Shutemov   thp: introduce de...
428
429
  	unregister_shrinker(&deferred_split_shrinker);
  err_split_shrinker:
65ebb64f4   Kirill A. Shutemov   thp: handle error...
430
431
  	unregister_shrinker(&huge_zero_page_shrinker);
  err_hzp_shrinker:
b46e756f5   Kirill A. Shutemov   thp: extract khug...
432
  	khugepaged_destroy();
65ebb64f4   Kirill A. Shutemov   thp: handle error...
433
  err_slab:
569e55900   Shaohua Li   thp: improve the ...
434
  	hugepage_exit_sysfs(hugepage_kobj);
65ebb64f4   Kirill A. Shutemov   thp: handle error...
435
  err_sysfs:
ba76149f4   Andrea Arcangeli   thp: khugepaged
436
  	return err;
71e3aac07   Andrea Arcangeli   thp: transparent ...
437
  }
a64fb3cd6   Paul Gortmaker   mm: audit/fix non...
438
  subsys_initcall(hugepage_init);
71e3aac07   Andrea Arcangeli   thp: transparent ...
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
  
  static int __init setup_transparent_hugepage(char *str)
  {
  	int ret = 0;
  	if (!str)
  		goto out;
  	if (!strcmp(str, "always")) {
  		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			&transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "madvise")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			&transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "never")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	}
  out:
  	if (!ret)
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
466
467
  		pr_warn("transparent_hugepage= cannot parse, ignored
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
468
469
470
  	return ret;
  }
  __setup("transparent_hugepage=", setup_transparent_hugepage);
b32967ff1   Mel Gorman   mm: numa: Add THP...
471
  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
71e3aac07   Andrea Arcangeli   thp: transparent ...
472
473
474
475
476
  {
  	if (likely(vma->vm_flags & VM_WRITE))
  		pmd = pmd_mkwrite(pmd);
  	return pmd;
  }
9a982250f   Kirill A. Shutemov   thp: introduce de...
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
  static inline struct list_head *page_deferred_list(struct page *page)
  {
  	/*
  	 * ->lru in the tail pages is occupied by compound_head.
  	 * Let's use ->mapping + ->index in the second tail page as list_head.
  	 */
  	return (struct list_head *)&page[2].mapping;
  }
  
  void prep_transhuge_page(struct page *page)
  {
  	/*
  	 * we use page->mapping and page->indexlru in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
9a982250f   Kirill A. Shutemov   thp: introduce de...
492
493
494
495
  
  	INIT_LIST_HEAD(page_deferred_list(page));
  	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
  }
74d2fad13   Toshi Kani   thp, dax: add thp...
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
  unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
  		loff_t off, unsigned long flags, unsigned long size)
  {
  	unsigned long addr;
  	loff_t off_end = off + len;
  	loff_t off_align = round_up(off, size);
  	unsigned long len_pad;
  
  	if (off_end <= off_align || (off_end - off_align) < size)
  		return 0;
  
  	len_pad = len + size;
  	if (len_pad < len || (off + len_pad) < off)
  		return 0;
  
  	addr = current->mm->get_unmapped_area(filp, 0, len_pad,
  					      off >> PAGE_SHIFT, flags);
  	if (IS_ERR_VALUE(addr))
  		return 0;
  
  	addr += (off - addr) & (size - 1);
  	return addr;
  }
  
  unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
  		unsigned long len, unsigned long pgoff, unsigned long flags)
  {
  	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
  
  	if (addr)
  		goto out;
  	if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
  		goto out;
  
  	addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
  	if (addr)
  		return addr;
  
   out:
  	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
  }
  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
82b0f8c39   Jan Kara   mm: join struct f...
538
  static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
bae473a42   Kirill A. Shutemov   mm: introduce fau...
539
  		gfp_t gfp)
71e3aac07   Andrea Arcangeli   thp: transparent ...
540
  {
82b0f8c39   Jan Kara   mm: join struct f...
541
  	struct vm_area_struct *vma = vmf->vma;
00501b531   Johannes Weiner   mm: memcontrol: r...
542
  	struct mem_cgroup *memcg;
71e3aac07   Andrea Arcangeli   thp: transparent ...
543
  	pgtable_t pgtable;
82b0f8c39   Jan Kara   mm: join struct f...
544
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
6b31d5955   Michal Hocko   mm, oom: fix pote...
545
  	int ret = 0;
71e3aac07   Andrea Arcangeli   thp: transparent ...
546

309381fea   Sasha Levin   mm: dump page whe...
547
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
00501b531   Johannes Weiner   mm: memcontrol: r...
548

49f4a8c52   David Rientjes   mm, thp: do not c...
549
550
  	if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
  				  true)) {
6b251fc96   Andrea Arcangeli   userfaultfd: call...
551
552
553
554
  		put_page(page);
  		count_vm_event(THP_FAULT_FALLBACK);
  		return VM_FAULT_FALLBACK;
  	}
00501b531   Johannes Weiner   mm: memcontrol: r...
555

bae473a42   Kirill A. Shutemov   mm: introduce fau...
556
  	pgtable = pte_alloc_one(vma->vm_mm, haddr);
00501b531   Johannes Weiner   mm: memcontrol: r...
557
  	if (unlikely(!pgtable)) {
6b31d5955   Michal Hocko   mm, oom: fix pote...
558
559
  		ret = VM_FAULT_OOM;
  		goto release;
00501b531   Johannes Weiner   mm: memcontrol: r...
560
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
561

c79b57e46   Huang Ying   mm: hugetlb: clea...
562
  	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
52f37629f   Minchan Kim   THP: fix comment ...
563
564
565
566
567
  	/*
  	 * The memory barrier inside __SetPageUptodate makes sure that
  	 * clear_huge_page writes become visible before the set_pmd_at()
  	 * write.
  	 */
71e3aac07   Andrea Arcangeli   thp: transparent ...
568
  	__SetPageUptodate(page);
82b0f8c39   Jan Kara   mm: join struct f...
569
570
  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_none(*vmf->pmd))) {
6b31d5955   Michal Hocko   mm, oom: fix pote...
571
  		goto unlock_release;
71e3aac07   Andrea Arcangeli   thp: transparent ...
572
573
  	} else {
  		pmd_t entry;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
574

6b31d5955   Michal Hocko   mm, oom: fix pote...
575
576
577
  		ret = check_stable_address_space(vma->vm_mm);
  		if (ret)
  			goto unlock_release;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
578
579
580
  		/* Deliver the page fault to userland */
  		if (userfaultfd_missing(vma)) {
  			int ret;
82b0f8c39   Jan Kara   mm: join struct f...
581
  			spin_unlock(vmf->ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
582
  			mem_cgroup_cancel_charge(page, memcg, true);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
583
  			put_page(page);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
584
  			pte_free(vma->vm_mm, pgtable);
82b0f8c39   Jan Kara   mm: join struct f...
585
  			ret = handle_userfault(vmf, VM_UFFD_MISSING);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
586
587
588
  			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  			return ret;
  		}
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
589
590
  		entry = mk_huge_pmd(page, vma->vm_page_prot);
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
591
  		page_add_new_anon_rmap(page, vma, haddr, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
592
  		mem_cgroup_commit_charge(page, memcg, false, true);
00501b531   Johannes Weiner   mm: memcontrol: r...
593
  		lru_cache_add_active_or_unevictable(page, vma);
82b0f8c39   Jan Kara   mm: join struct f...
594
595
  		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
596
597
  		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  		atomic_long_inc(&vma->vm_mm->nr_ptes);
82b0f8c39   Jan Kara   mm: join struct f...
598
  		spin_unlock(vmf->ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
599
  		count_vm_event(THP_FAULT_ALLOC);
71e3aac07   Andrea Arcangeli   thp: transparent ...
600
  	}
aa2e878ef   David Rientjes   mm, thp: remove u...
601
  	return 0;
6b31d5955   Michal Hocko   mm, oom: fix pote...
602
603
604
605
606
607
608
609
  unlock_release:
  	spin_unlock(vmf->ptl);
  release:
  	if (pgtable)
  		pte_free(vma->vm_mm, pgtable);
  	mem_cgroup_cancel_charge(page, memcg, true);
  	put_page(page);
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
610
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
611
  /*
21440d7eb   David Rientjes   mm, thp: add new ...
612
613
614
615
616
617
618
   * always: directly stall for all thp allocations
   * defer: wake kswapd and fail if not immediately available
   * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
   *		  fail if not immediately available
   * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
   *	    available
   * never: never stall for any thp allocation
444eb2a44   Mel Gorman   mm: thp: set THP ...
619
620
621
   */
  static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
  {
21440d7eb   David Rientjes   mm, thp: add new ...
622
  	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
251603549   Vlastimil Babka   mm, thp: remove _...
623

21440d7eb   David Rientjes   mm, thp: add new ...
624
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
251603549   Vlastimil Babka   mm, thp: remove _...
625
  		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
21440d7eb   David Rientjes   mm, thp: add new ...
626
627
628
629
630
631
632
633
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
  		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
  		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
  							     __GFP_KSWAPD_RECLAIM);
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
  							     0);
251603549   Vlastimil Babka   mm, thp: remove _...
634
  	return GFP_TRANSHUGE_LIGHT;
444eb2a44   Mel Gorman   mm: thp: set THP ...
635
  }
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
636
  /* Caller must hold page table lock. */
d295e3415   Kirill A. Shutemov   dax: don't use se...
637
  static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
97ae17497   Kirill A. Shutemov   thp: implement re...
638
  		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
639
  		struct page *zero_page)
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
640
641
  {
  	pmd_t entry;
7c4141645   Andrew Morton   dax: revert userf...
642
643
  	if (!pmd_none(*pmd))
  		return false;
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
644
  	entry = mk_pmd(zero_page, vma->vm_page_prot);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
645
  	entry = pmd_mkhuge(entry);
12c9d70bd   Matthew Wilcox   mm: fix memory le...
646
647
  	if (pgtable)
  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
648
  	set_pmd_at(mm, haddr, pmd, entry);
e1f56c89b   Kirill A. Shutemov   mm: convert mm->n...
649
  	atomic_long_inc(&mm->nr_ptes);
7c4141645   Andrew Morton   dax: revert userf...
650
  	return true;
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
651
  }
82b0f8c39   Jan Kara   mm: join struct f...
652
  int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
71e3aac07   Andrea Arcangeli   thp: transparent ...
653
  {
82b0f8c39   Jan Kara   mm: join struct f...
654
  	struct vm_area_struct *vma = vmf->vma;
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
655
  	gfp_t gfp;
71e3aac07   Andrea Arcangeli   thp: transparent ...
656
  	struct page *page;
82b0f8c39   Jan Kara   mm: join struct f...
657
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
71e3aac07   Andrea Arcangeli   thp: transparent ...
658

128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
659
  	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
c02925540   Kirill A. Shutemov   thp: consolidate ...
660
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
661
662
  	if (unlikely(anon_vma_prepare(vma)))
  		return VM_FAULT_OOM;
6d50e60cd   David Rientjes   mm, thp: fix coll...
663
  	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
664
  		return VM_FAULT_OOM;
82b0f8c39   Jan Kara   mm: join struct f...
665
  	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
bae473a42   Kirill A. Shutemov   mm: introduce fau...
666
  			!mm_forbids_zeropage(vma->vm_mm) &&
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
667
668
669
670
  			transparent_hugepage_use_zero_page()) {
  		pgtable_t pgtable;
  		struct page *zero_page;
  		bool set;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
671
  		int ret;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
672
  		pgtable = pte_alloc_one(vma->vm_mm, haddr);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
673
  		if (unlikely(!pgtable))
ba76149f4   Andrea Arcangeli   thp: khugepaged
674
  			return VM_FAULT_OOM;
6fcb52a56   Aaron Lu   thp: reduce usage...
675
  		zero_page = mm_get_huge_zero_page(vma->vm_mm);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
676
  		if (unlikely(!zero_page)) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
677
  			pte_free(vma->vm_mm, pgtable);
81ab4201f   Andi Kleen   mm: add VM counte...
678
  			count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
679
  			return VM_FAULT_FALLBACK;
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
680
  		}
82b0f8c39   Jan Kara   mm: join struct f...
681
  		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
682
683
  		ret = 0;
  		set = false;
82b0f8c39   Jan Kara   mm: join struct f...
684
  		if (pmd_none(*vmf->pmd)) {
6b31d5955   Michal Hocko   mm, oom: fix pote...
685
686
687
688
  			ret = check_stable_address_space(vma->vm_mm);
  			if (ret) {
  				spin_unlock(vmf->ptl);
  			} else if (userfaultfd_missing(vma)) {
82b0f8c39   Jan Kara   mm: join struct f...
689
690
  				spin_unlock(vmf->ptl);
  				ret = handle_userfault(vmf, VM_UFFD_MISSING);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
691
692
  				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  			} else {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
693
  				set_huge_zero_page(pgtable, vma->vm_mm, vma,
82b0f8c39   Jan Kara   mm: join struct f...
694
695
  						   haddr, vmf->pmd, zero_page);
  				spin_unlock(vmf->ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
696
697
698
  				set = true;
  			}
  		} else
82b0f8c39   Jan Kara   mm: join struct f...
699
  			spin_unlock(vmf->ptl);
6fcb52a56   Aaron Lu   thp: reduce usage...
700
  		if (!set)
bae473a42   Kirill A. Shutemov   mm: introduce fau...
701
  			pte_free(vma->vm_mm, pgtable);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
702
  		return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
703
  	}
444eb2a44   Mel Gorman   mm: thp: set THP ...
704
  	gfp = alloc_hugepage_direct_gfpmask(vma);
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
705
  	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
706
707
  	if (unlikely(!page)) {
  		count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
708
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
709
  	}
9a982250f   Kirill A. Shutemov   thp: introduce de...
710
  	prep_transhuge_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
711
  	return __do_huge_pmd_anonymous_page(vmf, page, gfp);
71e3aac07   Andrea Arcangeli   thp: transparent ...
712
  }
ae18d6dcf   Matthew Wilcox   thp: change inser...
713
  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
714
715
  		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
  		pgtable_t pgtable)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
716
717
718
719
720
721
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pmd_t entry;
  	spinlock_t *ptl;
  
  	ptl = pmd_lock(mm, pmd);
f25748e3c   Dan Williams   mm, dax: convert ...
722
723
724
  	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
  	if (pfn_t_devmap(pfn))
  		entry = pmd_mkdevmap(entry);
01871e59a   Ross Zwisler   mm, dax: fix live...
725
726
727
  	if (write) {
  		entry = pmd_mkyoung(pmd_mkdirty(entry));
  		entry = maybe_pmd_mkwrite(entry, vma);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
728
  	}
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
729
730
731
732
733
  
  	if (pgtable) {
  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
  		atomic_long_inc(&mm->nr_ptes);
  	}
01871e59a   Ross Zwisler   mm, dax: fix live...
734
735
  	set_pmd_at(mm, addr, pmd, entry);
  	update_mmu_cache_pmd(vma, addr, pmd);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
736
  	spin_unlock(ptl);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
737
738
739
  }
  
  int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
f25748e3c   Dan Williams   mm, dax: convert ...
740
  			pmd_t *pmd, pfn_t pfn, bool write)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
741
742
  {
  	pgprot_t pgprot = vma->vm_page_prot;
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
743
  	pgtable_t pgtable = NULL;
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
744
745
746
747
748
749
750
751
752
  	/*
  	 * If we had pmd_special, we could avoid all these restrictions,
  	 * but we need to be consistent with PTEs and architectures that
  	 * can't support a 'special' bit.
  	 */
  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  						(VM_PFNMAP|VM_MIXEDMAP));
  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
f25748e3c   Dan Williams   mm, dax: convert ...
753
  	BUG_ON(!pfn_t_devmap(pfn));
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
754
755
756
  
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
308a047c3   Borislav Petkov   x86/pat, mm: Make...
757

3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
758
759
760
761
762
  	if (arch_needs_pgtable_deposit()) {
  		pgtable = pte_alloc_one(vma->vm_mm, addr);
  		if (!pgtable)
  			return VM_FAULT_OOM;
  	}
308a047c3   Borislav Petkov   x86/pat, mm: Make...
763
  	track_pfn_insert(vma, &pgprot, pfn);
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
764
  	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
ae18d6dcf   Matthew Wilcox   thp: change inser...
765
  	return VM_FAULT_NOPAGE;
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
766
  }
dee410792   Dan Williams   /dev/dax, core: f...
767
  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
768

a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
  {
  	if (likely(vma->vm_flags & VM_WRITE))
  		pud = pud_mkwrite(pud);
  	return pud;
  }
  
  static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
  		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pud_t entry;
  	spinlock_t *ptl;
  
  	ptl = pud_lock(mm, pud);
  	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
  	if (pfn_t_devmap(pfn))
  		entry = pud_mkdevmap(entry);
  	if (write) {
  		entry = pud_mkyoung(pud_mkdirty(entry));
  		entry = maybe_pud_mkwrite(entry, vma);
  	}
  	set_pud_at(mm, addr, pud, entry);
  	update_mmu_cache_pud(vma, addr, pud);
  	spin_unlock(ptl);
  }
  
  int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
  			pud_t *pud, pfn_t pfn, bool write)
  {
  	pgprot_t pgprot = vma->vm_page_prot;
  	/*
  	 * If we had pud_special, we could avoid all these restrictions,
  	 * but we need to be consistent with PTEs and architectures that
  	 * can't support a 'special' bit.
  	 */
  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  						(VM_PFNMAP|VM_MIXEDMAP));
  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
  	BUG_ON(!pfn_t_devmap(pfn));
  
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
  
  	track_pfn_insert(vma, &pgprot, pfn);
  
  	insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
  	return VM_FAULT_NOPAGE;
  }
  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
3565fce3a   Dan Williams   mm, x86: get_user...
822
  static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
01ca97274   Kirill A. Shutemov   mm, thp: Do not m...
823
  		pmd_t *pmd, int flags)
3565fce3a   Dan Williams   mm, x86: get_user...
824
825
  {
  	pmd_t _pmd;
01ca97274   Kirill A. Shutemov   mm, thp: Do not m...
826
827
828
  	_pmd = pmd_mkyoung(*pmd);
  	if (flags & FOLL_WRITE)
  		_pmd = pmd_mkdirty(_pmd);
3565fce3a   Dan Williams   mm, x86: get_user...
829
  	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
01ca97274   Kirill A. Shutemov   mm, thp: Do not m...
830
  				pmd, _pmd, flags & FOLL_WRITE))
3565fce3a   Dan Williams   mm, x86: get_user...
831
832
833
834
835
836
837
838
839
840
841
842
  		update_mmu_cache_pmd(vma, addr, pmd);
  }
  
  struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
  		pmd_t *pmd, int flags)
  {
  	unsigned long pfn = pmd_pfn(*pmd);
  	struct mm_struct *mm = vma->vm_mm;
  	struct dev_pagemap *pgmap;
  	struct page *page;
  
  	assert_spin_locked(pmd_lockptr(mm, pmd));
8310d48b1   Keno Fischer   mm/huge_memory.c:...
843
844
845
846
847
  	/*
  	 * When we COW a devmap PMD entry, we split it into PTEs, so we should
  	 * not be in this function with `flags & FOLL_COW` set.
  	 */
  	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
3565fce3a   Dan Williams   mm, x86: get_user...
848
849
850
851
852
853
854
855
856
  	if (flags & FOLL_WRITE && !pmd_write(*pmd))
  		return NULL;
  
  	if (pmd_present(*pmd) && pmd_devmap(*pmd))
  		/* pass */;
  	else
  		return NULL;
  
  	if (flags & FOLL_TOUCH)
01ca97274   Kirill A. Shutemov   mm, thp: Do not m...
857
  		touch_pmd(vma, addr, pmd, flags);
3565fce3a   Dan Williams   mm, x86: get_user...
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
  
  	/*
  	 * device mapped pages can only be returned if the
  	 * caller will manage the page reference count.
  	 */
  	if (!(flags & FOLL_GET))
  		return ERR_PTR(-EEXIST);
  
  	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
  	pgmap = get_dev_pagemap(pfn, NULL);
  	if (!pgmap)
  		return ERR_PTR(-EFAULT);
  	page = pfn_to_page(pfn);
  	get_page(page);
  	put_dev_pagemap(pgmap);
  
  	return page;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
876
877
878
879
  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
  		  struct vm_area_struct *vma)
  {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
880
  	spinlock_t *dst_ptl, *src_ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
881
882
  	struct page *src_page;
  	pmd_t pmd;
12c9d70bd   Matthew Wilcox   mm: fix memory le...
883
  	pgtable_t pgtable = NULL;
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
884
  	int ret = -ENOMEM;
71e3aac07   Andrea Arcangeli   thp: transparent ...
885

628d47ce9   Kirill A. Shutemov   thp: skip file hu...
886
887
888
889
890
891
892
  	/* Skip if can be re-fill on fault */
  	if (!vma_is_anonymous(vma))
  		return 0;
  
  	pgtable = pte_alloc_one(dst_mm, addr);
  	if (unlikely(!pgtable))
  		goto out;
71e3aac07   Andrea Arcangeli   thp: transparent ...
893

c4088ebdc   Kirill A. Shutemov   mm: convert the r...
894
895
896
  	dst_ptl = pmd_lock(dst_mm, dst_pmd);
  	src_ptl = pmd_lockptr(src_mm, src_pmd);
  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
71e3aac07   Andrea Arcangeli   thp: transparent ...
897
898
899
  
  	ret = -EAGAIN;
  	pmd = *src_pmd;
84c3fc4e9   Zi Yan   mm: thp: check pm...
900
901
902
903
904
905
906
907
908
  
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  	if (unlikely(is_swap_pmd(pmd))) {
  		swp_entry_t entry = pmd_to_swp_entry(pmd);
  
  		VM_BUG_ON(!is_pmd_migration_entry(pmd));
  		if (is_write_migration_entry(entry)) {
  			make_migration_entry_read(&entry);
  			pmd = swp_entry_to_pmd(entry);
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
909
910
  			if (pmd_swp_soft_dirty(*src_pmd))
  				pmd = pmd_swp_mksoft_dirty(pmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
911
912
  			set_pmd_at(src_mm, addr, src_pmd, pmd);
  		}
dd8a67f9a   Zi Yan   mm/huge_memory.c:...
913
914
915
  		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  		atomic_long_inc(&dst_mm->nr_ptes);
  		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
84c3fc4e9   Zi Yan   mm: thp: check pm...
916
917
918
919
920
  		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
  		ret = 0;
  		goto out_unlock;
  	}
  #endif
628d47ce9   Kirill A. Shutemov   thp: skip file hu...
921
  	if (unlikely(!pmd_trans_huge(pmd))) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
922
923
924
  		pte_free(dst_mm, pgtable);
  		goto out_unlock;
  	}
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
925
  	/*
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
926
  	 * When page table lock is held, the huge zero pmd should not be
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
927
928
929
930
  	 * under splitting since we don't split the page itself, only pmd to
  	 * a page table.
  	 */
  	if (is_huge_zero_pmd(pmd)) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
931
  		struct page *zero_page;
97ae17497   Kirill A. Shutemov   thp: implement re...
932
933
934
935
936
  		/*
  		 * get_huge_zero_page() will never allocate a new page here,
  		 * since we already have a zero page to copy. It just takes a
  		 * reference.
  		 */
6fcb52a56   Aaron Lu   thp: reduce usage...
937
  		zero_page = mm_get_huge_zero_page(dst_mm);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
938
  		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
939
  				zero_page);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
940
941
942
  		ret = 0;
  		goto out_unlock;
  	}
de466bd62   Mel Gorman   mm: numa: avoid u...
943

628d47ce9   Kirill A. Shutemov   thp: skip file hu...
944
945
946
947
948
949
950
  	src_page = pmd_page(pmd);
  	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
  	get_page(src_page);
  	page_dup_rmap(src_page, true);
  	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  	atomic_long_inc(&dst_mm->nr_ptes);
  	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
71e3aac07   Andrea Arcangeli   thp: transparent ...
951
952
953
954
  
  	pmdp_set_wrprotect(src_mm, addr, src_pmd);
  	pmd = pmd_mkold(pmd_wrprotect(pmd));
  	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
955
956
957
  
  	ret = 0;
  out_unlock:
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
958
959
  	spin_unlock(src_ptl);
  	spin_unlock(dst_ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
960
961
962
  out:
  	return ret;
  }
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
963
964
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
01ca97274   Kirill A. Shutemov   mm, thp: Do not m...
965
  		pud_t *pud, int flags)
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
966
967
  {
  	pud_t _pud;
01ca97274   Kirill A. Shutemov   mm, thp: Do not m...
968
969
970
  	_pud = pud_mkyoung(*pud);
  	if (flags & FOLL_WRITE)
  		_pud = pud_mkdirty(_pud);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
971
  	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
01ca97274   Kirill A. Shutemov   mm, thp: Do not m...
972
  				pud, _pud, flags & FOLL_WRITE))
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
  		update_mmu_cache_pud(vma, addr, pud);
  }
  
  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
  		pud_t *pud, int flags)
  {
  	unsigned long pfn = pud_pfn(*pud);
  	struct mm_struct *mm = vma->vm_mm;
  	struct dev_pagemap *pgmap;
  	struct page *page;
  
  	assert_spin_locked(pud_lockptr(mm, pud));
  
  	if (flags & FOLL_WRITE && !pud_write(*pud))
  		return NULL;
  
  	if (pud_present(*pud) && pud_devmap(*pud))
  		/* pass */;
  	else
  		return NULL;
  
  	if (flags & FOLL_TOUCH)
01ca97274   Kirill A. Shutemov   mm, thp: Do not m...
995
  		touch_pud(vma, addr, pud, flags);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
  
  	/*
  	 * device mapped pages can only be returned if the
  	 * caller will manage the page reference count.
  	 */
  	if (!(flags & FOLL_GET))
  		return ERR_PTR(-EEXIST);
  
  	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
  	pgmap = get_dev_pagemap(pfn, NULL);
  	if (!pgmap)
  		return ERR_PTR(-EFAULT);
  	page = pfn_to_page(pfn);
  	get_page(page);
  	put_dev_pagemap(pgmap);
  
  	return page;
  }
  
  int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
  		  struct vm_area_struct *vma)
  {
  	spinlock_t *dst_ptl, *src_ptl;
  	pud_t pud;
  	int ret;
  
  	dst_ptl = pud_lock(dst_mm, dst_pud);
  	src_ptl = pud_lockptr(src_mm, src_pud);
  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  
  	ret = -EAGAIN;
  	pud = *src_pud;
  	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
  		goto out_unlock;
  
  	/*
  	 * When page table lock is held, the huge zero pud should not be
  	 * under splitting since we don't split the page itself, only pud to
  	 * a page table.
  	 */
  	if (is_huge_zero_pud(pud)) {
  		/* No huge zero pud yet */
  	}
  
  	pudp_set_wrprotect(src_mm, addr, src_pud);
  	pud = pud_mkold(pud_wrprotect(pud));
  	set_pud_at(dst_mm, addr, dst_pud, pud);
  
  	ret = 0;
  out_unlock:
  	spin_unlock(src_ptl);
  	spin_unlock(dst_ptl);
  	return ret;
  }
  
  void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
  {
  	pud_t entry;
  	unsigned long haddr;
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
  
  	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
  	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
  		goto unlock;
  
  	entry = pud_mkyoung(orig_pud);
  	if (write)
  		entry = pud_mkdirty(entry);
  	haddr = vmf->address & HPAGE_PUD_MASK;
  	if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
  		update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
  
  unlock:
  	spin_unlock(vmf->ptl);
  }
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
82b0f8c39   Jan Kara   mm: join struct f...
1073
  void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
a1dd450bc   Will Deacon   mm: thp: set the ...
1074
1075
1076
  {
  	pmd_t entry;
  	unsigned long haddr;
20f664aab   Minchan Kim   mm: pmd dirty emu...
1077
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
a1dd450bc   Will Deacon   mm: thp: set the ...
1078

82b0f8c39   Jan Kara   mm: join struct f...
1079
1080
  	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
a1dd450bc   Will Deacon   mm: thp: set the ...
1081
1082
1083
  		goto unlock;
  
  	entry = pmd_mkyoung(orig_pmd);
20f664aab   Minchan Kim   mm: pmd dirty emu...
1084
1085
  	if (write)
  		entry = pmd_mkdirty(entry);
82b0f8c39   Jan Kara   mm: join struct f...
1086
  	haddr = vmf->address & HPAGE_PMD_MASK;
20f664aab   Minchan Kim   mm: pmd dirty emu...
1087
  	if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
82b0f8c39   Jan Kara   mm: join struct f...
1088
  		update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
a1dd450bc   Will Deacon   mm: thp: set the ...
1089
1090
  
  unlock:
82b0f8c39   Jan Kara   mm: join struct f...
1091
  	spin_unlock(vmf->ptl);
a1dd450bc   Will Deacon   mm: thp: set the ...
1092
  }
82b0f8c39   Jan Kara   mm: join struct f...
1093
  static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1094
  		struct page *page)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1095
  {
82b0f8c39   Jan Kara   mm: join struct f...
1096
1097
  	struct vm_area_struct *vma = vmf->vma;
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
00501b531   Johannes Weiner   mm: memcontrol: r...
1098
  	struct mem_cgroup *memcg;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1099
1100
1101
1102
  	pgtable_t pgtable;
  	pmd_t _pmd;
  	int ret = 0, i;
  	struct page **pages;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1103
1104
  	unsigned long mmun_start;	/* For mmu_notifiers */
  	unsigned long mmun_end;		/* For mmu_notifiers */
71e3aac07   Andrea Arcangeli   thp: transparent ...
1105
1106
1107
1108
1109
1110
1111
1112
1113
  
  	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
  			GFP_KERNEL);
  	if (unlikely(!pages)) {
  		ret |= VM_FAULT_OOM;
  		goto out;
  	}
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
41b6167e8   Michal Hocko   mm: get rid of __...
1114
  		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
82b0f8c39   Jan Kara   mm: join struct f...
1115
  					       vmf->address, page_to_nid(page));
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1116
  		if (unlikely(!pages[i] ||
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1117
1118
  			     mem_cgroup_try_charge(pages[i], vma->vm_mm,
  				     GFP_KERNEL, &memcg, false))) {
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1119
  			if (pages[i])
71e3aac07   Andrea Arcangeli   thp: transparent ...
1120
  				put_page(pages[i]);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1121
  			while (--i >= 0) {
00501b531   Johannes Weiner   mm: memcontrol: r...
1122
1123
  				memcg = (void *)page_private(pages[i]);
  				set_page_private(pages[i], 0);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1124
1125
  				mem_cgroup_cancel_charge(pages[i], memcg,
  						false);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1126
1127
  				put_page(pages[i]);
  			}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1128
1129
1130
1131
  			kfree(pages);
  			ret |= VM_FAULT_OOM;
  			goto out;
  		}
00501b531   Johannes Weiner   mm: memcontrol: r...
1132
  		set_page_private(pages[i], (unsigned long)memcg);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1133
1134
1135
1136
  	}
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		copy_user_highpage(pages[i], page + i,
0089e4853   Hillf Danton   mm/huge_memory: f...
1137
  				   haddr + PAGE_SIZE * i, vma);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1138
1139
1140
  		__SetPageUptodate(pages[i]);
  		cond_resched();
  	}
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1141
1142
  	mmun_start = haddr;
  	mmun_end   = haddr + HPAGE_PMD_SIZE;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1143
  	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1144

82b0f8c39   Jan Kara   mm: join struct f...
1145
1146
  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
71e3aac07   Andrea Arcangeli   thp: transparent ...
1147
  		goto out_free_pages;
309381fea   Sasha Levin   mm: dump page whe...
1148
  	VM_BUG_ON_PAGE(!PageHead(page), page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1149

82b0f8c39   Jan Kara   mm: join struct f...
1150
  	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1151
  	/* leave pmd empty until pte is filled */
82b0f8c39   Jan Kara   mm: join struct f...
1152
  	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1153
  	pmd_populate(vma->vm_mm, &_pmd, pgtable);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1154
1155
  
  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1156
  		pte_t entry;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1157
1158
  		entry = mk_pte(pages[i], vma->vm_page_prot);
  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
00501b531   Johannes Weiner   mm: memcontrol: r...
1159
1160
  		memcg = (void *)page_private(pages[i]);
  		set_page_private(pages[i], 0);
82b0f8c39   Jan Kara   mm: join struct f...
1161
  		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1162
  		mem_cgroup_commit_charge(pages[i], memcg, false, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
1163
  		lru_cache_add_active_or_unevictable(pages[i], vma);
82b0f8c39   Jan Kara   mm: join struct f...
1164
1165
1166
1167
  		vmf->pte = pte_offset_map(&_pmd, haddr);
  		VM_BUG_ON(!pte_none(*vmf->pte));
  		set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
  		pte_unmap(vmf->pte);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1168
1169
  	}
  	kfree(pages);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1170
  	smp_wmb(); /* make pte visible before pmd */
82b0f8c39   Jan Kara   mm: join struct f...
1171
  	pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1172
  	page_remove_rmap(page, true);
82b0f8c39   Jan Kara   mm: join struct f...
1173
  	spin_unlock(vmf->ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1174

bae473a42   Kirill A. Shutemov   mm: introduce fau...
1175
  	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1176

71e3aac07   Andrea Arcangeli   thp: transparent ...
1177
1178
1179
1180
1181
1182
1183
  	ret |= VM_FAULT_WRITE;
  	put_page(page);
  
  out:
  	return ret;
  
  out_free_pages:
82b0f8c39   Jan Kara   mm: join struct f...
1184
  	spin_unlock(vmf->ptl);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1185
  	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1186
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
00501b531   Johannes Weiner   mm: memcontrol: r...
1187
1188
  		memcg = (void *)page_private(pages[i]);
  		set_page_private(pages[i], 0);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1189
  		mem_cgroup_cancel_charge(pages[i], memcg, false);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1190
  		put_page(pages[i]);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1191
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1192
1193
1194
  	kfree(pages);
  	goto out;
  }
82b0f8c39   Jan Kara   mm: join struct f...
1195
  int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1196
  {
82b0f8c39   Jan Kara   mm: join struct f...
1197
  	struct vm_area_struct *vma = vmf->vma;
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1198
  	struct page *page = NULL, *new_page;
00501b531   Johannes Weiner   mm: memcontrol: r...
1199
  	struct mem_cgroup *memcg;
82b0f8c39   Jan Kara   mm: join struct f...
1200
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1201
1202
  	unsigned long mmun_start;	/* For mmu_notifiers */
  	unsigned long mmun_end;		/* For mmu_notifiers */
3b3636924   Michal Hocko   mm, memcg: sync a...
1203
  	gfp_t huge_gfp;			/* for allocation and charge */
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1204
  	int ret = 0;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1205

82b0f8c39   Jan Kara   mm: join struct f...
1206
  	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
81d1b09c6   Sasha Levin   mm: convert a few...
1207
  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1208
1209
  	if (is_huge_zero_pmd(orig_pmd))
  		goto alloc;
82b0f8c39   Jan Kara   mm: join struct f...
1210
1211
  	spin_lock(vmf->ptl);
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
71e3aac07   Andrea Arcangeli   thp: transparent ...
1212
1213
1214
  		goto out_unlock;
  
  	page = pmd_page(orig_pmd);
309381fea   Sasha Levin   mm: dump page whe...
1215
  	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1f25fe20a   Kirill A. Shutemov   mm, thp: adjust c...
1216
1217
  	/*
  	 * We can only reuse the page if nobody else maps the huge page or it's
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1218
  	 * part.
1f25fe20a   Kirill A. Shutemov   mm, thp: adjust c...
1219
  	 */
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
  	if (!trylock_page(page)) {
  		get_page(page);
  		spin_unlock(vmf->ptl);
  		lock_page(page);
  		spin_lock(vmf->ptl);
  		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
  			unlock_page(page);
  			put_page(page);
  			goto out_unlock;
  		}
  		put_page(page);
  	}
  	if (reuse_swap_page(page, NULL)) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1233
1234
1235
  		pmd_t entry;
  		entry = pmd_mkyoung(orig_pmd);
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
82b0f8c39   Jan Kara   mm: join struct f...
1236
1237
  		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
  			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1238
  		ret |= VM_FAULT_WRITE;
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1239
  		unlock_page(page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1240
1241
  		goto out_unlock;
  	}
ba3c4ce6d   Huang Ying   mm, THP, swap: ma...
1242
  	unlock_page(page);
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1243
  	get_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
1244
  	spin_unlock(vmf->ptl);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1245
  alloc:
71e3aac07   Andrea Arcangeli   thp: transparent ...
1246
  	if (transparent_hugepage_enabled(vma) &&
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
1247
  	    !transparent_hugepage_debug_cow()) {
444eb2a44   Mel Gorman   mm: thp: set THP ...
1248
  		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
3b3636924   Michal Hocko   mm, memcg: sync a...
1249
  		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
1250
  	} else
71e3aac07   Andrea Arcangeli   thp: transparent ...
1251
  		new_page = NULL;
9a982250f   Kirill A. Shutemov   thp: introduce de...
1252
1253
1254
  	if (likely(new_page)) {
  		prep_transhuge_page(new_page);
  	} else {
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1255
  		if (!page) {
82b0f8c39   Jan Kara   mm: join struct f...
1256
  			split_huge_pmd(vma, vmf->pmd, vmf->address);
e9b71ca91   Kirill A. Shutemov   mm, thp: drop do_...
1257
  			ret |= VM_FAULT_FALLBACK;
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1258
  		} else {
82b0f8c39   Jan Kara   mm: join struct f...
1259
  			ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1260
  			if (ret & VM_FAULT_OOM) {
82b0f8c39   Jan Kara   mm: join struct f...
1261
  				split_huge_pmd(vma, vmf->pmd, vmf->address);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1262
1263
  				ret |= VM_FAULT_FALLBACK;
  			}
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1264
  			put_page(page);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1265
  		}
17766dde3   David Rientjes   mm, thp: count th...
1266
  		count_vm_event(THP_FAULT_FALLBACK);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1267
1268
  		goto out;
  	}
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1269
  	if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
49f4a8c52   David Rientjes   mm, thp: do not c...
1270
  				huge_gfp | __GFP_NORETRY, &memcg, true))) {
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1271
  		put_page(new_page);
82b0f8c39   Jan Kara   mm: join struct f...
1272
  		split_huge_pmd(vma, vmf->pmd, vmf->address);
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1273
  		if (page)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1274
  			put_page(page);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1275
  		ret |= VM_FAULT_FALLBACK;
17766dde3   David Rientjes   mm, thp: count th...
1276
  		count_vm_event(THP_FAULT_FALLBACK);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1277
1278
  		goto out;
  	}
17766dde3   David Rientjes   mm, thp: count th...
1279
  	count_vm_event(THP_FAULT_ALLOC);
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1280
  	if (!page)
c79b57e46   Huang Ying   mm: hugetlb: clea...
1281
  		clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1282
1283
  	else
  		copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1284
  	__SetPageUptodate(new_page);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1285
1286
  	mmun_start = haddr;
  	mmun_end   = haddr + HPAGE_PMD_SIZE;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1287
  	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1288

82b0f8c39   Jan Kara   mm: join struct f...
1289
  	spin_lock(vmf->ptl);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1290
  	if (page)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1291
  		put_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
1292
1293
  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
  		spin_unlock(vmf->ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1294
  		mem_cgroup_cancel_charge(new_page, memcg, true);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1295
  		put_page(new_page);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1296
  		goto out_mn;
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1297
  	} else {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1298
  		pmd_t entry;
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
1299
1300
  		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
82b0f8c39   Jan Kara   mm: join struct f...
1301
  		pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1302
  		page_add_new_anon_rmap(new_page, vma, haddr, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1303
  		mem_cgroup_commit_charge(new_page, memcg, false, true);
00501b531   Johannes Weiner   mm: memcontrol: r...
1304
  		lru_cache_add_active_or_unevictable(new_page, vma);
82b0f8c39   Jan Kara   mm: join struct f...
1305
1306
  		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
  		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1307
  		if (!page) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1308
  			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
97ae17497   Kirill A. Shutemov   thp: implement re...
1309
  		} else {
309381fea   Sasha Levin   mm: dump page whe...
1310
  			VM_BUG_ON_PAGE(!PageHead(page), page);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1311
  			page_remove_rmap(page, true);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1312
1313
  			put_page(page);
  		}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1314
1315
  		ret |= VM_FAULT_WRITE;
  	}
82b0f8c39   Jan Kara   mm: join struct f...
1316
  	spin_unlock(vmf->ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1317
  out_mn:
bae473a42   Kirill A. Shutemov   mm: introduce fau...
1318
  	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1319
1320
  out:
  	return ret;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1321
  out_unlock:
82b0f8c39   Jan Kara   mm: join struct f...
1322
  	spin_unlock(vmf->ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1323
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1324
  }
8310d48b1   Keno Fischer   mm/huge_memory.c:...
1325
1326
1327
1328
1329
1330
1331
1332
1333
  /*
   * FOLL_FORCE can write to even unwritable pmd's, but only
   * after we've gone through a COW cycle and they are dirty.
   */
  static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
  {
  	return pmd_write(pmd) ||
  	       ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
  }
b676b293f   David Rientjes   mm, thp: fix mapp...
1334
  struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
71e3aac07   Andrea Arcangeli   thp: transparent ...
1335
1336
1337
1338
  				   unsigned long addr,
  				   pmd_t *pmd,
  				   unsigned int flags)
  {
b676b293f   David Rientjes   mm, thp: fix mapp...
1339
  	struct mm_struct *mm = vma->vm_mm;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1340
  	struct page *page = NULL;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1341
  	assert_spin_locked(pmd_lockptr(mm, pmd));
71e3aac07   Andrea Arcangeli   thp: transparent ...
1342

8310d48b1   Keno Fischer   mm/huge_memory.c:...
1343
  	if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
71e3aac07   Andrea Arcangeli   thp: transparent ...
1344
  		goto out;
85facf257   Kirill A. Shutemov   thp: avoid dumpin...
1345
1346
1347
  	/* Avoid dumping huge zero page */
  	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
  		return ERR_PTR(-EFAULT);
2b4847e73   Mel Gorman   mm: numa: seriali...
1348
  	/* Full NUMA hinting faults to serialise migration in fault paths */
8a0516ed8   Mel Gorman   mm: convert p[te|...
1349
  	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
2b4847e73   Mel Gorman   mm: numa: seriali...
1350
  		goto out;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1351
  	page = pmd_page(*pmd);
ca120cf68   Dan Williams   mm: fix show_smap...
1352
  	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
3565fce3a   Dan Williams   mm, x86: get_user...
1353
  	if (flags & FOLL_TOUCH)
01ca97274   Kirill A. Shutemov   mm, thp: Do not m...
1354
  		touch_pmd(vma, addr, pmd, flags);
de60f5f10   Eric B Munson   mm: introduce VM_...
1355
  	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1356
1357
1358
1359
  		/*
  		 * We don't mlock() pte-mapped THPs. This way we can avoid
  		 * leaking mlocked pages into non-VM_LOCKED VMAs.
  		 *
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1360
1361
  		 * For anon THP:
  		 *
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1362
1363
1364
1365
1366
1367
1368
  		 * In most cases the pmd is the only mapping of the page as we
  		 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
  		 * writable private mappings in populate_vma_page_range().
  		 *
  		 * The only scenario when we have the page shared here is if we
  		 * mlocking read-only mapping shared over fork(). We skip
  		 * mlocking such pages.
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1369
1370
1371
1372
1373
1374
  		 *
  		 * For file THP:
  		 *
  		 * We can expect PageDoubleMap() to be stable under page lock:
  		 * for file pages we set it in page_add_file_rmap(), which
  		 * requires page to be locked.
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1375
  		 */
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
  
  		if (PageAnon(page) && compound_mapcount(page) != 1)
  			goto skip_mlock;
  		if (PageDoubleMap(page) || !page->mapping)
  			goto skip_mlock;
  		if (!trylock_page(page))
  			goto skip_mlock;
  		lru_add_drain();
  		if (page->mapping && !PageDoubleMap(page))
  			mlock_vma_page(page);
  		unlock_page(page);
b676b293f   David Rientjes   mm, thp: fix mapp...
1387
  	}
9a73f61bd   Kirill A. Shutemov   thp, mlock: do no...
1388
  skip_mlock:
71e3aac07   Andrea Arcangeli   thp: transparent ...
1389
  	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
ca120cf68   Dan Williams   mm: fix show_smap...
1390
  	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1391
  	if (flags & FOLL_GET)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1392
  		get_page(page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1393
1394
1395
1396
  
  out:
  	return page;
  }
d10e63f29   Mel Gorman   mm: numa: Create ...
1397
  /* NUMA hinting page fault entry point for trans huge pmds */
82b0f8c39   Jan Kara   mm: join struct f...
1398
  int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
d10e63f29   Mel Gorman   mm: numa: Create ...
1399
  {
82b0f8c39   Jan Kara   mm: join struct f...
1400
  	struct vm_area_struct *vma = vmf->vma;
b8916634b   Mel Gorman   mm: Prevent paral...
1401
  	struct anon_vma *anon_vma = NULL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1402
  	struct page *page;
82b0f8c39   Jan Kara   mm: join struct f...
1403
  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1404
  	int page_nid = -1, this_nid = numa_node_id();
90572890d   Peter Zijlstra   mm: numa: Change ...
1405
  	int target_nid, last_cpupid = -1;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1406
1407
  	bool page_locked;
  	bool migrated = false;
b191f9b10   Mel Gorman   mm: numa: preserv...
1408
  	bool was_writable;
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1409
  	int flags = 0;
d10e63f29   Mel Gorman   mm: numa: Create ...
1410

82b0f8c39   Jan Kara   mm: join struct f...
1411
1412
  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  	if (unlikely(!pmd_same(pmd, *vmf->pmd)))
d10e63f29   Mel Gorman   mm: numa: Create ...
1413
  		goto out_unlock;
de466bd62   Mel Gorman   mm: numa: avoid u...
1414
1415
1416
1417
1418
  	/*
  	 * If there are potential migrations, wait for completion and retry
  	 * without disrupting NUMA hinting information. Do not relock and
  	 * check_same as the page may no longer be mapped.
  	 */
82b0f8c39   Jan Kara   mm: join struct f...
1419
1420
  	if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
  		page = pmd_page(*vmf->pmd);
3c226c637   Mark Rutland   mm: numa: avoid w...
1421
1422
  		if (!get_page_unless_zero(page))
  			goto out_unlock;
82b0f8c39   Jan Kara   mm: join struct f...
1423
  		spin_unlock(vmf->ptl);
5d8330621   Mel Gorman   mm: numa: do not ...
1424
  		wait_on_page_locked(page);
3c226c637   Mark Rutland   mm: numa: avoid w...
1425
  		put_page(page);
de466bd62   Mel Gorman   mm: numa: avoid u...
1426
1427
  		goto out;
  	}
d10e63f29   Mel Gorman   mm: numa: Create ...
1428
  	page = pmd_page(pmd);
a1a46184e   Mel Gorman   mm: numa: Do not ...
1429
  	BUG_ON(is_huge_zero_page(page));
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1430
  	page_nid = page_to_nid(page);
90572890d   Peter Zijlstra   mm: numa: Change ...
1431
  	last_cpupid = page_cpupid_last(page);
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1432
  	count_vm_numa_event(NUMA_HINT_FAULTS);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1433
  	if (page_nid == this_nid) {
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1434
  		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1435
1436
  		flags |= TNF_FAULT_LOCAL;
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1437

bea66fbd1   Mel Gorman   mm: numa: group r...
1438
  	/* See similar comment in do_numa_page for explanation */
288bc5494   Aneesh Kumar K.V   mm/autonuma: let ...
1439
  	if (!pmd_savedwrite(pmd))
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1440
1441
1442
  		flags |= TNF_NO_GROUP;
  
  	/*
ff9042b11   Mel Gorman   mm: Wait for THP ...
1443
1444
1445
  	 * Acquire the page lock to serialise THP migrations but avoid dropping
  	 * page_table_lock if at all possible
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1446
1447
1448
1449
  	page_locked = trylock_page(page);
  	target_nid = mpol_misplaced(page, vma, haddr);
  	if (target_nid == -1) {
  		/* If the page was locked, there are no parallel migrations */
a54a407fb   Mel Gorman   mm: Close races b...
1450
  		if (page_locked)
b8916634b   Mel Gorman   mm: Prevent paral...
1451
  			goto clear_pmdnuma;
2b4847e73   Mel Gorman   mm: numa: seriali...
1452
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1453

de466bd62   Mel Gorman   mm: numa: avoid u...
1454
  	/* Migration could have started since the pmd_trans_migrating check */
2b4847e73   Mel Gorman   mm: numa: seriali...
1455
  	if (!page_locked) {
3c226c637   Mark Rutland   mm: numa: avoid w...
1456
1457
1458
  		page_nid = -1;
  		if (!get_page_unless_zero(page))
  			goto out_unlock;
82b0f8c39   Jan Kara   mm: join struct f...
1459
  		spin_unlock(vmf->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1460
  		wait_on_page_locked(page);
3c226c637   Mark Rutland   mm: numa: avoid w...
1461
  		put_page(page);
b8916634b   Mel Gorman   mm: Prevent paral...
1462
1463
  		goto out;
  	}
2b4847e73   Mel Gorman   mm: numa: seriali...
1464
1465
1466
1467
  	/*
  	 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
  	 * to serialises splits
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1468
  	get_page(page);
82b0f8c39   Jan Kara   mm: join struct f...
1469
  	spin_unlock(vmf->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1470
  	anon_vma = page_lock_anon_vma_read(page);
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1471

c69307d53   Peter Zijlstra   sched/numa: Fix c...
1472
  	/* Confirm the PMD did not change while page_table_lock was released */
82b0f8c39   Jan Kara   mm: join struct f...
1473
1474
  	spin_lock(vmf->ptl);
  	if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
b32967ff1   Mel Gorman   mm: numa: Add THP...
1475
1476
  		unlock_page(page);
  		put_page(page);
a54a407fb   Mel Gorman   mm: Close races b...
1477
  		page_nid = -1;
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1478
  		goto out_unlock;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1479
  	}
ff9042b11   Mel Gorman   mm: Wait for THP ...
1480

c3a489cac   Mel Gorman   mm: numa: ensure ...
1481
1482
1483
1484
1485
1486
  	/* Bail if we fail to protect against THP splits for any reason */
  	if (unlikely(!anon_vma)) {
  		put_page(page);
  		page_nid = -1;
  		goto clear_pmdnuma;
  	}
a54a407fb   Mel Gorman   mm: Close races b...
1487
  	/*
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1488
1489
1490
1491
1492
  	 * Since we took the NUMA fault, we must have observed the !accessible
  	 * bit. Make sure all other CPUs agree with that, to avoid them
  	 * modifying the page we're about to migrate.
  	 *
  	 * Must be done under PTL such that we'll observe the relevant
ccde85ba0   Peter Zijlstra   mm, locking: Fix ...
1493
1494
1495
1496
  	 * inc_tlb_flush_pending().
  	 *
  	 * We are not sure a pending tlb flush here is for a huge page
  	 * mapping or not. Hence use the tlb range variant
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1497
1498
  	 */
  	if (mm_tlb_flush_pending(vma->vm_mm))
ccde85ba0   Peter Zijlstra   mm, locking: Fix ...
1499
  		flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1500
1501
  
  	/*
a54a407fb   Mel Gorman   mm: Close races b...
1502
  	 * Migrate the THP to the requested node, returns with page unlocked
8a0516ed8   Mel Gorman   mm: convert p[te|...
1503
  	 * and access rights restored.
a54a407fb   Mel Gorman   mm: Close races b...
1504
  	 */
82b0f8c39   Jan Kara   mm: join struct f...
1505
  	spin_unlock(vmf->ptl);
8b1b436dd   Peter Zijlstra   mm, locking: Rewo...
1506

bae473a42   Kirill A. Shutemov   mm: introduce fau...
1507
  	migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
82b0f8c39   Jan Kara   mm: join struct f...
1508
  				vmf->pmd, pmd, vmf->address, page, target_nid);
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1509
1510
  	if (migrated) {
  		flags |= TNF_MIGRATED;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1511
  		page_nid = target_nid;
074c23817   Mel Gorman   mm: numa: slow PT...
1512
1513
  	} else
  		flags |= TNF_MIGRATE_FAIL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1514

8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1515
  	goto out;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1516
  clear_pmdnuma:
a54a407fb   Mel Gorman   mm: Close races b...
1517
  	BUG_ON(!PageLocked(page));
288bc5494   Aneesh Kumar K.V   mm/autonuma: let ...
1518
  	was_writable = pmd_savedwrite(pmd);
4d9424669   Mel Gorman   mm: convert p[te|...
1519
  	pmd = pmd_modify(pmd, vma->vm_page_prot);
b7b04004e   Mel Gorman   mm: numa: mark hu...
1520
  	pmd = pmd_mkyoung(pmd);
b191f9b10   Mel Gorman   mm: numa: preserv...
1521
1522
  	if (was_writable)
  		pmd = pmd_mkwrite(pmd);
82b0f8c39   Jan Kara   mm: join struct f...
1523
1524
  	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
  	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
a54a407fb   Mel Gorman   mm: Close races b...
1525
  	unlock_page(page);
d10e63f29   Mel Gorman   mm: numa: Create ...
1526
  out_unlock:
82b0f8c39   Jan Kara   mm: join struct f...
1527
  	spin_unlock(vmf->ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1528
1529
1530
1531
  
  out:
  	if (anon_vma)
  		page_unlock_anon_vma_read(anon_vma);
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1532
  	if (page_nid != -1)
82b0f8c39   Jan Kara   mm: join struct f...
1533
  		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
9a8b300f2   Aneesh Kumar K.V   mm/thp/autonuma: ...
1534
  				flags);
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1535

d10e63f29   Mel Gorman   mm: numa: Create ...
1536
1537
  	return 0;
  }
319904ad4   Huang Ying   mm, THP: clean up...
1538
1539
1540
1541
1542
  /*
   * Return true if we do MADV_FREE successfully on entire pmd page.
   * Otherwise, return false.
   */
  bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1543
  		pmd_t *pmd, unsigned long addr, unsigned long next)
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1544
1545
1546
1547
1548
  {
  	spinlock_t *ptl;
  	pmd_t orig_pmd;
  	struct page *page;
  	struct mm_struct *mm = tlb->mm;
319904ad4   Huang Ying   mm, THP: clean up...
1549
  	bool ret = false;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1550

07e326610   Aneesh Kumar K.V   mm: add tlb_remov...
1551
  	tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1552
1553
  	ptl = pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
25eedabe0   Linus Torvalds   vm: fix incorrect...
1554
  		goto out_unlocked;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1555
1556
  
  	orig_pmd = *pmd;
319904ad4   Huang Ying   mm, THP: clean up...
1557
  	if (is_huge_zero_pmd(orig_pmd))
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1558
  		goto out;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1559

84c3fc4e9   Zi Yan   mm: thp: check pm...
1560
1561
1562
1563
1564
  	if (unlikely(!pmd_present(orig_pmd))) {
  		VM_BUG_ON(thp_migration_supported() &&
  				  !is_pmd_migration_entry(orig_pmd));
  		goto out;
  	}
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
  	page = pmd_page(orig_pmd);
  	/*
  	 * If other processes are mapping this page, we couldn't discard
  	 * the page unless they all do MADV_FREE so let's skip the page.
  	 */
  	if (page_mapcount(page) != 1)
  		goto out;
  
  	if (!trylock_page(page))
  		goto out;
  
  	/*
  	 * If user want to discard part-pages of THP, split it so MADV_FREE
  	 * will deactivate only them.
  	 */
  	if (next - addr != HPAGE_PMD_SIZE) {
  		get_page(page);
  		spin_unlock(ptl);
9818b8cde   Huang Ying   madvise_free, thp...
1583
  		split_huge_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1584
  		unlock_page(page);
bbf29ffc7   Kirill A. Shutemov   thp, mm: fix cras...
1585
  		put_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1586
1587
1588
1589
1590
1591
  		goto out_unlocked;
  	}
  
  	if (PageDirty(page))
  		ClearPageDirty(page);
  	unlock_page(page);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1592
  	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
58ceeb6be   Kirill A. Shutemov   thp: fix MADV_DON...
1593
  		pmdp_invalidate(vma, addr, pmd);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1594
1595
1596
1597
1598
1599
  		orig_pmd = pmd_mkold(orig_pmd);
  		orig_pmd = pmd_mkclean(orig_pmd);
  
  		set_pmd_at(mm, addr, pmd, orig_pmd);
  		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  	}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
1600
1601
  
  	mark_page_lazyfree(page);
319904ad4   Huang Ying   mm, THP: clean up...
1602
  	ret = true;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1603
1604
1605
1606
1607
  out:
  	spin_unlock(ptl);
  out_unlocked:
  	return ret;
  }
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1608
1609
1610
1611
1612
1613
1614
1615
  static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
  {
  	pgtable_t pgtable;
  
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pte_free(mm, pgtable);
  	atomic_long_dec(&mm->nr_ptes);
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
1616
  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
f21760b15   Shaohua Li   thp: add tlb_remo...
1617
  		 pmd_t *pmd, unsigned long addr)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1618
  {
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1619
  	pmd_t orig_pmd;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1620
  	spinlock_t *ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1621

07e326610   Aneesh Kumar K.V   mm: add tlb_remov...
1622
  	tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1623
1624
  	ptl = __pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
  		return 0;
  	/*
  	 * For architectures like ppc64 we look at deposited pgtable
  	 * when calling pmdp_huge_get_and_clear. So do the
  	 * pgtable_trans_huge_withdraw after finishing pmdp related
  	 * operations.
  	 */
  	orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
  			tlb->fullmm);
  	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  	if (vma_is_dax(vma)) {
3b6521f53   Oliver O'Halloran   mm/huge_memory.c:...
1636
1637
  		if (arch_needs_pgtable_deposit())
  			zap_deposited_table(tlb->mm, pmd);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1638
1639
  		spin_unlock(ptl);
  		if (is_huge_zero_pmd(orig_pmd))
c0f2e176f   Aneesh Kumar K.V   mm: use the corre...
1640
  			tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1641
  	} else if (is_huge_zero_pmd(orig_pmd)) {
c14a6eb44   Oliver O'Halloran   mm/huge_memory.c:...
1642
  		zap_deposited_table(tlb->mm, pmd);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1643
  		spin_unlock(ptl);
c0f2e176f   Aneesh Kumar K.V   mm: use the corre...
1644
  		tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1645
  	} else {
616b83715   Zi Yan   mm: thp: enable t...
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
  		struct page *page = NULL;
  		int flush_needed = 1;
  
  		if (pmd_present(orig_pmd)) {
  			page = pmd_page(orig_pmd);
  			page_remove_rmap(page, true);
  			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
  			VM_BUG_ON_PAGE(!PageHead(page), page);
  		} else if (thp_migration_supported()) {
  			swp_entry_t entry;
  
  			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
  			entry = pmd_to_swp_entry(orig_pmd);
  			page = pfn_to_page(swp_offset(entry));
  			flush_needed = 0;
  		} else
  			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
b5072380e   Kirill A. Shutemov   thp: support file...
1663
  		if (PageAnon(page)) {
c14a6eb44   Oliver O'Halloran   mm/huge_memory.c:...
1664
  			zap_deposited_table(tlb->mm, pmd);
b5072380e   Kirill A. Shutemov   thp: support file...
1665
1666
  			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  		} else {
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1667
1668
  			if (arch_needs_pgtable_deposit())
  				zap_deposited_table(tlb->mm, pmd);
b5072380e   Kirill A. Shutemov   thp: support file...
1669
1670
  			add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
  		}
616b83715   Zi Yan   mm: thp: enable t...
1671

da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1672
  		spin_unlock(ptl);
616b83715   Zi Yan   mm: thp: enable t...
1673
1674
  		if (flush_needed)
  			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1675
  	}
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1676
  	return 1;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1677
  }
1dd38b6c2   Aneesh Kumar K.V   mm: move vma_is_a...
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
  #ifndef pmd_move_must_withdraw
  static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
  					 spinlock_t *old_pmd_ptl,
  					 struct vm_area_struct *vma)
  {
  	/*
  	 * With split pmd lock we also need to move preallocated
  	 * PTE page table if new_pmd is on different PMD page table.
  	 *
  	 * We also don't deposit and withdraw tables for file pages.
  	 */
  	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
  }
  #endif
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
  static pmd_t move_soft_dirty_pmd(pmd_t pmd)
  {
  #ifdef CONFIG_MEM_SOFT_DIRTY
  	if (unlikely(is_pmd_migration_entry(pmd)))
  		pmd = pmd_swp_mksoft_dirty(pmd);
  	else if (pmd_present(pmd))
  		pmd = pmd_mksoft_dirty(pmd);
  #endif
  	return pmd;
  }
bf8616d5f   Hugh Dickins   huge mm: move_hug...
1702
  bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1703
  		  unsigned long new_addr, unsigned long old_end,
541500abf   Linus Torvalds   mremap: properly ...
1704
  		  pmd_t *old_pmd, pmd_t *new_pmd)
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1705
  {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1706
  	spinlock_t *old_ptl, *new_ptl;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1707
  	pmd_t pmd;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1708
  	struct mm_struct *mm = vma->vm_mm;
5d1904204   Aaron Lu   mremap: fix race ...
1709
  	bool force_flush = false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1710
1711
1712
  
  	if ((old_addr & ~HPAGE_PMD_MASK) ||
  	    (new_addr & ~HPAGE_PMD_MASK) ||
bf8616d5f   Hugh Dickins   huge mm: move_hug...
1713
  	    old_end - old_addr < HPAGE_PMD_SIZE)
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1714
  		return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1715
1716
1717
1718
1719
1720
1721
  
  	/*
  	 * The destination pmd shouldn't be established, free_pgtables()
  	 * should have release it.
  	 */
  	if (WARN_ON(!pmd_none(*new_pmd))) {
  		VM_BUG_ON(pmd_trans_huge(*new_pmd));
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1722
  		return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1723
  	}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1724
1725
1726
1727
  	/*
  	 * We don't have to worry about the ordering of src and dst
  	 * ptlocks because exclusive mmap_sem prevents deadlock.
  	 */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1728
1729
  	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
  	if (old_ptl) {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1730
1731
1732
  		new_ptl = pmd_lockptr(mm, new_pmd);
  		if (new_ptl != old_ptl)
  			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
8809aa2d2   Aneesh Kumar K.V   mm: clarify that ...
1733
  		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
541500abf   Linus Torvalds   mremap: properly ...
1734
  		if (pmd_present(pmd))
a2ce2666a   Aaron Lu   mremap: move_ptes...
1735
  			force_flush = true;
025c5b245   Naoya Horiguchi   thp: optimize awa...
1736
  		VM_BUG_ON(!pmd_none(*new_pmd));
3592806cf   Kirill A. Shutemov   thp: move preallo...
1737

1dd38b6c2   Aneesh Kumar K.V   mm: move vma_is_a...
1738
  		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
b3084f4db   Aneesh Kumar K.V   powerpc/thp: Fix ...
1739
  			pgtable_t pgtable;
3592806cf   Kirill A. Shutemov   thp: move preallo...
1740
1741
  			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
  			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
3592806cf   Kirill A. Shutemov   thp: move preallo...
1742
  		}
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
1743
1744
  		pmd = move_soft_dirty_pmd(pmd);
  		set_pmd_at(mm, new_addr, new_pmd, pmd);
5d1904204   Aaron Lu   mremap: fix race ...
1745
1746
  		if (force_flush)
  			flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
541500abf   Linus Torvalds   mremap: properly ...
1747
1748
  		if (new_ptl != old_ptl)
  			spin_unlock(new_ptl);
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1749
  		spin_unlock(old_ptl);
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1750
  		return true;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1751
  	}
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1752
  	return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1753
  }
f123d74ab   Mel Gorman   mm: Only flush TL...
1754
1755
1756
1757
1758
1759
  /*
   * Returns
   *  - 0 if PMD could not be locked
   *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
   *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
   */
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1760
  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
e944fd67b   Mel Gorman   mm: numa: do not ...
1761
  		unsigned long addr, pgprot_t newprot, int prot_numa)
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1762
1763
  {
  	struct mm_struct *mm = vma->vm_mm;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1764
  	spinlock_t *ptl;
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1765
1766
1767
  	pmd_t entry;
  	bool preserve_write;
  	int ret;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1768

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1769
  	ptl = __pmd_trans_huge_lock(pmd, vma);
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1770
1771
  	if (!ptl)
  		return 0;
e944fd67b   Mel Gorman   mm: numa: do not ...
1772

0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1773
1774
  	preserve_write = prot_numa && pmd_write(*pmd);
  	ret = 1;
e944fd67b   Mel Gorman   mm: numa: do not ...
1775

84c3fc4e9   Zi Yan   mm: thp: check pm...
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  	if (is_swap_pmd(*pmd)) {
  		swp_entry_t entry = pmd_to_swp_entry(*pmd);
  
  		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
  		if (is_write_migration_entry(entry)) {
  			pmd_t newpmd;
  			/*
  			 * A protection check is difficult so
  			 * just be safe and disable write
  			 */
  			make_migration_entry_read(&entry);
  			newpmd = swp_entry_to_pmd(entry);
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
1789
1790
  			if (pmd_swp_soft_dirty(*pmd))
  				newpmd = pmd_swp_mksoft_dirty(newpmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1791
1792
1793
1794
1795
  			set_pmd_at(mm, addr, pmd, newpmd);
  		}
  		goto unlock;
  	}
  #endif
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1796
1797
1798
1799
1800
1801
1802
  	/*
  	 * Avoid trapping faults against the zero page. The read-only
  	 * data is likely to be read-cached on the local CPU and
  	 * local/remote hits to the zero page are not interesting.
  	 */
  	if (prot_numa && is_huge_zero_pmd(*pmd))
  		goto unlock;
025c5b245   Naoya Horiguchi   thp: optimize awa...
1803

0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1804
1805
  	if (prot_numa && pmd_protnone(*pmd))
  		goto unlock;
ced108037   Kirill A. Shutemov   thp: fix MADV_DON...
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
  	/*
  	 * In case prot_numa, we are under down_read(mmap_sem). It's critical
  	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
  	 * which is also under down_read(mmap_sem):
  	 *
  	 *	CPU0:				CPU1:
  	 *				change_huge_pmd(prot_numa=1)
  	 *				 pmdp_huge_get_and_clear_notify()
  	 * madvise_dontneed()
  	 *  zap_pmd_range()
  	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
  	 *   // skip the pmd
  	 *				 set_pmd_at();
  	 *				 // pmd is re-established
  	 *
  	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
  	 * which may break userspace.
  	 *
  	 * pmdp_invalidate() is required to make sure we don't miss
  	 * dirty/young flags set by hardware.
  	 */
  	entry = *pmd;
  	pmdp_invalidate(vma, addr, pmd);
  
  	/*
  	 * Recover dirty/young flags.  It relies on pmdp_invalidate to not
  	 * corrupt them.
  	 */
  	if (pmd_dirty(*pmd))
  		entry = pmd_mkdirty(entry);
  	if (pmd_young(*pmd))
  		entry = pmd_mkyoung(entry);
0a85e51d3   Kirill A. Shutemov   thp: reduce inden...
1838
1839
1840
1841
1842
1843
1844
1845
  	entry = pmd_modify(entry, newprot);
  	if (preserve_write)
  		entry = pmd_mk_savedwrite(entry);
  	ret = HPAGE_PMD_NR;
  	set_pmd_at(mm, addr, pmd, entry);
  	BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
  unlock:
  	spin_unlock(ptl);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1846
1847
1848
1849
  	return ret;
  }
  
  /*
8f19b0c05   Huang Ying   thp: fix comments...
1850
   * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1851
   *
8f19b0c05   Huang Ying   thp: fix comments...
1852
1853
   * Note that if it returns page table lock pointer, this routine returns without
   * unlocking page table lock. So callers must unlock it.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1854
   */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1855
  spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
025c5b245   Naoya Horiguchi   thp: optimize awa...
1856
  {
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1857
1858
  	spinlock_t *ptl;
  	ptl = pmd_lock(vma->vm_mm, pmd);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1859
1860
  	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
  			pmd_devmap(*pmd)))
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1861
1862
1863
  		return ptl;
  	spin_unlock(ptl);
  	return NULL;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1864
  }
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
  /*
   * Returns true if a given pud maps a thp, false otherwise.
   *
   * Note that if it returns true, this routine returns without unlocking page
   * table lock. So callers must unlock it.
   */
  spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
  {
  	spinlock_t *ptl;
  
  	ptl = pud_lock(vma->vm_mm, pud);
  	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
  		return ptl;
  	spin_unlock(ptl);
  	return NULL;
  }
  
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
  		 pud_t *pud, unsigned long addr)
  {
  	pud_t orig_pud;
  	spinlock_t *ptl;
  
  	ptl = __pud_trans_huge_lock(pud, vma);
  	if (!ptl)
  		return 0;
  	/*
  	 * For architectures like ppc64 we look at deposited pgtable
  	 * when calling pudp_huge_get_and_clear. So do the
  	 * pgtable_trans_huge_withdraw after finishing pudp related
  	 * operations.
  	 */
  	orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
  			tlb->fullmm);
  	tlb_remove_pud_tlb_entry(tlb, pud, addr);
  	if (vma_is_dax(vma)) {
  		spin_unlock(ptl);
  		/* No zero page support yet */
  	} else {
  		/* No support for anonymous PUD pages yet */
  		BUG();
  	}
  	return 1;
  }
  
  static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
  		unsigned long haddr)
  {
  	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
  	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
ce9311cf9   Yisheng Xie   mm/vmstats: add t...
1918
  	count_vm_event(THP_SPLIT_PUD);
a00cc7d9d   Matthew Wilcox   mm, x86: add supp...
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
  
  	pudp_huge_clear_flush_notify(vma, haddr, pud);
  }
  
  void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
  		unsigned long address)
  {
  	spinlock_t *ptl;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long haddr = address & HPAGE_PUD_MASK;
  
  	mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
  	ptl = pud_lock(mm, pud);
  	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
  		goto out;
  	__split_huge_pud_locked(vma, pud, haddr);
  
  out:
  	spin_unlock(ptl);
  	mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
  }
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
  static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
  		unsigned long haddr, pmd_t *pmd)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pgtable_t pgtable;
  	pmd_t _pmd;
  	int i;
  
  	/* leave pmd empty until pte is filled */
  	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
  
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
  
  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
  		pte_t *pte, entry;
  		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
  		entry = pte_mkspecial(entry);
  		pte = pte_offset_map(&_pmd, haddr);
  		VM_BUG_ON(!pte_none(*pte));
  		set_pte_at(mm, haddr, pte, entry);
  		pte_unmap(pte);
  	}
  	smp_wmb(); /* make pte visible before pmd */
  	pmd_populate(mm, pmd, pgtable);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1966
1967
1968
  }
  
  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
ba9882808   Kirill A. Shutemov   thp: add option t...
1969
  		unsigned long haddr, bool freeze)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1970
1971
1972
1973
1974
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct page *page;
  	pgtable_t pgtable;
  	pmd_t _pmd;
84c3fc4e9   Zi Yan   mm: thp: check pm...
1975
  	bool young, write, dirty, soft_dirty, pmd_migration = false;
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
1976
  	unsigned long addr;
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1977
1978
1979
1980
1981
  	int i;
  
  	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
84c3fc4e9   Zi Yan   mm: thp: check pm...
1982
1983
  	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
  				&& !pmd_devmap(*pmd));
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
1984
1985
  
  	count_vm_event(THP_SPLIT_PMD);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
1986
1987
  	if (!vma_is_anonymous(vma)) {
  		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
953c66c2b   Aneesh Kumar K.V   mm: THP page cach...
1988
1989
1990
1991
1992
1993
  		/*
  		 * We are going to unmap this huge page. So
  		 * just go ahead and zap it
  		 */
  		if (arch_needs_pgtable_deposit())
  			zap_deposited_table(mm, pmd);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
1994
1995
1996
  		if (vma_is_dax(vma))
  			return;
  		page = pmd_page(_pmd);
70ef1db1f   Hugh Dickins   mm/huge_memory.c:...
1997
1998
  		if (!PageDirty(page) && pmd_dirty(_pmd))
  			set_page_dirty(page);
d21b9e57c   Kirill A. Shutemov   thp: handle file ...
1999
2000
2001
2002
2003
  		if (!PageReferenced(page) && pmd_young(_pmd))
  			SetPageReferenced(page);
  		page_remove_rmap(page, true);
  		put_page(page);
  		add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2004
2005
2006
2007
  		return;
  	} else if (is_huge_zero_pmd(*pmd)) {
  		return __split_huge_zero_page_pmd(vma, haddr, pmd);
  	}
84c3fc4e9   Zi Yan   mm: thp: check pm...
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  	pmd_migration = is_pmd_migration_entry(*pmd);
  	if (pmd_migration) {
  		swp_entry_t entry;
  
  		entry = pmd_to_swp_entry(*pmd);
  		page = pfn_to_page(swp_offset(entry));
  	} else
  #endif
  		page = pmd_page(*pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2018
  	VM_BUG_ON_PAGE(!page_count(page), page);
fe896d187   Joonsoo Kim   mm: introduce pag...
2019
  	page_ref_add(page, HPAGE_PMD_NR - 1);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2020
2021
  	write = pmd_write(*pmd);
  	young = pmd_young(*pmd);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2022
  	dirty = pmd_dirty(*pmd);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
2023
  	soft_dirty = pmd_soft_dirty(*pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2024

c777e2a8b   Aneesh Kumar K.V   powerpc/mm: Fix M...
2025
  	pmdp_huge_split_prepare(vma, haddr, pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2026
2027
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2028
  	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2029
2030
2031
2032
2033
2034
  		pte_t entry, *pte;
  		/*
  		 * Note that NUMA hinting access restrictions are not
  		 * transferred to avoid any possibility of altering
  		 * permissions across VMAs.
  		 */
84c3fc4e9   Zi Yan   mm: thp: check pm...
2035
  		if (freeze || pmd_migration) {
ba9882808   Kirill A. Shutemov   thp: add option t...
2036
2037
2038
  			swp_entry_t swp_entry;
  			swp_entry = make_migration_entry(page + i, write);
  			entry = swp_entry_to_pte(swp_entry);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
2039
2040
  			if (soft_dirty)
  				entry = pte_swp_mksoft_dirty(entry);
ba9882808   Kirill A. Shutemov   thp: add option t...
2041
  		} else {
6d2329f88   Andrea Arcangeli   mm: vm_page_prot:...
2042
  			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2043
  			entry = maybe_mkwrite(entry, vma);
ba9882808   Kirill A. Shutemov   thp: add option t...
2044
2045
2046
2047
  			if (!write)
  				entry = pte_wrprotect(entry);
  			if (!young)
  				entry = pte_mkold(entry);
804dd1504   Andrea Arcangeli   soft_dirty: fix s...
2048
2049
  			if (soft_dirty)
  				entry = pte_mksoft_dirty(entry);
ba9882808   Kirill A. Shutemov   thp: add option t...
2050
  		}
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2051
2052
  		if (dirty)
  			SetPageDirty(page + i);
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2053
  		pte = pte_offset_map(&_pmd, addr);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2054
  		BUG_ON(!pte_none(*pte));
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2055
  		set_pte_at(mm, addr, pte, entry);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
  		atomic_inc(&page[i]._mapcount);
  		pte_unmap(pte);
  	}
  
  	/*
  	 * Set PG_double_map before dropping compound_mapcount to avoid
  	 * false-negative page_mapped().
  	 */
  	if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
  		for (i = 0; i < HPAGE_PMD_NR; i++)
  			atomic_inc(&page[i]._mapcount);
  	}
  
  	if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
  		/* Last compound_mapcount is gone. */
11fb99898   Mel Gorman   mm: move most fil...
2071
  		__dec_node_page_state(page, NR_ANON_THPS);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2072
2073
2074
2075
2076
2077
2078
2079
  		if (TestClearPageDoubleMap(page)) {
  			/* No need in mapcount reference anymore */
  			for (i = 0; i < HPAGE_PMD_NR; i++)
  				atomic_dec(&page[i]._mapcount);
  		}
  	}
  
  	smp_wmb(); /* make pte visible before pmd */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
  	/*
  	 * Up to this point the pmd is present and huge and userland has the
  	 * whole access to the hugepage during the split (which happens in
  	 * place). If we overwrite the pmd with the not-huge version pointing
  	 * to the pte here (which of course we could if all CPUs were bug
  	 * free), userland could trigger a small page size TLB miss on the
  	 * small sized TLB while the hugepage TLB entry is still established in
  	 * the huge TLB. Some CPU doesn't like that.
  	 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
  	 * 383 on page 93. Intel should be safe but is also warns that it's
  	 * only safe if the permission and cache attributes of the two entries
  	 * loaded in the two TLB is identical (which should be the case here).
  	 * But it is generally safer to never allow small and huge TLB entries
  	 * for the same virtual address to be loaded simultaneously. So instead
  	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
  	 * current pmd notpresent (atomically because here the pmd_trans_huge
  	 * and pmd_trans_splitting must remain set at all times on the pmd
  	 * until the split is complete for this pmd), then we flush the SMP TLB
  	 * and finally we write the non-huge version of the pmd entry with
  	 * pmd_populate.
  	 */
  	pmdp_invalidate(vma, haddr, pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2102
  	pmd_populate(mm, pmd, pgtable);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2103
2104
  
  	if (freeze) {
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2105
  		for (i = 0; i < HPAGE_PMD_NR; i++) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2106
2107
2108
2109
  			page_remove_rmap(page + i, false);
  			put_page(page + i);
  		}
  	}
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2110
2111
2112
  }
  
  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2113
  		unsigned long address, bool freeze, struct page *page)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2114
2115
2116
2117
2118
2119
2120
  {
  	spinlock_t *ptl;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long haddr = address & HPAGE_PMD_MASK;
  
  	mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
  	ptl = pmd_lock(mm, pmd);
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2121
2122
2123
2124
2125
2126
2127
2128
  
  	/*
  	 * If caller asks to setup a migration entries, we need a page to check
  	 * pmd against. Otherwise we can end up replacing wrong page.
  	 */
  	VM_BUG_ON(freeze && !page);
  	if (page && page != pmd_page(*pmd))
  	        goto out;
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2129
  	if (pmd_trans_huge(*pmd)) {
33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2130
  		page = pmd_page(*pmd);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2131
  		if (PageMlocked(page))
5f7377147   Kirill A. Shutemov   thp: fix deadlock...
2132
  			clear_page_mlock(page);
84c3fc4e9   Zi Yan   mm: thp: check pm...
2133
  	} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
2134
  		goto out;
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2135
  	__split_huge_pmd_locked(vma, pmd, haddr, freeze);
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
2136
  out:
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2137
2138
2139
  	spin_unlock(ptl);
  	mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
  }
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2140
2141
  void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
  		bool freeze, struct page *page)
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2142
  {
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2143
  	pgd_t *pgd;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2144
  	p4d_t *p4d;
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2145
  	pud_t *pud;
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2146
  	pmd_t *pmd;
78ddc5347   Kirill A. Shutemov   thp: rename split...
2147
  	pgd = pgd_offset(vma->vm_mm, address);
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2148
2149
  	if (!pgd_present(*pgd))
  		return;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2150
2151
2152
2153
2154
  	p4d = p4d_offset(pgd, address);
  	if (!p4d_present(*p4d))
  		return;
  
  	pud = pud_offset(p4d, address);
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2155
2156
2157
2158
  	if (!pud_present(*pud))
  		return;
  
  	pmd = pmd_offset(pud, address);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2159

33f4751e9   Naoya Horiguchi   mm: thp: move pmd...
2160
  	__split_huge_pmd(vma, pmd, address, freeze, page);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2161
  }
e1b9996b8   Kirill A. Shutemov   thp: vma_adjust_t...
2162
  void vma_adjust_trans_huge(struct vm_area_struct *vma,
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
  			     unsigned long start,
  			     unsigned long end,
  			     long adjust_next)
  {
  	/*
  	 * If the new start address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (start & ~HPAGE_PMD_MASK &&
  	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2175
  		split_huge_pmd_address(vma, start, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2176
2177
2178
2179
2180
2181
2182
2183
2184
  
  	/*
  	 * If the new end address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (end & ~HPAGE_PMD_MASK &&
  	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2185
  		split_huge_pmd_address(vma, end, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
  
  	/*
  	 * If we're also updating the vma->vm_next->vm_start, if the new
  	 * vm_next->vm_start isn't page aligned and it could previously
  	 * contain an hugepage: check if we need to split an huge pmd.
  	 */
  	if (adjust_next > 0) {
  		struct vm_area_struct *next = vma->vm_next;
  		unsigned long nstart = next->vm_start;
  		nstart += adjust_next << PAGE_SHIFT;
  		if (nstart & ~HPAGE_PMD_MASK &&
  		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
  		    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2199
  			split_huge_pmd_address(next, nstart, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2200
2201
  	}
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2202

e12b67d81   Hugh Dickins   mm/huge_memory: r...
2203
  static void unmap_page(struct page *page)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2204
  {
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2205
  	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
c7ab0d2fd   Kirill A. Shutemov   mm: convert try_t...
2206
  		TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
666e5a406   Minchan Kim   mm: make ttu's re...
2207
  	bool unmap_success;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2208
2209
  
  	VM_BUG_ON_PAGE(!PageHead(page), page);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2210
  	if (PageAnon(page))
b5ff8161e   Naoya Horiguchi   mm: thp: introduc...
2211
  		ttu_flags |= TTU_SPLIT_FREEZE;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2212

666e5a406   Minchan Kim   mm: make ttu's re...
2213
2214
  	unmap_success = try_to_unmap(page, ttu_flags);
  	VM_BUG_ON_PAGE(!unmap_success, page);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2215
  }
e12b67d81   Hugh Dickins   mm/huge_memory: r...
2216
  static void remap_page(struct page *page)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2217
  {
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2218
  	int i;
ace71a19c   Kirill A. Shutemov   mm: introduce pag...
2219
2220
2221
2222
2223
2224
  	if (PageTransHuge(page)) {
  		remove_migration_ptes(page, page, true);
  	} else {
  		for (i = 0; i < HPAGE_PMD_NR; i++)
  			remove_migration_ptes(page + i, page + i, true);
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2225
  }
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2226
  static void __split_huge_page_tail(struct page *head, int tail,
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2227
2228
  		struct lruvec *lruvec, struct list_head *list)
  {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2229
  	struct page *page_tail = head + tail;
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2230
  	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2231
2232
  
  	/*
30241d721   Konstantin Khlebnikov   mm/huge_memory.c:...
2233
2234
2235
2236
  	 * Clone page flags before unfreezing refcount.
  	 *
  	 * After successful get_page_unless_zero() might follow flags change,
  	 * for exmaple lock_page() which set PG_waiters.
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2237
  	 */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2238
2239
2240
2241
  	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  	page_tail->flags |= (head->flags &
  			((1L << PG_referenced) |
  			 (1L << PG_swapbacked) |
38d8b4e6b   Huang Ying   mm, THP, swap: de...
2242
  			 (1L << PG_swapcache) |
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2243
2244
2245
2246
  			 (1L << PG_mlocked) |
  			 (1L << PG_uptodate) |
  			 (1L << PG_active) |
  			 (1L << PG_locked) |
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2247
2248
  			 (1L << PG_unevictable) |
  			 (1L << PG_dirty)));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2249

16d07443b   Hugh Dickins   mm/huge_memory: s...
2250
2251
2252
2253
2254
  	/* ->mapping in first tail page is compound_mapcount */
  	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
  			page_tail);
  	page_tail->mapping = head->mapping;
  	page_tail->index = head->index + tail;
30241d721   Konstantin Khlebnikov   mm/huge_memory.c:...
2255
  	/* Page flags must be visible before we make the page non-compound. */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2256
  	smp_wmb();
30241d721   Konstantin Khlebnikov   mm/huge_memory.c:...
2257
2258
2259
2260
2261
2262
  	/*
  	 * Clear PageTail before unfreezing page refcount.
  	 *
  	 * After successful get_page_unless_zero() might follow put_page()
  	 * which needs correct compound_head().
  	 */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2263
  	clear_compound_head(page_tail);
30241d721   Konstantin Khlebnikov   mm/huge_memory.c:...
2264
2265
2266
  	/* Finally unfreeze refcount. Additional reference from page cache. */
  	page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
  					  PageSwapCache(head)));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2267
2268
2269
2270
  	if (page_is_young(head))
  		set_page_young(page_tail);
  	if (page_is_idle(head))
  		set_page_idle(page_tail);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2271
2272
  	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
  	lru_add_page_tail(head, page_tail, lruvec, list);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2273
  }
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2274
  static void __split_huge_page(struct page *page, struct list_head *list,
6f75a0983   Hugh Dickins   mm/huge_memory: f...
2275
  		pgoff_t end, unsigned long flags)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2276
2277
2278
2279
  {
  	struct page *head = compound_head(page);
  	struct zone *zone = page_zone(head);
  	struct lruvec *lruvec;
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2280
  	int i;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2281

599d0c954   Mel Gorman   mm, vmscan: move ...
2282
  	lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2283
2284
2285
  
  	/* complete memcg works before add pages to LRU */
  	mem_cgroup_split_huge_fixup(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2286
  	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
2287
  		__split_huge_page_tail(head, i, lruvec, list);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2288
2289
  		/* Some pages can be beyond i_size: drop them from page cache */
  		if (head[i].index >= end) {
a7027b7d6   Hugh Dickins   mm/huge_memory.c:...
2290
  			ClearPageDirty(head + i);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2291
  			__delete_from_page_cache(head + i, NULL);
800d8c63b   Kirill A. Shutemov   shmem: add huge p...
2292
2293
  			if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
  				shmem_uncharge(head->mapping->host, 1);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2294
2295
2296
  			put_page(head + i);
  		}
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2297
2298
  
  	ClearPageCompound(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2299
2300
  	/* See comment in __split_huge_page_tail() */
  	if (PageAnon(head)) {
38d8b4e6b   Huang Ying   mm, THP, swap: de...
2301
2302
2303
2304
2305
  		/* Additional pin to radix tree of swap cache */
  		if (PageSwapCache(head))
  			page_ref_add(head, 2);
  		else
  			page_ref_inc(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2306
2307
2308
2309
2310
  	} else {
  		/* Additional pin to radix tree */
  		page_ref_add(head, 2);
  		spin_unlock(&head->mapping->tree_lock);
  	}
a52633d8e   Mel Gorman   mm, vmscan: move ...
2311
  	spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2312

e12b67d81   Hugh Dickins   mm/huge_memory: r...
2313
  	remap_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		struct page *subpage = head + i;
  		if (subpage == page)
  			continue;
  		unlock_page(subpage);
  
  		/*
  		 * Subpages may be freed if there wasn't any mapping
  		 * like if add_to_swap() is running on a lru page that
  		 * had its mapping zapped. And freeing these pages
  		 * requires taking the lru_lock so we do the put_page
  		 * of the tail pages after the split is complete.
  		 */
  		put_page(subpage);
  	}
  }
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2331
2332
  int total_mapcount(struct page *page)
  {
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2333
  	int i, compound, ret;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2334
2335
2336
2337
2338
  
  	VM_BUG_ON_PAGE(PageTail(page), page);
  
  	if (likely(!PageCompound(page)))
  		return atomic_read(&page->_mapcount) + 1;
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2339
  	compound = compound_mapcount(page);
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2340
  	if (PageHuge(page))
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2341
2342
  		return compound;
  	ret = compound;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2343
2344
  	for (i = 0; i < HPAGE_PMD_NR; i++)
  		ret += atomic_read(&page[i]._mapcount) + 1;
dd78fedde   Kirill A. Shutemov   rmap: support fil...
2345
2346
2347
  	/* File pages has compound_mapcount included in _mapcount */
  	if (!PageAnon(page))
  		return ret - compound * HPAGE_PMD_NR;
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
2348
2349
2350
2351
  	if (PageDoubleMap(page))
  		ret -= HPAGE_PMD_NR;
  	return ret;
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2352
  /*
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
   * This calculates accurately how many mappings a transparent hugepage
   * has (unlike page_mapcount() which isn't fully accurate). This full
   * accuracy is primarily needed to know if copy-on-write faults can
   * reuse the page and change the mapping to read-write instead of
   * copying them. At the same time this returns the total_mapcount too.
   *
   * The function returns the highest mapcount any one of the subpages
   * has. If the return value is one, even if different processes are
   * mapping different subpages of the transparent hugepage, they can
   * all reuse it, because each process is reusing a different subpage.
   *
   * The total_mapcount is instead counting all virtual mappings of the
   * subpages. If the total_mapcount is equal to "one", it tells the
   * caller all mappings belong to the same "mm" and in turn the
   * anon_vma of the transparent hugepage can become the vma->anon_vma
   * local one as no other process may be mapping any of the subpages.
   *
   * It would be more accurate to replace page_mapcount() with
   * page_trans_huge_mapcount(), however we only use
   * page_trans_huge_mapcount() in the copy-on-write faults where we
   * need full accuracy to avoid breaking page pinning, because
   * page_trans_huge_mapcount() is slower than page_mapcount().
   */
  int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
  {
  	int i, ret, _total_mapcount, mapcount;
  
  	/* hugetlbfs shouldn't call it */
  	VM_BUG_ON_PAGE(PageHuge(page), page);
  
  	if (likely(!PageTransCompound(page))) {
  		mapcount = atomic_read(&page->_mapcount) + 1;
  		if (total_mapcount)
  			*total_mapcount = mapcount;
  		return mapcount;
  	}
  
  	page = compound_head(page);
  
  	_total_mapcount = ret = 0;
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		mapcount = atomic_read(&page[i]._mapcount) + 1;
  		ret = max(ret, mapcount);
  		_total_mapcount += mapcount;
  	}
  	if (PageDoubleMap(page)) {
  		ret -= 1;
  		_total_mapcount -= HPAGE_PMD_NR;
  	}
  	mapcount = compound_mapcount(page);
  	ret += mapcount;
  	_total_mapcount += mapcount;
  	if (total_mapcount)
  		*total_mapcount = _total_mapcount;
  	return ret;
  }
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
  /* Racy check whether the huge page can be split */
  bool can_split_huge_page(struct page *page, int *pextra_pins)
  {
  	int extra_pins;
  
  	/* Additional pins from radix tree */
  	if (PageAnon(page))
  		extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
  	else
  		extra_pins = HPAGE_PMD_NR;
  	if (pextra_pins)
  		*pextra_pins = extra_pins;
  	return total_mapcount(page) == page_count(page) - extra_pins - 1;
  }
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
2423
  /*
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
   * This function splits huge page into normal pages. @page can point to any
   * subpage of huge page to split. Split doesn't change the position of @page.
   *
   * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
   * The huge page must be locked.
   *
   * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
   *
   * Both head page and tail pages will inherit mapping, flags, and so on from
   * the hugepage.
   *
   * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
   * they are not mapped.
   *
   * Returns 0 if the hugepage is split successfully.
   * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
   * us.
   */
  int split_huge_page_to_list(struct page *page, struct list_head *list)
  {
  	struct page *head = compound_head(page);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2445
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2446
2447
2448
  	struct anon_vma *anon_vma = NULL;
  	struct address_space *mapping = NULL;
  	int count, mapcount, extra_pins, ret;
d96543223   Kirill A. Shutemov   thp: increase spl...
2449
  	bool mlocked;
0b9b6fff7   Kirill A. Shutemov   thp: fix interrup...
2450
  	unsigned long flags;
6f75a0983   Hugh Dickins   mm/huge_memory: f...
2451
  	pgoff_t end;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2452
2453
  
  	VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2454
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2455
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
59807685a   Huang Ying   mm, THP, swap: su...
2456
2457
  	if (PageWriteback(page))
  		return -EBUSY;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
  	if (PageAnon(head)) {
  		/*
  		 * The caller does not necessarily hold an mmap_sem that would
  		 * prevent the anon_vma disappearing so we first we take a
  		 * reference to it and then lock the anon_vma for write. This
  		 * is similar to page_lock_anon_vma_read except the write lock
  		 * is taken to serialise against parallel split or collapse
  		 * operations.
  		 */
  		anon_vma = page_get_anon_vma(head);
  		if (!anon_vma) {
  			ret = -EBUSY;
  			goto out;
  		}
6f75a0983   Hugh Dickins   mm/huge_memory: f...
2472
  		end = -1;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
  		mapping = NULL;
  		anon_vma_lock_write(anon_vma);
  	} else {
  		mapping = head->mapping;
  
  		/* Truncated ? */
  		if (!mapping) {
  			ret = -EBUSY;
  			goto out;
  		}
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2483
2484
  		anon_vma = NULL;
  		i_mmap_lock_read(mapping);
6f75a0983   Hugh Dickins   mm/huge_memory: f...
2485
2486
2487
2488
2489
2490
2491
2492
2493
  
  		/*
  		 *__split_huge_page() may need to trim off pages beyond EOF:
  		 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
  		 * which cannot be nested inside the page tree lock. So note
  		 * end now: i_size itself may be changed at any moment, but
  		 * head page lock is good enough to serialize the trimming.
  		 */
  		end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2494
  	}
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2495
2496
  
  	/*
e12b67d81   Hugh Dickins   mm/huge_memory: r...
2497
  	 * Racy check if we can split the page, before unmap_page() will
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2498
2499
  	 * split PMDs
  	 */
b8f593cd0   Huang Ying   mm, THP, swap: ch...
2500
  	if (!can_split_huge_page(head, &extra_pins)) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2501
2502
2503
  		ret = -EBUSY;
  		goto out_unlock;
  	}
d96543223   Kirill A. Shutemov   thp: increase spl...
2504
  	mlocked = PageMlocked(page);
e12b67d81   Hugh Dickins   mm/huge_memory: r...
2505
  	unmap_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2506
  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
d96543223   Kirill A. Shutemov   thp: increase spl...
2507
2508
2509
  	/* Make sure the page is not on per-CPU pagevec as it takes pin */
  	if (mlocked)
  		lru_add_drain();
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2510
  	/* prevent PageLRU to go away from under us, and freeze lru stats */
a52633d8e   Mel Gorman   mm, vmscan: move ...
2511
  	spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
  
  	if (mapping) {
  		void **pslot;
  
  		spin_lock(&mapping->tree_lock);
  		pslot = radix_tree_lookup_slot(&mapping->page_tree,
  				page_index(head));
  		/*
  		 * Check if the head page is present in radix tree.
  		 * We assume all tail are present too, if head is there.
  		 */
  		if (radix_tree_deref_slot_protected(pslot,
  					&mapping->tree_lock) != head)
  			goto fail;
  	}
0139aa7b7   Joonsoo Kim   mm: rename _count...
2527
  	/* Prevent deferred_split_scan() touching ->_refcount */
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2528
  	spin_lock(&pgdata->split_queue_lock);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2529
2530
  	count = page_count(head);
  	mapcount = total_mapcount(head);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2531
  	if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
2532
  		if (!list_empty(page_deferred_list(head))) {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2533
  			pgdata->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2534
2535
  			list_del(page_deferred_list(head));
  		}
65c453778   Kirill A. Shutemov   mm, rmap: account...
2536
  		if (mapping)
11fb99898   Mel Gorman   mm: move most fil...
2537
  			__dec_node_page_state(page, NR_SHMEM_THPS);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2538
  		spin_unlock(&pgdata->split_queue_lock);
6f75a0983   Hugh Dickins   mm/huge_memory: f...
2539
  		__split_huge_page(page, list, end, flags);
59807685a   Huang Ying   mm, THP, swap: su...
2540
2541
2542
2543
2544
2545
  		if (PageSwapCache(head)) {
  			swp_entry_t entry = { .val = page_private(head) };
  
  			ret = split_swap_cluster(entry);
  		} else
  			ret = 0;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2546
  	} else {
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
  		if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
  			pr_alert("total_mapcount: %u, page_count(): %u
  ",
  					mapcount, count);
  			if (PageTail(page))
  				dump_page(head, NULL);
  			dump_page(page, "total_mapcount(head) > 0");
  			BUG();
  		}
  		spin_unlock(&pgdata->split_queue_lock);
  fail:		if (mapping)
  			spin_unlock(&mapping->tree_lock);
a52633d8e   Mel Gorman   mm, vmscan: move ...
2559
  		spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
e12b67d81   Hugh Dickins   mm/huge_memory: r...
2560
  		remap_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2561
2562
2563
2564
  		ret = -EBUSY;
  	}
  
  out_unlock:
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2565
2566
2567
2568
2569
2570
  	if (anon_vma) {
  		anon_vma_unlock_write(anon_vma);
  		put_anon_vma(anon_vma);
  	}
  	if (mapping)
  		i_mmap_unlock_read(mapping);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2571
2572
2573
2574
  out:
  	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
  	return ret;
  }
9a982250f   Kirill A. Shutemov   thp: introduce de...
2575
2576
2577
  
  void free_transhuge_page(struct page *page)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2578
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
9a982250f   Kirill A. Shutemov   thp: introduce de...
2579
  	unsigned long flags;
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2580
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2581
  	if (!list_empty(page_deferred_list(page))) {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2582
  		pgdata->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2583
2584
  		list_del(page_deferred_list(page));
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2585
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2586
2587
2588
2589
2590
  	free_compound_page(page);
  }
  
  void deferred_split_huge_page(struct page *page)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2591
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
9a982250f   Kirill A. Shutemov   thp: introduce de...
2592
2593
2594
  	unsigned long flags;
  
  	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2595
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2596
  	if (list_empty(page_deferred_list(page))) {
f9719a03d   Kirill A. Shutemov   thp, vmstats: cou...
2597
  		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2598
2599
  		list_add_tail(page_deferred_list(page), &pgdata->split_queue);
  		pgdata->split_queue_len++;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2600
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2601
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2602
2603
2604
2605
2606
  }
  
  static unsigned long deferred_split_count(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2607
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
cb8d68ec1   Kirill A. Shutemov   thp: change defer...
2608
  	return ACCESS_ONCE(pgdata->split_queue_len);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2609
2610
2611
2612
2613
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2614
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2615
2616
2617
2618
  	unsigned long flags;
  	LIST_HEAD(list), *pos, *next;
  	struct page *page;
  	int split = 0;
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2619
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2620
  	/* Take pin on all head pages to avoid freeing them under us */
ae026204a   Kirill A. Shutemov   thp: make deferre...
2621
  	list_for_each_safe(pos, next, &pgdata->split_queue) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
2622
2623
  		page = list_entry((void *)pos, struct page, mapping);
  		page = compound_head(page);
e3ae19535   Kirill A. Shutemov   thp: limit number...
2624
2625
2626
2627
  		if (get_page_unless_zero(page)) {
  			list_move(page_deferred_list(page), &list);
  		} else {
  			/* We lost race with put_compound_page() */
9a982250f   Kirill A. Shutemov   thp: introduce de...
2628
  			list_del_init(page_deferred_list(page));
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2629
  			pgdata->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2630
  		}
e3ae19535   Kirill A. Shutemov   thp: limit number...
2631
2632
  		if (!--sc->nr_to_scan)
  			break;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2633
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2634
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2635
2636
2637
  
  	list_for_each_safe(pos, next, &list) {
  		page = list_entry((void *)pos, struct page, mapping);
b6b6783c8   Kirill A. Shutemov   mm/thp: do not wa...
2638
2639
  		if (!trylock_page(page))
  			goto next;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2640
2641
2642
2643
  		/* split_huge_page() removes page from list on success */
  		if (!split_huge_page(page))
  			split++;
  		unlock_page(page);
b6b6783c8   Kirill A. Shutemov   mm/thp: do not wa...
2644
  next:
9a982250f   Kirill A. Shutemov   thp: introduce de...
2645
2646
  		put_page(page);
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2647
2648
2649
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
  	list_splice_tail(&list, &pgdata->split_queue);
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2650

cb8d68ec1   Kirill A. Shutemov   thp: change defer...
2651
2652
2653
2654
2655
2656
2657
  	/*
  	 * Stop shrinker if we didn't split any page, but the queue is empty.
  	 * This can happen if pages were freed under us.
  	 */
  	if (!split && list_empty(&pgdata->split_queue))
  		return SHRINK_STOP;
  	return split;
9a982250f   Kirill A. Shutemov   thp: introduce de...
2658
2659
2660
2661
2662
2663
  }
  
  static struct shrinker deferred_split_shrinker = {
  	.count_objects = deferred_split_count,
  	.scan_objects = deferred_split_scan,
  	.seeks = DEFAULT_SEEKS,
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
2664
  	.flags = SHRINKER_NUMA_AWARE,
9a982250f   Kirill A. Shutemov   thp: introduce de...
2665
  };
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
  
  #ifdef CONFIG_DEBUG_FS
  static int split_huge_pages_set(void *data, u64 val)
  {
  	struct zone *zone;
  	struct page *page;
  	unsigned long pfn, max_zone_pfn;
  	unsigned long total = 0, split = 0;
  
  	if (val != 1)
  		return -EINVAL;
  
  	for_each_populated_zone(zone) {
  		max_zone_pfn = zone_end_pfn(zone);
  		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
  			if (!pfn_valid(pfn))
  				continue;
  
  			page = pfn_to_page(pfn);
  			if (!get_page_unless_zero(page))
  				continue;
  
  			if (zone != page_zone(page))
  				goto next;
baa355fd3   Kirill A. Shutemov   thp: file pages s...
2690
  			if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
  				goto next;
  
  			total++;
  			lock_page(page);
  			if (!split_huge_page(page))
  				split++;
  			unlock_page(page);
  next:
  			put_page(page);
  		}
  	}
145bdaa15   Yang Shi   mm: thp: correct ...
2702
2703
  	pr_info("%lu of %lu THP split
  ", split, total);
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
  
  	return 0;
  }
  DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
  		"%llu
  ");
  
  static int __init split_huge_pages_debugfs(void)
  {
  	void *ret;
145bdaa15   Yang Shi   mm: thp: correct ...
2714
  	ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
49071d436   Kirill A. Shutemov   thp: add debugfs ...
2715
2716
2717
2718
2719
2720
2721
  			&split_huge_pages_fops);
  	if (!ret)
  		pr_warn("Failed to create split_huge_pages in debugfs");
  	return 0;
  }
  late_initcall(split_huge_pages_debugfs);
  #endif
616b83715   Zi Yan   mm: thp: enable t...
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
  
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
  		struct page *page)
  {
  	struct vm_area_struct *vma = pvmw->vma;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address = pvmw->address;
  	pmd_t pmdval;
  	swp_entry_t entry;
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
2732
  	pmd_t pmdswp;
616b83715   Zi Yan   mm: thp: enable t...
2733
2734
2735
  
  	if (!(pvmw->pmd && !pvmw->pte))
  		return;
616b83715   Zi Yan   mm: thp: enable t...
2736
2737
2738
2739
2740
2741
  	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
  	pmdval = *pvmw->pmd;
  	pmdp_invalidate(vma, address, pvmw->pmd);
  	if (pmd_dirty(pmdval))
  		set_page_dirty(page);
  	entry = make_migration_entry(page, pmd_write(pmdval));
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
2742
2743
2744
2745
  	pmdswp = swp_entry_to_pmd(entry);
  	if (pmd_soft_dirty(pmdval))
  		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
  	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
616b83715   Zi Yan   mm: thp: enable t...
2746
2747
  	page_remove_rmap(page, true);
  	put_page(page);
616b83715   Zi Yan   mm: thp: enable t...
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
  }
  
  void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
  {
  	struct vm_area_struct *vma = pvmw->vma;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address = pvmw->address;
  	unsigned long mmun_start = address & HPAGE_PMD_MASK;
  	pmd_t pmde;
  	swp_entry_t entry;
  
  	if (!(pvmw->pmd && !pvmw->pte))
  		return;
  
  	entry = pmd_to_swp_entry(*pvmw->pmd);
  	get_page(new);
  	pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
ab6e3d093   Naoya Horiguchi   mm: soft-dirty: k...
2765
2766
  	if (pmd_swp_soft_dirty(*pvmw->pmd))
  		pmde = pmd_mksoft_dirty(pmde);
616b83715   Zi Yan   mm: thp: enable t...
2767
2768
2769
2770
2771
2772
  	if (is_write_migration_entry(entry))
  		pmde = maybe_pmd_mkwrite(pmde, vma);
  
  	flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
  	page_add_anon_rmap(new, vma, mmun_start, true);
  	set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
a2e0493f9   Kirill A. Shutemov   mm, thp: fix mloc...
2773
  	if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
616b83715   Zi Yan   mm: thp: enable t...
2774
2775
2776
2777
  		mlock_vma_page(new);
  	update_mmu_cache_pmd(vma, address, pvmw->pmd);
  }
  #endif