Blame view

mm/huge_memory.c 92.1 KB
71e3aac07   Andrea Arcangeli   thp: transparent ...
1
2
3
4
5
6
  /*
   *  Copyright (C) 2009  Red Hat, Inc.
   *
   *  This work is licensed under the terms of the GNU GPL, version 2. See
   *  the COPYING file in the top-level directory.
   */
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
7
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71e3aac07   Andrea Arcangeli   thp: transparent ...
8
9
10
11
12
13
14
  #include <linux/mm.h>
  #include <linux/sched.h>
  #include <linux/highmem.h>
  #include <linux/hugetlb.h>
  #include <linux/mmu_notifier.h>
  #include <linux/rmap.h>
  #include <linux/swap.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
15
  #include <linux/shrinker.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
16
  #include <linux/mm_inline.h>
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
17
  #include <linux/swapops.h>
4897c7655   Matthew Wilcox   thp: prepare for ...
18
  #include <linux/dax.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
19
20
  #include <linux/kthread.h>
  #include <linux/khugepaged.h>
878aee7d6   Andrea Arcangeli   thp: freeze khuge...
21
  #include <linux/freezer.h>
f25748e3c   Dan Williams   mm, dax: convert ...
22
  #include <linux/pfn_t.h>
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
23
  #include <linux/mman.h>
3565fce3a   Dan Williams   mm, x86: get_user...
24
  #include <linux/memremap.h>
325adeb55   Ralf Baechle   mm: huge_memory: ...
25
  #include <linux/pagemap.h>
49071d436   Kirill A. Shutemov   thp: add debugfs ...
26
  #include <linux/debugfs.h>
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
27
  #include <linux/migrate.h>
43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
28
  #include <linux/hashtable.h>
6b251fc96   Andrea Arcangeli   userfaultfd: call...
29
  #include <linux/userfaultfd_k.h>
33c3fc71c   Vladimir Davydov   mm: introduce idl...
30
  #include <linux/page_idle.h>
97ae17497   Kirill A. Shutemov   thp: implement re...
31

71e3aac07   Andrea Arcangeli   thp: transparent ...
32
33
34
  #include <asm/tlb.h>
  #include <asm/pgalloc.h>
  #include "internal.h"
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  enum scan_result {
  	SCAN_FAIL,
  	SCAN_SUCCEED,
  	SCAN_PMD_NULL,
  	SCAN_EXCEED_NONE_PTE,
  	SCAN_PTE_NON_PRESENT,
  	SCAN_PAGE_RO,
  	SCAN_NO_REFERENCED_PAGE,
  	SCAN_PAGE_NULL,
  	SCAN_SCAN_ABORT,
  	SCAN_PAGE_COUNT,
  	SCAN_PAGE_LRU,
  	SCAN_PAGE_LOCK,
  	SCAN_PAGE_ANON,
b1caa957a   Kirill A. Shutemov   khugepaged: ignor...
49
  	SCAN_PAGE_COMPOUND,
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
50
51
52
53
54
55
56
57
58
59
60
61
  	SCAN_ANY_PROCESS,
  	SCAN_VMA_NULL,
  	SCAN_VMA_CHECK,
  	SCAN_ADDRESS_RANGE,
  	SCAN_SWAP_CACHE_PAGE,
  	SCAN_DEL_PAGE_LRU,
  	SCAN_ALLOC_HUGE_PAGE_FAIL,
  	SCAN_CGROUP_CHARGE_FAIL
  };
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/huge_memory.h>
ba76149f4   Andrea Arcangeli   thp: khugepaged
62
  /*
8bfa3f9a0   Jianguo Wu   mm/huge_memory.c:...
63
64
65
66
67
68
   * By default transparent hugepage support is disabled in order that avoid
   * to risk increase the memory footprint of applications without a guaranteed
   * benefit. When transparent hugepage support is enabled, is for all mappings,
   * and khugepaged scans all mappings.
   * Defrag is invoked by khugepaged hugepage allocations and by page faults
   * for all hugepage allocations.
ba76149f4   Andrea Arcangeli   thp: khugepaged
69
   */
71e3aac07   Andrea Arcangeli   thp: transparent ...
70
  unsigned long transparent_hugepage_flags __read_mostly =
13ece886d   Andrea Arcangeli   thp: transparent ...
71
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
ba76149f4   Andrea Arcangeli   thp: khugepaged
72
  	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
13ece886d   Andrea Arcangeli   thp: transparent ...
73
74
75
76
  #endif
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  #endif
444eb2a44   Mel Gorman   mm: thp: set THP ...
77
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
79da5407e   Kirill A. Shutemov   thp: introduce sy...
78
79
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
ba76149f4   Andrea Arcangeli   thp: khugepaged
80
81
  
  /* default scan 8*512 pte (or vmas) every 30 second */
ff20c2e0a   Kirill A. Shutemov   mm: Some arch may...
82
  static unsigned int khugepaged_pages_to_scan __read_mostly;
ba76149f4   Andrea Arcangeli   thp: khugepaged
83
84
85
86
87
88
89
90
91
92
93
94
95
96
  static unsigned int khugepaged_pages_collapsed;
  static unsigned int khugepaged_full_scans;
  static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
  /* during fragmentation poll the hugepage allocator once every minute */
  static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
  static struct task_struct *khugepaged_thread __read_mostly;
  static DEFINE_MUTEX(khugepaged_mutex);
  static DEFINE_SPINLOCK(khugepaged_mm_lock);
  static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  /*
   * default collapse hugepages if there is at least one pte mapped like
   * it would have happened if the vma was large enough during page
   * fault.
   */
ff20c2e0a   Kirill A. Shutemov   mm: Some arch may...
97
  static unsigned int khugepaged_max_ptes_none __read_mostly;
ba76149f4   Andrea Arcangeli   thp: khugepaged
98
99
  
  static int khugepaged(void *none);
ba76149f4   Andrea Arcangeli   thp: khugepaged
100
  static int khugepaged_slab_init(void);
65ebb64f4   Kirill A. Shutemov   thp: handle error...
101
  static void khugepaged_slab_exit(void);
ba76149f4   Andrea Arcangeli   thp: khugepaged
102

43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
103
104
  #define MM_SLOTS_HASH_BITS 10
  static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
ba76149f4   Andrea Arcangeli   thp: khugepaged
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
  static struct kmem_cache *mm_slot_cache __read_mostly;
  
  /**
   * struct mm_slot - hash lookup from mm to mm_slot
   * @hash: hash collision list
   * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
   * @mm: the mm that this information is valid for
   */
  struct mm_slot {
  	struct hlist_node hash;
  	struct list_head mm_node;
  	struct mm_struct *mm;
  };
  
  /**
   * struct khugepaged_scan - cursor for scanning
   * @mm_head: the head of the mm list to scan
   * @mm_slot: the current mm_slot we are scanning
   * @address: the next address inside that to be scanned
   *
   * There is only the one khugepaged_scan instance of this cursor structure.
   */
  struct khugepaged_scan {
  	struct list_head mm_head;
  	struct mm_slot *mm_slot;
  	unsigned long address;
2f1da6421   H Hartley Sweeten   mm/huge_memory.c:...
131
132
  };
  static struct khugepaged_scan khugepaged_scan = {
ba76149f4   Andrea Arcangeli   thp: khugepaged
133
134
  	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
  };
9a982250f   Kirill A. Shutemov   thp: introduce de...
135
  static struct shrinker deferred_split_shrinker;
f000565ad   Andrea Arcangeli   thp: set recommen...
136

2c0b80d46   Nicholas Krause   mm: make set_reco...
137
  static void set_recommended_min_free_kbytes(void)
f000565ad   Andrea Arcangeli   thp: set recommen...
138
139
140
141
  {
  	struct zone *zone;
  	int nr_zones = 0;
  	unsigned long recommended_min;
f000565ad   Andrea Arcangeli   thp: set recommen...
142

f000565ad   Andrea Arcangeli   thp: set recommen...
143
144
  	for_each_populated_zone(zone)
  		nr_zones++;
974a786e6   Mel Gorman   mm, page_alloc: r...
145
  	/* Ensure 2 pageblocks are free to assist fragmentation avoidance */
f000565ad   Andrea Arcangeli   thp: set recommen...
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
  	recommended_min = pageblock_nr_pages * nr_zones * 2;
  
  	/*
  	 * Make sure that on average at least two pageblocks are almost free
  	 * of another type, one for a migratetype to fall back to and a
  	 * second to avoid subsequent fallbacks of other types There are 3
  	 * MIGRATE_TYPES we care about.
  	 */
  	recommended_min += pageblock_nr_pages * nr_zones *
  			   MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
  
  	/* don't ever allow to reserve more than 5% of the lowmem */
  	recommended_min = min(recommended_min,
  			      (unsigned long) nr_free_buffer_pages() / 20);
  	recommended_min <<= (PAGE_SHIFT-10);
42aa83cb6   Han Pingtian   mm: show message ...
161
162
  	if (recommended_min > min_free_kbytes) {
  		if (user_min_free_kbytes >= 0)
756a025f0   Joe Perches   mm: coalesce spli...
163
164
  			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations
  ",
42aa83cb6   Han Pingtian   mm: show message ...
165
  				min_free_kbytes, recommended_min);
f000565ad   Andrea Arcangeli   thp: set recommen...
166
  		min_free_kbytes = recommended_min;
42aa83cb6   Han Pingtian   mm: show message ...
167
  	}
f000565ad   Andrea Arcangeli   thp: set recommen...
168
  	setup_per_zone_wmarks();
f000565ad   Andrea Arcangeli   thp: set recommen...
169
  }
f000565ad   Andrea Arcangeli   thp: set recommen...
170

79553da29   Kirill A. Shutemov   thp: cleanup khug...
171
  static int start_stop_khugepaged(void)
ba76149f4   Andrea Arcangeli   thp: khugepaged
172
173
174
  {
  	int err = 0;
  	if (khugepaged_enabled()) {
ba76149f4   Andrea Arcangeli   thp: khugepaged
175
176
177
  		if (!khugepaged_thread)
  			khugepaged_thread = kthread_run(khugepaged, NULL,
  							"khugepaged");
18e8e5c7a   Viresh Kumar   mm: Drop unlikely...
178
  		if (IS_ERR(khugepaged_thread)) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
179
180
  			pr_err("khugepaged: kthread_run(khugepaged) failed
  ");
ba76149f4   Andrea Arcangeli   thp: khugepaged
181
182
  			err = PTR_ERR(khugepaged_thread);
  			khugepaged_thread = NULL;
79553da29   Kirill A. Shutemov   thp: cleanup khug...
183
  			goto fail;
ba76149f4   Andrea Arcangeli   thp: khugepaged
184
  		}
911891afe   Xiao Guangrong   thp: move khugepa...
185
186
  
  		if (!list_empty(&khugepaged_scan.mm_head))
ba76149f4   Andrea Arcangeli   thp: khugepaged
187
  			wake_up_interruptible(&khugepaged_wait);
f000565ad   Andrea Arcangeli   thp: set recommen...
188
189
  
  		set_recommended_min_free_kbytes();
911891afe   Xiao Guangrong   thp: move khugepa...
190
  	} else if (khugepaged_thread) {
911891afe   Xiao Guangrong   thp: move khugepa...
191
192
193
  		kthread_stop(khugepaged_thread);
  		khugepaged_thread = NULL;
  	}
79553da29   Kirill A. Shutemov   thp: cleanup khug...
194
  fail:
ba76149f4   Andrea Arcangeli   thp: khugepaged
195
196
  	return err;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
197

97ae17497   Kirill A. Shutemov   thp: implement re...
198
  static atomic_t huge_zero_refcount;
56873f43a   Wang, Yalin   mm:add KPF_ZERO_P...
199
  struct page *huge_zero_page __read_mostly;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
200

fc4370443   Matthew Wilcox   mm: export variou...
201
  struct page *get_huge_zero_page(void)
97ae17497   Kirill A. Shutemov   thp: implement re...
202
203
204
205
  {
  	struct page *zero_page;
  retry:
  	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
4db0c3c29   Jason Low   mm: remove rest o...
206
  		return READ_ONCE(huge_zero_page);
97ae17497   Kirill A. Shutemov   thp: implement re...
207
208
  
  	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
209
  			HPAGE_PMD_ORDER);
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
210
211
  	if (!zero_page) {
  		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
212
  		return NULL;
d8a8e1f0d   Kirill A. Shutemov   thp, vmstat: impl...
213
214
  	}
  	count_vm_event(THP_ZERO_PAGE_ALLOC);
97ae17497   Kirill A. Shutemov   thp: implement re...
215
  	preempt_disable();
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
216
  	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
97ae17497   Kirill A. Shutemov   thp: implement re...
217
  		preempt_enable();
5ddacbe92   Yu Zhao   mm: free compound...
218
  		__free_pages(zero_page, compound_order(zero_page));
97ae17497   Kirill A. Shutemov   thp: implement re...
219
220
221
222
223
224
  		goto retry;
  	}
  
  	/* We take additional reference here. It will be put back by shrinker */
  	atomic_set(&huge_zero_refcount, 2);
  	preempt_enable();
4db0c3c29   Jason Low   mm: remove rest o...
225
  	return READ_ONCE(huge_zero_page);
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
226
  }
aa88b68c3   Kirill A. Shutemov   thp: keep huge ze...
227
  void put_huge_zero_page(void)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
228
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
229
230
231
232
233
  	/*
  	 * Counter should never go to zero here. Only shrinker can put
  	 * last reference.
  	 */
  	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
234
  }
488964666   Glauber Costa   hugepage: convert...
235
236
  static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
  					struct shrink_control *sc)
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
237
  {
488964666   Glauber Costa   hugepage: convert...
238
239
240
  	/* we can free zero page only if last reference remains */
  	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
241

488964666   Glauber Costa   hugepage: convert...
242
243
244
  static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
  				       struct shrink_control *sc)
  {
97ae17497   Kirill A. Shutemov   thp: implement re...
245
  	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
246
247
  		struct page *zero_page = xchg(&huge_zero_page, NULL);
  		BUG_ON(zero_page == NULL);
5ddacbe92   Yu Zhao   mm: free compound...
248
  		__free_pages(zero_page, compound_order(zero_page));
488964666   Glauber Costa   hugepage: convert...
249
  		return HPAGE_PMD_NR;
97ae17497   Kirill A. Shutemov   thp: implement re...
250
251
252
  	}
  
  	return 0;
4a6c12972   Kirill A. Shutemov   thp: huge zero pa...
253
  }
97ae17497   Kirill A. Shutemov   thp: implement re...
254
  static struct shrinker huge_zero_page_shrinker = {
488964666   Glauber Costa   hugepage: convert...
255
256
  	.count_objects = shrink_huge_zero_page_count,
  	.scan_objects = shrink_huge_zero_page_scan,
97ae17497   Kirill A. Shutemov   thp: implement re...
257
258
  	.seeks = DEFAULT_SEEKS,
  };
71e3aac07   Andrea Arcangeli   thp: transparent ...
259
  #ifdef CONFIG_SYSFS
ba76149f4   Andrea Arcangeli   thp: khugepaged
260

444eb2a44   Mel Gorman   mm: thp: set THP ...
261
  static ssize_t triple_flag_store(struct kobject *kobj,
71e3aac07   Andrea Arcangeli   thp: transparent ...
262
263
264
  				 struct kobj_attribute *attr,
  				 const char *buf, size_t count,
  				 enum transparent_hugepage_flag enabled,
444eb2a44   Mel Gorman   mm: thp: set THP ...
265
  				 enum transparent_hugepage_flag deferred,
71e3aac07   Andrea Arcangeli   thp: transparent ...
266
267
  				 enum transparent_hugepage_flag req_madv)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
268
269
270
271
272
273
274
275
  	if (!memcmp("defer", buf,
  		    min(sizeof("defer")-1, count))) {
  		if (enabled == deferred)
  			return -EINVAL;
  		clear_bit(enabled, &transparent_hugepage_flags);
  		clear_bit(req_madv, &transparent_hugepage_flags);
  		set_bit(deferred, &transparent_hugepage_flags);
  	} else if (!memcmp("always", buf,
71e3aac07   Andrea Arcangeli   thp: transparent ...
276
  		    min(sizeof("always")-1, count))) {
444eb2a44   Mel Gorman   mm: thp: set THP ...
277
  		clear_bit(deferred, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
278
  		clear_bit(req_madv, &transparent_hugepage_flags);
444eb2a44   Mel Gorman   mm: thp: set THP ...
279
  		set_bit(enabled, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
280
281
282
  	} else if (!memcmp("madvise", buf,
  			   min(sizeof("madvise")-1, count))) {
  		clear_bit(enabled, &transparent_hugepage_flags);
444eb2a44   Mel Gorman   mm: thp: set THP ...
283
  		clear_bit(deferred, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
284
285
286
287
288
  		set_bit(req_madv, &transparent_hugepage_flags);
  	} else if (!memcmp("never", buf,
  			   min(sizeof("never")-1, count))) {
  		clear_bit(enabled, &transparent_hugepage_flags);
  		clear_bit(req_madv, &transparent_hugepage_flags);
444eb2a44   Mel Gorman   mm: thp: set THP ...
289
  		clear_bit(deferred, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
290
291
292
293
294
295
296
297
298
  	} else
  		return -EINVAL;
  
  	return count;
  }
  
  static ssize_t enabled_show(struct kobject *kobj,
  			    struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
299
300
301
302
303
304
305
306
307
  	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "[always] madvise never
  ");
  	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always [madvise] never
  ");
  	else
  		return sprintf(buf, "always madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
308
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
309

71e3aac07   Andrea Arcangeli   thp: transparent ...
310
311
312
313
  static ssize_t enabled_store(struct kobject *kobj,
  			     struct kobj_attribute *attr,
  			     const char *buf, size_t count)
  {
ba76149f4   Andrea Arcangeli   thp: khugepaged
314
  	ssize_t ret;
444eb2a44   Mel Gorman   mm: thp: set THP ...
315
316
  	ret = triple_flag_store(kobj, attr, buf, count,
  				TRANSPARENT_HUGEPAGE_FLAG,
ba76149f4   Andrea Arcangeli   thp: khugepaged
317
318
319
320
  				TRANSPARENT_HUGEPAGE_FLAG,
  				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
  
  	if (ret > 0) {
911891afe   Xiao Guangrong   thp: move khugepa...
321
322
323
  		int err;
  
  		mutex_lock(&khugepaged_mutex);
79553da29   Kirill A. Shutemov   thp: cleanup khug...
324
  		err = start_stop_khugepaged();
911891afe   Xiao Guangrong   thp: move khugepa...
325
  		mutex_unlock(&khugepaged_mutex);
ba76149f4   Andrea Arcangeli   thp: khugepaged
326
327
328
329
330
  		if (err)
  			ret = err;
  	}
  
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
331
332
333
334
335
336
337
338
  }
  static struct kobj_attribute enabled_attr =
  	__ATTR(enabled, 0644, enabled_show, enabled_store);
  
  static ssize_t single_flag_show(struct kobject *kobj,
  				struct kobj_attribute *attr, char *buf,
  				enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
339
340
341
  	return sprintf(buf, "%d
  ",
  		       !!test_bit(flag, &transparent_hugepage_flags));
71e3aac07   Andrea Arcangeli   thp: transparent ...
342
  }
e27e6151b   Ben Hutchings   mm/thp: use conve...
343

71e3aac07   Andrea Arcangeli   thp: transparent ...
344
345
346
347
348
  static ssize_t single_flag_store(struct kobject *kobj,
  				 struct kobj_attribute *attr,
  				 const char *buf, size_t count,
  				 enum transparent_hugepage_flag flag)
  {
e27e6151b   Ben Hutchings   mm/thp: use conve...
349
350
351
352
353
354
355
356
357
358
  	unsigned long value;
  	int ret;
  
  	ret = kstrtoul(buf, 10, &value);
  	if (ret < 0)
  		return ret;
  	if (value > 1)
  		return -EINVAL;
  
  	if (value)
71e3aac07   Andrea Arcangeli   thp: transparent ...
359
  		set_bit(flag, &transparent_hugepage_flags);
e27e6151b   Ben Hutchings   mm/thp: use conve...
360
  	else
71e3aac07   Andrea Arcangeli   thp: transparent ...
361
  		clear_bit(flag, &transparent_hugepage_flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
362
363
364
365
366
367
368
369
370
371
372
373
  
  	return count;
  }
  
  /*
   * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
   * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
   * memory just to allocate one more hugepage.
   */
  static ssize_t defrag_show(struct kobject *kobj,
  			   struct kobj_attribute *attr, char *buf)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
374
375
376
377
378
379
380
381
382
383
384
385
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "[always] defer madvise never
  ");
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always [defer] madvise never
  ");
  	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
  		return sprintf(buf, "always defer [madvise] never
  ");
  	else
  		return sprintf(buf, "always defer madvise [never]
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
386
387
388
389
390
  }
  static ssize_t defrag_store(struct kobject *kobj,
  			    struct kobj_attribute *attr,
  			    const char *buf, size_t count)
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
391
392
393
  	return triple_flag_store(kobj, attr, buf, count,
  				 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
  				 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
71e3aac07   Andrea Arcangeli   thp: transparent ...
394
395
396
397
  				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
  }
  static struct kobj_attribute defrag_attr =
  	__ATTR(defrag, 0644, defrag_show, defrag_store);
79da5407e   Kirill A. Shutemov   thp: introduce sy...
398
399
400
401
402
403
404
405
406
407
408
409
410
411
  static ssize_t use_zero_page_show(struct kobject *kobj,
  		struct kobj_attribute *attr, char *buf)
  {
  	return single_flag_show(kobj, attr, buf,
  				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static ssize_t use_zero_page_store(struct kobject *kobj,
  		struct kobj_attribute *attr, const char *buf, size_t count)
  {
  	return single_flag_store(kobj, attr, buf, count,
  				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  }
  static struct kobj_attribute use_zero_page_attr =
  	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
71e3aac07   Andrea Arcangeli   thp: transparent ...
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
  #ifdef CONFIG_DEBUG_VM
  static ssize_t debug_cow_show(struct kobject *kobj,
  				struct kobj_attribute *attr, char *buf)
  {
  	return single_flag_show(kobj, attr, buf,
  				TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  }
  static ssize_t debug_cow_store(struct kobject *kobj,
  			       struct kobj_attribute *attr,
  			       const char *buf, size_t count)
  {
  	return single_flag_store(kobj, attr, buf, count,
  				 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
  }
  static struct kobj_attribute debug_cow_attr =
  	__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
  #endif /* CONFIG_DEBUG_VM */
  
  static struct attribute *hugepage_attr[] = {
  	&enabled_attr.attr,
  	&defrag_attr.attr,
79da5407e   Kirill A. Shutemov   thp: introduce sy...
433
  	&use_zero_page_attr.attr,
71e3aac07   Andrea Arcangeli   thp: transparent ...
434
435
436
437
438
439
440
441
  #ifdef CONFIG_DEBUG_VM
  	&debug_cow_attr.attr,
  #endif
  	NULL,
  };
  
  static struct attribute_group hugepage_attr_group = {
  	.attrs = hugepage_attr,
ba76149f4   Andrea Arcangeli   thp: khugepaged
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
  };
  
  static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
  					 struct kobj_attribute *attr,
  					 char *buf)
  {
  	return sprintf(buf, "%u
  ", khugepaged_scan_sleep_millisecs);
  }
  
  static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
  					  struct kobj_attribute *attr,
  					  const char *buf, size_t count)
  {
  	unsigned long msecs;
  	int err;
3dbb95f78   Jingoo Han   mm: replace stric...
458
  	err = kstrtoul(buf, 10, &msecs);
ba76149f4   Andrea Arcangeli   thp: khugepaged
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
  	if (err || msecs > UINT_MAX)
  		return -EINVAL;
  
  	khugepaged_scan_sleep_millisecs = msecs;
  	wake_up_interruptible(&khugepaged_wait);
  
  	return count;
  }
  static struct kobj_attribute scan_sleep_millisecs_attr =
  	__ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
  	       scan_sleep_millisecs_store);
  
  static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
  					  struct kobj_attribute *attr,
  					  char *buf)
  {
  	return sprintf(buf, "%u
  ", khugepaged_alloc_sleep_millisecs);
  }
  
  static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
  					   struct kobj_attribute *attr,
  					   const char *buf, size_t count)
  {
  	unsigned long msecs;
  	int err;
3dbb95f78   Jingoo Han   mm: replace stric...
485
  	err = kstrtoul(buf, 10, &msecs);
ba76149f4   Andrea Arcangeli   thp: khugepaged
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
  	if (err || msecs > UINT_MAX)
  		return -EINVAL;
  
  	khugepaged_alloc_sleep_millisecs = msecs;
  	wake_up_interruptible(&khugepaged_wait);
  
  	return count;
  }
  static struct kobj_attribute alloc_sleep_millisecs_attr =
  	__ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
  	       alloc_sleep_millisecs_store);
  
  static ssize_t pages_to_scan_show(struct kobject *kobj,
  				  struct kobj_attribute *attr,
  				  char *buf)
  {
  	return sprintf(buf, "%u
  ", khugepaged_pages_to_scan);
  }
  static ssize_t pages_to_scan_store(struct kobject *kobj,
  				   struct kobj_attribute *attr,
  				   const char *buf, size_t count)
  {
  	int err;
  	unsigned long pages;
3dbb95f78   Jingoo Han   mm: replace stric...
511
  	err = kstrtoul(buf, 10, &pages);
ba76149f4   Andrea Arcangeli   thp: khugepaged
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
  	if (err || !pages || pages > UINT_MAX)
  		return -EINVAL;
  
  	khugepaged_pages_to_scan = pages;
  
  	return count;
  }
  static struct kobj_attribute pages_to_scan_attr =
  	__ATTR(pages_to_scan, 0644, pages_to_scan_show,
  	       pages_to_scan_store);
  
  static ssize_t pages_collapsed_show(struct kobject *kobj,
  				    struct kobj_attribute *attr,
  				    char *buf)
  {
  	return sprintf(buf, "%u
  ", khugepaged_pages_collapsed);
  }
  static struct kobj_attribute pages_collapsed_attr =
  	__ATTR_RO(pages_collapsed);
  
  static ssize_t full_scans_show(struct kobject *kobj,
  			       struct kobj_attribute *attr,
  			       char *buf)
  {
  	return sprintf(buf, "%u
  ", khugepaged_full_scans);
  }
  static struct kobj_attribute full_scans_attr =
  	__ATTR_RO(full_scans);
  
  static ssize_t khugepaged_defrag_show(struct kobject *kobj,
  				      struct kobj_attribute *attr, char *buf)
  {
  	return single_flag_show(kobj, attr, buf,
  				TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
  }
  static ssize_t khugepaged_defrag_store(struct kobject *kobj,
  				       struct kobj_attribute *attr,
  				       const char *buf, size_t count)
  {
  	return single_flag_store(kobj, attr, buf, count,
  				 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
  }
  static struct kobj_attribute khugepaged_defrag_attr =
  	__ATTR(defrag, 0644, khugepaged_defrag_show,
  	       khugepaged_defrag_store);
  
  /*
   * max_ptes_none controls if khugepaged should collapse hugepages over
   * any unmapped ptes in turn potentially increasing the memory
   * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
   * reduce the available free memory in the system as it
   * runs. Increasing max_ptes_none will instead potentially reduce the
   * free memory in the system during the khugepaged scan.
   */
  static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
  					     struct kobj_attribute *attr,
  					     char *buf)
  {
  	return sprintf(buf, "%u
  ", khugepaged_max_ptes_none);
  }
  static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
  					      struct kobj_attribute *attr,
  					      const char *buf, size_t count)
  {
  	int err;
  	unsigned long max_ptes_none;
3dbb95f78   Jingoo Han   mm: replace stric...
581
  	err = kstrtoul(buf, 10, &max_ptes_none);
ba76149f4   Andrea Arcangeli   thp: khugepaged
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
  	if (err || max_ptes_none > HPAGE_PMD_NR-1)
  		return -EINVAL;
  
  	khugepaged_max_ptes_none = max_ptes_none;
  
  	return count;
  }
  static struct kobj_attribute khugepaged_max_ptes_none_attr =
  	__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
  	       khugepaged_max_ptes_none_store);
  
  static struct attribute *khugepaged_attr[] = {
  	&khugepaged_defrag_attr.attr,
  	&khugepaged_max_ptes_none_attr.attr,
  	&pages_to_scan_attr.attr,
  	&pages_collapsed_attr.attr,
  	&full_scans_attr.attr,
  	&scan_sleep_millisecs_attr.attr,
  	&alloc_sleep_millisecs_attr.attr,
  	NULL,
  };
  
  static struct attribute_group khugepaged_attr_group = {
  	.attrs = khugepaged_attr,
  	.name = "khugepaged",
71e3aac07   Andrea Arcangeli   thp: transparent ...
607
  };
71e3aac07   Andrea Arcangeli   thp: transparent ...
608

569e55900   Shaohua Li   thp: improve the ...
609
  static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
71e3aac07   Andrea Arcangeli   thp: transparent ...
610
  {
71e3aac07   Andrea Arcangeli   thp: transparent ...
611
  	int err;
569e55900   Shaohua Li   thp: improve the ...
612
613
  	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
  	if (unlikely(!*hugepage_kobj)) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
614
615
  		pr_err("failed to create transparent hugepage kobject
  ");
569e55900   Shaohua Li   thp: improve the ...
616
  		return -ENOMEM;
ba76149f4   Andrea Arcangeli   thp: khugepaged
617
  	}
569e55900   Shaohua Li   thp: improve the ...
618
  	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
619
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
620
621
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
622
  		goto delete_obj;
ba76149f4   Andrea Arcangeli   thp: khugepaged
623
  	}
569e55900   Shaohua Li   thp: improve the ...
624
  	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
ba76149f4   Andrea Arcangeli   thp: khugepaged
625
  	if (err) {
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
626
627
  		pr_err("failed to register transparent hugepage group
  ");
569e55900   Shaohua Li   thp: improve the ...
628
  		goto remove_hp_group;
ba76149f4   Andrea Arcangeli   thp: khugepaged
629
  	}
569e55900   Shaohua Li   thp: improve the ...
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
  
  	return 0;
  
  remove_hp_group:
  	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
  delete_obj:
  	kobject_put(*hugepage_kobj);
  	return err;
  }
  
  static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
  	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
  	kobject_put(hugepage_kobj);
  }
  #else
  static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
  {
  	return 0;
  }
  
  static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  {
  }
  #endif /* CONFIG_SYSFS */
  
  static int __init hugepage_init(void)
  {
  	int err;
  	struct kobject *hugepage_kobj;
  
  	if (!has_transparent_hugepage()) {
  		transparent_hugepage_flags = 0;
  		return -EINVAL;
  	}
ff20c2e0a   Kirill A. Shutemov   mm: Some arch may...
666
667
668
669
670
671
672
673
674
675
676
  	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
  	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
  	/*
  	 * hugepages can't be allocated by the buddy allocator
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
  	/*
  	 * we use page->mapping and page->index in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
  	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
569e55900   Shaohua Li   thp: improve the ...
677
678
  	err = hugepage_init_sysfs(&hugepage_kobj);
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
679
  		goto err_sysfs;
ba76149f4   Andrea Arcangeli   thp: khugepaged
680
681
682
  
  	err = khugepaged_slab_init();
  	if (err)
65ebb64f4   Kirill A. Shutemov   thp: handle error...
683
  		goto err_slab;
ba76149f4   Andrea Arcangeli   thp: khugepaged
684

65ebb64f4   Kirill A. Shutemov   thp: handle error...
685
686
687
  	err = register_shrinker(&huge_zero_page_shrinker);
  	if (err)
  		goto err_hzp_shrinker;
9a982250f   Kirill A. Shutemov   thp: introduce de...
688
689
690
  	err = register_shrinker(&deferred_split_shrinker);
  	if (err)
  		goto err_split_shrinker;
97ae17497   Kirill A. Shutemov   thp: implement re...
691

97562cd24   Rik van Riel   thp: disable tran...
692
693
694
695
696
  	/*
  	 * By default disable transparent hugepages on smaller systems,
  	 * where the extra memory used could hurt more than TLB overhead
  	 * is likely to save.  The admin can still enable it through /sys.
  	 */
79553da29   Kirill A. Shutemov   thp: cleanup khug...
697
  	if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
97562cd24   Rik van Riel   thp: disable tran...
698
  		transparent_hugepage_flags = 0;
79553da29   Kirill A. Shutemov   thp: cleanup khug...
699
700
  		return 0;
  	}
97562cd24   Rik van Riel   thp: disable tran...
701

79553da29   Kirill A. Shutemov   thp: cleanup khug...
702
  	err = start_stop_khugepaged();
65ebb64f4   Kirill A. Shutemov   thp: handle error...
703
704
  	if (err)
  		goto err_khugepaged;
ba76149f4   Andrea Arcangeli   thp: khugepaged
705

569e55900   Shaohua Li   thp: improve the ...
706
  	return 0;
65ebb64f4   Kirill A. Shutemov   thp: handle error...
707
  err_khugepaged:
9a982250f   Kirill A. Shutemov   thp: introduce de...
708
709
  	unregister_shrinker(&deferred_split_shrinker);
  err_split_shrinker:
65ebb64f4   Kirill A. Shutemov   thp: handle error...
710
711
712
713
  	unregister_shrinker(&huge_zero_page_shrinker);
  err_hzp_shrinker:
  	khugepaged_slab_exit();
  err_slab:
569e55900   Shaohua Li   thp: improve the ...
714
  	hugepage_exit_sysfs(hugepage_kobj);
65ebb64f4   Kirill A. Shutemov   thp: handle error...
715
  err_sysfs:
ba76149f4   Andrea Arcangeli   thp: khugepaged
716
  	return err;
71e3aac07   Andrea Arcangeli   thp: transparent ...
717
  }
a64fb3cd6   Paul Gortmaker   mm: audit/fix non...
718
  subsys_initcall(hugepage_init);
71e3aac07   Andrea Arcangeli   thp: transparent ...
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
  
  static int __init setup_transparent_hugepage(char *str)
  {
  	int ret = 0;
  	if (!str)
  		goto out;
  	if (!strcmp(str, "always")) {
  		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			&transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "madvise")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			&transparent_hugepage_flags);
  		ret = 1;
  	} else if (!strcmp(str, "never")) {
  		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  			  &transparent_hugepage_flags);
  		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  			  &transparent_hugepage_flags);
  		ret = 1;
  	}
  out:
  	if (!ret)
ae3a8c1c2   Andrew Morton   mm/huge_memory.c:...
746
747
  		pr_warn("transparent_hugepage= cannot parse, ignored
  ");
71e3aac07   Andrea Arcangeli   thp: transparent ...
748
749
750
  	return ret;
  }
  __setup("transparent_hugepage=", setup_transparent_hugepage);
b32967ff1   Mel Gorman   mm: numa: Add THP...
751
  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
71e3aac07   Andrea Arcangeli   thp: transparent ...
752
753
754
755
756
  {
  	if (likely(vma->vm_flags & VM_WRITE))
  		pmd = pmd_mkwrite(pmd);
  	return pmd;
  }
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
757
  static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
b3092b3b7   Bob Liu   thp: cleanup: int...
758
759
  {
  	pmd_t entry;
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
760
  	entry = mk_pmd(page, prot);
b3092b3b7   Bob Liu   thp: cleanup: int...
761
762
763
  	entry = pmd_mkhuge(entry);
  	return entry;
  }
9a982250f   Kirill A. Shutemov   thp: introduce de...
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
  static inline struct list_head *page_deferred_list(struct page *page)
  {
  	/*
  	 * ->lru in the tail pages is occupied by compound_head.
  	 * Let's use ->mapping + ->index in the second tail page as list_head.
  	 */
  	return (struct list_head *)&page[2].mapping;
  }
  
  void prep_transhuge_page(struct page *page)
  {
  	/*
  	 * we use page->mapping and page->indexlru in second tail page
  	 * as list_head: assuming THP order >= 2
  	 */
9a982250f   Kirill A. Shutemov   thp: introduce de...
779
780
781
782
  
  	INIT_LIST_HEAD(page_deferred_list(page));
  	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
783
784
  static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
  					struct vm_area_struct *vma,
230c92a87   Andrea Arcangeli   userfaultfd: prop...
785
  					unsigned long address, pmd_t *pmd,
6b251fc96   Andrea Arcangeli   userfaultfd: call...
786
787
  					struct page *page, gfp_t gfp,
  					unsigned int flags)
71e3aac07   Andrea Arcangeli   thp: transparent ...
788
  {
00501b531   Johannes Weiner   mm: memcontrol: r...
789
  	struct mem_cgroup *memcg;
71e3aac07   Andrea Arcangeli   thp: transparent ...
790
  	pgtable_t pgtable;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
791
  	spinlock_t *ptl;
230c92a87   Andrea Arcangeli   userfaultfd: prop...
792
  	unsigned long haddr = address & HPAGE_PMD_MASK;
71e3aac07   Andrea Arcangeli   thp: transparent ...
793

309381fea   Sasha Levin   mm: dump page whe...
794
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
00501b531   Johannes Weiner   mm: memcontrol: r...
795

f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
796
  	if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
6b251fc96   Andrea Arcangeli   userfaultfd: call...
797
798
799
800
  		put_page(page);
  		count_vm_event(THP_FAULT_FALLBACK);
  		return VM_FAULT_FALLBACK;
  	}
00501b531   Johannes Weiner   mm: memcontrol: r...
801

71e3aac07   Andrea Arcangeli   thp: transparent ...
802
  	pgtable = pte_alloc_one(mm, haddr);
00501b531   Johannes Weiner   mm: memcontrol: r...
803
  	if (unlikely(!pgtable)) {
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
804
  		mem_cgroup_cancel_charge(page, memcg, true);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
805
  		put_page(page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
806
  		return VM_FAULT_OOM;
00501b531   Johannes Weiner   mm: memcontrol: r...
807
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
808
809
  
  	clear_huge_page(page, haddr, HPAGE_PMD_NR);
52f37629f   Minchan Kim   THP: fix comment ...
810
811
812
813
814
  	/*
  	 * The memory barrier inside __SetPageUptodate makes sure that
  	 * clear_huge_page writes become visible before the set_pmd_at()
  	 * write.
  	 */
71e3aac07   Andrea Arcangeli   thp: transparent ...
815
  	__SetPageUptodate(page);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
816
  	ptl = pmd_lock(mm, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
817
  	if (unlikely(!pmd_none(*pmd))) {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
818
  		spin_unlock(ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
819
  		mem_cgroup_cancel_charge(page, memcg, true);
71e3aac07   Andrea Arcangeli   thp: transparent ...
820
821
822
823
  		put_page(page);
  		pte_free(mm, pgtable);
  	} else {
  		pmd_t entry;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
824
825
826
827
828
829
  
  		/* Deliver the page fault to userland */
  		if (userfaultfd_missing(vma)) {
  			int ret;
  
  			spin_unlock(ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
830
  			mem_cgroup_cancel_charge(page, memcg, true);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
831
832
  			put_page(page);
  			pte_free(mm, pgtable);
230c92a87   Andrea Arcangeli   userfaultfd: prop...
833
  			ret = handle_userfault(vma, address, flags,
6b251fc96   Andrea Arcangeli   userfaultfd: call...
834
835
836
837
  					       VM_UFFD_MISSING);
  			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  			return ret;
  		}
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
838
839
  		entry = mk_huge_pmd(page, vma->vm_page_prot);
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
840
  		page_add_new_anon_rmap(page, vma, haddr, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
841
  		mem_cgroup_commit_charge(page, memcg, false, true);
00501b531   Johannes Weiner   mm: memcontrol: r...
842
  		lru_cache_add_active_or_unevictable(page, vma);
6b0b50b06   Aneesh Kumar K.V   mm/THP: add pmd a...
843
  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
71e3aac07   Andrea Arcangeli   thp: transparent ...
844
  		set_pmd_at(mm, haddr, pmd, entry);
71e3aac07   Andrea Arcangeli   thp: transparent ...
845
  		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
e1f56c89b   Kirill A. Shutemov   mm: convert mm->n...
846
  		atomic_long_inc(&mm->nr_ptes);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
847
  		spin_unlock(ptl);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
848
  		count_vm_event(THP_FAULT_ALLOC);
71e3aac07   Andrea Arcangeli   thp: transparent ...
849
  	}
aa2e878ef   David Rientjes   mm, thp: remove u...
850
  	return 0;
71e3aac07   Andrea Arcangeli   thp: transparent ...
851
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
  /*
   * If THP is set to always then directly reclaim/compact as necessary
   * If set to defer then do no reclaim and defer to khugepaged
   * If set to madvise and the VMA is flagged then directly reclaim/compact
   */
  static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
  {
  	gfp_t reclaim_flags = 0;
  
  	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) &&
  	    (vma->vm_flags & VM_HUGEPAGE))
  		reclaim_flags = __GFP_DIRECT_RECLAIM;
  	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
  		reclaim_flags = __GFP_KSWAPD_RECLAIM;
  	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
  		reclaim_flags = __GFP_DIRECT_RECLAIM;
  
  	return GFP_TRANSHUGE | reclaim_flags;
  }
  
  /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
  static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
0bbbc0b33   Andrea Arcangeli   thp: add numa awa...
874
  {
444eb2a44   Mel Gorman   mm: thp: set THP ...
875
  	return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
0bbbc0b33   Andrea Arcangeli   thp: add numa awa...
876
  }
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
877
  /* Caller must hold page table lock. */
d295e3415   Kirill A. Shutemov   dax: don't use se...
878
  static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
97ae17497   Kirill A. Shutemov   thp: implement re...
879
  		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
880
  		struct page *zero_page)
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
881
882
  {
  	pmd_t entry;
7c4141645   Andrew Morton   dax: revert userf...
883
884
  	if (!pmd_none(*pmd))
  		return false;
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
885
  	entry = mk_pmd(zero_page, vma->vm_page_prot);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
886
  	entry = pmd_mkhuge(entry);
12c9d70bd   Matthew Wilcox   mm: fix memory le...
887
888
  	if (pgtable)
  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
889
  	set_pmd_at(mm, haddr, pmd, entry);
e1f56c89b   Kirill A. Shutemov   mm: convert mm->n...
890
  	atomic_long_inc(&mm->nr_ptes);
7c4141645   Andrew Morton   dax: revert userf...
891
  	return true;
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
892
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
893
894
895
896
  int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  			       unsigned long address, pmd_t *pmd,
  			       unsigned int flags)
  {
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
897
  	gfp_t gfp;
71e3aac07   Andrea Arcangeli   thp: transparent ...
898
899
  	struct page *page;
  	unsigned long haddr = address & HPAGE_PMD_MASK;
71e3aac07   Andrea Arcangeli   thp: transparent ...
900

128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
901
  	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
c02925540   Kirill A. Shutemov   thp: consolidate ...
902
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
903
904
  	if (unlikely(anon_vma_prepare(vma)))
  		return VM_FAULT_OOM;
6d50e60cd   David Rientjes   mm, thp: fix coll...
905
  	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
906
  		return VM_FAULT_OOM;
593befa6a   Dominik Dingel   mm: introduce mm_...
907
  	if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) &&
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
908
  			transparent_hugepage_use_zero_page()) {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
909
  		spinlock_t *ptl;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
910
911
912
  		pgtable_t pgtable;
  		struct page *zero_page;
  		bool set;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
913
  		int ret;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
914
915
  		pgtable = pte_alloc_one(mm, haddr);
  		if (unlikely(!pgtable))
ba76149f4   Andrea Arcangeli   thp: khugepaged
916
  			return VM_FAULT_OOM;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
917
918
919
  		zero_page = get_huge_zero_page();
  		if (unlikely(!zero_page)) {
  			pte_free(mm, pgtable);
81ab4201f   Andi Kleen   mm: add VM counte...
920
  			count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
921
  			return VM_FAULT_FALLBACK;
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
922
  		}
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
923
  		ptl = pmd_lock(mm, pmd);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
924
925
926
927
928
  		ret = 0;
  		set = false;
  		if (pmd_none(*pmd)) {
  			if (userfaultfd_missing(vma)) {
  				spin_unlock(ptl);
230c92a87   Andrea Arcangeli   userfaultfd: prop...
929
  				ret = handle_userfault(vma, address, flags,
6b251fc96   Andrea Arcangeli   userfaultfd: call...
930
931
932
933
934
935
936
937
938
939
940
  						       VM_UFFD_MISSING);
  				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  			} else {
  				set_huge_zero_page(pgtable, mm, vma,
  						   haddr, pmd,
  						   zero_page);
  				spin_unlock(ptl);
  				set = true;
  			}
  		} else
  			spin_unlock(ptl);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
941
942
943
  		if (!set) {
  			pte_free(mm, pgtable);
  			put_huge_zero_page();
edad9d2c3   David Rientjes   mm, thp: allow fa...
944
  		}
6b251fc96   Andrea Arcangeli   userfaultfd: call...
945
  		return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
946
  	}
444eb2a44   Mel Gorman   mm: thp: set THP ...
947
  	gfp = alloc_hugepage_direct_gfpmask(vma);
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
948
  	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
949
950
  	if (unlikely(!page)) {
  		count_vm_event(THP_FAULT_FALLBACK);
c02925540   Kirill A. Shutemov   thp: consolidate ...
951
  		return VM_FAULT_FALLBACK;
128ec037b   Kirill A. Shutemov   thp: do_huge_pmd_...
952
  	}
9a982250f   Kirill A. Shutemov   thp: introduce de...
953
  	prep_transhuge_page(page);
230c92a87   Andrea Arcangeli   userfaultfd: prop...
954
955
  	return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
  					    flags);
71e3aac07   Andrea Arcangeli   thp: transparent ...
956
  }
ae18d6dcf   Matthew Wilcox   thp: change inser...
957
  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
f25748e3c   Dan Williams   mm, dax: convert ...
958
  		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
959
960
961
962
963
964
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pmd_t entry;
  	spinlock_t *ptl;
  
  	ptl = pmd_lock(mm, pmd);
f25748e3c   Dan Williams   mm, dax: convert ...
965
966
967
  	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
  	if (pfn_t_devmap(pfn))
  		entry = pmd_mkdevmap(entry);
01871e59a   Ross Zwisler   mm, dax: fix live...
968
969
970
  	if (write) {
  		entry = pmd_mkyoung(pmd_mkdirty(entry));
  		entry = maybe_pmd_mkwrite(entry, vma);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
971
  	}
01871e59a   Ross Zwisler   mm, dax: fix live...
972
973
  	set_pmd_at(mm, addr, pmd, entry);
  	update_mmu_cache_pmd(vma, addr, pmd);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
974
  	spin_unlock(ptl);
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
975
976
977
  }
  
  int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
f25748e3c   Dan Williams   mm, dax: convert ...
978
  			pmd_t *pmd, pfn_t pfn, bool write)
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
979
980
981
982
983
984
985
986
987
988
989
  {
  	pgprot_t pgprot = vma->vm_page_prot;
  	/*
  	 * If we had pmd_special, we could avoid all these restrictions,
  	 * but we need to be consistent with PTEs and architectures that
  	 * can't support a 'special' bit.
  	 */
  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  						(VM_PFNMAP|VM_MIXEDMAP));
  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
f25748e3c   Dan Williams   mm, dax: convert ...
990
  	BUG_ON(!pfn_t_devmap(pfn));
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
991
992
993
994
995
  
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
  	if (track_pfn_insert(vma, &pgprot, pfn))
  		return VM_FAULT_SIGBUS;
ae18d6dcf   Matthew Wilcox   thp: change inser...
996
997
  	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
  	return VM_FAULT_NOPAGE;
5cad465d7   Matthew Wilcox   mm: add vmf_inser...
998
  }
3565fce3a   Dan Williams   mm, x86: get_user...
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
  static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
  		pmd_t *pmd)
  {
  	pmd_t _pmd;
  
  	/*
  	 * We should set the dirty bit only for FOLL_WRITE but for now
  	 * the dirty bit in the pmd is meaningless.  And if the dirty
  	 * bit will become meaningful and we'll only set it with
  	 * FOLL_WRITE, an atomic set_bit will be required on the pmd to
  	 * set the young bit, instead of the current set_pmd_at.
  	 */
  	_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
  	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
  				pmd, _pmd,  1))
  		update_mmu_cache_pmd(vma, addr, pmd);
  }
  
  struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
  		pmd_t *pmd, int flags)
  {
  	unsigned long pfn = pmd_pfn(*pmd);
  	struct mm_struct *mm = vma->vm_mm;
  	struct dev_pagemap *pgmap;
  	struct page *page;
  
  	assert_spin_locked(pmd_lockptr(mm, pmd));
  
  	if (flags & FOLL_WRITE && !pmd_write(*pmd))
  		return NULL;
  
  	if (pmd_present(*pmd) && pmd_devmap(*pmd))
  		/* pass */;
  	else
  		return NULL;
  
  	if (flags & FOLL_TOUCH)
  		touch_pmd(vma, addr, pmd);
  
  	/*
  	 * device mapped pages can only be returned if the
  	 * caller will manage the page reference count.
  	 */
  	if (!(flags & FOLL_GET))
  		return ERR_PTR(-EEXIST);
  
  	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
  	pgmap = get_dev_pagemap(pfn, NULL);
  	if (!pgmap)
  		return ERR_PTR(-EFAULT);
  	page = pfn_to_page(pfn);
  	get_page(page);
  	put_dev_pagemap(pgmap);
  
  	return page;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
1055
1056
1057
1058
  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
  		  struct vm_area_struct *vma)
  {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1059
  	spinlock_t *dst_ptl, *src_ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1060
1061
  	struct page *src_page;
  	pmd_t pmd;
12c9d70bd   Matthew Wilcox   mm: fix memory le...
1062
  	pgtable_t pgtable = NULL;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1063
  	int ret;
12c9d70bd   Matthew Wilcox   mm: fix memory le...
1064
1065
1066
1067
1068
1069
  	if (!vma_is_dax(vma)) {
  		ret = -ENOMEM;
  		pgtable = pte_alloc_one(dst_mm, addr);
  		if (unlikely(!pgtable))
  			goto out;
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1070

c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1071
1072
1073
  	dst_ptl = pmd_lock(dst_mm, dst_pmd);
  	src_ptl = pmd_lockptr(src_mm, src_pmd);
  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1074
1075
1076
  
  	ret = -EAGAIN;
  	pmd = *src_pmd;
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
1077
  	if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1078
1079
1080
  		pte_free(dst_mm, pgtable);
  		goto out_unlock;
  	}
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
1081
  	/*
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1082
  	 * When page table lock is held, the huge zero pmd should not be
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
1083
1084
1085
1086
  	 * under splitting since we don't split the page itself, only pmd to
  	 * a page table.
  	 */
  	if (is_huge_zero_pmd(pmd)) {
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
1087
  		struct page *zero_page;
97ae17497   Kirill A. Shutemov   thp: implement re...
1088
1089
1090
1091
1092
  		/*
  		 * get_huge_zero_page() will never allocate a new page here,
  		 * since we already have a zero page to copy. It just takes a
  		 * reference.
  		 */
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
1093
  		zero_page = get_huge_zero_page();
6b251fc96   Andrea Arcangeli   userfaultfd: call...
1094
  		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
5918d10a4   Kirill A. Shutemov   thp: fix huge zer...
1095
  				zero_page);
fc9fe822f   Kirill A. Shutemov   thp: copy_huge_pm...
1096
1097
1098
  		ret = 0;
  		goto out_unlock;
  	}
de466bd62   Mel Gorman   mm: numa: avoid u...
1099

12c9d70bd   Matthew Wilcox   mm: fix memory le...
1100
  	if (!vma_is_dax(vma)) {
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
1101
1102
1103
1104
1105
1106
1107
1108
1109
  		/* thp accounting separate from pmd_devmap accounting */
  		src_page = pmd_page(pmd);
  		VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
  		get_page(src_page);
  		page_dup_rmap(src_page, true);
  		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  		atomic_long_inc(&dst_mm->nr_ptes);
  		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1110
1111
1112
1113
  
  	pmdp_set_wrprotect(src_mm, addr, src_pmd);
  	pmd = pmd_mkold(pmd_wrprotect(pmd));
  	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1114
1115
1116
  
  	ret = 0;
  out_unlock:
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1117
1118
  	spin_unlock(src_ptl);
  	spin_unlock(dst_ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1119
1120
1121
  out:
  	return ret;
  }
a1dd450bc   Will Deacon   mm: thp: set the ...
1122
1123
1124
1125
1126
1127
  void huge_pmd_set_accessed(struct mm_struct *mm,
  			   struct vm_area_struct *vma,
  			   unsigned long address,
  			   pmd_t *pmd, pmd_t orig_pmd,
  			   int dirty)
  {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1128
  	spinlock_t *ptl;
a1dd450bc   Will Deacon   mm: thp: set the ...
1129
1130
  	pmd_t entry;
  	unsigned long haddr;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1131
  	ptl = pmd_lock(mm, pmd);
a1dd450bc   Will Deacon   mm: thp: set the ...
1132
1133
1134
1135
1136
1137
1138
1139
1140
  	if (unlikely(!pmd_same(*pmd, orig_pmd)))
  		goto unlock;
  
  	entry = pmd_mkyoung(orig_pmd);
  	haddr = address & HPAGE_PMD_MASK;
  	if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
  		update_mmu_cache_pmd(vma, address, pmd);
  
  unlock:
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1141
  	spin_unlock(ptl);
a1dd450bc   Will Deacon   mm: thp: set the ...
1142
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
1143
1144
1145
1146
1147
1148
1149
  static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
  					struct vm_area_struct *vma,
  					unsigned long address,
  					pmd_t *pmd, pmd_t orig_pmd,
  					struct page *page,
  					unsigned long haddr)
  {
00501b531   Johannes Weiner   mm: memcontrol: r...
1150
  	struct mem_cgroup *memcg;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1151
  	spinlock_t *ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1152
1153
1154
1155
  	pgtable_t pgtable;
  	pmd_t _pmd;
  	int ret = 0, i;
  	struct page **pages;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1156
1157
  	unsigned long mmun_start;	/* For mmu_notifiers */
  	unsigned long mmun_end;		/* For mmu_notifiers */
71e3aac07   Andrea Arcangeli   thp: transparent ...
1158
1159
1160
1161
1162
1163
1164
1165
1166
  
  	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
  			GFP_KERNEL);
  	if (unlikely(!pages)) {
  		ret |= VM_FAULT_OOM;
  		goto out;
  	}
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
cc5d462f7   Andi Kleen   mm: use __GFP_OTH...
1167
1168
  		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
  					       __GFP_OTHER_NODE,
19ee151e1   Andi Kleen   mm: preserve orig...
1169
  					       vma, address, page_to_nid(page));
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1170
  		if (unlikely(!pages[i] ||
00501b531   Johannes Weiner   mm: memcontrol: r...
1171
  			     mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1172
  						   &memcg, false))) {
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1173
  			if (pages[i])
71e3aac07   Andrea Arcangeli   thp: transparent ...
1174
  				put_page(pages[i]);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1175
  			while (--i >= 0) {
00501b531   Johannes Weiner   mm: memcontrol: r...
1176
1177
  				memcg = (void *)page_private(pages[i]);
  				set_page_private(pages[i], 0);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1178
1179
  				mem_cgroup_cancel_charge(pages[i], memcg,
  						false);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1180
1181
  				put_page(pages[i]);
  			}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1182
1183
1184
1185
  			kfree(pages);
  			ret |= VM_FAULT_OOM;
  			goto out;
  		}
00501b531   Johannes Weiner   mm: memcontrol: r...
1186
  		set_page_private(pages[i], (unsigned long)memcg);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1187
1188
1189
1190
  	}
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		copy_user_highpage(pages[i], page + i,
0089e4853   Hillf Danton   mm/huge_memory: f...
1191
  				   haddr + PAGE_SIZE * i, vma);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1192
1193
1194
  		__SetPageUptodate(pages[i]);
  		cond_resched();
  	}
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1195
1196
1197
  	mmun_start = haddr;
  	mmun_end   = haddr + HPAGE_PMD_SIZE;
  	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1198
  	ptl = pmd_lock(mm, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1199
1200
  	if (unlikely(!pmd_same(*pmd, orig_pmd)))
  		goto out_free_pages;
309381fea   Sasha Levin   mm: dump page whe...
1201
  	VM_BUG_ON_PAGE(!PageHead(page), page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1202

8809aa2d2   Aneesh Kumar K.V   mm: clarify that ...
1203
  	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1204
  	/* leave pmd empty until pte is filled */
6b0b50b06   Aneesh Kumar K.V   mm/THP: add pmd a...
1205
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1206
1207
1208
1209
1210
1211
  	pmd_populate(mm, &_pmd, pgtable);
  
  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
  		pte_t *pte, entry;
  		entry = mk_pte(pages[i], vma->vm_page_prot);
  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
00501b531   Johannes Weiner   mm: memcontrol: r...
1212
1213
  		memcg = (void *)page_private(pages[i]);
  		set_page_private(pages[i], 0);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1214
  		page_add_new_anon_rmap(pages[i], vma, haddr, false);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1215
  		mem_cgroup_commit_charge(pages[i], memcg, false, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
1216
  		lru_cache_add_active_or_unevictable(pages[i], vma);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1217
1218
1219
1220
1221
1222
  		pte = pte_offset_map(&_pmd, haddr);
  		VM_BUG_ON(!pte_none(*pte));
  		set_pte_at(mm, haddr, pte, entry);
  		pte_unmap(pte);
  	}
  	kfree(pages);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1223
1224
  	smp_wmb(); /* make pte visible before pmd */
  	pmd_populate(mm, pmd, pgtable);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1225
  	page_remove_rmap(page, true);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1226
  	spin_unlock(ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1227

2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1228
  	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1229
1230
1231
1232
1233
1234
1235
  	ret |= VM_FAULT_WRITE;
  	put_page(page);
  
  out:
  	return ret;
  
  out_free_pages:
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1236
  	spin_unlock(ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1237
  	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1238
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
00501b531   Johannes Weiner   mm: memcontrol: r...
1239
1240
  		memcg = (void *)page_private(pages[i]);
  		set_page_private(pages[i], 0);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1241
  		mem_cgroup_cancel_charge(pages[i], memcg, false);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1242
  		put_page(pages[i]);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1243
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1244
1245
1246
1247
1248
1249
1250
  	kfree(pages);
  	goto out;
  }
  
  int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
  			unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
  {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1251
  	spinlock_t *ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1252
  	int ret = 0;
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1253
  	struct page *page = NULL, *new_page;
00501b531   Johannes Weiner   mm: memcontrol: r...
1254
  	struct mem_cgroup *memcg;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1255
  	unsigned long haddr;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1256
1257
  	unsigned long mmun_start;	/* For mmu_notifiers */
  	unsigned long mmun_end;		/* For mmu_notifiers */
3b3636924   Michal Hocko   mm, memcg: sync a...
1258
  	gfp_t huge_gfp;			/* for allocation and charge */
71e3aac07   Andrea Arcangeli   thp: transparent ...
1259

c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1260
  	ptl = pmd_lockptr(mm, pmd);
81d1b09c6   Sasha Levin   mm: convert a few...
1261
  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1262
1263
1264
  	haddr = address & HPAGE_PMD_MASK;
  	if (is_huge_zero_pmd(orig_pmd))
  		goto alloc;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1265
  	spin_lock(ptl);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1266
1267
1268
1269
  	if (unlikely(!pmd_same(*pmd, orig_pmd)))
  		goto out_unlock;
  
  	page = pmd_page(orig_pmd);
309381fea   Sasha Levin   mm: dump page whe...
1270
  	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1f25fe20a   Kirill A. Shutemov   mm, thp: adjust c...
1271
1272
  	/*
  	 * We can only reuse the page if nobody else maps the huge page or it's
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1273
  	 * part.
1f25fe20a   Kirill A. Shutemov   mm, thp: adjust c...
1274
  	 */
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
1275
  	if (page_trans_huge_mapcount(page, NULL) == 1) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1276
1277
1278
1279
  		pmd_t entry;
  		entry = pmd_mkyoung(orig_pmd);
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  		if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
b113da657   David Miller   mm: Add and use u...
1280
  			update_mmu_cache_pmd(vma, address, pmd);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1281
1282
1283
  		ret |= VM_FAULT_WRITE;
  		goto out_unlock;
  	}
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1284
  	get_page(page);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1285
  	spin_unlock(ptl);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1286
  alloc:
71e3aac07   Andrea Arcangeli   thp: transparent ...
1287
  	if (transparent_hugepage_enabled(vma) &&
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
1288
  	    !transparent_hugepage_debug_cow()) {
444eb2a44   Mel Gorman   mm: thp: set THP ...
1289
  		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
3b3636924   Michal Hocko   mm, memcg: sync a...
1290
  		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
077fcf116   Aneesh Kumar K.V   mm/thp: allocate ...
1291
  	} else
71e3aac07   Andrea Arcangeli   thp: transparent ...
1292
  		new_page = NULL;
9a982250f   Kirill A. Shutemov   thp: introduce de...
1293
1294
1295
  	if (likely(new_page)) {
  		prep_transhuge_page(new_page);
  	} else {
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1296
  		if (!page) {
78ddc5347   Kirill A. Shutemov   thp: rename split...
1297
  			split_huge_pmd(vma, pmd, address);
e9b71ca91   Kirill A. Shutemov   mm, thp: drop do_...
1298
  			ret |= VM_FAULT_FALLBACK;
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1299
1300
1301
  		} else {
  			ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
  					pmd, orig_pmd, page, haddr);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1302
  			if (ret & VM_FAULT_OOM) {
78ddc5347   Kirill A. Shutemov   thp: rename split...
1303
  				split_huge_pmd(vma, pmd, address);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1304
1305
  				ret |= VM_FAULT_FALLBACK;
  			}
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1306
  			put_page(page);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1307
  		}
17766dde3   David Rientjes   mm, thp: count th...
1308
  		count_vm_event(THP_FAULT_FALLBACK);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1309
1310
  		goto out;
  	}
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1311
1312
  	if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
  					   true))) {
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1313
  		put_page(new_page);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1314
  		if (page) {
78ddc5347   Kirill A. Shutemov   thp: rename split...
1315
  			split_huge_pmd(vma, pmd, address);
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1316
  			put_page(page);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1317
  		} else
78ddc5347   Kirill A. Shutemov   thp: rename split...
1318
  			split_huge_pmd(vma, pmd, address);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
1319
  		ret |= VM_FAULT_FALLBACK;
17766dde3   David Rientjes   mm, thp: count th...
1320
  		count_vm_event(THP_FAULT_FALLBACK);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1321
1322
  		goto out;
  	}
17766dde3   David Rientjes   mm, thp: count th...
1323
  	count_vm_event(THP_FAULT_ALLOC);
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1324
  	if (!page)
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1325
1326
1327
  		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
  	else
  		copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1328
  	__SetPageUptodate(new_page);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1329
1330
1331
  	mmun_start = haddr;
  	mmun_end   = haddr + HPAGE_PMD_SIZE;
  	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1332
  	spin_lock(ptl);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1333
  	if (page)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1334
  		put_page(page);
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1335
  	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1336
  		spin_unlock(ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1337
  		mem_cgroup_cancel_charge(new_page, memcg, true);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1338
  		put_page(new_page);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1339
  		goto out_mn;
b9bbfbe30   Andrea Arcangeli   thp: memcg huge m...
1340
  	} else {
71e3aac07   Andrea Arcangeli   thp: transparent ...
1341
  		pmd_t entry;
3122359a6   Kirill A. Shutemov   thp: move maybe_p...
1342
1343
  		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
8809aa2d2   Aneesh Kumar K.V   mm: clarify that ...
1344
  		pmdp_huge_clear_flush_notify(vma, haddr, pmd);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1345
  		page_add_new_anon_rmap(new_page, vma, haddr, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
1346
  		mem_cgroup_commit_charge(new_page, memcg, false, true);
00501b531   Johannes Weiner   mm: memcontrol: r...
1347
  		lru_cache_add_active_or_unevictable(new_page, vma);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1348
  		set_pmd_at(mm, haddr, pmd, entry);
b113da657   David Miller   mm: Add and use u...
1349
  		update_mmu_cache_pmd(vma, address, pmd);
eecc1e426   Hugh Dickins   thp: fix copy_pag...
1350
  		if (!page) {
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1351
  			add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
97ae17497   Kirill A. Shutemov   thp: implement re...
1352
1353
  			put_huge_zero_page();
  		} else {
309381fea   Sasha Levin   mm: dump page whe...
1354
  			VM_BUG_ON_PAGE(!PageHead(page), page);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1355
  			page_remove_rmap(page, true);
93b4796de   Kirill A. Shutemov   thp: do_huge_pmd_...
1356
1357
  			put_page(page);
  		}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1358
1359
  		ret |= VM_FAULT_WRITE;
  	}
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1360
  	spin_unlock(ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1361
1362
  out_mn:
  	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1363
1364
  out:
  	return ret;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1365
  out_unlock:
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1366
  	spin_unlock(ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1367
  	return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1368
  }
b676b293f   David Rientjes   mm, thp: fix mapp...
1369
  struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
71e3aac07   Andrea Arcangeli   thp: transparent ...
1370
1371
1372
1373
  				   unsigned long addr,
  				   pmd_t *pmd,
  				   unsigned int flags)
  {
b676b293f   David Rientjes   mm, thp: fix mapp...
1374
  	struct mm_struct *mm = vma->vm_mm;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1375
  	struct page *page = NULL;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1376
  	assert_spin_locked(pmd_lockptr(mm, pmd));
71e3aac07   Andrea Arcangeli   thp: transparent ...
1377
1378
1379
  
  	if (flags & FOLL_WRITE && !pmd_write(*pmd))
  		goto out;
85facf257   Kirill A. Shutemov   thp: avoid dumpin...
1380
1381
1382
  	/* Avoid dumping huge zero page */
  	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
  		return ERR_PTR(-EFAULT);
2b4847e73   Mel Gorman   mm: numa: seriali...
1383
  	/* Full NUMA hinting faults to serialise migration in fault paths */
8a0516ed8   Mel Gorman   mm: convert p[te|...
1384
  	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
2b4847e73   Mel Gorman   mm: numa: seriali...
1385
  		goto out;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1386
  	page = pmd_page(*pmd);
309381fea   Sasha Levin   mm: dump page whe...
1387
  	VM_BUG_ON_PAGE(!PageHead(page), page);
3565fce3a   Dan Williams   mm, x86: get_user...
1388
1389
  	if (flags & FOLL_TOUCH)
  		touch_pmd(vma, addr, pmd);
de60f5f10   Eric B Munson   mm: introduce VM_...
1390
  	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
  		/*
  		 * We don't mlock() pte-mapped THPs. This way we can avoid
  		 * leaking mlocked pages into non-VM_LOCKED VMAs.
  		 *
  		 * In most cases the pmd is the only mapping of the page as we
  		 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
  		 * writable private mappings in populate_vma_page_range().
  		 *
  		 * The only scenario when we have the page shared here is if we
  		 * mlocking read-only mapping shared over fork(). We skip
  		 * mlocking such pages.
  		 */
  		if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
  				page->mapping && trylock_page(page)) {
b676b293f   David Rientjes   mm, thp: fix mapp...
1405
1406
1407
1408
1409
1410
  			lru_add_drain();
  			if (page->mapping)
  				mlock_vma_page(page);
  			unlock_page(page);
  		}
  	}
71e3aac07   Andrea Arcangeli   thp: transparent ...
1411
  	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
309381fea   Sasha Levin   mm: dump page whe...
1412
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1413
  	if (flags & FOLL_GET)
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
1414
  		get_page(page);
71e3aac07   Andrea Arcangeli   thp: transparent ...
1415
1416
1417
1418
  
  out:
  	return page;
  }
d10e63f29   Mel Gorman   mm: numa: Create ...
1419
  /* NUMA hinting page fault entry point for trans huge pmds */
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1420
1421
  int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
  				unsigned long addr, pmd_t pmd, pmd_t *pmdp)
d10e63f29   Mel Gorman   mm: numa: Create ...
1422
  {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1423
  	spinlock_t *ptl;
b8916634b   Mel Gorman   mm: Prevent paral...
1424
  	struct anon_vma *anon_vma = NULL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1425
  	struct page *page;
d10e63f29   Mel Gorman   mm: numa: Create ...
1426
  	unsigned long haddr = addr & HPAGE_PMD_MASK;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1427
  	int page_nid = -1, this_nid = numa_node_id();
90572890d   Peter Zijlstra   mm: numa: Change ...
1428
  	int target_nid, last_cpupid = -1;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1429
1430
  	bool page_locked;
  	bool migrated = false;
b191f9b10   Mel Gorman   mm: numa: preserv...
1431
  	bool was_writable;
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1432
  	int flags = 0;
d10e63f29   Mel Gorman   mm: numa: Create ...
1433

c0e7cad9f   Mel Gorman   mm: numa: add par...
1434
1435
  	/* A PROT_NONE fault should not end up here */
  	BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1436
  	ptl = pmd_lock(mm, pmdp);
d10e63f29   Mel Gorman   mm: numa: Create ...
1437
1438
  	if (unlikely(!pmd_same(pmd, *pmdp)))
  		goto out_unlock;
de466bd62   Mel Gorman   mm: numa: avoid u...
1439
1440
1441
1442
1443
1444
  	/*
  	 * If there are potential migrations, wait for completion and retry
  	 * without disrupting NUMA hinting information. Do not relock and
  	 * check_same as the page may no longer be mapped.
  	 */
  	if (unlikely(pmd_trans_migrating(*pmdp))) {
5d8330621   Mel Gorman   mm: numa: do not ...
1445
  		page = pmd_page(*pmdp);
de466bd62   Mel Gorman   mm: numa: avoid u...
1446
  		spin_unlock(ptl);
5d8330621   Mel Gorman   mm: numa: do not ...
1447
  		wait_on_page_locked(page);
de466bd62   Mel Gorman   mm: numa: avoid u...
1448
1449
  		goto out;
  	}
d10e63f29   Mel Gorman   mm: numa: Create ...
1450
  	page = pmd_page(pmd);
a1a46184e   Mel Gorman   mm: numa: Do not ...
1451
  	BUG_ON(is_huge_zero_page(page));
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1452
  	page_nid = page_to_nid(page);
90572890d   Peter Zijlstra   mm: numa: Change ...
1453
  	last_cpupid = page_cpupid_last(page);
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1454
  	count_vm_numa_event(NUMA_HINT_FAULTS);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1455
  	if (page_nid == this_nid) {
03c5a6e16   Mel Gorman   mm: numa: Add pte...
1456
  		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1457
1458
  		flags |= TNF_FAULT_LOCAL;
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1459

bea66fbd1   Mel Gorman   mm: numa: group r...
1460
1461
  	/* See similar comment in do_numa_page for explanation */
  	if (!(vma->vm_flags & VM_WRITE))
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1462
1463
1464
  		flags |= TNF_NO_GROUP;
  
  	/*
ff9042b11   Mel Gorman   mm: Wait for THP ...
1465
1466
1467
  	 * Acquire the page lock to serialise THP migrations but avoid dropping
  	 * page_table_lock if at all possible
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1468
1469
1470
1471
  	page_locked = trylock_page(page);
  	target_nid = mpol_misplaced(page, vma, haddr);
  	if (target_nid == -1) {
  		/* If the page was locked, there are no parallel migrations */
a54a407fb   Mel Gorman   mm: Close races b...
1472
  		if (page_locked)
b8916634b   Mel Gorman   mm: Prevent paral...
1473
  			goto clear_pmdnuma;
2b4847e73   Mel Gorman   mm: numa: seriali...
1474
  	}
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1475

de466bd62   Mel Gorman   mm: numa: avoid u...
1476
  	/* Migration could have started since the pmd_trans_migrating check */
2b4847e73   Mel Gorman   mm: numa: seriali...
1477
  	if (!page_locked) {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1478
  		spin_unlock(ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1479
  		wait_on_page_locked(page);
a54a407fb   Mel Gorman   mm: Close races b...
1480
  		page_nid = -1;
b8916634b   Mel Gorman   mm: Prevent paral...
1481
1482
  		goto out;
  	}
2b4847e73   Mel Gorman   mm: numa: seriali...
1483
1484
1485
1486
  	/*
  	 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
  	 * to serialises splits
  	 */
b8916634b   Mel Gorman   mm: Prevent paral...
1487
  	get_page(page);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1488
  	spin_unlock(ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1489
  	anon_vma = page_lock_anon_vma_read(page);
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1490

c69307d53   Peter Zijlstra   sched/numa: Fix c...
1491
  	/* Confirm the PMD did not change while page_table_lock was released */
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1492
  	spin_lock(ptl);
b32967ff1   Mel Gorman   mm: numa: Add THP...
1493
1494
1495
  	if (unlikely(!pmd_same(pmd, *pmdp))) {
  		unlock_page(page);
  		put_page(page);
a54a407fb   Mel Gorman   mm: Close races b...
1496
  		page_nid = -1;
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
1497
  		goto out_unlock;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1498
  	}
ff9042b11   Mel Gorman   mm: Wait for THP ...
1499

c3a489cac   Mel Gorman   mm: numa: ensure ...
1500
1501
1502
1503
1504
1505
  	/* Bail if we fail to protect against THP splits for any reason */
  	if (unlikely(!anon_vma)) {
  		put_page(page);
  		page_nid = -1;
  		goto clear_pmdnuma;
  	}
a54a407fb   Mel Gorman   mm: Close races b...
1506
1507
  	/*
  	 * Migrate the THP to the requested node, returns with page unlocked
8a0516ed8   Mel Gorman   mm: convert p[te|...
1508
  	 * and access rights restored.
a54a407fb   Mel Gorman   mm: Close races b...
1509
  	 */
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1510
  	spin_unlock(ptl);
b32967ff1   Mel Gorman   mm: numa: Add THP...
1511
  	migrated = migrate_misplaced_transhuge_page(mm, vma,
340ef3902   Hugh Dickins   mm: numa: cleanup...
1512
  				pmdp, pmd, addr, page, target_nid);
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1513
1514
  	if (migrated) {
  		flags |= TNF_MIGRATED;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1515
  		page_nid = target_nid;
074c23817   Mel Gorman   mm: numa: slow PT...
1516
1517
  	} else
  		flags |= TNF_MIGRATE_FAIL;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1518

8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1519
  	goto out;
b32967ff1   Mel Gorman   mm: numa: Add THP...
1520
  clear_pmdnuma:
a54a407fb   Mel Gorman   mm: Close races b...
1521
  	BUG_ON(!PageLocked(page));
b191f9b10   Mel Gorman   mm: numa: preserv...
1522
  	was_writable = pmd_write(pmd);
4d9424669   Mel Gorman   mm: convert p[te|...
1523
  	pmd = pmd_modify(pmd, vma->vm_page_prot);
b7b04004e   Mel Gorman   mm: numa: mark hu...
1524
  	pmd = pmd_mkyoung(pmd);
b191f9b10   Mel Gorman   mm: numa: preserv...
1525
1526
  	if (was_writable)
  		pmd = pmd_mkwrite(pmd);
d10e63f29   Mel Gorman   mm: numa: Create ...
1527
  	set_pmd_at(mm, haddr, pmdp, pmd);
d10e63f29   Mel Gorman   mm: numa: Create ...
1528
  	update_mmu_cache_pmd(vma, addr, pmdp);
a54a407fb   Mel Gorman   mm: Close races b...
1529
  	unlock_page(page);
d10e63f29   Mel Gorman   mm: numa: Create ...
1530
  out_unlock:
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
1531
  	spin_unlock(ptl);
b8916634b   Mel Gorman   mm: Prevent paral...
1532
1533
1534
1535
  
  out:
  	if (anon_vma)
  		page_unlock_anon_vma_read(anon_vma);
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1536
  	if (page_nid != -1)
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1537
  		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
1538

d10e63f29   Mel Gorman   mm: numa: Create ...
1539
1540
  	return 0;
  }
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1541
1542
1543
1544
1545
1546
1547
1548
1549
  int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  		pmd_t *pmd, unsigned long addr, unsigned long next)
  
  {
  	spinlock_t *ptl;
  	pmd_t orig_pmd;
  	struct page *page;
  	struct mm_struct *mm = tlb->mm;
  	int ret = 0;
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1550
1551
  	ptl = pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
25eedabe0   Linus Torvalds   vm: fix incorrect...
1552
  		goto out_unlocked;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
  
  	orig_pmd = *pmd;
  	if (is_huge_zero_pmd(orig_pmd)) {
  		ret = 1;
  		goto out;
  	}
  
  	page = pmd_page(orig_pmd);
  	/*
  	 * If other processes are mapping this page, we couldn't discard
  	 * the page unless they all do MADV_FREE so let's skip the page.
  	 */
  	if (page_mapcount(page) != 1)
  		goto out;
  
  	if (!trylock_page(page))
  		goto out;
  
  	/*
  	 * If user want to discard part-pages of THP, split it so MADV_FREE
  	 * will deactivate only them.
  	 */
  	if (next - addr != HPAGE_PMD_SIZE) {
  		get_page(page);
  		spin_unlock(ptl);
  		if (split_huge_page(page)) {
  			put_page(page);
  			unlock_page(page);
  			goto out_unlocked;
  		}
  		put_page(page);
  		unlock_page(page);
  		ret = 1;
  		goto out_unlocked;
  	}
  
  	if (PageDirty(page))
  		ClearPageDirty(page);
  	unlock_page(page);
  
  	if (PageActive(page))
  		deactivate_page(page);
  
  	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
  		orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
  			tlb->fullmm);
  		orig_pmd = pmd_mkold(orig_pmd);
  		orig_pmd = pmd_mkclean(orig_pmd);
  
  		set_pmd_at(mm, addr, pmd, orig_pmd);
  		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  	}
  	ret = 1;
  out:
  	spin_unlock(ptl);
  out_unlocked:
  	return ret;
  }
71e3aac07   Andrea Arcangeli   thp: transparent ...
1611
  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
f21760b15   Shaohua Li   thp: add tlb_remo...
1612
  		 pmd_t *pmd, unsigned long addr)
71e3aac07   Andrea Arcangeli   thp: transparent ...
1613
  {
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1614
  	pmd_t orig_pmd;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1615
  	spinlock_t *ptl;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1616

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1617
1618
  	ptl = __pmd_trans_huge_lock(pmd, vma);
  	if (!ptl)
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
  		return 0;
  	/*
  	 * For architectures like ppc64 we look at deposited pgtable
  	 * when calling pmdp_huge_get_and_clear. So do the
  	 * pgtable_trans_huge_withdraw after finishing pmdp related
  	 * operations.
  	 */
  	orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
  			tlb->fullmm);
  	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  	if (vma_is_dax(vma)) {
  		spin_unlock(ptl);
  		if (is_huge_zero_pmd(orig_pmd))
aa88b68c3   Kirill A. Shutemov   thp: keep huge ze...
1632
  			tlb_remove_page(tlb, pmd_page(orig_pmd));
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1633
1634
1635
1636
  	} else if (is_huge_zero_pmd(orig_pmd)) {
  		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
  		atomic_long_dec(&tlb->mm->nr_ptes);
  		spin_unlock(ptl);
aa88b68c3   Kirill A. Shutemov   thp: keep huge ze...
1637
  		tlb_remove_page(tlb, pmd_page(orig_pmd));
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1638
1639
  	} else {
  		struct page *page = pmd_page(orig_pmd);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1640
  		page_remove_rmap(page, true);
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1641
1642
1643
1644
1645
1646
1647
  		VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
  		add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  		VM_BUG_ON_PAGE(!PageHead(page), page);
  		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
  		atomic_long_dec(&tlb->mm->nr_ptes);
  		spin_unlock(ptl);
  		tlb_remove_page(tlb, page);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1648
  	}
da1467690   Kirill A. Shutemov   thp: fix zap_huge...
1649
  	return 1;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1650
  }
bf8616d5f   Hugh Dickins   huge mm: move_hug...
1651
  bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1652
1653
1654
  		  unsigned long new_addr, unsigned long old_end,
  		  pmd_t *old_pmd, pmd_t *new_pmd)
  {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1655
  	spinlock_t *old_ptl, *new_ptl;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1656
  	pmd_t pmd;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1657
1658
1659
1660
  	struct mm_struct *mm = vma->vm_mm;
  
  	if ((old_addr & ~HPAGE_PMD_MASK) ||
  	    (new_addr & ~HPAGE_PMD_MASK) ||
bf8616d5f   Hugh Dickins   huge mm: move_hug...
1661
  	    old_end - old_addr < HPAGE_PMD_SIZE)
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1662
  		return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1663
1664
1665
1666
1667
1668
1669
  
  	/*
  	 * The destination pmd shouldn't be established, free_pgtables()
  	 * should have release it.
  	 */
  	if (WARN_ON(!pmd_none(*new_pmd))) {
  		VM_BUG_ON(pmd_trans_huge(*new_pmd));
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1670
  		return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1671
  	}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1672
1673
1674
1675
  	/*
  	 * We don't have to worry about the ordering of src and dst
  	 * ptlocks because exclusive mmap_sem prevents deadlock.
  	 */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1676
1677
  	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
  	if (old_ptl) {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1678
1679
1680
  		new_ptl = pmd_lockptr(mm, new_pmd);
  		if (new_ptl != old_ptl)
  			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
8809aa2d2   Aneesh Kumar K.V   mm: clarify that ...
1681
  		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1682
  		VM_BUG_ON(!pmd_none(*new_pmd));
3592806cf   Kirill A. Shutemov   thp: move preallo...
1683

69a8ec2d8   Kirill A. Shutemov   thp, dax: do not ...
1684
1685
  		if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
  				vma_is_anonymous(vma)) {
b3084f4db   Aneesh Kumar K.V   powerpc/thp: Fix ...
1686
  			pgtable_t pgtable;
3592806cf   Kirill A. Shutemov   thp: move preallo...
1687
1688
  			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
  			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
3592806cf   Kirill A. Shutemov   thp: move preallo...
1689
  		}
b3084f4db   Aneesh Kumar K.V   powerpc/thp: Fix ...
1690
1691
1692
  		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
  		if (new_ptl != old_ptl)
  			spin_unlock(new_ptl);
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1693
  		spin_unlock(old_ptl);
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1694
  		return true;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1695
  	}
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1696
  	return false;
37a1c49a9   Andrea Arcangeli   thp: mremap suppo...
1697
  }
f123d74ab   Mel Gorman   mm: Only flush TL...
1698
1699
1700
1701
1702
1703
  /*
   * Returns
   *  - 0 if PMD could not be locked
   *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
   *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
   */
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1704
  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
e944fd67b   Mel Gorman   mm: numa: do not ...
1705
  		unsigned long addr, pgprot_t newprot, int prot_numa)
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1706
1707
  {
  	struct mm_struct *mm = vma->vm_mm;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1708
  	spinlock_t *ptl;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1709
  	int ret = 0;
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1710
1711
  	ptl = __pmd_trans_huge_lock(pmd, vma);
  	if (ptl) {
025c5b245   Naoya Horiguchi   thp: optimize awa...
1712
  		pmd_t entry;
b191f9b10   Mel Gorman   mm: numa: preserv...
1713
  		bool preserve_write = prot_numa && pmd_write(*pmd);
ba68bc011   Mel Gorman   mm: thp: Return t...
1714
  		ret = 1;
e944fd67b   Mel Gorman   mm: numa: do not ...
1715
1716
1717
1718
1719
1720
1721
1722
  
  		/*
  		 * Avoid trapping faults against the zero page. The read-only
  		 * data is likely to be read-cached on the local CPU and
  		 * local/remote hits to the zero page are not interesting.
  		 */
  		if (prot_numa && is_huge_zero_pmd(*pmd)) {
  			spin_unlock(ptl);
ba68bc011   Mel Gorman   mm: thp: Return t...
1723
  			return ret;
e944fd67b   Mel Gorman   mm: numa: do not ...
1724
  		}
10c1045f2   Mel Gorman   mm: numa: avoid u...
1725
  		if (!prot_numa || !pmd_protnone(*pmd)) {
8809aa2d2   Aneesh Kumar K.V   mm: clarify that ...
1726
  			entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
10c1045f2   Mel Gorman   mm: numa: avoid u...
1727
  			entry = pmd_modify(entry, newprot);
b191f9b10   Mel Gorman   mm: numa: preserv...
1728
1729
  			if (preserve_write)
  				entry = pmd_mkwrite(entry);
10c1045f2   Mel Gorman   mm: numa: avoid u...
1730
1731
  			ret = HPAGE_PMD_NR;
  			set_pmd_at(mm, addr, pmd, entry);
b191f9b10   Mel Gorman   mm: numa: preserv...
1732
  			BUG_ON(!preserve_write && pmd_write(entry));
10c1045f2   Mel Gorman   mm: numa: avoid u...
1733
  		}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
1734
  		spin_unlock(ptl);
025c5b245   Naoya Horiguchi   thp: optimize awa...
1735
1736
1737
1738
1739
1740
  	}
  
  	return ret;
  }
  
  /*
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1741
   * Returns true if a given pmd maps a thp, false otherwise.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1742
   *
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
1743
1744
   * Note that if it returns true, this routine returns without unlocking page
   * table lock. So callers must unlock it.
025c5b245   Naoya Horiguchi   thp: optimize awa...
1745
   */
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1746
  spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
025c5b245   Naoya Horiguchi   thp: optimize awa...
1747
  {
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1748
1749
  	spinlock_t *ptl;
  	ptl = pmd_lock(vma->vm_mm, pmd);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
1750
  	if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
1751
1752
1753
  		return ptl;
  	spin_unlock(ptl);
  	return NULL;
cd7548ab3   Johannes Weiner   thp: mprotect: tr...
1754
  }
9050d7eba   Vlastimil Babka   mm: include VM_MI...
1755
  #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
78f11a255   Andrea Arcangeli   mm: thp: fix /dev...
1756

60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
1757
1758
  int hugepage_madvise(struct vm_area_struct *vma,
  		     unsigned long *vm_flags, int advice)
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
1759
  {
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
1760
1761
  	switch (advice) {
  	case MADV_HUGEPAGE:
1e1836e84   Alex Thorlton   mm: revert "thp: ...
1762
1763
1764
1765
1766
1767
1768
1769
1770
  #ifdef CONFIG_S390
  		/*
  		 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
  		 * can't handle this properly after s390_enable_sie, so we simply
  		 * ignore the madvise to prevent qemu from causing a SIGSEGV.
  		 */
  		if (mm_has_pgste(vma->vm_mm))
  			return 0;
  #endif
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
1771
1772
1773
  		/*
  		 * Be somewhat over-protective like KSM for now!
  		 */
1a7636156   Jason J. Herne   mm: loosen MADV_N...
1774
  		if (*vm_flags & VM_NO_THP)
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
1775
1776
1777
  			return -EINVAL;
  		*vm_flags &= ~VM_NOHUGEPAGE;
  		*vm_flags |= VM_HUGEPAGE;
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
1778
1779
1780
1781
1782
  		/*
  		 * If the vma become good for khugepaged to scan,
  		 * register it here without waiting a page fault that
  		 * may not happen any time soon.
  		 */
6d50e60cd   David Rientjes   mm, thp: fix coll...
1783
  		if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
1784
  			return -ENOMEM;
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
1785
1786
1787
1788
1789
  		break;
  	case MADV_NOHUGEPAGE:
  		/*
  		 * Be somewhat over-protective like KSM for now!
  		 */
1a7636156   Jason J. Herne   mm: loosen MADV_N...
1790
  		if (*vm_flags & VM_NO_THP)
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
1791
1792
1793
  			return -EINVAL;
  		*vm_flags &= ~VM_HUGEPAGE;
  		*vm_flags |= VM_NOHUGEPAGE;
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
1794
1795
1796
1797
1798
  		/*
  		 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
  		 * this vma even if we leave the mm registered in khugepaged if
  		 * it got registered before VM_NOHUGEPAGE was set.
  		 */
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
1799
1800
  		break;
  	}
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
1801
1802
1803
  
  	return 0;
  }
ba76149f4   Andrea Arcangeli   thp: khugepaged
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
  static int __init khugepaged_slab_init(void)
  {
  	mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
  					  sizeof(struct mm_slot),
  					  __alignof__(struct mm_slot), 0, NULL);
  	if (!mm_slot_cache)
  		return -ENOMEM;
  
  	return 0;
  }
65ebb64f4   Kirill A. Shutemov   thp: handle error...
1814
1815
1816
1817
  static void __init khugepaged_slab_exit(void)
  {
  	kmem_cache_destroy(mm_slot_cache);
  }
ba76149f4   Andrea Arcangeli   thp: khugepaged
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
  static inline struct mm_slot *alloc_mm_slot(void)
  {
  	if (!mm_slot_cache)	/* initialization failed */
  		return NULL;
  	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
  }
  
  static inline void free_mm_slot(struct mm_slot *mm_slot)
  {
  	kmem_cache_free(mm_slot_cache, mm_slot);
  }
ba76149f4   Andrea Arcangeli   thp: khugepaged
1829
1830
1831
  static struct mm_slot *get_mm_slot(struct mm_struct *mm)
  {
  	struct mm_slot *mm_slot;
ba76149f4   Andrea Arcangeli   thp: khugepaged
1832

b67bfe0d4   Sasha Levin   hlist: drop the n...
1833
  	hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
ba76149f4   Andrea Arcangeli   thp: khugepaged
1834
1835
  		if (mm == mm_slot->mm)
  			return mm_slot;
43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
1836

ba76149f4   Andrea Arcangeli   thp: khugepaged
1837
1838
1839
1840
1841
1842
  	return NULL;
  }
  
  static void insert_to_mm_slots_hash(struct mm_struct *mm,
  				    struct mm_slot *mm_slot)
  {
ba76149f4   Andrea Arcangeli   thp: khugepaged
1843
  	mm_slot->mm = mm;
43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
1844
  	hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
ba76149f4   Andrea Arcangeli   thp: khugepaged
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
  }
  
  static inline int khugepaged_test_exit(struct mm_struct *mm)
  {
  	return atomic_read(&mm->mm_users) == 0;
  }
  
  int __khugepaged_enter(struct mm_struct *mm)
  {
  	struct mm_slot *mm_slot;
  	int wakeup;
  
  	mm_slot = alloc_mm_slot();
  	if (!mm_slot)
  		return -ENOMEM;
  
  	/* __khugepaged_exit() must not run from under us */
96dad67ff   Sasha Levin   mm: use VM_BUG_ON...
1862
  	VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
ba76149f4   Andrea Arcangeli   thp: khugepaged
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
  	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
  		free_mm_slot(mm_slot);
  		return 0;
  	}
  
  	spin_lock(&khugepaged_mm_lock);
  	insert_to_mm_slots_hash(mm, mm_slot);
  	/*
  	 * Insert just behind the scanning cursor, to let the area settle
  	 * down a little.
  	 */
  	wakeup = list_empty(&khugepaged_scan.mm_head);
  	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
  	spin_unlock(&khugepaged_mm_lock);
  
  	atomic_inc(&mm->mm_count);
  	if (wakeup)
  		wake_up_interruptible(&khugepaged_wait);
  
  	return 0;
  }
6d50e60cd   David Rientjes   mm, thp: fix coll...
1884
1885
  int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
  			       unsigned long vm_flags)
ba76149f4   Andrea Arcangeli   thp: khugepaged
1886
1887
1888
1889
1890
1891
1892
1893
  {
  	unsigned long hstart, hend;
  	if (!vma->anon_vma)
  		/*
  		 * Not yet faulted in so we will register later in the
  		 * page fault if needed.
  		 */
  		return 0;
3486b85a2   Konstantin Khlebnikov   mm/huge_memory: r...
1894
  	if (vma->vm_ops || (vm_flags & VM_NO_THP))
ba76149f4   Andrea Arcangeli   thp: khugepaged
1895
1896
  		/* khugepaged not yet working on file or special mappings */
  		return 0;
ba76149f4   Andrea Arcangeli   thp: khugepaged
1897
1898
1899
  	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
  	hend = vma->vm_end & HPAGE_PMD_MASK;
  	if (hstart < hend)
6d50e60cd   David Rientjes   mm, thp: fix coll...
1900
  		return khugepaged_enter(vma, vm_flags);
ba76149f4   Andrea Arcangeli   thp: khugepaged
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
  	return 0;
  }
  
  void __khugepaged_exit(struct mm_struct *mm)
  {
  	struct mm_slot *mm_slot;
  	int free = 0;
  
  	spin_lock(&khugepaged_mm_lock);
  	mm_slot = get_mm_slot(mm);
  	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
1912
  		hash_del(&mm_slot->hash);
ba76149f4   Andrea Arcangeli   thp: khugepaged
1913
1914
1915
  		list_del(&mm_slot->mm_node);
  		free = 1;
  	}
d788e80a8   Chris Wright   mm/huge_memory.c:...
1916
  	spin_unlock(&khugepaged_mm_lock);
ba76149f4   Andrea Arcangeli   thp: khugepaged
1917
1918
  
  	if (free) {
ba76149f4   Andrea Arcangeli   thp: khugepaged
1919
1920
1921
1922
  		clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
  		free_mm_slot(mm_slot);
  		mmdrop(mm);
  	} else if (mm_slot) {
ba76149f4   Andrea Arcangeli   thp: khugepaged
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
  		/*
  		 * This is required to serialize against
  		 * khugepaged_test_exit() (which is guaranteed to run
  		 * under mmap sem read mode). Stop here (after we
  		 * return all pagetables will be destroyed) until
  		 * khugepaged has finished working on the pagetables
  		 * under the mmap_sem.
  		 */
  		down_write(&mm->mmap_sem);
  		up_write(&mm->mmap_sem);
d788e80a8   Chris Wright   mm/huge_memory.c:...
1933
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
  }
  
  static void release_pte_page(struct page *page)
  {
  	/* 0 stands for page_is_file_cache(page) == false */
  	dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
  	unlock_page(page);
  	putback_lru_page(page);
  }
  
  static void release_pte_pages(pte_t *pte, pte_t *_pte)
  {
  	while (--_pte >= pte) {
  		pte_t pteval = *_pte;
ca0984caa   Ebru Akagunduz   mm: incorporate z...
1948
  		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
ba76149f4   Andrea Arcangeli   thp: khugepaged
1949
1950
1951
  			release_pte_page(pte_page(pteval));
  	}
  }
ba76149f4   Andrea Arcangeli   thp: khugepaged
1952
1953
1954
1955
  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
  					unsigned long address,
  					pte_t *pte)
  {
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1956
  	struct page *page = NULL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
1957
  	pte_t *_pte;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1958
  	int none_or_zero = 0, result = 0;
10359213d   Ebru Akagunduz   mm: incorporate r...
1959
  	bool referenced = false, writable = false;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1960

ba76149f4   Andrea Arcangeli   thp: khugepaged
1961
1962
1963
  	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
  	     _pte++, address += PAGE_SIZE) {
  		pte_t pteval = *_pte;
47aee4d8e   Minchan Kim   thp: use is_zero_...
1964
1965
  		if (pte_none(pteval) || (pte_present(pteval) &&
  				is_zero_pfn(pte_pfn(pteval)))) {
c1294d05d   Andrea Arcangeli   userfaultfd: prev...
1966
  			if (!userfaultfd_armed(vma) &&
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1967
  			    ++none_or_zero <= khugepaged_max_ptes_none) {
ba76149f4   Andrea Arcangeli   thp: khugepaged
1968
  				continue;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1969
1970
  			} else {
  				result = SCAN_EXCEED_NONE_PTE;
ba76149f4   Andrea Arcangeli   thp: khugepaged
1971
  				goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1972
  			}
ba76149f4   Andrea Arcangeli   thp: khugepaged
1973
  		}
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1974
1975
  		if (!pte_present(pteval)) {
  			result = SCAN_PTE_NON_PRESENT;
ba76149f4   Andrea Arcangeli   thp: khugepaged
1976
  			goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1977
  		}
ba76149f4   Andrea Arcangeli   thp: khugepaged
1978
  		page = vm_normal_page(vma, address, pteval);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1979
1980
  		if (unlikely(!page)) {
  			result = SCAN_PAGE_NULL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
1981
  			goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1982
  		}
344aa35c2   Bob Liu   thp: clean up __c...
1983

309381fea   Sasha Levin   mm: dump page whe...
1984
1985
1986
  		VM_BUG_ON_PAGE(PageCompound(page), page);
  		VM_BUG_ON_PAGE(!PageAnon(page), page);
  		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
ba76149f4   Andrea Arcangeli   thp: khugepaged
1987

ba76149f4   Andrea Arcangeli   thp: khugepaged
1988
1989
1990
1991
1992
1993
  		/*
  		 * We can do it before isolate_lru_page because the
  		 * page can't be freed from under us. NOTE: PG_lock
  		 * is needed to serialize against split_huge_page
  		 * when invoked from the VM.
  		 */
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1994
1995
  		if (!trylock_page(page)) {
  			result = SCAN_PAGE_LOCK;
ba76149f4   Andrea Arcangeli   thp: khugepaged
1996
  			goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
1997
  		}
10359213d   Ebru Akagunduz   mm: incorporate r...
1998
1999
2000
2001
2002
2003
2004
2005
  
  		/*
  		 * cannot use mapcount: can't collapse if there's a gup pin.
  		 * The page must only be referenced by the scanned process
  		 * and page swap cache.
  		 */
  		if (page_count(page) != 1 + !!PageSwapCache(page)) {
  			unlock_page(page);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2006
  			result = SCAN_PAGE_COUNT;
10359213d   Ebru Akagunduz   mm: incorporate r...
2007
2008
2009
2010
2011
  			goto out;
  		}
  		if (pte_write(pteval)) {
  			writable = true;
  		} else {
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
2012
2013
  			if (PageSwapCache(page) &&
  			    !reuse_swap_page(page, NULL)) {
10359213d   Ebru Akagunduz   mm: incorporate r...
2014
  				unlock_page(page);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2015
  				result = SCAN_SWAP_CACHE_PAGE;
10359213d   Ebru Akagunduz   mm: incorporate r...
2016
2017
2018
2019
2020
2021
2022
  				goto out;
  			}
  			/*
  			 * Page is not in the swap cache. It can be collapsed
  			 * into a THP.
  			 */
  		}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2023
2024
2025
2026
2027
2028
  		/*
  		 * Isolate the page to avoid collapsing an hugepage
  		 * currently in use by the VM.
  		 */
  		if (isolate_lru_page(page)) {
  			unlock_page(page);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2029
  			result = SCAN_DEL_PAGE_LRU;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2030
2031
2032
2033
  			goto out;
  		}
  		/* 0 stands for page_is_file_cache(page) == false */
  		inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
309381fea   Sasha Levin   mm: dump page whe...
2034
2035
  		VM_BUG_ON_PAGE(!PageLocked(page), page);
  		VM_BUG_ON_PAGE(PageLRU(page), page);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2036
2037
  
  		/* If there is no mapped pte young don't collapse the page */
33c3fc71c   Vladimir Davydov   mm: introduce idl...
2038
2039
  		if (pte_young(pteval) ||
  		    page_is_young(page) || PageReferenced(page) ||
8ee53820e   Andrea Arcangeli   thp: mmu_notifier...
2040
  		    mmu_notifier_test_young(vma->vm_mm, address))
10359213d   Ebru Akagunduz   mm: incorporate r...
2041
  			referenced = true;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2042
  	}
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2043
2044
2045
  	if (likely(writable)) {
  		if (likely(referenced)) {
  			result = SCAN_SUCCEED;
16fd0fe4a   yalin wang   mm: fix kernel cr...
2046
  			trace_mm_collapse_huge_page_isolate(page, none_or_zero,
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2047
2048
2049
2050
2051
2052
  							    referenced, writable, result);
  			return 1;
  		}
  	} else {
  		result = SCAN_PAGE_RO;
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2053
  out:
344aa35c2   Bob Liu   thp: clean up __c...
2054
  	release_pte_pages(pte, _pte);
16fd0fe4a   yalin wang   mm: fix kernel cr...
2055
  	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2056
  					    referenced, writable, result);
344aa35c2   Bob Liu   thp: clean up __c...
2057
  	return 0;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
  }
  
  static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
  				      struct vm_area_struct *vma,
  				      unsigned long address,
  				      spinlock_t *ptl)
  {
  	pte_t *_pte;
  	for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
  		pte_t pteval = *_pte;
  		struct page *src_page;
ca0984caa   Ebru Akagunduz   mm: incorporate z...
2069
  		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
ba76149f4   Andrea Arcangeli   thp: khugepaged
2070
2071
  			clear_user_highpage(page, address);
  			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
ca0984caa   Ebru Akagunduz   mm: incorporate z...
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
  			if (is_zero_pfn(pte_pfn(pteval))) {
  				/*
  				 * ptl mostly unnecessary.
  				 */
  				spin_lock(ptl);
  				/*
  				 * paravirt calls inside pte_clear here are
  				 * superfluous.
  				 */
  				pte_clear(vma->vm_mm, address, _pte);
  				spin_unlock(ptl);
  			}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2084
2085
2086
  		} else {
  			src_page = pte_page(pteval);
  			copy_user_highpage(page, src_page, address, vma);
309381fea   Sasha Levin   mm: dump page whe...
2087
  			VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
  			release_pte_page(src_page);
  			/*
  			 * ptl mostly unnecessary, but preempt has to
  			 * be disabled to update the per-cpu stats
  			 * inside page_remove_rmap().
  			 */
  			spin_lock(ptl);
  			/*
  			 * paravirt calls inside pte_clear here are
  			 * superfluous.
  			 */
  			pte_clear(vma->vm_mm, address, _pte);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
2100
  			page_remove_rmap(src_page, false);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2101
2102
2103
2104
2105
2106
2107
2108
  			spin_unlock(ptl);
  			free_page_and_swap_cache(src_page);
  		}
  
  		address += PAGE_SIZE;
  		page++;
  	}
  }
26234f36e   Xiao Guangrong   thp: introduce kh...
2109
  static void khugepaged_alloc_sleep(void)
ba76149f4   Andrea Arcangeli   thp: khugepaged
2110
  {
bde43c6c9   Petr Mladek   mm/khugepaged: al...
2111
2112
2113
2114
2115
2116
  	DEFINE_WAIT(wait);
  
  	add_wait_queue(&khugepaged_wait, &wait);
  	freezable_schedule_timeout_interruptible(
  		msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
  	remove_wait_queue(&khugepaged_wait, &wait);
26234f36e   Xiao Guangrong   thp: introduce kh...
2117
  }
ba76149f4   Andrea Arcangeli   thp: khugepaged
2118

9f1b868a1   Bob Liu   mm: thp: khugepag...
2119
  static int khugepaged_node_load[MAX_NUMNODES];
14a4e2141   David Rientjes   mm, thp: only col...
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
  static bool khugepaged_scan_abort(int nid)
  {
  	int i;
  
  	/*
  	 * If zone_reclaim_mode is disabled, then no extra effort is made to
  	 * allocate memory locally.
  	 */
  	if (!zone_reclaim_mode)
  		return false;
  
  	/* If there is a count for this node already, it must be acceptable */
  	if (khugepaged_node_load[nid])
  		return false;
  
  	for (i = 0; i < MAX_NUMNODES; i++) {
  		if (!khugepaged_node_load[i])
  			continue;
  		if (node_distance(nid, i) > RECLAIM_DISTANCE)
  			return true;
  	}
  	return false;
  }
26234f36e   Xiao Guangrong   thp: introduce kh...
2143
  #ifdef CONFIG_NUMA
9f1b868a1   Bob Liu   mm: thp: khugepag...
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
  static int khugepaged_find_target_node(void)
  {
  	static int last_khugepaged_target_node = NUMA_NO_NODE;
  	int nid, target_node = 0, max_value = 0;
  
  	/* find first node with max normal pages hit */
  	for (nid = 0; nid < MAX_NUMNODES; nid++)
  		if (khugepaged_node_load[nid] > max_value) {
  			max_value = khugepaged_node_load[nid];
  			target_node = nid;
  		}
  
  	/* do some balance if several nodes have the same hit record */
  	if (target_node <= last_khugepaged_target_node)
  		for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
  				nid++)
  			if (max_value == khugepaged_node_load[nid]) {
  				target_node = nid;
  				break;
  			}
  
  	last_khugepaged_target_node = target_node;
  	return target_node;
  }
26234f36e   Xiao Guangrong   thp: introduce kh...
2168
2169
2170
2171
2172
2173
2174
  static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
  {
  	if (IS_ERR(*hpage)) {
  		if (!*wait)
  			return false;
  
  		*wait = false;
e3b4126c5   Xiao Guangrong   thp: khugepaged_p...
2175
  		*hpage = NULL;
26234f36e   Xiao Guangrong   thp: introduce kh...
2176
2177
2178
2179
2180
2181
2182
2183
  		khugepaged_alloc_sleep();
  	} else if (*hpage) {
  		put_page(*hpage);
  		*hpage = NULL;
  	}
  
  	return true;
  }
3b3636924   Michal Hocko   mm, memcg: sync a...
2184
2185
  static struct page *
  khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
d6669d689   Aaron Tomlin   thp: remove unuse...
2186
  		       unsigned long address, int node)
26234f36e   Xiao Guangrong   thp: introduce kh...
2187
  {
309381fea   Sasha Levin   mm: dump page whe...
2188
  	VM_BUG_ON_PAGE(*hpage, *hpage);
8b1645685   Vlastimil Babka   mm, THP: don't ho...
2189

ce83d2174   Andrea Arcangeli   thp: allocate mem...
2190
  	/*
8b1645685   Vlastimil Babka   mm, THP: don't ho...
2191
2192
2193
2194
  	 * Before allocating the hugepage, release the mmap_sem read lock.
  	 * The allocation can take potentially a long time if it involves
  	 * sync compaction, and we do not need to hold the mmap_sem during
  	 * that. We will recheck the vma after taking it again in write mode.
ce83d2174   Andrea Arcangeli   thp: allocate mem...
2195
  	 */
8b1645685   Vlastimil Babka   mm, THP: don't ho...
2196
  	up_read(&mm->mmap_sem);
96db800f5   Vlastimil Babka   mm: rename alloc_...
2197
  	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
26234f36e   Xiao Guangrong   thp: introduce kh...
2198
  	if (unlikely(!*hpage)) {
81ab4201f   Andi Kleen   mm: add VM counte...
2199
  		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
ce83d2174   Andrea Arcangeli   thp: allocate mem...
2200
  		*hpage = ERR_PTR(-ENOMEM);
26234f36e   Xiao Guangrong   thp: introduce kh...
2201
  		return NULL;
ce83d2174   Andrea Arcangeli   thp: allocate mem...
2202
  	}
26234f36e   Xiao Guangrong   thp: introduce kh...
2203

9a982250f   Kirill A. Shutemov   thp: introduce de...
2204
  	prep_transhuge_page(*hpage);
65b3c07b4   Xiao Guangrong   thp: fix the coun...
2205
  	count_vm_event(THP_COLLAPSE_ALLOC);
26234f36e   Xiao Guangrong   thp: introduce kh...
2206
2207
2208
  	return *hpage;
  }
  #else
9f1b868a1   Bob Liu   mm: thp: khugepag...
2209
2210
2211
2212
  static int khugepaged_find_target_node(void)
  {
  	return 0;
  }
444eb2a44   Mel Gorman   mm: thp: set THP ...
2213
  static inline struct page *alloc_khugepaged_hugepage(void)
10dc4155c   Bob Liu   mm: thp: cleanup:...
2214
  {
9a982250f   Kirill A. Shutemov   thp: introduce de...
2215
  	struct page *page;
444eb2a44   Mel Gorman   mm: thp: set THP ...
2216
2217
  	page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
  			   HPAGE_PMD_ORDER);
9a982250f   Kirill A. Shutemov   thp: introduce de...
2218
2219
2220
  	if (page)
  		prep_transhuge_page(page);
  	return page;
10dc4155c   Bob Liu   mm: thp: cleanup:...
2221
  }
26234f36e   Xiao Guangrong   thp: introduce kh...
2222
2223
2224
2225
2226
  static struct page *khugepaged_alloc_hugepage(bool *wait)
  {
  	struct page *hpage;
  
  	do {
444eb2a44   Mel Gorman   mm: thp: set THP ...
2227
  		hpage = alloc_khugepaged_hugepage();
26234f36e   Xiao Guangrong   thp: introduce kh...
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
  		if (!hpage) {
  			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
  			if (!*wait)
  				return NULL;
  
  			*wait = false;
  			khugepaged_alloc_sleep();
  		} else
  			count_vm_event(THP_COLLAPSE_ALLOC);
  	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
  
  	return hpage;
  }
  
  static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
  {
  	if (!*hpage)
  		*hpage = khugepaged_alloc_hugepage(wait);
  
  	if (unlikely(!*hpage))
  		return false;
  
  	return true;
  }
3b3636924   Michal Hocko   mm, memcg: sync a...
2252
2253
  static struct page *
  khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
d6669d689   Aaron Tomlin   thp: remove unuse...
2254
  		       unsigned long address, int node)
26234f36e   Xiao Guangrong   thp: introduce kh...
2255
2256
2257
  {
  	up_read(&mm->mmap_sem);
  	VM_BUG_ON(!*hpage);
3b3636924   Michal Hocko   mm, memcg: sync a...
2258

26234f36e   Xiao Guangrong   thp: introduce kh...
2259
2260
  	return  *hpage;
  }
692e0b354   Andrea Arcangeli   mm: thp: optimize...
2261
  #endif
fa475e517   Bob Liu   thp: introduce hu...
2262
2263
2264
2265
2266
  static bool hugepage_vma_check(struct vm_area_struct *vma)
  {
  	if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
  	    (vma->vm_flags & VM_NOHUGEPAGE))
  		return false;
fa475e517   Bob Liu   thp: introduce hu...
2267
2268
2269
2270
  	if (!vma->anon_vma || vma->vm_ops)
  		return false;
  	if (is_vma_temporary_stack(vma))
  		return false;
3486b85a2   Konstantin Khlebnikov   mm/huge_memory: r...
2271
  	return !(vma->vm_flags & VM_NO_THP);
fa475e517   Bob Liu   thp: introduce hu...
2272
  }
26234f36e   Xiao Guangrong   thp: introduce kh...
2273
2274
2275
2276
2277
2278
  static void collapse_huge_page(struct mm_struct *mm,
  				   unsigned long address,
  				   struct page **hpage,
  				   struct vm_area_struct *vma,
  				   int node)
  {
26234f36e   Xiao Guangrong   thp: introduce kh...
2279
2280
2281
2282
  	pmd_t *pmd, _pmd;
  	pte_t *pte;
  	pgtable_t pgtable;
  	struct page *new_page;
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2283
  	spinlock_t *pmd_ptl, *pte_ptl;
629d9d1ca   Arnd Bergmann   mm: avoid uniniti...
2284
  	int isolated = 0, result = 0;
26234f36e   Xiao Guangrong   thp: introduce kh...
2285
  	unsigned long hstart, hend;
00501b531   Johannes Weiner   mm: memcontrol: r...
2286
  	struct mem_cgroup *memcg;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
2287
2288
  	unsigned long mmun_start;	/* For mmu_notifiers */
  	unsigned long mmun_end;		/* For mmu_notifiers */
3b3636924   Michal Hocko   mm, memcg: sync a...
2289
  	gfp_t gfp;
26234f36e   Xiao Guangrong   thp: introduce kh...
2290
2291
  
  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
3b3636924   Michal Hocko   mm, memcg: sync a...
2292
  	/* Only allocate from the target node */
444eb2a44   Mel Gorman   mm: thp: set THP ...
2293
  	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
3b3636924   Michal Hocko   mm, memcg: sync a...
2294

26234f36e   Xiao Guangrong   thp: introduce kh...
2295
  	/* release the mmap_sem read lock. */
d6669d689   Aaron Tomlin   thp: remove unuse...
2296
  	new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2297
2298
2299
2300
  	if (!new_page) {
  		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
  		goto out_nolock;
  	}
26234f36e   Xiao Guangrong   thp: introduce kh...
2301

f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2302
  	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2303
2304
2305
  		result = SCAN_CGROUP_CHARGE_FAIL;
  		goto out_nolock;
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2306
2307
2308
2309
2310
2311
2312
  
  	/*
  	 * Prevent all access to pagetables with the exception of
  	 * gup_fast later hanlded by the ptep_clear_flush and the VM
  	 * handled by the anon_vma lock + PG_lock.
  	 */
  	down_write(&mm->mmap_sem);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2313
2314
  	if (unlikely(khugepaged_test_exit(mm))) {
  		result = SCAN_ANY_PROCESS;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2315
  		goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2316
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2317
2318
  
  	vma = find_vma(mm, address);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2319
2320
  	if (!vma) {
  		result = SCAN_VMA_NULL;
a8f531ebc   Libin   mm/huge_memory.c:...
2321
  		goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2322
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2323
2324
  	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
  	hend = vma->vm_end & HPAGE_PMD_MASK;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2325
2326
  	if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
  		result = SCAN_ADDRESS_RANGE;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2327
  		goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2328
2329
2330
  	}
  	if (!hugepage_vma_check(vma)) {
  		result = SCAN_VMA_CHECK;
a7d6e4ecd   Andrea Arcangeli   thp: prevent huge...
2331
  		goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2332
  	}
6219049ae   Bob Liu   mm: introduce mm_...
2333
  	pmd = mm_find_pmd(mm, address);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2334
2335
  	if (!pmd) {
  		result = SCAN_PMD_NULL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2336
  		goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2337
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2338

4fc3f1d66   Ingo Molnar   mm/rmap, migratio...
2339
  	anon_vma_lock_write(vma->anon_vma);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2340
2341
  
  	pte = pte_offset_map(pmd, address);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2342
  	pte_ptl = pte_lockptr(mm, pmd);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2343

2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
2344
2345
2346
  	mmun_start = address;
  	mmun_end   = address + HPAGE_PMD_SIZE;
  	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2347
  	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
ba76149f4   Andrea Arcangeli   thp: khugepaged
2348
2349
2350
2351
2352
2353
  	/*
  	 * After this gup_fast can't run anymore. This also removes
  	 * any huge TLB entry from the CPU so we won't allow
  	 * huge and small TLB entries for the same virtual address
  	 * to avoid the risk of CPU bugs in that area.
  	 */
15a25b2ea   Aneesh Kumar K.V   mm/thp: split out...
2354
  	_pmd = pmdp_collapse_flush(vma, address, pmd);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2355
  	spin_unlock(pmd_ptl);
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
2356
  	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2357

c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2358
  	spin_lock(pte_ptl);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2359
  	isolated = __collapse_huge_page_isolate(vma, address, pte);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2360
  	spin_unlock(pte_ptl);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2361
2362
  
  	if (unlikely(!isolated)) {
453c71926   Johannes Weiner   thp: keep highpte...
2363
  		pte_unmap(pte);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2364
  		spin_lock(pmd_ptl);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2365
  		BUG_ON(!pmd_none(*pmd));
7c3425123   Aneesh Kumar K.V   mm/THP: use pmd_p...
2366
2367
2368
2369
2370
2371
  		/*
  		 * We can only use set_pmd_at when establishing
  		 * hugepmds and never for establishing regular pmds that
  		 * points to regular pagetables. Use pmd_populate for that
  		 */
  		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2372
  		spin_unlock(pmd_ptl);
08b52706d   Konstantin Khlebnikov   mm/rmap: rename a...
2373
  		anon_vma_unlock_write(vma->anon_vma);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2374
  		result = SCAN_FAIL;
ce83d2174   Andrea Arcangeli   thp: allocate mem...
2375
  		goto out;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2376
2377
2378
2379
2380
2381
  	}
  
  	/*
  	 * All pages are isolated and locked so anon_vma rmap
  	 * can't run anymore.
  	 */
08b52706d   Konstantin Khlebnikov   mm/rmap: rename a...
2382
  	anon_vma_unlock_write(vma->anon_vma);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2383

c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2384
  	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
453c71926   Johannes Weiner   thp: keep highpte...
2385
  	pte_unmap(pte);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2386
2387
  	__SetPageUptodate(new_page);
  	pgtable = pmd_pgtable(_pmd);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2388

3122359a6   Kirill A. Shutemov   thp: move maybe_p...
2389
2390
  	_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
  	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2391
2392
2393
2394
2395
2396
2397
  
  	/*
  	 * spin_lock() below is not the equivalent of smp_wmb(), so
  	 * this is needed to avoid the copy_huge_page writes to become
  	 * visible after the set_pmd_at() write.
  	 */
  	smp_wmb();
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2398
  	spin_lock(pmd_ptl);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2399
  	BUG_ON(!pmd_none(*pmd));
d281ee614   Kirill A. Shutemov   rmap: add argumen...
2400
  	page_add_new_anon_rmap(new_page, vma, address, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2401
  	mem_cgroup_commit_charge(new_page, memcg, false, true);
00501b531   Johannes Weiner   mm: memcontrol: r...
2402
  	lru_cache_add_active_or_unevictable(new_page, vma);
fce144b47   Aneesh Kumar K.V   mm/THP: deposit t...
2403
  	pgtable_trans_huge_deposit(mm, pmd, pgtable);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2404
  	set_pmd_at(mm, address, pmd, _pmd);
b113da657   David Miller   mm: Add and use u...
2405
  	update_mmu_cache_pmd(vma, address, pmd);
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
2406
  	spin_unlock(pmd_ptl);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2407
2408
  
  	*hpage = NULL;
420256ef0   Xiao Guangrong   thp: release page...
2409

ba76149f4   Andrea Arcangeli   thp: khugepaged
2410
  	khugepaged_pages_collapsed++;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2411
  	result = SCAN_SUCCEED;
ce83d2174   Andrea Arcangeli   thp: allocate mem...
2412
  out_up_write:
ba76149f4   Andrea Arcangeli   thp: khugepaged
2413
  	up_write(&mm->mmap_sem);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2414
  	trace_mm_collapse_huge_page(mm, isolated, result);
0bbbc0b33   Andrea Arcangeli   thp: add numa awa...
2415
  	return;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2416
2417
2418
  out_nolock:
  	trace_mm_collapse_huge_page(mm, isolated, result);
  	return;
ce83d2174   Andrea Arcangeli   thp: allocate mem...
2419
  out:
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2420
  	mem_cgroup_cancel_charge(new_page, memcg, true);
ce83d2174   Andrea Arcangeli   thp: allocate mem...
2421
  	goto out_up_write;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2422
2423
2424
2425
2426
2427
2428
  }
  
  static int khugepaged_scan_pmd(struct mm_struct *mm,
  			       struct vm_area_struct *vma,
  			       unsigned long address,
  			       struct page **hpage)
  {
ba76149f4   Andrea Arcangeli   thp: khugepaged
2429
2430
  	pmd_t *pmd;
  	pte_t *pte, *_pte;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2431
2432
  	int ret = 0, none_or_zero = 0, result = 0;
  	struct page *page = NULL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2433
2434
  	unsigned long _address;
  	spinlock_t *ptl;
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
2435
  	int node = NUMA_NO_NODE;
10359213d   Ebru Akagunduz   mm: incorporate r...
2436
  	bool writable = false, referenced = false;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2437
2438
  
  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
6219049ae   Bob Liu   mm: introduce mm_...
2439
  	pmd = mm_find_pmd(mm, address);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2440
2441
  	if (!pmd) {
  		result = SCAN_PMD_NULL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2442
  		goto out;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2443
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2444

9f1b868a1   Bob Liu   mm: thp: khugepag...
2445
  	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
ba76149f4   Andrea Arcangeli   thp: khugepaged
2446
2447
2448
2449
  	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
  	     _pte++, _address += PAGE_SIZE) {
  		pte_t pteval = *_pte;
ca0984caa   Ebru Akagunduz   mm: incorporate z...
2450
  		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
c1294d05d   Andrea Arcangeli   userfaultfd: prev...
2451
  			if (!userfaultfd_armed(vma) &&
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2452
  			    ++none_or_zero <= khugepaged_max_ptes_none) {
ba76149f4   Andrea Arcangeli   thp: khugepaged
2453
  				continue;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2454
2455
  			} else {
  				result = SCAN_EXCEED_NONE_PTE;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2456
  				goto out_unmap;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2457
  			}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2458
  		}
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2459
2460
  		if (!pte_present(pteval)) {
  			result = SCAN_PTE_NON_PRESENT;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2461
  			goto out_unmap;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2462
  		}
10359213d   Ebru Akagunduz   mm: incorporate r...
2463
2464
  		if (pte_write(pteval))
  			writable = true;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2465
  		page = vm_normal_page(vma, _address, pteval);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2466
2467
  		if (unlikely(!page)) {
  			result = SCAN_PAGE_NULL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2468
  			goto out_unmap;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2469
  		}
b1caa957a   Kirill A. Shutemov   khugepaged: ignor...
2470
2471
2472
2473
2474
2475
  
  		/* TODO: teach khugepaged to collapse THP mapped with pte */
  		if (PageCompound(page)) {
  			result = SCAN_PAGE_COMPOUND;
  			goto out_unmap;
  		}
5c4b4be3b   Andi Kleen   mm: use correct n...
2476
  		/*
9f1b868a1   Bob Liu   mm: thp: khugepag...
2477
2478
2479
2480
  		 * Record which node the original page is from and save this
  		 * information to khugepaged_node_load[].
  		 * Khupaged will allocate hugepage from the node has the max
  		 * hit record.
5c4b4be3b   Andi Kleen   mm: use correct n...
2481
  		 */
9f1b868a1   Bob Liu   mm: thp: khugepag...
2482
  		node = page_to_nid(page);
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2483
2484
  		if (khugepaged_scan_abort(node)) {
  			result = SCAN_SCAN_ABORT;
14a4e2141   David Rientjes   mm, thp: only col...
2485
  			goto out_unmap;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2486
  		}
9f1b868a1   Bob Liu   mm: thp: khugepag...
2487
  		khugepaged_node_load[node]++;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2488
  		if (!PageLRU(page)) {
0fda2788b   Kirill A. Shutemov   thp: fix typo in ...
2489
  			result = SCAN_PAGE_LRU;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2490
2491
2492
2493
  			goto out_unmap;
  		}
  		if (PageLocked(page)) {
  			result = SCAN_PAGE_LOCK;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2494
  			goto out_unmap;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2495
2496
2497
2498
2499
  		}
  		if (!PageAnon(page)) {
  			result = SCAN_PAGE_ANON;
  			goto out_unmap;
  		}
10359213d   Ebru Akagunduz   mm: incorporate r...
2500
2501
2502
2503
2504
  		/*
  		 * cannot use mapcount: can't collapse if there's a gup pin.
  		 * The page must only be referenced by the scanned process
  		 * and page swap cache.
  		 */
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2505
2506
  		if (page_count(page) != 1 + !!PageSwapCache(page)) {
  			result = SCAN_PAGE_COUNT;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2507
  			goto out_unmap;
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2508
  		}
33c3fc71c   Vladimir Davydov   mm: introduce idl...
2509
2510
  		if (pte_young(pteval) ||
  		    page_is_young(page) || PageReferenced(page) ||
8ee53820e   Andrea Arcangeli   thp: mmu_notifier...
2511
  		    mmu_notifier_test_young(vma->vm_mm, address))
10359213d   Ebru Akagunduz   mm: incorporate r...
2512
  			referenced = true;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2513
  	}
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
  	if (writable) {
  		if (referenced) {
  			result = SCAN_SUCCEED;
  			ret = 1;
  		} else {
  			result = SCAN_NO_REFERENCED_PAGE;
  		}
  	} else {
  		result = SCAN_PAGE_RO;
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2524
2525
  out_unmap:
  	pte_unmap_unlock(pte, ptl);
9f1b868a1   Bob Liu   mm: thp: khugepag...
2526
2527
  	if (ret) {
  		node = khugepaged_find_target_node();
ce83d2174   Andrea Arcangeli   thp: allocate mem...
2528
  		/* collapse_huge_page will return with the mmap_sem released */
5c4b4be3b   Andi Kleen   mm: use correct n...
2529
  		collapse_huge_page(mm, address, hpage, vma, node);
9f1b868a1   Bob Liu   mm: thp: khugepag...
2530
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2531
  out:
16fd0fe4a   yalin wang   mm: fix kernel cr...
2532
  	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
7d2eba055   Ebru Akagunduz   mm: add tracepoin...
2533
  				     none_or_zero, result);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2534
2535
2536
2537
2538
2539
  	return ret;
  }
  
  static void collect_mm_slot(struct mm_slot *mm_slot)
  {
  	struct mm_struct *mm = mm_slot->mm;
b9980cdcf   Hugh Dickins   mm: fix UP THP sp...
2540
  	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
ba76149f4   Andrea Arcangeli   thp: khugepaged
2541
2542
2543
  
  	if (khugepaged_test_exit(mm)) {
  		/* free mm_slot */
43b5fbbd2   Sasha Levin   mm/huge_memory.c:...
2544
  		hash_del(&mm_slot->hash);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
  		list_del(&mm_slot->mm_node);
  
  		/*
  		 * Not strictly needed because the mm exited already.
  		 *
  		 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
  		 */
  
  		/* khugepaged_mm_lock actually not necessary for the below */
  		free_mm_slot(mm_slot);
  		mmdrop(mm);
  	}
  }
  
  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
  					    struct page **hpage)
2f1da6421   H Hartley Sweeten   mm/huge_memory.c:...
2561
2562
  	__releases(&khugepaged_mm_lock)
  	__acquires(&khugepaged_mm_lock)
ba76149f4   Andrea Arcangeli   thp: khugepaged
2563
2564
2565
2566
2567
2568
2569
  {
  	struct mm_slot *mm_slot;
  	struct mm_struct *mm;
  	struct vm_area_struct *vma;
  	int progress = 0;
  
  	VM_BUG_ON(!pages);
b9980cdcf   Hugh Dickins   mm: fix UP THP sp...
2570
  	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
ba76149f4   Andrea Arcangeli   thp: khugepaged
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
  
  	if (khugepaged_scan.mm_slot)
  		mm_slot = khugepaged_scan.mm_slot;
  	else {
  		mm_slot = list_entry(khugepaged_scan.mm_head.next,
  				     struct mm_slot, mm_node);
  		khugepaged_scan.address = 0;
  		khugepaged_scan.mm_slot = mm_slot;
  	}
  	spin_unlock(&khugepaged_mm_lock);
  
  	mm = mm_slot->mm;
  	down_read(&mm->mmap_sem);
  	if (unlikely(khugepaged_test_exit(mm)))
  		vma = NULL;
  	else
  		vma = find_vma(mm, khugepaged_scan.address);
  
  	progress++;
  	for (; vma; vma = vma->vm_next) {
  		unsigned long hstart, hend;
  
  		cond_resched();
  		if (unlikely(khugepaged_test_exit(mm))) {
  			progress++;
  			break;
  		}
fa475e517   Bob Liu   thp: introduce hu...
2598
2599
  		if (!hugepage_vma_check(vma)) {
  skip:
ba76149f4   Andrea Arcangeli   thp: khugepaged
2600
2601
2602
  			progress++;
  			continue;
  		}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2603
2604
  		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
  		hend = vma->vm_end & HPAGE_PMD_MASK;
a7d6e4ecd   Andrea Arcangeli   thp: prevent huge...
2605
2606
2607
2608
  		if (hstart >= hend)
  			goto skip;
  		if (khugepaged_scan.address > hend)
  			goto skip;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2609
2610
  		if (khugepaged_scan.address < hstart)
  			khugepaged_scan.address = hstart;
a7d6e4ecd   Andrea Arcangeli   thp: prevent huge...
2611
  		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
  
  		while (khugepaged_scan.address < hend) {
  			int ret;
  			cond_resched();
  			if (unlikely(khugepaged_test_exit(mm)))
  				goto breakouterloop;
  
  			VM_BUG_ON(khugepaged_scan.address < hstart ||
  				  khugepaged_scan.address + HPAGE_PMD_SIZE >
  				  hend);
  			ret = khugepaged_scan_pmd(mm, vma,
  						  khugepaged_scan.address,
  						  hpage);
  			/* move to next address */
  			khugepaged_scan.address += HPAGE_PMD_SIZE;
  			progress += HPAGE_PMD_NR;
  			if (ret)
  				/* we released mmap_sem so break loop */
  				goto breakouterloop_mmap_sem;
  			if (progress >= pages)
  				goto breakouterloop;
  		}
  	}
  breakouterloop:
  	up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
  breakouterloop_mmap_sem:
  
  	spin_lock(&khugepaged_mm_lock);
a7d6e4ecd   Andrea Arcangeli   thp: prevent huge...
2640
  	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
  	/*
  	 * Release the current mm_slot if this mm is about to die, or
  	 * if we scanned all vmas of this mm.
  	 */
  	if (khugepaged_test_exit(mm) || !vma) {
  		/*
  		 * Make sure that if mm_users is reaching zero while
  		 * khugepaged runs here, khugepaged_exit will find
  		 * mm_slot not pointing to the exiting mm.
  		 */
  		if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
  			khugepaged_scan.mm_slot = list_entry(
  				mm_slot->mm_node.next,
  				struct mm_slot, mm_node);
  			khugepaged_scan.address = 0;
  		} else {
  			khugepaged_scan.mm_slot = NULL;
  			khugepaged_full_scans++;
  		}
  
  		collect_mm_slot(mm_slot);
  	}
  
  	return progress;
  }
  
  static int khugepaged_has_work(void)
  {
  	return !list_empty(&khugepaged_scan.mm_head) &&
  		khugepaged_enabled();
  }
  
  static int khugepaged_wait_event(void)
  {
  	return !list_empty(&khugepaged_scan.mm_head) ||
2017c0bff   Xiao Guangrong   thp: remove wake_...
2676
  		kthread_should_stop();
ba76149f4   Andrea Arcangeli   thp: khugepaged
2677
  }
d516904bd   Xiao Guangrong   thp: merge page p...
2678
  static void khugepaged_do_scan(void)
ba76149f4   Andrea Arcangeli   thp: khugepaged
2679
  {
d516904bd   Xiao Guangrong   thp: merge page p...
2680
  	struct page *hpage = NULL;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2681
2682
  	unsigned int progress = 0, pass_through_head = 0;
  	unsigned int pages = khugepaged_pages_to_scan;
d516904bd   Xiao Guangrong   thp: merge page p...
2683
  	bool wait = true;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2684
2685
2686
2687
  
  	barrier(); /* write khugepaged_pages_to_scan to local stack */
  
  	while (progress < pages) {
26234f36e   Xiao Guangrong   thp: introduce kh...
2688
  		if (!khugepaged_prealloc_page(&hpage, &wait))
d516904bd   Xiao Guangrong   thp: merge page p...
2689
  			break;
26234f36e   Xiao Guangrong   thp: introduce kh...
2690

420256ef0   Xiao Guangrong   thp: release page...
2691
  		cond_resched();
ba76149f4   Andrea Arcangeli   thp: khugepaged
2692

cd0924112   Jiri Kosina   thp: cleanup how ...
2693
  		if (unlikely(kthread_should_stop() || try_to_freeze()))
878aee7d6   Andrea Arcangeli   thp: freeze khuge...
2694
  			break;
ba76149f4   Andrea Arcangeli   thp: khugepaged
2695
2696
2697
2698
2699
2700
  		spin_lock(&khugepaged_mm_lock);
  		if (!khugepaged_scan.mm_slot)
  			pass_through_head++;
  		if (khugepaged_has_work() &&
  		    pass_through_head < 2)
  			progress += khugepaged_scan_mm_slot(pages - progress,
d516904bd   Xiao Guangrong   thp: merge page p...
2701
  							    &hpage);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2702
2703
2704
2705
  		else
  			progress = pages;
  		spin_unlock(&khugepaged_mm_lock);
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2706

d516904bd   Xiao Guangrong   thp: merge page p...
2707
2708
  	if (!IS_ERR_OR_NULL(hpage))
  		put_page(hpage);
0bbbc0b33   Andrea Arcangeli   thp: add numa awa...
2709
  }
2017c0bff   Xiao Guangrong   thp: remove wake_...
2710
2711
  static void khugepaged_wait_work(void)
  {
2017c0bff   Xiao Guangrong   thp: remove wake_...
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
  	if (khugepaged_has_work()) {
  		if (!khugepaged_scan_sleep_millisecs)
  			return;
  
  		wait_event_freezable_timeout(khugepaged_wait,
  					     kthread_should_stop(),
  			msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
  		return;
  	}
  
  	if (khugepaged_enabled())
  		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
  }
ba76149f4   Andrea Arcangeli   thp: khugepaged
2725
2726
2727
  static int khugepaged(void *none)
  {
  	struct mm_slot *mm_slot;
878aee7d6   Andrea Arcangeli   thp: freeze khuge...
2728
  	set_freezable();
8698a745d   Dongsheng Yang   sched, treewide: ...
2729
  	set_user_nice(current, MAX_NICE);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2730

b7231789b   Xiao Guangrong   thp: remove khuge...
2731
2732
2733
2734
  	while (!kthread_should_stop()) {
  		khugepaged_do_scan();
  		khugepaged_wait_work();
  	}
ba76149f4   Andrea Arcangeli   thp: khugepaged
2735
2736
2737
2738
2739
2740
2741
  
  	spin_lock(&khugepaged_mm_lock);
  	mm_slot = khugepaged_scan.mm_slot;
  	khugepaged_scan.mm_slot = NULL;
  	if (mm_slot)
  		collect_mm_slot(mm_slot);
  	spin_unlock(&khugepaged_mm_lock);
ba76149f4   Andrea Arcangeli   thp: khugepaged
2742
2743
  	return 0;
  }
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
  static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
  		unsigned long haddr, pmd_t *pmd)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pgtable_t pgtable;
  	pmd_t _pmd;
  	int i;
  
  	/* leave pmd empty until pte is filled */
  	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
  
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
  
  	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
  		pte_t *pte, entry;
  		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
  		entry = pte_mkspecial(entry);
  		pte = pte_offset_map(&_pmd, haddr);
  		VM_BUG_ON(!pte_none(*pte));
  		set_pte_at(mm, haddr, pte, entry);
  		pte_unmap(pte);
  	}
  	smp_wmb(); /* make pte visible before pmd */
  	pmd_populate(mm, pmd, pgtable);
  	put_huge_zero_page();
  }
  
  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
ba9882808   Kirill A. Shutemov   thp: add option t...
2773
  		unsigned long haddr, bool freeze)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2774
2775
2776
2777
2778
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct page *page;
  	pgtable_t pgtable;
  	pmd_t _pmd;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2779
  	bool young, write, dirty;
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2780
  	unsigned long addr;
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2781
2782
2783
2784
2785
  	int i;
  
  	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2786
  	VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
  
  	count_vm_event(THP_SPLIT_PMD);
  
  	if (vma_is_dax(vma)) {
  		pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
  		if (is_huge_zero_pmd(_pmd))
  			put_huge_zero_page();
  		return;
  	} else if (is_huge_zero_pmd(*pmd)) {
  		return __split_huge_zero_page_pmd(vma, haddr, pmd);
  	}
  
  	page = pmd_page(*pmd);
  	VM_BUG_ON_PAGE(!page_count(page), page);
fe896d187   Joonsoo Kim   mm: introduce pag...
2801
  	page_ref_add(page, HPAGE_PMD_NR - 1);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2802
2803
  	write = pmd_write(*pmd);
  	young = pmd_young(*pmd);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2804
  	dirty = pmd_dirty(*pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2805

c777e2a8b   Aneesh Kumar K.V   powerpc/mm: Fix M...
2806
  	pmdp_huge_split_prepare(vma, haddr, pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2807
2808
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2809
  	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2810
2811
2812
2813
2814
2815
  		pte_t entry, *pte;
  		/*
  		 * Note that NUMA hinting access restrictions are not
  		 * transferred to avoid any possibility of altering
  		 * permissions across VMAs.
  		 */
ba9882808   Kirill A. Shutemov   thp: add option t...
2816
2817
2818
2819
2820
2821
  		if (freeze) {
  			swp_entry_t swp_entry;
  			swp_entry = make_migration_entry(page + i, write);
  			entry = swp_entry_to_pte(swp_entry);
  		} else {
  			entry = mk_pte(page + i, vma->vm_page_prot);
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2822
  			entry = maybe_mkwrite(entry, vma);
ba9882808   Kirill A. Shutemov   thp: add option t...
2823
2824
2825
2826
2827
  			if (!write)
  				entry = pte_wrprotect(entry);
  			if (!young)
  				entry = pte_mkold(entry);
  		}
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
2828
2829
  		if (dirty)
  			SetPageDirty(page + i);
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2830
  		pte = pte_offset_map(&_pmd, addr);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2831
  		BUG_ON(!pte_none(*pte));
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2832
  		set_pte_at(mm, addr, pte, entry);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
  		atomic_inc(&page[i]._mapcount);
  		pte_unmap(pte);
  	}
  
  	/*
  	 * Set PG_double_map before dropping compound_mapcount to avoid
  	 * false-negative page_mapped().
  	 */
  	if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
  		for (i = 0; i < HPAGE_PMD_NR; i++)
  			atomic_inc(&page[i]._mapcount);
  	}
  
  	if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
  		/* Last compound_mapcount is gone. */
  		__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
  		if (TestClearPageDoubleMap(page)) {
  			/* No need in mapcount reference anymore */
  			for (i = 0; i < HPAGE_PMD_NR; i++)
  				atomic_dec(&page[i]._mapcount);
  		}
  	}
  
  	smp_wmb(); /* make pte visible before pmd */
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
  	/*
  	 * Up to this point the pmd is present and huge and userland has the
  	 * whole access to the hugepage during the split (which happens in
  	 * place). If we overwrite the pmd with the not-huge version pointing
  	 * to the pte here (which of course we could if all CPUs were bug
  	 * free), userland could trigger a small page size TLB miss on the
  	 * small sized TLB while the hugepage TLB entry is still established in
  	 * the huge TLB. Some CPU doesn't like that.
  	 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
  	 * 383 on page 93. Intel should be safe but is also warns that it's
  	 * only safe if the permission and cache attributes of the two entries
  	 * loaded in the two TLB is identical (which should be the case here).
  	 * But it is generally safer to never allow small and huge TLB entries
  	 * for the same virtual address to be loaded simultaneously. So instead
  	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
  	 * current pmd notpresent (atomically because here the pmd_trans_huge
  	 * and pmd_trans_splitting must remain set at all times on the pmd
  	 * until the split is complete for this pmd), then we flush the SMP TLB
  	 * and finally we write the non-huge version of the pmd entry with
  	 * pmd_populate.
  	 */
  	pmdp_invalidate(vma, haddr, pmd);
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2879
  	pmd_populate(mm, pmd, pgtable);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2880
2881
  
  	if (freeze) {
2ac015e29   Kirill A. Shutemov   thp: call pmdp_in...
2882
  		for (i = 0; i < HPAGE_PMD_NR; i++) {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2883
2884
2885
2886
  			page_remove_rmap(page + i, false);
  			put_page(page + i);
  		}
  	}
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2887
2888
2889
  }
  
  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2890
  		unsigned long address, bool freeze)
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2891
2892
2893
2894
2895
2896
2897
  {
  	spinlock_t *ptl;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long haddr = address & HPAGE_PMD_MASK;
  
  	mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
  	ptl = pmd_lock(mm, pmd);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2898
  	if (pmd_trans_huge(*pmd)) {
5f7377147   Kirill A. Shutemov   thp: fix deadlock...
2899
  		struct page *page = pmd_page(*pmd);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2900
  		if (PageMlocked(page))
5f7377147   Kirill A. Shutemov   thp: fix deadlock...
2901
  			clear_page_mlock(page);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2902
  	} else if (!pmd_devmap(*pmd))
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
2903
  		goto out;
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2904
  	__split_huge_pmd_locked(vma, pmd, haddr, freeze);
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
2905
  out:
eef1b3ba0   Kirill A. Shutemov   thp: implement sp...
2906
2907
2908
  	spin_unlock(ptl);
  	mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
  }
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2909
2910
  void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
  		bool freeze, struct page *page)
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2911
  {
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2912
2913
  	pgd_t *pgd;
  	pud_t *pud;
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2914
  	pmd_t *pmd;
78ddc5347   Kirill A. Shutemov   thp: rename split...
2915
  	pgd = pgd_offset(vma->vm_mm, address);
f72e7dcdd   Hugh Dickins   mm: let mm_find_p...
2916
2917
2918
2919
2920
2921
2922
2923
  	if (!pgd_present(*pgd))
  		return;
  
  	pud = pud_offset(pgd, address);
  	if (!pud_present(*pud))
  		return;
  
  	pmd = pmd_offset(pud, address);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
2924
  	if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2925
  		return;
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2926
2927
2928
2929
2930
2931
2932
2933
  
  	/*
  	 * If caller asks to setup a migration entries, we need a page to check
  	 * pmd against. Otherwise we can end up replacing wrong page.
  	 */
  	VM_BUG_ON(freeze && !page);
  	if (page && page != pmd_page(*pmd))
  		return;
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2934
2935
2936
2937
  	/*
  	 * Caller holds the mmap_sem write mode, so a huge pmd cannot
  	 * materialize from under us.
  	 */
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2938
  	__split_huge_pmd(vma, pmd, address, freeze);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2939
  }
e1b9996b8   Kirill A. Shutemov   thp: vma_adjust_t...
2940
  void vma_adjust_trans_huge(struct vm_area_struct *vma,
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
  			     unsigned long start,
  			     unsigned long end,
  			     long adjust_next)
  {
  	/*
  	 * If the new start address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (start & ~HPAGE_PMD_MASK &&
  	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2953
  		split_huge_pmd_address(vma, start, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2954
2955
2956
2957
2958
2959
2960
2961
2962
  
  	/*
  	 * If the new end address isn't hpage aligned and it could
  	 * previously contain an hugepage: check if we need to split
  	 * an huge pmd.
  	 */
  	if (end & ~HPAGE_PMD_MASK &&
  	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
  	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2963
  		split_huge_pmd_address(vma, end, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
  
  	/*
  	 * If we're also updating the vma->vm_next->vm_start, if the new
  	 * vm_next->vm_start isn't page aligned and it could previously
  	 * contain an hugepage: check if we need to split an huge pmd.
  	 */
  	if (adjust_next > 0) {
  		struct vm_area_struct *next = vma->vm_next;
  		unsigned long nstart = next->vm_start;
  		nstart += adjust_next << PAGE_SHIFT;
  		if (nstart & ~HPAGE_PMD_MASK &&
  		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
  		    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2977
  			split_huge_pmd_address(next, nstart, false, NULL);
94fcc585f   Andrea Arcangeli   thp: avoid breaki...
2978
2979
  	}
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2980

fec89c109   Kirill A. Shutemov   thp: rewrite free...
2981
  static void freeze_page(struct page *page)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2982
  {
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2983
2984
2985
  	enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
  		TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
  	int i, ret;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2986
2987
  
  	VM_BUG_ON_PAGE(!PageHead(page), page);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2988
2989
2990
2991
2992
2993
  	/* We only need TTU_SPLIT_HUGE_PMD once */
  	ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
  	for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
  		/* Cut short if the page is unmapped */
  		if (page_count(page) == 1)
  			return;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2994

fec89c109   Kirill A. Shutemov   thp: rewrite free...
2995
  		ret = try_to_unmap(page + i, ttu_flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2996
  	}
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2997
  	VM_BUG_ON(ret);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
2998
  }
fec89c109   Kirill A. Shutemov   thp: rewrite free...
2999
  static void unfreeze_page(struct page *page)
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3000
  {
fec89c109   Kirill A. Shutemov   thp: rewrite free...
3001
  	int i;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3002

fec89c109   Kirill A. Shutemov   thp: rewrite free...
3003
3004
  	for (i = 0; i < HPAGE_PMD_NR; i++)
  		remove_migration_ptes(page + i, page + i, true);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3005
  }
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
3006
  static void __split_huge_page_tail(struct page *head, int tail,
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3007
3008
  		struct lruvec *lruvec, struct list_head *list)
  {
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3009
  	struct page *page_tail = head + tail;
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
3010
  	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
fe896d187   Joonsoo Kim   mm: introduce pag...
3011
  	VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3012
3013
  
  	/*
0139aa7b7   Joonsoo Kim   mm: rename _count...
3014
  	 * tail_page->_refcount is zero and not changing from under us. But
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3015
  	 * get_page_unless_zero() may be running from under us on the
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
3016
  	 * tail_page. If we used atomic_set() below instead of atomic_inc(), we
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3017
3018
3019
3020
3021
  	 * would then run atomic_set() concurrently with
  	 * get_page_unless_zero(), and atomic_set() is implemented in C not
  	 * using locked ops. spin_unlock on x86 sometime uses locked ops
  	 * because of PPro errata 66, 92, so unless somebody can guarantee
  	 * atomic_set() here would be safe on all archs (and not only on x86),
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
3022
  	 * it's safer to use atomic_inc().
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3023
  	 */
fe896d187   Joonsoo Kim   mm: introduce pag...
3024
  	page_ref_inc(page_tail);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3025
3026
3027
3028
3029
3030
3031
3032
3033
  
  	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  	page_tail->flags |= (head->flags &
  			((1L << PG_referenced) |
  			 (1L << PG_swapbacked) |
  			 (1L << PG_mlocked) |
  			 (1L << PG_uptodate) |
  			 (1L << PG_active) |
  			 (1L << PG_locked) |
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
3034
3035
  			 (1L << PG_unevictable) |
  			 (1L << PG_dirty)));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
  
  	/*
  	 * After clearing PageTail the gup refcount can be released.
  	 * Page flags also must be visible before we make the page non-compound.
  	 */
  	smp_wmb();
  
  	clear_compound_head(page_tail);
  
  	if (page_is_young(head))
  		set_page_young(page_tail);
  	if (page_is_idle(head))
  		set_page_idle(page_tail);
  
  	/* ->mapping in first tail page is compound_mapcount */
9a982250f   Kirill A. Shutemov   thp: introduce de...
3051
  	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3052
3053
3054
3055
3056
3057
  			page_tail);
  	page_tail->mapping = head->mapping;
  
  	page_tail->index = head->index + tail;
  	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
  	lru_add_page_tail(head, page_tail, lruvec, list);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3058
3059
3060
3061
3062
3063
3064
  }
  
  static void __split_huge_page(struct page *page, struct list_head *list)
  {
  	struct page *head = compound_head(page);
  	struct zone *zone = page_zone(head);
  	struct lruvec *lruvec;
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
3065
  	int i;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3066
3067
3068
3069
3070
3071
3072
  
  	/* prevent PageLRU to go away from under us, and freeze lru stats */
  	spin_lock_irq(&zone->lru_lock);
  	lruvec = mem_cgroup_page_lruvec(head, zone);
  
  	/* complete memcg works before add pages to LRU */
  	mem_cgroup_split_huge_fixup(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3073
  	for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
8df651c70   Kirill A. Shutemov   thp: cleanup spli...
3074
  		__split_huge_page_tail(head, i, lruvec, list);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3075
3076
3077
  
  	ClearPageCompound(head);
  	spin_unlock_irq(&zone->lru_lock);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
3078
  	unfreeze_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
  
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		struct page *subpage = head + i;
  		if (subpage == page)
  			continue;
  		unlock_page(subpage);
  
  		/*
  		 * Subpages may be freed if there wasn't any mapping
  		 * like if add_to_swap() is running on a lru page that
  		 * had its mapping zapped. And freeing these pages
  		 * requires taking the lru_lock so we do the put_page
  		 * of the tail pages after the split is complete.
  		 */
  		put_page(subpage);
  	}
  }
b20ce5e03   Kirill A. Shutemov   mm: prepare page_...
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
  int total_mapcount(struct page *page)
  {
  	int i, ret;
  
  	VM_BUG_ON_PAGE(PageTail(page), page);
  
  	if (likely(!PageCompound(page)))
  		return atomic_read(&page->_mapcount) + 1;
  
  	ret = compound_mapcount(page);
  	if (PageHuge(page))
  		return ret;
  	for (i = 0; i < HPAGE_PMD_NR; i++)
  		ret += atomic_read(&page[i]._mapcount) + 1;
  	if (PageDoubleMap(page))
  		ret -= HPAGE_PMD_NR;
  	return ret;
  }
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3114
  /*
6d0a07edd   Andrea Arcangeli   mm: thp: calculat...
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
   * This calculates accurately how many mappings a transparent hugepage
   * has (unlike page_mapcount() which isn't fully accurate). This full
   * accuracy is primarily needed to know if copy-on-write faults can
   * reuse the page and change the mapping to read-write instead of
   * copying them. At the same time this returns the total_mapcount too.
   *
   * The function returns the highest mapcount any one of the subpages
   * has. If the return value is one, even if different processes are
   * mapping different subpages of the transparent hugepage, they can
   * all reuse it, because each process is reusing a different subpage.
   *
   * The total_mapcount is instead counting all virtual mappings of the
   * subpages. If the total_mapcount is equal to "one", it tells the
   * caller all mappings belong to the same "mm" and in turn the
   * anon_vma of the transparent hugepage can become the vma->anon_vma
   * local one as no other process may be mapping any of the subpages.
   *
   * It would be more accurate to replace page_mapcount() with
   * page_trans_huge_mapcount(), however we only use
   * page_trans_huge_mapcount() in the copy-on-write faults where we
   * need full accuracy to avoid breaking page pinning, because
   * page_trans_huge_mapcount() is slower than page_mapcount().
   */
  int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
  {
  	int i, ret, _total_mapcount, mapcount;
  
  	/* hugetlbfs shouldn't call it */
  	VM_BUG_ON_PAGE(PageHuge(page), page);
  
  	if (likely(!PageTransCompound(page))) {
  		mapcount = atomic_read(&page->_mapcount) + 1;
  		if (total_mapcount)
  			*total_mapcount = mapcount;
  		return mapcount;
  	}
  
  	page = compound_head(page);
  
  	_total_mapcount = ret = 0;
  	for (i = 0; i < HPAGE_PMD_NR; i++) {
  		mapcount = atomic_read(&page[i]._mapcount) + 1;
  		ret = max(ret, mapcount);
  		_total_mapcount += mapcount;
  	}
  	if (PageDoubleMap(page)) {
  		ret -= 1;
  		_total_mapcount -= HPAGE_PMD_NR;
  	}
  	mapcount = compound_mapcount(page);
  	ret += mapcount;
  	_total_mapcount += mapcount;
  	if (total_mapcount)
  		*total_mapcount = _total_mapcount;
  	return ret;
  }
  
  /*
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
   * This function splits huge page into normal pages. @page can point to any
   * subpage of huge page to split. Split doesn't change the position of @page.
   *
   * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
   * The huge page must be locked.
   *
   * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
   *
   * Both head page and tail pages will inherit mapping, flags, and so on from
   * the hugepage.
   *
   * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
   * they are not mapped.
   *
   * Returns 0 if the hugepage is split successfully.
   * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
   * us.
   */
  int split_huge_page_to_list(struct page *page, struct list_head *list)
  {
  	struct page *head = compound_head(page);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3194
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3195
3196
  	struct anon_vma *anon_vma;
  	int count, mapcount, ret;
d96543223   Kirill A. Shutemov   thp: increase spl...
3197
  	bool mlocked;
0b9b6fff7   Kirill A. Shutemov   thp: fix interrup...
3198
  	unsigned long flags;
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
  
  	VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
  	VM_BUG_ON_PAGE(!PageAnon(page), page);
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
  	VM_BUG_ON_PAGE(!PageCompound(page), page);
  
  	/*
  	 * The caller does not necessarily hold an mmap_sem that would prevent
  	 * the anon_vma disappearing so we first we take a reference to it
  	 * and then lock the anon_vma for write. This is similar to
  	 * page_lock_anon_vma_read except the write lock is taken to serialise
  	 * against parallel split or collapse operations.
  	 */
  	anon_vma = page_get_anon_vma(head);
  	if (!anon_vma) {
  		ret = -EBUSY;
  		goto out;
  	}
  	anon_vma_lock_write(anon_vma);
  
  	/*
  	 * Racy check if we can split the page, before freeze_page() will
  	 * split PMDs
  	 */
  	if (total_mapcount(head) != page_count(head) - 1) {
  		ret = -EBUSY;
  		goto out_unlock;
  	}
d96543223   Kirill A. Shutemov   thp: increase spl...
3228
  	mlocked = PageMlocked(page);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
3229
  	freeze_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3230
  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
d96543223   Kirill A. Shutemov   thp: increase spl...
3231
3232
3233
  	/* Make sure the page is not on per-CPU pagevec as it takes pin */
  	if (mlocked)
  		lru_add_drain();
0139aa7b7   Joonsoo Kim   mm: rename _count...
3234
  	/* Prevent deferred_split_scan() touching ->_refcount */
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3235
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3236
3237
  	count = page_count(head);
  	mapcount = total_mapcount(head);
bd56086f1   Kirill A. Shutemov   thp: fix split_hu...
3238
  	if (!mapcount && count == 1) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
3239
  		if (!list_empty(page_deferred_list(head))) {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3240
  			pgdata->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
3241
3242
  			list_del(page_deferred_list(head));
  		}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3243
  		spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3244
3245
  		__split_huge_page(page, list);
  		ret = 0;
bd56086f1   Kirill A. Shutemov   thp: fix split_hu...
3246
  	} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3247
  		spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3248
3249
3250
3251
3252
  		pr_alert("total_mapcount: %u, page_count(): %u
  ",
  				mapcount, count);
  		if (PageTail(page))
  			dump_page(head, NULL);
bd56086f1   Kirill A. Shutemov   thp: fix split_hu...
3253
  		dump_page(page, "total_mapcount(head) > 0");
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3254
3255
  		BUG();
  	} else {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3256
  		spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
fec89c109   Kirill A. Shutemov   thp: rewrite free...
3257
  		unfreeze_page(head);
e9b61f198   Kirill A. Shutemov   thp: reintroduce ...
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
  		ret = -EBUSY;
  	}
  
  out_unlock:
  	anon_vma_unlock_write(anon_vma);
  	put_anon_vma(anon_vma);
  out:
  	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
  	return ret;
  }
9a982250f   Kirill A. Shutemov   thp: introduce de...
3268
3269
3270
  
  void free_transhuge_page(struct page *page)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3271
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
9a982250f   Kirill A. Shutemov   thp: introduce de...
3272
  	unsigned long flags;
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3273
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
3274
  	if (!list_empty(page_deferred_list(page))) {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3275
  		pgdata->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
3276
3277
  		list_del(page_deferred_list(page));
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3278
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
3279
3280
3281
3282
3283
  	free_compound_page(page);
  }
  
  void deferred_split_huge_page(struct page *page)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3284
  	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
9a982250f   Kirill A. Shutemov   thp: introduce de...
3285
3286
3287
  	unsigned long flags;
  
  	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3288
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
3289
  	if (list_empty(page_deferred_list(page))) {
f9719a03d   Kirill A. Shutemov   thp, vmstats: cou...
3290
  		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3291
3292
  		list_add_tail(page_deferred_list(page), &pgdata->split_queue);
  		pgdata->split_queue_len++;
9a982250f   Kirill A. Shutemov   thp: introduce de...
3293
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3294
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
3295
3296
3297
3298
3299
  }
  
  static unsigned long deferred_split_count(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3300
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
cb8d68ec1   Kirill A. Shutemov   thp: change defer...
3301
  	return ACCESS_ONCE(pgdata->split_queue_len);
9a982250f   Kirill A. Shutemov   thp: introduce de...
3302
3303
3304
3305
3306
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3307
  	struct pglist_data *pgdata = NODE_DATA(sc->nid);
9a982250f   Kirill A. Shutemov   thp: introduce de...
3308
3309
3310
3311
  	unsigned long flags;
  	LIST_HEAD(list), *pos, *next;
  	struct page *page;
  	int split = 0;
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3312
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
3313
  	/* Take pin on all head pages to avoid freeing them under us */
ae026204a   Kirill A. Shutemov   thp: make deferre...
3314
  	list_for_each_safe(pos, next, &pgdata->split_queue) {
9a982250f   Kirill A. Shutemov   thp: introduce de...
3315
3316
  		page = list_entry((void *)pos, struct page, mapping);
  		page = compound_head(page);
e3ae19535   Kirill A. Shutemov   thp: limit number...
3317
3318
3319
3320
  		if (get_page_unless_zero(page)) {
  			list_move(page_deferred_list(page), &list);
  		} else {
  			/* We lost race with put_compound_page() */
9a982250f   Kirill A. Shutemov   thp: introduce de...
3321
  			list_del_init(page_deferred_list(page));
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3322
  			pgdata->split_queue_len--;
9a982250f   Kirill A. Shutemov   thp: introduce de...
3323
  		}
e3ae19535   Kirill A. Shutemov   thp: limit number...
3324
3325
  		if (!--sc->nr_to_scan)
  			break;
9a982250f   Kirill A. Shutemov   thp: introduce de...
3326
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3327
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
  
  	list_for_each_safe(pos, next, &list) {
  		page = list_entry((void *)pos, struct page, mapping);
  		lock_page(page);
  		/* split_huge_page() removes page from list on success */
  		if (!split_huge_page(page))
  			split++;
  		unlock_page(page);
  		put_page(page);
  	}
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3338
3339
3340
  	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
  	list_splice_tail(&list, &pgdata->split_queue);
  	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
9a982250f   Kirill A. Shutemov   thp: introduce de...
3341

cb8d68ec1   Kirill A. Shutemov   thp: change defer...
3342
3343
3344
3345
3346
3347
3348
  	/*
  	 * Stop shrinker if we didn't split any page, but the queue is empty.
  	 * This can happen if pages were freed under us.
  	 */
  	if (!split && list_empty(&pgdata->split_queue))
  		return SHRINK_STOP;
  	return split;
9a982250f   Kirill A. Shutemov   thp: introduce de...
3349
3350
3351
3352
3353
3354
  }
  
  static struct shrinker deferred_split_shrinker = {
  	.count_objects = deferred_split_count,
  	.scan_objects = deferred_split_scan,
  	.seeks = DEFAULT_SEEKS,
a3d0a9185   Kirill A. Shutemov   thp: make split_q...
3355
  	.flags = SHRINKER_NUMA_AWARE,
9a982250f   Kirill A. Shutemov   thp: introduce de...
3356
  };
49071d436   Kirill A. Shutemov   thp: add debugfs ...
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
  
  #ifdef CONFIG_DEBUG_FS
  static int split_huge_pages_set(void *data, u64 val)
  {
  	struct zone *zone;
  	struct page *page;
  	unsigned long pfn, max_zone_pfn;
  	unsigned long total = 0, split = 0;
  
  	if (val != 1)
  		return -EINVAL;
  
  	for_each_populated_zone(zone) {
  		max_zone_pfn = zone_end_pfn(zone);
  		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
  			if (!pfn_valid(pfn))
  				continue;
  
  			page = pfn_to_page(pfn);
  			if (!get_page_unless_zero(page))
  				continue;
  
  			if (zone != page_zone(page))
  				goto next;
  
  			if (!PageHead(page) || !PageAnon(page) ||
  					PageHuge(page))
  				goto next;
  
  			total++;
  			lock_page(page);
  			if (!split_huge_page(page))
  				split++;
  			unlock_page(page);
  next:
  			put_page(page);
  		}
  	}
145bdaa15   Yang Shi   mm: thp: correct ...
3395
3396
  	pr_info("%lu of %lu THP split
  ", split, total);
49071d436   Kirill A. Shutemov   thp: add debugfs ...
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
  
  	return 0;
  }
  DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
  		"%llu
  ");
  
  static int __init split_huge_pages_debugfs(void)
  {
  	void *ret;
145bdaa15   Yang Shi   mm: thp: correct ...
3407
  	ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
49071d436   Kirill A. Shutemov   thp: add debugfs ...
3408
3409
3410
3411
3412
3413
3414
  			&split_huge_pages_fops);
  	if (!ret)
  		pr_warn("Failed to create split_huge_pages in debugfs");
  	return 0;
  }
  late_initcall(split_huge_pages_debugfs);
  #endif