Blame view

mm/memory.c 107 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
  /*
   *  linux/mm/memory.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   */
  
  /*
   * demand-loading started 01.12.91 - seems it is high on the list of
   * things wanted, and it should be easy to implement. - Linus
   */
  
  /*
   * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
   * pages started 02.12.91, seems to work. - Linus.
   *
   * Tested sharing by executing about 30 /bin/sh: under the old kernel it
   * would have taken more than the 6M I have free, but it worked well as
   * far as I could see.
   *
   * Also corrected some "invalidate()"s - I wasn't doing enough of them.
   */
  
  /*
   * Real VM (paging to/from disk) started 18.12.91. Much more work and
   * thought has to go into this. Oh, well..
   * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
   *		Found it. Everything seems to work now.
   * 20.12.91  -  Ok, making the swap-device changeable like the root.
   */
  
  /*
   * 05.04.94  -  Multi-page memory management added for v1.1.
   * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
   *
   * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
   *		(Gerhard.Wichert@pdb.siemens.de)
   *
   * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
   */
  
  #include <linux/kernel_stat.h>
  #include <linux/mm.h>
  #include <linux/hugetlb.h>
  #include <linux/mman.h>
  #include <linux/swap.h>
  #include <linux/highmem.h>
  #include <linux/pagemap.h>
9a8408951   Hugh Dickins   ksm: identify Pag...
48
  #include <linux/ksm.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
  #include <linux/rmap.h>
b95f1b31b   Paul Gortmaker   mm: Map most file...
50
  #include <linux/export.h>
0ff922452   Shailabh Nagar   [PATCH] per-task-...
51
  #include <linux/delayacct.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
52
  #include <linux/init.h>
01c8f1c44   Dan Williams   mm, dax, gpu: con...
53
  #include <linux/pfn_t.h>
edc79b2a4   Peter Zijlstra   [PATCH] mm: balan...
54
  #include <linux/writeback.h>
8a9f3ccd2   Balbir Singh   Memory controller...
55
  #include <linux/memcontrol.h>
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
56
  #include <linux/mmu_notifier.h>
3dc147414   Hugh Dickins   badpage: replace ...
57
58
59
  #include <linux/kallsyms.h>
  #include <linux/swapops.h>
  #include <linux/elf.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
60
  #include <linux/gfp.h>
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
61
  #include <linux/migrate.h>
2fbc57c53   Andy Shevchenko   mm: use kbasename()
62
  #include <linux/string.h>
0abdd7a81   Dan Williams   dma-debug: introd...
63
  #include <linux/dma-debug.h>
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
64
  #include <linux/debugfs.h>
6b251fc96   Andrea Arcangeli   userfaultfd: call...
65
  #include <linux/userfaultfd_k.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
66

6952b61de   Alexey Dobriyan   headers: taskstat...
67
  #include <asm/io.h>
33a709b25   Dave Hansen   mm/gup, x86/mm/pk...
68
  #include <asm/mmu_context.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
69
70
71
72
73
  #include <asm/pgalloc.h>
  #include <asm/uaccess.h>
  #include <asm/tlb.h>
  #include <asm/tlbflush.h>
  #include <asm/pgtable.h>
42b777281   Jan Beulich   mm: remove double...
74
  #include "internal.h"
90572890d   Peter Zijlstra   mm: numa: Change ...
75
76
  #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
  #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
75980e97d   Peter Zijlstra   mm: fold page->_l...
77
  #endif
d41dee369   Andy Whitcroft   [PATCH] sparsemem...
78
  #ifndef CONFIG_NEED_MULTIPLE_NODES
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
79
80
81
82
83
84
85
  /* use the per-pgdat data instead for discontigmem - mbligh */
  unsigned long max_mapnr;
  struct page *mem_map;
  
  EXPORT_SYMBOL(max_mapnr);
  EXPORT_SYMBOL(mem_map);
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
86
87
88
89
90
91
92
93
  /*
   * A number of key systems in x86 including ioremap() rely on the assumption
   * that high_memory defines the upper bound on direct map memory, then end
   * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
   * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
   * and ZONE_HIGHMEM.
   */
  void * high_memory;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
94

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
95
  EXPORT_SYMBOL(high_memory);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
96

32a932332   Ingo Molnar   brk randomization...
97
98
99
100
101
102
103
104
105
106
107
108
  /*
   * Randomize the address space (stacks, mmaps, brk, etc.).
   *
   * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
   *   as ancient (libc5 based) binaries can segfault. )
   */
  int randomize_va_space __read_mostly =
  #ifdef CONFIG_COMPAT_BRK
  					1;
  #else
  					2;
  #endif
a62eaf151   Andi Kleen   [PATCH] x86_64: A...
109
110
111
112
  
  static int __init disable_randmaps(char *s)
  {
  	randomize_va_space = 0;
9b41046cd   OGAWA Hirofumi   [PATCH] Don't pas...
113
  	return 1;
a62eaf151   Andi Kleen   [PATCH] x86_64: A...
114
115
  }
  __setup("norandmaps", disable_randmaps);
62eede62d   Hugh Dickins   mm: ZERO_PAGE wit...
116
  unsigned long zero_pfn __read_mostly;
03f6462a3   Hugh Dickins   mm: move highest_...
117
  unsigned long highest_memmap_pfn __read_mostly;
a13ea5b75   Hugh Dickins   mm: reinstate ZER...
118

0b70068e4   Ard Biesheuvel   mm: export symbol...
119
  EXPORT_SYMBOL(zero_pfn);
a13ea5b75   Hugh Dickins   mm: reinstate ZER...
120
121
122
123
124
125
126
127
128
  /*
   * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
   */
  static int __init init_zero_pfn(void)
  {
  	zero_pfn = page_to_pfn(ZERO_PAGE(0));
  	return 0;
  }
  core_initcall(init_zero_pfn);
a62eaf151   Andi Kleen   [PATCH] x86_64: A...
129

d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
130

34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
131
  #if defined(SPLIT_RSS_COUNTING)
ea48cf786   David Rientjes   mm, counters: fol...
132
  void sync_mm_rss(struct mm_struct *mm)
34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
133
134
135
136
  {
  	int i;
  
  	for (i = 0; i < NR_MM_COUNTERS; i++) {
05af2e104   David Rientjes   mm, counters: rem...
137
138
139
  		if (current->rss_stat.count[i]) {
  			add_mm_counter(mm, i, current->rss_stat.count[i]);
  			current->rss_stat.count[i] = 0;
34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
140
141
  		}
  	}
05af2e104   David Rientjes   mm, counters: rem...
142
  	current->rss_stat.events = 0;
34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
  }
  
  static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
  {
  	struct task_struct *task = current;
  
  	if (likely(task->mm == mm))
  		task->rss_stat.count[member] += val;
  	else
  		add_mm_counter(mm, member, val);
  }
  #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
  #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
  
  /* sync counter once per 64 page faults */
  #define TASK_RSS_EVENTS_THRESH	(64)
  static void check_sync_rss_stat(struct task_struct *task)
  {
  	if (unlikely(task != current))
  		return;
  	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
ea48cf786   David Rientjes   mm, counters: fol...
164
  		sync_mm_rss(task->mm);
34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
165
  }
9547d01bf   Peter Zijlstra   mm: uninline larg...
166
  #else /* SPLIT_RSS_COUNTING */
34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
167
168
169
170
171
172
173
  
  #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
  #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
  
  static void check_sync_rss_stat(struct task_struct *task)
  {
  }
9547d01bf   Peter Zijlstra   mm: uninline larg...
174
175
176
  #endif /* SPLIT_RSS_COUNTING */
  
  #ifdef HAVE_GENERIC_MMU_GATHER
ca1d6c7d9   Nicholas Krause   mm/memory.c: make...
177
  static bool tlb_next_batch(struct mmu_gather *tlb)
9547d01bf   Peter Zijlstra   mm: uninline larg...
178
179
180
181
182
183
  {
  	struct mmu_gather_batch *batch;
  
  	batch = tlb->active;
  	if (batch->next) {
  		tlb->active = batch->next;
ca1d6c7d9   Nicholas Krause   mm/memory.c: make...
184
  		return true;
9547d01bf   Peter Zijlstra   mm: uninline larg...
185
  	}
53a59fc67   Michal Hocko   mm: limit mmu_gat...
186
  	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
ca1d6c7d9   Nicholas Krause   mm/memory.c: make...
187
  		return false;
53a59fc67   Michal Hocko   mm: limit mmu_gat...
188

9547d01bf   Peter Zijlstra   mm: uninline larg...
189
190
  	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
  	if (!batch)
ca1d6c7d9   Nicholas Krause   mm/memory.c: make...
191
  		return false;
9547d01bf   Peter Zijlstra   mm: uninline larg...
192

53a59fc67   Michal Hocko   mm: limit mmu_gat...
193
  	tlb->batch_count++;
9547d01bf   Peter Zijlstra   mm: uninline larg...
194
195
196
197
198
199
  	batch->next = NULL;
  	batch->nr   = 0;
  	batch->max  = MAX_GATHER_BATCH;
  
  	tlb->active->next = batch;
  	tlb->active = batch;
ca1d6c7d9   Nicholas Krause   mm/memory.c: make...
200
  	return true;
9547d01bf   Peter Zijlstra   mm: uninline larg...
201
202
203
204
205
206
207
  }
  
  /* tlb_gather_mmu
   *	Called to initialize an (on-stack) mmu_gather structure for page-table
   *	tear-down from @mm. The @fullmm argument is used when @mm is without
   *	users and we're going to destroy the full address space (exit/execve).
   */
2b047252d   Linus Torvalds   Fix TLB gather vi...
208
  void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
9547d01bf   Peter Zijlstra   mm: uninline larg...
209
210
  {
  	tlb->mm = mm;
2b047252d   Linus Torvalds   Fix TLB gather vi...
211
212
  	/* Is it from 0 to ~0? */
  	tlb->fullmm     = !(start | (end+1));
1de14c3c5   Dave Hansen   x86-32: Fix possi...
213
  	tlb->need_flush_all = 0;
9547d01bf   Peter Zijlstra   mm: uninline larg...
214
215
216
217
  	tlb->local.next = NULL;
  	tlb->local.nr   = 0;
  	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
  	tlb->active     = &tlb->local;
53a59fc67   Michal Hocko   mm: limit mmu_gat...
218
  	tlb->batch_count = 0;
9547d01bf   Peter Zijlstra   mm: uninline larg...
219
220
221
222
  
  #ifdef CONFIG_HAVE_RCU_TABLE_FREE
  	tlb->batch = NULL;
  #endif
fb7332a9f   Will Deacon   mmu_gather: move ...
223
224
  
  	__tlb_reset_range(tlb);
9547d01bf   Peter Zijlstra   mm: uninline larg...
225
  }
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
226
  static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
9547d01bf   Peter Zijlstra   mm: uninline larg...
227
  {
721c21c17   Will Deacon   mm: mmu_gather: u...
228
229
  	if (!tlb->end)
  		return;
9547d01bf   Peter Zijlstra   mm: uninline larg...
230
  	tlb_flush(tlb);
34ee645e8   Joerg Roedel   mmu_notifier: cal...
231
  	mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
9547d01bf   Peter Zijlstra   mm: uninline larg...
232
233
  #ifdef CONFIG_HAVE_RCU_TABLE_FREE
  	tlb_table_flush(tlb);
34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
234
  #endif
fb7332a9f   Will Deacon   mmu_gather: move ...
235
  	__tlb_reset_range(tlb);
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
236
237
238
239
240
  }
  
  static void tlb_flush_mmu_free(struct mmu_gather *tlb)
  {
  	struct mmu_gather_batch *batch;
34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
241

721c21c17   Will Deacon   mm: mmu_gather: u...
242
  	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
9547d01bf   Peter Zijlstra   mm: uninline larg...
243
244
245
246
247
  		free_pages_and_swap_cache(batch->pages, batch->nr);
  		batch->nr = 0;
  	}
  	tlb->active = &tlb->local;
  }
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
248
249
  void tlb_flush_mmu(struct mmu_gather *tlb)
  {
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
250
251
252
  	tlb_flush_mmu_tlbonly(tlb);
  	tlb_flush_mmu_free(tlb);
  }
9547d01bf   Peter Zijlstra   mm: uninline larg...
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
  /* tlb_finish_mmu
   *	Called at the end of the shootdown operation to free up any resources
   *	that were required.
   */
  void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
  {
  	struct mmu_gather_batch *batch, *next;
  
  	tlb_flush_mmu(tlb);
  
  	/* keep the page table cache within bounds */
  	check_pgt_cache();
  
  	for (batch = tlb->local.next; batch; batch = next) {
  		next = batch->next;
  		free_pages((unsigned long)batch, 0);
  	}
  	tlb->local.next = NULL;
  }
  
  /* __tlb_remove_page
   *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
   *	handling the additional races in SMP caused by other CPUs caching valid
   *	mappings in their TLBs. Returns the number of free page slots left.
   *	When out of page slots we must call tlb_flush_mmu().
   */
  int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
  {
  	struct mmu_gather_batch *batch;
fb7332a9f   Will Deacon   mmu_gather: move ...
282
  	VM_BUG_ON(!tlb->end);
9547d01bf   Peter Zijlstra   mm: uninline larg...
283

9547d01bf   Peter Zijlstra   mm: uninline larg...
284
285
286
287
288
  	batch = tlb->active;
  	batch->pages[batch->nr++] = page;
  	if (batch->nr == batch->max) {
  		if (!tlb_next_batch(tlb))
  			return 0;
0b43c3aab   Shaohua Li   mm: __tlb_remove_...
289
  		batch = tlb->active;
9547d01bf   Peter Zijlstra   mm: uninline larg...
290
  	}
309381fea   Sasha Levin   mm: dump page whe...
291
  	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
9547d01bf   Peter Zijlstra   mm: uninline larg...
292
293
294
295
296
  
  	return batch->max - batch->nr;
  }
  
  #endif /* HAVE_GENERIC_MMU_GATHER */
267239116   Peter Zijlstra   mm, powerpc: move...
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
  #ifdef CONFIG_HAVE_RCU_TABLE_FREE
  
  /*
   * See the comment near struct mmu_table_batch.
   */
  
  static void tlb_remove_table_smp_sync(void *arg)
  {
  	/* Simply deliver the interrupt */
  }
  
  static void tlb_remove_table_one(void *table)
  {
  	/*
  	 * This isn't an RCU grace period and hence the page-tables cannot be
  	 * assumed to be actually RCU-freed.
  	 *
  	 * It is however sufficient for software page-table walkers that rely on
  	 * IRQ disabling. See the comment near struct mmu_table_batch.
  	 */
  	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
  	__tlb_remove_table(table);
  }
  
  static void tlb_remove_table_rcu(struct rcu_head *head)
  {
  	struct mmu_table_batch *batch;
  	int i;
  
  	batch = container_of(head, struct mmu_table_batch, rcu);
  
  	for (i = 0; i < batch->nr; i++)
  		__tlb_remove_table(batch->tables[i]);
  
  	free_page((unsigned long)batch);
  }
  
  void tlb_table_flush(struct mmu_gather *tlb)
  {
  	struct mmu_table_batch **batch = &tlb->batch;
  
  	if (*batch) {
  		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
  		*batch = NULL;
  	}
  }
  
  void tlb_remove_table(struct mmu_gather *tlb, void *table)
  {
  	struct mmu_table_batch **batch = &tlb->batch;
267239116   Peter Zijlstra   mm, powerpc: move...
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
  	/*
  	 * When there's less then two users of this mm there cannot be a
  	 * concurrent page-table walk.
  	 */
  	if (atomic_read(&tlb->mm->mm_users) < 2) {
  		__tlb_remove_table(table);
  		return;
  	}
  
  	if (*batch == NULL) {
  		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
  		if (*batch == NULL) {
  			tlb_remove_table_one(table);
  			return;
  		}
  		(*batch)->nr = 0;
  	}
  	(*batch)->tables[(*batch)->nr++] = table;
  	if ((*batch)->nr == MAX_TABLE_BATCH)
  		tlb_table_flush(tlb);
  }
9547d01bf   Peter Zijlstra   mm: uninline larg...
368
  #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
267239116   Peter Zijlstra   mm, powerpc: move...
369

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
370
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
371
372
373
   * Note: this doesn't free the actual pages themselves. That
   * has been handled earlier when unmapping all the memory regions.
   */
9e1b32caa   Benjamin Herrenschmidt   mm: Pass virtual ...
374
375
  static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
  			   unsigned long addr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
376
  {
2f569afd9   Martin Schwidefsky   CONFIG_HIGHPTE vs...
377
  	pgtable_t token = pmd_pgtable(*pmd);
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
378
  	pmd_clear(pmd);
9e1b32caa   Benjamin Herrenschmidt   mm: Pass virtual ...
379
  	pte_free_tlb(tlb, token, addr);
e1f56c89b   Kirill A. Shutemov   mm: convert mm->n...
380
  	atomic_long_dec(&tlb->mm->nr_ptes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
381
  }
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
382
383
384
  static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  				unsigned long addr, unsigned long end,
  				unsigned long floor, unsigned long ceiling)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
385
386
387
  {
  	pmd_t *pmd;
  	unsigned long next;
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
388
  	unsigned long start;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
389

e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
390
  	start = addr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
391
  	pmd = pmd_offset(pud, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
392
393
394
395
  	do {
  		next = pmd_addr_end(addr, end);
  		if (pmd_none_or_clear_bad(pmd))
  			continue;
9e1b32caa   Benjamin Herrenschmidt   mm: Pass virtual ...
396
  		free_pte_range(tlb, pmd, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
397
  	} while (pmd++, addr = next, addr != end);
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
398
399
400
401
402
403
404
  	start &= PUD_MASK;
  	if (start < floor)
  		return;
  	if (ceiling) {
  		ceiling &= PUD_MASK;
  		if (!ceiling)
  			return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
405
  	}
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
406
407
408
409
410
  	if (end - 1 > ceiling - 1)
  		return;
  
  	pmd = pmd_offset(pud, start);
  	pud_clear(pud);
9e1b32caa   Benjamin Herrenschmidt   mm: Pass virtual ...
411
  	pmd_free_tlb(tlb, pmd, start);
dc6c9a35b   Kirill A. Shutemov   mm: account pmd p...
412
  	mm_dec_nr_pmds(tlb->mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
413
  }
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
414
415
416
  static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  				unsigned long addr, unsigned long end,
  				unsigned long floor, unsigned long ceiling)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
417
418
419
  {
  	pud_t *pud;
  	unsigned long next;
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
420
  	unsigned long start;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
421

e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
422
  	start = addr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
423
  	pud = pud_offset(pgd, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
424
425
426
427
  	do {
  		next = pud_addr_end(addr, end);
  		if (pud_none_or_clear_bad(pud))
  			continue;
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
428
  		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
429
  	} while (pud++, addr = next, addr != end);
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
430
431
432
433
434
435
436
  	start &= PGDIR_MASK;
  	if (start < floor)
  		return;
  	if (ceiling) {
  		ceiling &= PGDIR_MASK;
  		if (!ceiling)
  			return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
437
  	}
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
438
439
440
441
442
  	if (end - 1 > ceiling - 1)
  		return;
  
  	pud = pud_offset(pgd, start);
  	pgd_clear(pgd);
9e1b32caa   Benjamin Herrenschmidt   mm: Pass virtual ...
443
  	pud_free_tlb(tlb, pud, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
444
445
446
  }
  
  /*
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
447
   * This function frees user-level page tables of a process.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
448
   */
42b777281   Jan Beulich   mm: remove double...
449
  void free_pgd_range(struct mmu_gather *tlb,
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
450
451
  			unsigned long addr, unsigned long end,
  			unsigned long floor, unsigned long ceiling)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
452
453
454
  {
  	pgd_t *pgd;
  	unsigned long next;
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
  
  	/*
  	 * The next few lines have given us lots of grief...
  	 *
  	 * Why are we testing PMD* at this top level?  Because often
  	 * there will be no work to do at all, and we'd prefer not to
  	 * go all the way down to the bottom just to discover that.
  	 *
  	 * Why all these "- 1"s?  Because 0 represents both the bottom
  	 * of the address space and the top of it (using -1 for the
  	 * top wouldn't help much: the masks would do the wrong thing).
  	 * The rule is that addr 0 and floor 0 refer to the bottom of
  	 * the address space, but end 0 and ceiling 0 refer to the top
  	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
  	 * that end 0 case should be mythical).
  	 *
  	 * Wherever addr is brought up or ceiling brought down, we must
  	 * be careful to reject "the opposite 0" before it confuses the
  	 * subsequent tests.  But what about where end is brought down
  	 * by PMD_SIZE below? no, end can't go down to 0 there.
  	 *
  	 * Whereas we round start (addr) and ceiling down, by different
  	 * masks at different levels, in order to test whether a table
  	 * now has no other vmas using it, so can be freed, we don't
  	 * bother to round floor or end up - the tests don't need that.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
481

e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
  	addr &= PMD_MASK;
  	if (addr < floor) {
  		addr += PMD_SIZE;
  		if (!addr)
  			return;
  	}
  	if (ceiling) {
  		ceiling &= PMD_MASK;
  		if (!ceiling)
  			return;
  	}
  	if (end - 1 > ceiling - 1)
  		end -= PMD_SIZE;
  	if (addr > end - 1)
  		return;
42b777281   Jan Beulich   mm: remove double...
497
  	pgd = pgd_offset(tlb->mm, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
498
499
500
501
  	do {
  		next = pgd_addr_end(addr, end);
  		if (pgd_none_or_clear_bad(pgd))
  			continue;
42b777281   Jan Beulich   mm: remove double...
502
  		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
  	} while (pgd++, addr = next, addr != end);
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
504
  }
42b777281   Jan Beulich   mm: remove double...
505
  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
3bf5ee956   Hugh Dickins   [PATCH] freepgt: ...
506
  		unsigned long floor, unsigned long ceiling)
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
507
508
509
510
  {
  	while (vma) {
  		struct vm_area_struct *next = vma->vm_next;
  		unsigned long addr = vma->vm_start;
8f4f8c164   Hugh Dickins   [PATCH] mm: unlin...
511
  		/*
25d9e2d15   npiggin@suse.de   truncate: new hel...
512
513
  		 * Hide vma from rmap and truncate_pagecache before freeing
  		 * pgtables
8f4f8c164   Hugh Dickins   [PATCH] mm: unlin...
514
  		 */
5beb49305   Rik van Riel   mm: change anon_v...
515
  		unlink_anon_vmas(vma);
8f4f8c164   Hugh Dickins   [PATCH] mm: unlin...
516
  		unlink_file_vma(vma);
9da61aef0   David Gibson   [PATCH] hugepage:...
517
  		if (is_vm_hugetlb_page(vma)) {
3bf5ee956   Hugh Dickins   [PATCH] freepgt: ...
518
  			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
519
  				floor, next? next->vm_start: ceiling);
3bf5ee956   Hugh Dickins   [PATCH] freepgt: ...
520
521
522
523
524
  		} else {
  			/*
  			 * Optimization: gather nearby vmas into one call down
  			 */
  			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
4866920b9   David Gibson   [PATCH] hugepage:...
525
  			       && !is_vm_hugetlb_page(next)) {
3bf5ee956   Hugh Dickins   [PATCH] freepgt: ...
526
527
  				vma = next;
  				next = vma->vm_next;
5beb49305   Rik van Riel   mm: change anon_v...
528
  				unlink_anon_vmas(vma);
8f4f8c164   Hugh Dickins   [PATCH] mm: unlin...
529
  				unlink_file_vma(vma);
3bf5ee956   Hugh Dickins   [PATCH] freepgt: ...
530
531
532
533
  			}
  			free_pgd_range(tlb, addr, vma->vm_end,
  				floor, next? next->vm_start: ceiling);
  		}
e0da382c9   Hugh Dickins   [PATCH] freepgt: ...
534
535
  		vma = next;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
536
  }
3ed3a4f0d   Kirill A. Shutemov   mm: cleanup *pte_...
537
  int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
538
  {
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
539
  	spinlock_t *ptl;
2f569afd9   Martin Schwidefsky   CONFIG_HIGHPTE vs...
540
  	pgtable_t new = pte_alloc_one(mm, address);
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
541
542
  	if (!new)
  		return -ENOMEM;
362a61ad6   Nick Piggin   fix SMP data race...
543
544
545
546
547
548
549
550
551
552
553
554
555
556
  	/*
  	 * Ensure all pte setup (eg. pte page lock and page clearing) are
  	 * visible before the pte is made visible to other CPUs by being
  	 * put into page tables.
  	 *
  	 * The other side of the story is the pointer chasing in the page
  	 * table walking code (when walking the page table without locking;
  	 * ie. most of the time). Fortunately, these data accesses consist
  	 * of a chain of data-dependent loads, meaning most CPUs (alpha
  	 * being the notable exception) will already guarantee loads are
  	 * seen in-order. See the alpha page table accessors for the
  	 * smp_read_barrier_depends() barriers in page table walking code.
  	 */
  	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
557
  	ptl = pmd_lock(mm, pmd);
8ac1f8320   Andrea Arcangeli   thp: pte alloc tr...
558
  	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
e1f56c89b   Kirill A. Shutemov   mm: convert mm->n...
559
  		atomic_long_inc(&mm->nr_ptes);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
560
  		pmd_populate(mm, pmd, new);
2f569afd9   Martin Schwidefsky   CONFIG_HIGHPTE vs...
561
  		new = NULL;
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
562
  	}
c4088ebdc   Kirill A. Shutemov   mm: convert the r...
563
  	spin_unlock(ptl);
2f569afd9   Martin Schwidefsky   CONFIG_HIGHPTE vs...
564
565
  	if (new)
  		pte_free(mm, new);
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
566
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
567
  }
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
568
  int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
569
  {
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
570
571
572
  	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
  	if (!new)
  		return -ENOMEM;
362a61ad6   Nick Piggin   fix SMP data race...
573
  	smp_wmb(); /* See comment in __pte_alloc */
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
574
  	spin_lock(&init_mm.page_table_lock);
8ac1f8320   Andrea Arcangeli   thp: pte alloc tr...
575
  	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
576
  		pmd_populate_kernel(&init_mm, pmd, new);
2f569afd9   Martin Schwidefsky   CONFIG_HIGHPTE vs...
577
  		new = NULL;
4b471e889   Kirill A. Shutemov   mm, thp: remove i...
578
  	}
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
579
  	spin_unlock(&init_mm.page_table_lock);
2f569afd9   Martin Schwidefsky   CONFIG_HIGHPTE vs...
580
581
  	if (new)
  		pte_free_kernel(&init_mm, new);
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
582
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
583
  }
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
584
585
586
587
588
589
  static inline void init_rss_vec(int *rss)
  {
  	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
  }
  
  static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
ae8597623   Hugh Dickins   [PATCH] mm: batch...
590
  {
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
591
  	int i;
34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
592
  	if (current->mm == mm)
05af2e104   David Rientjes   mm, counters: rem...
593
  		sync_mm_rss(mm);
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
594
595
596
  	for (i = 0; i < NR_MM_COUNTERS; i++)
  		if (rss[i])
  			add_mm_counter(mm, i, rss[i]);
ae8597623   Hugh Dickins   [PATCH] mm: batch...
597
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
598
  /*
6aab341e0   Linus Torvalds   mm: re-architect ...
599
600
601
   * This function is called to print an error when a bad pte
   * is found. For example, we might have a PFN-mapped pte in
   * a region that doesn't allow it.
b5810039a   Nick Piggin   [PATCH] core remo...
602
603
604
   *
   * The calling function must still handle the error.
   */
3dc147414   Hugh Dickins   badpage: replace ...
605
606
  static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  			  pte_t pte, struct page *page)
b5810039a   Nick Piggin   [PATCH] core remo...
607
  {
3dc147414   Hugh Dickins   badpage: replace ...
608
609
610
611
612
  	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
  	pud_t *pud = pud_offset(pgd, addr);
  	pmd_t *pmd = pmd_offset(pud, addr);
  	struct address_space *mapping;
  	pgoff_t index;
d936cf9b3   Hugh Dickins   badpage: ratelimi...
613
614
615
616
617
618
619
620
621
622
623
624
625
626
  	static unsigned long resume;
  	static unsigned long nr_shown;
  	static unsigned long nr_unshown;
  
  	/*
  	 * Allow a burst of 60 reports, then keep quiet for that minute;
  	 * or allow a steady drip of one report per second.
  	 */
  	if (nr_shown == 60) {
  		if (time_before(jiffies, resume)) {
  			nr_unshown++;
  			return;
  		}
  		if (nr_unshown) {
1170532bb   Joe Perches   mm: convert print...
627
628
629
  			pr_alert("BUG: Bad page map: %lu messages suppressed
  ",
  				 nr_unshown);
d936cf9b3   Hugh Dickins   badpage: ratelimi...
630
631
632
633
634
635
  			nr_unshown = 0;
  		}
  		nr_shown = 0;
  	}
  	if (nr_shown++ == 0)
  		resume = jiffies + 60 * HZ;
3dc147414   Hugh Dickins   badpage: replace ...
636
637
638
  
  	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
  	index = linear_page_index(vma, addr);
1170532bb   Joe Perches   mm: convert print...
639
640
641
642
  	pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx
  ",
  		 current->comm,
  		 (long long)pte_val(pte), (long long)pmd_val(*pmd));
718a38211   Wu Fengguang   mm: introduce dum...
643
  	if (page)
f0b791a34   Dave Hansen   mm: print more de...
644
  		dump_page(page, "bad pte");
1170532bb   Joe Perches   mm: convert print...
645
646
647
  	pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx
  ",
  		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
3dc147414   Hugh Dickins   badpage: replace ...
648
649
650
  	/*
  	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
  	 */
2682582a6   Konstantin Khlebnikov   mm/memory: also p...
651
652
653
654
655
656
  	pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf
  ",
  		 vma->vm_file,
  		 vma->vm_ops ? vma->vm_ops->fault : NULL,
  		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
  		 mapping ? mapping->a_ops->readpage : NULL);
b5810039a   Nick Piggin   [PATCH] core remo...
657
  	dump_stack();
373d4d099   Rusty Russell   taint: add explic...
658
  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
b5810039a   Nick Piggin   [PATCH] core remo...
659
660
661
  }
  
  /*
7e675137a   Nick Piggin   mm: introduce pte...
662
   * vm_normal_page -- This function gets the "struct page" associated with a pte.
6aab341e0   Linus Torvalds   mm: re-architect ...
663
   *
7e675137a   Nick Piggin   mm: introduce pte...
664
665
666
   * "Special" mappings do not wish to be associated with a "struct page" (either
   * it doesn't exist, or it exists but they don't want to touch it). In this
   * case, NULL is returned here. "Normal" mappings do have a struct page.
b379d7901   Jared Hulbert   mm: introduce VM_...
667
   *
7e675137a   Nick Piggin   mm: introduce pte...
668
669
670
671
672
673
674
675
   * There are 2 broad cases. Firstly, an architecture may define a pte_special()
   * pte bit, in which case this function is trivial. Secondly, an architecture
   * may not have a spare pte bit, which requires a more complicated scheme,
   * described below.
   *
   * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
   * special mapping (even if there are underlying and valid "struct pages").
   * COWed pages of a VM_PFNMAP are always normal.
6aab341e0   Linus Torvalds   mm: re-architect ...
676
   *
b379d7901   Jared Hulbert   mm: introduce VM_...
677
678
   * The way we recognize COWed pages within VM_PFNMAP mappings is through the
   * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
7e675137a   Nick Piggin   mm: introduce pte...
679
680
   * set, and the vm_pgoff will point to the first PFN mapped: thus every special
   * mapping will always honor the rule
6aab341e0   Linus Torvalds   mm: re-architect ...
681
682
683
   *
   *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
   *
7e675137a   Nick Piggin   mm: introduce pte...
684
685
686
687
688
689
   * And for normal mappings this is false.
   *
   * This restricts such mappings to be a linear translation from virtual address
   * to pfn. To get around this restriction, we allow arbitrary mappings so long
   * as the vma is not a COW mapping; in that case, we know that all ptes are
   * special (because none can have been COWed).
b379d7901   Jared Hulbert   mm: introduce VM_...
690
   *
b379d7901   Jared Hulbert   mm: introduce VM_...
691
   *
7e675137a   Nick Piggin   mm: introduce pte...
692
   * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
b379d7901   Jared Hulbert   mm: introduce VM_...
693
694
695
696
697
698
699
700
701
   *
   * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
   * page" backing, however the difference is that _all_ pages with a struct
   * page (that is, those where pfn_valid is true) are refcounted and considered
   * normal pages by the VM. The disadvantage is that pages are refcounted
   * (which can be slower and simply not an option for some PFNMAP users). The
   * advantage is that we don't have to follow the strict linearity rule of
   * PFNMAP mappings in order to support COWable mappings.
   *
ee498ed73   Hugh Dickins   [PATCH] unpaged: ...
702
   */
7e675137a   Nick Piggin   mm: introduce pte...
703
704
705
706
707
708
709
  #ifdef __HAVE_ARCH_PTE_SPECIAL
  # define HAVE_PTE_SPECIAL 1
  #else
  # define HAVE_PTE_SPECIAL 0
  #endif
  struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
  				pte_t pte)
ee498ed73   Hugh Dickins   [PATCH] unpaged: ...
710
  {
22b31eec6   Hugh Dickins   badpage: vm_norma...
711
  	unsigned long pfn = pte_pfn(pte);
7e675137a   Nick Piggin   mm: introduce pte...
712
713
  
  	if (HAVE_PTE_SPECIAL) {
b38af4721   Hugh Dickins   x86,mm: fix pte_s...
714
  		if (likely(!pte_special(pte)))
22b31eec6   Hugh Dickins   badpage: vm_norma...
715
  			goto check_pfn;
667a0a06c   David Vrabel   mm: provide a fin...
716
717
  		if (vma->vm_ops && vma->vm_ops->find_special_page)
  			return vma->vm_ops->find_special_page(vma, addr);
a13ea5b75   Hugh Dickins   mm: reinstate ZER...
718
719
  		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
  			return NULL;
62eede62d   Hugh Dickins   mm: ZERO_PAGE wit...
720
  		if (!is_zero_pfn(pfn))
22b31eec6   Hugh Dickins   badpage: vm_norma...
721
  			print_bad_pte(vma, addr, pte, NULL);
7e675137a   Nick Piggin   mm: introduce pte...
722
723
724
725
  		return NULL;
  	}
  
  	/* !HAVE_PTE_SPECIAL case follows: */
b379d7901   Jared Hulbert   mm: introduce VM_...
726
727
728
729
730
731
  	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
  		if (vma->vm_flags & VM_MIXEDMAP) {
  			if (!pfn_valid(pfn))
  				return NULL;
  			goto out;
  		} else {
7e675137a   Nick Piggin   mm: introduce pte...
732
733
  			unsigned long off;
  			off = (addr - vma->vm_start) >> PAGE_SHIFT;
b379d7901   Jared Hulbert   mm: introduce VM_...
734
735
736
737
738
  			if (pfn == vma->vm_pgoff + off)
  				return NULL;
  			if (!is_cow_mapping(vma->vm_flags))
  				return NULL;
  		}
6aab341e0   Linus Torvalds   mm: re-architect ...
739
  	}
b38af4721   Hugh Dickins   x86,mm: fix pte_s...
740
741
  	if (is_zero_pfn(pfn))
  		return NULL;
22b31eec6   Hugh Dickins   badpage: vm_norma...
742
743
744
745
746
  check_pfn:
  	if (unlikely(pfn > highest_memmap_pfn)) {
  		print_bad_pte(vma, addr, pte, NULL);
  		return NULL;
  	}
6aab341e0   Linus Torvalds   mm: re-architect ...
747
748
  
  	/*
7e675137a   Nick Piggin   mm: introduce pte...
749
  	 * NOTE! We still have PageReserved() pages in the page tables.
7e675137a   Nick Piggin   mm: introduce pte...
750
  	 * eg. VDSO mappings can cause them to exist.
6aab341e0   Linus Torvalds   mm: re-architect ...
751
  	 */
b379d7901   Jared Hulbert   mm: introduce VM_...
752
  out:
6aab341e0   Linus Torvalds   mm: re-architect ...
753
  	return pfn_to_page(pfn);
ee498ed73   Hugh Dickins   [PATCH] unpaged: ...
754
755
756
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
757
758
759
   * copy one vm_area from one task to the other. Assumes the page tables
   * already present in the new task to be cleared in the whole range
   * covered by this vma.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
760
   */
570a335b8   Hugh Dickins   swap_info: swap c...
761
  static inline unsigned long
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
762
  copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
b5810039a   Nick Piggin   [PATCH] core remo...
763
  		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
8c1037627   Hugh Dickins   [PATCH] mm: copy_...
764
  		unsigned long addr, int *rss)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
765
  {
b5810039a   Nick Piggin   [PATCH] core remo...
766
  	unsigned long vm_flags = vma->vm_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
767
768
  	pte_t pte = *src_pte;
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
769
770
771
  
  	/* pte contains position in swap or file, so copy. */
  	if (unlikely(!pte_present(pte))) {
0661a3361   Kirill A. Shutemov   mm: remove rest u...
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
  		swp_entry_t entry = pte_to_swp_entry(pte);
  
  		if (likely(!non_swap_entry(entry))) {
  			if (swap_duplicate(entry) < 0)
  				return entry.val;
  
  			/* make sure dst_mm is on swapoff's mmlist. */
  			if (unlikely(list_empty(&dst_mm->mmlist))) {
  				spin_lock(&mmlist_lock);
  				if (list_empty(&dst_mm->mmlist))
  					list_add(&dst_mm->mmlist,
  							&src_mm->mmlist);
  				spin_unlock(&mmlist_lock);
  			}
  			rss[MM_SWAPENTS]++;
  		} else if (is_migration_entry(entry)) {
  			page = migration_entry_to_page(entry);
eca56ff90   Jerome Marchand   mm, shmem: add in...
789
  			rss[mm_counter(page)]++;
0661a3361   Kirill A. Shutemov   mm: remove rest u...
790
791
792
793
794
795
796
797
798
799
800
801
  
  			if (is_write_migration_entry(entry) &&
  					is_cow_mapping(vm_flags)) {
  				/*
  				 * COW mappings require pages in both
  				 * parent and child to be set to read.
  				 */
  				make_migration_entry_read(&entry);
  				pte = swp_entry_to_pte(entry);
  				if (pte_swp_soft_dirty(*src_pte))
  					pte = pte_swp_mksoft_dirty(pte);
  				set_pte_at(src_mm, addr, src_pte, pte);
0697212a4   Christoph Lameter   [PATCH] Swapless ...
802
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
803
  		}
ae8597623   Hugh Dickins   [PATCH] mm: batch...
804
  		goto out_set_pte;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
805
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
806
807
808
809
  	/*
  	 * If it's a COW mapping, write protect it both
  	 * in the parent and the child
  	 */
67121172f   Linus Torvalds   Allow arbitrary r...
810
  	if (is_cow_mapping(vm_flags)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
811
  		ptep_set_wrprotect(src_mm, addr, src_pte);
3dc907951   Zachary Amsden   [PATCH] paravirt:...
812
  		pte = pte_wrprotect(pte);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
813
814
815
816
817
818
819
820
821
  	}
  
  	/*
  	 * If it's a shared mapping, mark it clean in
  	 * the child
  	 */
  	if (vm_flags & VM_SHARED)
  		pte = pte_mkclean(pte);
  	pte = pte_mkold(pte);
6aab341e0   Linus Torvalds   mm: re-architect ...
822
823
824
825
  
  	page = vm_normal_page(vma, addr, pte);
  	if (page) {
  		get_page(page);
53f9263ba   Kirill A. Shutemov   mm: rework mapcou...
826
  		page_dup_rmap(page, false);
eca56ff90   Jerome Marchand   mm, shmem: add in...
827
  		rss[mm_counter(page)]++;
6aab341e0   Linus Torvalds   mm: re-architect ...
828
  	}
ae8597623   Hugh Dickins   [PATCH] mm: batch...
829
830
831
  
  out_set_pte:
  	set_pte_at(dst_mm, addr, dst_pte, pte);
570a335b8   Hugh Dickins   swap_info: swap c...
832
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
833
  }
21bda264f   Jerome Marchand   mm: make copy_pte...
834
  static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
71e3aac07   Andrea Arcangeli   thp: transparent ...
835
836
  		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
  		   unsigned long addr, unsigned long end)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
837
  {
c36987e2e   Daisuke Nishimura   mm: don't call pt...
838
  	pte_t *orig_src_pte, *orig_dst_pte;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
839
  	pte_t *src_pte, *dst_pte;
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
840
  	spinlock_t *src_ptl, *dst_ptl;
e040f218b   Hugh Dickins   [PATCH] mm: copy_...
841
  	int progress = 0;
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
842
  	int rss[NR_MM_COUNTERS];
570a335b8   Hugh Dickins   swap_info: swap c...
843
  	swp_entry_t entry = (swp_entry_t){0};
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
844
845
  
  again:
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
846
  	init_rss_vec(rss);
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
847
  	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
848
849
  	if (!dst_pte)
  		return -ENOMEM;
ece0e2b64   Peter Zijlstra   mm: remove pte_*m...
850
  	src_pte = pte_offset_map(src_pmd, addr);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
851
  	src_ptl = pte_lockptr(src_mm, src_pmd);
f20dc5f7c   Ingo Molnar   [PATCH] lockdep: ...
852
  	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
c36987e2e   Daisuke Nishimura   mm: don't call pt...
853
854
  	orig_src_pte = src_pte;
  	orig_dst_pte = dst_pte;
6606c3e0d   Zachary Amsden   [PATCH] paravirt:...
855
  	arch_enter_lazy_mmu_mode();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
856

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
857
858
859
860
861
  	do {
  		/*
  		 * We are holding two locks at this point - either of them
  		 * could generate latencies in another task on another CPU.
  		 */
e040f218b   Hugh Dickins   [PATCH] mm: copy_...
862
863
864
  		if (progress >= 32) {
  			progress = 0;
  			if (need_resched() ||
95c354fe9   Nick Piggin   spinlock: lockbre...
865
  			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
e040f218b   Hugh Dickins   [PATCH] mm: copy_...
866
867
  				break;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
868
869
870
871
  		if (pte_none(*src_pte)) {
  			progress++;
  			continue;
  		}
570a335b8   Hugh Dickins   swap_info: swap c...
872
873
874
875
  		entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
  							vma, addr, rss);
  		if (entry.val)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
876
877
  		progress += 8;
  	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
878

6606c3e0d   Zachary Amsden   [PATCH] paravirt:...
879
  	arch_leave_lazy_mmu_mode();
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
880
  	spin_unlock(src_ptl);
ece0e2b64   Peter Zijlstra   mm: remove pte_*m...
881
  	pte_unmap(orig_src_pte);
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
882
  	add_mm_rss_vec(dst_mm, rss);
c36987e2e   Daisuke Nishimura   mm: don't call pt...
883
  	pte_unmap_unlock(orig_dst_pte, dst_ptl);
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
884
  	cond_resched();
570a335b8   Hugh Dickins   swap_info: swap c...
885
886
887
888
889
890
  
  	if (entry.val) {
  		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
  			return -ENOMEM;
  		progress = 0;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
  	if (addr != end)
  		goto again;
  	return 0;
  }
  
  static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
  		unsigned long addr, unsigned long end)
  {
  	pmd_t *src_pmd, *dst_pmd;
  	unsigned long next;
  
  	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
  	if (!dst_pmd)
  		return -ENOMEM;
  	src_pmd = pmd_offset(src_pud, addr);
  	do {
  		next = pmd_addr_end(addr, end);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
909
  		if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
71e3aac07   Andrea Arcangeli   thp: transparent ...
910
  			int err;
14d1a55cd   Andrea Arcangeli   thp: add debug ch...
911
  			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
71e3aac07   Andrea Arcangeli   thp: transparent ...
912
913
914
915
916
917
918
919
  			err = copy_huge_pmd(dst_mm, src_mm,
  					    dst_pmd, src_pmd, addr, vma);
  			if (err == -ENOMEM)
  				return -ENOMEM;
  			if (!err)
  				continue;
  			/* fall through */
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
  		if (pmd_none_or_clear_bad(src_pmd))
  			continue;
  		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
  						vma, addr, next))
  			return -ENOMEM;
  	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
  	return 0;
  }
  
  static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
  		unsigned long addr, unsigned long end)
  {
  	pud_t *src_pud, *dst_pud;
  	unsigned long next;
  
  	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
  	if (!dst_pud)
  		return -ENOMEM;
  	src_pud = pud_offset(src_pgd, addr);
  	do {
  		next = pud_addr_end(addr, end);
  		if (pud_none_or_clear_bad(src_pud))
  			continue;
  		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
  						vma, addr, next))
  			return -ENOMEM;
  	} while (dst_pud++, src_pud++, addr = next, addr != end);
  	return 0;
  }
  
  int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  		struct vm_area_struct *vma)
  {
  	pgd_t *src_pgd, *dst_pgd;
  	unsigned long next;
  	unsigned long addr = vma->vm_start;
  	unsigned long end = vma->vm_end;
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
958
959
960
  	unsigned long mmun_start;	/* For mmu_notifiers */
  	unsigned long mmun_end;		/* For mmu_notifiers */
  	bool is_cow;
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
961
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
962

d992895ba   Nick Piggin   [PATCH] Lazy page...
963
964
965
966
967
968
  	/*
  	 * Don't copy ptes where a page fault will fill them correctly.
  	 * Fork becomes much lighter when there are big shared or private
  	 * readonly mappings. The tradeoff is that copy_page_range is more
  	 * efficient than faulting.
  	 */
0661a3361   Kirill A. Shutemov   mm: remove rest u...
969
970
971
  	if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
  			!vma->anon_vma)
  		return 0;
d992895ba   Nick Piggin   [PATCH] Lazy page...
972

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
973
974
  	if (is_vm_hugetlb_page(vma))
  		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
b3b9c2932   Konstantin Khlebnikov   mm, x86, pat: rew...
975
  	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
2ab640379   venkatesh.pallipadi@intel.com   x86: PAT: hooks i...
976
977
978
979
  		/*
  		 * We do not free on error cases below as remove_vma
  		 * gets called on error from higher level routine
  		 */
5180da410   Suresh Siddha   x86, pat: separat...
980
  		ret = track_pfn_copy(vma);
2ab640379   venkatesh.pallipadi@intel.com   x86: PAT: hooks i...
981
982
983
  		if (ret)
  			return ret;
  	}
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
984
985
986
987
988
989
  	/*
  	 * We need to invalidate the secondary MMU mappings only when
  	 * there could be a permission downgrade on the ptes of the
  	 * parent mm. And a permission downgrade will only happen if
  	 * is_cow_mapping() returns true.
  	 */
2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
990
991
992
993
994
995
  	is_cow = is_cow_mapping(vma->vm_flags);
  	mmun_start = addr;
  	mmun_end   = end;
  	if (is_cow)
  		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
  						    mmun_end);
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
996
997
  
  	ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
998
999
1000
1001
1002
1003
  	dst_pgd = pgd_offset(dst_mm, addr);
  	src_pgd = pgd_offset(src_mm, addr);
  	do {
  		next = pgd_addr_end(addr, end);
  		if (pgd_none_or_clear_bad(src_pgd))
  			continue;
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
1004
1005
1006
1007
1008
  		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
  					    vma, addr, next))) {
  			ret = -ENOMEM;
  			break;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1009
  	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
1010

2ec74c3ef   Sagi Grimberg   mm: move all mmu ...
1011
1012
  	if (is_cow)
  		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
1013
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1014
  }
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1015
  static unsigned long zap_pte_range(struct mmu_gather *tlb,
b5810039a   Nick Piggin   [PATCH] core remo...
1016
  				struct vm_area_struct *vma, pmd_t *pmd,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1017
  				unsigned long addr, unsigned long end,
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1018
  				struct zap_details *details)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1019
  {
b5810039a   Nick Piggin   [PATCH] core remo...
1020
  	struct mm_struct *mm = tlb->mm;
d16dfc550   Peter Zijlstra   mm: mmu_gather re...
1021
  	int force_flush = 0;
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
1022
  	int rss[NR_MM_COUNTERS];
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1023
  	spinlock_t *ptl;
5f1a19070   Steven Rostedt   mm: fix wrong kun...
1024
  	pte_t *start_pte;
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1025
  	pte_t *pte;
8a5f14a23   Kirill A. Shutemov   mm: drop support ...
1026
  	swp_entry_t entry;
d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
1027

d16dfc550   Peter Zijlstra   mm: mmu_gather re...
1028
  again:
e303297e6   Peter Zijlstra   mm: extended batc...
1029
  	init_rss_vec(rss);
5f1a19070   Steven Rostedt   mm: fix wrong kun...
1030
1031
  	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  	pte = start_pte;
6606c3e0d   Zachary Amsden   [PATCH] paravirt:...
1032
  	arch_enter_lazy_mmu_mode();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1033
1034
  	do {
  		pte_t ptent = *pte;
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1035
  		if (pte_none(ptent)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1036
  			continue;
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1037
  		}
6f5e6b9e6   Hugh Dickins   [PATCH] fix free ...
1038

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1039
  		if (pte_present(ptent)) {
ee498ed73   Hugh Dickins   [PATCH] unpaged: ...
1040
  			struct page *page;
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1041

6aab341e0   Linus Torvalds   mm: re-architect ...
1042
  			page = vm_normal_page(vma, addr, ptent);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1043
1044
1045
1046
1047
1048
1049
1050
1051
  			if (unlikely(details) && page) {
  				/*
  				 * unmap_shared_mapping_pages() wants to
  				 * invalidate cache without truncating:
  				 * unmap shared but keep private pages.
  				 */
  				if (details->check_mapping &&
  				    details->check_mapping != page->mapping)
  					continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1052
  			}
b5810039a   Nick Piggin   [PATCH] core remo...
1053
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
a600388d2   Zachary Amsden   [PATCH] x86: ptep...
1054
  							tlb->fullmm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1055
1056
1057
  			tlb_remove_tlb_entry(tlb, pte, addr);
  			if (unlikely(!page))
  				continue;
eca56ff90   Jerome Marchand   mm, shmem: add in...
1058
1059
  
  			if (!PageAnon(page)) {
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
1060
  				if (pte_dirty(ptent)) {
aac453635   Michal Hocko   mm, oom: introduc...
1061
1062
1063
1064
1065
1066
  					/*
  					 * oom_reaper cannot tear down dirty
  					 * pages
  					 */
  					if (unlikely(details && details->ignore_dirty))
  						continue;
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
1067
  					force_flush = 1;
6237bcd94   Hugh Dickins   [PATCH] mm: zap_p...
1068
  					set_page_dirty(page);
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
1069
  				}
4917e5d04   Johannes Weiner   mm: more likely r...
1070
  				if (pte_young(ptent) &&
64363aad5   Joe Perches   mm: remove unused...
1071
  				    likely(!(vma->vm_flags & VM_SEQ_READ)))
bf3f3bc5e   Nick Piggin   mm: don't mark_pa...
1072
  					mark_page_accessed(page);
6237bcd94   Hugh Dickins   [PATCH] mm: zap_p...
1073
  			}
eca56ff90   Jerome Marchand   mm, shmem: add in...
1074
  			rss[mm_counter(page)]--;
d281ee614   Kirill A. Shutemov   rmap: add argumen...
1075
  			page_remove_rmap(page, false);
3dc147414   Hugh Dickins   badpage: replace ...
1076
1077
  			if (unlikely(page_mapcount(page) < 0))
  				print_bad_pte(vma, addr, ptent, page);
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
1078
1079
  			if (unlikely(!__tlb_remove_page(tlb, page))) {
  				force_flush = 1;
ce9ec37bd   Will Deacon   zap_pte_range: up...
1080
  				addr += PAGE_SIZE;
d16dfc550   Peter Zijlstra   mm: mmu_gather re...
1081
  				break;
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
1082
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1083
1084
  			continue;
  		}
aac453635   Michal Hocko   mm, oom: introduc...
1085
1086
  		/* only check swap_entries if explicitly asked for in details */
  		if (unlikely(details && !details->check_swap_entries))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1087
  			continue;
b084d4353   KAMEZAWA Hiroyuki   mm: count swap usage
1088

8a5f14a23   Kirill A. Shutemov   mm: drop support ...
1089
1090
1091
1092
1093
  		entry = pte_to_swp_entry(ptent);
  		if (!non_swap_entry(entry))
  			rss[MM_SWAPENTS]--;
  		else if (is_migration_entry(entry)) {
  			struct page *page;
9f9f1acd7   Konstantin Khlebnikov   mm: fix rss count...
1094

8a5f14a23   Kirill A. Shutemov   mm: drop support ...
1095
  			page = migration_entry_to_page(entry);
eca56ff90   Jerome Marchand   mm, shmem: add in...
1096
  			rss[mm_counter(page)]--;
b084d4353   KAMEZAWA Hiroyuki   mm: count swap usage
1097
  		}
8a5f14a23   Kirill A. Shutemov   mm: drop support ...
1098
1099
  		if (unlikely(!free_swap_and_cache(entry)))
  			print_bad_pte(vma, addr, ptent, NULL);
9888a1cae   Zachary Amsden   [PATCH] paravirt:...
1100
  		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1101
  	} while (pte++, addr += PAGE_SIZE, addr != end);
ae8597623   Hugh Dickins   [PATCH] mm: batch...
1102

d559db086   KAMEZAWA Hiroyuki   mm: clean up mm_c...
1103
  	add_mm_rss_vec(mm, rss);
6606c3e0d   Zachary Amsden   [PATCH] paravirt:...
1104
  	arch_leave_lazy_mmu_mode();
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1105

1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
1106
  	/* Do the actual TLB flush before dropping ptl */
fb7332a9f   Will Deacon   mmu_gather: move ...
1107
  	if (force_flush)
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
1108
  		tlb_flush_mmu_tlbonly(tlb);
1cf35d477   Linus Torvalds   mm: split 'tlb_fl...
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
  	pte_unmap_unlock(start_pte, ptl);
  
  	/*
  	 * If we forced a TLB flush (either due to running out of
  	 * batch buffers or because we needed to flush dirty TLB
  	 * entries before releasing the ptl), free the batched
  	 * memory too. Restart if we didn't do everything.
  	 */
  	if (force_flush) {
  		force_flush = 0;
  		tlb_flush_mmu_free(tlb);
2b047252d   Linus Torvalds   Fix TLB gather vi...
1120
1121
  
  		if (addr != end)
d16dfc550   Peter Zijlstra   mm: mmu_gather re...
1122
1123
  			goto again;
  	}
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1124
  	return addr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1125
  }
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1126
  static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
b5810039a   Nick Piggin   [PATCH] core remo...
1127
  				struct vm_area_struct *vma, pud_t *pud,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1128
  				unsigned long addr, unsigned long end,
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1129
  				struct zap_details *details)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1130
1131
1132
1133
1134
1135
1136
  {
  	pmd_t *pmd;
  	unsigned long next;
  
  	pmd = pmd_offset(pud, addr);
  	do {
  		next = pmd_addr_end(addr, end);
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
1137
  		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
1138
  			if (next - addr != HPAGE_PMD_SIZE) {
e0897d75f   David Rientjes   mm, thp: print us...
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
  #ifdef CONFIG_DEBUG_VM
  				if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
  					pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx
  ",
  						__func__, addr, end,
  						vma->vm_start,
  						vma->vm_end);
  					BUG();
  				}
  #endif
78ddc5347   Kirill A. Shutemov   thp: rename split...
1149
  				split_huge_pmd(vma, pmd, addr);
f21760b15   Shaohua Li   thp: add tlb_remo...
1150
  			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
1151
  				goto next;
71e3aac07   Andrea Arcangeli   thp: transparent ...
1152
1153
  			/* fall through */
  		}
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
1154
1155
1156
1157
1158
1159
1160
1161
1162
  		/*
  		 * Here there can be other concurrent MADV_DONTNEED or
  		 * trans huge page faults running, and if the pmd is
  		 * none or trans huge it can change under us. This is
  		 * because MADV_DONTNEED holds the mmap_sem in read
  		 * mode.
  		 */
  		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  			goto next;
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1163
  		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
1164
  next:
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1165
1166
  		cond_resched();
  	} while (pmd++, addr = next, addr != end);
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1167
1168
  
  	return addr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1169
  }
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1170
  static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
b5810039a   Nick Piggin   [PATCH] core remo...
1171
  				struct vm_area_struct *vma, pgd_t *pgd,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1172
  				unsigned long addr, unsigned long end,
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1173
  				struct zap_details *details)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174
1175
1176
1177
1178
1179
1180
  {
  	pud_t *pud;
  	unsigned long next;
  
  	pud = pud_offset(pgd, addr);
  	do {
  		next = pud_addr_end(addr, end);
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1181
  		if (pud_none_or_clear_bad(pud))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1182
  			continue;
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1183
1184
  		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
  	} while (pud++, addr = next, addr != end);
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1185
1186
  
  	return addr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1187
  }
aac453635   Michal Hocko   mm, oom: introduc...
1188
  void unmap_page_range(struct mmu_gather *tlb,
038c7aa16   Al Viro   VM: unmap_page_ra...
1189
1190
1191
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end,
  			     struct zap_details *details)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1192
1193
1194
  {
  	pgd_t *pgd;
  	unsigned long next;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1195
1196
1197
1198
1199
  	BUG_ON(addr >= end);
  	tlb_start_vma(tlb, vma);
  	pgd = pgd_offset(vma->vm_mm, addr);
  	do {
  		next = pgd_addr_end(addr, end);
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1200
  		if (pgd_none_or_clear_bad(pgd))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1201
  			continue;
97a894136   Peter Zijlstra   mm: Remove i_mmap...
1202
1203
  		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
  	} while (pgd++, addr = next, addr != end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1204
1205
  	tlb_end_vma(tlb, vma);
  }
51c6f666f   Robin Holt   [PATCH] mm: ZAP_B...
1206

f5cc4eef9   Al Viro   VM: make zap_page...
1207
1208
1209
  
  static void unmap_single_vma(struct mmu_gather *tlb,
  		struct vm_area_struct *vma, unsigned long start_addr,
4f74d2c8e   Linus Torvalds   vm: remove 'nr_ac...
1210
  		unsigned long end_addr,
f5cc4eef9   Al Viro   VM: make zap_page...
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
  		struct zap_details *details)
  {
  	unsigned long start = max(vma->vm_start, start_addr);
  	unsigned long end;
  
  	if (start >= vma->vm_end)
  		return;
  	end = min(vma->vm_end, end_addr);
  	if (end <= vma->vm_start)
  		return;
cbc91f71b   Srikar Dronamraju   uprobes/core: Dec...
1221
1222
  	if (vma->vm_file)
  		uprobe_munmap(vma, start, end);
b3b9c2932   Konstantin Khlebnikov   mm, x86, pat: rew...
1223
  	if (unlikely(vma->vm_flags & VM_PFNMAP))
5180da410   Suresh Siddha   x86, pat: separat...
1224
  		untrack_pfn(vma, 0, 0);
f5cc4eef9   Al Viro   VM: make zap_page...
1225
1226
1227
1228
1229
1230
1231
  
  	if (start != end) {
  		if (unlikely(is_vm_hugetlb_page(vma))) {
  			/*
  			 * It is undesirable to test vma->vm_file as it
  			 * should be non-null for valid hugetlb area.
  			 * However, vm_file will be NULL in the error
7aa6b4ad5   Davidlohr Bueso   mm/memory.c: upda...
1232
  			 * cleanup path of mmap_region. When
f5cc4eef9   Al Viro   VM: make zap_page...
1233
  			 * hugetlbfs ->mmap method fails,
7aa6b4ad5   Davidlohr Bueso   mm/memory.c: upda...
1234
  			 * mmap_region() nullifies vma->vm_file
f5cc4eef9   Al Viro   VM: make zap_page...
1235
1236
1237
1238
  			 * before calling this function to clean up.
  			 * Since no pte has actually been setup, it is
  			 * safe to do nothing in this case.
  			 */
24669e584   Aneesh Kumar K.V   hugetlb: use mmu_...
1239
  			if (vma->vm_file) {
83cde9e8b   Davidlohr Bueso   mm: use new helpe...
1240
  				i_mmap_lock_write(vma->vm_file->f_mapping);
d833352a4   Mel Gorman   mm: hugetlbfs: cl...
1241
  				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
83cde9e8b   Davidlohr Bueso   mm: use new helpe...
1242
  				i_mmap_unlock_write(vma->vm_file->f_mapping);
24669e584   Aneesh Kumar K.V   hugetlb: use mmu_...
1243
  			}
f5cc4eef9   Al Viro   VM: make zap_page...
1244
1245
1246
  		} else
  			unmap_page_range(tlb, vma, start, end, details);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1247
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1248
1249
  /**
   * unmap_vmas - unmap a range of memory covered by a list of vma's
0164f69d0   Randy Dunlap   mm/memory.c: fix ...
1250
   * @tlb: address of the caller's struct mmu_gather
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1251
1252
1253
   * @vma: the starting vma
   * @start_addr: virtual address at which to start unmapping
   * @end_addr: virtual address at which to end unmapping
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1254
   *
508034a32   Hugh Dickins   [PATCH] mm: unmap...
1255
   * Unmap all pages in the vma list.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1256
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1257
1258
1259
1260
1261
1262
1263
1264
1265
   * Only addresses between `start' and `end' will be unmapped.
   *
   * The VMA list must be sorted in ascending virtual address order.
   *
   * unmap_vmas() assumes that the caller will flush the whole unmapped address
   * range after unmap_vmas() returns.  So the only responsibility here is to
   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
   * drops the lock and schedules.
   */
6e8bb0193   Al Viro   VM: make unmap_vm...
1266
  void unmap_vmas(struct mmu_gather *tlb,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1267
  		struct vm_area_struct *vma, unsigned long start_addr,
4f74d2c8e   Linus Torvalds   vm: remove 'nr_ac...
1268
  		unsigned long end_addr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1269
  {
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
1270
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1271

cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
1272
  	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
f5cc4eef9   Al Viro   VM: make zap_page...
1273
  	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
4f74d2c8e   Linus Torvalds   vm: remove 'nr_ac...
1274
  		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
1275
  	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1276
1277
1278
1279
1280
  }
  
  /**
   * zap_page_range - remove user pages in a given range
   * @vma: vm_area_struct holding the applicable pages
eb4546bbb   Randy Dunlap   mm/memory.c: fix ...
1281
   * @start: starting address of pages to zap
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1282
   * @size: number of bytes to zap
8a5f14a23   Kirill A. Shutemov   mm: drop support ...
1283
   * @details: details of shared cache invalidation
f5cc4eef9   Al Viro   VM: make zap_page...
1284
1285
   *
   * Caller must protect the VMA list
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1286
   */
7e027b14d   Linus Torvalds   vm: simplify unma...
1287
  void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1288
1289
1290
  		unsigned long size, struct zap_details *details)
  {
  	struct mm_struct *mm = vma->vm_mm;
d16dfc550   Peter Zijlstra   mm: mmu_gather re...
1291
  	struct mmu_gather tlb;
7e027b14d   Linus Torvalds   vm: simplify unma...
1292
  	unsigned long end = start + size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1293

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1294
  	lru_add_drain();
2b047252d   Linus Torvalds   Fix TLB gather vi...
1295
  	tlb_gather_mmu(&tlb, mm, start, end);
365e9c87a   Hugh Dickins   [PATCH] mm: updat...
1296
  	update_hiwater_rss(mm);
7e027b14d   Linus Torvalds   vm: simplify unma...
1297
1298
  	mmu_notifier_invalidate_range_start(mm, start, end);
  	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
4f74d2c8e   Linus Torvalds   vm: remove 'nr_ac...
1299
  		unmap_single_vma(&tlb, vma, start, end, details);
7e027b14d   Linus Torvalds   vm: simplify unma...
1300
1301
  	mmu_notifier_invalidate_range_end(mm, start, end);
  	tlb_finish_mmu(&tlb, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1302
  }
c627f9cc0   Jack Steiner   mm: add zap_vma_p...
1303
  /**
f5cc4eef9   Al Viro   VM: make zap_page...
1304
1305
1306
1307
   * zap_page_range_single - remove user pages in a given range
   * @vma: vm_area_struct holding the applicable pages
   * @address: starting address of pages to zap
   * @size: number of bytes to zap
8a5f14a23   Kirill A. Shutemov   mm: drop support ...
1308
   * @details: details of shared cache invalidation
f5cc4eef9   Al Viro   VM: make zap_page...
1309
1310
   *
   * The range must fit into one VMA.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1311
   */
f5cc4eef9   Al Viro   VM: make zap_page...
1312
  static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1313
1314
1315
  		unsigned long size, struct zap_details *details)
  {
  	struct mm_struct *mm = vma->vm_mm;
d16dfc550   Peter Zijlstra   mm: mmu_gather re...
1316
  	struct mmu_gather tlb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1317
  	unsigned long end = address + size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1318

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1319
  	lru_add_drain();
2b047252d   Linus Torvalds   Fix TLB gather vi...
1320
  	tlb_gather_mmu(&tlb, mm, address, end);
365e9c87a   Hugh Dickins   [PATCH] mm: updat...
1321
  	update_hiwater_rss(mm);
f5cc4eef9   Al Viro   VM: make zap_page...
1322
  	mmu_notifier_invalidate_range_start(mm, address, end);
4f74d2c8e   Linus Torvalds   vm: remove 'nr_ac...
1323
  	unmap_single_vma(&tlb, vma, address, end, details);
f5cc4eef9   Al Viro   VM: make zap_page...
1324
  	mmu_notifier_invalidate_range_end(mm, address, end);
d16dfc550   Peter Zijlstra   mm: mmu_gather re...
1325
  	tlb_finish_mmu(&tlb, address, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1326
  }
c627f9cc0   Jack Steiner   mm: add zap_vma_p...
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
  /**
   * zap_vma_ptes - remove ptes mapping the vma
   * @vma: vm_area_struct holding ptes to be zapped
   * @address: starting address of pages to zap
   * @size: number of bytes to zap
   *
   * This function only unmaps ptes assigned to VM_PFNMAP vmas.
   *
   * The entire address range must be fully contained within the vma.
   *
   * Returns 0 if successful.
   */
  int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
  		unsigned long size)
  {
  	if (address < vma->vm_start || address + size > vma->vm_end ||
  	    		!(vma->vm_flags & VM_PFNMAP))
  		return -1;
f5cc4eef9   Al Viro   VM: make zap_page...
1345
  	zap_page_range_single(vma, address, size, NULL);
c627f9cc0   Jack Steiner   mm: add zap_vma_p...
1346
1347
1348
  	return 0;
  }
  EXPORT_SYMBOL_GPL(zap_vma_ptes);
25ca1d6c0   Namhyung Kim   mm: wrap get_lock...
1349
  pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
920c7a5d0   Harvey Harrison   mm: remove fastca...
1350
  			spinlock_t **ptl)
c9cfcddfd   Linus Torvalds   VM: add common he...
1351
1352
1353
1354
  {
  	pgd_t * pgd = pgd_offset(mm, addr);
  	pud_t * pud = pud_alloc(mm, pgd, addr);
  	if (pud) {
49c91fb01   Trond Myklebust   [PATCH] VM: Fix t...
1355
  		pmd_t * pmd = pmd_alloc(mm, pud, addr);
f66055ab6   Andrea Arcangeli   thp: verify pmd_t...
1356
1357
  		if (pmd) {
  			VM_BUG_ON(pmd_trans_huge(*pmd));
c9cfcddfd   Linus Torvalds   VM: add common he...
1358
  			return pte_alloc_map_lock(mm, pmd, addr, ptl);
f66055ab6   Andrea Arcangeli   thp: verify pmd_t...
1359
  		}
c9cfcddfd   Linus Torvalds   VM: add common he...
1360
1361
1362
  	}
  	return NULL;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1363
  /*
238f58d89   Linus Torvalds   Support strange d...
1364
1365
1366
1367
1368
1369
   * This is the old fallback for page remapping.
   *
   * For historical reasons, it only allows reserved pages. Only
   * old drivers should use this, and they needed to mark their
   * pages reserved for the old functions anyway.
   */
423bad600   Nick Piggin   mm: add vm_insert...
1370
1371
  static int insert_page(struct vm_area_struct *vma, unsigned long addr,
  			struct page *page, pgprot_t prot)
238f58d89   Linus Torvalds   Support strange d...
1372
  {
423bad600   Nick Piggin   mm: add vm_insert...
1373
  	struct mm_struct *mm = vma->vm_mm;
238f58d89   Linus Torvalds   Support strange d...
1374
  	int retval;
c9cfcddfd   Linus Torvalds   VM: add common he...
1375
  	pte_t *pte;
8a9f3ccd2   Balbir Singh   Memory controller...
1376
  	spinlock_t *ptl;
238f58d89   Linus Torvalds   Support strange d...
1377
  	retval = -EINVAL;
a145dd411   Linus Torvalds   VM: add "vm_inser...
1378
  	if (PageAnon(page))
5b4e655e9   KAMEZAWA Hiroyuki   memcg: avoid acco...
1379
  		goto out;
238f58d89   Linus Torvalds   Support strange d...
1380
1381
  	retval = -ENOMEM;
  	flush_dcache_page(page);
c9cfcddfd   Linus Torvalds   VM: add common he...
1382
  	pte = get_locked_pte(mm, addr, &ptl);
238f58d89   Linus Torvalds   Support strange d...
1383
  	if (!pte)
5b4e655e9   KAMEZAWA Hiroyuki   memcg: avoid acco...
1384
  		goto out;
238f58d89   Linus Torvalds   Support strange d...
1385
1386
1387
1388
1389
1390
  	retval = -EBUSY;
  	if (!pte_none(*pte))
  		goto out_unlock;
  
  	/* Ok, finally just insert the thing.. */
  	get_page(page);
eca56ff90   Jerome Marchand   mm, shmem: add in...
1391
  	inc_mm_counter_fast(mm, mm_counter_file(page));
238f58d89   Linus Torvalds   Support strange d...
1392
1393
1394
1395
  	page_add_file_rmap(page);
  	set_pte_at(mm, addr, pte, mk_pte(page, prot));
  
  	retval = 0;
8a9f3ccd2   Balbir Singh   Memory controller...
1396
1397
  	pte_unmap_unlock(pte, ptl);
  	return retval;
238f58d89   Linus Torvalds   Support strange d...
1398
1399
1400
1401
1402
  out_unlock:
  	pte_unmap_unlock(pte, ptl);
  out:
  	return retval;
  }
bfa5bf6d6   Rolf Eike Beer   [PATCH] Add kerne...
1403
1404
1405
1406
1407
1408
  /**
   * vm_insert_page - insert single page into user vma
   * @vma: user vma to map to
   * @addr: target user address of this page
   * @page: source kernel page
   *
a145dd411   Linus Torvalds   VM: add "vm_inser...
1409
1410
1411
1412
1413
1414
   * This allows drivers to insert individual pages they've allocated
   * into a user vma.
   *
   * The page has to be a nice clean _individual_ kernel allocation.
   * If you allocate a compound page, you need to have marked it as
   * such (__GFP_COMP), or manually just split the page up yourself
8dfcc9ba2   Nick Piggin   [PATCH] mm: split...
1415
   * (see split_page()).
a145dd411   Linus Torvalds   VM: add "vm_inser...
1416
1417
1418
1419
1420
1421
1422
1423
   *
   * NOTE! Traditionally this was done with "remap_pfn_range()" which
   * took an arbitrary page protection parameter. This doesn't allow
   * that. Your vma protection will have to be set up correctly, which
   * means that if you want a shared writable mapping, you'd better
   * ask for a shared writable mapping!
   *
   * The page does not need to be reserved.
4b6e1e370   Konstantin Khlebnikov   mm: kill vma flag...
1424
1425
1426
1427
1428
   *
   * Usually this function is called from f_op->mmap() handler
   * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
   * Caller must set VM_MIXEDMAP on vma if it wants to call this
   * function from other places, for example from page-fault handler.
a145dd411   Linus Torvalds   VM: add "vm_inser...
1429
   */
423bad600   Nick Piggin   mm: add vm_insert...
1430
1431
  int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
  			struct page *page)
a145dd411   Linus Torvalds   VM: add "vm_inser...
1432
1433
1434
1435
1436
  {
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return -EFAULT;
  	if (!page_count(page))
  		return -EINVAL;
4b6e1e370   Konstantin Khlebnikov   mm: kill vma flag...
1437
1438
1439
1440
1441
  	if (!(vma->vm_flags & VM_MIXEDMAP)) {
  		BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
  		BUG_ON(vma->vm_flags & VM_PFNMAP);
  		vma->vm_flags |= VM_MIXEDMAP;
  	}
423bad600   Nick Piggin   mm: add vm_insert...
1442
  	return insert_page(vma, addr, page, vma->vm_page_prot);
a145dd411   Linus Torvalds   VM: add "vm_inser...
1443
  }
e3c3374fb   Linus Torvalds   Make vm_insert_pa...
1444
  EXPORT_SYMBOL(vm_insert_page);
a145dd411   Linus Torvalds   VM: add "vm_inser...
1445

423bad600   Nick Piggin   mm: add vm_insert...
1446
  static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
01c8f1c44   Dan Williams   mm, dax, gpu: con...
1447
  			pfn_t pfn, pgprot_t prot)
423bad600   Nick Piggin   mm: add vm_insert...
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
  {
  	struct mm_struct *mm = vma->vm_mm;
  	int retval;
  	pte_t *pte, entry;
  	spinlock_t *ptl;
  
  	retval = -ENOMEM;
  	pte = get_locked_pte(mm, addr, &ptl);
  	if (!pte)
  		goto out;
  	retval = -EBUSY;
  	if (!pte_none(*pte))
  		goto out_unlock;
  
  	/* Ok, finally just insert the thing.. */
01c8f1c44   Dan Williams   mm, dax, gpu: con...
1463
1464
1465
1466
  	if (pfn_t_devmap(pfn))
  		entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
  	else
  		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
423bad600   Nick Piggin   mm: add vm_insert...
1467
  	set_pte_at(mm, addr, pte, entry);
4b3073e1c   Russell King   MM: Pass a PTE po...
1468
  	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
423bad600   Nick Piggin   mm: add vm_insert...
1469
1470
1471
1472
1473
1474
1475
  
  	retval = 0;
  out_unlock:
  	pte_unmap_unlock(pte, ptl);
  out:
  	return retval;
  }
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1476
1477
1478
1479
1480
1481
  /**
   * vm_insert_pfn - insert single pfn into user vma
   * @vma: user vma to map to
   * @addr: target user address of this page
   * @pfn: source kernel pfn
   *
c462f179e   Robert P. J. Day   mm/memory.c: fix ...
1482
   * Similar to vm_insert_page, this allows drivers to insert individual pages
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1483
1484
1485
1486
   * they've allocated into a user vma. Same comments apply.
   *
   * This function should only be called from a vm_ops->fault handler, and
   * in that case the handler should return NULL.
0d71d10a4   Nick Piggin   mm: remove nopfn
1487
1488
1489
1490
1491
   *
   * vma cannot be a COW mapping.
   *
   * As this is called only for pages that do not currently exist, we
   * do not need to flush old virtual caches or the TLB.
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1492
1493
   */
  int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
423bad600   Nick Piggin   mm: add vm_insert...
1494
  			unsigned long pfn)
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1495
  {
1745cbc5d   Andy Lutomirski   mm: Add vm_insert...
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
  	return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
  }
  EXPORT_SYMBOL(vm_insert_pfn);
  
  /**
   * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
   * @vma: user vma to map to
   * @addr: target user address of this page
   * @pfn: source kernel pfn
   * @pgprot: pgprot flags for the inserted page
   *
   * This is exactly like vm_insert_pfn, except that it allows drivers to
   * to override pgprot on a per-page basis.
   *
   * This only makes sense for IO mappings, and it makes no sense for
   * cow mappings.  In general, using multiple vmas is preferable;
   * vm_insert_pfn_prot should only be used if using multiple VMAs is
   * impractical.
   */
  int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
  			unsigned long pfn, pgprot_t pgprot)
  {
2ab640379   venkatesh.pallipadi@intel.com   x86: PAT: hooks i...
1518
  	int ret;
7e675137a   Nick Piggin   mm: introduce pte...
1519
1520
1521
1522
1523
1524
  	/*
  	 * Technically, architectures with pte_special can avoid all these
  	 * restrictions (same for remap_pfn_range).  However we would like
  	 * consistency in testing and feature parity among all, so we should
  	 * try to keep these invariants in place for everybody.
  	 */
b379d7901   Jared Hulbert   mm: introduce VM_...
1525
1526
1527
1528
1529
  	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
  	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  						(VM_PFNMAP|VM_MIXEDMAP));
  	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
  	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1530

423bad600   Nick Piggin   mm: add vm_insert...
1531
1532
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return -EFAULT;
f25748e3c   Dan Williams   mm, dax: convert ...
1533
  	if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
2ab640379   venkatesh.pallipadi@intel.com   x86: PAT: hooks i...
1534
  		return -EINVAL;
01c8f1c44   Dan Williams   mm, dax, gpu: con...
1535
  	ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
2ab640379   venkatesh.pallipadi@intel.com   x86: PAT: hooks i...
1536

2ab640379   venkatesh.pallipadi@intel.com   x86: PAT: hooks i...
1537
  	return ret;
423bad600   Nick Piggin   mm: add vm_insert...
1538
  }
1745cbc5d   Andy Lutomirski   mm: Add vm_insert...
1539
  EXPORT_SYMBOL(vm_insert_pfn_prot);
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1540

423bad600   Nick Piggin   mm: add vm_insert...
1541
  int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
01c8f1c44   Dan Williams   mm, dax, gpu: con...
1542
  			pfn_t pfn)
423bad600   Nick Piggin   mm: add vm_insert...
1543
1544
  {
  	BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1545

423bad600   Nick Piggin   mm: add vm_insert...
1546
1547
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return -EFAULT;
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1548

423bad600   Nick Piggin   mm: add vm_insert...
1549
1550
1551
1552
  	/*
  	 * If we don't have pte special, then we have to use the pfn_valid()
  	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
  	 * refcount the page if pfn_valid is true (hence insert_page rather
62eede62d   Hugh Dickins   mm: ZERO_PAGE wit...
1553
1554
  	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
  	 * without pte special, it would there be refcounted as a normal page.
423bad600   Nick Piggin   mm: add vm_insert...
1555
  	 */
03fc2da63   Dan Williams   mm: fix pfn_t to ...
1556
  	if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
423bad600   Nick Piggin   mm: add vm_insert...
1557
  		struct page *page;
03fc2da63   Dan Williams   mm: fix pfn_t to ...
1558
1559
1560
1561
1562
1563
  		/*
  		 * At this point we are committed to insert_page()
  		 * regardless of whether the caller specified flags that
  		 * result in pfn_t_has_page() == false.
  		 */
  		page = pfn_to_page(pfn_t_to_pfn(pfn));
423bad600   Nick Piggin   mm: add vm_insert...
1564
1565
1566
  		return insert_page(vma, addr, page, vma->vm_page_prot);
  	}
  	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1567
  }
423bad600   Nick Piggin   mm: add vm_insert...
1568
  EXPORT_SYMBOL(vm_insert_mixed);
e0dc0d8f4   Nick Piggin   [PATCH] add vm_in...
1569

a145dd411   Linus Torvalds   VM: add "vm_inser...
1570
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1571
1572
1573
1574
1575
1576
1577
1578
1579
   * maps a range of physical memory into the requested pages. the old
   * mappings are removed. any references to nonexistent pages results
   * in null mappings (currently treated as "copy-on-access")
   */
  static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
  			unsigned long addr, unsigned long end,
  			unsigned long pfn, pgprot_t prot)
  {
  	pte_t *pte;
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
1580
  	spinlock_t *ptl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1581

c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
1582
  	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1583
1584
  	if (!pte)
  		return -ENOMEM;
6606c3e0d   Zachary Amsden   [PATCH] paravirt:...
1585
  	arch_enter_lazy_mmu_mode();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1586
1587
  	do {
  		BUG_ON(!pte_none(*pte));
7e675137a   Nick Piggin   mm: introduce pte...
1588
  		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1589
1590
  		pfn++;
  	} while (pte++, addr += PAGE_SIZE, addr != end);
6606c3e0d   Zachary Amsden   [PATCH] paravirt:...
1591
  	arch_leave_lazy_mmu_mode();
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
1592
  	pte_unmap_unlock(pte - 1, ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
  	return 0;
  }
  
  static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
  			unsigned long addr, unsigned long end,
  			unsigned long pfn, pgprot_t prot)
  {
  	pmd_t *pmd;
  	unsigned long next;
  
  	pfn -= addr >> PAGE_SHIFT;
  	pmd = pmd_alloc(mm, pud, addr);
  	if (!pmd)
  		return -ENOMEM;
f66055ab6   Andrea Arcangeli   thp: verify pmd_t...
1607
  	VM_BUG_ON(pmd_trans_huge(*pmd));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
  	do {
  		next = pmd_addr_end(addr, end);
  		if (remap_pte_range(mm, pmd, addr, next,
  				pfn + (addr >> PAGE_SHIFT), prot))
  			return -ENOMEM;
  	} while (pmd++, addr = next, addr != end);
  	return 0;
  }
  
  static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
  			unsigned long addr, unsigned long end,
  			unsigned long pfn, pgprot_t prot)
  {
  	pud_t *pud;
  	unsigned long next;
  
  	pfn -= addr >> PAGE_SHIFT;
  	pud = pud_alloc(mm, pgd, addr);
  	if (!pud)
  		return -ENOMEM;
  	do {
  		next = pud_addr_end(addr, end);
  		if (remap_pmd_range(mm, pud, addr, next,
  				pfn + (addr >> PAGE_SHIFT), prot))
  			return -ENOMEM;
  	} while (pud++, addr = next, addr != end);
  	return 0;
  }
bfa5bf6d6   Rolf Eike Beer   [PATCH] Add kerne...
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
  /**
   * remap_pfn_range - remap kernel memory to userspace
   * @vma: user vma to map to
   * @addr: target user address to start at
   * @pfn: physical address of kernel memory
   * @size: size of map area
   * @prot: page protection flags for this mapping
   *
   *  Note: this is only safe if the mm semaphore is held when called.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1646
1647
1648
1649
1650
  int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
  		    unsigned long pfn, unsigned long size, pgprot_t prot)
  {
  	pgd_t *pgd;
  	unsigned long next;
2d15cab85   Hugh Dickins   [PATCH] mm: fix r...
1651
  	unsigned long end = addr + PAGE_ALIGN(size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1652
1653
1654
1655
1656
1657
1658
1659
  	struct mm_struct *mm = vma->vm_mm;
  	int err;
  
  	/*
  	 * Physically remapped pages are special. Tell the
  	 * rest of the world about it:
  	 *   VM_IO tells people not to look at these pages
  	 *	(accesses can have side effects).
6aab341e0   Linus Torvalds   mm: re-architect ...
1660
1661
1662
  	 *   VM_PFNMAP tells the core MM that the base pages are just
  	 *	raw PFN mappings, and do not have a "struct page" associated
  	 *	with them.
314e51b98   Konstantin Khlebnikov   mm: kill vma flag...
1663
1664
1665
1666
  	 *   VM_DONTEXPAND
  	 *      Disable vma merging and expanding with mremap().
  	 *   VM_DONTDUMP
  	 *      Omit vma from core dump, even when VM_IO turned off.
fb155c161   Linus Torvalds   Allow arbitrary s...
1667
1668
1669
1670
  	 *
  	 * There's a horrible special case to handle copy-on-write
  	 * behaviour that some programs depend on. We mark the "original"
  	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
b3b9c2932   Konstantin Khlebnikov   mm, x86, pat: rew...
1671
  	 * See vm_normal_page() for details.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1672
  	 */
b3b9c2932   Konstantin Khlebnikov   mm, x86, pat: rew...
1673
1674
1675
  	if (is_cow_mapping(vma->vm_flags)) {
  		if (addr != vma->vm_start || end != vma->vm_end)
  			return -EINVAL;
fb155c161   Linus Torvalds   Allow arbitrary s...
1676
  		vma->vm_pgoff = pfn;
b3b9c2932   Konstantin Khlebnikov   mm, x86, pat: rew...
1677
1678
1679
1680
  	}
  
  	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
  	if (err)
3c8bb73ac   venkatesh.pallipadi@intel.com   x86: PAT: store v...
1681
  		return -EINVAL;
fb155c161   Linus Torvalds   Allow arbitrary s...
1682

314e51b98   Konstantin Khlebnikov   mm: kill vma flag...
1683
  	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1684
1685
1686
1687
1688
  
  	BUG_ON(addr >= end);
  	pfn -= addr >> PAGE_SHIFT;
  	pgd = pgd_offset(mm, addr);
  	flush_cache_range(vma, addr, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1689
1690
1691
1692
1693
1694
1695
  	do {
  		next = pgd_addr_end(addr, end);
  		err = remap_pud_range(mm, pgd, addr, next,
  				pfn + (addr >> PAGE_SHIFT), prot);
  		if (err)
  			break;
  	} while (pgd++, addr = next, addr != end);
2ab640379   venkatesh.pallipadi@intel.com   x86: PAT: hooks i...
1696
1697
  
  	if (err)
5180da410   Suresh Siddha   x86, pat: separat...
1698
  		untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2ab640379   venkatesh.pallipadi@intel.com   x86: PAT: hooks i...
1699

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1700
1701
1702
  	return err;
  }
  EXPORT_SYMBOL(remap_pfn_range);
b4cbb197c   Linus Torvalds   vm: add vm_iomap_...
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
  /**
   * vm_iomap_memory - remap memory to userspace
   * @vma: user vma to map to
   * @start: start of area
   * @len: size of area
   *
   * This is a simplified io_remap_pfn_range() for common driver use. The
   * driver just needs to give us the physical memory range to be mapped,
   * we'll figure out the rest from the vma information.
   *
   * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
   * whatever write-combining details or similar.
   */
  int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
  {
  	unsigned long vm_len, pfn, pages;
  
  	/* Check that the physical memory area passed in looks valid */
  	if (start + len < start)
  		return -EINVAL;
  	/*
  	 * You *really* shouldn't map things that aren't page-aligned,
  	 * but we've historically allowed it because IO memory might
  	 * just have smaller alignment.
  	 */
  	len += start & ~PAGE_MASK;
  	pfn = start >> PAGE_SHIFT;
  	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
  	if (pfn + pages < pfn)
  		return -EINVAL;
  
  	/* We start the mapping 'vm_pgoff' pages into the area */
  	if (vma->vm_pgoff > pages)
  		return -EINVAL;
  	pfn += vma->vm_pgoff;
  	pages -= vma->vm_pgoff;
  
  	/* Can we fit all of the mapping? */
  	vm_len = vma->vm_end - vma->vm_start;
  	if (vm_len >> PAGE_SHIFT > pages)
  		return -EINVAL;
  
  	/* Ok, let it rip */
  	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
  }
  EXPORT_SYMBOL(vm_iomap_memory);
aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1749
1750
1751
1752
1753
1754
  static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
  				     unsigned long addr, unsigned long end,
  				     pte_fn_t fn, void *data)
  {
  	pte_t *pte;
  	int err;
2f569afd9   Martin Schwidefsky   CONFIG_HIGHPTE vs...
1755
  	pgtable_t token;
949099148   Borislav Petkov   Add unitialized_v...
1756
  	spinlock_t *uninitialized_var(ptl);
aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1757
1758
1759
1760
1761
1762
1763
1764
  
  	pte = (mm == &init_mm) ?
  		pte_alloc_kernel(pmd, addr) :
  		pte_alloc_map_lock(mm, pmd, addr, &ptl);
  	if (!pte)
  		return -ENOMEM;
  
  	BUG_ON(pmd_huge(*pmd));
38e0edb15   Jeremy Fitzhardinge   mm/apply_to_range...
1765
  	arch_enter_lazy_mmu_mode();
2f569afd9   Martin Schwidefsky   CONFIG_HIGHPTE vs...
1766
  	token = pmd_pgtable(*pmd);
aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1767
1768
  
  	do {
c36987e2e   Daisuke Nishimura   mm: don't call pt...
1769
  		err = fn(pte++, token, addr, data);
aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1770
1771
  		if (err)
  			break;
c36987e2e   Daisuke Nishimura   mm: don't call pt...
1772
  	} while (addr += PAGE_SIZE, addr != end);
aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1773

38e0edb15   Jeremy Fitzhardinge   mm/apply_to_range...
1774
  	arch_leave_lazy_mmu_mode();
aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
  	if (mm != &init_mm)
  		pte_unmap_unlock(pte-1, ptl);
  	return err;
  }
  
  static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
  				     unsigned long addr, unsigned long end,
  				     pte_fn_t fn, void *data)
  {
  	pmd_t *pmd;
  	unsigned long next;
  	int err;
ceb868796   Andi Kleen   hugetlb: introduc...
1787
  	BUG_ON(pud_huge(*pud));
aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
  	pmd = pmd_alloc(mm, pud, addr);
  	if (!pmd)
  		return -ENOMEM;
  	do {
  		next = pmd_addr_end(addr, end);
  		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
  		if (err)
  			break;
  	} while (pmd++, addr = next, addr != end);
  	return err;
  }
  
  static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
  				     unsigned long addr, unsigned long end,
  				     pte_fn_t fn, void *data)
  {
  	pud_t *pud;
  	unsigned long next;
  	int err;
  
  	pud = pud_alloc(mm, pgd, addr);
  	if (!pud)
  		return -ENOMEM;
  	do {
  		next = pud_addr_end(addr, end);
  		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
  		if (err)
  			break;
  	} while (pud++, addr = next, addr != end);
  	return err;
  }
  
  /*
   * Scan a region of virtual memory, filling in page tables as necessary
   * and calling a provided function on each leaf page table.
   */
  int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
  			unsigned long size, pte_fn_t fn, void *data)
  {
  	pgd_t *pgd;
  	unsigned long next;
57250a5bf   Jeremy Fitzhardinge   mmu-notifiers: re...
1829
  	unsigned long end = addr + size;
aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1830
  	int err;
9cb65bc3b   Mika Penttilä   mm/memory.c: make...
1831
1832
  	if (WARN_ON(addr >= end))
  		return -EINVAL;
aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1833
1834
1835
1836
1837
1838
1839
  	pgd = pgd_offset(mm, addr);
  	do {
  		next = pgd_addr_end(addr, end);
  		err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
  		if (err)
  			break;
  	} while (pgd++, addr = next, addr != end);
57250a5bf   Jeremy Fitzhardinge   mmu-notifiers: re...
1840

aee16b3ce   Jeremy Fitzhardinge   Add apply_to_page...
1841
1842
1843
  	return err;
  }
  EXPORT_SYMBOL_GPL(apply_to_page_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1844
  /*
9b4bdd2ff   Kirill A. Shutemov   mm: drop support ...
1845
1846
1847
1848
1849
   * handle_pte_fault chooses page fault handler according to an entry which was
   * read non-atomically.  Before making any commitment, on those architectures
   * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
   * parts, do_swap_page must check under lock before unmapping the pte and
   * proceeding (but do_wp_page is only called after already making such a check;
a335b2e17   Ryota Ozaki   mm: Fix out-of-da...
1850
   * and do_anonymous_page can safely check later on).
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
1851
   */
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
1852
  static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
1853
1854
1855
1856
1857
  				pte_t *page_table, pte_t orig_pte)
  {
  	int same = 1;
  #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
  	if (sizeof(pte_t) > sizeof(unsigned long)) {
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
1858
1859
  		spinlock_t *ptl = pte_lockptr(mm, pmd);
  		spin_lock(ptl);
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
1860
  		same = pte_same(*page_table, orig_pte);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
1861
  		spin_unlock(ptl);
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
1862
1863
1864
1865
1866
  	}
  #endif
  	pte_unmap(page_table);
  	return same;
  }
9de455b20   Atsushi Nemoto   [PATCH] Pass vma ...
1867
  static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
6aab341e0   Linus Torvalds   mm: re-architect ...
1868
  {
0abdd7a81   Dan Williams   dma-debug: introd...
1869
  	debug_dma_assert_idle(src);
6aab341e0   Linus Torvalds   mm: re-architect ...
1870
1871
1872
1873
1874
1875
1876
  	/*
  	 * If the source page was a PFN mapping, we don't have
  	 * a "struct page" for it. We do a best-effort copy by
  	 * just copying from the original user address. If that
  	 * fails, we just zero-fill it. Live with it.
  	 */
  	if (unlikely(!src)) {
9b04c5fec   Cong Wang   mm: remove the se...
1877
  		void *kaddr = kmap_atomic(dst);
5d2a2dbbc   Linus Torvalds   cow_user_page: fi...
1878
1879
1880
1881
1882
1883
1884
1885
1886
  		void __user *uaddr = (void __user *)(va & PAGE_MASK);
  
  		/*
  		 * This really shouldn't fail, because the page is there
  		 * in the page tables. But it might just be unreadable,
  		 * in which case we just give up and fill the result with
  		 * zeroes.
  		 */
  		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
3ecb01df3   Jan Beulich   use clear_page()/...
1887
  			clear_page(kaddr);
9b04c5fec   Cong Wang   mm: remove the se...
1888
  		kunmap_atomic(kaddr);
c4ec7b0de   Dmitriy Monakhov   [PATCH] mm: D-cac...
1889
  		flush_dcache_page(dst);
0ed361dec   Nick Piggin   mm: fix PageUptod...
1890
1891
  	} else
  		copy_user_highpage(dst, src, va, vma);
6aab341e0   Linus Torvalds   mm: re-architect ...
1892
  }
c20cd45eb   Michal Hocko   mm: allow GFP_{FS...
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
  static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
  {
  	struct file *vm_file = vma->vm_file;
  
  	if (vm_file)
  		return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
  
  	/*
  	 * Special mappings (e.g. VDSO) do not have any file so fake
  	 * a default GFP_KERNEL for them.
  	 */
  	return GFP_KERNEL;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1906
  /*
fb09a4642   Kirill A. Shutemov   mm: consolidate c...
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
   * Notify the address space that the page is about to become writable so that
   * it can prohibit this or wait for the page to get into an appropriate state.
   *
   * We do this without the lock held, so that it can sleep if it needs to.
   */
  static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
  	       unsigned long address)
  {
  	struct vm_fault vmf;
  	int ret;
  
  	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
  	vmf.pgoff = page->index;
  	vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
c20cd45eb   Michal Hocko   mm: allow GFP_{FS...
1921
  	vmf.gfp_mask = __get_fault_gfp_mask(vma);
fb09a4642   Kirill A. Shutemov   mm: consolidate c...
1922
  	vmf.page = page;
2e4cdab05   Matthew Wilcox   mm: allow page fa...
1923
  	vmf.cow_page = NULL;
fb09a4642   Kirill A. Shutemov   mm: consolidate c...
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
  
  	ret = vma->vm_ops->page_mkwrite(vma, &vmf);
  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
  		return ret;
  	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
  		lock_page(page);
  		if (!page->mapping) {
  			unlock_page(page);
  			return 0; /* retry */
  		}
  		ret |= VM_FAULT_LOCKED;
  	} else
  		VM_BUG_ON_PAGE(!PageLocked(page), page);
  	return ret;
  }
  
  /*
4e047f897   Shachar Raindel   mm: refactor do_w...
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
   * Handle write page faults for pages that can be reused in the current vma
   *
   * This can happen either due to the mapping being with the VM_SHARED flag,
   * or due to us being the last reference standing to the page. In either
   * case, all we need to do here is to mark the page as writable and update
   * any related book-keeping.
   */
  static inline int wp_page_reuse(struct mm_struct *mm,
  			struct vm_area_struct *vma, unsigned long address,
  			pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
  			struct page *page, int page_mkwrite,
  			int dirty_shared)
  	__releases(ptl)
  {
  	pte_t entry;
  	/*
  	 * Clear the pages cpupid information as the existing
  	 * information potentially belongs to a now completely
  	 * unrelated process.
  	 */
  	if (page)
  		page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
  
  	flush_cache_page(vma, address, pte_pfn(orig_pte));
  	entry = pte_mkyoung(orig_pte);
  	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  	if (ptep_set_access_flags(vma, address, page_table, entry, 1))
  		update_mmu_cache(vma, address, page_table);
  	pte_unmap_unlock(page_table, ptl);
  
  	if (dirty_shared) {
  		struct address_space *mapping;
  		int dirtied;
  
  		if (!page_mkwrite)
  			lock_page(page);
  
  		dirtied = set_page_dirty(page);
  		VM_BUG_ON_PAGE(PageAnon(page), page);
  		mapping = page->mapping;
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1982
  		put_page(page);
4e047f897   Shachar Raindel   mm: refactor do_w...
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
  
  		if ((dirtied || page_mkwrite) && mapping) {
  			/*
  			 * Some device drivers do not set page.mapping
  			 * but still dirty their pages
  			 */
  			balance_dirty_pages_ratelimited(mapping);
  		}
  
  		if (!page_mkwrite)
  			file_update_time(vma->vm_file);
  	}
  
  	return VM_FAULT_WRITE;
  }
  
  /*
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
   * Handle the case of a page which we actually need to copy to a new page.
   *
   * Called with mmap_sem locked and the old page referenced, but
   * without the ptl held.
   *
   * High level logic flow:
   *
   * - Allocate a page, copy the content of the old page to the new one.
   * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
   * - Take the PTL. If the pte changed, bail out and release the allocated page
   * - If the pte is still the way we remember it, update the page table and all
   *   relevant references. This includes dropping the reference the page-table
   *   held to the old page, as well as updating the rmap.
   * - In any case, unlock the PTL and drop the reference we took to the old page.
   */
  static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
  			unsigned long address, pte_t *page_table, pmd_t *pmd,
  			pte_t orig_pte, struct page *old_page)
  {
  	struct page *new_page = NULL;
  	spinlock_t *ptl = NULL;
  	pte_t entry;
  	int page_copied = 0;
  	const unsigned long mmun_start = address & PAGE_MASK;	/* For mmu_notifiers */
  	const unsigned long mmun_end = mmun_start + PAGE_SIZE;	/* For mmu_notifiers */
  	struct mem_cgroup *memcg;
  
  	if (unlikely(anon_vma_prepare(vma)))
  		goto oom;
  
  	if (is_zero_pfn(pte_pfn(orig_pte))) {
  		new_page = alloc_zeroed_user_highpage_movable(vma, address);
  		if (!new_page)
  			goto oom;
  	} else {
  		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
  		if (!new_page)
  			goto oom;
  		cow_user_page(new_page, old_page, address, vma);
  	}
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2040

f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2041
  	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2042
  		goto oom_free_new;
eb3c24f30   Mel Gorman   mm, memcg: Try ch...
2043
  	__SetPageUptodate(new_page);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2044
2045
2046
2047
2048
2049
2050
2051
2052
  	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
  
  	/*
  	 * Re-check the pte - we dropped the lock
  	 */
  	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
  	if (likely(pte_same(*page_table, orig_pte))) {
  		if (old_page) {
  			if (!PageAnon(old_page)) {
eca56ff90   Jerome Marchand   mm, shmem: add in...
2053
2054
  				dec_mm_counter_fast(mm,
  						mm_counter_file(old_page));
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
  				inc_mm_counter_fast(mm, MM_ANONPAGES);
  			}
  		} else {
  			inc_mm_counter_fast(mm, MM_ANONPAGES);
  		}
  		flush_cache_page(vma, address, pte_pfn(orig_pte));
  		entry = mk_pte(new_page, vma->vm_page_prot);
  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  		/*
  		 * Clear the pte entry and flush it first, before updating the
  		 * pte with the new entry. This will avoid a race condition
  		 * seen in the presence of one thread doing SMC and another
  		 * thread doing COW.
  		 */
  		ptep_clear_flush_notify(vma, address, page_table);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
2070
  		page_add_new_anon_rmap(new_page, vma, address, false);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2071
  		mem_cgroup_commit_charge(new_page, memcg, false, false);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
  		lru_cache_add_active_or_unevictable(new_page, vma);
  		/*
  		 * We call the notify macro here because, when using secondary
  		 * mmu page tables (such as kvm shadow page tables), we want the
  		 * new page to be mapped directly into the secondary page table.
  		 */
  		set_pte_at_notify(mm, address, page_table, entry);
  		update_mmu_cache(vma, address, page_table);
  		if (old_page) {
  			/*
  			 * Only after switching the pte to the new page may
  			 * we remove the mapcount here. Otherwise another
  			 * process may come and find the rmap count decremented
  			 * before the pte is switched to the new page, and
  			 * "reuse" the old page writing into it while our pte
  			 * here still points into it and can be read by other
  			 * threads.
  			 *
  			 * The critical issue is to order this
  			 * page_remove_rmap with the ptp_clear_flush above.
  			 * Those stores are ordered by (if nothing else,)
  			 * the barrier present in the atomic_add_negative
  			 * in page_remove_rmap.
  			 *
  			 * Then the TLB flush in ptep_clear_flush ensures that
  			 * no process can access the old page before the
  			 * decremented mapcount is visible. And the old page
  			 * cannot be reused until after the decremented
  			 * mapcount is visible. So transitively, TLBs to
  			 * old page will be flushed before it can be reused.
  			 */
d281ee614   Kirill A. Shutemov   rmap: add argumen...
2103
  			page_remove_rmap(old_page, false);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2104
2105
2106
2107
2108
2109
  		}
  
  		/* Free the old page.. */
  		new_page = old_page;
  		page_copied = 1;
  	} else {
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2110
  		mem_cgroup_cancel_charge(new_page, memcg, false);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2111
2112
2113
  	}
  
  	if (new_page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2114
  		put_page(new_page);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
  
  	pte_unmap_unlock(page_table, ptl);
  	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
  	if (old_page) {
  		/*
  		 * Don't let another task, with possibly unlocked vma,
  		 * keep the mlocked page.
  		 */
  		if (page_copied && (vma->vm_flags & VM_LOCKED)) {
  			lock_page(old_page);	/* LRU manipulation */
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
2125
2126
  			if (PageMlocked(old_page))
  				munlock_vma_page(old_page);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2127
2128
  			unlock_page(old_page);
  		}
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2129
  		put_page(old_page);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2130
2131
2132
  	}
  	return page_copied ? VM_FAULT_WRITE : 0;
  oom_free_new:
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2133
  	put_page(new_page);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2134
2135
  oom:
  	if (old_page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2136
  		put_page(old_page);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2137
2138
  	return VM_FAULT_OOM;
  }
dd9061846   Boaz Harrosh   mm: new pfn_mkwri...
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
  /*
   * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
   * mapping
   */
  static int wp_pfn_shared(struct mm_struct *mm,
  			struct vm_area_struct *vma, unsigned long address,
  			pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
  			pmd_t *pmd)
  {
  	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
  		struct vm_fault vmf = {
  			.page = NULL,
  			.pgoff = linear_page_index(vma, address),
  			.virtual_address = (void __user *)(address & PAGE_MASK),
  			.flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
  		};
  		int ret;
  
  		pte_unmap_unlock(page_table, ptl);
  		ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
  		if (ret & VM_FAULT_ERROR)
  			return ret;
  		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
  		/*
  		 * We might have raced with another page fault while we
  		 * released the pte_offset_map_lock.
  		 */
  		if (!pte_same(*page_table, orig_pte)) {
  			pte_unmap_unlock(page_table, ptl);
  			return 0;
  		}
  	}
  	return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
  			     NULL, 0, 0);
  }
93e478d4c   Shachar Raindel   mm: refactor do_w...
2174
2175
2176
2177
2178
2179
2180
  static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
  			  unsigned long address, pte_t *page_table,
  			  pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
  			  struct page *old_page)
  	__releases(ptl)
  {
  	int page_mkwrite = 0;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2181
  	get_page(old_page);
93e478d4c   Shachar Raindel   mm: refactor do_w...
2182

93e478d4c   Shachar Raindel   mm: refactor do_w...
2183
2184
2185
2186
2187
2188
2189
  	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
  		int tmp;
  
  		pte_unmap_unlock(page_table, ptl);
  		tmp = do_page_mkwrite(vma, old_page, address);
  		if (unlikely(!tmp || (tmp &
  				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2190
  			put_page(old_page);
93e478d4c   Shachar Raindel   mm: refactor do_w...
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
  			return tmp;
  		}
  		/*
  		 * Since we dropped the lock we need to revalidate
  		 * the PTE as someone else may have changed it.  If
  		 * they did, we just return, as we can count on the
  		 * MMU to tell us if they didn't also make it writable.
  		 */
  		page_table = pte_offset_map_lock(mm, pmd, address,
  						 &ptl);
  		if (!pte_same(*page_table, orig_pte)) {
  			unlock_page(old_page);
  			pte_unmap_unlock(page_table, ptl);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2204
  			put_page(old_page);
93e478d4c   Shachar Raindel   mm: refactor do_w...
2205
2206
2207
2208
2209
2210
2211
2212
  			return 0;
  		}
  		page_mkwrite = 1;
  	}
  
  	return wp_page_reuse(mm, vma, address, page_table, ptl,
  			     orig_pte, old_page, page_mkwrite, 1);
  }
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2213
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2214
2215
2216
2217
   * This routine handles present pages, when users try to write
   * to a shared page. It is done by copying the page to a new address
   * and decrementing the shared-page counter for the old page.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2218
2219
2220
2221
2222
2223
2224
2225
2226
   * Note that this routine assumes that the protection checks have been
   * done by the caller (the low-level page fault routine in most cases).
   * Thus we can safely just mark it writable once we've done any necessary
   * COW.
   *
   * We also mark the page dirty at this point even though the page will
   * change only once the write actually happens. This avoids a few races,
   * and potentially makes it more efficient.
   *
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2227
2228
2229
   * We enter with non-exclusive mmap_sem (to exclude vma changes,
   * but allow concurrent faults), with pte both mapped and locked.
   * We return with mmap_sem still held, but pte unmapped and unlocked.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2230
   */
65500d234   Hugh Dickins   [PATCH] mm: page ...
2231
2232
  static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
  		unsigned long address, pte_t *page_table, pmd_t *pmd,
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2233
  		spinlock_t *ptl, pte_t orig_pte)
e6219ec81   Namhyung Kim   mm: add lock rele...
2234
  	__releases(ptl)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2235
  {
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2236
  	struct page *old_page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2237

6aab341e0   Linus Torvalds   mm: re-architect ...
2238
  	old_page = vm_normal_page(vma, address, orig_pte);
251b97f55   Peter Zijlstra   mm: dirty page ac...
2239
2240
  	if (!old_page) {
  		/*
64e455079   Peter Feiner   mm: softdirty: en...
2241
2242
  		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
  		 * VM_PFNMAP VMA.
251b97f55   Peter Zijlstra   mm: dirty page ac...
2243
2244
  		 *
  		 * We should not cow pages in a shared writeable mapping.
dd9061846   Boaz Harrosh   mm: new pfn_mkwri...
2245
  		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
251b97f55   Peter Zijlstra   mm: dirty page ac...
2246
2247
2248
  		 */
  		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
  				     (VM_WRITE|VM_SHARED))
dd9061846   Boaz Harrosh   mm: new pfn_mkwri...
2249
2250
  			return wp_pfn_shared(mm, vma, address, page_table, ptl,
  					     orig_pte, pmd);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2251
2252
2253
2254
  
  		pte_unmap_unlock(page_table, ptl);
  		return wp_page_copy(mm, vma, address, page_table, pmd,
  				    orig_pte, old_page);
251b97f55   Peter Zijlstra   mm: dirty page ac...
2255
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2256

d08b3851d   Peter Zijlstra   [PATCH] mm: track...
2257
  	/*
ee6a64578   Peter Zijlstra   [PATCH] mm: fixup...
2258
2259
  	 * Take out anonymous pages first, anonymous shared vmas are
  	 * not dirty accountable.
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
2260
  	 */
9a8408951   Hugh Dickins   ksm: identify Pag...
2261
  	if (PageAnon(old_page) && !PageKsm(old_page)) {
ab967d860   Hugh Dickins   mm: wp lock page ...
2262
  		if (!trylock_page(old_page)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2263
  			get_page(old_page);
ab967d860   Hugh Dickins   mm: wp lock page ...
2264
2265
2266
2267
2268
2269
  			pte_unmap_unlock(page_table, ptl);
  			lock_page(old_page);
  			page_table = pte_offset_map_lock(mm, pmd, address,
  							 &ptl);
  			if (!pte_same(*page_table, orig_pte)) {
  				unlock_page(old_page);
287668052   Shachar Raindel   mm: refactor do_w...
2270
  				pte_unmap_unlock(page_table, ptl);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2271
  				put_page(old_page);
287668052   Shachar Raindel   mm: refactor do_w...
2272
  				return 0;
ab967d860   Hugh Dickins   mm: wp lock page ...
2273
  			}
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2274
  			put_page(old_page);
ee6a64578   Peter Zijlstra   [PATCH] mm: fixup...
2275
  		}
b009c024f   Michel Lespinasse   do_wp_page: remov...
2276
  		if (reuse_swap_page(old_page)) {
c44b67432   Rik van Riel   rmap: move exclus...
2277
2278
2279
2280
2281
2282
  			/*
  			 * The page is all ours.  Move it to our anon_vma so
  			 * the rmap code will not search our parent or siblings.
  			 * Protected against the rmap code by the page lock.
  			 */
  			page_move_anon_rmap(old_page, vma, address);
b009c024f   Michel Lespinasse   do_wp_page: remov...
2283
  			unlock_page(old_page);
4e047f897   Shachar Raindel   mm: refactor do_w...
2284
2285
  			return wp_page_reuse(mm, vma, address, page_table, ptl,
  					     orig_pte, old_page, 0, 0);
b009c024f   Michel Lespinasse   do_wp_page: remov...
2286
  		}
ab967d860   Hugh Dickins   mm: wp lock page ...
2287
  		unlock_page(old_page);
ee6a64578   Peter Zijlstra   [PATCH] mm: fixup...
2288
  	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
2289
  					(VM_WRITE|VM_SHARED))) {
93e478d4c   Shachar Raindel   mm: refactor do_w...
2290
2291
  		return wp_page_shared(mm, vma, address, page_table, pmd,
  				      ptl, orig_pte, old_page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2292
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2293
2294
2295
2296
  
  	/*
  	 * Ok, we need to copy. Oh, well..
  	 */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2297
  	get_page(old_page);
287668052   Shachar Raindel   mm: refactor do_w...
2298

8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2299
  	pte_unmap_unlock(page_table, ptl);
2f38ab2c3   Shachar Raindel   mm: refactor do_w...
2300
2301
  	return wp_page_copy(mm, vma, address, page_table, pmd,
  			    orig_pte, old_page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2302
  }
97a894136   Peter Zijlstra   mm: Remove i_mmap...
2303
  static void unmap_mapping_range_vma(struct vm_area_struct *vma,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2304
2305
2306
  		unsigned long start_addr, unsigned long end_addr,
  		struct zap_details *details)
  {
f5cc4eef9   Al Viro   VM: make zap_page...
2307
  	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2308
  }
6b2dbba8b   Michel Lespinasse   mm: replace vma p...
2309
  static inline void unmap_mapping_range_tree(struct rb_root *root,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2310
2311
2312
  					    struct zap_details *details)
  {
  	struct vm_area_struct *vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2313
  	pgoff_t vba, vea, zba, zea;
6b2dbba8b   Michel Lespinasse   mm: replace vma p...
2314
  	vma_interval_tree_foreach(vma, root,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2315
  			details->first_index, details->last_index) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2316
2317
  
  		vba = vma->vm_pgoff;
d6e932177   Libin   mm: use vma_pages...
2318
  		vea = vba + vma_pages(vma) - 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2319
2320
2321
2322
2323
2324
  		zba = details->first_index;
  		if (zba < vba)
  			zba = vba;
  		zea = details->last_index;
  		if (zea > vea)
  			zea = vea;
97a894136   Peter Zijlstra   mm: Remove i_mmap...
2325
  		unmap_mapping_range_vma(vma,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2326
2327
  			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
  			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
97a894136   Peter Zijlstra   mm: Remove i_mmap...
2328
  				details);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2329
2330
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2331
  /**
8a5f14a23   Kirill A. Shutemov   mm: drop support ...
2332
2333
2334
2335
   * unmap_mapping_range - unmap the portion of all mmaps in the specified
   * address_space corresponding to the specified page range in the underlying
   * file.
   *
3d41088fa   Martin Waitz   [PATCH] DocBook: ...
2336
   * @mapping: the address space containing mmaps to be unmapped.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2337
2338
   * @holebegin: byte in first page to unmap, relative to the start of
   * the underlying file.  This will be rounded down to a PAGE_SIZE
25d9e2d15   npiggin@suse.de   truncate: new hel...
2339
   * boundary.  Note that this is different from truncate_pagecache(), which
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
   * must keep the partial page.  In contrast, we must get rid of
   * partial pages.
   * @holelen: size of prospective hole in bytes.  This will be rounded
   * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
   * end of the file.
   * @even_cows: 1 when truncating a file, unmap even private COWed pages;
   * but 0 when invalidating pagecache, don't throw away private data.
   */
  void unmap_mapping_range(struct address_space *mapping,
  		loff_t const holebegin, loff_t const holelen, int even_cows)
  {
aac453635   Michal Hocko   mm, oom: introduc...
2351
  	struct zap_details details = { };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
  	pgoff_t hba = holebegin >> PAGE_SHIFT;
  	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
  
  	/* Check for overflow. */
  	if (sizeof(holelen) > sizeof(hlen)) {
  		long long holeend =
  			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
  		if (holeend & ~(long long)ULONG_MAX)
  			hlen = ULONG_MAX - hba + 1;
  	}
  
  	details.check_mapping = even_cows? NULL: mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2364
2365
2366
2367
  	details.first_index = hba;
  	details.last_index = hba + hlen - 1;
  	if (details.last_index < details.first_index)
  		details.last_index = ULONG_MAX;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2368

0f90cc660   Ross Zwisler   mm, dax: fix DAX ...
2369
2370
  
  	/* DAX uses i_mmap_lock to serialise file truncate vs page fault */
46c043ede   Kirill A. Shutemov   mm: take i_mmap_l...
2371
  	i_mmap_lock_write(mapping);
6b2dbba8b   Michel Lespinasse   mm: replace vma p...
2372
  	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2373
  		unmap_mapping_range_tree(&mapping->i_mmap, &details);
46c043ede   Kirill A. Shutemov   mm: take i_mmap_l...
2374
  	i_mmap_unlock_write(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2375
2376
  }
  EXPORT_SYMBOL(unmap_mapping_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2377
  /*
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2378
2379
   * We enter with non-exclusive mmap_sem (to exclude vma changes,
   * but allow concurrent faults), and pte mapped but not yet locked.
9a95f3cf7   Paul Cassella   mm: describe mmap...
2380
2381
2382
2383
   * We return with pte unmapped and unlocked.
   *
   * We return with the mmap_sem locked or unlocked in the same cases
   * as does filemap_fault().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2384
   */
65500d234   Hugh Dickins   [PATCH] mm: page ...
2385
2386
  static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
  		unsigned long address, pte_t *page_table, pmd_t *pmd,
30c9f3a9f   Linus Torvalds   Remove internal u...
2387
  		unsigned int flags, pte_t orig_pte)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2388
  {
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2389
  	spinlock_t *ptl;
56f31801c   Hugh Dickins   mm: cleanup "swap...
2390
  	struct page *page, *swapcache;
00501b531   Johannes Weiner   mm: memcontrol: r...
2391
  	struct mem_cgroup *memcg;
65500d234   Hugh Dickins   [PATCH] mm: page ...
2392
  	swp_entry_t entry;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2393
  	pte_t pte;
d065bd810   Michel Lespinasse   mm: retry page fa...
2394
  	int locked;
ad8c2ee80   Rik van Riel   rmap: add exclusi...
2395
  	int exclusive = 0;
83c54070e   Nick Piggin   mm: fault feedbac...
2396
  	int ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2397

4c21e2f24   Hugh Dickins   [PATCH] mm: split...
2398
  	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2399
  		goto out;
65500d234   Hugh Dickins   [PATCH] mm: page ...
2400
2401
  
  	entry = pte_to_swp_entry(orig_pte);
d1737fdbe   Andi Kleen   HWPOISON: Add bas...
2402
2403
2404
2405
2406
2407
2408
  	if (unlikely(non_swap_entry(entry))) {
  		if (is_migration_entry(entry)) {
  			migration_entry_wait(mm, pmd, address);
  		} else if (is_hwpoison_entry(entry)) {
  			ret = VM_FAULT_HWPOISON;
  		} else {
  			print_bad_pte(vma, address, orig_pte, NULL);
d99be1a8e   Hugh Dickins   mm: sigbus instea...
2409
  			ret = VM_FAULT_SIGBUS;
d1737fdbe   Andi Kleen   HWPOISON: Add bas...
2410
  		}
0697212a4   Christoph Lameter   [PATCH] Swapless ...
2411
2412
  		goto out;
  	}
0ff922452   Shailabh Nagar   [PATCH] per-task-...
2413
  	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2414
2415
  	page = lookup_swap_cache(entry);
  	if (!page) {
02098feaa   Hugh Dickins   swapin needs gfp_...
2416
2417
  		page = swapin_readahead(entry,
  					GFP_HIGHUSER_MOVABLE, vma, address);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2418
2419
  		if (!page) {
  			/*
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2420
2421
  			 * Back out if somebody else faulted in this pte
  			 * while we released the pte lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2422
  			 */
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2423
  			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2424
2425
  			if (likely(pte_same(*page_table, orig_pte)))
  				ret = VM_FAULT_OOM;
0ff922452   Shailabh Nagar   [PATCH] per-task-...
2426
  			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
65500d234   Hugh Dickins   [PATCH] mm: page ...
2427
  			goto unlock;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2428
2429
2430
2431
  		}
  
  		/* Had to read the page from swap area: Major fault */
  		ret = VM_FAULT_MAJOR;
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
2432
  		count_vm_event(PGMAJFAULT);
456f998ec   Ying Han   memcg: add the pa...
2433
  		mem_cgroup_count_vm_event(mm, PGMAJFAULT);
d1737fdbe   Andi Kleen   HWPOISON: Add bas...
2434
  	} else if (PageHWPoison(page)) {
71f72525d   Wu Fengguang   HWPOISON: comment...
2435
2436
2437
2438
  		/*
  		 * hwpoisoned dirty swapcache pages are kept for killing
  		 * owner processes (which may be unknown at hwpoison time)
  		 */
d1737fdbe   Andi Kleen   HWPOISON: Add bas...
2439
2440
  		ret = VM_FAULT_HWPOISON;
  		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
56f31801c   Hugh Dickins   mm: cleanup "swap...
2441
  		swapcache = page;
4779cb31c   Andi Kleen   HWPOISON: Fix pag...
2442
  		goto out_release;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2443
  	}
56f31801c   Hugh Dickins   mm: cleanup "swap...
2444
  	swapcache = page;
d065bd810   Michel Lespinasse   mm: retry page fa...
2445
  	locked = lock_page_or_retry(page, mm, flags);
e709ffd61   Rik van Riel   mm: remove swap t...
2446

073e587ec   KAMEZAWA Hiroyuki   memcg: move charg...
2447
  	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
d065bd810   Michel Lespinasse   mm: retry page fa...
2448
2449
2450
2451
  	if (!locked) {
  		ret |= VM_FAULT_RETRY;
  		goto out_release;
  	}
073e587ec   KAMEZAWA Hiroyuki   memcg: move charg...
2452

4969c1192   Andrea Arcangeli   mm: fix swapin ra...
2453
  	/*
31c4a3d3a   Hugh Dickins   mm: further fix s...
2454
2455
2456
2457
  	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
  	 * release the swapcache from under us.  The page pin, and pte_same
  	 * test below, are not enough to exclude that.  Even if it is still
  	 * swapcache, we need to check that the page's swap has not changed.
4969c1192   Andrea Arcangeli   mm: fix swapin ra...
2458
  	 */
31c4a3d3a   Hugh Dickins   mm: further fix s...
2459
  	if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
4969c1192   Andrea Arcangeli   mm: fix swapin ra...
2460
  		goto out_page;
cbf86cfe0   Hugh Dickins   ksm: remove old s...
2461
2462
2463
2464
  	page = ksm_might_need_to_copy(page, vma, address);
  	if (unlikely(!page)) {
  		ret = VM_FAULT_OOM;
  		page = swapcache;
cbf86cfe0   Hugh Dickins   ksm: remove old s...
2465
  		goto out_page;
5ad646880   Hugh Dickins   ksm: let shared p...
2466
  	}
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2467
  	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
8a9f3ccd2   Balbir Singh   Memory controller...
2468
  		ret = VM_FAULT_OOM;
bc43f75cd   Johannes Weiner   mm: fix pageref l...
2469
  		goto out_page;
8a9f3ccd2   Balbir Singh   Memory controller...
2470
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2471
  	/*
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2472
  	 * Back out if somebody else already faulted in this pte.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2473
  	 */
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2474
  	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
9e9bef07c   Hugh Dickins   [PATCH] mm: do_sw...
2475
  	if (unlikely(!pte_same(*page_table, orig_pte)))
b81074800   Kirill Korotaev   [PATCH] do_swap_p...
2476
  		goto out_nomap;
b81074800   Kirill Korotaev   [PATCH] do_swap_p...
2477
2478
2479
2480
  
  	if (unlikely(!PageUptodate(page))) {
  		ret = VM_FAULT_SIGBUS;
  		goto out_nomap;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2481
  	}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2482
2483
2484
2485
2486
2487
2488
2489
  	/*
  	 * The page isn't present yet, go ahead with the fault.
  	 *
  	 * Be careful about the sequence of operations here.
  	 * To get its accounting right, reuse_swap_page() must be called
  	 * while the page is counted on swap but not yet in mapcount i.e.
  	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
  	 * must be called after the swap_free(), or it will never succeed.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2490
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2491

34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
2492
  	inc_mm_counter_fast(mm, MM_ANONPAGES);
b084d4353   KAMEZAWA Hiroyuki   mm: count swap usage
2493
  	dec_mm_counter_fast(mm, MM_SWAPENTS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2494
  	pte = mk_pte(page, vma->vm_page_prot);
30c9f3a9f   Linus Torvalds   Remove internal u...
2495
  	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2496
  		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
30c9f3a9f   Linus Torvalds   Remove internal u...
2497
  		flags &= ~FAULT_FLAG_WRITE;
9a5b489b8   Andrea Arcangeli   mm: set VM_FAULT_...
2498
  		ret |= VM_FAULT_WRITE;
d281ee614   Kirill A. Shutemov   rmap: add argumen...
2499
  		exclusive = RMAP_EXCLUSIVE;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2500
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2501
  	flush_icache_page(vma, page);
179ef71cb   Cyrill Gorcunov   mm: save soft-dir...
2502
2503
  	if (pte_swp_soft_dirty(orig_pte))
  		pte = pte_mksoft_dirty(pte);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2504
  	set_pte_at(mm, address, page_table, pte);
00501b531   Johannes Weiner   mm: memcontrol: r...
2505
  	if (page == swapcache) {
af34770e5   Johannes Weiner   mm: reduce rmap o...
2506
  		do_page_add_anon_rmap(page, vma, address, exclusive);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2507
  		mem_cgroup_commit_charge(page, memcg, true, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
2508
  	} else { /* ksm created a completely new copy */
d281ee614   Kirill A. Shutemov   rmap: add argumen...
2509
  		page_add_new_anon_rmap(page, vma, address, false);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2510
  		mem_cgroup_commit_charge(page, memcg, false, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
2511
2512
  		lru_cache_add_active_or_unevictable(page, vma);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2513

c475a8ab6   Hugh Dickins   [PATCH] can_share...
2514
  	swap_free(entry);
5ccc5abaa   Vladimir Davydov   mm: free swap cac...
2515
2516
  	if (mem_cgroup_swap_full(page) ||
  	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
a2c43eed8   Hugh Dickins   mm: try_to_free_s...
2517
  		try_to_free_swap(page);
c475a8ab6   Hugh Dickins   [PATCH] can_share...
2518
  	unlock_page(page);
56f31801c   Hugh Dickins   mm: cleanup "swap...
2519
  	if (page != swapcache) {
4969c1192   Andrea Arcangeli   mm: fix swapin ra...
2520
2521
2522
2523
2524
2525
2526
2527
2528
  		/*
  		 * Hold the lock to avoid the swap entry to be reused
  		 * until we take the PT lock for the pte_same() check
  		 * (to avoid false positives from pte_same). For
  		 * further safety release the lock after the swap_free
  		 * so that the swap count won't change under a
  		 * parallel locked swapcache.
  		 */
  		unlock_page(swapcache);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2529
  		put_page(swapcache);
4969c1192   Andrea Arcangeli   mm: fix swapin ra...
2530
  	}
c475a8ab6   Hugh Dickins   [PATCH] can_share...
2531

30c9f3a9f   Linus Torvalds   Remove internal u...
2532
  	if (flags & FAULT_FLAG_WRITE) {
61469f1d5   Hugh Dickins   memcg: when do_sw...
2533
2534
2535
  		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
  		if (ret & VM_FAULT_ERROR)
  			ret &= VM_FAULT_ERROR;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2536
2537
2538
2539
  		goto out;
  	}
  
  	/* No need to invalidate - it was non-present before */
4b3073e1c   Russell King   MM: Pass a PTE po...
2540
  	update_mmu_cache(vma, address, page_table);
65500d234   Hugh Dickins   [PATCH] mm: page ...
2541
  unlock:
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2542
  	pte_unmap_unlock(page_table, ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2543
2544
  out:
  	return ret;
b81074800   Kirill Korotaev   [PATCH] do_swap_p...
2545
  out_nomap:
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2546
  	mem_cgroup_cancel_charge(page, memcg, false);
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2547
  	pte_unmap_unlock(page_table, ptl);
bc43f75cd   Johannes Weiner   mm: fix pageref l...
2548
  out_page:
b81074800   Kirill Korotaev   [PATCH] do_swap_p...
2549
  	unlock_page(page);
4779cb31c   Andi Kleen   HWPOISON: Fix pag...
2550
  out_release:
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2551
  	put_page(page);
56f31801c   Hugh Dickins   mm: cleanup "swap...
2552
  	if (page != swapcache) {
4969c1192   Andrea Arcangeli   mm: fix swapin ra...
2553
  		unlock_page(swapcache);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2554
  		put_page(swapcache);
4969c1192   Andrea Arcangeli   mm: fix swapin ra...
2555
  	}
65500d234   Hugh Dickins   [PATCH] mm: page ...
2556
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2557
2558
2559
  }
  
  /*
8ca3eb080   Tony Luck   guard page for st...
2560
2561
   * This is like a special single-page "expand_{down|up}wards()",
   * except we must first make sure that 'address{-|+}PAGE_SIZE'
320b2b8de   Linus Torvalds   mm: keep a guard ...
2562
   * doesn't hit another vma.
320b2b8de   Linus Torvalds   mm: keep a guard ...
2563
2564
2565
2566
2567
   */
  static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
  {
  	address &= PAGE_MASK;
  	if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
0e8e50e20   Linus Torvalds   mm: make stack gu...
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
  		struct vm_area_struct *prev = vma->vm_prev;
  
  		/*
  		 * Is there a mapping abutting this one below?
  		 *
  		 * That's only ok if it's the same stack mapping
  		 * that has gotten split..
  		 */
  		if (prev && prev->vm_end == address)
  			return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
320b2b8de   Linus Torvalds   mm: keep a guard ...
2578

fee7e49d4   Linus Torvalds   mm: propagate err...
2579
  		return expand_downwards(vma, address - PAGE_SIZE);
320b2b8de   Linus Torvalds   mm: keep a guard ...
2580
  	}
8ca3eb080   Tony Luck   guard page for st...
2581
2582
2583
2584
2585
2586
  	if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
  		struct vm_area_struct *next = vma->vm_next;
  
  		/* As VM_GROWSDOWN but s/below/above/ */
  		if (next && next->vm_start == address + PAGE_SIZE)
  			return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
fee7e49d4   Linus Torvalds   mm: propagate err...
2587
  		return expand_upwards(vma, address + PAGE_SIZE);
8ca3eb080   Tony Luck   guard page for st...
2588
  	}
320b2b8de   Linus Torvalds   mm: keep a guard ...
2589
2590
2591
2592
  	return 0;
  }
  
  /*
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2593
2594
2595
   * We enter with non-exclusive mmap_sem (to exclude vma changes,
   * but allow concurrent faults), and pte mapped but not yet locked.
   * We return with mmap_sem still held, but pte unmapped and unlocked.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2596
   */
65500d234   Hugh Dickins   [PATCH] mm: page ...
2597
2598
  static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  		unsigned long address, pte_t *page_table, pmd_t *pmd,
30c9f3a9f   Linus Torvalds   Remove internal u...
2599
  		unsigned int flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2600
  {
00501b531   Johannes Weiner   mm: memcontrol: r...
2601
  	struct mem_cgroup *memcg;
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2602
2603
  	struct page *page;
  	spinlock_t *ptl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2604
  	pte_t entry;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2605

11ac55247   Linus Torvalds   mm: fix page tabl...
2606
  	pte_unmap(page_table);
6b7339f4c   Kirill A. Shutemov   mm: avoid setting...
2607
2608
2609
  	/* File mapping without ->vm_ops ? */
  	if (vma->vm_flags & VM_SHARED)
  		return VM_FAULT_SIGBUS;
11ac55247   Linus Torvalds   mm: fix page tabl...
2610
2611
  	/* Check if we need to add a guard page to the stack */
  	if (check_stack_guard_page(vma, address) < 0)
9c145c56d   Linus Torvalds   vm: make stack gu...
2612
  		return VM_FAULT_SIGSEGV;
320b2b8de   Linus Torvalds   mm: keep a guard ...
2613

11ac55247   Linus Torvalds   mm: fix page tabl...
2614
  	/* Use the zero-page for reads */
593befa6a   Dominik Dingel   mm: introduce mm_...
2615
  	if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
62eede62d   Hugh Dickins   mm: ZERO_PAGE wit...
2616
2617
  		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
  						vma->vm_page_prot));
11ac55247   Linus Torvalds   mm: fix page tabl...
2618
  		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
a13ea5b75   Hugh Dickins   mm: reinstate ZER...
2619
2620
  		if (!pte_none(*page_table))
  			goto unlock;
6b251fc96   Andrea Arcangeli   userfaultfd: call...
2621
2622
2623
2624
2625
2626
  		/* Deliver the page fault to userland, check inside PT lock */
  		if (userfaultfd_missing(vma)) {
  			pte_unmap_unlock(page_table, ptl);
  			return handle_userfault(vma, address, flags,
  						VM_UFFD_MISSING);
  		}
a13ea5b75   Hugh Dickins   mm: reinstate ZER...
2627
2628
  		goto setpte;
  	}
557ed1fa2   Nick Piggin   remove ZERO_PAGE
2629
  	/* Allocate our own private page. */
557ed1fa2   Nick Piggin   remove ZERO_PAGE
2630
2631
2632
2633
2634
  	if (unlikely(anon_vma_prepare(vma)))
  		goto oom;
  	page = alloc_zeroed_user_highpage_movable(vma, address);
  	if (!page)
  		goto oom;
eb3c24f30   Mel Gorman   mm, memcg: Try ch...
2635

f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2636
  	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
eb3c24f30   Mel Gorman   mm, memcg: Try ch...
2637
  		goto oom_free_page;
52f37629f   Minchan Kim   THP: fix comment ...
2638
2639
2640
2641
2642
  	/*
  	 * The memory barrier inside __SetPageUptodate makes sure that
  	 * preceeding stores to the page contents become visible before
  	 * the set_pte_at() write.
  	 */
0ed361dec   Nick Piggin   mm: fix PageUptod...
2643
  	__SetPageUptodate(page);
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2644

557ed1fa2   Nick Piggin   remove ZERO_PAGE
2645
  	entry = mk_pte(page, vma->vm_page_prot);
1ac0cb5d0   Hugh Dickins   mm: fix anonymous...
2646
2647
  	if (vma->vm_flags & VM_WRITE)
  		entry = pte_mkwrite(pte_mkdirty(entry));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2648

557ed1fa2   Nick Piggin   remove ZERO_PAGE
2649
  	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1c2fb7a4c   Andrea Arcangeli   ksm: fix deadlock...
2650
  	if (!pte_none(*page_table))
557ed1fa2   Nick Piggin   remove ZERO_PAGE
2651
  		goto release;
9ba692948   Hugh Dickins   ksm: fix oom dead...
2652

6b251fc96   Andrea Arcangeli   userfaultfd: call...
2653
2654
2655
  	/* Deliver the page fault to userland, check inside PT lock */
  	if (userfaultfd_missing(vma)) {
  		pte_unmap_unlock(page_table, ptl);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2656
  		mem_cgroup_cancel_charge(page, memcg, false);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2657
  		put_page(page);
6b251fc96   Andrea Arcangeli   userfaultfd: call...
2658
2659
2660
  		return handle_userfault(vma, address, flags,
  					VM_UFFD_MISSING);
  	}
34e55232e   KAMEZAWA Hiroyuki   mm: avoid false s...
2661
  	inc_mm_counter_fast(mm, MM_ANONPAGES);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
2662
  	page_add_new_anon_rmap(page, vma, address, false);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2663
  	mem_cgroup_commit_charge(page, memcg, false, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
2664
  	lru_cache_add_active_or_unevictable(page, vma);
a13ea5b75   Hugh Dickins   mm: reinstate ZER...
2665
  setpte:
65500d234   Hugh Dickins   [PATCH] mm: page ...
2666
  	set_pte_at(mm, address, page_table, entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2667
2668
  
  	/* No need to invalidate - it was non-present before */
4b3073e1c   Russell King   MM: Pass a PTE po...
2669
  	update_mmu_cache(vma, address, page_table);
65500d234   Hugh Dickins   [PATCH] mm: page ...
2670
  unlock:
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2671
  	pte_unmap_unlock(page_table, ptl);
83c54070e   Nick Piggin   mm: fault feedbac...
2672
  	return 0;
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2673
  release:
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2674
  	mem_cgroup_cancel_charge(page, memcg, false);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2675
  	put_page(page);
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2676
  	goto unlock;
8a9f3ccd2   Balbir Singh   Memory controller...
2677
  oom_free_page:
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2678
  	put_page(page);
65500d234   Hugh Dickins   [PATCH] mm: page ...
2679
  oom:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2680
2681
  	return VM_FAULT_OOM;
  }
9a95f3cf7   Paul Cassella   mm: describe mmap...
2682
2683
2684
2685
2686
  /*
   * The mmap_sem must have been held on entry, and may have been
   * released depending on flags and vma->vm_ops->fault() return value.
   * See filemap_fault() and __lock_page_retry().
   */
7eae74af3   Kirill A. Shutemov   mm: do_fault(): e...
2687
  static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2688
2689
  			pgoff_t pgoff, unsigned int flags,
  			struct page *cow_page, struct page **page)
7eae74af3   Kirill A. Shutemov   mm: do_fault(): e...
2690
2691
2692
2693
2694
2695
2696
2697
  {
  	struct vm_fault vmf;
  	int ret;
  
  	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
  	vmf.pgoff = pgoff;
  	vmf.flags = flags;
  	vmf.page = NULL;
c20cd45eb   Michal Hocko   mm: allow GFP_{FS...
2698
  	vmf.gfp_mask = __get_fault_gfp_mask(vma);
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2699
  	vmf.cow_page = cow_page;
7eae74af3   Kirill A. Shutemov   mm: do_fault(): e...
2700
2701
2702
2703
  
  	ret = vma->vm_ops->fault(vma, &vmf);
  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
  		return ret;
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2704
2705
  	if (!vmf.page)
  		goto out;
7eae74af3   Kirill A. Shutemov   mm: do_fault(): e...
2706
2707
2708
2709
  
  	if (unlikely(PageHWPoison(vmf.page))) {
  		if (ret & VM_FAULT_LOCKED)
  			unlock_page(vmf.page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2710
  		put_page(vmf.page);
7eae74af3   Kirill A. Shutemov   mm: do_fault(): e...
2711
2712
2713
2714
2715
2716
2717
  		return VM_FAULT_HWPOISON;
  	}
  
  	if (unlikely(!(ret & VM_FAULT_LOCKED)))
  		lock_page(vmf.page);
  	else
  		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2718
   out:
7eae74af3   Kirill A. Shutemov   mm: do_fault(): e...
2719
2720
2721
  	*page = vmf.page;
  	return ret;
  }
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
  /**
   * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
   *
   * @vma: virtual memory area
   * @address: user virtual address
   * @page: page to map
   * @pte: pointer to target page table entry
   * @write: true, if new entry is writable
   * @anon: true, if it's anonymous page
   *
   * Caller must hold page table lock relevant for @pte.
   *
   * Target users are page handler itself and implementations of
   * vm_ops->map_pages.
   */
  void do_set_pte(struct vm_area_struct *vma, unsigned long address,
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2738
2739
2740
2741
2742
2743
2744
2745
  		struct page *page, pte_t *pte, bool write, bool anon)
  {
  	pte_t entry;
  
  	flush_icache_page(vma, page);
  	entry = mk_pte(page, vma->vm_page_prot);
  	if (write)
  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2746
2747
  	if (anon) {
  		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
d281ee614   Kirill A. Shutemov   rmap: add argumen...
2748
  		page_add_new_anon_rmap(page, vma, address, false);
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2749
  	} else {
eca56ff90   Jerome Marchand   mm, shmem: add in...
2750
  		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2751
2752
2753
2754
2755
2756
2757
  		page_add_file_rmap(page);
  	}
  	set_pte_at(vma->vm_mm, address, pte, entry);
  
  	/* no need to invalidate: a not-present page won't be cached */
  	update_mmu_cache(vma, address, pte);
  }
3a91053ae   Kirill A. Shutemov   mm: mark fault_ar...
2758
2759
  static unsigned long fault_around_bytes __read_mostly =
  	rounddown_pow_of_two(65536);
a9b0f8618   Kirill A. Shutemov   mm: nominate faul...
2760

a9b0f8618   Kirill A. Shutemov   mm: nominate faul...
2761
2762
  #ifdef CONFIG_DEBUG_FS
  static int fault_around_bytes_get(void *data, u64 *val)
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
2763
  {
a9b0f8618   Kirill A. Shutemov   mm: nominate faul...
2764
  	*val = fault_around_bytes;
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
2765
2766
  	return 0;
  }
b4903d6e8   Andrey Ryabinin   mm: debugfs: move...
2767
2768
2769
2770
2771
  /*
   * fault_around_pages() and fault_around_mask() expects fault_around_bytes
   * rounded down to nearest page order. It's what do_fault_around() expects to
   * see.
   */
a9b0f8618   Kirill A. Shutemov   mm: nominate faul...
2772
  static int fault_around_bytes_set(void *data, u64 val)
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
2773
  {
a9b0f8618   Kirill A. Shutemov   mm: nominate faul...
2774
  	if (val / PAGE_SIZE > PTRS_PER_PTE)
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
2775
  		return -EINVAL;
b4903d6e8   Andrey Ryabinin   mm: debugfs: move...
2776
2777
2778
2779
  	if (val > PAGE_SIZE)
  		fault_around_bytes = rounddown_pow_of_two(val);
  	else
  		fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
2780
2781
  	return 0;
  }
a9b0f8618   Kirill A. Shutemov   mm: nominate faul...
2782
2783
2784
  DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
  		fault_around_bytes_get, fault_around_bytes_set, "%llu
  ");
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
2785
2786
2787
2788
  
  static int __init fault_around_debugfs(void)
  {
  	void *ret;
a9b0f8618   Kirill A. Shutemov   mm: nominate faul...
2789
2790
  	ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
  			&fault_around_bytes_fops);
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
2791
  	if (!ret)
a9b0f8618   Kirill A. Shutemov   mm: nominate faul...
2792
  		pr_warn("Failed to create fault_around_bytes in debugfs");
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
2793
2794
2795
  	return 0;
  }
  late_initcall(fault_around_debugfs);
1592eef01   Kirill A. Shutemov   mm: add debugfs t...
2796
  #endif
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2797

1fdb412bd   Kirill A. Shutemov   mm: document do_f...
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
  /*
   * do_fault_around() tries to map few pages around the fault address. The hope
   * is that the pages will be needed soon and this will lower the number of
   * faults to handle.
   *
   * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
   * not ready to be mapped: not up-to-date, locked, etc.
   *
   * This function is called with the page table lock taken. In the split ptlock
   * case the page table lock only protects only those entries which belong to
   * the page table corresponding to the fault address.
   *
   * This function doesn't cross the VMA boundaries, in order to call map_pages()
   * only once.
   *
   * fault_around_pages() defines how many pages we'll try to map.
   * do_fault_around() expects it to return a power of two less than or equal to
   * PTRS_PER_PTE.
   *
   * The virtual address of the area that we map is naturally aligned to the
   * fault_around_pages() value (and therefore to page order).  This way it's
   * easier to guarantee that we don't cross page table boundaries.
   */
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2821
2822
2823
  static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
  		pte_t *pte, pgoff_t pgoff, unsigned int flags)
  {
aecd6f442   Kirill A. Shutemov   mm: close race be...
2824
  	unsigned long start_addr, nr_pages, mask;
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2825
2826
2827
  	pgoff_t max_pgoff;
  	struct vm_fault vmf;
  	int off;
4db0c3c29   Jason Low   mm: remove rest o...
2828
  	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
aecd6f442   Kirill A. Shutemov   mm: close race be...
2829
2830
2831
  	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
  
  	start_addr = max(address & mask, vma->vm_start);
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2832
2833
2834
2835
2836
2837
  	off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
  	pte -= off;
  	pgoff -= off;
  
  	/*
  	 *  max_pgoff is either end of page table or end of vma
850e9c69c   Kirill A. Shutemov   mm: fix typo in c...
2838
  	 *  or fault_around_pages() from pgoff, depending what is nearest.
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2839
2840
2841
2842
  	 */
  	max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
  		PTRS_PER_PTE - 1;
  	max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
aecd6f442   Kirill A. Shutemov   mm: close race be...
2843
  			pgoff + nr_pages - 1);
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
  
  	/* Check if it makes any sense to call ->map_pages */
  	while (!pte_none(*pte)) {
  		if (++pgoff > max_pgoff)
  			return;
  		start_addr += PAGE_SIZE;
  		if (start_addr >= vma->vm_end)
  			return;
  		pte++;
  	}
  
  	vmf.virtual_address = (void __user *) start_addr;
  	vmf.pte = pte;
  	vmf.pgoff = pgoff;
  	vmf.max_pgoff = max_pgoff;
  	vmf.flags = flags;
c20cd45eb   Michal Hocko   mm: allow GFP_{FS...
2860
  	vmf.gfp_mask = __get_fault_gfp_mask(vma);
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2861
2862
  	vma->vm_ops->map_pages(vma, &vmf);
  }
e655fb290   Kirill A. Shutemov   mm: introduce do_...
2863
2864
2865
2866
2867
2868
  static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  		unsigned long address, pmd_t *pmd,
  		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
  {
  	struct page *fault_page;
  	spinlock_t *ptl;
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2869
  	pte_t *pte;
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2870
2871
2872
2873
2874
2875
2876
  	int ret = 0;
  
  	/*
  	 * Let's call ->map_pages() first and use ->fault() as fallback
  	 * if page by the offset is not ready to be mapped (cold cache or
  	 * something).
  	 */
9b4bdd2ff   Kirill A. Shutemov   mm: drop support ...
2877
  	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2878
2879
2880
2881
2882
2883
  		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  		do_fault_around(vma, address, pte, pgoff, flags);
  		if (!pte_same(*pte, orig_pte))
  			goto unlock_out;
  		pte_unmap_unlock(pte, ptl);
  	}
e655fb290   Kirill A. Shutemov   mm: introduce do_...
2884

2e4cdab05   Matthew Wilcox   mm: allow page fa...
2885
  	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
e655fb290   Kirill A. Shutemov   mm: introduce do_...
2886
2887
2888
2889
2890
2891
2892
  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
  		return ret;
  
  	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  	if (unlikely(!pte_same(*pte, orig_pte))) {
  		pte_unmap_unlock(pte, ptl);
  		unlock_page(fault_page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2893
  		put_page(fault_page);
e655fb290   Kirill A. Shutemov   mm: introduce do_...
2894
2895
  		return ret;
  	}
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2896
  	do_set_pte(vma, address, fault_page, pte, false, false);
e655fb290   Kirill A. Shutemov   mm: introduce do_...
2897
  	unlock_page(fault_page);
8c6e50b02   Kirill A. Shutemov   mm: introduce vm_...
2898
2899
  unlock_out:
  	pte_unmap_unlock(pte, ptl);
e655fb290   Kirill A. Shutemov   mm: introduce do_...
2900
2901
  	return ret;
  }
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2902
2903
2904
2905
2906
  static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  		unsigned long address, pmd_t *pmd,
  		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
  {
  	struct page *fault_page, *new_page;
00501b531   Johannes Weiner   mm: memcontrol: r...
2907
  	struct mem_cgroup *memcg;
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2908
  	spinlock_t *ptl;
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2909
  	pte_t *pte;
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2910
2911
2912
2913
2914
2915
2916
2917
  	int ret;
  
  	if (unlikely(anon_vma_prepare(vma)))
  		return VM_FAULT_OOM;
  
  	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
  	if (!new_page)
  		return VM_FAULT_OOM;
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2918
  	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2919
  		put_page(new_page);
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2920
2921
  		return VM_FAULT_OOM;
  	}
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2922
  	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2923
2924
  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
  		goto uncharge_out;
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2925
2926
  	if (fault_page)
  		copy_user_highpage(new_page, fault_page, address, vma);
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2927
2928
2929
2930
2931
  	__SetPageUptodate(new_page);
  
  	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  	if (unlikely(!pte_same(*pte, orig_pte))) {
  		pte_unmap_unlock(pte, ptl);
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2932
2933
  		if (fault_page) {
  			unlock_page(fault_page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2934
  			put_page(fault_page);
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2935
2936
2937
  		} else {
  			/*
  			 * The fault handler has no page to lock, so it holds
0df9d41ab   Yigal Korman   mm, dax: fix DAX ...
2938
  			 * i_mmap_lock for read to protect against truncate.
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2939
  			 */
0df9d41ab   Yigal Korman   mm, dax: fix DAX ...
2940
  			i_mmap_unlock_read(vma->vm_file->f_mapping);
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2941
  		}
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2942
2943
  		goto uncharge_out;
  	}
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2944
  	do_set_pte(vma, address, new_page, pte, true, true);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2945
  	mem_cgroup_commit_charge(new_page, memcg, false, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
2946
  	lru_cache_add_active_or_unevictable(new_page, vma);
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2947
  	pte_unmap_unlock(pte, ptl);
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2948
2949
  	if (fault_page) {
  		unlock_page(fault_page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2950
  		put_page(fault_page);
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2951
2952
2953
  	} else {
  		/*
  		 * The fault handler has no page to lock, so it holds
0df9d41ab   Yigal Korman   mm, dax: fix DAX ...
2954
  		 * i_mmap_lock for read to protect against truncate.
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2955
  		 */
0df9d41ab   Yigal Korman   mm, dax: fix DAX ...
2956
  		i_mmap_unlock_read(vma->vm_file->f_mapping);
2e4cdab05   Matthew Wilcox   mm: allow page fa...
2957
  	}
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2958
2959
  	return ret;
  uncharge_out:
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
2960
  	mem_cgroup_cancel_charge(new_page, memcg, false);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2961
  	put_page(new_page);
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
2962
2963
  	return ret;
  }
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
2964
  static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
16abfa086   Hugh Dickins   Fix sys_remap_fil...
2965
  		unsigned long address, pmd_t *pmd,
54cb8821d   Nick Piggin   mm: merge populat...
2966
  		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2967
  {
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
2968
2969
  	struct page *fault_page;
  	struct address_space *mapping;
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
2970
  	spinlock_t *ptl;
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2971
  	pte_t *pte;
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
2972
  	int dirtied = 0;
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
2973
  	int ret, tmp;
1d65f86db   KAMEZAWA Hiroyuki   mm: preallocate p...
2974

2e4cdab05   Matthew Wilcox   mm: allow page fa...
2975
  	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
7eae74af3   Kirill A. Shutemov   mm: do_fault(): e...
2976
  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
2977
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2978
2979
  
  	/*
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
2980
2981
  	 * Check if the backing address space wants to know that the page is
  	 * about to become writable
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2982
  	 */
fb09a4642   Kirill A. Shutemov   mm: consolidate c...
2983
2984
2985
2986
2987
  	if (vma->vm_ops->page_mkwrite) {
  		unlock_page(fault_page);
  		tmp = do_page_mkwrite(vma, fault_page, address);
  		if (unlikely(!tmp ||
  				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2988
  			put_page(fault_page);
fb09a4642   Kirill A. Shutemov   mm: consolidate c...
2989
  			return tmp;
4294621f4   Hugh Dickins   [PATCH] mm: rss =...
2990
  		}
fb09a4642   Kirill A. Shutemov   mm: consolidate c...
2991
  	}
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
2992
2993
2994
2995
  	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  	if (unlikely(!pte_same(*pte, orig_pte))) {
  		pte_unmap_unlock(pte, ptl);
  		unlock_page(fault_page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2996
  		put_page(fault_page);
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
2997
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2998
  	}
3bb977946   Kirill A. Shutemov   mm: consolidate c...
2999
  	do_set_pte(vma, address, fault_page, pte, true, false);
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
3000
  	pte_unmap_unlock(pte, ptl);
b827e496c   Nick Piggin   mm: close page_mk...
3001

f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
3002
3003
  	if (set_page_dirty(fault_page))
  		dirtied = 1;
d82fa87d2   Andrew Morton   mm/memory.c:do_sh...
3004
3005
3006
3007
3008
3009
  	/*
  	 * Take a local copy of the address_space - page.mapping may be zeroed
  	 * by truncate after unlock_page().   The address_space itself remains
  	 * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
  	 * release semantics to prevent the compiler from undoing this copying.
  	 */
1c290f642   Kirill A. Shutemov   mm: sanitize page...
3010
  	mapping = page_rmapping(fault_page);
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
3011
3012
3013
3014
3015
3016
3017
  	unlock_page(fault_page);
  	if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
  		/*
  		 * Some device drivers do not set page.mapping but still
  		 * dirty their pages
  		 */
  		balance_dirty_pages_ratelimited(mapping);
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
3018
  	}
d00806b18   Nick Piggin   mm: fix fault vs ...
3019

74ec67511   Johannes Weiner   mm: memory: remov...
3020
  	if (!vma->vm_ops->page_mkwrite)
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
3021
  		file_update_time(vma->vm_file);
b827e496c   Nick Piggin   mm: close page_mk...
3022

1d65f86db   KAMEZAWA Hiroyuki   mm: preallocate p...
3023
  	return ret;
54cb8821d   Nick Piggin   mm: merge populat...
3024
  }
d00806b18   Nick Piggin   mm: fix fault vs ...
3025

9a95f3cf7   Paul Cassella   mm: describe mmap...
3026
3027
3028
3029
3030
3031
  /*
   * We enter with non-exclusive mmap_sem (to exclude vma changes,
   * but allow concurrent faults).
   * The mmap_sem may have been released depending on flags and our
   * return value.  See filemap_fault() and __lock_page_or_retry().
   */
9b4bdd2ff   Kirill A. Shutemov   mm: drop support ...
3032
  static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
54cb8821d   Nick Piggin   mm: merge populat...
3033
  		unsigned long address, pte_t *page_table, pmd_t *pmd,
30c9f3a9f   Linus Torvalds   Remove internal u...
3034
  		unsigned int flags, pte_t orig_pte)
54cb8821d   Nick Piggin   mm: merge populat...
3035
  {
88193f7ce   Matthew Wilcox   mm: use linear_pa...
3036
  	pgoff_t pgoff = linear_page_index(vma, address);
54cb8821d   Nick Piggin   mm: merge populat...
3037

16abfa086   Hugh Dickins   Fix sys_remap_fil...
3038
  	pte_unmap(page_table);
6b7339f4c   Kirill A. Shutemov   mm: avoid setting...
3039
3040
3041
  	/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
  	if (!vma->vm_ops->fault)
  		return VM_FAULT_SIGBUS;
e655fb290   Kirill A. Shutemov   mm: introduce do_...
3042
3043
3044
  	if (!(flags & FAULT_FLAG_WRITE))
  		return do_read_fault(mm, vma, address, pmd, pgoff, flags,
  				orig_pte);
ec47c3b95   Kirill A. Shutemov   mm: introduce do_...
3045
3046
3047
  	if (!(vma->vm_flags & VM_SHARED))
  		return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
  				orig_pte);
f0c6d4d29   Kirill A. Shutemov   mm: introduce do_...
3048
  	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
54cb8821d   Nick Piggin   mm: merge populat...
3049
  }
b19a99392   Rashika Kheria   mm/memory.c: mark...
3050
  static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
04bb2f947   Rik van Riel   sched/numa: Adjus...
3051
3052
  				unsigned long addr, int page_nid,
  				int *flags)
9532fec11   Mel Gorman   mm: numa: Migrate...
3053
3054
3055
3056
  {
  	get_page(page);
  
  	count_vm_numa_event(NUMA_HINT_FAULTS);
04bb2f947   Rik van Riel   sched/numa: Adjus...
3057
  	if (page_nid == numa_node_id()) {
9532fec11   Mel Gorman   mm: numa: Migrate...
3058
  		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
04bb2f947   Rik van Riel   sched/numa: Adjus...
3059
3060
  		*flags |= TNF_FAULT_LOCAL;
  	}
9532fec11   Mel Gorman   mm: numa: Migrate...
3061
3062
3063
  
  	return mpol_misplaced(page, vma, addr);
  }
b19a99392   Rashika Kheria   mm/memory.c: mark...
3064
  static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
d10e63f29   Mel Gorman   mm: numa: Create ...
3065
3066
  		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
  {
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
3067
  	struct page *page = NULL;
d10e63f29   Mel Gorman   mm: numa: Create ...
3068
  	spinlock_t *ptl;
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
3069
  	int page_nid = -1;
90572890d   Peter Zijlstra   mm: numa: Change ...
3070
  	int last_cpupid;
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
3071
  	int target_nid;
b8593bfda   Mel Gorman   mm: sched: Adapt ...
3072
  	bool migrated = false;
b191f9b10   Mel Gorman   mm: numa: preserv...
3073
  	bool was_writable = pte_write(pte);
6688cc054   Peter Zijlstra   mm: numa: Do not ...
3074
  	int flags = 0;
d10e63f29   Mel Gorman   mm: numa: Create ...
3075

c0e7cad9f   Mel Gorman   mm: numa: add par...
3076
3077
  	/* A PROT_NONE fault should not end up here */
  	BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
d10e63f29   Mel Gorman   mm: numa: Create ...
3078
3079
3080
3081
3082
  	/*
  	* The "pte" at this point cannot be used safely without
  	* validation through pte_unmap_same(). It's of NUMA type but
  	* the pfn may be screwed if the read is non atomic.
  	*
4d9424669   Mel Gorman   mm: convert p[te|...
3083
3084
3085
  	* We can safely just do a "set_pte_at()", because the old
  	* page table entry is not accessible, so there would be no
  	* concurrent hardware modifications to the PTE.
d10e63f29   Mel Gorman   mm: numa: Create ...
3086
3087
3088
  	*/
  	ptl = pte_lockptr(mm, pmd);
  	spin_lock(ptl);
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
3089
3090
3091
3092
  	if (unlikely(!pte_same(*ptep, pte))) {
  		pte_unmap_unlock(ptep, ptl);
  		goto out;
  	}
4d9424669   Mel Gorman   mm: convert p[te|...
3093
3094
3095
  	/* Make it present again */
  	pte = pte_modify(pte, vma->vm_page_prot);
  	pte = pte_mkyoung(pte);
b191f9b10   Mel Gorman   mm: numa: preserv...
3096
3097
  	if (was_writable)
  		pte = pte_mkwrite(pte);
d10e63f29   Mel Gorman   mm: numa: Create ...
3098
3099
3100
3101
3102
3103
3104
3105
  	set_pte_at(mm, addr, ptep, pte);
  	update_mmu_cache(vma, addr, ptep);
  
  	page = vm_normal_page(vma, addr, pte);
  	if (!page) {
  		pte_unmap_unlock(ptep, ptl);
  		return 0;
  	}
e81c48024   Kirill A. Shutemov   mm, numa: skip PT...
3106
3107
3108
3109
3110
  	/* TODO: handle PTE-mapped THP */
  	if (PageCompound(page)) {
  		pte_unmap_unlock(ptep, ptl);
  		return 0;
  	}
6688cc054   Peter Zijlstra   mm: numa: Do not ...
3111
  	/*
bea66fbd1   Mel Gorman   mm: numa: group r...
3112
3113
3114
3115
3116
3117
  	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
  	 * much anyway since they can be in shared cache state. This misses
  	 * the case where a mapping is writable but the process never writes
  	 * to it but pte_write gets cleared during protection updates and
  	 * pte_dirty has unpredictable behaviour between PTE scan updates,
  	 * background writeback, dirty balancing and application behaviour.
6688cc054   Peter Zijlstra   mm: numa: Do not ...
3118
  	 */
bea66fbd1   Mel Gorman   mm: numa: group r...
3119
  	if (!(vma->vm_flags & VM_WRITE))
6688cc054   Peter Zijlstra   mm: numa: Do not ...
3120
  		flags |= TNF_NO_GROUP;
dabe1d992   Rik van Riel   sched/numa: Be mo...
3121
3122
3123
3124
3125
3126
  	/*
  	 * Flag if the page is shared between multiple address spaces. This
  	 * is later used when determining whether to group tasks together
  	 */
  	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
  		flags |= TNF_SHARED;
90572890d   Peter Zijlstra   mm: numa: Change ...
3127
  	last_cpupid = page_cpupid_last(page);
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
3128
  	page_nid = page_to_nid(page);
04bb2f947   Rik van Riel   sched/numa: Adjus...
3129
  	target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
d10e63f29   Mel Gorman   mm: numa: Create ...
3130
  	pte_unmap_unlock(ptep, ptl);
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
3131
  	if (target_nid == -1) {
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
3132
3133
3134
3135
3136
  		put_page(page);
  		goto out;
  	}
  
  	/* Migrate to the requested node */
1bc115d87   Mel Gorman   mm: numa: Scan pa...
3137
  	migrated = migrate_misplaced_page(page, vma, target_nid);
6688cc054   Peter Zijlstra   mm: numa: Do not ...
3138
  	if (migrated) {
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
3139
  		page_nid = target_nid;
6688cc054   Peter Zijlstra   mm: numa: Do not ...
3140
  		flags |= TNF_MIGRATED;
074c23817   Mel Gorman   mm: numa: slow PT...
3141
3142
  	} else
  		flags |= TNF_MIGRATE_FAIL;
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
3143
3144
  
  out:
8191acbd3   Mel Gorman   mm: numa: Sanitiz...
3145
  	if (page_nid != -1)
6688cc054   Peter Zijlstra   mm: numa: Do not ...
3146
  		task_numa_fault(last_cpupid, page_nid, 1, flags);
d10e63f29   Mel Gorman   mm: numa: Create ...
3147
3148
  	return 0;
  }
b96375f74   Matthew Wilcox   mm: add a pmd_fau...
3149
3150
3151
  static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
  			unsigned long address, pmd_t *pmd, unsigned int flags)
  {
fb6dd5fa4   Kirill A. Shutemov   mm: use vma_is_an...
3152
  	if (vma_is_anonymous(vma))
b96375f74   Matthew Wilcox   mm: add a pmd_fau...
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
  		return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
  	if (vma->vm_ops->pmd_fault)
  		return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
  	return VM_FAULT_FALLBACK;
  }
  
  static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
  			unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
  			unsigned int flags)
  {
fb6dd5fa4   Kirill A. Shutemov   mm: use vma_is_an...
3163
  	if (vma_is_anonymous(vma))
b96375f74   Matthew Wilcox   mm: add a pmd_fau...
3164
3165
3166
3167
3168
  		return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
  	if (vma->vm_ops->pmd_fault)
  		return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
  	return VM_FAULT_FALLBACK;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3169
3170
3171
3172
3173
3174
3175
3176
3177
  /*
   * These routines also need to handle stuff like marking pages dirty
   * and/or accessed for architectures that don't do it in hardware (most
   * RISC architectures).  The early dirtying is also good on the i386.
   *
   * There is also a hook called "update_mmu_cache()" that architectures
   * with external mmu caches can use to update those (ie the Sparc or
   * PowerPC hashed page tables that act as extended TLBs).
   *
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
3178
3179
   * We enter with non-exclusive mmap_sem (to exclude vma changes,
   * but allow concurrent faults), and pte mapped but not yet locked.
9a95f3cf7   Paul Cassella   mm: describe mmap...
3180
3181
3182
3183
   * We return with pte unmapped and unlocked.
   *
   * The mmap_sem may have been released depending on flags and our
   * return value.  See filemap_fault() and __lock_page_or_retry().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3184
   */
c02925540   Kirill A. Shutemov   thp: consolidate ...
3185
  static int handle_pte_fault(struct mm_struct *mm,
71e3aac07   Andrea Arcangeli   thp: transparent ...
3186
3187
  		     struct vm_area_struct *vma, unsigned long address,
  		     pte_t *pte, pmd_t *pmd, unsigned int flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3188
3189
  {
  	pte_t entry;
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
3190
  	spinlock_t *ptl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3191

e37c69827   Christian Borntraeger   mm: replace ACCES...
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
  	/*
  	 * some architectures can have larger ptes than wordsize,
  	 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
  	 * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
  	 * The code below just needs a consistent view for the ifs and
  	 * we later double check anyway with the ptl lock held. So here
  	 * a barrier will do.
  	 */
  	entry = *pte;
  	barrier();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3202
  	if (!pte_present(entry)) {
65500d234   Hugh Dickins   [PATCH] mm: page ...
3203
  		if (pte_none(entry)) {
b53306285   Oleg Nesterov   mm: introduce vma...
3204
3205
3206
3207
  			if (vma_is_anonymous(vma))
  				return do_anonymous_page(mm, vma, address,
  							 pte, pmd, flags);
  			else
6b7339f4c   Kirill A. Shutemov   mm: avoid setting...
3208
3209
  				return do_fault(mm, vma, address, pte, pmd,
  						flags, entry);
65500d234   Hugh Dickins   [PATCH] mm: page ...
3210
  		}
65500d234   Hugh Dickins   [PATCH] mm: page ...
3211
  		return do_swap_page(mm, vma, address,
30c9f3a9f   Linus Torvalds   Remove internal u...
3212
  					pte, pmd, flags, entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3213
  	}
8a0516ed8   Mel Gorman   mm: convert p[te|...
3214
  	if (pte_protnone(entry))
d10e63f29   Mel Gorman   mm: numa: Create ...
3215
  		return do_numa_page(mm, vma, address, entry, pte, pmd);
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
3216
  	ptl = pte_lockptr(mm, pmd);
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
3217
3218
3219
  	spin_lock(ptl);
  	if (unlikely(!pte_same(*pte, entry)))
  		goto unlock;
30c9f3a9f   Linus Torvalds   Remove internal u...
3220
  	if (flags & FAULT_FLAG_WRITE) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3221
  		if (!pte_write(entry))
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
3222
3223
  			return do_wp_page(mm, vma, address,
  					pte, pmd, ptl, entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3224
3225
3226
  		entry = pte_mkdirty(entry);
  	}
  	entry = pte_mkyoung(entry);
30c9f3a9f   Linus Torvalds   Remove internal u...
3227
  	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
4b3073e1c   Russell King   MM: Pass a PTE po...
3228
  		update_mmu_cache(vma, address, pte);
1a44e1490   Andrea Arcangeli   [PATCH] .text pag...
3229
3230
3231
3232
3233
3234
3235
  	} else {
  		/*
  		 * This is needed only for protection faults but the arch code
  		 * is not yet telling us if this is a protection fault or not.
  		 * This still avoids useless tlb flushes for .text page faults
  		 * with threads.
  		 */
30c9f3a9f   Linus Torvalds   Remove internal u...
3236
  		if (flags & FAULT_FLAG_WRITE)
61c77326d   Shaohua Li   x86, mm: Avoid un...
3237
  			flush_tlb_fix_spurious_fault(vma, address);
1a44e1490   Andrea Arcangeli   [PATCH] .text pag...
3238
  	}
8f4e2101f   Hugh Dickins   [PATCH] mm: page ...
3239
3240
  unlock:
  	pte_unmap_unlock(pte, ptl);
83c54070e   Nick Piggin   mm: fault feedbac...
3241
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3242
3243
3244
3245
  }
  
  /*
   * By the time we get here, we already hold the mm semaphore
9a95f3cf7   Paul Cassella   mm: describe mmap...
3246
3247
3248
   *
   * The mmap_sem may have been released depending on flags and our
   * return value.  See filemap_fault() and __lock_page_or_retry().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3249
   */
519e52473   Johannes Weiner   mm: memcg: enable...
3250
3251
  static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  			     unsigned long address, unsigned int flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3252
3253
3254
3255
3256
  {
  	pgd_t *pgd;
  	pud_t *pud;
  	pmd_t *pmd;
  	pte_t *pte;
1b2ee1266   Dave Hansen   mm/core: Do not e...
3257
  	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
d61172b4b   Dave Hansen   mm/core, x86/mm/p...
3258
  					    flags & FAULT_FLAG_INSTRUCTION,
1b2ee1266   Dave Hansen   mm/core: Do not e...
3259
  					    flags & FAULT_FLAG_REMOTE))
33a709b25   Dave Hansen   mm/gup, x86/mm/pk...
3260
  		return VM_FAULT_SIGSEGV;
ac9b9c667   Hugh Dickins   [PATCH] Fix handl...
3261
  	if (unlikely(is_vm_hugetlb_page(vma)))
30c9f3a9f   Linus Torvalds   Remove internal u...
3262
  		return hugetlb_fault(mm, vma, address, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3263

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3264
  	pgd = pgd_offset(mm, address);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3265
3266
  	pud = pud_alloc(mm, pgd, address);
  	if (!pud)
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
3267
  		return VM_FAULT_OOM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3268
3269
  	pmd = pmd_alloc(mm, pud, address);
  	if (!pmd)
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
3270
  		return VM_FAULT_OOM;
71e3aac07   Andrea Arcangeli   thp: transparent ...
3271
  	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
b96375f74   Matthew Wilcox   mm: add a pmd_fau...
3272
  		int ret = create_huge_pmd(mm, vma, address, pmd, flags);
c02925540   Kirill A. Shutemov   thp: consolidate ...
3273
3274
  		if (!(ret & VM_FAULT_FALLBACK))
  			return ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
3275
3276
  	} else {
  		pmd_t orig_pmd = *pmd;
1f1d06c34   David Rientjes   thp, memcg: split...
3277
  		int ret;
71e3aac07   Andrea Arcangeli   thp: transparent ...
3278
  		barrier();
5c7fb56e5   Dan Williams   mm, dax: dax-pmd ...
3279
  		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
a1dd450bc   Will Deacon   mm: thp: set the ...
3280
  			unsigned int dirty = flags & FAULT_FLAG_WRITE;
8a0516ed8   Mel Gorman   mm: convert p[te|...
3281
  			if (pmd_protnone(orig_pmd))
4daae3b4b   Mel Gorman   mm: mempolicy: Us...
3282
  				return do_huge_pmd_numa_page(mm, vma, address,
d10e63f29   Mel Gorman   mm: numa: Create ...
3283
  							     orig_pmd, pmd);
3d59eebc5   Linus Torvalds   Merge tag 'balanc...
3284
  			if (dirty && !pmd_write(orig_pmd)) {
b96375f74   Matthew Wilcox   mm: add a pmd_fau...
3285
3286
  				ret = wp_huge_pmd(mm, vma, address, pmd,
  							orig_pmd, flags);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
3287
3288
  				if (!(ret & VM_FAULT_FALLBACK))
  					return ret;
a1dd450bc   Will Deacon   mm: thp: set the ...
3289
3290
3291
  			} else {
  				huge_pmd_set_accessed(mm, vma, address, pmd,
  						      orig_pmd, dirty);
9845cbbd1   Kirill A. Shutemov   mm, thp: fix infi...
3292
  				return 0;
1f1d06c34   David Rientjes   thp, memcg: split...
3293
  			}
71e3aac07   Andrea Arcangeli   thp: transparent ...
3294
3295
3296
3297
  		}
  	}
  
  	/*
3ed3a4f0d   Kirill A. Shutemov   mm: cleanup *pte_...
3298
  	 * Use pte_alloc() instead of pte_alloc_map, because we can't
71e3aac07   Andrea Arcangeli   thp: transparent ...
3299
3300
3301
  	 * run pte_offset_map on the pmd, if an huge pmd could
  	 * materialize from under us from a different thread.
  	 */
3ed3a4f0d   Kirill A. Shutemov   mm: cleanup *pte_...
3302
  	if (unlikely(pte_alloc(mm, pmd, address)))
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
3303
  		return VM_FAULT_OOM;
ad33bb04b   Andrea Arcangeli   mm: thp: fix SMP ...
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
  	/*
  	 * If a huge pmd materialized under us just retry later.  Use
  	 * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
  	 * didn't become pmd_trans_huge under us and then back to pmd_none, as
  	 * a result of MADV_DONTNEED running immediately after a huge pmd fault
  	 * in a different thread of this mm, in turn leading to a misleading
  	 * pmd_trans_huge() retval.  All we have to ensure is that it is a
  	 * regular pmd that we can walk with pte_offset_map() and we can do that
  	 * through an atomic read in C, which is what pmd_trans_unstable()
  	 * provides.
  	 */
  	if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
71e3aac07   Andrea Arcangeli   thp: transparent ...
3316
3317
3318
3319
3320
3321
3322
3323
  		return 0;
  	/*
  	 * A regular pmd is established and it can't morph into a huge pmd
  	 * from under us anymore at this point because we hold the mmap_sem
  	 * read mode and khugepaged takes it in write mode. So now it's
  	 * safe to run pte_offset_map().
  	 */
  	pte = pte_offset_map(pmd, address);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3324

30c9f3a9f   Linus Torvalds   Remove internal u...
3325
  	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3326
  }
9a95f3cf7   Paul Cassella   mm: describe mmap...
3327
3328
3329
3330
3331
3332
  /*
   * By the time we get here, we already hold the mm semaphore
   *
   * The mmap_sem may have been released depending on flags and our
   * return value.  See filemap_fault() and __lock_page_or_retry().
   */
519e52473   Johannes Weiner   mm: memcg: enable...
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
  int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  		    unsigned long address, unsigned int flags)
  {
  	int ret;
  
  	__set_current_state(TASK_RUNNING);
  
  	count_vm_event(PGFAULT);
  	mem_cgroup_count_vm_event(mm, PGFAULT);
  
  	/* do counter updates before entering really critical section. */
  	check_sync_rss_stat(current);
  
  	/*
  	 * Enable the memcg OOM handling for faults triggered in user
  	 * space.  Kernel faults are handled more gracefully.
  	 */
  	if (flags & FAULT_FLAG_USER)
494264208   Johannes Weiner   mm: memcg: handle...
3351
  		mem_cgroup_oom_enable();
519e52473   Johannes Weiner   mm: memcg: enable...
3352
3353
  
  	ret = __handle_mm_fault(mm, vma, address, flags);
494264208   Johannes Weiner   mm: memcg: handle...
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
  	if (flags & FAULT_FLAG_USER) {
  		mem_cgroup_oom_disable();
                  /*
                   * The task may have entered a memcg OOM situation but
                   * if the allocation error was handled gracefully (no
                   * VM_FAULT_OOM), there is no need to kill anything.
                   * Just clean up the OOM state peacefully.
                   */
                  if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                          mem_cgroup_oom_synchronize(false);
  	}
3812c8c8f   Johannes Weiner   mm: memcg: do not...
3365

519e52473   Johannes Weiner   mm: memcg: enable...
3366
3367
  	return ret;
  }
e1d6d01ab   Jesse Barnes   mm: export find_e...
3368
  EXPORT_SYMBOL_GPL(handle_mm_fault);
519e52473   Johannes Weiner   mm: memcg: enable...
3369

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3370
3371
3372
  #ifndef __PAGETABLE_PUD_FOLDED
  /*
   * Allocate page upper directory.
872fec16d   Hugh Dickins   [PATCH] mm: init_...
3373
   * We've already handled the fast-path in-line.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3374
   */
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3375
  int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3376
  {
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
3377
3378
  	pud_t *new = pud_alloc_one(mm, address);
  	if (!new)
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3379
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3380

362a61ad6   Nick Piggin   fix SMP data race...
3381
  	smp_wmb(); /* See comment in __pte_alloc */
872fec16d   Hugh Dickins   [PATCH] mm: init_...
3382
  	spin_lock(&mm->page_table_lock);
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3383
  	if (pgd_present(*pgd))		/* Another has populated it */
5e5419734   Benjamin Herrenschmidt   add mm argument t...
3384
  		pud_free(mm, new);
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3385
3386
  	else
  		pgd_populate(mm, pgd, new);
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
3387
  	spin_unlock(&mm->page_table_lock);
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3388
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3389
3390
3391
3392
3393
3394
  }
  #endif /* __PAGETABLE_PUD_FOLDED */
  
  #ifndef __PAGETABLE_PMD_FOLDED
  /*
   * Allocate page middle directory.
872fec16d   Hugh Dickins   [PATCH] mm: init_...
3395
   * We've already handled the fast-path in-line.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3396
   */
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3397
  int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3398
  {
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
3399
3400
  	pmd_t *new = pmd_alloc_one(mm, address);
  	if (!new)
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3401
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3402

362a61ad6   Nick Piggin   fix SMP data race...
3403
  	smp_wmb(); /* See comment in __pte_alloc */
872fec16d   Hugh Dickins   [PATCH] mm: init_...
3404
  	spin_lock(&mm->page_table_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3405
  #ifndef __ARCH_HAS_4LEVEL_HACK
dc6c9a35b   Kirill A. Shutemov   mm: account pmd p...
3406
3407
  	if (!pud_present(*pud)) {
  		mm_inc_nr_pmds(mm);
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3408
  		pud_populate(mm, pud, new);
dc6c9a35b   Kirill A. Shutemov   mm: account pmd p...
3409
  	} else	/* Another has populated it */
5e5419734   Benjamin Herrenschmidt   add mm argument t...
3410
  		pmd_free(mm, new);
dc6c9a35b   Kirill A. Shutemov   mm: account pmd p...
3411
3412
3413
  #else
  	if (!pgd_present(*pud)) {
  		mm_inc_nr_pmds(mm);
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3414
  		pgd_populate(mm, pud, new);
dc6c9a35b   Kirill A. Shutemov   mm: account pmd p...
3415
3416
  	} else /* Another has populated it */
  		pmd_free(mm, new);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3417
  #endif /* __ARCH_HAS_4LEVEL_HACK */
c74df32c7   Hugh Dickins   [PATCH] mm: ptd_a...
3418
  	spin_unlock(&mm->page_table_lock);
1bb3630e8   Hugh Dickins   [PATCH] mm: ptd_a...
3419
  	return 0;
e0f39591c   Alan Stern   [PATCH] Workaroun...
3420
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3421
  #endif /* __PAGETABLE_PMD_FOLDED */
1b36ba815   Namhyung Kim   mm: wrap follow_p...
3422
  static int __follow_pte(struct mm_struct *mm, unsigned long address,
f8ad0f499   Johannes Weiner   mm: introduce fol...
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
  		pte_t **ptepp, spinlock_t **ptlp)
  {
  	pgd_t *pgd;
  	pud_t *pud;
  	pmd_t *pmd;
  	pte_t *ptep;
  
  	pgd = pgd_offset(mm, address);
  	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
  		goto out;
  
  	pud = pud_offset(pgd, address);
  	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
  		goto out;
  
  	pmd = pmd_offset(pud, address);
f66055ab6   Andrea Arcangeli   thp: verify pmd_t...
3439
  	VM_BUG_ON(pmd_trans_huge(*pmd));
f8ad0f499   Johannes Weiner   mm: introduce fol...
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
  	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
  		goto out;
  
  	/* We cannot handle huge page PFN maps. Luckily they don't exist. */
  	if (pmd_huge(*pmd))
  		goto out;
  
  	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
  	if (!ptep)
  		goto out;
  	if (!pte_present(*ptep))
  		goto unlock;
  	*ptepp = ptep;
  	return 0;
  unlock:
  	pte_unmap_unlock(ptep, *ptlp);
  out:
  	return -EINVAL;
  }
1b36ba815   Namhyung Kim   mm: wrap follow_p...
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
  static inline int follow_pte(struct mm_struct *mm, unsigned long address,
  			     pte_t **ptepp, spinlock_t **ptlp)
  {
  	int res;
  
  	/* (void) is needed to make gcc happy */
  	(void) __cond_lock(*ptlp,
  			   !(res = __follow_pte(mm, address, ptepp, ptlp)));
  	return res;
  }
3b6748e2d   Johannes Weiner   mm: introduce fol...
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
  /**
   * follow_pfn - look up PFN at a user virtual address
   * @vma: memory mapping
   * @address: user virtual address
   * @pfn: location to store found PFN
   *
   * Only IO mappings and raw PFN mappings are allowed.
   *
   * Returns zero and the pfn at @pfn on success, -ve otherwise.
   */
  int follow_pfn(struct vm_area_struct *vma, unsigned long address,
  	unsigned long *pfn)
  {
  	int ret = -EINVAL;
  	spinlock_t *ptl;
  	pte_t *ptep;
  
  	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
  		return ret;
  
  	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
  	if (ret)
  		return ret;
  	*pfn = pte_pfn(*ptep);
  	pte_unmap_unlock(ptep, ptl);
  	return 0;
  }
  EXPORT_SYMBOL(follow_pfn);
28b2ee20c   Rik van Riel   access_process_vm...
3497
  #ifdef CONFIG_HAVE_IOREMAP_PROT
d87fe6607   venkatesh.pallipadi@intel.com   x86: PAT: modify ...
3498
3499
3500
  int follow_phys(struct vm_area_struct *vma,
  		unsigned long address, unsigned int flags,
  		unsigned long *prot, resource_size_t *phys)
28b2ee20c   Rik van Riel   access_process_vm...
3501
  {
03668a4de   Johannes Weiner   mm: use generic f...
3502
  	int ret = -EINVAL;
28b2ee20c   Rik van Riel   access_process_vm...
3503
3504
  	pte_t *ptep, pte;
  	spinlock_t *ptl;
28b2ee20c   Rik van Riel   access_process_vm...
3505

d87fe6607   venkatesh.pallipadi@intel.com   x86: PAT: modify ...
3506
3507
  	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
  		goto out;
28b2ee20c   Rik van Riel   access_process_vm...
3508

03668a4de   Johannes Weiner   mm: use generic f...
3509
  	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
d87fe6607   venkatesh.pallipadi@intel.com   x86: PAT: modify ...
3510
  		goto out;
28b2ee20c   Rik van Riel   access_process_vm...
3511
  	pte = *ptep;
03668a4de   Johannes Weiner   mm: use generic f...
3512

28b2ee20c   Rik van Riel   access_process_vm...
3513
3514
  	if ((flags & FOLL_WRITE) && !pte_write(pte))
  		goto unlock;
28b2ee20c   Rik van Riel   access_process_vm...
3515
3516
  
  	*prot = pgprot_val(pte_pgprot(pte));
03668a4de   Johannes Weiner   mm: use generic f...
3517
  	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
28b2ee20c   Rik van Riel   access_process_vm...
3518

03668a4de   Johannes Weiner   mm: use generic f...
3519
  	ret = 0;
28b2ee20c   Rik van Riel   access_process_vm...
3520
3521
3522
  unlock:
  	pte_unmap_unlock(ptep, ptl);
  out:
d87fe6607   venkatesh.pallipadi@intel.com   x86: PAT: modify ...
3523
  	return ret;
28b2ee20c   Rik van Riel   access_process_vm...
3524
3525
3526
3527
3528
3529
3530
  }
  
  int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
  			void *buf, int len, int write)
  {
  	resource_size_t phys_addr;
  	unsigned long prot = 0;
2bc7273b0   KOSAKI Motohiro   mm: make maddr __...
3531
  	void __iomem *maddr;
28b2ee20c   Rik van Riel   access_process_vm...
3532
  	int offset = addr & (PAGE_SIZE-1);
d87fe6607   venkatesh.pallipadi@intel.com   x86: PAT: modify ...
3533
  	if (follow_phys(vma, addr, write, &prot, &phys_addr))
28b2ee20c   Rik van Riel   access_process_vm...
3534
  		return -EINVAL;
9cb12d7b4   Grazvydas Ignotas   mm/memory.c: actu...
3535
  	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
28b2ee20c   Rik van Riel   access_process_vm...
3536
3537
3538
3539
3540
3541
3542
3543
  	if (write)
  		memcpy_toio(maddr + offset, buf, len);
  	else
  		memcpy_fromio(buf, maddr + offset, len);
  	iounmap(maddr);
  
  	return len;
  }
5a73633ef   Uwe Kleine-König   mm: make generic_...
3544
  EXPORT_SYMBOL_GPL(generic_access_phys);
28b2ee20c   Rik van Riel   access_process_vm...
3545
  #endif
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3546
  /*
206cb6365   Stephen Wilson   mm: factor out ma...
3547
3548
   * Access another process' address space as given in mm.  If non-NULL, use the
   * given task for page fault accounting.
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3549
   */
206cb6365   Stephen Wilson   mm: factor out ma...
3550
3551
  static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
  		unsigned long addr, void *buf, int len, int write)
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3552
  {
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3553
  	struct vm_area_struct *vma;
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3554
  	void *old_buf = buf;
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3555
  	down_read(&mm->mmap_sem);
183ff22bb   Simon Arlott   spelling fixes: mm/
3556
  	/* ignore errors, just check how much was successfully transferred */
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3557
3558
3559
  	while (len) {
  		int bytes, ret, offset;
  		void *maddr;
28b2ee20c   Rik van Riel   access_process_vm...
3560
  		struct page *page = NULL;
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3561

1e9877902   Dave Hansen   mm/gup: Introduce...
3562
  		ret = get_user_pages_remote(tsk, mm, addr, 1,
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3563
  				write, 1, &page, &vma);
28b2ee20c   Rik van Riel   access_process_vm...
3564
  		if (ret <= 0) {
dbffcd03d   Rik van Riel   mm: change confus...
3565
3566
3567
  #ifndef CONFIG_HAVE_IOREMAP_PROT
  			break;
  #else
28b2ee20c   Rik van Riel   access_process_vm...
3568
3569
3570
3571
  			/*
  			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
  			 * we can access using slightly different code.
  			 */
28b2ee20c   Rik van Riel   access_process_vm...
3572
  			vma = find_vma(mm, addr);
fe936dfc2   Michael Ellerman   mm: check that we...
3573
  			if (!vma || vma->vm_start > addr)
28b2ee20c   Rik van Riel   access_process_vm...
3574
3575
3576
3577
3578
  				break;
  			if (vma->vm_ops && vma->vm_ops->access)
  				ret = vma->vm_ops->access(vma, addr, buf,
  							  len, write);
  			if (ret <= 0)
28b2ee20c   Rik van Riel   access_process_vm...
3579
3580
  				break;
  			bytes = ret;
dbffcd03d   Rik van Riel   mm: change confus...
3581
  #endif
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3582
  		} else {
28b2ee20c   Rik van Riel   access_process_vm...
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
  			bytes = len;
  			offset = addr & (PAGE_SIZE-1);
  			if (bytes > PAGE_SIZE-offset)
  				bytes = PAGE_SIZE-offset;
  
  			maddr = kmap(page);
  			if (write) {
  				copy_to_user_page(vma, page, addr,
  						  maddr + offset, buf, bytes);
  				set_page_dirty_lock(page);
  			} else {
  				copy_from_user_page(vma, page, addr,
  						    buf, maddr + offset, bytes);
  			}
  			kunmap(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3598
  			put_page(page);
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3599
  		}
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3600
3601
3602
3603
3604
  		len -= bytes;
  		buf += bytes;
  		addr += bytes;
  	}
  	up_read(&mm->mmap_sem);
0ec76a110   David Howells   [PATCH] NOMMU: Ch...
3605
3606
3607
  
  	return buf - old_buf;
  }
03252919b   Andi Kleen   x86: print which ...
3608

5ddd36b9c   Stephen Wilson   mm: implement acc...
3609
  /**
ae91dbfc9   Randy Dunlap   mm: fix memory.c ...
3610
   * access_remote_vm - access another process' address space
5ddd36b9c   Stephen Wilson   mm: implement acc...
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
   * @mm:		the mm_struct of the target address space
   * @addr:	start address to access
   * @buf:	source or destination buffer
   * @len:	number of bytes to transfer
   * @write:	whether the access is a write
   *
   * The caller must hold a reference on @mm.
   */
  int access_remote_vm(struct mm_struct *mm, unsigned long addr,
  		void *buf, int len, int write)
  {
  	return __access_remote_vm(NULL, mm, addr, buf, len, write);
  }
03252919b   Andi Kleen   x86: print which ...
3624
  /*
206cb6365   Stephen Wilson   mm: factor out ma...
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
   * Access another process' address space.
   * Source/target buffer must be kernel space,
   * Do not walk the page table directly, use get_user_pages
   */
  int access_process_vm(struct task_struct *tsk, unsigned long addr,
  		void *buf, int len, int write)
  {
  	struct mm_struct *mm;
  	int ret;
  
  	mm = get_task_mm(tsk);
  	if (!mm)
  		return 0;
  
  	ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
  	mmput(mm);
  
  	return ret;
  }
03252919b   Andi Kleen   x86: print which ...
3644
3645
3646
3647
3648
3649
3650
  /*
   * Print the name of a VMA.
   */
  void print_vma_addr(char *prefix, unsigned long ip)
  {
  	struct mm_struct *mm = current->mm;
  	struct vm_area_struct *vma;
e8bff74af   Ingo Molnar   x86: fix "BUG: sl...
3651
3652
3653
3654
3655
3656
  	/*
  	 * Do not print if we are in atomic
  	 * contexts (in exception stacks, etc.):
  	 */
  	if (preempt_count())
  		return;
03252919b   Andi Kleen   x86: print which ...
3657
3658
3659
3660
3661
3662
  	down_read(&mm->mmap_sem);
  	vma = find_vma(mm, ip);
  	if (vma && vma->vm_file) {
  		struct file *f = vma->vm_file;
  		char *buf = (char *)__get_free_page(GFP_KERNEL);
  		if (buf) {
2fbc57c53   Andy Shevchenko   mm: use kbasename()
3663
  			char *p;
03252919b   Andi Kleen   x86: print which ...
3664

9bf39ab2a   Miklos Szeredi   vfs: add file_pat...
3665
  			p = file_path(f, buf, PAGE_SIZE);
03252919b   Andi Kleen   x86: print which ...
3666
3667
  			if (IS_ERR(p))
  				p = "?";
2fbc57c53   Andy Shevchenko   mm: use kbasename()
3668
  			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
03252919b   Andi Kleen   x86: print which ...
3669
3670
3671
3672
3673
  					vma->vm_start,
  					vma->vm_end - vma->vm_start);
  			free_page((unsigned long)buf);
  		}
  	}
51a07e50b   Jeff Liu   mm/memory.c:print...
3674
  	up_read(&mm->mmap_sem);
03252919b   Andi Kleen   x86: print which ...
3675
  }
3ee1afa30   Nick Piggin   x86: some lock an...
3676

662bbcb27   Michael S. Tsirkin   mm, sched: Allow ...
3677
  #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
9ec23531f   David Hildenbrand   sched/preempt, mm...
3678
  void __might_fault(const char *file, int line)
3ee1afa30   Nick Piggin   x86: some lock an...
3679
  {
95156f005   Peter Zijlstra   lockdep, mm: fix ...
3680
3681
3682
3683
3684
3685
3686
3687
  	/*
  	 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
  	 * holding the mmap_sem, this is safe because kernel memory doesn't
  	 * get paged out, therefore we'll never actually fault, and the
  	 * below annotations will generate false positives.
  	 */
  	if (segment_eq(get_fs(), KERNEL_DS))
  		return;
9ec23531f   David Hildenbrand   sched/preempt, mm...
3688
  	if (pagefault_disabled())
662bbcb27   Michael S. Tsirkin   mm, sched: Allow ...
3689
  		return;
9ec23531f   David Hildenbrand   sched/preempt, mm...
3690
3691
  	__might_sleep(file, line, 0);
  #if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
662bbcb27   Michael S. Tsirkin   mm, sched: Allow ...
3692
  	if (current->mm)
3ee1afa30   Nick Piggin   x86: some lock an...
3693
  		might_lock_read(&current->mm->mmap_sem);
9ec23531f   David Hildenbrand   sched/preempt, mm...
3694
  #endif
3ee1afa30   Nick Piggin   x86: some lock an...
3695
  }
9ec23531f   David Hildenbrand   sched/preempt, mm...
3696
  EXPORT_SYMBOL(__might_fault);
3ee1afa30   Nick Piggin   x86: some lock an...
3697
  #endif
47ad8475c   Andrea Arcangeli   thp: clear_copy_h...
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
  
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
  static void clear_gigantic_page(struct page *page,
  				unsigned long addr,
  				unsigned int pages_per_huge_page)
  {
  	int i;
  	struct page *p = page;
  
  	might_sleep();
  	for (i = 0; i < pages_per_huge_page;
  	     i++, p = mem_map_next(p, page, i)) {
  		cond_resched();
  		clear_user_highpage(p, addr + i * PAGE_SIZE);
  	}
  }
  void clear_huge_page(struct page *page,
  		     unsigned long addr, unsigned int pages_per_huge_page)
  {
  	int i;
  
  	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
  		clear_gigantic_page(page, addr, pages_per_huge_page);
  		return;
  	}
  
  	might_sleep();
  	for (i = 0; i < pages_per_huge_page; i++) {
  		cond_resched();
  		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
  	}
  }
  
  static void copy_user_gigantic_page(struct page *dst, struct page *src,
  				    unsigned long addr,
  				    struct vm_area_struct *vma,
  				    unsigned int pages_per_huge_page)
  {
  	int i;
  	struct page *dst_base = dst;
  	struct page *src_base = src;
  
  	for (i = 0; i < pages_per_huge_page; ) {
  		cond_resched();
  		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
  
  		i++;
  		dst = mem_map_next(dst, dst_base, i);
  		src = mem_map_next(src, src_base, i);
  	}
  }
  
  void copy_user_huge_page(struct page *dst, struct page *src,
  			 unsigned long addr, struct vm_area_struct *vma,
  			 unsigned int pages_per_huge_page)
  {
  	int i;
  
  	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
  		copy_user_gigantic_page(dst, src, addr, vma,
  					pages_per_huge_page);
  		return;
  	}
  
  	might_sleep();
  	for (i = 0; i < pages_per_huge_page; i++) {
  		cond_resched();
  		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
  	}
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
49076ec2c   Kirill A. Shutemov   mm: dynamically a...
3769

40b64acd1   Olof Johansson   mm: fix build of ...
3770
  #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
b35f1819a   Kirill A. Shutemov   mm: create a sepa...
3771
3772
3773
3774
3775
3776
3777
3778
  
  static struct kmem_cache *page_ptl_cachep;
  
  void __init ptlock_cache_init(void)
  {
  	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
  			SLAB_PANIC, NULL);
  }
539edb584   Peter Zijlstra   mm: properly sepa...
3779
  bool ptlock_alloc(struct page *page)
49076ec2c   Kirill A. Shutemov   mm: dynamically a...
3780
3781
  {
  	spinlock_t *ptl;
b35f1819a   Kirill A. Shutemov   mm: create a sepa...
3782
  	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
49076ec2c   Kirill A. Shutemov   mm: dynamically a...
3783
3784
  	if (!ptl)
  		return false;
539edb584   Peter Zijlstra   mm: properly sepa...
3785
  	page->ptl = ptl;
49076ec2c   Kirill A. Shutemov   mm: dynamically a...
3786
3787
  	return true;
  }
539edb584   Peter Zijlstra   mm: properly sepa...
3788
  void ptlock_free(struct page *page)
49076ec2c   Kirill A. Shutemov   mm: dynamically a...
3789
  {
b35f1819a   Kirill A. Shutemov   mm: create a sepa...
3790
  	kmem_cache_free(page_ptl_cachep, page->ptl);
49076ec2c   Kirill A. Shutemov   mm: dynamically a...
3791
3792
  }
  #endif