Blame view

mm/rmap.c 24.5 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
  /*
   * mm/rmap.c - physical to virtual reverse mappings
   *
   * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   * Released under the General Public License (GPL).
   *
   * Simple, low overhead reverse mapping scheme.
   * Please try to keep this thing as modular as possible.
   *
   * Provides methods for unmapping each kind of mapped page:
   * the anon methods track anonymous pages, and
   * the file methods track pages belonging to an inode.
   *
   * Original design by Rik van Riel <riel@conectiva.com.br> 2001
   * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
   * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
   * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
   */
  
  /*
   * Lock ordering in mm:
   *
1b1dcc1b5   Jes Sorensen   [PATCH] mutex sub...
23
   * inode->i_mutex	(while writing or truncating, not reading or faulting)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
   *   inode->i_alloc_sem
   *
   * When a page fault occurs in writing from user to file, down_read
1b1dcc1b5   Jes Sorensen   [PATCH] mutex sub...
27
28
29
   * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
   * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
   * taken together; in truncation, i_mutex is taken outermost.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
30
31
32
33
34
   *
   * mm->mmap_sem
   *   page->flags PG_locked (lock_page)
   *     mapping->i_mmap_lock
   *       anon_vma->lock
b8072f099   Hugh Dickins   [PATCH] mm: updat...
35
   *         mm->page_table_lock or pte_lock
053837fce   Nick Piggin   [PATCH] mm: migra...
36
   *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
5d337b919   Hugh Dickins   [PATCH] swap: swa...
37
   *           swap_lock (in swap_duplicate, swap_info_get)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
38
   *             mmlist_lock (in mmput, drain_mmlist and others)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
   *             mapping->private_lock (in __set_page_dirty_buffers)
   *             inode_lock (in set_page_dirty's __mark_inode_dirty)
   *               sb_lock (within inode_lock in fs/fs-writeback.c)
   *               mapping->tree_lock (widely used, in set_page_dirty,
   *                         in arch-dependent flush_dcache_mmap_lock,
   *                         within inode_lock in __sync_single_inode)
   */
  
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
  #include <linux/slab.h>
  #include <linux/init.h>
  #include <linux/rmap.h>
  #include <linux/rcupdate.h>
a48d07afd   Christoph Lameter   [PATCH] Direct Mi...
55
  #include <linux/module.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
56
57
  
  #include <asm/tlbflush.h>
fcc234f88   Pekka Enberg   [PATCH] mm: kill ...
58
  struct kmem_cache *anon_vma_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
59
60
61
  
  static inline void validate_anon_vma(struct vm_area_struct *find_vma)
  {
b7ab795b7   Nick Piggin   [PATCH] mm: more ...
62
  #ifdef CONFIG_DEBUG_VM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
  	struct anon_vma *anon_vma = find_vma->anon_vma;
  	struct vm_area_struct *vma;
  	unsigned int mapcount = 0;
  	int found = 0;
  
  	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  		mapcount++;
  		BUG_ON(mapcount > 100000);
  		if (vma == find_vma)
  			found = 1;
  	}
  	BUG_ON(!found);
  #endif
  }
  
  /* This must be called under the mmap_sem. */
  int anon_vma_prepare(struct vm_area_struct *vma)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
  
  	might_sleep();
  	if (unlikely(!anon_vma)) {
  		struct mm_struct *mm = vma->vm_mm;
  		struct anon_vma *allocated, *locked;
  
  		anon_vma = find_mergeable_anon_vma(vma);
  		if (anon_vma) {
  			allocated = NULL;
  			locked = anon_vma;
  			spin_lock(&locked->lock);
  		} else {
  			anon_vma = anon_vma_alloc();
  			if (unlikely(!anon_vma))
  				return -ENOMEM;
  			allocated = anon_vma;
  			locked = NULL;
  		}
  
  		/* page_table_lock to protect against threads */
  		spin_lock(&mm->page_table_lock);
  		if (likely(!vma->anon_vma)) {
  			vma->anon_vma = anon_vma;
0697212a4   Christoph Lameter   [PATCH] Swapless ...
105
  			list_add_tail(&vma->anon_vma_node, &anon_vma->head);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  			allocated = NULL;
  		}
  		spin_unlock(&mm->page_table_lock);
  
  		if (locked)
  			spin_unlock(&locked->lock);
  		if (unlikely(allocated))
  			anon_vma_free(allocated);
  	}
  	return 0;
  }
  
  void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  {
  	BUG_ON(vma->anon_vma != next->anon_vma);
  	list_del(&next->anon_vma_node);
  }
  
  void __anon_vma_link(struct vm_area_struct *vma)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
  
  	if (anon_vma) {
0697212a4   Christoph Lameter   [PATCH] Swapless ...
129
  		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
130
131
132
133
134
135
136
137
138
139
  		validate_anon_vma(vma);
  	}
  }
  
  void anon_vma_link(struct vm_area_struct *vma)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
  
  	if (anon_vma) {
  		spin_lock(&anon_vma->lock);
0697212a4   Christoph Lameter   [PATCH] Swapless ...
140
  		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
  		validate_anon_vma(vma);
  		spin_unlock(&anon_vma->lock);
  	}
  }
  
  void anon_vma_unlink(struct vm_area_struct *vma)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
  	int empty;
  
  	if (!anon_vma)
  		return;
  
  	spin_lock(&anon_vma->lock);
  	validate_anon_vma(vma);
  	list_del(&vma->anon_vma_node);
  
  	/* We must garbage collect the anon_vma if it's empty */
  	empty = list_empty(&anon_vma->head);
  	spin_unlock(&anon_vma->lock);
  
  	if (empty)
  		anon_vma_free(anon_vma);
  }
fcc234f88   Pekka Enberg   [PATCH] mm: kill ...
165
166
  static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
  			  unsigned long flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
  {
  	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
  						SLAB_CTOR_CONSTRUCTOR) {
  		struct anon_vma *anon_vma = data;
  
  		spin_lock_init(&anon_vma->lock);
  		INIT_LIST_HEAD(&anon_vma->head);
  	}
  }
  
  void __init anon_vma_init(void)
  {
  	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
  			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
  }
  
  /*
   * Getting a lock on a stable anon_vma from a page off the LRU is
   * tricky: page_lock_anon_vma rely on RCU to guard against the races.
   */
  static struct anon_vma *page_lock_anon_vma(struct page *page)
  {
  	struct anon_vma *anon_vma = NULL;
  	unsigned long anon_mapping;
  
  	rcu_read_lock();
  	anon_mapping = (unsigned long) page->mapping;
  	if (!(anon_mapping & PAGE_MAPPING_ANON))
  		goto out;
  	if (!page_mapped(page))
  		goto out;
  
  	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
  	spin_lock(&anon_vma->lock);
  out:
  	rcu_read_unlock();
  	return anon_vma;
  }
  
  /*
   * At what user virtual address is page expected in vma?
   */
  static inline unsigned long
  vma_address(struct page *page, struct vm_area_struct *vma)
  {
  	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  	unsigned long address;
  
  	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
  		/* page should be within any vma from prio_tree_next */
  		BUG_ON(!PageAnon(page));
  		return -EFAULT;
  	}
  	return address;
  }
  
  /*
   * At what user virtual address is page expected in vma? checking that the
ee498ed73   Hugh Dickins   [PATCH] unpaged: ...
226
   * page matches the vma: currently only used on anon pages, by unuse_vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
227
228
229
230
231
232
233
234
   */
  unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  {
  	if (PageAnon(page)) {
  		if ((void *)vma->anon_vma !=
  		    (void *)page->mapping - PAGE_MAPPING_ANON)
  			return -EFAULT;
  	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
ee498ed73   Hugh Dickins   [PATCH] unpaged: ...
235
236
  		if (!vma->vm_file ||
  		    vma->vm_file->f_mapping != page->mapping)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
237
238
239
240
241
242
243
  			return -EFAULT;
  	} else
  		return -EFAULT;
  	return vma_address(page, vma);
  }
  
  /*
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
244
245
   * Check that @page is mapped at @address into @mm.
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
246
   * On success returns with pte mapped and locked.
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
247
   */
ceffc0785   Carsten Otte   [PATCH] xip: fs/m...
248
  pte_t *page_check_address(struct page *page, struct mm_struct *mm,
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
249
  			  unsigned long address, spinlock_t **ptlp)
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
250
251
252
253
254
  {
  	pgd_t *pgd;
  	pud_t *pud;
  	pmd_t *pmd;
  	pte_t *pte;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
255
  	spinlock_t *ptl;
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
256

81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
257
  	pgd = pgd_offset(mm, address);
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
  	if (!pgd_present(*pgd))
  		return NULL;
  
  	pud = pud_offset(pgd, address);
  	if (!pud_present(*pud))
  		return NULL;
  
  	pmd = pmd_offset(pud, address);
  	if (!pmd_present(*pmd))
  		return NULL;
  
  	pte = pte_offset_map(pmd, address);
  	/* Make a quick check before getting the lock */
  	if (!pte_present(*pte)) {
  		pte_unmap(pte);
  		return NULL;
  	}
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
275
  	ptl = pte_lockptr(mm, pmd);
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
276
277
278
279
  	spin_lock(ptl);
  	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
  		*ptlp = ptl;
  		return pte;
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
280
  	}
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
281
282
  	pte_unmap_unlock(pte, ptl);
  	return NULL;
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
283
284
285
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
286
287
288
289
   * Subfunctions of page_referenced: page_referenced_one called
   * repeatedly from either page_referenced_anon or page_referenced_file.
   */
  static int page_referenced_one(struct page *page,
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
290
  	struct vm_area_struct *vma, unsigned int *mapcount)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
291
292
293
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
294
  	pte_t *pte;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
295
  	spinlock_t *ptl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
296
  	int referenced = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
297
298
299
  	address = vma_address(page, vma);
  	if (address == -EFAULT)
  		goto out;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
300
301
302
  	pte = page_check_address(page, mm, address, &ptl);
  	if (!pte)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
303

c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
304
305
  	if (ptep_clear_flush_young(vma, address, pte))
  		referenced++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
306

c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
307
308
  	/* Pretend the page is referenced if the task has the
  	   swap token and is in the middle of a page fault. */
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
309
  	if (mm != current->mm && has_swap_token(mm) &&
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
310
311
312
313
314
  			rwsem_is_locked(&mm->mmap_sem))
  		referenced++;
  
  	(*mapcount)--;
  	pte_unmap_unlock(pte, ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
315
316
317
  out:
  	return referenced;
  }
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
318
  static int page_referenced_anon(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
319
320
321
322
323
324
325
326
327
328
329
330
  {
  	unsigned int mapcount;
  	struct anon_vma *anon_vma;
  	struct vm_area_struct *vma;
  	int referenced = 0;
  
  	anon_vma = page_lock_anon_vma(page);
  	if (!anon_vma)
  		return referenced;
  
  	mapcount = page_mapcount(page);
  	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
331
  		referenced += page_referenced_one(page, vma, &mapcount);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
  		if (!mapcount)
  			break;
  	}
  	spin_unlock(&anon_vma->lock);
  	return referenced;
  }
  
  /**
   * page_referenced_file - referenced check for object-based rmap
   * @page: the page we're checking references on.
   *
   * For an object-based mapped page, find all the places it is mapped and
   * check/clear the referenced flag.  This is done by following the page->mapping
   * pointer, then walking the chain of vmas it holds.  It returns the number
   * of references it found.
   *
   * This function is only called from page_referenced for object-based pages.
   */
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
350
  static int page_referenced_file(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
  {
  	unsigned int mapcount;
  	struct address_space *mapping = page->mapping;
  	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  	struct vm_area_struct *vma;
  	struct prio_tree_iter iter;
  	int referenced = 0;
  
  	/*
  	 * The caller's checks on page->mapping and !PageAnon have made
  	 * sure that this is a file page: the check for page->mapping
  	 * excludes the case just before it gets set on an anon page.
  	 */
  	BUG_ON(PageAnon(page));
  
  	/*
  	 * The page lock not only makes sure that page->mapping cannot
  	 * suddenly be NULLified by truncation, it makes sure that the
  	 * structure at mapping cannot be freed and reused yet,
  	 * so we can safely take mapping->i_mmap_lock.
  	 */
  	BUG_ON(!PageLocked(page));
  
  	spin_lock(&mapping->i_mmap_lock);
  
  	/*
  	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
  	 * is more likely to be accurate if we note it after spinning.
  	 */
  	mapcount = page_mapcount(page);
  
  	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
  				  == (VM_LOCKED|VM_MAYSHARE)) {
  			referenced++;
  			break;
  		}
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
388
  		referenced += page_referenced_one(page, vma, &mapcount);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
  		if (!mapcount)
  			break;
  	}
  
  	spin_unlock(&mapping->i_mmap_lock);
  	return referenced;
  }
  
  /**
   * page_referenced - test if the page was referenced
   * @page: the page to test
   * @is_locked: caller holds lock on the page
   *
   * Quick test_and_clear_referenced for all mappings to a page,
   * returns the number of ptes which referenced the page.
   */
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
405
  int page_referenced(struct page *page, int is_locked)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
406
407
  {
  	int referenced = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
408
409
410
411
412
413
414
415
  	if (page_test_and_clear_young(page))
  		referenced++;
  
  	if (TestClearPageReferenced(page))
  		referenced++;
  
  	if (page_mapped(page) && page->mapping) {
  		if (PageAnon(page))
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
416
  			referenced += page_referenced_anon(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
417
  		else if (is_locked)
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
418
  			referenced += page_referenced_file(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
419
420
421
422
  		else if (TestSetPageLocked(page))
  			referenced++;
  		else {
  			if (page->mapping)
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
423
  				referenced += page_referenced_file(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
424
425
426
427
428
  			unlock_page(page);
  		}
  	}
  	return referenced;
  }
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
  static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address;
  	pte_t *pte, entry;
  	spinlock_t *ptl;
  	int ret = 0;
  
  	address = vma_address(page, vma);
  	if (address == -EFAULT)
  		goto out;
  
  	pte = page_check_address(page, mm, address, &ptl);
  	if (!pte)
  		goto out;
  
  	if (!pte_dirty(*pte) && !pte_write(*pte))
  		goto unlock;
  
  	entry = ptep_get_and_clear(mm, address, pte);
  	entry = pte_mkclean(entry);
  	entry = pte_wrprotect(entry);
  	ptep_establish(vma, address, pte, entry);
  	lazy_mmu_prot_update(entry);
  	ret = 1;
  
  unlock:
  	pte_unmap_unlock(pte, ptl);
  out:
  	return ret;
  }
  
  static int page_mkclean_file(struct address_space *mapping, struct page *page)
  {
  	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  	struct vm_area_struct *vma;
  	struct prio_tree_iter iter;
  	int ret = 0;
  
  	BUG_ON(PageAnon(page));
  
  	spin_lock(&mapping->i_mmap_lock);
  	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  		if (vma->vm_flags & VM_SHARED)
  			ret += page_mkclean_one(page, vma);
  	}
  	spin_unlock(&mapping->i_mmap_lock);
  	return ret;
  }
  
  int page_mkclean(struct page *page)
  {
  	int ret = 0;
  
  	BUG_ON(!PageLocked(page));
  
  	if (page_mapped(page)) {
  		struct address_space *mapping = page_mapping(page);
  		if (mapping)
  			ret = page_mkclean_file(mapping, page);
  	}
  
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
493
  /**
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
   * page_set_anon_rmap - setup new anonymous rmap
   * @page:	the page to add the mapping to
   * @vma:	the vm area in which the mapping is added
   * @address:	the user virtual address mapped
   */
  static void __page_set_anon_rmap(struct page *page,
  	struct vm_area_struct *vma, unsigned long address)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
  
  	BUG_ON(!anon_vma);
  	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
  	page->mapping = (struct address_space *) anon_vma;
  
  	page->index = linear_page_index(vma, address);
a74609faf   Nick Piggin   [PATCH] mm: page_...
509
510
511
512
  	/*
  	 * nr_mapped state can be updated without turning off
  	 * interrupts because it is not modified via interrupt.
  	 */
f3dbd3446   Christoph Lameter   [PATCH] zoned vm ...
513
  	__inc_zone_page_state(page, NR_ANON_PAGES);
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
514
515
516
  }
  
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
517
518
519
520
521
   * page_add_anon_rmap - add pte mapping to an anonymous page
   * @page:	the page to add the mapping to
   * @vma:	the vm area in which the mapping is added
   * @address:	the user virtual address mapped
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
522
   * The caller needs to hold the pte lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
523
524
525
526
   */
  void page_add_anon_rmap(struct page *page,
  	struct vm_area_struct *vma, unsigned long address)
  {
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
527
528
  	if (atomic_inc_and_test(&page->_mapcount))
  		__page_set_anon_rmap(page, vma, address);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
529
530
  	/* else checking page index and mapping is racy */
  }
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
  /*
   * page_add_new_anon_rmap - add pte mapping to a new anonymous page
   * @page:	the page to add the mapping to
   * @vma:	the vm area in which the mapping is added
   * @address:	the user virtual address mapped
   *
   * Same as page_add_anon_rmap but must only be called on *new* pages.
   * This means the inc-and-test can be bypassed.
   */
  void page_add_new_anon_rmap(struct page *page,
  	struct vm_area_struct *vma, unsigned long address)
  {
  	atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
  	__page_set_anon_rmap(page, vma, address);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
546
547
548
549
  /**
   * page_add_file_rmap - add pte mapping to a file page
   * @page: the page to add the mapping to
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
550
   * The caller needs to hold the pte lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
551
552
553
   */
  void page_add_file_rmap(struct page *page)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
554
  	if (atomic_inc_and_test(&page->_mapcount))
65ba55f50   Christoph Lameter   [PATCH] zoned vm ...
555
  		__inc_zone_page_state(page, NR_FILE_MAPPED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
556
557
558
559
560
561
  }
  
  /**
   * page_remove_rmap - take down pte mapping from a page
   * @page: page to remove mapping from
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
562
   * The caller needs to hold the pte lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
563
564
565
   */
  void page_remove_rmap(struct page *page)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
566
  	if (atomic_add_negative(-1, &page->_mapcount)) {
b7ab795b7   Nick Piggin   [PATCH] mm: more ...
567
568
  #ifdef CONFIG_DEBUG_VM
  		if (unlikely(page_mapcount(page) < 0)) {
ef2bf0dc8   Dave Jones   [PATCH] rmap: add...
569
570
571
572
573
574
575
576
577
  			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)
  ", page_mapcount(page));
  			printk (KERN_EMERG "  page->flags = %lx
  ", page->flags);
  			printk (KERN_EMERG "  page->count = %x
  ", page_count(page));
  			printk (KERN_EMERG "  page->mapping = %p
  ", page->mapping);
  		}
b7ab795b7   Nick Piggin   [PATCH] mm: more ...
578
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
579
580
581
582
583
584
585
586
587
588
589
590
  		BUG_ON(page_mapcount(page) < 0);
  		/*
  		 * It would be tidy to reset the PageAnon mapping here,
  		 * but that might overwrite a racing page_add_anon_rmap
  		 * which increments mapcount after us but sets mapping
  		 * before us: so leave the reset to free_hot_cold_page,
  		 * and remember that it's only reliable while mapped.
  		 * Leaving it set also helps swapoff to reinstate ptes
  		 * faster for those pages still in swapcache.
  		 */
  		if (page_test_and_clear_dirty(page))
  			set_page_dirty(page);
f3dbd3446   Christoph Lameter   [PATCH] zoned vm ...
591
592
  		__dec_zone_page_state(page,
  				PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
593
594
595
596
597
598
599
  	}
  }
  
  /*
   * Subfunctions of try_to_unmap: try_to_unmap_one called
   * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
   */
a48d07afd   Christoph Lameter   [PATCH] Direct Mi...
600
  static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
7352349a1   Christoph Lameter   [PATCH] page migr...
601
  				int migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
602
603
604
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
605
606
  	pte_t *pte;
  	pte_t pteval;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
607
  	spinlock_t *ptl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
608
  	int ret = SWAP_AGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
609
610
611
  	address = vma_address(page, vma);
  	if (address == -EFAULT)
  		goto out;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
612
613
  	pte = page_check_address(page, mm, address, &ptl);
  	if (!pte)
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
614
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
615
616
617
618
619
620
  
  	/*
  	 * If the page is mlock()d, we cannot swap it out.
  	 * If it's recently referenced (perhaps page_referenced
  	 * skipped over this mm) then we should reactivate it.
  	 */
e6a1530d6   Christoph Lameter   [PATCH] Allow mig...
621
622
  	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
  			(ptep_clear_flush_young(vma, address, pte)))) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
623
624
625
  		ret = SWAP_FAIL;
  		goto out_unmap;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
626
627
628
629
630
631
632
  	/* Nuke the page table entry. */
  	flush_cache_page(vma, address, page_to_pfn(page));
  	pteval = ptep_clear_flush(vma, address, pte);
  
  	/* Move the dirty bit to the physical page now the pte is gone. */
  	if (pte_dirty(pteval))
  		set_page_dirty(page);
365e9c87a   Hugh Dickins   [PATCH] mm: updat...
633
634
  	/* Update high watermark before we lower rss */
  	update_hiwater_rss(mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
635
  	if (PageAnon(page)) {
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
636
  		swp_entry_t entry = { .val = page_private(page) };
0697212a4   Christoph Lameter   [PATCH] Swapless ...
637
638
639
640
641
642
643
644
645
646
647
648
649
  
  		if (PageSwapCache(page)) {
  			/*
  			 * Store the swap location in the pte.
  			 * See handle_pte_fault() ...
  			 */
  			swap_duplicate(entry);
  			if (list_empty(&mm->mmlist)) {
  				spin_lock(&mmlist_lock);
  				if (list_empty(&mm->mmlist))
  					list_add(&mm->mmlist, &init_mm.mmlist);
  				spin_unlock(&mmlist_lock);
  			}
442c9137d   Christoph Lameter   [PATCH] More page...
650
  			dec_mm_counter(mm, anon_rss);
04e62a29b   Christoph Lameter   [PATCH] More page...
651
  #ifdef CONFIG_MIGRATION
0697212a4   Christoph Lameter   [PATCH] Swapless ...
652
653
654
655
656
657
658
659
  		} else {
  			/*
  			 * Store the pfn of the page in a special migration
  			 * pte. do_swap_page() will wait until the migration
  			 * pte is removed and then restart fault handling.
  			 */
  			BUG_ON(!migration);
  			entry = make_migration_entry(page, pte_write(pteval));
04e62a29b   Christoph Lameter   [PATCH] More page...
660
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
661
662
663
  		}
  		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
  		BUG_ON(pte_file(*pte));
4294621f4   Hugh Dickins   [PATCH] mm: rss =...
664
  	} else
04e62a29b   Christoph Lameter   [PATCH] More page...
665
666
667
668
669
670
671
672
  #ifdef CONFIG_MIGRATION
  	if (migration) {
  		/* Establish migration entry for a file page */
  		swp_entry_t entry;
  		entry = make_migration_entry(page, pte_write(pteval));
  		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
  	} else
  #endif
4294621f4   Hugh Dickins   [PATCH] mm: rss =...
673
  		dec_mm_counter(mm, file_rss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
674

04e62a29b   Christoph Lameter   [PATCH] More page...
675

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
676
677
678
679
  	page_remove_rmap(page);
  	page_cache_release(page);
  
  out_unmap:
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
680
  	pte_unmap_unlock(pte, ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
  out:
  	return ret;
  }
  
  /*
   * objrmap doesn't work for nonlinear VMAs because the assumption that
   * offset-into-file correlates with offset-into-virtual-addresses does not hold.
   * Consequently, given a particular page and its ->index, we cannot locate the
   * ptes which are mapping that page without an exhaustive linear search.
   *
   * So what this code does is a mini "virtual scan" of each nonlinear VMA which
   * maps the file to which the target page belongs.  The ->vm_private_data field
   * holds the current cursor into that scan.  Successive searches will circulate
   * around the vma's virtual address space.
   *
   * So as more replacement pressure is applied to the pages in a nonlinear VMA,
   * more scanning pressure is placed against them as well.   Eventually pages
   * will become fully unmapped and are eligible for eviction.
   *
   * For very sparsely populated VMAs this is a little inefficient - chances are
   * there there won't be many ptes located within the scan cluster.  In this case
   * maybe we could scan further - to the end of the pte page, perhaps.
   */
  #define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
  #define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
  
  static void try_to_unmap_cluster(unsigned long cursor,
  	unsigned int *mapcount, struct vm_area_struct *vma)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pgd_t *pgd;
  	pud_t *pud;
  	pmd_t *pmd;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
714
  	pte_t *pte;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
715
  	pte_t pteval;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
716
  	spinlock_t *ptl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
717
718
719
  	struct page *page;
  	unsigned long address;
  	unsigned long end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
720

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
721
722
723
724
725
726
727
728
729
  	address = (vma->vm_start + cursor) & CLUSTER_MASK;
  	end = address + CLUSTER_SIZE;
  	if (address < vma->vm_start)
  		address = vma->vm_start;
  	if (end > vma->vm_end)
  		end = vma->vm_end;
  
  	pgd = pgd_offset(mm, address);
  	if (!pgd_present(*pgd))
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
730
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
731
732
733
  
  	pud = pud_offset(pgd, address);
  	if (!pud_present(*pud))
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
734
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
735
736
737
  
  	pmd = pmd_offset(pud, address);
  	if (!pmd_present(*pmd))
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
738
739
740
  		return;
  
  	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
741

365e9c87a   Hugh Dickins   [PATCH] mm: updat...
742
743
  	/* Update high watermark before we lower rss */
  	update_hiwater_rss(mm);
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
744
  	for (; address < end; pte++, address += PAGE_SIZE) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
745
746
  		if (!pte_present(*pte))
  			continue;
6aab341e0   Linus Torvalds   mm: re-architect ...
747
748
  		page = vm_normal_page(vma, address, *pte);
  		BUG_ON(!page || PageAnon(page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
749
750
751
752
753
  
  		if (ptep_clear_flush_young(vma, address, pte))
  			continue;
  
  		/* Nuke the page table entry. */
eca351336   Ben Collins   [PATCH] Fix missi...
754
  		flush_cache_page(vma, address, pte_pfn(*pte));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
755
756
757
758
759
760
761
762
763
764
765
766
  		pteval = ptep_clear_flush(vma, address, pte);
  
  		/* If nonlinear, store the file page offset in the pte. */
  		if (page->index != linear_page_index(vma, address))
  			set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
  
  		/* Move the dirty bit to the physical page now the pte is gone. */
  		if (pte_dirty(pteval))
  			set_page_dirty(page);
  
  		page_remove_rmap(page);
  		page_cache_release(page);
4294621f4   Hugh Dickins   [PATCH] mm: rss =...
767
  		dec_mm_counter(mm, file_rss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
768
769
  		(*mapcount)--;
  	}
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
770
  	pte_unmap_unlock(pte - 1, ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
771
  }
7352349a1   Christoph Lameter   [PATCH] page migr...
772
  static int try_to_unmap_anon(struct page *page, int migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
773
774
775
776
777
778
779
780
781
782
  {
  	struct anon_vma *anon_vma;
  	struct vm_area_struct *vma;
  	int ret = SWAP_AGAIN;
  
  	anon_vma = page_lock_anon_vma(page);
  	if (!anon_vma)
  		return ret;
  
  	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
7352349a1   Christoph Lameter   [PATCH] page migr...
783
  		ret = try_to_unmap_one(page, vma, migration);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
  		if (ret == SWAP_FAIL || !page_mapped(page))
  			break;
  	}
  	spin_unlock(&anon_vma->lock);
  	return ret;
  }
  
  /**
   * try_to_unmap_file - unmap file page using the object-based rmap method
   * @page: the page to unmap
   *
   * Find all the mappings of a page using the mapping pointer and the vma chains
   * contained in the address_space struct it points to.
   *
   * This function is only called from try_to_unmap for object-based pages.
   */
7352349a1   Christoph Lameter   [PATCH] page migr...
800
  static int try_to_unmap_file(struct page *page, int migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
801
802
803
804
805
806
807
808
809
810
811
812
813
  {
  	struct address_space *mapping = page->mapping;
  	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  	struct vm_area_struct *vma;
  	struct prio_tree_iter iter;
  	int ret = SWAP_AGAIN;
  	unsigned long cursor;
  	unsigned long max_nl_cursor = 0;
  	unsigned long max_nl_size = 0;
  	unsigned int mapcount;
  
  	spin_lock(&mapping->i_mmap_lock);
  	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
7352349a1   Christoph Lameter   [PATCH] page migr...
814
  		ret = try_to_unmap_one(page, vma, migration);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
815
816
817
818
819
820
821
822
823
  		if (ret == SWAP_FAIL || !page_mapped(page))
  			goto out;
  	}
  
  	if (list_empty(&mapping->i_mmap_nonlinear))
  		goto out;
  
  	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
  						shared.vm_set.list) {
e6a1530d6   Christoph Lameter   [PATCH] Allow mig...
824
  		if ((vma->vm_flags & VM_LOCKED) && !migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
  			continue;
  		cursor = (unsigned long) vma->vm_private_data;
  		if (cursor > max_nl_cursor)
  			max_nl_cursor = cursor;
  		cursor = vma->vm_end - vma->vm_start;
  		if (cursor > max_nl_size)
  			max_nl_size = cursor;
  	}
  
  	if (max_nl_size == 0) {	/* any nonlinears locked or reserved */
  		ret = SWAP_FAIL;
  		goto out;
  	}
  
  	/*
  	 * We don't try to search for this page in the nonlinear vmas,
  	 * and page_referenced wouldn't have found it anyway.  Instead
  	 * just walk the nonlinear vmas trying to age and unmap some.
  	 * The mapcount of the page we came in with is irrelevant,
  	 * but even so use it as a guide to how hard we should try?
  	 */
  	mapcount = page_mapcount(page);
  	if (!mapcount)
  		goto out;
  	cond_resched_lock(&mapping->i_mmap_lock);
  
  	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
  	if (max_nl_cursor == 0)
  		max_nl_cursor = CLUSTER_SIZE;
  
  	do {
  		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
  						shared.vm_set.list) {
e6a1530d6   Christoph Lameter   [PATCH] Allow mig...
858
  			if ((vma->vm_flags & VM_LOCKED) && !migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
859
860
  				continue;
  			cursor = (unsigned long) vma->vm_private_data;
839b9685e   Hugh Dickins   [PATCH] rmap: don...
861
  			while ( cursor < max_nl_cursor &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
  				cursor < vma->vm_end - vma->vm_start) {
  				try_to_unmap_cluster(cursor, &mapcount, vma);
  				cursor += CLUSTER_SIZE;
  				vma->vm_private_data = (void *) cursor;
  				if ((int)mapcount <= 0)
  					goto out;
  			}
  			vma->vm_private_data = (void *) max_nl_cursor;
  		}
  		cond_resched_lock(&mapping->i_mmap_lock);
  		max_nl_cursor += CLUSTER_SIZE;
  	} while (max_nl_cursor <= max_nl_size);
  
  	/*
  	 * Don't loop forever (perhaps all the remaining pages are
  	 * in locked vmas).  Reset cursor on all unreserved nonlinear
  	 * vmas, now forgetting on which ones it had fallen behind.
  	 */
101d2be76   Hugh Dickins   [PATCH] unpaged: ...
880
881
  	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
  		vma->vm_private_data = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
  out:
  	spin_unlock(&mapping->i_mmap_lock);
  	return ret;
  }
  
  /**
   * try_to_unmap - try to remove all page table mappings to a page
   * @page: the page to get unmapped
   *
   * Tries to remove all the page table entries which are mapping this
   * page, used in the pageout path.  Caller must hold the page lock.
   * Return values are:
   *
   * SWAP_SUCCESS	- we succeeded in removing all mappings
   * SWAP_AGAIN	- we missed a mapping, try again later
   * SWAP_FAIL	- the page is unswappable
   */
7352349a1   Christoph Lameter   [PATCH] page migr...
899
  int try_to_unmap(struct page *page, int migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
900
901
  {
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
902
903
904
  	BUG_ON(!PageLocked(page));
  
  	if (PageAnon(page))
7352349a1   Christoph Lameter   [PATCH] page migr...
905
  		ret = try_to_unmap_anon(page, migration);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
906
  	else
7352349a1   Christoph Lameter   [PATCH] page migr...
907
  		ret = try_to_unmap_file(page, migration);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
908
909
910
911
912
  
  	if (!page_mapped(page))
  		ret = SWAP_SUCCESS;
  	return ret;
  }
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
913