Blame view

mm/rmap.c 34.6 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
  /*
   * mm/rmap.c - physical to virtual reverse mappings
   *
   * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   * Released under the General Public License (GPL).
   *
   * Simple, low overhead reverse mapping scheme.
   * Please try to keep this thing as modular as possible.
   *
   * Provides methods for unmapping each kind of mapped page:
   * the anon methods track anonymous pages, and
   * the file methods track pages belonging to an inode.
   *
   * Original design by Rik van Riel <riel@conectiva.com.br> 2001
   * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
   * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
   * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
   */
  
  /*
   * Lock ordering in mm:
   *
1b1dcc1b5   Jes Sorensen   [PATCH] mutex sub...
23
   * inode->i_mutex	(while writing or truncating, not reading or faulting)
82591e6ea   Nick Piggin   [PATCH] mm: more ...
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
   *   inode->i_alloc_sem (vmtruncate_range)
   *   mm->mmap_sem
   *     page->flags PG_locked (lock_page)
   *       mapping->i_mmap_lock
   *         anon_vma->lock
   *           mm->page_table_lock or pte_lock
   *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
   *             swap_lock (in swap_duplicate, swap_info_get)
   *               mmlist_lock (in mmput, drain_mmlist and others)
   *               mapping->private_lock (in __set_page_dirty_buffers)
   *               inode_lock (in set_page_dirty's __mark_inode_dirty)
   *                 sb_lock (within inode_lock in fs/fs-writeback.c)
   *                 mapping->tree_lock (widely used, in set_page_dirty,
   *                           in arch-dependent flush_dcache_mmap_lock,
   *                           within inode_lock in __sync_single_inode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
39
40
41
42
43
44
45
46
47
48
   */
  
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
  #include <linux/slab.h>
  #include <linux/init.h>
  #include <linux/rmap.h>
  #include <linux/rcupdate.h>
a48d07afd   Christoph Lameter   [PATCH] Direct Mi...
49
  #include <linux/module.h>
7de6b8057   Nick Piggin   [PATCH] mm: more ...
50
  #include <linux/kallsyms.h>
8a9f3ccd2   Balbir Singh   Memory controller...
51
  #include <linux/memcontrol.h>
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
52
  #include <linux/mmu_notifier.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
53
54
  
  #include <asm/tlbflush.h>
b291f0003   Nick Piggin   mlock: mlocked pa...
55
  #include "internal.h"
fdd2e5f88   Adrian Bunk   make mm/rmap.c:an...
56
57
58
59
60
61
62
63
64
65
66
  static struct kmem_cache *anon_vma_cachep;
  
  static inline struct anon_vma *anon_vma_alloc(void)
  {
  	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
  }
  
  static inline void anon_vma_free(struct anon_vma *anon_vma)
  {
  	kmem_cache_free(anon_vma_cachep, anon_vma);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
67

d9d332e08   Linus Torvalds   anon_vma_prepare:...
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
  /**
   * anon_vma_prepare - attach an anon_vma to a memory region
   * @vma: the memory region in question
   *
   * This makes sure the memory mapping described by 'vma' has
   * an 'anon_vma' attached to it, so that we can associate the
   * anonymous pages mapped into it with that anon_vma.
   *
   * The common case will be that we already have one, but if
   * if not we either need to find an adjacent mapping that we
   * can re-use the anon_vma from (very common when the only
   * reason for splitting a vma has been mprotect()), or we
   * allocate a new one.
   *
   * Anon-vma allocations are very subtle, because we may have
   * optimistically looked up an anon_vma in page_lock_anon_vma()
   * and that may actually touch the spinlock even in the newly
   * allocated vma (it depends on RCU to make sure that the
   * anon_vma isn't actually destroyed).
   *
   * As a result, we need to do proper anon_vma locking even
   * for the new allocation. At the same time, we do not want
   * to do any locking for the common case of already having
   * an anon_vma.
   *
   * This must be called with the mmap_sem held for reading.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
95
96
97
98
99
100
101
  int anon_vma_prepare(struct vm_area_struct *vma)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
  
  	might_sleep();
  	if (unlikely(!anon_vma)) {
  		struct mm_struct *mm = vma->vm_mm;
d9d332e08   Linus Torvalds   anon_vma_prepare:...
102
  		struct anon_vma *allocated;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
103
104
  
  		anon_vma = find_mergeable_anon_vma(vma);
d9d332e08   Linus Torvalds   anon_vma_prepare:...
105
106
  		allocated = NULL;
  		if (!anon_vma) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
107
108
109
110
  			anon_vma = anon_vma_alloc();
  			if (unlikely(!anon_vma))
  				return -ENOMEM;
  			allocated = anon_vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
111
  		}
d9d332e08   Linus Torvalds   anon_vma_prepare:...
112
  		spin_lock(&anon_vma->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
113
114
115
116
117
  
  		/* page_table_lock to protect against threads */
  		spin_lock(&mm->page_table_lock);
  		if (likely(!vma->anon_vma)) {
  			vma->anon_vma = anon_vma;
0697212a4   Christoph Lameter   [PATCH] Swapless ...
118
  			list_add_tail(&vma->anon_vma_node, &anon_vma->head);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
119
120
121
  			allocated = NULL;
  		}
  		spin_unlock(&mm->page_table_lock);
d9d332e08   Linus Torvalds   anon_vma_prepare:...
122
  		spin_unlock(&anon_vma->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
  		if (unlikely(allocated))
  			anon_vma_free(allocated);
  	}
  	return 0;
  }
  
  void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  {
  	BUG_ON(vma->anon_vma != next->anon_vma);
  	list_del(&next->anon_vma_node);
  }
  
  void __anon_vma_link(struct vm_area_struct *vma)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
30acbabae   Hugh Dickins   mm: kill validate...
138
  	if (anon_vma)
0697212a4   Christoph Lameter   [PATCH] Swapless ...
139
  		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
140
141
142
143
144
145
146
147
  }
  
  void anon_vma_link(struct vm_area_struct *vma)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
  
  	if (anon_vma) {
  		spin_lock(&anon_vma->lock);
0697212a4   Christoph Lameter   [PATCH] Swapless ...
148
  		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
149
150
151
152
153
154
155
156
157
158
159
160
161
  		spin_unlock(&anon_vma->lock);
  	}
  }
  
  void anon_vma_unlink(struct vm_area_struct *vma)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
  	int empty;
  
  	if (!anon_vma)
  		return;
  
  	spin_lock(&anon_vma->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
162
163
164
165
166
167
168
169
170
  	list_del(&vma->anon_vma_node);
  
  	/* We must garbage collect the anon_vma if it's empty */
  	empty = list_empty(&anon_vma->head);
  	spin_unlock(&anon_vma->lock);
  
  	if (empty)
  		anon_vma_free(anon_vma);
  }
51cc50685   Alexey Dobriyan   SL*B: drop kmem c...
171
  static void anon_vma_ctor(void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
172
  {
a35afb830   Christoph Lameter   Remove SLAB_CTOR_...
173
  	struct anon_vma *anon_vma = data;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
174

a35afb830   Christoph Lameter   Remove SLAB_CTOR_...
175
176
  	spin_lock_init(&anon_vma->lock);
  	INIT_LIST_HEAD(&anon_vma->head);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
177
178
179
180
181
  }
  
  void __init anon_vma_init(void)
  {
  	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
20c2df83d   Paul Mundt   mm: Remove slab d...
182
  			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
183
184
185
186
187
188
  }
  
  /*
   * Getting a lock on a stable anon_vma from a page off the LRU is
   * tricky: page_lock_anon_vma rely on RCU to guard against the races.
   */
af936a160   Lee Schermerhorn   vmscan: unevictab...
189
  struct anon_vma *page_lock_anon_vma(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
190
  {
34bbd7040   Oleg Nesterov   [PATCH] adapt pag...
191
  	struct anon_vma *anon_vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
192
193
194
195
196
197
198
199
200
201
202
  	unsigned long anon_mapping;
  
  	rcu_read_lock();
  	anon_mapping = (unsigned long) page->mapping;
  	if (!(anon_mapping & PAGE_MAPPING_ANON))
  		goto out;
  	if (!page_mapped(page))
  		goto out;
  
  	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
  	spin_lock(&anon_vma->lock);
34bbd7040   Oleg Nesterov   [PATCH] adapt pag...
203
  	return anon_vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
204
205
  out:
  	rcu_read_unlock();
34bbd7040   Oleg Nesterov   [PATCH] adapt pag...
206
207
  	return NULL;
  }
af936a160   Lee Schermerhorn   vmscan: unevictab...
208
  void page_unlock_anon_vma(struct anon_vma *anon_vma)
34bbd7040   Oleg Nesterov   [PATCH] adapt pag...
209
210
211
  {
  	spin_unlock(&anon_vma->lock);
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
212
213
214
  }
  
  /*
3ad33b243   Lee Schermerhorn   Migration: find c...
215
216
217
   * At what user virtual address is page expected in @vma?
   * Returns virtual address or -EFAULT if page's index/offset is not
   * within the range mapped the @vma.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
218
219
220
221
222
223
224
225
226
   */
  static inline unsigned long
  vma_address(struct page *page, struct vm_area_struct *vma)
  {
  	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  	unsigned long address;
  
  	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
3ad33b243   Lee Schermerhorn   Migration: find c...
227
  		/* page should be within @vma mapping range */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
228
229
230
231
232
233
234
  		return -EFAULT;
  	}
  	return address;
  }
  
  /*
   * At what user virtual address is page expected in vma? checking that the
ee498ed73   Hugh Dickins   [PATCH] unpaged: ...
235
   * page matches the vma: currently only used on anon pages, by unuse_vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
236
237
238
239
240
241
242
243
   */
  unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  {
  	if (PageAnon(page)) {
  		if ((void *)vma->anon_vma !=
  		    (void *)page->mapping - PAGE_MAPPING_ANON)
  			return -EFAULT;
  	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
ee498ed73   Hugh Dickins   [PATCH] unpaged: ...
244
245
  		if (!vma->vm_file ||
  		    vma->vm_file->f_mapping != page->mapping)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
247
248
249
250
251
252
  			return -EFAULT;
  	} else
  		return -EFAULT;
  	return vma_address(page, vma);
  }
  
  /*
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
253
254
   * Check that @page is mapped at @address into @mm.
   *
479db0bf4   Nick Piggin   mm: dirty page tr...
255
256
257
258
   * If @sync is false, page_check_address may perform a racy check to avoid
   * the page table lock when the pte is not present (helpful when reclaiming
   * highly shared pages).
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
259
   * On success returns with pte mapped and locked.
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
260
   */
ceffc0785   Carsten Otte   [PATCH] xip: fs/m...
261
  pte_t *page_check_address(struct page *page, struct mm_struct *mm,
479db0bf4   Nick Piggin   mm: dirty page tr...
262
  			  unsigned long address, spinlock_t **ptlp, int sync)
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
263
264
265
266
267
  {
  	pgd_t *pgd;
  	pud_t *pud;
  	pmd_t *pmd;
  	pte_t *pte;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
268
  	spinlock_t *ptl;
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
269

81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
270
  	pgd = pgd_offset(mm, address);
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
271
272
273
274
275
276
277
278
279
280
281
282
283
  	if (!pgd_present(*pgd))
  		return NULL;
  
  	pud = pud_offset(pgd, address);
  	if (!pud_present(*pud))
  		return NULL;
  
  	pmd = pmd_offset(pud, address);
  	if (!pmd_present(*pmd))
  		return NULL;
  
  	pte = pte_offset_map(pmd, address);
  	/* Make a quick check before getting the lock */
479db0bf4   Nick Piggin   mm: dirty page tr...
284
  	if (!sync && !pte_present(*pte)) {
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
285
286
287
  		pte_unmap(pte);
  		return NULL;
  	}
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
288
  	ptl = pte_lockptr(mm, pmd);
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
289
290
291
292
  	spin_lock(ptl);
  	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
  		*ptlp = ptl;
  		return pte;
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
293
  	}
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
294
295
  	pte_unmap_unlock(pte, ptl);
  	return NULL;
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
296
  }
b291f0003   Nick Piggin   mlock: mlocked pa...
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
  /**
   * page_mapped_in_vma - check whether a page is really mapped in a VMA
   * @page: the page to test
   * @vma: the VMA to test
   *
   * Returns 1 if the page is mapped into the page tables of the VMA, 0
   * if the page is not mapped into the page tables of this VMA.  Only
   * valid for normal file or anonymous VMAs.
   */
  static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
  {
  	unsigned long address;
  	pte_t *pte;
  	spinlock_t *ptl;
  
  	address = vma_address(page, vma);
  	if (address == -EFAULT)		/* out of vma range */
  		return 0;
  	pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
  	if (!pte)			/* the page is not in this mm */
  		return 0;
  	pte_unmap_unlock(pte, ptl);
  
  	return 1;
  }
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
322
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
323
324
325
326
   * Subfunctions of page_referenced: page_referenced_one called
   * repeatedly from either page_referenced_anon or page_referenced_file.
   */
  static int page_referenced_one(struct page *page,
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
327
  	struct vm_area_struct *vma, unsigned int *mapcount)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
328
329
330
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
331
  	pte_t *pte;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
332
  	spinlock_t *ptl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
333
  	int referenced = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
334
335
336
  	address = vma_address(page, vma);
  	if (address == -EFAULT)
  		goto out;
479db0bf4   Nick Piggin   mm: dirty page tr...
337
  	pte = page_check_address(page, mm, address, &ptl, 0);
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
338
339
  	if (!pte)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
340

b291f0003   Nick Piggin   mlock: mlocked pa...
341
342
343
344
345
  	/*
  	 * Don't want to elevate referenced for mlocked page that gets this far,
  	 * in order that it progresses to try_to_unmap and is moved to the
  	 * unevictable list.
  	 */
5a9bbdcd2   Hugh Dickins   mm: don't waste s...
346
  	if (vma->vm_flags & VM_LOCKED) {
5a9bbdcd2   Hugh Dickins   mm: don't waste s...
347
  		*mapcount = 1;	/* break early from loop */
b291f0003   Nick Piggin   mlock: mlocked pa...
348
349
350
351
  		goto out_unmap;
  	}
  
  	if (ptep_clear_flush_young_notify(vma, address, pte))
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
352
  		referenced++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
353

c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
354
355
  	/* Pretend the page is referenced if the task has the
  	   swap token and is in the middle of a page fault. */
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
356
  	if (mm != current->mm && has_swap_token(mm) &&
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
357
358
  			rwsem_is_locked(&mm->mmap_sem))
  		referenced++;
b291f0003   Nick Piggin   mlock: mlocked pa...
359
  out_unmap:
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
360
361
  	(*mapcount)--;
  	pte_unmap_unlock(pte, ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
362
363
364
  out:
  	return referenced;
  }
bed7161a5   Balbir Singh   Memory controller...
365
366
  static int page_referenced_anon(struct page *page,
  				struct mem_cgroup *mem_cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
367
368
369
370
371
372
373
374
375
376
377
378
  {
  	unsigned int mapcount;
  	struct anon_vma *anon_vma;
  	struct vm_area_struct *vma;
  	int referenced = 0;
  
  	anon_vma = page_lock_anon_vma(page);
  	if (!anon_vma)
  		return referenced;
  
  	mapcount = page_mapcount(page);
  	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
bed7161a5   Balbir Singh   Memory controller...
379
380
381
382
383
  		/*
  		 * If we are reclaiming on behalf of a cgroup, skip
  		 * counting on behalf of references from different
  		 * cgroups
  		 */
bd845e38c   Hugh Dickins   memcg: mm_match_c...
384
  		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
bed7161a5   Balbir Singh   Memory controller...
385
  			continue;
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
386
  		referenced += page_referenced_one(page, vma, &mapcount);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
387
388
389
  		if (!mapcount)
  			break;
  	}
34bbd7040   Oleg Nesterov   [PATCH] adapt pag...
390
391
  
  	page_unlock_anon_vma(anon_vma);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
392
393
394
395
396
397
  	return referenced;
  }
  
  /**
   * page_referenced_file - referenced check for object-based rmap
   * @page: the page we're checking references on.
43d8eac44   Randy Dunlap   mm: rmap kernel-d...
398
   * @mem_cont: target memory controller
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
399
400
401
402
403
404
405
406
   *
   * For an object-based mapped page, find all the places it is mapped and
   * check/clear the referenced flag.  This is done by following the page->mapping
   * pointer, then walking the chain of vmas it holds.  It returns the number
   * of references it found.
   *
   * This function is only called from page_referenced for object-based pages.
   */
bed7161a5   Balbir Singh   Memory controller...
407
408
  static int page_referenced_file(struct page *page,
  				struct mem_cgroup *mem_cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
  {
  	unsigned int mapcount;
  	struct address_space *mapping = page->mapping;
  	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  	struct vm_area_struct *vma;
  	struct prio_tree_iter iter;
  	int referenced = 0;
  
  	/*
  	 * The caller's checks on page->mapping and !PageAnon have made
  	 * sure that this is a file page: the check for page->mapping
  	 * excludes the case just before it gets set on an anon page.
  	 */
  	BUG_ON(PageAnon(page));
  
  	/*
  	 * The page lock not only makes sure that page->mapping cannot
  	 * suddenly be NULLified by truncation, it makes sure that the
  	 * structure at mapping cannot be freed and reused yet,
  	 * so we can safely take mapping->i_mmap_lock.
  	 */
  	BUG_ON(!PageLocked(page));
  
  	spin_lock(&mapping->i_mmap_lock);
  
  	/*
  	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
  	 * is more likely to be accurate if we note it after spinning.
  	 */
  	mapcount = page_mapcount(page);
  
  	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
bed7161a5   Balbir Singh   Memory controller...
441
442
443
444
445
  		/*
  		 * If we are reclaiming on behalf of a cgroup, skip
  		 * counting on behalf of references from different
  		 * cgroups
  		 */
bd845e38c   Hugh Dickins   memcg: mm_match_c...
446
  		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
bed7161a5   Balbir Singh   Memory controller...
447
  			continue;
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
448
  		referenced += page_referenced_one(page, vma, &mapcount);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
449
450
451
452
453
454
455
456
457
458
459
460
  		if (!mapcount)
  			break;
  	}
  
  	spin_unlock(&mapping->i_mmap_lock);
  	return referenced;
  }
  
  /**
   * page_referenced - test if the page was referenced
   * @page: the page to test
   * @is_locked: caller holds lock on the page
43d8eac44   Randy Dunlap   mm: rmap kernel-d...
461
   * @mem_cont: target memory controller
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
462
463
464
465
   *
   * Quick test_and_clear_referenced for all mappings to a page,
   * returns the number of ptes which referenced the page.
   */
bed7161a5   Balbir Singh   Memory controller...
466
467
  int page_referenced(struct page *page, int is_locked,
  			struct mem_cgroup *mem_cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
468
469
  {
  	int referenced = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
470
471
472
473
474
  	if (TestClearPageReferenced(page))
  		referenced++;
  
  	if (page_mapped(page) && page->mapping) {
  		if (PageAnon(page))
bed7161a5   Balbir Singh   Memory controller...
475
  			referenced += page_referenced_anon(page, mem_cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
476
  		else if (is_locked)
bed7161a5   Balbir Singh   Memory controller...
477
  			referenced += page_referenced_file(page, mem_cont);
529ae9aaa   Nick Piggin   mm: rename page t...
478
  		else if (!trylock_page(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
479
480
481
  			referenced++;
  		else {
  			if (page->mapping)
bed7161a5   Balbir Singh   Memory controller...
482
483
  				referenced +=
  					page_referenced_file(page, mem_cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
484
485
486
  			unlock_page(page);
  		}
  	}
5b7baf057   Christian Borntraeger   s390: KVM prepara...
487
488
489
  
  	if (page_test_and_clear_young(page))
  		referenced++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
490
491
  	return referenced;
  }
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
492
493
494
495
  static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address;
c2fda5fed   Peter Zijlstra   [PATCH] Fix up pa...
496
  	pte_t *pte;
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
497
498
499
500
501
502
  	spinlock_t *ptl;
  	int ret = 0;
  
  	address = vma_address(page, vma);
  	if (address == -EFAULT)
  		goto out;
479db0bf4   Nick Piggin   mm: dirty page tr...
503
  	pte = page_check_address(page, mm, address, &ptl, 1);
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
504
505
  	if (!pte)
  		goto out;
c2fda5fed   Peter Zijlstra   [PATCH] Fix up pa...
506
507
  	if (pte_dirty(*pte) || pte_write(*pte)) {
  		pte_t entry;
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
508

c2fda5fed   Peter Zijlstra   [PATCH] Fix up pa...
509
  		flush_cache_page(vma, address, pte_pfn(*pte));
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
510
  		entry = ptep_clear_flush_notify(vma, address, pte);
c2fda5fed   Peter Zijlstra   [PATCH] Fix up pa...
511
512
  		entry = pte_wrprotect(entry);
  		entry = pte_mkclean(entry);
d6e88e671   Al Viro   [PATCH] page_mkcl...
513
  		set_pte_at(mm, address, pte, entry);
c2fda5fed   Peter Zijlstra   [PATCH] Fix up pa...
514
515
  		ret = 1;
  	}
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
516

d08b3851d   Peter Zijlstra   [PATCH] mm: track...
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
  	pte_unmap_unlock(pte, ptl);
  out:
  	return ret;
  }
  
  static int page_mkclean_file(struct address_space *mapping, struct page *page)
  {
  	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  	struct vm_area_struct *vma;
  	struct prio_tree_iter iter;
  	int ret = 0;
  
  	BUG_ON(PageAnon(page));
  
  	spin_lock(&mapping->i_mmap_lock);
  	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
  		if (vma->vm_flags & VM_SHARED)
  			ret += page_mkclean_one(page, vma);
  	}
  	spin_unlock(&mapping->i_mmap_lock);
  	return ret;
  }
  
  int page_mkclean(struct page *page)
  {
  	int ret = 0;
  
  	BUG_ON(!PageLocked(page));
  
  	if (page_mapped(page)) {
  		struct address_space *mapping = page_mapping(page);
ce7e9fae8   Christian Borntraeger   [S390] Optimize s...
548
  		if (mapping) {
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
549
  			ret = page_mkclean_file(mapping, page);
ce7e9fae8   Christian Borntraeger   [S390] Optimize s...
550
551
552
553
  			if (page_test_dirty(page)) {
  				page_clear_dirty(page);
  				ret = 1;
  			}
6c210482a   Martin Schwidefsky   [S390] split page...
554
  		}
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
555
556
557
558
  	}
  
  	return ret;
  }
60b59beaf   Jaya Kumar   fbdev: mm: Deferr...
559
  EXPORT_SYMBOL_GPL(page_mkclean);
d08b3851d   Peter Zijlstra   [PATCH] mm: track...
560

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
561
  /**
43d8eac44   Randy Dunlap   mm: rmap kernel-d...
562
   * __page_set_anon_rmap - setup new anonymous rmap
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
563
564
565
566
567
568
569
570
571
572
573
574
575
576
   * @page:	the page to add the mapping to
   * @vma:	the vm area in which the mapping is added
   * @address:	the user virtual address mapped
   */
  static void __page_set_anon_rmap(struct page *page,
  	struct vm_area_struct *vma, unsigned long address)
  {
  	struct anon_vma *anon_vma = vma->anon_vma;
  
  	BUG_ON(!anon_vma);
  	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
  	page->mapping = (struct address_space *) anon_vma;
  
  	page->index = linear_page_index(vma, address);
a74609faf   Nick Piggin   [PATCH] mm: page_...
577
578
579
580
  	/*
  	 * nr_mapped state can be updated without turning off
  	 * interrupts because it is not modified via interrupt.
  	 */
f3dbd3446   Christoph Lameter   [PATCH] zoned vm ...
581
  	__inc_zone_page_state(page, NR_ANON_PAGES);
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
582
583
584
  }
  
  /**
43d8eac44   Randy Dunlap   mm: rmap kernel-d...
585
   * __page_check_anon_rmap - sanity check anonymous rmap addition
c97a9e10e   Nick Piggin   mm: more rmap che...
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
   * @page:	the page to add the mapping to
   * @vma:	the vm area in which the mapping is added
   * @address:	the user virtual address mapped
   */
  static void __page_check_anon_rmap(struct page *page,
  	struct vm_area_struct *vma, unsigned long address)
  {
  #ifdef CONFIG_DEBUG_VM
  	/*
  	 * The page's anon-rmap details (mapping and index) are guaranteed to
  	 * be set up correctly at this point.
  	 *
  	 * We have exclusion against page_add_anon_rmap because the caller
  	 * always holds the page locked, except if called from page_dup_rmap,
  	 * in which case the page is already known to be setup.
  	 *
  	 * We have exclusion against page_add_new_anon_rmap because those pages
  	 * are initially only visible via the pagetables, and the pte is locked
  	 * over the call to page_add_new_anon_rmap.
  	 */
  	struct anon_vma *anon_vma = vma->anon_vma;
  	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
  	BUG_ON(page->mapping != (struct address_space *)anon_vma);
  	BUG_ON(page->index != linear_page_index(vma, address));
  #endif
  }
  
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
614
615
616
617
618
   * page_add_anon_rmap - add pte mapping to an anonymous page
   * @page:	the page to add the mapping to
   * @vma:	the vm area in which the mapping is added
   * @address:	the user virtual address mapped
   *
c97a9e10e   Nick Piggin   mm: more rmap che...
619
   * The caller needs to hold the pte lock and the page must be locked.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
620
621
622
623
   */
  void page_add_anon_rmap(struct page *page,
  	struct vm_area_struct *vma, unsigned long address)
  {
c97a9e10e   Nick Piggin   mm: more rmap che...
624
625
  	VM_BUG_ON(!PageLocked(page));
  	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
626
627
  	if (atomic_inc_and_test(&page->_mapcount))
  		__page_set_anon_rmap(page, vma, address);
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
628
  	else
c97a9e10e   Nick Piggin   mm: more rmap che...
629
  		__page_check_anon_rmap(page, vma, address);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
630
  }
43d8eac44   Randy Dunlap   mm: rmap kernel-d...
631
  /**
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
632
633
634
635
636
637
638
   * page_add_new_anon_rmap - add pte mapping to a new anonymous page
   * @page:	the page to add the mapping to
   * @vma:	the vm area in which the mapping is added
   * @address:	the user virtual address mapped
   *
   * Same as page_add_anon_rmap but must only be called on *new* pages.
   * This means the inc-and-test can be bypassed.
c97a9e10e   Nick Piggin   mm: more rmap che...
639
   * Page does not have to be locked.
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
640
641
642
643
   */
  void page_add_new_anon_rmap(struct page *page,
  	struct vm_area_struct *vma, unsigned long address)
  {
c97a9e10e   Nick Piggin   mm: more rmap che...
644
  	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
9617d95e6   Nick Piggin   [PATCH] mm: rmap ...
645
646
647
  	atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
  	__page_set_anon_rmap(page, vma, address);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
648
649
650
651
  /**
   * page_add_file_rmap - add pte mapping to a file page
   * @page: the page to add the mapping to
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
652
   * The caller needs to hold the pte lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
653
654
655
   */
  void page_add_file_rmap(struct page *page)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
656
  	if (atomic_inc_and_test(&page->_mapcount))
65ba55f50   Christoph Lameter   [PATCH] zoned vm ...
657
  		__inc_zone_page_state(page, NR_FILE_MAPPED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
658
  }
c97a9e10e   Nick Piggin   mm: more rmap che...
659
660
661
662
  #ifdef CONFIG_DEBUG_VM
  /**
   * page_dup_rmap - duplicate pte mapping to a page
   * @page:	the page to add the mapping to
43d8eac44   Randy Dunlap   mm: rmap kernel-d...
663
664
   * @vma:	the vm area being duplicated
   * @address:	the user virtual address mapped
c97a9e10e   Nick Piggin   mm: more rmap che...
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
   *
   * For copy_page_range only: minimal extract from page_add_file_rmap /
   * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
   * quicker.
   *
   * The caller needs to hold the pte lock.
   */
  void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
  {
  	BUG_ON(page_mapcount(page) == 0);
  	if (PageAnon(page))
  		__page_check_anon_rmap(page, vma, address);
  	atomic_inc(&page->_mapcount);
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
680
681
682
  /**
   * page_remove_rmap - take down pte mapping from a page
   * @page: page to remove mapping from
43d8eac44   Randy Dunlap   mm: rmap kernel-d...
683
   * @vma: the vm area in which the mapping is removed
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
684
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
685
   * The caller needs to hold the pte lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
686
   */
7de6b8057   Nick Piggin   [PATCH] mm: more ...
687
  void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
688
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
689
  	if (atomic_add_negative(-1, &page->_mapcount)) {
b7ab795b7   Nick Piggin   [PATCH] mm: more ...
690
  		if (unlikely(page_mapcount(page) < 0)) {
ef2bf0dc8   Dave Jones   [PATCH] rmap: add...
691
692
  			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)
  ", page_mapcount(page));
7de6b8057   Nick Piggin   [PATCH] mm: more ...
693
694
  			printk (KERN_EMERG "  page pfn = %lx
  ", page_to_pfn(page));
ef2bf0dc8   Dave Jones   [PATCH] rmap: add...
695
696
697
698
699
700
  			printk (KERN_EMERG "  page->flags = %lx
  ", page->flags);
  			printk (KERN_EMERG "  page->count = %x
  ", page_count(page));
  			printk (KERN_EMERG "  page->mapping = %p
  ", page->mapping);
7de6b8057   Nick Piggin   [PATCH] mm: more ...
701
702
  			print_symbol (KERN_EMERG "  vma->vm_ops = %s
  ", (unsigned long)vma->vm_ops);
54cb8821d   Nick Piggin   mm: merge populat...
703
  			if (vma->vm_ops) {
54cb8821d   Nick Piggin   mm: merge populat...
704
705
706
  				print_symbol (KERN_EMERG "  vma->vm_ops->fault = %s
  ", (unsigned long)vma->vm_ops->fault);
  			}
7de6b8057   Nick Piggin   [PATCH] mm: more ...
707
708
709
  			if (vma->vm_file && vma->vm_file->f_op)
  				print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s
  ", (unsigned long)vma->vm_file->f_op->mmap);
b16bc64d1   Dave Jones   [PATCH] move rmap...
710
  			BUG();
ef2bf0dc8   Dave Jones   [PATCH] rmap: add...
711
  		}
b16bc64d1   Dave Jones   [PATCH] move rmap...
712

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
713
  		/*
16f8c5b2e   Hugh Dickins   mm: page_remove_r...
714
715
716
717
718
  		 * Now that the last pte has gone, s390 must transfer dirty
  		 * flag from storage key to struct page.  We can usually skip
  		 * this if the page is anon, so about to be freed; but perhaps
  		 * not if it's in swapcache - there might be another pte slot
  		 * containing the swap entry, but page not yet written to swap.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
719
  		 */
a4b526b3b   Martin Schwidefsky   [S390] Optimize s...
720
721
  		if ((!PageAnon(page) || PageSwapCache(page)) &&
  		    page_test_dirty(page)) {
6c210482a   Martin Schwidefsky   [S390] split page...
722
  			page_clear_dirty(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
723
  			set_page_dirty(page);
6c210482a   Martin Schwidefsky   [S390] split page...
724
  		}
5b4e655e9   KAMEZAWA Hiroyuki   memcg: avoid acco...
725
726
  		if (PageAnon(page))
  			mem_cgroup_uncharge_page(page);
f3dbd3446   Christoph Lameter   [PATCH] zoned vm ...
727
  		__dec_zone_page_state(page,
16f8c5b2e   Hugh Dickins   mm: page_remove_r...
728
729
730
731
732
733
734
735
736
737
  			PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
  		/*
  		 * It would be tidy to reset the PageAnon mapping here,
  		 * but that might overwrite a racing page_add_anon_rmap
  		 * which increments mapcount after us but sets mapping
  		 * before us: so leave the reset to free_hot_cold_page,
  		 * and remember that it's only reliable while mapped.
  		 * Leaving it set also helps swapoff to reinstate ptes
  		 * faster for those pages still in swapcache.
  		 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
738
739
740
741
742
743
744
  	}
  }
  
  /*
   * Subfunctions of try_to_unmap: try_to_unmap_one called
   * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
   */
a48d07afd   Christoph Lameter   [PATCH] Direct Mi...
745
  static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
7352349a1   Christoph Lameter   [PATCH] page migr...
746
  				int migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
747
748
749
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long address;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
750
751
  	pte_t *pte;
  	pte_t pteval;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
752
  	spinlock_t *ptl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
753
  	int ret = SWAP_AGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
754
755
756
  	address = vma_address(page, vma);
  	if (address == -EFAULT)
  		goto out;
479db0bf4   Nick Piggin   mm: dirty page tr...
757
  	pte = page_check_address(page, mm, address, &ptl, 0);
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
758
  	if (!pte)
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
759
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
760
761
762
763
764
765
  
  	/*
  	 * If the page is mlock()d, we cannot swap it out.
  	 * If it's recently referenced (perhaps page_referenced
  	 * skipped over this mm) then we should reactivate it.
  	 */
b291f0003   Nick Piggin   mlock: mlocked pa...
766
767
768
769
770
771
772
773
774
775
  	if (!migration) {
  		if (vma->vm_flags & VM_LOCKED) {
  			ret = SWAP_MLOCK;
  			goto out_unmap;
  		}
  		if (ptep_clear_flush_young_notify(vma, address, pte)) {
  			ret = SWAP_FAIL;
  			goto out_unmap;
  		}
    	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
776

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
777
778
  	/* Nuke the page table entry. */
  	flush_cache_page(vma, address, page_to_pfn(page));
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
779
  	pteval = ptep_clear_flush_notify(vma, address, pte);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
780
781
782
783
  
  	/* Move the dirty bit to the physical page now the pte is gone. */
  	if (pte_dirty(pteval))
  		set_page_dirty(page);
365e9c87a   Hugh Dickins   [PATCH] mm: updat...
784
785
  	/* Update high watermark before we lower rss */
  	update_hiwater_rss(mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
786
  	if (PageAnon(page)) {
4c21e2f24   Hugh Dickins   [PATCH] mm: split...
787
  		swp_entry_t entry = { .val = page_private(page) };
0697212a4   Christoph Lameter   [PATCH] Swapless ...
788
789
790
791
792
793
794
795
796
797
798
799
800
  
  		if (PageSwapCache(page)) {
  			/*
  			 * Store the swap location in the pte.
  			 * See handle_pte_fault() ...
  			 */
  			swap_duplicate(entry);
  			if (list_empty(&mm->mmlist)) {
  				spin_lock(&mmlist_lock);
  				if (list_empty(&mm->mmlist))
  					list_add(&mm->mmlist, &init_mm.mmlist);
  				spin_unlock(&mmlist_lock);
  			}
442c9137d   Christoph Lameter   [PATCH] More page...
801
  			dec_mm_counter(mm, anon_rss);
04e62a29b   Christoph Lameter   [PATCH] More page...
802
  #ifdef CONFIG_MIGRATION
0697212a4   Christoph Lameter   [PATCH] Swapless ...
803
804
805
806
807
808
809
810
  		} else {
  			/*
  			 * Store the pfn of the page in a special migration
  			 * pte. do_swap_page() will wait until the migration
  			 * pte is removed and then restart fault handling.
  			 */
  			BUG_ON(!migration);
  			entry = make_migration_entry(page, pte_write(pteval));
04e62a29b   Christoph Lameter   [PATCH] More page...
811
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
812
813
814
  		}
  		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
  		BUG_ON(pte_file(*pte));
4294621f4   Hugh Dickins   [PATCH] mm: rss =...
815
  	} else
04e62a29b   Christoph Lameter   [PATCH] More page...
816
817
818
819
820
821
822
823
  #ifdef CONFIG_MIGRATION
  	if (migration) {
  		/* Establish migration entry for a file page */
  		swp_entry_t entry;
  		entry = make_migration_entry(page, pte_write(pteval));
  		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
  	} else
  #endif
4294621f4   Hugh Dickins   [PATCH] mm: rss =...
824
  		dec_mm_counter(mm, file_rss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
825

04e62a29b   Christoph Lameter   [PATCH] More page...
826

7de6b8057   Nick Piggin   [PATCH] mm: more ...
827
  	page_remove_rmap(page, vma);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
828
829
830
  	page_cache_release(page);
  
  out_unmap:
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
831
  	pte_unmap_unlock(pte, ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
  out:
  	return ret;
  }
  
  /*
   * objrmap doesn't work for nonlinear VMAs because the assumption that
   * offset-into-file correlates with offset-into-virtual-addresses does not hold.
   * Consequently, given a particular page and its ->index, we cannot locate the
   * ptes which are mapping that page without an exhaustive linear search.
   *
   * So what this code does is a mini "virtual scan" of each nonlinear VMA which
   * maps the file to which the target page belongs.  The ->vm_private_data field
   * holds the current cursor into that scan.  Successive searches will circulate
   * around the vma's virtual address space.
   *
   * So as more replacement pressure is applied to the pages in a nonlinear VMA,
   * more scanning pressure is placed against them as well.   Eventually pages
   * will become fully unmapped and are eligible for eviction.
   *
   * For very sparsely populated VMAs this is a little inefficient - chances are
   * there there won't be many ptes located within the scan cluster.  In this case
   * maybe we could scan further - to the end of the pte page, perhaps.
b291f0003   Nick Piggin   mlock: mlocked pa...
854
855
856
857
858
   *
   * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
   * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
   * rather than unmapping them.  If we encounter the "check_page" that vmscan is
   * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
859
860
861
   */
  #define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
  #define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
b291f0003   Nick Piggin   mlock: mlocked pa...
862
863
  static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
  		struct vm_area_struct *vma, struct page *check_page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
864
865
866
867
868
  {
  	struct mm_struct *mm = vma->vm_mm;
  	pgd_t *pgd;
  	pud_t *pud;
  	pmd_t *pmd;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
869
  	pte_t *pte;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
870
  	pte_t pteval;
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
871
  	spinlock_t *ptl;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
872
873
874
  	struct page *page;
  	unsigned long address;
  	unsigned long end;
b291f0003   Nick Piggin   mlock: mlocked pa...
875
876
  	int ret = SWAP_AGAIN;
  	int locked_vma = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
877

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
878
879
880
881
882
883
884
885
886
  	address = (vma->vm_start + cursor) & CLUSTER_MASK;
  	end = address + CLUSTER_SIZE;
  	if (address < vma->vm_start)
  		address = vma->vm_start;
  	if (end > vma->vm_end)
  		end = vma->vm_end;
  
  	pgd = pgd_offset(mm, address);
  	if (!pgd_present(*pgd))
b291f0003   Nick Piggin   mlock: mlocked pa...
887
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
888
889
890
  
  	pud = pud_offset(pgd, address);
  	if (!pud_present(*pud))
b291f0003   Nick Piggin   mlock: mlocked pa...
891
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
892
893
894
  
  	pmd = pmd_offset(pud, address);
  	if (!pmd_present(*pmd))
b291f0003   Nick Piggin   mlock: mlocked pa...
895
896
897
898
899
900
901
902
903
904
905
906
  		return ret;
  
  	/*
  	 * MLOCK_PAGES => feature is configured.
  	 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
  	 * keep the sem while scanning the cluster for mlocking pages.
  	 */
  	if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
  		locked_vma = (vma->vm_flags & VM_LOCKED);
  		if (!locked_vma)
  			up_read(&vma->vm_mm->mmap_sem); /* don't need it */
  	}
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
907
908
  
  	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
909

365e9c87a   Hugh Dickins   [PATCH] mm: updat...
910
911
  	/* Update high watermark before we lower rss */
  	update_hiwater_rss(mm);
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
912
  	for (; address < end; pte++, address += PAGE_SIZE) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
913
914
  		if (!pte_present(*pte))
  			continue;
6aab341e0   Linus Torvalds   mm: re-architect ...
915
916
  		page = vm_normal_page(vma, address, *pte);
  		BUG_ON(!page || PageAnon(page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
917

b291f0003   Nick Piggin   mlock: mlocked pa...
918
919
920
921
922
923
  		if (locked_vma) {
  			mlock_vma_page(page);   /* no-op if already mlocked */
  			if (page == check_page)
  				ret = SWAP_MLOCK;
  			continue;	/* don't unmap */
  		}
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
924
  		if (ptep_clear_flush_young_notify(vma, address, pte))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
925
926
927
  			continue;
  
  		/* Nuke the page table entry. */
eca351336   Ben Collins   [PATCH] Fix missi...
928
  		flush_cache_page(vma, address, pte_pfn(*pte));
cddb8a5c1   Andrea Arcangeli   mmu-notifiers: core
929
  		pteval = ptep_clear_flush_notify(vma, address, pte);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
930
931
932
933
934
935
936
937
  
  		/* If nonlinear, store the file page offset in the pte. */
  		if (page->index != linear_page_index(vma, address))
  			set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
  
  		/* Move the dirty bit to the physical page now the pte is gone. */
  		if (pte_dirty(pteval))
  			set_page_dirty(page);
7de6b8057   Nick Piggin   [PATCH] mm: more ...
938
  		page_remove_rmap(page, vma);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
939
  		page_cache_release(page);
4294621f4   Hugh Dickins   [PATCH] mm: rss =...
940
  		dec_mm_counter(mm, file_rss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
941
942
  		(*mapcount)--;
  	}
c0718806c   Hugh Dickins   [PATCH] mm: rmap ...
943
  	pte_unmap_unlock(pte - 1, ptl);
b291f0003   Nick Piggin   mlock: mlocked pa...
944
945
946
  	if (locked_vma)
  		up_read(&vma->vm_mm->mmap_sem);
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
947
  }
b291f0003   Nick Piggin   mlock: mlocked pa...
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
  /*
   * common handling for pages mapped in VM_LOCKED vmas
   */
  static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
  {
  	int mlocked = 0;
  
  	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
  		if (vma->vm_flags & VM_LOCKED) {
  			mlock_vma_page(page);
  			mlocked++;	/* really mlocked the page */
  		}
  		up_read(&vma->vm_mm->mmap_sem);
  	}
  	return mlocked;
  }
  
  /**
   * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
   * rmap method
   * @page: the page to unmap/unlock
   * @unlock:  request for unlock rather than unmap [unlikely]
   * @migration:  unmapping for migration - ignored if @unlock
   *
   * Find all the mappings of a page using the mapping pointer and the vma chains
   * contained in the anon_vma struct it points to.
   *
   * This function is only called from try_to_unmap/try_to_munlock for
   * anonymous pages.
   * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
   * where the page was found will be held for write.  So, we won't recheck
   * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
   * 'LOCKED.
   */
  static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
983
984
985
  {
  	struct anon_vma *anon_vma;
  	struct vm_area_struct *vma;
b291f0003   Nick Piggin   mlock: mlocked pa...
986
  	unsigned int mlocked = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
987
  	int ret = SWAP_AGAIN;
b291f0003   Nick Piggin   mlock: mlocked pa...
988
989
  	if (MLOCK_PAGES && unlikely(unlock))
  		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
990
991
992
993
994
  	anon_vma = page_lock_anon_vma(page);
  	if (!anon_vma)
  		return ret;
  
  	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
b291f0003   Nick Piggin   mlock: mlocked pa...
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
  		if (MLOCK_PAGES && unlikely(unlock)) {
  			if (!((vma->vm_flags & VM_LOCKED) &&
  			      page_mapped_in_vma(page, vma)))
  				continue;  /* must visit all unlocked vmas */
  			ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
  		} else {
  			ret = try_to_unmap_one(page, vma, migration);
  			if (ret == SWAP_FAIL || !page_mapped(page))
  				break;
  		}
  		if (ret == SWAP_MLOCK) {
  			mlocked = try_to_mlock_page(page, vma);
  			if (mlocked)
  				break;	/* stop if actually mlocked page */
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1010
  	}
34bbd7040   Oleg Nesterov   [PATCH] adapt pag...
1011
1012
  
  	page_unlock_anon_vma(anon_vma);
b291f0003   Nick Piggin   mlock: mlocked pa...
1013
1014
1015
1016
1017
  
  	if (mlocked)
  		ret = SWAP_MLOCK;	/* actually mlocked the page */
  	else if (ret == SWAP_MLOCK)
  		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1018
1019
1020
1021
  	return ret;
  }
  
  /**
b291f0003   Nick Piggin   mlock: mlocked pa...
1022
1023
1024
1025
   * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
   * @page: the page to unmap/unlock
   * @unlock:  request for unlock rather than unmap [unlikely]
   * @migration:  unmapping for migration - ignored if @unlock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1026
1027
1028
1029
   *
   * Find all the mappings of a page using the mapping pointer and the vma chains
   * contained in the address_space struct it points to.
   *
b291f0003   Nick Piggin   mlock: mlocked pa...
1030
1031
1032
1033
1034
1035
   * This function is only called from try_to_unmap/try_to_munlock for
   * object-based pages.
   * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
   * where the page was found will be held for write.  So, we won't recheck
   * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
   * 'LOCKED.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1036
   */
b291f0003   Nick Piggin   mlock: mlocked pa...
1037
  static int try_to_unmap_file(struct page *page, int unlock, int migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
  {
  	struct address_space *mapping = page->mapping;
  	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  	struct vm_area_struct *vma;
  	struct prio_tree_iter iter;
  	int ret = SWAP_AGAIN;
  	unsigned long cursor;
  	unsigned long max_nl_cursor = 0;
  	unsigned long max_nl_size = 0;
  	unsigned int mapcount;
b291f0003   Nick Piggin   mlock: mlocked pa...
1048
1049
1050
1051
  	unsigned int mlocked = 0;
  
  	if (MLOCK_PAGES && unlikely(unlock))
  		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1052
1053
1054
  
  	spin_lock(&mapping->i_mmap_lock);
  	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
b291f0003   Nick Piggin   mlock: mlocked pa...
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
  		if (MLOCK_PAGES && unlikely(unlock)) {
  			if (!(vma->vm_flags & VM_LOCKED))
  				continue;	/* must visit all vmas */
  			ret = SWAP_MLOCK;
  		} else {
  			ret = try_to_unmap_one(page, vma, migration);
  			if (ret == SWAP_FAIL || !page_mapped(page))
  				goto out;
  		}
  		if (ret == SWAP_MLOCK) {
  			mlocked = try_to_mlock_page(page, vma);
  			if (mlocked)
  				break;  /* stop if actually mlocked page */
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1069
  	}
b291f0003   Nick Piggin   mlock: mlocked pa...
1070
1071
  	if (mlocked)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1072
1073
1074
1075
1076
  	if (list_empty(&mapping->i_mmap_nonlinear))
  		goto out;
  
  	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
  						shared.vm_set.list) {
b291f0003   Nick Piggin   mlock: mlocked pa...
1077
1078
1079
1080
1081
1082
1083
  		if (MLOCK_PAGES && unlikely(unlock)) {
  			if (!(vma->vm_flags & VM_LOCKED))
  				continue;	/* must visit all vmas */
  			ret = SWAP_MLOCK;	/* leave mlocked == 0 */
  			goto out;		/* no need to look further */
  		}
  		if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1084
1085
1086
1087
1088
1089
1090
1091
  			continue;
  		cursor = (unsigned long) vma->vm_private_data;
  		if (cursor > max_nl_cursor)
  			max_nl_cursor = cursor;
  		cursor = vma->vm_end - vma->vm_start;
  		if (cursor > max_nl_size)
  			max_nl_size = cursor;
  	}
b291f0003   Nick Piggin   mlock: mlocked pa...
1092
  	if (max_nl_size == 0) {	/* all nonlinears locked or reserved ? */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
  		ret = SWAP_FAIL;
  		goto out;
  	}
  
  	/*
  	 * We don't try to search for this page in the nonlinear vmas,
  	 * and page_referenced wouldn't have found it anyway.  Instead
  	 * just walk the nonlinear vmas trying to age and unmap some.
  	 * The mapcount of the page we came in with is irrelevant,
  	 * but even so use it as a guide to how hard we should try?
  	 */
  	mapcount = page_mapcount(page);
  	if (!mapcount)
  		goto out;
  	cond_resched_lock(&mapping->i_mmap_lock);
  
  	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
  	if (max_nl_cursor == 0)
  		max_nl_cursor = CLUSTER_SIZE;
  
  	do {
  		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
  						shared.vm_set.list) {
b291f0003   Nick Piggin   mlock: mlocked pa...
1116
1117
  			if (!MLOCK_PAGES && !migration &&
  			    (vma->vm_flags & VM_LOCKED))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1118
1119
  				continue;
  			cursor = (unsigned long) vma->vm_private_data;
839b9685e   Hugh Dickins   [PATCH] rmap: don...
1120
  			while ( cursor < max_nl_cursor &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1121
  				cursor < vma->vm_end - vma->vm_start) {
b291f0003   Nick Piggin   mlock: mlocked pa...
1122
1123
1124
1125
  				ret = try_to_unmap_cluster(cursor, &mapcount,
  								vma, page);
  				if (ret == SWAP_MLOCK)
  					mlocked = 2;	/* to return below */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
  				cursor += CLUSTER_SIZE;
  				vma->vm_private_data = (void *) cursor;
  				if ((int)mapcount <= 0)
  					goto out;
  			}
  			vma->vm_private_data = (void *) max_nl_cursor;
  		}
  		cond_resched_lock(&mapping->i_mmap_lock);
  		max_nl_cursor += CLUSTER_SIZE;
  	} while (max_nl_cursor <= max_nl_size);
  
  	/*
  	 * Don't loop forever (perhaps all the remaining pages are
  	 * in locked vmas).  Reset cursor on all unreserved nonlinear
  	 * vmas, now forgetting on which ones it had fallen behind.
  	 */
101d2be76   Hugh Dickins   [PATCH] unpaged: ...
1142
1143
  	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
  		vma->vm_private_data = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1144
1145
  out:
  	spin_unlock(&mapping->i_mmap_lock);
b291f0003   Nick Piggin   mlock: mlocked pa...
1146
1147
1148
1149
  	if (mlocked)
  		ret = SWAP_MLOCK;	/* actually mlocked the page */
  	else if (ret == SWAP_MLOCK)
  		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1150
1151
1152
1153
1154
1155
  	return ret;
  }
  
  /**
   * try_to_unmap - try to remove all page table mappings to a page
   * @page: the page to get unmapped
43d8eac44   Randy Dunlap   mm: rmap kernel-d...
1156
   * @migration: migration flag
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1157
1158
1159
1160
1161
1162
1163
1164
   *
   * Tries to remove all the page table entries which are mapping this
   * page, used in the pageout path.  Caller must hold the page lock.
   * Return values are:
   *
   * SWAP_SUCCESS	- we succeeded in removing all mappings
   * SWAP_AGAIN	- we missed a mapping, try again later
   * SWAP_FAIL	- the page is unswappable
b291f0003   Nick Piggin   mlock: mlocked pa...
1165
   * SWAP_MLOCK	- page is mlocked.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1166
   */
7352349a1   Christoph Lameter   [PATCH] page migr...
1167
  int try_to_unmap(struct page *page, int migration)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1168
1169
  {
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1170
1171
1172
  	BUG_ON(!PageLocked(page));
  
  	if (PageAnon(page))
b291f0003   Nick Piggin   mlock: mlocked pa...
1173
  		ret = try_to_unmap_anon(page, 0, migration);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174
  	else
b291f0003   Nick Piggin   mlock: mlocked pa...
1175
1176
  		ret = try_to_unmap_file(page, 0, migration);
  	if (ret != SWAP_MLOCK && !page_mapped(page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1177
1178
1179
  		ret = SWAP_SUCCESS;
  	return ret;
  }
81b4082dc   Nikita Danilov   [PATCH] mm: rmap....
1180

b291f0003   Nick Piggin   mlock: mlocked pa...
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
  #ifdef CONFIG_UNEVICTABLE_LRU
  /**
   * try_to_munlock - try to munlock a page
   * @page: the page to be munlocked
   *
   * Called from munlock code.  Checks all of the VMAs mapping the page
   * to make sure nobody else has this page mlocked. The page will be
   * returned with PG_mlocked cleared if no other vmas have it mlocked.
   *
   * Return values are:
   *
   * SWAP_SUCCESS	- no vma's holding page mlocked.
   * SWAP_AGAIN	- page mapped in mlocked vma -- couldn't acquire mmap sem
   * SWAP_MLOCK	- page is now mlocked.
   */
  int try_to_munlock(struct page *page)
  {
  	VM_BUG_ON(!PageLocked(page) || PageLRU(page));
  
  	if (PageAnon(page))
  		return try_to_unmap_anon(page, 1, 0);
  	else
  		return try_to_unmap_file(page, 1, 0);
  }
  #endif