Blame view
mm/rmap.c
55.3 KB
1da177e4c
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
/* * mm/rmap.c - physical to virtual reverse mappings * * Copyright 2001, Rik van Riel <riel@conectiva.com.br> * Released under the General Public License (GPL). * * Simple, low overhead reverse mapping scheme. * Please try to keep this thing as modular as possible. * * Provides methods for unmapping each kind of mapped page: * the anon methods track anonymous pages, and * the file methods track pages belonging to an inode. * * Original design by Rik van Riel <riel@conectiva.com.br> 2001 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 |
98f32602d
|
17 |
* Contributions by Hugh Dickins 2003, 2004 |
1da177e4c
|
18 19 20 21 22 |
*/ /* * Lock ordering in mm: * |
1b1dcc1b5
|
23 |
* inode->i_mutex (while writing or truncating, not reading or faulting) |
c1e8d7c6a
|
24 |
* mm->mmap_lock |
c0d0381ad
|
25 |
* page->flags PG_locked (lock_page) * (see huegtlbfs below) |
88f306b68
|
26 27 |
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) * mapping->i_mmap_rwsem |
c0d0381ad
|
28 |
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex) |
88f306b68
|
29 30 |
* anon_vma->rwsem * mm->page_table_lock or pte_lock |
f4b7e272b
|
31 |
* pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) |
88f306b68
|
32 33 34 35 |
* swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) |
b93b01631
|
36 |
* i_pages lock (widely used) |
88f306b68
|
37 38 39 |
* inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) |
b93b01631
|
40 |
* i_pages lock (widely used, in set_page_dirty, |
88f306b68
|
41 42 |
* in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) |
6a46079cf
|
43 |
* |
5a505085f
|
44 |
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) |
9b679320a
|
45 |
* ->tasklist_lock |
6a46079cf
|
46 |
* pte map lock |
c0d0381ad
|
47 48 49 50 51 |
* * * hugetlbfs PageHuge() pages take locks in this order: * mapping->i_mmap_rwsem * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * page->flags PG_locked (lock_page) |
1da177e4c
|
52 53 54 |
*/ #include <linux/mm.h> |
6e84f3152
|
55 |
#include <linux/sched/mm.h> |
299300258
|
56 |
#include <linux/sched/task.h> |
1da177e4c
|
57 58 59 60 61 |
#include <linux/pagemap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/slab.h> #include <linux/init.h> |
5ad646880
|
62 |
#include <linux/ksm.h> |
1da177e4c
|
63 64 |
#include <linux/rmap.h> #include <linux/rcupdate.h> |
b95f1b31b
|
65 |
#include <linux/export.h> |
8a9f3ccd2
|
66 |
#include <linux/memcontrol.h> |
cddb8a5c1
|
67 |
#include <linux/mmu_notifier.h> |
64cdd548f
|
68 |
#include <linux/migrate.h> |
0fe6e20b9
|
69 |
#include <linux/hugetlb.h> |
444f84fd2
|
70 |
#include <linux/huge_mm.h> |
ef5d437f7
|
71 |
#include <linux/backing-dev.h> |
33c3fc71c
|
72 |
#include <linux/page_idle.h> |
a5430dda8
|
73 |
#include <linux/memremap.h> |
bce73e484
|
74 |
#include <linux/userfaultfd_k.h> |
1da177e4c
|
75 76 |
#include <asm/tlbflush.h> |
72b252aed
|
77 |
#include <trace/events/tlb.h> |
b291f0003
|
78 |
#include "internal.h" |
fdd2e5f88
|
79 |
static struct kmem_cache *anon_vma_cachep; |
5beb49305
|
80 |
static struct kmem_cache *anon_vma_chain_cachep; |
fdd2e5f88
|
81 82 83 |
static inline struct anon_vma *anon_vma_alloc(void) { |
01d8b20de
|
84 85 86 87 88 |
struct anon_vma *anon_vma; anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); |
7a3ef208e
|
89 90 |
anon_vma->degree = 1; /* Reference for first vma */ anon_vma->parent = anon_vma; |
01d8b20de
|
91 92 93 94 95 96 97 98 |
/* * Initialise the anon_vma root to point to itself. If called * from fork, the root will be reset to the parents anon_vma. */ anon_vma->root = anon_vma; } return anon_vma; |
fdd2e5f88
|
99 |
} |
01d8b20de
|
100 |
static inline void anon_vma_free(struct anon_vma *anon_vma) |
fdd2e5f88
|
101 |
{ |
01d8b20de
|
102 |
VM_BUG_ON(atomic_read(&anon_vma->refcount)); |
88c22088b
|
103 104 |
/* |
4fc3f1d66
|
105 |
* Synchronize against page_lock_anon_vma_read() such that |
88c22088b
|
106 107 108 109 110 |
* we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by |
4fc3f1d66
|
111 |
* down_read_trylock() from page_lock_anon_vma_read(). This orders: |
88c22088b
|
112 |
* |
4fc3f1d66
|
113 114 |
* page_lock_anon_vma_read() VS put_anon_vma() * down_read_trylock() atomic_dec_and_test() |
88c22088b
|
115 |
* LOCK MB |
4fc3f1d66
|
116 |
* atomic_read() rwsem_is_locked() |
88c22088b
|
117 118 119 120 |
* * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ |
7f39dda9d
|
121 |
might_sleep(); |
5a505085f
|
122 |
if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
4fc3f1d66
|
123 |
anon_vma_lock_write(anon_vma); |
08b52706d
|
124 |
anon_vma_unlock_write(anon_vma); |
88c22088b
|
125 |
} |
fdd2e5f88
|
126 127 |
kmem_cache_free(anon_vma_cachep, anon_vma); } |
1da177e4c
|
128 |
|
dd34739c0
|
129 |
static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) |
5beb49305
|
130 |
{ |
dd34739c0
|
131 |
return kmem_cache_alloc(anon_vma_chain_cachep, gfp); |
5beb49305
|
132 |
} |
e574b5fd2
|
133 |
static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) |
5beb49305
|
134 135 136 |
{ kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } |
6583a8430
|
137 138 139 140 141 142 143 |
static void anon_vma_chain_link(struct vm_area_struct *vma, struct anon_vma_chain *avc, struct anon_vma *anon_vma) { avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); |
bf181b9f9
|
144 |
anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); |
6583a8430
|
145 |
} |
d9d332e08
|
146 |
/** |
d5a187daf
|
147 |
* __anon_vma_prepare - attach an anon_vma to a memory region |
d9d332e08
|
148 149 150 151 152 153 |
* @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * |
d5a187daf
|
154 155 |
* The common case will be that we already have one, which * is handled inline by anon_vma_prepare(). But if |
23a0790af
|
156 |
* not we either need to find an adjacent mapping that we |
d9d332e08
|
157 158 159 160 161 |
* can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we * allocate a new one. * * Anon-vma allocations are very subtle, because we may have |
4fc3f1d66
|
162 |
* optimistically looked up an anon_vma in page_lock_anon_vma_read() |
d9d332e08
|
163 164 165 166 167 168 169 170 171 |
* and that may actually touch the spinlock even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). * * As a result, we need to do proper anon_vma locking even * for the new allocation. At the same time, we do not want * to do any locking for the common case of already having * an anon_vma. * |
c1e8d7c6a
|
172 |
* This must be called with the mmap_lock held for reading. |
d9d332e08
|
173 |
*/ |
d5a187daf
|
174 |
int __anon_vma_prepare(struct vm_area_struct *vma) |
1da177e4c
|
175 |
{ |
d5a187daf
|
176 177 |
struct mm_struct *mm = vma->vm_mm; struct anon_vma *anon_vma, *allocated; |
5beb49305
|
178 |
struct anon_vma_chain *avc; |
1da177e4c
|
179 180 |
might_sleep(); |
1da177e4c
|
181 |
|
d5a187daf
|
182 183 184 185 186 187 188 189 190 191 192 193 |
avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_enomem; anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; allocated = anon_vma; } |
5beb49305
|
194 |
|
d5a187daf
|
195 196 197 198 199 200 201 202 |
anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); /* vma reference or self-parent link for new root */ anon_vma->degree++; |
d9d332e08
|
203 |
allocated = NULL; |
d5a187daf
|
204 205 206 207 |
avc = NULL; } spin_unlock(&mm->page_table_lock); anon_vma_unlock_write(anon_vma); |
1da177e4c
|
208 |
|
d5a187daf
|
209 210 211 212 |
if (unlikely(allocated)) put_anon_vma(allocated); if (unlikely(avc)) anon_vma_chain_free(avc); |
31f2b0ebc
|
213 |
|
1da177e4c
|
214 |
return 0; |
5beb49305
|
215 216 217 218 219 |
out_enomem_free_avc: anon_vma_chain_free(avc); out_enomem: return -ENOMEM; |
1da177e4c
|
220 |
} |
bb4aa3967
|
221 222 223 224 225 226 227 228 229 230 231 232 233 |
/* * This is a useful helper function for locking the anon_vma root as * we traverse the vma->anon_vma_chain, looping over anon_vma's that * have the same vma. * * Such anon_vma's should have the same root, so you'd expect to see * just a single mutex_lock for the whole traversal. */ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) { struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) |
5a505085f
|
234 |
up_write(&root->rwsem); |
bb4aa3967
|
235 |
root = new_root; |
5a505085f
|
236 |
down_write(&root->rwsem); |
bb4aa3967
|
237 238 239 240 241 242 243 |
} return root; } static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) |
5a505085f
|
244 |
up_write(&root->rwsem); |
bb4aa3967
|
245 |
} |
5beb49305
|
246 247 248 |
/* * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. |
7a3ef208e
|
249 |
* |
47b390d23
|
250 251 252 253 254 255 256 257 258 259 260 261 262 |
* anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and * anon_vma_fork(). The first three want an exact copy of src, while the last * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, * we can identify this case by checking (!dst->anon_vma && src->anon_vma). * * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find * and reuse existing anon_vma which has no vmas and only one child anon_vma. * This prevents degradation of anon_vma hierarchy to endless linear chain in * case of constantly forking task. On the other hand, an anon_vma with more * than one child isn't reused even if there was no alive vma, thus rmap * walker has a good chance of avoiding scanning the whole hierarchy when it * searches where page is mapped. |
5beb49305
|
263 264 |
*/ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) |
1da177e4c
|
265 |
{ |
5beb49305
|
266 |
struct anon_vma_chain *avc, *pavc; |
bb4aa3967
|
267 |
struct anon_vma *root = NULL; |
5beb49305
|
268 |
|
646d87b48
|
269 |
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { |
bb4aa3967
|
270 |
struct anon_vma *anon_vma; |
dd34739c0
|
271 272 273 274 275 276 277 278 |
avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!avc)) { unlock_anon_vma_root(root); root = NULL; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto enomem_failure; } |
bb4aa3967
|
279 280 281 |
anon_vma = pavc->anon_vma; root = lock_anon_vma_root(root, anon_vma); anon_vma_chain_link(dst, avc, anon_vma); |
7a3ef208e
|
282 283 284 285 286 287 288 289 290 |
/* * Reuse existing anon_vma if its degree lower than two, * that means it has no vma and only one anon_vma child. * * Do not chose parent anon_vma, otherwise first child * will always reuse it. Root anon_vma is never reused: * it has self-parent reference and at least one child. */ |
47b390d23
|
291 292 |
if (!dst->anon_vma && src->anon_vma && anon_vma != src->anon_vma && anon_vma->degree < 2) |
7a3ef208e
|
293 |
dst->anon_vma = anon_vma; |
5beb49305
|
294 |
} |
7a3ef208e
|
295 296 |
if (dst->anon_vma) dst->anon_vma->degree++; |
bb4aa3967
|
297 |
unlock_anon_vma_root(root); |
5beb49305
|
298 |
return 0; |
1da177e4c
|
299 |
|
5beb49305
|
300 |
enomem_failure: |
3fe89b3e2
|
301 302 303 304 305 306 307 |
/* * dst->anon_vma is dropped here otherwise its degree can be incorrectly * decremented in unlink_anon_vmas(). * We can safely do this because callers of anon_vma_clone() don't care * about dst->anon_vma if anon_vma_clone() failed. */ dst->anon_vma = NULL; |
5beb49305
|
308 309 |
unlink_anon_vmas(dst); return -ENOMEM; |
1da177e4c
|
310 |
} |
5beb49305
|
311 312 313 314 315 316 |
/* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. * Returns 0 on success, non-zero on failure. */ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) |
1da177e4c
|
317 |
{ |
5beb49305
|
318 319 |
struct anon_vma_chain *avc; struct anon_vma *anon_vma; |
c4ea95d7c
|
320 |
int error; |
1da177e4c
|
321 |
|
5beb49305
|
322 323 324 |
/* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) return 0; |
7a3ef208e
|
325 326 |
/* Drop inherited anon_vma, we'll reuse existing or allocate new. */ vma->anon_vma = NULL; |
5beb49305
|
327 328 329 330 |
/* * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ |
c4ea95d7c
|
331 332 333 |
error = anon_vma_clone(vma, pvma); if (error) return error; |
5beb49305
|
334 |
|
7a3ef208e
|
335 336 337 |
/* An existing anon_vma has been reused, all done then. */ if (vma->anon_vma) return 0; |
5beb49305
|
338 339 340 341 |
/* Then add our own anon_vma. */ anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; |
dd34739c0
|
342 |
avc = anon_vma_chain_alloc(GFP_KERNEL); |
5beb49305
|
343 344 |
if (!avc) goto out_error_free_anon_vma; |
5c341ee1d
|
345 346 347 348 349 350 |
/* * The root anon_vma's spinlock is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. */ anon_vma->root = pvma->anon_vma->root; |
7a3ef208e
|
351 |
anon_vma->parent = pvma->anon_vma; |
76545066c
|
352 |
/* |
01d8b20de
|
353 354 355 |
* With refcounts, an anon_vma can stay around longer than the * process it belongs to. The root anon_vma needs to be pinned until * this anon_vma is freed, because the lock lives in the root. |
76545066c
|
356 357 |
*/ get_anon_vma(anon_vma->root); |
5beb49305
|
358 359 |
/* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; |
4fc3f1d66
|
360 |
anon_vma_lock_write(anon_vma); |
5c341ee1d
|
361 |
anon_vma_chain_link(vma, avc, anon_vma); |
7a3ef208e
|
362 |
anon_vma->parent->degree++; |
08b52706d
|
363 |
anon_vma_unlock_write(anon_vma); |
5beb49305
|
364 365 366 367 |
return 0; out_error_free_anon_vma: |
01d8b20de
|
368 |
put_anon_vma(anon_vma); |
5beb49305
|
369 |
out_error: |
4946d54cb
|
370 |
unlink_anon_vmas(vma); |
5beb49305
|
371 |
return -ENOMEM; |
1da177e4c
|
372 |
} |
5beb49305
|
373 374 375 |
void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; |
eee2acbae
|
376 |
struct anon_vma *root = NULL; |
5beb49305
|
377 |
|
5c341ee1d
|
378 379 380 381 |
/* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. */ |
5beb49305
|
382 |
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
eee2acbae
|
383 384 385 |
struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); |
bf181b9f9
|
386 |
anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); |
eee2acbae
|
387 388 389 390 391 |
/* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ |
f808c13fd
|
392 |
if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { |
7a3ef208e
|
393 |
anon_vma->parent->degree--; |
eee2acbae
|
394 |
continue; |
7a3ef208e
|
395 |
} |
eee2acbae
|
396 397 398 399 |
list_del(&avc->same_vma); anon_vma_chain_free(avc); } |
7a3ef208e
|
400 401 |
if (vma->anon_vma) vma->anon_vma->degree--; |
eee2acbae
|
402 403 404 405 406 |
unlock_anon_vma_root(root); /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() |
5a505085f
|
407 |
* needing to write-acquire the anon_vma->root->rwsem. |
eee2acbae
|
408 409 410 |
*/ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; |
e4c5800a3
|
411 |
VM_WARN_ON(anon_vma->degree); |
eee2acbae
|
412 |
put_anon_vma(anon_vma); |
5beb49305
|
413 414 415 416 |
list_del(&avc->same_vma); anon_vma_chain_free(avc); } } |
51cc50685
|
417 |
static void anon_vma_ctor(void *data) |
1da177e4c
|
418 |
{ |
a35afb830
|
419 |
struct anon_vma *anon_vma = data; |
1da177e4c
|
420 |
|
5a505085f
|
421 |
init_rwsem(&anon_vma->rwsem); |
83813267c
|
422 |
atomic_set(&anon_vma->refcount, 0); |
f808c13fd
|
423 |
anon_vma->rb_root = RB_ROOT_CACHED; |
1da177e4c
|
424 425 426 427 428 |
} void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
5f0d5a3ae
|
429 |
0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, |
5d097056c
|
430 431 432 |
anon_vma_ctor); anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC|SLAB_ACCOUNT); |
1da177e4c
|
433 434 435 |
} /* |
6111e4ca6
|
436 437 438 439 440 441 442 443 444 |
* Getting a lock on a stable anon_vma from a page off the LRU is tricky! * * Since there is no serialization what so ever against page_remove_rmap() * the best this function can do is return a locked anon_vma that might * have been relevant to this page. * * The page might have been remapped to a different anon_vma or the anon_vma * returned may already be freed (and even reused). * |
bc658c960
|
445 446 447 448 449 |
* In case it was remapped to a different anon_vma, the new anon_vma will be a * child of the old anon_vma, and the anon_vma lifetime rules will therefore * ensure that any anon_vma obtained from the page will still be valid for as * long as we observe page_mapped() [ hence all those page_mapped() tests ]. * |
6111e4ca6
|
450 451 452 453 |
* All users of this function must be very careful when walking the anon_vma * chain and verify that the page in question is indeed mapped in it * [ something equivalent to page_mapped_in_vma() ]. * |
091e42995
|
454 455 456 457 |
* Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from * page_remove_rmap() that the anon_vma pointer from page->mapping is valid * if there is a mapcount, we can dereference the anon_vma after observing * those. |
1da177e4c
|
458 |
*/ |
746b18d42
|
459 |
struct anon_vma *page_get_anon_vma(struct page *page) |
1da177e4c
|
460 |
{ |
746b18d42
|
461 |
struct anon_vma *anon_vma = NULL; |
1da177e4c
|
462 463 464 |
unsigned long anon_mapping; rcu_read_lock(); |
4db0c3c29
|
465 |
anon_mapping = (unsigned long)READ_ONCE(page->mapping); |
3ca7b3c5b
|
466 |
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
1da177e4c
|
467 468 469 470 471 |
goto out; if (!page_mapped(page)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
746b18d42
|
472 473 474 475 |
if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } |
f18194275
|
476 477 478 |
/* * If this page is still mapped, then its anon_vma cannot have been |
746b18d42
|
479 480 |
* freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: |
5f0d5a3ae
|
481 |
* SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() |
746b18d42
|
482 |
* above cannot corrupt). |
f18194275
|
483 |
*/ |
746b18d42
|
484 |
if (!page_mapped(page)) { |
7f39dda9d
|
485 |
rcu_read_unlock(); |
746b18d42
|
486 |
put_anon_vma(anon_vma); |
7f39dda9d
|
487 |
return NULL; |
746b18d42
|
488 |
} |
1da177e4c
|
489 490 |
out: rcu_read_unlock(); |
746b18d42
|
491 492 493 |
return anon_vma; } |
88c22088b
|
494 495 496 497 498 499 500 |
/* * Similar to page_get_anon_vma() except it locks the anon_vma. * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with page_get_anon_vma() and then block on the mutex. */ |
4fc3f1d66
|
501 |
struct anon_vma *page_lock_anon_vma_read(struct page *page) |
746b18d42
|
502 |
{ |
88c22088b
|
503 |
struct anon_vma *anon_vma = NULL; |
eee0f252c
|
504 |
struct anon_vma *root_anon_vma; |
88c22088b
|
505 |
unsigned long anon_mapping; |
746b18d42
|
506 |
|
88c22088b
|
507 |
rcu_read_lock(); |
4db0c3c29
|
508 |
anon_mapping = (unsigned long)READ_ONCE(page->mapping); |
88c22088b
|
509 510 511 512 513 514 |
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!page_mapped(page)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
4db0c3c29
|
515 |
root_anon_vma = READ_ONCE(anon_vma->root); |
4fc3f1d66
|
516 |
if (down_read_trylock(&root_anon_vma->rwsem)) { |
88c22088b
|
517 |
/* |
eee0f252c
|
518 519 |
* If the page is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will |
bc658c960
|
520 |
* not go away, see anon_vma_free(). |
88c22088b
|
521 |
*/ |
eee0f252c
|
522 |
if (!page_mapped(page)) { |
4fc3f1d66
|
523 |
up_read(&root_anon_vma->rwsem); |
88c22088b
|
524 525 526 527 |
anon_vma = NULL; } goto out; } |
746b18d42
|
528 |
|
88c22088b
|
529 530 531 532 533 534 535 |
/* trylock failed, we got to sleep */ if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } if (!page_mapped(page)) { |
7f39dda9d
|
536 |
rcu_read_unlock(); |
88c22088b
|
537 |
put_anon_vma(anon_vma); |
7f39dda9d
|
538 |
return NULL; |
88c22088b
|
539 540 541 542 |
} /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); |
4fc3f1d66
|
543 |
anon_vma_lock_read(anon_vma); |
88c22088b
|
544 545 546 547 548 |
if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because |
4fc3f1d66
|
549 |
* we'll deadlock on the anon_vma_lock_write() recursion. |
88c22088b
|
550 |
*/ |
4fc3f1d66
|
551 |
anon_vma_unlock_read(anon_vma); |
88c22088b
|
552 553 554 555 556 557 558 559 |
__put_anon_vma(anon_vma); anon_vma = NULL; } return anon_vma; out: rcu_read_unlock(); |
746b18d42
|
560 |
return anon_vma; |
34bbd7040
|
561 |
} |
4fc3f1d66
|
562 |
void page_unlock_anon_vma_read(struct anon_vma *anon_vma) |
34bbd7040
|
563 |
{ |
4fc3f1d66
|
564 |
anon_vma_unlock_read(anon_vma); |
1da177e4c
|
565 |
} |
72b252aed
|
566 |
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
72b252aed
|
567 568 569 570 571 572 573 574 575 |
/* * Flush TLB entries for recently unmapped pages from remote CPUs. It is * important if a PTE was dirty when it was unmapped that it's flushed * before any IO is initiated on the page to prevent lost writes. Similarly, * it must be flushed before freeing to prevent data leakage. */ void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; |
72b252aed
|
576 577 578 |
if (!tlb_ubc->flush_required) return; |
e73ad5ff2
|
579 |
arch_tlbbatch_flush(&tlb_ubc->arch); |
72b252aed
|
580 |
tlb_ubc->flush_required = false; |
d950c9477
|
581 |
tlb_ubc->writable = false; |
72b252aed
|
582 |
} |
d950c9477
|
583 584 585 586 587 588 589 590 |
/* Flush iff there are potentially writable TLB entries that can race with IO */ void try_to_unmap_flush_dirty(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; if (tlb_ubc->writable) try_to_unmap_flush(); } |
c7ab0d2fd
|
591 |
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) |
72b252aed
|
592 593 |
{ struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; |
e73ad5ff2
|
594 |
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); |
72b252aed
|
595 |
tlb_ubc->flush_required = true; |
d950c9477
|
596 597 |
/* |
3ea277194
|
598 599 600 601 602 603 604 |
* Ensure compiler does not re-order the setting of tlb_flush_batched * before the PTE is cleared. */ barrier(); mm->tlb_flush_batched = true; /* |
d950c9477
|
605 606 607 608 609 610 |
* If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() * before the page is queued for IO. */ if (writable) tlb_ubc->writable = true; |
72b252aed
|
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 |
} /* * Returns true if the TLB flush should be deferred to the end of a batch of * unmap operations to reduce IPIs. */ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { bool should_defer = false; if (!(flags & TTU_BATCH_FLUSH)) return false; /* If remote CPUs need to be flushed then defer batch the flush */ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) should_defer = true; put_cpu(); return should_defer; } |
3ea277194
|
631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 |
/* * Reclaim unmaps pages under the PTL but do not flush the TLB prior to * releasing the PTL if TLB flushes are batched. It's possible for a parallel * operation such as mprotect or munmap to race between reclaim unmapping * the page and flushing the page. If this race occurs, it potentially allows * access to data via a stale TLB entry. Tracking all mm's that have TLB * batching in flight would be expensive during reclaim so instead track * whether TLB batching occurred in the past and if so then do a flush here * if required. This will cost one additional flush per reclaim cycle paid * by the first operation at risk such as mprotect and mumap. * * This must be called under the PTL so that an access to tlb_flush_batched * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise * via the PTL. */ void flush_tlb_batched_pending(struct mm_struct *mm) { |
9c1177b62
|
649 |
if (data_race(mm->tlb_flush_batched)) { |
3ea277194
|
650 651 652 653 654 655 656 657 658 659 |
flush_tlb_mm(mm); /* * Do not allow the compiler to re-order the clearing of * tlb_flush_batched before the tlb is flushed. */ barrier(); mm->tlb_flush_batched = false; } } |
72b252aed
|
660 |
#else |
c7ab0d2fd
|
661 |
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) |
72b252aed
|
662 663 664 665 666 667 668 669 |
{ } static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { return false; } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ |
1da177e4c
|
670 |
/* |
bf89c8c86
|
671 |
* At what user virtual address is page expected in vma? |
ab941e0ff
|
672 |
* Caller should check the page is actually part of the vma. |
1da177e4c
|
673 674 675 |
*/ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { |
86c2ad199
|
676 |
unsigned long address; |
21d0d443c
|
677 |
if (PageAnon(page)) { |
4829b906c
|
678 679 680 681 682 683 684 |
struct anon_vma *page__anon_vma = page_anon_vma(page); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. */ if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) |
21d0d443c
|
685 |
return -EFAULT; |
27ba0644e
|
686 687 |
} else if (page->mapping) { if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) |
1da177e4c
|
688 689 690 |
return -EFAULT; } else return -EFAULT; |
86c2ad199
|
691 692 693 694 |
address = __vma_address(page, vma); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) return -EFAULT; return address; |
1da177e4c
|
695 |
} |
6219049ae
|
696 697 698 |
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; |
c2febafc6
|
699 |
p4d_t *p4d; |
6219049ae
|
700 701 |
pud_t *pud; pmd_t *pmd = NULL; |
f72e7dcdd
|
702 |
pmd_t pmde; |
6219049ae
|
703 704 705 706 |
pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto out; |
c2febafc6
|
707 708 709 710 711 |
p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) goto out; pud = pud_offset(p4d, address); |
6219049ae
|
712 713 714 715 |
if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); |
f72e7dcdd
|
716 |
/* |
8809aa2d2
|
717 |
* Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() |
f72e7dcdd
|
718 719 720 |
* without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ |
e37c69827
|
721 722 |
pmde = *pmd; barrier(); |
f72e7dcdd
|
723 |
if (!pmd_present(pmde) || pmd_trans_huge(pmde)) |
6219049ae
|
724 725 726 727 |
pmd = NULL; out: return pmd; } |
8749cfea1
|
728 729 730 731 732 733 734 735 736 |
struct page_referenced_arg { int mapcount; int referenced; unsigned long vm_flags; struct mem_cgroup *memcg; }; /* * arg: page_referenced_arg will be passed */ |
e4b822227
|
737 |
static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, |
8749cfea1
|
738 739 |
unsigned long address, void *arg) { |
8749cfea1
|
740 |
struct page_referenced_arg *pra = arg; |
8eaedede8
|
741 742 743 744 745 |
struct page_vma_mapped_walk pvmw = { .page = page, .vma = vma, .address = address, }; |
8749cfea1
|
746 |
int referenced = 0; |
8eaedede8
|
747 748 |
while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; |
b20ce5e03
|
749 |
|
8eaedede8
|
750 751 752 |
if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; |
e4b822227
|
753 |
return false; /* To break the loop */ |
8eaedede8
|
754 |
} |
71e3aac07
|
755 |
|
8eaedede8
|
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 |
if (pvmw.pte) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { /* * Don't treat a reference through * a sequentially read mapping as such. * If the page has been used in another mapping, * we will catch it; if this other mapping is * already gone, the unmap path will have set * PG_referenced or activated the page. */ if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) |
8749cfea1
|
773 |
referenced++; |
8eaedede8
|
774 775 776 |
} else { /* unexpected pmd-mapped page? */ WARN_ON_ONCE(1); |
8749cfea1
|
777 |
} |
8eaedede8
|
778 779 |
pra->mapcount--; |
b20ce5e03
|
780 |
} |
b20ce5e03
|
781 |
|
33c3fc71c
|
782 783 784 785 |
if (referenced) clear_page_idle(page); if (test_and_clear_page_young(page)) referenced++; |
9f32624be
|
786 787 788 |
if (referenced) { pra->referenced++; pra->vm_flags |= vma->vm_flags; |
1da177e4c
|
789 |
} |
34bbd7040
|
790 |
|
9f32624be
|
791 |
if (!pra->mapcount) |
e4b822227
|
792 |
return false; /* To break the loop */ |
9f32624be
|
793 |
|
e4b822227
|
794 |
return true; |
1da177e4c
|
795 |
} |
9f32624be
|
796 |
static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) |
1da177e4c
|
797 |
{ |
9f32624be
|
798 799 |
struct page_referenced_arg *pra = arg; struct mem_cgroup *memcg = pra->memcg; |
1da177e4c
|
800 |
|
9f32624be
|
801 802 |
if (!mm_match_cgroup(vma->vm_mm, memcg)) return true; |
1da177e4c
|
803 |
|
9f32624be
|
804 |
return false; |
1da177e4c
|
805 806 807 808 809 810 |
} /** * page_referenced - test if the page was referenced * @page: the page to test * @is_locked: caller holds lock on the page |
72835c86c
|
811 |
* @memcg: target memory cgroup |
6fe6b7e35
|
812 |
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page |
1da177e4c
|
813 814 815 816 |
* * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. */ |
6fe6b7e35
|
817 818 |
int page_referenced(struct page *page, int is_locked, |
72835c86c
|
819 |
struct mem_cgroup *memcg, |
6fe6b7e35
|
820 |
unsigned long *vm_flags) |
1da177e4c
|
821 |
{ |
5ad646880
|
822 |
int we_locked = 0; |
9f32624be
|
823 |
struct page_referenced_arg pra = { |
b20ce5e03
|
824 |
.mapcount = total_mapcount(page), |
9f32624be
|
825 826 827 828 829 830 831 |
.memcg = memcg, }; struct rmap_walk_control rwc = { .rmap_one = page_referenced_one, .arg = (void *)&pra, .anon_lock = page_lock_anon_vma_read, }; |
1da177e4c
|
832 |
|
6fe6b7e35
|
833 |
*vm_flags = 0; |
059d8442e
|
834 |
if (!pra.mapcount) |
9f32624be
|
835 836 837 838 839 840 841 842 843 |
return 0; if (!page_rmapping(page)) return 0; if (!is_locked && (!PageAnon(page) || PageKsm(page))) { we_locked = trylock_page(page); if (!we_locked) return 1; |
1da177e4c
|
844 |
} |
9f32624be
|
845 846 847 848 849 850 851 852 853 |
/* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different * cgroups */ if (memcg) { rwc.invalid_vma = invalid_page_referenced_vma; } |
c24f386c6
|
854 |
rmap_walk(page, &rwc); |
9f32624be
|
855 856 857 858 859 860 |
*vm_flags = pra.vm_flags; if (we_locked) unlock_page(page); return pra.referenced; |
1da177e4c
|
861 |
} |
e4b822227
|
862 |
static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
9853a407b
|
863 |
unsigned long address, void *arg) |
d08b3851d
|
864 |
{ |
f27176cfc
|
865 866 867 868 869 870 |
struct page_vma_mapped_walk pvmw = { .page = page, .vma = vma, .address = address, .flags = PVMW_SYNC, }; |
ac46d4f3c
|
871 |
struct mmu_notifier_range range; |
9853a407b
|
872 |
int *cleaned = arg; |
d08b3851d
|
873 |
|
369ea8242
|
874 875 876 877 |
/* * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ |
7269f9999
|
878 879 |
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, |
a50b854e0
|
880 |
min(vma->vm_end, address + page_size(page))); |
ac46d4f3c
|
881 |
mmu_notifier_invalidate_range_start(&range); |
369ea8242
|
882 |
|
f27176cfc
|
883 884 |
while (page_vma_mapped_walk(&pvmw)) { int ret = 0; |
369ea8242
|
885 |
|
1f18b2966
|
886 |
address = pvmw.address; |
f27176cfc
|
887 888 889 890 891 892 |
if (pvmw.pte) { pte_t entry; pte_t *pte = pvmw.pte; if (!pte_dirty(*pte) && !pte_write(*pte)) continue; |
785373b4c
|
893 894 |
flush_cache_page(vma, address, pte_pfn(*pte)); entry = ptep_clear_flush(vma, address, pte); |
f27176cfc
|
895 896 |
entry = pte_wrprotect(entry); entry = pte_mkclean(entry); |
785373b4c
|
897 |
set_pte_at(vma->vm_mm, address, pte, entry); |
f27176cfc
|
898 899 |
ret = 1; } else { |
396bcc529
|
900 |
#ifdef CONFIG_TRANSPARENT_HUGEPAGE |
f27176cfc
|
901 902 903 904 905 |
pmd_t *pmd = pvmw.pmd; pmd_t entry; if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) continue; |
785373b4c
|
906 |
flush_cache_page(vma, address, page_to_pfn(page)); |
024eee0e8
|
907 |
entry = pmdp_invalidate(vma, address, pmd); |
f27176cfc
|
908 909 |
entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); |
785373b4c
|
910 |
set_pmd_at(vma->vm_mm, address, pmd, entry); |
f27176cfc
|
911 912 913 914 915 916 |
ret = 1; #else /* unexpected pmd-mapped page? */ WARN_ON_ONCE(1); #endif } |
d08b3851d
|
917 |
|
0f10851ea
|
918 919 920 921 922 |
/* * No need to call mmu_notifier_invalidate_range() as we are * downgrading page table protection not changing it to point * to a new page. * |
ad56b738c
|
923 |
* See Documentation/vm/mmu_notifier.rst |
0f10851ea
|
924 925 |
*/ if (ret) |
f27176cfc
|
926 |
(*cleaned)++; |
c2fda5fed
|
927 |
} |
d08b3851d
|
928 |
|
ac46d4f3c
|
929 |
mmu_notifier_invalidate_range_end(&range); |
369ea8242
|
930 |
|
e4b822227
|
931 |
return true; |
d08b3851d
|
932 |
} |
9853a407b
|
933 |
static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) |
d08b3851d
|
934 |
{ |
9853a407b
|
935 |
if (vma->vm_flags & VM_SHARED) |
871beb8c3
|
936 |
return false; |
d08b3851d
|
937 |
|
871beb8c3
|
938 |
return true; |
d08b3851d
|
939 940 941 942 |
} int page_mkclean(struct page *page) { |
9853a407b
|
943 944 945 946 947 948 949 |
int cleaned = 0; struct address_space *mapping; struct rmap_walk_control rwc = { .arg = (void *)&cleaned, .rmap_one = page_mkclean_one, .invalid_vma = invalid_mkclean_vma, }; |
d08b3851d
|
950 951 |
BUG_ON(!PageLocked(page)); |
9853a407b
|
952 953 954 955 956 957 958 959 |
if (!page_mapped(page)) return 0; mapping = page_mapping(page); if (!mapping) return 0; rmap_walk(page, &rwc); |
d08b3851d
|
960 |
|
9853a407b
|
961 |
return cleaned; |
d08b3851d
|
962 |
} |
60b59beaf
|
963 |
EXPORT_SYMBOL_GPL(page_mkclean); |
d08b3851d
|
964 |
|
1da177e4c
|
965 |
/** |
c44b67432
|
966 967 968 |
* page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma * @vma: the vma the page belongs to |
c44b67432
|
969 970 971 972 973 974 |
* * When a page belongs exclusively to one process after a COW event, * that page can be moved into the anon_vma that belongs to just that * process, so the rmap code will not search the parent or sibling * processes. */ |
5a49973d7
|
975 |
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) |
c44b67432
|
976 977 |
{ struct anon_vma *anon_vma = vma->anon_vma; |
5a49973d7
|
978 |
page = compound_head(page); |
309381fea
|
979 |
VM_BUG_ON_PAGE(!PageLocked(page), page); |
81d1b09c6
|
980 |
VM_BUG_ON_VMA(!anon_vma, vma); |
c44b67432
|
981 982 |
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
414e2fb8c
|
983 984 985 986 987 988 |
/* * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg page_referenced()'s * PageAnon()) will not see one without the other. */ WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); |
c44b67432
|
989 990 991 |
} /** |
4e1c19750
|
992 |
* __page_set_anon_rmap - set up new anonymous rmap |
451b9514a
|
993 |
* @page: Page or Hugepage to add to rmap |
4e1c19750
|
994 995 |
* @vma: VM area to add page to. * @address: User virtual address of the mapping |
e8a03feb5
|
996 |
* @exclusive: the page is exclusively owned by the current process |
9617d95e6
|
997 998 |
*/ static void __page_set_anon_rmap(struct page *page, |
e8a03feb5
|
999 |
struct vm_area_struct *vma, unsigned long address, int exclusive) |
9617d95e6
|
1000 |
{ |
e8a03feb5
|
1001 |
struct anon_vma *anon_vma = vma->anon_vma; |
ea90002b0
|
1002 |
|
e8a03feb5
|
1003 |
BUG_ON(!anon_vma); |
ea90002b0
|
1004 |
|
4e1c19750
|
1005 1006 |
if (PageAnon(page)) return; |
ea90002b0
|
1007 |
/* |
e8a03feb5
|
1008 1009 1010 |
* If the page isn't exclusively mapped into this vma, * we must use the _oldest_ possible anon_vma for the * page mapping! |
ea90002b0
|
1011 |
*/ |
4e1c19750
|
1012 |
if (!exclusive) |
288468c33
|
1013 |
anon_vma = anon_vma->root; |
9617d95e6
|
1014 |
|
9617d95e6
|
1015 1016 |
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; |
9617d95e6
|
1017 |
page->index = linear_page_index(vma, address); |
9617d95e6
|
1018 1019 1020 |
} /** |
43d8eac44
|
1021 |
* __page_check_anon_rmap - sanity check anonymous rmap addition |
c97a9e10e
|
1022 1023 1024 1025 1026 1027 1028 |
* @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ static void __page_check_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { |
c97a9e10e
|
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 |
/* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. * * We have exclusion against page_add_anon_rmap because the caller * always holds the page locked, except if called from page_dup_rmap, * in which case the page is already known to be setup. * * We have exclusion against page_add_new_anon_rmap because those pages * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ |
30c463828
|
1041 1042 1043 |
VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), page); |
c97a9e10e
|
1044 1045 1046 |
} /** |
1da177e4c
|
1047 1048 1049 1050 |
* page_add_anon_rmap - add pte mapping to an anonymous page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped |
d281ee614
|
1051 |
* @compound: charge the page as compound or small page |
1da177e4c
|
1052 |
* |
5ad646880
|
1053 |
* The caller needs to hold the pte lock, and the page must be locked in |
80e148226
|
1054 1055 1056 |
* the anon_vma case: to serialize mapping,index checking after setting, * and to ensure that PageAnon is not being upgraded racily to PageKsm * (but PageKsm is never downgraded to PageAnon). |
1da177e4c
|
1057 1058 |
*/ void page_add_anon_rmap(struct page *page, |
d281ee614
|
1059 |
struct vm_area_struct *vma, unsigned long address, bool compound) |
1da177e4c
|
1060 |
{ |
d281ee614
|
1061 |
do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); |
ad8c2ee80
|
1062 1063 1064 1065 1066 1067 1068 1069 |
} /* * Special version of the above for do_swap_page, which often runs * into pages that are exclusively owned by the current process. * Everybody else should continue to use page_add_anon_rmap above. */ void do_page_add_anon_rmap(struct page *page, |
d281ee614
|
1070 |
struct vm_area_struct *vma, unsigned long address, int flags) |
ad8c2ee80
|
1071 |
{ |
53f9263ba
|
1072 1073 |
bool compound = flags & RMAP_COMPOUND; bool first; |
be5d0a74c
|
1074 1075 1076 1077 |
if (unlikely(PageKsm(page))) lock_page_memcg(page); else VM_BUG_ON_PAGE(!PageLocked(page), page); |
e9b61f198
|
1078 1079 |
if (compound) { atomic_t *mapcount; |
53f9263ba
|
1080 |
VM_BUG_ON_PAGE(!PageLocked(page), page); |
e9b61f198
|
1081 1082 1083 |
VM_BUG_ON_PAGE(!PageTransHuge(page), page); mapcount = compound_mapcount_ptr(page); first = atomic_inc_and_test(mapcount); |
53f9263ba
|
1084 1085 1086 |
} else { first = atomic_inc_and_test(&page->_mapcount); } |
79134171d
|
1087 |
if (first) { |
6c357848b
|
1088 |
int nr = compound ? thp_nr_pages(page) : 1; |
bea04b073
|
1089 1090 1091 1092 1093 1094 |
/* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption * disabled. */ |
65c453778
|
1095 |
if (compound) |
468c39823
|
1096 |
__inc_lruvec_page_state(page, NR_ANON_THPS); |
be5d0a74c
|
1097 |
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); |
79134171d
|
1098 |
} |
5ad646880
|
1099 |
|
be5d0a74c
|
1100 1101 1102 1103 |
if (unlikely(PageKsm(page))) { unlock_page_memcg(page); return; } |
53f9263ba
|
1104 |
|
5dbe0af47
|
1105 |
/* address might be in next vma when migration races vma_adjust */ |
5ad646880
|
1106 |
if (first) |
d281ee614
|
1107 1108 |
__page_set_anon_rmap(page, vma, address, flags & RMAP_EXCLUSIVE); |
69029cd55
|
1109 |
else |
c97a9e10e
|
1110 |
__page_check_anon_rmap(page, vma, address); |
1da177e4c
|
1111 |
} |
43d8eac44
|
1112 |
/** |
9617d95e6
|
1113 1114 1115 1116 |
* page_add_new_anon_rmap - add pte mapping to a new anonymous page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped |
d281ee614
|
1117 |
* @compound: charge the page as compound or small page |
9617d95e6
|
1118 1119 1120 |
* * Same as page_add_anon_rmap but must only be called on *new* pages. * This means the inc-and-test can be bypassed. |
c97a9e10e
|
1121 |
* Page does not have to be locked. |
9617d95e6
|
1122 1123 |
*/ void page_add_new_anon_rmap(struct page *page, |
d281ee614
|
1124 |
struct vm_area_struct *vma, unsigned long address, bool compound) |
9617d95e6
|
1125 |
{ |
6c357848b
|
1126 |
int nr = compound ? thp_nr_pages(page) : 1; |
d281ee614
|
1127 |
|
81d1b09c6
|
1128 |
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); |
fa9949da5
|
1129 |
__SetPageSwapBacked(page); |
d281ee614
|
1130 1131 |
if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
53f9263ba
|
1132 1133 |
/* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); |
47e29d32a
|
1134 1135 |
if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); |
468c39823
|
1136 |
__inc_lruvec_page_state(page, NR_ANON_THPS); |
53f9263ba
|
1137 1138 1139 1140 1141 |
} else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); |
d281ee614
|
1142 |
} |
be5d0a74c
|
1143 |
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); |
e8a03feb5
|
1144 |
__page_set_anon_rmap(page, vma, address, 1); |
9617d95e6
|
1145 |
} |
1da177e4c
|
1146 1147 1148 |
/** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to |
e8b098fc5
|
1149 |
* @compound: charge the page as compound or small page |
1da177e4c
|
1150 |
* |
b8072f099
|
1151 |
* The caller needs to hold the pte lock. |
1da177e4c
|
1152 |
*/ |
dd78fedde
|
1153 |
void page_add_file_rmap(struct page *page, bool compound) |
1da177e4c
|
1154 |
{ |
dd78fedde
|
1155 1156 1157 |
int i, nr = 1; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); |
62cccb8c8
|
1158 |
lock_page_memcg(page); |
dd78fedde
|
1159 |
if (compound && PageTransHuge(page)) { |
5eaf35ab1
|
1160 |
for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { |
dd78fedde
|
1161 1162 1163 1164 1165 |
if (atomic_inc_and_test(&page[i]._mapcount)) nr++; } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; |
99cb0dbd4
|
1166 1167 1168 1169 |
if (PageSwapBacked(page)) __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); else __inc_node_page_state(page, NR_FILE_PMDMAPPED); |
dd78fedde
|
1170 |
} else { |
c8efc390c
|
1171 1172 |
if (PageTransCompound(page) && page_mapping(page)) { VM_WARN_ON_ONCE(!PageLocked(page)); |
9a73f61bd
|
1173 1174 1175 1176 |
SetPageDoubleMap(compound_head(page)); if (PageMlocked(page)) clear_page_mlock(compound_head(page)); } |
dd78fedde
|
1177 1178 |
if (!atomic_inc_and_test(&page->_mapcount)) goto out; |
d69b042f3
|
1179 |
} |
00f3ca2c2
|
1180 |
__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); |
dd78fedde
|
1181 |
out: |
62cccb8c8
|
1182 |
unlock_page_memcg(page); |
1da177e4c
|
1183 |
} |
dd78fedde
|
1184 |
static void page_remove_file_rmap(struct page *page, bool compound) |
8186eb6a7
|
1185 |
{ |
dd78fedde
|
1186 |
int i, nr = 1; |
57dea93ac
|
1187 |
VM_BUG_ON_PAGE(compound && !PageHead(page), page); |
8186eb6a7
|
1188 |
|
53f9263ba
|
1189 1190 1191 1192 |
/* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); |
be5d0a74c
|
1193 |
return; |
53f9263ba
|
1194 |
} |
8186eb6a7
|
1195 |
|
53f9263ba
|
1196 |
/* page still mapped by someone else? */ |
dd78fedde
|
1197 |
if (compound && PageTransHuge(page)) { |
5eaf35ab1
|
1198 |
for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { |
dd78fedde
|
1199 1200 1201 1202 |
if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) |
be5d0a74c
|
1203 |
return; |
99cb0dbd4
|
1204 1205 1206 1207 |
if (PageSwapBacked(page)) __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); else __dec_node_page_state(page, NR_FILE_PMDMAPPED); |
dd78fedde
|
1208 1209 |
} else { if (!atomic_add_negative(-1, &page->_mapcount)) |
be5d0a74c
|
1210 |
return; |
dd78fedde
|
1211 |
} |
8186eb6a7
|
1212 1213 |
/* |
00f3ca2c2
|
1214 |
* We use the irq-unsafe __{inc|mod}_lruvec_page_state because |
8186eb6a7
|
1215 1216 1217 |
* these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ |
00f3ca2c2
|
1218 |
__mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); |
8186eb6a7
|
1219 1220 1221 |
if (unlikely(PageMlocked(page))) clear_page_mlock(page); |
8186eb6a7
|
1222 |
} |
53f9263ba
|
1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 |
static void page_remove_anon_compound_rmap(struct page *page) { int i, nr; if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) return; /* Hugepages are not counted in NR_ANON_PAGES for now. */ if (unlikely(PageHuge(page))) return; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; |
468c39823
|
1236 |
__dec_lruvec_page_state(page, NR_ANON_THPS); |
53f9263ba
|
1237 1238 1239 1240 |
if (TestClearPageDoubleMap(page)) { /* * Subpages can be mapped with PTEs too. Check how many of |
f1fe80d4a
|
1241 |
* them are still mapped. |
53f9263ba
|
1242 |
*/ |
5eaf35ab1
|
1243 |
for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { |
53f9263ba
|
1244 1245 1246 |
if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } |
f1fe80d4a
|
1247 1248 1249 1250 1251 1252 |
/* * Queue the page for deferred split if at least one small * page of the compound page is unmapped, but at least one * small page is still mapped. */ |
5eaf35ab1
|
1253 |
if (nr && nr < thp_nr_pages(page)) |
f1fe80d4a
|
1254 |
deferred_split_huge_page(page); |
53f9263ba
|
1255 |
} else { |
5eaf35ab1
|
1256 |
nr = thp_nr_pages(page); |
53f9263ba
|
1257 |
} |
e90309c9f
|
1258 1259 |
if (unlikely(PageMlocked(page))) clear_page_mlock(page); |
f1fe80d4a
|
1260 |
if (nr) |
be5d0a74c
|
1261 |
__mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); |
53f9263ba
|
1262 |
} |
1da177e4c
|
1263 1264 |
/** * page_remove_rmap - take down pte mapping from a page |
d281ee614
|
1265 1266 |
* @page: page to remove mapping from * @compound: uncharge the page as compound or small page |
1da177e4c
|
1267 |
* |
b8072f099
|
1268 |
* The caller needs to hold the pte lock. |
1da177e4c
|
1269 |
*/ |
d281ee614
|
1270 |
void page_remove_rmap(struct page *page, bool compound) |
1da177e4c
|
1271 |
{ |
be5d0a74c
|
1272 |
lock_page_memcg(page); |
89c06bd52
|
1273 |
|
be5d0a74c
|
1274 1275 1276 1277 1278 1279 1280 1281 1282 |
if (!PageAnon(page)) { page_remove_file_rmap(page, compound); goto out; } if (compound) { page_remove_anon_compound_rmap(page); goto out; } |
53f9263ba
|
1283 |
|
b904dcfed
|
1284 1285 |
/* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) |
be5d0a74c
|
1286 |
goto out; |
8186eb6a7
|
1287 |
|
b904dcfed
|
1288 |
/* |
bea04b073
|
1289 1290 |
* We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and |
bea04b073
|
1291 |
* pte lock(a spinlock) is held, which implies preemption disabled. |
0fe6e20b9
|
1292 |
*/ |
be5d0a74c
|
1293 |
__dec_lruvec_page_state(page, NR_ANON_MAPPED); |
8186eb6a7
|
1294 |
|
e6c509f85
|
1295 1296 |
if (unlikely(PageMlocked(page))) clear_page_mlock(page); |
8186eb6a7
|
1297 |
|
9a982250f
|
1298 1299 |
if (PageTransCompound(page)) deferred_split_huge_page(compound_head(page)); |
b904dcfed
|
1300 1301 1302 1303 |
/* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping |
2d4894b5d
|
1304 |
* before us: so leave the reset to free_unref_page, |
b904dcfed
|
1305 1306 1307 1308 |
* and remember that it's only reliable while mapped. * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ |
be5d0a74c
|
1309 1310 |
out: unlock_page_memcg(page); |
1da177e4c
|
1311 1312 1313 |
} /* |
526295064
|
1314 |
* @arg: enum ttu_flags will be passed to this argument |
1da177e4c
|
1315 |
*/ |
e4b822227
|
1316 |
static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
526295064
|
1317 |
unsigned long address, void *arg) |
1da177e4c
|
1318 1319 |
{ struct mm_struct *mm = vma->vm_mm; |
c7ab0d2fd
|
1320 1321 1322 1323 1324 |
struct page_vma_mapped_walk pvmw = { .page = page, .vma = vma, .address = address, }; |
1da177e4c
|
1325 |
pte_t pteval; |
c7ab0d2fd
|
1326 |
struct page *subpage; |
785373b4c
|
1327 |
bool ret = true; |
ac46d4f3c
|
1328 |
struct mmu_notifier_range range; |
4708f3188
|
1329 |
enum ttu_flags flags = (enum ttu_flags)(long)arg; |
1da177e4c
|
1330 |
|
b87537d9e
|
1331 1332 |
/* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) |
e4b822227
|
1333 |
return true; |
b87537d9e
|
1334 |
|
a5430dda8
|
1335 1336 1337 |
if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && is_zone_device_page(page) && !is_device_private_page(page)) return true; |
fec89c109
|
1338 1339 |
if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, |
b5ff8161e
|
1340 |
flags & TTU_SPLIT_FREEZE, page); |
fec89c109
|
1341 |
} |
369ea8242
|
1342 |
/* |
017b1660d
|
1343 1344 1345 1346 1347 1348 |
* For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. |
369ea8242
|
1349 |
*/ |
7269f9999
|
1350 |
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, |
6f4f13e8d
|
1351 |
address, |
a50b854e0
|
1352 |
min(vma->vm_end, address + page_size(page))); |
017b1660d
|
1353 1354 1355 1356 1357 |
if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ |
ac46d4f3c
|
1358 1359 |
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); |
017b1660d
|
1360 |
} |
ac46d4f3c
|
1361 |
mmu_notifier_invalidate_range_start(&range); |
369ea8242
|
1362 |
|
c7ab0d2fd
|
1363 |
while (page_vma_mapped_walk(&pvmw)) { |
616b83715
|
1364 1365 1366 1367 |
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte && (flags & TTU_MIGRATION)) { VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); |
616b83715
|
1368 1369 1370 1371 |
set_pmd_migration_entry(&pvmw, page); continue; } #endif |
c7ab0d2fd
|
1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 |
/* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ if (!(flags & TTU_IGNORE_MLOCK)) { if (vma->vm_flags & VM_LOCKED) { /* PTE-mapped THP are never mlocked */ if (!PageTransCompound(page)) { /* * Holding pte lock, we do *not* need |
c1e8d7c6a
|
1383 |
* mmap_lock here |
c7ab0d2fd
|
1384 1385 1386 |
*/ mlock_vma_page(page); } |
e4b822227
|
1387 |
ret = false; |
c7ab0d2fd
|
1388 1389 |
page_vma_mapped_walk_done(&pvmw); break; |
9a73f61bd
|
1390 |
} |
c7ab0d2fd
|
1391 1392 |
if (flags & TTU_MUNLOCK) continue; |
b87537d9e
|
1393 |
} |
c7ab0d2fd
|
1394 |
|
8346242a7
|
1395 1396 1397 1398 |
/* Unexpected PMD-mapped THP? */ VM_BUG_ON_PAGE(!pvmw.pte, page); subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); |
785373b4c
|
1399 |
address = pvmw.address; |
336bf30eb
|
1400 |
if (PageHuge(page) && !PageAnon(page)) { |
c0d0381ad
|
1401 1402 1403 1404 1405 1406 |
/* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. */ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); |
34ae204f1
|
1407 |
if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { |
017b1660d
|
1408 1409 1410 1411 1412 1413 1414 |
/* * huge_pmd_unshare unmapped an entire PMD * page. There is no way of knowing exactly * which PMDs may be cached for this mm, so * we must flush them all. start/end were * already adjusted above to cover this range. */ |
ac46d4f3c
|
1415 1416 1417 1418 |
flush_cache_range(vma, range.start, range.end); flush_tlb_range(vma, range.start, range.end); mmu_notifier_invalidate_range(mm, range.start, range.end); |
017b1660d
|
1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 |
/* * The ref count of the PMD page was dropped * which is part of the way map counting * is done for shared PMDs. Return 'true' * here. When there is no other sharing, * huge_pmd_unshare returns false and we will * unmap the actual page and drop map count * to zero. */ page_vma_mapped_walk_done(&pvmw); break; } } |
8346242a7
|
1433 |
|
a5430dda8
|
1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 |
if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && is_zone_device_page(page)) { swp_entry_t entry; pte_t swp_pte; pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ entry = make_migration_entry(page, 0); swp_pte = swp_entry_to_pte(entry); |
ad7df764b
|
1449 1450 1451 1452 1453 1454 |
/* * pteval maps a zone device page and is therefore * a swap pte. */ if (pte_swp_soft_dirty(pteval)) |
a5430dda8
|
1455 |
swp_pte = pte_swp_mksoft_dirty(swp_pte); |
ad7df764b
|
1456 |
if (pte_swp_uffd_wp(pteval)) |
f45ec5ff1
|
1457 |
swp_pte = pte_swp_mkuffd_wp(swp_pte); |
a5430dda8
|
1458 |
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); |
0f10851ea
|
1459 1460 1461 |
/* * No need to invalidate here it will synchronize on * against the special swap migration pte. |
1de13ee59
|
1462 1463 1464 1465 1466 1467 1468 |
* * The assignment to subpage above was computed from a * swap PTE which results in an invalid pointer. * Since only PAGE_SIZE pages can currently be * migrated, just set it to page. This will need to be * changed when hugepage migrations to device private * memory are supported. |
0f10851ea
|
1469 |
*/ |
1de13ee59
|
1470 |
subpage = page; |
a5430dda8
|
1471 1472 |
goto discard; } |
c7ab0d2fd
|
1473 |
/* Nuke the page table entry. */ |
785373b4c
|
1474 |
flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); |
c7ab0d2fd
|
1475 1476 1477 1478 1479 1480 1481 1482 1483 |
if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the page. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ |
785373b4c
|
1484 |
pteval = ptep_get_and_clear(mm, address, pvmw.pte); |
c7ab0d2fd
|
1485 1486 1487 |
set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); } else { |
785373b4c
|
1488 |
pteval = ptep_clear_flush(vma, address, pvmw.pte); |
c7ab0d2fd
|
1489 |
} |
72b252aed
|
1490 |
|
c7ab0d2fd
|
1491 1492 1493 |
/* Move the dirty bit to the page. Now the pte is gone. */ if (pte_dirty(pteval)) set_page_dirty(page); |
1da177e4c
|
1494 |
|
c7ab0d2fd
|
1495 1496 |
/* Update high watermark before we lower rss */ update_hiwater_rss(mm); |
1da177e4c
|
1497 |
|
c7ab0d2fd
|
1498 |
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
5fd27b8e7
|
1499 |
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); |
c7ab0d2fd
|
1500 |
if (PageHuge(page)) { |
d8c6546b1
|
1501 |
hugetlb_count_sub(compound_nr(page), mm); |
785373b4c
|
1502 |
set_huge_swap_pte_at(mm, address, |
5fd27b8e7
|
1503 1504 |
pvmw.pte, pteval, vma_mmu_pagesize(vma)); |
c7ab0d2fd
|
1505 1506 |
} else { dec_mm_counter(mm, mm_counter(page)); |
785373b4c
|
1507 |
set_pte_at(mm, address, pvmw.pte, pteval); |
c7ab0d2fd
|
1508 |
} |
365e9c87a
|
1509 |
|
bce73e484
|
1510 |
} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { |
c7ab0d2fd
|
1511 1512 1513 1514 |
/* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. |
bce73e484
|
1515 1516 1517 1518 1519 |
* A future reference will then fault in a new zero * page. When userfaultfd is active, we must not drop * this page though, as its main user (postcopy * migration) will not expect userfaults on already * copied pages. |
c7ab0d2fd
|
1520 |
*/ |
eca56ff90
|
1521 |
dec_mm_counter(mm, mm_counter(page)); |
0f10851ea
|
1522 1523 1524 |
/* We have to invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); |
c7ab0d2fd
|
1525 |
} else if (IS_ENABLED(CONFIG_MIGRATION) && |
b5ff8161e
|
1526 |
(flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { |
c7ab0d2fd
|
1527 1528 |
swp_entry_t entry; pte_t swp_pte; |
ca827d55e
|
1529 1530 1531 1532 1533 1534 1535 |
if (arch_unmap_one(mm, vma, address, pteval) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } |
c7ab0d2fd
|
1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 |
/* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ entry = make_migration_entry(subpage, pte_write(pteval)); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); |
f45ec5ff1
|
1546 1547 |
if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); |
785373b4c
|
1548 |
set_pte_at(mm, address, pvmw.pte, swp_pte); |
0f10851ea
|
1549 1550 1551 1552 |
/* * No need to invalidate here it will synchronize on * against the special swap migration pte. */ |
c7ab0d2fd
|
1553 1554 1555 1556 1557 1558 1559 |
} else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(subpage) }; pte_t swp_pte; /* * Store the swap location in the pte. * See handle_pte_fault() ... */ |
eb94a8784
|
1560 1561 |
if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { WARN_ON_ONCE(1); |
83612a948
|
1562 |
ret = false; |
369ea8242
|
1563 |
/* We have to invalidate as we cleared the pte */ |
0f10851ea
|
1564 1565 |
mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); |
eb94a8784
|
1566 1567 1568 |
page_vma_mapped_walk_done(&pvmw); break; } |
c7ab0d2fd
|
1569 |
|
802a3a92a
|
1570 1571 1572 |
/* MADV_FREE page check */ if (!PageSwapBacked(page)) { if (!PageDirty(page)) { |
0f10851ea
|
1573 1574 1575 |
/* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); |
802a3a92a
|
1576 1577 1578 1579 1580 1581 1582 1583 |
dec_mm_counter(mm, MM_ANONPAGES); goto discard; } /* * If the page was redirtied, it cannot be * discarded. Remap the page to page table. */ |
785373b4c
|
1584 |
set_pte_at(mm, address, pvmw.pte, pteval); |
18863d3a3
|
1585 |
SetPageSwapBacked(page); |
e4b822227
|
1586 |
ret = false; |
802a3a92a
|
1587 1588 |
page_vma_mapped_walk_done(&pvmw); break; |
c7ab0d2fd
|
1589 |
} |
854e9ed09
|
1590 |
|
c7ab0d2fd
|
1591 |
if (swap_duplicate(entry) < 0) { |
785373b4c
|
1592 |
set_pte_at(mm, address, pvmw.pte, pteval); |
e4b822227
|
1593 |
ret = false; |
c7ab0d2fd
|
1594 1595 1596 |
page_vma_mapped_walk_done(&pvmw); break; } |
ca827d55e
|
1597 1598 1599 1600 1601 1602 |
if (arch_unmap_one(mm, vma, address, pteval) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } |
c7ab0d2fd
|
1603 1604 1605 1606 1607 1608 |
if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } |
854e9ed09
|
1609 |
dec_mm_counter(mm, MM_ANONPAGES); |
c7ab0d2fd
|
1610 1611 1612 1613 |
inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); |
f45ec5ff1
|
1614 1615 |
if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); |
785373b4c
|
1616 |
set_pte_at(mm, address, pvmw.pte, swp_pte); |
0f10851ea
|
1617 1618 1619 1620 1621 |
/* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); } else { /* |
906f9cdfc
|
1622 1623 1624 |
* This is a locked file-backed page, thus it cannot * be removed from the page cache and replaced by a new * page before mmu_notifier_invalidate_range_end, so no |
0f10851ea
|
1625 1626 1627 1628 |
* concurrent thread might update its page table to * point at new page while a device still is using this * page. * |
ad56b738c
|
1629 |
* See Documentation/vm/mmu_notifier.rst |
0f10851ea
|
1630 |
*/ |
c7ab0d2fd
|
1631 |
dec_mm_counter(mm, mm_counter_file(page)); |
0f10851ea
|
1632 |
} |
854e9ed09
|
1633 |
discard: |
0f10851ea
|
1634 1635 1636 1637 1638 |
/* * No need to call mmu_notifier_invalidate_range() it has be * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * |
ad56b738c
|
1639 |
* See Documentation/vm/mmu_notifier.rst |
0f10851ea
|
1640 |
*/ |
c7ab0d2fd
|
1641 1642 |
page_remove_rmap(subpage, PageHuge(page)); put_page(page); |
c7ab0d2fd
|
1643 |
} |
369ea8242
|
1644 |
|
ac46d4f3c
|
1645 |
mmu_notifier_invalidate_range_end(&range); |
369ea8242
|
1646 |
|
caed0f486
|
1647 |
return ret; |
1da177e4c
|
1648 |
} |
526295064
|
1649 1650 |
static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) { |
222100eed
|
1651 |
return vma_is_temporary_stack(vma); |
526295064
|
1652 |
} |
2a52bcbcc
|
1653 |
static int page_mapcount_is_zero(struct page *page) |
526295064
|
1654 |
{ |
c7ab0d2fd
|
1655 |
return !total_mapcount(page); |
2a52bcbcc
|
1656 |
} |
526295064
|
1657 |
|
1da177e4c
|
1658 1659 1660 |
/** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped |
14fa31b89
|
1661 |
* @flags: action and flags |
1da177e4c
|
1662 1663 1664 |
* * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. |
1da177e4c
|
1665 |
* |
666e5a406
|
1666 |
* If unmap is successful, return true. Otherwise, false. |
1da177e4c
|
1667 |
*/ |
666e5a406
|
1668 |
bool try_to_unmap(struct page *page, enum ttu_flags flags) |
1da177e4c
|
1669 |
{ |
526295064
|
1670 1671 |
struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, |
802a3a92a
|
1672 |
.arg = (void *)flags, |
2a52bcbcc
|
1673 |
.done = page_mapcount_is_zero, |
526295064
|
1674 1675 |
.anon_lock = page_lock_anon_vma_read, }; |
1da177e4c
|
1676 |
|
526295064
|
1677 1678 1679 1680 1681 1682 1683 1684 |
/* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the * page tables leading to a race where migration cannot * find the migration ptes. Rather than increasing the * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ |
b5ff8161e
|
1685 1686 |
if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) && !PageKsm(page) && PageAnon(page)) |
526295064
|
1687 |
rwc.invalid_vma = invalid_migration_vma; |
2a52bcbcc
|
1688 |
if (flags & TTU_RMAP_LOCKED) |
33fc80e25
|
1689 |
rmap_walk_locked(page, &rwc); |
2a52bcbcc
|
1690 |
else |
33fc80e25
|
1691 |
rmap_walk(page, &rwc); |
526295064
|
1692 |
|
666e5a406
|
1693 |
return !page_mapcount(page) ? true : false; |
1da177e4c
|
1694 |
} |
81b4082dc
|
1695 |
|
2a52bcbcc
|
1696 1697 1698 1699 |
static int page_not_mapped(struct page *page) { return !page_mapped(page); }; |
b291f0003
|
1700 1701 1702 1703 1704 1705 1706 |
/** * try_to_munlock - try to munlock a page * @page: the page to be munlocked * * Called from munlock code. Checks all of the VMAs mapping the page * to make sure nobody else has this page mlocked. The page will be * returned with PG_mlocked cleared if no other vmas have it mlocked. |
b291f0003
|
1707 |
*/ |
854e9ed09
|
1708 |
|
192d72325
|
1709 1710 |
void try_to_munlock(struct page *page) { |
e8351ac9b
|
1711 1712 |
struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, |
802a3a92a
|
1713 |
.arg = (void *)TTU_MUNLOCK, |
e8351ac9b
|
1714 |
.done = page_not_mapped, |
e8351ac9b
|
1715 1716 1717 |
.anon_lock = page_lock_anon_vma_read, }; |
309381fea
|
1718 |
VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); |
192d72325
|
1719 |
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); |
b291f0003
|
1720 |
|
192d72325
|
1721 |
rmap_walk(page, &rwc); |
b291f0003
|
1722 |
} |
e9995ef97
|
1723 |
|
01d8b20de
|
1724 |
void __put_anon_vma(struct anon_vma *anon_vma) |
76545066c
|
1725 |
{ |
01d8b20de
|
1726 |
struct anon_vma *root = anon_vma->root; |
76545066c
|
1727 |
|
624483f3e
|
1728 |
anon_vma_free(anon_vma); |
01d8b20de
|
1729 1730 |
if (root != anon_vma && atomic_dec_and_test(&root->refcount)) anon_vma_free(root); |
76545066c
|
1731 |
} |
76545066c
|
1732 |
|
0dd1c7bbc
|
1733 1734 |
static struct anon_vma *rmap_walk_anon_lock(struct page *page, struct rmap_walk_control *rwc) |
faecd8dd8
|
1735 1736 |
{ struct anon_vma *anon_vma; |
0dd1c7bbc
|
1737 1738 |
if (rwc->anon_lock) return rwc->anon_lock(page); |
faecd8dd8
|
1739 1740 1741 |
/* * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages |
c1e8d7c6a
|
1742 |
* are holding mmap_lock. Users without mmap_lock are required to |
faecd8dd8
|
1743 1744 1745 1746 1747 1748 1749 1750 1751 |
* take a reference count to prevent the anon_vma disappearing */ anon_vma = page_anon_vma(page); if (!anon_vma) return NULL; anon_vma_lock_read(anon_vma); return anon_vma; } |
e9995ef97
|
1752 |
/* |
e8351ac9b
|
1753 1754 1755 1756 1757 1758 1759 1760 |
* rmap_walk_anon - do something to anonymous page using the object-based * rmap method * @page: the page to be handled * @rwc: control variable according to each walk type * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. * |
c1e8d7c6a
|
1761 |
* When called from try_to_munlock(), the mmap_lock of the mm containing the vma |
e8351ac9b
|
1762 1763 1764 |
* where the page was found will be held for write. So, we won't recheck * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. |
e9995ef97
|
1765 |
*/ |
1df631ae1
|
1766 |
static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, |
b97731992
|
1767 |
bool locked) |
e9995ef97
|
1768 1769 |
{ struct anon_vma *anon_vma; |
a8fa41ad2
|
1770 |
pgoff_t pgoff_start, pgoff_end; |
5beb49305
|
1771 |
struct anon_vma_chain *avc; |
e9995ef97
|
1772 |
|
b97731992
|
1773 1774 1775 1776 1777 1778 1779 |
if (locked) { anon_vma = page_anon_vma(page); /* anon_vma disappear under us? */ VM_BUG_ON_PAGE(!anon_vma, page); } else { anon_vma = rmap_walk_anon_lock(page, rwc); } |
e9995ef97
|
1780 |
if (!anon_vma) |
1df631ae1
|
1781 |
return; |
faecd8dd8
|
1782 |
|
a8fa41ad2
|
1783 |
pgoff_start = page_to_pgoff(page); |
6c357848b
|
1784 |
pgoff_end = pgoff_start + thp_nr_pages(page) - 1; |
a8fa41ad2
|
1785 1786 |
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { |
5beb49305
|
1787 |
struct vm_area_struct *vma = avc->vma; |
e9995ef97
|
1788 |
unsigned long address = vma_address(page, vma); |
0dd1c7bbc
|
1789 |
|
ad12695f1
|
1790 |
cond_resched(); |
0dd1c7bbc
|
1791 1792 |
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; |
e4b822227
|
1793 |
if (!rwc->rmap_one(page, vma, address, rwc->arg)) |
e9995ef97
|
1794 |
break; |
0dd1c7bbc
|
1795 1796 |
if (rwc->done && rwc->done(page)) break; |
e9995ef97
|
1797 |
} |
b97731992
|
1798 1799 1800 |
if (!locked) anon_vma_unlock_read(anon_vma); |
e9995ef97
|
1801 |
} |
e8351ac9b
|
1802 1803 1804 1805 1806 1807 1808 1809 |
/* * rmap_walk_file - do something to file page using the object-based rmap method * @page: the page to be handled * @rwc: control variable according to each walk type * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. * |
c1e8d7c6a
|
1810 |
* When called from try_to_munlock(), the mmap_lock of the mm containing the vma |
e8351ac9b
|
1811 1812 1813 1814 |
* where the page was found will be held for write. So, we won't recheck * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ |
1df631ae1
|
1815 |
static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, |
b97731992
|
1816 |
bool locked) |
e9995ef97
|
1817 |
{ |
b97731992
|
1818 |
struct address_space *mapping = page_mapping(page); |
a8fa41ad2
|
1819 |
pgoff_t pgoff_start, pgoff_end; |
e9995ef97
|
1820 |
struct vm_area_struct *vma; |
e9995ef97
|
1821 |
|
9f32624be
|
1822 1823 1824 1825 |
/* * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, |
c8c06efa8
|
1826 |
* so we can safely take mapping->i_mmap_rwsem. |
9f32624be
|
1827 |
*/ |
81d1b09c6
|
1828 |
VM_BUG_ON_PAGE(!PageLocked(page), page); |
9f32624be
|
1829 |
|
e9995ef97
|
1830 |
if (!mapping) |
1df631ae1
|
1831 |
return; |
3dec0ba0b
|
1832 |
|
a8fa41ad2
|
1833 |
pgoff_start = page_to_pgoff(page); |
6c357848b
|
1834 |
pgoff_end = pgoff_start + thp_nr_pages(page) - 1; |
b97731992
|
1835 1836 |
if (!locked) i_mmap_lock_read(mapping); |
a8fa41ad2
|
1837 1838 |
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { |
e9995ef97
|
1839 |
unsigned long address = vma_address(page, vma); |
0dd1c7bbc
|
1840 |
|
ad12695f1
|
1841 |
cond_resched(); |
0dd1c7bbc
|
1842 1843 |
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; |
e4b822227
|
1844 |
if (!rwc->rmap_one(page, vma, address, rwc->arg)) |
0dd1c7bbc
|
1845 1846 1847 |
goto done; if (rwc->done && rwc->done(page)) goto done; |
e9995ef97
|
1848 |
} |
0dd1c7bbc
|
1849 |
|
0dd1c7bbc
|
1850 |
done: |
b97731992
|
1851 1852 |
if (!locked) i_mmap_unlock_read(mapping); |
e9995ef97
|
1853 |
} |
1df631ae1
|
1854 |
void rmap_walk(struct page *page, struct rmap_walk_control *rwc) |
e9995ef97
|
1855 |
{ |
e9995ef97
|
1856 |
if (unlikely(PageKsm(page))) |
1df631ae1
|
1857 |
rmap_walk_ksm(page, rwc); |
e9995ef97
|
1858 |
else if (PageAnon(page)) |
1df631ae1
|
1859 |
rmap_walk_anon(page, rwc, false); |
b97731992
|
1860 |
else |
1df631ae1
|
1861 |
rmap_walk_file(page, rwc, false); |
b97731992
|
1862 1863 1864 |
} /* Like rmap_walk, but caller holds relevant rmap lock */ |
1df631ae1
|
1865 |
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) |
b97731992
|
1866 1867 1868 1869 |
{ /* no ksm support for now */ VM_BUG_ON_PAGE(PageKsm(page), page); if (PageAnon(page)) |
1df631ae1
|
1870 |
rmap_walk_anon(page, rwc, true); |
e9995ef97
|
1871 |
else |
1df631ae1
|
1872 |
rmap_walk_file(page, rwc, true); |
e9995ef97
|
1873 |
} |
0fe6e20b9
|
1874 |
|
e3390f67a
|
1875 |
#ifdef CONFIG_HUGETLB_PAGE |
0fe6e20b9
|
1876 |
/* |
451b9514a
|
1877 |
* The following two functions are for anonymous (private mapped) hugepages. |
0fe6e20b9
|
1878 1879 1880 |
* Unlike common anonymous pages, anonymous hugepages have no accounting code * and no lru code, because we handle hugepages differently from common pages. */ |
0fe6e20b9
|
1881 1882 1883 1884 1885 |
void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { struct anon_vma *anon_vma = vma->anon_vma; int first; |
a850ea303
|
1886 1887 |
BUG_ON(!PageLocked(page)); |
0fe6e20b9
|
1888 |
BUG_ON(!anon_vma); |
5dbe0af47
|
1889 |
/* address might be in next vma when migration races vma_adjust */ |
53f9263ba
|
1890 |
first = atomic_inc_and_test(compound_mapcount_ptr(page)); |
0fe6e20b9
|
1891 |
if (first) |
451b9514a
|
1892 |
__page_set_anon_rmap(page, vma, address, 0); |
0fe6e20b9
|
1893 1894 1895 1896 1897 1898 |
} void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
53f9263ba
|
1899 |
atomic_set(compound_mapcount_ptr(page), 0); |
47e29d32a
|
1900 1901 |
if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); |
451b9514a
|
1902 |
__page_set_anon_rmap(page, vma, address, 1); |
0fe6e20b9
|
1903 |
} |
e3390f67a
|
1904 |
#endif /* CONFIG_HUGETLB_PAGE */ |