Blame view
mm/swap_state.c
24 KB
b24413180
|
1 |
// SPDX-License-Identifier: GPL-2.0 |
1da177e4c
|
2 3 4 5 6 7 8 9 |
/* * linux/mm/swap_state.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * * Rewritten to use page cache, (C) 1998 Stephen Tweedie */ |
1da177e4c
|
10 |
#include <linux/mm.h> |
5a0e3ad6a
|
11 |
#include <linux/gfp.h> |
1da177e4c
|
12 13 |
#include <linux/kernel_stat.h> #include <linux/swap.h> |
46017e954
|
14 |
#include <linux/swapops.h> |
1da177e4c
|
15 16 |
#include <linux/init.h> #include <linux/pagemap.h> |
1da177e4c
|
17 |
#include <linux/backing-dev.h> |
3fb5c298b
|
18 |
#include <linux/blkdev.h> |
c484d4104
|
19 |
#include <linux/pagevec.h> |
b20a35035
|
20 |
#include <linux/migrate.h> |
4b3ef9daa
|
21 |
#include <linux/vmalloc.h> |
67afa38e0
|
22 |
#include <linux/swap_slots.h> |
38d8b4e6b
|
23 |
#include <linux/huge_mm.h> |
61ef18655
|
24 |
#include <linux/shmem_fs.h> |
243bce09c
|
25 |
#include "internal.h" |
1da177e4c
|
26 27 28 |
/* * swapper_space is a fiction, retained to simplify the path through |
7eaceacca
|
29 |
* vmscan's shrink_page_list. |
1da177e4c
|
30 |
*/ |
f5e54d6e5
|
31 |
static const struct address_space_operations swap_aops = { |
1da177e4c
|
32 |
.writepage = swap_writepage, |
62c230bc1
|
33 |
.set_page_dirty = swap_set_page_dirty, |
1c93923cc
|
34 |
#ifdef CONFIG_MIGRATION |
e965f9630
|
35 |
.migratepage = migrate_page, |
1c93923cc
|
36 |
#endif |
1da177e4c
|
37 |
}; |
783cb68ee
|
38 39 |
struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; |
f5c754d63
|
40 |
static bool enable_vma_readahead __read_mostly = true; |
ec560175c
|
41 |
|
ec560175c
|
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) #define SWAP_RA_VAL(addr, win, hits) \ (((addr) & PAGE_MASK) | \ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ ((hits) & SWAP_RA_HITS_MASK)) /* Initial readahead hits is 4 to start up with a small window */ #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) |
1da177e4c
|
59 |
|
b96a3db2f
|
60 61 |
#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++) #define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr)) |
1da177e4c
|
62 63 64 65 66 67 |
static struct { unsigned long add_total; unsigned long del_total; unsigned long find_success; unsigned long find_total; |
1da177e4c
|
68 |
} swap_cache_info; |
579f82901
|
69 |
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); |
1da177e4c
|
70 71 |
void show_swap_cache_info(void) { |
33806f06d
|
72 73 |
printk("%lu pages in swap cache ", total_swapcache_pages()); |
2c97b7fc0
|
74 75 |
printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu ", |
1da177e4c
|
76 |
swap_cache_info.add_total, swap_cache_info.del_total, |
bb63be0a0
|
77 |
swap_cache_info.find_success, swap_cache_info.find_total); |
ec8acf20a
|
78 79 80 |
printk("Free swap = %ldkB ", get_nr_swap_pages() << (PAGE_SHIFT - 10)); |
1da177e4c
|
81 82 83 |
printk("Total swap = %lukB ", total_swap_pages << (PAGE_SHIFT - 10)); } |
aae466b00
|
84 85 86 87 88 |
void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); struct page *page; |
8c647dd1e
|
89 |
page = xa_load(&address_space->i_pages, idx); |
aae466b00
|
90 91 |
if (xa_is_value(page)) return page; |
aae466b00
|
92 93 |
return NULL; } |
1da177e4c
|
94 |
/* |
8d93b41c0
|
95 |
* add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, |
1da177e4c
|
96 97 |
* but sets SwapCache flag and private instead of mapping and index. */ |
3852f6768
|
98 99 |
int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp, void **shadowp) |
1da177e4c
|
100 |
{ |
8d93b41c0
|
101 |
struct address_space *address_space = swap_address_space(entry); |
38d8b4e6b
|
102 |
pgoff_t idx = swp_offset(entry); |
8d93b41c0
|
103 |
XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); |
6c357848b
|
104 |
unsigned long i, nr = thp_nr_pages(page); |
3852f6768
|
105 |
void *old; |
1da177e4c
|
106 |
|
309381fea
|
107 108 109 |
VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
51726b122
|
110 |
|
38d8b4e6b
|
111 |
page_ref_add(page, nr); |
31a563962
|
112 |
SetPageSwapCache(page); |
31a563962
|
113 |
|
8d93b41c0
|
114 115 116 117 118 119 120 |
do { xas_lock_irq(&xas); xas_create_range(&xas); if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); |
3852f6768
|
121 122 |
old = xas_load(&xas); if (xa_is_value(old)) { |
3852f6768
|
123 124 125 |
if (shadowp) *shadowp = old; } |
8d93b41c0
|
126 |
set_page_private(page + i, entry.val + i); |
4101196b1
|
127 |
xas_store(&xas, page); |
8d93b41c0
|
128 129 |
xas_next(&xas); } |
38d8b4e6b
|
130 131 |
address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); |
b60389424
|
132 |
__mod_lruvec_page_state(page, NR_SWAPCACHE, nr); |
38d8b4e6b
|
133 |
ADD_CACHE_INFO(add_total, nr); |
8d93b41c0
|
134 135 136 |
unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); |
31a563962
|
137 |
|
8d93b41c0
|
138 139 |
if (!xas_error(&xas)) return 0; |
31a563962
|
140 |
|
8d93b41c0
|
141 142 143 |
ClearPageSwapCache(page); page_ref_sub(page, nr); return xas_error(&xas); |
1da177e4c
|
144 |
} |
1da177e4c
|
145 146 147 148 |
/* * This must be called only on pages that have * been verified to be in the swap cache. */ |
3852f6768
|
149 150 |
void __delete_from_swap_cache(struct page *page, swp_entry_t entry, void *shadow) |
1da177e4c
|
151 |
{ |
4e17ec250
|
152 |
struct address_space *address_space = swap_address_space(entry); |
6c357848b
|
153 |
int i, nr = thp_nr_pages(page); |
4e17ec250
|
154 155 |
pgoff_t idx = swp_offset(entry); XA_STATE(xas, &address_space->i_pages, idx); |
33806f06d
|
156 |
|
309381fea
|
157 158 159 |
VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(PageWriteback(page), page); |
1da177e4c
|
160 |
|
38d8b4e6b
|
161 |
for (i = 0; i < nr; i++) { |
3852f6768
|
162 |
void *entry = xas_store(&xas, shadow); |
4101196b1
|
163 |
VM_BUG_ON_PAGE(entry != page, entry); |
38d8b4e6b
|
164 |
set_page_private(page + i, 0); |
4e17ec250
|
165 |
xas_next(&xas); |
38d8b4e6b
|
166 |
} |
1da177e4c
|
167 |
ClearPageSwapCache(page); |
38d8b4e6b
|
168 169 |
address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); |
b60389424
|
170 |
__mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); |
38d8b4e6b
|
171 |
ADD_CACHE_INFO(del_total, nr); |
1da177e4c
|
172 173 174 175 176 177 178 179 180 |
} /** * add_to_swap - allocate swap space for a page * @page: page we want to move to swap * * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ |
0f0746589
|
181 |
int add_to_swap(struct page *page) |
1da177e4c
|
182 183 |
{ swp_entry_t entry; |
1da177e4c
|
184 |
int err; |
309381fea
|
185 186 |
VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); |
1da177e4c
|
187 |
|
38d8b4e6b
|
188 |
entry = get_swap_page(page); |
2ca4532a4
|
189 |
if (!entry.val) |
0f0746589
|
190 |
return 0; |
2ca4532a4
|
191 |
/* |
8d93b41c0
|
192 |
* XArray node allocations from PF_MEMALLOC contexts could |
2ca4532a4
|
193 194 195 196 197 198 199 |
* completely exhaust the page allocator. __GFP_NOMEMALLOC * stops emergency reserves from being allocated. * * TODO: this could cause a theoretical memory reclaim * deadlock in the swap out path. */ /* |
854e9ed09
|
200 |
* Add it to the swap cache. |
2ca4532a4
|
201 202 |
*/ err = add_to_swap_cache(page, entry, |
3852f6768
|
203 |
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); |
38d8b4e6b
|
204 |
if (err) |
bd53b714d
|
205 |
/* |
2ca4532a4
|
206 207 |
* add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. |
1da177e4c
|
208 |
*/ |
0f0746589
|
209 |
goto fail; |
9625456cc
|
210 211 |
/* * Normally the page will be dirtied in unmap because its pte should be |
0e9aa6755
|
212 |
* dirty. A special case is MADV_FREE page. The page's pte could have |
9625456cc
|
213 214 215 216 217 218 219 220 |
* dirty bit cleared but the page's SwapBacked bit is still set because * clearing the dirty bit and SwapBacked bit has no lock protected. For * such page, unmap will not set dirty bit for it, so page reclaim will * not write the page out. This can cause data corruption when the page * is swap in later. Always setting the dirty bit for the page solves * the problem. */ set_page_dirty(page); |
38d8b4e6b
|
221 222 |
return 1; |
38d8b4e6b
|
223 |
fail: |
0f0746589
|
224 |
put_swap_page(page, entry); |
38d8b4e6b
|
225 |
return 0; |
1da177e4c
|
226 227 228 229 230 231 232 233 234 235 |
} /* * This must be called only on pages that have * been verified to be in the swap cache and locked. * It will never put the page into the free list, * the caller has a reference on the page. */ void delete_from_swap_cache(struct page *page) { |
4e17ec250
|
236 237 |
swp_entry_t entry = { .val = page_private(page) }; struct address_space *address_space = swap_address_space(entry); |
1da177e4c
|
238 |
|
b93b01631
|
239 |
xa_lock_irq(&address_space->i_pages); |
3852f6768
|
240 |
__delete_from_swap_cache(page, entry, NULL); |
b93b01631
|
241 |
xa_unlock_irq(&address_space->i_pages); |
1da177e4c
|
242 |
|
75f6d6d29
|
243 |
put_swap_page(page, entry); |
6c357848b
|
244 |
page_ref_sub(page, thp_nr_pages(page)); |
1da177e4c
|
245 |
} |
3852f6768
|
246 247 248 249 250 251 252 |
void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end) { unsigned long curr = begin; void *old; for (;;) { |
3852f6768
|
253 254 255 256 257 258 259 260 261 |
swp_entry_t entry = swp_entry(type, curr); struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, curr); xa_lock_irq(&address_space->i_pages); xas_for_each(&xas, old, end) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); |
3852f6768
|
262 |
} |
3852f6768
|
263 264 265 266 267 268 269 270 271 272 |
xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ curr >>= SWAP_ADDRESS_SPACE_SHIFT; curr++; curr <<= SWAP_ADDRESS_SPACE_SHIFT; if (curr > end) break; } } |
1da177e4c
|
273 274 275 276 |
/* * If we are the only user, then try to free up the swap cache. * * Its ok to check for PageSwapCache without the page lock |
a2c43eed8
|
277 278 |
* here because we are going to recheck again inside * try_to_free_swap() _with_ the lock. |
1da177e4c
|
279 280 |
* - Marcelo */ |
f4c4a3f48
|
281 |
void free_swap_cache(struct page *page) |
1da177e4c
|
282 |
{ |
a2c43eed8
|
283 284 |
if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { try_to_free_swap(page); |
1da177e4c
|
285 286 287 288 289 290 |
unlock_page(page); } } /* * Perform a free_page(), also freeing any swap cache associated with |
b8072f099
|
291 |
* this page if it is the last user of the page. |
1da177e4c
|
292 293 294 295 |
*/ void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); |
6fcb52a56
|
296 |
if (!is_huge_zero_page(page)) |
770a53702
|
297 |
put_page(page); |
1da177e4c
|
298 299 300 301 302 303 304 305 |
} /* * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ void free_pages_and_swap_cache(struct page **pages, int nr) { |
1da177e4c
|
306 |
struct page **pagep = pages; |
aabfb5729
|
307 |
int i; |
1da177e4c
|
308 309 |
lru_add_drain(); |
aabfb5729
|
310 311 |
for (i = 0; i < nr; i++) free_swap_cache(pagep[i]); |
c6f92f9fb
|
312 |
release_pages(pagep, nr); |
1da177e4c
|
313 |
} |
e9e9b7ece
|
314 315 316 317 |
static inline bool swap_use_vma_readahead(void) { return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } |
1da177e4c
|
318 319 320 321 322 323 |
/* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the page * lock before returning. */ |
ec560175c
|
324 325 |
struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) |
1da177e4c
|
326 327 |
{ struct page *page; |
eb085574a
|
328 |
struct swap_info_struct *si; |
1da177e4c
|
329 |
|
eb085574a
|
330 331 332 |
si = get_swap_device(entry); if (!si) return NULL; |
f6ab1f7f6
|
333 |
page = find_get_page(swap_address_space(entry), swp_offset(entry)); |
eb085574a
|
334 |
put_swap_device(si); |
1da177e4c
|
335 |
|
ec560175c
|
336 337 |
INC_CACHE_INFO(find_total); if (page) { |
eaf649ebc
|
338 339 |
bool vma_ra = swap_use_vma_readahead(); bool readahead; |
1da177e4c
|
340 |
INC_CACHE_INFO(find_success); |
eaf649ebc
|
341 342 343 344 |
/* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ |
ec560175c
|
345 346 |
if (unlikely(PageTransCompound(page))) return page; |
eaf649ebc
|
347 |
|
ec560175c
|
348 |
readahead = TestClearPageReadahead(page); |
eaf649ebc
|
349 350 351 352 353 354 355 |
if (vma && vma_ra) { unsigned long ra_val; int win, hits; ra_val = GET_SWAP_RA_VAL(vma); win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); |
ec560175c
|
356 357 358 359 360 |
if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } |
eaf649ebc
|
361 |
|
ec560175c
|
362 |
if (readahead) { |
cbc65df24
|
363 |
count_vm_event(SWAP_RA_HIT); |
eaf649ebc
|
364 |
if (!vma || !vma_ra) |
ec560175c
|
365 |
atomic_inc(&swapin_readahead_hits); |
cbc65df24
|
366 |
} |
579f82901
|
367 |
} |
eaf649ebc
|
368 |
|
1da177e4c
|
369 370 |
return page; } |
61ef18655
|
371 372 373 374 375 376 377 378 379 380 381 382 383 384 |
/** * find_get_incore_page - Find and get a page from the page or swap caches. * @mapping: The address_space to search. * @index: The page cache index. * * This differs from find_get_page() in that it will also look for the * page in the swap cache. * * Return: The found page or %NULL. */ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; |
44835d20b
|
385 386 |
struct page *page = pagecache_get_page(mapping, index, FGP_ENTRY | FGP_HEAD, 0); |
61ef18655
|
387 |
|
a6de4b487
|
388 |
if (!page) |
61ef18655
|
389 |
return page; |
a6de4b487
|
390 391 |
if (!xa_is_value(page)) return find_subpage(page, index); |
61ef18655
|
392 393 394 395 396 397 398 399 400 401 402 403 |
if (!shmem_mapping(mapping)) return NULL; swp = radix_to_swp_entry(page); /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) return NULL; page = find_get_page(swap_address_space(swp), swp_offset(swp)); put_swap_device(si); return page; } |
5b999aadb
|
404 405 406 |
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) |
1da177e4c
|
407 |
{ |
eb085574a
|
408 |
struct swap_info_struct *si; |
4c6355b25
|
409 |
struct page *page; |
aae466b00
|
410 |
void *shadow = NULL; |
4c6355b25
|
411 |
|
5b999aadb
|
412 |
*new_page_allocated = false; |
1da177e4c
|
413 |
|
4c6355b25
|
414 415 |
for (;;) { int err; |
1da177e4c
|
416 417 418 419 420 |
/* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ |
eb085574a
|
421 422 |
si = get_swap_device(entry); if (!si) |
4c6355b25
|
423 424 425 |
return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); |
eb085574a
|
426 |
put_swap_device(si); |
4c6355b25
|
427 428 |
if (page) return page; |
1da177e4c
|
429 |
|
ba81f8384
|
430 431 432 433 434 435 436 437 438 |
/* * Just skip read ahead for unused swap slot. * During swap_off when swap_slot_cache is disabled, * we have to handle the race between putting * swap entry in swap cache and marking swap slot * as SWAP_HAS_CACHE. That's done in later part of code or * else swap_off will be aborted if we return NULL. */ if (!__swp_swapcount(entry) && swap_slot_cache_enabled) |
4c6355b25
|
439 |
return NULL; |
e8c26ab60
|
440 |
|
1da177e4c
|
441 |
/* |
4c6355b25
|
442 443 444 |
* Get a new page to read into from swap. Allocate it now, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. |
1da177e4c
|
445 |
*/ |
4c6355b25
|
446 447 448 |
page = alloc_page_vma(gfp_mask, vma, addr); if (!page) return NULL; |
1da177e4c
|
449 450 |
/* |
f000944d0
|
451 452 |
* Swap entry may have been freed since our caller observed it. */ |
355cfa73d
|
453 |
err = swapcache_prepare(entry); |
4c6355b25
|
454 |
if (!err) |
f000944d0
|
455 |
break; |
4c6355b25
|
456 457 458 |
put_page(page); if (err != -EEXIST) return NULL; |
2ca4532a4
|
459 |
/* |
4c6355b25
|
460 461 462 463 464 |
* We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE * in swap_map, but not yet added its page to swap cache. |
2ca4532a4
|
465 |
*/ |
6829aa17c
|
466 |
schedule_timeout_uninterruptible(1); |
4c6355b25
|
467 468 469 470 471 472 473 474 |
} /* * The swap entry is ours to swap in. Prepare the new page. */ __SetPageLocked(page); __SetPageSwapBacked(page); |
0add0c77a
|
475 |
if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry)) |
4c6355b25
|
476 |
goto fail_unlock; |
4c6355b25
|
477 |
|
0add0c77a
|
478 479 |
/* May fail (-ENOMEM) if XArray node allocation failed. */ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) |
4c6355b25
|
480 |
goto fail_unlock; |
0add0c77a
|
481 482 |
mem_cgroup_swapin_uncharge_swap(entry); |
4c6355b25
|
483 |
|
aae466b00
|
484 485 |
if (shadow) workingset_refault(page, shadow); |
314b57fb0
|
486 |
|
4c6355b25
|
487 |
/* Caller will initiate read into locked page */ |
6058eaec8
|
488 |
lru_cache_add(page); |
4c6355b25
|
489 490 |
*new_page_allocated = true; return page; |
1da177e4c
|
491 |
|
4c6355b25
|
492 |
fail_unlock: |
0add0c77a
|
493 |
put_swap_page(page, entry); |
4c6355b25
|
494 495 496 |
unlock_page(page); put_page(page); return NULL; |
1da177e4c
|
497 |
} |
46017e954
|
498 |
|
5b999aadb
|
499 500 501 502 503 504 505 |
/* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, |
23955622f
|
506 |
struct vm_area_struct *vma, unsigned long addr, bool do_poll) |
5b999aadb
|
507 508 509 510 511 512 |
{ bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) |
23955622f
|
513 |
swap_readpage(retpage, do_poll); |
5b999aadb
|
514 515 516 |
return retpage; } |
ec560175c
|
517 518 519 520 521 |
static unsigned int __swapin_nr_pages(unsigned long prev_offset, unsigned long offset, int hits, int max_pages, int prev_win) |
579f82901
|
522 |
{ |
ec560175c
|
523 |
unsigned int pages, last_ra; |
579f82901
|
524 525 526 527 528 529 |
/* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ |
ec560175c
|
530 |
pages = hits + 2; |
579f82901
|
531 532 533 534 535 536 537 538 |
if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get * stuck here forever, so check for an adjacent offset instead * (and don't even bother to check whether swap type is same). */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; |
579f82901
|
539 540 541 542 543 544 545 546 547 548 549 |
} else { unsigned int roundup = 4; while (roundup < pages) roundup <<= 1; pages = roundup; } if (pages > max_pages) pages = max_pages; /* Don't shrink readahead too fast */ |
ec560175c
|
550 |
last_ra = prev_win / 2; |
579f82901
|
551 552 |
if (pages < last_ra) pages = last_ra; |
ec560175c
|
553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 |
return pages; } static unsigned long swapin_nr_pages(unsigned long offset) { static unsigned long prev_offset; unsigned int hits, pages, max_pages; static atomic_t last_readahead_pages; max_pages = 1 << READ_ONCE(page_cluster); if (max_pages <= 1) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); |
d6c1f098f
|
568 569 |
pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, max_pages, |
ec560175c
|
570 571 |
atomic_read(&last_readahead_pages)); if (!hits) |
d6c1f098f
|
572 |
WRITE_ONCE(prev_offset, offset); |
579f82901
|
573 574 575 576 |
atomic_set(&last_readahead_pages, pages); return pages; } |
46017e954
|
577 |
/** |
e9e9b7ece
|
578 |
* swap_cluster_readahead - swap in pages in hope we need them soon |
46017e954
|
579 |
* @entry: swap entry of this memory |
7682486b3
|
580 |
* @gfp_mask: memory allocation flags |
e9e9b7ece
|
581 |
* @vmf: fault information |
46017e954
|
582 583 584 585 586 587 588 589 590 591 592 |
* * Returns the struct page for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * * This has been extended to use the NUMA policies from the mm triggering * the readahead. * |
c1e8d7c6a
|
593 |
* Caller must hold read mmap_lock if vmf->vma is not NULL. |
46017e954
|
594 |
*/ |
e9e9b7ece
|
595 596 |
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) |
46017e954
|
597 |
{ |
46017e954
|
598 |
struct page *page; |
579f82901
|
599 600 |
unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; |
67f96aa25
|
601 |
unsigned long start_offset, end_offset; |
579f82901
|
602 |
unsigned long mask; |
e9a6effa5
|
603 |
struct swap_info_struct *si = swp_swap_info(entry); |
3fb5c298b
|
604 |
struct blk_plug plug; |
c4fa63092
|
605 |
bool do_poll = true, page_allocated; |
e9e9b7ece
|
606 607 |
struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; |
46017e954
|
608 |
|
579f82901
|
609 610 611 |
mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; |
23955622f
|
612 |
do_poll = false; |
67f96aa25
|
613 614 615 616 617 |
/* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; |
e9a6effa5
|
618 619 |
if (end_offset >= si->max) end_offset = si->max - 1; |
67f96aa25
|
620 |
|
3fb5c298b
|
621 |
blk_start_plug(&plug); |
67f96aa25
|
622 |
for (offset = start_offset; offset <= end_offset ; offset++) { |
46017e954
|
623 |
/* Ok, do the async read-ahead now */ |
c4fa63092
|
624 625 626 |
page = __read_swap_cache_async( swp_entry(swp_type(entry), offset), gfp_mask, vma, addr, &page_allocated); |
46017e954
|
627 |
if (!page) |
67f96aa25
|
628 |
continue; |
c4fa63092
|
629 630 |
if (page_allocated) { swap_readpage(page, false); |
eaf649ebc
|
631 |
if (offset != entry_offset) { |
c4fa63092
|
632 633 634 |
SetPageReadahead(page); count_vm_event(SWAP_RA); } |
cbc65df24
|
635 |
} |
09cbfeaf1
|
636 |
put_page(page); |
46017e954
|
637 |
} |
3fb5c298b
|
638 |
blk_finish_plug(&plug); |
46017e954
|
639 |
lru_add_drain(); /* Push any new pages onto the LRU now */ |
579f82901
|
640 |
skip: |
23955622f
|
641 |
return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); |
46017e954
|
642 |
} |
4b3ef9daa
|
643 644 645 646 647 648 649 |
int init_swap_address_space(unsigned int type, unsigned long nr_pages) { struct address_space *spaces, *space; unsigned int i, nr; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); |
778e1cdd8
|
650 |
spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); |
4b3ef9daa
|
651 652 653 654 |
if (!spaces) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; |
a28334862
|
655 |
xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); |
4b3ef9daa
|
656 657 658 659 |
atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); |
4b3ef9daa
|
660 661 |
} nr_swapper_spaces[type] = nr; |
054f1d1fa
|
662 |
swapper_spaces[type] = spaces; |
4b3ef9daa
|
663 664 665 666 667 668 |
return 0; } void exit_swap_address_space(unsigned int type) { |
eea4a5011
|
669 670 671 672 673 674 |
int i; struct address_space *spaces = swapper_spaces[type]; for (i = 0; i < nr_swapper_spaces[type]; i++) VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); kvfree(spaces); |
4b3ef9daa
|
675 |
nr_swapper_spaces[type] = 0; |
054f1d1fa
|
676 |
swapper_spaces[type] = NULL; |
4b3ef9daa
|
677 |
} |
ec560175c
|
678 679 680 681 682 683 684 685 686 687 688 689 690 |
static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, unsigned long faddr, unsigned long lpfn, unsigned long rpfn, unsigned long *start, unsigned long *end) { *start = max3(lpfn, PFN_DOWN(vma->vm_start), PFN_DOWN(faddr & PMD_MASK)); *end = min3(rpfn, PFN_DOWN(vma->vm_end), PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); } |
eaf649ebc
|
691 692 |
static void swap_ra_info(struct vm_fault *vmf, struct vma_swap_readahead *ra_info) |
ec560175c
|
693 694 |
{ struct vm_area_struct *vma = vmf->vma; |
eaf649ebc
|
695 |
unsigned long ra_val; |
ec560175c
|
696 697 |
unsigned long faddr, pfn, fpfn; unsigned long start, end; |
eaf649ebc
|
698 |
pte_t *pte, *orig_pte; |
ec560175c
|
699 700 701 702 |
unsigned int max_win, hits, prev_win, win, left; #ifndef CONFIG_64BIT pte_t *tpte; #endif |
61b639723
|
703 704 705 |
max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) { |
eaf649ebc
|
706 707 |
ra_info->win = 1; return; |
61b639723
|
708 |
} |
ec560175c
|
709 |
faddr = vmf->address; |
eaf649ebc
|
710 |
orig_pte = pte = pte_offset_map(vmf->pmd, faddr); |
ec560175c
|
711 |
|
ec560175c
|
712 |
fpfn = PFN_DOWN(faddr); |
eaf649ebc
|
713 714 715 716 717 |
ra_val = GET_SWAP_RA_VAL(vma); pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, |
ec560175c
|
718 719 720 |
max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); |
eaf649ebc
|
721 722 723 724 |
if (win == 1) { pte_unmap(orig_pte); return; } |
ec560175c
|
725 726 727 728 729 730 731 732 733 734 735 736 |
/* Copy the PTEs because the page table may be unmapped */ if (fpfn == pfn + 1) swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); else if (pfn == fpfn + 1) swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, &start, &end); else { left = (win - 1) / 2; swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, &start, &end); } |
eaf649ebc
|
737 738 739 |
ra_info->nr_pte = end - start; ra_info->offset = fpfn - start; pte -= ra_info->offset; |
ec560175c
|
740 |
#ifdef CONFIG_64BIT |
eaf649ebc
|
741 |
ra_info->ptes = pte; |
ec560175c
|
742 |
#else |
eaf649ebc
|
743 |
tpte = ra_info->ptes; |
ec560175c
|
744 745 746 |
for (pfn = start; pfn != end; pfn++) *tpte++ = *pte++; #endif |
eaf649ebc
|
747 |
pte_unmap(orig_pte); |
ec560175c
|
748 |
} |
e9f598730
|
749 750 |
/** * swap_vma_readahead - swap in pages in hope we need them soon |
27ec4878d
|
751 |
* @fentry: swap entry of this memory |
e9f598730
|
752 753 754 755 756 |
* @gfp_mask: memory allocation flags * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. * |
cb152a1a9
|
757 |
* Primitive swap readahead code. We simply read in a few pages whose |
e9f598730
|
758 759 |
* virtual addresses are around the fault address in the same vma. * |
c1e8d7c6a
|
760 |
* Caller must hold read mmap_lock if vmf->vma is not NULL. |
e9f598730
|
761 762 |
* */ |
f5c754d63
|
763 764 |
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct vm_fault *vmf) |
ec560175c
|
765 766 767 768 769 770 771 772 |
{ struct blk_plug plug; struct vm_area_struct *vma = vmf->vma; struct page *page; pte_t *pte, pentry; swp_entry_t entry; unsigned int i; bool page_allocated; |
e97af6995
|
773 774 775 |
struct vma_swap_readahead ra_info = { .win = 1, }; |
ec560175c
|
776 |
|
eaf649ebc
|
777 778 |
swap_ra_info(vmf, &ra_info); if (ra_info.win == 1) |
ec560175c
|
779 780 781 |
goto skip; blk_start_plug(&plug); |
eaf649ebc
|
782 |
for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; |
ec560175c
|
783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 |
i++, pte++) { pentry = *pte; if (pte_none(pentry)) continue; if (pte_present(pentry)) continue; entry = pte_to_swp_entry(pentry); if (unlikely(non_swap_entry(entry))) continue; page = __read_swap_cache_async(entry, gfp_mask, vma, vmf->address, &page_allocated); if (!page) continue; if (page_allocated) { swap_readpage(page, false); |
eaf649ebc
|
798 |
if (i != ra_info.offset) { |
ec560175c
|
799 800 801 802 803 804 805 806 807 808 |
SetPageReadahead(page); count_vm_event(SWAP_RA); } } put_page(page); } blk_finish_plug(&plug); lru_add_drain(); skip: return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, |
eaf649ebc
|
809 |
ra_info.win == 1); |
ec560175c
|
810 |
} |
d9bfcfdc4
|
811 |
|
e9e9b7ece
|
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 |
/** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { return swap_use_vma_readahead() ? swap_vma_readahead(entry, gfp_mask, vmf) : swap_cluster_readahead(entry, gfp_mask, vmf); } |
d9bfcfdc4
|
831 832 833 834 |
#ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { |
ae7a927d2
|
835 836 837 |
return sysfs_emit(buf, "%s ", enable_vma_readahead ? "true" : "false"); |
d9bfcfdc4
|
838 839 840 841 842 843 |
} static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) |
e9e9b7ece
|
844 |
enable_vma_readahead = true; |
d9bfcfdc4
|
845 |
else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) |
e9e9b7ece
|
846 |
enable_vma_readahead = false; |
d9bfcfdc4
|
847 848 849 850 851 852 853 854 |
else return -EINVAL; return count; } static struct kobj_attribute vma_ra_enabled_attr = __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, vma_ra_enabled_store); |
d9bfcfdc4
|
855 856 |
static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, |
d9bfcfdc4
|
857 858 |
NULL, }; |
e48333b66
|
859 |
static const struct attribute_group swap_attr_group = { |
d9bfcfdc4
|
860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 |
.attrs = swap_attrs, }; static int __init swap_init_sysfs(void) { int err; struct kobject *swap_kobj; swap_kobj = kobject_create_and_add("swap", mm_kobj); if (!swap_kobj) { pr_err("failed to create swap kobject "); return -ENOMEM; } err = sysfs_create_group(swap_kobj, &swap_attr_group); if (err) { pr_err("failed to register swap group "); goto delete_obj; } return 0; delete_obj: kobject_put(swap_kobj); return err; } subsys_initcall(swap_init_sysfs); #endif |