Blame view
mm/swapfile.c
96.4 KB
457c89965
|
1 |
// SPDX-License-Identifier: GPL-2.0-only |
1da177e4c
|
2 3 4 5 6 7 |
/* * linux/mm/swapfile.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie */ |
1da177e4c
|
8 |
#include <linux/mm.h> |
6e84f3152
|
9 |
#include <linux/sched/mm.h> |
299300258
|
10 |
#include <linux/sched/task.h> |
1da177e4c
|
11 12 13 14 15 16 17 18 |
#include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> |
072441e21
|
19 |
#include <linux/shmem_fs.h> |
1da177e4c
|
20 |
#include <linux/blkdev.h> |
20137a490
|
21 |
#include <linux/random.h> |
1da177e4c
|
22 23 24 25 |
#include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> |
5ad646880
|
26 |
#include <linux/ksm.h> |
1da177e4c
|
27 28 29 |
#include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> |
fc0abb145
|
30 |
#include <linux/mutex.h> |
c59ede7b7
|
31 |
#include <linux/capability.h> |
1da177e4c
|
32 |
#include <linux/syscalls.h> |
8a9f3ccd2
|
33 |
#include <linux/memcontrol.h> |
66d7dd518
|
34 |
#include <linux/poll.h> |
72788c385
|
35 |
#include <linux/oom.h> |
38b5faf4b
|
36 37 |
#include <linux/frontswap.h> #include <linux/swapfile.h> |
f981c5950
|
38 |
#include <linux/export.h> |
67afa38e0
|
39 |
#include <linux/swap_slots.h> |
155b5f88e
|
40 |
#include <linux/sort.h> |
63d8620ec
|
41 |
#include <linux/completion.h> |
1da177e4c
|
42 |
|
1da177e4c
|
43 44 |
#include <asm/tlbflush.h> #include <linux/swapops.h> |
5d1ea48bd
|
45 |
#include <linux/swap_cgroup.h> |
1da177e4c
|
46 |
|
570a335b8
|
47 48 49 |
static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); |
38b5faf4b
|
50 |
DEFINE_SPINLOCK(swap_lock); |
7c363b8c6
|
51 |
static unsigned int nr_swapfiles; |
ec8acf20a
|
52 |
atomic_long_t nr_swap_pages; |
fb0fec501
|
53 54 55 56 57 58 |
/* * Some modules use swappable objects and may try to swap them out under * memory pressure (via the shrinker). Before doing so, they may wish to * check to see if any swap space is available. */ EXPORT_SYMBOL_GPL(nr_swap_pages); |
ec8acf20a
|
59 |
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ |
1da177e4c
|
60 |
long total_swap_pages; |
a2468cc9b
|
61 |
static int least_priority = -1; |
1da177e4c
|
62 |
|
1da177e4c
|
63 64 65 66 |
static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; |
adfab836f
|
67 68 69 70 |
/* * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ |
18ab4d4ce
|
71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. * This is used by get_swap_page() instead of swap_active_head * because swap_active_head includes all swap_info_structs, * but get_swap_page() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ |
bfc6b1cab
|
85 |
static struct plist_head *swap_avail_heads; |
18ab4d4ce
|
86 |
static DEFINE_SPINLOCK(swap_avail_lock); |
1da177e4c
|
87 |
|
38b5faf4b
|
88 |
struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
1da177e4c
|
89 |
|
fc0abb145
|
90 |
static DEFINE_MUTEX(swapon_mutex); |
1da177e4c
|
91 |
|
66d7dd518
|
92 93 94 |
static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); |
81a0298bd
|
95 |
atomic_t nr_rotate_swap = ATOMIC_INIT(0); |
c10d38cc8
|
96 97 |
static struct swap_info_struct *swap_type_to_swap_info(int type) { |
a4b451143
|
98 |
if (type >= MAX_SWAPFILES) |
c10d38cc8
|
99 |
return NULL; |
a4b451143
|
100 |
return READ_ONCE(swap_info[type]); /* rcu_dereference() */ |
c10d38cc8
|
101 |
} |
8d69aaee8
|
102 |
static inline unsigned char swap_count(unsigned char ent) |
355cfa73d
|
103 |
{ |
955c97f08
|
104 |
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ |
355cfa73d
|
105 |
} |
bcd49e867
|
106 107 108 109 110 111 112 113 114 |
/* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* * Reclaim the swap entry if there are no more mappings of the * corresponding page */ #define TTRS_UNMAPPED 0x2 /* Reclaim the swap entry if swap is getting full*/ #define TTRS_FULL 0x4 |
efa90a981
|
115 |
/* returns 1 if swap entry is freed */ |
bcd49e867
|
116 117 |
static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) |
c9e444103
|
118 |
{ |
efa90a981
|
119 |
swp_entry_t entry = swp_entry(si->type, offset); |
c9e444103
|
120 121 |
struct page *page; int ret = 0; |
bcd49e867
|
122 |
page = find_get_page(swap_address_space(entry), offset); |
c9e444103
|
123 124 125 |
if (!page) return 0; /* |
bcd49e867
|
126 127 128 |
* When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, * here. We have to use trylock for avoiding deadlock. This is a special |
c9e444103
|
129 130 131 132 |
* case and you should use try_to_free_swap() with explicit lock_page() * in usual operations. */ if (trylock_page(page)) { |
bcd49e867
|
133 134 135 136 |
if ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) ret = try_to_free_swap(page); |
c9e444103
|
137 138 |
unlock_page(page); } |
09cbfeaf1
|
139 |
put_page(page); |
c9e444103
|
140 141 |
return ret; } |
355cfa73d
|
142 |
|
4efaceb1c
|
143 144 145 146 147 148 149 150 151 152 153 |
static inline struct swap_extent *first_se(struct swap_info_struct *sis) { struct rb_node *rb = rb_first(&sis->swap_extent_root); return rb_entry(rb, struct swap_extent, rb_node); } static inline struct swap_extent *next_se(struct swap_extent *se) { struct rb_node *rb = rb_next(&se->rb_node); return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; } |
1da177e4c
|
154 |
/* |
6a6ba8317
|
155 156 157 158 159 160 |
* swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; |
9625a5f28
|
161 162 |
sector_t start_block; sector_t nr_blocks; |
6a6ba8317
|
163 |
int err = 0; |
9625a5f28
|
164 |
/* Do not discard the swap header page! */ |
4efaceb1c
|
165 |
se = first_se(si); |
9625a5f28
|
166 167 168 169 |
start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, |
dd3932edd
|
170 |
nr_blocks, GFP_KERNEL, 0); |
9625a5f28
|
171 172 173 174 |
if (err) return err; cond_resched(); } |
6a6ba8317
|
175 |
|
4efaceb1c
|
176 |
for (se = next_se(se); se; se = next_se(se)) { |
9625a5f28
|
177 178 |
start_block = se->start_block << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
6a6ba8317
|
179 180 |
err = blkdev_issue_discard(si->bdev, start_block, |
dd3932edd
|
181 |
nr_blocks, GFP_KERNEL, 0); |
6a6ba8317
|
182 183 184 185 186 187 188 |
if (err) break; cond_resched(); } return err; /* That will often be -EOPNOTSUPP */ } |
4efaceb1c
|
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
static struct swap_extent * offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) { struct swap_extent *se; struct rb_node *rb; rb = sis->swap_extent_root.rb_node; while (rb) { se = rb_entry(rb, struct swap_extent, rb_node); if (offset < se->start_page) rb = rb->rb_left; else if (offset >= se->start_page + se->nr_pages) rb = rb->rb_right; else return se; } /* It *must* be present */ BUG(); } |
caf6912f3
|
208 209 210 211 212 213 214 215 216 217 218 219 |
sector_t swap_page_sector(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); struct swap_extent *se; sector_t sector; pgoff_t offset; offset = __page_file_index(page); se = offset_to_swap_extent(sis, offset); sector = se->start_block + (offset - se->start_page); return sector << (PAGE_SHIFT - 9); } |
7992fde72
|
220 221 222 223 224 225 226 |
/* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. */ static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages) { |
4efaceb1c
|
227 |
struct swap_extent *se = offset_to_swap_extent(si, start_page); |
7992fde72
|
228 229 |
while (nr_pages) { |
4efaceb1c
|
230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
pgoff_t offset = start_page - se->start_page; sector_t start_block = se->start_block + offset; sector_t nr_blocks = se->nr_pages - offset; if (nr_blocks > nr_pages) nr_blocks = nr_pages; start_page += nr_blocks; nr_pages -= nr_blocks; start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_NOIO, 0)) break; |
7992fde72
|
244 |
|
4efaceb1c
|
245 |
se = next_se(se); |
7992fde72
|
246 247 |
} } |
38d8b4e6b
|
248 249 |
#ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR |
a448f2d07
|
250 251 |
#define swap_entry_size(size) (size) |
38d8b4e6b
|
252 |
#else |
048c27fd7
|
253 |
#define SWAPFILE_CLUSTER 256 |
a448f2d07
|
254 255 256 257 258 259 |
/* * Define swap_entry_size() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP */ #define swap_entry_size(size) 1 |
38d8b4e6b
|
260 |
#endif |
048c27fd7
|
261 |
#define LATENCY_LIMIT 256 |
2a8f94493
|
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 |
static inline void cluster_set_flag(struct swap_cluster_info *info, unsigned int flag) { info->flags = flag; } static inline unsigned int cluster_count(struct swap_cluster_info *info) { return info->data; } static inline void cluster_set_count(struct swap_cluster_info *info, unsigned int c) { info->data = c; } static inline void cluster_set_count_flag(struct swap_cluster_info *info, unsigned int c, unsigned int f) { info->flags = f; info->data = c; } static inline unsigned int cluster_next(struct swap_cluster_info *info) { return info->data; } static inline void cluster_set_next(struct swap_cluster_info *info, unsigned int n) { info->data = n; } static inline void cluster_set_next_flag(struct swap_cluster_info *info, unsigned int n, unsigned int f) { info->flags = f; info->data = n; } static inline bool cluster_is_free(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_FREE; } static inline bool cluster_is_null(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_NEXT_NULL; } static inline void cluster_set_null(struct swap_cluster_info *info) { info->flags = CLUSTER_FLAG_NEXT_NULL; info->data = 0; } |
e07098294
|
319 320 |
static inline bool cluster_is_huge(struct swap_cluster_info *info) { |
33ee011e5
|
321 322 323 |
if (IS_ENABLED(CONFIG_THP_SWAP)) return info->flags & CLUSTER_FLAG_HUGE; return false; |
e07098294
|
324 325 326 327 328 329 |
} static inline void cluster_clear_huge(struct swap_cluster_info *info) { info->flags &= ~CLUSTER_FLAG_HUGE; } |
235b62176
|
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 |
static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; ci = si->cluster_info; if (ci) { ci += offset / SWAPFILE_CLUSTER; spin_lock(&ci->lock); } return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { if (ci) spin_unlock(&ci->lock); } |
59d98bf3c
|
348 349 350 351 |
/* * Determine the locking method in use for this device. Return * swap_cluster_info if SSD-style cluster-based locking is in place. */ |
235b62176
|
352 |
static inline struct swap_cluster_info *lock_cluster_or_swap_info( |
59d98bf3c
|
353 |
struct swap_info_struct *si, unsigned long offset) |
235b62176
|
354 355 |
{ struct swap_cluster_info *ci; |
59d98bf3c
|
356 |
/* Try to use fine-grained SSD-style locking if available: */ |
235b62176
|
357 |
ci = lock_cluster(si, offset); |
59d98bf3c
|
358 |
/* Otherwise, fall back to traditional, coarse locking: */ |
235b62176
|
359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
if (!ci) spin_lock(&si->lock); return ci; } static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, struct swap_cluster_info *ci) { if (ci) unlock_cluster(ci); else spin_unlock(&si->lock); } |
6b5349159
|
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 |
static inline bool cluster_list_empty(struct swap_cluster_list *list) { return cluster_is_null(&list->head); } static inline unsigned int cluster_list_first(struct swap_cluster_list *list) { return cluster_next(&list->head); } static void cluster_list_init(struct swap_cluster_list *list) { cluster_set_null(&list->head); cluster_set_null(&list->tail); } static void cluster_list_add_tail(struct swap_cluster_list *list, struct swap_cluster_info *ci, unsigned int idx) { if (cluster_list_empty(list)) { cluster_set_next_flag(&list->head, idx, 0); cluster_set_next_flag(&list->tail, idx, 0); } else { |
235b62176
|
397 |
struct swap_cluster_info *ci_tail; |
6b5349159
|
398 |
unsigned int tail = cluster_next(&list->tail); |
235b62176
|
399 400 401 402 403 404 405 |
/* * Nested cluster lock, but both cluster locks are * only acquired when we held swap_info_struct->lock */ ci_tail = ci + tail; spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); cluster_set_next(ci_tail, idx); |
0ef017d11
|
406 |
spin_unlock(&ci_tail->lock); |
6b5349159
|
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 |
cluster_set_next_flag(&list->tail, idx, 0); } } static unsigned int cluster_list_del_first(struct swap_cluster_list *list, struct swap_cluster_info *ci) { unsigned int idx; idx = cluster_next(&list->head); if (cluster_next(&list->tail) == idx) { cluster_set_null(&list->head); cluster_set_null(&list->tail); } else cluster_set_next_flag(&list->head, cluster_next(&ci[idx]), 0); return idx; } |
815c2c543
|
426 427 428 429 430 |
/* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, unsigned int idx) { /* |
bb243f7dc
|
431 |
* If scan_swap_map_slots() can't find a free cluster, it will check |
815c2c543
|
432 |
* si->swap_map directly. To make sure the discarding cluster isn't |
bb243f7dc
|
433 434 |
* taken by scan_swap_map_slots(), mark the swap entries bad (occupied). * It will be cleared after discard |
815c2c543
|
435 436 437 |
*/ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); |
6b5349159
|
438 |
cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); |
815c2c543
|
439 440 441 |
schedule_work(&si->discard_work); } |
38d8b4e6b
|
442 443 444 445 446 447 448 |
static void __free_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info; cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); cluster_list_add_tail(&si->free_clusters, ci, idx); } |
815c2c543
|
449 450 451 452 453 454 |
/* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. caller should hold si->lock. */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { |
235b62176
|
455 |
struct swap_cluster_info *info, *ci; |
815c2c543
|
456 457 458 |
unsigned int idx; info = si->cluster_info; |
6b5349159
|
459 460 |
while (!cluster_list_empty(&si->discard_clusters)) { idx = cluster_list_del_first(&si->discard_clusters, info); |
815c2c543
|
461 462 463 464 465 466 |
spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&si->lock); |
235b62176
|
467 |
ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); |
38d8b4e6b
|
468 |
__free_cluster(si, idx); |
815c2c543
|
469 470 |
memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); |
235b62176
|
471 |
unlock_cluster(ci); |
815c2c543
|
472 473 474 475 476 477 478 479 480 481 482 483 484 |
} } static void swap_discard_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, discard_work); spin_lock(&si->lock); swap_do_scheduled_discard(si); spin_unlock(&si->lock); } |
63d8620ec
|
485 486 487 488 489 490 491 |
static void swap_users_ref_free(struct percpu_ref *ref) { struct swap_info_struct *si; si = container_of(ref, struct swap_info_struct, users); complete(&si->comp); } |
38d8b4e6b
|
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 |
static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info; VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); cluster_list_del_first(&si->free_clusters, ci); cluster_set_count_flag(ci + idx, 0, 0); } static void free_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info + idx; VM_BUG_ON(cluster_count(ci) != 0); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed * after discard. */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { swap_cluster_schedule_discard(si, idx); return; } __free_cluster(si, idx); } |
2a8f94493
|
519 520 521 522 523 524 525 526 527 528 529 |
/* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased. */ static void inc_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; if (!cluster_info) return; |
38d8b4e6b
|
530 531 |
if (cluster_is_free(&cluster_info[idx])) alloc_cluster(p, idx); |
2a8f94493
|
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 |
VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) + 1); } /* * The cluster corresponding to page_nr decreases one usage. If the usage * counter becomes 0, which means no page in the cluster is in using, we can * optionally discard the cluster and add it to free cluster list. */ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; if (!cluster_info) return; VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) - 1); |
38d8b4e6b
|
554 555 |
if (cluster_count(&cluster_info[idx]) == 0) free_cluster(p, idx); |
2a8f94493
|
556 557 558 |
} /* |
bb243f7dc
|
559 |
* It's possible scan_swap_map_slots() uses a free cluster in the middle of free |
2a8f94493
|
560 561 |
* cluster list. Avoiding such abuse to avoid list corruption. */ |
ebc2a1a69
|
562 563 |
static bool scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, |
2a8f94493
|
564 565 |
unsigned long offset) { |
ebc2a1a69
|
566 567 |
struct percpu_cluster *percpu_cluster; bool conflict; |
2a8f94493
|
568 |
offset /= SWAPFILE_CLUSTER; |
6b5349159
|
569 570 |
conflict = !cluster_list_empty(&si->free_clusters) && offset != cluster_list_first(&si->free_clusters) && |
2a8f94493
|
571 |
cluster_is_free(&si->cluster_info[offset]); |
ebc2a1a69
|
572 573 574 575 576 577 578 579 580 581 582 583 584 |
if (!conflict) return false; percpu_cluster = this_cpu_ptr(si->percpu_cluster); cluster_set_null(&percpu_cluster->index); return true; } /* * Try to get a swap entry from current cpu's swap entry pool (a cluster). This * might involve allocating a new cluster for current CPU too. */ |
36005bae2
|
585 |
static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, |
ebc2a1a69
|
586 587 588 |
unsigned long *offset, unsigned long *scan_base) { struct percpu_cluster *cluster; |
235b62176
|
589 |
struct swap_cluster_info *ci; |
235b62176
|
590 |
unsigned long tmp, max; |
ebc2a1a69
|
591 592 593 594 |
new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); if (cluster_is_null(&cluster->index)) { |
6b5349159
|
595 596 |
if (!cluster_list_empty(&si->free_clusters)) { cluster->index = si->free_clusters.head; |
ebc2a1a69
|
597 598 |
cluster->next = cluster_next(&cluster->index) * SWAPFILE_CLUSTER; |
6b5349159
|
599 |
} else if (!cluster_list_empty(&si->discard_clusters)) { |
ebc2a1a69
|
600 601 |
/* * we don't have free cluster but have some clusters in |
490705888
|
602 603 |
* discarding, do discard now and reclaim them, then * reread cluster_next_cpu since we dropped si->lock |
ebc2a1a69
|
604 605 |
*/ swap_do_scheduled_discard(si); |
490705888
|
606 607 |
*scan_base = this_cpu_read(*si->cluster_next_cpu); *offset = *scan_base; |
ebc2a1a69
|
608 609 |
goto new_cluster; } else |
36005bae2
|
610 |
return false; |
ebc2a1a69
|
611 |
} |
ebc2a1a69
|
612 613 614 615 616 |
/* * Other CPUs can use our cluster if they can't find a free cluster, * check if there is still free entry in the cluster */ tmp = cluster->next; |
235b62176
|
617 618 |
max = min_t(unsigned long, si->max, (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); |
7b9e2de13
|
619 620 621 622 623 624 625 626 |
if (tmp < max) { ci = lock_cluster(si, tmp); while (tmp < max) { if (!si->swap_map[tmp]) break; tmp++; } unlock_cluster(ci); |
ebc2a1a69
|
627 |
} |
0fd0e19e4
|
628 |
if (tmp >= max) { |
ebc2a1a69
|
629 630 631 632 633 634 |
cluster_set_null(&cluster->index); goto new_cluster; } cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; |
fdff1debb
|
635 |
return true; |
2a8f94493
|
636 |
} |
a2468cc9b
|
637 638 639 640 641 642 643 644 645 646 647 648 649 650 |
static void __del_from_avail_list(struct swap_info_struct *p) { int nid; for_each_node(nid) plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); } static void del_from_avail_list(struct swap_info_struct *p) { spin_lock(&swap_avail_lock); __del_from_avail_list(p); spin_unlock(&swap_avail_lock); } |
38d8b4e6b
|
651 652 653 654 655 656 657 658 |
static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned int end = offset + nr_entries - 1; if (offset == si->lowest_bit) si->lowest_bit += nr_entries; if (end == si->highest_bit) |
a449bf58e
|
659 |
WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); |
38d8b4e6b
|
660 661 662 663 |
si->inuse_pages += nr_entries; if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; |
a2468cc9b
|
664 |
del_from_avail_list(si); |
38d8b4e6b
|
665 666 |
} } |
a2468cc9b
|
667 668 669 670 671 672 673 674 675 676 677 |
static void add_to_avail_list(struct swap_info_struct *p) { int nid; spin_lock(&swap_avail_lock); for_each_node(nid) { WARN_ON(!plist_node_empty(&p->avail_lists[nid])); plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); } spin_unlock(&swap_avail_lock); } |
38d8b4e6b
|
678 679 680 |
static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { |
3852f6768
|
681 |
unsigned long begin = offset; |
38d8b4e6b
|
682 683 684 685 686 687 688 |
unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); if (offset < si->lowest_bit) si->lowest_bit = offset; if (end > si->highest_bit) { bool was_full = !si->highest_bit; |
a449bf58e
|
689 |
WRITE_ONCE(si->highest_bit, end); |
a2468cc9b
|
690 691 |
if (was_full && (si->flags & SWP_WRITEOK)) add_to_avail_list(si); |
38d8b4e6b
|
692 693 694 695 696 697 698 699 700 |
} atomic_long_add(nr_entries, &nr_swap_pages); si->inuse_pages -= nr_entries; if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; else swap_slot_free_notify = NULL; while (offset <= end) { |
8a84802e2
|
701 |
arch_swap_invalidate_page(si->type, offset); |
38d8b4e6b
|
702 703 704 705 706 |
frontswap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; } |
3852f6768
|
707 |
clear_shadow_from_swap_cache(si->type, begin, end); |
38d8b4e6b
|
708 |
} |
490705888
|
709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 |
static void set_cluster_next(struct swap_info_struct *si, unsigned long next) { unsigned long prev; if (!(si->flags & SWP_SOLIDSTATE)) { si->cluster_next = next; return; } prev = this_cpu_read(*si->cluster_next_cpu); /* * Cross the swap address space size aligned trunk, choose * another trunk randomly to avoid lock contention on swap * address space if possible. */ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != (next >> SWAP_ADDRESS_SPACE_SHIFT)) { /* No free swap slots available */ if (si->highest_bit <= si->lowest_bit) return; next = si->lowest_bit + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); next = max_t(unsigned int, next, si->lowest_bit); } this_cpu_write(*si->cluster_next_cpu, next); } |
36005bae2
|
736 737 738 |
static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) |
1da177e4c
|
739 |
{ |
235b62176
|
740 |
struct swap_cluster_info *ci; |
ebebbbe90
|
741 |
unsigned long offset; |
c60aa176c
|
742 |
unsigned long scan_base; |
7992fde72
|
743 |
unsigned long last_in_cluster = 0; |
048c27fd7
|
744 |
int latency_ration = LATENCY_LIMIT; |
36005bae2
|
745 |
int n_ret = 0; |
ed43af109
|
746 |
bool scanned_many = false; |
36005bae2
|
747 |
|
886bb7e9c
|
748 |
/* |
7dfad4183
|
749 750 751 752 753 754 755 |
* We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting * a new cluster. This prevents us from scattering swap pages * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea |
c60aa176c
|
756 |
* And we let swap pages go all over an SSD partition. Hugh |
7dfad4183
|
757 |
*/ |
52b7efdbe
|
758 |
si->flags += SWP_SCANNING; |
490705888
|
759 760 761 762 763 764 765 766 767 768 |
/* * Use percpu scan base for SSD to reduce lock contention on * cluster and swap cache. For HDD, sequential access is more * important. */ if (si->flags & SWP_SOLIDSTATE) scan_base = this_cpu_read(*si->cluster_next_cpu); else scan_base = si->cluster_next; offset = scan_base; |
ebebbbe90
|
769 |
|
ebc2a1a69
|
770 771 |
/* SSD algorithm */ if (si->cluster_info) { |
bd2d18da4
|
772 |
if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) |
36005bae2
|
773 |
goto scan; |
f4eaf51a7
|
774 |
} else if (unlikely(!si->cluster_nr--)) { |
ebebbbe90
|
775 776 777 778 |
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } |
2a8f94493
|
779 |
|
ec8acf20a
|
780 |
spin_unlock(&si->lock); |
7dfad4183
|
781 |
|
c60aa176c
|
782 783 784 |
/* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. |
50088c440
|
785 786 |
* If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info * case, just handled by scan_swap_map_try_ssd_cluster() above. |
c60aa176c
|
787 |
*/ |
50088c440
|
788 |
scan_base = offset = si->lowest_bit; |
7dfad4183
|
789 790 791 792 |
last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ for (; last_in_cluster <= si->highest_bit; offset++) { |
1da177e4c
|
793 |
if (si->swap_map[offset]) |
7dfad4183
|
794 795 |
last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { |
ec8acf20a
|
796 |
spin_lock(&si->lock); |
ebebbbe90
|
797 798 799 |
offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; |
c60aa176c
|
800 801 802 803 804 805 806 807 808 |
goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } offset = scan_base; |
ec8acf20a
|
809 |
spin_lock(&si->lock); |
ebebbbe90
|
810 |
si->cluster_nr = SWAPFILE_CLUSTER - 1; |
1da177e4c
|
811 |
} |
7dfad4183
|
812 |
|
ebebbbe90
|
813 |
checks: |
ebc2a1a69
|
814 |
if (si->cluster_info) { |
36005bae2
|
815 816 817 818 819 820 821 822 |
while (scan_swap_map_ssd_cluster_conflict(si, offset)) { /* take a break if we already got some slots */ if (n_ret) goto done; if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; } |
ebc2a1a69
|
823 |
} |
ebebbbe90
|
824 |
if (!(si->flags & SWP_WRITEOK)) |
52b7efdbe
|
825 |
goto no_page; |
7dfad4183
|
826 827 |
if (!si->highest_bit) goto no_page; |
ebebbbe90
|
828 |
if (offset > si->highest_bit) |
c60aa176c
|
829 |
scan_base = offset = si->lowest_bit; |
c9e444103
|
830 |
|
235b62176
|
831 |
ci = lock_cluster(si, offset); |
b73d7fcec
|
832 833 |
/* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
c9e444103
|
834 |
int swap_was_freed; |
235b62176
|
835 |
unlock_cluster(ci); |
ec8acf20a
|
836 |
spin_unlock(&si->lock); |
bcd49e867
|
837 |
swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); |
ec8acf20a
|
838 |
spin_lock(&si->lock); |
c9e444103
|
839 840 841 842 843 |
/* entry was freed successfully, try to use this again */ if (swap_was_freed) goto checks; goto scan; /* check next one */ } |
235b62176
|
844 845 |
if (si->swap_map[offset]) { unlock_cluster(ci); |
36005bae2
|
846 847 848 849 |
if (!n_ret) goto scan; else goto done; |
235b62176
|
850 |
} |
a449bf58e
|
851 |
WRITE_ONCE(si->swap_map[offset], usage); |
2872bb2d0
|
852 853 |
inc_cluster_info_page(si, si->cluster_info, offset); unlock_cluster(ci); |
ebebbbe90
|
854 |
|
38d8b4e6b
|
855 |
swap_range_alloc(si, offset, 1); |
36005bae2
|
856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 |
slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ if ((n_ret == nr) || (offset >= si->highest_bit)) goto done; /* search for next available slot */ /* time to take a break? */ if (unlikely(--latency_ration < 0)) { if (n_ret) goto done; spin_unlock(&si->lock); cond_resched(); spin_lock(&si->lock); latency_ration = LATENCY_LIMIT; } /* try to get more slots in cluster */ if (si->cluster_info) { if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto checks; |
f4eaf51a7
|
878 879 |
} else if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ |
36005bae2
|
880 881 882 |
--si->cluster_nr; goto checks; } |
7992fde72
|
883 |
|
ed43af109
|
884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 |
/* * Even if there's no free clusters available (fragmented), * try to scan a little more quickly with lock held unless we * have scanned too many slots already. */ if (!scanned_many) { unsigned long scan_limit; if (offset < scan_base) scan_limit = scan_base; else scan_limit = si->highest_bit; for (; offset <= scan_limit && --latency_ration > 0; offset++) { if (!si->swap_map[offset]) goto checks; } } |
36005bae2
|
902 |
done: |
490705888
|
903 |
set_cluster_next(si, offset + 1); |
36005bae2
|
904 905 |
si->flags -= SWP_SCANNING; return n_ret; |
7dfad4183
|
906 |
|
ebebbbe90
|
907 |
scan: |
ec8acf20a
|
908 |
spin_unlock(&si->lock); |
a449bf58e
|
909 910 |
while (++offset <= READ_ONCE(si->highest_bit)) { if (data_race(!si->swap_map[offset])) { |
ec8acf20a
|
911 |
spin_lock(&si->lock); |
52b7efdbe
|
912 913 |
goto checks; } |
a449bf58e
|
914 915 |
if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { |
ec8acf20a
|
916 |
spin_lock(&si->lock); |
c9e444103
|
917 918 |
goto checks; } |
048c27fd7
|
919 920 921 |
if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; |
ed43af109
|
922 |
scanned_many = true; |
048c27fd7
|
923 |
} |
7dfad4183
|
924 |
} |
c60aa176c
|
925 |
offset = si->lowest_bit; |
a5998061d
|
926 |
while (offset < scan_base) { |
a449bf58e
|
927 |
if (data_race(!si->swap_map[offset])) { |
ec8acf20a
|
928 |
spin_lock(&si->lock); |
c60aa176c
|
929 930 |
goto checks; } |
a449bf58e
|
931 932 |
if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { |
ec8acf20a
|
933 |
spin_lock(&si->lock); |
c9e444103
|
934 935 |
goto checks; } |
c60aa176c
|
936 937 938 |
if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; |
ed43af109
|
939 |
scanned_many = true; |
c60aa176c
|
940 |
} |
a5998061d
|
941 |
offset++; |
c60aa176c
|
942 |
} |
ec8acf20a
|
943 |
spin_lock(&si->lock); |
7dfad4183
|
944 945 |
no_page: |
52b7efdbe
|
946 |
si->flags -= SWP_SCANNING; |
36005bae2
|
947 |
return n_ret; |
1da177e4c
|
948 |
} |
38d8b4e6b
|
949 950 951 952 |
static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) { unsigned long idx; struct swap_cluster_info *ci; |
661c75664
|
953 |
unsigned long offset; |
38d8b4e6b
|
954 |
|
fe5266d5d
|
955 956 957 958 959 960 961 962 |
/* * Should not even be attempting cluster allocations when huge * page swap is disabled. Warn and fail the allocation. */ if (!IS_ENABLED(CONFIG_THP_SWAP)) { VM_WARN_ON_ONCE(1); return 0; } |
38d8b4e6b
|
963 964 965 966 967 968 969 |
if (cluster_list_empty(&si->free_clusters)) return 0; idx = cluster_list_first(&si->free_clusters); offset = idx * SWAPFILE_CLUSTER; ci = lock_cluster(si, offset); alloc_cluster(si, idx); |
e07098294
|
970 |
cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); |
38d8b4e6b
|
971 |
|
661c75664
|
972 |
memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); |
38d8b4e6b
|
973 974 975 976 977 978 979 980 981 982 983 984 985 |
unlock_cluster(ci); swap_range_alloc(si, offset, SWAPFILE_CLUSTER); *slot = swp_entry(si->type, offset); return 1; } static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) { unsigned long offset = idx * SWAPFILE_CLUSTER; struct swap_cluster_info *ci; ci = lock_cluster(si, offset); |
979aafa59
|
986 |
memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); |
38d8b4e6b
|
987 988 989 990 991 |
cluster_set_count_flag(ci, 0, 0); free_cluster(si, idx); unlock_cluster(ci); swap_range_free(si, offset, SWAPFILE_CLUSTER); } |
38d8b4e6b
|
992 |
|
5d5e8f195
|
993 |
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) |
1da177e4c
|
994 |
{ |
5d5e8f195
|
995 |
unsigned long size = swap_entry_size(entry_size); |
adfab836f
|
996 |
struct swap_info_struct *si, *next; |
36005bae2
|
997 998 |
long avail_pgs; int n_ret = 0; |
a2468cc9b
|
999 |
int node; |
1da177e4c
|
1000 |
|
38d8b4e6b
|
1001 |
/* Only single cluster request supported */ |
5d5e8f195
|
1002 |
WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); |
38d8b4e6b
|
1003 |
|
b50da6e9f
|
1004 |
spin_lock(&swap_avail_lock); |
5d5e8f195
|
1005 |
avail_pgs = atomic_long_read(&nr_swap_pages) / size; |
b50da6e9f
|
1006 1007 |
if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); |
fb4f88dca
|
1008 |
goto noswap; |
b50da6e9f
|
1009 |
} |
36005bae2
|
1010 |
|
08d3090fc
|
1011 |
n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); |
36005bae2
|
1012 |
|
5d5e8f195
|
1013 |
atomic_long_sub(n_goal * size, &nr_swap_pages); |
fb4f88dca
|
1014 |
|
18ab4d4ce
|
1015 |
start_over: |
a2468cc9b
|
1016 1017 |
node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { |
18ab4d4ce
|
1018 |
/* requeue si to after same-priority siblings */ |
a2468cc9b
|
1019 |
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); |
18ab4d4ce
|
1020 |
spin_unlock(&swap_avail_lock); |
ec8acf20a
|
1021 |
spin_lock(&si->lock); |
adfab836f
|
1022 |
if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { |
18ab4d4ce
|
1023 |
spin_lock(&swap_avail_lock); |
a2468cc9b
|
1024 |
if (plist_node_empty(&si->avail_lists[node])) { |
18ab4d4ce
|
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 |
spin_unlock(&si->lock); goto nextsi; } WARN(!si->highest_bit, "swap_info %d in list but !highest_bit ", si->type); WARN(!(si->flags & SWP_WRITEOK), "swap_info %d in list but !SWP_WRITEOK ", si->type); |
a2468cc9b
|
1036 |
__del_from_avail_list(si); |
ec8acf20a
|
1037 |
spin_unlock(&si->lock); |
18ab4d4ce
|
1038 |
goto nextsi; |
ec8acf20a
|
1039 |
} |
5d5e8f195
|
1040 |
if (size == SWAPFILE_CLUSTER) { |
416634305
|
1041 |
if (si->flags & SWP_BLKDEV) |
f0eea189e
|
1042 1043 |
n_ret = swap_alloc_cluster(si, swp_entries); } else |
38d8b4e6b
|
1044 1045 |
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); |
ec8acf20a
|
1046 |
spin_unlock(&si->lock); |
5d5e8f195
|
1047 |
if (n_ret || size == SWAPFILE_CLUSTER) |
36005bae2
|
1048 |
goto check_out; |
18ab4d4ce
|
1049 1050 |
pr_debug("scan_swap_map of si %d failed to find offset ", |
36005bae2
|
1051 |
si->type); |
18ab4d4ce
|
1052 1053 |
spin_lock(&swap_avail_lock); nextsi: |
adfab836f
|
1054 1055 |
/* * if we got here, it's likely that si was almost full before, |
bb243f7dc
|
1056 1057 1058 1059 1060 1061 1062 1063 |
* and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si * filled up between us dropping swap_avail_lock and taking * si->lock. Since we dropped the swap_avail_lock, the * swap_avail_head list may have been modified; so if next is * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. |
adfab836f
|
1064 |
*/ |
a2468cc9b
|
1065 |
if (plist_node_empty(&next->avail_lists[node])) |
18ab4d4ce
|
1066 |
goto start_over; |
1da177e4c
|
1067 |
} |
fb4f88dca
|
1068 |
|
18ab4d4ce
|
1069 |
spin_unlock(&swap_avail_lock); |
36005bae2
|
1070 1071 |
check_out: if (n_ret < n_goal) |
5d5e8f195
|
1072 |
atomic_long_add((long)(n_goal - n_ret) * size, |
38d8b4e6b
|
1073 |
&nr_swap_pages); |
fb4f88dca
|
1074 |
noswap: |
36005bae2
|
1075 1076 |
return n_ret; } |
e8c26ab60
|
1077 |
static struct swap_info_struct *__swap_info_get(swp_entry_t entry) |
1da177e4c
|
1078 |
{ |
73c34b6ac
|
1079 |
struct swap_info_struct *p; |
eb085574a
|
1080 |
unsigned long offset; |
1da177e4c
|
1081 1082 1083 |
if (!entry.val) goto out; |
eb085574a
|
1084 |
p = swp_swap_info(entry); |
c10d38cc8
|
1085 |
if (!p) |
1da177e4c
|
1086 |
goto bad_nofile; |
a449bf58e
|
1087 |
if (data_race(!(p->flags & SWP_USED))) |
1da177e4c
|
1088 1089 1090 1091 |
goto bad_device; offset = swp_offset(entry); if (offset >= p->max) goto bad_offset; |
1da177e4c
|
1092 |
return p; |
1da177e4c
|
1093 |
bad_offset: |
cf532faa4
|
1094 1095 |
pr_err("%s: %s%08lx ", __func__, Bad_offset, entry.val); |
1da177e4c
|
1096 1097 |
goto out; bad_device: |
cf532faa4
|
1098 1099 |
pr_err("%s: %s%08lx ", __func__, Unused_file, entry.val); |
1da177e4c
|
1100 1101 |
goto out; bad_nofile: |
cf532faa4
|
1102 1103 |
pr_err("%s: %s%08lx ", __func__, Bad_file, entry.val); |
1da177e4c
|
1104 1105 |
out: return NULL; |
886bb7e9c
|
1106 |
} |
1da177e4c
|
1107 |
|
e8c26ab60
|
1108 1109 1110 1111 1112 1113 1114 |
static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; p = __swap_info_get(entry); if (!p) goto out; |
a449bf58e
|
1115 |
if (data_race(!p->swap_map[swp_offset(entry)])) |
e8c26ab60
|
1116 1117 1118 1119 |
goto bad_free; return p; bad_free: |
cf532faa4
|
1120 1121 |
pr_err("%s: %s%08lx ", __func__, Unused_offset, entry.val); |
e8c26ab60
|
1122 1123 1124 |
out: return NULL; } |
235b62176
|
1125 1126 1127 1128 1129 1130 1131 1132 1133 |
static struct swap_info_struct *swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; p = _swap_info_get(entry); if (p) spin_lock(&p->lock); return p; } |
7c00bafee
|
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 |
static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { struct swap_info_struct *p; p = _swap_info_get(entry); if (p != q) { if (q != NULL) spin_unlock(&q->lock); if (p != NULL) spin_lock(&p->lock); } return p; } |
b32d5f32b
|
1149 1150 1151 |
static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, unsigned long offset, unsigned char usage) |
1da177e4c
|
1152 |
{ |
8d69aaee8
|
1153 1154 |
unsigned char count; unsigned char has_cache; |
235b62176
|
1155 |
|
253d553ba
|
1156 |
count = p->swap_map[offset]; |
235b62176
|
1157 |
|
253d553ba
|
1158 1159 |
has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; |
355cfa73d
|
1160 |
|
253d553ba
|
1161 |
if (usage == SWAP_HAS_CACHE) { |
355cfa73d
|
1162 |
VM_BUG_ON(!has_cache); |
253d553ba
|
1163 |
has_cache = 0; |
aaa468653
|
1164 1165 1166 1167 1168 1169 |
} else if (count == SWAP_MAP_SHMEM) { /* * Or we could insist on shmem.c using a special * swap_shmem_free() and free_shmem_swap_and_cache()... */ count = 0; |
570a335b8
|
1170 1171 1172 1173 1174 1175 1176 1177 1178 |
} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(p, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; else count = SWAP_MAP_MAX; } else count--; } |
253d553ba
|
1179 |
|
253d553ba
|
1180 |
usage = count | has_cache; |
a449bf58e
|
1181 1182 1183 1184 |
if (usage) WRITE_ONCE(p->swap_map[offset], usage); else WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); |
7c00bafee
|
1185 |
|
b32d5f32b
|
1186 1187 |
return usage; } |
eb085574a
|
1188 1189 1190 1191 1192 1193 |
/* * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. * |
eb085574a
|
1194 |
* Notice that swapoff or swapoff+swapon can still happen before the |
63d8620ec
|
1195 1196 1197 1198 1199 |
* percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way * to prevent swapoff, such as page lock, page table lock, etc. The * caller must be prepared for that. For example, the following * situation is possible. |
eb085574a
|
1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 |
* * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon * __read_swap_cache_async() * swapcache_prepare() * __swap_duplicate() * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before * changing partly because the specified swap entry may be for another * swap device which has been swapoff. And in do_swap_page(), after * the page is read from the swap device, the PTE is verified not * changed with the page table locked to check whether the swap device * has been swapoff or swapoff+swapon. */ struct swap_info_struct *get_swap_device(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swp_swap_info(entry); if (!si) goto bad_nofile; |
63d8620ec
|
1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 |
if (!percpu_ref_tryget_live(&si->users)) goto out; /* * Guarantee the si->users are checked before accessing other * fields of swap_info_struct. * * Paired with the spin_unlock() after setup_swap_info() in * enable_swap_info(). */ smp_rmb(); |
eb085574a
|
1237 1238 |
offset = swp_offset(entry); if (offset >= si->max) |
63d8620ec
|
1239 |
goto put_out; |
eb085574a
|
1240 1241 1242 1243 1244 1245 1246 |
return si; bad_nofile: pr_err("%s: %s%08lx ", __func__, Bad_file, entry.val); out: return NULL; |
63d8620ec
|
1247 1248 |
put_out: percpu_ref_put(&si->users); |
eb085574a
|
1249 1250 |
return NULL; } |
b32d5f32b
|
1251 |
static unsigned char __swap_entry_free(struct swap_info_struct *p, |
33e16272f
|
1252 |
swp_entry_t entry) |
b32d5f32b
|
1253 1254 1255 |
{ struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); |
33e16272f
|
1256 |
unsigned char usage; |
b32d5f32b
|
1257 1258 |
ci = lock_cluster_or_swap_info(p, offset); |
33e16272f
|
1259 |
usage = __swap_entry_free_locked(p, offset, 1); |
7c00bafee
|
1260 |
unlock_cluster_or_swap_info(p, ci); |
10e364da1
|
1261 1262 |
if (!usage) free_swap_slot(entry); |
7c00bafee
|
1263 1264 1265 |
return usage; } |
355cfa73d
|
1266 |
|
7c00bafee
|
1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 |
static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char count; ci = lock_cluster(p, offset); count = p->swap_map[offset]; VM_BUG_ON(count != SWAP_HAS_CACHE); p->swap_map[offset] = 0; dec_cluster_info_page(p, p->cluster_info, offset); |
235b62176
|
1278 |
unlock_cluster(ci); |
38d8b4e6b
|
1279 1280 |
mem_cgroup_uncharge_swap(entry, 1); swap_range_free(p, offset, 1); |
1da177e4c
|
1281 1282 1283 |
} /* |
2de1a7e40
|
1284 |
* Caller has made sure that the swap device corresponding to entry |
1da177e4c
|
1285 1286 1287 1288 |
* is still around or has not been recycled. */ void swap_free(swp_entry_t entry) { |
73c34b6ac
|
1289 |
struct swap_info_struct *p; |
1da177e4c
|
1290 |
|
235b62176
|
1291 |
p = _swap_info_get(entry); |
10e364da1
|
1292 |
if (p) |
33e16272f
|
1293 |
__swap_entry_free(p, entry); |
1da177e4c
|
1294 1295 1296 |
} /* |
cb4b86ba4
|
1297 1298 |
* Called after dropping swapcache to decrease refcnt to swap entries. */ |
a448f2d07
|
1299 |
void put_swap_page(struct page *page, swp_entry_t entry) |
38d8b4e6b
|
1300 1301 1302 1303 1304 1305 |
{ unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; struct swap_info_struct *si; unsigned char *map; |
a3aea839e
|
1306 1307 |
unsigned int i, free_entries = 0; unsigned char val; |
6c357848b
|
1308 |
int size = swap_entry_size(thp_nr_pages(page)); |
fe5266d5d
|
1309 |
|
a3aea839e
|
1310 |
si = _swap_info_get(entry); |
38d8b4e6b
|
1311 1312 |
if (!si) return; |
c2343d276
|
1313 |
ci = lock_cluster_or_swap_info(si, offset); |
a448f2d07
|
1314 |
if (size == SWAPFILE_CLUSTER) { |
a448f2d07
|
1315 1316 1317 1318 1319 1320 1321 1322 |
VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { val = map[i]; VM_BUG_ON(!(val & SWAP_HAS_CACHE)); if (val == SWAP_HAS_CACHE) free_entries++; } |
a448f2d07
|
1323 |
cluster_clear_huge(ci); |
a448f2d07
|
1324 |
if (free_entries == SWAPFILE_CLUSTER) { |
c2343d276
|
1325 |
unlock_cluster_or_swap_info(si, ci); |
a448f2d07
|
1326 |
spin_lock(&si->lock); |
a448f2d07
|
1327 1328 1329 1330 1331 1332 |
mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); swap_free_cluster(si, idx); spin_unlock(&si->lock); return; } } |
c2343d276
|
1333 1334 1335 1336 1337 1338 1339 |
for (i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { unlock_cluster_or_swap_info(si, ci); free_swap_slot(entry); if (i == size - 1) return; lock_cluster_or_swap_info(si, offset); |
a3aea839e
|
1340 1341 |
} } |
c2343d276
|
1342 |
unlock_cluster_or_swap_info(si, ci); |
38d8b4e6b
|
1343 |
} |
59807685a
|
1344 |
|
fe5266d5d
|
1345 |
#ifdef CONFIG_THP_SWAP |
59807685a
|
1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 |
int split_swap_cluster(swp_entry_t entry) { struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); si = _swap_info_get(entry); if (!si) return -EBUSY; ci = lock_cluster(si, offset); cluster_clear_huge(ci); unlock_cluster(ci); return 0; } |
fe5266d5d
|
1360 |
#endif |
38d8b4e6b
|
1361 |
|
155b5f88e
|
1362 1363 1364 1365 1366 1367 |
static int swp_entry_cmp(const void *ent1, const void *ent2) { const swp_entry_t *e1 = ent1, *e2 = ent2; return (int)swp_type(*e1) - (int)swp_type(*e2); } |
7c00bafee
|
1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 |
void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; int i; if (n <= 0) return; prev = NULL; p = NULL; |
155b5f88e
|
1378 1379 1380 1381 1382 1383 1384 1385 |
/* * Sort swap entries by swap device, so each lock is only taken once. * nr_swapfiles isn't absolutely correct, but the overhead of sort() is * so low that it isn't necessary to optimize further. */ if (nr_swapfiles > 1) sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); |
7c00bafee
|
1386 1387 1388 1389 |
for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) swap_entry_free(p, entries[i]); |
7c00bafee
|
1390 1391 |
prev = p; } |
235b62176
|
1392 |
if (p) |
7c00bafee
|
1393 |
spin_unlock(&p->lock); |
cb4b86ba4
|
1394 1395 1396 |
} /* |
c475a8ab6
|
1397 |
* How many references to page are currently swapped out? |
570a335b8
|
1398 1399 |
* This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. |
1da177e4c
|
1400 |
*/ |
bde05d1cc
|
1401 |
int page_swapcount(struct page *page) |
1da177e4c
|
1402 |
{ |
c475a8ab6
|
1403 1404 |
int count = 0; struct swap_info_struct *p; |
235b62176
|
1405 |
struct swap_cluster_info *ci; |
1da177e4c
|
1406 |
swp_entry_t entry; |
235b62176
|
1407 |
unsigned long offset; |
1da177e4c
|
1408 |
|
4c21e2f24
|
1409 |
entry.val = page_private(page); |
235b62176
|
1410 |
p = _swap_info_get(entry); |
1da177e4c
|
1411 |
if (p) { |
235b62176
|
1412 1413 1414 1415 |
offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); count = swap_count(p->swap_map[offset]); unlock_cluster_or_swap_info(p, ci); |
1da177e4c
|
1416 |
} |
c475a8ab6
|
1417 |
return count; |
1da177e4c
|
1418 |
} |
eb085574a
|
1419 |
int __swap_count(swp_entry_t entry) |
aa8d22a11
|
1420 |
{ |
eb085574a
|
1421 |
struct swap_info_struct *si; |
aa8d22a11
|
1422 |
pgoff_t offset = swp_offset(entry); |
eb085574a
|
1423 |
int count = 0; |
aa8d22a11
|
1424 |
|
eb085574a
|
1425 1426 1427 1428 1429 1430 |
si = get_swap_device(entry); if (si) { count = swap_count(si->swap_map[offset]); put_swap_device(si); } return count; |
aa8d22a11
|
1431 |
} |
322b8afe4
|
1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 |
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { int count = 0; pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); unlock_cluster_or_swap_info(si, ci); return count; } |
1da177e4c
|
1443 |
/* |
8334b9622
|
1444 |
* How many references to @entry are currently swapped out? |
e8c26ab60
|
1445 1446 1447 1448 1449 1450 |
* This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ int __swp_swapcount(swp_entry_t entry) { int count = 0; |
e8c26ab60
|
1451 |
struct swap_info_struct *si; |
e8c26ab60
|
1452 |
|
eb085574a
|
1453 1454 |
si = get_swap_device(entry); if (si) { |
322b8afe4
|
1455 |
count = swap_swapcount(si, entry); |
eb085574a
|
1456 1457 |
put_swap_device(si); } |
e8c26ab60
|
1458 1459 1460 1461 1462 |
return count; } /* * How many references to @entry are currently swapped out? |
8334b9622
|
1463 1464 1465 1466 1467 1468 |
* This considers COUNT_CONTINUED so it returns exact answer. */ int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; struct swap_info_struct *p; |
235b62176
|
1469 |
struct swap_cluster_info *ci; |
8334b9622
|
1470 1471 1472 |
struct page *page; pgoff_t offset; unsigned char *map; |
235b62176
|
1473 |
p = _swap_info_get(entry); |
8334b9622
|
1474 1475 |
if (!p) return 0; |
235b62176
|
1476 1477 1478 1479 1480 |
offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); count = swap_count(p->swap_map[offset]); |
8334b9622
|
1481 1482 1483 1484 1485 |
if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; |
8334b9622
|
1486 1487 1488 1489 1490 |
page = vmalloc_to_page(p->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); do { |
a8ae49917
|
1491 |
page = list_next_entry(page, lru); |
8334b9622
|
1492 1493 1494 1495 1496 1497 1498 1499 |
map = kmap_atomic(page); tmp_count = map[offset]; kunmap_atomic(map); count += (tmp_count & ~COUNT_CONTINUED) * n; n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: |
235b62176
|
1500 |
unlock_cluster_or_swap_info(p, ci); |
8334b9622
|
1501 1502 |
return count; } |
e07098294
|
1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 |
static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned char *map = si->swap_map; unsigned long roffset = swp_offset(entry); unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); int i; bool ret = false; ci = lock_cluster_or_swap_info(si, offset); if (!ci || !cluster_is_huge(ci)) { |
afa4711ef
|
1515 |
if (swap_count(map[roffset])) |
e07098294
|
1516 1517 1518 1519 |
ret = true; goto unlock_out; } for (i = 0; i < SWAPFILE_CLUSTER; i++) { |
afa4711ef
|
1520 |
if (swap_count(map[offset + i])) { |
e07098294
|
1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 |
ret = true; break; } } unlock_out: unlock_cluster_or_swap_info(si, ci); return ret; } static bool page_swapped(struct page *page) { swp_entry_t entry; struct swap_info_struct *si; |
fe5266d5d
|
1534 |
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) |
e07098294
|
1535 1536 1537 1538 1539 1540 1541 1542 1543 |
return page_swapcount(page) != 0; page = compound_head(page); entry.val = page_private(page); si = _swap_info_get(entry); if (si) return swap_page_trans_huge_swapped(si, entry); return false; } |
ba3c4ce6d
|
1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 |
static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, int *total_swapcount) { int i, map_swapcount, _total_mapcount, _total_swapcount; unsigned long offset = 0; struct swap_info_struct *si; struct swap_cluster_info *ci = NULL; unsigned char *map = NULL; int mapcount, swapcount = 0; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); |
fe5266d5d
|
1557 1558 |
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { mapcount = page_trans_huge_mapcount(page, total_mapcount); |
ba3c4ce6d
|
1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 |
if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; return mapcount + swapcount; } page = compound_head(page); _total_mapcount = _total_swapcount = map_swapcount = 0; if (PageSwapCache(page)) { swp_entry_t entry; entry.val = page_private(page); si = _swap_info_get(entry); if (si) { map = si->swap_map; offset = swp_offset(entry); } } if (map) ci = lock_cluster(si, offset); for (i = 0; i < HPAGE_PMD_NR; i++) { mapcount = atomic_read(&page[i]._mapcount) + 1; _total_mapcount += mapcount; if (map) { swapcount = swap_count(map[offset + i]); _total_swapcount += swapcount; } map_swapcount = max(map_swapcount, mapcount + swapcount); } unlock_cluster(ci); if (PageDoubleMap(page)) { map_swapcount -= 1; _total_mapcount -= HPAGE_PMD_NR; } mapcount = compound_mapcount(page); map_swapcount += mapcount; _total_mapcount += mapcount; if (total_mapcount) *total_mapcount = _total_mapcount; if (total_swapcount) *total_swapcount = _total_swapcount; return map_swapcount; } |
e07098294
|
1605 |
|
8334b9622
|
1606 |
/* |
7b1fe5979
|
1607 1608 1609 1610 |
* We can write to an anon page without COW if there are no other references * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. |
6d0a07edd
|
1611 |
* |
ba3c4ce6d
|
1612 |
* NOTE: total_map_swapcount should not be relied upon by the caller if |
6d0a07edd
|
1613 1614 |
* reuse_swap_page() returns false, but it may be always overwritten * (see the other implementation for CONFIG_SWAP=n). |
1da177e4c
|
1615 |
*/ |
ba3c4ce6d
|
1616 |
bool reuse_swap_page(struct page *page, int *total_map_swapcount) |
1da177e4c
|
1617 |
{ |
ba3c4ce6d
|
1618 |
int count, total_mapcount, total_swapcount; |
c475a8ab6
|
1619 |
|
309381fea
|
1620 |
VM_BUG_ON_PAGE(!PageLocked(page), page); |
5ad646880
|
1621 |
if (unlikely(PageKsm(page))) |
6d0a07edd
|
1622 |
return false; |
ba3c4ce6d
|
1623 1624 1625 1626 1627 1628 1629 1630 |
count = page_trans_huge_map_swapcount(page, &total_mapcount, &total_swapcount); if (total_map_swapcount) *total_map_swapcount = total_mapcount + total_swapcount; if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ total_swapcount == page_swapcount(page))) { |
f05714293
|
1631 |
if (!PageWriteback(page)) { |
ba3c4ce6d
|
1632 |
page = compound_head(page); |
7b1fe5979
|
1633 1634 |
delete_from_swap_cache(page); SetPageDirty(page); |
f05714293
|
1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 |
} else { swp_entry_t entry; struct swap_info_struct *p; entry.val = page_private(page); p = swap_info_get(entry); if (p->flags & SWP_STABLE_WRITES) { spin_unlock(&p->lock); return false; } spin_unlock(&p->lock); |
7b1fe5979
|
1646 1647 |
} } |
ba3c4ce6d
|
1648 |
|
5ad646880
|
1649 |
return count <= 1; |
1da177e4c
|
1650 1651 1652 |
} /* |
a2c43eed8
|
1653 1654 |
* If swap is getting full, or if there are no more mappings of this page, * then try_to_free_swap is called to free its swap space. |
1da177e4c
|
1655 |
*/ |
a2c43eed8
|
1656 |
int try_to_free_swap(struct page *page) |
1da177e4c
|
1657 |
{ |
309381fea
|
1658 |
VM_BUG_ON_PAGE(!PageLocked(page), page); |
1da177e4c
|
1659 1660 1661 1662 1663 |
if (!PageSwapCache(page)) return 0; if (PageWriteback(page)) return 0; |
e07098294
|
1664 |
if (page_swapped(page)) |
1da177e4c
|
1665 |
return 0; |
b73d7fcec
|
1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 |
/* * Once hibernation has begun to create its image of memory, * there's a danger that one of the calls to try_to_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free * the swap from a page which has already been recorded in the * image as a clean swapcache page, and then reuse its swap for * another page of the image. On waking from hibernation, the * original page might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * |
2de1a7e40
|
1678 |
* Hibernation suspends storage while it is writing the image |
f90ac3982
|
1679 |
* to disk so check that here. |
b73d7fcec
|
1680 |
*/ |
f90ac3982
|
1681 |
if (pm_suspended_storage()) |
b73d7fcec
|
1682 |
return 0; |
e07098294
|
1683 |
page = compound_head(page); |
a2c43eed8
|
1684 1685 1686 |
delete_from_swap_cache(page); SetPageDirty(page); return 1; |
68a22394c
|
1687 1688 1689 |
} /* |
1da177e4c
|
1690 1691 1692 |
* Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ |
2509ef26d
|
1693 |
int free_swap_and_cache(swp_entry_t entry) |
1da177e4c
|
1694 |
{ |
2509ef26d
|
1695 |
struct swap_info_struct *p; |
7c00bafee
|
1696 |
unsigned char count; |
1da177e4c
|
1697 |
|
a7420aa54
|
1698 |
if (non_swap_entry(entry)) |
2509ef26d
|
1699 |
return 1; |
0697212a4
|
1700 |
|
7c00bafee
|
1701 |
p = _swap_info_get(entry); |
1da177e4c
|
1702 |
if (p) { |
33e16272f
|
1703 |
count = __swap_entry_free(p, entry); |
e07098294
|
1704 |
if (count == SWAP_HAS_CACHE && |
bcd49e867
|
1705 1706 1707 |
!swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), TTRS_UNMAPPED | TTRS_FULL); |
1da177e4c
|
1708 |
} |
2509ef26d
|
1709 |
return p != NULL; |
1da177e4c
|
1710 |
} |
b0cb1a19d
|
1711 |
#ifdef CONFIG_HIBERNATION |
bb243f7dc
|
1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 |
swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si = swap_type_to_swap_info(type); swp_entry_t entry = {0}; if (!si) goto fail; /* This is called for allocating swap entry, not cache */ spin_lock(&si->lock); if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); fail: return entry; } |
f577eb30a
|
1729 |
/* |
915bae9eb
|
1730 |
* Find the swap type that corresponds to given device (if any). |
f577eb30a
|
1731 |
* |
915bae9eb
|
1732 1733 1734 1735 |
* @offset - number of the PAGE_SIZE-sized block of the device, starting * from 0, in which the swap header is expected to be located. * * This is needed for the suspend to disk (aka swsusp). |
f577eb30a
|
1736 |
*/ |
21bd90057
|
1737 |
int swap_type_of(dev_t device, sector_t offset) |
f577eb30a
|
1738 |
{ |
efa90a981
|
1739 |
int type; |
f577eb30a
|
1740 |
|
21bd90057
|
1741 1742 |
if (!device) return -1; |
915bae9eb
|
1743 |
|
f577eb30a
|
1744 |
spin_lock(&swap_lock); |
efa90a981
|
1745 1746 |
for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; |
f577eb30a
|
1747 |
|
915bae9eb
|
1748 |
if (!(sis->flags & SWP_WRITEOK)) |
f577eb30a
|
1749 |
continue; |
b6b5bce35
|
1750 |
|
21bd90057
|
1751 |
if (device == sis->bdev->bd_dev) { |
4efaceb1c
|
1752 |
struct swap_extent *se = first_se(sis); |
915bae9eb
|
1753 |
|
915bae9eb
|
1754 1755 |
if (se->start_block == offset) { spin_unlock(&swap_lock); |
efa90a981
|
1756 |
return type; |
915bae9eb
|
1757 |
} |
f577eb30a
|
1758 1759 1760 |
} } spin_unlock(&swap_lock); |
21bd90057
|
1761 1762 |
return -ENODEV; } |
915bae9eb
|
1763 |
|
21bd90057
|
1764 1765 1766 |
int find_first_swap(dev_t *device) { int type; |
915bae9eb
|
1767 |
|
21bd90057
|
1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 |
spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; *device = sis->bdev->bd_dev; spin_unlock(&swap_lock); return type; } spin_unlock(&swap_lock); |
f577eb30a
|
1779 1780 1781 1782 |
return -ENODEV; } /* |
73c34b6ac
|
1783 1784 1785 1786 1787 |
* Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type). */ sector_t swapdev_block(int type, pgoff_t offset) { |
c10d38cc8
|
1788 |
struct swap_info_struct *si = swap_type_to_swap_info(type); |
f885056a4
|
1789 |
struct swap_extent *se; |
73c34b6ac
|
1790 |
|
c10d38cc8
|
1791 |
if (!si || !(si->flags & SWP_WRITEOK)) |
73c34b6ac
|
1792 |
return 0; |
f885056a4
|
1793 1794 |
se = offset_to_swap_extent(si, offset); return se->start_block + (offset - se->start_page); |
73c34b6ac
|
1795 1796 1797 |
} /* |
f577eb30a
|
1798 1799 1800 1801 1802 1803 1804 1805 |
* Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * * This is needed for software suspend */ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; |
efa90a981
|
1806 1807 1808 |
spin_lock(&swap_lock); if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; |
ec8acf20a
|
1809 |
spin_lock(&sis->lock); |
efa90a981
|
1810 1811 |
if (sis->flags & SWP_WRITEOK) { n = sis->pages; |
f577eb30a
|
1812 |
if (free) |
efa90a981
|
1813 |
n -= sis->inuse_pages; |
f577eb30a
|
1814 |
} |
ec8acf20a
|
1815 |
spin_unlock(&sis->lock); |
f577eb30a
|
1816 |
} |
efa90a981
|
1817 |
spin_unlock(&swap_lock); |
f577eb30a
|
1818 1819 |
return n; } |
73c34b6ac
|
1820 |
#endif /* CONFIG_HIBERNATION */ |
f577eb30a
|
1821 |
|
9f8bdb3f3
|
1822 |
static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) |
179ef71cb
|
1823 |
{ |
099dd6878
|
1824 |
return pte_same(pte_swp_clear_flags(pte), swp_pte); |
179ef71cb
|
1825 |
} |
1da177e4c
|
1826 |
/* |
72866f6f2
|
1827 1828 1829 |
* No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. |
1da177e4c
|
1830 |
*/ |
044d66c1d
|
1831 |
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
1da177e4c
|
1832 1833 |
unsigned long addr, swp_entry_t entry, struct page *page) { |
9e16b7fb1
|
1834 |
struct page *swapcache; |
044d66c1d
|
1835 1836 1837 |
spinlock_t *ptl; pte_t *pte; int ret = 1; |
9e16b7fb1
|
1838 1839 1840 1841 |
swapcache = page; page = ksm_might_need_to_copy(page, vma, addr); if (unlikely(!page)) return -ENOMEM; |
044d66c1d
|
1842 |
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
9f8bdb3f3
|
1843 |
if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { |
044d66c1d
|
1844 1845 1846 |
ret = 0; goto out; } |
8a9f3ccd2
|
1847 |
|
b084d4353
|
1848 |
dec_mm_counter(vma->vm_mm, MM_SWAPENTS); |
d559db086
|
1849 |
inc_mm_counter(vma->vm_mm, MM_ANONPAGES); |
1da177e4c
|
1850 1851 1852 |
get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); |
00501b531
|
1853 |
if (page == swapcache) { |
be5d0a74c
|
1854 |
page_add_anon_rmap(page, vma, addr, false); |
00501b531
|
1855 |
} else { /* ksm created a completely new copy */ |
be5d0a74c
|
1856 |
page_add_new_anon_rmap(page, vma, addr, false); |
b518154e5
|
1857 |
lru_cache_add_inactive_or_unevictable(page, vma); |
00501b531
|
1858 |
} |
1da177e4c
|
1859 |
swap_free(entry); |
044d66c1d
|
1860 1861 |
out: pte_unmap_unlock(pte, ptl); |
9e16b7fb1
|
1862 1863 1864 1865 |
if (page != swapcache) { unlock_page(page); put_page(page); } |
044d66c1d
|
1866 |
return ret; |
1da177e4c
|
1867 1868 1869 |
} static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
b56a2d8af
|
1870 1871 1872 |
unsigned long addr, unsigned long end, unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse) |
1da177e4c
|
1873 |
{ |
b56a2d8af
|
1874 1875 |
struct page *page; swp_entry_t entry; |
705e87c0c
|
1876 |
pte_t *pte; |
b56a2d8af
|
1877 1878 |
struct swap_info_struct *si; unsigned long offset; |
8a9f3ccd2
|
1879 |
int ret = 0; |
b56a2d8af
|
1880 |
volatile unsigned char *swap_map; |
1da177e4c
|
1881 |
|
b56a2d8af
|
1882 |
si = swap_info[type]; |
044d66c1d
|
1883 |
pte = pte_offset_map(pmd, addr); |
1da177e4c
|
1884 |
do { |
b56a2d8af
|
1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 |
if (!is_swap_pte(*pte)) continue; entry = pte_to_swp_entry(*pte); if (swp_type(entry) != type) continue; offset = swp_offset(entry); if (frontswap && !frontswap_test(si, offset)) continue; pte_unmap(pte); swap_map = &si->swap_map[offset]; |
ebc5951ee
|
1898 1899 |
page = lookup_swap_cache(entry, vma, addr); if (!page) { |
8c63ca5bc
|
1900 1901 1902 1903 1904 |
struct vm_fault vmf = { .vma = vma, .address = addr, .pmd = pmd, }; |
ebc5951ee
|
1905 1906 1907 |
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); } |
b56a2d8af
|
1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 |
if (!page) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; return -ENOMEM; } lock_page(page); wait_on_page_writeback(page); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret < 0) { unlock_page(page); put_page(page); goto out; } try_to_free_swap(page); unlock_page(page); put_page(page); if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { ret = FRONTSWAP_PAGES_UNUSED; goto out; |
1da177e4c
|
1930 |
} |
b56a2d8af
|
1931 1932 |
try_next: pte = pte_offset_map(pmd, addr); |
1da177e4c
|
1933 |
} while (pte++, addr += PAGE_SIZE, addr != end); |
044d66c1d
|
1934 |
pte_unmap(pte - 1); |
b56a2d8af
|
1935 1936 |
ret = 0; |
044d66c1d
|
1937 |
out: |
8a9f3ccd2
|
1938 |
return ret; |
1da177e4c
|
1939 1940 1941 1942 |
} static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, |
b56a2d8af
|
1943 1944 |
unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse) |
1da177e4c
|
1945 1946 1947 |
{ pmd_t *pmd; unsigned long next; |
8a9f3ccd2
|
1948 |
int ret; |
1da177e4c
|
1949 1950 1951 |
pmd = pmd_offset(pud, addr); do { |
dc644a073
|
1952 |
cond_resched(); |
1da177e4c
|
1953 |
next = pmd_addr_end(addr, end); |
1a5a9906d
|
1954 |
if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
1da177e4c
|
1955 |
continue; |
b56a2d8af
|
1956 1957 |
ret = unuse_pte_range(vma, pmd, addr, next, type, frontswap, fs_pages_to_unuse); |
8a9f3ccd2
|
1958 1959 |
if (ret) return ret; |
1da177e4c
|
1960 1961 1962 |
} while (pmd++, addr = next, addr != end); return 0; } |
c2febafc6
|
1963 |
static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, |
1da177e4c
|
1964 |
unsigned long addr, unsigned long end, |
b56a2d8af
|
1965 1966 |
unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse) |
1da177e4c
|
1967 1968 1969 |
{ pud_t *pud; unsigned long next; |
8a9f3ccd2
|
1970 |
int ret; |
1da177e4c
|
1971 |
|
c2febafc6
|
1972 |
pud = pud_offset(p4d, addr); |
1da177e4c
|
1973 1974 1975 1976 |
do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; |
b56a2d8af
|
1977 1978 |
ret = unuse_pmd_range(vma, pud, addr, next, type, frontswap, fs_pages_to_unuse); |
8a9f3ccd2
|
1979 1980 |
if (ret) return ret; |
1da177e4c
|
1981 1982 1983 |
} while (pud++, addr = next, addr != end); return 0; } |
c2febafc6
|
1984 1985 |
static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, |
b56a2d8af
|
1986 1987 |
unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse) |
c2febafc6
|
1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 |
{ p4d_t *p4d; unsigned long next; int ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; |
b56a2d8af
|
1998 1999 |
ret = unuse_pud_range(vma, p4d, addr, next, type, frontswap, fs_pages_to_unuse); |
c2febafc6
|
2000 2001 2002 2003 2004 |
if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } |
b56a2d8af
|
2005 2006 |
static int unuse_vma(struct vm_area_struct *vma, unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse) |
1da177e4c
|
2007 2008 2009 |
{ pgd_t *pgd; unsigned long addr, end, next; |
8a9f3ccd2
|
2010 |
int ret; |
1da177e4c
|
2011 |
|
b56a2d8af
|
2012 2013 |
addr = vma->vm_start; end = vma->vm_end; |
1da177e4c
|
2014 2015 2016 2017 2018 2019 |
pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; |
b56a2d8af
|
2020 2021 |
ret = unuse_p4d_range(vma, pgd, addr, next, type, frontswap, fs_pages_to_unuse); |
8a9f3ccd2
|
2022 2023 |
if (ret) return ret; |
1da177e4c
|
2024 2025 2026 |
} while (pgd++, addr = next, addr != end); return 0; } |
b56a2d8af
|
2027 2028 |
static int unuse_mm(struct mm_struct *mm, unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse) |
1da177e4c
|
2029 2030 |
{ struct vm_area_struct *vma; |
8a9f3ccd2
|
2031 |
int ret = 0; |
1da177e4c
|
2032 |
|
d8ed45c5d
|
2033 |
mmap_read_lock(mm); |
1da177e4c
|
2034 |
for (vma = mm->mmap; vma; vma = vma->vm_next) { |
b56a2d8af
|
2035 2036 2037 2038 2039 2040 |
if (vma->anon_vma) { ret = unuse_vma(vma, type, frontswap, fs_pages_to_unuse); if (ret) break; } |
dc644a073
|
2041 |
cond_resched(); |
1da177e4c
|
2042 |
} |
d8ed45c5d
|
2043 |
mmap_read_unlock(mm); |
b56a2d8af
|
2044 |
return ret; |
1da177e4c
|
2045 2046 2047 |
} /* |
38b5faf4b
|
2048 |
* Scan swap_map (or frontswap_map if frontswap parameter is true) |
b56a2d8af
|
2049 2050 |
* from current position to next entry still in use. Return 0 * if there are no inuse entries after prev till end of the map. |
1da177e4c
|
2051 |
*/ |
6eb396dc4
|
2052 |
static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
38b5faf4b
|
2053 |
unsigned int prev, bool frontswap) |
1da177e4c
|
2054 |
{ |
b56a2d8af
|
2055 |
unsigned int i; |
8d69aaee8
|
2056 |
unsigned char count; |
1da177e4c
|
2057 2058 |
/* |
5d337b919
|
2059 |
* No need for swap_lock here: we're just looking |
1da177e4c
|
2060 2061 |
* for whether an entry is in use, not modifying it; false * hits are okay, and sys_swapoff() has already prevented new |
5d337b919
|
2062 |
* allocations from this area (while holding swap_lock). |
1da177e4c
|
2063 |
*/ |
b56a2d8af
|
2064 |
for (i = prev + 1; i < si->max; i++) { |
4db0c3c29
|
2065 |
count = READ_ONCE(si->swap_map[i]); |
355cfa73d
|
2066 |
if (count && swap_count(count) != SWAP_MAP_BAD) |
dc644a073
|
2067 2068 2069 2070 |
if (!frontswap || frontswap_test(si, i)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); |
1da177e4c
|
2071 |
} |
b56a2d8af
|
2072 2073 2074 |
if (i == si->max) i = 0; |
1da177e4c
|
2075 2076 2077 2078 |
return i; } /* |
b56a2d8af
|
2079 |
* If the boolean frontswap is true, only unuse pages_to_unuse pages; |
38b5faf4b
|
2080 |
* pages_to_unuse==0 means all pages; ignored if frontswap is false |
1da177e4c
|
2081 |
*/ |
38b5faf4b
|
2082 2083 |
int try_to_unuse(unsigned int type, bool frontswap, unsigned long pages_to_unuse) |
1da177e4c
|
2084 |
{ |
b56a2d8af
|
2085 2086 2087 2088 |
struct mm_struct *prev_mm; struct mm_struct *mm; struct list_head *p; int retval = 0; |
efa90a981
|
2089 |
struct swap_info_struct *si = swap_info[type]; |
1da177e4c
|
2090 2091 |
struct page *page; swp_entry_t entry; |
b56a2d8af
|
2092 |
unsigned int i; |
1da177e4c
|
2093 |
|
218209487
|
2094 |
if (!READ_ONCE(si->inuse_pages)) |
b56a2d8af
|
2095 |
return 0; |
1da177e4c
|
2096 |
|
b56a2d8af
|
2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 |
if (!frontswap) pages_to_unuse = 0; retry: retval = shmem_unuse(type, frontswap, &pages_to_unuse); if (retval) goto out; prev_mm = &init_mm; mmget(prev_mm); spin_lock(&mmlist_lock); p = &init_mm.mmlist; |
218209487
|
2110 |
while (READ_ONCE(si->inuse_pages) && |
64165b1af
|
2111 2112 |
!signal_pending(current) && (p = p->next) != &init_mm.mmlist) { |
1da177e4c
|
2113 |
|
b56a2d8af
|
2114 2115 2116 2117 2118 2119 2120 |
mm = list_entry(p, struct mm_struct, mmlist); if (!mmget_not_zero(mm)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); |
1da177e4c
|
2121 |
|
b56a2d8af
|
2122 2123 2124 |
if (retval) { mmput(prev_mm); goto out; |
1da177e4c
|
2125 2126 2127 |
} /* |
b56a2d8af
|
2128 2129 |
* Make sure that we aren't completely killing * interactive performance. |
1da177e4c
|
2130 |
*/ |
b56a2d8af
|
2131 2132 2133 2134 |
cond_resched(); spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); |
1da177e4c
|
2135 |
|
b56a2d8af
|
2136 |
mmput(prev_mm); |
1da177e4c
|
2137 |
|
b56a2d8af
|
2138 |
i = 0; |
218209487
|
2139 |
while (READ_ONCE(si->inuse_pages) && |
64165b1af
|
2140 2141 |
!signal_pending(current) && (i = find_next_to_unuse(si, i, frontswap)) != 0) { |
1da177e4c
|
2142 |
|
b56a2d8af
|
2143 2144 2145 2146 |
entry = swp_entry(type, i); page = find_get_page(swap_address_space(entry), i); if (!page) continue; |
68bdc8d64
|
2147 2148 2149 |
/* * It is conceivable that a racing task removed this page from |
b56a2d8af
|
2150 2151 2152 |
* swap cache just before we acquired the page lock. The page * might even be back in swap cache on another swap area. But * that is okay, try_to_free_swap() only removes stale pages. |
1da177e4c
|
2153 |
*/ |
b56a2d8af
|
2154 2155 2156 |
lock_page(page); wait_on_page_writeback(page); try_to_free_swap(page); |
1da177e4c
|
2157 |
unlock_page(page); |
09cbfeaf1
|
2158 |
put_page(page); |
1da177e4c
|
2159 2160 |
/* |
b56a2d8af
|
2161 2162 2163 |
* For frontswap, we just need to unuse pages_to_unuse, if * it was specified. Need not check frontswap again here as * we already zeroed out pages_to_unuse if not frontswap. |
1da177e4c
|
2164 |
*/ |
b56a2d8af
|
2165 2166 |
if (pages_to_unuse && --pages_to_unuse == 0) goto out; |
1da177e4c
|
2167 |
} |
b56a2d8af
|
2168 2169 2170 2171 2172 |
/* * Lets check again to see if there are still swap entries in the map. * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. |
dd862deb1
|
2173 |
* |
af53d3e9e
|
2174 2175 2176 2177 2178 |
* Limit the number of retries? No: when mmget_not_zero() above fails, * that mm is likely to be freeing swap from exit_mmap(), which proceeds * at its own independent pace; and even shmem_writepage() could have * been preempted after get_swap_page(), temporarily hiding that swap. * It's easy and robust (though cpu-intensive) just to keep retrying. |
b56a2d8af
|
2179 |
*/ |
218209487
|
2180 |
if (READ_ONCE(si->inuse_pages)) { |
64165b1af
|
2181 2182 2183 2184 |
if (!signal_pending(current)) goto retry; retval = -EINTR; } |
b56a2d8af
|
2185 2186 |
out: return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; |
1da177e4c
|
2187 2188 2189 |
} /* |
5d337b919
|
2190 2191 2192 |
* After a successful try_to_unuse, if no swap is now in use, we know * we can empty the mmlist. swap_lock must be held on entry and exit. * Note that mmlist_lock nests inside swap_lock, and an mm must be |
1da177e4c
|
2193 2194 2195 2196 2197 |
* added to the mmlist just after page_duplicate - before would be racy. */ static void drain_mmlist(void) { struct list_head *p, *next; |
efa90a981
|
2198 |
unsigned int type; |
1da177e4c
|
2199 |
|
efa90a981
|
2200 2201 |
for (type = 0; type < nr_swapfiles; type++) if (swap_info[type]->inuse_pages) |
1da177e4c
|
2202 2203 2204 2205 2206 2207 |
return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) list_del_init(p); spin_unlock(&mmlist_lock); } |
1da177e4c
|
2208 2209 2210 2211 2212 |
/* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) { |
4efaceb1c
|
2213 2214 2215 |
while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); |
1da177e4c
|
2216 |
|
4efaceb1c
|
2217 |
rb_erase(rb, &sis->swap_extent_root); |
1da177e4c
|
2218 2219 |
kfree(se); } |
62c230bc1
|
2220 |
|
bc4ae27d8
|
2221 |
if (sis->flags & SWP_ACTIVATED) { |
62c230bc1
|
2222 2223 |
struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; |
bc4ae27d8
|
2224 2225 2226 |
sis->flags &= ~SWP_ACTIVATED; if (mapping->a_ops->swap_deactivate) mapping->a_ops->swap_deactivate(swap_file); |
62c230bc1
|
2227 |
} |
1da177e4c
|
2228 2229 2230 2231 |
} /* * Add a block range (and the corresponding page range) into this swapdev's |
4efaceb1c
|
2232 |
* extent tree. |
1da177e4c
|
2233 |
* |
11d31886d
|
2234 |
* This function rather assumes that it is called in ascending page order. |
1da177e4c
|
2235 |
*/ |
a509bc1a9
|
2236 |
int |
1da177e4c
|
2237 2238 2239 |
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { |
4efaceb1c
|
2240 |
struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; |
1da177e4c
|
2241 2242 |
struct swap_extent *se; struct swap_extent *new_se; |
4efaceb1c
|
2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 |
/* * place the new node at the right most since the * function is called in ascending page order. */ while (*link) { parent = *link; link = &parent->rb_right; } if (parent) { se = rb_entry(parent, struct swap_extent, rb_node); |
11d31886d
|
2255 2256 |
BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { |
1da177e4c
|
2257 2258 2259 2260 |
/* Merge it */ se->nr_pages += nr_pages; return 0; } |
1da177e4c
|
2261 |
} |
4efaceb1c
|
2262 |
/* No merge, insert a new extent. */ |
1da177e4c
|
2263 2264 2265 2266 2267 2268 |
new_se = kmalloc(sizeof(*se), GFP_KERNEL); if (new_se == NULL) return -ENOMEM; new_se->start_page = start_page; new_se->nr_pages = nr_pages; new_se->start_block = start_block; |
4efaceb1c
|
2269 2270 |
rb_link_node(&new_se->rb_node, parent, link); rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); |
53092a740
|
2271 |
return 1; |
1da177e4c
|
2272 |
} |
aa8aa8a33
|
2273 |
EXPORT_SYMBOL_GPL(add_swap_extent); |
1da177e4c
|
2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 |
/* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. An ordered list of swap extents * is built at swapon time and is then used at swap_writepage/swap_readpage * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. * This is done so that the main operating code can treat S_ISBLK and S_ISREG * swap files identically. * * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK * swapfiles are handled *identically* after swapon time. * * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If * some stray blocks are found which do not fall within the PAGE_SIZE alignment * requirements, they are simply tossed out - we will never use those blocks * for swapping. * |
1638045c3
|
2295 2296 |
* For all swap devices we set S_SWAPFILE across the life of the swapon. This * prevents users from writing to the swap device, which will corrupt memory. |
1da177e4c
|
2297 2298 2299 2300 2301 2302 2303 2304 |
* * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of * extents in the list. To avoid much list walking, we cache the previous * search location in `curr_swap_extent', and start new searches from there. * This is extremely effective. The average number of iterations in * map_swap_page() has been measured at about 0.3 per page. - akpm. */ |
53092a740
|
2305 |
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) |
1da177e4c
|
2306 |
{ |
62c230bc1
|
2307 2308 2309 |
struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; |
1da177e4c
|
2310 |
int ret; |
1da177e4c
|
2311 2312 |
if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); |
53092a740
|
2313 |
*span = sis->pages; |
a509bc1a9
|
2314 |
return ret; |
1da177e4c
|
2315 |
} |
62c230bc1
|
2316 |
if (mapping->a_ops->swap_activate) { |
a509bc1a9
|
2317 |
ret = mapping->a_ops->swap_activate(sis, swap_file, span); |
bc4ae27d8
|
2318 2319 |
if (ret >= 0) sis->flags |= SWP_ACTIVATED; |
62c230bc1
|
2320 |
if (!ret) { |
326463154
|
2321 |
sis->flags |= SWP_FS_OPS; |
62c230bc1
|
2322 2323 2324 |
ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } |
a509bc1a9
|
2325 |
return ret; |
62c230bc1
|
2326 |
} |
a509bc1a9
|
2327 |
return generic_swapfile_activate(sis, swap_file, span); |
1da177e4c
|
2328 |
} |
a2468cc9b
|
2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 |
static int swap_node(struct swap_info_struct *p) { struct block_device *bdev; if (p->bdev) bdev = p->bdev; else bdev = p->swap_file->f_inode->i_sb->s_bdev; return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } |
eb085574a
|
2340 2341 2342 |
static void setup_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) |
40531542e
|
2343 |
{ |
a2468cc9b
|
2344 |
int i; |
40531542e
|
2345 2346 2347 2348 |
if (prio >= 0) p->prio = prio; else p->prio = --least_priority; |
18ab4d4ce
|
2349 2350 2351 2352 2353 |
/* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ p->list.prio = -p->prio; |
a2468cc9b
|
2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 |
for_each_node(i) { if (p->prio >= 0) p->avail_lists[i].prio = -p->prio; else { if (swap_node(p) == i) p->avail_lists[i].prio = 1; else p->avail_lists[i].prio = -p->prio; } } |
40531542e
|
2364 |
p->swap_map = swap_map; |
2a8f94493
|
2365 |
p->cluster_info = cluster_info; |
eb085574a
|
2366 2367 2368 2369 |
} static void _enable_swap_info(struct swap_info_struct *p) { |
63d8620ec
|
2370 |
p->flags |= SWP_WRITEOK; |
ec8acf20a
|
2371 |
atomic_long_add(p->pages, &nr_swap_pages); |
40531542e
|
2372 |
total_swap_pages += p->pages; |
adfab836f
|
2373 |
assert_spin_locked(&swap_lock); |
adfab836f
|
2374 |
/* |
18ab4d4ce
|
2375 2376 2377 2378 2379 2380 2381 2382 |
* both lists are plists, and thus priority ordered. * swap_active_head needs to be priority ordered for swapoff(), * which on removal of any swap_info_struct with an auto-assigned * (i.e. negative) priority increments the auto-assigned priority * of any lower-priority swap_info_structs. * swap_avail_head needs to be priority ordered for get_swap_page(), * which allocates swap pages from the highest available priority * swap_info_struct. |
adfab836f
|
2383 |
*/ |
18ab4d4ce
|
2384 |
plist_add(&p->list, &swap_active_head); |
a2468cc9b
|
2385 |
add_to_avail_list(p); |
cf0cac0a0
|
2386 2387 2388 2389 |
} static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, |
2a8f94493
|
2390 |
struct swap_cluster_info *cluster_info, |
cf0cac0a0
|
2391 2392 |
unsigned long *frontswap_map) { |
4f89849da
|
2393 |
frontswap_init(p->type, frontswap_map); |
cf0cac0a0
|
2394 |
spin_lock(&swap_lock); |
ec8acf20a
|
2395 |
spin_lock(&p->lock); |
eb085574a
|
2396 2397 2398 2399 |
setup_swap_info(p, prio, swap_map, cluster_info); spin_unlock(&p->lock); spin_unlock(&swap_lock); /* |
63d8620ec
|
2400 |
* Finished initializing swap device, now it's safe to reference it. |
eb085574a
|
2401 |
*/ |
63d8620ec
|
2402 |
percpu_ref_resurrect(&p->users); |
eb085574a
|
2403 2404 2405 |
spin_lock(&swap_lock); spin_lock(&p->lock); _enable_swap_info(p); |
ec8acf20a
|
2406 |
spin_unlock(&p->lock); |
cf0cac0a0
|
2407 2408 2409 2410 2411 2412 |
spin_unlock(&swap_lock); } static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); |
ec8acf20a
|
2413 |
spin_lock(&p->lock); |
eb085574a
|
2414 2415 |
setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); _enable_swap_info(p); |
ec8acf20a
|
2416 |
spin_unlock(&p->lock); |
40531542e
|
2417 2418 |
spin_unlock(&swap_lock); } |
67afa38e0
|
2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 |
bool has_usable_swap(void) { bool ret = true; spin_lock(&swap_lock); if (plist_head_empty(&swap_active_head)) ret = false; spin_unlock(&swap_lock); return ret; } |
c4ea37c26
|
2429 |
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1da177e4c
|
2430 |
{ |
73c34b6ac
|
2431 |
struct swap_info_struct *p = NULL; |
8d69aaee8
|
2432 |
unsigned char *swap_map; |
2a8f94493
|
2433 |
struct swap_cluster_info *cluster_info; |
4f89849da
|
2434 |
unsigned long *frontswap_map; |
1da177e4c
|
2435 2436 2437 |
struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; |
91a27b2a7
|
2438 |
struct filename *pathname; |
adfab836f
|
2439 |
int err, found = 0; |
5b808a230
|
2440 |
unsigned int old_block_size; |
886bb7e9c
|
2441 |
|
1da177e4c
|
2442 2443 |
if (!capable(CAP_SYS_ADMIN)) return -EPERM; |
191c54244
|
2444 |
BUG_ON(!current->mm); |
1da177e4c
|
2445 |
pathname = getname(specialfile); |
1da177e4c
|
2446 |
if (IS_ERR(pathname)) |
f58b59c1d
|
2447 |
return PTR_ERR(pathname); |
1da177e4c
|
2448 |
|
669abf4e5
|
2449 |
victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); |
1da177e4c
|
2450 2451 2452 2453 2454 |
err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; mapping = victim->f_mapping; |
5d337b919
|
2455 |
spin_lock(&swap_lock); |
18ab4d4ce
|
2456 |
plist_for_each_entry(p, &swap_active_head, list) { |
22c6f8fdb
|
2457 |
if (p->flags & SWP_WRITEOK) { |
adfab836f
|
2458 2459 |
if (p->swap_file->f_mapping == mapping) { found = 1; |
1da177e4c
|
2460 |
break; |
adfab836f
|
2461 |
} |
1da177e4c
|
2462 |
} |
1da177e4c
|
2463 |
} |
adfab836f
|
2464 |
if (!found) { |
1da177e4c
|
2465 |
err = -EINVAL; |
5d337b919
|
2466 |
spin_unlock(&swap_lock); |
1da177e4c
|
2467 2468 |
goto out_dput; } |
191c54244
|
2469 |
if (!security_vm_enough_memory_mm(current->mm, p->pages)) |
1da177e4c
|
2470 2471 2472 |
vm_unacct_memory(p->pages); else { err = -ENOMEM; |
5d337b919
|
2473 |
spin_unlock(&swap_lock); |
1da177e4c
|
2474 2475 |
goto out_dput; } |
a2468cc9b
|
2476 |
del_from_avail_list(p); |
ec8acf20a
|
2477 |
spin_lock(&p->lock); |
78ecba081
|
2478 |
if (p->prio < 0) { |
adfab836f
|
2479 |
struct swap_info_struct *si = p; |
a2468cc9b
|
2480 |
int nid; |
adfab836f
|
2481 |
|
18ab4d4ce
|
2482 |
plist_for_each_entry_continue(si, &swap_active_head, list) { |
adfab836f
|
2483 |
si->prio++; |
18ab4d4ce
|
2484 |
si->list.prio--; |
a2468cc9b
|
2485 2486 2487 2488 |
for_each_node(nid) { if (si->avail_lists[nid].prio != 1) si->avail_lists[nid].prio--; } |
adfab836f
|
2489 |
} |
78ecba081
|
2490 2491 |
least_priority++; } |
18ab4d4ce
|
2492 |
plist_del(&p->list, &swap_active_head); |
ec8acf20a
|
2493 |
atomic_long_sub(p->pages, &nr_swap_pages); |
1da177e4c
|
2494 2495 |
total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; |
ec8acf20a
|
2496 |
spin_unlock(&p->lock); |
5d337b919
|
2497 |
spin_unlock(&swap_lock); |
fb4f88dca
|
2498 |
|
039939a65
|
2499 |
disable_swap_slots_cache_lock(); |
e1e12d2f3
|
2500 |
set_current_oom_origin(); |
adfab836f
|
2501 |
err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ |
e1e12d2f3
|
2502 |
clear_current_oom_origin(); |
1da177e4c
|
2503 |
|
1da177e4c
|
2504 2505 |
if (err) { /* re-insert swap space back into swap_list */ |
cf0cac0a0
|
2506 |
reinsert_swap_info(p); |
039939a65
|
2507 |
reenable_swap_slots_cache_unlock(); |
1da177e4c
|
2508 2509 |
goto out_dput; } |
52b7efdbe
|
2510 |
|
039939a65
|
2511 |
reenable_swap_slots_cache_unlock(); |
eb085574a
|
2512 |
/* |
63d8620ec
|
2513 2514 2515 2516 2517 |
* Wait for swap operations protected by get/put_swap_device() * to complete. * * We need synchronize_rcu() here to protect the accessing to * the swap cache data structure. |
eb085574a
|
2518 |
*/ |
63d8620ec
|
2519 |
percpu_ref_kill(&p->users); |
eb085574a
|
2520 |
synchronize_rcu(); |
63d8620ec
|
2521 |
wait_for_completion(&p->comp); |
eb085574a
|
2522 |
|
815c2c543
|
2523 |
flush_work(&p->discard_work); |
5d337b919
|
2524 |
destroy_swap_extents(p); |
570a335b8
|
2525 2526 |
if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); |
81a0298bd
|
2527 2528 |
if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) atomic_dec(&nr_rotate_swap); |
fc0abb145
|
2529 |
mutex_lock(&swapon_mutex); |
5d337b919
|
2530 |
spin_lock(&swap_lock); |
ec8acf20a
|
2531 |
spin_lock(&p->lock); |
5d337b919
|
2532 |
drain_mmlist(); |
bb243f7dc
|
2533 |
/* wait for anyone still in scan_swap_map_slots */ |
52b7efdbe
|
2534 2535 |
p->highest_bit = 0; /* cuts scans short */ while (p->flags >= SWP_SCANNING) { |
ec8acf20a
|
2536 |
spin_unlock(&p->lock); |
5d337b919
|
2537 |
spin_unlock(&swap_lock); |
13e4b57f6
|
2538 |
schedule_timeout_uninterruptible(1); |
5d337b919
|
2539 |
spin_lock(&swap_lock); |
ec8acf20a
|
2540 |
spin_lock(&p->lock); |
52b7efdbe
|
2541 |
} |
52b7efdbe
|
2542 |
|
1da177e4c
|
2543 |
swap_file = p->swap_file; |
5b808a230
|
2544 |
old_block_size = p->old_block_size; |
1da177e4c
|
2545 2546 2547 2548 |
p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; |
2a8f94493
|
2549 2550 |
cluster_info = p->cluster_info; p->cluster_info = NULL; |
4f89849da
|
2551 |
frontswap_map = frontswap_map_get(p); |
ec8acf20a
|
2552 |
spin_unlock(&p->lock); |
5d337b919
|
2553 |
spin_unlock(&swap_lock); |
8a84802e2
|
2554 |
arch_swap_invalidate_area(p->type); |
adfab836f
|
2555 |
frontswap_invalidate_area(p->type); |
58e97ba6b
|
2556 |
frontswap_map_set(p, NULL); |
fc0abb145
|
2557 |
mutex_unlock(&swapon_mutex); |
ebc2a1a69
|
2558 2559 |
free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; |
490705888
|
2560 2561 |
free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; |
1da177e4c
|
2562 |
vfree(swap_map); |
54f180d3c
|
2563 2564 |
kvfree(cluster_info); kvfree(frontswap_map); |
2de1a7e40
|
2565 |
/* Destroy swap account information */ |
adfab836f
|
2566 |
swap_cgroup_swapoff(p->type); |
4b3ef9daa
|
2567 |
exit_swap_address_space(p->type); |
27a7faa07
|
2568 |
|
1da177e4c
|
2569 2570 2571 |
inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); |
1638045c3
|
2572 |
|
5b808a230
|
2573 |
set_blocksize(bdev, old_block_size); |
e525fd89d
|
2574 |
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
1da177e4c
|
2575 |
} |
1638045c3
|
2576 2577 2578 2579 |
inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; inode_unlock(inode); |
1da177e4c
|
2580 |
filp_close(swap_file, NULL); |
f893ab41e
|
2581 2582 2583 2584 2585 2586 2587 2588 2589 |
/* * Clear the SWP_USED flag after all resources are freed so that swapon * can reuse this swap_info in alloc_swap_info() safely. It is ok to * not hold p->lock after we cleared its SWP_WRITEOK. */ spin_lock(&swap_lock); p->flags = 0; spin_unlock(&swap_lock); |
1da177e4c
|
2590 |
err = 0; |
66d7dd518
|
2591 2592 |
atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); |
1da177e4c
|
2593 2594 2595 2596 |
out_dput: filp_close(victim, NULL); out: |
f58b59c1d
|
2597 |
putname(pathname); |
1da177e4c
|
2598 2599 2600 2601 |
return err; } #ifdef CONFIG_PROC_FS |
9dd957485
|
2602 |
static __poll_t swaps_poll(struct file *file, poll_table *wait) |
66d7dd518
|
2603 |
{ |
f15146380
|
2604 |
struct seq_file *seq = file->private_data; |
66d7dd518
|
2605 2606 |
poll_wait(file, &proc_poll_wait, wait); |
f15146380
|
2607 2608 |
if (seq->poll_event != atomic_read(&proc_poll_event)) { seq->poll_event = atomic_read(&proc_poll_event); |
a9a08845e
|
2609 |
return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; |
66d7dd518
|
2610 |
} |
a9a08845e
|
2611 |
return EPOLLIN | EPOLLRDNORM; |
66d7dd518
|
2612 |
} |
1da177e4c
|
2613 2614 2615 |
/* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { |
efa90a981
|
2616 2617 |
struct swap_info_struct *si; int type; |
1da177e4c
|
2618 |
loff_t l = *pos; |
fc0abb145
|
2619 |
mutex_lock(&swapon_mutex); |
1da177e4c
|
2620 |
|
881e4aabe
|
2621 2622 |
if (!l) return SEQ_START_TOKEN; |
c10d38cc8
|
2623 |
for (type = 0; (si = swap_type_to_swap_info(type)); type++) { |
efa90a981
|
2624 |
if (!(si->flags & SWP_USED) || !si->swap_map) |
1da177e4c
|
2625 |
continue; |
881e4aabe
|
2626 |
if (!--l) |
efa90a981
|
2627 |
return si; |
1da177e4c
|
2628 2629 2630 2631 2632 2633 2634 |
} return NULL; } static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { |
efa90a981
|
2635 2636 |
struct swap_info_struct *si = v; int type; |
1da177e4c
|
2637 |
|
881e4aabe
|
2638 |
if (v == SEQ_START_TOKEN) |
efa90a981
|
2639 2640 2641 |
type = 0; else type = si->type + 1; |
881e4aabe
|
2642 |
|
10c8d69f3
|
2643 |
++(*pos); |
c10d38cc8
|
2644 |
for (; (si = swap_type_to_swap_info(type)); type++) { |
efa90a981
|
2645 |
if (!(si->flags & SWP_USED) || !si->swap_map) |
1da177e4c
|
2646 |
continue; |
efa90a981
|
2647 |
return si; |
1da177e4c
|
2648 2649 2650 2651 2652 2653 2654 |
} return NULL; } static void swap_stop(struct seq_file *swap, void *v) { |
fc0abb145
|
2655 |
mutex_unlock(&swapon_mutex); |
1da177e4c
|
2656 2657 2658 2659 |
} static int swap_show(struct seq_file *swap, void *v) { |
efa90a981
|
2660 |
struct swap_info_struct *si = v; |
1da177e4c
|
2661 2662 |
struct file *file; int len; |
6f7939405
|
2663 |
unsigned int bytes, inuse; |
1da177e4c
|
2664 |
|
efa90a981
|
2665 |
if (si == SEQ_START_TOKEN) { |
68d68ff6e
|
2666 2667 |
seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority "); |
881e4aabe
|
2668 2669 |
return 0; } |
1da177e4c
|
2670 |
|
6f7939405
|
2671 2672 |
bytes = si->pages << (PAGE_SHIFT - 10); inuse = si->inuse_pages << (PAGE_SHIFT - 10); |
efa90a981
|
2673 |
file = si->swap_file; |
2726d5662
|
2674 2675 |
len = seq_file_path(swap, file, " \t \\"); |
6f7939405
|
2676 2677 |
seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d ", |
886bb7e9c
|
2678 |
len < 40 ? 40 - len : 1, " ", |
496ad9aa8
|
2679 |
S_ISBLK(file_inode(file)->i_mode) ? |
1da177e4c
|
2680 |
"partition" : "file\t", |
6f7939405
|
2681 2682 |
bytes, bytes < 10000000 ? "\t" : "", inuse, inuse < 10000000 ? "\t" : "", |
efa90a981
|
2683 |
si->prio); |
1da177e4c
|
2684 2685 |
return 0; } |
15ad7cdcf
|
2686 |
static const struct seq_operations swaps_op = { |
1da177e4c
|
2687 2688 2689 2690 2691 2692 2693 2694 |
.start = swap_start, .next = swap_next, .stop = swap_stop, .show = swap_show }; static int swaps_open(struct inode *inode, struct file *file) { |
f15146380
|
2695 |
struct seq_file *seq; |
66d7dd518
|
2696 |
int ret; |
66d7dd518
|
2697 |
ret = seq_open(file, &swaps_op); |
f15146380
|
2698 |
if (ret) |
66d7dd518
|
2699 |
return ret; |
66d7dd518
|
2700 |
|
f15146380
|
2701 2702 2703 |
seq = file->private_data; seq->poll_event = atomic_read(&proc_poll_event); return 0; |
1da177e4c
|
2704 |
} |
97a32539b
|
2705 |
static const struct proc_ops swaps_proc_ops = { |
d919b33da
|
2706 |
.proc_flags = PROC_ENTRY_PERMANENT, |
97a32539b
|
2707 2708 2709 2710 2711 |
.proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, .proc_poll = swaps_poll, |
1da177e4c
|
2712 2713 2714 2715 |
}; static int __init procswaps_init(void) { |
97a32539b
|
2716 |
proc_create("swaps", 0, NULL, &swaps_proc_ops); |
1da177e4c
|
2717 2718 2719 2720 |
return 0; } __initcall(procswaps_init); #endif /* CONFIG_PROC_FS */ |
1796316a8
|
2721 2722 2723 2724 2725 2726 2727 2728 |
#ifdef MAX_SWAPFILES_CHECK static int __init max_swapfiles_check(void) { MAX_SWAPFILES_CHECK(); return 0; } late_initcall(max_swapfiles_check); #endif |
53cbb2435
|
2729 |
static struct swap_info_struct *alloc_swap_info(void) |
1da177e4c
|
2730 |
{ |
73c34b6ac
|
2731 |
struct swap_info_struct *p; |
b11a76b37
|
2732 |
struct swap_info_struct *defer = NULL; |
1da177e4c
|
2733 |
unsigned int type; |
a2468cc9b
|
2734 |
int i; |
efa90a981
|
2735 |
|
960087445
|
2736 |
p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); |
efa90a981
|
2737 |
if (!p) |
53cbb2435
|
2738 |
return ERR_PTR(-ENOMEM); |
efa90a981
|
2739 |
|
63d8620ec
|
2740 2741 2742 2743 2744 |
if (percpu_ref_init(&p->users, swap_users_ref_free, PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { kvfree(p); return ERR_PTR(-ENOMEM); } |
5d337b919
|
2745 |
spin_lock(&swap_lock); |
efa90a981
|
2746 2747 |
for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) |
1da177e4c
|
2748 |
break; |
efa90a981
|
2749 |
} |
0697212a4
|
2750 |
if (type >= MAX_SWAPFILES) { |
5d337b919
|
2751 |
spin_unlock(&swap_lock); |
63d8620ec
|
2752 |
percpu_ref_exit(&p->users); |
873d7bcfd
|
2753 |
kvfree(p); |
730c0581c
|
2754 |
return ERR_PTR(-EPERM); |
1da177e4c
|
2755 |
} |
efa90a981
|
2756 2757 |
if (type >= nr_swapfiles) { p->type = type; |
efa90a981
|
2758 |
/* |
a4b451143
|
2759 2760 |
* Publish the swap_info_struct after initializing it. * Note that kvzalloc() above zeroes all its fields. |
efa90a981
|
2761 |
*/ |
a4b451143
|
2762 2763 |
smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ nr_swapfiles++; |
efa90a981
|
2764 |
} else { |
b11a76b37
|
2765 |
defer = p; |
efa90a981
|
2766 2767 2768 2769 2770 2771 |
p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() * would be relying on p->type to remain valid. */ } |
4efaceb1c
|
2772 |
p->swap_extent_root = RB_ROOT; |
18ab4d4ce
|
2773 |
plist_node_init(&p->list, 0); |
a2468cc9b
|
2774 2775 |
for_each_node(i) plist_node_init(&p->avail_lists[i], 0); |
1da177e4c
|
2776 |
p->flags = SWP_USED; |
5d337b919
|
2777 |
spin_unlock(&swap_lock); |
63d8620ec
|
2778 2779 2780 2781 |
if (defer) { percpu_ref_exit(&defer->users); kvfree(defer); } |
ec8acf20a
|
2782 |
spin_lock_init(&p->lock); |
2628bd6fc
|
2783 |
spin_lock_init(&p->cont_lock); |
63d8620ec
|
2784 |
init_completion(&p->comp); |
efa90a981
|
2785 |
|
53cbb2435
|
2786 |
return p; |
53cbb2435
|
2787 |
} |
4d0e1e107
|
2788 2789 2790 2791 2792 |
static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) { int error; if (S_ISBLK(inode->i_mode)) { |
ef16e1d98
|
2793 |
p->bdev = blkdev_get_by_dev(inode->i_rdev, |
6f179af88
|
2794 |
FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); |
ef16e1d98
|
2795 2796 |
if (IS_ERR(p->bdev)) { error = PTR_ERR(p->bdev); |
4d0e1e107
|
2797 |
p->bdev = NULL; |
6f179af88
|
2798 |
return error; |
4d0e1e107
|
2799 2800 2801 2802 |
} p->old_block_size = block_size(p->bdev); error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) |
87ade72a7
|
2803 |
return error; |
12d2966d8
|
2804 2805 2806 2807 2808 |
/* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ |
e556f6ba1
|
2809 |
if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) |
12d2966d8
|
2810 |
return -EINVAL; |
4d0e1e107
|
2811 2812 2813 |
p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; |
1638045c3
|
2814 |
} |
4d0e1e107
|
2815 |
return 0; |
4d0e1e107
|
2816 |
} |
377eeaa8e
|
2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 |
/* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: * 1) the number of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte, as defined by the different * architectures. * * In order to find the largest possible bit mask, a swap entry with * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap offset is * extracted. * * This will mask all the bits from the initial ~0UL mask that can't * be encoded in either the swp_entry_t or the architecture definition * of a swap pte. */ unsigned long generic_max_swapfile_size(void) { return swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; } /* Can be overridden by an architecture for additional checks. */ __weak unsigned long max_swapfile_size(void) { return generic_max_swapfile_size(); } |
ca8bd38bf
|
2845 2846 2847 2848 2849 2850 2851 |
static unsigned long read_swap_header(struct swap_info_struct *p, union swap_header *swap_header, struct inode *inode) { int i; unsigned long maxpages; unsigned long swapfilepages; |
d6bbbd29b
|
2852 |
unsigned long last_page; |
ca8bd38bf
|
2853 2854 |
if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { |
465c47fd8
|
2855 2856 |
pr_err("Unable to find swap-space signature "); |
387190253
|
2857 |
return 0; |
ca8bd38bf
|
2858 |
} |
041711ce7
|
2859 |
/* swap partition endianness hack... */ |
ca8bd38bf
|
2860 2861 2862 2863 |
if (swab32(swap_header->info.version) == 1) { swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); |
dd111be69
|
2864 2865 |
if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; |
ca8bd38bf
|
2866 2867 2868 2869 2870 |
for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { |
465c47fd8
|
2871 2872 2873 |
pr_warn("Unable to handle swap header version %d ", swap_header->info.version); |
387190253
|
2874 |
return 0; |
ca8bd38bf
|
2875 2876 2877 2878 2879 |
} p->lowest_bit = 1; p->cluster_next = 1; p->cluster_nr = 0; |
377eeaa8e
|
2880 |
maxpages = max_swapfile_size(); |
d6bbbd29b
|
2881 |
last_page = swap_header->info.last_page; |
a06ad633a
|
2882 2883 2884 2885 2886 |
if (!last_page) { pr_warn("Empty swap-file "); return 0; } |
d6bbbd29b
|
2887 |
if (last_page > maxpages) { |
465c47fd8
|
2888 2889 |
pr_warn("Truncating oversized swap area, only using %luk out of %luk ", |
d6bbbd29b
|
2890 2891 2892 2893 2894 |
maxpages << (PAGE_SHIFT - 10), last_page << (PAGE_SHIFT - 10)); } if (maxpages > last_page) { maxpages = last_page + 1; |
ca8bd38bf
|
2895 2896 2897 2898 2899 2900 2901 |
/* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } p->highest_bit = maxpages - 1; if (!maxpages) |
387190253
|
2902 |
return 0; |
ca8bd38bf
|
2903 2904 |
swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { |
465c47fd8
|
2905 2906 |
pr_warn("Swap area shorter than signature indicates "); |
387190253
|
2907 |
return 0; |
ca8bd38bf
|
2908 2909 |
} if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
387190253
|
2910 |
return 0; |
ca8bd38bf
|
2911 |
if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
387190253
|
2912 |
return 0; |
ca8bd38bf
|
2913 2914 |
return maxpages; |
ca8bd38bf
|
2915 |
} |
4b3ef9daa
|
2916 |
#define SWAP_CLUSTER_INFO_COLS \ |
235b62176
|
2917 |
DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) |
4b3ef9daa
|
2918 2919 2920 2921 |
#define SWAP_CLUSTER_SPACE_COLS \ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) #define SWAP_CLUSTER_COLS \ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) |
235b62176
|
2922 |
|
915d4d7bc
|
2923 2924 2925 |
static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, unsigned char *swap_map, |
2a8f94493
|
2926 |
struct swap_cluster_info *cluster_info, |
915d4d7bc
|
2927 2928 2929 |
unsigned long maxpages, sector_t *span) { |
235b62176
|
2930 |
unsigned int j, k; |
915d4d7bc
|
2931 2932 |
unsigned int nr_good_pages; int nr_extents; |
2a8f94493
|
2933 |
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); |
235b62176
|
2934 2935 |
unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; unsigned long i, idx; |
915d4d7bc
|
2936 2937 |
nr_good_pages = maxpages - 1; /* omit header page */ |
6b5349159
|
2938 2939 |
cluster_list_init(&p->free_clusters); cluster_list_init(&p->discard_clusters); |
2a8f94493
|
2940 |
|
915d4d7bc
|
2941 2942 |
for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; |
bdb8e3f68
|
2943 2944 |
if (page_nr == 0 || page_nr > swap_header->info.last_page) return -EINVAL; |
915d4d7bc
|
2945 2946 2947 |
if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; |
2a8f94493
|
2948 2949 2950 2951 2952 |
/* * Haven't marked the cluster free yet, no list * operation involved */ inc_cluster_info_page(p, cluster_info, page_nr); |
915d4d7bc
|
2953 2954 |
} } |
2a8f94493
|
2955 2956 2957 |
/* Haven't marked the cluster free yet, no list operation involved */ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) inc_cluster_info_page(p, cluster_info, i); |
915d4d7bc
|
2958 2959 |
if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; |
2a8f94493
|
2960 2961 2962 2963 2964 |
/* * Not mark the cluster free yet, no list * operation involved */ inc_cluster_info_page(p, cluster_info, 0); |
915d4d7bc
|
2965 2966 2967 |
p->max = maxpages; p->pages = nr_good_pages; nr_extents = setup_swap_extents(p, span); |
bdb8e3f68
|
2968 2969 |
if (nr_extents < 0) return nr_extents; |
915d4d7bc
|
2970 2971 2972 |
nr_good_pages = p->pages; } if (!nr_good_pages) { |
465c47fd8
|
2973 2974 |
pr_warn("Empty swap-file "); |
bdb8e3f68
|
2975 |
return -EINVAL; |
915d4d7bc
|
2976 |
} |
2a8f94493
|
2977 2978 |
if (!cluster_info) return nr_extents; |
235b62176
|
2979 |
|
4b3ef9daa
|
2980 2981 2982 2983 |
/* * Reduce false cache line sharing between cluster_info and * sharing same address space. */ |
235b62176
|
2984 2985 2986 2987 2988 2989 2990 2991 |
for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = (k + col) % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { idx = i * SWAP_CLUSTER_COLS + j; if (idx >= nr_clusters) continue; if (cluster_count(&cluster_info[idx])) continue; |
2a8f94493
|
2992 |
cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); |
6b5349159
|
2993 2994 |
cluster_list_add_tail(&p->free_clusters, cluster_info, idx); |
2a8f94493
|
2995 |
} |
2a8f94493
|
2996 |
} |
915d4d7bc
|
2997 |
return nr_extents; |
915d4d7bc
|
2998 |
} |
dcf6b7ddd
|
2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 |
/* * Helper to sys_swapon determining if a given swap * backing device queue supports DISCARD operations. */ static bool swap_discardable(struct swap_info_struct *si) { struct request_queue *q = bdev_get_queue(si->bdev); if (!q || !blk_queue_discard(q)) return false; return true; } |
53cbb2435
|
3012 3013 3014 |
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; |
91a27b2a7
|
3015 |
struct filename *name; |
53cbb2435
|
3016 3017 |
struct file *swap_file = NULL; struct address_space *mapping; |
51cc3a662
|
3018 |
struct dentry *dentry; |
40531542e
|
3019 |
int prio; |
53cbb2435
|
3020 3021 |
int error; union swap_header *swap_header; |
915d4d7bc
|
3022 |
int nr_extents; |
53cbb2435
|
3023 3024 |
sector_t span; unsigned long maxpages; |
53cbb2435
|
3025 |
unsigned char *swap_map = NULL; |
2a8f94493
|
3026 |
struct swap_cluster_info *cluster_info = NULL; |
38b5faf4b
|
3027 |
unsigned long *frontswap_map = NULL; |
53cbb2435
|
3028 3029 |
struct page *page = NULL; struct inode *inode = NULL; |
7cbf31923
|
3030 |
bool inced_nr_rotate_swap = false; |
53cbb2435
|
3031 |
|
d15cab975
|
3032 3033 |
if (swap_flags & ~SWAP_FLAGS_VALID) return -EINVAL; |
53cbb2435
|
3034 3035 |
if (!capable(CAP_SYS_ADMIN)) return -EPERM; |
a2468cc9b
|
3036 3037 |
if (!swap_avail_heads) return -ENOMEM; |
53cbb2435
|
3038 |
p = alloc_swap_info(); |
2542e5134
|
3039 3040 |
if (IS_ERR(p)) return PTR_ERR(p); |
53cbb2435
|
3041 |
|
815c2c543
|
3042 |
INIT_WORK(&p->discard_work, swap_discard_work); |
1da177e4c
|
3043 |
name = getname(specialfile); |
1da177e4c
|
3044 |
if (IS_ERR(name)) { |
7de7fb6b3
|
3045 |
error = PTR_ERR(name); |
1da177e4c
|
3046 |
name = NULL; |
bd69010b0
|
3047 |
goto bad_swap; |
1da177e4c
|
3048 |
} |
669abf4e5
|
3049 |
swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); |
1da177e4c
|
3050 |
if (IS_ERR(swap_file)) { |
7de7fb6b3
|
3051 |
error = PTR_ERR(swap_file); |
1da177e4c
|
3052 |
swap_file = NULL; |
bd69010b0
|
3053 |
goto bad_swap; |
1da177e4c
|
3054 3055 3056 3057 |
} p->swap_file = swap_file; mapping = swap_file->f_mapping; |
51cc3a662
|
3058 |
dentry = swap_file->f_path.dentry; |
2130781e2
|
3059 |
inode = mapping->host; |
6f179af88
|
3060 |
|
4d0e1e107
|
3061 3062 |
error = claim_swapfile(p, inode); if (unlikely(error)) |
1da177e4c
|
3063 |
goto bad_swap; |
1da177e4c
|
3064 |
|
d795a90e2
|
3065 |
inode_lock(inode); |
51cc3a662
|
3066 3067 3068 3069 |
if (d_unlinked(dentry) || cant_mount(dentry)) { error = -ENOENT; goto bad_swap_unlock_inode; } |
d795a90e2
|
3070 3071 3072 3073 |
if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; } |
1da177e4c
|
3074 3075 3076 3077 3078 |
/* * Read the swap header. */ if (!mapping->a_ops->readpage) { error = -EINVAL; |
d795a90e2
|
3079 |
goto bad_swap_unlock_inode; |
1da177e4c
|
3080 |
} |
090d2b185
|
3081 |
page = read_mapping_page(mapping, 0, swap_file); |
1da177e4c
|
3082 3083 |
if (IS_ERR(page)) { error = PTR_ERR(page); |
d795a90e2
|
3084 |
goto bad_swap_unlock_inode; |
1da177e4c
|
3085 |
} |
81e339712
|
3086 |
swap_header = kmap(page); |
1da177e4c
|
3087 |
|
ca8bd38bf
|
3088 3089 |
maxpages = read_swap_header(p, swap_header, inode); if (unlikely(!maxpages)) { |
1da177e4c
|
3090 |
error = -EINVAL; |
d795a90e2
|
3091 |
goto bad_swap_unlock_inode; |
1da177e4c
|
3092 |
} |
886bb7e9c
|
3093 |
|
81e339712
|
3094 |
/* OK, set up the swap map and apply the bad block list */ |
803d0c835
|
3095 |
swap_map = vzalloc(maxpages); |
81e339712
|
3096 3097 |
if (!swap_map) { error = -ENOMEM; |
d795a90e2
|
3098 |
goto bad_swap_unlock_inode; |
81e339712
|
3099 |
} |
f05714293
|
3100 |
|
1cb039f3d
|
3101 |
if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) |
f05714293
|
3102 |
p->flags |= SWP_STABLE_WRITES; |
a8b456d01
|
3103 |
if (p->bdev && p->bdev->bd_disk->fops->rw_page) |
539a6fea7
|
3104 |
p->flags |= SWP_SYNCHRONOUS_IO; |
2a8f94493
|
3105 |
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
6f179af88
|
3106 |
int cpu; |
235b62176
|
3107 |
unsigned long ci, nr_cluster; |
6f179af88
|
3108 |
|
2a8f94493
|
3109 |
p->flags |= SWP_SOLIDSTATE; |
490705888
|
3110 3111 3112 3113 3114 |
p->cluster_next_cpu = alloc_percpu(unsigned int); if (!p->cluster_next_cpu) { error = -ENOMEM; goto bad_swap_unlock_inode; } |
2a8f94493
|
3115 3116 3117 3118 |
/* * select a random position to start with to help wear leveling * SSD */ |
490705888
|
3119 3120 3121 3122 |
for_each_possible_cpu(cpu) { per_cpu(*p->cluster_next_cpu, cpu) = 1 + prandom_u32_max(p->highest_bit); } |
235b62176
|
3123 |
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); |
2a8f94493
|
3124 |
|
778e1cdd8
|
3125 |
cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), |
54f180d3c
|
3126 |
GFP_KERNEL); |
2a8f94493
|
3127 3128 |
if (!cluster_info) { error = -ENOMEM; |
d795a90e2
|
3129 |
goto bad_swap_unlock_inode; |
2a8f94493
|
3130 |
} |
235b62176
|
3131 3132 3133 |
for (ci = 0; ci < nr_cluster; ci++) spin_lock_init(&((cluster_info + ci)->lock)); |
ebc2a1a69
|
3134 3135 3136 |
p->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!p->percpu_cluster) { error = -ENOMEM; |
d795a90e2
|
3137 |
goto bad_swap_unlock_inode; |
ebc2a1a69
|
3138 |
} |
6f179af88
|
3139 |
for_each_possible_cpu(cpu) { |
ebc2a1a69
|
3140 |
struct percpu_cluster *cluster; |
6f179af88
|
3141 |
cluster = per_cpu_ptr(p->percpu_cluster, cpu); |
ebc2a1a69
|
3142 3143 |
cluster_set_null(&cluster->index); } |
7cbf31923
|
3144 |
} else { |
81a0298bd
|
3145 |
atomic_inc(&nr_rotate_swap); |
7cbf31923
|
3146 3147 |
inced_nr_rotate_swap = true; } |
1da177e4c
|
3148 |
|
1421ef3cd
|
3149 3150 |
error = swap_cgroup_swapon(p->type, maxpages); if (error) |
d795a90e2
|
3151 |
goto bad_swap_unlock_inode; |
1421ef3cd
|
3152 |
|
915d4d7bc
|
3153 |
nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, |
2a8f94493
|
3154 |
cluster_info, maxpages, &span); |
915d4d7bc
|
3155 3156 |
if (unlikely(nr_extents < 0)) { error = nr_extents; |
d795a90e2
|
3157 |
goto bad_swap_unlock_inode; |
1da177e4c
|
3158 |
} |
38b5faf4b
|
3159 |
/* frontswap enabled? set up bit-per-page map for frontswap */ |
8ea1d2a19
|
3160 |
if (IS_ENABLED(CONFIG_FRONTSWAP)) |
778e1cdd8
|
3161 3162 |
frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages), sizeof(long), |
54f180d3c
|
3163 |
GFP_KERNEL); |
1da177e4c
|
3164 |
|
68d68ff6e
|
3165 |
if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { |
2a8f94493
|
3166 3167 3168 3169 3170 3171 3172 3173 |
/* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in * order to sustain backward compatibility with older * swapon(8) releases. */ p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | SWP_PAGE_DISCARD); |
dcf6b7ddd
|
3174 |
|
2a8f94493
|
3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 |
/* * By flagging sys_swapon, a sysadmin can tell us to * either do single-time area discards only, or to just * perform discards for released swap page-clusters. * Now it's time to adjust the p->flags accordingly. */ if (swap_flags & SWAP_FLAG_DISCARD_ONCE) p->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) p->flags &= ~SWP_AREA_DISCARD; /* issue a swapon-time discard if it's still required */ if (p->flags & SWP_AREA_DISCARD) { int err = discard_swap(p); if (unlikely(err)) pr_err("swapon: discard_swap(%p): %d ", p, err); |
dcf6b7ddd
|
3193 |
} |
20137a490
|
3194 |
} |
6a6ba8317
|
3195 |
|
4b3ef9daa
|
3196 3197 |
error = init_swap_address_space(p->type, maxpages); if (error) |
d795a90e2
|
3198 |
goto bad_swap_unlock_inode; |
4b3ef9daa
|
3199 |
|
dc617f29d
|
3200 3201 3202 3203 3204 3205 3206 3207 |
/* * Flush any pending IO and dirty mappings before we start using this * swap device. */ inode->i_flags |= S_SWAPFILE; error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; |
822bca52e
|
3208 |
goto free_swap_address_space; |
dc617f29d
|
3209 |
} |
fc0abb145
|
3210 |
mutex_lock(&swapon_mutex); |
40531542e
|
3211 |
prio = -1; |
78ecba081
|
3212 |
if (swap_flags & SWAP_FLAG_PREFER) |
40531542e
|
3213 |
prio = |
78ecba081
|
3214 |
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
2a8f94493
|
3215 |
enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); |
c69dbfb84
|
3216 |
|
756a025f0
|
3217 3218 |
pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s ", |
91a27b2a7
|
3219 |
p->pages<<(PAGE_SHIFT-10), name->name, p->prio, |
c69dbfb84
|
3220 3221 |
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
38b5faf4b
|
3222 |
(p->flags & SWP_DISCARDABLE) ? "D" : "", |
dcf6b7ddd
|
3223 3224 |
(p->flags & SWP_AREA_DISCARD) ? "s" : "", (p->flags & SWP_PAGE_DISCARD) ? "c" : "", |
38b5faf4b
|
3225 |
(frontswap_map) ? "FS" : ""); |
c69dbfb84
|
3226 |
|
fc0abb145
|
3227 |
mutex_unlock(&swapon_mutex); |
66d7dd518
|
3228 3229 |
atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); |
1da177e4c
|
3230 3231 |
error = 0; goto out; |
822bca52e
|
3232 3233 |
free_swap_address_space: exit_swap_address_space(p->type); |
d795a90e2
|
3234 3235 |
bad_swap_unlock_inode: inode_unlock(inode); |
1da177e4c
|
3236 |
bad_swap: |
ebc2a1a69
|
3237 3238 |
free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; |
490705888
|
3239 3240 |
free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; |
bd69010b0
|
3241 |
if (inode && S_ISBLK(inode->i_mode) && p->bdev) { |
f2090d2df
|
3242 3243 |
set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
1da177e4c
|
3244 |
} |
d795a90e2
|
3245 |
inode = NULL; |
4cd3bb10f
|
3246 |
destroy_swap_extents(p); |
e8e6c2ec4
|
3247 |
swap_cgroup_swapoff(p->type); |
5d337b919
|
3248 |
spin_lock(&swap_lock); |
1da177e4c
|
3249 |
p->swap_file = NULL; |
1da177e4c
|
3250 |
p->flags = 0; |
5d337b919
|
3251 |
spin_unlock(&swap_lock); |
1da177e4c
|
3252 |
vfree(swap_map); |
8606a1a94
|
3253 |
kvfree(cluster_info); |
b6b1fd2a6
|
3254 |
kvfree(frontswap_map); |
7cbf31923
|
3255 3256 |
if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); |
d795a90e2
|
3257 |
if (swap_file) |
1da177e4c
|
3258 3259 3260 3261 |
filp_close(swap_file, NULL); out: if (page && !IS_ERR(page)) { kunmap(page); |
09cbfeaf1
|
3262 |
put_page(page); |
1da177e4c
|
3263 3264 3265 |
} if (name) putname(name); |
1638045c3
|
3266 |
if (inode) |
5955102c9
|
3267 |
inode_unlock(inode); |
039939a65
|
3268 3269 |
if (!error) enable_swap_slots_cache(); |
1da177e4c
|
3270 3271 3272 3273 3274 |
return error; } void si_swapinfo(struct sysinfo *val) { |
efa90a981
|
3275 |
unsigned int type; |
1da177e4c
|
3276 |
unsigned long nr_to_be_unused = 0; |
5d337b919
|
3277 |
spin_lock(&swap_lock); |
efa90a981
|
3278 3279 3280 3281 3282 |
for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += si->inuse_pages; |
1da177e4c
|
3283 |
} |
ec8acf20a
|
3284 |
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; |
1da177e4c
|
3285 |
val->totalswap = total_swap_pages + nr_to_be_unused; |
5d337b919
|
3286 |
spin_unlock(&swap_lock); |
1da177e4c
|
3287 3288 3289 3290 3291 |
} /* * Verify that a swap entry is valid and increment its swap map count. * |
355cfa73d
|
3292 3293 3294 3295 3296 3297 |
* Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT |
570a335b8
|
3298 |
* - swap-mapped reference requested but needs continued swap count. -> ENOMEM |
1da177e4c
|
3299 |
*/ |
8d69aaee8
|
3300 |
static int __swap_duplicate(swp_entry_t entry, unsigned char usage) |
1da177e4c
|
3301 |
{ |
73c34b6ac
|
3302 |
struct swap_info_struct *p; |
235b62176
|
3303 |
struct swap_cluster_info *ci; |
c10d38cc8
|
3304 |
unsigned long offset; |
8d69aaee8
|
3305 3306 |
unsigned char count; unsigned char has_cache; |
9d9a03340
|
3307 |
int err; |
1da177e4c
|
3308 |
|
eb085574a
|
3309 |
p = get_swap_device(entry); |
c10d38cc8
|
3310 |
if (!p) |
9d9a03340
|
3311 |
return -EINVAL; |
235b62176
|
3312 |
|
eb085574a
|
3313 |
offset = swp_offset(entry); |
235b62176
|
3314 |
ci = lock_cluster_or_swap_info(p, offset); |
355cfa73d
|
3315 |
|
253d553ba
|
3316 |
count = p->swap_map[offset]; |
edfe23dac
|
3317 3318 3319 3320 3321 3322 3323 3324 3325 |
/* * swapin_readahead() doesn't check if a swap entry is valid, so the * swap entry could be SWAP_MAP_BAD. Check here with lock held. */ if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { err = -ENOENT; goto unlock_out; } |
253d553ba
|
3326 3327 3328 |
has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; err = 0; |
355cfa73d
|
3329 |
|
253d553ba
|
3330 |
if (usage == SWAP_HAS_CACHE) { |
355cfa73d
|
3331 3332 |
/* set SWAP_HAS_CACHE if there is no cache and entry is used */ |
253d553ba
|
3333 3334 3335 3336 3337 3338 |
if (!has_cache && count) has_cache = SWAP_HAS_CACHE; else if (has_cache) /* someone else added cache */ err = -EEXIST; else /* no users remaining */ err = -ENOENT; |
355cfa73d
|
3339 3340 |
} else if (count || has_cache) { |
253d553ba
|
3341 |
|
570a335b8
|
3342 3343 3344 |
if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) |
253d553ba
|
3345 |
err = -EINVAL; |
570a335b8
|
3346 3347 3348 3349 |
else if (swap_count_continued(p, offset, count)) count = COUNT_CONTINUED; else err = -ENOMEM; |
355cfa73d
|
3350 |
} else |
253d553ba
|
3351 |
err = -ENOENT; /* unused swap entry */ |
a449bf58e
|
3352 |
WRITE_ONCE(p->swap_map[offset], count | has_cache); |
253d553ba
|
3353 |
|
355cfa73d
|
3354 |
unlock_out: |
235b62176
|
3355 |
unlock_cluster_or_swap_info(p, ci); |
eb085574a
|
3356 3357 |
if (p) put_swap_device(p); |
253d553ba
|
3358 |
return err; |
1da177e4c
|
3359 |
} |
253d553ba
|
3360 |
|
355cfa73d
|
3361 |
/* |
aaa468653
|
3362 3363 3364 3365 3366 3367 3368 3369 3370 |
* Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ void swap_shmem_alloc(swp_entry_t entry) { __swap_duplicate(entry, SWAP_MAP_SHMEM); } /* |
08259d58e
|
3371 3372 3373 3374 3375 |
* Increase reference count of swap entry by 1. * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. |
355cfa73d
|
3376 |
*/ |
570a335b8
|
3377 |
int swap_duplicate(swp_entry_t entry) |
355cfa73d
|
3378 |
{ |
570a335b8
|
3379 3380 3381 3382 3383 |
int err = 0; while (!err && __swap_duplicate(entry, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; |
355cfa73d
|
3384 |
} |
1da177e4c
|
3385 |
|
cb4b86ba4
|
3386 |
/* |
355cfa73d
|
3387 3388 |
* @entry: swap entry for which we allocate swap cache. * |
73c34b6ac
|
3389 |
* Called when allocating swap cache for existing swap entry, |
355cfa73d
|
3390 |
* This can return error codes. Returns 0 at success. |
3eeba1356
|
3391 |
* -EEXIST means there is a swap cache. |
355cfa73d
|
3392 |
* Note: return code is different from swap_duplicate(). |
cb4b86ba4
|
3393 3394 3395 |
*/ int swapcache_prepare(swp_entry_t entry) { |
253d553ba
|
3396 |
return __swap_duplicate(entry, SWAP_HAS_CACHE); |
cb4b86ba4
|
3397 |
} |
0bcac06f2
|
3398 3399 |
struct swap_info_struct *swp_swap_info(swp_entry_t entry) { |
c10d38cc8
|
3400 |
return swap_type_to_swap_info(swp_type(entry)); |
0bcac06f2
|
3401 |
} |
f981c5950
|
3402 3403 |
struct swap_info_struct *page_swap_info(struct page *page) { |
0bcac06f2
|
3404 3405 |
swp_entry_t entry = { .val = page_private(page) }; return swp_swap_info(entry); |
f981c5950
|
3406 3407 3408 3409 3410 3411 3412 |
} /* * out-of-line __page_file_ methods to avoid include hell. */ struct address_space *__page_file_mapping(struct page *page) { |
f981c5950
|
3413 3414 3415 3416 3417 3418 3419 |
return page_swap_info(page)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(__page_file_mapping); pgoff_t __page_file_index(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; |
f981c5950
|
3420 3421 3422 |
return swp_offset(swap); } EXPORT_SYMBOL_GPL(__page_file_index); |
1da177e4c
|
3423 |
/* |
570a335b8
|
3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 |
* add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. * * These continuation pages are seldom referenced: the common paths all work * on the original swap_map, only referring to a continuation page when the * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. * * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) * can be called after dropping locks. */ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; |
235b62176
|
3441 |
struct swap_cluster_info *ci; |
570a335b8
|
3442 3443 3444 3445 3446 |
struct page *head; struct page *page; struct page *list_page; pgoff_t offset; unsigned char count; |
eb085574a
|
3447 |
int ret = 0; |
570a335b8
|
3448 3449 3450 3451 3452 3453 |
/* * When debugging, it's easier to use __GFP_ZERO here; but it's better * for latency not to zero a page while GFP_ATOMIC and holding locks. */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); |
eb085574a
|
3454 |
si = get_swap_device(entry); |
570a335b8
|
3455 3456 3457 |
if (!si) { /* * An acceptable race has occurred since the failing |
eb085574a
|
3458 |
* __swap_duplicate(): the swap device may be swapoff |
570a335b8
|
3459 3460 3461 |
*/ goto outer; } |
eb085574a
|
3462 |
spin_lock(&si->lock); |
570a335b8
|
3463 3464 |
offset = swp_offset(entry); |
235b62176
|
3465 3466 |
ci = lock_cluster(si, offset); |
d8aa24e04
|
3467 |
count = swap_count(si->swap_map[offset]); |
570a335b8
|
3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 |
if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* * The higher the swap count, the more likely it is that tasks * will race to add swap count continuation: we need to avoid * over-provisioning. */ goto out; } if (!page) { |
eb085574a
|
3479 3480 |
ret = -ENOMEM; goto out; |
570a335b8
|
3481 3482 3483 3484 |
} /* * We are fortunate that although vmalloc_to_page uses pte_offset_map, |
2de1a7e40
|
3485 3486 |
* no architecture is using highmem pages for kernel page tables: so it * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. |
570a335b8
|
3487 3488 3489 |
*/ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; |
2628bd6fc
|
3490 |
spin_lock(&si->cont_lock); |
570a335b8
|
3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 |
/* * Page allocation does not initialize the page's lru field, * but it does always reset its private field. */ if (!page_private(head)) { BUG_ON(count & COUNT_CONTINUED); INIT_LIST_HEAD(&head->lru); set_page_private(head, SWP_CONTINUED); si->flags |= SWP_CONTINUED; } list_for_each_entry(list_page, &head->lru, lru) { unsigned char *map; /* * If the previous map said no continuation, but we've found * a continuation page, free our allocation and use this one. */ if (!(count & COUNT_CONTINUED)) |
2628bd6fc
|
3510 |
goto out_unlock_cont; |
570a335b8
|
3511 |
|
9b04c5fec
|
3512 |
map = kmap_atomic(list_page) + offset; |
570a335b8
|
3513 |
count = *map; |
9b04c5fec
|
3514 |
kunmap_atomic(map); |
570a335b8
|
3515 3516 3517 3518 3519 3520 |
/* * If this continuation count now has some space in it, * free our allocation and use this one. */ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) |
2628bd6fc
|
3521 |
goto out_unlock_cont; |
570a335b8
|
3522 3523 3524 3525 |
} list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ |
2628bd6fc
|
3526 3527 |
out_unlock_cont: spin_unlock(&si->cont_lock); |
570a335b8
|
3528 |
out: |
235b62176
|
3529 |
unlock_cluster(ci); |
ec8acf20a
|
3530 |
spin_unlock(&si->lock); |
eb085574a
|
3531 |
put_swap_device(si); |
570a335b8
|
3532 3533 3534 |
outer: if (page) __free_page(page); |
eb085574a
|
3535 |
return ret; |
570a335b8
|
3536 3537 3538 3539 3540 3541 3542 3543 |
} /* * swap_count_continued - when the original swap_map count is incremented * from SWAP_MAP_MAX, check if there is already a continuation page to carry * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. |
235b62176
|
3544 3545 |
* Called while __swap_duplicate() or swap_entry_free() holds swap or cluster * lock. |
570a335b8
|
3546 3547 3548 3549 3550 3551 3552 |
*/ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) { struct page *head; struct page *page; unsigned char *map; |
2628bd6fc
|
3553 |
bool ret; |
570a335b8
|
3554 3555 3556 3557 3558 3559 |
head = vmalloc_to_page(si->swap_map + offset); if (page_private(head) != SWP_CONTINUED) { BUG_ON(count & COUNT_CONTINUED); return false; /* need to add count continuation */ } |
2628bd6fc
|
3560 |
spin_lock(&si->cont_lock); |
570a335b8
|
3561 |
offset &= ~PAGE_MASK; |
213516ac0
|
3562 |
page = list_next_entry(head, lru); |
9b04c5fec
|
3563 |
map = kmap_atomic(page) + offset; |
570a335b8
|
3564 3565 3566 3567 3568 3569 3570 3571 3572 |
if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ /* * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { |
9b04c5fec
|
3573 |
kunmap_atomic(map); |
213516ac0
|
3574 |
page = list_next_entry(page, lru); |
570a335b8
|
3575 |
BUG_ON(page == head); |
9b04c5fec
|
3576 |
map = kmap_atomic(page) + offset; |
570a335b8
|
3577 3578 |
} if (*map == SWAP_CONT_MAX) { |
9b04c5fec
|
3579 |
kunmap_atomic(map); |
213516ac0
|
3580 |
page = list_next_entry(page, lru); |
2628bd6fc
|
3581 3582 3583 3584 |
if (page == head) { ret = false; /* add count continuation */ goto out; } |
9b04c5fec
|
3585 |
map = kmap_atomic(page) + offset; |
570a335b8
|
3586 3587 3588 |
init_map: *map = 0; /* we didn't zero the page */ } *map += 1; |
9b04c5fec
|
3589 |
kunmap_atomic(map); |
213516ac0
|
3590 |
while ((page = list_prev_entry(page, lru)) != head) { |
9b04c5fec
|
3591 |
map = kmap_atomic(page) + offset; |
570a335b8
|
3592 |
*map = COUNT_CONTINUED; |
9b04c5fec
|
3593 |
kunmap_atomic(map); |
570a335b8
|
3594 |
} |
2628bd6fc
|
3595 |
ret = true; /* incremented */ |
570a335b8
|
3596 3597 3598 3599 3600 3601 3602 |
} else { /* decrementing */ /* * Think of how you subtract 1 from 1000 */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { |
9b04c5fec
|
3603 |
kunmap_atomic(map); |
213516ac0
|
3604 |
page = list_next_entry(page, lru); |
570a335b8
|
3605 |
BUG_ON(page == head); |
9b04c5fec
|
3606 |
map = kmap_atomic(page) + offset; |
570a335b8
|
3607 3608 3609 3610 3611 |
} BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; |
9b04c5fec
|
3612 |
kunmap_atomic(map); |
213516ac0
|
3613 |
while ((page = list_prev_entry(page, lru)) != head) { |
9b04c5fec
|
3614 |
map = kmap_atomic(page) + offset; |
570a335b8
|
3615 3616 |
*map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; |
9b04c5fec
|
3617 |
kunmap_atomic(map); |
570a335b8
|
3618 |
} |
2628bd6fc
|
3619 |
ret = count == COUNT_CONTINUED; |
570a335b8
|
3620 |
} |
2628bd6fc
|
3621 3622 3623 |
out: spin_unlock(&si->cont_lock); return ret; |
570a335b8
|
3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 |
} /* * free_swap_count_continuations - swapoff free all the continuation pages * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. */ static void free_swap_count_continuations(struct swap_info_struct *si) { pgoff_t offset; for (offset = 0; offset < si->max; offset += PAGE_SIZE) { struct page *head; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head)) { |
0d576d20c
|
3638 3639 3640 3641 |
struct page *page, *next; list_for_each_entry_safe(page, next, &head->lru, lru) { list_del(&page->lru); |
570a335b8
|
3642 3643 3644 3645 3646 |
__free_page(page); } } } } |
a2468cc9b
|
3647 |
|
2cf855837
|
3648 |
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) |
01c4b28cd
|
3649 |
void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) |
2cf855837
|
3650 3651 |
{ struct swap_info_struct *si, *next; |
6caa6a070
|
3652 3653 3654 |
int nid = page_to_nid(page); if (!(gfp_mask & __GFP_IO)) |
2cf855837
|
3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 |
return; if (!blk_cgroup_congested()) return; /* * We've already scheduled a throttle, avoid taking the global swap * lock. */ if (current->throttle_queue) return; spin_lock(&swap_avail_lock); |
6caa6a070
|
3668 3669 |
plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { |
2cf855837
|
3670 |
if (si->bdev) { |
6caa6a070
|
3671 |
blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); |
2cf855837
|
3672 3673 3674 3675 3676 3677 |
break; } } spin_unlock(&swap_avail_lock); } #endif |
a2468cc9b
|
3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 |
static int __init swapfile_init(void) { int nid; swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), GFP_KERNEL); if (!swap_avail_heads) { pr_emerg("Not enough memory for swap heads, swap is disabled "); return -ENOMEM; } for_each_node(nid) plist_head_init(&swap_avail_heads[nid]); return 0; } subsys_initcall(swapfile_init); |